summaryrefslogtreecommitdiffstats
path: root/man2
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 19:40:15 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-15 19:40:15 +0000
commit399644e47874bff147afb19c89228901ac39340e (patch)
tree1c4c0b733f4c16b5783b41bebb19194a9ef62ad1 /man2
parentInitial commit. (diff)
downloadmanpages-399644e47874bff147afb19c89228901ac39340e.tar.xz
manpages-399644e47874bff147afb19c89228901ac39340e.zip
Adding upstream version 6.05.01.upstream/6.05.01
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'man2')
-rw-r--r--man2/_Exit.21
-rw-r--r--man2/__clone2.21
-rw-r--r--man2/_exit.2138
-rw-r--r--man2/_llseek.21
-rw-r--r--man2/_newselect.21
-rw-r--r--man2/_syscall.2171
-rw-r--r--man2/_sysctl.21
-rw-r--r--man2/accept.2347
-rw-r--r--man2/accept4.21
-rw-r--r--man2/access.2447
-rw-r--r--man2/acct.2136
-rw-r--r--man2/add_key.2298
-rw-r--r--man2/adjtimex.2596
-rw-r--r--man2/afs_syscall.21
-rw-r--r--man2/alarm.281
-rw-r--r--man2/alloc_hugepages.2135
-rw-r--r--man2/arch_prctl.2176
-rw-r--r--man2/arm_fadvise.21
-rw-r--r--man2/arm_fadvise64_64.21
-rw-r--r--man2/arm_sync_file_range.21
-rw-r--r--man2/bdflush.2103
-rw-r--r--man2/bind.2286
-rw-r--r--man2/bpf.21273
-rw-r--r--man2/break.21
-rw-r--r--man2/brk.2153
-rw-r--r--man2/cacheflush.2143
-rw-r--r--man2/capget.2260
-rw-r--r--man2/capset.21
-rw-r--r--man2/chdir.2127
-rw-r--r--man2/chmod.2347
-rw-r--r--man2/chown.2471
-rw-r--r--man2/chown32.21
-rw-r--r--man2/chroot.2166
-rw-r--r--man2/clock_adjtime.21
-rw-r--r--man2/clock_getres.2524
-rw-r--r--man2/clock_gettime.21
-rw-r--r--man2/clock_nanosleep.2253
-rw-r--r--man2/clock_settime.21
-rw-r--r--man2/clone.21944
-rw-r--r--man2/clone2.21
-rw-r--r--man2/clone3.21
-rw-r--r--man2/close.2266
-rw-r--r--man2/close_range.2273
-rw-r--r--man2/connect.2251
-rw-r--r--man2/copy_file_range.2307
-rw-r--r--man2/creat.21
-rw-r--r--man2/create_module.272
-rw-r--r--man2/delete_module.2205
-rw-r--r--man2/dup.2284
-rw-r--r--man2/dup2.21
-rw-r--r--man2/dup3.21
-rw-r--r--man2/epoll_create.2144
-rw-r--r--man2/epoll_create1.21
-rw-r--r--man2/epoll_ctl.2429
-rw-r--r--man2/epoll_pwait.21
-rw-r--r--man2/epoll_pwait2.21
-rw-r--r--man2/epoll_wait.2288
-rw-r--r--man2/eventfd.2443
-rw-r--r--man2/eventfd2.21
-rw-r--r--man2/execve.2884
-rw-r--r--man2/execveat.2220
-rw-r--r--man2/exit.21
-rw-r--r--man2/exit_group.238
-rw-r--r--man2/faccessat.21
-rw-r--r--man2/faccessat2.21
-rw-r--r--man2/fadvise64.21
-rw-r--r--man2/fadvise64_64.21
-rw-r--r--man2/fallocate.2481
-rw-r--r--man2/fanotify_init.2542
-rw-r--r--man2/fanotify_mark.2843
-rw-r--r--man2/fattach.21
-rw-r--r--man2/fchdir.21
-rw-r--r--man2/fchmod.21
-rw-r--r--man2/fchmodat.21
-rw-r--r--man2/fchown.21
-rw-r--r--man2/fchown32.21
-rw-r--r--man2/fchownat.21
-rw-r--r--man2/fcntl.22111
-rw-r--r--man2/fcntl64.21
-rw-r--r--man2/fdatasync.21
-rw-r--r--man2/fdetach.21
-rw-r--r--man2/fgetxattr.21
-rw-r--r--man2/finit_module.21
-rw-r--r--man2/flistxattr.21
-rw-r--r--man2/flock.2267
-rw-r--r--man2/fork.2348
-rw-r--r--man2/free_hugepages.21
-rw-r--r--man2/fremovexattr.21
-rw-r--r--man2/fsetxattr.21
-rw-r--r--man2/fstat.21
-rw-r--r--man2/fstat64.21
-rw-r--r--man2/fstatat.21
-rw-r--r--man2/fstatat64.21
-rw-r--r--man2/fstatfs.21
-rw-r--r--man2/fstatfs64.21
-rw-r--r--man2/fsync.2195
-rw-r--r--man2/ftruncate.21
-rw-r--r--man2/ftruncate64.21
-rw-r--r--man2/futex.21976
-rw-r--r--man2/futimesat.2128
-rw-r--r--man2/get_kernel_syms.288
-rw-r--r--man2/get_mempolicy.2235
-rw-r--r--man2/get_robust_list.2156
-rw-r--r--man2/get_thread_area.21
-rw-r--r--man2/getcpu.2147
-rw-r--r--man2/getcwd.22
-rw-r--r--man2/getdents.2319
-rw-r--r--man2/getdents64.21
-rw-r--r--man2/getdomainname.2122
-rw-r--r--man2/getegid.21
-rw-r--r--man2/getegid32.21
-rw-r--r--man2/geteuid.21
-rw-r--r--man2/geteuid32.21
-rw-r--r--man2/getgid.270
-rw-r--r--man2/getgid32.21
-rw-r--r--man2/getgroups.2219
-rw-r--r--man2/getgroups32.21
-rw-r--r--man2/gethostname.2176
-rw-r--r--man2/getitimer.2278
-rw-r--r--man2/getmsg.21
-rw-r--r--man2/getpagesize.289
-rw-r--r--man2/getpeername.2116
-rw-r--r--man2/getpgid.21
-rw-r--r--man2/getpgrp.21
-rw-r--r--man2/getpid.2150
-rw-r--r--man2/getpmsg.21
-rw-r--r--man2/getppid.21
-rw-r--r--man2/getpriority.2209
-rw-r--r--man2/getrandom.2295
-rw-r--r--man2/getresgid.21
-rw-r--r--man2/getresgid32.21
-rw-r--r--man2/getresuid.270
-rw-r--r--man2/getresuid32.21
-rw-r--r--man2/getrlimit.2854
-rw-r--r--man2/getrusage.2254
-rw-r--r--man2/getsid.275
-rw-r--r--man2/getsockname.285
-rw-r--r--man2/getsockopt.2172
-rw-r--r--man2/gettid.274
-rw-r--r--man2/gettimeofday.2296
-rw-r--r--man2/getuid.280
-rw-r--r--man2/getuid32.21
-rw-r--r--man2/getunwind.287
-rw-r--r--man2/getxattr.2143
-rw-r--r--man2/gtty.21
-rw-r--r--man2/idle.244
-rw-r--r--man2/inb.21
-rw-r--r--man2/inb_p.21
-rw-r--r--man2/init_module.2342
-rw-r--r--man2/inl.21
-rw-r--r--man2/inl_p.21
-rw-r--r--man2/inotify_add_watch.2135
-rw-r--r--man2/inotify_init.297
-rw-r--r--man2/inotify_init1.21
-rw-r--r--man2/inotify_rm_watch.260
-rw-r--r--man2/insb.21
-rw-r--r--man2/insl.21
-rw-r--r--man2/insw.21
-rw-r--r--man2/intro.2115
-rw-r--r--man2/inw.21
-rw-r--r--man2/inw_p.21
-rw-r--r--man2/io_cancel.2106
-rw-r--r--man2/io_destroy.297
-rw-r--r--man2/io_getevents.2137
-rw-r--r--man2/io_setup.2114
-rw-r--r--man2/io_submit.2289
-rw-r--r--man2/ioctl.2185
-rw-r--r--man2/ioctl_console.2903
-rw-r--r--man2/ioctl_fat.2489
-rw-r--r--man2/ioctl_ficlone.21
-rw-r--r--man2/ioctl_ficlonerange.2129
-rw-r--r--man2/ioctl_fideduperange.2200
-rw-r--r--man2/ioctl_fslabel.272
-rw-r--r--man2/ioctl_getfsmap.2351
-rw-r--r--man2/ioctl_iflags.2202
-rw-r--r--man2/ioctl_ns.2342
-rw-r--r--man2/ioctl_pipe.264
-rw-r--r--man2/ioctl_tty.2913
-rw-r--r--man2/ioctl_userfaultfd.2906
-rw-r--r--man2/ioperm.2105
-rw-r--r--man2/iopl.292
-rw-r--r--man2/ioprio_get.21
-rw-r--r--man2/ioprio_set.2362
-rw-r--r--man2/ipc.263
-rw-r--r--man2/isastream.21
-rw-r--r--man2/kcmp.2420
-rw-r--r--man2/kexec_file_load.21
-rw-r--r--man2/kexec_load.2331
-rw-r--r--man2/keyctl.22297
-rw-r--r--man2/kill.2165
-rw-r--r--man2/landlock_add_rule.2131
-rw-r--r--man2/landlock_create_ruleset.2124
-rw-r--r--man2/landlock_restrict_self.2116
-rw-r--r--man2/lchown.21
-rw-r--r--man2/lchown32.21
-rw-r--r--man2/lgetxattr.21
-rw-r--r--man2/link.2425
-rw-r--r--man2/linkat.21
-rw-r--r--man2/listen.2155
-rw-r--r--man2/listxattr.2322
-rw-r--r--man2/llistxattr.21
-rw-r--r--man2/llseek.292
-rw-r--r--man2/lock.21
-rw-r--r--man2/lookup_dcookie.286
-rw-r--r--man2/lremovexattr.21
-rw-r--r--man2/lseek.2252
-rw-r--r--man2/lsetxattr.21
-rw-r--r--man2/lstat.21
-rw-r--r--man2/lstat64.21
-rw-r--r--man2/madvise.2898
-rw-r--r--man2/madvise1.21
-rw-r--r--man2/mbind.2486
-rw-r--r--man2/membarrier.2460
-rw-r--r--man2/memfd_create.2545
-rw-r--r--man2/memfd_secret.2204
-rw-r--r--man2/migrate_pages.2174
-rw-r--r--man2/mincore.2158
-rw-r--r--man2/mkdir.2250
-rw-r--r--man2/mkdirat.21
-rw-r--r--man2/mknod.2302
-rw-r--r--man2/mknodat.21
-rw-r--r--man2/mlock.2507
-rw-r--r--man2/mlock2.21
-rw-r--r--man2/mlockall.21
-rw-r--r--man2/mmap.21035
-rw-r--r--man2/mmap2.285
-rw-r--r--man2/modify_ldt.2196
-rw-r--r--man2/mount.2971
-rw-r--r--man2/mount_setattr.21055
-rw-r--r--man2/move_pages.2253
-rw-r--r--man2/mprotect.2363
-rw-r--r--man2/mpx.21
-rw-r--r--man2/mq_getsetattr.233
-rw-r--r--man2/mq_notify.22
-rw-r--r--man2/mq_open.22
-rw-r--r--man2/mq_timedreceive.22
-rw-r--r--man2/mq_timedsend.22
-rw-r--r--man2/mq_unlink.22
-rw-r--r--man2/mremap.2352
-rw-r--r--man2/msgctl.2424
-rw-r--r--man2/msgget.2217
-rw-r--r--man2/msgop.2684
-rw-r--r--man2/msgrcv.21
-rw-r--r--man2/msgsnd.21
-rw-r--r--man2/msync.2140
-rw-r--r--man2/munlock.21
-rw-r--r--man2/munlockall.21
-rw-r--r--man2/munmap.21
-rw-r--r--man2/name_to_handle_at.21
-rw-r--r--man2/nanosleep.2220
-rw-r--r--man2/newfstatat.21
-rw-r--r--man2/nfsservctl.270
-rw-r--r--man2/nice.2118
-rw-r--r--man2/oldfstat.21
-rw-r--r--man2/oldlstat.21
-rw-r--r--man2/oldolduname.21
-rw-r--r--man2/oldstat.21
-rw-r--r--man2/olduname.21
-rw-r--r--man2/open.21934
-rw-r--r--man2/open_by_handle_at.2751
-rw-r--r--man2/openat.21
-rw-r--r--man2/openat2.2582
-rw-r--r--man2/outb.284
-rw-r--r--man2/outb_p.21
-rw-r--r--man2/outl.21
-rw-r--r--man2/outl_p.21
-rw-r--r--man2/outsb.21
-rw-r--r--man2/outsl.21
-rw-r--r--man2/outsw.21
-rw-r--r--man2/outw.21
-rw-r--r--man2/outw_p.21
-rw-r--r--man2/pause.250
-rw-r--r--man2/pciconfig_iobase.21
-rw-r--r--man2/pciconfig_read.2122
-rw-r--r--man2/pciconfig_write.21
-rw-r--r--man2/perf_event_open.23989
-rw-r--r--man2/perfmonctl.2193
-rw-r--r--man2/personality.2296
-rw-r--r--man2/phys.21
-rw-r--r--man2/pidfd_getfd.2144
-rw-r--r--man2/pidfd_open.2269
-rw-r--r--man2/pidfd_send_signal.2240
-rw-r--r--man2/pipe.2304
-rw-r--r--man2/pipe2.21
-rw-r--r--man2/pivot_root.2409
-rw-r--r--man2/pkey_alloc.2115
-rw-r--r--man2/pkey_free.21
-rw-r--r--man2/pkey_mprotect.21
-rw-r--r--man2/poll.2649
-rw-r--r--man2/posix_fadvise.2227
-rw-r--r--man2/ppoll.21
-rw-r--r--man2/prctl.22544
-rw-r--r--man2/pread.2146
-rw-r--r--man2/pread64.21
-rw-r--r--man2/preadv.21
-rw-r--r--man2/preadv2.21
-rw-r--r--man2/prlimit.21
-rw-r--r--man2/prlimit64.21
-rw-r--r--man2/process_madvise.2209
-rw-r--r--man2/process_vm_readv.2314
-rw-r--r--man2/process_vm_writev.21
-rw-r--r--man2/prof.21
-rw-r--r--man2/pselect.21
-rw-r--r--man2/pselect6.21
-rw-r--r--man2/ptrace.22974
-rw-r--r--man2/putmsg.21
-rw-r--r--man2/putpmsg.21
-rw-r--r--man2/pwrite.21
-rw-r--r--man2/pwrite64.21
-rw-r--r--man2/pwritev.21
-rw-r--r--man2/pwritev2.21
-rw-r--r--man2/query_module.2194
-rw-r--r--man2/quotactl.2806
-rw-r--r--man2/read.2245
-rw-r--r--man2/readahead.299
-rw-r--r--man2/readdir.2116
-rw-r--r--man2/readlink.2331
-rw-r--r--man2/readlinkat.21
-rw-r--r--man2/readv.2427
-rw-r--r--man2/reboot.2236
-rw-r--r--man2/recv.2563
-rw-r--r--man2/recvfrom.21
-rw-r--r--man2/recvmmsg.2276
-rw-r--r--man2/recvmsg.21
-rw-r--r--man2/remap_file_pages.2170
-rw-r--r--man2/removexattr.2100
-rw-r--r--man2/rename.2549
-rw-r--r--man2/renameat.21
-rw-r--r--man2/renameat2.21
-rw-r--r--man2/request_key.2562
-rw-r--r--man2/restart_syscall.2123
-rw-r--r--man2/rmdir.2128
-rw-r--r--man2/rt_sigaction.21
-rw-r--r--man2/rt_sigpending.21
-rw-r--r--man2/rt_sigprocmask.21
-rw-r--r--man2/rt_sigqueueinfo.2195
-rw-r--r--man2/rt_sigreturn.21
-rw-r--r--man2/rt_sigsuspend.21
-rw-r--r--man2/rt_sigtimedwait.21
-rw-r--r--man2/rt_tgsigqueueinfo.21
-rw-r--r--man2/s390_guarded_storage.2162
-rw-r--r--man2/s390_pci_mmio_read.21
-rw-r--r--man2/s390_pci_mmio_write.294
-rw-r--r--man2/s390_runtime_instr.2104
-rw-r--r--man2/s390_sthyi.2133
-rw-r--r--man2/sbrk.21
-rw-r--r--man2/sched_get_priority_max.2112
-rw-r--r--man2/sched_get_priority_min.21
-rw-r--r--man2/sched_getaffinity.21
-rw-r--r--man2/sched_getattr.21
-rw-r--r--man2/sched_getparam.21
-rw-r--r--man2/sched_getscheduler.21
-rw-r--r--man2/sched_rr_get_interval.2110
-rw-r--r--man2/sched_setaffinity.2427
-rw-r--r--man2/sched_setattr.2447
-rw-r--r--man2/sched_setparam.2121
-rw-r--r--man2/sched_setscheduler.2232
-rw-r--r--man2/sched_yield.276
-rw-r--r--man2/seccomp.21245
-rw-r--r--man2/seccomp_unotify.22011
-rw-r--r--man2/security.21
-rw-r--r--man2/select.2765
-rw-r--r--man2/select_tut.2638
-rw-r--r--man2/semctl.2623
-rw-r--r--man2/semget.2434
-rw-r--r--man2/semop.2523
-rw-r--r--man2/semtimedop.21
-rw-r--r--man2/send.2506
-rw-r--r--man2/sendfile.2236
-rw-r--r--man2/sendfile64.21
-rw-r--r--man2/sendmmsg.2232
-rw-r--r--man2/sendmsg.21
-rw-r--r--man2/sendto.21
-rw-r--r--man2/set_mempolicy.2325
-rw-r--r--man2/set_robust_list.21
-rw-r--r--man2/set_thread_area.2229
-rw-r--r--man2/set_tid_address.297
-rw-r--r--man2/setdomainname.21
-rw-r--r--man2/setegid.21
-rw-r--r--man2/seteuid.2134
-rw-r--r--man2/setfsgid.2109
-rw-r--r--man2/setfsgid32.21
-rw-r--r--man2/setfsuid.2127
-rw-r--r--man2/setfsuid32.21
-rw-r--r--man2/setgid.292
-rw-r--r--man2/setgid32.21
-rw-r--r--man2/setgroups.21
-rw-r--r--man2/setgroups32.21
-rw-r--r--man2/sethostname.21
-rw-r--r--man2/setitimer.21
-rw-r--r--man2/setns.2419
-rw-r--r--man2/setpgid.2329
-rw-r--r--man2/setpgrp.21
-rw-r--r--man2/setpriority.21
-rw-r--r--man2/setregid.21
-rw-r--r--man2/setregid32.21
-rw-r--r--man2/setresgid.21
-rw-r--r--man2/setresgid32.21
-rw-r--r--man2/setresuid.2147
-rw-r--r--man2/setresuid32.21
-rw-r--r--man2/setreuid.2193
-rw-r--r--man2/setreuid32.21
-rw-r--r--man2/setrlimit.21
-rw-r--r--man2/setsid.2100
-rw-r--r--man2/setsockopt.21
-rw-r--r--man2/settimeofday.21
-rw-r--r--man2/setuid.2156
-rw-r--r--man2/setuid32.21
-rw-r--r--man2/setup.255
-rw-r--r--man2/setxattr.2159
-rw-r--r--man2/sgetmask.270
-rw-r--r--man2/shmat.21
-rw-r--r--man2/shmctl.2490
-rw-r--r--man2/shmdt.21
-rw-r--r--man2/shmget.2410
-rw-r--r--man2/shmop.2507
-rw-r--r--man2/shutdown.298
-rw-r--r--man2/sigaction.21203
-rw-r--r--man2/sigaltstack.2363
-rw-r--r--man2/signal.2280
-rw-r--r--man2/signalfd.2521
-rw-r--r--man2/signalfd4.21
-rw-r--r--man2/sigpending.2110
-rw-r--r--man2/sigprocmask.2224
-rw-r--r--man2/sigreturn.2151
-rw-r--r--man2/sigsuspend.2131
-rw-r--r--man2/sigtimedwait.21
-rw-r--r--man2/sigwaitinfo.2231
-rw-r--r--man2/socket.2493
-rw-r--r--man2/socketcall.2185
-rw-r--r--man2/socketpair.2116
-rw-r--r--man2/splice.2266
-rw-r--r--man2/spu_create.2276
-rw-r--r--man2/spu_run.2260
-rw-r--r--man2/ssetmask.21
-rw-r--r--man2/stat.2539
-rw-r--r--man2/stat64.21
-rw-r--r--man2/statfs.2389
-rw-r--r--man2/statfs64.21
-rw-r--r--man2/statx.2614
-rw-r--r--man2/stime.273
-rw-r--r--man2/stty.21
-rw-r--r--man2/subpage_prot.2118
-rw-r--r--man2/swapoff.21
-rw-r--r--man2/swapon.2197
-rw-r--r--man2/symlink.2265
-rw-r--r--man2/symlinkat.21
-rw-r--r--man2/sync.2146
-rw-r--r--man2/sync_file_range.2213
-rw-r--r--man2/sync_file_range2.21
-rw-r--r--man2/syncfs.21
-rw-r--r--man2/syscall.2367
-rw-r--r--man2/syscalls.21168
-rw-r--r--man2/sysctl.2158
-rw-r--r--man2/sysfs.297
-rw-r--r--man2/sysinfo.2106
-rw-r--r--man2/syslog.2378
-rw-r--r--man2/tee.2199
-rw-r--r--man2/tgkill.21
-rw-r--r--man2/time.2117
-rw-r--r--man2/timer_create.2487
-rw-r--r--man2/timer_delete.258
-rw-r--r--man2/timer_getoverrun.2134
-rw-r--r--man2/timer_gettime.21
-rw-r--r--man2/timer_settime.2187
-rw-r--r--man2/timerfd_create.2700
-rw-r--r--man2/timerfd_gettime.21
-rw-r--r--man2/timerfd_settime.21
-rw-r--r--man2/times.2222
-rw-r--r--man2/tkill.2130
-rw-r--r--man2/truncate.2251
-rw-r--r--man2/truncate64.21
-rw-r--r--man2/tuxcall.21
-rw-r--r--man2/ugetrlimit.21
-rw-r--r--man2/umask.2149
-rw-r--r--man2/umount.2214
-rw-r--r--man2/umount2.21
-rw-r--r--man2/uname.2134
-rw-r--r--man2/unimplemented.248
-rw-r--r--man2/unlink.2298
-rw-r--r--man2/unlinkat.21
-rw-r--r--man2/unshare.2572
-rw-r--r--man2/uselib.2106
-rw-r--r--man2/userfaultfd.2943
-rw-r--r--man2/ustat.2104
-rw-r--r--man2/utime.2179
-rw-r--r--man2/utimensat.2613
-rw-r--r--man2/utimes.21
-rw-r--r--man2/vfork.2316
-rw-r--r--man2/vhangup.258
-rw-r--r--man2/vm86.258
-rw-r--r--man2/vm86old.21
-rw-r--r--man2/vmsplice.2162
-rw-r--r--man2/vserver.21
-rw-r--r--man2/wait.2720
-rw-r--r--man2/wait3.21
-rw-r--r--man2/wait4.2169
-rw-r--r--man2/waitid.21
-rw-r--r--man2/waitpid.21
-rw-r--r--man2/write.2329
-rw-r--r--man2/writev.21
501 files changed, 99127 insertions, 0 deletions
diff --git a/man2/_Exit.2 b/man2/_Exit.2
new file mode 100644
index 0000000..9f9d2e7
--- /dev/null
+++ b/man2/_Exit.2
@@ -0,0 +1 @@
+.so man2/_exit.2
diff --git a/man2/__clone2.2 b/man2/__clone2.2
new file mode 100644
index 0000000..68f41a5
--- /dev/null
+++ b/man2/__clone2.2
@@ -0,0 +1 @@
+.so man2/clone.2
diff --git a/man2/_exit.2 b/man2/_exit.2
new file mode 100644
index 0000000..22cccd9
--- /dev/null
+++ b/man2/_exit.2
@@ -0,0 +1,138 @@
+.\" This manpage is Copyright (C) 1992 Drew Eckhardt;
+.\" and Copyright (C) 1993 Michael Haardt, Ian Jackson.
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified Wed Jul 21 23:02:38 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 2001-11-17, aeb
+.\"
+.TH _exit 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+_exit, _Exit \- terminate the calling process
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "[[noreturn]] void _exit(int " status );
+.PP
+.B #include <stdlib.h>
+.PP
+.BI "[[noreturn]] void _Exit(int " status );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR _Exit ():
+.nf
+ _ISOC99_SOURCE || _POSIX_C_SOURCE >= 200112L
+.fi
+.SH DESCRIPTION
+.BR _exit ()
+terminates the calling process "immediately".
+Any open file descriptors belonging to the process are closed.
+Any children of the process are inherited by
+.BR init (1)
+(or by the nearest "subreaper" process as defined through the use of the
+.BR prctl (2)
+.B PR_SET_CHILD_SUBREAPER
+operation).
+The process's parent is sent a
+.B SIGCHLD
+signal.
+.PP
+The value
+.I "status & 0xFF"
+is returned to the parent process as the process's exit status, and
+can be collected by the parent using one of the
+.BR wait (2)
+family of calls.
+.PP
+The function
+.BR _Exit ()
+is equivalent to
+.BR _exit ().
+.SH RETURN VALUE
+These functions do not return.
+.SH STANDARDS
+.TP
+.BR _exit ()
+POSIX.1-2008.
+.TP
+.BR _Exit ()
+C11, POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4, 4.3BSD.
+.PP
+.BR _Exit ()
+was introduced by C99.
+.SH NOTES
+For a discussion on the effects of an exit, the transmission of
+exit status, zombie processes, signals sent, and so on, see
+.BR exit (3).
+.PP
+The function
+.BR _exit ()
+is like
+.BR exit (3),
+but does not call any
+functions registered with
+.BR atexit (3)
+or
+.BR on_exit (3).
+Open
+.BR stdio (3)
+streams are not flushed.
+On the other hand,
+.BR _exit ()
+does close open file descriptors, and this may cause an unknown delay,
+waiting for pending output to finish.
+If the delay is undesired,
+it may be useful to call functions like
+.BR tcflush (3)
+before calling
+.BR _exit ().
+Whether any pending I/O is canceled, and which pending I/O may be
+canceled upon
+.BR _exit (),
+is implementation-dependent.
+.SS C library/kernel differences
+The text above in DESCRIPTION describes the traditional effect of
+.BR _exit (),
+which is to terminate a process,
+and these are the semantics specified by POSIX.1 and implemented
+by the C library wrapper function.
+On modern systems, this means termination of all threads in the process.
+.PP
+By contrast with the C library wrapper function, the raw Linux
+.BR _exit ()
+system call terminates only the calling thread, and actions such as
+reparenting child processes or sending
+.B SIGCHLD
+to the parent process are performed only if this is
+the last thread in the thread group.
+.\" _exit() is used by pthread_exit() to terminate the calling thread
+.PP
+Up to glibc 2.3, the
+.BR _exit ()
+wrapper function invoked the kernel system call of the same name.
+Since glibc 2.3, the wrapper function invokes
+.BR exit_group (2),
+in order to terminate all of the threads in a process.
+.SH SEE ALSO
+.BR execve (2),
+.BR exit_group (2),
+.BR fork (2),
+.BR kill (2),
+.BR wait (2),
+.BR wait4 (2),
+.BR waitpid (2),
+.BR atexit (3),
+.BR exit (3),
+.BR on_exit (3),
+.BR termios (3)
diff --git a/man2/_llseek.2 b/man2/_llseek.2
new file mode 100644
index 0000000..d15dbee
--- /dev/null
+++ b/man2/_llseek.2
@@ -0,0 +1 @@
+.so man2/llseek.2
diff --git a/man2/_newselect.2 b/man2/_newselect.2
new file mode 100644
index 0000000..e177843
--- /dev/null
+++ b/man2/_newselect.2
@@ -0,0 +1 @@
+.so man2/select.2
diff --git a/man2/_syscall.2 b/man2/_syscall.2
new file mode 100644
index 0000000..ef6542f
--- /dev/null
+++ b/man2/_syscall.2
@@ -0,0 +1,171 @@
+.\" Copyright (c) 1993 Michael Haardt (michael@moria.de),
+.\" Fri Apr 2 11:32:09 MET DST 1993
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" Tue Jul 6 12:42:46 MDT 1993 <dminer@nyx.cs.du.edu>
+.\" Added "Calling Directly" and supporting paragraphs
+.\"
+.\" Modified Sat Jul 24 15:19:12 1993 by Rik Faith <faith@cs.unc.edu>
+.\"
+.\" Modified 21 Aug 1994 by Michael Chastain <mec@shell.portal.com>:
+.\" Added explanation of arg stacking when 6 or more args.
+.\"
+.\" Modified 10 June 1995 by Andries Brouwer <aeb@cwi.nl>
+.\"
+.\" 2007-10-23 mtk: created as a new page, by taking the content
+.\" specific to the _syscall() macros from intro(2).
+.\"
+.TH _syscall 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+_syscall \- invoking a system call without library support (OBSOLETE)
+.SH SYNOPSIS
+.nf
+.B #include <linux/unistd.h>
+.PP
+A _syscall macro
+.PP
+desired system call
+.fi
+.SH DESCRIPTION
+The important thing to know about a system call is its prototype.
+You need to know how many arguments, their types,
+and the function return type.
+There are seven macros that make the actual call into the system easier.
+They have the form:
+.PP
+.in +4n
+.EX
+.RI _syscall X ( type , name , type1 , arg1 , type2 , arg2 ,...)
+.EE
+.in
+.PP
+where
+.IP
+.I X
+is 0\[en]6, which are the number of arguments taken by the
+system call
+.IP
+.I type
+is the return type of the system call
+.IP
+.I name
+is the name of the system call
+.IP
+.I typeN
+is the Nth argument's type
+.IP
+.I argN
+is the name of the Nth argument
+.PP
+These macros create a function called
+.I name
+with the arguments you
+specify.
+Once you include the _syscall() in your source file,
+you call the system call by
+.IR name .
+.SH FILES
+.I /usr/include/linux/unistd.h
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Starting around Linux 2.6.18, the _syscall macros were removed
+from header files supplied to user space.
+Use
+.BR syscall (2)
+instead.
+(Some architectures, notably ia64, never provided the _syscall macros;
+on those architectures,
+.BR syscall (2)
+was always required.)
+.SH NOTES
+The _syscall() macros
+.I "do not"
+produce a prototype.
+You may have to
+create one, especially for C++ users.
+.PP
+System calls are not required to return only positive or negative error
+codes.
+You need to read the source to be sure how it will return errors.
+Usually, it is the negative of a standard error code,
+for example,
+.RI \- EPERM .
+The _syscall() macros will return the result
+.I r
+of the system call
+when
+.I r
+is nonnegative, but will return \-1 and set the variable
+.I errno
+to
+.RI \- r
+when
+.I r
+is negative.
+For the error codes, see
+.BR errno (3).
+.PP
+When defining a system call, the argument types
+.I must
+be
+passed by-value or by-pointer (for aggregates like structs).
+.\" The preferred way to invoke system calls that glibc does not know
+.\" about yet is via
+.\" .BR syscall (2).
+.\" However, this mechanism can be used only if using a libc
+.\" (such as glibc) that supports
+.\" .BR syscall (2),
+.\" and if the
+.\" .I <sys/syscall.h>
+.\" header file contains the required SYS_foo definition.
+.\" Otherwise, the use of a _syscall macro is required.
+.\"
+.SH EXAMPLES
+.\" [[deprecated]] SRC BEGIN (_syscall.c)
+.EX
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <linux/unistd.h> /* for _syscallX macros/related stuff */
+#include <linux/kernel.h> /* for struct sysinfo */
+\&
+_syscall1(int, sysinfo, struct sysinfo *, info);
+\&
+int
+main(void)
+{
+ struct sysinfo s_info;
+ int error;
+\&
+ error = sysinfo(&s_info);
+ printf("code error = %d\en", error);
+ printf("Uptime = %lds\enLoad: 1 min %lu / 5 min %lu / 15 min %lu\en"
+ "RAM: total %lu / free %lu / shared %lu\en"
+ "Memory in buffers = %lu\enSwap: total %lu / free %lu\en"
+ "Number of processes = %d\en",
+ s_info.uptime, s_info.loads[0],
+ s_info.loads[1], s_info.loads[2],
+ s_info.totalram, s_info.freeram,
+ s_info.sharedram, s_info.bufferram,
+ s_info.totalswap, s_info.freeswap,
+ s_info.procs);
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SS Sample output
+.EX
+code error = 0
+uptime = 502034s
+Load: 1 min 13376 / 5 min 5504 / 15 min 1152
+RAM: total 15343616 / free 827392 / shared 8237056
+Memory in buffers = 5066752
+Swap: total 27881472 / free 24698880
+Number of processes = 40
+.EE
+.SH SEE ALSO
+.BR intro (2),
+.BR syscall (2),
+.BR errno (3)
diff --git a/man2/_sysctl.2 b/man2/_sysctl.2
new file mode 100644
index 0000000..9e14d4b
--- /dev/null
+++ b/man2/_sysctl.2
@@ -0,0 +1 @@
+.so man2/sysctl.2
diff --git a/man2/accept.2 b/man2/accept.2
new file mode 100644
index 0000000..340fdb8
--- /dev/null
+++ b/man2/accept.2
@@ -0,0 +1,347 @@
+.\" Copyright (c) 1983, 1990, 1991 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" SPDX-License-Identifier: BSD-4-Clause-UC
+.\"
+.\" Modified 1993-07-24 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1996-10-21 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 1998-2000 by Andi Kleen to match Linux 2.2 reality
+.\" Modified 2002-04-23 by Roger Luethi <rl@hellgate.ch>
+.\" Modified 2004-06-17 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" 2008-12-04, mtk, Add documentation of accept4()
+.\"
+.TH accept 2 2023-04-05 "Linux man-pages 6.05.01"
+.SH NAME
+accept, accept4 \- accept a connection on a socket
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/socket.h>
+.PP
+.BI "int accept(int " sockfd ", struct sockaddr *_Nullable restrict " addr ,
+.BI " socklen_t *_Nullable restrict " addrlen );
+.PP
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #include <sys/socket.h>
+.PP
+.BI "int accept4(int " sockfd ", struct sockaddr *_Nullable restrict " addr ,
+.BI " socklen_t *_Nullable restrict " addrlen ", int " flags );
+.fi
+.SH DESCRIPTION
+The
+.BR accept ()
+system call is used with connection-based socket types
+.RB ( SOCK_STREAM ,
+.BR SOCK_SEQPACKET ).
+It extracts the first connection request on the queue of pending
+connections for the listening socket,
+.IR sockfd ,
+creates a new connected socket, and returns a new file
+descriptor referring to that socket.
+The newly created socket is not in the listening state.
+The original socket
+.I sockfd
+is unaffected by this call.
+.PP
+The argument
+.I sockfd
+is a socket that has been created with
+.BR socket (2),
+bound to a local address with
+.BR bind (2),
+and is listening for connections after a
+.BR listen (2).
+.PP
+The argument
+.I addr
+is a pointer to a
+.I sockaddr
+structure.
+This structure is filled in with the address of the peer socket,
+as known to the communications layer.
+The exact format of the address returned
+.I addr
+is determined by the socket's address family (see
+.BR socket (2)
+and the respective protocol man pages).
+When
+.I addr
+is NULL, nothing is filled in; in this case,
+.I addrlen
+is not used, and should also be NULL.
+.PP
+The
+.I addrlen
+argument is a value-result argument:
+the caller must initialize it to contain the
+size (in bytes) of the structure pointed to by
+.IR addr ;
+on return it will contain the actual size of the peer address.
+.PP
+The returned address is truncated if the buffer provided is too small;
+in this case,
+.I addrlen
+will return a value greater than was supplied to the call.
+.PP
+If no pending
+connections are present on the queue, and the socket is not marked as
+nonblocking,
+.BR accept ()
+blocks the caller until a connection is present.
+If the socket is marked
+nonblocking and no pending connections are present on the queue,
+.BR accept ()
+fails with the error
+.B EAGAIN
+or
+.BR EWOULDBLOCK .
+.PP
+In order to be notified of incoming connections on a socket, you can use
+.BR select (2),
+.BR poll (2),
+or
+.BR epoll (7).
+A readable event will be delivered when a new connection is attempted and you
+may then call
+.BR accept ()
+to get a socket for that connection.
+Alternatively, you can set the socket to deliver
+.B SIGIO
+when activity occurs on a socket; see
+.BR socket (7)
+for details.
+.PP
+If
+.I flags
+is 0, then
+.BR accept4 ()
+is the same as
+.BR accept ().
+The following values can be bitwise ORed in
+.I flags
+to obtain different behavior:
+.TP 16
+.B SOCK_NONBLOCK
+Set the
+.B O_NONBLOCK
+file status flag on the open file description (see
+.BR open (2))
+referred to by the new file descriptor.
+Using this flag saves extra calls to
+.BR fcntl (2)
+to achieve the same result.
+.TP
+.B SOCK_CLOEXEC
+Set the close-on-exec
+.RB ( FD_CLOEXEC )
+flag on the new file descriptor.
+See the description of the
+.B O_CLOEXEC
+flag in
+.BR open (2)
+for reasons why this may be useful.
+.SH RETURN VALUE
+On success,
+these system calls return a file descriptor
+for the accepted socket (a nonnegative integer).
+On error, \-1 is returned,
+.I errno
+is set to indicate the error, and
+.I addrlen
+is left unchanged.
+.SS Error handling
+Linux
+.BR accept ()
+(and
+.BR accept4 ())
+passes already-pending network errors on the new socket
+as an error code from
+.BR accept ().
+This behavior differs from other BSD socket
+implementations.
+For reliable operation the application should detect
+the network errors defined for the protocol after
+.BR accept ()
+and treat
+them like
+.B EAGAIN
+by retrying.
+In the case of TCP/IP, these are
+.BR ENETDOWN ,
+.BR EPROTO ,
+.BR ENOPROTOOPT ,
+.BR EHOSTDOWN ,
+.BR ENONET ,
+.BR EHOSTUNREACH ,
+.BR EOPNOTSUPP ,
+and
+.BR ENETUNREACH .
+.SH ERRORS
+.TP
+.BR EAGAIN " or " EWOULDBLOCK
+.\" Actually EAGAIN on Linux
+The socket is marked nonblocking and no connections are
+present to be accepted.
+POSIX.1-2001 and POSIX.1-2008
+allow either error to be returned for this case,
+and do not require these constants to have the same value,
+so a portable application should check for both possibilities.
+.TP
+.B EBADF
+.I sockfd
+is not an open file descriptor.
+.TP
+.B ECONNABORTED
+A connection has been aborted.
+.TP
+.B EFAULT
+The
+.I addr
+argument is not in a writable part of the user address space.
+.TP
+.B EINTR
+The system call was interrupted by a signal that was caught
+before a valid connection arrived; see
+.BR signal (7).
+.TP
+.B EINVAL
+Socket is not listening for connections, or
+.I addrlen
+is invalid (e.g., is negative).
+.TP
+.B EINVAL
+.RB ( accept4 ())
+invalid value in
+.IR flags .
+.TP
+.B EMFILE
+The per-process limit on the number of open file descriptors has been reached.
+.TP
+.B ENFILE
+The system-wide limit on the total number of open files has been reached.
+.TP
+.BR ENOBUFS ", " ENOMEM
+Not enough free memory.
+This often means that the memory allocation is limited by the socket buffer
+limits, not by the system memory.
+.TP
+.B ENOTSOCK
+The file descriptor
+.I sockfd
+does not refer to a socket.
+.TP
+.B EOPNOTSUPP
+The referenced socket is not of type
+.BR SOCK_STREAM .
+.TP
+.B EPERM
+Firewall rules forbid connection.
+.TP
+.B EPROTO
+Protocol error.
+.PP
+In addition, network errors for the new socket and as defined
+for the protocol may be returned.
+Various Linux kernels can
+return other errors such as
+.BR ENOSR ,
+.BR ESOCKTNOSUPPORT ,
+.BR EPROTONOSUPPORT ,
+.BR ETIMEDOUT .
+The value
+.B ERESTARTSYS
+may be seen during a trace.
+.SH VERSIONS
+On Linux, the new socket returned by
+.BR accept ()
+does \fInot\fP inherit file status flags such as
+.B O_NONBLOCK
+and
+.B O_ASYNC
+from the listening socket.
+This behavior differs from the canonical BSD sockets implementation.
+.\" Some testing seems to show that Tru64 5.1 and HP-UX 11 also
+.\" do not inherit file status flags -- MTK Jun 05
+Portable programs should not rely on inheritance or noninheritance
+of file status flags and always explicitly set all required flags on
+the socket returned from
+.BR accept ().
+.SH STANDARDS
+.TP
+.BR accept ()
+POSIX.1-2008.
+.TP
+.BR accept4 ()
+Linux.
+.SH HISTORY
+.TP
+.BR accept ()
+POSIX.1-2001, SVr4, 4.4BSD
+.RB ( accept ()
+first appeared in 4.2BSD).
+.\" The BSD man page documents five possible error returns
+.\" (EBADF, ENOTSOCK, EOPNOTSUPP, EWOULDBLOCK, EFAULT).
+.\" POSIX.1-2001 documents errors
+.\" EAGAIN, EBADF, ECONNABORTED, EINTR, EINVAL, EMFILE,
+.\" ENFILE, ENOBUFS, ENOMEM, ENOTSOCK, EOPNOTSUPP, EPROTO, EWOULDBLOCK.
+.\" In addition, SUSv2 documents EFAULT and ENOSR.
+.TP
+.BR accept4 ()
+Linux 2.6.28,
+glibc 2.10.
+.SH NOTES
+There may not always be a connection waiting after a
+.B SIGIO
+is delivered or
+.BR select (2),
+.BR poll (2),
+or
+.BR epoll (7)
+return a readability event because the connection might have been
+removed by an asynchronous network error or another thread before
+.BR accept ()
+is called.
+If this happens, then the call will block waiting for the next
+connection to arrive.
+To ensure that
+.BR accept ()
+never blocks, the passed socket
+.I sockfd
+needs to have the
+.B O_NONBLOCK
+flag set (see
+.BR socket (7)).
+.PP
+For certain protocols which require an explicit confirmation,
+such as DECnet,
+.BR accept ()
+can be thought of as merely dequeuing the next connection request and not
+implying confirmation.
+Confirmation can be implied by
+a normal read or write on the new file descriptor, and rejection can be
+implied by closing the new socket.
+Currently, only DECnet has these semantics on Linux.
+.\"
+.SS The socklen_t type
+In the original BSD sockets implementation (and on other older systems)
+.\" such as Linux libc4 and libc5, SunOS 4, SGI
+the third argument of
+.BR accept ()
+was declared as an \fIint\ *\fP.
+A POSIX.1g draft
+standard wanted to change it into a \fIsize_t\ *\fPC;
+.\" SunOS 5 has 'size_t *'
+later POSIX standards and glibc 2.x have
+.IR "socklen_t\ * ".
+.SH EXAMPLES
+See
+.BR bind (2).
+.SH SEE ALSO
+.BR bind (2),
+.BR connect (2),
+.BR listen (2),
+.BR select (2),
+.BR socket (2),
+.BR socket (7)
diff --git a/man2/accept4.2 b/man2/accept4.2
new file mode 100644
index 0000000..963dfb5
--- /dev/null
+++ b/man2/accept4.2
@@ -0,0 +1 @@
+.so man2/accept.2
diff --git a/man2/access.2 b/man2/access.2
new file mode 100644
index 0000000..3f492d2
--- /dev/null
+++ b/man2/access.2
@@ -0,0 +1,447 @@
+.\" This manpage is Copyright (C) 1992 Drew Eckhardt;
+.\" and Copyright (C) 1993 Michael Haardt, Ian Jackson.
+.\" and Copyright (C) 2004, 2006, 2007, 2014 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 1993-07-21 Rik Faith (faith@cs.unc.edu)
+.\" Modified 1994-08-21 by Michael Chastain (mec@shell.portal.com):
+.\" Removed note about old kernel (pre-1.1.44) using wrong id on path.
+.\" Modified 1996-03-18 by Martin Schulze (joey@infodrom.north.de):
+.\" Stated more clearly how it behaves with symbolic links.
+.\" Added correction due to Nick Duffek (nsd@bbc.com), aeb, 960426
+.\" Modified 1996-09-07 by Michael Haardt:
+.\" Restrictions for NFS
+.\" Modified 1997-09-09 by Joseph S. Myers <jsm28@cam.ac.uk>
+.\" Modified 1998-01-13 by Michael Haardt:
+.\" Using access is often insecure
+.\" Modified 2001-10-16 by aeb
+.\" Modified 2002-04-23 by Roger Luethi <rl@hellgate.ch>
+.\" Modified 2004-06-23 by Michael Kerrisk
+.\" 2007-06-10, mtk, various parts rewritten, and added BUGS section.
+.\"
+.TH access 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+access, faccessat, faccessat2 \- check user's permissions for a file
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int access(const char *" pathname ", int " mode );
+.PP
+.BR "#include <fcntl.h>" " /* Definition of " AT_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int faccessat(int " dirfd ", const char *" pathname ", int " \
+mode ", int " flags );
+ /* But see C library/kernel differences, below */
+.PP
+.BR "#include <fcntl.h>" " /* Definition of " AT_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.B int syscall(SYS_faccessat2,
+.BI " int " dirfd ", const char *" pathname ", int " mode \
+", int " flags );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR faccessat ():
+.nf
+ Since glibc 2.10:
+ _POSIX_C_SOURCE >= 200809L
+ Before glibc 2.10:
+ _ATFILE_SOURCE
+.fi
+.SH DESCRIPTION
+.BR access ()
+checks whether the calling process can access the file
+.IR pathname .
+If
+.I pathname
+is a symbolic link, it is dereferenced.
+.PP
+The
+.I mode
+specifies the accessibility check(s) to be performed,
+and is either the value
+.BR F_OK ,
+.\" F_OK is defined as 0 on every system that I know of.
+or a mask consisting of the bitwise OR of one or more of
+.BR R_OK ", " W_OK ", and " X_OK .
+.B F_OK
+tests for the existence of the file.
+.BR R_OK ", " W_OK ", and " X_OK
+test whether the file exists and grants read, write, and
+execute permissions, respectively.
+.PP
+The check is done using the calling process's
+.I real
+UID and GID, rather than the effective IDs as is done when
+actually attempting an operation (e.g.,
+.BR open (2))
+on the file.
+Similarly, for the root user, the check uses the set of
+permitted capabilities rather than the set of effective
+capabilities; and for non-root users, the check uses an empty set
+of capabilities.
+.PP
+This allows set-user-ID programs and capability-endowed programs
+to easily determine the invoking user's authority.
+In other words,
+.BR access ()
+does not answer the "can I read/write/execute this file?" question.
+It answers a slightly different question:
+"(assuming I'm a setuid binary) can
+.I the user who invoked me
+read/write/execute this file?",
+which gives set-user-ID programs the possibility to
+prevent malicious users from causing them to read files
+which users shouldn't be able to read.
+.PP
+If the calling process is privileged (i.e., its real UID is zero),
+then an
+.B X_OK
+check is successful for a regular file if execute permission
+is enabled for any of the file owner, group, or other.
+.SS faccessat()
+.BR faccessat ()
+operates in exactly the same way as
+.BR access (),
+except for the differences described here.
+.PP
+If the pathname given in
+.I pathname
+is relative, then it is interpreted relative to the directory
+referred to by the file descriptor
+.I dirfd
+(rather than relative to the current working directory of
+the calling process, as is done by
+.BR access ()
+for a relative pathname).
+.PP
+If
+.I pathname
+is relative and
+.I dirfd
+is the special value
+.BR AT_FDCWD ,
+then
+.I pathname
+is interpreted relative to the current working
+directory of the calling process (like
+.BR access ()).
+.PP
+If
+.I pathname
+is absolute, then
+.I dirfd
+is ignored.
+.PP
+.I flags
+is constructed by ORing together zero or more of the following values:
+.TP
+.B AT_EACCESS
+Perform access checks using the effective user and group IDs.
+By default,
+.BR faccessat ()
+uses the real IDs (like
+.BR access ()).
+.TP
+.B AT_SYMLINK_NOFOLLOW
+If
+.I pathname
+is a symbolic link, do not dereference it:
+instead return information about the link itself.
+.PP
+See
+.BR openat (2)
+for an explanation of the need for
+.BR faccessat ().
+.\"
+.SS faccessat2()
+The description of
+.BR faccessat ()
+given above corresponds to POSIX.1 and
+to the implementation provided by glibc.
+However, the glibc implementation was an imperfect emulation (see BUGS)
+that papered over the fact that the raw Linux
+.BR faccessat ()
+system call does not have a
+.I flags
+argument.
+To allow for a proper implementation, Linux 5.8 added the
+.BR faccessat2 ()
+system call, which supports the
+.I flags
+argument and allows a correct implementation of the
+.BR faccessat ()
+wrapper function.
+.SH RETURN VALUE
+On success (all requested permissions granted, or
+.I mode
+is
+.B F_OK
+and the file exists), zero is returned.
+On error (at least one bit in
+.I mode
+asked for a permission that is denied, or
+.I mode
+is
+.B F_OK
+and the file does not exist, or some other error occurred),
+\-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+The requested access would be denied to the file, or search permission
+is denied for one of the directories in the path prefix of
+.IR pathname .
+(See also
+.BR path_resolution (7).)
+.TP
+.B EBADF
+.RB ( faccessat ())
+.I pathname
+is relative but
+.I dirfd
+is neither
+.B AT_FDCWD
+.RB ( faccessat ())
+nor a valid file descriptor.
+.TP
+.B EFAULT
+.I pathname
+points outside your accessible address space.
+.TP
+.B EINVAL
+.I mode
+was incorrectly specified.
+.TP
+.B EINVAL
+.RB ( faccessat ())
+Invalid flag specified in
+.IR flags .
+.TP
+.B EIO
+An I/O error occurred.
+.TP
+.B ELOOP
+Too many symbolic links were encountered in resolving
+.IR pathname .
+.TP
+.B ENAMETOOLONG
+.I pathname
+is too long.
+.TP
+.B ENOENT
+A component of
+.I pathname
+does not exist or is a dangling symbolic link.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.B ENOTDIR
+A component used as a directory in
+.I pathname
+is not, in fact, a directory.
+.TP
+.B ENOTDIR
+.RB ( faccessat ())
+.I pathname
+is relative and
+.I dirfd
+is a file descriptor referring to a file other than a directory.
+.TP
+.B EPERM
+Write permission was requested to a file that has the immutable flag set.
+See also
+.BR ioctl_iflags (2).
+.TP
+.B EROFS
+Write permission was requested for a file on a read-only filesystem.
+.TP
+.B ETXTBSY
+Write access was requested to an executable which is being
+executed.
+.SH VERSIONS
+If the calling process has appropriate privileges (i.e., is superuser),
+POSIX.1-2001 permits an implementation to indicate success for an
+.B X_OK
+check even if none of the execute file permission bits are set.
+.\" HPU-UX 11 and Tru64 5.1 do this.
+Linux does not do this.
+.\"
+.SS C library/kernel differences
+The raw
+.BR faccessat ()
+system call takes only the first three arguments.
+The
+.B AT_EACCESS
+and
+.B AT_SYMLINK_NOFOLLOW
+flags are actually implemented within the glibc wrapper function for
+.BR faccessat ().
+If either of these flags is specified, then the wrapper function employs
+.BR fstatat (2)
+to determine access permissions, but see BUGS.
+.\"
+.SS glibc notes
+On older kernels where
+.BR faccessat ()
+is unavailable (and when the
+.B AT_EACCESS
+and
+.B AT_SYMLINK_NOFOLLOW
+flags are not specified),
+the glibc wrapper function falls back to the use of
+.BR access ().
+When
+.I pathname
+is a relative pathname,
+glibc constructs a pathname based on the symbolic link in
+.I /proc/self/fd
+that corresponds to the
+.I dirfd
+argument.
+.SH STANDARDS
+.TP
+.BR access ()
+.TQ
+.BR faccessat ()
+POSIX.1-2008.
+.TP
+.BR faccessat2 ()
+Linux.
+.SH HISTORY
+.TP
+.BR access ()
+SVr4, 4.3BSD, POSIX.1-2001.
+.TP
+.BR faccessat ()
+Linux 2.6.16,
+glibc 2.4.
+.TP
+.BR faccessat2 ()
+Linux 5.8.
+.SH NOTES
+.BR Warning :
+Using these calls to check if a user is authorized to, for example,
+open a file before actually doing so using
+.BR open (2)
+creates a security hole, because the user might exploit the short time
+interval between checking and opening the file to manipulate it.
+.BR "For this reason, the use of this system call should be avoided" .
+(In the example just described,
+a safer alternative would be to temporarily switch the process's
+effective user ID to the real ID and then call
+.BR open (2).)
+.PP
+.BR access ()
+always dereferences symbolic links.
+If you need to check the permissions on a symbolic link, use
+.BR faccessat ()
+with the flag
+.BR AT_SYMLINK_NOFOLLOW .
+.PP
+These calls return an error if any of the access types in
+.I mode
+is denied, even if some of the other access types in
+.I mode
+are permitted.
+.PP
+A file is accessible only if the permissions on each of the
+directories in the path prefix of
+.I pathname
+grant search (i.e., execute) access.
+If any directory is inaccessible, then the
+.BR access ()
+call fails, regardless of the permissions on the file itself.
+.PP
+Only access bits are checked, not the file type or contents.
+Therefore, if a directory is found to be writable,
+it probably means that files can be created in the directory,
+and not that the directory can be written as a file.
+Similarly, a DOS file may be reported as executable, but the
+.BR execve (2)
+call will still fail.
+.PP
+These calls
+may not work correctly on NFSv2 filesystems with UID mapping enabled,
+because UID mapping is done on the server and hidden from the client,
+which checks permissions.
+(NFS versions 3 and higher perform the check on the server.)
+Similar problems can occur to FUSE mounts.
+.\"
+.SH BUGS
+Because the Linux kernel's
+.BR faccessat ()
+system call does not support a
+.I flags
+argument, the glibc
+.BR faccessat ()
+wrapper function provided in glibc 2.32 and earlier
+emulates the required functionality using
+a combination of the
+.BR faccessat ()
+system call and
+.BR fstatat (2).
+However, this emulation does not take ACLs into account.
+Starting with glibc 2.33, the wrapper function avoids this bug
+by making use of the
+.BR faccessat2 ()
+system call where it is provided by the underlying kernel.
+.PP
+In Linux 2.4 (and earlier) there is some strangeness in the handling of
+.B X_OK
+tests for superuser.
+If all categories of execute permission are disabled
+for a nondirectory file, then the only
+.BR access ()
+test that returns \-1 is when
+.I mode
+is specified as just
+.BR X_OK ;
+if
+.B R_OK
+or
+.B W_OK
+is also specified in
+.IR mode ,
+then
+.BR access ()
+returns 0 for such files.
+.\" This behavior appears to have been an implementation accident.
+Early Linux 2.6 (up to and including Linux 2.6.3)
+also behaved in the same way as Linux 2.4.
+.PP
+Before Linux 2.6.20,
+these calls ignored the effect of the
+.B MS_NOEXEC
+flag if it was used to
+.BR mount (2)
+the underlying filesystem.
+Since Linux 2.6.20, the
+.B MS_NOEXEC
+flag is honored.
+.SH SEE ALSO
+.BR chmod (2),
+.BR chown (2),
+.BR open (2),
+.BR setgid (2),
+.BR setuid (2),
+.BR stat (2),
+.BR euidaccess (3),
+.BR credentials (7),
+.BR path_resolution (7),
+.BR symlink (7)
diff --git a/man2/acct.2 b/man2/acct.2
new file mode 100644
index 0000000..de03a0e
--- /dev/null
+++ b/man2/acct.2
@@ -0,0 +1,136 @@
+.\" Copyright (c) 1993 Michael Haardt
+.\" (michael@moria.de),
+.\" Fri Apr 2 11:32:09 MET DST 1993
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" Modified 1993-07-22 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1993-08-10 by Alan Cox <iiitac@pyramid.swansea.ac.uk>
+.\" Modified 1998-11-04 by Tigran Aivazian <tigran@sco.com>
+.\" Modified 2004-05-27, 2004-06-17, 2004-06-23 by Michael Kerrisk
+.\"
+.TH acct 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+acct \- switch process accounting on or off
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int acct(const char *_Nullable " filename );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR acct ():
+.nf
+ Since glibc 2.21:
+.\" commit 266865c0e7b79d4196e2cc393693463f03c90bd8
+ _DEFAULT_SOURCE
+ In glibc 2.19 and 2.20:
+ _DEFAULT_SOURCE || (_XOPEN_SOURCE && _XOPEN_SOURCE < 500)
+ Up to and including glibc 2.19:
+ _BSD_SOURCE || (_XOPEN_SOURCE && _XOPEN_SOURCE < 500)
+.fi
+.SH DESCRIPTION
+The
+.BR acct ()
+system call enables or disables process accounting.
+If called with the name of an existing file as its argument,
+accounting is turned on,
+and records for each terminating process are appended to
+.I filename
+as it terminates.
+An argument of NULL causes accounting to be turned off.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+Write permission is denied for the specified file,
+or search permission is denied for one of the directories
+in the path prefix of
+.I filename
+(see also
+.BR path_resolution (7)),
+or
+.I filename
+is not a regular file.
+.TP
+.B EFAULT
+.I filename
+points outside your accessible address space.
+.TP
+.B EIO
+Error writing to the file
+.IR filename .
+.TP
+.B EISDIR
+.I filename
+is a directory.
+.TP
+.B ELOOP
+Too many symbolic links were encountered in resolving
+.IR filename .
+.TP
+.B ENAMETOOLONG
+.I filename
+was too long.
+.TP
+.B ENFILE
+The system-wide limit on the total number of open files has been reached.
+.TP
+.B ENOENT
+The specified file does not exist.
+.TP
+.B ENOMEM
+Out of memory.
+.TP
+.B ENOSYS
+BSD process accounting has not been enabled when the operating system
+kernel was compiled.
+The kernel configuration parameter controlling this feature is
+.BR CONFIG_BSD_PROCESS_ACCT .
+.TP
+.B ENOTDIR
+A component used as a directory in
+.I filename
+is not in fact a directory.
+.TP
+.B EPERM
+The calling process has insufficient privilege to enable process accounting.
+On Linux, the
+.B CAP_SYS_PACCT
+capability is required.
+.TP
+.B EROFS
+.I filename
+refers to a file on a read-only filesystem.
+.TP
+.B EUSERS
+There are no more free file structures or we ran out of memory.
+.SH STANDARDS
+None.
+.SH HISTORY
+SVr4, 4.3BSD.
+.\" SVr4 documents an EBUSY error condition, but no EISDIR or ENOSYS.
+.\" Also AIX and HP-UX document EBUSY (attempt is made
+.\" to enable accounting when it is already enabled), as does Solaris
+.\" (attempt is made to enable accounting using the same file that is
+.\" currently being used).
+.SH NOTES
+No accounting is produced for programs running when a system crash occurs.
+In particular, nonterminating processes are never accounted for.
+.PP
+The structure of the records written to the accounting file is described in
+.BR acct (5).
+.SH SEE ALSO
+.BR acct (5)
diff --git a/man2/add_key.2 b/man2/add_key.2
new file mode 100644
index 0000000..570db11
--- /dev/null
+++ b/man2/add_key.2
@@ -0,0 +1,298 @@
+.\" Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+.\" Written by David Howells (dhowells@redhat.com)
+.\" and Copyright (C) 2016 Michael Kerrisk <mtk.man-pages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.TH add_key 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+add_key \- add a key to the kernel's key management facility
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <keyutils.h>
+.PP
+.BI "key_serial_t add_key(const char *" type ", const char *" description ,
+.BI " const void " payload [. plen "], size_t " plen ,
+.BI " key_serial_t " keyring ");"
+.fi
+.PP
+.IR Note :
+There is no glibc wrapper for this system call; see NOTES.
+.SH DESCRIPTION
+.BR add_key ()
+creates or updates a key of the given
+.I type
+and
+.IR description ,
+instantiates it with the
+.I payload
+of length
+.IR plen ,
+attaches it to the nominated
+.IR keyring ,
+and returns the key's serial number.
+.PP
+The key may be rejected if the provided data is in the wrong format or
+it is invalid in some other way.
+.PP
+If the destination
+.I keyring
+already contains a key that matches the specified
+.I type
+and
+.IR description ,
+then, if the key type supports it,
+.\" FIXME The aforementioned phrases begs the question:
+.\" which key types support this?
+that key will be updated rather than a new key being created;
+if not, a new key (with a different ID) will be created
+and it will displace the link to the extant key from the keyring.
+.\" FIXME Perhaps elaborate the implications here? Namely, the new
+.\" key will have a new ID, and if the old key was a keyring that
+.\" is consequently unlinked, then keys that it was anchoring
+.\" will have their reference count decreased by one (and may
+.\" consequently be garbage collected). Is this all correct?
+.PP
+The destination
+.I keyring
+serial number may be that of a valid keyring for which the caller has
+.I write
+permission.
+Alternatively, it may be one of the following special keyring IDs:
+.\" FIXME . Perhaps have a separate page describing special keyring IDs?
+.TP
+.B KEY_SPEC_THREAD_KEYRING
+This specifies the caller's thread-specific keyring
+.RB ( thread\-keyring (7)).
+.TP
+.B KEY_SPEC_PROCESS_KEYRING
+This specifies the caller's process-specific keyring
+.RB ( process\-keyring (7)).
+.TP
+.B KEY_SPEC_SESSION_KEYRING
+This specifies the caller's session-specific keyring
+.RB ( session\-keyring (7)).
+.TP
+.B KEY_SPEC_USER_KEYRING
+This specifies the caller's UID-specific keyring
+.RB ( user\-keyring (7)).
+.TP
+.B KEY_SPEC_USER_SESSION_KEYRING
+This specifies the caller's UID-session keyring
+.RB ( user\-session\-keyring (7)).
+.SS Key types
+The key
+.I type
+is a string that specifies the key's type.
+Internally, the kernel defines a number of key types that are
+available in the core key management code.
+Among the types that are available for user-space use
+and can be specified as the
+.I type
+argument to
+.BR add_key ()
+are the following:
+.TP
+.I """keyring"""
+Keyrings are special key types that may contain links to sequences of other
+keys of any type.
+If this interface is used to create a keyring, then
+.I payload
+should be NULL and
+.I plen
+should be zero.
+.TP
+.I """user"""
+This is a general purpose key type whose payload may be read and updated
+by user-space applications.
+The key is kept entirely within kernel memory.
+The payload for keys of this type is a blob of arbitrary data
+of up to 32,767 bytes.
+.TP
+.IR """logon""" " (since Linux 3.3)"
+.\" commit 9f6ed2ca257fa8650b876377833e6f14e272848b
+This key type is essentially the same as
+.IR """user""" ,
+but it does not permit the key to read.
+This is suitable for storing payloads
+that you do not want to be readable from user space.
+.PP
+This key type vets the
+.I description
+to ensure that it is qualified by a "service" prefix,
+by checking to ensure that the
+.I description
+contains a ':' that is preceded by other characters.
+.TP
+.IR """big_key""" " (since Linux 3.13)"
+.\" commit ab3c3587f8cda9083209a61dbe3a4407d3cada10
+This key type is similar to
+.IR """user""" ,
+but may hold a payload of up to 1\ MiB.
+If the key payload is large enough,
+then it may be stored encrypted in tmpfs
+(which can be swapped out) rather than kernel memory.
+.PP
+For further details on these key types, see
+.BR keyrings (7).
+.SH RETURN VALUE
+On success,
+.BR add_key ()
+returns the serial number of the key it created or updated.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+The keyring wasn't available for modification by the user.
+.TP
+.B EDQUOT
+The key quota for this user would be exceeded by creating this key or linking
+it to the keyring.
+.TP
+.B EFAULT
+One or more of
+.IR type ,
+.IR description ,
+and
+.I payload
+points outside process's accessible address space.
+.TP
+.B EINVAL
+The size of the string (including the terminating null byte) specified in
+.I type
+or
+.I description
+exceeded the limit (32 bytes and 4096 bytes respectively).
+.TP
+.B EINVAL
+The payload data was invalid.
+.TP
+.B EINVAL
+.I type
+was
+.I """logon"""
+and the
+.I description
+was not qualified with a prefix string of the form
+.IR """service:""" .
+.TP
+.B EKEYEXPIRED
+The keyring has expired.
+.TP
+.B EKEYREVOKED
+The keyring has been revoked.
+.TP
+.B ENOKEY
+The keyring doesn't exist.
+.TP
+.B ENOMEM
+Insufficient memory to create a key.
+.TP
+.B EPERM
+The
+.I type
+started with a period (\[aq].\[aq]).
+Key types that begin with a period are reserved to the implementation.
+.TP
+.B EPERM
+.I type
+was
+.I """keyring"""
+and the
+.I description
+started with a period (\[aq].\[aq]).
+Keyrings with descriptions (names)
+that begin with a period are reserved to the implementation.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.10.
+.SH NOTES
+glibc does not provide a wrapper for this system call.
+A wrapper is provided in the
+.I libkeyutils
+library.
+(The accompanying package provides the
+.I <keyutils.h>
+header file.)
+When employing the wrapper in that library, link with
+.IR \-lkeyutils .
+.SH EXAMPLES
+The program below creates a key with the type, description, and payload
+specified in its command-line arguments,
+and links that key into the session keyring.
+The following shell session demonstrates the use of the program:
+.PP
+.in +4n
+.EX
+$ \fB./a.out user mykey "Some payload"\fP
+Key ID is 64a4dca
+$ \fBgrep \[aq]64a4dca\[aq] /proc/keys\fP
+064a4dca I\-\-Q\-\-\- 1 perm 3f010000 1000 1000 user mykey: 12
+.EE
+.in
+.SS Program source
+\&
+.\" SRC BEGIN (add_key.c)
+.EX
+#include <keyutils.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ key_serial_t key;
+\&
+ if (argc != 4) {
+ fprintf(stderr, "Usage: %s type description payload\en",
+ argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ key = add_key(argv[1], argv[2], argv[3], strlen(argv[3]),
+ KEY_SPEC_SESSION_KEYRING);
+ if (key == \-1) {
+ perror("add_key");
+ exit(EXIT_FAILURE);
+ }
+\&
+ printf("Key ID is %jx\en", (uintmax_t) key);
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.ad l
+.nh
+.BR keyctl (1),
+.BR keyctl (2),
+.BR request_key (2),
+.BR keyctl (3),
+.BR keyrings (7),
+.BR keyutils (7),
+.BR persistent\-keyring (7),
+.BR process\-keyring (7),
+.BR session\-keyring (7),
+.BR thread\-keyring (7),
+.BR user\-keyring (7),
+.BR user\-session\-keyring (7)
+.PP
+The kernel source files
+.I Documentation/security/keys/core.rst
+and
+.I Documentation/keys/request\-key.rst
+(or, before Linux 4.13, in the files
+.\" commit b68101a1e8f0263dbc7b8375d2a7c57c6216fb76
+.I Documentation/security/keys.txt
+and
+.\" commit 3db38ed76890565772fcca3279cc8d454ea6176b
+.IR Documentation/security/keys\-request\-key.txt ).
diff --git a/man2/adjtimex.2 b/man2/adjtimex.2
new file mode 100644
index 0000000..c850a3d
--- /dev/null
+++ b/man2/adjtimex.2
@@ -0,0 +1,596 @@
+'\" t
+.\" Copyright (c) 1995 Michael Chastain (mec@shell.portal.com), 15 April 1995.
+.\" and Copyright (C) 2014, 2016 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" Modified 1997-01-31 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 1997-07-30 by Paul Slootman <paul@wurtel.demon.nl>
+.\" Modified 2004-05-27 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.TH adjtimex 2 2023-07-20 "Linux man-pages 6.05.01"
+.SH NAME
+adjtimex, clock_adjtime, ntp_adjtime \- tune kernel clock
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/timex.h>
+.PP
+.BI "int adjtimex(struct timex *" "buf" );
+.PP
+.BI "int clock_adjtime(clockid_t " clk_id, " struct timex *" "buf" );
+.PP
+.BI "int ntp_adjtime(struct timex *" buf );
+.fi
+.SH DESCRIPTION
+Linux uses David L.\& Mills' clock adjustment algorithm (see RFC\ 5905).
+The system call
+.BR adjtimex ()
+reads and optionally sets adjustment parameters for this algorithm.
+It takes a pointer to a
+.I timex
+structure, updates kernel parameters from (selected) field values,
+and returns the same structure updated with the current kernel values.
+This structure is declared as follows:
+.PP
+.in +4n
+.EX
+struct timex {
+ int modes; /* Mode selector */
+ long offset; /* Time offset; nanoseconds, if STA_NANO
+ status flag is set, otherwise
+ microseconds */
+ long freq; /* Frequency offset; see NOTES for units */
+ long maxerror; /* Maximum error (microseconds) */
+ long esterror; /* Estimated error (microseconds) */
+ int status; /* Clock command/status */
+ long constant; /* PLL (phase\-locked loop) time constant */
+ long precision; /* Clock precision
+ (microseconds, read\-only) */
+ long tolerance; /* Clock frequency tolerance (read\-only);
+ see NOTES for units */
+ struct timeval time;
+ /* Current time (read\-only, except for
+ ADJ_SETOFFSET); upon return, time.tv_usec
+ contains nanoseconds, if STA_NANO status
+ flag is set, otherwise microseconds */
+ long tick; /* Microseconds between clock ticks */
+ long ppsfreq; /* PPS (pulse per second) frequency
+ (read\-only); see NOTES for units */
+ long jitter; /* PPS jitter (read\-only); nanoseconds, if
+ STA_NANO status flag is set, otherwise
+ microseconds */
+ int shift; /* PPS interval duration
+ (seconds, read\-only) */
+ long stabil; /* PPS stability (read\-only);
+ see NOTES for units */
+ long jitcnt; /* PPS count of jitter limit exceeded
+ events (read\-only) */
+ long calcnt; /* PPS count of calibration intervals
+ (read\-only) */
+ long errcnt; /* PPS count of calibration errors
+ (read\-only) */
+ long stbcnt; /* PPS count of stability limit exceeded
+ events (read\-only) */
+ int tai; /* TAI offset, as set by previous ADJ_TAI
+ operation (seconds, read\-only,
+ since Linux 2.6.26) */
+ /* Further padding bytes to allow for future expansion */
+};
+.EE
+.in
+.PP
+The
+.I modes
+field determines which parameters, if any, to set.
+(As described later in this page,
+the constants used for
+.BR ntp_adjtime ()
+are equivalent but differently named.)
+It is a bit mask containing a
+bitwise OR
+combination of zero or more of the following bits:
+.TP
+.B ADJ_OFFSET
+Set time offset from
+.IR buf.offset .
+Since Linux 2.6.26,
+.\" commit 074b3b87941c99bc0ce35385b5817924b1ed0c23
+the supplied value is clamped to the range (\-0.5s, +0.5s).
+In older kernels, an
+.B EINVAL
+error occurs if the supplied value is out of range.
+.TP
+.B ADJ_FREQUENCY
+Set frequency offset from
+.IR buf.freq .
+Since Linux 2.6.26,
+.\" commit 074b3b87941c99bc0ce35385b5817924b1ed0c23
+the supplied value is clamped to the range (\-32768000, +32768000).
+In older kernels, an
+.B EINVAL
+error occurs if the supplied value is out of range.
+.TP
+.B ADJ_MAXERROR
+Set maximum time error from
+.IR buf.maxerror .
+.TP
+.B ADJ_ESTERROR
+Set estimated time error from
+.IR buf.esterror .
+.TP
+.B ADJ_STATUS
+Set clock status bits from
+.IR buf.status .
+A description of these bits is provided below.
+.TP
+.B ADJ_TIMECONST
+Set PLL time constant from
+.IR buf.constant .
+If the
+.B STA_NANO
+status flag (see below) is clear, the kernel adds 4 to this value.
+.TP
+.BR ADJ_SETOFFSET " (since Linux 2.6.39)"
+.\" commit 094aa1881fdc1b8889b442eb3511b31f3ec2b762
+.\" Author: Richard Cochran <richardcochran@gmail.com>
+Add
+.I buf.time
+to the current time.
+If
+.I buf.status
+includes the
+.B ADJ_NANO
+flag, then
+.I buf.time.tv_usec
+is interpreted as a nanosecond value;
+otherwise it is interpreted as microseconds.
+.IP
+The value of
+.I buf.time
+is the sum of its two fields, but the
+field
+.I buf.time.tv_usec
+must always be nonnegative.
+The following example shows how to
+normalize a
+.I timeval
+with nanosecond resolution.
+.IP
+.in +4n
+.EX
+while (buf.time.tv_usec < 0) {
+ buf.time.tv_sec \-= 1;
+ buf.time.tv_usec += 1000000000;
+}
+.EE
+.in
+.TP
+.BR ADJ_MICRO " (since Linux 2.6.26)"
+.\" commit eea83d896e318bda54be2d2770d2c5d6668d11db
+.\" Author: Roman Zippel <zippel@linux-m68k.org>
+Select microsecond resolution.
+.TP
+.BR ADJ_NANO " (since Linux 2.6.26)"
+.\" commit eea83d896e318bda54be2d2770d2c5d6668d11db
+.\" Author: Roman Zippel <zippel@linux-m68k.org>
+Select nanosecond resolution.
+Only one of
+.B ADJ_MICRO
+and
+.B ADJ_NANO
+should be specified.
+.TP
+.BR ADJ_TAI " (since Linux 2.6.26)"
+.\" commit 153b5d054ac2d98ea0d86504884326b6777f683d
+Set TAI (Atomic International Time) offset from
+.IR buf.constant .
+.IP
+.B ADJ_TAI
+should not be used in conjunction with
+.BR ADJ_TIMECONST ,
+since the latter mode also employs the
+.I buf.constant
+field.
+.IP
+For a complete explanation of TAI
+and the difference between TAI and UTC, see
+.UR http://www.bipm.org/en/bipm/tai/tai.html
+.I BIPM
+.UE
+.TP
+.B ADJ_TICK
+Set tick value from
+.IR buf.tick .
+.PP
+Alternatively,
+.I modes
+can be specified as either of the following (multibit mask) values,
+in which case other bits should not be specified in
+.IR modes :
+.\" In general, the other bits are ignored, but ADJ_OFFSET_SINGLESHOT 0x8001
+.\" ORed with ADJ_NANO (0x2000) gives 0xa0001 == ADJ_OFFSET_SS_READ!!
+.TP
+.B ADJ_OFFSET_SINGLESHOT
+.\" In user space, ADJ_OFFSET_SINGLESHOT is 0x8001
+.\" In kernel space it is 0x0001, and must be ANDed with ADJ_ADJTIME (0x8000)
+Old-fashioned
+.BR adjtime (3):
+(gradually) adjust time by value specified in
+.IR buf.offset ,
+which specifies an adjustment in microseconds.
+.TP
+.BR ADJ_OFFSET_SS_READ " (functional since Linux 2.6.28)"
+.\" In user space, ADJ_OFFSET_SS_READ is 0xa001
+.\" In kernel space there is ADJ_OFFSET_READONLY (0x2000) anded with
+.\" ADJ_ADJTIME (0x8000) and ADJ_OFFSET_SINGLESHOT (0x0001) to give 0xa001)
+Return (in
+.IR buf.offset )
+the remaining amount of time to be adjusted after an earlier
+.B ADJ_OFFSET_SINGLESHOT
+operation.
+This feature was added in Linux 2.6.24,
+.\" commit 52bfb36050c8529d9031d2c2513b281a360922ec
+but did not work correctly
+.\" commit 916c7a855174e3b53d182b97a26b2e27a29726a1
+until Linux 2.6.28.
+.PP
+Ordinary users are restricted to a value of either 0 or
+.B ADJ_OFFSET_SS_READ
+for
+.IR modes .
+Only the superuser may set any parameters.
+.PP
+The
+.I buf.status
+field is a bit mask that is used to set and/or retrieve status
+bits associated with the NTP implementation.
+Some bits in the mask are both readable and settable,
+while others are read-only.
+.TP
+.BR STA_PLL " (read-write)"
+Enable phase-locked loop (PLL) updates via
+.BR ADJ_OFFSET .
+.TP
+.BR STA_PPSFREQ " (read-write)"
+Enable PPS (pulse-per-second) frequency discipline.
+.TP
+.BR STA_PPSTIME " (read-write)"
+Enable PPS time discipline.
+.TP
+.BR STA_FLL " (read-write)"
+Select frequency-locked loop (FLL) mode.
+.TP
+.BR STA_INS " (read-write)"
+Insert a leap second after the last second of the UTC day,
+thus extending the last minute of the day by one second.
+Leap-second insertion will occur each day, so long as this flag remains set.
+.\" John Stultz;
+.\" Usually this is written as extending the day by one second,
+.\" which is represented as:
+.\" 23:59:59
+.\" 23:59:60
+.\" 00:00:00
+.\"
+.\" But since posix cannot represent 23:59:60, we repeat the last second:
+.\" 23:59:59 + TIME_INS
+.\" 23:59:59 + TIME_OOP
+.\" 00:00:00 + TIME_WAIT
+.\"
+.TP
+.BR STA_DEL " (read-write)"
+Delete a leap second at the last second of the UTC day.
+.\" John Stultz:
+.\" Similarly the progression here is:
+.\" 23:59:57 + TIME_DEL
+.\" 23:59:58 + TIME_DEL
+.\" 00:00:00 + TIME_WAIT
+Leap second deletion will occur each day, so long as this flag
+remains set.
+.\" FIXME Does there need to be a statement that it is nonsensical to set
+.\" to set both STA_INS and STA_DEL?
+.TP
+.BR STA_UNSYNC " (read-write)"
+Clock unsynchronized.
+.TP
+.BR STA_FREQHOLD " (read-write)"
+Hold frequency.
+.\" Following text from John Stultz:
+Normally adjustments made via
+.B ADJ_OFFSET
+result in dampened frequency adjustments also being made.
+So a single call corrects the current offset,
+but as offsets in the same direction are made repeatedly,
+the small frequency adjustments will accumulate to fix the long-term skew.
+.IP
+This flag prevents the small frequency adjustment from being made
+when correcting for an
+.B ADJ_OFFSET
+value.
+.\" According to the Kernel Application Program Interface document,
+.\" STA_FREQHOLD is not used by the NTP version 4 daemon
+.TP
+.BR STA_PPSSIGNAL " (read-only)"
+A valid PPS (pulse-per-second) signal is present.
+.TP
+.BR STA_PPSJITTER " (read-only)"
+PPS signal jitter exceeded.
+.TP
+.BR STA_PPSWANDER " (read-only)"
+PPS signal wander exceeded.
+.TP
+.BR STA_PPSERROR " (read-only)"
+PPS signal calibration error.
+.TP
+.BR STA_CLOCKERR " (read-only)"
+Clock hardware fault.
+.\" Not set in current kernel (4.5), but checked in a few places
+.TP
+.BR STA_NANO " (read-only; since Linux 2.6.26)"
+.\" commit eea83d896e318bda54be2d2770d2c5d6668d11db
+.\" Author: Roman Zippel <zippel@linux-m68k.org>
+Resolution (0 = microsecond, 1 = nanoseconds).
+Set via
+.BR ADJ_NANO ,
+cleared via
+.BR ADJ_MICRO .
+.TP
+.BR STA_MODE " (since Linux 2.6.26)"
+.\" commit eea83d896e318bda54be2d2770d2c5d6668d11db
+.\" Author: Roman Zippel <zippel@linux-m68k.org>
+Mode (0 = Phase Locked Loop, 1 = Frequency Locked Loop).
+.TP
+.BR STA_CLK " (read-only; since Linux 2.6.26)"
+.\" commit eea83d896e318bda54be2d2770d2c5d6668d11db
+.\" Author: Roman Zippel <zippel@linux-m68k.org>
+Clock source (0 = A, 1 = B); currently unused.
+.PP
+Attempts to set read-only
+.I status
+bits are silently ignored.
+.\"
+.SS clock_adjtime ()
+The
+.BR clock_adjtime ()
+system call (added in Linux 2.6.39) behaves like
+.BR adjtimex ()
+but takes an additional
+.I clk_id
+argument to specify the particular clock on which to act.
+.SS ntp_adjtime ()
+The
+.BR ntp_adjtime ()
+library function
+(described in the NTP "Kernel Application Program API", KAPI)
+is a more portable interface for performing the same task as
+.BR adjtimex ().
+Other than the following points, it is identical to
+.BR adjtimex ():
+.IP \[bu] 3
+The constants used in
+.I modes
+are prefixed with "MOD_" rather than "ADJ_", and have the same suffixes (thus,
+.BR MOD_OFFSET ,
+.BR MOD_FREQUENCY ,
+and so on), other than the exceptions noted in the following points.
+.IP \[bu]
+.B MOD_CLKA
+is the synonym for
+.BR ADJ_OFFSET_SINGLESHOT .
+.IP \[bu]
+.B MOD_CLKB
+is the synonym for
+.BR ADJ_TICK .
+.IP \[bu]
+The is no synonym for
+.BR ADJ_OFFSET_SS_READ ,
+which is not described in the KAPI.
+.SH RETURN VALUE
+On success,
+.BR adjtimex ()
+and
+.BR ntp_adjtime ()
+return the clock state; that is, one of the following values:
+.TP 12
+.B TIME_OK
+Clock synchronized, no leap second adjustment pending.
+.TP
+.B TIME_INS
+Indicates that a leap second will be added at the end of the UTC day.
+.TP
+.B TIME_DEL
+Indicates that a leap second will be deleted at the end of the UTC day.
+.TP
+.B TIME_OOP
+Insertion of a leap second is in progress.
+.TP
+.B TIME_WAIT
+A leap-second insertion or deletion has been completed.
+This value will be returned until the next
+.B ADJ_STATUS
+operation clears the
+.B STA_INS
+and
+.B STA_DEL
+flags.
+.TP
+.B TIME_ERROR
+The system clock is not synchronized to a reliable server.
+This value is returned when any of the following holds true:
+.RS
+.IP \[bu] 3
+Either
+.B STA_UNSYNC
+or
+.B STA_CLOCKERR
+is set.
+.IP \[bu]
+.B STA_PPSSIGNAL
+is clear and either
+.B STA_PPSFREQ
+or
+.B STA_PPSTIME
+is set.
+.IP \[bu]
+.B STA_PPSTIME
+and
+.B STA_PPSJITTER
+are both set.
+.IP \[bu]
+.B STA_PPSFREQ
+is set and either
+.B STA_PPSWANDER
+or
+.B STA_PPSJITTER
+is set.
+.RE
+.IP
+The symbolic name
+.B TIME_BAD
+is a synonym for
+.BR TIME_ERROR ,
+provided for backward compatibility.
+.PP
+Note that starting with Linux 3.4,
+.\" commit 6b43ae8a619d17c4935c3320d2ef9e92bdeed05d changed to asynchronous
+.\" operation, so we can no longer rely on the return code.
+the call operates asynchronously and the return value usually will
+not reflect a state change caused by the call itself.
+.PP
+On failure, these calls return \-1 and set
+.I errno
+to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+.I buf
+does not point to writable memory.
+.TP
+.BR EINVAL " (before Linux 2.6.26)"
+An attempt was made to set
+.I buf.freq
+to a value outside the range (\-33554432, +33554432).
+.\" From a quick glance, it appears there was no clamping or range check
+.\" for buf.freq before Linux 2.0
+.TP
+.BR EINVAL " (before Linux 2.6.26)"
+An attempt was made to set
+.I buf.offset
+to a value outside the permitted range.
+Before Linux 2.0, the permitted range was (\-131072, +131072).
+From Linux 2.0 onwards, the permitted range was (\-512000, +512000).
+.TP
+.B EINVAL
+An attempt was made to set
+.I buf.status
+to a value other than those listed above.
+.TP
+.B EINVAL
+The
+.I clk_id
+given to
+.BR clock_adjtime ()
+is invalid for one of two reasons.
+Either the System-V style hard-coded
+positive clock ID value is out of range, or the dynamic
+.I clk_id
+does not refer to a valid instance of a clock object.
+See
+.BR clock_gettime (2)
+for a discussion of dynamic clocks.
+.TP
+.B EINVAL
+An attempt was made to set
+.I buf.tick
+to a value outside the range
+.RB 900000/ HZ
+to
+.RB 1100000/ HZ ,
+where
+.B HZ
+is the system timer interrupt frequency.
+.TP
+.B ENODEV
+The hot-pluggable device (like USB for example) represented by a
+dynamic
+.I clk_id
+has disappeared after its character device was opened.
+See
+.BR clock_gettime (2)
+for a discussion of dynamic clocks.
+.TP
+.B EOPNOTSUPP
+The given
+.I clk_id
+does not support adjustment.
+.TP
+.B EPERM
+.I buf.modes
+is neither 0 nor
+.BR ADJ_OFFSET_SS_READ ,
+and the caller does not have sufficient privilege.
+Under Linux, the
+.B CAP_SYS_TIME
+capability is required.
+.SH ATTRIBUTES
+For an explanation of the terms used in this section, see
+.BR attributes (7).
+.TS
+allbox;
+lbx lb lb
+l l l.
+Interface Attribute Value
+T{
+.na
+.nh
+.BR \%ntp_adjtime ()
+T} Thread safety MT-Safe
+.TE
+.sp 1
+.SH STANDARDS
+.TP
+.BR adjtimex ()
+.TQ
+.BR clock_adjtime ()
+Linux.
+.PP
+The preferred API for the NTP daemon is
+.BR ntp_adjtime ().
+.SH NOTES
+In struct
+.IR timex ,
+.IR freq ,
+.IR ppsfreq ,
+and
+.I stabil
+are ppm (parts per million) with a 16-bit fractional part,
+which means that a value of 1 in one of those fields
+actually means 2\[ha]-16 ppm, and 2\[ha]16=65536 is 1 ppm.
+This is the case for both input values (in the case of
+.IR freq )
+and output values.
+.PP
+The leap-second processing triggered by
+.B STA_INS
+and
+.B STA_DEL
+is done by the kernel in timer context.
+Thus, it will take one tick into the second
+for the leap second to be inserted or deleted.
+.SH SEE ALSO
+.BR clock_gettime (2),
+.BR clock_settime (2),
+.BR settimeofday (2),
+.BR adjtime (3),
+.BR ntp_gettime (3),
+.BR capabilities (7),
+.BR time (7),
+.BR adjtimex (8),
+.BR hwclock (8)
+.PP
+.ad l
+.UR http://www.slac.stanford.edu/comp/unix/\:package/\:rtems/\:src/\:ssrlApps/\:ntpNanoclock/\:api.htm
+NTP "Kernel Application Program Interface"
+.UE
diff --git a/man2/afs_syscall.2 b/man2/afs_syscall.2
new file mode 100644
index 0000000..5d25ea6
--- /dev/null
+++ b/man2/afs_syscall.2
@@ -0,0 +1 @@
+.so man2/unimplemented.2
diff --git a/man2/alarm.2 b/man2/alarm.2
new file mode 100644
index 0000000..cae0890
--- /dev/null
+++ b/man2/alarm.2
@@ -0,0 +1,81 @@
+.\" This manpage is Copyright (C) 1992 Drew Eckhardt;
+.\" and Copyright (C) 1993 Michael Haardt, Ian Jackson.
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified Wed Jul 21 19:42:57 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Modified Sun Jul 21 21:25:26 1996 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified Wed Nov 6 03:46:05 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\"
+.TH alarm 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+alarm \- set an alarm clock for delivery of a signal
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "unsigned int alarm(unsigned int " seconds );
+.fi
+.SH DESCRIPTION
+.BR alarm ()
+arranges for a
+.B SIGALRM
+signal to be delivered to the calling process in
+.I seconds
+seconds.
+.PP
+If
+.I seconds
+is zero, any pending alarm is canceled.
+.PP
+In any event any previously set
+.BR alarm ()
+is canceled.
+.SH RETURN VALUE
+.BR alarm ()
+returns the number of seconds remaining until any previously scheduled
+alarm was due to be delivered, or zero if there was no previously
+scheduled alarm.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4, 4.3BSD.
+.SH NOTES
+.BR alarm ()
+and
+.BR setitimer (2)
+share the same timer; calls to one will interfere with use of the
+other.
+.PP
+Alarms created by
+.BR alarm ()
+are preserved across
+.BR execve (2)
+and are not inherited by children created via
+.BR fork (2).
+.PP
+.BR sleep (3)
+may be implemented using
+.BR SIGALRM ;
+mixing calls to
+.BR alarm ()
+and
+.BR sleep (3)
+is a bad idea.
+.PP
+Scheduling delays can, as ever, cause the execution of the process to
+be delayed by an arbitrary amount of time.
+.SH SEE ALSO
+.BR gettimeofday (2),
+.BR pause (2),
+.BR select (2),
+.BR setitimer (2),
+.BR sigaction (2),
+.BR signal (2),
+.BR timer_create (2),
+.BR timerfd_create (2),
+.BR sleep (3),
+.BR time (7)
diff --git a/man2/alloc_hugepages.2 b/man2/alloc_hugepages.2
new file mode 100644
index 0000000..33671da
--- /dev/null
+++ b/man2/alloc_hugepages.2
@@ -0,0 +1,135 @@
+.\" Copyright 2003 Andries E. Brouwer (aeb@cwi.nl)
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH alloc_hugepages 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+alloc_hugepages, free_hugepages \- allocate or free huge pages
+.SH SYNOPSIS
+.nf
+.BI "void *syscall(SYS_alloc_hugepages, int " key ", void " addr [. len "], \
+size_t " len ,
+.BI " int " prot ", int " flag );
+.\" asmlinkage unsigned long sys_alloc_hugepages(int key, unsigned long addr,
+.\" unsigned long len, int prot, int flag);
+.BI "int syscall(SYS_free_hugepages, void *" addr );
+.\" asmlinkage int sys_free_hugepages(unsigned long addr);
+.fi
+.PP
+.IR Note :
+glibc provides no wrappers for these system calls,
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The system calls
+.BR alloc_hugepages ()
+and
+.BR free_hugepages ()
+were introduced in Linux 2.5.36 and removed again in Linux 2.5.54.
+They existed only on i386 and ia64 (when built with
+.BR CONFIG_HUGETLB_PAGE ).
+In Linux 2.4.20, the syscall numbers exist,
+but the calls fail with the error
+.BR ENOSYS .
+.PP
+On i386 the memory management hardware knows about ordinary pages (4\ KiB)
+and huge pages (2 or 4\ MiB).
+Similarly ia64 knows about huge pages of
+several sizes.
+These system calls serve to map huge pages into the
+process's memory or to free them again.
+Huge pages are locked into memory, and are not swapped.
+.PP
+The
+.I key
+argument is an identifier.
+When zero the pages are private, and
+not inherited by children.
+When positive the pages are shared with other applications using the same
+.IR key ,
+and inherited by child processes.
+.PP
+The
+.I addr
+argument of
+.BR free_hugepages ()
+tells which page is being freed: it was the return value of a
+call to
+.BR alloc_hugepages ().
+(The memory is first actually freed when all users have released it.)
+The
+.I addr
+argument of
+.BR alloc_hugepages ()
+is a hint, that the kernel may or may not follow.
+Addresses must be properly aligned.
+.PP
+The
+.I len
+argument is the length of the required segment.
+It must be a multiple of the huge page size.
+.PP
+The
+.I prot
+argument specifies the memory protection of the segment.
+It is one of
+.BR PROT_READ ,
+.BR PROT_WRITE ,
+.BR PROT_EXEC .
+.PP
+The
+.I flag
+argument is ignored, unless
+.I key
+is positive.
+In that case, if
+.I flag
+is
+.BR IPC_CREAT ,
+then a new huge page segment is created when none
+with the given key existed.
+If this flag is not set, then
+.B ENOENT
+is returned when no segment with the given key exists.
+.SH RETURN VALUE
+On success,
+.BR alloc_hugepages ()
+returns the allocated virtual address, and
+.BR free_hugepages ()
+returns zero.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B ENOSYS
+The system call is not supported on this kernel.
+.SH FILES
+.TP
+.I /proc/sys/vm/nr_hugepages
+Number of configured hugetlb pages.
+This can be read and written.
+.TP
+.I /proc/meminfo
+Gives info on the number of configured hugetlb pages and on their size
+in the three variables HugePages_Total, HugePages_Free, Hugepagesize.
+.SH STANDARDS
+Linux on Intel processors.
+.SH HISTORY
+These system calls are gone;
+they existed only in Linux 2.5.36 through to Linux 2.5.54.
+.SH NOTES
+Now the hugetlbfs filesystem can be used instead.
+Memory backed by huge pages (if the CPU supports them) is obtained by
+using
+.BR mmap (2)
+to map files in this virtual filesystem.
+.PP
+The maximal number of huge pages can be specified using the
+.B hugepages=
+boot parameter.
+.\".PP
+.\" requires CONFIG_HUGETLB_PAGE (under "Processor type and features")
+.\" and CONFIG_HUGETLBFS (under "Filesystems").
+.\" mount \-t hugetlbfs hugetlbfs /huge
+.\" SHM_HUGETLB
diff --git a/man2/arch_prctl.2 b/man2/arch_prctl.2
new file mode 100644
index 0000000..04a3633
--- /dev/null
+++ b/man2/arch_prctl.2
@@ -0,0 +1,176 @@
+.\" Copyright (C) 2003 Andi Kleen
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH arch_prctl 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+arch_prctl \- set architecture-specific thread state
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <asm/prctl.h>" " /* Definition of " ARCH_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_arch_prctl, int " code ", unsigned long " addr );
+.BI "int syscall(SYS_arch_prctl, int " code ", unsigned long *" addr );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR arch_prctl (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+.BR arch_prctl ()
+sets architecture-specific process or thread state.
+.I code
+selects a subfunction
+and passes argument
+.I addr
+to it;
+.I addr
+is interpreted as either an
+.I "unsigned long"
+for the "set" operations, or as an
+.IR "unsigned long\ *" ,
+for the "get" operations.
+.PP
+Subfunctions for both x86 and x86-64 are:
+.TP
+.BR ARCH_SET_CPUID " (since Linux 4.12)"
+.\" commit e9ea1e7f53b852147cbd568b0568c7ad97ec21a3
+Enable
+.RI ( "addr != 0" )
+or disable
+.RI ( "addr == 0" )
+the
+.I cpuid
+instruction for the calling thread.
+The instruction is enabled by default.
+If disabled, any execution of a
+.I cpuid
+instruction will instead generate a
+.B SIGSEGV
+signal.
+This feature can be used to emulate
+.I cpuid
+results that differ from what the underlying
+hardware would have produced (e.g., in a paravirtualization setting).
+.IP
+The
+.B ARCH_SET_CPUID
+setting is preserved across
+.BR fork (2)
+and
+.BR clone (2)
+but reset to the default (i.e.,
+.I cpuid
+enabled) on
+.BR execve (2).
+.TP
+.BR ARCH_GET_CPUID " (since Linux 4.12)"
+Return the setting of the flag manipulated by
+.B ARCH_SET_CPUID
+as the result of the system call (1 for enabled, 0 for disabled).
+.I addr
+is ignored.
+.TP
+Subfunctions for x86-64 only are:
+.TP
+.B ARCH_SET_FS
+Set the 64-bit base for the
+.I FS
+register to
+.IR addr .
+.TP
+.B ARCH_GET_FS
+Return the 64-bit base value for the
+.I FS
+register of the calling thread in the
+.I unsigned long
+pointed to by
+.IR addr .
+.TP
+.B ARCH_SET_GS
+Set the 64-bit base for the
+.I GS
+register to
+.IR addr .
+.TP
+.B ARCH_GET_GS
+Return the 64-bit base value for the
+.I GS
+register of the calling thread in the
+.I unsigned long
+pointed to by
+.IR addr .
+.SH RETURN VALUE
+On success,
+.BR arch_prctl ()
+returns 0; on error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+.I addr
+points to an unmapped address or is outside the process address space.
+.TP
+.B EINVAL
+.I code
+is not a valid subcommand.
+.TP
+.B ENODEV
+.B ARCH_SET_CPUID
+was requested, but the underlying hardware does not support CPUID faulting.
+.TP
+.B EPERM
+.I addr
+is outside the process address space.
+.\" .SH AUTHOR
+.\" Man page written by Andi Kleen.
+.SH STANDARDS
+Linux/x86-64.
+.SH NOTES
+.BR arch_prctl ()
+is supported only on Linux/x86-64 for 64-bit programs currently.
+.PP
+The 64-bit base changes when a new 32-bit segment selector is loaded.
+.PP
+.B ARCH_SET_GS
+is disabled in some kernels.
+.PP
+Context switches for 64-bit segment bases are rather expensive.
+As an optimization, if a 32-bit TLS base address is used,
+.BR arch_prctl ()
+may use a real TLS entry as if
+.BR set_thread_area (2)
+had been called, instead of manipulating the segment base register directly.
+Memory in the first 2\ GB of address space can be allocated by using
+.BR mmap (2)
+with the
+.B MAP_32BIT
+flag.
+.PP
+Because of the aforementioned optimization, using
+.BR arch_prctl ()
+and
+.BR set_thread_area (2)
+in the same thread is dangerous, as they may overwrite each other's
+TLS entries.
+.PP
+.I FS
+may be already used by the threading library.
+Programs that use
+.B ARCH_SET_FS
+directly are very likely to crash.
+.SH SEE ALSO
+.BR mmap (2),
+.BR modify_ldt (2),
+.BR prctl (2),
+.BR set_thread_area (2)
+.PP
+AMD X86-64 Programmer's manual
diff --git a/man2/arm_fadvise.2 b/man2/arm_fadvise.2
new file mode 100644
index 0000000..53f54a1
--- /dev/null
+++ b/man2/arm_fadvise.2
@@ -0,0 +1 @@
+.so man2/posix_fadvise.2
diff --git a/man2/arm_fadvise64_64.2 b/man2/arm_fadvise64_64.2
new file mode 100644
index 0000000..53f54a1
--- /dev/null
+++ b/man2/arm_fadvise64_64.2
@@ -0,0 +1 @@
+.so man2/posix_fadvise.2
diff --git a/man2/arm_sync_file_range.2 b/man2/arm_sync_file_range.2
new file mode 100644
index 0000000..ad7a1e6
--- /dev/null
+++ b/man2/arm_sync_file_range.2
@@ -0,0 +1 @@
+.so man2/sync_file_range.2
diff --git a/man2/bdflush.2 b/man2/bdflush.2
new file mode 100644
index 0000000..d97949e
--- /dev/null
+++ b/man2/bdflush.2
@@ -0,0 +1,103 @@
+.\" Copyright (c) 1995 Michael Chastain (mec@shell.portal.com), 15 April 1995.
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" Modified 1997-01-31 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 2004-06-17 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.TH bdflush 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+bdflush \- start, flush, or tune buffer-dirty-flush daemon
+.SH SYNOPSIS
+.nf
+.B #include <sys/kdaemon.h>
+.PP
+.BI "[[deprecated]] int bdflush(int " func ", long *" address );
+.BI "[[deprecated]] int bdflush(int " func ", long " data );
+.fi
+.SH DESCRIPTION
+.IR Note :
+Since Linux 2.6,
+.\" As noted in changes in the 2.5.12 source
+this system call is deprecated and does nothing.
+It is likely to disappear altogether in a future kernel release.
+Nowadays, the task performed by
+.BR bdflush ()
+is handled by the kernel
+.I pdflush
+thread.
+.PP
+.BR bdflush ()
+starts, flushes, or tunes the buffer-dirty-flush daemon.
+Only a privileged process (one with the
+.B CAP_SYS_ADMIN
+capability) may call
+.BR bdflush ().
+.PP
+If
+.I func
+is negative or 0, and no daemon has been started, then
+.BR bdflush ()
+enters the daemon code and never returns.
+.PP
+If
+.I func
+is 1,
+some dirty buffers are written to disk.
+.PP
+If
+.I func
+is 2 or more and is even (low bit is 0), then
+.I address
+is the address of a long word,
+and the tuning parameter numbered
+.RI "(" "func" "\-2)/2"
+is returned to the caller in that address.
+.PP
+If
+.I func
+is 3 or more and is odd (low bit is 1), then
+.I data
+is a long word,
+and the kernel sets tuning parameter numbered
+.RI "(" "func" "\-3)/2"
+to that value.
+.PP
+The set of parameters, their values, and their valid ranges
+are defined in the Linux kernel source file
+.IR fs/buffer.c .
+.SH RETURN VALUE
+If
+.I func
+is negative or 0 and the daemon successfully starts,
+.BR bdflush ()
+never returns.
+Otherwise, the return value is 0 on success and \-1 on failure, with
+.I errno
+set to indicate the error.
+.SH ERRORS
+.TP
+.B EBUSY
+An attempt was made to enter the daemon code after
+another process has already entered.
+.TP
+.B EFAULT
+.I address
+points outside your accessible address space.
+.TP
+.B EINVAL
+An attempt was made to read or write an invalid parameter number,
+or to write an invalid value to a parameter.
+.TP
+.B EPERM
+Caller does not have the
+.B CAP_SYS_ADMIN
+capability.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Since glibc 2.23, glibc no longer supports this obsolete system call.
+.SH SEE ALSO
+.BR sync (1),
+.BR fsync (2),
+.BR sync (2)
diff --git a/man2/bind.2 b/man2/bind.2
new file mode 100644
index 0000000..6288c41
--- /dev/null
+++ b/man2/bind.2
@@ -0,0 +1,286 @@
+.\" Copyright 1993 Rickard E. Faith (faith@cs.unc.edu)
+.\" and Copyright 2005-2007, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Portions extracted from /usr/include/sys/socket.h, which does not have
+.\" any authorship information in it. It is probably available under the GPL.
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\"
+.\" Other portions are from the 6.9 (Berkeley) 3/10/91 man page:
+.\"
+.\" Copyright (c) 1983 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" SPDX-License-Identifier: BSD-4-Clause-UC
+.\"
+.\" Modified Mon Oct 21 23:05:29 EDT 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 1998 by Andi Kleen
+.\" $Id: bind.2,v 1.3 1999/04/23 19:56:07 freitag Exp $
+.\" Modified 2004-06-23 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.TH bind 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+bind \- bind a name to a socket
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/socket.h>
+.PP
+.BI "int bind(int " sockfd ", const struct sockaddr *" addr ,
+.BI " socklen_t " addrlen );
+.fi
+.SH DESCRIPTION
+When a socket is created with
+.BR socket (2),
+it exists in a name space (address family) but has no address assigned to it.
+.BR bind ()
+assigns the address specified by
+.I addr
+to the socket referred to by the file descriptor
+.IR sockfd .
+.I addrlen
+specifies the size, in bytes, of the address structure pointed to by
+.IR addr .
+Traditionally, this operation is called \[lq]assigning a name to a socket\[rq].
+.PP
+It is normally necessary to assign a local address using
+.BR bind ()
+before a
+.B SOCK_STREAM
+socket may receive connections (see
+.BR accept (2)).
+.PP
+The rules used in name binding vary between address families.
+Consult the manual entries in Section 7 for detailed information.
+For
+.BR AF_INET ,
+see
+.BR ip (7);
+for
+.BR AF_INET6 ,
+see
+.BR ipv6 (7);
+for
+.BR AF_UNIX ,
+see
+.BR unix (7);
+for
+.BR AF_APPLETALK ,
+see
+.BR ddp (7);
+for
+.BR AF_PACKET ,
+see
+.BR packet (7);
+for
+.BR AF_X25 ,
+see
+.BR x25 (7);
+and for
+.BR AF_NETLINK ,
+see
+.BR netlink (7).
+.PP
+The actual structure passed for the
+.I addr
+argument will depend on the address family.
+The
+.I sockaddr
+structure is defined as something like:
+.PP
+.in +4n
+.EX
+struct sockaddr {
+ sa_family_t sa_family;
+ char sa_data[14];
+}
+.EE
+.in
+.PP
+The only purpose of this structure is to cast the structure
+pointer passed in
+.I addr
+in order to avoid compiler warnings.
+See EXAMPLES below.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+.\" e.g., privileged port in AF_INET domain
+The address is protected, and the user is not the superuser.
+.TP
+.B EADDRINUSE
+The given address is already in use.
+.TP
+.B EADDRINUSE
+(Internet domain sockets)
+The port number was specified as zero in the socket address structure,
+but, upon attempting to bind to an ephemeral port,
+it was determined that all port numbers in the ephemeral port range
+are currently in use.
+See the discussion of
+.I /proc/sys/net/ipv4/ip_local_port_range
+.BR ip (7).
+.TP
+.B EBADF
+.I sockfd
+is not a valid file descriptor.
+.TP
+.B EINVAL
+The socket is already bound to an address.
+.\" This may change in the future: see
+.\" .I linux/unix/sock.c for details.
+.TP
+.B EINVAL
+.I addrlen
+is wrong, or
+.I addr
+is not a valid address for this socket's domain.
+.TP
+.B ENOTSOCK
+The file descriptor
+.I sockfd
+does not refer to a socket.
+.PP
+The following errors are specific to UNIX domain
+.RB ( AF_UNIX )
+sockets:
+.TP
+.B EACCES
+Search permission is denied on a component of the path prefix.
+(See also
+.BR path_resolution (7).)
+.TP
+.B EADDRNOTAVAIL
+A nonexistent interface was requested or the requested
+address was not local.
+.TP
+.B EFAULT
+.I addr
+points outside the user's accessible address space.
+.TP
+.B ELOOP
+Too many symbolic links were encountered in resolving
+.IR addr .
+.TP
+.B ENAMETOOLONG
+.I addr
+is too long.
+.TP
+.B ENOENT
+A component in the directory prefix of the socket pathname does not exist.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.B ENOTDIR
+A component of the path prefix is not a directory.
+.TP
+.B EROFS
+The socket inode would reside on a read-only filesystem.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4, 4.4BSD
+.RB ( bind ()
+first appeared in 4.2BSD).
+.\" SVr4 documents an additional
+.\" .B ENOSR
+.\" general error condition, and
+.\" additional
+.\" .B EIO
+.\" and
+.\" .B EISDIR
+.\" UNIX-domain error conditions.
+.SH BUGS
+The transparent proxy options are not described.
+.\" FIXME Document transparent proxy options
+.SH EXAMPLES
+An example of the use of
+.BR bind ()
+with Internet domain sockets can be found in
+.BR getaddrinfo (3).
+.PP
+The following example shows how to bind a stream socket in the UNIX
+.RB ( AF_UNIX )
+domain, and accept connections:
+.\" listen.7 refers to this example.
+.\" accept.7 refers to this example.
+.\" unix.7 refers to this example.
+.PP
+.\" SRC BEGIN (bind.c)
+.EX
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+\&
+#define MY_SOCK_PATH "/somepath"
+#define LISTEN_BACKLOG 50
+\&
+#define handle_error(msg) \e
+ do { perror(msg); exit(EXIT_FAILURE); } while (0)
+\&
+int
+main(void)
+{
+ int sfd, cfd;
+ socklen_t peer_addr_size;
+ struct sockaddr_un my_addr, peer_addr;
+\&
+ sfd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (sfd == \-1)
+ handle_error("socket");
+\&
+ memset(&my_addr, 0, sizeof(my_addr));
+ my_addr.sun_family = AF_UNIX;
+ strncpy(my_addr.sun_path, MY_SOCK_PATH,
+ sizeof(my_addr.sun_path) \- 1);
+\&
+ if (bind(sfd, (struct sockaddr *) &my_addr,
+ sizeof(my_addr)) == \-1)
+ handle_error("bind");
+\&
+ if (listen(sfd, LISTEN_BACKLOG) == \-1)
+ handle_error("listen");
+\&
+ /* Now we can accept incoming connections one
+ at a time using accept(2). */
+\&
+ peer_addr_size = sizeof(peer_addr);
+ cfd = accept(sfd, (struct sockaddr *) &peer_addr,
+ &peer_addr_size);
+ if (cfd == \-1)
+ handle_error("accept");
+\&
+ /* Code to deal with incoming connection(s)... */
+\&
+ if (close(sfd) == \-1)
+ handle_error("close");
+\&
+ if (unlink(MY_SOCK_PATH) == \-1)
+ handle_error("unlink");
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR accept (2),
+.BR connect (2),
+.BR getsockname (2),
+.BR listen (2),
+.BR socket (2),
+.BR getaddrinfo (3),
+.BR getifaddrs (3),
+.BR ip (7),
+.BR ipv6 (7),
+.BR path_resolution (7),
+.BR socket (7),
+.BR unix (7)
diff --git a/man2/bpf.2 b/man2/bpf.2
new file mode 100644
index 0000000..4df108d
--- /dev/null
+++ b/man2/bpf.2
@@ -0,0 +1,1273 @@
+.\" Copyright (C) 2015 Alexei Starovoitov <ast@kernel.org>
+.\" and Copyright (C) 2015 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH bpf 2 2023-07-28 "Linux man-pages 6.05.01"
+.SH NAME
+bpf \- perform a command on an extended BPF map or program
+.SH SYNOPSIS
+.nf
+.B #include <linux/bpf.h>
+.PP
+.BI "int bpf(int " cmd ", union bpf_attr *" attr ", unsigned int " size );
+.fi
+.SH DESCRIPTION
+The
+.BR bpf ()
+system call performs a range of operations related to extended
+Berkeley Packet Filters.
+Extended BPF (or eBPF) is similar to
+the original ("classic") BPF (cBPF) used to filter network packets.
+For both cBPF and eBPF programs,
+the kernel statically analyzes the programs before loading them,
+in order to ensure that they cannot harm the running system.
+.PP
+eBPF extends cBPF in multiple ways, including the ability to call
+a fixed set of in-kernel helper functions
+.\" See 'enum bpf_func_id' in include/uapi/linux/bpf.h
+(via the
+.B BPF_CALL
+opcode extension provided by eBPF)
+and access shared data structures such as eBPF maps.
+.\"
+.SS Extended BPF Design/Architecture
+eBPF maps are a generic data structure for storage of different data types.
+Data types are generally treated as binary blobs, so a user just specifies
+the size of the key and the size of the value at map-creation time.
+In other words, a key/value for a given map can have an arbitrary structure.
+.PP
+A user process can create multiple maps (with key/value-pairs being
+opaque bytes of data) and access them via file descriptors.
+Different eBPF programs can access the same maps in parallel.
+It's up to the user process and eBPF program to decide what they store
+inside maps.
+.PP
+There's one special map type, called a program array.
+This type of map stores file descriptors referring to other eBPF programs.
+When a lookup in the map is performed, the program flow is
+redirected in-place to the beginning of another eBPF program and does not
+return back to the calling program.
+The level of nesting has a fixed limit of 32,
+.\" Defined by the kernel constant MAX_TAIL_CALL_CNT in include/linux/bpf.h
+so that infinite loops cannot be crafted.
+At run time, the program file descriptors stored in the map can be modified,
+so program functionality can be altered based on specific requirements.
+All programs referred to in a program-array map must
+have been previously loaded into the kernel via
+.BR bpf ().
+If a map lookup fails, the current program continues its execution.
+See
+.B BPF_MAP_TYPE_PROG_ARRAY
+below for further details.
+.PP
+Generally, eBPF programs are loaded by the user process and automatically
+unloaded when the process exits.
+In some cases, for example,
+.BR tc\-bpf (8),
+the program will continue to stay alive inside the kernel even after the
+process that loaded the program exits.
+In that case,
+the tc subsystem holds a reference to the eBPF program after the
+file descriptor has been closed by the user-space program.
+Thus, whether a specific program continues to live inside the kernel
+depends on how it is further attached to a given kernel subsystem
+after it was loaded via
+.BR bpf ().
+.PP
+Each eBPF program is a set of instructions that is safe to run until
+its completion.
+An in-kernel verifier statically determines that the eBPF program
+terminates and is safe to execute.
+During verification, the kernel increments reference counts for each of
+the maps that the eBPF program uses,
+so that the attached maps can't be removed until the program is unloaded.
+.PP
+eBPF programs can be attached to different events.
+These events can be the arrival of network packets, tracing
+events, classification events by network queueing disciplines
+(for eBPF programs attached to a
+.BR tc (8)
+classifier), and other types that may be added in the future.
+A new event triggers execution of the eBPF program, which
+may store information about the event in eBPF maps.
+Beyond storing data, eBPF programs may call a fixed set of
+in-kernel helper functions.
+.PP
+The same eBPF program can be attached to multiple events and different
+eBPF programs can access the same map:
+.PP
+.in +4n
+.EX
+tracing tracing tracing packet packet packet
+event A event B event C on eth0 on eth1 on eth2
+ | | | | | \[ha]
+ | | | | v |
+ \-\-> tracing <\-\- tracing socket tc ingress tc egress
+ prog_1 prog_2 prog_3 classifier action
+ | | | | prog_4 prog_5
+ |\-\-\- \-\-\-\-\-| |\-\-\-\-\-\-| map_3 | |
+ map_1 map_2 \-\-| map_4 |\-\-
+.EE
+.in
+.\"
+.SS Arguments
+The operation to be performed by the
+.BR bpf ()
+system call is determined by the
+.I cmd
+argument.
+Each operation takes an accompanying argument,
+provided via
+.IR attr ,
+which is a pointer to a union of type
+.I bpf_attr
+(see below).
+The unused fields and padding must be zeroed out before the call.
+The
+.I size
+argument is the size of the union pointed to by
+.IR attr .
+.PP
+The value provided in
+.I cmd
+is one of the following:
+.TP
+.B BPF_MAP_CREATE
+Create a map and return a file descriptor that refers to the map.
+The close-on-exec file descriptor flag (see
+.BR fcntl (2))
+is automatically enabled for the new file descriptor.
+.TP
+.B BPF_MAP_LOOKUP_ELEM
+Look up an element by key in a specified map and return its value.
+.TP
+.B BPF_MAP_UPDATE_ELEM
+Create or update an element (key/value pair) in a specified map.
+.TP
+.B BPF_MAP_DELETE_ELEM
+Look up and delete an element by key in a specified map.
+.TP
+.B BPF_MAP_GET_NEXT_KEY
+Look up an element by key in a specified map and return the key
+of the next element.
+.TP
+.B BPF_PROG_LOAD
+Verify and load an eBPF program,
+returning a new file descriptor associated with the program.
+The close-on-exec file descriptor flag (see
+.BR fcntl (2))
+is automatically enabled for the new file descriptor.
+.IP
+The
+.I bpf_attr
+union consists of various anonymous structures that are used by different
+.BR bpf ()
+commands:
+.PP
+.in +4n
+.EX
+union bpf_attr {
+ struct { /* Used by BPF_MAP_CREATE */
+ __u32 map_type;
+ __u32 key_size; /* size of key in bytes */
+ __u32 value_size; /* size of value in bytes */
+ __u32 max_entries; /* maximum number of entries
+ in a map */
+ };
+\&
+ struct { /* Used by BPF_MAP_*_ELEM and BPF_MAP_GET_NEXT_KEY
+ commands */
+ __u32 map_fd;
+ __aligned_u64 key;
+ union {
+ __aligned_u64 value;
+ __aligned_u64 next_key;
+ };
+ __u64 flags;
+ };
+\&
+ struct { /* Used by BPF_PROG_LOAD */
+ __u32 prog_type;
+ __u32 insn_cnt;
+ __aligned_u64 insns; /* \[aq]const struct bpf_insn *\[aq] */
+ __aligned_u64 license; /* \[aq]const char *\[aq] */
+ __u32 log_level; /* verbosity level of verifier */
+ __u32 log_size; /* size of user buffer */
+ __aligned_u64 log_buf; /* user supplied \[aq]char *\[aq]
+ buffer */
+ __u32 kern_version;
+ /* checked when prog_type=kprobe
+ (since Linux 4.1) */
+.\" commit 2541517c32be2531e0da59dfd7efc1ce844644f5
+ };
+} __attribute__((aligned(8)));
+.EE
+.in
+.\"
+.SS eBPF maps
+Maps are a generic data structure for storage of different types of data.
+They allow sharing of data between eBPF kernel programs,
+and also between kernel and user-space applications.
+.PP
+Each map type has the following attributes:
+.IP \[bu] 3
+type
+.IP \[bu]
+maximum number of elements
+.IP \[bu]
+key size in bytes
+.IP \[bu]
+value size in bytes
+.PP
+The following wrapper functions demonstrate how various
+.BR bpf ()
+commands can be used to access the maps.
+The functions use the
+.I cmd
+argument to invoke different operations.
+.TP
+.B BPF_MAP_CREATE
+The
+.B BPF_MAP_CREATE
+command creates a new map,
+returning a new file descriptor that refers to the map.
+.IP
+.in +4n
+.EX
+int
+bpf_create_map(enum bpf_map_type map_type,
+ unsigned int key_size,
+ unsigned int value_size,
+ unsigned int max_entries)
+{
+ union bpf_attr attr = {
+ .map_type = map_type,
+ .key_size = key_size,
+ .value_size = value_size,
+ .max_entries = max_entries
+ };
+\&
+ return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
+}
+.EE
+.in
+.IP
+The new map has the type specified by
+.IR map_type ,
+and attributes as specified in
+.IR key_size ,
+.IR value_size ,
+and
+.IR max_entries .
+On success, this operation returns a file descriptor.
+On error, \-1 is returned and
+.I errno
+is set to
+.BR EINVAL ,
+.BR EPERM ,
+or
+.BR ENOMEM .
+.IP
+The
+.I key_size
+and
+.I value_size
+attributes will be used by the verifier during program loading
+to check that the program is calling
+.BR bpf_map_*_elem ()
+helper functions with a correctly initialized
+.I key
+and to check that the program doesn't access the map element
+.I value
+beyond the specified
+.IR value_size .
+For example, when a map is created with a
+.I key_size
+of 8 and the eBPF program calls
+.IP
+.in +4n
+.EX
+bpf_map_lookup_elem(map_fd, fp \- 4)
+.EE
+.in
+.IP
+the program will be rejected,
+since the in-kernel helper function
+.IP
+.in +4n
+.EX
+bpf_map_lookup_elem(map_fd, void *key)
+.EE
+.in
+.IP
+expects to read 8 bytes from the location pointed to by
+.IR key ,
+but the
+.I fp\ \-\ 4
+(where
+.I fp
+is the top of the stack)
+starting address will cause out-of-bounds stack access.
+.IP
+Similarly, when a map is created with a
+.I value_size
+of 1 and the eBPF program contains
+.IP
+.in +4n
+.EX
+value = bpf_map_lookup_elem(...);
+*(u32 *) value = 1;
+.EE
+.in
+.IP
+the program will be rejected, since it accesses the
+.I value
+pointer beyond the specified 1 byte
+.I value_size
+limit.
+.IP
+Currently, the following values are supported for
+.IR map_type :
+.IP
+.in +4n
+.EX
+enum bpf_map_type {
+ BPF_MAP_TYPE_UNSPEC, /* Reserve 0 as invalid map type */
+ BPF_MAP_TYPE_HASH,
+ BPF_MAP_TYPE_ARRAY,
+ BPF_MAP_TYPE_PROG_ARRAY,
+ BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+ BPF_MAP_TYPE_PERCPU_HASH,
+ BPF_MAP_TYPE_PERCPU_ARRAY,
+ BPF_MAP_TYPE_STACK_TRACE,
+ BPF_MAP_TYPE_CGROUP_ARRAY,
+ BPF_MAP_TYPE_LRU_HASH,
+ BPF_MAP_TYPE_LRU_PERCPU_HASH,
+ BPF_MAP_TYPE_LPM_TRIE,
+ BPF_MAP_TYPE_ARRAY_OF_MAPS,
+ BPF_MAP_TYPE_HASH_OF_MAPS,
+ BPF_MAP_TYPE_DEVMAP,
+ BPF_MAP_TYPE_SOCKMAP,
+ BPF_MAP_TYPE_CPUMAP,
+ BPF_MAP_TYPE_XSKMAP,
+ BPF_MAP_TYPE_SOCKHASH,
+ BPF_MAP_TYPE_CGROUP_STORAGE,
+ BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
+ BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
+ BPF_MAP_TYPE_QUEUE,
+ BPF_MAP_TYPE_STACK,
+ /* See /usr/include/linux/bpf.h for the full list. */
+};
+.EE
+.in
+.IP
+.I map_type
+selects one of the available map implementations in the kernel.
+.\" FIXME We need an explanation of why one might choose each of
+.\" these map implementations
+For all map types,
+eBPF programs access maps with the same
+.BR bpf_map_lookup_elem ()
+and
+.BR bpf_map_update_elem ()
+helper functions.
+Further details of the various map types are given below.
+.TP
+.B BPF_MAP_LOOKUP_ELEM
+The
+.B BPF_MAP_LOOKUP_ELEM
+command looks up an element with a given
+.I key
+in the map referred to by the file descriptor
+.IR fd .
+.IP
+.in +4n
+.EX
+int
+bpf_lookup_elem(int fd, const void *key, void *value)
+{
+ union bpf_attr attr = {
+ .map_fd = fd,
+ .key = ptr_to_u64(key),
+ .value = ptr_to_u64(value),
+ };
+\&
+ return bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
+}
+.EE
+.in
+.IP
+If an element is found,
+the operation returns zero and stores the element's value into
+.IR value ,
+which must point to a buffer of
+.I value_size
+bytes.
+.IP
+If no element is found, the operation returns \-1 and sets
+.I errno
+to
+.BR ENOENT .
+.TP
+.B BPF_MAP_UPDATE_ELEM
+The
+.B BPF_MAP_UPDATE_ELEM
+command
+creates or updates an element with a given
+.I key/value
+in the map referred to by the file descriptor
+.IR fd .
+.IP
+.in +4n
+.EX
+int
+bpf_update_elem(int fd, const void *key, const void *value,
+ uint64_t flags)
+{
+ union bpf_attr attr = {
+ .map_fd = fd,
+ .key = ptr_to_u64(key),
+ .value = ptr_to_u64(value),
+ .flags = flags,
+ };
+\&
+ return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
+}
+.EE
+.in
+.IP
+The
+.I flags
+argument should be specified as one of the following:
+.RS
+.TP
+.B BPF_ANY
+Create a new element or update an existing element.
+.TP
+.B BPF_NOEXIST
+Create a new element only if it did not exist.
+.TP
+.B BPF_EXIST
+Update an existing element.
+.RE
+.IP
+On success, the operation returns zero.
+On error, \-1 is returned and
+.I errno
+is set to
+.BR EINVAL ,
+.BR EPERM ,
+.BR ENOMEM ,
+or
+.BR E2BIG .
+.B E2BIG
+indicates that the number of elements in the map reached the
+.I max_entries
+limit specified at map creation time.
+.B EEXIST
+will be returned if
+.I flags
+specifies
+.B BPF_NOEXIST
+and the element with
+.I key
+already exists in the map.
+.B ENOENT
+will be returned if
+.I flags
+specifies
+.B BPF_EXIST
+and the element with
+.I key
+doesn't exist in the map.
+.TP
+.B BPF_MAP_DELETE_ELEM
+The
+.B BPF_MAP_DELETE_ELEM
+command
+deletes the element whose key is
+.I key
+from the map referred to by the file descriptor
+.IR fd .
+.IP
+.in +4n
+.EX
+int
+bpf_delete_elem(int fd, const void *key)
+{
+ union bpf_attr attr = {
+ .map_fd = fd,
+ .key = ptr_to_u64(key),
+ };
+\&
+ return bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
+}
+.EE
+.in
+.IP
+On success, zero is returned.
+If the element is not found, \-1 is returned and
+.I errno
+is set to
+.BR ENOENT .
+.TP
+.B BPF_MAP_GET_NEXT_KEY
+The
+.B BPF_MAP_GET_NEXT_KEY
+command looks up an element by
+.I key
+in the map referred to by the file descriptor
+.I fd
+and sets the
+.I next_key
+pointer to the key of the next element.
+.IP
+.in +4n
+.EX
+int
+bpf_get_next_key(int fd, const void *key, void *next_key)
+{
+ union bpf_attr attr = {
+ .map_fd = fd,
+ .key = ptr_to_u64(key),
+ .next_key = ptr_to_u64(next_key),
+ };
+\&
+ return bpf(BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
+}
+.EE
+.in
+.IP
+If
+.I key
+is found, the operation returns zero and sets the
+.I next_key
+pointer to the key of the next element.
+If
+.I key
+is not found, the operation returns zero and sets the
+.I next_key
+pointer to the key of the first element.
+If
+.I key
+is the last element, \-1 is returned and
+.I errno
+is set to
+.BR ENOENT .
+Other possible
+.I errno
+values are
+.BR ENOMEM ,
+.BR EFAULT ,
+.BR EPERM ,
+and
+.BR EINVAL .
+This method can be used to iterate over all elements in the map.
+.TP
+.B close(map_fd)
+Delete the map referred to by the file descriptor
+.IR map_fd .
+When the user-space program that created a map exits, all maps will
+be deleted automatically (but see NOTES).
+.\"
+.SS eBPF map types
+The following map types are supported:
+.TP
+.B BPF_MAP_TYPE_HASH
+.\" commit 0f8e4bd8a1fc8c4185f1630061d0a1f2d197a475
+Hash-table maps have the following characteristics:
+.RS
+.IP \[bu] 3
+Maps are created and destroyed by user-space programs.
+Both user-space and eBPF programs
+can perform lookup, update, and delete operations.
+.IP \[bu]
+The kernel takes care of allocating and freeing key/value pairs.
+.IP \[bu]
+The
+.BR map_update_elem ()
+helper will fail to insert new element when the
+.I max_entries
+limit is reached.
+(This ensures that eBPF programs cannot exhaust memory.)
+.IP \[bu]
+.BR map_update_elem ()
+replaces existing elements atomically.
+.RE
+.IP
+Hash-table maps are
+optimized for speed of lookup.
+.TP
+.B BPF_MAP_TYPE_ARRAY
+.\" commit 28fbcfa08d8ed7c5a50d41a0433aad222835e8e3
+Array maps have the following characteristics:
+.RS
+.IP \[bu] 3
+Optimized for fastest possible lookup.
+In the future the verifier/JIT compiler
+may recognize lookup() operations that employ a constant key
+and optimize it into constant pointer.
+It is possible to optimize a non-constant
+key into direct pointer arithmetic as well, since pointers and
+.I value_size
+are constant for the life of the eBPF program.
+In other words,
+.BR array_map_lookup_elem ()
+may be 'inlined' by the verifier/JIT compiler
+while preserving concurrent access to this map from user space.
+.IP \[bu]
+All array elements pre-allocated and zero initialized at init time
+.IP \[bu]
+The key is an array index, and must be exactly four bytes.
+.IP \[bu]
+.BR map_delete_elem ()
+fails with the error
+.BR EINVAL ,
+since elements cannot be deleted.
+.IP \[bu]
+.BR map_update_elem ()
+replaces elements in a
+.B nonatomic
+fashion;
+for atomic updates, a hash-table map should be used instead.
+There is however one special case that can also be used with arrays:
+the atomic built-in
+.B __sync_fetch_and_add()
+can be used on 32 and 64 bit atomic counters.
+For example, it can be
+applied on the whole value itself if it represents a single counter,
+or in case of a structure containing multiple counters, it could be
+used on individual counters.
+This is quite often useful for aggregation and accounting of events.
+.RE
+.IP
+Among the uses for array maps are the following:
+.RS
+.IP \[bu] 3
+As "global" eBPF variables: an array of 1 element whose key is (index) 0
+and where the value is a collection of 'global' variables which
+eBPF programs can use to keep state between events.
+.IP \[bu]
+Aggregation of tracing events into a fixed set of buckets.
+.IP \[bu]
+Accounting of networking events, for example, number of packets and packet
+sizes.
+.RE
+.TP
+.BR BPF_MAP_TYPE_PROG_ARRAY " (since Linux 4.2)"
+A program array map is a special kind of array map whose map values
+contain only file descriptors referring to other eBPF programs.
+Thus, both the
+.I key_size
+and
+.I value_size
+must be exactly four bytes.
+This map is used in conjunction with the
+.BR bpf_tail_call ()
+helper.
+.IP
+This means that an eBPF program with a program array map attached to it
+can call from kernel side into
+.IP
+.in +4n
+.EX
+void bpf_tail_call(void *context, void *prog_map,
+ unsigned int index);
+.EE
+.in
+.IP
+and therefore replace its own program flow with the one from the program
+at the given program array slot, if present.
+This can be regarded as kind of a jump table to a different eBPF program.
+The invoked program will then reuse the same stack.
+When a jump into the new program has been performed,
+it won't return to the old program anymore.
+.IP
+If no eBPF program is found at the given index of the program array
+(because the map slot doesn't contain a valid program file descriptor,
+the specified lookup index/key is out of bounds,
+or the limit of 32
+.\" MAX_TAIL_CALL_CNT
+nested calls has been exceed),
+execution continues with the current eBPF program.
+This can be used as a fall-through for default cases.
+.IP
+A program array map is useful, for example, in tracing or networking, to
+handle individual system calls or protocols in their own subprograms and
+use their identifiers as an individual map index.
+This approach may result in performance benefits,
+and also makes it possible to overcome the maximum
+instruction limit of a single eBPF program.
+In dynamic environments,
+a user-space daemon might atomically replace individual subprograms
+at run-time with newer versions to alter overall program behavior,
+for instance, if global policies change.
+.\"
+.SS eBPF programs
+The
+.B BPF_PROG_LOAD
+command is used to load an eBPF program into the kernel.
+The return value for this command is a new file descriptor associated
+with this eBPF program.
+.PP
+.in +4n
+.EX
+char bpf_log_buf[LOG_BUF_SIZE];
+\&
+int
+bpf_prog_load(enum bpf_prog_type type,
+ const struct bpf_insn *insns, int insn_cnt,
+ const char *license)
+{
+ union bpf_attr attr = {
+ .prog_type = type,
+ .insns = ptr_to_u64(insns),
+ .insn_cnt = insn_cnt,
+ .license = ptr_to_u64(license),
+ .log_buf = ptr_to_u64(bpf_log_buf),
+ .log_size = LOG_BUF_SIZE,
+ .log_level = 1,
+ };
+\&
+ return bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
+}
+.EE
+.in
+.PP
+.I prog_type
+is one of the available program types:
+.IP
+.in +4n
+.EX
+enum bpf_prog_type {
+ BPF_PROG_TYPE_UNSPEC, /* Reserve 0 as invalid
+ program type */
+ BPF_PROG_TYPE_SOCKET_FILTER,
+ BPF_PROG_TYPE_KPROBE,
+ BPF_PROG_TYPE_SCHED_CLS,
+ BPF_PROG_TYPE_SCHED_ACT,
+ BPF_PROG_TYPE_TRACEPOINT,
+ BPF_PROG_TYPE_XDP,
+ BPF_PROG_TYPE_PERF_EVENT,
+ BPF_PROG_TYPE_CGROUP_SKB,
+ BPF_PROG_TYPE_CGROUP_SOCK,
+ BPF_PROG_TYPE_LWT_IN,
+ BPF_PROG_TYPE_LWT_OUT,
+ BPF_PROG_TYPE_LWT_XMIT,
+ BPF_PROG_TYPE_SOCK_OPS,
+ BPF_PROG_TYPE_SK_SKB,
+ BPF_PROG_TYPE_CGROUP_DEVICE,
+ BPF_PROG_TYPE_SK_MSG,
+ BPF_PROG_TYPE_RAW_TRACEPOINT,
+ BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ BPF_PROG_TYPE_LWT_SEG6LOCAL,
+ BPF_PROG_TYPE_LIRC_MODE2,
+ BPF_PROG_TYPE_SK_REUSEPORT,
+ BPF_PROG_TYPE_FLOW_DISSECTOR,
+ /* See /usr/include/linux/bpf.h for the full list. */
+};
+.EE
+.in
+.PP
+For further details of eBPF program types, see below.
+.PP
+The remaining fields of
+.I bpf_attr
+are set as follows:
+.IP \[bu] 3
+.I insns
+is an array of
+.I "struct bpf_insn"
+instructions.
+.IP \[bu]
+.I insn_cnt
+is the number of instructions in the program referred to by
+.IR insns .
+.IP \[bu]
+.I license
+is a license string, which must be GPL compatible to call helper functions
+marked
+.IR gpl_only .
+(The licensing rules are the same as for kernel modules,
+so that also dual licenses, such as "Dual BSD/GPL", may be used.)
+.IP \[bu]
+.I log_buf
+is a pointer to a caller-allocated buffer in which the in-kernel
+verifier can store the verification log.
+This log is a multi-line string that can be checked by
+the program author in order to understand how the verifier came to
+the conclusion that the eBPF program is unsafe.
+The format of the output can change at any time as the verifier evolves.
+.IP \[bu]
+.I log_size
+size of the buffer pointed to by
+.IR log_buf .
+If the size of the buffer is not large enough to store all
+verifier messages, \-1 is returned and
+.I errno
+is set to
+.BR ENOSPC .
+.IP \[bu]
+.I log_level
+verbosity level of the verifier.
+A value of zero means that the verifier will not provide a log;
+in this case,
+.I log_buf
+must be a NULL pointer, and
+.I log_size
+must be zero.
+.PP
+Applying
+.BR close (2)
+to the file descriptor returned by
+.B BPF_PROG_LOAD
+will unload the eBPF program (but see NOTES).
+.PP
+Maps are accessible from eBPF programs and are used to exchange data between
+eBPF programs and between eBPF programs and user-space programs.
+For example,
+eBPF programs can process various events (like kprobe, packets) and
+store their data into a map,
+and user-space programs can then fetch data from the map.
+Conversely, user-space programs can use a map as a configuration mechanism,
+populating the map with values checked by the eBPF program,
+which then modifies its behavior on the fly according to those values.
+.\"
+.\"
+.SS eBPF program types
+The eBPF program type
+.RI ( prog_type )
+determines the subset of kernel helper functions that the program
+may call.
+The program type also determines the program input (context)\[em]the
+format of
+.I "struct bpf_context"
+(which is the data blob passed into the eBPF program as the first argument).
+.\"
+.\" FIXME
+.\" Somewhere in this page we need a general introduction to the
+.\" bpf_context. For example, how does a BPF program access the
+.\" context?
+.PP
+For example, a tracing program does not have the exact same
+subset of helper functions as a socket filter program
+(though they may have some helpers in common).
+Similarly,
+the input (context) for a tracing program is a set of register values,
+while for a socket filter it is a network packet.
+.PP
+The set of functions available to eBPF programs of a given type may increase
+in the future.
+.PP
+The following program types are supported:
+.TP
+.BR BPF_PROG_TYPE_SOCKET_FILTER " (since Linux 3.19)"
+Currently, the set of functions for
+.B BPF_PROG_TYPE_SOCKET_FILTER
+is:
+.IP
+.in +4n
+.EX
+bpf_map_lookup_elem(map_fd, void *key)
+ /* look up key in a map_fd */
+bpf_map_update_elem(map_fd, void *key, void *value)
+ /* update key/value */
+bpf_map_delete_elem(map_fd, void *key)
+ /* delete key in a map_fd */
+.EE
+.in
+.IP
+The
+.I bpf_context
+argument is a pointer to a
+.IR "struct __sk_buff" .
+.\" FIXME: We need some text here to explain how the program
+.\" accesses __sk_buff.
+.\" See 'struct __sk_buff' and commit 9bac3d6d548e5
+.\"
+.\" Alexei commented:
+.\" Actually now in case of SOCKET_FILTER, SCHED_CLS, SCHED_ACT
+.\" the program can now access skb fields.
+.\"
+.TP
+.BR BPF_PROG_TYPE_KPROBE " (since Linux 4.1)"
+.\" commit 2541517c32be2531e0da59dfd7efc1ce844644f5
+[To be documented]
+.\" FIXME Document this program type
+.\" Describe allowed helper functions for this program type
+.\" Describe bpf_context for this program type
+.\"
+.\" FIXME We need text here to describe 'kern_version'
+.TP
+.BR BPF_PROG_TYPE_SCHED_CLS " (since Linux 4.1)"
+.\" commit 96be4325f443dbbfeb37d2a157675ac0736531a1
+.\" commit e2e9b6541dd4b31848079da80fe2253daaafb549
+[To be documented]
+.\" FIXME Document this program type
+.\" Describe allowed helper functions for this program type
+.\" Describe bpf_context for this program type
+.TP
+.BR BPF_PROG_TYPE_SCHED_ACT " (since Linux 4.1)"
+.\" commit 94caee8c312d96522bcdae88791aaa9ebcd5f22c
+.\" commit a8cb5f556b567974d75ea29c15181c445c541b1f
+[To be documented]
+.\" FIXME Document this program type
+.\" Describe allowed helper functions for this program type
+.\" Describe bpf_context for this program type
+.SS Events
+Once a program is loaded, it can be attached to an event.
+Various kernel subsystems have different ways to do so.
+.PP
+Since Linux 3.19,
+.\" commit 89aa075832b0da4402acebd698d0411dcc82d03e
+the following call will attach the program
+.I prog_fd
+to the socket
+.IR sockfd ,
+which was created by an earlier call to
+.BR socket (2):
+.PP
+.in +4n
+.EX
+setsockopt(sockfd, SOL_SOCKET, SO_ATTACH_BPF,
+ &prog_fd, sizeof(prog_fd));
+.EE
+.in
+.PP
+Since Linux 4.1,
+.\" commit 2541517c32be2531e0da59dfd7efc1ce844644f5
+the following call may be used to attach
+the eBPF program referred to by the file descriptor
+.I prog_fd
+to a perf event file descriptor,
+.IR event_fd ,
+that was created by a previous call to
+.BR perf_event_open (2):
+.PP
+.in +4n
+.EX
+ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+.EE
+.in
+.\"
+.\"
+.SH RETURN VALUE
+For a successful call, the return value depends on the operation:
+.TP
+.B BPF_MAP_CREATE
+The new file descriptor associated with the eBPF map.
+.TP
+.B BPF_PROG_LOAD
+The new file descriptor associated with the eBPF program.
+.TP
+All other commands
+Zero.
+.PP
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B E2BIG
+The eBPF program is too large or a map reached the
+.I max_entries
+limit (maximum number of elements).
+.TP
+.B EACCES
+For
+.BR BPF_PROG_LOAD ,
+even though all program instructions are valid, the program has been
+rejected because it was deemed unsafe.
+This may be because it may have
+accessed a disallowed memory region or an uninitialized stack/register or
+because the function constraints don't match the actual types or because
+there was a misaligned memory access.
+In this case, it is recommended to call
+.BR bpf ()
+again with
+.I log_level = 1
+and examine
+.I log_buf
+for the specific reason provided by the verifier.
+.TP
+.B EAGAIN
+For
+.BR BPF_PROG_LOAD ,
+indicates that needed resources are blocked.
+This happens when the verifier detects pending signals
+while it is checking the validity of the bpf program.
+In this case, just call
+.BR bpf ()
+again with the same parameters.
+.TP
+.B EBADF
+.I fd
+is not an open file descriptor.
+.TP
+.B EFAULT
+One of the pointers
+.RI ( key
+or
+.I value
+or
+.I log_buf
+or
+.IR insns )
+is outside the accessible address space.
+.TP
+.B EINVAL
+The value specified in
+.I cmd
+is not recognized by this kernel.
+.TP
+.B EINVAL
+For
+.BR BPF_MAP_CREATE ,
+either
+.I map_type
+or attributes are invalid.
+.TP
+.B EINVAL
+For
+.B BPF_MAP_*_ELEM
+commands,
+some of the fields of
+.I "union bpf_attr"
+that are not used by this command
+are not set to zero.
+.TP
+.B EINVAL
+For
+.BR BPF_PROG_LOAD ,
+indicates an attempt to load an invalid program.
+eBPF programs can be deemed
+invalid due to unrecognized instructions, the use of reserved fields, jumps
+out of range, infinite loops or calls of unknown functions.
+.TP
+.B ENOENT
+For
+.B BPF_MAP_LOOKUP_ELEM
+or
+.BR BPF_MAP_DELETE_ELEM ,
+indicates that the element with the given
+.I key
+was not found.
+.TP
+.B ENOMEM
+Cannot allocate sufficient memory.
+.TP
+.B EPERM
+The call was made without sufficient privilege
+(without the
+.B CAP_SYS_ADMIN
+capability).
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 3.18.
+.SH NOTES
+Prior to Linux 4.4, all
+.BR bpf ()
+commands require the caller to have the
+.B CAP_SYS_ADMIN
+capability.
+From Linux 4.4 onwards,
+.\" commit 1be7f75d1668d6296b80bf35dcf6762393530afc
+an unprivileged user may create limited programs of type
+.B BPF_PROG_TYPE_SOCKET_FILTER
+and associated maps.
+However they may not store kernel pointers within
+the maps and are presently limited to the following helper functions:
+.\" [Linux 5.6] mtk: The list of available functions is, I think, governed
+.\" by the check in net/core/filter.c::bpf_base_func_proto().
+.IP \[bu] 3
+get_random
+.PD 0
+.IP \[bu]
+get_smp_processor_id
+.IP \[bu]
+tail_call
+.IP \[bu]
+ktime_get_ns
+.PD
+.PP
+Unprivileged access may be blocked by writing the value 1 to the file
+.IR /proc/sys/kernel/unprivileged_bpf_disabled .
+.PP
+eBPF objects (maps and programs) can be shared between processes.
+For example, after
+.BR fork (2),
+the child inherits file descriptors referring to the same eBPF objects.
+In addition, file descriptors referring to eBPF objects can be
+transferred over UNIX domain sockets.
+File descriptors referring to eBPF objects can be duplicated
+in the usual way, using
+.BR dup (2)
+and similar calls.
+An eBPF object is deallocated only after all file descriptors
+referring to the object have been closed.
+.PP
+eBPF programs can be written in a restricted C that is compiled (using the
+.B clang
+compiler) into eBPF bytecode.
+Various features are omitted from this restricted C, such as loops,
+global variables, variadic functions, floating-point numbers,
+and passing structures as function arguments.
+Some examples can be found in the
+.I samples/bpf/*_kern.c
+files in the kernel source tree.
+.\" There are also examples for the tc classifier, in the iproute2
+.\" project, in examples/bpf
+.PP
+The kernel contains a just-in-time (JIT) compiler that translates
+eBPF bytecode into native machine code for better performance.
+Before Linux 4.15,
+the JIT compiler is disabled by default,
+but its operation can be controlled by writing one of the
+following integer strings to the file
+.IR /proc/sys/net/core/bpf_jit_enable :
+.TP
+.B 0
+Disable JIT compilation (default).
+.TP
+.B 1
+Normal compilation.
+.TP
+.B 2
+Debugging mode.
+The generated opcodes are dumped in hexadecimal into the kernel log.
+These opcodes can then be disassembled using the program
+.I tools/net/bpf_jit_disasm.c
+provided in the kernel source tree.
+.PP
+Since Linux 4.15,
+.\" commit 290af86629b25ffd1ed6232c4e9107da031705cb
+the kernel may configured with the
+.B CONFIG_BPF_JIT_ALWAYS_ON
+option.
+In this case, the JIT compiler is always enabled, and the
+.I bpf_jit_enable
+is initialized to 1 and is immutable.
+(This kernel configuration option was provided as a mitigation for
+one of the Spectre attacks against the BPF interpreter.)
+.PP
+The JIT compiler for eBPF is currently
+.\" Last reviewed in Linux 4.18-rc by grepping for BPF_ALU64 in arch/
+.\" and by checking the documentation for bpf_jit_enable in
+.\" Documentation/sysctl/net.txt
+available for the following architectures:
+.IP \[bu] 3
+x86-64 (since Linux 3.18; cBPF since Linux 3.0);
+.\" commit 0a14842f5a3c0e88a1e59fac5c3025db39721f74
+.PD 0
+.IP \[bu]
+ARM32 (since Linux 3.18; cBPF since Linux 3.4);
+.\" commit ddecdfcea0ae891f782ae853771c867ab51024c2
+.IP \[bu]
+SPARC 32 (since Linux 3.18; cBPF since Linux 3.5);
+.\" commit 2809a2087cc44b55e4377d7b9be3f7f5d2569091
+.IP \[bu]
+ARM-64 (since Linux 3.18);
+.\" commit e54bcde3d69d40023ae77727213d14f920eb264a
+.IP \[bu]
+s390 (since Linux 4.1; cBPF since Linux 3.7);
+.\" commit c10302efe569bfd646b4c22df29577a4595b4580
+.IP \[bu]
+PowerPC 64 (since Linux 4.8; cBPF since Linux 3.1);
+.\" commit 0ca87f05ba8bdc6791c14878464efc901ad71e99
+.\" commit 156d0e290e969caba25f1851c52417c14d141b24
+.IP \[bu]
+SPARC 64 (since Linux 4.12);
+.\" commit 7a12b5031c6b947cc13918237ae652b536243b76
+.IP \[bu]
+x86-32 (since Linux 4.18);
+.\" commit 03f5781be2c7b7e728d724ac70ba10799cc710d7
+.IP \[bu]
+MIPS 64 (since Linux 4.18; cBPF since Linux 3.16);
+.\" commit c6610de353da5ca6eee5b8960e838a87a90ead0c
+.\" commit f381bf6d82f032b7410185b35d000ea370ac706b
+.IP \[bu]
+riscv (since Linux 5.1).
+.\" commit 2353ecc6f91fd15b893fa01bf85a1c7a823ee4f2
+.PD
+.SH EXAMPLES
+.\" [[FIXME]] SRC BEGIN (bpf.c)
+.EX
+/* bpf+sockets example:
+ * 1. create array map of 256 elements
+ * 2. load program that counts number of packets received
+ * r0 = skb\->data[ETH_HLEN + offsetof(struct iphdr, protocol)]
+ * map[r0]++
+ * 3. attach prog_fd to raw socket via setsockopt()
+ * 4. print number of received TCP/UDP packets every second
+ */
+int
+main(int argc, char *argv[])
+{
+ int sock, map_fd, prog_fd, key;
+ long long value = 0, tcp_cnt, udp_cnt;
+\&
+ map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key),
+ sizeof(value), 256);
+ if (map_fd < 0) {
+ printf("failed to create map \[aq]%s\[aq]\en", strerror(errno));
+ /* likely not run as root */
+ return 1;
+ }
+\&
+ struct bpf_insn prog[] = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), /* r6 = r1 */
+ BPF_LD_ABS(BPF_B, ETH_HLEN + offsetof(struct iphdr, protocol)),
+ /* r0 = ip\->proto */
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, \-4),
+ /* *(u32 *)(fp \- 4) = r0 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), /* r2 = fp */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, \-4), /* r2 = r2 \- 4 */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd), /* r1 = map_fd */
+ BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem),
+ /* r0 = map_lookup(r1, r2) */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ /* if (r0 == 0) goto pc+2 */
+ BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
+ BPF_XADD(BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0),
+ /* lock *(u64 *) r0 += r1 */
+.\" == atomic64_add
+ BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
+ BPF_EXIT_INSN(), /* return r0 */
+ };
+\&
+ prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog,
+ sizeof(prog) / sizeof(prog[0]), "GPL");
+\&
+ sock = open_raw_sock("lo");
+\&
+ assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd,
+ sizeof(prog_fd)) == 0);
+\&
+ for (;;) {
+ key = IPPROTO_TCP;
+ assert(bpf_lookup_elem(map_fd, &key, &tcp_cnt) == 0);
+ key = IPPROTO_UDP;
+ assert(bpf_lookup_elem(map_fd, &key, &udp_cnt) == 0);
+ printf("TCP %lld UDP %lld packets\en", tcp_cnt, udp_cnt);
+ sleep(1);
+ }
+\&
+ return 0;
+}
+.EE
+.\" SRC END
+.PP
+Some complete working code can be found in the
+.I samples/bpf
+directory in the kernel source tree.
+.SH SEE ALSO
+.BR seccomp (2),
+.BR bpf\-helpers (7),
+.BR socket (7),
+.BR tc (8),
+.BR tc\-bpf (8)
+.PP
+Both classic and extended BPF are explained in the kernel source file
+.IR Documentation/networking/filter.txt .
diff --git a/man2/break.2 b/man2/break.2
new file mode 100644
index 0000000..5d25ea6
--- /dev/null
+++ b/man2/break.2
@@ -0,0 +1 @@
+.so man2/unimplemented.2
diff --git a/man2/brk.2 b/man2/brk.2
new file mode 100644
index 0000000..2cc61a9
--- /dev/null
+++ b/man2/brk.2
@@ -0,0 +1,153 @@
+.\" Copyright (c) 1993 Michael Haardt, (michael@moria.de)
+.\" and Copyright 2006, 2008, Michael Kerrisk <tmk.manpages@gmail.com>
+.\" Fri Apr 2 11:32:09 MET DST 1993
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" Modified Wed Jul 21 19:52:58 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Modified Sun Aug 21 17:40:38 1994 by Rik Faith <faith@cs.unc.edu>
+.\"
+.TH brk 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+brk, sbrk \- change data segment size
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int brk(void *" addr );
+.BI "void *sbrk(intptr_t " increment );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR brk (),
+.BR sbrk ():
+.nf
+ Since glibc 2.19:
+ _DEFAULT_SOURCE
+ || ((_XOPEN_SOURCE >= 500) &&
+ ! (_POSIX_C_SOURCE >= 200112L))
+.\" (_XOPEN_SOURCE >= 500 ||
+.\" _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED) &&
+ From glibc 2.12 to glibc 2.19:
+ _BSD_SOURCE || _SVID_SOURCE
+ || ((_XOPEN_SOURCE >= 500) &&
+ ! (_POSIX_C_SOURCE >= 200112L))
+.\" (_XOPEN_SOURCE >= 500 ||
+.\" _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED) &&
+ Before glibc 2.12:
+ _BSD_SOURCE || _SVID_SOURCE || _XOPEN_SOURCE >= 500
+.\" || _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED
+.fi
+.SH DESCRIPTION
+.BR brk ()
+and
+.BR sbrk ()
+change the location of the
+.IR "program break" ,
+which defines the end of the process's data segment
+(i.e., the program break is the first location after the end of the
+uninitialized data segment).
+Increasing the program break has the effect of
+allocating memory to the process;
+decreasing the break deallocates memory.
+.PP
+.BR brk ()
+sets the end of the data segment to the value specified by
+.IR addr ,
+when that value is reasonable, the system has enough memory,
+and the process does not exceed its maximum data size (see
+.BR setrlimit (2)).
+.PP
+.BR sbrk ()
+increments the program's data space by
+.I increment
+bytes.
+Calling
+.BR sbrk ()
+with an
+.I increment
+of 0 can be used to find the current location of the program break.
+.SH RETURN VALUE
+On success,
+.BR brk ()
+returns zero.
+On error, \-1 is returned, and
+.I errno
+is set to
+.BR ENOMEM .
+.PP
+On success,
+.BR sbrk ()
+returns the previous program break.
+(If the break was increased,
+then this value is a pointer to the start of the newly allocated memory).
+On error,
+.I "(void\ *)\ \-1"
+is returned, and
+.I errno
+is set to
+.BR ENOMEM .
+.SH STANDARDS
+None.
+.SH HISTORY
+4.3BSD; SUSv1, marked LEGACY in SUSv2, removed in POSIX.1-2001.
+.\"
+.\" .BR brk ()
+.\" and
+.\" .BR sbrk ()
+.\" are not defined in the C Standard and are deliberately excluded from the
+.\" POSIX.1-1990 standard (see paragraphs B.1.1.1.3 and B.8.3.3).
+.SH NOTES
+Avoid using
+.BR brk ()
+and
+.BR sbrk ():
+the
+.BR malloc (3)
+memory allocation package is the
+portable and comfortable way of allocating memory.
+.PP
+Various systems use various types for the argument of
+.BR sbrk ().
+Common are \fIint\fP, \fIssize_t\fP, \fIptrdiff_t\fP, \fIintptr_t\fP.
+.\" One sees
+.\" \fIint\fP (e.g., XPGv4, DU 4.0, HP-UX 11, FreeBSD 4.0, OpenBSD 3.2),
+.\" \fIssize_t\fP (OSF1 2.0, Irix 5.3, 6.5),
+.\" \fIptrdiff_t\fP (libc4, libc5, ulibc, glibc 2.0, 2.1),
+.\" \fIintptr_t\fP (e.g., XPGv5, AIX, SunOS 5.8, 5.9, FreeBSD 4.7, NetBSD 1.6,
+.\" Tru64 5.1, glibc2.2).
+.SS C library/kernel differences
+The return value described above for
+.BR brk ()
+is the behavior provided by the glibc wrapper function for the Linux
+.BR brk ()
+system call.
+(On most other implementations, the return value from
+.BR brk ()
+is the same; this return value was also specified in SUSv2.)
+However,
+the actual Linux system call returns the new program break on success.
+On failure, the system call returns the current break.
+The glibc wrapper function does some work
+(i.e., checks whether the new break is less than
+.IR addr )
+to provide the 0 and \-1 return values described above.
+.PP
+On Linux,
+.BR sbrk ()
+is implemented as a library function that uses the
+.BR brk ()
+system call, and does some internal bookkeeping so that it can
+return the old break value.
+.SH SEE ALSO
+.BR execve (2),
+.BR getrlimit (2),
+.BR end (3),
+.BR malloc (3)
diff --git a/man2/cacheflush.2 b/man2/cacheflush.2
new file mode 100644
index 0000000..733462e
--- /dev/null
+++ b/man2/cacheflush.2
@@ -0,0 +1,143 @@
+.\" Written by Ralf Baechle (ralf@waldorf-gmbh.de),
+.\" Copyright (c) 1994, 1995 Waldorf GMBH
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.TH cacheflush 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+cacheflush \- flush contents of instruction and/or data cache
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/cachectl.h>
+.PP
+.BI "int cacheflush(void " addr [. nbytes "], int "nbytes ", int "cache );
+.fi
+.PP
+.IR Note :
+On some architectures,
+there is no glibc wrapper for this system call; see NOTES.
+.SH DESCRIPTION
+.BR cacheflush ()
+flushes the contents of the indicated cache(s) for the
+user addresses in the range
+.I addr
+to
+.IR (addr+nbytes\-1) .
+.I cache
+may be one of:
+.TP
+.B ICACHE
+Flush the instruction cache.
+.TP
+.B DCACHE
+Write back to memory and invalidate the affected valid cache lines.
+.TP
+.B BCACHE
+Same as
+.BR (ICACHE|DCACHE) .
+.SH RETURN VALUE
+.BR cacheflush ()
+returns 0 on success.
+On error, it returns \-1 and sets
+.I errno
+to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+Some or all of the address range
+.I addr
+to
+.I (addr+nbytes\-1)
+is not accessible.
+.TP
+.B EINVAL
+.I cache
+is not one of
+.BR ICACHE ,
+.BR DCACHE ,
+or
+.B BCACHE
+(but see BUGS).
+.SH VERSIONS
+.BR cacheflush ()
+should not be used in programs intended to be portable.
+On Linux, this call first appeared on the MIPS architecture,
+but nowadays, Linux provides a
+.BR cacheflush ()
+system call on some other architectures, but with different arguments.
+.SS Architecture-specific variants
+glibc provides a wrapper for this system call,
+with the prototype shown in SYNOPSIS,
+for the following architectures:
+ARC, CSKY, MIPS, and NIOS2.
+.PP
+On some other architectures,
+Linux provides this system call, with different arguments:
+.TP
+M68K:
+.nf
+.BI "int cacheflush(unsigned long " addr ", int " scope ", int " cache ,
+.BI " unsigned long " len );
+.fi
+.TP
+SH:
+.nf
+.BI "int cacheflush(unsigned long " addr ", unsigned long " len ", int " op );
+.fi
+.TP
+NDS32:
+.nf
+.BI "int cacheflush(unsigned int " start ", unsigned int " end ", int " cache );
+.fi
+.PP
+On the above architectures,
+glibc does not provide a wrapper for this system call; call it using
+.BR syscall (2).
+.SS GCC alternative
+Unless you need the finer grained control that this system call provides,
+you probably want to use the GCC built-in function
+.BR __builtin___clear_cache (),
+which provides a portable interface
+across platforms supported by GCC and compatible compilers:
+.PP
+.in +4n
+.EX
+.BI "void __builtin___clear_cache(void *" begin ", void *" end );
+.EE
+.in
+.PP
+On platforms that don't require instruction cache flushes,
+.BR __builtin___clear_cache ()
+has no effect.
+.PP
+.IR Note :
+On some GCC-compatible compilers,
+the prototype for this built-in function uses
+.I char *
+instead of
+.I void *
+for the parameters.
+.SH STANDARDS
+Historically, this system call was available on all MIPS UNIX variants
+including RISC/os, IRIX, Ultrix, NetBSD, OpenBSD, and FreeBSD
+(and also on some non-UNIX MIPS operating systems), so that
+the existence of this call in MIPS operating systems is a de-facto
+standard.
+.SH BUGS
+Linux kernels older than Linux 2.6.11 ignore the
+.I addr
+and
+.I nbytes
+arguments, making this function fairly expensive.
+Therefore, the whole cache is always flushed.
+.PP
+This function always behaves as if
+.B BCACHE
+has been passed for the
+.I cache
+argument and does not do any error checking on the
+.I cache
+argument.
diff --git a/man2/capget.2 b/man2/capget.2
new file mode 100644
index 0000000..9c4ba7d
--- /dev/null
+++ b/man2/capget.2
@@ -0,0 +1,260 @@
+.\" Copyright: written by Andrew Morgan <morgan@kernel.org>
+.\" and Copyright 2006, 2008, Michael Kerrisk <tmk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: GPL-1.0-or-later
+.\"
+.\" Modified by David A. Wheeler <dwheeler@ida.org>
+.\" Modified 2004-05-27, mtk
+.\" Modified 2004-06-21, aeb
+.\" Modified 2008-04-28, morgan of kernel.org
+.\" Update in line with addition of file capabilities and
+.\" 64-bit capability sets in Linux 2.6.2[45].
+.\" Modified 2009-01-26, andi kleen
+.\"
+.TH capget 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+capget, capset \- set/get capabilities of thread(s)
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/capability.h>" " /* Definition of " CAP_* " and"
+.BR " _LINUX_CAPABILITY_*" " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_capget, cap_user_header_t " hdrp ,
+.BI " cap_user_data_t " datap );
+.BI "int syscall(SYS_capset, cap_user_header_t " hdrp ,
+.BI " const cap_user_data_t " datap );
+.fi
+.PP
+.IR Note :
+glibc provides no wrappers for these system calls,
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+These two system calls are the raw kernel interface for getting and
+setting thread capabilities.
+Not only are these system calls specific to Linux,
+but the kernel API is likely to change and use of
+these system calls (in particular the format of the
+.I cap_user_*_t
+types) is subject to extension with each kernel revision,
+but old programs will keep working.
+.PP
+The portable interfaces are
+.BR cap_set_proc (3)
+and
+.BR cap_get_proc (3);
+if possible, you should use those interfaces in applications; see NOTES.
+.\"
+.SS Current details
+Now that you have been warned, some current kernel details.
+The structures are defined as follows.
+.PP
+.in +4n
+.EX
+#define _LINUX_CAPABILITY_VERSION_1 0x19980330
+#define _LINUX_CAPABILITY_U32S_1 1
+\&
+ /* V2 added in Linux 2.6.25; deprecated */
+#define _LINUX_CAPABILITY_VERSION_2 0x20071026
+.\" commit e338d263a76af78fe8f38a72131188b58fceb591
+.\" Added 64 bit capability support
+#define _LINUX_CAPABILITY_U32S_2 2
+\&
+ /* V3 added in Linux 2.6.26 */
+#define _LINUX_CAPABILITY_VERSION_3 0x20080522
+.\" commit ca05a99a54db1db5bca72eccb5866d2a86f8517f
+#define _LINUX_CAPABILITY_U32S_3 2
+\&
+typedef struct __user_cap_header_struct {
+ __u32 version;
+ int pid;
+} *cap_user_header_t;
+\&
+typedef struct __user_cap_data_struct {
+ __u32 effective;
+ __u32 permitted;
+ __u32 inheritable;
+} *cap_user_data_t;
+.EE
+.in
+.PP
+The
+.IR effective ,
+.IR permitted ,
+and
+.I inheritable
+fields are bit masks of the capabilities defined in
+.BR capabilities (7).
+Note that the
+.B CAP_*
+values are bit indexes and need to be bit-shifted before ORing into
+the bit fields.
+To define the structures for passing to the system call, you have to use the
+.I struct __user_cap_header_struct
+and
+.I struct __user_cap_data_struct
+names because the typedefs are only pointers.
+.PP
+Kernels prior to Linux 2.6.25 prefer
+32-bit capabilities with version
+.BR _LINUX_CAPABILITY_VERSION_1 .
+Linux 2.6.25 added 64-bit capability sets, with version
+.BR _LINUX_CAPABILITY_VERSION_2 .
+There was, however, an API glitch, and Linux 2.6.26 added
+.B _LINUX_CAPABILITY_VERSION_3
+to fix the problem.
+.PP
+Note that 64-bit capabilities use
+.I datap[0]
+and
+.IR datap[1] ,
+whereas 32-bit capabilities use only
+.IR datap[0] .
+.PP
+On kernels that support file capabilities (VFS capabilities support),
+these system calls behave slightly differently.
+This support was added as an option in Linux 2.6.24,
+and became fixed (nonoptional) in Linux 2.6.33.
+.PP
+For
+.BR capget ()
+calls, one can probe the capabilities of any process by specifying its
+process ID with the
+.I hdrp\->pid
+field value.
+.PP
+For details on the data, see
+.BR capabilities (7).
+.\"
+.SS With VFS capabilities support
+VFS capabilities employ a file extended attribute (see
+.BR xattr (7))
+to allow capabilities to be attached to executables.
+This privilege model obsoletes kernel support for one process
+asynchronously setting the capabilities of another.
+That is, on kernels that have VFS capabilities support, when calling
+.BR capset (),
+the only permitted values for
+.I hdrp\->pid
+are 0 or, equivalently, the value returned by
+.BR gettid (2).
+.\"
+.SS Without VFS capabilities support
+On older kernels that do not provide VFS capabilities support
+.BR capset ()
+can, if the caller has the
+.B CAP_SETPCAP
+capability, be used to change not only the caller's own capabilities,
+but also the capabilities of other threads.
+The call operates on the capabilities of the thread specified by the
+.I pid
+field of
+.I hdrp
+when that is nonzero, or on the capabilities of the calling thread if
+.I pid
+is 0.
+If
+.I pid
+refers to a single-threaded process, then
+.I pid
+can be specified as a traditional process ID;
+operating on a thread of a multithreaded process requires a thread ID
+of the type returned by
+.BR gettid (2).
+For
+.BR capset (),
+.I pid
+can also be: \-1, meaning perform the change on all threads except the
+caller and
+.BR init (1);
+or a value less than \-1, in which case the change is applied
+to all members of the process group whose ID is \-\fIpid\fP.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.PP
+The calls fail with the error
+.BR EINVAL ,
+and set the
+.I version
+field of
+.I hdrp
+to the kernel preferred value of
+.B _LINUX_CAPABILITY_VERSION_?
+when an unsupported
+.I version
+value is specified.
+In this way, one can probe what the current
+preferred capability revision is.
+.SH ERRORS
+.TP
+.B EFAULT
+Bad memory address.
+.I hdrp
+must not be NULL.
+.I datap
+may be NULL only when the user is trying to determine the preferred
+capability version format supported by the kernel.
+.TP
+.B EINVAL
+One of the arguments was invalid.
+.TP
+.B EPERM
+An attempt was made to add a capability to the permitted set, or to set
+a capability in the effective set that is not in the
+permitted set.
+.TP
+.B EPERM
+An attempt was made to add a capability to the inheritable set, and either:
+.RS
+.IP \[bu] 3
+that capability was not in the caller's bounding set; or
+.IP \[bu]
+the capability was not in the caller's permitted set
+and the caller lacked the
+.B CAP_SETPCAP
+capability in its effective set.
+.RE
+.TP
+.B EPERM
+The caller attempted to use
+.BR capset ()
+to modify the capabilities of a thread other than itself,
+but lacked sufficient privilege.
+For kernels supporting VFS
+capabilities, this is never permitted.
+For kernels lacking VFS
+support, the
+.B CAP_SETPCAP
+capability is required.
+(A bug in kernels before Linux 2.6.11 meant that this error could also
+occur if a thread without this capability tried to change its
+own capabilities by specifying the
+.I pid
+field as a nonzero value (i.e., the value returned by
+.BR getpid (2))
+instead of 0.)
+.TP
+.B ESRCH
+No such thread.
+.SH STANDARDS
+Linux.
+.SH NOTES
+The portable interface to the capability querying and setting
+functions is provided by the
+.I libcap
+library and is available here:
+.br
+.UR http://git.kernel.org/cgit\:/linux\:/kernel\:/git\:/morgan\:\:/libcap.git
+.UE
+.SH SEE ALSO
+.BR clone (2),
+.BR gettid (2),
+.BR capabilities (7)
diff --git a/man2/capset.2 b/man2/capset.2
new file mode 100644
index 0000000..9e829cb
--- /dev/null
+++ b/man2/capset.2
@@ -0,0 +1 @@
+.so man2/capget.2
diff --git a/man2/chdir.2 b/man2/chdir.2
new file mode 100644
index 0000000..5aca7bf
--- /dev/null
+++ b/man2/chdir.2
@@ -0,0 +1,127 @@
+.\" Copyright (c) 1992 Drew Eckhardt (drew@cs.colorado.edu), March 28, 1992
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt <michael@moria.de>
+.\" Modified 1993-07-21 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1995-04-15 by Michael Chastain <mec@shell.portal.com>:
+.\" Added 'fchdir'. Fixed bugs in error section.
+.\" Modified 1996-10-21 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 1997-08-21 by Joseph S. Myers <jsm28@cam.ac.uk>
+.\" Modified 2004-06-23 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.TH chdir 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+chdir, fchdir \- change working directory
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int chdir(const char *" path );
+.BI "int fchdir(int " fd );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR fchdir ():
+.nf
+ _XOPEN_SOURCE >= 500
+.\" || _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED
+ || /* Since glibc 2.12: */ _POSIX_C_SOURCE >= 200809L
+ || /* glibc up to and including 2.19: */ _BSD_SOURCE
+.fi
+.SH DESCRIPTION
+.BR chdir ()
+changes the current working directory of the calling process to the
+directory specified in
+.IR path .
+.PP
+.BR fchdir ()
+is identical to
+.BR chdir ();
+the only difference is that the directory is given as an
+open file descriptor.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+Depending on the filesystem, other errors can be returned.
+The more
+general errors for
+.BR chdir ()
+are listed below:
+.TP
+.B EACCES
+Search permission is denied for one of the components of
+.IR path .
+(See also
+.BR path_resolution (7).)
+.TP
+.B EFAULT
+.I path
+points outside your accessible address space.
+.TP
+.B EIO
+An I/O error occurred.
+.TP
+.B ELOOP
+Too many symbolic links were encountered in resolving
+.IR path .
+.TP
+.B ENAMETOOLONG
+.I path
+is too long.
+.TP
+.B ENOENT
+The directory specified in
+.I path
+does not exist.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.B ENOTDIR
+A component of
+.I path
+is not a directory.
+.PP
+The general errors for
+.BR fchdir ()
+are listed below:
+.TP
+.B EACCES
+Search permission was denied on the directory open on
+.IR fd .
+.TP
+.B EBADF
+.I fd
+is not a valid file descriptor.
+.TP
+.B ENOTDIR
+.I fd
+does not refer to a directory.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4, 4.4BSD.
+.SH NOTES
+The current working directory is the starting point for interpreting
+relative pathnames (those not starting with \[aq]/\[aq]).
+.PP
+A child process created via
+.BR fork (2)
+inherits its parent's current working directory.
+The current working directory is left unchanged by
+.BR execve (2).
+.SH SEE ALSO
+.BR chroot (2),
+.BR getcwd (3),
+.BR path_resolution (7)
diff --git a/man2/chmod.2 b/man2/chmod.2
new file mode 100644
index 0000000..b1c130e
--- /dev/null
+++ b/man2/chmod.2
@@ -0,0 +1,347 @@
+.\" Copyright (c) 1992 Drew Eckhardt (drew@cs.colorado.edu), March 28, 1992
+.\" and Copyright (C) 2006, 2014 Michael Kerrisk
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt <michael@moria.de>
+.\" Modified 1993-07-21 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1997-01-12 by Michael Haardt
+.\" <michael@cantor.informatik.rwth-aachen.de>: NFS details
+.\" Modified 2004-06-23 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.TH chmod 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+chmod, fchmod, fchmodat \- change permissions of a file
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/stat.h>
+.PP
+.BI "int chmod(const char *" pathname ", mode_t " mode );
+.BI "int fchmod(int " fd ", mode_t " mode );
+.PP
+.BR "#include <fcntl.h>" " /* Definition of AT_* constants */"
+.B #include <sys/stat.h>
+.PP
+.BI "int fchmodat(int " dirfd ", const char *" pathname ", mode_t " \
+mode ", int " flags );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.nf
+.BR fchmod ():
+ Since glibc 2.24:
+ _POSIX_C_SOURCE >= 199309L
+.\" || (_XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED)
+ glibc 2.19 to glibc 2.23
+ _POSIX_C_SOURCE
+ glibc 2.16 to glibc 2.19:
+ _BSD_SOURCE || _POSIX_C_SOURCE
+ glibc 2.12 to glibc 2.16:
+ _BSD_SOURCE || _XOPEN_SOURCE >= 500
+ || _POSIX_C_SOURCE >= 200809L
+ glibc 2.11 and earlier:
+ _BSD_SOURCE || _XOPEN_SOURCE >= 500
+.\" || (_XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED)
+.fi
+.PP
+.BR fchmodat ():
+.nf
+ Since glibc 2.10:
+ _POSIX_C_SOURCE >= 200809L
+ Before glibc 2.10:
+ _ATFILE_SOURCE
+.fi
+.SH DESCRIPTION
+The
+.BR chmod ()
+and
+.BR fchmod ()
+system calls change a file's mode bits.
+(The file mode consists of the file permission bits plus the set-user-ID,
+set-group-ID, and sticky bits.)
+These system calls differ only in how the file is specified:
+.IP \[bu] 3
+.BR chmod ()
+changes the mode of the file specified whose pathname is given in
+.IR pathname ,
+which is dereferenced if it is a symbolic link.
+.IP \[bu]
+.BR fchmod ()
+changes the mode of the file referred to by the open file descriptor
+.IR fd .
+.PP
+The new file mode is specified in
+.IR mode ,
+which is a bit mask created by ORing together zero or
+more of the following:
+.TP 18
+.BR S_ISUID " (04000)"
+set-user-ID (set process effective user ID on
+.BR execve (2))
+.TP
+.BR S_ISGID " (02000)"
+set-group-ID (set process effective group ID on
+.BR execve (2);
+mandatory locking, as described in
+.BR fcntl (2);
+take a new file's group from parent directory, as described in
+.BR chown (2)
+and
+.BR mkdir (2))
+.TP
+.BR S_ISVTX " (01000)"
+sticky bit (restricted deletion flag, as described in
+.BR unlink (2))
+.TP
+.BR S_IRUSR " (00400)"
+read by owner
+.TP
+.BR S_IWUSR " (00200)"
+write by owner
+.TP
+.BR S_IXUSR " (00100)"
+execute/search by owner ("search" applies for directories,
+and means that entries within the directory can be accessed)
+.TP
+.BR S_IRGRP " (00040)"
+read by group
+.TP
+.BR S_IWGRP " (00020)"
+write by group
+.TP
+.BR S_IXGRP " (00010)"
+execute/search by group
+.TP
+.BR S_IROTH " (00004)"
+read by others
+.TP
+.BR S_IWOTH " (00002)"
+write by others
+.TP
+.BR S_IXOTH " (00001)"
+execute/search by others
+.PP
+The effective UID of the calling process must match the owner of the file,
+or the process must be privileged (Linux: it must have the
+.B CAP_FOWNER
+capability).
+.PP
+If the calling process is not privileged (Linux: does not have the
+.B CAP_FSETID
+capability), and the group of the file does not match
+the effective group ID of the process or one of its
+supplementary group IDs, the
+.B S_ISGID
+bit will be turned off,
+but this will not cause an error to be returned.
+.PP
+As a security measure, depending on the filesystem,
+the set-user-ID and set-group-ID execution bits
+may be turned off if a file is written.
+(On Linux, this occurs if the writing process does not have the
+.B CAP_FSETID
+capability.)
+On some filesystems, only the superuser can set the sticky bit,
+which may have a special meaning.
+For the sticky bit, and for set-user-ID and set-group-ID bits on
+directories, see
+.BR inode (7).
+.PP
+On NFS filesystems, restricting the permissions will immediately influence
+already open files, because the access control is done on the server, but
+open files are maintained by the client.
+Widening the permissions may be
+delayed for other clients if attribute caching is enabled on them.
+.\"
+.\"
+.SS fchmodat()
+The
+.BR fchmodat ()
+system call operates in exactly the same way as
+.BR chmod (),
+except for the differences described here.
+.PP
+If the pathname given in
+.I pathname
+is relative, then it is interpreted relative to the directory
+referred to by the file descriptor
+.I dirfd
+(rather than relative to the current working directory of
+the calling process, as is done by
+.BR chmod ()
+for a relative pathname).
+.PP
+If
+.I pathname
+is relative and
+.I dirfd
+is the special value
+.BR AT_FDCWD ,
+then
+.I pathname
+is interpreted relative to the current working
+directory of the calling process (like
+.BR chmod ()).
+.PP
+If
+.I pathname
+is absolute, then
+.I dirfd
+is ignored.
+.PP
+.I flags
+can either be 0, or include the following flag:
+.TP
+.B AT_SYMLINK_NOFOLLOW
+If
+.I pathname
+is a symbolic link, do not dereference it:
+instead operate on the link itself.
+This flag is not currently implemented.
+.PP
+See
+.BR openat (2)
+for an explanation of the need for
+.BR fchmodat ().
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+Depending on the filesystem,
+errors other than those listed below can be returned.
+.PP
+The more general errors for
+.BR chmod ()
+are listed below:
+.TP
+.B EACCES
+Search permission is denied on a component of the path prefix.
+(See also
+.BR path_resolution (7).)
+.TP
+.B EBADF
+.RB ( fchmod ())
+The file descriptor
+.I fd
+is not valid.
+.TP
+.B EBADF
+.RB ( fchmodat ())
+.I pathname
+is relative but
+.I dirfd
+is neither
+.B AT_FDCWD
+nor a valid file descriptor.
+.TP
+.B EFAULT
+.I pathname
+points outside your accessible address space.
+.TP
+.B EINVAL
+.RB ( fchmodat ())
+Invalid flag specified in
+.IR flags .
+.TP
+.B EIO
+An I/O error occurred.
+.TP
+.B ELOOP
+Too many symbolic links were encountered in resolving
+.IR pathname .
+.TP
+.B ENAMETOOLONG
+.I pathname
+is too long.
+.TP
+.B ENOENT
+The file does not exist.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.B ENOTDIR
+A component of the path prefix is not a directory.
+.TP
+.B ENOTDIR
+.RB ( fchmodat ())
+.I pathname
+is relative and
+.I dirfd
+is a file descriptor referring to a file other than a directory.
+.TP
+.B ENOTSUP
+.RB ( fchmodat ())
+.I flags
+specified
+.BR AT_SYMLINK_NOFOLLOW ,
+which is not supported.
+.TP
+.B EPERM
+The effective UID does not match the owner of the file,
+and the process is not privileged (Linux: it does not have the
+.B CAP_FOWNER
+capability).
+.TP
+.B EPERM
+The file is marked immutable or append-only.
+(See
+.BR ioctl_iflags (2).)
+.TP
+.B EROFS
+The named file resides on a read-only filesystem.
+.SH VERSIONS
+.SS C library/kernel differences
+The GNU C library
+.BR fchmodat ()
+wrapper function implements the POSIX-specified
+interface described in this page.
+This interface differs from the underlying Linux system call, which does
+.I not
+have a
+.I flags
+argument.
+.SS glibc notes
+On older kernels where
+.BR fchmodat ()
+is unavailable, the glibc wrapper function falls back to the use of
+.BR chmod ().
+When
+.I pathname
+is a relative pathname,
+glibc constructs a pathname based on the symbolic link in
+.I /proc/self/fd
+that corresponds to the
+.I dirfd
+argument.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+.TP
+.BR chmod ()
+.TQ
+.BR fchmod ()
+4.4BSD, SVr4, POSIX.1-2001.
+.TP
+.BR fchmodat ()
+POSIX.1-2008.
+Linux 2.6.16,
+glibc 2.4.
+.SH SEE ALSO
+.BR chmod (1),
+.BR chown (2),
+.BR execve (2),
+.BR open (2),
+.BR stat (2),
+.BR inode (7),
+.BR path_resolution (7),
+.BR symlink (7)
diff --git a/man2/chown.2 b/man2/chown.2
new file mode 100644
index 0000000..ff7c6dd
--- /dev/null
+++ b/man2/chown.2
@@ -0,0 +1,471 @@
+.\" Copyright (c) 1992 Drew Eckhardt (drew@cs.colorado.edu), March 28, 1992
+.\" and Copyright (c) 1998 Andries Brouwer (aeb@cwi.nl)
+.\" and Copyright (c) 2006, 2007, 2008, 2014 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt <michael@moria.de>
+.\" Modified 1993-07-21 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1996-07-09 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified 1996-11-06 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 1997-05-18 by Michael Haardt <michael@cantor.informatik.rwth-aachen.de>
+.\" Modified 2004-06-23 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" 2007-07-08, mtk, added an example program; updated SYNOPSIS
+.\" 2008-05-08, mtk, Describe rules governing ownership of new files
+.\" (bsdgroups versus sysvgroups, and the effect of the parent
+.\" directory's set-group-ID mode bit).
+.\"
+.TH chown 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+chown, fchown, lchown, fchownat \- change ownership of a file
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int chown(const char *" pathname ", uid_t " owner ", gid_t " group );
+.BI "int fchown(int " fd ", uid_t " owner ", gid_t " group );
+.BI "int lchown(const char *" pathname ", uid_t " owner ", gid_t " group );
+.PP
+.BR "#include <fcntl.h> " "/* Definition of AT_* constants */"
+.B #include <unistd.h>
+.PP
+.BI "int fchownat(int " dirfd ", const char *" pathname ,
+.BI " uid_t " owner ", gid_t " group ", int " flags );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR fchown (),
+.BR lchown ():
+.nf
+ /* Since glibc 2.12: */ _POSIX_C_SOURCE >= 200809L
+ || _XOPEN_SOURCE >= 500
+.\" || _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED
+ || /* glibc <= 2.19: */ _BSD_SOURCE
+.fi
+.PP
+.BR fchownat ():
+.nf
+ Since glibc 2.10:
+ _POSIX_C_SOURCE >= 200809L
+ Before glibc 2.10:
+ _ATFILE_SOURCE
+.fi
+.SH DESCRIPTION
+These system calls change the owner and group of a file.
+The
+.BR chown (),
+.BR fchown (),
+and
+.BR lchown ()
+system calls differ only in how the file is specified:
+.IP \[bu] 3
+.BR chown ()
+changes the ownership of the file specified by
+.IR pathname ,
+which is dereferenced if it is a symbolic link.
+.IP \[bu]
+.BR fchown ()
+changes the ownership of the file referred to by the open file descriptor
+.IR fd .
+.IP \[bu]
+.BR lchown ()
+is like
+.BR chown (),
+but does not dereference symbolic links.
+.PP
+Only a privileged process (Linux: one with the
+.B CAP_CHOWN
+capability) may change the owner of a file.
+The owner of a file may change the group of the file
+to any group of which that owner is a member.
+A privileged process (Linux: with
+.BR CAP_CHOWN )
+may change the group arbitrarily.
+.PP
+If the
+.I owner
+or
+.I group
+is specified as \-1, then that ID is not changed.
+.PP
+When the owner or group of an executable file is
+changed by an unprivileged user, the
+.B S_ISUID
+and
+.B S_ISGID
+mode bits are cleared.
+POSIX does not specify whether
+this also should happen when root does the
+.BR chown ();
+the Linux behavior depends on the kernel version,
+and since Linux 2.2.13, root is treated like other users.
+.\" In Linux 2.0 kernels, superuser was like everyone else
+.\" In Linux 2.2, up to Linux 2.2.12, these bits were not cleared for superuser.
+.\" Since Linux 2.2.13, superuser is once more like everyone else.
+In case of a non-group-executable file (i.e., one for which the
+.B S_IXGRP
+bit is not set) the
+.B S_ISGID
+bit indicates mandatory locking, and is not cleared by a
+.BR chown ().
+.PP
+When the owner or group of an executable file is changed (by any user),
+all capability sets for the file are cleared.
+.\"
+.SS fchownat()
+The
+.BR fchownat ()
+system call operates in exactly the same way as
+.BR chown (),
+except for the differences described here.
+.PP
+If the pathname given in
+.I pathname
+is relative, then it is interpreted relative to the directory
+referred to by the file descriptor
+.I dirfd
+(rather than relative to the current working directory of
+the calling process, as is done by
+.BR chown ()
+for a relative pathname).
+.PP
+If
+.I pathname
+is relative and
+.I dirfd
+is the special value
+.BR AT_FDCWD ,
+then
+.I pathname
+is interpreted relative to the current working
+directory of the calling process (like
+.BR chown ()).
+.PP
+If
+.I pathname
+is absolute, then
+.I dirfd
+is ignored.
+.PP
+The
+.I flags
+argument is a bit mask created by ORing together
+0 or more of the following values;
+.TP
+.BR AT_EMPTY_PATH " (since Linux 2.6.39)"
+.\" commit 65cfc6722361570bfe255698d9cd4dccaf47570d
+If
+.I pathname
+is an empty string, operate on the file referred to by
+.I dirfd
+(which may have been obtained using the
+.BR open (2)
+.B O_PATH
+flag).
+In this case,
+.I dirfd
+can refer to any type of file, not just a directory.
+If
+.I dirfd
+is
+.BR AT_FDCWD ,
+the call operates on the current working directory.
+This flag is Linux-specific; define
+.B _GNU_SOURCE
+.\" Before glibc 2.16, defining _ATFILE_SOURCE sufficed
+to obtain its definition.
+.TP
+.B AT_SYMLINK_NOFOLLOW
+If
+.I pathname
+is a symbolic link, do not dereference it:
+instead operate on the link itself, like
+.BR lchown ().
+(By default,
+.BR fchownat ()
+dereferences symbolic links, like
+.BR chown ().)
+.PP
+See
+.BR openat (2)
+for an explanation of the need for
+.BR fchownat ().
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+Depending on the filesystem,
+errors other than those listed below can be returned.
+.PP
+The more general errors for
+.BR chown ()
+are listed below.
+.TP
+.B EACCES
+Search permission is denied on a component of the path prefix.
+(See also
+.BR path_resolution (7).)
+.TP
+.B EBADF
+.RB ( fchown ())
+.I fd
+is not a valid open file descriptor.
+.TP
+.B EBADF
+.RB ( fchownat ())
+.I pathname
+is relative but
+.I dirfd
+is neither
+.B AT_FDCWD
+nor a valid file descriptor.
+.TP
+.B EFAULT
+.I pathname
+points outside your accessible address space.
+.TP
+.B EINVAL
+.RB ( fchownat ())
+Invalid flag specified in
+.IR flags .
+.TP
+.B EIO
+.RB ( fchown ())
+A low-level I/O error occurred while modifying the inode.
+.TP
+.B ELOOP
+Too many symbolic links were encountered in resolving
+.IR pathname .
+.TP
+.B ENAMETOOLONG
+.I pathname
+is too long.
+.TP
+.B ENOENT
+The file does not exist.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.B ENOTDIR
+A component of the path prefix is not a directory.
+.TP
+.B ENOTDIR
+.RB ( fchownat ())
+.I pathname
+is relative and
+.I dirfd
+is a file descriptor referring to a file other than a directory.
+.TP
+.B EPERM
+The calling process did not have the required permissions
+(see above) to change owner and/or group.
+.TP
+.B EPERM
+The file is marked immutable or append-only.
+(See
+.BR ioctl_iflags (2).)
+.TP
+.B EROFS
+The named file resides on a read-only filesystem.
+.SH VERSIONS
+The 4.4BSD version can be
+used only by the superuser (that is, ordinary users cannot give away files).
+.\" chown():
+.\" SVr4 documents EINVAL, EINTR, ENOLINK and EMULTIHOP returns, but no
+.\" ENOMEM. POSIX.1 does not document ENOMEM or ELOOP error conditions.
+.\" fchown():
+.\" SVr4 documents additional EINVAL, EIO, EINTR, and ENOLINK
+.\" error conditions.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+.TP
+.BR chown ()
+.TQ
+.BR fchown ()
+.TQ
+.BR lchown ()
+4.4BSD, SVr4, POSIX.1-2001.
+.TP
+.BR fchownat ()
+POSIX.1-2008.
+Linux 2.6.16,
+glibc 2.4.
+.SH NOTES
+.SS Ownership of new files
+When a new file is created (by, for example,
+.BR open (2)
+or
+.BR mkdir (2)),
+its owner is made the same as the filesystem user ID of the
+creating process.
+The group of the file depends on a range of factors,
+including the type of filesystem,
+the options used to mount the filesystem,
+and whether or not the set-group-ID mode bit is enabled
+on the parent directory.
+If the filesystem supports the
+.B "\-o\ grpid"
+(or, synonymously
+.BR "\-o\ bsdgroups" )
+and
+.B "\-o\ nogrpid"
+(or, synonymously
+.BR "\-o\ sysvgroups" )
+.BR mount (8)
+options, then the rules are as follows:
+.IP \[bu] 3
+If the filesystem is mounted with
+.BR "\-o\ grpid" ,
+then the group of a new file is made
+the same as that of the parent directory.
+.IP \[bu]
+If the filesystem is mounted with
+.B \-o\ nogrpid
+and the set-group-ID bit is disabled on the parent directory,
+then the group of a new file is made the same as the
+process's filesystem GID.
+.IP \[bu]
+If the filesystem is mounted with
+.B \-o\ nogrpid
+and the set-group-ID bit is enabled on the parent directory,
+then the group of a new file is made
+the same as that of the parent directory.
+.PP
+As at Linux 4.12,
+the
+.B \-o\ grpid
+and
+.B \-o\ nogrpid
+mount options are supported by ext2, ext3, ext4, and XFS.
+Filesystems that don't support these mount options follow the
+.B \-o\ nogrpid
+rules.
+.SS glibc notes
+On older kernels where
+.BR fchownat ()
+is unavailable, the glibc wrapper function falls back to the use of
+.BR chown ()
+and
+.BR lchown ().
+When
+.I pathname
+is a relative pathname,
+glibc constructs a pathname based on the symbolic link in
+.I /proc/self/fd
+that corresponds to the
+.I dirfd
+argument.
+.SS NFS
+The
+.BR chown ()
+semantics are deliberately violated on NFS filesystems
+which have UID mapping enabled.
+Additionally, the semantics of all system
+calls which access the file contents are violated, because
+.BR chown ()
+may cause immediate access revocation on already open files.
+Client side
+caching may lead to a delay between the time where ownership have
+been changed to allow access for a user and the time where the file can
+actually be accessed by the user on other clients.
+.SS Historical details
+The original Linux
+.BR chown (),
+.BR fchown (),
+and
+.BR lchown ()
+system calls supported only 16-bit user and group IDs.
+Subsequently, Linux 2.4 added
+.BR chown32 (),
+.BR fchown32 (),
+and
+.BR lchown32 (),
+supporting 32-bit IDs.
+The glibc
+.BR chown (),
+.BR fchown (),
+and
+.BR lchown ()
+wrapper functions transparently deal with the variations across kernel versions.
+.PP
+Before Linux 2.1.81 (except 2.1.46),
+.BR chown ()
+did not follow symbolic links.
+Since Linux 2.1.81,
+.BR chown ()
+does follow symbolic links, and there is a new system call
+.BR lchown ()
+that does not follow symbolic links.
+Since Linux 2.1.86, this new call (that has the same semantics
+as the old
+.BR chown ())
+has got the same syscall number, and
+.BR chown ()
+got the newly introduced number.
+.SH EXAMPLES
+The following program changes the ownership of the file named in
+its second command-line argument to the value specified in its
+first command-line argument.
+The new owner can be specified either as a numeric user ID,
+or as a username (which is converted to a user ID by using
+.BR getpwnam (3)
+to perform a lookup in the system password file).
+.SS Program source
+.\" SRC BEGIN (chown.c)
+.EX
+#include <pwd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ char *endptr;
+ uid_t uid;
+ struct passwd *pwd;
+\&
+ if (argc != 3 || argv[1][0] == \[aq]\e0\[aq]) {
+ fprintf(stderr, "%s <owner> <file>\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ uid = strtol(argv[1], &endptr, 10); /* Allow a numeric string */
+\&
+ if (*endptr != \[aq]\e0\[aq]) { /* Was not pure numeric string */
+ pwd = getpwnam(argv[1]); /* Try getting UID for username */
+ if (pwd == NULL) {
+ perror("getpwnam");
+ exit(EXIT_FAILURE);
+ }
+\&
+ uid = pwd\->pw_uid;
+ }
+\&
+ if (chown(argv[2], uid, \-1) == \-1) {
+ perror("chown");
+ exit(EXIT_FAILURE);
+ }
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR chgrp (1),
+.BR chown (1),
+.BR chmod (2),
+.BR flock (2),
+.BR path_resolution (7),
+.BR symlink (7)
diff --git a/man2/chown32.2 b/man2/chown32.2
new file mode 100644
index 0000000..f0a5635
--- /dev/null
+++ b/man2/chown32.2
@@ -0,0 +1 @@
+.so man2/chown.2
diff --git a/man2/chroot.2 b/man2/chroot.2
new file mode 100644
index 0000000..d872b8a
--- /dev/null
+++ b/man2/chroot.2
@@ -0,0 +1,166 @@
+.\" Copyright (c) 1992 Drew Eckhardt (drew@cs.colorado.edu), March 28, 1992
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt <michael@moria.de>
+.\" Modified 1993-07-21 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1994-08-21 by Michael Chastain <mec@shell.portal.com>
+.\" Modified 1996-06-13 by aeb
+.\" Modified 1996-11-06 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 1997-08-21 by Joseph S. Myers <jsm28@cam.ac.uk>
+.\" Modified 2004-06-23 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.TH chroot 2 2023-04-03 "Linux man-pages 6.05.01"
+.SH NAME
+chroot \- change root directory
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int chroot(const char *" path );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR chroot ():
+.nf
+ Since glibc 2.2.2:
+ _XOPEN_SOURCE && ! (_POSIX_C_SOURCE >= 200112L)
+ || /* Since glibc 2.20: */ _DEFAULT_SOURCE
+ || /* glibc <= 2.19: */ _BSD_SOURCE
+ Before glibc 2.2.2:
+ none
+.fi
+.SH DESCRIPTION
+.BR chroot ()
+changes the root directory of the calling process to that specified in
+.IR path .
+This directory will be used for pathnames beginning with \fI/\fP.
+The root directory is inherited by all children of the calling process.
+.PP
+Only a privileged process (Linux: one with the
+.B CAP_SYS_CHROOT
+capability in its user namespace) may call
+.BR chroot ().
+.PP
+This call changes an ingredient in the pathname resolution process
+and does nothing else.
+In particular, it is not intended to be used
+for any kind of security purpose, neither to fully sandbox a process nor
+to restrict filesystem system calls.
+In the past,
+.BR chroot ()
+has been used by daemons to restrict themselves prior to passing paths
+supplied by untrusted users to system calls such as
+.BR open (2).
+However, if a folder is moved out of the chroot directory, an attacker
+can exploit that to get out of the chroot directory as well.
+The easiest way to do that is to
+.BR chdir (2)
+to the to-be-moved directory, wait for it to be moved out, then open a
+path like ../../../etc/passwd.
+.PP
+.\" This is how the "slightly trickier variation" works:
+.\" https://github.com/QubesOS/qubes-secpack/blob/master/QSBs/qsb-014-2015.txt#L142
+A slightly
+trickier variation also works under some circumstances if
+.BR chdir (2)
+is not permitted.
+If a daemon allows a "chroot directory" to be specified,
+that usually means that if you want to prevent remote users from accessing
+files outside the chroot directory, you must ensure that folders are never
+moved out of it.
+.PP
+This call does not change the current working directory,
+so that after the call \[aq]\fI.\fP\[aq] can
+be outside the tree rooted at \[aq]\fI/\fP\[aq].
+In particular, the superuser can escape from a "chroot jail"
+by doing:
+.PP
+.in +4n
+.EX
+mkdir foo; chroot foo; cd ..
+.EE
+.in
+.PP
+This call does not close open file descriptors, and such file
+descriptors may allow access to files outside the chroot tree.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+Depending on the filesystem, other errors can be returned.
+The more general errors are listed below:
+.TP
+.B EACCES
+Search permission is denied on a component of the path prefix.
+(See also
+.BR path_resolution (7).)
+.\" Also search permission is required on the final component,
+.\" maybe just to guarantee that it is a directory?
+.TP
+.B EFAULT
+.I path
+points outside your accessible address space.
+.TP
+.B EIO
+An I/O error occurred.
+.TP
+.B ELOOP
+Too many symbolic links were encountered in resolving
+.IR path .
+.TP
+.B ENAMETOOLONG
+.I path
+is too long.
+.TP
+.B ENOENT
+The file does not exist.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.B ENOTDIR
+A component of
+.I path
+is not a directory.
+.TP
+.B EPERM
+The caller has insufficient privilege.
+.SH STANDARDS
+None.
+.SH HISTORY
+SVr4, 4.4BSD, SUSv2 (marked LEGACY).
+This function is not part of POSIX.1-2001.
+.\" SVr4 documents additional EINTR, ENOLINK and EMULTIHOP error conditions.
+.\" X/OPEN does not document EIO, ENOMEM or EFAULT error conditions.
+.SH NOTES
+A child process created via
+.BR fork (2)
+inherits its parent's root directory.
+The root directory is left unchanged by
+.BR execve (2).
+.PP
+The magic symbolic link,
+.IR /proc/ pid /root ,
+can be used to discover a process's root directory; see
+.BR proc (5)
+for details.
+.PP
+FreeBSD has a stronger
+.BR jail ()
+system call.
+.SH SEE ALSO
+.BR chroot (1),
+.BR chdir (2),
+.BR pivot_root (2),
+.BR path_resolution (7),
+.BR switch_root (8)
diff --git a/man2/clock_adjtime.2 b/man2/clock_adjtime.2
new file mode 100644
index 0000000..b08b9c8
--- /dev/null
+++ b/man2/clock_adjtime.2
@@ -0,0 +1 @@
+.so man2/adjtimex.2
diff --git a/man2/clock_getres.2 b/man2/clock_getres.2
new file mode 100644
index 0000000..170215d
--- /dev/null
+++ b/man2/clock_getres.2
@@ -0,0 +1,524 @@
+'\" t
+.\" Copyright (c) 2003 Nick Clifford (zaf@nrc.co.nz), Jan 25, 2003
+.\" Copyright (c) 2003 Andries Brouwer (aeb@cwi.nl), Aug 24, 2003
+.\" Copyright (c) 2020 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" 2003-08-23 Martin Schulze <joey@infodrom.org> improvements
+.\" 2003-08-24 aeb, large parts rewritten
+.\" 2004-08-06 Christoph Lameter <clameter@sgi.com>, SMP note
+.\"
+.TH clock_getres 2 2023-07-20 "Linux man-pages 6.05.01"
+.SH NAME
+clock_getres, clock_gettime, clock_settime \- clock and time functions
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc ),
+since glibc 2.17
+.PP
+Before glibc 2.17,
+Real-time library
+.RI ( librt ", " \-lrt )
+.SH SYNOPSIS
+.nf
+.B #include <time.h>
+.PP
+.BI "int clock_getres(clockid_t " clockid ", struct timespec *_Nullable " res );
+.PP
+.BI "int clock_gettime(clockid_t " clockid ", struct timespec *" tp );
+.BI "int clock_settime(clockid_t " clockid ", const struct timespec *" tp );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR clock_getres (),
+.BR clock_gettime (),
+.BR clock_settime ():
+.nf
+ _POSIX_C_SOURCE >= 199309L
+.fi
+.SH DESCRIPTION
+The function
+.BR clock_getres ()
+finds the resolution (precision) of the specified clock
+.IR clockid ,
+and, if
+.I res
+is non-NULL, stores it in the \fIstruct timespec\fP pointed to by
+.IR res .
+The resolution of clocks depends on the implementation and cannot be
+configured by a particular process.
+If the time value pointed to by the argument
+.I tp
+of
+.BR clock_settime ()
+is not a multiple of
+.IR res ,
+then it is truncated to a multiple of
+.IR res .
+.PP
+The functions
+.BR clock_gettime ()
+and
+.BR clock_settime ()
+retrieve and set the time of the specified clock
+.IR clockid .
+.PP
+The
+.I res
+and
+.I tp
+arguments are
+.BR timespec (3)
+structures.
+.PP
+The
+.I clockid
+argument is the identifier of the particular clock on which to act.
+A clock may be system-wide and hence visible for all processes, or
+per-process if it measures time only within a single process.
+.PP
+All implementations support the system-wide real-time clock,
+which is identified by
+.BR CLOCK_REALTIME .
+Its time represents seconds and nanoseconds since the Epoch.
+When its time is changed, timers for a relative interval are
+unaffected, but timers for an absolute point in time are affected.
+.PP
+More clocks may be implemented.
+The interpretation of the
+corresponding time values and the effect on timers is unspecified.
+.PP
+Sufficiently recent versions of glibc and the Linux kernel
+support the following clocks:
+.TP
+.B CLOCK_REALTIME
+A settable system-wide clock that measures real (i.e., wall-clock) time.
+Setting this clock requires appropriate privileges.
+This clock is affected by discontinuous jumps in the system time
+(e.g., if the system administrator manually changes the clock),
+and by the incremental adjustments performed by
+.BR adjtime (3)
+and NTP.
+.TP
+.BR CLOCK_REALTIME_ALARM " (since Linux 3.0; Linux-specific)"
+Like
+.BR CLOCK_REALTIME ,
+but not settable.
+See
+.BR timer_create (2)
+for further details.
+.TP
+.BR CLOCK_REALTIME_COARSE " (since Linux 2.6.32; Linux-specific)"
+.\" Added in commit da15cfdae03351c689736f8d142618592e3cebc3
+A faster but less precise version of
+.BR CLOCK_REALTIME .
+This clock is not settable.
+Use when you need very fast, but not fine-grained timestamps.
+Requires per-architecture support,
+and probably also architecture support for this flag in the
+.BR vdso (7).
+.TP
+.BR CLOCK_TAI " (since Linux 3.10; Linux-specific)"
+.\" commit 1ff3c9677bff7e468e0c487d0ffefe4e901d33f4
+A nonsettable system-wide clock derived from wall-clock time
+but ignoring leap seconds.
+This clock does
+not experience discontinuities and backwards jumps caused by NTP
+inserting leap seconds as
+.B CLOCK_REALTIME
+does.
+.IP
+The acronym TAI refers to International Atomic Time.
+.TP
+.B CLOCK_MONOTONIC
+A nonsettable system-wide clock that
+represents monotonic time since\[em]as described
+by POSIX\[em]"some unspecified point in the past".
+On Linux, that point corresponds to the number of seconds that the system
+has been running since it was booted.
+.IP
+The
+.B CLOCK_MONOTONIC
+clock is not affected by discontinuous jumps in the system time
+(e.g., if the system administrator manually changes the clock),
+but is affected by the incremental adjustments performed by
+.BR adjtime (3)
+and NTP.
+This clock does not count time that the system is suspended.
+All
+.B CLOCK_MONOTONIC
+variants guarantee that the time returned by consecutive calls will not go
+backwards, but successive calls may\[em]depending on the architecture\[em]return
+identical (not-increased) time values.
+.TP
+.BR CLOCK_MONOTONIC_COARSE " (since Linux 2.6.32; Linux-specific)"
+.\" Added in commit da15cfdae03351c689736f8d142618592e3cebc3
+A faster but less precise version of
+.BR CLOCK_MONOTONIC .
+Use when you need very fast, but not fine-grained timestamps.
+Requires per-architecture support,
+and probably also architecture support for this flag in the
+.BR vdso (7).
+.TP
+.BR CLOCK_MONOTONIC_RAW " (since Linux 2.6.28; Linux-specific)"
+.\" Added in commit 2d42244ae71d6c7b0884b5664cf2eda30fb2ae68, John Stultz
+Similar to
+.BR CLOCK_MONOTONIC ,
+but provides access to a raw hardware-based time
+that is not subject to NTP adjustments or
+the incremental adjustments performed by
+.BR adjtime (3).
+This clock does not count time that the system is suspended.
+.TP
+.BR CLOCK_BOOTTIME " (since Linux 2.6.39; Linux-specific)"
+.\" commit 7fdd7f89006dd5a4c702fa0ce0c272345fa44ae0
+.\" commit 70a08cca1227dc31c784ec930099a4417a06e7d0
+A nonsettable system-wide clock that is identical to
+.BR CLOCK_MONOTONIC ,
+except that it also includes any time that the system is suspended.
+This allows applications to get a suspend-aware monotonic clock
+without having to deal with the complications of
+.BR CLOCK_REALTIME ,
+which may have discontinuities if the time is changed using
+.BR settimeofday (2)
+or similar.
+.TP
+.BR CLOCK_BOOTTIME_ALARM " (since Linux 3.0; Linux-specific)"
+Like
+.BR CLOCK_BOOTTIME .
+See
+.BR timer_create (2)
+for further details.
+.TP
+.BR CLOCK_PROCESS_CPUTIME_ID " (since Linux 2.6.12)"
+This is a clock that measures CPU time consumed by this process
+(i.e., CPU time consumed by all threads in the process).
+On Linux, this clock is not settable.
+.TP
+.BR CLOCK_THREAD_CPUTIME_ID " (since Linux 2.6.12)"
+This is a clock that measures CPU time consumed by this thread.
+On Linux, this clock is not settable.
+.PP
+Linux also implements dynamic clock instances as described below.
+.SS Dynamic clocks
+In addition to the hard-coded System-V style clock IDs described above,
+Linux also supports
+POSIX clock operations on certain character devices.
+Such devices are
+called "dynamic" clocks, and are supported since Linux 2.6.39.
+.PP
+Using the appropriate macros, open file
+descriptors may be converted into clock IDs and passed to
+.BR clock_gettime (),
+.BR clock_settime (),
+and
+.BR clock_adjtime (2).
+The following example shows how to convert a file descriptor into a
+dynamic clock ID.
+.PP
+.in +4n
+.EX
+#define CLOCKFD 3
+#define FD_TO_CLOCKID(fd) ((\[ti](clockid_t) (fd) << 3) | CLOCKFD)
+#define CLOCKID_TO_FD(clk) ((unsigned int) \[ti]((clk) >> 3))
+\&
+struct timespec ts;
+clockid_t clkid;
+int fd;
+\&
+fd = open("/dev/ptp0", O_RDWR);
+clkid = FD_TO_CLOCKID(fd);
+clock_gettime(clkid, &ts);
+.EE
+.in
+.SH RETURN VALUE
+.BR clock_gettime (),
+.BR clock_settime (),
+and
+.BR clock_getres ()
+return 0 for success.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+.BR clock_settime ()
+does not have write permission for the dynamic POSIX
+clock device indicated.
+.TP
+.B EFAULT
+.I tp
+points outside the accessible address space.
+.TP
+.B EINVAL
+The
+.I clockid
+specified is invalid for one of two reasons.
+Either the System-V style
+hard coded positive value is out of range, or the dynamic clock ID
+does not refer to a valid instance of a clock object.
+.\" Linux also gives this error on attempts to set CLOCK_PROCESS_CPUTIME_ID
+.\" and CLOCK_THREAD_CPUTIME_ID, when probably the proper error should be
+.\" EPERM.
+.TP
+.B EINVAL
+.RB ( clock_settime ()):
+.I tp.tv_sec
+is negative or
+.I tp.tv_nsec
+is outside the range [0, 999,999,999].
+.TP
+.B EINVAL
+The
+.I clockid
+specified in a call to
+.BR clock_settime ()
+is not a settable clock.
+.TP
+.BR EINVAL " (since Linux 4.3)"
+.\" commit e1d7ba8735551ed79c7a0463a042353574b96da3
+A call to
+.BR clock_settime ()
+with a
+.I clockid
+of
+.B CLOCK_REALTIME
+attempted to set the time to a value less than
+the current value of the
+.B CLOCK_MONOTONIC
+clock.
+.TP
+.B ENODEV
+The hot-pluggable device (like USB for example) represented by a
+dynamic
+.I clk_id
+has disappeared after its character device was opened.
+.TP
+.B ENOTSUP
+The operation is not supported by the dynamic POSIX clock device
+specified.
+.TP
+.B EPERM
+.BR clock_settime ()
+does not have permission to set the clock indicated.
+.SH ATTRIBUTES
+For an explanation of the terms used in this section, see
+.BR attributes (7).
+.TS
+allbox;
+lbx lb lb
+l l l.
+Interface Attribute Value
+T{
+.na
+.nh
+.BR clock_getres (),
+.BR clock_gettime (),
+.BR clock_settime ()
+T} Thread safety MT-Safe
+.TE
+.sp 1
+.SH VERSIONS
+POSIX.1 specifies the following:
+.RS
+.PP
+Setting the value of the
+.B CLOCK_REALTIME
+clock via
+.BR clock_settime ()
+shall have no effect on threads that are blocked waiting for a relative time
+service based upon this clock, including the
+.BR nanosleep ()
+function; nor on the expiration of relative timers based upon this clock.
+Consequently, these time services shall expire when the requested relative
+interval elapses, independently of the new or old value of the clock.
+.RE
+.PP
+According to POSIX.1-2001, a process with "appropriate privileges" may set the
+.B CLOCK_PROCESS_CPUTIME_ID
+and
+.B CLOCK_THREAD_CPUTIME_ID
+clocks using
+.BR clock_settime ().
+On Linux, these clocks are not settable
+(i.e., no process has "appropriate privileges").
+.\" See http://bugzilla.kernel.org/show_bug.cgi?id=11972
+.SS C library/kernel differences
+On some architectures, an implementation of
+.BR clock_gettime ()
+is provided in the
+.BR vdso (7).
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SUSv2.
+Linux 2.6.
+.PP
+On POSIX systems on which these functions are available, the symbol
+.B _POSIX_TIMERS
+is defined in \fI<unistd.h>\fP to a value greater than 0.
+The symbols
+.BR _POSIX_MONOTONIC_CLOCK ,
+.BR _POSIX_CPUTIME ,
+.B _POSIX_THREAD_CPUTIME
+indicate that
+.BR CLOCK_MONOTONIC ,
+.BR CLOCK_PROCESS_CPUTIME_ID ,
+.B CLOCK_THREAD_CPUTIME_ID
+are available.
+(See also
+.BR sysconf (3).)
+POSIX.1-2008 makes these APIs mandatory.
+.\"
+.SS Historical note for SMP systems
+Before Linux added kernel support for
+.B CLOCK_PROCESS_CPUTIME_ID
+and
+.BR CLOCK_THREAD_CPUTIME_ID ,
+glibc implemented these clocks on many platforms using timer
+registers from the CPUs
+(TSC on i386, AR.ITC on Itanium).
+These registers may differ between CPUs and as a consequence
+these clocks may return
+.B bogus results
+if a process is migrated to another CPU.
+.PP
+If the CPUs in an SMP system have different clock sources, then
+there is no way to maintain a correlation between the timer registers since
+each CPU will run at a slightly different frequency.
+If that is the case, then
+.I clock_getcpuclockid(0)
+will return
+.B ENOENT
+to signify this condition.
+The two clocks will then be useful only if it
+can be ensured that a process stays on a certain CPU.
+.PP
+The processors in an SMP system do not start all at exactly the same
+time and therefore the timer registers are typically running at an offset.
+Some architectures include code that attempts to limit these offsets on bootup.
+However, the code cannot guarantee to accurately tune the offsets.
+glibc contains no provisions to deal with these offsets (unlike the Linux
+Kernel).
+Typically these offsets are small and therefore the effects may be
+negligible in most cases.
+.PP
+Since glibc 2.4,
+the wrapper functions for the system calls described in this page avoid
+the abovementioned problems by employing the kernel implementation of
+.B CLOCK_PROCESS_CPUTIME_ID
+and
+.BR CLOCK_THREAD_CPUTIME_ID ,
+on systems that provide such an implementation
+(i.e., Linux 2.6.12 and later).
+.SH EXAMPLES
+The program below demonstrates the use of
+.BR clock_gettime ()
+and
+.BR clock_getres ()
+with various clocks.
+This is an example of what we might see when running the program:
+.PP
+.in +4n
+.EX
+$ \fB./clock_times x\fP
+CLOCK_REALTIME : 1585985459.446 (18356 days + 7h 30m 59s)
+ resolution: 0.000000001
+CLOCK_TAI : 1585985496.447 (18356 days + 7h 31m 36s)
+ resolution: 0.000000001
+CLOCK_MONOTONIC: 52395.722 (14h 33m 15s)
+ resolution: 0.000000001
+CLOCK_BOOTTIME : 72691.019 (20h 11m 31s)
+ resolution: 0.000000001
+.EE
+.in
+.SS Program source
+\&
+.\" SRC BEGIN (clock_getres.c)
+.EX
+/* clock_times.c
+\&
+ Licensed under GNU General Public License v2 or later.
+*/
+#define _XOPEN_SOURCE 600
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+\&
+#define SECS_IN_DAY (24 * 60 * 60)
+\&
+static void
+displayClock(clockid_t clock, const char *name, bool showRes)
+{
+ long days;
+ struct timespec ts;
+\&
+ if (clock_gettime(clock, &ts) == \-1) {
+ perror("clock_gettime");
+ exit(EXIT_FAILURE);
+ }
+\&
+ printf("%\-15s: %10jd.%03ld (", name,
+ (intmax_t) ts.tv_sec, ts.tv_nsec / 1000000);
+\&
+ days = ts.tv_sec / SECS_IN_DAY;
+ if (days > 0)
+ printf("%ld days + ", days);
+\&
+ printf("%2dh %2dm %2ds",
+ (int) (ts.tv_sec % SECS_IN_DAY) / 3600,
+ (int) (ts.tv_sec % 3600) / 60,
+ (int) ts.tv_sec % 60);
+ printf(")\en");
+\&
+ if (clock_getres(clock, &ts) == \-1) {
+ perror("clock_getres");
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (showRes)
+ printf(" resolution: %10jd.%09ld\en",
+ (intmax_t) ts.tv_sec, ts.tv_nsec);
+}
+\&
+int
+main(int argc, char *argv[])
+{
+ bool showRes = argc > 1;
+\&
+ displayClock(CLOCK_REALTIME, "CLOCK_REALTIME", showRes);
+#ifdef CLOCK_TAI
+ displayClock(CLOCK_TAI, "CLOCK_TAI", showRes);
+#endif
+ displayClock(CLOCK_MONOTONIC, "CLOCK_MONOTONIC", showRes);
+#ifdef CLOCK_BOOTTIME
+ displayClock(CLOCK_BOOTTIME, "CLOCK_BOOTTIME", showRes);
+#endif
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR date (1),
+.BR gettimeofday (2),
+.BR settimeofday (2),
+.BR time (2),
+.BR adjtime (3),
+.BR clock_getcpuclockid (3),
+.BR ctime (3),
+.BR ftime (3),
+.BR pthread_getcpuclockid (3),
+.BR sysconf (3),
+.BR timespec (3),
+.BR time (7),
+.BR time_namespaces (7),
+.BR vdso (7),
+.BR hwclock (8)
diff --git a/man2/clock_gettime.2 b/man2/clock_gettime.2
new file mode 100644
index 0000000..5a599b4
--- /dev/null
+++ b/man2/clock_gettime.2
@@ -0,0 +1 @@
+.so man2/clock_getres.2
diff --git a/man2/clock_nanosleep.2 b/man2/clock_nanosleep.2
new file mode 100644
index 0000000..d1e53a6
--- /dev/null
+++ b/man2/clock_nanosleep.2
@@ -0,0 +1,253 @@
+.\" Copyright (c) 2008, Linux Foundation, written by Michael Kerrisk
+.\" <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH clock_nanosleep 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+clock_nanosleep \- high-resolution sleep with specifiable clock
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc ),
+since glibc 2.17
+.PP
+Before glibc 2.17,
+Real-time library
+.RI ( librt ", " \-lrt )
+.SH SYNOPSIS
+.B #include <time.h>
+.nf
+.PP
+.BI "int clock_nanosleep(clockid_t " clockid ", int " flags ,
+.BI " const struct timespec *" request ,
+.BI " struct timespec *_Nullable " remain );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR clock_nanosleep ():
+.nf
+ _POSIX_C_SOURCE >= 200112L
+.fi
+.SH DESCRIPTION
+Like
+.BR nanosleep (2),
+.BR clock_nanosleep ()
+allows the calling thread to sleep for an interval specified
+with nanosecond precision.
+It differs in allowing the caller to select the clock against
+which the sleep interval is to be measured,
+and in allowing the sleep interval to be specified as
+either an absolute or a relative value.
+.PP
+The time values passed to and returned by this call are specified using
+.BR timespec (3)
+structures.
+.PP
+The
+.I clockid
+argument specifies the clock against which the sleep interval
+is to be measured.
+This argument can have one of the following values:
+.\" Look in time/posix-timers.c (kernel 5.6 sources) for the
+.\" 'struct k_clock' structures that have an 'nsleep' method
+.TP
+.B CLOCK_REALTIME
+A settable system-wide real-time clock.
+.TP
+.BR CLOCK_TAI " (since Linux 3.10)"
+A system-wide clock derived from wall-clock time but ignoring leap seconds.
+.TP
+.B CLOCK_MONOTONIC
+A nonsettable, monotonically increasing clock that measures time
+since some unspecified point in the past that does not change after
+system startup.
+.\" On Linux this clock measures time since boot.
+.TP
+.BR CLOCK_BOOTTIME " (since Linux 2.6.39)"
+Identical to
+.BR CLOCK_MONOTONIC ,
+except that it also includes any time that the system is suspended.
+.TP
+.B CLOCK_PROCESS_CPUTIME_ID
+A settable per-process clock that measures CPU time consumed
+by all threads in the process.
+.\" There is some trickery between glibc and the kernel
+.\" to deal with the CLOCK_PROCESS_CPUTIME_ID case.
+.PP
+See
+.BR clock_getres (2)
+for further details on these clocks.
+In addition, the CPU clock IDs returned by
+.BR clock_getcpuclockid (3)
+and
+.BR pthread_getcpuclockid (3)
+can also be passed in
+.IR clockid .
+.\" Sleeping against CLOCK_REALTIME_ALARM and CLOCK_BOOTTIME_ALARM
+.\" is also possible (tested), with CAP_WAKE_ALARM, but I'm not
+.\" sure if this is useful or needs to be documented.
+.PP
+If
+.I flags
+is 0, then the value specified in
+.I request
+is interpreted as an interval relative to the current
+value of the clock specified by
+.IR clockid .
+.PP
+If
+.I flags
+is
+.BR TIMER_ABSTIME ,
+then
+.I request
+is interpreted as an absolute time as measured by the clock,
+.IR clockid .
+If
+.I request
+is less than or equal to the current value of the clock,
+then
+.BR clock_nanosleep ()
+returns immediately without suspending the calling thread.
+.PP
+.BR clock_nanosleep ()
+suspends the execution of the calling thread
+until either at least the time specified by
+.I request
+has elapsed,
+or a signal is delivered that causes a signal handler to be called or
+that terminates the process.
+.PP
+If the call is interrupted by a signal handler,
+.BR clock_nanosleep ()
+fails with the error
+.BR EINTR .
+In addition, if
+.I remain
+is not NULL, and
+.I flags
+was not
+.BR TIMER_ABSTIME ,
+it returns the remaining unslept time in
+.IR remain .
+This value can then be used to call
+.BR clock_nanosleep ()
+again and complete a (relative) sleep.
+.SH RETURN VALUE
+On successfully sleeping for the requested interval,
+.BR clock_nanosleep ()
+returns 0.
+If the call is interrupted by a signal handler or encounters an error,
+then it returns one of the positive error number listed in ERRORS.
+.SH ERRORS
+.TP
+.B EFAULT
+.I request
+or
+.I remain
+specified an invalid address.
+.TP
+.B EINTR
+The sleep was interrupted by a signal handler; see
+.BR signal (7).
+.TP
+.B EINVAL
+The value in the
+.I tv_nsec
+field was not in the range [0, 999999999] or
+.I tv_sec
+was negative.
+.TP
+.B EINVAL
+.I clockid
+was invalid.
+.RB ( CLOCK_THREAD_CPUTIME_ID
+is not a permitted value for
+.IR clockid .)
+.TP
+.B ENOTSUP
+The kernel does not support sleeping against this
+.IR clockid .
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001.
+Linux 2.6,
+glibc 2.1.
+.SH NOTES
+If the interval specified in
+.I request
+is not an exact multiple of the granularity underlying clock (see
+.BR time (7)),
+then the interval will be rounded up to the next multiple.
+Furthermore, after the sleep completes, there may still be a delay before
+the CPU becomes free to once again execute the calling thread.
+.PP
+Using an absolute timer is useful for preventing
+timer drift problems of the type described in
+.BR nanosleep (2).
+(Such problems are exacerbated in programs that try to restart
+a relative sleep that is repeatedly interrupted by signals.)
+To perform a relative sleep that avoids these problems, call
+.BR clock_gettime (2)
+for the desired clock,
+add the desired interval to the returned time value,
+and then call
+.BR clock_nanosleep ()
+with the
+.B TIMER_ABSTIME
+flag.
+.PP
+.BR clock_nanosleep ()
+is never restarted after being interrupted by a signal handler,
+regardless of the use of the
+.BR sigaction (2)
+.B SA_RESTART
+flag.
+.PP
+The
+.I remain
+argument is unused, and unnecessary, when
+.I flags
+is
+.BR TIMER_ABSTIME .
+(An absolute sleep can be restarted using the same
+.I request
+argument.)
+.PP
+POSIX.1 specifies that
+.BR clock_nanosleep ()
+has no effect on signals dispositions or the signal mask.
+.PP
+POSIX.1 specifies that after changing the value of the
+.B CLOCK_REALTIME
+clock via
+.BR clock_settime (2),
+the new clock value shall be used to determine the time
+at which a thread blocked on an absolute
+.BR clock_nanosleep ()
+will wake up;
+if the new clock value falls past the end of the sleep interval, then the
+.BR clock_nanosleep ()
+call will return immediately.
+.PP
+POSIX.1 specifies that
+changing the value of the
+.B CLOCK_REALTIME
+clock via
+.BR clock_settime (2)
+shall have no effect on a thread that is blocked on a relative
+.BR clock_nanosleep ().
+.SH SEE ALSO
+.BR clock_getres (2),
+.BR nanosleep (2),
+.BR restart_syscall (2),
+.BR timer_create (2),
+.BR sleep (3),
+.BR timespec (3),
+.BR usleep (3),
+.BR time (7)
diff --git a/man2/clock_settime.2 b/man2/clock_settime.2
new file mode 100644
index 0000000..5a599b4
--- /dev/null
+++ b/man2/clock_settime.2
@@ -0,0 +1 @@
+.so man2/clock_getres.2
diff --git a/man2/clone.2 b/man2/clone.2
new file mode 100644
index 0000000..38d2b90
--- /dev/null
+++ b/man2/clone.2
@@ -0,0 +1,1944 @@
+'\" t
+.\" Copyright (c) 1992 Drew Eckhardt <drew@cs.colorado.edu>, March 28, 1992
+.\" and Copyright (c) Michael Kerrisk, 2001, 2002, 2005, 2013, 2019
+.\"
+.\" SPDX-License-Identifier: GPL-1.0-or-later
+.\"
+.\" Modified by Michael Haardt <michael@moria.de>
+.\" Modified 24 Jul 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 21 Aug 1994 by Michael Chastain <mec@shell.portal.com>:
+.\" New man page (copied from 'fork.2').
+.\" Modified 10 June 1995 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified 25 April 1998 by Xavier Leroy <Xavier.Leroy@inria.fr>
+.\" Modified 26 Jun 2001 by Michael Kerrisk
+.\" Mostly upgraded to Linux 2.4.x
+.\" Added prototype for sys_clone() plus description
+.\" Added CLONE_THREAD with a brief description of thread groups
+.\" Added CLONE_PARENT and revised entire page remove ambiguity
+.\" between "calling process" and "parent process"
+.\" Added CLONE_PTRACE and CLONE_VFORK
+.\" Added EPERM and EINVAL error codes
+.\" Renamed "__clone" to "clone" (which is the prototype in <sched.h>)
+.\" various other minor tidy ups and clarifications.
+.\" Modified 26 Jun 2001 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Updated notes for 2.4.7+ behavior of CLONE_THREAD
+.\" Modified 15 Oct 2002 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added description for CLONE_NEWNS, which was added in Linux 2.4.19
+.\" Slightly rephrased, aeb.
+.\" Modified 1 Feb 2003 - added CLONE_SIGHAND restriction, aeb.
+.\" Modified 1 Jan 2004 - various updates, aeb
+.\" Modified 2004-09-10 - added CLONE_PARENT_SETTID etc. - aeb.
+.\" 2005-04-12, mtk, noted the PID caching behavior of NPTL's getpid()
+.\" wrapper under BUGS.
+.\" 2005-05-10, mtk, added CLONE_SYSVSEM, CLONE_UNTRACED, CLONE_STOPPED.
+.\" 2005-05-17, mtk, Substantially enhanced discussion of CLONE_THREAD.
+.\" 2008-11-18, mtk, order CLONE_* flags alphabetically
+.\" 2008-11-18, mtk, document CLONE_NEWPID
+.\" 2008-11-19, mtk, document CLONE_NEWUTS
+.\" 2008-11-19, mtk, document CLONE_NEWIPC
+.\" 2008-11-19, Jens Axboe, mtk, document CLONE_IO
+.\"
+.TH clone 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+clone, __clone2, clone3 \- create a child process
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+/* Prototype for the glibc wrapper function */
+.PP
+.B #define _GNU_SOURCE
+.B #include <sched.h>
+.PP
+.BI "int clone(int (*" "fn" ")(void *_Nullable), void *" stack \
+", int " flags ,
+.BI " void *_Nullable " "arg" ", ..." \
+" \fR/*\fP" " pid_t *_Nullable " parent_tid ,
+.BI " void *_Nullable " tls ,
+.BI " pid_t *_Nullable " child_tid " \fR*/\fP );"
+.PP
+/* For the prototype of the raw clone() system call, see NOTES */
+.PP
+.BR "#include <linux/sched.h>" " /* Definition of " "struct clone_args" " */"
+.BR "#include <sched.h>" " /* Definition of " CLONE_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "long syscall(SYS_clone3, struct clone_args *" cl_args ", size_t " size );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR clone3 (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+These system calls
+create a new ("child") process, in a manner similar to
+.BR fork (2).
+.PP
+By contrast with
+.BR fork (2),
+these system calls provide more precise control over what pieces of execution
+context are shared between the calling process and the child process.
+For example, using these system calls, the caller can control whether
+or not the two processes share the virtual address space,
+the table of file descriptors, and the table of signal handlers.
+These system calls also allow the new child process to be placed
+in separate
+.BR namespaces (7).
+.PP
+Note that in this manual
+page, "calling process" normally corresponds to "parent process".
+But see the descriptions of
+.B CLONE_PARENT
+and
+.B CLONE_THREAD
+below.
+.PP
+This page describes the following interfaces:
+.IP \[bu] 3
+The glibc
+.BR clone ()
+wrapper function and the underlying system call on which it is based.
+The main text describes the wrapper function;
+the differences for the raw system call
+are described toward the end of this page.
+.IP \[bu]
+The newer
+.BR clone3 ()
+system call.
+.PP
+In the remainder of this page, the terminology "the clone call" is used
+when noting details that apply to all of these interfaces,
+.\"
+.SS The clone() wrapper function
+When the child process is created with the
+.BR clone ()
+wrapper function,
+it commences execution by calling the function pointed to by the argument
+.IR fn .
+(This differs from
+.BR fork (2),
+where execution continues in the child from the point
+of the
+.BR fork (2)
+call.)
+The
+.I arg
+argument is passed as the argument of the function
+.IR fn .
+.PP
+When the
+.IR fn ( arg )
+function returns, the child process terminates.
+The integer returned by
+.I fn
+is the exit status for the child process.
+The child process may also terminate explicitly by calling
+.BR exit (2)
+or after receiving a fatal signal.
+.PP
+The
+.I stack
+argument specifies the location of the stack used by the child process.
+Since the child and calling process may share memory,
+it is not possible for the child process to execute in the
+same stack as the calling process.
+The calling process must therefore
+set up memory space for the child stack and pass a pointer to this
+space to
+.BR clone ().
+Stacks grow downward on all processors that run Linux
+(except the HP PA processors), so
+.I stack
+usually points to the topmost address of the memory space set up for
+the child stack.
+Note that
+.BR clone ()
+does not provide a means whereby the caller can inform the kernel of the
+size of the stack area.
+.PP
+The remaining arguments to
+.BR clone ()
+are discussed below.
+.\"
+.SS clone3()
+The
+.BR clone3 ()
+system call provides a superset of the functionality of the older
+.BR clone ()
+interface.
+It also provides a number of API improvements, including:
+space for additional flags bits;
+cleaner separation in the use of various arguments;
+and the ability to specify the size of the child's stack area.
+.PP
+As with
+.BR fork (2),
+.BR clone3 ()
+returns in both the parent and the child.
+It returns 0 in the child process and returns the PID of the child
+in the parent.
+.PP
+The
+.I cl_args
+argument of
+.BR clone3 ()
+is a structure of the following form:
+.PP
+.in +4n
+.EX
+struct clone_args {
+ u64 flags; /* Flags bit mask */
+ u64 pidfd; /* Where to store PID file descriptor
+ (\fIint *\fP) */
+ u64 child_tid; /* Where to store child TID,
+ in child\[aq]s memory (\fIpid_t *\fP) */
+ u64 parent_tid; /* Where to store child TID,
+ in parent\[aq]s memory (\fIpid_t *\fP) */
+ u64 exit_signal; /* Signal to deliver to parent on
+ child termination */
+ u64 stack; /* Pointer to lowest byte of stack */
+ u64 stack_size; /* Size of stack */
+ u64 tls; /* Location of new TLS */
+ u64 set_tid; /* Pointer to a \fIpid_t\fP array
+ (since Linux 5.5) */
+ u64 set_tid_size; /* Number of elements in \fIset_tid\fP
+ (since Linux 5.5) */
+ u64 cgroup; /* File descriptor for target cgroup
+ of child (since Linux 5.7) */
+};
+.EE
+.in
+.PP
+The
+.I size
+argument that is supplied to
+.BR clone3 ()
+should be initialized to the size of this structure.
+(The existence of the
+.I size
+argument permits future extensions to the
+.I clone_args
+structure.)
+.PP
+The stack for the child process is specified via
+.IR cl_args.stack ,
+which points to the lowest byte of the stack area,
+and
+.IR cl_args.stack_size ,
+which specifies the size of the stack in bytes.
+In the case where the
+.B CLONE_VM
+flag (see below) is specified, a stack must be explicitly allocated
+and specified.
+Otherwise, these two fields can be specified as NULL and 0,
+which causes the child to use the same stack area as the parent
+(in the child's own virtual address space).
+.PP
+The remaining fields in the
+.I cl_args
+argument are discussed below.
+.\"
+.SS Equivalence between clone() and clone3() arguments
+Unlike the older
+.BR clone ()
+interface, where arguments are passed individually, in the newer
+.BR clone3 ()
+interface the arguments are packaged into the
+.I clone_args
+structure shown above.
+This structure allows for a superset of the information passed via the
+.BR clone ()
+arguments.
+.PP
+The following table shows the equivalence between the arguments of
+.BR clone ()
+and the fields in the
+.I clone_args
+argument supplied to
+.BR clone3 ():
+.RS 4
+.TS
+lb lb lb
+l l l
+li li l.
+clone() clone3() Notes
+ \fIcl_args\fP field
+flags & \[ti]0xff flags T{
+For most flags; details below
+T}
+parent_tid pidfd See CLONE_PIDFD
+child_tid child_tid See CLONE_CHILD_SETTID
+parent_tid parent_tid See CLONE_PARENT_SETTID
+flags & 0xff exit_signal
+stack stack
+\fP---\fP stack_size
+tls tls See CLONE_SETTLS
+\fP---\fP set_tid See below for details
+\fP---\fP set_tid_size
+\fP---\fP cgroup See CLONE_INTO_CGROUP
+.TE
+.RE
+.\"
+.SS The child termination signal
+When the child process terminates, a signal may be sent to the parent.
+The termination signal is specified in the low byte of
+.I flags
+.RB ( clone ())
+or in
+.I cl_args.exit_signal
+.RB ( clone3 ()).
+If this signal is specified as anything other than
+.BR SIGCHLD ,
+then the parent process must specify the
+.B __WALL
+or
+.B __WCLONE
+options when waiting for the child with
+.BR wait (2).
+If no signal (i.e., zero) is specified, then the parent process is not signaled
+when the child terminates.
+.\"
+.SS The set_tid array
+By default, the kernel chooses the next sequential PID for the new
+process in each of the PID namespaces where it is present.
+When creating a process with
+.BR clone3 (),
+the
+.I set_tid
+array (available since Linux 5.5)
+can be used to select specific PIDs for the process in some
+or all of the PID namespaces where it is present.
+If the PID of the newly created process should be set only for the current
+PID namespace or in the newly created PID namespace (if
+.I flags
+contains
+.BR CLONE_NEWPID )
+then the first element in the
+.I set_tid
+array has to be the desired PID and
+.I set_tid_size
+needs to be 1.
+.PP
+If the PID of the newly created process should have a certain value in
+multiple PID namespaces, then the
+.I set_tid
+array can have multiple entries.
+The first entry defines the PID in the most
+deeply nested PID namespace and each of the following entries contains
+the PID in the
+corresponding ancestor PID namespace.
+The number of PID namespaces in which a PID
+should be set is defined by
+.I set_tid_size
+which cannot be larger than the number of currently nested PID namespaces.
+.PP
+To create a process with the following PIDs in a PID namespace hierarchy:
+.RS 4
+.TS
+lb lb lb
+l l l.
+PID NS level Requested PID Notes
+0 31496 Outermost PID namespace
+1 42
+2 7 Innermost PID namespace
+.TE
+.RE
+.PP
+Set the array to:
+.PP
+.in +4n
+.EX
+set_tid[0] = 7;
+set_tid[1] = 42;
+set_tid[2] = 31496;
+set_tid_size = 3;
+.EE
+.in
+.PP
+If only the PIDs in the two innermost PID namespaces
+need to be specified, set the array to:
+.PP
+.in +4n
+.EX
+set_tid[0] = 7;
+set_tid[1] = 42;
+set_tid_size = 2;
+.EE
+.in
+.PP
+The PID in the PID namespaces outside the two innermost PID namespaces
+is selected the same way as any other PID is selected.
+.PP
+The
+.I set_tid
+feature requires
+.B CAP_SYS_ADMIN
+or
+(since Linux 5.9)
+.\" commit 124ea650d3072b005457faed69909221c2905a1f
+.\" commit 1caef81da05a84a40dbf02110e967ce6d1135ff6
+.B CAP_CHECKPOINT_RESTORE
+in all owning user namespaces of the target PID namespaces.
+.PP
+Callers may only choose a PID greater than 1 in a given PID namespace
+if an
+.B init
+process (i.e., a process with PID 1) already exists in that namespace.
+Otherwise the PID
+entry for this PID namespace must be 1.
+.\"
+.SS The flags mask
+Both
+.BR clone ()
+and
+.BR clone3 ()
+allow a flags bit mask that modifies their behavior
+and allows the caller to specify what is shared between the calling process
+and the child process.
+This bit mask\[em]the
+.I flags
+argument of
+.BR clone ()
+or the
+.I cl_args.flags
+field passed to
+.BR clone3 ()\[em]is
+referred to as the
+.I flags
+mask in the remainder of this page.
+.PP
+The
+.I flags
+mask is specified as a bitwise OR of zero or more of
+the constants listed below.
+Except as noted below, these flags are available
+(and have the same effect) in both
+.BR clone ()
+and
+.BR clone3 ().
+.TP
+.BR CLONE_CHILD_CLEARTID " (since Linux 2.5.49)"
+Clear (zero) the child thread ID at the location pointed to by
+.I child_tid
+.RB ( clone ())
+or
+.I cl_args.child_tid
+.RB ( clone3 ())
+in child memory when the child exits, and do a wakeup on the futex
+at that address.
+The address involved may be changed by the
+.BR set_tid_address (2)
+system call.
+This is used by threading libraries.
+.TP
+.BR CLONE_CHILD_SETTID " (since Linux 2.5.49)"
+Store the child thread ID at the location pointed to by
+.I child_tid
+.RB ( clone ())
+or
+.I cl_args.child_tid
+.RB ( clone3 ())
+in the child's memory.
+The store operation completes before the clone call
+returns control to user space in the child process.
+(Note that the store operation may not have completed before the clone call
+returns in the parent process, which is relevant if the
+.B CLONE_VM
+flag is also employed.)
+.TP
+.BR CLONE_CLEAR_SIGHAND " (since Linux 5.5)"
+.\" commit b612e5df4587c934bd056bf05f4a1deca4de4f75
+By default, signal dispositions in the child thread are the same as
+in the parent.
+If this flag is specified,
+then all signals that are handled in the parent
+are reset to their default dispositions
+.RB ( SIG_DFL )
+in the child.
+.IP
+Specifying this flag together with
+.B CLONE_SIGHAND
+is nonsensical and disallowed.
+.TP
+.BR CLONE_DETACHED " (historical)"
+For a while (during the Linux 2.5 development series)
+.\" added in Linux 2.5.32; removed in Linux 2.6.0-test4
+there was a
+.B CLONE_DETACHED
+flag,
+which caused the parent not to receive a signal when the child terminated.
+Ultimately, the effect of this flag was subsumed under the
+.B CLONE_THREAD
+flag and by the time Linux 2.6.0 was released, this flag had no effect.
+Starting in Linux 2.6.2, the need to give this flag together with
+.B CLONE_THREAD
+disappeared.
+.IP
+This flag is still defined, but it is usually ignored when calling
+.BR clone ().
+However, see the description of
+.B CLONE_PIDFD
+for some exceptions.
+.TP
+.BR CLONE_FILES " (since Linux 2.0)"
+If
+.B CLONE_FILES
+is set, the calling process and the child process share the same file
+descriptor table.
+Any file descriptor created by the calling process or by the child
+process is also valid in the other process.
+Similarly, if one of the processes closes a file descriptor,
+or changes its associated flags (using the
+.BR fcntl (2)
+.B F_SETFD
+operation), the other process is also affected.
+If a process sharing a file descriptor table calls
+.BR execve (2),
+its file descriptor table is duplicated (unshared).
+.IP
+If
+.B CLONE_FILES
+is not set, the child process inherits a copy of all file descriptors
+opened in the calling process at the time of the clone call.
+Subsequent operations that open or close file descriptors,
+or change file descriptor flags,
+performed by either the calling
+process or the child process do not affect the other process.
+Note, however,
+that the duplicated file descriptors in the child refer to the same
+open file descriptions as the corresponding file descriptors
+in the calling process,
+and thus share file offsets and file status flags (see
+.BR open (2)).
+.TP
+.BR CLONE_FS " (since Linux 2.0)"
+If
+.B CLONE_FS
+is set, the caller and the child process share the same filesystem
+information.
+This includes the root of the filesystem, the current
+working directory, and the umask.
+Any call to
+.BR chroot (2),
+.BR chdir (2),
+or
+.BR umask (2)
+performed by the calling process or the child process also affects the
+other process.
+.IP
+If
+.B CLONE_FS
+is not set, the child process works on a copy of the filesystem
+information of the calling process at the time of the clone call.
+Calls to
+.BR chroot (2),
+.BR chdir (2),
+or
+.BR umask (2)
+performed later by one of the processes do not affect the other process.
+.TP
+.BR CLONE_INTO_CGROUP " (since Linux 5.7)"
+.\" commit ef2c41cf38a7559bbf91af42d5b6a4429db8fc68
+By default, a child process is placed in the same version 2
+cgroup as its parent.
+The
+.B CLONE_INTO_CGROUP
+flag allows the child process to be created in a different version 2 cgroup.
+(Note that
+.B CLONE_INTO_CGROUP
+has effect only for version 2 cgroups.)
+.IP
+In order to place the child process in a different cgroup,
+the caller specifies
+.B CLONE_INTO_CGROUP
+in
+.I cl_args.flags
+and passes a file descriptor that refers to a version 2 cgroup in the
+.I cl_args.cgroup
+field.
+(This file descriptor can be obtained by opening a cgroup v2 directory
+using either the
+.B O_RDONLY
+or the
+.B O_PATH
+flag.)
+Note that all of the usual restrictions (described in
+.BR cgroups (7))
+on placing a process into a version 2 cgroup apply.
+.IP
+Among the possible use cases for
+.B CLONE_INTO_CGROUP
+are the following:
+.RS
+.IP \[bu] 3
+Spawning a process into a cgroup different from the parent's cgroup
+makes it possible for a service manager to directly spawn new
+services into dedicated cgroups.
+This eliminates the accounting
+jitter that would be caused if the child process was first created in the
+same cgroup as the parent and then
+moved into the target cgroup.
+Furthermore, spawning the child process directly into a target cgroup
+is significantly cheaper than moving the child process into
+the target cgroup after it has been created.
+.IP \[bu]
+The
+.B CLONE_INTO_CGROUP
+flag also allows the creation of
+frozen child processes by spawning them into a frozen cgroup.
+(See
+.BR cgroups (7)
+for a description of the freezer controller.)
+.IP \[bu]
+For threaded applications (or even thread implementations which
+make use of cgroups to limit individual threads), it is possible to
+establish a fixed cgroup layout before spawning each thread
+directly into its target cgroup.
+.RE
+.TP
+.BR CLONE_IO " (since Linux 2.6.25)"
+If
+.B CLONE_IO
+is set, then the new process shares an I/O context with
+the calling process.
+If this flag is not set, then (as with
+.BR fork (2))
+the new process has its own I/O context.
+.IP
+.\" The following based on text from Jens Axboe
+The I/O context is the I/O scope of the disk scheduler (i.e.,
+what the I/O scheduler uses to model scheduling of a process's I/O).
+If processes share the same I/O context,
+they are treated as one by the I/O scheduler.
+As a consequence, they get to share disk time.
+For some I/O schedulers,
+.\" the anticipatory and CFQ scheduler
+if two processes share an I/O context,
+they will be allowed to interleave their disk access.
+If several threads are doing I/O on behalf of the same process
+.RB ( aio_read (3),
+for instance), they should employ
+.B CLONE_IO
+to get better I/O performance.
+.\" with CFQ and AS.
+.IP
+If the kernel is not configured with the
+.B CONFIG_BLOCK
+option, this flag is a no-op.
+.TP
+.BR CLONE_NEWCGROUP " (since Linux 4.6)"
+Create the process in a new cgroup namespace.
+If this flag is not set, then (as with
+.BR fork (2))
+the process is created in the same cgroup namespaces as the calling process.
+.IP
+For further information on cgroup namespaces, see
+.BR cgroup_namespaces (7).
+.IP
+Only a privileged process
+.RB ( CAP_SYS_ADMIN )
+can employ
+.BR CLONE_NEWCGROUP .
+.\"
+.TP
+.BR CLONE_NEWIPC " (since Linux 2.6.19)"
+If
+.B CLONE_NEWIPC
+is set, then create the process in a new IPC namespace.
+If this flag is not set, then (as with
+.BR fork (2)),
+the process is created in the same IPC namespace as
+the calling process.
+.IP
+For further information on IPC namespaces, see
+.BR ipc_namespaces (7).
+.IP
+Only a privileged process
+.RB ( CAP_SYS_ADMIN )
+can employ
+.BR CLONE_NEWIPC .
+This flag can't be specified in conjunction with
+.BR CLONE_SYSVSEM .
+.TP
+.BR CLONE_NEWNET " (since Linux 2.6.24)"
+(The implementation of this flag was completed only
+by about Linux 2.6.29.)
+.IP
+If
+.B CLONE_NEWNET
+is set, then create the process in a new network namespace.
+If this flag is not set, then (as with
+.BR fork (2))
+the process is created in the same network namespace as
+the calling process.
+.IP
+For further information on network namespaces, see
+.BR network_namespaces (7).
+.IP
+Only a privileged process
+.RB ( CAP_SYS_ADMIN )
+can employ
+.BR CLONE_NEWNET .
+.TP
+.BR CLONE_NEWNS " (since Linux 2.4.19)"
+If
+.B CLONE_NEWNS
+is set, the cloned child is started in a new mount namespace,
+initialized with a copy of the namespace of the parent.
+If
+.B CLONE_NEWNS
+is not set, the child lives in the same mount
+namespace as the parent.
+.IP
+For further information on mount namespaces, see
+.BR namespaces (7)
+and
+.BR mount_namespaces (7).
+.IP
+Only a privileged process
+.RB ( CAP_SYS_ADMIN )
+can employ
+.BR CLONE_NEWNS .
+It is not permitted to specify both
+.B CLONE_NEWNS
+and
+.B CLONE_FS
+.\" See https://lwn.net/Articles/543273/
+in the same clone call.
+.TP
+.BR CLONE_NEWPID " (since Linux 2.6.24)"
+.\" This explanation draws a lot of details from
+.\" http://lwn.net/Articles/259217/
+.\" Authors: Pavel Emelyanov <xemul@openvz.org>
+.\" and Kir Kolyshkin <kir@openvz.org>
+.\"
+.\" The primary kernel commit is 30e49c263e36341b60b735cbef5ca37912549264
+.\" Author: Pavel Emelyanov <xemul@openvz.org>
+If
+.B CLONE_NEWPID
+is set, then create the process in a new PID namespace.
+If this flag is not set, then (as with
+.BR fork (2))
+the process is created in the same PID namespace as
+the calling process.
+.IP
+For further information on PID namespaces, see
+.BR namespaces (7)
+and
+.BR pid_namespaces (7).
+.IP
+Only a privileged process
+.RB ( CAP_SYS_ADMIN )
+can employ
+.BR CLONE_NEWPID .
+This flag can't be specified in conjunction with
+.B CLONE_THREAD
+or
+.BR CLONE_PARENT .
+.TP
+.B CLONE_NEWUSER
+(This flag first became meaningful for
+.BR clone ()
+in Linux 2.6.23,
+the current
+.BR clone ()
+semantics were merged in Linux 3.5,
+and the final pieces to make the user namespaces completely usable were
+merged in Linux 3.8.)
+.IP
+If
+.B CLONE_NEWUSER
+is set, then create the process in a new user namespace.
+If this flag is not set, then (as with
+.BR fork (2))
+the process is created in the same user namespace as the calling process.
+.IP
+For further information on user namespaces, see
+.BR namespaces (7)
+and
+.BR user_namespaces (7).
+.IP
+Before Linux 3.8, use of
+.B CLONE_NEWUSER
+required that the caller have three capabilities:
+.BR CAP_SYS_ADMIN ,
+.BR CAP_SETUID ,
+and
+.BR CAP_SETGID .
+.\" Before Linux 2.6.29, it appears that only CAP_SYS_ADMIN was needed
+Starting with Linux 3.8,
+no privileges are needed to create a user namespace.
+.IP
+This flag can't be specified in conjunction with
+.B CLONE_THREAD
+or
+.BR CLONE_PARENT .
+For security reasons,
+.\" commit e66eded8309ebf679d3d3c1f5820d1f2ca332c71
+.\" https://lwn.net/Articles/543273/
+.\" The fix actually went into Linux 3.9 and into Linux 3.8.3. However, user namespaces
+.\" were, for practical purposes, unusable in earlier Linux 3.8.x because of the
+.\" various filesystems that didn't support userns.
+.B CLONE_NEWUSER
+cannot be specified in conjunction with
+.BR CLONE_FS .
+.TP
+.BR CLONE_NEWUTS " (since Linux 2.6.19)"
+If
+.B CLONE_NEWUTS
+is set, then create the process in a new UTS namespace,
+whose identifiers are initialized by duplicating the identifiers
+from the UTS namespace of the calling process.
+If this flag is not set, then (as with
+.BR fork (2))
+the process is created in the same UTS namespace as
+the calling process.
+.IP
+For further information on UTS namespaces, see
+.BR uts_namespaces (7).
+.IP
+Only a privileged process
+.RB ( CAP_SYS_ADMIN )
+can employ
+.BR CLONE_NEWUTS .
+.TP
+.BR CLONE_PARENT " (since Linux 2.3.12)"
+If
+.B CLONE_PARENT
+is set, then the parent of the new child (as returned by
+.BR getppid (2))
+will be the same as that of the calling process.
+.IP
+If
+.B CLONE_PARENT
+is not set, then (as with
+.BR fork (2))
+the child's parent is the calling process.
+.IP
+Note that it is the parent process, as returned by
+.BR getppid (2),
+which is signaled when the child terminates, so that
+if
+.B CLONE_PARENT
+is set, then the parent of the calling process, rather than the
+calling process itself, is signaled.
+.IP
+The
+.B CLONE_PARENT
+flag can't be used in clone calls by the
+global init process (PID 1 in the initial PID namespace)
+and init processes in other PID namespaces.
+This restriction prevents the creation of multi-rooted process trees
+as well as the creation of unreapable zombies in the initial PID namespace.
+.TP
+.BR CLONE_PARENT_SETTID " (since Linux 2.5.49)"
+Store the child thread ID at the location pointed to by
+.I parent_tid
+.RB ( clone ())
+or
+.I cl_args.parent_tid
+.RB ( clone3 ())
+in the parent's memory.
+(In Linux 2.5.32-2.5.48 there was a flag
+.B CLONE_SETTID
+that did this.)
+The store operation completes before the clone call
+returns control to user space.
+.TP
+.BR CLONE_PID " (Linux 2.0 to Linux 2.5.15)"
+If
+.B CLONE_PID
+is set, the child process is created with the same process ID as
+the calling process.
+This is good for hacking the system, but otherwise
+of not much use.
+From Linux 2.3.21 onward, this flag could be
+specified only by the system boot process (PID 0).
+The flag disappeared completely from the kernel sources in Linux 2.5.16.
+Subsequently, the kernel silently ignored this bit if it was specified in the
+.I flags
+mask.
+Much later, the same bit was recycled for use as the
+.B CLONE_PIDFD
+flag.
+.TP
+.BR CLONE_PIDFD " (since Linux 5.2)"
+.\" commit b3e5838252665ee4cfa76b82bdf1198dca81e5be
+If this flag is specified,
+a PID file descriptor referring to the child process is allocated
+and placed at a specified location in the parent's memory.
+The close-on-exec flag is set on this new file descriptor.
+PID file descriptors can be used for the purposes described in
+.BR pidfd_open (2).
+.RS
+.IP \[bu] 3
+When using
+.BR clone3 (),
+the PID file descriptor is placed at the location pointed to by
+.IR cl_args.pidfd .
+.IP \[bu]
+When using
+.BR clone (),
+the PID file descriptor is placed at the location pointed to by
+.IR parent_tid .
+Since the
+.I parent_tid
+argument is used to return the PID file descriptor,
+.B CLONE_PIDFD
+cannot be used with
+.B CLONE_PARENT_SETTID
+when calling
+.BR clone ().
+.RE
+.IP
+It is currently not possible to use this flag together with
+.B CLONE_THREAD.
+This means that the process identified by the PID file descriptor
+will always be a thread group leader.
+.IP
+If the obsolete
+.B CLONE_DETACHED
+flag is specified alongside
+.B CLONE_PIDFD
+when calling
+.BR clone (),
+an error is returned.
+An error also results if
+.B CLONE_DETACHED
+is specified when calling
+.BR clone3 ().
+This error behavior ensures that the bit corresponding to
+.B CLONE_DETACHED
+can be reused for further PID file descriptor features in the future.
+.TP
+.BR CLONE_PTRACE " (since Linux 2.2)"
+If
+.B CLONE_PTRACE
+is specified, and the calling process is being traced,
+then trace the child also (see
+.BR ptrace (2)).
+.TP
+.BR CLONE_SETTLS " (since Linux 2.5.32)"
+The TLS (Thread Local Storage) descriptor is set to
+.IR tls .
+.IP
+The interpretation of
+.I tls
+and the resulting effect is architecture dependent.
+On x86,
+.I tls
+is interpreted as a
+.I struct user_desc\~*
+(see
+.BR set_thread_area (2)).
+On x86-64 it is the new value to be set for the %fs base register
+(see the
+.B ARCH_SET_FS
+argument to
+.BR arch_prctl (2)).
+On architectures with a dedicated TLS register, it is the new value
+of that register.
+.IP
+Use of this flag requires detailed knowledge and generally it
+should not be used except in libraries implementing threading.
+.TP
+.BR CLONE_SIGHAND " (since Linux 2.0)"
+If
+.B CLONE_SIGHAND
+is set, the calling process and the child process share the same table of
+signal handlers.
+If the calling process or child process calls
+.BR sigaction (2)
+to change the behavior associated with a signal, the behavior is
+changed in the other process as well.
+However, the calling process and child
+processes still have distinct signal masks and sets of pending
+signals.
+So, one of them may block or unblock signals using
+.BR sigprocmask (2)
+without affecting the other process.
+.IP
+If
+.B CLONE_SIGHAND
+is not set, the child process inherits a copy of the signal handlers
+of the calling process at the time of the clone call.
+Calls to
+.BR sigaction (2)
+performed later by one of the processes have no effect on the other
+process.
+.IP
+Since Linux 2.6.0,
+.\" Precisely: Linux 2.6.0-test6
+the
+.I flags
+mask must also include
+.B CLONE_VM
+if
+.B CLONE_SIGHAND
+is specified.
+.TP
+.BR CLONE_STOPPED " (since Linux 2.6.0)"
+.\" Precisely: Linux 2.6.0-test2
+If
+.B CLONE_STOPPED
+is set, then the child is initially stopped (as though it was sent a
+.B SIGSTOP
+signal), and must be resumed by sending it a
+.B SIGCONT
+signal.
+.IP
+This flag was
+.I deprecated
+from Linux 2.6.25 onward,
+and was
+.I removed
+altogether in Linux 2.6.38.
+Since then, the kernel silently ignores it without error.
+.\" glibc 2.8 removed this defn from bits/sched.h
+Starting with Linux 4.6, the same bit was reused for the
+.B CLONE_NEWCGROUP
+flag.
+.TP
+.BR CLONE_SYSVSEM " (since Linux 2.5.10)"
+If
+.B CLONE_SYSVSEM
+is set, then the child and the calling process share
+a single list of System V semaphore adjustment
+.RI ( semadj )
+values (see
+.BR semop (2)).
+In this case, the shared list accumulates
+.I semadj
+values across all processes sharing the list,
+and semaphore adjustments are performed only when the last process
+that is sharing the list terminates (or ceases sharing the list using
+.BR unshare (2)).
+If this flag is not set, then the child has a separate
+.I semadj
+list that is initially empty.
+.TP
+.BR CLONE_THREAD " (since Linux 2.4.0)"
+.\" Precisely: Linux 2.6.0-test8
+If
+.B CLONE_THREAD
+is set, the child is placed in the same thread group as the calling process.
+To make the remainder of the discussion of
+.B CLONE_THREAD
+more readable, the term "thread" is used to refer to the
+processes within a thread group.
+.IP
+Thread groups were a feature added in Linux 2.4 to support the
+POSIX threads notion of a set of threads that share a single PID.
+Internally, this shared PID is the so-called
+thread group identifier (TGID) for the thread group.
+Since Linux 2.4, calls to
+.BR getpid (2)
+return the TGID of the caller.
+.IP
+The threads within a group can be distinguished by their (system-wide)
+unique thread IDs (TID).
+A new thread's TID is available as the function result
+returned to the caller,
+and a thread can obtain
+its own TID using
+.BR gettid (2).
+.IP
+When a clone call is made without specifying
+.BR CLONE_THREAD ,
+then the resulting thread is placed in a new thread group
+whose TGID is the same as the thread's TID.
+This thread is the
+.I leader
+of the new thread group.
+.IP
+A new thread created with
+.B CLONE_THREAD
+has the same parent process as the process that made the clone call
+(i.e., like
+.BR CLONE_PARENT ),
+so that calls to
+.BR getppid (2)
+return the same value for all of the threads in a thread group.
+When a
+.B CLONE_THREAD
+thread terminates, the thread that created it is not sent a
+.B SIGCHLD
+(or other termination) signal;
+nor can the status of such a thread be obtained
+using
+.BR wait (2).
+(The thread is said to be
+.IR detached .)
+.IP
+After all of the threads in a thread group terminate
+the parent process of the thread group is sent a
+.B SIGCHLD
+(or other termination) signal.
+.IP
+If any of the threads in a thread group performs an
+.BR execve (2),
+then all threads other than the thread group leader are terminated,
+and the new program is executed in the thread group leader.
+.IP
+If one of the threads in a thread group creates a child using
+.BR fork (2),
+then any thread in the group can
+.BR wait (2)
+for that child.
+.IP
+Since Linux 2.5.35, the
+.I flags
+mask must also include
+.B CLONE_SIGHAND
+if
+.B CLONE_THREAD
+is specified
+(and note that, since Linux 2.6.0,
+.\" Precisely: Linux 2.6.0-test6
+.B CLONE_SIGHAND
+also requires
+.B CLONE_VM
+to be included).
+.IP
+Signal dispositions and actions are process-wide:
+if an unhandled signal is delivered to a thread, then
+it will affect (terminate, stop, continue, be ignored in)
+all members of the thread group.
+.IP
+Each thread has its own signal mask, as set by
+.BR sigprocmask (2).
+.IP
+A signal may be process-directed or thread-directed.
+A process-directed signal is targeted at a thread group (i.e., a TGID),
+and is delivered to an arbitrarily selected thread from among those
+that are not blocking the signal.
+A signal may be process-directed because it was generated by the kernel
+for reasons other than a hardware exception, or because it was sent using
+.BR kill (2)
+or
+.BR sigqueue (3).
+A thread-directed signal is targeted at (i.e., delivered to)
+a specific thread.
+A signal may be thread directed because it was sent using
+.BR tgkill (2)
+or
+.BR pthread_sigqueue (3),
+or because the thread executed a machine language instruction that triggered
+a hardware exception
+(e.g., invalid memory access triggering
+.B SIGSEGV
+or a floating-point exception triggering
+.BR SIGFPE ).
+.IP
+A call to
+.BR sigpending (2)
+returns a signal set that is the union of the pending process-directed
+signals and the signals that are pending for the calling thread.
+.IP
+If a process-directed signal is delivered to a thread group,
+and the thread group has installed a handler for the signal, then
+the handler is invoked in exactly one, arbitrarily selected
+member of the thread group that has not blocked the signal.
+If multiple threads in a group are waiting to accept the same signal using
+.BR sigwaitinfo (2),
+the kernel will arbitrarily select one of these threads
+to receive the signal.
+.TP
+.BR CLONE_UNTRACED " (since Linux 2.5.46)"
+If
+.B CLONE_UNTRACED
+is specified, then a tracing process cannot force
+.B CLONE_PTRACE
+on this child process.
+.TP
+.BR CLONE_VFORK " (since Linux 2.2)"
+If
+.B CLONE_VFORK
+is set, the execution of the calling process is suspended
+until the child releases its virtual memory
+resources via a call to
+.BR execve (2)
+or
+.BR _exit (2)
+(as with
+.BR vfork (2)).
+.IP
+If
+.B CLONE_VFORK
+is not set, then both the calling process and the child are schedulable
+after the call, and an application should not rely on execution occurring
+in any particular order.
+.TP
+.BR CLONE_VM " (since Linux 2.0)"
+If
+.B CLONE_VM
+is set, the calling process and the child process run in the same memory
+space.
+In particular, memory writes performed by the calling process
+or by the child process are also visible in the other process.
+Moreover, any memory mapping or unmapping performed with
+.BR mmap (2)
+or
+.BR munmap (2)
+by the child or calling process also affects the other process.
+.IP
+If
+.B CLONE_VM
+is not set, the child process runs in a separate copy of the memory
+space of the calling process at the time of the clone call.
+Memory writes or file mappings/unmappings performed by one of the
+processes do not affect the other, as with
+.BR fork (2).
+.IP
+If the
+.B CLONE_VM
+flag is specified and the
+.B CLONE_VFORK
+flag is not specified,
+then any alternate signal stack that was established by
+.BR sigaltstack (2)
+is cleared in the child process.
+.SH RETURN VALUE
+.\" gettid(2) returns current->pid;
+.\" getpid(2) returns current->tgid;
+On success, the thread ID of the child process is returned
+in the caller's thread of execution.
+On failure, \-1 is returned
+in the caller's context, no child process is created, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.BR EACCES " (" clone3 "() only)"
+.B CLONE_INTO_CGROUP
+was specified in
+.IR cl_args.flags ,
+but the restrictions (described in
+.BR cgroups (7))
+on placing the child process into the version 2 cgroup referred to by
+.I cl_args.cgroup
+are not met.
+.TP
+.B EAGAIN
+Too many processes are already running; see
+.BR fork (2).
+.TP
+.BR EBUSY " (" clone3 "() only)"
+.B CLONE_INTO_CGROUP
+was specified in
+.IR cl_args.flags ,
+but the file descriptor specified in
+.I cl_args.cgroup
+refers to a version 2 cgroup in which a domain controller is enabled.
+.TP
+.BR EEXIST " (" clone3 "() only)"
+One (or more) of the PIDs specified in
+.I set_tid
+already exists in the corresponding PID namespace.
+.TP
+.B EINVAL
+Both
+.B CLONE_SIGHAND
+and
+.B CLONE_CLEAR_SIGHAND
+were specified in the
+.I flags
+mask.
+.TP
+.B EINVAL
+.B CLONE_SIGHAND
+was specified in the
+.I flags
+mask, but
+.B CLONE_VM
+was not.
+(Since Linux 2.6.0.)
+.\" Precisely: Linux 2.6.0-test6
+.TP
+.B EINVAL
+.B CLONE_THREAD
+was specified in the
+.I flags
+mask, but
+.B CLONE_SIGHAND
+was not.
+(Since Linux 2.5.35.)
+.\" .TP
+.\" .B EINVAL
+.\" Precisely one of
+.\" .B CLONE_DETACHED
+.\" and
+.\" .B CLONE_THREAD
+.\" was specified.
+.\" (Since Linux 2.6.0-test6.)
+.TP
+.B EINVAL
+.B CLONE_THREAD
+was specified in the
+.I flags
+mask, but the current process previously called
+.BR unshare (2)
+with the
+.B CLONE_NEWPID
+flag or used
+.BR setns (2)
+to reassociate itself with a PID namespace.
+.TP
+.B EINVAL
+.\" commit e66eded8309ebf679d3d3c1f5820d1f2ca332c71
+Both
+.B CLONE_FS
+and
+.B CLONE_NEWNS
+were specified in the
+.I flags
+mask.
+.TP
+.BR EINVAL " (since Linux 3.9)"
+Both
+.B CLONE_NEWUSER
+and
+.B CLONE_FS
+were specified in the
+.I flags
+mask.
+.TP
+.B EINVAL
+Both
+.B CLONE_NEWIPC
+and
+.B CLONE_SYSVSEM
+were specified in the
+.I flags
+mask.
+.TP
+.B EINVAL
+One (or both) of
+.B CLONE_NEWPID
+or
+.B CLONE_NEWUSER
+and one (or both) of
+.B CLONE_THREAD
+or
+.B CLONE_PARENT
+were specified in the
+.I flags
+mask.
+.TP
+.BR EINVAL " (since Linux 2.6.32)"
+.\" commit 123be07b0b399670a7cc3d82fef0cb4f93ef885c
+.B CLONE_PARENT
+was specified, and the caller is an init process.
+.TP
+.B EINVAL
+Returned by the glibc
+.BR clone ()
+wrapper function when
+.I fn
+or
+.I stack
+is specified as NULL.
+.TP
+.B EINVAL
+.B CLONE_NEWIPC
+was specified in the
+.I flags
+mask,
+but the kernel was not configured with the
+.B CONFIG_SYSVIPC
+and
+.B CONFIG_IPC_NS
+options.
+.TP
+.B EINVAL
+.B CLONE_NEWNET
+was specified in the
+.I flags
+mask,
+but the kernel was not configured with the
+.B CONFIG_NET_NS
+option.
+.TP
+.B EINVAL
+.B CLONE_NEWPID
+was specified in the
+.I flags
+mask,
+but the kernel was not configured with the
+.B CONFIG_PID_NS
+option.
+.TP
+.B EINVAL
+.B CLONE_NEWUSER
+was specified in the
+.I flags
+mask,
+but the kernel was not configured with the
+.B CONFIG_USER_NS
+option.
+.TP
+.B EINVAL
+.B CLONE_NEWUTS
+was specified in the
+.I flags
+mask,
+but the kernel was not configured with the
+.B CONFIG_UTS_NS
+option.
+.TP
+.B EINVAL
+.I stack
+is not aligned to a suitable boundary for this architecture.
+For example, on aarch64,
+.I stack
+must be a multiple of 16.
+.TP
+.BR EINVAL " (" clone3 "() only)"
+.B CLONE_DETACHED
+was specified in the
+.I flags
+mask.
+.TP
+.BR EINVAL " (" clone "() only)"
+.B CLONE_PIDFD
+was specified together with
+.B CLONE_DETACHED
+in the
+.I flags
+mask.
+.TP
+.B EINVAL
+.B CLONE_PIDFD
+was specified together with
+.B CLONE_THREAD
+in the
+.I flags
+mask.
+.TP
+.BR "EINVAL " "(" clone "() only)"
+.B CLONE_PIDFD
+was specified together with
+.B CLONE_PARENT_SETTID
+in the
+.I flags
+mask.
+.TP
+.BR EINVAL " (" clone3 "() only)"
+.I set_tid_size
+is greater than the number of nested PID namespaces.
+.TP
+.BR EINVAL " (" clone3 "() only)"
+One of the PIDs specified in
+.I set_tid
+was an invalid.
+.TP
+.BR EINVAL " (" clone3 "() only)"
+.\" commit 7f192e3cd316ba58c88dfa26796cf77789dd9872
+.B CLONE_THREAD
+or
+.B CLONE_PARENT
+was specified in the
+.I flags
+mask, but a signal was specified in
+.I exit_signal.
+.TP
+.BR EINVAL " (AArch64 only, Linux 4.6 and earlier)"
+.I stack
+was not aligned to a 128-bit boundary.
+.TP
+.B ENOMEM
+Cannot allocate sufficient memory to allocate a task structure for the
+child, or to copy those parts of the caller's context that need to be
+copied.
+.TP
+.BR ENOSPC " (since Linux 3.7)"
+.\" commit f2302505775fd13ba93f034206f1e2a587017929
+.B CLONE_NEWPID
+was specified in the
+.I flags
+mask,
+but the limit on the nesting depth of PID namespaces
+would have been exceeded; see
+.BR pid_namespaces (7).
+.TP
+.BR ENOSPC " (since Linux 4.9; beforehand " EUSERS )
+.B CLONE_NEWUSER
+was specified in the
+.I flags
+mask, and the call would cause the limit on the number of
+nested user namespaces to be exceeded.
+See
+.BR user_namespaces (7).
+.IP
+From Linux 3.11 to Linux 4.8, the error diagnosed in this case was
+.BR EUSERS .
+.TP
+.BR ENOSPC " (since Linux 4.9)"
+One of the values in the
+.I flags
+mask specified the creation of a new user namespace,
+but doing so would have caused the limit defined by the corresponding file in
+.I /proc/sys/user
+to be exceeded.
+For further details, see
+.BR namespaces (7).
+.TP
+.BR EOPNOTSUPP " (" clone3 "() only)"
+.B CLONE_INTO_CGROUP
+was specified in
+.IR cl_args.flags ,
+but the file descriptor specified in
+.I cl_args.cgroup
+refers to a version 2 cgroup that is in the
+.I domain invalid
+state.
+.TP
+.B EPERM
+.BR CLONE_NEWCGROUP ,
+.BR CLONE_NEWIPC ,
+.BR CLONE_NEWNET ,
+.BR CLONE_NEWNS ,
+.BR CLONE_NEWPID ,
+or
+.B CLONE_NEWUTS
+was specified by an unprivileged process (process without \fBCAP_SYS_ADMIN\fP).
+.TP
+.B EPERM
+.B CLONE_PID
+was specified by a process other than process 0.
+(This error occurs only on Linux 2.5.15 and earlier.)
+.TP
+.B EPERM
+.B CLONE_NEWUSER
+was specified in the
+.I flags
+mask,
+but either the effective user ID or the effective group ID of the caller
+does not have a mapping in the parent namespace (see
+.BR user_namespaces (7)).
+.TP
+.BR EPERM " (since Linux 3.9)"
+.\" commit 3151527ee007b73a0ebd296010f1c0454a919c7d
+.B CLONE_NEWUSER
+was specified in the
+.I flags
+mask and the caller is in a chroot environment
+.\" FIXME What is the rationale for this restriction?
+(i.e., the caller's root directory does not match the root directory
+of the mount namespace in which it resides).
+.TP
+.BR EPERM " (" clone3 "() only)"
+.I set_tid_size
+was greater than zero, and the caller lacks the
+.B CAP_SYS_ADMIN
+capability in one or more of the user namespaces that own the
+corresponding PID namespaces.
+.TP
+.BR ERESTARTNOINTR " (since Linux 2.6.17)"
+.\" commit 4a2c7a7837da1b91468e50426066d988050e4d56
+System call was interrupted by a signal and will be restarted.
+(This can be seen only during a trace.)
+.TP
+.BR EUSERS " (Linux 3.11 to Linux 4.8)"
+.B CLONE_NEWUSER
+was specified in the
+.I flags
+mask,
+and the limit on the number of nested user namespaces would be exceeded.
+See the discussion of the
+.B ENOSPC
+error above.
+.SH VERSIONS
+The glibc
+.BR clone ()
+wrapper function makes some changes
+in the memory pointed to by
+.I stack
+(changes required to set the stack up correctly for the child)
+.I before
+invoking the
+.BR clone ()
+system call.
+So, in cases where
+.BR clone ()
+is used to recursively create children,
+do not use the buffer employed for the parent's stack
+as the stack of the child.
+.PP
+On i386,
+.BR clone ()
+should not be called through vsyscall, but directly through
+.IR "int $0x80" .
+.SS C library/kernel differences
+The raw
+.BR clone ()
+system call corresponds more closely to
+.BR fork (2)
+in that execution in the child continues from the point of the
+call.
+As such, the
+.I fn
+and
+.I arg
+arguments of the
+.BR clone ()
+wrapper function are omitted.
+.PP
+In contrast to the glibc wrapper, the raw
+.BR clone ()
+system call accepts NULL as a
+.I stack
+argument (and
+.BR clone3 ()
+likewise allows
+.I cl_args.stack
+to be NULL).
+In this case, the child uses a duplicate of the parent's stack.
+(Copy-on-write semantics ensure that the child gets separate copies
+of stack pages when either process modifies the stack.)
+In this case, for correct operation, the
+.B CLONE_VM
+option should not be specified.
+(If the child
+.I shares
+the parent's memory because of the use of the
+.B CLONE_VM
+flag,
+then no copy-on-write duplication occurs and chaos is likely to result.)
+.PP
+The order of the arguments also differs in the raw system call,
+and there are variations in the arguments across architectures,
+as detailed in the following paragraphs.
+.PP
+The raw system call interface on x86-64 and some other architectures
+(including sh, tile, and alpha) is:
+.PP
+.in +4n
+.EX
+.BI "long clone(unsigned long " flags ", void *" stack ,
+.BI " int *" parent_tid ", int *" child_tid ,
+.BI " unsigned long " tls );
+.EE
+.in
+.PP
+On x86-32, and several other common architectures
+(including score, ARM, ARM 64, PA-RISC, arc, Power PC, xtensa,
+and MIPS),
+.\" CONFIG_CLONE_BACKWARDS
+the order of the last two arguments is reversed:
+.PP
+.in +4n
+.EX
+.BI "long clone(unsigned long " flags ", void *" stack ,
+.BI " int *" parent_tid ", unsigned long " tls ,
+.BI " int *" child_tid );
+.EE
+.in
+.PP
+On the cris and s390 architectures,
+.\" CONFIG_CLONE_BACKWARDS2
+the order of the first two arguments is reversed:
+.PP
+.in +4n
+.EX
+.BI "long clone(void *" stack ", unsigned long " flags ,
+.BI " int *" parent_tid ", int *" child_tid ,
+.BI " unsigned long " tls );
+.EE
+.in
+.PP
+On the microblaze architecture,
+.\" CONFIG_CLONE_BACKWARDS3
+an additional argument is supplied:
+.PP
+.in +4n
+.EX
+.BI "long clone(unsigned long " flags ", void *" stack ,
+.BI " int " stack_size , "\fR /* Size of stack */"
+.BI " int *" parent_tid ", int *" child_tid ,
+.BI " unsigned long " tls );
+.EE
+.in
+.\"
+.SS blackfin, m68k, and sparc
+.\" Mike Frysinger noted in a 2013 mail:
+.\" these arches don't define __ARCH_WANT_SYS_CLONE:
+.\" blackfin ia64 m68k sparc
+The argument-passing conventions on
+blackfin, m68k, and sparc are different from the descriptions above.
+For details, see the kernel (and glibc) source.
+.SS ia64
+On ia64, a different interface is used:
+.PP
+.in +4n
+.EX
+.BI "int __clone2(int (*" "fn" ")(void *),"
+.BI " void *" stack_base ", size_t " stack_size ,
+.BI " int " flags ", void *" "arg" ", ..."
+.BI " /* pid_t *" parent_tid ", struct user_desc *" tls ,
+.BI " pid_t *" child_tid " */ );"
+.EE
+.in
+.PP
+The prototype shown above is for the glibc wrapper function;
+for the system call itself,
+the prototype can be described as follows (it is identical to the
+.BR clone ()
+prototype on microblaze):
+.PP
+.in +4n
+.EX
+.BI "long clone2(unsigned long " flags ", void *" stack_base ,
+.BI " int " stack_size , "\fR /* Size of stack */"
+.BI " int *" parent_tid ", int *" child_tid ,
+.BI " unsigned long " tls );
+.EE
+.in
+.PP
+.BR __clone2 ()
+operates in the same way as
+.BR clone (),
+except that
+.I stack_base
+points to the lowest address of the child's stack area,
+and
+.I stack_size
+specifies the size of the stack pointed to by
+.IR stack_base .
+.SH STANDARDS
+Linux.
+.SH HISTORY
+.TP
+.BR clone3 ()
+Linux 5.3.
+.\" There is no entry for
+.\" .BR clone ()
+.\" in libc5.
+.\" glibc2 provides
+.\" .BR clone ()
+.\" as described in this manual page.
+.SS Linux 2.4 and earlier
+In the Linux 2.4.x series,
+.B CLONE_THREAD
+generally does not make the parent of the new thread the same
+as the parent of the calling process.
+However, from Linux 2.4.7 to Linux 2.4.18 the
+.B CLONE_THREAD
+flag implied the
+.B CLONE_PARENT
+flag (as in Linux 2.6.0 and later).
+.PP
+In Linux 2.4 and earlier,
+.BR clone ()
+does not take arguments
+.IR parent_tid ,
+.IR tls ,
+and
+.IR child_tid .
+.SH NOTES
+One use of these systems calls
+is to implement threads: multiple flows of control in a program that
+run concurrently in a shared address space.
+.PP
+The
+.BR kcmp (2)
+system call can be used to test whether two processes share various
+resources such as a file descriptor table,
+System V semaphore undo operations, or a virtual address space.
+.PP
+Handlers registered using
+.BR pthread_atfork (3)
+are not executed during a clone call.
+.SH BUGS
+GNU C library versions 2.3.4 up to and including 2.24
+contained a wrapper function for
+.BR getpid (2)
+that performed caching of PIDs.
+This caching relied on support in the glibc wrapper for
+.BR clone (),
+but limitations in the implementation
+meant that the cache was not up to date in some circumstances.
+In particular,
+if a signal was delivered to the child immediately after the
+.BR clone ()
+call, then a call to
+.BR getpid (2)
+in a handler for the signal could return the PID
+of the calling process ("the parent"),
+if the clone wrapper had not yet had a chance to update the PID
+cache in the child.
+(This discussion ignores the case where the child was created using
+.BR CLONE_THREAD ,
+when
+.BR getpid (2)
+.I should
+return the same value in the child and in the process that called
+.BR clone (),
+since the caller and the child are in the same thread group.
+The stale-cache problem also does not occur if the
+.I flags
+argument includes
+.BR CLONE_VM .)
+To get the truth, it was sometimes necessary to use code such as the following:
+.PP
+.in +4n
+.EX
+#include <syscall.h>
+\&
+pid_t mypid;
+\&
+mypid = syscall(SYS_getpid);
+.EE
+.in
+.\" See also the following bug reports
+.\" https://bugzilla.redhat.com/show_bug.cgi?id=417521
+.\" http://sourceware.org/bugzilla/show_bug.cgi?id=6910
+.PP
+Because of the stale-cache problem, as well as other problems noted in
+.BR getpid (2),
+the PID caching feature was removed in glibc 2.25.
+.SH EXAMPLES
+The following program demonstrates the use of
+.BR clone ()
+to create a child process that executes in a separate UTS namespace.
+The child changes the hostname in its UTS namespace.
+Both parent and child then display the system hostname,
+making it possible to see that the hostname
+differs in the UTS namespaces of the parent and child.
+For an example of the use of this program, see
+.BR setns (2).
+.PP
+Within the sample program, we allocate the memory that is to
+be used for the child's stack using
+.BR mmap (2)
+rather than
+.BR malloc (3)
+for the following reasons:
+.IP \[bu] 3
+.BR mmap (2)
+allocates a block of memory that starts on a page
+boundary and is a multiple of the page size.
+This is useful if we want to establish a guard page (a page with protection
+.BR PROT_NONE )
+at the end of the stack using
+.BR mprotect (2).
+.IP \[bu]
+We can specify the
+.B MAP_STACK
+flag to request a mapping that is suitable for a stack.
+For the moment, this flag is a no-op on Linux,
+but it exists and has effect on some other systems,
+so we should include it for portability.
+.SS Program source
+.\" SRC BEGIN (clone.c)
+.EX
+#define _GNU_SOURCE
+#include <err.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/utsname.h>
+#include <sys/wait.h>
+#include <unistd.h>
+\&
+static int /* Start function for cloned child */
+childFunc(void *arg)
+{
+ struct utsname uts;
+\&
+ /* Change hostname in UTS namespace of child. */
+\&
+ if (sethostname(arg, strlen(arg)) == \-1)
+ err(EXIT_FAILURE, "sethostname");
+\&
+ /* Retrieve and display hostname. */
+\&
+ if (uname(&uts) == \-1)
+ err(EXIT_FAILURE, "uname");
+ printf("uts.nodename in child: %s\en", uts.nodename);
+\&
+ /* Keep the namespace open for a while, by sleeping.
+ This allows some experimentation\-\-for example, another
+ process might join the namespace. */
+\&
+ sleep(200);
+\&
+ return 0; /* Child terminates now */
+}
+\&
+#define STACK_SIZE (1024 * 1024) /* Stack size for cloned child */
+\&
+int
+main(int argc, char *argv[])
+{
+ char *stack; /* Start of stack buffer */
+ char *stackTop; /* End of stack buffer */
+ pid_t pid;
+ struct utsname uts;
+\&
+ if (argc < 2) {
+ fprintf(stderr, "Usage: %s <child\-hostname>\en", argv[0]);
+ exit(EXIT_SUCCESS);
+ }
+\&
+ /* Allocate memory to be used for the stack of the child. */
+\&
+ stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, \-1, 0);
+ if (stack == MAP_FAILED)
+ err(EXIT_FAILURE, "mmap");
+\&
+ stackTop = stack + STACK_SIZE; /* Assume stack grows downward */
+\&
+ /* Create child that has its own UTS namespace;
+ child commences execution in childFunc(). */
+\&
+ pid = clone(childFunc, stackTop, CLONE_NEWUTS | SIGCHLD, argv[1]);
+ if (pid == \-1)
+ err(EXIT_FAILURE, "clone");
+ printf("clone() returned %jd\en", (intmax_t) pid);
+\&
+ /* Parent falls through to here */
+\&
+ sleep(1); /* Give child time to change its hostname */
+\&
+ /* Display hostname in parent\[aq]s UTS namespace. This will be
+ different from hostname in child\[aq]s UTS namespace. */
+\&
+ if (uname(&uts) == \-1)
+ err(EXIT_FAILURE, "uname");
+ printf("uts.nodename in parent: %s\en", uts.nodename);
+\&
+ if (waitpid(pid, NULL, 0) == \-1) /* Wait for child */
+ err(EXIT_FAILURE, "waitpid");
+ printf("child has terminated\en");
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR fork (2),
+.BR futex (2),
+.BR getpid (2),
+.BR gettid (2),
+.BR kcmp (2),
+.BR mmap (2),
+.BR pidfd_open (2),
+.BR set_thread_area (2),
+.BR set_tid_address (2),
+.BR setns (2),
+.BR tkill (2),
+.BR unshare (2),
+.BR wait (2),
+.BR capabilities (7),
+.BR namespaces (7),
+.BR pthreads (7)
diff --git a/man2/clone2.2 b/man2/clone2.2
new file mode 100644
index 0000000..68f41a5
--- /dev/null
+++ b/man2/clone2.2
@@ -0,0 +1 @@
+.so man2/clone.2
diff --git a/man2/clone3.2 b/man2/clone3.2
new file mode 100644
index 0000000..68f41a5
--- /dev/null
+++ b/man2/clone3.2
@@ -0,0 +1 @@
+.so man2/clone.2
diff --git a/man2/close.2 b/man2/close.2
new file mode 100644
index 0000000..239979b
--- /dev/null
+++ b/man2/close.2
@@ -0,0 +1,266 @@
+.\" This manpage is Copyright (C) 1992 Drew Eckhardt;
+.\" and Copyright (C) 1993 Michael Haardt, Ian Jackson.
+.\" and Copyright (C) 2016 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified Wed Jul 21 22:40:25 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Modified Sat Feb 18 15:27:48 1995 by Michael Haardt
+.\" Modified Sun Apr 14 11:40:50 1996 by Andries Brouwer <aeb@cwi.nl>:
+.\" corrected description of effect on locks (thanks to
+.\" Tigran Aivazian <tigran@sco.com>).
+.\" Modified Fri Jan 31 16:21:46 1997 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 2000-07-22 by Nicolás Lichtmaier <nick@debian.org>
+.\" added note about close(2) not guaranteeing that data is safe on close.
+.\"
+.TH close 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+close \- close a file descriptor
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int close(int " fd );
+.fi
+.SH DESCRIPTION
+.BR close ()
+closes a file descriptor, so that it no longer refers to any file and
+may be reused.
+Any record locks (see
+.BR fcntl (2))
+held on the file it was associated with,
+and owned by the process, are removed (regardless of the file
+descriptor that was used to obtain the lock).
+.PP
+If
+.I fd
+is the last file descriptor referring to the underlying
+open file description (see
+.BR open (2)),
+the resources associated with the open file description are freed;
+if the file descriptor was the last reference to a file which has been
+removed using
+.BR unlink (2),
+the file is deleted.
+.SH RETURN VALUE
+.BR close ()
+returns zero on success.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+.I fd
+isn't a valid open file descriptor.
+.TP
+.B EINTR
+.\" Though, it's in doubt whether this error can ever occur; see
+.\" https://lwn.net/Articles/576478/ "Returning EINTR from close()"
+The
+.BR close ()
+call was interrupted by a signal; see
+.BR signal (7).
+.TP
+.B EIO
+An I/O error occurred.
+.TP
+.BR ENOSPC ", " EDQUOT
+On NFS, these errors are not normally reported against the first write
+which exceeds the available storage space, but instead against a
+subsequent
+.BR write (2),
+.BR fsync (2),
+or
+.BR close ().
+.PP
+See NOTES for a discussion of why
+.BR close ()
+should not be retried after an error.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4, 4.3BSD.
+.\" SVr4 documents an additional ENOLINK error condition.
+.SH NOTES
+A successful close does not guarantee that the data has been successfully
+saved to disk, as the kernel uses the buffer cache to defer writes.
+Typically, filesystems do not flush buffers when a file is closed.
+If you need to be sure that
+the data is physically stored on the underlying disk, use
+.BR fsync (2).
+(It will depend on the disk hardware at this point.)
+.PP
+The close-on-exec file descriptor flag can be used to ensure
+that a file descriptor is automatically closed upon a successful
+.BR execve (2);
+see
+.BR fcntl (2)
+for details.
+.\"
+.SS Multithreaded processes and close()
+It is probably unwise to close file descriptors while
+they may be in use by system calls in
+other threads in the same process.
+Since a file descriptor may be reused,
+there are some obscure race conditions
+that may cause unintended side effects.
+.\" Date: Tue, 4 Sep 2007 13:57:35 +0200
+.\" From: Fredrik Noring <noring@nocrew.org>
+.\" One such race involves signals and ERESTARTSYS. If a file descriptor
+.\" in use by a system call is closed and then reused by e.g. an
+.\" independent open() in some unrelated thread, before the original system
+.\" call has restarted after ERESTARTSYS, the original system call will
+.\" later restart with the reused file descriptor. This is most likely a
+.\" serious programming error.
+.PP
+Furthermore, consider the following scenario where two threads are
+performing operations on the same file descriptor:
+.IP (1) 5
+One thread is blocked in an I/O system call on the file descriptor.
+For example, it is trying to
+.BR write (2)
+to a pipe that is already full, or trying to
+.BR read (2)
+from a stream socket which currently has no available data.
+.IP (2)
+Another thread closes the file descriptor.
+.PP
+The behavior in this situation varies across systems.
+On some systems, when the file descriptor is closed,
+the blocking system call returns immediately with an error.
+.PP
+On Linux (and possibly some other systems), the behavior is different:
+the blocking I/O system call holds a reference to the underlying
+open file description, and this reference keeps the description open
+until the I/O system call completes.
+.\" 'struct file' in kernel-speak
+(See
+.BR open (2)
+for a discussion of open file descriptions.)
+Thus, the blocking system call in the first thread may successfully
+complete after the
+.BR close ()
+in the second thread.
+.\"
+.SS Dealing with error returns from close()
+A careful programmer will check the return value of
+.BR close (),
+since it is quite possible that errors on a previous
+.BR write (2)
+operation are reported only on the final
+.BR close ()
+that releases the open file description.
+Failing to check the return value when closing a file may lead to
+.I silent
+loss of data.
+This can especially be observed with NFS and with disk quota.
+.PP
+Note, however, that a failure return should be used only for
+diagnostic purposes (i.e., a warning to the application that there
+may still be I/O pending or there may have been failed I/O)
+or remedial purposes
+(e.g., writing the file once more or creating a backup).
+.PP
+Retrying the
+.BR close ()
+after a failure return is the wrong thing to do,
+.\" The file descriptor is released early in close();
+.\" close() ==> __close_fd():
+.\" __put_unused_fd() ==> __clear_open_fd()
+.\" return filp_close(file, files);
+.\"
+.\" The errors are returned by filp_close() after the FD has been
+.\" cleared for re-use.
+since this may cause a reused file descriptor
+from another thread to be closed.
+This can occur because the Linux kernel
+.I always
+releases the file descriptor early in the close
+operation, freeing it for reuse;
+the steps that may return an error,
+.\" filp_close()
+such as flushing data to the filesystem or device,
+occur only later in the close operation.
+.PP
+Many other implementations similarly always close the file descriptor
+.\" FreeBSD documents this explicitly. From the look of the source code
+.\" SVR4, ancient SunOS, later Solaris, and AIX all do this.
+(except in the case of
+.BR EBADF ,
+meaning that the file descriptor was invalid)
+even if they subsequently report an error on return from
+.BR close ().
+POSIX.1 is currently silent on this point,
+but there are plans to mandate this behavior in the next major release
+.\" Issue 8
+of the standard.
+.PP
+A careful programmer who wants to know about I/O errors may precede
+.BR close ()
+with a call to
+.BR fsync (2).
+.PP
+The
+.B EINTR
+error is a somewhat special case.
+Regarding the
+.B EINTR
+error, POSIX.1-2008 says:
+.PP
+.RS
+If
+.BR close ()
+is interrupted by a signal that is to be caught, it shall return \-1 with
+.I errno
+set to
+.B EINTR
+and the state of
+.I fildes
+is unspecified.
+.RE
+.PP
+This permits the behavior that occurs on Linux and
+many other implementations, where,
+as with other errors that may be reported by
+.BR close (),
+the file descriptor is guaranteed to be closed.
+However, it also permits another possibility:
+that the implementation returns an
+.B EINTR
+error and keeps the file descriptor open.
+(According to its documentation, HP-UX's
+.BR close ()
+does this.)
+The caller must then once more use
+.BR close ()
+to close the file descriptor, to avoid file descriptor leaks.
+This divergence in implementation behaviors provides
+a difficult hurdle for portable applications, since on many implementations,
+.BR close ()
+must not be called again after an
+.B EINTR
+error, and on at least one,
+.BR close ()
+must be called again.
+There are plans to address this conundrum for
+the next major release of the POSIX.1 standard.
+.\" FIXME . for later review when Issue 8 is one day released...
+.\" POSIX proposes further changes for EINTR
+.\" http://austingroupbugs.net/tag_view_page.php?tag_id=8
+.\" http://austingroupbugs.net/view.php?id=529
+.\"
+.\" FIXME .
+.\" Review the following glibc bug later
+.\" https://sourceware.org/bugzilla/show_bug.cgi?id=14627
+.SH SEE ALSO
+.BR close_range (2),
+.BR fcntl (2),
+.BR fsync (2),
+.BR open (2),
+.BR shutdown (2),
+.BR unlink (2),
+.BR fclose (3)
diff --git a/man2/close_range.2 b/man2/close_range.2
new file mode 100644
index 0000000..c1aa3db
--- /dev/null
+++ b/man2/close_range.2
@@ -0,0 +1,273 @@
+.\" Copyright (c) 2020 Stephen Kitt <steve@sk2.org>
+.\" and Copyright (c) 2021 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH close_range 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+close_range \- close all file descriptors in a given range
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <linux/close_range.h>
+.PP
+.BI "int close_range(unsigned int " first ", unsigned int " last ,
+.BI " unsigned int " flags );
+.fi
+.SH DESCRIPTION
+The
+.BR close_range ()
+system call closes all open file descriptors from
+.I first
+to
+.I last
+(included).
+.PP
+Errors closing a given file descriptor are currently ignored.
+.PP
+.I flags
+is a bit mask containing 0 or more of the following:
+.TP
+.BR CLOSE_RANGE_CLOEXEC " (since Linux 5.11)"
+Set the close-on-exec flag on the specified file descriptors,
+rather than immediately closing them.
+.TP
+.B CLOSE_RANGE_UNSHARE
+Unshare the specified file descriptors from any other processes
+before closing them,
+avoiding races with other threads sharing the file descriptor table.
+.SH RETURN VALUE
+On success,
+.BR close_range ()
+returns 0.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+.I flags
+is not valid, or
+.I first
+is greater than
+.IR last .
+.PP
+The following can occur with
+.B CLOSE_RANGE_UNSHARE
+(when constructing the new descriptor table):
+.TP
+.B EMFILE
+The number of open file descriptors exceeds the limit specified in
+.I /proc/sys/fs/nr_open
+(see
+.BR proc (5)).
+This error can occur in situations where that limit was lowered before
+a call to
+.BR close_range ()
+where the
+.B CLOSE_RANGE_UNSHARE
+flag is specified.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.SH STANDARDS
+None.
+.SH HISTORY
+FreeBSD.
+Linux 5.9,
+glibc 2.34.
+.SH NOTES
+.SS Closing all open file descriptors
+.\" 278a5fbaed89dacd04e9d052f4594ffd0e0585de
+To avoid blindly closing file descriptors
+in the range of possible file descriptors,
+this is sometimes implemented (on Linux)
+by listing open file descriptors in
+.I /proc/self/fd/
+and calling
+.BR close (2)
+on each one.
+.BR close_range ()
+can take care of this without requiring
+.I /proc
+and within a single system call,
+which provides significant performance benefits.
+.SS Closing file descriptors before exec
+.\" 60997c3d45d9a67daf01c56d805ae4fec37e0bd8
+File descriptors can be closed safely using
+.PP
+.in +4n
+.EX
+/* we don't want anything past stderr here */
+close_range(3, \[ti]0U, CLOSE_RANGE_UNSHARE);
+execve(....);
+.EE
+.in
+.PP
+.B CLOSE_RANGE_UNSHARE
+is conceptually equivalent to
+.PP
+.in +4n
+.EX
+unshare(CLONE_FILES);
+close_range(first, last, 0);
+.EE
+.in
+.PP
+but can be more efficient:
+if the unshared range extends past
+the current maximum number of file descriptors allocated
+in the caller's file descriptor table
+(the common case when
+.I last
+is \[ti]0U),
+the kernel will unshare a new file descriptor table for the caller up to
+.IR first ,
+copying as few file descriptors as possible.
+This avoids subsequent
+.BR close (2)
+calls entirely;
+the whole operation is complete once the table is unshared.
+.SS Closing files on \fBexec\fP
+.\" 582f1fb6b721facf04848d2ca57f34468da1813e
+This is particularly useful in cases where multiple
+.RB pre- exec
+setup steps risk conflicting with each other.
+For example, setting up a
+.BR seccomp (2)
+profile can conflict with a
+.BR close_range ()
+call:
+if the file descriptors are closed before the
+.BR seccomp (2)
+profile is set up,
+the profile setup can't use them itself,
+or control their closure;
+if the file descriptors are closed afterwards,
+the seccomp profile can't block the
+.BR close_range ()
+call or any fallbacks.
+Using
+.B CLOSE_RANGE_CLOEXEC
+avoids this:
+the descriptors can be marked before the
+.BR seccomp (2)
+profile is set up,
+and the profile can control access to
+.BR close_range ()
+without affecting the calling process.
+.SH EXAMPLES
+The program shown below opens the files named in its command-line arguments,
+displays the list of files that it has opened
+(by iterating through the entries in
+.IR /proc/PID/fd ),
+uses
+.BR close_range ()
+to close all file descriptors greater than or equal to 3,
+and then once more displays the process's list of open files.
+The following example demonstrates the use of the program:
+.PP
+.in +4n
+.EX
+$ \fBtouch /tmp/a /tmp/b /tmp/c\fP
+$ \fB./a.out /tmp/a /tmp/b /tmp/c\fP
+/tmp/a opened as FD 3
+/tmp/b opened as FD 4
+/tmp/c opened as FD 5
+/proc/self/fd/0 ==> /dev/pts/1
+/proc/self/fd/1 ==> /dev/pts/1
+/proc/self/fd/2 ==> /dev/pts/1
+/proc/self/fd/3 ==> /tmp/a
+/proc/self/fd/4 ==> /tmp/b
+/proc/self/fd/5 ==> /tmp/b
+/proc/self/fd/6 ==> /proc/9005/fd
+========= About to call close_range() =======
+/proc/self/fd/0 ==> /dev/pts/1
+/proc/self/fd/1 ==> /dev/pts/1
+/proc/self/fd/2 ==> /dev/pts/1
+/proc/self/fd/3 ==> /proc/9005/fd
+.EE
+.in
+.PP
+Note that the lines showing the pathname
+.I /proc/9005/fd
+result from the calls to
+.BR opendir (3).
+.SS Program source
+\&
+.\" SRC BEGIN (close_range.c)
+.EX
+#define _GNU_SOURCE
+#include <dirent.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+\&
+/* Show the contents of the symbolic links in /proc/self/fd */
+\&
+static void
+show_fds(void)
+{
+ DIR *dirp;
+ char path[PATH_MAX], target[PATH_MAX];
+ ssize_t len;
+ struct dirent *dp;
+\&
+ dirp = opendir("/proc/self/fd");
+ if (dirp == NULL) {
+ perror("opendir");
+ exit(EXIT_FAILURE);
+ }
+\&
+ for (;;) {
+ dp = readdir(dirp);
+ if (dp == NULL)
+ break;
+\&
+ if (dp\->d_type == DT_LNK) {
+ snprintf(path, sizeof(path), "/proc/self/fd/%s",
+ dp\->d_name);
+\&
+ len = readlink(path, target, sizeof(target));
+ printf("%s ==> %.*s\en", path, (int) len, target);
+ }
+ }
+\&
+ closedir(dirp);
+}
+\&
+int
+main(int argc, char *argv[])
+{
+ int fd;
+\&
+ for (size_t j = 1; j < argc; j++) {
+ fd = open(argv[j], O_RDONLY);
+ if (fd == \-1) {
+ perror(argv[j]);
+ exit(EXIT_FAILURE);
+ }
+ printf("%s opened as FD %d\en", argv[j], fd);
+ }
+\&
+ show_fds();
+\&
+ printf("========= About to call close_range() =======\en");
+\&
+ if (syscall(SYS_close_range, 3, \[ti]0U, 0) == \-1) {
+ perror("close_range");
+ exit(EXIT_FAILURE);
+ }
+\&
+ show_fds();
+ exit(EXIT_FAILURE);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR close (2)
diff --git a/man2/connect.2 b/man2/connect.2
new file mode 100644
index 0000000..abd9e87
--- /dev/null
+++ b/man2/connect.2
@@ -0,0 +1,251 @@
+.\" Copyright 1993 Rickard E. Faith (faith@cs.unc.edu)
+.\" Portions extracted from /usr/include/sys/socket.h, which does not have
+.\" any authorship information in it. It is probably available under the GPL.
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\"
+.\" Other portions are from the 6.9 (Berkeley) 3/10/91 man page:
+.\"
+.\" Copyright (c) 1983 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" SPDX-License-Identifier: BSD-4-Clause-UC
+.\"
+.\" Modified 1997-01-31 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 1998, 1999 by Andi Kleen
+.\" Modified 2004-06-23 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.TH connect 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+connect \- initiate a connection on a socket
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/socket.h>
+.PP
+.BI "int connect(int " sockfd ", const struct sockaddr *" addr ,
+.BI " socklen_t " addrlen );
+.fi
+.SH DESCRIPTION
+The
+.BR connect ()
+system call connects the socket referred to by the file descriptor
+.I sockfd
+to the address specified by
+.IR addr .
+The
+.I addrlen
+argument specifies the size of
+.IR addr .
+The format of the address in
+.I addr
+is determined by the address space of the socket
+.IR sockfd ;
+see
+.BR socket (2)
+for further details.
+.PP
+If the socket
+.I sockfd
+is of type
+.BR SOCK_DGRAM ,
+then
+.I addr
+is the address to which datagrams are sent by default, and the only
+address from which datagrams are received.
+If the socket is of type
+.B SOCK_STREAM
+or
+.BR SOCK_SEQPACKET ,
+this call attempts to make a connection to the socket that is bound
+to the address specified by
+.IR addr .
+.PP
+Some protocol sockets (e.g., UNIX domain stream sockets)
+may successfully
+.BR connect ()
+only once.
+.PP
+Some protocol sockets
+(e.g., datagram sockets in the UNIX and Internet domains)
+may use
+.BR connect ()
+multiple times to change their association.
+.PP
+Some protocol sockets
+(e.g., TCP sockets as well as datagram sockets in the UNIX and
+Internet domains)
+may dissolve the association by connecting to an address with the
+.I sa_family
+member of
+.I sockaddr
+set to
+.BR AF_UNSPEC ;
+thereafter, the socket can be connected to another address.
+.RB ( AF_UNSPEC
+is supported since Linux 2.2.)
+.SH RETURN VALUE
+If the connection or binding succeeds, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+The following are general socket errors only.
+There may be other domain-specific error codes.
+.TP
+.B EACCES
+For UNIX domain sockets, which are identified by pathname:
+Write permission is denied on the socket file,
+or search permission is denied for one of the directories
+in the path prefix.
+(See also
+.BR path_resolution (7).)
+.TP
+.BR EACCES ", " EPERM
+The user tried to connect to a broadcast address without having the socket
+broadcast flag enabled or the connection request failed because of a local
+firewall rule.
+.TP
+.B EACCES
+It can also be returned if an SELinux policy denied a connection (for
+example, if there is a policy saying that an HTTP proxy can only
+connect to ports associated with HTTP servers, and the proxy tries to
+connect to a different port).
+.TP
+.B EADDRINUSE
+Local address is already in use.
+.TP
+.B EADDRNOTAVAIL
+(Internet domain sockets)
+The socket referred to by
+.I sockfd
+had not previously been bound to an address and,
+upon attempting to bind it to an ephemeral port,
+it was determined that all port numbers in the ephemeral port range
+are currently in use.
+See the discussion of
+.I /proc/sys/net/ipv4/ip_local_port_range
+in
+.BR ip (7).
+.TP
+.B EAFNOSUPPORT
+The passed address didn't have the correct address family in its
+.I sa_family
+field.
+.TP
+.B EAGAIN
+For nonblocking UNIX domain sockets, the socket is nonblocking, and the
+connection cannot be completed immediately.
+For other socket families, there are insufficient entries in the routing cache.
+.TP
+.B EALREADY
+The socket is nonblocking and a previous connection attempt has not yet
+been completed.
+.TP
+.B EBADF
+.I sockfd
+is not a valid open file descriptor.
+.TP
+.B ECONNREFUSED
+A
+.BR connect ()
+on a stream socket found no one listening on the remote address.
+.TP
+.B EFAULT
+The socket structure address is outside the user's address space.
+.TP
+.B EINPROGRESS
+The socket is nonblocking and the connection cannot be completed immediately.
+(UNIX domain sockets failed with
+.B EAGAIN
+instead.)
+It is possible to
+.BR select (2)
+or
+.BR poll (2)
+for completion by selecting the socket for writing.
+After
+.BR select (2)
+indicates writability, use
+.BR getsockopt (2)
+to read the
+.B SO_ERROR
+option at level
+.B SOL_SOCKET
+to determine whether
+.BR connect ()
+completed successfully
+.RB ( SO_ERROR
+is zero) or unsuccessfully
+.RB ( SO_ERROR
+is one of the usual error codes listed here,
+explaining the reason for the failure).
+.TP
+.B EINTR
+The system call was interrupted by a signal that was caught; see
+.BR signal (7).
+.\" For TCP, the connection will complete asynchronously.
+.\" See http://lkml.org/lkml/2005/7/12/254
+.TP
+.B EISCONN
+The socket is already connected.
+.TP
+.B ENETUNREACH
+Network is unreachable.
+.TP
+.B ENOTSOCK
+The file descriptor
+.I sockfd
+does not refer to a socket.
+.TP
+.B EPROTOTYPE
+The socket type does not support the requested communications protocol.
+This error can occur, for example,
+on an attempt to connect a UNIX domain datagram socket to a stream socket.
+.TP
+.B ETIMEDOUT
+Timeout while attempting connection.
+The server may be too
+busy to accept new connections.
+Note that for IP sockets the timeout may
+be very long when syncookies are enabled on the server.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4, 4.4BSD,
+.RB (connect ()
+first appeared in 4.2BSD).
+.\" SVr4 documents the additional
+.\" general error codes
+.\" .BR EADDRNOTAVAIL ,
+.\" .BR EINVAL ,
+.\" .BR EAFNOSUPPORT ,
+.\" .BR EALREADY ,
+.\" .BR EINTR ,
+.\" .BR EPROTOTYPE ,
+.\" and
+.\" .BR ENOSR .
+.\" It also
+.\" documents many additional error conditions not described here.
+.SH NOTES
+If
+.BR connect ()
+fails, consider the state of the socket as unspecified.
+Portable applications should close the socket and create a new one for
+reconnecting.
+.SH EXAMPLES
+An example of the use of
+.BR connect ()
+is shown in
+.BR getaddrinfo (3).
+.SH SEE ALSO
+.BR accept (2),
+.BR bind (2),
+.BR getsockname (2),
+.BR listen (2),
+.BR socket (2),
+.BR path_resolution (7),
+.BR selinux (8)
diff --git a/man2/copy_file_range.2 b/man2/copy_file_range.2
new file mode 100644
index 0000000..8bea2e8
--- /dev/null
+++ b/man2/copy_file_range.2
@@ -0,0 +1,307 @@
+.\"This manpage is Copyright (C) 2015 Anna Schumaker <Anna.Schumaker@Netapp.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH copy_file_range 2 2023-07-15 "Linux man-pages 6.05.01"
+.SH NAME
+copy_file_range \- Copy a range of data from one file to another
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #define _GNU_SOURCE
+.B #define _FILE_OFFSET_BITS 64
+.B #include <unistd.h>
+.PP
+.BI "ssize_t copy_file_range(int " fd_in ", off_t *_Nullable " off_in ,
+.BI " int " fd_out ", off_t *_Nullable " off_out ,
+.BI " size_t " len ", unsigned int " flags );
+.fi
+.SH DESCRIPTION
+The
+.BR copy_file_range ()
+system call performs an in-kernel copy between two file descriptors
+without the additional cost of transferring data from the kernel to user space
+and then back into the kernel.
+It copies up to
+.I len
+bytes of data from the source file descriptor
+.I fd_in
+to the target file descriptor
+.IR fd_out ,
+overwriting any data that exists within the requested range of the target file.
+.PP
+The following semantics apply for
+.IR off_in ,
+and similar statements apply to
+.IR off_out :
+.IP \[bu] 3
+If
+.I off_in
+is NULL, then bytes are read from
+.I fd_in
+starting from the file offset, and the file offset is
+adjusted by the number of bytes copied.
+.IP \[bu]
+If
+.I off_in
+is not NULL, then
+.I off_in
+must point to a buffer that specifies the starting
+offset where bytes from
+.I fd_in
+will be read.
+The file offset of
+.I fd_in
+is not changed, but
+.I off_in
+is adjusted appropriately.
+.PP
+.I fd_in
+and
+.I fd_out
+can refer to the same file.
+If they refer to the same file, then the source and target ranges are not
+allowed to overlap.
+.PP
+The
+.I flags
+argument is provided to allow for future extensions
+and currently must be set to 0.
+.SH RETURN VALUE
+Upon successful completion,
+.BR copy_file_range ()
+will return the number of bytes copied between files.
+This could be less than the length originally requested.
+If the file offset of
+.I fd_in
+is at or past the end of file, no bytes are copied, and
+.BR copy_file_range ()
+returns zero.
+.PP
+On error,
+.BR copy_file_range ()
+returns \-1 and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+One or more file descriptors are not valid.
+.TP
+.B EBADF
+.I fd_in
+is not open for reading; or
+.I fd_out
+is not open for writing.
+.TP
+.B EBADF
+The
+.B O_APPEND
+flag is set for the open file description (see
+.BR open (2))
+referred to by the file descriptor
+.IR fd_out .
+.TP
+.B EFBIG
+An attempt was made to write at a position past the maximum file offset the
+kernel supports.
+.TP
+.B EFBIG
+An attempt was made to write a range that exceeds the allowed maximum file size.
+The maximum file size differs between filesystem implementations and can be
+different from the maximum allowed file offset.
+.TP
+.B EFBIG
+An attempt was made to write beyond the process's file size resource limit.
+This may also result in the process receiving a
+.B SIGXFSZ
+signal.
+.TP
+.B EINVAL
+The
+.I flags
+argument is not 0.
+.TP
+.B EINVAL
+.I fd_in
+and
+.I fd_out
+refer to the same file and the source and target ranges overlap.
+.TP
+.B EINVAL
+Either
+.I fd_in
+or
+.I fd_out
+is not a regular file.
+.TP
+.B EIO
+A low-level I/O error occurred while copying.
+.TP
+.B EISDIR
+Either
+.I fd_in
+or
+.I fd_out
+refers to a directory.
+.TP
+.B ENOMEM
+Out of memory.
+.TP
+.B ENOSPC
+There is not enough space on the target filesystem to complete the copy.
+.TP
+.BR EOPNOTSUPP " (since Linux 5.19)"
+.\" commit 868f9f2f8e004bfe0d3935b1976f625b2924893b
+The filesystem does not support this operation.
+.TP
+.B EOVERFLOW
+The requested source or destination range is too large to represent in the
+specified data types.
+.TP
+.B EPERM
+.I fd_out
+refers to an immutable file.
+.TP
+.B ETXTBSY
+Either
+.I fd_in
+or
+.I fd_out
+refers to an active swap file.
+.TP
+.BR EXDEV " (before Linux 5.3)"
+.\" commit 5dae222a5ff0c269730393018a5539cc970a4726
+The files referred to by
+.IR fd_in " and " fd_out
+are not on the same filesystem.
+.TP
+.BR EXDEV " (since Linux 5.19)"
+.\" commit 868f9f2f8e004bfe0d3935b1976f625b2924893b
+The files referred to by
+.IR fd_in " and " fd_out
+are not on the same filesystem,
+and the source and target filesystems are not of the same type,
+or do not support cross-filesystem copy.
+.SH VERSIONS
+A major rework of the kernel implementation occurred in Linux 5.3.
+Areas of the API that weren't clearly defined were clarified and the API bounds
+are much more strictly checked than on earlier kernels.
+.PP
+Since Linux 5.19,
+cross-filesystem copies can be achieved
+when both filesystems are of the same type,
+and that filesystem implements support for it.
+See BUGS for behavior prior to Linux 5.19.
+.PP
+Applications should target the behaviour and requirements of Linux 5.19,
+that was also backported to earlier stable kernels.
+.SH STANDARDS
+Linux, GNU.
+.SH HISTORY
+Linux 4.5,
+but glibc 2.27 provides a user-space
+emulation when it is not available.
+.\" https://sourceware.org/git/?p=glibc.git;a=commit;f=posix/unistd.h;h=bad7a0c81f501fbbcc79af9eaa4b8254441c4a1f
+.SH NOTES
+If
+.I fd_in
+is a sparse file, then
+.BR copy_file_range ()
+may expand any holes existing in the requested range.
+Users may benefit from calling
+.BR copy_file_range ()
+in a loop, and using the
+.BR lseek (2)
+.B SEEK_DATA
+and
+.B SEEK_HOLE
+operations to find the locations of data segments.
+.PP
+.BR copy_file_range ()
+gives filesystems an opportunity to implement "copy acceleration" techniques,
+such as the use of reflinks (i.e., two or more inodes that share
+pointers to the same copy-on-write disk blocks)
+or server-side-copy (in the case of NFS).
+.PP
+.B _FILE_OFFSET_BITS
+should be defined to be 64 in code that uses non-null
+.I off_in
+or
+.I off_out
+or that takes the address of
+.BR copy_file_range ,
+if the code is intended to be portable
+to traditional 32-bit x86 and ARM platforms where
+.BR off_t 's
+width defaults to 32 bits.
+.SH BUGS
+In Linux 5.3 to Linux 5.18,
+cross-filesystem copies were implemented by the kernel,
+if the operation was not supported by individual filesystems.
+However, on some virtual filesystems,
+the call failed to copy, while still reporting success.
+.SH EXAMPLES
+.\" SRC BEGIN (copy_file_range.c)
+.EX
+#define _GNU_SOURCE
+#define _FILE_OFFSET_BITS 64
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ int fd_in, fd_out;
+ off_t len, ret;
+ struct stat stat;
+\&
+ if (argc != 3) {
+ fprintf(stderr, "Usage: %s <source> <destination>\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ fd_in = open(argv[1], O_RDONLY);
+ if (fd_in == \-1) {
+ perror("open (argv[1])");
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (fstat(fd_in, &stat) == \-1) {
+ perror("fstat");
+ exit(EXIT_FAILURE);
+ }
+\&
+ len = stat.st_size;
+\&
+ fd_out = open(argv[2], O_CREAT | O_WRONLY | O_TRUNC, 0644);
+ if (fd_out == \-1) {
+ perror("open (argv[2])");
+ exit(EXIT_FAILURE);
+ }
+\&
+ do {
+ ret = copy_file_range(fd_in, NULL, fd_out, NULL, len, 0);
+ if (ret == \-1) {
+ perror("copy_file_range");
+ exit(EXIT_FAILURE);
+ }
+\&
+ len \-= ret;
+ } while (len > 0 && ret > 0);
+\&
+ close(fd_in);
+ close(fd_out);
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR lseek (2),
+.BR sendfile (2),
+.BR splice (2)
diff --git a/man2/creat.2 b/man2/creat.2
new file mode 100644
index 0000000..604e121
--- /dev/null
+++ b/man2/creat.2
@@ -0,0 +1 @@
+.so man2/open.2
diff --git a/man2/create_module.2 b/man2/create_module.2
new file mode 100644
index 0000000..d159cb1
--- /dev/null
+++ b/man2/create_module.2
@@ -0,0 +1,72 @@
+.\" Copyright (C) 1996 Free Software Foundation, Inc.
+.\"
+.\" SPDX-License-Identifier: GPL-1.0-or-later
+.\"
+.\" 2006-02-09, some reformatting by Luc Van Oostenryck; some
+.\" reformatting and rewordings by mtk
+.\"
+.TH create_module 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+create_module \- create a loadable module entry
+.SH SYNOPSIS
+.nf
+.B #include <linux/module.h>
+.PP
+.BI "[[deprecated]] caddr_t create_module(const char *" name ", size_t " size );
+.fi
+.SH DESCRIPTION
+.IR Note :
+This system call is present only before Linux 2.6.
+.PP
+.BR create_module ()
+attempts to create a loadable module entry and reserve the kernel memory
+that will be needed to hold the module.
+This system call requires privilege.
+.SH RETURN VALUE
+On success, returns the kernel address at which the module will reside.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EEXIST
+A module by that name already exists.
+.TP
+.B EFAULT
+.I name
+is outside the program's accessible address space.
+.TP
+.B EINVAL
+The requested size is too small even for the module header information.
+.TP
+.B ENOMEM
+The kernel could not allocate a contiguous block of memory large
+enough for the module.
+.TP
+.B ENOSYS
+.BR create_module ()
+is not supported in this version of the kernel
+(e.g., Linux 2.6 or later).
+.TP
+.B EPERM
+The caller was not privileged
+(did not have the
+.B CAP_SYS_MODULE
+capability).
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Removed in Linux 2.6.
+.\" Removed in Linux 2.5.48
+.PP
+This obsolete system call is not supported by glibc.
+No declaration is provided in glibc headers, but, through a quirk of history,
+glibc versions before glibc 2.23 did export an ABI for this system call.
+Therefore, in order to employ this system call,
+it was sufficient to manually declare the interface in your code;
+alternatively, you could invoke the system call using
+.BR syscall (2).
+.SH SEE ALSO
+.BR delete_module (2),
+.BR init_module (2),
+.BR query_module (2)
diff --git a/man2/delete_module.2 b/man2/delete_module.2
new file mode 100644
index 0000000..a909729
--- /dev/null
+++ b/man2/delete_module.2
@@ -0,0 +1,205 @@
+.\" Copyright (C) 2012 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH delete_module 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+delete_module \- unload a kernel module
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <fcntl.h>" " /* Definition of " O_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_delete_module, const char *" name ", unsigned int " flags );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR delete_module (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The
+.BR delete_module ()
+system call attempts to remove the unused loadable module entry
+identified by
+.IR name .
+If the module has an
+.I exit
+function, then that function is executed before unloading the module.
+The
+.I flags
+argument is used to modify the behavior of the system call,
+as described below.
+This system call requires privilege.
+.PP
+Module removal is attempted according to the following rules:
+.IP (1) 5
+If there are other loaded modules that depend on
+(i.e., refer to symbols defined in) this module,
+then the call fails.
+.IP (2)
+Otherwise, if the reference count for the module
+(i.e., the number of processes currently using the module)
+is zero, then the module is immediately unloaded.
+.IP (3)
+If a module has a nonzero reference count,
+then the behavior depends on the bits set in
+.IR flags .
+In normal usage (see NOTES), the
+.B O_NONBLOCK
+flag is always specified, and the
+.B O_TRUNC
+flag may additionally be specified.
+.\" O_TRUNC == KMOD_REMOVE_FORCE in kmod library
+.\" O_NONBLOCK == KMOD_REMOVE_NOWAIT in kmod library
+.IP
+The various combinations for
+.I flags
+have the following effect:
+.RS
+.TP
+.B flags == O_NONBLOCK
+The call returns immediately, with an error.
+.TP
+.B flags == (O_NONBLOCK | O_TRUNC)
+The module is unloaded immediately,
+regardless of whether it has a nonzero reference count.
+.TP
+.B (flags & O_NONBLOCK) == 0
+If
+.I flags
+does not specify
+.BR O_NONBLOCK ,
+the following steps occur:
+.RS
+.IP \[bu] 3
+The module is marked so that no new references are permitted.
+.IP \[bu]
+If the module's reference count is nonzero,
+the caller is placed in an uninterruptible sleep state
+.RB ( TASK_UNINTERRUPTIBLE )
+until the reference count is zero, at which point the call unblocks.
+.IP \[bu]
+The module is unloaded in the usual way.
+.RE
+.RE
+.PP
+The
+.B O_TRUNC
+flag has one further effect on the rules described above.
+By default, if a module has an
+.I init
+function but no
+.I exit
+function, then an attempt to remove the module fails.
+However, if
+.B O_TRUNC
+was specified, this requirement is bypassed.
+.PP
+Using the
+.B O_TRUNC
+flag is dangerous!
+If the kernel was not built with
+.BR CONFIG_MODULE_FORCE_UNLOAD ,
+this flag is silently ignored.
+(Normally,
+.B CONFIG_MODULE_FORCE_UNLOAD
+is enabled.)
+Using this flag taints the kernel (TAINT_FORCED_RMMOD).
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBUSY
+The module is not "live"
+(i.e., it is still being initialized or is already marked for removal);
+or, the module has
+an
+.I init
+function but has no
+.I exit
+function, and
+.B O_TRUNC
+was not specified in
+.IR flags .
+.TP
+.B EFAULT
+.I name
+refers to a location outside the process's accessible address space.
+.TP
+.B ENOENT
+No module by that name exists.
+.TP
+.B EPERM
+The caller was not privileged
+(did not have the
+.B CAP_SYS_MODULE
+capability),
+or module unloading is disabled
+(see
+.I /proc/sys/kernel/modules_disabled
+in
+.BR proc (5)).
+.TP
+.B EWOULDBLOCK
+Other modules depend on this module;
+or,
+.B O_NONBLOCK
+was specified in
+.IR flags ,
+but the reference count of this module is nonzero and
+.B O_TRUNC
+was not specified in
+.IR flags .
+.SH STANDARDS
+Linux.
+.SH HISTORY
+The
+.BR delete_module ()
+system call is not supported by glibc.
+No declaration is provided in glibc headers, but, through a quirk of history,
+glibc versions before glibc 2.23 did export an ABI for this system call.
+Therefore, in order to employ this system call,
+it is (before glibc 2.23) sufficient to
+manually declare the interface in your code;
+alternatively, you can invoke the system call using
+.BR syscall (2).
+.SS Linux 2.4 and earlier
+In Linux 2.4 and earlier, the system call took only one argument:
+.PP
+.BI " int delete_module(const char *" name );
+.PP
+If
+.I name
+is NULL, all unused modules marked auto-clean are removed.
+.PP
+Some further details of differences in the behavior of
+.BR delete_module ()
+in Linux 2.4 and earlier are
+.I not
+currently explained in this manual page.
+.SH NOTES
+The uninterruptible sleep that may occur if
+.B O_NONBLOCK
+is omitted from
+.I flags
+is considered undesirable, because the sleeping process is left
+in an unkillable state.
+As at Linux 3.7, specifying
+.B O_NONBLOCK
+is optional, but in future kernels it is likely to become mandatory.
+.SH SEE ALSO
+.BR create_module (2),
+.BR init_module (2),
+.BR query_module (2),
+.BR lsmod (8),
+.BR modprobe (8),
+.BR rmmod (8)
diff --git a/man2/dup.2 b/man2/dup.2
new file mode 100644
index 0000000..b7187ed
--- /dev/null
+++ b/man2/dup.2
@@ -0,0 +1,284 @@
+.\" This manpage is Copyright (C) 1992 Drew Eckhardt;
+.\" and Copyright (C) 1993 Michael Haardt, Ian Jackson.
+.\" and Copyright (C) 2005, 2008 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" and Copyright (C) 2014 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 1993-07-21, Rik Faith <faith@cs.unc.edu>
+.\" Modified 1994-08-21, Michael Chastain <mec@shell.portal.com>:
+.\" Fixed typos.
+.\" Modified 1997-01-31, Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 2002-09-28, aeb
+.\" 2009-01-12, mtk, reordered text in DESCRIPTION and added some
+.\" details for dup2().
+.\" 2008-10-09, mtk: add description of dup3()
+.\"
+.TH dup 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+dup, dup2, dup3 \- duplicate a file descriptor
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int dup(int " oldfd );
+.BI "int dup2(int " oldfd ", int " newfd );
+.PP
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.BR "#include <fcntl.h>" " /* Definition of " O_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int dup3(int " oldfd ", int " newfd ", int " flags );
+.fi
+.SH DESCRIPTION
+The
+.BR dup ()
+system call allocates a new file descriptor that refers to the same
+open file description as the descriptor
+.IR oldfd .
+(For an explanation of open file descriptions, see
+.BR open (2).)
+The new file descriptor number is guaranteed to be the lowest-numbered
+file descriptor that was unused in the calling process.
+.PP
+After a successful return,
+the old and new file descriptors may be used interchangeably.
+Since the two file descriptors refer to the same open file description,
+they share file offset and file status flags;
+for example, if the file offset is modified by using
+.BR lseek (2)
+on one of the file descriptors,
+the offset is also changed for the other file descriptor.
+.PP
+The two file descriptors do not share file descriptor flags
+(the close-on-exec flag).
+The close-on-exec flag
+.RB ( FD_CLOEXEC ;
+see
+.BR fcntl (2))
+for the duplicate descriptor is off.
+.\"
+.SS dup2()
+The
+.BR dup2 ()
+system call performs the same task as
+.BR dup (),
+but instead of using the lowest-numbered unused file descriptor,
+it uses the file descriptor number specified in
+.IR newfd .
+In other words,
+the file descriptor
+.I newfd
+is adjusted so that it now refers to the same open file description as
+.IR oldfd .
+.PP
+If the file descriptor
+.I newfd
+was previously open, it is closed before being reused;
+the close is performed silently
+(i.e., any errors during the close are not reported by
+.BR dup2 ()).
+.PP
+The steps of closing and reusing the file descriptor
+.I newfd
+are performed
+.IR atomically .
+This is important, because trying to implement equivalent functionality using
+.BR close (2)
+and
+.BR dup ()
+would be
+subject to race conditions, whereby
+.I newfd
+might be reused between the two steps.
+Such reuse could happen because the main program is interrupted
+by a signal handler that allocates a file descriptor,
+or because a parallel thread allocates a file descriptor.
+.PP
+Note the following points:
+.IP \[bu] 3
+If
+.I oldfd
+is not a valid file descriptor, then the call fails, and
+.I newfd
+is not closed.
+.IP \[bu]
+If
+.I oldfd
+is a valid file descriptor, and
+.I newfd
+has the same value as
+.IR oldfd ,
+then
+.BR dup2 ()
+does nothing, and returns
+.IR newfd .
+.\"
+.SS dup3()
+.BR dup3 ()
+is the same as
+.BR dup2 (),
+except that:
+.IP \[bu] 3
+The caller can force the close-on-exec flag to be set
+for the new file descriptor by specifying
+.B O_CLOEXEC
+in
+.IR flags .
+See the description of the same flag in
+.BR open (2)
+for reasons why this may be useful.
+.IP \[bu]
+.\" Ulrich Drepper, LKML, 2008-10-09:
+.\" We deliberately decided on this change. Otherwise, what is the
+.\" result of dup3(fd, fd, O_CLOEXEC)?
+If
+.I oldfd
+equals
+.IR newfd ,
+then
+.BR dup3 ()
+fails with the error
+.BR EINVAL .
+.SH RETURN VALUE
+On success, these system calls
+return the new file descriptor.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+.I oldfd
+isn't an open file descriptor.
+.TP
+.B EBADF
+.I newfd
+is out of the allowed range for file descriptors (see the discussion of
+.B RLIMIT_NOFILE
+in
+.BR getrlimit (2)).
+.TP
+.B EBUSY
+(Linux only) This may be returned by
+.BR dup2 ()
+or
+.BR dup3 ()
+during a race condition with
+.BR open (2)
+and
+.BR dup ().
+.TP
+.B EINTR
+The
+.BR dup2 ()
+or
+.BR dup3 ()
+call was interrupted by a signal; see
+.BR signal (7).
+.TP
+.B EINVAL
+.RB ( dup3 ())
+.I flags
+contain an invalid value.
+.TP
+.B EINVAL
+.RB ( dup3 ())
+.I oldfd
+was equal to
+.IR newfd .
+.TP
+.B EMFILE
+The per-process limit on the number of open file descriptors has been reached
+(see the discussion of
+.B RLIMIT_NOFILE
+in
+.BR getrlimit (2)).
+.SH STANDARDS
+.TP
+.BR dup ()
+.TQ
+.BR dup2 ()
+POSIX.1-2008.
+.TP
+.BR dup3 ()
+Linux.
+.SH HISTORY
+.TP
+.BR dup ()
+.TQ
+.BR dup2 ()
+POSIX.1-2001, SVr4, 4.3BSD.
+.\" SVr4 documents additional
+.\" EINTR and ENOLINK error conditions. POSIX.1 adds EINTR.
+.\" The EBUSY return is Linux-specific.
+.TP
+.BR dup3 ()
+Linux 2.6.27,
+glibc 2.9.
+.SH NOTES
+The error returned by
+.BR dup2 ()
+is different from that returned by
+.BR fcntl( "..., " F_DUPFD ", ..." )
+when
+.I newfd
+is out of range.
+On some systems,
+.BR dup2 ()
+also sometimes returns
+.B EINVAL
+like
+.BR F_DUPFD .
+.PP
+If
+.I newfd
+was open, any errors that would have been reported at
+.BR close (2)
+time are lost.
+If this is of concern,
+then\[em]unless the program is single-threaded and does not allocate
+file descriptors in signal handlers\[em]the correct approach is
+.I not
+to close
+.I newfd
+before calling
+.BR dup2 (),
+because of the race condition described above.
+Instead, code something like the following could be used:
+.PP
+.in +4n
+.EX
+/* Obtain a duplicate of \[aq]newfd\[aq] that can subsequently
+ be used to check for close() errors; an EBADF error
+ means that \[aq]newfd\[aq] was not open. */
+\&
+tmpfd = dup(newfd);
+if (tmpfd == \-1 && errno != EBADF) {
+ /* Handle unexpected dup() error. */
+}
+\&
+/* Atomically duplicate \[aq]oldfd\[aq] on \[aq]newfd\[aq]. */
+\&
+if (dup2(oldfd, newfd) == \-1) {
+ /* Handle dup2() error. */
+}
+\&
+/* Now check for close() errors on the file originally
+ referred to by \[aq]newfd\[aq]. */
+\&
+if (tmpfd != \-1) {
+ if (close(tmpfd) == \-1) {
+ /* Handle errors from close. */
+ }
+}
+.EE
+.in
+.SH SEE ALSO
+.BR close (2),
+.BR fcntl (2),
+.BR open (2),
+.BR pidfd_getfd (2)
diff --git a/man2/dup2.2 b/man2/dup2.2
new file mode 100644
index 0000000..49a65c6
--- /dev/null
+++ b/man2/dup2.2
@@ -0,0 +1 @@
+.so man2/dup.2
diff --git a/man2/dup3.2 b/man2/dup3.2
new file mode 100644
index 0000000..49a65c6
--- /dev/null
+++ b/man2/dup3.2
@@ -0,0 +1 @@
+.so man2/dup.2
diff --git a/man2/epoll_create.2 b/man2/epoll_create.2
new file mode 100644
index 0000000..8d2c0be
--- /dev/null
+++ b/man2/epoll_create.2
@@ -0,0 +1,144 @@
+.\" Copyright (C) 2003 Davide Libenzi
+.\" and Copyright 2008, 2009, 2012 Michael Kerrisk <tk.manpages@gmail.com>
+.\" Davide Libenzi <davidel@xmailserver.org>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" Modified 2004-06-17 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified 2005-04-04 by Marko Kohtala <marko.kohtala@gmail.com>
+.\" 2008-10-10, mtk: add description of epoll_create1()
+.\"
+.TH epoll_create 2 2023-07-16 "Linux man-pages 6.05.01"
+.SH NAME
+epoll_create, epoll_create1 \- open an epoll file descriptor
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/epoll.h>
+.PP
+.BI "int epoll_create(int " size );
+.BI "int epoll_create1(int " flags );
+.fi
+.SH DESCRIPTION
+.BR epoll_create ()
+creates a new
+.BR epoll (7)
+instance.
+Since Linux 2.6.8, the
+.I size
+argument is ignored, but must be greater than zero; see HISTORY.
+.PP
+.BR epoll_create ()
+returns a file descriptor referring to the new epoll instance.
+This file descriptor is used for all the subsequent calls to the
+.B epoll
+interface.
+When no longer required, the file descriptor returned by
+.BR epoll_create ()
+should be closed by using
+.BR close (2).
+When all file descriptors referring to an epoll instance have been closed,
+the kernel destroys the instance
+and releases the associated resources for reuse.
+.SS epoll_create1()
+If
+.I flags
+is 0, then, other than the fact that the obsolete
+.I size
+argument is dropped,
+.BR epoll_create1 ()
+is the same as
+.BR epoll_create ().
+The following value can be included in
+.I flags
+to obtain different behavior:
+.TP
+.B EPOLL_CLOEXEC
+Set the close-on-exec
+.RB ( FD_CLOEXEC )
+flag on the new file descriptor.
+See the description of the
+.B O_CLOEXEC
+flag in
+.BR open (2)
+for reasons why this may be useful.
+.SH RETURN VALUE
+On success,
+these system calls
+return a file descriptor (a nonnegative integer).
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+.I size
+is not positive.
+.TP
+.B EINVAL
+.RB ( epoll_create1 ())
+Invalid value specified in
+.IR flags .
+.TP
+.B EMFILE
+The per-process limit on the number of open file descriptors has been reached.
+.TP
+.B ENFILE
+The system-wide limit on the total number of open files has been reached.
+.TP
+.B ENOMEM
+There was insufficient memory to create the kernel object.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+.TP
+.BR epoll_create ()
+Linux 2.6,
+glibc 2.3.2.
+.\" To be precise: kernel 2.5.44.
+.\" The interface should be finalized by Linux kernel 2.5.66.
+.TP
+.BR epoll_create1 ()
+Linux 2.6.27,
+glibc 2.9.
+.PP
+In the initial
+.BR epoll_create ()
+implementation, the
+.I size
+argument informed the kernel of the number of file descriptors
+that the caller expected to add to the
+.B epoll
+instance.
+The kernel used this information as a hint for the amount of
+space to initially allocate in internal data structures describing events.
+(If necessary, the kernel would allocate more space
+if the caller's usage exceeded the hint given in
+.IR size .)
+Nowadays,
+this hint is no longer required
+(the kernel dynamically sizes the required data structures
+without needing the hint), but
+.I size
+must still be greater than zero,
+in order to ensure backward compatibility when new
+.B epoll
+applications are run on older kernels.
+.PP
+Prior to Linux 2.6.29,
+.\" commit 9df04e1f25effde823a600e755b51475d438f56b
+a
+.I /proc/sys/fs/epoll/max_user_instances
+kernel parameter limited live epolls for each real user ID,
+and caused
+.BR epoll_create ()
+to fail with
+.B EMFILE
+on overrun.
+.SH SEE ALSO
+.BR close (2),
+.BR epoll_ctl (2),
+.BR epoll_wait (2),
+.BR epoll (7)
diff --git a/man2/epoll_create1.2 b/man2/epoll_create1.2
new file mode 100644
index 0000000..69605b6
--- /dev/null
+++ b/man2/epoll_create1.2
@@ -0,0 +1 @@
+.so man2/epoll_create.2
diff --git a/man2/epoll_ctl.2 b/man2/epoll_ctl.2
new file mode 100644
index 0000000..e8ee1e6
--- /dev/null
+++ b/man2/epoll_ctl.2
@@ -0,0 +1,429 @@
+.\" Copyright (C) 2003 Davide Libenzi
+.\" Davide Libenzi <davidel@xmailserver.org>
+.\" and Copyright 2009, 2014, 2016, 2018, 2019 Michael Kerrisk <tk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.TH epoll_ctl 2 2023-04-03 "Linux man-pages 6.05.01"
+.SH NAME
+epoll_ctl \- control interface for an epoll file descriptor
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/epoll.h>
+.PP
+.BI "int epoll_ctl(int " epfd ", int " op ", int " fd ,
+.BI " struct epoll_event *_Nullable " event );
+.fi
+.SH DESCRIPTION
+This system call is used to add, modify, or remove
+entries in the interest list of the
+.BR epoll (7)
+instance
+referred to by the file descriptor
+.IR epfd .
+It requests that the operation
+.I op
+be performed for the target file descriptor,
+.IR fd .
+.PP
+Valid values for the
+.I op
+argument are:
+.TP
+.B EPOLL_CTL_ADD
+Add an entry to the interest list of the epoll file descriptor,
+.IR epfd .
+The entry includes the file descriptor,
+.IR fd ,
+a reference to the corresponding open file description (see
+.BR epoll (7)
+and
+.BR open (2)),
+and the settings specified in
+.IR event .
+.TP
+.B EPOLL_CTL_MOD
+Change the settings associated with
+.I fd
+in the interest list to the new settings specified in
+.IR event .
+.TP
+.B EPOLL_CTL_DEL
+Remove (deregister) the target file descriptor
+.I fd
+from the interest list.
+The
+.I event
+argument is ignored and can be NULL (but see BUGS below).
+.PP
+The
+.I event
+argument describes the object linked to the file descriptor
+.IR fd .
+The
+.I struct epoll_event
+is described in
+.BR epoll_event (3type).
+.PP
+The
+.I data
+member of the
+.I epoll_event
+structure specifies data that the kernel should save and then return (via
+.BR epoll_wait (2))
+when this file descriptor becomes ready.
+.PP
+The
+.I events
+member of the
+.I epoll_event
+structure is a bit mask composed by ORing together zero or more event types,
+returned by
+.BR epoll_wait (2),
+and input flags, which affect its behaviour, but aren't returned.
+The available event types are:
+.TP
+.B EPOLLIN
+The associated file is available for
+.BR read (2)
+operations.
+.TP
+.B EPOLLOUT
+The associated file is available for
+.BR write (2)
+operations.
+.TP
+.BR EPOLLRDHUP " (since Linux 2.6.17)"
+Stream socket peer closed connection,
+or shut down writing half of connection.
+(This flag is especially useful for writing simple code to detect
+peer shutdown when using edge-triggered monitoring.)
+.TP
+.B EPOLLPRI
+There is an exceptional condition on the file descriptor.
+See the discussion of
+.B POLLPRI
+in
+.BR poll (2).
+.TP
+.B EPOLLERR
+Error condition happened on the associated file descriptor.
+This event is also reported for the write end of a pipe when the read end
+has been closed.
+.IP
+.BR epoll_wait (2)
+will always report for this event; it is not necessary to set it in
+.I events
+when calling
+.BR epoll_ctl ().
+.TP
+.B EPOLLHUP
+Hang up happened on the associated file descriptor.
+.IP
+.BR epoll_wait (2)
+will always wait for this event; it is not necessary to set it in
+.I events
+when calling
+.BR epoll_ctl ().
+.IP
+Note that when reading from a channel such as a pipe or a stream socket,
+this event merely indicates that the peer closed its end of the channel.
+Subsequent reads from the channel will return 0 (end of file)
+only after all outstanding data in the channel has been consumed.
+.PP
+And the available input flags are:
+.TP
+.B EPOLLET
+Requests edge-triggered notification for the associated file descriptor.
+The default behavior for
+.B epoll
+is level-triggered.
+See
+.BR epoll (7)
+for more detailed information about edge-triggered and
+level-triggered notification.
+.TP
+.BR EPOLLONESHOT " (since Linux 2.6.2)"
+Requests one-shot notification for the associated file descriptor.
+This means that after an event notified for the file descriptor by
+.BR epoll_wait (2),
+the file descriptor is disabled in the interest list and no other events
+will be reported by the
+.B epoll
+interface.
+The user must call
+.BR epoll_ctl ()
+with
+.B EPOLL_CTL_MOD
+to rearm the file descriptor with a new event mask.
+.TP
+.BR EPOLLWAKEUP " (since Linux 3.5)"
+.\" commit 4d7e30d98939a0340022ccd49325a3d70f7e0238
+If
+.B EPOLLONESHOT
+and
+.B EPOLLET
+are clear and the process has the
+.B CAP_BLOCK_SUSPEND
+capability,
+ensure that the system does not enter "suspend" or
+"hibernate" while this event is pending or being processed.
+The event is considered as being "processed" from the time
+when it is returned by a call to
+.BR epoll_wait (2)
+until the next call to
+.BR epoll_wait (2)
+on the same
+.BR epoll (7)
+file descriptor,
+the closure of that file descriptor,
+the removal of the event file descriptor with
+.BR EPOLL_CTL_DEL ,
+or the clearing of
+.B EPOLLWAKEUP
+for the event file descriptor with
+.BR EPOLL_CTL_MOD .
+See also BUGS.
+.TP
+.BR EPOLLEXCLUSIVE " (since Linux 4.5)"
+Sets an exclusive wakeup mode for the epoll file descriptor that is being
+attached to the target file descriptor,
+.IR fd .
+When a wakeup event occurs and multiple epoll file descriptors
+are attached to the same target file using
+.BR EPOLLEXCLUSIVE ,
+one or more of the epoll file descriptors will receive an event with
+.BR epoll_wait (2).
+The default in this scenario (when
+.B EPOLLEXCLUSIVE
+is not set) is for all epoll file descriptors to receive an event.
+.B EPOLLEXCLUSIVE
+is thus useful for avoiding thundering herd problems in certain scenarios.
+.IP
+If the same file descriptor is in multiple epoll instances,
+some with the
+.B EPOLLEXCLUSIVE
+flag, and others without, then events will be provided to all epoll
+instances that did not specify
+.BR EPOLLEXCLUSIVE ,
+and at least one of the epoll instances that did specify
+.BR EPOLLEXCLUSIVE .
+.IP
+The following values may be specified in conjunction with
+.BR EPOLLEXCLUSIVE :
+.BR EPOLLIN ,
+.BR EPOLLOUT ,
+.BR EPOLLWAKEUP ,
+and
+.BR EPOLLET .
+.B EPOLLHUP
+and
+.B EPOLLERR
+can also be specified, but this is not required:
+as usual, these events are always reported if they occur,
+regardless of whether they are specified in
+.IR events .
+Attempts to specify other values in
+.I events
+yield the error
+.BR EINVAL .
+.IP
+.B EPOLLEXCLUSIVE
+may be used only in an
+.B EPOLL_CTL_ADD
+operation; attempts to employ it with
+.B EPOLL_CTL_MOD
+yield an error.
+If
+.B EPOLLEXCLUSIVE
+has been set using
+.BR epoll_ctl (),
+then a subsequent
+.B EPOLL_CTL_MOD
+on the same
+.IR epfd ,\~ fd
+pair yields an error.
+A call to
+.BR epoll_ctl ()
+that specifies
+.B EPOLLEXCLUSIVE
+in
+.I events
+and specifies the target file descriptor
+.I fd
+as an epoll instance will likewise fail.
+The error in all of these cases is
+.BR EINVAL .
+.SH RETURN VALUE
+When successful,
+.BR epoll_ctl ()
+returns zero.
+When an error occurs,
+.BR epoll_ctl ()
+returns \-1 and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+.I epfd
+or
+.I fd
+is not a valid file descriptor.
+.TP
+.B EEXIST
+.I op
+was
+.BR EPOLL_CTL_ADD ,
+and the supplied file descriptor
+.I fd
+is already registered with this epoll instance.
+.TP
+.B EINVAL
+.I epfd
+is not an
+.B epoll
+file descriptor,
+or
+.I fd
+is the same as
+.IR epfd ,
+or the requested operation
+.I op
+is not supported by this interface.
+.TP
+.B EINVAL
+An invalid event type was specified along with
+.B EPOLLEXCLUSIVE
+in
+.IR events .
+.TP
+.B EINVAL
+.I op
+was
+.B EPOLL_CTL_MOD
+and
+.I events
+included
+.BR EPOLLEXCLUSIVE .
+.TP
+.B EINVAL
+.I op
+was
+.B EPOLL_CTL_MOD
+and the
+.B EPOLLEXCLUSIVE
+flag has previously been applied to this
+.IR epfd ,\~ fd
+pair.
+.TP
+.B EINVAL
+.B EPOLLEXCLUSIVE
+was specified in
+.I event
+and
+.I fd
+refers to an epoll instance.
+.TP
+.B ELOOP
+.I fd
+refers to an epoll instance and this
+.B EPOLL_CTL_ADD
+operation would result in a circular loop of epoll instances
+monitoring one another or a nesting depth of epoll instances
+greater than 5.
+.TP
+.B ENOENT
+.I op
+was
+.B EPOLL_CTL_MOD
+or
+.BR EPOLL_CTL_DEL ,
+and
+.I fd
+is not registered with this epoll instance.
+.TP
+.B ENOMEM
+There was insufficient memory to handle the requested
+.I op
+control operation.
+.TP
+.B ENOSPC
+The limit imposed by
+.I /proc/sys/fs/epoll/max_user_watches
+was encountered while trying to register
+.RB ( EPOLL_CTL_ADD )
+a new file descriptor on an epoll instance.
+See
+.BR epoll (7)
+for further details.
+.TP
+.B EPERM
+The target file
+.I fd
+does not support
+.BR epoll .
+This error can occur if
+.I fd
+refers to, for example, a regular file or a directory.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6,
+.\" To be precise: Linux 2.5.44.
+.\" The interface should be finalized by Linux 2.5.66.
+glibc 2.3.2.
+.SH NOTES
+The
+.B epoll
+interface supports all file descriptors that support
+.BR poll (2).
+.SH BUGS
+Before Linux 2.6.9, the
+.B EPOLL_CTL_DEL
+operation required a non-null pointer in
+.IR event ,
+even though this argument is ignored.
+Since Linux 2.6.9,
+.I event
+can be specified as NULL
+when using
+.BR EPOLL_CTL_DEL .
+Applications that need to be portable to kernels before Linux 2.6.9
+should specify a non-null pointer in
+.IR event .
+.PP
+If
+.B EPOLLWAKEUP
+is specified in
+.IR flags ,
+but the caller does not have the
+.B CAP_BLOCK_SUSPEND
+capability, then the
+.B EPOLLWAKEUP
+flag is
+.IR "silently ignored" .
+This unfortunate behavior is necessary because no validity
+checks were performed on the
+.I flags
+argument in the original implementation, and the addition of the
+.B EPOLLWAKEUP
+with a check that caused the call to fail if the caller did not have the
+.B CAP_BLOCK_SUSPEND
+capability caused a breakage in at least one existing user-space
+application that happened to randomly (and uselessly) specify this bit.
+.\" commit a8159414d7e3af7233e7a5a82d1c5d85379bd75c (behavior change)
+.\" https://lwn.net/Articles/520198/
+A robust application should therefore double check that it has the
+.B CAP_BLOCK_SUSPEND
+capability if attempting to use the
+.B EPOLLWAKEUP
+flag.
+.SH SEE ALSO
+.BR epoll_create (2),
+.BR epoll_wait (2),
+.BR poll (2),
+.BR epoll (7)
diff --git a/man2/epoll_pwait.2 b/man2/epoll_pwait.2
new file mode 100644
index 0000000..9282a70
--- /dev/null
+++ b/man2/epoll_pwait.2
@@ -0,0 +1 @@
+.so man2/epoll_wait.2
diff --git a/man2/epoll_pwait2.2 b/man2/epoll_pwait2.2
new file mode 100644
index 0000000..9282a70
--- /dev/null
+++ b/man2/epoll_pwait2.2
@@ -0,0 +1 @@
+.so man2/epoll_wait.2
diff --git a/man2/epoll_wait.2 b/man2/epoll_wait.2
new file mode 100644
index 0000000..5efaada
--- /dev/null
+++ b/man2/epoll_wait.2
@@ -0,0 +1,288 @@
+.\" Copyright (C) 2003 Davide Libenzi
+.\" Davide Libenzi <davidel@xmailserver.org>
+.\" and Copyright 2007, 2012, 2014, 2018 Michael Kerrisk <tk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" 2007-04-30: mtk, Added description of epoll_pwait()
+.\"
+.TH epoll_wait 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+epoll_wait, epoll_pwait, epoll_pwait2 \-
+wait for an I/O event on an epoll file descriptor
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/epoll.h>
+.PP
+.BI "int epoll_wait(int " epfd ", struct epoll_event *" events ,
+.BI " int " maxevents ", int " timeout );
+.BI "int epoll_pwait(int " epfd ", struct epoll_event *" events ,
+.BI " int " maxevents ", int " timeout ,
+.BI " const sigset_t *_Nullable " sigmask );
+.BI "int epoll_pwait2(int " epfd ", struct epoll_event *" events ,
+.BI " int " maxevents ", \
+const struct timespec *_Nullable " timeout ,
+.BI " const sigset_t *_Nullable " sigmask );
+.fi
+.SH DESCRIPTION
+The
+.BR epoll_wait ()
+system call waits for events on the
+.BR epoll (7)
+instance referred to by the file descriptor
+.IR epfd .
+The buffer pointed to by
+.I events
+is used to return information from the ready list
+about file descriptors in the interest list that
+have some events available.
+Up to
+.I maxevents
+are returned by
+.BR epoll_wait ().
+The
+.I maxevents
+argument must be greater than zero.
+.PP
+The
+.I timeout
+argument specifies the number of milliseconds that
+.BR epoll_wait ()
+will block.
+Time is measured against the
+.B CLOCK_MONOTONIC
+clock.
+.PP
+A call to
+.BR epoll_wait ()
+will block until either:
+.IP \[bu] 3
+a file descriptor delivers an event;
+.IP \[bu]
+the call is interrupted by a signal handler; or
+.IP \[bu]
+the timeout expires.
+.PP
+Note that the
+.I timeout
+interval will be rounded up to the system clock granularity,
+and kernel scheduling delays mean that the blocking interval
+may overrun by a small amount.
+Specifying a
+.I timeout
+of \-1 causes
+.BR epoll_wait ()
+to block indefinitely, while specifying a
+.I timeout
+equal to zero cause
+.BR epoll_wait ()
+to return immediately, even if no events are available.
+.PP
+The
+.I struct epoll_event
+is described in
+.BR epoll_event (3type).
+.PP
+The
+.I data
+field of each returned
+.I epoll_event
+structure contains the same data as was specified
+in the most recent call to
+.BR epoll_ctl (2)
+.RB ( EPOLL_CTL_ADD ", " EPOLL_CTL_MOD )
+for the corresponding open file descriptor.
+.PP
+The
+.I events
+field is a bit mask that indicates the events that have occurred for the
+corresponding open file description.
+See
+.BR epoll_ctl (2)
+for a list of the bits that may appear in this mask.
+.\"
+.SS epoll_pwait()
+The relationship between
+.BR epoll_wait ()
+and
+.BR epoll_pwait ()
+is analogous to the relationship between
+.BR select (2)
+and
+.BR pselect (2):
+like
+.BR pselect (2),
+.BR epoll_pwait ()
+allows an application to safely wait until either a file descriptor
+becomes ready or until a signal is caught.
+.PP
+The following
+.BR epoll_pwait ()
+call:
+.PP
+.in +4n
+.EX
+ready = epoll_pwait(epfd, &events, maxevents, timeout, &sigmask);
+.EE
+.in
+.PP
+is equivalent to
+.I atomically
+executing the following calls:
+.PP
+.in +4n
+.EX
+sigset_t origmask;
+\&
+pthread_sigmask(SIG_SETMASK, &sigmask, &origmask);
+ready = epoll_wait(epfd, &events, maxevents, timeout);
+pthread_sigmask(SIG_SETMASK, &origmask, NULL);
+.EE
+.in
+.PP
+The
+.I sigmask
+argument may be specified as NULL, in which case
+.BR epoll_pwait ()
+is equivalent to
+.BR epoll_wait ().
+.\"
+.SS epoll_pwait2()
+The
+.BR epoll_pwait2 ()
+system call is equivalent to
+.BR epoll_pwait ()
+except for the
+.I timeout
+argument.
+It takes an argument of type
+.I timespec
+to be able to specify nanosecond resolution timeout.
+This argument functions the same as in
+.BR pselect (2)
+and
+.BR ppoll (2).
+If
+.I timeout
+is NULL, then
+.BR epoll_pwait2 ()
+can block indefinitely.
+.SH RETURN VALUE
+On success,
+.BR epoll_wait ()
+returns the number of file descriptors ready for the requested I/O, or zero
+if no file descriptor became ready during the requested
+.I timeout
+milliseconds.
+On failure,
+.BR epoll_wait ()
+returns \-1 and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+.I epfd
+is not a valid file descriptor.
+.TP
+.B EFAULT
+The memory area pointed to by
+.I events
+is not accessible with write permissions.
+.TP
+.B EINTR
+The call was interrupted by a signal handler before either (1) any of the
+requested events occurred or (2) the
+.I timeout
+expired; see
+.BR signal (7).
+.TP
+.B EINVAL
+.I epfd
+is not an
+.B epoll
+file descriptor, or
+.I maxevents
+is less than or equal to zero.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+.TP
+.BR epoll_wait ()
+Linux 2.6,
+.\" To be precise: Linux 2.5.44.
+.\" The interface should be finalized by Linux 2.5.66.
+glibc 2.3.2.
+.TP
+.BR epoll_pwait ()
+Linux 2.6.19,
+glibc 2.6.
+.TP
+.BR epoll_pwait2 ()
+Linux 5.11.
+.SH NOTES
+While one thread is blocked in a call to
+.BR epoll_wait (),
+it is possible for another thread to add a file descriptor to the waited-upon
+.B epoll
+instance.
+If the new file descriptor becomes ready,
+it will cause the
+.BR epoll_wait ()
+call to unblock.
+.PP
+If more than
+.I maxevents
+file descriptors are ready when
+.BR epoll_wait ()
+is called, then successive
+.BR epoll_wait ()
+calls will round robin through the set of ready file descriptors.
+This behavior helps avoid starvation scenarios,
+where a process fails to notice that additional file descriptors
+are ready because it focuses on a set of file descriptors that
+are already known to be ready.
+.PP
+Note that it is possible to call
+.BR epoll_wait ()
+on an
+.B epoll
+instance whose interest list is currently empty
+(or whose interest list becomes empty because file descriptors are closed
+or removed from the interest in another thread).
+The call will block until some file descriptor is later added to the
+interest list (in another thread) and that file descriptor becomes ready.
+.SS C library/kernel differences
+The raw
+.BR epoll_pwait ()
+and
+.BR epoll_pwait2 ()
+system calls have a sixth argument,
+.IR "size_t sigsetsize" ,
+which specifies the size in bytes of the
+.I sigmask
+argument.
+The glibc
+.BR epoll_pwait ()
+wrapper function specifies this argument as a fixed value
+(equal to
+.IR sizeof(sigset_t) ).
+.SH BUGS
+Before Linux 2.6.37, a
+.I timeout
+value larger than approximately
+.I LONG_MAX / HZ
+milliseconds is treated as \-1 (i.e., infinity).
+Thus, for example, on a system where
+.I sizeof(long)
+is 4 and the kernel
+.I HZ
+value is 1000,
+this means that timeouts greater than 35.79 minutes are treated as infinity.
+.SH SEE ALSO
+.BR epoll_create (2),
+.BR epoll_ctl (2),
+.BR epoll (7)
diff --git a/man2/eventfd.2 b/man2/eventfd.2
new file mode 100644
index 0000000..7be6a3a
--- /dev/null
+++ b/man2/eventfd.2
@@ -0,0 +1,443 @@
+'\" t
+.\" Copyright (C) 2008 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" starting from a version by Davide Libenzi <davidel@xmailserver.org>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" 2008-10-10, mtk: describe eventfd2(), and EFD_NONBLOCK and EFD_CLOEXEC
+.\"
+.TH eventfd 2 2023-07-20 "Linux man-pages 6.05.01"
+.SH NAME
+eventfd \- create a file descriptor for event notification
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/eventfd.h>
+.PP
+.BI "int eventfd(unsigned int " initval ", int " flags );
+.fi
+.SH DESCRIPTION
+.BR eventfd ()
+creates an "eventfd object" that can be used as
+an event wait/notify mechanism by user-space applications,
+and by the kernel to notify user-space applications of events.
+The object contains an unsigned 64-bit integer
+.RI ( uint64_t )
+counter that is maintained by the kernel.
+This counter is initialized with the value specified in the argument
+.IR initval .
+.PP
+As its return value,
+.BR eventfd ()
+returns a new file descriptor that can be used to refer to the
+eventfd object.
+.PP
+The following values may be bitwise ORed in
+.I flags
+to change the behavior of
+.BR eventfd ():
+.TP
+.BR EFD_CLOEXEC " (since Linux 2.6.27)"
+Set the close-on-exec
+.RB ( FD_CLOEXEC )
+flag on the new file descriptor.
+See the description of the
+.B O_CLOEXEC
+flag in
+.BR open (2)
+for reasons why this may be useful.
+.TP
+.BR EFD_NONBLOCK " (since Linux 2.6.27)"
+Set the
+.B O_NONBLOCK
+file status flag on the open file description (see
+.BR open (2))
+referred to by the new file descriptor.
+Using this flag saves extra calls to
+.BR fcntl (2)
+to achieve the same result.
+.TP
+.BR EFD_SEMAPHORE " (since Linux 2.6.30)"
+Provide semaphore-like semantics for reads from the new file descriptor.
+See below.
+.PP
+Up to Linux 2.6.26, the
+.I flags
+argument is unused, and must be specified as zero.
+.PP
+The following operations can be performed on the file descriptor returned by
+.BR eventfd ():
+.TP
+.BR read (2)
+Each successful
+.BR read (2)
+returns an 8-byte integer.
+A
+.BR read (2)
+fails with the error
+.B EINVAL
+if the size of the supplied buffer is less than 8 bytes.
+.IP
+The value returned by
+.BR read (2)
+is in host byte order\[em]that is,
+the native byte order for integers on the host machine.
+.IP
+The semantics of
+.BR read (2)
+depend on whether the eventfd counter currently has a nonzero value
+and whether the
+.B EFD_SEMAPHORE
+flag was specified when creating the eventfd file descriptor:
+.RS
+.IP \[bu] 3
+If
+.B EFD_SEMAPHORE
+was not specified and the eventfd counter has a nonzero value, then a
+.BR read (2)
+returns 8 bytes containing that value,
+and the counter's value is reset to zero.
+.IP \[bu]
+If
+.B EFD_SEMAPHORE
+was specified and the eventfd counter has a nonzero value, then a
+.BR read (2)
+returns 8 bytes containing the value 1,
+and the counter's value is decremented by 1.
+.IP \[bu]
+If the eventfd counter is zero at the time of the call to
+.BR read (2),
+then the call either blocks until the counter becomes nonzero
+(at which time, the
+.BR read (2)
+proceeds as described above)
+or fails with the error
+.B EAGAIN
+if the file descriptor has been made nonblocking.
+.RE
+.TP
+.BR write (2)
+A
+.BR write (2)
+call adds the 8-byte integer value supplied in its
+buffer to the counter.
+The maximum value that may be stored in the counter is the largest
+unsigned 64-bit value minus 1 (i.e., 0xfffffffffffffffe).
+If the addition would cause the counter's value to exceed
+the maximum, then the
+.BR write (2)
+either blocks until a
+.BR read (2)
+is performed on the file descriptor,
+or fails with the error
+.B EAGAIN
+if the file descriptor has been made nonblocking.
+.IP
+A
+.BR write (2)
+fails with the error
+.B EINVAL
+if the size of the supplied buffer is less than 8 bytes,
+or if an attempt is made to write the value 0xffffffffffffffff.
+.TP
+.BR poll "(2), " select "(2) (and similar)"
+The returned file descriptor supports
+.BR poll (2)
+(and analogously
+.BR epoll (7))
+and
+.BR select (2),
+as follows:
+.RS
+.IP \[bu] 3
+The file descriptor is readable
+(the
+.BR select (2)
+.I readfds
+argument; the
+.BR poll (2)
+.B POLLIN
+flag)
+if the counter has a value greater than 0.
+.IP \[bu]
+The file descriptor is writable
+(the
+.BR select (2)
+.I writefds
+argument; the
+.BR poll (2)
+.B POLLOUT
+flag)
+if it is possible to write a value of at least "1" without blocking.
+.IP \[bu]
+If an overflow of the counter value was detected,
+then
+.BR select (2)
+indicates the file descriptor as being both readable and writable, and
+.BR poll (2)
+returns a
+.B POLLERR
+event.
+As noted above,
+.BR write (2)
+can never overflow the counter.
+However an overflow can occur if 2\[ha]64
+eventfd "signal posts" were performed by the KAIO
+subsystem (theoretically possible, but practically unlikely).
+If an overflow has occurred, then
+.BR read (2)
+will return that maximum
+.I uint64_t
+value (i.e., 0xffffffffffffffff).
+.RE
+.IP
+The eventfd file descriptor also supports the other file-descriptor
+multiplexing APIs:
+.BR pselect (2)
+and
+.BR ppoll (2).
+.TP
+.BR close (2)
+When the file descriptor is no longer required it should be closed.
+When all file descriptors associated with the same eventfd object
+have been closed, the resources for object are freed by the kernel.
+.PP
+A copy of the file descriptor created by
+.BR eventfd ()
+is inherited by the child produced by
+.BR fork (2).
+The duplicate file descriptor is associated with the same
+eventfd object.
+File descriptors created by
+.BR eventfd ()
+are preserved across
+.BR execve (2),
+unless the close-on-exec flag has been set.
+.SH RETURN VALUE
+On success,
+.BR eventfd ()
+returns a new eventfd file descriptor.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+An unsupported value was specified in
+.IR flags .
+.TP
+.B EMFILE
+The per-process limit on the number of open file descriptors has been reached.
+.TP
+.B ENFILE
+The system-wide limit on the total number of open files has been
+reached.
+.TP
+.B ENODEV
+.\" Note from Davide:
+.\" The ENODEV error is basically never going to happen if
+.\" the kernel boots correctly. That error happen only if during
+.\" the kernel initialization, some error occur in the anonymous
+.\" inode source initialization.
+Could not mount (internal) anonymous inode device.
+.TP
+.B ENOMEM
+There was insufficient memory to create a new
+eventfd file descriptor.
+.SH ATTRIBUTES
+For an explanation of the terms used in this section, see
+.BR attributes (7).
+.TS
+allbox;
+lbx lb lb
+l l l.
+Interface Attribute Value
+T{
+.na
+.nh
+.BR eventfd ()
+T} Thread safety MT-Safe
+.TE
+.sp 1
+.SH VERSIONS
+.SS C library/kernel differences
+There are two underlying Linux system calls:
+.BR eventfd ()
+and the more recent
+.BR eventfd2 ().
+The former system call does not implement a
+.I flags
+argument.
+The latter system call implements the
+.I flags
+values described above.
+The glibc wrapper function will use
+.BR eventfd2 ()
+where it is available.
+.SS Additional glibc features
+The GNU C library defines an additional type,
+and two functions that attempt to abstract some of the details of
+reading and writing on an eventfd file descriptor:
+.PP
+.in +4n
+.EX
+typedef uint64_t eventfd_t;
+\&
+int eventfd_read(int fd, eventfd_t *value);
+int eventfd_write(int fd, eventfd_t value);
+.EE
+.in
+.PP
+The functions perform the read and write operations on an
+eventfd file descriptor,
+returning 0 if the correct number of bytes was transferred,
+or \-1 otherwise.
+.SH STANDARDS
+Linux, GNU.
+.SH HISTORY
+.TP
+.BR eventfd ()
+Linux 2.6.22,
+glibc 2.8.
+.\" eventfd() is in glibc 2.7, but reportedly does not build
+.TP
+.BR eventfd2 ()
+Linux 2.6.27 (see VERSIONS).
+Since glibc 2.9, the
+.BR eventfd ()
+wrapper will employ the
+.BR eventfd2 ()
+system call, if it is supported by the kernel.
+.SH NOTES
+Applications can use an eventfd file descriptor instead of a pipe (see
+.BR pipe (2))
+in all cases where a pipe is used simply to signal events.
+The kernel overhead of an eventfd file descriptor
+is much lower than that of a pipe,
+and only one file descriptor is
+required (versus the two required for a pipe).
+.PP
+When used in the kernel, an eventfd
+file descriptor can provide a bridge from kernel to user space, allowing,
+for example, functionalities like KAIO (kernel AIO)
+.\" or eventually syslets/threadlets
+to signal to a file descriptor that some operation is complete.
+.PP
+A key point about an eventfd file descriptor is that it can be
+monitored just like any other file descriptor using
+.BR select (2),
+.BR poll (2),
+or
+.BR epoll (7).
+This means that an application can simultaneously monitor the
+readiness of "traditional" files and the readiness of other
+kernel mechanisms that support the eventfd interface.
+(Without the
+.BR eventfd ()
+interface, these mechanisms could not be multiplexed via
+.BR select (2),
+.BR poll (2),
+or
+.BR epoll (7).)
+.PP
+The current value of an eventfd counter can be viewed
+via the entry for the corresponding file descriptor in the process's
+.IR /proc/ pid /fdinfo
+directory.
+See
+.BR proc (5)
+for further details.
+.SH EXAMPLES
+The following program creates an eventfd file descriptor
+and then forks to create a child process.
+While the parent briefly sleeps,
+the child writes each of the integers supplied in the program's
+command-line arguments to the eventfd file descriptor.
+When the parent has finished sleeping,
+it reads from the eventfd file descriptor.
+.PP
+The following shell session shows a sample run of the program:
+.PP
+.in +4n
+.EX
+.RB "$" " ./a.out 1 2 4 7 14"
+Child writing 1 to efd
+Child writing 2 to efd
+Child writing 4 to efd
+Child writing 7 to efd
+Child writing 14 to efd
+Child completed write loop
+Parent about to read
+Parent read 28 (0x1c) from efd
+.EE
+.in
+.SS Program source
+\&
+.\" SRC BEGIN (eventfd.c)
+.EX
+#include <err.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/eventfd.h>
+#include <unistd.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ int efd;
+ uint64_t u;
+ ssize_t s;
+\&
+ if (argc < 2) {
+ fprintf(stderr, "Usage: %s <num>...\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ efd = eventfd(0, 0);
+ if (efd == \-1)
+ err(EXIT_FAILURE, "eventfd");
+\&
+ switch (fork()) {
+ case 0:
+ for (size_t j = 1; j < argc; j++) {
+ printf("Child writing %s to efd\en", argv[j]);
+ u = strtoull(argv[j], NULL, 0);
+ /* strtoull() allows various bases */
+ s = write(efd, &u, sizeof(uint64_t));
+ if (s != sizeof(uint64_t))
+ err(EXIT_FAILURE, "write");
+ }
+ printf("Child completed write loop\en");
+\&
+ exit(EXIT_SUCCESS);
+\&
+ default:
+ sleep(2);
+\&
+ printf("Parent about to read\en");
+ s = read(efd, &u, sizeof(uint64_t));
+ if (s != sizeof(uint64_t))
+ err(EXIT_FAILURE, "read");
+ printf("Parent read %"PRIu64" (%#"PRIx64") from efd\en", u, u);
+ exit(EXIT_SUCCESS);
+\&
+ case \-1:
+ err(EXIT_FAILURE, "fork");
+ }
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR futex (2),
+.BR pipe (2),
+.BR poll (2),
+.BR read (2),
+.BR select (2),
+.BR signalfd (2),
+.BR timerfd_create (2),
+.BR write (2),
+.BR epoll (7),
+.BR sem_overview (7)
diff --git a/man2/eventfd2.2 b/man2/eventfd2.2
new file mode 100644
index 0000000..eddfaa8
--- /dev/null
+++ b/man2/eventfd2.2
@@ -0,0 +1 @@
+.so man2/eventfd.2
diff --git a/man2/execve.2 b/man2/execve.2
new file mode 100644
index 0000000..ae1863c
--- /dev/null
+++ b/man2/execve.2
@@ -0,0 +1,884 @@
+.\" Copyright (c) 1992 Drew Eckhardt (drew@cs.colorado.edu), March 28, 1992
+.\" and Copyright (c) 2006 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt <michael@moria.de>
+.\" Modified 1993-07-21 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1994-08-21 by Michael Chastain <mec@shell.portal.com>:
+.\" Modified 1997-01-31 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 1999-11-12 by Urs Thuermann <urs@isnogud.escape.de>
+.\" Modified 2004-06-23 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" 2006-09-04 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added list of process attributes that are not preserved on exec().
+.\" 2007-09-14 Ollie Wild <aaw@google.com>, mtk
+.\" Add text describing limits on command-line arguments + environment
+.\"
+.TH execve 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+execve \- execute program
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int execve(const char *" pathname ", char *const _Nullable " argv [],
+.BI " char *const _Nullable " envp []);
+.fi
+.SH DESCRIPTION
+.BR execve ()
+executes the program referred to by \fIpathname\fP.
+This causes the program that is currently being run by the calling process
+to be replaced with a new program, with newly initialized stack, heap,
+and (initialized and uninitialized) data segments.
+.PP
+\fIpathname\fP must be either a binary executable, or a script
+starting with a line of the form:
+.PP
+.in +4n
+.EX
+\fB#!\fP\fIinterpreter \fP[optional-arg]
+.EE
+.in
+.PP
+For details of the latter case, see "Interpreter scripts" below.
+.PP
+.I argv
+is an array of pointers to strings passed to the new program
+as its command-line arguments.
+By convention, the first of these strings (i.e.,
+.IR argv[0] )
+should contain the filename associated with the file being executed.
+The
+.I argv
+array must be terminated by a NULL pointer.
+(Thus, in the new program,
+.I argv[argc]
+will be NULL.)
+.PP
+.I envp
+is an array of pointers to strings, conventionally of the form
+.BR key=value ,
+which are passed as the environment of the new program.
+The
+.I envp
+array must be terminated by a NULL pointer.
+.PP
+This manual page describes the Linux system call in detail;
+for an overview of the nomenclature and the many, often preferable,
+standardised variants of this function provided by libc,
+including ones that search the
+.B PATH
+environment variable, see
+.BR exec (3).
+.PP
+The argument vector and environment can be accessed by the
+new program's main function, when it is defined as:
+.PP
+.in +4n
+.EX
+int main(int argc, char *argv[], char *envp[])
+.EE
+.in
+.PP
+Note, however, that the use of a third argument to the main function
+is not specified in POSIX.1;
+according to POSIX.1,
+the environment should be accessed via the external variable
+.BR environ (7).
+.PP
+.BR execve ()
+does not return on success, and the text, initialized data,
+uninitialized data (bss), and stack of the calling process are overwritten
+according to the contents of the newly loaded program.
+.PP
+If the current program is being ptraced, a \fBSIGTRAP\fP signal is sent to it
+after a successful
+.BR execve ().
+.PP
+If the set-user-ID bit is set on the program file referred to by
+\fIpathname\fP,
+then the effective user ID of the calling process is changed
+to that of the owner of the program file.
+Similarly, if the set-group-ID bit is set on the program file,
+then the effective group ID of the calling
+process is set to the group of the program file.
+.PP
+The aforementioned transformations of the effective IDs are
+.I not
+performed (i.e., the set-user-ID and set-group-ID bits are ignored)
+if any of the following is true:
+.IP \[bu] 3
+the
+.I no_new_privs
+attribute is set for the calling thread (see
+.BR prctl (2));
+.IP \[bu]
+the underlying filesystem is mounted
+.I nosuid
+(the
+.B MS_NOSUID
+flag for
+.BR mount (2));
+or
+.IP \[bu]
+the calling process is being ptraced.
+.PP
+The capabilities of the program file (see
+.BR capabilities (7))
+are also ignored if any of the above are true.
+.PP
+The effective user ID of the process is copied to the saved set-user-ID;
+similarly, the effective group ID is copied to the saved set-group-ID.
+This copying takes place after any effective ID changes that occur
+because of the set-user-ID and set-group-ID mode bits.
+.PP
+The process's real UID and real GID, as well as its supplementary group IDs,
+are unchanged by a call to
+.BR execve ().
+.PP
+If the executable is an a.out dynamically linked
+binary executable containing
+shared-library stubs, the Linux dynamic linker
+.BR ld.so (8)
+is called at the start of execution to bring
+needed shared objects into memory
+and link the executable with them.
+.PP
+If the executable is a dynamically linked ELF executable, the
+interpreter named in the PT_INTERP segment is used to load the needed
+shared objects.
+This interpreter is typically
+.I /lib/ld\-linux.so.2
+for binaries linked with glibc (see
+.BR ld\-linux.so (8)).
+.\"
+.SS Effect on process attributes
+All process attributes are preserved during an
+.BR execve (),
+except the following:
+.IP \[bu] 3
+The dispositions of any signals that are being caught are
+reset to the default
+.RB ( signal (7)).
+.IP \[bu]
+Any alternate signal stack is not preserved
+.RB ( sigaltstack (2)).
+.IP \[bu]
+Memory mappings are not preserved
+.RB ( mmap (2)).
+.IP \[bu]
+Attached System\ V shared memory segments are detached
+.RB ( shmat (2)).
+.IP \[bu]
+POSIX shared memory regions are unmapped
+.RB ( shm_open (3)).
+.IP \[bu]
+Open POSIX message queue descriptors are closed
+.RB ( mq_overview (7)).
+.IP \[bu]
+Any open POSIX named semaphores are closed
+.RB ( sem_overview (7)).
+.IP \[bu]
+POSIX timers are not preserved
+.RB ( timer_create (2)).
+.IP \[bu]
+Any open directory streams are closed
+.RB ( opendir (3)).
+.IP \[bu]
+Memory locks are not preserved
+.RB ( mlock (2),
+.BR mlockall (2)).
+.IP \[bu]
+Exit handlers are not preserved
+.RB ( atexit (3),
+.BR on_exit (3)).
+.IP \[bu]
+The floating-point environment is reset to the default (see
+.BR fenv (3)).
+.PP
+The process attributes in the preceding list are all specified
+in POSIX.1.
+The following Linux-specific process attributes are also
+not preserved during an
+.BR execve ():
+.IP \[bu] 3
+The process's "dumpable" attribute is set to the value 1,
+unless a set-user-ID program, a set-group-ID program,
+or a program with capabilities is being executed,
+in which case the dumpable flag may instead be reset to the value in
+.IR /proc/sys/fs/suid_dumpable ,
+in the circumstances described under
+.B PR_SET_DUMPABLE
+in
+.BR prctl (2).
+Note that changes to the "dumpable" attribute may cause ownership
+of files in the process's
+.IR /proc/ pid
+directory to change to
+.IR root:root ,
+as described in
+.BR proc (5).
+.IP \[bu]
+The
+.BR prctl (2)
+.B PR_SET_KEEPCAPS
+flag is cleared.
+.IP \[bu]
+(Since Linux 2.4.36 / 2.6.23)
+If a set-user-ID or set-group-ID program is being executed,
+then the parent death signal set by
+.BR prctl (2)
+.B PR_SET_PDEATHSIG
+flag is cleared.
+.IP \[bu]
+The process name, as set by
+.BR prctl (2)
+.B PR_SET_NAME
+(and displayed by
+.IR "ps\ \-o comm" ),
+is reset to the name of the new executable file.
+.IP \[bu]
+The
+.B SECBIT_KEEP_CAPS
+.I securebits
+flag is cleared.
+See
+.BR capabilities (7).
+.IP \[bu]
+The termination signal is reset to
+.B SIGCHLD
+(see
+.BR clone (2)).
+.IP \[bu]
+The file descriptor table is unshared, undoing the effect of the
+.B CLONE_FILES
+flag of
+.BR clone (2).
+.PP
+Note the following further points:
+.IP \[bu] 3
+All threads other than the calling thread are destroyed during an
+.BR execve ().
+Mutexes, condition variables, and other pthreads objects are not preserved.
+.IP \[bu]
+The equivalent of \fIsetlocale(LC_ALL, "C")\fP
+is executed at program start-up.
+.IP \[bu]
+POSIX.1 specifies that the dispositions of any signals that
+are ignored or set to the default are left unchanged.
+POSIX.1 specifies one exception: if
+.B SIGCHLD
+is being ignored,
+then an implementation may leave the disposition unchanged or
+reset it to the default; Linux does the former.
+.IP \[bu]
+Any outstanding asynchronous I/O operations are canceled
+.RB ( aio_read (3),
+.BR aio_write (3)).
+.IP \[bu]
+For the handling of capabilities during
+.BR execve (),
+see
+.BR capabilities (7).
+.IP \[bu]
+By default, file descriptors remain open across an
+.BR execve ().
+File descriptors that are marked close-on-exec are closed;
+see the description of
+.B FD_CLOEXEC
+in
+.BR fcntl (2).
+(If a file descriptor is closed, this will cause the release
+of all record locks obtained on the underlying file by this process.
+See
+.BR fcntl (2)
+for details.)
+POSIX.1 says that if file descriptors 0, 1, and 2 would
+otherwise be closed after a successful
+.BR execve (),
+and the process would gain privilege because the set-user-ID or
+set-group-ID mode bit was set on the executed file,
+then the system may open an unspecified file for each of these
+file descriptors.
+As a general principle, no portable program, whether privileged or not,
+can assume that these three file descriptors will remain
+closed across an
+.BR execve ().
+.\" On Linux it appears that these file descriptors are
+.\" always open after an execve(), and it looks like
+.\" Solaris 8 and FreeBSD 6.1 are the same. -- mtk, 30 Apr 2007
+.SS Interpreter scripts
+An interpreter script is a text file that has execute
+permission enabled and whose first line is of the form:
+.PP
+.in +4n
+.EX
+\fB#!\fP\fIinterpreter \fP[optional-arg]
+.EE
+.in
+.PP
+The
+.I interpreter
+must be a valid pathname for an executable file.
+.PP
+If the
+.I pathname
+argument of
+.BR execve ()
+specifies an interpreter script, then
+.I interpreter
+will be invoked with the following arguments:
+.PP
+.in +4n
+.EX
+\fIinterpreter\fP [optional-arg] \fIpathname\fP arg...
+.EE
+.in
+.PP
+where
+.I pathname
+is the pathname of the file specified as the first argument of
+.BR execve (),
+and
+.I arg...
+is the series of words pointed to by the
+.I argv
+argument of
+.BR execve (),
+starting at
+.IR argv[1] .
+Note that there is no way to get the
+.I argv[0]
+that was passed to the
+.BR execve ()
+call.
+.\" See the P - preserve-argv[0] option.
+.\" Documentation/admin-guide/binfmt-misc.rst
+.\" https://www.kernel.org/doc/html/latest/admin-guide/binfmt-misc.html
+.PP
+For portable use,
+.I optional-arg
+should either be absent, or be specified as a single word (i.e., it
+should not contain white space); see NOTES below.
+.PP
+Since Linux 2.6.28,
+.\" commit bf2a9a39639b8b51377905397a5005f444e9a892
+the kernel permits the interpreter of a script to itself be a script.
+This permission is recursive, up to a limit of four recursions,
+so that the interpreter may be a script which is interpreted by a script,
+and so on.
+.SS Limits on size of arguments and environment
+Most UNIX implementations impose some limit on the total size
+of the command-line argument
+.RI ( argv )
+and environment
+.RI ( envp )
+strings that may be passed to a new program.
+POSIX.1 allows an implementation to advertise this limit using the
+.B ARG_MAX
+constant (either defined in
+.I <limits.h>
+or available at run time using the call
+.IR "sysconf(_SC_ARG_MAX)" ).
+.PP
+Before Linux 2.6.23, the memory used to store the
+environment and argument strings was limited to 32 pages
+(defined by the kernel constant
+.BR MAX_ARG_PAGES ).
+On architectures with a 4-kB page size,
+this yields a maximum size of 128\ kB.
+.PP
+On Linux 2.6.23 and later, most architectures support a size limit
+derived from the soft
+.B RLIMIT_STACK
+resource limit (see
+.BR getrlimit (2))
+that is in force at the time of the
+.BR execve ()
+call.
+(Architectures with no memory management unit are excepted:
+they maintain the limit that was in effect before Linux 2.6.23.)
+This change allows programs to have a much larger
+argument and/or environment list.
+.\" For some background on the changes to ARG_MAX in Linux 2.6.23 and
+.\" Linux 2.6.25, see:
+.\" http://sourceware.org/bugzilla/show_bug.cgi?id=5786
+.\" http://bugzilla.kernel.org/show_bug.cgi?id=10095
+.\" http://thread.gmane.org/gmane.linux.kernel/646709/focus=648101,
+.\" checked into Linux 2.6.25 as commit a64e715fc74b1a7dcc5944f848acc38b2c4d4ee2.
+For these architectures, the total size is limited to 1/4 of the allowed
+stack size.
+(Imposing the 1/4-limit
+ensures that the new program always has some stack space.)
+.\" Ollie: That doesn't include the lists of pointers, though,
+.\" so the actual usage is a bit higher (1 pointer per argument).
+Additionally, the total size is limited to 3/4 of the value
+of the kernel constant
+.B _STK_LIM
+(8 MiB).
+Since Linux 2.6.25,
+the kernel also places a floor of 32 pages on this size limit,
+so that, even when
+.B RLIMIT_STACK
+is set very low,
+applications are guaranteed to have at least as much argument and
+environment space as was provided by Linux 2.6.22 and earlier.
+(This guarantee was not provided in Linux 2.6.23 and 2.6.24.)
+Additionally, the limit per string is 32 pages (the kernel constant
+.BR MAX_ARG_STRLEN ),
+and the maximum number of strings is 0x7FFFFFFF.
+.SH RETURN VALUE
+On success,
+.BR execve ()
+does not return, on error \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B E2BIG
+The total number of bytes in the environment
+.RI ( envp )
+and argument list
+.RI ( argv )
+is too large.
+.TP
+.B EACCES
+Search permission is denied on a component of the path prefix of
+.I pathname
+or the name of a script interpreter.
+(See also
+.BR path_resolution (7).)
+.TP
+.B EACCES
+The file or a script interpreter is not a regular file.
+.TP
+.B EACCES
+Execute permission is denied for the file or a script or ELF interpreter.
+.TP
+.B EACCES
+The filesystem is mounted
+.IR noexec .
+.TP
+.BR EAGAIN " (since Linux 3.1)"
+.\" commit 72fa59970f8698023045ab0713d66f3f4f96945c
+Having changed its real UID using one of the
+.BR set*uid ()
+calls, the caller was\[em]and is now still\[em]above its
+.B RLIMIT_NPROC
+resource limit (see
+.BR setrlimit (2)).
+For a more detailed explanation of this error, see NOTES.
+.TP
+.B EFAULT
+.I pathname
+or one of the pointers in the vectors
+.I argv
+or
+.I envp
+points outside your accessible address space.
+.TP
+.B EINVAL
+An ELF executable had more than one PT_INTERP segment (i.e., tried to
+name more than one interpreter).
+.TP
+.B EIO
+An I/O error occurred.
+.TP
+.B EISDIR
+An ELF interpreter was a directory.
+.TP
+.B ELIBBAD
+An ELF interpreter was not in a recognized format.
+.TP
+.B ELOOP
+Too many symbolic links were encountered in resolving
+.I pathname
+or the name of a script or ELF interpreter.
+.TP
+.B ELOOP
+The maximum recursion limit was reached during recursive script
+interpretation (see "Interpreter scripts", above).
+Before Linux 3.8,
+.\" commit d740269867021faf4ce38a449353d2b986c34a67
+the error produced for this case was
+.BR ENOEXEC .
+.TP
+.B EMFILE
+The per-process limit on the number of open file descriptors has been reached.
+.TP
+.B ENAMETOOLONG
+.I pathname
+is too long.
+.TP
+.B ENFILE
+The system-wide limit on the total number of open files has been reached.
+.TP
+.B ENOENT
+The file
+.I pathname
+or a script or ELF interpreter does not exist.
+.TP
+.B ENOEXEC
+An executable is not in a recognized format, is for the wrong
+architecture, or has some other format error that means it cannot be
+executed.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.B ENOTDIR
+A component of the path prefix of
+.I pathname
+or a script or ELF interpreter is not a directory.
+.TP
+.B EPERM
+The filesystem is mounted
+.IR nosuid ,
+the user is not the superuser,
+and the file has the set-user-ID or set-group-ID bit set.
+.TP
+.B EPERM
+The process is being traced, the user is not the superuser and the
+file has the set-user-ID or set-group-ID bit set.
+.TP
+.B EPERM
+A "capability-dumb" applications would not obtain the full set of
+permitted capabilities granted by the executable file.
+See
+.BR capabilities (7).
+.TP
+.B ETXTBSY
+The specified executable was open for writing by one or more processes.
+.SH VERSIONS
+POSIX does not document the #! behavior, but it exists
+(with some variations) on other UNIX systems.
+.PP
+On Linux,
+.I argv
+and
+.I envp
+can be specified as NULL.
+In both cases, this has the same effect as specifying the argument
+as a pointer to a list containing a single null pointer.
+.B "Do not take advantage of this nonstandard and nonportable misfeature!"
+On many other UNIX systems, specifying
+.I argv
+as NULL will result in an error
+.RB ( EFAULT ).
+.I Some
+other UNIX systems treat the
+.I envp==NULL
+case the same as Linux.
+.\" e.g., EFAULT on Solaris 8 and FreeBSD 6.1; but
+.\" HP-UX 11 is like Linux -- mtk, Apr 2007
+.\" Bug filed 30 Apr 2007: http://bugzilla.kernel.org/show_bug.cgi?id=8408
+.\" Bug rejected (because fix would constitute an ABI change).
+.\"
+.PP
+POSIX.1 says that values returned by
+.BR sysconf (3)
+should be invariant over the lifetime of a process.
+However, since Linux 2.6.23, if the
+.B RLIMIT_STACK
+resource limit changes, then the value reported by
+.B _SC_ARG_MAX
+will also change,
+to reflect the fact that the limit on space for holding
+command-line arguments and environment variables has changed.
+.\"
+.SS Interpreter scripts
+The kernel imposes a maximum length on the text that follows the
+"#!" characters at the start of a script;
+characters beyond the limit are ignored.
+Before Linux 5.1, the limit is 127 characters.
+Since Linux 5.1,
+.\" commit 6eb3c3d0a52dca337e327ae8868ca1f44a712e02
+the limit is 255 characters.
+.PP
+The semantics of the
+.I optional-arg
+argument of an interpreter script vary across implementations.
+On Linux, the entire string following the
+.I interpreter
+name is passed as a single argument to the interpreter,
+and this string can include white space.
+However, behavior differs on some other systems.
+Some systems
+.\" e.g., Solaris 8
+use the first white space to terminate
+.IR optional-arg .
+On some systems,
+.\" e.g., FreeBSD before 6.0, but not FreeBSD 6.0 onward
+an interpreter script can have multiple arguments,
+and white spaces in
+.I optional-arg
+are used to delimit the arguments.
+.PP
+Linux (like most other modern UNIX systems)
+ignores the set-user-ID and set-group-ID bits on scripts.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4, 4.3BSD.
+.\" SVr4 documents additional error
+.\" conditions EAGAIN, EINTR, ELIBACC, ENOLINK, EMULTIHOP; POSIX does not
+.\" document ETXTBSY, EPERM, EFAULT, ELOOP, EIO, ENFILE, EMFILE, EINVAL,
+.\" EISDIR or ELIBBAD error conditions.
+.PP
+With UNIX\ V6, the argument list of an
+.BR exec ()
+call was ended by 0,
+while the argument list of
+.I main
+was ended by \-1.
+Thus, this argument list was not directly usable in a further
+.BR exec ()
+call.
+Since UNIX\ V7, both are NULL.
+.SH NOTES
+One sometimes sees
+.BR execve ()
+(and the related functions described in
+.BR exec (3))
+described as "executing a
+.I new
+process" (or similar).
+This is a highly misleading description:
+there is no new process;
+many attributes of the calling process remain unchanged
+(in particular, its PID).
+All that
+.BR execve ()
+does is arrange for an existing process (the calling process)
+to execute a new program.
+.PP
+Set-user-ID and set-group-ID processes can not be
+.BR ptrace (2)d.
+.PP
+The result of mounting a filesystem
+.I nosuid
+varies across Linux kernel versions:
+some will refuse execution of set-user-ID and set-group-ID
+executables when this would
+give the user powers they did not have already (and return
+.BR EPERM ),
+some will just ignore the set-user-ID and set-group-ID bits and
+.BR exec ()
+successfully.
+.PP
+In most cases where
+.BR execve ()
+fails, control returns to the original executable image,
+and the caller of
+.BR execve ()
+can then handle the error.
+However, in (rare) cases (typically caused by resource exhaustion),
+failure may occur past the point of no return:
+the original executable image has been torn down,
+but the new image could not be completely built.
+In such cases, the kernel kills the process with a
+.\" commit 19d860a140beac48a1377f179e693abe86a9dac9
+.B SIGSEGV
+.RB ( SIGKILL
+until Linux 3.17)
+signal.
+.SS execve() and EAGAIN
+A more detailed explanation of the
+.B EAGAIN
+error that can occur (since Linux 3.1) when calling
+.BR execve ()
+is as follows.
+.PP
+The
+.B EAGAIN
+error can occur when a
+.I preceding
+call to
+.BR setuid (2),
+.BR setreuid (2),
+or
+.BR setresuid (2)
+caused the real user ID of the process to change,
+and that change caused the process to exceed its
+.B RLIMIT_NPROC
+resource limit (i.e., the number of processes belonging
+to the new real UID exceeds the resource limit).
+From Linux 2.6.0 to Linux 3.0, this caused the
+.BR set*uid ()
+call to fail.
+(Before Linux 2.6,
+.\" commit 909cc4ae86f3380152a18e2a3c44523893ee11c4
+the resource limit was not imposed on processes that
+changed their user IDs.)
+.PP
+Since Linux 3.1, the scenario just described no longer causes the
+.BR set*uid ()
+call to fail,
+because it too often led to security holes where buggy applications
+didn't check the return status and assumed
+that\[em]if the caller had root privileges\[em]the call would always succeed.
+Instead, the
+.BR set*uid ()
+calls now successfully change the real UID,
+but the kernel sets an internal flag, named
+.BR PF_NPROC_EXCEEDED ,
+to note that the
+.B RLIMIT_NPROC
+resource limit has been exceeded.
+If the
+.B PF_NPROC_EXCEEDED
+flag is set and the resource limit is still
+exceeded at the time of a subsequent
+.BR execve ()
+call, that call fails with the error
+.BR EAGAIN .
+This kernel logic ensures that the
+.B RLIMIT_NPROC
+resource limit is still enforced for the
+common privileged daemon workflow\[em]namely,
+.BR fork (2)
++
+.BR set*uid ()
++
+.BR execve ().
+.PP
+If the resource limit was not still exceeded at the time of the
+.BR execve ()
+call
+(because other processes belonging to this real UID terminated between the
+.BR set*uid ()
+call and the
+.BR execve ()
+call), then the
+.BR execve ()
+call succeeds and the kernel clears the
+.B PF_NPROC_EXCEEDED
+process flag.
+The flag is also cleared if a subsequent call to
+.BR fork (2)
+by this process succeeds.
+.\"
+.\" .SH BUGS
+.\" Some Linux versions have failed to check permissions on ELF
+.\" interpreters. This is a security hole, because it allows users to
+.\" open any file, such as a rewinding tape device, for reading. Some
+.\" Linux versions have also had other security holes in
+.\" .BR execve ()
+.\" that could be exploited for denial of service by a suitably crafted
+.\" ELF binary. There are no known problems with Linux 2.0.34 or Linux 2.2.15.
+.SH EXAMPLES
+The following program is designed to be execed by the second program below.
+It just echoes its command-line arguments, one per line.
+.PP
+.in +4n
+.\" SRC BEGIN (myecho.c)
+.EX
+/* myecho.c */
+\&
+#include <stdio.h>
+#include <stdlib.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ for (size_t j = 0; j < argc; j++)
+ printf("argv[%zu]: %s\en", j, argv[j]);
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.in
+.PP
+This program can be used to exec the program named in its command-line
+argument:
+.PP
+.in +4n
+.\" SRC BEGIN (execve.c)
+.EX
+/* execve.c */
+\&
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ static char *newargv[] = { NULL, "hello", "world", NULL };
+ static char *newenviron[] = { NULL };
+\&
+ if (argc != 2) {
+ fprintf(stderr, "Usage: %s <file\-to\-exec>\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ newargv[0] = argv[1];
+\&
+ execve(argv[1], newargv, newenviron);
+ perror("execve"); /* execve() returns only on error */
+ exit(EXIT_FAILURE);
+}
+.EE
+.\" SRC END
+.in
+.PP
+We can use the second program to exec the first as follows:
+.PP
+.in +4n
+.EX
+.RB "$" " cc myecho.c \-o myecho"
+.RB "$" " cc execve.c \-o execve"
+.RB "$" " ./execve ./myecho"
+argv[0]: ./myecho
+argv[1]: hello
+argv[2]: world
+.EE
+.in
+.PP
+We can also use these programs to demonstrate the use of a script
+interpreter.
+To do this we create a script whose "interpreter" is our
+.I myecho
+program:
+.PP
+.in +4n
+.EX
+.RB "$" " cat > script"
+.B #!./myecho script\-arg
+.B \[ha]D
+.RB "$" " chmod +x script"
+.EE
+.in
+.PP
+We can then use our program to exec the script:
+.PP
+.in +4n
+.EX
+.RB "$" " ./execve ./script"
+argv[0]: ./myecho
+argv[1]: script\-arg
+argv[2]: ./script
+argv[3]: hello
+argv[4]: world
+.EE
+.in
+.SH SEE ALSO
+.BR chmod (2),
+.BR execveat (2),
+.BR fork (2),
+.BR get_robust_list (2),
+.BR ptrace (2),
+.BR exec (3),
+.BR fexecve (3),
+.BR getauxval (3),
+.BR getopt (3),
+.BR system (3),
+.BR capabilities (7),
+.BR credentials (7),
+.BR environ (7),
+.BR path_resolution (7),
+.BR ld.so (8)
diff --git a/man2/execveat.2 b/man2/execveat.2
new file mode 100644
index 0000000..22c468a
--- /dev/null
+++ b/man2/execveat.2
@@ -0,0 +1,220 @@
+.\" Copyright (c) 2014 Google, Inc., written by David Drysdale
+.\" and Copyright (c) 2015, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH execveat 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+execveat \- execute program relative to a directory file descriptor
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/fcntl.h>" " /* Definition of " AT_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int execveat(int " dirfd ", const char *" pathname ,
+.BI " char *const _Nullable " argv [],
+.BI " char *const _Nullable " envp [],
+.BI " int " flags );
+.fi
+.SH DESCRIPTION
+.\" commit 51f39a1f0cea1cacf8c787f652f26dfee9611874
+The
+.BR execveat ()
+system call executes the program referred to by the combination of
+.I dirfd
+and
+.IR pathname .
+It operates in exactly the same way as
+.BR execve (2),
+except for the differences described in this manual page.
+.PP
+If the pathname given in
+.I pathname
+is relative, then it is interpreted relative to the directory
+referred to by the file descriptor
+.I dirfd
+(rather than relative to the current working directory of
+the calling process, as is done by
+.BR execve (2)
+for a relative pathname).
+.PP
+If
+.I pathname
+is relative and
+.I dirfd
+is the special value
+.BR AT_FDCWD ,
+then
+.I pathname
+is interpreted relative to the current working
+directory of the calling process (like
+.BR execve (2)).
+.PP
+If
+.I pathname
+is absolute, then
+.I dirfd
+is ignored.
+.PP
+If
+.I pathname
+is an empty string and the
+.B AT_EMPTY_PATH
+flag is specified, then the file descriptor
+.I dirfd
+specifies the file to be executed (i.e.,
+.I dirfd
+refers to an executable file, rather than a directory).
+.PP
+The
+.I flags
+argument is a bit mask that can include zero or more of the following flags:
+.TP
+.B AT_EMPTY_PATH
+If
+.I pathname
+is an empty string, operate on the file referred to by
+.I dirfd
+(which may have been obtained using the
+.BR open (2)
+.B O_PATH
+flag).
+.TP
+.B AT_SYMLINK_NOFOLLOW
+If the file identified by
+.I dirfd
+and a non-NULL
+.I pathname
+is a symbolic link, then the call fails with the error
+.BR ELOOP .
+.SH RETURN VALUE
+On success,
+.BR execveat ()
+does not return.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+The same errors that occur for
+.BR execve (2)
+can also occur for
+.BR execveat ().
+The following additional errors can occur for
+.BR execveat ():
+.TP
+.I pathname
+is relative but
+.I dirfd
+is neither
+.B AT_FDCWD
+nor a valid file descriptor.
+.TP
+.B EINVAL
+Invalid flag specified in
+.IR flags .
+.TP
+.B ELOOP
+.I flags
+includes
+.B AT_SYMLINK_NOFOLLOW
+and the file identified by
+.I dirfd
+and a non-NULL
+.I pathname
+is a symbolic link.
+.TP
+.B ENOENT
+The program identified by
+.I dirfd
+and
+.I pathname
+requires the use of an interpreter program
+(such as a script starting with "#!"), but the file descriptor
+.I dirfd
+was opened with the
+.B O_CLOEXEC
+flag, with the result that
+the program file is inaccessible to the launched interpreter.
+See BUGS.
+.TP
+.B ENOTDIR
+.I pathname
+is relative and
+.I dirfd
+is a file descriptor referring to a file other than a directory.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 3.19,
+glibc 2.34.
+.SH NOTES
+In addition to the reasons explained in
+.BR openat (2),
+the
+.BR execveat ()
+system call is also needed to allow
+.BR fexecve (3)
+to be implemented on systems that do not have the
+.I /proc
+filesystem mounted.
+.PP
+When asked to execute a script file, the
+.I argv[0]
+that is passed to the script interpreter is a string of the form
+.I /dev/fd/N
+or
+.IR /dev/fd/N/P ,
+where
+.I N
+is the number of the file descriptor passed via the
+.I dirfd
+argument.
+A string of the first form occurs when
+.B AT_EMPTY_PATH
+is employed.
+A string of the second form occurs when the script is specified via both
+.I dirfd
+and
+.IR pathname ;
+in this case,
+.I P
+is the value given in
+.IR pathname .
+.PP
+For the same reasons described in
+.BR fexecve (3),
+the natural idiom when using
+.BR execveat ()
+is to set the close-on-exec flag on
+.IR dirfd .
+(But see BUGS.)
+.SH BUGS
+The
+.B ENOENT
+error described above means that it is not possible to set the
+close-on-exec flag on the file descriptor given to a call of the form:
+.PP
+.in +4n
+.EX
+execveat(fd, "", argv, envp, AT_EMPTY_PATH);
+.EE
+.in
+.PP
+However, the inability to set the close-on-exec flag means that a file
+descriptor referring to the script leaks through to the script itself.
+As well as wasting a file descriptor,
+this leakage can lead to file-descriptor exhaustion in scenarios
+where scripts recursively employ
+.BR execveat ().
+.\" For an example, see Michael Kerrisk's 2015-01-10 reply in this LKML
+.\" thread (http://thread.gmane.org/gmane.linux.kernel/1836105/focus=20229):
+.\"
+.\" Subject: [PATCHv10 man-pages 5/5] execveat.2: initial man page.\" for execveat(2
+.\" Date: Mon, 24 Nov 2014 11:53:59 +0000
+.SH SEE ALSO
+.BR execve (2),
+.BR openat (2),
+.BR fexecve (3)
diff --git a/man2/exit.2 b/man2/exit.2
new file mode 100644
index 0000000..9f9d2e7
--- /dev/null
+++ b/man2/exit.2
@@ -0,0 +1 @@
+.so man2/_exit.2
diff --git a/man2/exit_group.2 b/man2/exit_group.2
new file mode 100644
index 0000000..3515406
--- /dev/null
+++ b/man2/exit_group.2
@@ -0,0 +1,38 @@
+.\" Copyright (C) 2004 Andries Brouwer (aeb@cwi.nl)
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH exit_group 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+exit_group \- exit all threads in a process
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "[[noreturn]] void syscall(SYS_exit_group, int " status );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR exit_group (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+This system call terminates all threads
+in the calling process's thread group.
+.SH RETURN VALUE
+This system call does not return.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.5.35.
+.SH NOTES
+Since glibc 2.3, this is the system call invoked when the
+.BR _exit (2)
+wrapper function is called.
+.SH SEE ALSO
+.BR _exit (2)
diff --git a/man2/faccessat.2 b/man2/faccessat.2
new file mode 100644
index 0000000..9d4f76e
--- /dev/null
+++ b/man2/faccessat.2
@@ -0,0 +1 @@
+.so man2/access.2
diff --git a/man2/faccessat2.2 b/man2/faccessat2.2
new file mode 100644
index 0000000..9d4f76e
--- /dev/null
+++ b/man2/faccessat2.2
@@ -0,0 +1 @@
+.so man2/access.2
diff --git a/man2/fadvise64.2 b/man2/fadvise64.2
new file mode 100644
index 0000000..53f54a1
--- /dev/null
+++ b/man2/fadvise64.2
@@ -0,0 +1 @@
+.so man2/posix_fadvise.2
diff --git a/man2/fadvise64_64.2 b/man2/fadvise64_64.2
new file mode 100644
index 0000000..53f54a1
--- /dev/null
+++ b/man2/fadvise64_64.2
@@ -0,0 +1 @@
+.so man2/posix_fadvise.2
diff --git a/man2/fallocate.2 b/man2/fallocate.2
new file mode 100644
index 0000000..e462658
--- /dev/null
+++ b/man2/fallocate.2
@@ -0,0 +1,481 @@
+.\" Copyright (c) 2007 Silicon Graphics, Inc. All Rights Reserved
+.\" Written by Dave Chinner <dgc@sgi.com>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-only
+.\"
+.\" 2011-09-19: Added FALLOC_FL_PUNCH_HOLE
+.\" 2011-09-19: Substantial restructuring of the page
+.\"
+.TH fallocate 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+fallocate \- manipulate file space
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #include <fcntl.h>
+.PP
+.BI "int fallocate(int " fd ", int " mode ", off_t " offset \
+", off_t " len ");"
+.fi
+.SH DESCRIPTION
+This is a nonportable, Linux-specific system call.
+For the portable, POSIX.1-specified method of ensuring that space
+is allocated for a file, see
+.BR posix_fallocate (3).
+.PP
+.BR fallocate ()
+allows the caller to directly manipulate the allocated disk space
+for the file referred to by
+.I fd
+for the byte range starting at
+.I offset
+and continuing for
+.I len
+bytes.
+.PP
+The
+.I mode
+argument determines the operation to be performed on the given range.
+Details of the supported operations are given in the subsections below.
+.SS Allocating disk space
+The default operation (i.e.,
+.I mode
+is zero) of
+.BR fallocate ()
+allocates the disk space within the range specified by
+.I offset
+and
+.IR len .
+The file size (as reported by
+.BR stat (2))
+will be changed if
+.IR offset + len
+is greater than the file size.
+Any subregion within the range specified by
+.I offset
+and
+.I len
+that did not contain data before the call will be initialized to zero.
+This default behavior closely resembles the behavior of the
+.BR posix_fallocate (3)
+library function,
+and is intended as a method of optimally implementing that function.
+.PP
+After a successful call, subsequent writes into the range specified by
+.I offset
+and
+.I len
+are guaranteed not to fail because of lack of disk space.
+.PP
+If the
+.B FALLOC_FL_KEEP_SIZE
+flag is specified in
+.IR mode ,
+the behavior of the call is similar,
+but the file size will not be changed even if
+.IR offset + len
+is greater than the file size.
+Preallocating zeroed blocks beyond the end of the file in this manner
+is useful for optimizing append workloads.
+.PP
+If the
+.B FALLOC_FL_UNSHARE_RANGE
+flag is specified in
+.IR mode ,
+shared file data extents will be made private to the file to guarantee
+that a subsequent write will not fail due to lack of space.
+Typically, this will be done by performing a copy-on-write operation on
+all shared data in the file.
+This flag may not be supported by all filesystems.
+.PP
+Because allocation is done in block size chunks,
+.BR fallocate ()
+may allocate a larger range of disk space than was specified.
+.SS Deallocating file space
+Specifying the
+.B FALLOC_FL_PUNCH_HOLE
+flag (available since Linux 2.6.38) in
+.I mode
+deallocates space (i.e., creates a hole)
+in the byte range starting at
+.I offset
+and continuing for
+.I len
+bytes.
+Within the specified range, partial filesystem blocks are zeroed,
+and whole filesystem blocks are removed from the file.
+After a successful call,
+subsequent reads from this range will return zeros.
+.PP
+The
+.B FALLOC_FL_PUNCH_HOLE
+flag must be ORed with
+.B FALLOC_FL_KEEP_SIZE
+in
+.IR mode ;
+in other words, even when punching off the end of the file, the file size
+(as reported by
+.BR stat (2))
+does not change.
+.PP
+Not all filesystems support
+.BR FALLOC_FL_PUNCH_HOLE ;
+if a filesystem doesn't support the operation, an error is returned.
+The operation is supported on at least the following filesystems:
+.IP \[bu] 3
+XFS (since Linux 2.6.38)
+.IP \[bu]
+ext4 (since Linux 3.0)
+.\" commit a4bb6b64e39abc0e41ca077725f2a72c868e7622
+.IP \[bu]
+Btrfs (since Linux 3.7)
+.IP \[bu]
+.BR tmpfs (5)
+(since Linux 3.5)
+.\" commit 83e4fa9c16e4af7122e31be3eca5d57881d236fe
+.IP \[bu]
+.BR gfs2 (5)
+(since Linux 4.16)
+.\" commit 4e56a6411fbce6f859566e17298114c2434391a4
+.SS Collapsing file space
+.\" commit 00f5e61998dd17f5375d9dfc01331f104b83f841
+Specifying the
+.B FALLOC_FL_COLLAPSE_RANGE
+flag (available since Linux 3.15) in
+.I mode
+removes a byte range from a file, without leaving a hole.
+The byte range to be collapsed starts at
+.I offset
+and continues for
+.I len
+bytes.
+At the completion of the operation,
+the contents of the file starting at the location
+.I offset+len
+will be appended at the location
+.IR offset ,
+and the file will be
+.I len
+bytes smaller.
+.PP
+A filesystem may place limitations on the granularity of the operation,
+in order to ensure efficient implementation.
+Typically,
+.I offset
+and
+.I len
+must be a multiple of the filesystem logical block size,
+which varies according to the filesystem type and configuration.
+If a filesystem has such a requirement,
+.BR fallocate ()
+fails with the error
+.B EINVAL
+if this requirement is violated.
+.PP
+If the region specified by
+.I offset
+plus
+.I len
+reaches or passes the end of file, an error is returned;
+instead, use
+.BR ftruncate (2)
+to truncate a file.
+.PP
+No other flags may be specified in
+.I mode
+in conjunction with
+.BR FALLOC_FL_COLLAPSE_RANGE .
+.PP
+As at Linux 3.15,
+.B FALLOC_FL_COLLAPSE_RANGE
+is supported by
+ext4 (only for extent-based files)
+.\" commit 9eb79482a97152930b113b51dff530aba9e28c8e
+and XFS.
+.\" commit e1d8fb88a64c1f8094b9f6c3b6d2d9e6719c970d
+.SS Zeroing file space
+Specifying the
+.B FALLOC_FL_ZERO_RANGE
+flag (available since Linux 3.15)
+.\" commit 409332b65d3ed8cfa7a8030f1e9d52f372219642
+in
+.I mode
+zeros space in the byte range starting at
+.I offset
+and continuing for
+.I len
+bytes.
+Within the specified range, blocks are preallocated for the regions
+that span the holes in the file.
+After a successful call, subsequent
+reads from this range will return zeros.
+.PP
+Zeroing is done within the filesystem preferably by converting the range into
+unwritten extents.
+This approach means that the specified range will not be physically zeroed
+out on the device (except for partial blocks at the either end of the range),
+and I/O is (otherwise) required only to update metadata.
+.PP
+If the
+.B FALLOC_FL_KEEP_SIZE
+flag is additionally specified in
+.IR mode ,
+the behavior of the call is similar,
+but the file size will not be changed even if
+.IR offset + len
+is greater than the file size.
+This behavior is the same as when preallocating space with
+.B FALLOC_FL_KEEP_SIZE
+specified.
+.PP
+Not all filesystems support
+.BR FALLOC_FL_ZERO_RANGE ;
+if a filesystem doesn't support the operation, an error is returned.
+The operation is supported on at least the following filesystems:
+.IP \[bu] 3
+XFS (since Linux 3.15)
+.\" commit 376ba313147b4172f3e8cf620b9fb591f3e8cdfa
+.IP \[bu]
+ext4, for extent-based files (since Linux 3.15)
+.\" commit b8a8684502a0fc852afa0056c6bb2a9273f6fcc0
+.IP \[bu]
+SMB3 (since Linux 3.17)
+.\" commit 30175628bf7f521e9ee31ac98fa6d6fe7441a556
+.IP \[bu]
+Btrfs (since Linux 4.16)
+.\" commit f27451f229966874a8793995b8e6b74326d125df
+.SS Increasing file space
+Specifying the
+.B FALLOC_FL_INSERT_RANGE
+flag
+(available since Linux 4.1)
+.\" commit dd46c787788d5bf5b974729d43e4c405814a4c7d
+in
+.I mode
+increases the file space by inserting a hole within the file size without
+overwriting any existing data.
+The hole will start at
+.I offset
+and continue for
+.I len
+bytes.
+When inserting the hole inside file, the contents of the file starting at
+.I offset
+will be shifted upward (i.e., to a higher file offset) by
+.I len
+bytes.
+Inserting a hole inside a file increases the file size by
+.I len
+bytes.
+.PP
+This mode has the same limitations as
+.B FALLOC_FL_COLLAPSE_RANGE
+regarding the granularity of the operation.
+If the granularity requirements are not met,
+.BR fallocate ()
+fails with the error
+.BR EINVAL .
+If the
+.I offset
+is equal to or greater than the end of file, an error is returned.
+For such operations (i.e., inserting a hole at the end of file),
+.BR ftruncate (2)
+should be used.
+.PP
+No other flags may be specified in
+.I mode
+in conjunction with
+.BR FALLOC_FL_INSERT_RANGE .
+.PP
+.B FALLOC_FL_INSERT_RANGE
+requires filesystem support.
+Filesystems that support this operation include
+XFS (since Linux 4.1)
+.\" commit a904b1ca5751faf5ece8600e18cd3b674afcca1b
+and ext4 (since Linux 4.2).
+.\" commit 331573febb6a224bc50322e3670da326cb7f4cfc
+.\" f2fs also has support since Linux 4.2
+.\" commit f62185d0e283e9d311e3ac1020f159d95f0aab39
+.SH RETURN VALUE
+On success,
+.BR fallocate ()
+returns zero.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+.I fd
+is not a valid file descriptor, or is not opened for writing.
+.TP
+.B EFBIG
+.IR offset + len
+exceeds the maximum file size.
+.TP
+.B EFBIG
+.I mode
+is
+.BR FALLOC_FL_INSERT_RANGE ,
+and the current file size+\fIlen\fP exceeds the maximum file size.
+.TP
+.B EINTR
+A signal was caught during execution; see
+.BR signal (7).
+.TP
+.B EINVAL
+.I offset
+was less than 0, or
+.I len
+.\" FIXME . (raise a kernel bug) Probably the len==0 case should be
+.\" a no-op, rather than an error. That would be consistent with
+.\" similar APIs for the len==0 case.
+.\" See "Re: [PATCH] fallocate.2: add FALLOC_FL_PUNCH_HOLE flag definition"
+.\" 21 Sep 2012
+.\" http://thread.gmane.org/gmane.linux.file-systems/48331/focus=1193526
+was less than or equal to 0.
+.TP
+.B EINVAL
+.I mode
+is
+.B FALLOC_FL_COLLAPSE_RANGE
+and the range specified by
+.I offset
+plus
+.I len
+reaches or passes the end of the file.
+.TP
+.B EINVAL
+.I mode
+is
+.B FALLOC_FL_INSERT_RANGE
+and the range specified by
+.I offset
+reaches or passes the end of the file.
+.TP
+.B EINVAL
+.I mode
+is
+.B FALLOC_FL_COLLAPSE_RANGE
+or
+.BR FALLOC_FL_INSERT_RANGE ,
+but either
+.I offset
+or
+.I len
+is not a multiple of the filesystem block size.
+.TP
+.B EINVAL
+.I mode
+contains one of
+.B FALLOC_FL_COLLAPSE_RANGE
+or
+.B FALLOC_FL_INSERT_RANGE
+and also other flags;
+no other flags are permitted with
+.B FALLOC_FL_COLLAPSE_RANGE
+or
+.BR FALLOC_FL_INSERT_RANGE .
+.TP
+.B EINVAL
+.I mode
+is
+.BR FALLOC_FL_COLLAPSE_RANGE ,
+.BR FALLOC_FL_ZERO_RANGE ,
+or
+.BR FALLOC_FL_INSERT_RANGE ,
+but the file referred to by
+.I fd
+is not a regular file.
+.\" There was an inconsistency in Linux 3.15-rc1, that should be resolved so that all
+.\" filesystems use this error for this case. (Tytso says ex4 will change.)
+.\" http://thread.gmane.org/gmane.comp.file-systems.xfs.general/60485/focus=5521
+.\" From: Michael Kerrisk (man-pages <mtk.manpages@...>
+.\" Subject: Re: [PATCH v5 10/10] manpage: update FALLOC_FL_COLLAPSE_RANGE flag in fallocate
+.\" Newsgroups: gmane.linux.man, gmane.linux.file-systems
+.\" Date: 2014-04-17 13:40:05 GMT
+.TP
+.B EIO
+An I/O error occurred while reading from or writing to a filesystem.
+.TP
+.B ENODEV
+.I fd
+does not refer to a regular file or a directory.
+(If
+.I fd
+is a pipe or FIFO, a different error results.)
+.TP
+.B ENOSPC
+There is not enough space left on the device containing the file
+referred to by
+.IR fd .
+.TP
+.B ENOSYS
+This kernel does not implement
+.BR fallocate ().
+.TP
+.B EOPNOTSUPP
+The filesystem containing the file referred to by
+.I fd
+does not support this operation;
+or the
+.I mode
+is not supported by the filesystem containing the file referred to by
+.IR fd .
+.TP
+.B EPERM
+The file referred to by
+.I fd
+is marked immutable (see
+.BR chattr (1)).
+.TP
+.B EPERM
+.I mode
+specifies
+.BR FALLOC_FL_PUNCH_HOLE ,
+.BR FALLOC_FL_COLLAPSE_RANGE ,
+or
+.B FALLOC_FL_INSERT_RANGE
+and
+the file referred to by
+.I fd
+is marked append-only
+(see
+.BR chattr (1)).
+.TP
+.B EPERM
+The operation was prevented by a file seal; see
+.BR fcntl (2).
+.TP
+.B ESPIPE
+.I fd
+refers to a pipe or FIFO.
+.TP
+.B ETXTBSY
+.I mode
+specifies
+.B FALLOC_FL_COLLAPSE_RANGE
+or
+.BR FALLOC_FL_INSERT_RANGE ,
+but the file referred to by
+.I fd
+is currently being executed.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+.TP
+.BR fallocate ()
+Linux 2.6.23,
+glibc 2.10.
+.TP
+.B FALLOC_FL_*
+glibc 2.18.
+.\" See http://sourceware.org/bugzilla/show_bug.cgi?id=14964
+.SH SEE ALSO
+.BR fallocate (1),
+.BR ftruncate (2),
+.BR posix_fadvise (3),
+.BR posix_fallocate (3)
diff --git a/man2/fanotify_init.2 b/man2/fanotify_init.2
new file mode 100644
index 0000000..f48e43a
--- /dev/null
+++ b/man2/fanotify_init.2
@@ -0,0 +1,542 @@
+.\" Copyright (C) 2013, Heinrich Schuchardt <xypron.glpk@gmx.de>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.TH fanotify_init 2 2023-07-08 "Linux man-pages 6.05.01"
+.SH NAME
+fanotify_init \- create and initialize fanotify group
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <fcntl.h>" " /* Definition of " O_* " constants */"
+.B #include <sys/fanotify.h>
+.PP
+.BI "int fanotify_init(unsigned int " flags ", unsigned int " event_f_flags );
+.fi
+.SH DESCRIPTION
+For an overview of the fanotify API, see
+.BR fanotify (7).
+.PP
+.BR fanotify_init ()
+initializes a new fanotify group and returns a file descriptor for the event
+queue associated with the group.
+.PP
+The file descriptor is used in calls to
+.BR fanotify_mark (2)
+to specify the files, directories, mounts, or filesystems for which fanotify
+events shall be created.
+These events are received by reading from the file descriptor.
+Some events are only informative, indicating that a file has been accessed.
+Other events can be used to determine whether
+another application is permitted to access a file or directory.
+Permission to access filesystem objects is granted by writing to the file
+descriptor.
+.PP
+Multiple programs may be using the fanotify interface at the same time to
+monitor the same files.
+.PP
+The number of fanotify groups per user is limited.
+See
+.BR fanotify (7)
+for details about this limit.
+.PP
+The
+.I flags
+argument contains a multi-bit field defining the notification class of the
+listening application and further single bit fields specifying the behavior
+of the file descriptor.
+.PP
+If multiple listeners for permission events exist,
+the notification class is used to establish the sequence
+in which the listeners receive the events.
+.PP
+Only one of the following notification classes may be specified in
+.IR flags :
+.TP
+.B FAN_CLASS_PRE_CONTENT
+This value allows the receipt of events notifying that a file has been
+accessed and events for permission decisions if a file may be accessed.
+It is intended for event listeners that need to access files before they
+contain their final data.
+This notification class might be used by hierarchical storage managers,
+for example.
+Use of this flag requires the
+.B CAP_SYS_ADMIN
+capability.
+.TP
+.B FAN_CLASS_CONTENT
+This value allows the receipt of events notifying that a file has been
+accessed and events for permission decisions if a file may be accessed.
+It is intended for event listeners that need to access files when they
+already contain their final content.
+This notification class might be used by malware detection programs, for
+example.
+Use of this flag requires the
+.B CAP_SYS_ADMIN
+capability.
+.TP
+.B FAN_CLASS_NOTIF
+This is the default value.
+It does not need to be specified.
+This value only allows the receipt of events notifying that a file has been
+accessed.
+Permission decisions before the file is accessed are not possible.
+.PP
+Listeners with different notification classes will receive events in the
+order
+.BR FAN_CLASS_PRE_CONTENT ,
+.BR FAN_CLASS_CONTENT ,
+.BR FAN_CLASS_NOTIF .
+The order of notification for listeners in the same notification class
+is undefined.
+.PP
+The following bits can additionally be set in
+.IR flags :
+.TP
+.B FAN_CLOEXEC
+Set the close-on-exec flag
+.RB ( FD_CLOEXEC )
+on the new file descriptor.
+See the description of the
+.B O_CLOEXEC
+flag in
+.BR open (2).
+.TP
+.B FAN_NONBLOCK
+Enable the nonblocking flag
+.RB ( O_NONBLOCK )
+for the file descriptor.
+Reading from the file descriptor will not block.
+Instead, if no data is available,
+.BR read (2)
+fails with the error
+.BR EAGAIN .
+.TP
+.B FAN_UNLIMITED_QUEUE
+Remove the limit on the number of events in the event queue.
+See
+.BR fanotify (7)
+for details about this limit.
+Use of this flag requires the
+.B CAP_SYS_ADMIN
+capability.
+.TP
+.B FAN_UNLIMITED_MARKS
+Remove the limit on the number of fanotify marks per user.
+See
+.BR fanotify (7)
+for details about this limit.
+Use of this flag requires the
+.B CAP_SYS_ADMIN
+capability.
+.TP
+.BR FAN_REPORT_TID " (since Linux 4.20)"
+.\" commit d0a6a87e40da49cfc7954c491d3065a25a641b29
+Report thread ID (TID) instead of process ID (PID)
+in the
+.I pid
+field of the
+.I "struct fanotify_event_metadata"
+supplied to
+.BR read (2)
+(see
+.BR fanotify (7)).
+Use of this flag requires the
+.B CAP_SYS_ADMIN
+capability.
+.TP
+.BR FAN_ENABLE_AUDIT " (since Linux 4.15)"
+.\" commit de8cd83e91bc3ee212b3e6ec6e4283af9e4ab269
+Enable generation of audit log records about access mediation performed by
+permission events.
+The permission event response has to be marked with the
+.B FAN_AUDIT
+flag for an audit log record to be generated.
+Use of this flag requires the
+.B CAP_AUDIT_WRITE
+capability.
+.TP
+.BR FAN_REPORT_FID " (since Linux 5.1)"
+.\" commit a8b13aa20afb69161b5123b4f1acc7ea0a03d360
+This value allows the receipt of events which contain additional information
+about the underlying filesystem object correlated to an event.
+An additional record of type
+.B FAN_EVENT_INFO_TYPE_FID
+encapsulates the information about the object and is included alongside the
+generic event metadata structure.
+The file descriptor that is used to represent the object correlated to an
+event is instead substituted with a file handle.
+It is intended for applications that may find the use of a file handle to
+identify an object more suitable than a file descriptor.
+Additionally, it may be used for applications monitoring a directory or a
+filesystem that are interested in the directory entry modification events
+.BR FAN_CREATE ,
+.BR FAN_DELETE ,
+.BR FAN_MOVE ,
+and
+.BR FAN_RENAME ,
+or in events such as
+.BR FAN_ATTRIB ,
+.BR FAN_DELETE_SELF ,
+and
+.BR FAN_MOVE_SELF .
+All the events above require an fanotify group that identifies filesystem
+objects by file handles.
+Note that without the flag
+.BR FAN_REPORT_TARGET_FID ,
+for the directory entry modification events,
+there is an information record that identifies the modified directory
+and not the created/deleted/moved child object.
+The use of
+.B FAN_CLASS_CONTENT
+or
+.B FAN_CLASS_PRE_CONTENT
+is not permitted with this flag and will result in the error
+.BR EINVAL .
+See
+.BR fanotify (7)
+for additional details.
+.TP
+.BR FAN_REPORT_DIR_FID " (since Linux 5.9)"
+.\" commit 83b7a59896dd24015a34b7f00027f0ff3747972f
+Events for fanotify groups initialized with this flag will contain
+(see exceptions below) additional information about a directory object
+correlated to an event.
+An additional record of type
+.B FAN_EVENT_INFO_TYPE_DFID
+encapsulates the information about the directory object and is included
+alongside the generic event metadata structure.
+For events that occur on a non-directory object, the additional structure
+includes a file handle that identifies the parent directory filesystem object.
+Note that there is no guarantee that the directory filesystem object will be
+found at the location described by the file handle information at the time
+the event is received.
+When combined with the flag
+.BR FAN_REPORT_FID ,
+two records may be reported with events that occur on a non-directory object,
+one to identify the non-directory object itself and one to identify the parent
+directory object.
+Note that in some cases, a filesystem object does not have a parent,
+for example, when an event occurs on an unlinked but open file.
+In that case, with the
+.B FAN_REPORT_FID
+flag, the event will be reported with only one record to identify the
+non-directory object itself, because there is no directory associated with
+the event.
+Without the
+.B FAN_REPORT_FID
+flag, no event will be reported.
+See
+.BR fanotify (7)
+for additional details.
+.TP
+.BR FAN_REPORT_NAME " (since Linux 5.9)"
+.\" commit 929943b38daf817f2e6d303ea04401651fc3bc05
+Events for fanotify groups initialized with this flag will contain additional
+information about the name of the directory entry correlated to an event.
+This flag must be provided in conjunction with the flag
+.BR FAN_REPORT_DIR_FID .
+Providing this flag value without
+.B FAN_REPORT_DIR_FID
+will result in the error
+.BR EINVAL .
+This flag may be combined with the flag
+.BR FAN_REPORT_FID .
+An additional record of type
+.BR FAN_EVENT_INFO_TYPE_DFID_NAME ,
+which encapsulates the information about the directory entry, is included
+alongside the generic event metadata structure and substitutes the additional
+information record of type
+.BR FAN_EVENT_INFO_TYPE_DFID .
+The additional record includes a file handle that identifies a directory
+filesystem object followed by a name that identifies an entry in that
+directory.
+For the directory entry modification events
+.BR FAN_CREATE ,
+.BR FAN_DELETE ,
+and
+.BR FAN_MOVE ,
+the reported name is that of the created/deleted/moved directory entry.
+The event
+.B FAN_RENAME
+may contain two information records.
+One of type
+.B FAN_EVENT_INFO_TYPE_OLD_DFID_NAME
+identifying the old directory entry,
+and another of type
+.B FAN_EVENT_INFO_TYPE_NEW_DFID_NAME
+identifying the new directory entry.
+For other events that occur on a directory object, the reported file handle
+is that of the directory object itself and the reported name is '.'.
+For other events that occur on a non-directory object, the reported file handle
+is that of the parent directory object and the reported name is the name of a
+directory entry where the object was located at the time of the event.
+The rationale behind this logic is that the reported directory file handle can
+be passed to
+.BR open_by_handle_at (2)
+to get an open directory file descriptor and that file descriptor along with
+the reported name can be used to call
+.BR fstatat (2).
+The same rule that applies to record type
+.B FAN_EVENT_INFO_TYPE_DFID
+also applies to record type
+.BR FAN_EVENT_INFO_TYPE_DFID_NAME :
+if a non-directory object has no parent, either the event will not be reported
+or it will be reported without the directory entry information.
+Note that there is no guarantee that the filesystem object will be found at the
+location described by the directory entry information at the time the event is
+received.
+See
+.BR fanotify (7)
+for additional details.
+.TP
+.B FAN_REPORT_DFID_NAME
+This is a synonym for
+.RB ( FAN_REPORT_DIR_FID | FAN_REPORT_NAME ).
+.TP
+.BR FAN_REPORT_TARGET_FID " (since Linux 5.17)"
+.\" commit d61fd650e9d206a71fda789f02a1ced4b19944c4
+Events for fanotify groups initialized with this flag
+will contain additional information about the child
+correlated with directory entry modification events.
+This flag must be provided in conjunction with the flags
+.BR FAN_REPORT_FID ,
+.B FAN_REPORT_DIR_FID
+and
+.BR FAN_REPORT_NAME .
+or else the error
+.B EINVAL
+will be returned.
+For the directory entry modification events
+.BR FAN_CREATE ,
+.BR FAN_DELETE ,
+.BR FAN_MOVE ,
+and
+.BR FAN_RENAME ,
+an additional record of type
+.BR FAN_EVENT_INFO_TYPE_FID ,
+is reported in addition to the information records of type
+.BR FAN_EVENT_INFO_TYPE_DFID ,
+.BR FAN_EVENT_INFO_TYPE_DFID_NAME ,
+.BR FAN_EVENT_INFO_TYPE_OLD_DFID_NAME ,
+and
+.BR FAN_EVENT_INFO_TYPE_NEW_DFID_NAME .
+The additional record includes a file handle
+that identifies the filesystem child object
+that the directory entry is referring to.
+.TP
+.B FAN_REPORT_DFID_NAME_TARGET
+This is a synonym for
+.RB ( FAN_REPORT_DFID_NAME | FAN_REPORT_FID | FAN_REPORT_TARGET_FID ).
+.TP
+.BR FAN_REPORT_PIDFD " (since Linux 5.15)"
+.\" commit af579beb666aefb17e9a335c12c788c92932baf1
+Events for fanotify groups initialized with this flag will contain
+an additional information record alongside the generic
+.I fanotify_event_metadata
+structure.
+This information record will be of type
+.B FAN_EVENT_INFO_TYPE_PIDFD
+and will contain a pidfd for the process that
+was responsible for generating an event.
+A pidfd returned in this information record object is
+no different to the pidfd that is returned when calling
+.BR pidfd_open (2).
+Usage of this information record are for applications that
+may be interested in reliably determining whether
+the process responsible for generating an event
+has been recycled or terminated.
+The use of the
+.B FAN_REPORT_TID
+flag along with
+.B FAN_REPORT_PIDFD
+is currently not supported and
+attempting to do so will result in the error
+.B EINVAL
+being returned.
+This limitation is currently imposed by the pidfd API
+as it currently only supports
+the creation of pidfds for thread-group leaders.
+Creating pidfds for non-thread-group leaders
+may be supported at some point in the future,
+so this restriction may eventually be lifted.
+For more details on information records,
+see
+.BR fanotify (7).
+.PP
+The
+.I event_f_flags
+argument
+defines the file status flags that will be set on the open file descriptions
+that are created for fanotify events.
+For details of these flags, see the description of the
+.I flags
+values in
+.BR open (2).
+.I event_f_flags
+includes a multi-bit field for the access mode.
+This field can take the following values:
+.TP
+.B O_RDONLY
+This value allows only read access.
+.TP
+.B O_WRONLY
+This value allows only write access.
+.TP
+.B O_RDWR
+This value allows read and write access.
+.PP
+Additional bits can be set in
+.IR event_f_flags .
+The most useful values are:
+.TP
+.B O_LARGEFILE
+Enable support for files exceeding 2\ GB.
+Failing to set this flag will result in an
+.B EOVERFLOW
+error when trying to open a large file which is monitored by
+an fanotify group on a 32-bit system.
+.TP
+.BR O_CLOEXEC " (since Linux 3.18)"
+.\" commit 0b37e097a648aa71d4db1ad108001e95b69a2da4
+Enable the close-on-exec flag for the file descriptor.
+See the description of the
+.B O_CLOEXEC
+flag in
+.BR open (2)
+for reasons why this may be useful.
+.PP
+The following are also allowable:
+.BR O_APPEND ,
+.BR O_DSYNC ,
+.BR O_NOATIME ,
+.BR O_NONBLOCK ,
+and
+.BR O_SYNC .
+Specifying any other flag in
+.I event_f_flags
+yields the error
+.B EINVAL
+(but see BUGS).
+.SH RETURN VALUE
+On success,
+.BR fanotify_init ()
+returns a new file descriptor.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+An invalid value was passed in
+.I flags
+or
+.IR event_f_flags .
+.B FAN_ALL_INIT_FLAGS
+(deprecated since Linux 4.20)
+.\" commit 23c9deeb3285d34fd243abb3d6b9f07db60c3cf4
+defines all allowable bits for
+.IR flags .
+.TP
+.B EMFILE
+The number of fanotify groups for this user exceeds the limit.
+See
+.BR fanotify (7)
+for details about this limit.
+.TP
+.B EMFILE
+The per-process limit on the number of open file descriptors has been reached.
+.TP
+.B ENOMEM
+The allocation of memory for the notification group failed.
+.TP
+.B ENOSYS
+This kernel does not implement
+.BR fanotify_init ().
+The fanotify API is available only if the kernel was configured with
+.BR CONFIG_FANOTIFY .
+.TP
+.B EPERM
+The operation is not permitted because the caller lacks a required capability.
+.SH VERSIONS
+Prior to Linux 5.13,
+.\" commit 7cea2a3c505e87a9d6afc78be4a7f7be636a73a7
+calling
+.BR fanotify_init ()
+required the
+.B CAP_SYS_ADMIN
+capability.
+Since Linux 5.13,
+.\" commit 7cea2a3c505e87a9d6afc78be4a7f7be636a73a7
+users may call
+.BR fanotify_init ()
+without the
+.B CAP_SYS_ADMIN
+capability to create and initialize
+an fanotify group with limited functionality.
+.TP
+The limitations imposed on an event listener created by a user without the
+.B CAP_SYS_ADMIN
+capability are as follows:
+.RS
+.IP \[bu] 3
+The user cannot request for an unlimited event queue by using
+.BR FAN_UNLIMITED_QUEUE .
+.IP \[bu]
+The user cannot request for an unlimited number of marks by using
+.BR FAN_UNLIMITED_MARKS .
+.IP \[bu]
+The user cannot request to use either notification classes
+.B FAN_CLASS_CONTENT
+or
+.BR FAN_CLASS_PRE_CONTENT .
+This means that user cannot request permission events.
+.IP \[bu]
+The user is required to create a group that identifies filesystem objects by
+file handles, for example, by providing the
+.B FAN_REPORT_FID
+flag.
+.IP \[bu]
+The user is limited to only mark inodes.
+The ability to mark a mount or filesystem via
+.BR fanotify_mark ()
+through the use of
+.B FAN_MARK_MOUNT
+or
+.B FAN_MARK_FILESYSTEM
+is not permitted.
+.IP \[bu]
+The event object in the event queue is limited in terms of the information
+that is made available to the unprivileged user.
+A user will also not receive the pid that generated the event, unless the
+listening process itself generated the event.
+.RE
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.37.
+.\" was introduced in Linux 2.6.36 and enabled in Linux 2.6.37.
+.SH BUGS
+The following bug was present before Linux 3.18:
+.IP \[bu] 3
+.\" Fixed by commit 0b37e097a648aa71d4db1ad108001e95b69a2da4
+The
+.B O_CLOEXEC
+is ignored when passed in
+.IR event_f_flags .
+.PP
+The following bug was present before Linux 3.14:
+.IP \[bu] 3
+.\" Fixed by commit 48149e9d3a7e924010a0daab30a6197b7d7b6580
+The
+.I event_f_flags
+argument is not checked for invalid flags.
+Flags that are intended only for internal use,
+such as
+.BR FMODE_EXEC ,
+can be set, and will consequently be set for the file descriptors
+returned when reading from the fanotify file descriptor.
+.SH SEE ALSO
+.BR fanotify_mark (2),
+.BR fanotify (7)
diff --git a/man2/fanotify_mark.2 b/man2/fanotify_mark.2
new file mode 100644
index 0000000..d1f7eec
--- /dev/null
+++ b/man2/fanotify_mark.2
@@ -0,0 +1,843 @@
+.\" Copyright (C) 2013, Heinrich Schuchardt <xypron.glpk@gmx.de>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.TH fanotify_mark 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+fanotify_mark \- add, remove, or modify an fanotify mark on a filesystem
+object
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/fanotify.h>
+.PP
+.BI "int fanotify_mark(int " fanotify_fd ", unsigned int " flags ,
+.BI " uint64_t " mask ", int " dirfd ,
+.BI " const char *_Nullable " pathname );
+.fi
+.SH DESCRIPTION
+For an overview of the fanotify API, see
+.BR fanotify (7).
+.PP
+.BR fanotify_mark ()
+adds, removes, or modifies an fanotify mark on a filesystem object.
+The caller must have read permission on the filesystem object that
+is to be marked.
+.PP
+The
+.I fanotify_fd
+argument is a file descriptor returned by
+.BR fanotify_init (2).
+.PP
+.I flags
+is a bit mask describing the modification to perform.
+It must include exactly one of the following values:
+.TP
+.B FAN_MARK_ADD
+The events in
+.I mask
+will be added to the mark mask (or to the ignore mask).
+.I mask
+must be nonempty or the error
+.B EINVAL
+will occur.
+.TP
+.B FAN_MARK_REMOVE
+The events in argument
+.I mask
+will be removed from the mark mask (or from the ignore mask).
+.I mask
+must be nonempty or the error
+.B EINVAL
+will occur.
+.TP
+.B FAN_MARK_FLUSH
+Remove either all marks for filesystems, all marks for mounts, or all
+marks for directories and files from the fanotify group.
+If
+.I flags
+contains
+.BR FAN_MARK_MOUNT ,
+all marks for mounts are removed from the group.
+If
+.I flags
+contains
+.BR FAN_MARK_FILESYSTEM ,
+all marks for filesystems are removed from the group.
+Otherwise, all marks for directories and files are removed.
+No flag other than, and at most one of, the flags
+.B FAN_MARK_MOUNT
+or
+.B FAN_MARK_FILESYSTEM
+can be used in conjunction with
+.BR FAN_MARK_FLUSH .
+.I mask
+is ignored.
+.PP
+If none of the values above is specified, or more than one is specified,
+the call fails with the error
+.BR EINVAL .
+.PP
+In addition,
+zero or more of the following values may be ORed into
+.IR flags :
+.TP
+.B FAN_MARK_DONT_FOLLOW
+If
+.I pathname
+is a symbolic link, mark the link itself, rather than the file to which it
+refers.
+(By default,
+.BR fanotify_mark ()
+dereferences
+.I pathname
+if it is a symbolic link.)
+.TP
+.B FAN_MARK_ONLYDIR
+If the filesystem object to be marked is not a directory, the error
+.B ENOTDIR
+shall be raised.
+.TP
+.B FAN_MARK_MOUNT
+Mark the mount specified by
+.IR pathname .
+If
+.I pathname
+is not itself a mount point, the mount containing
+.I pathname
+will be marked.
+All directories, subdirectories, and the contained files of the mount
+will be monitored.
+The events which require that filesystem objects are identified by file handles,
+such as
+.BR FAN_CREATE ,
+.BR FAN_ATTRIB ,
+.BR FAN_MOVE ,
+and
+.BR FAN_DELETE_SELF ,
+cannot be provided as a
+.I mask
+when
+.I flags
+contains
+.BR FAN_MARK_MOUNT .
+Attempting to do so will result in the error
+.B EINVAL
+being returned.
+Use of this flag requires the
+.B CAP_SYS_ADMIN
+capability.
+.TP
+.BR FAN_MARK_FILESYSTEM " (since Linux 4.20)"
+.\" commit d54f4fba889b205e9cd8239182ca5d27d0ac3bc2
+Mark the filesystem specified by
+.IR pathname .
+The filesystem containing
+.I pathname
+will be marked.
+All the contained files and directories of the filesystem from any mount
+point will be monitored.
+Use of this flag requires the
+.B CAP_SYS_ADMIN
+capability.
+.TP
+.B FAN_MARK_IGNORED_MASK
+The events in
+.I mask
+shall be added to or removed from the ignore mask.
+Note that the flags
+.BR FAN_ONDIR ,
+and
+.B FAN_EVENT_ON_CHILD
+have no effect when provided with this flag.
+The effect of setting the flags
+.BR FAN_ONDIR ,
+and
+.B FAN_EVENT_ON_CHILD
+in the mark mask
+on the events that are set in the ignore mask
+is undefined and depends on the Linux kernel version.
+Specifically, prior to Linux 5.9,
+.\" commit 497b0c5a7c0688c1b100a9c2e267337f677c198e
+setting a mark mask on a file
+and a mark with ignore mask on its parent directory
+would not result in ignoring events on the file,
+regardless of the
+.B FAN_EVENT_ON_CHILD
+flag in the parent directory's mark mask.
+When the ignore mask is updated with the
+.B FAN_MARK_IGNORED_MASK
+flag
+on a mark that was previously updated with the
+.B FAN_MARK_IGNORE
+flag,
+the update fails with
+.B EEXIST
+error.
+.TP
+.BR FAN_MARK_IGNORE " (since Linux 6.0)"
+.\" commit e252f2ed1c8c6c3884ab5dd34e003ed21f1fe6e0
+This flag has a similar effect as setting the
+.B FAN_MARK_IGNORED_MASK
+flag.
+The events in
+.I mask
+shall be added to or removed from the ignore mask.
+Unlike the
+.B FAN_MARK_IGNORED_MASK
+flag,
+this flag also has the effect that the
+.BR FAN_ONDIR ,
+and
+.B FAN_EVENT_ON_CHILD
+flags take effect on the ignore mask.
+Specifically, unless the
+.B FAN_ONDIR
+flag is set with
+.BR FAN_MARK_IGNORE ,
+events on directories will not be ignored.
+If the flag
+.B FAN_EVENT_ON_CHILD
+is set with
+.BR FAN_MARK_IGNORE ,
+events on children will be ignored.
+For example,
+a mark on a directory with combination of
+a mask with
+.B FAN_CREATE
+event
+and
+.B FAN_ONDIR
+flag
+and an ignore mask with
+.B FAN_CREATE
+event
+and without
+.B FAN_ONDIR
+flag,
+will result in getting only
+the events for creation of sub-directories.
+When using the
+.B FAN_MARK_IGNORE
+flag to add to an ignore mask
+of a mount,
+filesystem,
+or directory inode mark,
+the
+.B FAN_MARK_IGNORED_SURV_MODIFY
+flag must be specified.
+Failure to do so will results with
+.B EINVAL
+or
+.B EISDIR
+error.
+.TP
+.B FAN_MARK_IGNORED_SURV_MODIFY
+The ignore mask shall survive modify events.
+If this flag is not set,
+the ignore mask is cleared when a modify event occurs
+on the marked object.
+Omitting this flag is typically used to suppress events
+(e.g.,
+.BR FAN_OPEN )
+for a specific file,
+until that specific file's content has been modified.
+It is far less useful to suppress events
+on an entire filesystem,
+or mount,
+or on all files inside a directory,
+until some file's content has been modified.
+For this reason,
+the
+.B FAN_MARK_IGNORE
+flag requires the
+.B FAN_MARK_IGNORED_SURV_MODIFY
+flag on a mount,
+filesystem,
+or directory inode mark.
+This flag cannot be removed from a mark once set.
+When the ignore mask is updated without this flag
+on a mark that was previously updated with the
+.B FAN_MARK_IGNORE
+and
+.B FAN_MARK_IGNORED_SURV_MODIFY
+flags,
+the update fails with
+.B EEXIST
+error.
+.TP
+.B FAN_MARK_IGNORE_SURV
+This is a synonym for
+.RB ( FAN_MARK_IGNORE | FAN_MARK_IGNORED_SURV_MODIFY ).
+.TP
+.BR FAN_MARK_EVICTABLE " (since Linux 5.19)"
+.\" commit 5f9d3bd520261fd7a850818c71809fd580e0f30c
+When an inode mark is created with this flag,
+the inode object will not be pinned to the inode cache,
+therefore,
+allowing the inode object to be evicted from the inode cache
+when the memory pressure on the system is high.
+The eviction of the inode object
+results in the evictable mark also being lost.
+When the mask of an evictable inode mark is updated
+without using the
+.B FAN_MARK_EVICATBLE
+flag,
+the marked inode is pinned to inode cache
+and the mark is no longer evictable.
+When the mask of a non-evictable inode mark is updated
+with the
+.B FAN_MARK_EVICTABLE
+flag,
+the inode mark remains non-evictable
+and the update fails with
+.B EEXIST
+error.
+Mounts and filesystems are not evictable objects,
+therefore,
+an attempt to create a mount mark or a filesystem mark
+with the
+.B FAN_MARK_EVICTABLE
+flag,
+will result in the error
+.BR EINVAL .
+For example,
+inode marks can be used in combination with mount marks
+to reduce the amount of events from noninteresting paths.
+The event listener reads events,
+checks if the path reported in the event is of interest,
+and if it is not,
+the listener sets a mark with an ignore mask on the directory.
+Evictable inode marks allow using this method for a large number of directories
+without the concern of pinning all inodes and exhausting the system's memory.
+.PP
+.I mask
+defines which events shall be listened for (or which shall be ignored).
+It is a bit mask composed of the following values:
+.TP
+.B FAN_ACCESS
+Create an event when a file or directory (but see BUGS) is accessed (read).
+.TP
+.B FAN_MODIFY
+Create an event when a file is modified (write).
+.TP
+.B FAN_CLOSE_WRITE
+Create an event when a writable file is closed.
+.TP
+.B FAN_CLOSE_NOWRITE
+Create an event when a read-only file or directory is closed.
+.TP
+.B FAN_OPEN
+Create an event when a file or directory is opened.
+.TP
+.BR FAN_OPEN_EXEC " (since Linux 5.0)"
+.\" commit 9b076f1c0f4869b838a1b7aa0edb5664d47ec8aa
+Create an event when a file is opened with the intent to be executed.
+See NOTES for additional details.
+.TP
+.BR FAN_ATTRIB " (since Linux 5.1)"
+.\" commit 235328d1fa4251c6dcb32351219bb553a58838d2
+Create an event when the metadata for a file or directory has changed.
+An fanotify group that identifies filesystem objects by file handles
+is required.
+.TP
+.BR FAN_CREATE " (since Linux 5.1)"
+.\" commit 235328d1fa4251c6dcb32351219bb553a58838d2
+Create an event when a file or directory has been created in a marked
+parent directory.
+An fanotify group that identifies filesystem objects by file handles
+is required.
+.TP
+.BR FAN_DELETE " (since Linux 5.1)"
+.\" commit 235328d1fa4251c6dcb32351219bb553a58838d2
+Create an event when a file or directory has been deleted in a marked
+parent directory.
+An fanotify group that identifies filesystem objects by file handles
+is required.
+.TP
+.BR FAN_DELETE_SELF " (since Linux 5.1)"
+.\" commit 235328d1fa4251c6dcb32351219bb553a58838d2
+Create an event when a marked file or directory itself is deleted.
+An fanotify group that identifies filesystem objects by file handles
+is required.
+.TP
+.BR FAN_FS_ERROR " (since Linux 5.16)"
+.\" commit 9709bd548f11a092d124698118013f66e1740f9b
+Create an event when a filesystem error
+leading to inconsistent filesystem metadata is detected.
+An additional information record of type
+.B FAN_EVENT_INFO_TYPE_ERROR
+is returned for each event in the read buffer.
+An fanotify group that identifies filesystem objects by file handles
+is required.
+.IP
+Events of such type are dependent on support
+from the underlying filesystem.
+At the time of writing,
+only the
+.B ext4
+filesystem reports
+.B FAN_FS_ERROR
+events.
+.IP
+See
+.BR fanotify (7)
+for additional details.
+.TP
+.BR FAN_MOVED_FROM " (since Linux 5.1)"
+.\" commit 235328d1fa4251c6dcb32351219bb553a58838d2
+Create an event when a file or directory has been moved from a marked
+parent directory.
+An fanotify group that identifies filesystem objects by file handles
+is required.
+.TP
+.BR FAN_MOVED_TO " (since Linux 5.1)"
+.\" commit 235328d1fa4251c6dcb32351219bb553a58838d2
+Create an event when a file or directory has been moved to a marked parent
+directory.
+An fanotify group that identifies filesystem objects by file handles
+is required.
+.TP
+.BR FAN_RENAME " (since Linux 5.17)"
+.\" commit 8cc3b1ccd930fe6971e1527f0c4f1bdc8cb56026
+This event contains the same information provided by events
+.B FAN_MOVED_FROM
+and
+.BR FAN_MOVED_TO ,
+however is represented by a single event with up to two information records.
+An fanotify group that identifies filesystem objects by file handles
+is required.
+If the filesystem object to be marked is not a directory, the error
+.B ENOTDIR
+shall be raised.
+.TP
+.BR FAN_MOVE_SELF " (since Linux 5.1)"
+.\" commit 235328d1fa4251c6dcb32351219bb553a58838d2
+Create an event when a marked file or directory itself has been moved.
+An fanotify group that identifies filesystem objects by file handles
+is required.
+.TP
+.B FAN_OPEN_PERM
+Create an event when a permission to open a file or directory is requested.
+An fanotify file descriptor created with
+.B FAN_CLASS_PRE_CONTENT
+or
+.B FAN_CLASS_CONTENT
+is required.
+.TP
+.BR FAN_OPEN_EXEC_PERM " (since Linux 5.0)"
+.\" commit 66917a3130f218dcef9eeab4fd11a71cd00cd7c9
+Create an event when a permission to open a file for execution is
+requested.
+An fanotify file descriptor created with
+.B FAN_CLASS_PRE_CONTENT
+or
+.B FAN_CLASS_CONTENT
+is required.
+See NOTES for additional details.
+.TP
+.B FAN_ACCESS_PERM
+Create an event when a permission to read a file or directory is requested.
+An fanotify file descriptor created with
+.B FAN_CLASS_PRE_CONTENT
+or
+.B FAN_CLASS_CONTENT
+is required.
+.TP
+.B FAN_ONDIR
+Create events for directories\[em]for example, when
+.BR opendir (3),
+.BR readdir (3)
+(but see BUGS), and
+.BR closedir (3)
+are called.
+Without this flag, events are created only for files.
+In the context of directory entry events, such as
+.BR FAN_CREATE ,
+.BR FAN_DELETE ,
+.BR FAN_MOVED_FROM ,
+and
+.BR FAN_MOVED_TO ,
+specifying the flag
+.B FAN_ONDIR
+is required in order to create events when subdirectory entries are
+modified (i.e.,
+.BR mkdir (2)/
+.BR rmdir (2)).
+.TP
+.B FAN_EVENT_ON_CHILD
+Events for the immediate children of marked directories shall be created.
+The flag has no effect when marking mounts and filesystems.
+Note that events are not generated for children of the subdirectories
+of marked directories.
+More specifically, the directory entry modification events
+.BR FAN_CREATE ,
+.BR FAN_DELETE ,
+.BR FAN_MOVED_FROM ,
+and
+.B FAN_MOVED_TO
+are not generated for any entry modifications performed inside subdirectories
+of marked directories.
+Note that the events
+.B FAN_DELETE_SELF
+and
+.B FAN_MOVE_SELF
+are not generated for children of marked directories.
+To monitor complete directory trees it is necessary to mark the relevant
+mount or filesystem.
+.PP
+The following composed values are defined:
+.TP
+.B FAN_CLOSE
+A file is closed
+.RB ( FAN_CLOSE_WRITE | FAN_CLOSE_NOWRITE ).
+.TP
+.B FAN_MOVE
+A file or directory has been moved
+.RB ( FAN_MOVED_FROM | FAN_MOVED_TO ).
+.PP
+The filesystem object to be marked is determined by the file descriptor
+.I dirfd
+and the pathname specified in
+.IR pathname :
+.IP \[bu] 3
+If
+.I pathname
+is NULL,
+.I dirfd
+defines the filesystem object to be marked.
+.IP \[bu]
+If
+.I pathname
+is NULL, and
+.I dirfd
+takes the special value
+.BR AT_FDCWD ,
+the current working directory is to be marked.
+.IP \[bu]
+If
+.I pathname
+is absolute, it defines the filesystem object to be marked, and
+.I dirfd
+is ignored.
+.IP \[bu]
+If
+.I pathname
+is relative, and
+.I dirfd
+does not have the value
+.BR AT_FDCWD ,
+then the filesystem object to be marked is determined by interpreting
+.I pathname
+relative the directory referred to by
+.IR dirfd .
+.IP \[bu]
+If
+.I pathname
+is relative, and
+.I dirfd
+has the value
+.BR AT_FDCWD ,
+then the filesystem object to be marked is determined by interpreting
+.I pathname
+relative to the current working directory.
+(See
+.BR openat (2)
+for an explanation of why the
+.I dirfd
+argument is useful.)
+.SH RETURN VALUE
+On success,
+.BR fanotify_mark ()
+returns 0.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+An invalid file descriptor was passed in
+.IR fanotify_fd .
+.TP
+.B EBADF
+.I pathname
+is relative but
+.I dirfd
+is neither
+.B AT_FDCWD
+nor a valid file descriptor.
+.TP
+.B EEXIST
+The filesystem object indicated by
+.I dirfd
+and
+.I pathname
+has a mark that was updated without the
+.B FAN_MARK_EVICTABLE
+flag,
+and the user attempted to update the mark with
+.B FAN_MARK_EVICTABLE
+flag.
+.TP
+.B EEXIST
+The filesystem object indicated by
+.I dirfd
+and
+.I pathname
+has a mark that was updated with the
+.B FAN_MARK_IGNORE
+flag,
+and the user attempted to update the mark with
+.B FAN_MARK_IGNORED_MASK
+flag.
+.TP
+.B EEXIST
+The filesystem object indicated by
+.I dirfd
+and
+.I pathname
+has a mark that was updated with the
+.B FAN_MARK_IGNORE
+and
+.B FAN_MARK_IGNORED_SURV_MODIFY
+flags,
+and the user attempted to update the mark only with
+.B FAN_MARK_IGNORE
+flag.
+.TP
+.B EINVAL
+An invalid value was passed in
+.I flags
+or
+.IR mask ,
+or
+.I fanotify_fd
+was not an fanotify file descriptor.
+.TP
+.B EINVAL
+The fanotify file descriptor was opened with
+.B FAN_CLASS_NOTIF
+or the fanotify group identifies filesystem objects by file handles
+and mask contains a flag for permission events
+.RB ( FAN_OPEN_PERM
+or
+.BR FAN_ACCESS_PERM ).
+.TP
+.B EINVAL
+The group was initialized without
+.B FAN_REPORT_FID
+but one or more event types specified in the
+.I mask
+require it.
+.TP
+.B EINVAL
+.I flags
+contains
+.BR FAN_MARK_IGNORE ,
+and either
+.B FAN_MARK_MOUNT
+or
+.BR FAN_MARK_FILESYSTEM ,
+but does not contain
+.BR FAN_MARK_IGNORED_SURV_MODIFY .
+.TP
+.B EISDIR
+.I flags
+contains
+.BR FAN_MARK_IGNORE ,
+but does not contain
+.BR FAN_MARK_IGNORED_SURV_MODIFY ,
+and
+.I dirfd
+and
+.I pathname
+specify a directory.
+.TP
+.B ENODEV
+The filesystem object indicated by
+.I dirfd
+and
+.I pathname
+is not associated with a filesystem that supports
+.I fsid
+(e.g.,
+.BR fuse (4)).
+.BR tmpfs (5)
+did not support
+.I fsid
+prior to Linux 5.13.
+.\" commit 59cda49ecf6c9a32fae4942420701b6e087204f6
+This error can be returned only with an fanotify group that identifies
+filesystem objects by file handles.
+.TP
+.B ENOENT
+The filesystem object indicated by
+.I dirfd
+and
+.I pathname
+does not exist.
+This error also occurs when trying to remove a mark from an object
+which is not marked.
+.TP
+.B ENOMEM
+The necessary memory could not be allocated.
+.TP
+.B ENOSPC
+The number of marks for this user exceeds the limit and the
+.B FAN_UNLIMITED_MARKS
+flag was not specified when the fanotify file descriptor was created with
+.BR fanotify_init (2).
+See
+.BR fanotify (7)
+for details about this limit.
+.TP
+.B ENOSYS
+This kernel does not implement
+.BR fanotify_mark ().
+The fanotify API is available only if the kernel was configured with
+.BR CONFIG_FANOTIFY .
+.TP
+.B ENOTDIR
+.I flags
+contains
+.BR FAN_MARK_ONLYDIR ,
+and
+.I dirfd
+and
+.I pathname
+do not specify a directory.
+.TP
+.B ENOTDIR
+.I mask
+contains
+.BR FAN_RENAME ,
+and
+.I dirfd
+and
+.I pathname
+do not specify a directory.
+.TP
+.B ENOTDIR
+.I flags
+contains
+.BR FAN_MARK_IGNORE ,
+or the fanotify group was initialized with flag
+.BR FAN_REPORT_TARGET_FID ,
+and
+.I mask
+contains directory entry modification events
+(e.g.,
+.BR FAN_CREATE ,
+.BR FAN_DELETE ),
+or directory event flags
+(e.g.,
+.BR FAN_ONDIR ,
+.BR FAN_EVENT_ON_CHILD ),
+and
+.I dirfd
+and
+.I pathname
+do not specify a directory.
+.TP
+.B EOPNOTSUPP
+The object indicated by
+.I pathname
+is associated with a filesystem that does not support the encoding of file
+handles.
+This error can be returned only with an fanotify group that identifies
+filesystem objects by file handles.
+.TP
+.B EPERM
+The operation is not permitted because the caller lacks a required capability.
+.TP
+.B EXDEV
+The filesystem object indicated by
+.I pathname
+resides within a filesystem subvolume (e.g.,
+.BR btrfs (5))
+which uses a different
+.I fsid
+than its root superblock.
+This error can be returned only with an fanotify group that identifies
+filesystem objects by file handles.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.37.
+.\" was introduced in Linux 2.6.36 and enabled in Linux 2.6.37.
+.SH NOTES
+.SS FAN_OPEN_EXEC and FAN_OPEN_EXEC_PERM
+When using either
+.B FAN_OPEN_EXEC
+or
+.B FAN_OPEN_EXEC_PERM
+within the
+.IR mask ,
+events of these types will be returned only when the direct execution of a
+program occurs.
+More specifically, this means that events of these types will be generated
+for files that are opened using
+.BR execve (2),
+.BR execveat (2),
+or
+.BR uselib (2).
+Events of these types will not be raised in the situation where an
+interpreter is passed (or reads) a file for interpretation.
+.PP
+Additionally, if a mark has also been placed on the Linux dynamic
+linker, a user should also expect to receive an event for it when
+an ELF object has been successfully opened using
+.BR execve (2)
+or
+.BR execveat (2).
+.PP
+For example, if the following ELF binary were to be invoked and a
+.B FAN_OPEN_EXEC
+mark has been placed on /:
+.PP
+.in +4n
+.EX
+$ /bin/echo foo
+.EE
+.in
+.PP
+The listening application in this case would receive
+.B FAN_OPEN_EXEC
+events for both the ELF binary and interpreter, respectively:
+.PP
+.in +4n
+.EX
+/bin/echo
+/lib64/ld\-linux\-x86\-64.so.2
+.EE
+.in
+.SH BUGS
+The following bugs were present in before Linux 3.16:
+.IP \[bu] 3
+.\" Fixed by commit 0a8dd2db579f7a0ac7033d6b857c3d5dbaa77563
+If
+.I flags
+contains
+.BR FAN_MARK_FLUSH ,
+.IR dirfd ,
+and
+.I pathname
+must specify a valid filesystem object, even though this object is not used.
+.IP \[bu]
+.\" Fixed by commit d4c7cf6cffb1bc711a833b5e304ba5bcfe76398b
+.BR readdir (2)
+does not generate a
+.B FAN_ACCESS
+event.
+.IP \[bu]
+.\" Fixed by commit cc299a98eb13a9853675a9cbb90b30b4011e1406
+If
+.BR fanotify_mark ()
+is called with
+.BR FAN_MARK_FLUSH ,
+.I flags
+is not checked for invalid values.
+.SH SEE ALSO
+.BR fanotify_init (2),
+.BR fanotify (7)
diff --git a/man2/fattach.2 b/man2/fattach.2
new file mode 100644
index 0000000..5d25ea6
--- /dev/null
+++ b/man2/fattach.2
@@ -0,0 +1 @@
+.so man2/unimplemented.2
diff --git a/man2/fchdir.2 b/man2/fchdir.2
new file mode 100644
index 0000000..60b9685
--- /dev/null
+++ b/man2/fchdir.2
@@ -0,0 +1 @@
+.so man2/chdir.2
diff --git a/man2/fchmod.2 b/man2/fchmod.2
new file mode 100644
index 0000000..92647d2
--- /dev/null
+++ b/man2/fchmod.2
@@ -0,0 +1 @@
+.so man2/chmod.2
diff --git a/man2/fchmodat.2 b/man2/fchmodat.2
new file mode 100644
index 0000000..92647d2
--- /dev/null
+++ b/man2/fchmodat.2
@@ -0,0 +1 @@
+.so man2/chmod.2
diff --git a/man2/fchown.2 b/man2/fchown.2
new file mode 100644
index 0000000..f0a5635
--- /dev/null
+++ b/man2/fchown.2
@@ -0,0 +1 @@
+.so man2/chown.2
diff --git a/man2/fchown32.2 b/man2/fchown32.2
new file mode 100644
index 0000000..b8b9452
--- /dev/null
+++ b/man2/fchown32.2
@@ -0,0 +1 @@
+.so man2/fchown.2
diff --git a/man2/fchownat.2 b/man2/fchownat.2
new file mode 100644
index 0000000..f0a5635
--- /dev/null
+++ b/man2/fchownat.2
@@ -0,0 +1 @@
+.so man2/chown.2
diff --git a/man2/fcntl.2 b/man2/fcntl.2
new file mode 100644
index 0000000..9362044
--- /dev/null
+++ b/man2/fcntl.2
@@ -0,0 +1,2111 @@
+.\" This manpage is Copyright (C) 1992 Drew Eckhardt;
+.\" and Copyright (C) 1993 Michael Haardt, Ian Jackson;
+.\" and Copyright (C) 1998 Jamie Lokier;
+.\" and Copyright (C) 2002-2010, 2014 Michael Kerrisk;
+.\" and Copyright (C) 2014 Jeff Layton
+.\" and Copyright (C) 2014 David Herrmann
+.\" and Copyright (C) 2017 Jens Axboe
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 1993-07-24 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1995-09-26 by Andries Brouwer <aeb@cwi.nl>
+.\" and again on 960413 and 980804 and 981223.
+.\" Modified 1998-12-11 by Jamie Lokier <jamie@imbolc.ucc.ie>
+.\" Applied correction by Christian Ehrhardt - aeb, 990712
+.\" Modified 2002-04-23 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added note on F_SETFL and O_DIRECT
+.\" Complete rewrite + expansion of material on file locking
+.\" Incorporated description of F_NOTIFY, drawing on
+.\" Stephen Rothwell's notes in Documentation/dnotify.txt.
+.\" Added description of F_SETLEASE and F_GETLEASE
+.\" Corrected and polished, aeb, 020527.
+.\" Modified 2004-03-03 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified description of file leases: fixed some errors of detail
+.\" Replaced the term "lease contestant" by "lease breaker"
+.\" Modified, 27 May 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on capability requirements
+.\" Modified 2004-12-08, added O_NOATIME after note from Martin Pool
+.\" 2004-12-10, mtk, noted F_GETOWN bug after suggestion from aeb.
+.\" 2005-04-08 Jamie Lokier <jamie@shareable.org>, mtk
+.\" Described behavior of F_SETOWN/F_SETSIG in
+.\" multithreaded processes, and generally cleaned
+.\" up the discussion of F_SETOWN.
+.\" 2005-05-20, Johannes Nicolai <johannes.nicolai@hpi.uni-potsdam.de>,
+.\" mtk: Noted F_SETOWN bug for socket file descriptor in Linux 2.4
+.\" and earlier. Added text on permissions required to send signal.
+.\" 2009-09-30, Michael Kerrisk
+.\" Note obsolete F_SETOWN behavior with threads.
+.\" Document F_SETOWN_EX and F_GETOWN_EX
+.\" 2010-06-17, Michael Kerrisk
+.\" Document F_SETPIPE_SZ and F_GETPIPE_SZ.
+.\" 2014-07-08, David Herrmann <dh.herrmann@gmail.com>
+.\" Document F_ADD_SEALS and F_GET_SEALS
+.\" 2017-06-26, Jens Axboe <axboe@kernel.dk>
+.\" Document F_{GET,SET}_RW_HINT and F_{GET,SET}_FILE_RW_HINT
+.\"
+.TH fcntl 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+fcntl \- manipulate file descriptor
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <fcntl.h>
+.PP
+.BI "int fcntl(int " fd ", int " cmd ", ... /* " arg " */ );"
+.fi
+.SH DESCRIPTION
+.BR fcntl ()
+performs one of the operations described below on the open file descriptor
+.IR fd .
+The operation is determined by
+.IR cmd .
+.PP
+.BR fcntl ()
+can take an optional third argument.
+Whether or not this argument is required is determined by
+.IR cmd .
+The required argument type is indicated in parentheses after each
+.I cmd
+name (in most cases, the required type is
+.IR int ,
+and we identify the argument using the name
+.IR arg ),
+or
+.I void
+is specified if the argument is not required.
+.PP
+Certain of the operations below are supported only since a particular
+Linux kernel version.
+The preferred method of checking whether the host kernel supports
+a particular operation is to invoke
+.BR fcntl ()
+with the desired
+.I cmd
+value and then test whether the call failed with
+.BR EINVAL ,
+indicating that the kernel does not recognize this value.
+.SS Duplicating a file descriptor
+.TP
+.BR F_DUPFD " (\fIint\fP)"
+Duplicate the file descriptor
+.I fd
+using the lowest-numbered available file descriptor greater than or equal to
+.IR arg .
+This is different from
+.BR dup2 (2),
+which uses exactly the file descriptor specified.
+.IP
+On success, the new file descriptor is returned.
+.IP
+See
+.BR dup (2)
+for further details.
+.TP
+.BR F_DUPFD_CLOEXEC " (\fIint\fP; since Linux 2.6.24)"
+As for
+.BR F_DUPFD ,
+but additionally set the
+close-on-exec flag for the duplicate file descriptor.
+Specifying this flag permits a program to avoid an additional
+.BR fcntl ()
+.B F_SETFD
+operation to set the
+.B FD_CLOEXEC
+flag.
+For an explanation of why this flag is useful,
+see the description of
+.B O_CLOEXEC
+in
+.BR open (2).
+.SS File descriptor flags
+The following commands manipulate the flags associated with
+a file descriptor.
+Currently, only one such flag is defined:
+.BR FD_CLOEXEC ,
+the close-on-exec flag.
+If the
+.B FD_CLOEXEC
+bit is set,
+the file descriptor will automatically be closed during a successful
+.BR execve (2).
+(If the
+.BR execve (2)
+fails, the file descriptor is left open.)
+If the
+.B FD_CLOEXEC
+bit is not set, the file descriptor will remain open across an
+.BR execve (2).
+.TP
+.BR F_GETFD " (\fIvoid\fP)"
+Return (as the function result) the file descriptor flags;
+.I arg
+is ignored.
+.TP
+.BR F_SETFD " (\fIint\fP)"
+Set the file descriptor flags to the value specified by
+.IR arg .
+.PP
+In multithreaded programs, using
+.BR fcntl ()
+.B F_SETFD
+to set the close-on-exec flag at the same time as another thread performs a
+.BR fork (2)
+plus
+.BR execve (2)
+is vulnerable to a race condition that may unintentionally leak
+the file descriptor to the program executed in the child process.
+See the discussion of the
+.B O_CLOEXEC
+flag in
+.BR open (2)
+for details and a remedy to the problem.
+.SS File status flags
+Each open file description has certain associated status flags,
+initialized by
+.BR open (2)
+.\" or
+.\" .BR creat (2),
+and possibly modified by
+.BR fcntl ().
+Duplicated file descriptors
+(made with
+.BR dup (2),
+.BR fcntl (F_DUPFD),
+.BR fork (2),
+etc.) refer to the same open file description, and thus
+share the same file status flags.
+.PP
+The file status flags and their semantics are described in
+.BR open (2).
+.TP
+.BR F_GETFL " (\fIvoid\fP)"
+Return (as the function result)
+the file access mode and the file status flags;
+.I arg
+is ignored.
+.TP
+.BR F_SETFL " (\fIint\fP)"
+Set the file status flags to the value specified by
+.IR arg .
+File access mode
+.RB ( O_RDONLY ", " O_WRONLY ", " O_RDWR )
+and file creation flags
+(i.e.,
+.BR O_CREAT ", " O_EXCL ", " O_NOCTTY ", " O_TRUNC )
+in
+.I arg
+are ignored.
+On Linux, this command can change only the
+.BR O_APPEND ,
+.BR O_ASYNC ,
+.BR O_DIRECT ,
+.BR O_NOATIME ,
+and
+.B O_NONBLOCK
+flags.
+It is not possible to change the
+.B O_DSYNC
+and
+.B O_SYNC
+flags; see BUGS, below.
+.SS Advisory record locking
+Linux implements traditional ("process-associated") UNIX record locks,
+as standardized by POSIX.
+For a Linux-specific alternative with better semantics,
+see the discussion of open file description locks below.
+.PP
+.BR F_SETLK ,
+.BR F_SETLKW ,
+and
+.B F_GETLK
+are used to acquire, release, and test for the existence of record
+locks (also known as byte-range, file-segment, or file-region locks).
+The third argument,
+.IR lock ,
+is a pointer to a structure that has at least the following fields
+(in unspecified order).
+.PP
+.in +4n
+.EX
+struct flock {
+ ...
+ short l_type; /* Type of lock: F_RDLCK,
+ F_WRLCK, F_UNLCK */
+ short l_whence; /* How to interpret l_start:
+ SEEK_SET, SEEK_CUR, SEEK_END */
+ off_t l_start; /* Starting offset for lock */
+ off_t l_len; /* Number of bytes to lock */
+ pid_t l_pid; /* PID of process blocking our lock
+ (set by F_GETLK and F_OFD_GETLK) */
+ ...
+};
+.EE
+.in
+.PP
+The
+.IR l_whence ", " l_start ", and " l_len
+fields of this structure specify the range of bytes we wish to lock.
+Bytes past the end of the file may be locked,
+but not bytes before the start of the file.
+.PP
+.I l_start
+is the starting offset for the lock, and is interpreted
+relative to either:
+the start of the file (if
+.I l_whence
+is
+.BR SEEK_SET );
+the current file offset (if
+.I l_whence
+is
+.BR SEEK_CUR );
+or the end of the file (if
+.I l_whence
+is
+.BR SEEK_END ).
+In the final two cases,
+.I l_start
+can be a negative number provided the
+offset does not lie before the start of the file.
+.PP
+.I l_len
+specifies the number of bytes to be locked.
+If
+.I l_len
+is positive, then the range to be locked covers bytes
+.I l_start
+up to and including
+.IR l_start + l_len \-1.
+Specifying 0 for
+.I l_len
+has the special meaning: lock all bytes starting at the
+location specified by
+.IR l_whence " and " l_start
+through to the end of file, no matter how large the file grows.
+.PP
+POSIX.1-2001 allows (but does not require)
+an implementation to support a negative
+.I l_len
+value; if
+.I l_len
+is negative, the interval described by
+.I lock
+covers bytes
+.IR l_start + l_len
+up to and including
+.IR l_start \-1.
+This is supported since Linux 2.4.21 and Linux 2.5.49.
+.PP
+The
+.I l_type
+field can be used to place a read
+.RB ( F_RDLCK )
+or a write
+.RB ( F_WRLCK )
+lock on a file.
+Any number of processes may hold a read lock (shared lock)
+on a file region, but only one process may hold a write lock
+(exclusive lock).
+An exclusive lock excludes all other locks,
+both shared and exclusive.
+A single process can hold only one type of lock on a file region;
+if a new lock is applied to an already-locked region,
+then the existing lock is converted to the new lock type.
+(Such conversions may involve splitting, shrinking, or coalescing with
+an existing lock if the byte range specified by the new lock does not
+precisely coincide with the range of the existing lock.)
+.TP
+.BR F_SETLK " (\fIstruct flock *\fP)"
+Acquire a lock (when
+.I l_type
+is
+.B F_RDLCK
+or
+.BR F_WRLCK )
+or release a lock (when
+.I l_type
+is
+.BR F_UNLCK )
+on the bytes specified by the
+.IR l_whence ", " l_start ", and " l_len
+fields of
+.IR lock .
+If a conflicting lock is held by another process,
+this call returns \-1 and sets
+.I errno
+to
+.B EACCES
+or
+.BR EAGAIN .
+(The error returned in this case differs across implementations,
+so POSIX requires a portable application to check for both errors.)
+.TP
+.BR F_SETLKW " (\fIstruct flock *\fP)"
+As for
+.BR F_SETLK ,
+but if a conflicting lock is held on the file, then wait for that
+lock to be released.
+If a signal is caught while waiting, then the call is interrupted
+and (after the signal handler has returned)
+returns immediately (with return value \-1 and
+.I errno
+set to
+.BR EINTR ;
+see
+.BR signal (7)).
+.TP
+.BR F_GETLK " (\fIstruct flock *\fP)"
+On input to this call,
+.I lock
+describes a lock we would like to place on the file.
+If the lock could be placed,
+.BR fcntl ()
+does not actually place it, but returns
+.B F_UNLCK
+in the
+.I l_type
+field of
+.I lock
+and leaves the other fields of the structure unchanged.
+.IP
+If one or more incompatible locks would prevent
+this lock being placed, then
+.BR fcntl ()
+returns details about one of those locks in the
+.IR l_type ", " l_whence ", " l_start ", and " l_len
+fields of
+.IR lock .
+If the conflicting lock is a traditional (process-associated) record lock,
+then the
+.I l_pid
+field is set to the PID of the process holding that lock.
+If the conflicting lock is an open file description lock, then
+.I l_pid
+is set to \-1.
+Note that the returned information
+may already be out of date by the time the caller inspects it.
+.PP
+In order to place a read lock,
+.I fd
+must be open for reading.
+In order to place a write lock,
+.I fd
+must be open for writing.
+To place both types of lock, open a file read-write.
+.PP
+When placing locks with
+.BR F_SETLKW ,
+the kernel detects
+.IR deadlocks ,
+whereby two or more processes have their
+lock requests mutually blocked by locks held by the other processes.
+For example, suppose process A holds a write lock on byte 100 of a file,
+and process B holds a write lock on byte 200.
+If each process then attempts to lock the byte already
+locked by the other process using
+.BR F_SETLKW ,
+then, without deadlock detection,
+both processes would remain blocked indefinitely.
+When the kernel detects such deadlocks,
+it causes one of the blocking lock requests to immediately fail with the error
+.BR EDEADLK ;
+an application that encounters such an error should release
+some of its locks to allow other applications to proceed before
+attempting regain the locks that it requires.
+Circular deadlocks involving more than two processes are also detected.
+Note, however, that there are limitations to the kernel's
+deadlock-detection algorithm; see BUGS.
+.PP
+As well as being removed by an explicit
+.BR F_UNLCK ,
+record locks are automatically released when the process terminates.
+.PP
+Record locks are not inherited by a child created via
+.BR fork (2),
+but are preserved across an
+.BR execve (2).
+.PP
+Because of the buffering performed by the
+.BR stdio (3)
+library, the use of record locking with routines in that package
+should be avoided; use
+.BR read (2)
+and
+.BR write (2)
+instead.
+.PP
+The record locks described above are associated with the process
+(unlike the open file description locks described below).
+This has some unfortunate consequences:
+.IP \[bu] 3
+If a process closes
+.I any
+file descriptor referring to a file,
+then all of the process's locks on that file are released,
+regardless of the file descriptor(s) on which the locks were obtained.
+.\" (Additional file descriptors referring to the same file
+.\" may have been obtained by calls to
+.\" .BR open "(2), " dup "(2), " dup2 "(2), or " fcntl ().)
+This is bad: it means that a process can lose its locks on
+a file such as
+.I /etc/passwd
+or
+.I /etc/mtab
+when for some reason a library function decides to open, read,
+and close the same file.
+.IP \[bu]
+The threads in a process share locks.
+In other words,
+a multithreaded program can't use record locking to ensure
+that threads don't simultaneously access the same region of a file.
+.PP
+Open file description locks solve both of these problems.
+.SS Open file description locks (non-POSIX)
+Open file description locks are advisory byte-range locks whose operation is
+in most respects identical to the traditional record locks described above.
+This lock type is Linux-specific,
+and available since Linux 3.15.
+(There is a proposal with the Austin Group
+.\" FIXME . Review progress into POSIX
+.\" http://austingroupbugs.net/view.php?id=768
+to include this lock type in the next revision of POSIX.1.)
+For an explanation of open file descriptions, see
+.BR open (2).
+.PP
+The principal difference between the two lock types
+is that whereas traditional record locks
+are associated with a process,
+open file description locks are associated with the
+open file description on which they are acquired,
+much like locks acquired with
+.BR flock (2).
+Consequently (and unlike traditional advisory record locks),
+open file description locks are inherited across
+.BR fork (2)
+(and
+.BR clone (2)
+with
+.BR CLONE_FILES ),
+and are only automatically released on the last close
+of the open file description,
+instead of being released on any close of the file.
+.PP
+Conflicting lock combinations
+(i.e., a read lock and a write lock or two write locks)
+where one lock is an open file description lock and the other
+is a traditional record lock conflict
+even when they are acquired by the same process on the same file descriptor.
+.PP
+Open file description locks placed via the same open file description
+(i.e., via the same file descriptor,
+or via a duplicate of the file descriptor created by
+.BR fork (2),
+.BR dup (2),
+.BR fcntl ()
+.BR F_DUPFD ,
+and so on) are always compatible:
+if a new lock is placed on an already locked region,
+then the existing lock is converted to the new lock type.
+(Such conversions may result in splitting, shrinking, or coalescing with
+an existing lock as discussed above.)
+.PP
+On the other hand, open file description locks may conflict with
+each other when they are acquired via different open file descriptions.
+Thus, the threads in a multithreaded program can use
+open file description locks to synchronize access to a file region
+by having each thread perform its own
+.BR open (2)
+on the file and applying locks via the resulting file descriptor.
+.PP
+As with traditional advisory locks, the third argument to
+.BR fcntl (),
+.IR lock ,
+is a pointer to an
+.I flock
+structure.
+By contrast with traditional record locks, the
+.I l_pid
+field of that structure must be set to zero
+when using the commands described below.
+.PP
+The commands for working with open file description locks are analogous
+to those used with traditional locks:
+.TP
+.BR F_OFD_SETLK " (\fIstruct flock *\fP)"
+Acquire an open file description lock (when
+.I l_type
+is
+.B F_RDLCK
+or
+.BR F_WRLCK )
+or release an open file description lock (when
+.I l_type
+is
+.BR F_UNLCK )
+on the bytes specified by the
+.IR l_whence ", " l_start ", and " l_len
+fields of
+.IR lock .
+If a conflicting lock is held by another process,
+this call returns \-1 and sets
+.I errno
+to
+.BR EAGAIN .
+.TP
+.BR F_OFD_SETLKW " (\fIstruct flock *\fP)"
+As for
+.BR F_OFD_SETLK ,
+but if a conflicting lock is held on the file, then wait for that lock to be
+released.
+If a signal is caught while waiting, then the call is interrupted
+and (after the signal handler has returned) returns immediately
+(with return value \-1 and
+.I errno
+set to
+.BR EINTR ;
+see
+.BR signal (7)).
+.TP
+.BR F_OFD_GETLK " (\fIstruct flock *\fP)"
+On input to this call,
+.I lock
+describes an open file description lock we would like to place on the file.
+If the lock could be placed,
+.BR fcntl ()
+does not actually place it, but returns
+.B F_UNLCK
+in the
+.I l_type
+field of
+.I lock
+and leaves the other fields of the structure unchanged.
+If one or more incompatible locks would prevent this lock being placed,
+then details about one of these locks are returned via
+.IR lock ,
+as described above for
+.BR F_GETLK .
+.PP
+In the current implementation,
+.\" commit 57b65325fe34ec4c917bc4e555144b4a94d9e1f7
+no deadlock detection is performed for open file description locks.
+(This contrasts with process-associated record locks,
+for which the kernel does perform deadlock detection.)
+.\"
+.SS Mandatory locking
+.IR Warning :
+the Linux implementation of mandatory locking is unreliable.
+See BUGS below.
+Because of these bugs,
+and the fact that the feature is believed to be little used,
+since Linux 4.5, mandatory locking has been made an optional feature,
+governed by a configuration option
+.RB ( CONFIG_MANDATORY_FILE_LOCKING ).
+This feature is no longer supported at all in Linux 5.15 and above.
+.PP
+By default, both traditional (process-associated) and open file description
+record locks are advisory.
+Advisory locks are not enforced and are useful only between
+cooperating processes.
+.PP
+Both lock types can also be mandatory.
+Mandatory locks are enforced for all processes.
+If a process tries to perform an incompatible access (e.g.,
+.BR read (2)
+or
+.BR write (2))
+on a file region that has an incompatible mandatory lock,
+then the result depends upon whether the
+.B O_NONBLOCK
+flag is enabled for its open file description.
+If the
+.B O_NONBLOCK
+flag is not enabled, then
+the system call is blocked until the lock is removed
+or converted to a mode that is compatible with the access.
+If the
+.B O_NONBLOCK
+flag is enabled, then the system call fails with the error
+.BR EAGAIN .
+.PP
+To make use of mandatory locks, mandatory locking must be enabled
+both on the filesystem that contains the file to be locked,
+and on the file itself.
+Mandatory locking is enabled on a filesystem
+using the "\-o mand" option to
+.BR mount (8),
+or the
+.B MS_MANDLOCK
+flag for
+.BR mount (2).
+Mandatory locking is enabled on a file by disabling
+group execute permission on the file and enabling the set-group-ID
+permission bit (see
+.BR chmod (1)
+and
+.BR chmod (2)).
+.PP
+Mandatory locking is not specified by POSIX.
+Some other systems also support mandatory locking,
+although the details of how to enable it vary across systems.
+.\"
+.SS Lost locks
+When an advisory lock is obtained on a networked filesystem such as
+NFS it is possible that the lock might get lost.
+This may happen due to administrative action on the server, or due to a
+network partition (i.e., loss of network connectivity with the server)
+which lasts long enough for the server to assume
+that the client is no longer functioning.
+.PP
+When the filesystem determines that a lock has been lost, future
+.BR read (2)
+or
+.BR write (2)
+requests may fail with the error
+.BR EIO .
+This error will persist until the lock is removed or the file
+descriptor is closed.
+Since Linux 3.12,
+.\" commit ef1820f9be27b6ad158f433ab38002ab8131db4d
+this happens at least for NFSv4 (including all minor versions).
+.PP
+Some versions of UNIX send a signal
+.RB ( SIGLOST )
+in this circumstance.
+Linux does not define this signal, and does not provide any
+asynchronous notification of lost locks.
+.\"
+.SS Managing signals
+.BR F_GETOWN ,
+.BR F_SETOWN ,
+.BR F_GETOWN_EX ,
+.BR F_SETOWN_EX ,
+.BR F_GETSIG ,
+and
+.B F_SETSIG
+are used to manage I/O availability signals:
+.TP
+.BR F_GETOWN " (\fIvoid\fP)"
+Return (as the function result)
+the process ID or process group ID currently receiving
+.B SIGIO
+and
+.B SIGURG
+signals for events on file descriptor
+.IR fd .
+Process IDs are returned as positive values;
+process group IDs are returned as negative values (but see BUGS below).
+.I arg
+is ignored.
+.TP
+.BR F_SETOWN " (\fIint\fP)"
+Set the process ID or process group ID that will receive
+.B SIGIO
+and
+.B SIGURG
+signals for events on the file descriptor
+.IR fd .
+The target process or process group ID is specified in
+.IR arg .
+A process ID is specified as a positive value;
+a process group ID is specified as a negative value.
+Most commonly, the calling process specifies itself as the owner
+(that is,
+.I arg
+is specified as
+.BR getpid (2)).
+.IP
+As well as setting the file descriptor owner,
+one must also enable generation of signals on the file descriptor.
+This is done by using the
+.BR fcntl ()
+.B F_SETFL
+command to set the
+.B O_ASYNC
+file status flag on the file descriptor.
+Subsequently, a
+.B SIGIO
+signal is sent whenever input or output becomes possible
+on the file descriptor.
+The
+.BR fcntl ()
+.B F_SETSIG
+command can be used to obtain delivery of a signal other than
+.BR SIGIO .
+.IP
+Sending a signal to the owner process (group) specified by
+.B F_SETOWN
+is subject to the same permissions checks as are described for
+.BR kill (2),
+where the sending process is the one that employs
+.B F_SETOWN
+(but see BUGS below).
+If this permission check fails, then the signal is
+silently discarded.
+.IR Note :
+The
+.B F_SETOWN
+operation records the caller's credentials at the time of the
+.BR fcntl ()
+call,
+and it is these saved credentials that are used for the permission checks.
+.IP
+If the file descriptor
+.I fd
+refers to a socket,
+.B F_SETOWN
+also selects
+the recipient of
+.B SIGURG
+signals that are delivered when out-of-band
+data arrives on that socket.
+.RB ( SIGURG
+is sent in any situation where
+.BR select (2)
+would report the socket as having an "exceptional condition".)
+.\" The following appears to be rubbish. It doesn't seem to
+.\" be true according to the kernel source, and I can write
+.\" a program that gets a terminal-generated SIGIO even though
+.\" it is not the foreground process group of the terminal.
+.\" -- MTK, 8 Apr 05
+.\"
+.\" If the file descriptor
+.\" .I fd
+.\" refers to a terminal device, then SIGIO
+.\" signals are sent to the foreground process group of the terminal.
+.IP
+The following was true in Linux 2.6.x up to and including Linux 2.6.11:
+.RS
+.IP
+If a nonzero value is given to
+.B F_SETSIG
+in a multithreaded process running with a threading library
+that supports thread groups (e.g., NPTL),
+then a positive value given to
+.B F_SETOWN
+has a different meaning:
+.\" The relevant place in the (2.6) kernel source is the
+.\" 'switch' in fs/fcntl.c::send_sigio_to_task() -- MTK, Apr 2005
+instead of being a process ID identifying a whole process,
+it is a thread ID identifying a specific thread within a process.
+Consequently, it may be necessary to pass
+.B F_SETOWN
+the result of
+.BR gettid (2)
+instead of
+.BR getpid (2)
+to get sensible results when
+.B F_SETSIG
+is used.
+(In current Linux threading implementations,
+a main thread's thread ID is the same as its process ID.
+This means that a single-threaded program can equally use
+.BR gettid (2)
+or
+.BR getpid (2)
+in this scenario.)
+Note, however, that the statements in this paragraph do not apply
+to the
+.B SIGURG
+signal generated for out-of-band data on a socket:
+this signal is always sent to either a process or a process group,
+depending on the value given to
+.BR F_SETOWN .
+.\" send_sigurg()/send_sigurg_to_task() bypasses
+.\" kill_fasync()/send_sigio()/send_sigio_to_task()
+.\" to directly call send_group_sig_info()
+.\" -- MTK, Apr 2005 (kernel 2.6.11)
+.RE
+.IP
+The above behavior was accidentally dropped in Linux 2.6.12,
+and won't be restored.
+From Linux 2.6.32 onward, use
+.B F_SETOWN_EX
+to target
+.B SIGIO
+and
+.B SIGURG
+signals at a particular thread.
+.TP
+.BR F_GETOWN_EX " (\fIstruct f_owner_ex *\fP) (since Linux 2.6.32)"
+Return the current file descriptor owner settings
+as defined by a previous
+.B F_SETOWN_EX
+operation.
+The information is returned in the structure pointed to by
+.IR arg ,
+which has the following form:
+.IP
+.in +4n
+.EX
+struct f_owner_ex {
+ int type;
+ pid_t pid;
+};
+.EE
+.in
+.IP
+The
+.I type
+field will have one of the values
+.BR F_OWNER_TID ,
+.BR F_OWNER_PID ,
+or
+.BR F_OWNER_PGRP .
+The
+.I pid
+field is a positive integer representing a thread ID, process ID,
+or process group ID.
+See
+.B F_SETOWN_EX
+for more details.
+.TP
+.BR F_SETOWN_EX " (\fIstruct f_owner_ex *\fP) (since Linux 2.6.32)"
+This operation performs a similar task to
+.BR F_SETOWN .
+It allows the caller to direct I/O availability signals
+to a specific thread, process, or process group.
+The caller specifies the target of signals via
+.IR arg ,
+which is a pointer to a
+.I f_owner_ex
+structure.
+The
+.I type
+field has one of the following values, which define how
+.I pid
+is interpreted:
+.RS
+.TP
+.B F_OWNER_TID
+Send the signal to the thread whose thread ID
+(the value returned by a call to
+.BR clone (2)
+or
+.BR gettid (2))
+is specified in
+.IR pid .
+.TP
+.B F_OWNER_PID
+Send the signal to the process whose ID
+is specified in
+.IR pid .
+.TP
+.B F_OWNER_PGRP
+Send the signal to the process group whose ID
+is specified in
+.IR pid .
+(Note that, unlike with
+.BR F_SETOWN ,
+a process group ID is specified as a positive value here.)
+.RE
+.TP
+.BR F_GETSIG " (\fIvoid\fP)"
+Return (as the function result)
+the signal sent when input or output becomes possible.
+A value of zero means
+.B SIGIO
+is sent.
+Any other value (including
+.BR SIGIO )
+is the
+signal sent instead, and in this case additional info is available to
+the signal handler if installed with
+.BR SA_SIGINFO .
+.I arg
+is ignored.
+.TP
+.BR F_SETSIG " (\fIint\fP)"
+Set the signal sent when input or output becomes possible
+to the value given in
+.IR arg .
+A value of zero means to send the default
+.B SIGIO
+signal.
+Any other value (including
+.BR SIGIO )
+is the signal to send instead, and in this case additional info
+is available to the signal handler if installed with
+.BR SA_SIGINFO .
+.\"
+.\" The following was true only up until Linux 2.6.11:
+.\"
+.\" Additionally, passing a nonzero value to
+.\" .B F_SETSIG
+.\" changes the signal recipient from a whole process to a specific thread
+.\" within a process.
+.\" See the description of
+.\" .B F_SETOWN
+.\" for more details.
+.IP
+By using
+.B F_SETSIG
+with a nonzero value, and setting
+.B SA_SIGINFO
+for the
+signal handler (see
+.BR sigaction (2)),
+extra information about I/O events is passed to
+the handler in a
+.I siginfo_t
+structure.
+If the
+.I si_code
+field indicates the source is
+.BR SI_SIGIO ,
+the
+.I si_fd
+field gives the file descriptor associated with the event.
+Otherwise,
+there is no indication which file descriptors are pending, and you
+should use the usual mechanisms
+.RB ( select (2),
+.BR poll (2),
+.BR read (2)
+with
+.B O_NONBLOCK
+set etc.) to determine which file descriptors are available for I/O.
+.IP
+Note that the file descriptor provided in
+.I si_fd
+is the one that was specified during the
+.B F_SETSIG
+operation.
+This can lead to an unusual corner case.
+If the file descriptor is duplicated
+.RB ( dup (2)
+or similar), and the original file descriptor is closed,
+then I/O events will continue to be generated, but the
+.I si_fd
+field will contain the number of the now closed file descriptor.
+.IP
+By selecting a real time signal (value >=
+.BR SIGRTMIN ),
+multiple I/O events may be queued using the same signal numbers.
+(Queuing is dependent on available memory.)
+Extra information is available
+if
+.B SA_SIGINFO
+is set for the signal handler, as above.
+.IP
+Note that Linux imposes a limit on the
+number of real-time signals that may be queued to a
+process (see
+.BR getrlimit (2)
+and
+.BR signal (7))
+and if this limit is reached, then the kernel reverts to
+delivering
+.BR SIGIO ,
+and this signal is delivered to the entire
+process rather than to a specific thread.
+.\" See fs/fcntl.c::send_sigio_to_task() (2.4/2.6) sources -- MTK, Apr 05
+.PP
+Using these mechanisms, a program can implement fully asynchronous I/O
+without using
+.BR select (2)
+or
+.BR poll (2)
+most of the time.
+.PP
+The use of
+.B O_ASYNC
+is specific to BSD and Linux.
+The only use of
+.B F_GETOWN
+and
+.B F_SETOWN
+specified in POSIX.1 is in conjunction with the use of the
+.B SIGURG
+signal on sockets.
+(POSIX does not specify the
+.B SIGIO
+signal.)
+.BR F_GETOWN_EX ,
+.BR F_SETOWN_EX ,
+.BR F_GETSIG ,
+and
+.B F_SETSIG
+are Linux-specific.
+POSIX has asynchronous I/O and the
+.I aio_sigevent
+structure to achieve similar things; these are also available
+in Linux as part of the GNU C Library (glibc).
+.SS Leases
+.B F_SETLEASE
+and
+.B F_GETLEASE
+(Linux 2.4 onward) are used to establish a new lease,
+and retrieve the current lease, on the open file description
+referred to by the file descriptor
+.IR fd .
+A file lease provides a mechanism whereby the process holding
+the lease (the "lease holder") is notified (via delivery of a signal)
+when a process (the "lease breaker") tries to
+.BR open (2)
+or
+.BR truncate (2)
+the file referred to by that file descriptor.
+.TP
+.BR F_SETLEASE " (\fIint\fP)"
+Set or remove a file lease according to which of the following
+values is specified in the integer
+.IR arg :
+.RS
+.TP
+.B F_RDLCK
+Take out a read lease.
+This will cause the calling process to be notified when
+the file is opened for writing or is truncated.
+.\" The following became true in Linux 2.6.10:
+.\" See the man-pages-2.09 Changelog for further info.
+A read lease can be placed only on a file descriptor that
+is opened read-only.
+.TP
+.B F_WRLCK
+Take out a write lease.
+This will cause the caller to be notified when
+the file is opened for reading or writing or is truncated.
+A write lease may be placed on a file only if there are no
+other open file descriptors for the file.
+.TP
+.B F_UNLCK
+Remove our lease from the file.
+.RE
+.PP
+Leases are associated with an open file description (see
+.BR open (2)).
+This means that duplicate file descriptors (created by, for example,
+.BR fork (2)
+or
+.BR dup (2))
+refer to the same lease, and this lease may be modified
+or released using any of these descriptors.
+Furthermore, the lease is released by either an explicit
+.B F_UNLCK
+operation on any of these duplicate file descriptors, or when all
+such file descriptors have been closed.
+.PP
+Leases may be taken out only on regular files.
+An unprivileged process may take out a lease only on a file whose
+UID (owner) matches the filesystem UID of the process.
+A process with the
+.B CAP_LEASE
+capability may take out leases on arbitrary files.
+.TP
+.BR F_GETLEASE " (\fIvoid\fP)"
+Indicates what type of lease is associated with the file descriptor
+.I fd
+by returning either
+.BR F_RDLCK ", " F_WRLCK ", or " F_UNLCK ,
+indicating, respectively, a read lease , a write lease, or no lease.
+.I arg
+is ignored.
+.PP
+When a process (the "lease breaker") performs an
+.BR open (2)
+or
+.BR truncate (2)
+that conflicts with a lease established via
+.BR F_SETLEASE ,
+the system call is blocked by the kernel and
+the kernel notifies the lease holder by sending it a signal
+.RB ( SIGIO
+by default).
+The lease holder should respond to receipt of this signal by doing
+whatever cleanup is required in preparation for the file to be
+accessed by another process (e.g., flushing cached buffers) and
+then either remove or downgrade its lease.
+A lease is removed by performing an
+.B F_SETLEASE
+command specifying
+.I arg
+as
+.BR F_UNLCK .
+If the lease holder currently holds a write lease on the file,
+and the lease breaker is opening the file for reading,
+then it is sufficient for the lease holder to downgrade
+the lease to a read lease.
+This is done by performing an
+.B F_SETLEASE
+command specifying
+.I arg
+as
+.BR F_RDLCK .
+.PP
+If the lease holder fails to downgrade or remove the lease within
+the number of seconds specified in
+.IR /proc/sys/fs/lease\-break\-time ,
+then the kernel forcibly removes or downgrades the lease holder's lease.
+.PP
+Once a lease break has been initiated,
+.B F_GETLEASE
+returns the target lease type (either
+.B F_RDLCK
+or
+.BR F_UNLCK ,
+depending on what would be compatible with the lease breaker)
+until the lease holder voluntarily downgrades or removes the lease or
+the kernel forcibly does so after the lease break timer expires.
+.PP
+Once the lease has been voluntarily or forcibly removed or downgraded,
+and assuming the lease breaker has not unblocked its system call,
+the kernel permits the lease breaker's system call to proceed.
+.PP
+If the lease breaker's blocked
+.BR open (2)
+or
+.BR truncate (2)
+is interrupted by a signal handler,
+then the system call fails with the error
+.BR EINTR ,
+but the other steps still occur as described above.
+If the lease breaker is killed by a signal while blocked in
+.BR open (2)
+or
+.BR truncate (2),
+then the other steps still occur as described above.
+If the lease breaker specifies the
+.B O_NONBLOCK
+flag when calling
+.BR open (2),
+then the call immediately fails with the error
+.BR EWOULDBLOCK ,
+but the other steps still occur as described above.
+.PP
+The default signal used to notify the lease holder is
+.BR SIGIO ,
+but this can be changed using the
+.B F_SETSIG
+command to
+.BR fcntl ().
+If a
+.B F_SETSIG
+command is performed (even one specifying
+.BR SIGIO ),
+and the signal
+handler is established using
+.BR SA_SIGINFO ,
+then the handler will receive a
+.I siginfo_t
+structure as its second argument, and the
+.I si_fd
+field of this argument will hold the file descriptor of the leased file
+that has been accessed by another process.
+(This is useful if the caller holds leases against multiple files.)
+.SS File and directory change notification (dnotify)
+.TP
+.BR F_NOTIFY " (\fIint\fP)"
+(Linux 2.4 onward)
+Provide notification when the directory referred to by
+.I fd
+or any of the files that it contains is changed.
+The events to be notified are specified in
+.IR arg ,
+which is a bit mask specified by ORing together zero or more of
+the following bits:
+.PP
+.RS
+.PD 0
+.TP
+.B DN_ACCESS
+A file was accessed
+.RB ( read (2),
+.BR pread (2),
+.BR readv (2),
+and similar)
+.TP
+.B DN_MODIFY
+A file was modified
+.RB ( write (2),
+.BR pwrite (2),
+.BR writev (2),
+.BR truncate (2),
+.BR ftruncate (2),
+and similar).
+.TP
+.B DN_CREATE
+A file was created
+.RB ( open (2),
+.BR creat (2),
+.BR mknod (2),
+.BR mkdir (2),
+.BR link (2),
+.BR symlink (2),
+.BR rename (2)
+into this directory).
+.TP
+.B DN_DELETE
+A file was unlinked
+.RB ( unlink (2),
+.BR rename (2)
+to another directory,
+.BR rmdir (2)).
+.TP
+.B DN_RENAME
+A file was renamed within this directory
+.RB ( rename (2)).
+.TP
+.B DN_ATTRIB
+The attributes of a file were changed
+.RB ( chown (2),
+.BR chmod (2),
+.BR utime (2),
+.BR utimensat (2),
+and similar).
+.PD
+.RE
+.IP
+(In order to obtain these definitions, the
+.B _GNU_SOURCE
+feature test macro must be defined before including
+.I any
+header files.)
+.IP
+Directory notifications are normally "one-shot", and the application
+must reregister to receive further notifications.
+Alternatively, if
+.B DN_MULTISHOT
+is included in
+.IR arg ,
+then notification will remain in effect until explicitly removed.
+.IP
+.\" The following does seem a poor API-design choice...
+A series of
+.B F_NOTIFY
+requests is cumulative, with the events in
+.I arg
+being added to the set already monitored.
+To disable notification of all events, make an
+.B F_NOTIFY
+call specifying
+.I arg
+as 0.
+.IP
+Notification occurs via delivery of a signal.
+The default signal is
+.BR SIGIO ,
+but this can be changed using the
+.B F_SETSIG
+command to
+.BR fcntl ().
+(Note that
+.B SIGIO
+is one of the nonqueuing standard signals;
+switching to the use of a real-time signal means that
+multiple notifications can be queued to the process.)
+In the latter case, the signal handler receives a
+.I siginfo_t
+structure as its second argument (if the handler was
+established using
+.BR SA_SIGINFO )
+and the
+.I si_fd
+field of this structure contains the file descriptor which
+generated the notification (useful when establishing notification
+on multiple directories).
+.IP
+Especially when using
+.BR DN_MULTISHOT ,
+a real time signal should be used for notification,
+so that multiple notifications can be queued.
+.IP
+.B NOTE:
+New applications should use the
+.I inotify
+interface (available since Linux 2.6.13),
+which provides a much superior interface for obtaining notifications of
+filesystem events.
+See
+.BR inotify (7).
+.SS Changing the capacity of a pipe
+.TP
+.BR F_SETPIPE_SZ " (\fIint\fP; since Linux 2.6.35)"
+Change the capacity of the pipe referred to by
+.I fd
+to be at least
+.I arg
+bytes.
+An unprivileged process can adjust the pipe capacity to any value
+between the system page size and the limit defined in
+.I /proc/sys/fs/pipe\-max\-size
+(see
+.BR proc (5)).
+Attempts to set the pipe capacity below the page size are silently
+rounded up to the page size.
+Attempts by an unprivileged process to set the pipe capacity above the limit in
+.I /proc/sys/fs/pipe\-max\-size
+yield the error
+.BR EPERM ;
+a privileged process
+.RB ( CAP_SYS_RESOURCE )
+can override the limit.
+.IP
+When allocating the buffer for the pipe,
+the kernel may use a capacity larger than
+.IR arg ,
+if that is convenient for the implementation.
+(In the current implementation,
+the allocation is the next higher power-of-two page-size multiple
+of the requested size.)
+The actual capacity (in bytes) that is set is returned as the function result.
+.IP
+Attempting to set the pipe capacity smaller than the amount
+of buffer space currently used to store data produces the error
+.BR EBUSY .
+.IP
+Note that because of the way the pages of the pipe buffer
+are employed when data is written to the pipe,
+the number of bytes that can be written may be less than the nominal size,
+depending on the size of the writes.
+.TP
+.BR F_GETPIPE_SZ " (\fIvoid\fP; since Linux 2.6.35)"
+Return (as the function result) the capacity of the pipe referred to by
+.IR fd .
+.\"
+.SS File Sealing
+File seals limit the set of allowed operations on a given file.
+For each seal that is set on a file,
+a specific set of operations will fail with
+.B EPERM
+on this file from now on.
+The file is said to be sealed.
+The default set of seals depends on the type of the underlying
+file and filesystem.
+For an overview of file sealing, a discussion of its purpose,
+and some code examples, see
+.BR memfd_create (2).
+.PP
+Currently,
+file seals can be applied only to a file descriptor returned by
+.BR memfd_create (2)
+(if the
+.B MFD_ALLOW_SEALING
+was employed).
+On other filesystems, all
+.BR fcntl ()
+operations that operate on seals will return
+.BR EINVAL .
+.PP
+Seals are a property of an inode.
+Thus, all open file descriptors referring to the same inode share
+the same set of seals.
+Furthermore, seals can never be removed, only added.
+.TP
+.BR F_ADD_SEALS " (\fIint\fP; since Linux 3.17)"
+Add the seals given in the bit-mask argument
+.I arg
+to the set of seals of the inode referred to by the file descriptor
+.IR fd .
+Seals cannot be removed again.
+Once this call succeeds, the seals are enforced by the kernel immediately.
+If the current set of seals includes
+.B F_SEAL_SEAL
+(see below), then this call will be rejected with
+.BR EPERM .
+Adding a seal that is already set is a no-op, in case
+.B F_SEAL_SEAL
+is not set already.
+In order to place a seal, the file descriptor
+.I fd
+must be writable.
+.TP
+.BR F_GET_SEALS " (\fIvoid\fP; since Linux 3.17)"
+Return (as the function result) the current set of seals
+of the inode referred to by
+.IR fd .
+If no seals are set, 0 is returned.
+If the file does not support sealing, \-1 is returned and
+.I errno
+is set to
+.BR EINVAL .
+.PP
+The following seals are available:
+.TP
+.B F_SEAL_SEAL
+If this seal is set, any further call to
+.BR fcntl ()
+with
+.B F_ADD_SEALS
+fails with the error
+.BR EPERM .
+Therefore, this seal prevents any modifications to the set of seals itself.
+If the initial set of seals of a file includes
+.BR F_SEAL_SEAL ,
+then this effectively causes the set of seals to be constant and locked.
+.TP
+.B F_SEAL_SHRINK
+If this seal is set, the file in question cannot be reduced in size.
+This affects
+.BR open (2)
+with the
+.B O_TRUNC
+flag as well as
+.BR truncate (2)
+and
+.BR ftruncate (2).
+Those calls fail with
+.B EPERM
+if you try to shrink the file in question.
+Increasing the file size is still possible.
+.TP
+.B F_SEAL_GROW
+If this seal is set, the size of the file in question cannot be increased.
+This affects
+.BR write (2)
+beyond the end of the file,
+.BR truncate (2),
+.BR ftruncate (2),
+and
+.BR fallocate (2).
+These calls fail with
+.B EPERM
+if you use them to increase the file size.
+If you keep the size or shrink it, those calls still work as expected.
+.TP
+.B F_SEAL_WRITE
+If this seal is set, you cannot modify the contents of the file.
+Note that shrinking or growing the size of the file is
+still possible and allowed.
+.\" One or more other seals are typically used with F_SEAL_WRITE
+.\" because, given a file with the F_SEAL_WRITE seal set, then,
+.\" while it would no longer be possible to (say) write zeros into
+.\" the last 100 bytes of a file, it would still be possible
+.\" to (say) shrink the file by 100 bytes using ftruncate(), and
+.\" then increase the file size by 100 bytes, which would have
+.\" the effect of replacing the last hundred bytes by zeros.
+.\"
+Thus, this seal is normally used in combination with one of the other seals.
+This seal affects
+.BR write (2)
+and
+.BR fallocate (2)
+(only in combination with the
+.B FALLOC_FL_PUNCH_HOLE
+flag).
+Those calls fail with
+.B EPERM
+if this seal is set.
+Furthermore, trying to create new shared, writable memory-mappings via
+.BR mmap (2)
+will also fail with
+.BR EPERM .
+.IP
+Using the
+.B F_ADD_SEALS
+operation to set the
+.B F_SEAL_WRITE
+seal fails with
+.B EBUSY
+if any writable, shared mapping exists.
+Such mappings must be unmapped before you can add this seal.
+Furthermore, if there are any asynchronous I/O operations
+.RB ( io_submit (2))
+pending on the file,
+all outstanding writes will be discarded.
+.TP
+.BR F_SEAL_FUTURE_WRITE " (since Linux 5.1)"
+The effect of this seal is similar to
+.BR F_SEAL_WRITE ,
+but the contents of the file can still be modified via
+shared writable mappings that were created prior to the seal being set.
+Any attempt to create a new writable mapping on the file via
+.BR mmap (2)
+will fail with
+.BR EPERM .
+Likewise, an attempt to write to the file via
+.BR write (2)
+will fail with
+.BR EPERM .
+.IP
+Using this seal,
+one process can create a memory buffer that it can continue to modify
+while sharing that buffer on a "read-only" basis with other processes.
+.\"
+.SS File read/write hints
+Write lifetime hints can be used to inform the kernel about the relative
+expected lifetime of writes on a given inode or
+via a particular open file description.
+(See
+.BR open (2)
+for an explanation of open file descriptions.)
+In this context, the term "write lifetime" means
+the expected time the data will live on media, before
+being overwritten or erased.
+.PP
+An application may use the different hint values specified below to
+separate writes into different write classes,
+so that multiple users or applications running on a single storage back-end
+can aggregate their I/O patterns in a consistent manner.
+However, there are no functional semantics implied by these flags,
+and different I/O classes can use the write lifetime hints
+in arbitrary ways, so long as the hints are used consistently.
+.PP
+The following operations can be applied to the file descriptor,
+.IR fd :
+.TP
+.BR F_GET_RW_HINT " (\fIuint64_t *\fP; since Linux 4.13)"
+Returns the value of the read/write hint associated with the underlying inode
+referred to by
+.IR fd .
+.TP
+.BR F_SET_RW_HINT " (\fIuint64_t *\fP; since Linux 4.13)"
+Sets the read/write hint value associated with the
+underlying inode referred to by
+.IR fd .
+This hint persists until either it is explicitly modified or
+the underlying filesystem is unmounted.
+.TP
+.BR F_GET_FILE_RW_HINT " (\fIuint64_t *\fP; since Linux 4.13)"
+Returns the value of the read/write hint associated with
+the open file description referred to by
+.IR fd .
+.TP
+.BR F_SET_FILE_RW_HINT " (\fIuint64_t *\fP; since Linux 4.13)"
+Sets the read/write hint value associated with the open file description
+referred to by
+.IR fd .
+.PP
+If an open file description has not been assigned a read/write hint,
+then it shall use the value assigned to the inode, if any.
+.PP
+The following read/write
+hints are valid since Linux 4.13:
+.TP
+.B RWH_WRITE_LIFE_NOT_SET
+No specific hint has been set.
+This is the default value.
+.TP
+.B RWH_WRITE_LIFE_NONE
+No specific write lifetime is associated with this file or inode.
+.TP
+.B RWH_WRITE_LIFE_SHORT
+Data written to this inode or via this open file description
+is expected to have a short lifetime.
+.TP
+.B RWH_WRITE_LIFE_MEDIUM
+Data written to this inode or via this open file description
+is expected to have a lifetime longer than
+data written with
+.BR RWH_WRITE_LIFE_SHORT .
+.TP
+.B RWH_WRITE_LIFE_LONG
+Data written to this inode or via this open file description
+is expected to have a lifetime longer than
+data written with
+.BR RWH_WRITE_LIFE_MEDIUM .
+.TP
+.B RWH_WRITE_LIFE_EXTREME
+Data written to this inode or via this open file description
+is expected to have a lifetime longer than
+data written with
+.BR RWH_WRITE_LIFE_LONG .
+.PP
+All the write-specific hints are relative to each other,
+and no individual absolute meaning should be attributed to them.
+.SH RETURN VALUE
+For a successful call, the return value depends on the operation:
+.TP
+.B F_DUPFD
+The new file descriptor.
+.TP
+.B F_GETFD
+Value of file descriptor flags.
+.TP
+.B F_GETFL
+Value of file status flags.
+.TP
+.B F_GETLEASE
+Type of lease held on file descriptor.
+.TP
+.B F_GETOWN
+Value of file descriptor owner.
+.TP
+.B F_GETSIG
+Value of signal sent when read or write becomes possible, or zero
+for traditional
+.B SIGIO
+behavior.
+.TP
+.BR F_GETPIPE_SZ ", " F_SETPIPE_SZ
+The pipe capacity.
+.TP
+.B F_GET_SEALS
+A bit mask identifying the seals that have been set
+for the inode referred to by
+.IR fd .
+.TP
+All other commands
+Zero.
+.PP
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.BR EACCES " or " EAGAIN
+Operation is prohibited by locks held by other processes.
+.TP
+.B EAGAIN
+The operation is prohibited because the file has been memory-mapped by
+another process.
+.TP
+.B EBADF
+.I fd
+is not an open file descriptor
+.TP
+.B EBADF
+.I cmd
+is
+.B F_SETLK
+or
+.B F_SETLKW
+and the file descriptor open mode doesn't match with the
+type of lock requested.
+.TP
+.B EBUSY
+.I cmd
+is
+.B F_SETPIPE_SZ
+and the new pipe capacity specified in
+.I arg
+is smaller than the amount of buffer space currently
+used to store data in the pipe.
+.TP
+.B EBUSY
+.I cmd
+is
+.BR F_ADD_SEALS ,
+.I arg
+includes
+.BR F_SEAL_WRITE ,
+and there exists a writable, shared mapping on the file referred to by
+.IR fd .
+.TP
+.B EDEADLK
+It was detected that the specified
+.B F_SETLKW
+command would cause a deadlock.
+.TP
+.B EFAULT
+.I lock
+is outside your accessible address space.
+.TP
+.B EINTR
+.I cmd
+is
+.B F_SETLKW
+or
+.B F_OFD_SETLKW
+and the operation was interrupted by a signal; see
+.BR signal (7).
+.TP
+.B EINTR
+.I cmd
+is
+.BR F_GETLK ,
+.BR F_SETLK ,
+.BR F_OFD_GETLK ,
+or
+.BR F_OFD_SETLK ,
+and the operation was interrupted by a signal before the lock was checked or
+acquired.
+Most likely when locking a remote file (e.g., locking over
+NFS), but can sometimes happen locally.
+.TP
+.B EINVAL
+The value specified in
+.I cmd
+is not recognized by this kernel.
+.TP
+.B EINVAL
+.I cmd
+is
+.B F_ADD_SEALS
+and
+.I arg
+includes an unrecognized sealing bit.
+.TP
+.B EINVAL
+.I cmd
+is
+.B F_ADD_SEALS
+or
+.B F_GET_SEALS
+and the filesystem containing the inode referred to by
+.I fd
+does not support sealing.
+.TP
+.B EINVAL
+.I cmd
+is
+.B F_DUPFD
+and
+.I arg
+is negative or is greater than the maximum allowable value
+(see the discussion of
+.B RLIMIT_NOFILE
+in
+.BR getrlimit (2)).
+.TP
+.B EINVAL
+.I cmd
+is
+.B F_SETSIG
+and
+.I arg
+is not an allowable signal number.
+.TP
+.B EINVAL
+.I cmd
+is
+.BR F_OFD_SETLK ,
+.BR F_OFD_SETLKW ,
+or
+.BR F_OFD_GETLK ,
+and
+.I l_pid
+was not specified as zero.
+.TP
+.B EMFILE
+.I cmd
+is
+.B F_DUPFD
+and the per-process limit on the number of open file descriptors
+has been reached.
+.TP
+.B ENOLCK
+Too many segment locks open, lock table is full, or a remote locking
+protocol failed (e.g., locking over NFS).
+.TP
+.B ENOTDIR
+.B F_NOTIFY
+was specified in
+.IR cmd ,
+but
+.I fd
+does not refer to a directory.
+.TP
+.B EPERM
+.I cmd
+is
+.B F_SETPIPE_SZ
+and the soft or hard user pipe limit has been reached; see
+.BR pipe (7).
+.TP
+.B EPERM
+Attempted to clear the
+.B O_APPEND
+flag on a file that has the append-only attribute set.
+.TP
+.B EPERM
+.I cmd
+was
+.BR F_ADD_SEALS ,
+but
+.I fd
+was not open for writing
+or the current set of seals on the file already includes
+.BR F_SEAL_SEAL .
+.SH STANDARDS
+POSIX.1-2008.
+.PP
+.BR F_GETOWN_EX ,
+.BR F_SETOWN_EX ,
+.BR F_SETPIPE_SZ ,
+.BR F_GETPIPE_SZ ,
+.BR F_GETSIG ,
+.BR F_SETSIG ,
+.BR F_NOTIFY ,
+.BR F_GETLEASE ,
+and
+.B F_SETLEASE
+are Linux-specific.
+(Define the
+.B _GNU_SOURCE
+macro to obtain these definitions.)
+.\" .PP
+.\" SVr4 documents additional EIO, ENOLINK and EOVERFLOW error conditions.
+.PP
+.BR F_OFD_SETLK ,
+.BR F_OFD_SETLKW ,
+and
+.B F_OFD_GETLK
+are Linux-specific (and one must define
+.B _GNU_SOURCE
+to obtain their definitions),
+but work is being done to have them included in the next version of POSIX.1.
+.PP
+.B F_ADD_SEALS
+and
+.B F_GET_SEALS
+are Linux-specific.
+.\" FIXME . Once glibc adds support, add a note about FTM requirements
+.SH HISTORY
+SVr4, 4.3BSD, POSIX.1-2001.
+.PP
+Only the operations
+.BR F_DUPFD ,
+.BR F_GETFD ,
+.BR F_SETFD ,
+.BR F_GETFL ,
+.BR F_SETFL ,
+.BR F_GETLK ,
+.BR F_SETLK ,
+and
+.B F_SETLKW
+are specified in POSIX.1-2001.
+.PP
+.B F_GETOWN
+and
+.B F_SETOWN
+are specified in POSIX.1-2001.
+(To get their definitions, define either
+.\" .BR _BSD_SOURCE ,
+.\" or
+.B _XOPEN_SOURCE
+with the value 500 or greater, or
+.B _POSIX_C_SOURCE
+with the value 200809L or greater.)
+.PP
+.B F_DUPFD_CLOEXEC
+is specified in POSIX.1-2008.
+(To get this definition, define
+.B _POSIX_C_SOURCE
+with the value 200809L or greater, or
+.B _XOPEN_SOURCE
+with the value 700 or greater.)
+.SH NOTES
+The errors returned by
+.BR dup2 (2)
+are different from those returned by
+.BR F_DUPFD .
+.\"
+.SS File locking
+The original Linux
+.BR fcntl ()
+system call was not designed to handle large file offsets
+(in the
+.I flock
+structure).
+Consequently, an
+.BR fcntl64 ()
+system call was added in Linux 2.4.
+The newer system call employs a different structure for file locking,
+.IR flock64 ,
+and corresponding commands,
+.BR F_GETLK64 ,
+.BR F_SETLK64 ,
+and
+.BR F_SETLKW64 .
+However, these details can be ignored by applications using glibc, whose
+.BR fcntl ()
+wrapper function transparently employs the more recent system call
+where it is available.
+.\"
+.SS Record locks
+Since Linux 2.0, there is no interaction between the types of lock
+placed by
+.BR flock (2)
+and
+.BR fcntl ().
+.PP
+Several systems have more fields in
+.I "struct flock"
+such as, for example,
+.I l_sysid
+(to identify the machine where the lock is held).
+.\" e.g., Solaris 8 documents this field in fcntl(2), and Irix 6.5
+.\" documents it in fcntl(5). mtk, May 2007
+.\" Also, FreeBSD documents it (Apr 2014).
+Clearly,
+.I l_pid
+alone is not going to be very useful if the process holding the lock
+may live on a different machine;
+on Linux, while present on some architectures (such as MIPS32),
+this field is not used.
+.PP
+The original Linux
+.BR fcntl ()
+system call was not designed to handle large file offsets
+(in the
+.I flock
+structure).
+Consequently, an
+.BR fcntl64 ()
+system call was added in Linux 2.4.
+The newer system call employs a different structure for file locking,
+.IR flock64 ,
+and corresponding commands,
+.BR F_GETLK64 ,
+.BR F_SETLK64 ,
+and
+.BR F_SETLKW64 .
+However, these details can be ignored by applications using glibc, whose
+.BR fcntl ()
+wrapper function transparently employs the more recent system call
+where it is available.
+.SS Record locking and NFS
+Before Linux 3.12, if an NFSv4 client
+loses contact with the server for a period of time
+(defined as more than 90 seconds with no communication),
+.\"
+.\" Neil Brown: With NFSv3 the failure mode is the reverse. If
+.\" the server loses contact with a client then any lock stays in place
+.\" indefinitely ("why can't I read my mail"... I remember it well).
+.\"
+it might lose and regain a lock without ever being aware of the fact.
+(The period of time after which contact is assumed lost is known as
+the NFSv4 leasetime.
+On a Linux NFS server, this can be determined by looking at
+.IR /proc/fs/nfsd/nfsv4leasetime ,
+which expresses the period in seconds.
+The default value for this file is 90.)
+.\"
+.\" Jeff Layton:
+.\" Note that this is not a firm timeout. The server runs a job
+.\" periodically to clean out expired stateful objects, and it's likely
+.\" that there is some time (maybe even up to another whole lease period)
+.\" between when the timeout expires and the job actually runs. If the
+.\" client gets a RENEW in there within that window, its lease will be
+.\" renewed and its state preserved.
+.\"
+This scenario potentially risks data corruption,
+since another process might acquire a lock in the intervening period
+and perform file I/O.
+.PP
+Since Linux 3.12,
+.\" commit ef1820f9be27b6ad158f433ab38002ab8131db4d
+if an NFSv4 client loses contact with the server,
+any I/O to the file by a process which "thinks" it holds
+a lock will fail until that process closes and reopens the file.
+A kernel parameter,
+.IR nfs.recover_lost_locks ,
+can be set to 1 to obtain the pre-3.12 behavior,
+whereby the client will attempt to recover lost locks
+when contact is reestablished with the server.
+Because of the attendant risk of data corruption,
+.\" commit f6de7a39c181dfb8a2c534661a53c73afb3081cd
+this parameter defaults to 0 (disabled).
+.SH BUGS
+.SS F_SETFL
+It is not possible to use
+.B F_SETFL
+to change the state of the
+.B O_DSYNC
+and
+.B O_SYNC
+flags.
+.\" FIXME . According to POSIX.1-2001, O_SYNC should also be modifiable
+.\" via fcntl(2), but currently Linux does not permit this
+.\" See http://bugzilla.kernel.org/show_bug.cgi?id=5994
+Attempts to change the state of these flags are silently ignored.
+.SS F_GETOWN
+A limitation of the Linux system call conventions on some
+architectures (notably i386) means that if a (negative)
+process group ID to be returned by
+.B F_GETOWN
+falls in the range \-1 to \-4095, then the return value is wrongly
+interpreted by glibc as an error in the system call;
+.\" glibc source: sysdeps/unix/sysv/linux/i386/sysdep.h
+that is, the return value of
+.BR fcntl ()
+will be \-1, and
+.I errno
+will contain the (positive) process group ID.
+The Linux-specific
+.B F_GETOWN_EX
+operation avoids this problem.
+.\" mtk, Dec 04: some limited testing on alpha and ia64 seems to
+.\" indicate that ANY negative PGID value will cause F_GETOWN
+.\" to misinterpret the return as an error. Some other architectures
+.\" seem to have the same range check as i386.
+Since glibc 2.11, glibc makes the kernel
+.B F_GETOWN
+problem invisible by implementing
+.B F_GETOWN
+using
+.BR F_GETOWN_EX .
+.SS F_SETOWN
+In Linux 2.4 and earlier, there is bug that can occur
+when an unprivileged process uses
+.B F_SETOWN
+to specify the owner
+of a socket file descriptor
+as a process (group) other than the caller.
+In this case,
+.BR fcntl ()
+can return \-1 with
+.I errno
+set to
+.BR EPERM ,
+even when the owner process (group) is one that the caller
+has permission to send signals to.
+Despite this error return, the file descriptor owner is set,
+and signals will be sent to the owner.
+.\"
+.SS Deadlock detection
+The deadlock-detection algorithm employed by the kernel when dealing with
+.B F_SETLKW
+requests can yield both
+false negatives (failures to detect deadlocks,
+leaving a set of deadlocked processes blocked indefinitely)
+and false positives
+.RB ( EDEADLK
+errors when there is no deadlock).
+For example,
+the kernel limits the lock depth of its dependency search to 10 steps,
+meaning that circular deadlock chains that exceed
+that size will not be detected.
+In addition, the kernel may falsely indicate a deadlock
+when two or more processes created using the
+.BR clone (2)
+.B CLONE_FILES
+flag place locks that appear (to the kernel) to conflict.
+.\"
+.SS Mandatory locking
+The Linux implementation of mandatory locking
+is subject to race conditions which render it unreliable:
+.\" http://marc.info/?l=linux-kernel&m=119013491707153&w=2
+.\"
+.\" Reconfirmed by Jeff Layton
+.\" From: Jeff Layton <jlayton <at> redhat.com>
+.\" Subject: Re: Status of fcntl() mandatory locking
+.\" Newsgroups: gmane.linux.file-systems
+.\" Date: 2014-04-28 10:07:57 GMT
+.\" http://thread.gmane.org/gmane.linux.file-systems/84481/focus=84518
+a
+.BR write (2)
+call that overlaps with a lock may modify data after the mandatory lock is
+acquired;
+a
+.BR read (2)
+call that overlaps with a lock may detect changes to data that were made
+only after a write lock was acquired.
+Similar races exist between mandatory locks and
+.BR mmap (2).
+It is therefore inadvisable to rely on mandatory locking.
+.SH SEE ALSO
+.BR dup2 (2),
+.BR flock (2),
+.BR open (2),
+.BR socket (2),
+.BR lockf (3),
+.BR capabilities (7),
+.BR feature_test_macros (7),
+.BR lslocks (8)
+.PP
+.IR locks.txt ,
+.IR mandatory\-locking.txt ,
+and
+.I dnotify.txt
+in the Linux kernel source directory
+.I Documentation/filesystems/
+(on older kernels, these files are directly under the
+.I Documentation/
+directory, and
+.I mandatory\-locking.txt
+is called
+.IR mandatory.txt )
diff --git a/man2/fcntl64.2 b/man2/fcntl64.2
new file mode 100644
index 0000000..fc8ddc1
--- /dev/null
+++ b/man2/fcntl64.2
@@ -0,0 +1 @@
+.so man2/fcntl.2
diff --git a/man2/fdatasync.2 b/man2/fdatasync.2
new file mode 100644
index 0000000..3c7494f
--- /dev/null
+++ b/man2/fdatasync.2
@@ -0,0 +1 @@
+.so man2/fsync.2
diff --git a/man2/fdetach.2 b/man2/fdetach.2
new file mode 100644
index 0000000..5d25ea6
--- /dev/null
+++ b/man2/fdetach.2
@@ -0,0 +1 @@
+.so man2/unimplemented.2
diff --git a/man2/fgetxattr.2 b/man2/fgetxattr.2
new file mode 100644
index 0000000..d9e5d90
--- /dev/null
+++ b/man2/fgetxattr.2
@@ -0,0 +1 @@
+.so man2/getxattr.2
diff --git a/man2/finit_module.2 b/man2/finit_module.2
new file mode 100644
index 0000000..20c5c51
--- /dev/null
+++ b/man2/finit_module.2
@@ -0,0 +1 @@
+.so man2/init_module.2
diff --git a/man2/flistxattr.2 b/man2/flistxattr.2
new file mode 100644
index 0000000..117bd2b
--- /dev/null
+++ b/man2/flistxattr.2
@@ -0,0 +1 @@
+.so man2/listxattr.2
diff --git a/man2/flock.2 b/man2/flock.2
new file mode 100644
index 0000000..5f2917f
--- /dev/null
+++ b/man2/flock.2
@@ -0,0 +1,267 @@
+.\" Copyright 1993 Rickard E. Faith (faith@cs.unc.edu) and
+.\" and Copyright 2002 Michael Kerrisk
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified Fri Jan 31 16:26:07 1997 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified Fri Dec 11 17:57:27 1998 by Jamie Lokier <jamie@imbolc.ucc.ie>
+.\" Modified 24 Apr 2002 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Substantial rewrites and additions
+.\" 2005-05-10 mtk, noted that lock conversions are not atomic.
+.\"
+.\" FIXME Maybe document LOCK_MAND, LOCK_RW, LOCK_READ, LOCK_WRITE
+.\" which only have effect for SAMBA.
+.\"
+.TH flock 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+flock \- apply or remove an advisory lock on an open file
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/file.h>
+.PP
+.BI "int flock(int " fd ", int " operation );
+.fi
+.SH DESCRIPTION
+Apply or remove an advisory lock on the open file specified by
+.IR fd .
+The argument
+.I operation
+is one of the following:
+.RS 4
+.TP 9
+.B LOCK_SH
+Place a shared lock.
+More than one process may hold a shared lock for a given file
+at a given time.
+.TP
+.B LOCK_EX
+Place an exclusive lock.
+Only one process may hold an exclusive lock for a given
+file at a given time.
+.TP
+.B LOCK_UN
+Remove an existing lock held by this process.
+.RE
+.PP
+A call to
+.BR flock ()
+may block if an incompatible lock is held by another process.
+To make a nonblocking request, include
+.B LOCK_NB
+(by ORing)
+with any of the above operations.
+.PP
+A single file may not simultaneously have both shared and exclusive locks.
+.PP
+Locks created by
+.BR flock ()
+are associated with an open file description (see
+.BR open (2)).
+This means that duplicate file descriptors (created by, for example,
+.BR fork (2)
+or
+.BR dup (2))
+refer to the same lock, and this lock may be modified
+or released using any of these file descriptors.
+Furthermore, the lock is released either by an explicit
+.B LOCK_UN
+operation on any of these duplicate file descriptors, or when all
+such file descriptors have been closed.
+.PP
+If a process uses
+.BR open (2)
+(or similar) to obtain more than one file descriptor for the same file,
+these file descriptors are treated independently by
+.BR flock ().
+An attempt to lock the file using one of these file descriptors
+may be denied by a lock that the calling process has
+already placed via another file descriptor.
+.PP
+A process may hold only one type of lock (shared or exclusive)
+on a file.
+Subsequent
+.BR flock ()
+calls on an already locked file will convert an existing lock to the new
+lock mode.
+.PP
+Locks created by
+.BR flock ()
+are preserved across an
+.BR execve (2).
+.PP
+A shared or exclusive lock can be placed on a file regardless of the
+mode in which the file was opened.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+.I fd
+is not an open file descriptor.
+.TP
+.B EINTR
+While waiting to acquire a lock, the call was interrupted by
+delivery of a signal caught by a handler; see
+.BR signal (7).
+.TP
+.B EINVAL
+.I operation
+is invalid.
+.TP
+.B ENOLCK
+The kernel ran out of memory for allocating lock records.
+.TP
+.B EWOULDBLOCK
+The file is locked and the
+.B LOCK_NB
+flag was selected.
+.SH VERSIONS
+Since Linux 2.0,
+.BR flock ()
+is implemented as a system call in its own right rather
+than being emulated in the GNU C library as a call to
+.BR fcntl (2).
+With this implementation,
+there is no interaction between the types of lock
+placed by
+.BR flock ()
+and
+.BR fcntl (2),
+and
+.BR flock ()
+does not detect deadlock.
+(Note, however, that on some systems, such as the modern BSDs,
+.\" E.g., according to the flock(2) man page, FreeBSD since at least 5.3
+.BR flock ()
+and
+.BR fcntl (2)
+locks
+.I do
+interact with one another.)
+.SS CIFS details
+Up to Linux 5.4,
+.BR flock ()
+is not propagated over SMB.
+A file with such locks will not appear locked for remote clients.
+.PP
+Since Linux 5.5,
+.BR flock ()
+locks are emulated with SMB byte-range locks on the entire file.
+Similarly to NFS, this means that
+.BR fcntl (2)
+and
+.BR flock ()
+locks interact with one another.
+Another important side-effect is that the locks are not advisory anymore:
+any IO on a locked file will always fail with
+.B EACCES
+when done from a separate file descriptor.
+This difference originates from the design of locks in the SMB protocol,
+which provides mandatory locking semantics.
+.PP
+Remote and mandatory locking semantics may vary with
+SMB protocol, mount options and server type.
+See
+.BR mount.cifs (8)
+for additional information.
+.SH STANDARDS
+BSD.
+.SH HISTORY
+4.4BSD (the
+.BR flock ()
+call first appeared in 4.2BSD).
+A version of
+.BR flock (),
+possibly implemented in terms of
+.BR fcntl (2),
+appears on most UNIX systems.
+.SS NFS details
+Up to Linux 2.6.11,
+.BR flock ()
+does not lock files over NFS
+(i.e., the scope of locks was limited to the local system).
+Instead, one could use
+.BR fcntl (2)
+byte-range locking, which does work over NFS,
+given a sufficiently recent version of
+Linux and a server which supports locking.
+.PP
+Since Linux 2.6.12, NFS clients support
+.BR flock ()
+locks by emulating them as
+.BR fcntl (2)
+byte-range locks on the entire file.
+This means that
+.BR fcntl (2)
+and
+.BR flock ()
+locks
+.I do
+interact with one another over NFS.
+It also means that in order to place an exclusive lock,
+the file must be opened for writing.
+.PP
+Since Linux 2.6.37,
+.\" commit 5eebde23223aeb0ad2d9e3be6590ff8bbfab0fc2
+the kernel supports a compatibility mode that allows
+.BR flock ()
+locks (and also
+.BR fcntl (2)
+byte region locks) to be treated as local;
+see the discussion of the
+.I "local_lock"
+option in
+.BR nfs (5).
+.SH NOTES
+.BR flock ()
+places advisory locks only; given suitable permissions on a file,
+a process is free to ignore the use of
+.BR flock ()
+and perform I/O on the file.
+.PP
+.BR flock ()
+and
+.BR fcntl (2)
+locks have different semantics with respect to forked processes and
+.BR dup (2).
+On systems that implement
+.BR flock ()
+using
+.BR fcntl (2),
+the semantics of
+.BR flock ()
+will be different from those described in this manual page.
+.PP
+Converting a lock
+(shared to exclusive, or vice versa) is not guaranteed to be atomic:
+the existing lock is first removed, and then a new lock is established.
+Between these two steps,
+a pending lock request by another process may be granted,
+with the result that the conversion either blocks, or fails if
+.B LOCK_NB
+was specified.
+(This is the original BSD behavior,
+and occurs on many other implementations.)
+.\" Kernel 2.5.21 changed things a little: during lock conversion
+.\" it is now the highest priority process that will get the lock -- mtk
+.SH SEE ALSO
+.BR flock (1),
+.BR close (2),
+.BR dup (2),
+.BR execve (2),
+.BR fcntl (2),
+.BR fork (2),
+.BR open (2),
+.BR lockf (3),
+.BR lslocks (8)
+.PP
+.I Documentation/filesystems/locks.txt
+in the Linux kernel source tree
+.RI ( Documentation/locks.txt
+in older kernels)
diff --git a/man2/fork.2 b/man2/fork.2
new file mode 100644
index 0000000..607a86b
--- /dev/null
+++ b/man2/fork.2
@@ -0,0 +1,348 @@
+.\" Copyright (C) 2006 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" A few fragments remain from an earlier (1992) page by
+.\" Drew Eckhardt (drew@cs.colorado.edu),
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt (michael@moria.de)
+.\" Modified Sat Jul 24 13:22:07 1993 by Rik Faith (faith@cs.unc.edu)
+.\" Modified 21 Aug 1994 by Michael Chastain (mec@shell.portal.com):
+.\" Referenced 'clone(2)'.
+.\" Modified 1995-06-10, 1996-04-18, 1999-11-01, 2000-12-24
+.\" by Andries Brouwer (aeb@cwi.nl)
+.\" Modified, 27 May 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on capability requirements
+.\" 2006-09-04, Michael Kerrisk
+.\" Greatly expanded, to describe all attributes that differ
+.\" parent and child.
+.\"
+.TH fork 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+fork \- create a child process
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.B pid_t fork(void);
+.fi
+.SH DESCRIPTION
+.BR fork ()
+creates a new process by duplicating the calling process.
+The new process is referred to as the
+.I child
+process.
+The calling process is referred to as the
+.I parent
+process.
+.PP
+The child process and the parent process run in separate memory spaces.
+At the time of
+.BR fork ()
+both memory spaces have the same content.
+Memory writes, file mappings
+.RB ( mmap (2)),
+and unmappings
+.RB ( munmap (2))
+performed by one of the processes do not affect the other.
+.PP
+The child process is an exact duplicate of the parent
+process except for the following points:
+.IP \[bu] 3
+The child has its own unique process ID,
+and this PID does not match the ID of any existing process group
+.RB ( setpgid (2))
+or session.
+.IP \[bu]
+The child's parent process ID is the same as the parent's process ID.
+.IP \[bu]
+The child does not inherit its parent's memory locks
+.RB ( mlock (2),
+.BR mlockall (2)).
+.IP \[bu]
+Process resource utilizations
+.RB ( getrusage (2))
+and CPU time counters
+.RB ( times (2))
+are reset to zero in the child.
+.IP \[bu]
+The child's set of pending signals is initially empty
+.RB ( sigpending (2)).
+.IP \[bu]
+The child does not inherit semaphore adjustments from its parent
+.RB ( semop (2)).
+.IP \[bu]
+The child does not inherit process-associated record locks from its parent
+.RB ( fcntl (2)).
+(On the other hand, it does inherit
+.BR fcntl (2)
+open file description locks and
+.BR flock (2)
+locks from its parent.)
+.IP \[bu]
+The child does not inherit timers from its parent
+.RB ( setitimer (2),
+.BR alarm (2),
+.BR timer_create (2)).
+.IP \[bu]
+The child does not inherit outstanding asynchronous I/O operations
+from its parent
+.RB ( aio_read (3),
+.BR aio_write (3)),
+nor does it inherit any asynchronous I/O contexts from its parent (see
+.BR io_setup (2)).
+.PP
+The process attributes in the preceding list are all specified
+in POSIX.1.
+The parent and child also differ with respect to the following
+Linux-specific process attributes:
+.IP \[bu] 3
+The child does not inherit directory change notifications (dnotify)
+from its parent
+(see the description of
+.B F_NOTIFY
+in
+.BR fcntl (2)).
+.IP \[bu]
+The
+.BR prctl (2)
+.B PR_SET_PDEATHSIG
+setting is reset so that the child does not receive a signal
+when its parent terminates.
+.IP \[bu]
+The default timer slack value is set to the parent's
+current timer slack value.
+See the description of
+.B PR_SET_TIMERSLACK
+in
+.BR prctl (2).
+.IP \[bu]
+Memory mappings that have been marked with the
+.BR madvise (2)
+.B MADV_DONTFORK
+flag are not inherited across a
+.BR fork ().
+.IP \[bu]
+Memory in address ranges that have been marked with the
+.BR madvise (2)
+.B MADV_WIPEONFORK
+flag is zeroed in the child after a
+.BR fork ().
+(The
+.B MADV_WIPEONFORK
+setting remains in place for those address ranges in the child.)
+.IP \[bu]
+The termination signal of the child is always
+.B SIGCHLD
+(see
+.BR clone (2)).
+.IP \[bu]
+The port access permission bits set by
+.BR ioperm (2)
+are not inherited by the child;
+the child must turn on any bits that it requires using
+.BR ioperm (2).
+.PP
+Note the following further points:
+.IP \[bu] 3
+The child process is created with a single thread\[em]the
+one that called
+.BR fork ().
+The entire virtual address space of the parent is replicated in the child,
+including the states of mutexes, condition variables,
+and other pthreads objects; the use of
+.BR pthread_atfork (3)
+may be helpful for dealing with problems that this can cause.
+.IP \[bu]
+After a
+.BR fork ()
+in a multithreaded program,
+the child can safely call only async-signal-safe functions (see
+.BR signal\-safety (7))
+until such time as it calls
+.BR execve (2).
+.IP \[bu]
+The child inherits copies of the parent's set of open file descriptors.
+Each file descriptor in the child refers to the same
+open file description (see
+.BR open (2))
+as the corresponding file descriptor in the parent.
+This means that the two file descriptors share open file status flags,
+file offset,
+and signal-driven I/O attributes (see the description of
+.B F_SETOWN
+and
+.B F_SETSIG
+in
+.BR fcntl (2)).
+.IP \[bu]
+The child inherits copies of the parent's set of open message
+queue descriptors (see
+.BR mq_overview (7)).
+Each file descriptor in the child refers to the same
+open message queue description
+as the corresponding file descriptor in the parent.
+This means that the two file descriptors share the same flags
+.RI ( mq_flags ).
+.IP \[bu]
+The child inherits copies of the parent's set of open directory streams (see
+.BR opendir (3)).
+POSIX.1 says that the corresponding directory streams
+in the parent and child
+.I may
+share the directory stream positioning;
+on Linux/glibc they do not.
+.SH RETURN VALUE
+On success, the PID of the child process is returned in the parent,
+and 0 is returned in the child.
+On failure, \-1 is returned in the parent,
+no child process is created, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EAGAIN
+.\" NOTE! The following should match the description in pthread_create(3)
+A system-imposed limit on the number of threads was encountered.
+There are a number of limits that may trigger this error:
+.RS
+.IP \[bu] 3
+the
+.B RLIMIT_NPROC
+soft resource limit (set via
+.BR setrlimit (2)),
+which limits the number of processes and threads for a real user ID,
+was reached;
+.IP \[bu]
+the kernel's system-wide limit on the number of processes and threads,
+.IR /proc/sys/kernel/threads\-max ,
+was reached (see
+.BR proc (5));
+.IP \[bu]
+the maximum number of PIDs,
+.IR /proc/sys/kernel/pid_max ,
+was reached (see
+.BR proc (5));
+or
+.IP \[bu]
+the PID limit
+.RI ( pids.max )
+imposed by the cgroup "process number" (PIDs) controller was reached.
+.RE
+.TP
+.B EAGAIN
+The caller is operating under the
+.B SCHED_DEADLINE
+scheduling policy and does not have the reset-on-fork flag set.
+See
+.BR sched (7).
+.TP
+.B ENOMEM
+.BR fork ()
+failed to allocate the necessary kernel structures because memory is tight.
+.TP
+.B ENOMEM
+An attempt was made to create a child process in a PID namespace
+whose "init" process has terminated.
+See
+.BR pid_namespaces (7).
+.TP
+.B ENOSYS
+.BR fork ()
+is not supported on this platform (for example,
+.\" e.g., arm (optionally), blackfin, c6x, frv, h8300, microblaze, xtensa
+hardware without a Memory-Management Unit).
+.TP
+.BR ERESTARTNOINTR " (since Linux 2.6.17)"
+.\" commit 4a2c7a7837da1b91468e50426066d988050e4d56
+System call was interrupted by a signal and will be restarted.
+(This can be seen only during a trace.)
+.SH VERSIONS
+.SS C library/kernel differences
+Since glibc 2.3.3,
+.\" nptl/sysdeps/unix/sysv/linux/fork.c
+rather than invoking the kernel's
+.BR fork ()
+system call,
+the glibc
+.BR fork ()
+wrapper that is provided as part of the
+NPTL threading implementation invokes
+.BR clone (2)
+with flags that provide the same effect as the traditional system call.
+(A call to
+.BR fork ()
+is equivalent to a call to
+.BR clone (2)
+specifying
+.I flags
+as just
+.BR SIGCHLD .)
+The glibc wrapper invokes any fork handlers that have been
+established using
+.BR pthread_atfork (3).
+.\" and does some magic to ensure that getpid(2) returns the right value.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4, 4.3BSD.
+.SH NOTES
+Under Linux,
+.BR fork ()
+is implemented using copy-on-write pages, so the only penalty that it incurs
+is the time and memory required to duplicate the parent's page tables,
+and to create a unique task structure for the child.
+.SH EXAMPLES
+See
+.BR pipe (2)
+and
+.BR wait (2)
+for more examples.
+.PP
+.\" SRC BEGIN (fork.c)
+.EX
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+\&
+int
+main(void)
+{
+ pid_t pid;
+\&
+ if (signal(SIGCHLD, SIG_IGN) == SIG_ERR) {
+ perror("signal");
+ exit(EXIT_FAILURE);
+ }
+ pid = fork();
+ switch (pid) {
+ case \-1:
+ perror("fork");
+ exit(EXIT_FAILURE);
+ case 0:
+ puts("Child exiting.");
+ exit(EXIT_SUCCESS);
+ default:
+ printf("Child is PID %jd\en", (intmax_t) pid);
+ puts("Parent exiting.");
+ exit(EXIT_SUCCESS);
+ }
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR clone (2),
+.BR execve (2),
+.BR exit (2),
+.BR setrlimit (2),
+.BR unshare (2),
+.BR vfork (2),
+.BR wait (2),
+.BR daemon (3),
+.BR pthread_atfork (3),
+.BR capabilities (7),
+.BR credentials (7)
diff --git a/man2/free_hugepages.2 b/man2/free_hugepages.2
new file mode 100644
index 0000000..d4b906a
--- /dev/null
+++ b/man2/free_hugepages.2
@@ -0,0 +1 @@
+.so man2/alloc_hugepages.2
diff --git a/man2/fremovexattr.2 b/man2/fremovexattr.2
new file mode 100644
index 0000000..38d01cc
--- /dev/null
+++ b/man2/fremovexattr.2
@@ -0,0 +1 @@
+.so man2/removexattr.2
diff --git a/man2/fsetxattr.2 b/man2/fsetxattr.2
new file mode 100644
index 0000000..dc07807
--- /dev/null
+++ b/man2/fsetxattr.2
@@ -0,0 +1 @@
+.so man2/setxattr.2
diff --git a/man2/fstat.2 b/man2/fstat.2
new file mode 100644
index 0000000..b1a86c1
--- /dev/null
+++ b/man2/fstat.2
@@ -0,0 +1 @@
+.so man2/stat.2
diff --git a/man2/fstat64.2 b/man2/fstat64.2
new file mode 100644
index 0000000..2b9971d
--- /dev/null
+++ b/man2/fstat64.2
@@ -0,0 +1 @@
+.so man2/fstat.2
diff --git a/man2/fstatat.2 b/man2/fstatat.2
new file mode 100644
index 0000000..b1a86c1
--- /dev/null
+++ b/man2/fstatat.2
@@ -0,0 +1 @@
+.so man2/stat.2
diff --git a/man2/fstatat64.2 b/man2/fstatat64.2
new file mode 100644
index 0000000..7791269
--- /dev/null
+++ b/man2/fstatat64.2
@@ -0,0 +1 @@
+.so man2/fstatat.2
diff --git a/man2/fstatfs.2 b/man2/fstatfs.2
new file mode 100644
index 0000000..923d3c0
--- /dev/null
+++ b/man2/fstatfs.2
@@ -0,0 +1 @@
+.so man2/statfs.2
diff --git a/man2/fstatfs64.2 b/man2/fstatfs64.2
new file mode 100644
index 0000000..fde2b22
--- /dev/null
+++ b/man2/fstatfs64.2
@@ -0,0 +1 @@
+.so man2/fstatfs.2
diff --git a/man2/fsync.2 b/man2/fsync.2
new file mode 100644
index 0000000..623e7ca
--- /dev/null
+++ b/man2/fsync.2
@@ -0,0 +1,195 @@
+.\" Copyright 1993 Rickard E. Faith (faith@cs.unc.edu) and
+.\" and Copyright 2006 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 21 Aug 1994 by Michael Chastain <mec@shell.portal.com>:
+.\" Removed note about old libc (pre-4.5.26) translating to 'sync'.
+.\" Modified 15 Apr 1995 by Michael Chastain <mec@shell.portal.com>:
+.\" Added `see also' section.
+.\" Modified 13 Apr 1996 by Markus Kuhn <mskuhn@cip.informatik.uni-erlangen.de>
+.\" Added remarks about fdatasync.
+.\" Modified 31 Jan 1997 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 18 Apr 2001 by Andi Kleen
+.\" Fix description to describe what it really does; add a few caveats.
+.\" 2006-04-28, mtk, substantial rewrite of various parts.
+.\" 2012-02-27 Various changes by Christoph Hellwig <hch@lst.de>
+.\"
+.TH fsync 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+fsync, fdatasync \- synchronize a file's in-core state with storage device
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int fsync(int " fd );
+.PP
+.BI "int fdatasync(int " fd );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.nf
+.BR fsync ():
+ glibc 2.16 and later:
+ No feature test macros need be defined
+ glibc up to and including 2.15:
+ _BSD_SOURCE || _XOPEN_SOURCE
+ || /* Since glibc 2.8: */ _POSIX_C_SOURCE >= 200112L
+.fi
+.PP
+.BR fdatasync ():
+.nf
+ _POSIX_C_SOURCE >= 199309L || _XOPEN_SOURCE >= 500
+.fi
+.SH DESCRIPTION
+.BR fsync ()
+transfers ("flushes") all modified in-core data of
+(i.e., modified buffer cache pages for) the
+file referred to by the file descriptor
+.I fd
+to the disk device (or other permanent storage device) so that all
+changed information can be retrieved even if the system crashes or
+is rebooted.
+This includes writing through or flushing a disk cache if present.
+The call blocks until the device reports that the transfer has completed.
+.PP
+As well as flushing the file data,
+.BR fsync ()
+also flushes the metadata information associated with the file (see
+.BR inode (7)).
+.PP
+Calling
+.BR fsync ()
+does not necessarily ensure
+that the entry in the directory containing the file has also reached disk.
+For that an explicit
+.BR fsync ()
+on a file descriptor for the directory is also needed.
+.PP
+.BR fdatasync ()
+is similar to
+.BR fsync (),
+but does not flush modified metadata unless that metadata
+is needed in order to allow a subsequent data retrieval to be
+correctly handled.
+For example, changes to
+.I st_atime
+or
+.I st_mtime
+(respectively, time of last access and
+time of last modification; see
+.BR inode (7))
+do not require flushing because they are not necessary for
+a subsequent data read to be handled correctly.
+On the other hand, a change to the file size
+.RI ( st_size ,
+as made by say
+.BR ftruncate (2)),
+would require a metadata flush.
+.PP
+The aim of
+.BR fdatasync ()
+is to reduce disk activity for applications that do not
+require all metadata to be synchronized with the disk.
+.SH RETURN VALUE
+On success, these system calls return zero.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+.I fd
+is not a valid open file descriptor.
+.TP
+.B EINTR
+The function was interrupted by a signal; see
+.BR signal (7).
+.TP
+.B EIO
+An error occurred during synchronization.
+This error may relate to data written to some other file descriptor
+on the same file.
+Since Linux 4.13,
+.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
+errors from write-back will be reported to
+all file descriptors that might have written the data which triggered
+the error.
+Some filesystems (e.g., NFS) keep close track of which data
+came through which file descriptor, and give more precise reporting.
+Other filesystems (e.g., most local filesystems) will report errors to
+all file descriptors that were open on the file when the error was recorded.
+.TP
+.B ENOSPC
+Disk space was exhausted while synchronizing.
+.TP
+.BR EROFS ", " EINVAL
+.I fd
+is bound to a special file (e.g., a pipe, FIFO, or socket)
+which does not support synchronization.
+.TP
+.BR ENOSPC ", " EDQUOT
+.I fd
+is bound to a file on NFS or another filesystem which does not allocate
+space at the time of a
+.BR write (2)
+system call, and some previous write failed due to insufficient
+storage space.
+.SH VERSIONS
+On POSIX systems on which
+.BR fdatasync ()
+is available,
+.B _POSIX_SYNCHRONIZED_IO
+is defined in
+.I <unistd.h>
+to a value greater than 0.
+(See also
+.BR sysconf (3).)
+.\" POSIX.1-2001: It shall be defined to -1 or 0 or 200112L.
+.\" -1: unavailable, 0: ask using sysconf().
+.\" glibc defines them to 1.
+.PP
+On some UNIX systems (but not Linux),
+.I fd
+must be a
+.I writable
+file descriptor.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, 4.3BSD.
+.PP
+In Linux 2.2 and earlier,
+.BR fdatasync ()
+is equivalent to
+.BR fsync (),
+and so has no performance advantage.
+.PP
+The
+.BR fsync ()
+implementations in older kernels and lesser used filesystems
+do not know how to flush disk caches.
+In these cases disk caches need to be disabled using
+.BR hdparm (8)
+or
+.BR sdparm (8)
+to guarantee safe operation.
+.SH SEE ALSO
+.BR sync (1),
+.BR bdflush (2),
+.BR open (2),
+.BR posix_fadvise (2),
+.BR pwritev (2),
+.BR sync (2),
+.BR sync_file_range (2),
+.BR fflush (3),
+.BR fileno (3),
+.BR hdparm (8),
+.BR mount (8)
diff --git a/man2/ftruncate.2 b/man2/ftruncate.2
new file mode 100644
index 0000000..2ed34f1
--- /dev/null
+++ b/man2/ftruncate.2
@@ -0,0 +1 @@
+.so man2/truncate.2
diff --git a/man2/ftruncate64.2 b/man2/ftruncate64.2
new file mode 100644
index 0000000..a8862d3
--- /dev/null
+++ b/man2/ftruncate64.2
@@ -0,0 +1 @@
+.so man2/ftruncate.2
diff --git a/man2/futex.2 b/man2/futex.2
new file mode 100644
index 0000000..43b1075
--- /dev/null
+++ b/man2/futex.2
@@ -0,0 +1,1976 @@
+.\" Page by b.hubert
+.\" and Copyright (C) 2015, Thomas Gleixner <tglx@linutronix.de>
+.\" and Copyright (C) 2015, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" %%%LICENSE_START(FREELY_REDISTRIBUTABLE)
+.\" may be freely modified and distributed
+.\" %%%LICENSE_END
+.\"
+.\" Niki A. Rahimi (LTC Security Development, narahimi@us.ibm.com)
+.\" added ERRORS section.
+.\"
+.\" Modified 2004-06-17 mtk
+.\" Modified 2004-10-07 aeb, added FUTEX_REQUEUE, FUTEX_CMP_REQUEUE
+.\"
+.\" FIXME Still to integrate are some points from Torvald Riegel's mail of
+.\" 2015-01-23:
+.\" http://thread.gmane.org/gmane.linux.kernel/1703405/focus=7977
+.\"
+.\" FIXME Do we need to add some text regarding Torvald Riegel's 2015-01-24 mail
+.\" http://thread.gmane.org/gmane.linux.kernel/1703405/focus=1873242
+.\"
+.TH futex 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+futex \- fast user-space locking
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.PP
+.BR "#include <linux/futex.h>" " /* Definition of " FUTEX_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "long syscall(SYS_futex, uint32_t *" uaddr ", int " futex_op \
+", uint32_t " val ,
+.BI " const struct timespec *" timeout , \
+" \fR /* or: \fBuint32_t \fIval2\fP */"
+.BI " uint32_t *" uaddr2 ", uint32_t " val3 );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR futex (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The
+.BR futex ()
+system call provides a method for waiting until a certain condition becomes
+true.
+It is typically used as a blocking construct in the context of
+shared-memory synchronization.
+When using futexes, the majority of
+the synchronization operations are performed in user space.
+A user-space program employs the
+.BR futex ()
+system call only when it is likely that the program has to block for
+a longer time until the condition becomes true.
+Other
+.BR futex ()
+operations can be used to wake any processes or threads waiting
+for a particular condition.
+.PP
+A futex is a 32-bit value\[em]referred to below as a
+.IR "futex word" \[em]whose
+address is supplied to the
+.BR futex ()
+system call.
+(Futexes are 32 bits in size on all platforms, including 64-bit systems.)
+All futex operations are governed by this value.
+In order to share a futex between processes,
+the futex is placed in a region of shared memory,
+created using (for example)
+.BR mmap (2)
+or
+.BR shmat (2).
+(Thus, the futex word may have different
+virtual addresses in different processes,
+but these addresses all refer to the same location in physical memory.)
+In a multithreaded program, it is sufficient to place the futex word
+in a global variable shared by all threads.
+.PP
+When executing a futex operation that requests to block a thread,
+the kernel will block only if the futex word has the value that the
+calling thread supplied (as one of the arguments of the
+.BR futex ()
+call) as the expected value of the futex word.
+The loading of the futex word's value,
+the comparison of that value with the expected value,
+and the actual blocking will happen atomically and will be totally ordered
+with respect to concurrent operations performed by other threads
+on the same futex word.
+.\" Notes from Darren Hart (Dec 2015):
+.\" Totally ordered with respect futex operations refers to semantics
+.\" of the ACQUIRE/RELEASE operations and how they impact ordering of
+.\" memory reads and writes. The kernel futex operations are protected
+.\" by spinlocks, which ensure that all operations are serialized
+.\" with respect to one another.
+.\"
+.\" This is a lot to attempt to define in this document. Perhaps a
+.\" reference to linux/Documentation/memory-barriers.txt as a footnote
+.\" would be sufficient? Or perhaps for this manual, "serialized" would
+.\" be sufficient, with a footnote regarding "totally ordered" and a
+.\" pointer to the memory-barrier documentation?
+Thus, the futex word is used to connect the synchronization in user space
+with the implementation of blocking by the kernel.
+Analogously to an atomic
+compare-and-exchange operation that potentially changes shared memory,
+blocking via a futex is an atomic compare-and-block operation.
+.\" FIXME(Torvald Riegel):
+.\" Eventually we want to have some text in NOTES to satisfy
+.\" the reference in the following sentence
+.\" See NOTES for a detailed specification of
+.\" the synchronization semantics.
+.PP
+One use of futexes is for implementing locks.
+The state of the lock (i.e., acquired or not acquired)
+can be represented as an atomically accessed flag in shared memory.
+In the uncontended case,
+a thread can access or modify the lock state with atomic instructions,
+for example atomically changing it from not acquired to acquired
+using an atomic compare-and-exchange instruction.
+(Such instructions are performed entirely in user mode,
+and the kernel maintains no information about the lock state.)
+On the other hand, a thread may be unable to acquire a lock because
+it is already acquired by another thread.
+It then may pass the lock's flag as a futex word and the value
+representing the acquired state as the expected value to a
+.BR futex ()
+wait operation.
+This
+.BR futex ()
+operation will block if and only if the lock is still acquired
+(i.e., the value in the futex word still matches the "acquired state").
+When releasing the lock, a thread has to first reset the
+lock state to not acquired and then execute a futex
+operation that wakes threads blocked on the lock flag used as a futex word
+(this can be further optimized to avoid unnecessary wake-ups).
+See
+.BR futex (7)
+for more detail on how to use futexes.
+.PP
+Besides the basic wait and wake-up futex functionality, there are further
+futex operations aimed at supporting more complex use cases.
+.PP
+Note that
+no explicit initialization or destruction is necessary to use futexes;
+the kernel maintains a futex
+(i.e., the kernel-internal implementation artifact)
+only while operations such as
+.BR FUTEX_WAIT ,
+described below, are being performed on a particular futex word.
+.\"
+.SS Arguments
+The
+.I uaddr
+argument points to the futex word.
+On all platforms, futexes are four-byte
+integers that must be aligned on a four-byte boundary.
+The operation to perform on the futex is specified in the
+.I futex_op
+argument;
+.I val
+is a value whose meaning and purpose depends on
+.IR futex_op .
+.PP
+The remaining arguments
+.RI ( timeout ,
+.IR uaddr2 ,
+and
+.IR val3 )
+are required only for certain of the futex operations described below.
+Where one of these arguments is not required, it is ignored.
+.PP
+For several blocking operations, the
+.I timeout
+argument is a pointer to a
+.I timespec
+structure that specifies a timeout for the operation.
+However, notwithstanding the prototype shown above, for some operations,
+the least significant four bytes of this argument are instead
+used as an integer whose meaning is determined by the operation.
+For these operations, the kernel casts the
+.I timeout
+value first to
+.IR "unsigned long",
+then to
+.IR uint32_t ,
+and in the remainder of this page, this argument is referred to as
+.I val2
+when interpreted in this fashion.
+.PP
+Where it is required, the
+.I uaddr2
+argument is a pointer to a second futex word that is employed
+by the operation.
+.PP
+The interpretation of the final integer argument,
+.IR val3 ,
+depends on the operation.
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.SS Futex operations
+The
+.I futex_op
+argument consists of two parts:
+a command that specifies the operation to be performed,
+bitwise ORed with zero or more options that
+modify the behaviour of the operation.
+The options that may be included in
+.I futex_op
+are as follows:
+.TP
+.BR FUTEX_PRIVATE_FLAG " (since Linux 2.6.22)"
+.\" commit 34f01cc1f512fa783302982776895c73714ebbc2
+This option bit can be employed with all futex operations.
+It tells the kernel that the futex is process-private and not shared
+with another process (i.e., it is being used for synchronization
+only between threads of the same process).
+This allows the kernel to make some additional performance optimizations.
+.\" I.e., It allows the kernel choose the fast path for validating
+.\" the user-space address and avoids expensive VMA lookups,
+.\" taking reference counts on file backing store, and so on.
+.IP
+As a convenience,
+.I <linux/futex.h>
+defines a set of constants with the suffix
+.B _PRIVATE
+that are equivalents of all of the operations listed below,
+.\" except the obsolete FUTEX_FD, for which the "private" flag was
+.\" meaningless
+but with the
+.B FUTEX_PRIVATE_FLAG
+ORed into the constant value.
+Thus, there are
+.BR FUTEX_WAIT_PRIVATE ,
+.BR FUTEX_WAKE_PRIVATE ,
+and so on.
+.TP
+.BR FUTEX_CLOCK_REALTIME " (since Linux 2.6.28)"
+.\" commit 1acdac104668a0834cfa267de9946fac7764d486
+This option bit can be employed only with the
+.BR FUTEX_WAIT_BITSET ,
+.BR FUTEX_WAIT_REQUEUE_PI ,
+(since Linux 4.5)
+.\" commit 337f13046ff03717a9e99675284a817527440a49
+.BR FUTEX_WAIT ,
+and
+(since Linux 5.14)
+.\" commit bf22a6976897977b0a3f1aeba6823c959fc4fdae
+.B FUTEX_LOCK_PI2
+operations.
+.IP
+If this option is set, the kernel measures the
+.I timeout
+against the
+.B CLOCK_REALTIME
+clock.
+.IP
+If this option is not set, the kernel measures the
+.I timeout
+against the
+.B CLOCK_MONOTONIC
+clock.
+.PP
+The operation specified in
+.I futex_op
+is one of the following:
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.TP
+.BR FUTEX_WAIT " (since Linux 2.6.0)"
+.\" Strictly speaking, since some time in Linux 2.5.x
+This operation tests that the value at the
+futex word pointed to by the address
+.I uaddr
+still contains the expected value
+.IR val ,
+and if so, then sleeps waiting for a
+.B FUTEX_WAKE
+operation on the futex word.
+The load of the value of the futex word is an atomic memory
+access (i.e., using atomic machine instructions of the respective
+architecture).
+This load, the comparison with the expected value, and
+starting to sleep are performed atomically
+.\" FIXME: Torvald, I think we may need to add some explanation of
+.\" "totally ordered" here.
+and totally ordered
+with respect to other futex operations on the same futex word.
+If the thread starts to sleep,
+it is considered a waiter on this futex word.
+If the futex value does not match
+.IR val ,
+then the call fails immediately with the error
+.BR EAGAIN .
+.IP
+The purpose of the comparison with the expected value is to prevent lost
+wake-ups.
+If another thread changed the value of the futex word after the
+calling thread decided to block based on the prior value,
+and if the other thread executed a
+.B FUTEX_WAKE
+operation (or similar wake-up) after the value change and before this
+.B FUTEX_WAIT
+operation, then the calling thread will observe the
+value change and will not start to sleep.
+.IP
+If the
+.I timeout
+is not NULL, the structure it points to specifies a
+timeout for the wait.
+(This interval will be rounded up to the system clock granularity,
+and is guaranteed not to expire early.)
+The timeout is by default measured according to the
+.B CLOCK_MONOTONIC
+clock, but, since Linux 4.5, the
+.B CLOCK_REALTIME
+clock can be selected by specifying
+.B FUTEX_CLOCK_REALTIME
+in
+.IR futex_op .
+If
+.I timeout
+is NULL, the call blocks indefinitely.
+.IP
+.IR Note :
+for
+.BR FUTEX_WAIT ,
+.I timeout
+is interpreted as a
+.I relative
+value.
+This differs from other futex operations, where
+.I timeout
+is interpreted as an absolute value.
+To obtain the equivalent of
+.B FUTEX_WAIT
+with an absolute timeout, employ
+.B FUTEX_WAIT_BITSET
+with
+.I val3
+specified as
+.BR FUTEX_BITSET_MATCH_ANY .
+.IP
+The arguments
+.I uaddr2
+and
+.I val3
+are ignored.
+.\" FIXME . (Torvald) I think we should remove this. Or maybe adapt to a
+.\" different example.
+.\"
+.\" For
+.\" .BR futex (7),
+.\" this call is executed if decrementing the count gave a negative value
+.\" (indicating contention),
+.\" and will sleep until another process or thread releases
+.\" the futex and executes the
+.\" .B FUTEX_WAKE
+.\" operation.
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.TP
+.BR FUTEX_WAKE " (since Linux 2.6.0)"
+.\" Strictly speaking, since Linux 2.5.x
+This operation wakes at most
+.I val
+of the waiters that are waiting (e.g., inside
+.BR FUTEX_WAIT )
+on the futex word at the address
+.IR uaddr .
+Most commonly,
+.I val
+is specified as either 1 (wake up a single waiter) or
+.B INT_MAX
+(wake up all waiters).
+No guarantee is provided about which waiters are awoken
+(e.g., a waiter with a higher scheduling priority is not guaranteed
+to be awoken in preference to a waiter with a lower priority).
+.IP
+The arguments
+.IR timeout ,
+.IR uaddr2 ,
+and
+.I val3
+are ignored.
+.\" FIXME . (Torvald) I think we should remove this. Or maybe adapt to
+.\" a different example.
+.\"
+.\" For
+.\" .BR futex (7),
+.\" this is executed if incrementing the count showed that
+.\" there were waiters,
+.\" once the futex value has been set to 1
+.\" (indicating that it is available).
+.\"
+.\" How does "incrementing the count show that there were waiters"?
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.TP
+.BR FUTEX_FD " (from Linux 2.6.0 up to and including Linux 2.6.25)"
+.\" Strictly speaking, from Linux 2.5.x to Linux 2.6.25
+This operation creates a file descriptor that is associated with
+the futex at
+.IR uaddr .
+The caller must close the returned file descriptor after use.
+When another process or thread performs a
+.B FUTEX_WAKE
+on the futex word, the file descriptor indicates as being readable with
+.BR select (2),
+.BR poll (2),
+and
+.BR epoll (7)
+.IP
+The file descriptor can be used to obtain asynchronous notifications: if
+.I val
+is nonzero, then, when another process or thread executes a
+.BR FUTEX_WAKE ,
+the caller will receive the signal number that was passed in
+.IR val .
+.IP
+The arguments
+.IR timeout ,
+.IR uaddr2 ,
+and
+.I val3
+are ignored.
+.IP
+Because it was inherently racy,
+.B FUTEX_FD
+has been removed
+.\" commit 82af7aca56c67061420d618cc5a30f0fd4106b80
+from Linux 2.6.26 onward.
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.TP
+.BR FUTEX_REQUEUE " (since Linux 2.6.0)"
+This operation performs the same task as
+.B FUTEX_CMP_REQUEUE
+(see below), except that no check is made using the value in
+.IR val3 .
+(The argument
+.I val3
+is ignored.)
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.TP
+.BR FUTEX_CMP_REQUEUE " (since Linux 2.6.7)"
+This operation first checks whether the location
+.I uaddr
+still contains the value
+.IR val3 .
+If not, the operation fails with the error
+.BR EAGAIN .
+Otherwise, the operation wakes up a maximum of
+.I val
+waiters that are waiting on the futex at
+.IR uaddr .
+If there are more than
+.I val
+waiters, then the remaining waiters are removed
+from the wait queue of the source futex at
+.I uaddr
+and added to the wait queue of the target futex at
+.IR uaddr2 .
+The
+.I val2
+argument specifies an upper limit on the number of waiters
+that are requeued to the futex at
+.IR uaddr2 .
+.IP
+.\" FIXME(Torvald) Is the following correct? Or is just the decision
+.\" which threads to wake or requeue part of the atomic operation?
+The load from
+.I uaddr
+is an atomic memory access (i.e., using atomic machine instructions of
+the respective architecture).
+This load, the comparison with
+.IR val3 ,
+and the requeueing of any waiters are performed atomically and totally
+ordered with respect to other operations on the same futex word.
+.\" Notes from a f2f conversation with Thomas Gleixner (Aug 2015): ###
+.\" The operation is serialized with respect to operations on both
+.\" source and target futex. No other waiter can enqueue itself
+.\" for waiting and no other waiter can dequeue itself because of
+.\" a timeout or signal.
+.IP
+Typical values to specify for
+.I val
+are 0 or 1.
+(Specifying
+.B INT_MAX
+is not useful, because it would make the
+.B FUTEX_CMP_REQUEUE
+operation equivalent to
+.BR FUTEX_WAKE .)
+The limit value specified via
+.I val2
+is typically either 1 or
+.BR INT_MAX .
+(Specifying the argument as 0 is not useful, because it would make the
+.B FUTEX_CMP_REQUEUE
+operation equivalent to
+.BR FUTEX_WAIT .)
+.IP
+The
+.B FUTEX_CMP_REQUEUE
+operation was added as a replacement for the earlier
+.BR FUTEX_REQUEUE .
+The difference is that the check of the value at
+.I uaddr
+can be used to ensure that requeueing happens only under certain
+conditions, which allows race conditions to be avoided in certain use cases.
+.\" But, as Rich Felker points out, there remain valid use cases for
+.\" FUTEX_REQUEUE, for example, when the calling thread is requeuing
+.\" the target(s) to a lock that the calling thread owns
+.\" From: Rich Felker <dalias@libc.org>
+.\" Date: Wed, 29 Oct 2014 22:43:17 -0400
+.\" To: Darren Hart <dvhart@infradead.org>
+.\" CC: libc-alpha@sourceware.org, ...
+.\" Subject: Re: Add futex wrapper to glibc?
+.IP
+Both
+.B FUTEX_REQUEUE
+and
+.B FUTEX_CMP_REQUEUE
+can be used to avoid "thundering herd" wake-ups that could occur when using
+.B FUTEX_WAKE
+in cases where all of the waiters that are woken need to acquire
+another futex.
+Consider the following scenario,
+where multiple waiter threads are waiting on B,
+a wait queue implemented using a futex:
+.IP
+.in +4n
+.EX
+lock(A)
+while (!check_value(V)) {
+ unlock(A);
+ block_on(B);
+ lock(A);
+};
+unlock(A);
+.EE
+.in
+.IP
+If a waker thread used
+.BR FUTEX_WAKE ,
+then all waiters waiting on B would be woken up,
+and they would all try to acquire lock A.
+However, waking all of the threads in this manner would be pointless because
+all except one of the threads would immediately block on lock A again.
+By contrast, a requeue operation wakes just one waiter and moves
+the other waiters to lock A,
+and when the woken waiter unlocks A then the next waiter can proceed.
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.TP
+.BR FUTEX_WAKE_OP " (since Linux 2.6.14)"
+.\" commit 4732efbeb997189d9f9b04708dc26bf8613ed721
+.\" Author: Jakub Jelinek <jakub@redhat.com>
+.\" Date: Tue Sep 6 15:16:25 2005 -0700
+.\" FIXME. (Torvald) The glibc condvar implementation is currently being
+.\" revised (e.g., to not use an internal lock anymore).
+.\" It is probably more future-proof to remove this paragraph.
+.\" [Torvald, do you have an update here?]
+This operation was added to support some user-space use cases
+where more than one futex must be handled at the same time.
+The most notable example is the implementation of
+.BR pthread_cond_signal (3),
+which requires operations on two futexes,
+the one used to implement the mutex and the one used in the implementation
+of the wait queue associated with the condition variable.
+.B FUTEX_WAKE_OP
+allows such cases to be implemented without leading to
+high rates of contention and context switching.
+.IP
+The
+.B FUTEX_WAKE_OP
+operation is equivalent to executing the following code atomically
+and totally ordered with respect to other futex operations on
+any of the two supplied futex words:
+.IP
+.in +4n
+.EX
+uint32_t oldval = *(uint32_t *) uaddr2;
+*(uint32_t *) uaddr2 = oldval \fIop\fP \fIoparg\fP;
+futex(uaddr, FUTEX_WAKE, val, 0, 0, 0);
+if (oldval \fIcmp\fP \fIcmparg\fP)
+ futex(uaddr2, FUTEX_WAKE, val2, 0, 0, 0);
+.EE
+.in
+.IP
+In other words,
+.B FUTEX_WAKE_OP
+does the following:
+.RS
+.IP \[bu] 3
+saves the original value of the futex word at
+.I uaddr2
+and performs an operation to modify the value of the futex at
+.IR uaddr2 ;
+this is an atomic read-modify-write memory access (i.e., using atomic
+machine instructions of the respective architecture)
+.IP \[bu]
+wakes up a maximum of
+.I val
+waiters on the futex for the futex word at
+.IR uaddr ;
+and
+.IP \[bu]
+dependent on the results of a test of the original value of the
+futex word at
+.IR uaddr2 ,
+wakes up a maximum of
+.I val2
+waiters on the futex for the futex word at
+.IR uaddr2 .
+.RE
+.IP
+The operation and comparison that are to be performed are encoded
+in the bits of the argument
+.IR val3 .
+Pictorially, the encoding is:
+.IP
+.in +4n
+.EX
++---+---+-----------+-----------+
+|op |cmp| oparg | cmparg |
++---+---+-----------+-----------+
+ 4 4 12 12 <== # of bits
+.EE
+.in
+.IP
+Expressed in code, the encoding is:
+.IP
+.in +4n
+.EX
+#define FUTEX_OP(op, oparg, cmp, cmparg) \e
+ (((op & 0xf) << 28) | \e
+ ((cmp & 0xf) << 24) | \e
+ ((oparg & 0xfff) << 12) | \e
+ (cmparg & 0xfff))
+.EE
+.in
+.IP
+In the above,
+.I op
+and
+.I cmp
+are each one of the codes listed below.
+The
+.I oparg
+and
+.I cmparg
+components are literal numeric values, except as noted below.
+.IP
+The
+.I op
+component has one of the following values:
+.IP
+.in +4n
+.EX
+FUTEX_OP_SET 0 /* uaddr2 = oparg; */
+FUTEX_OP_ADD 1 /* uaddr2 += oparg; */
+FUTEX_OP_OR 2 /* uaddr2 |= oparg; */
+FUTEX_OP_ANDN 3 /* uaddr2 &= \[ti]oparg; */
+FUTEX_OP_XOR 4 /* uaddr2 \[ha]= oparg; */
+.EE
+.in
+.IP
+In addition, bitwise ORing the following value into
+.I op
+causes
+.I (1\~<<\~oparg)
+to be used as the operand:
+.IP
+.in +4n
+.EX
+FUTEX_OP_ARG_SHIFT 8 /* Use (1 << oparg) as operand */
+.EE
+.in
+.IP
+The
+.I cmp
+field is one of the following:
+.IP
+.in +4n
+.EX
+FUTEX_OP_CMP_EQ 0 /* if (oldval == cmparg) wake */
+FUTEX_OP_CMP_NE 1 /* if (oldval != cmparg) wake */
+FUTEX_OP_CMP_LT 2 /* if (oldval < cmparg) wake */
+FUTEX_OP_CMP_LE 3 /* if (oldval <= cmparg) wake */
+FUTEX_OP_CMP_GT 4 /* if (oldval > cmparg) wake */
+FUTEX_OP_CMP_GE 5 /* if (oldval >= cmparg) wake */
+.EE
+.in
+.IP
+The return value of
+.B FUTEX_WAKE_OP
+is the sum of the number of waiters woken on the futex
+.I uaddr
+plus the number of waiters woken on the futex
+.IR uaddr2 .
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.TP
+.BR FUTEX_WAIT_BITSET " (since Linux 2.6.25)"
+.\" commit cd689985cf49f6ff5c8eddc48d98b9d581d9475d
+This operation is like
+.B FUTEX_WAIT
+except that
+.I val3
+is used to provide a 32-bit bit mask to the kernel.
+This bit mask, in which at least one bit must be set,
+is stored in the kernel-internal state of the waiter.
+See the description of
+.B FUTEX_WAKE_BITSET
+for further details.
+.IP
+If
+.I timeout
+is not NULL, the structure it points to specifies
+an absolute timeout for the wait operation.
+If
+.I timeout
+is NULL, the operation can block indefinitely.
+.IP
+The
+.I uaddr2
+argument is ignored.
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.TP
+.BR FUTEX_WAKE_BITSET " (since Linux 2.6.25)"
+.\" commit cd689985cf49f6ff5c8eddc48d98b9d581d9475d
+This operation is the same as
+.B FUTEX_WAKE
+except that the
+.I val3
+argument is used to provide a 32-bit bit mask to the kernel.
+This bit mask, in which at least one bit must be set,
+is used to select which waiters should be woken up.
+The selection is done by a bitwise AND of the "wake" bit mask
+(i.e., the value in
+.IR val3 )
+and the bit mask which is stored in the kernel-internal
+state of the waiter (the "wait" bit mask that is set using
+.BR FUTEX_WAIT_BITSET ).
+All of the waiters for which the result of the AND is nonzero are woken up;
+the remaining waiters are left sleeping.
+.IP
+The effect of
+.B FUTEX_WAIT_BITSET
+and
+.B FUTEX_WAKE_BITSET
+is to allow selective wake-ups among multiple waiters that are blocked
+on the same futex.
+However, note that, depending on the use case,
+employing this bit-mask multiplexing feature on a
+futex can be less efficient than simply using multiple futexes,
+because employing bit-mask multiplexing requires the kernel
+to check all waiters on a futex,
+including those that are not interested in being woken up
+(i.e., they do not have the relevant bit set in their "wait" bit mask).
+.\" According to http://locklessinc.com/articles/futex_cheat_sheet/:
+.\"
+.\" "The original reason for the addition of these extensions
+.\" was to improve the performance of pthread read-write locks
+.\" in glibc. However, the pthreads library no longer uses the
+.\" same locking algorithm, and these extensions are not used
+.\" without the bitset parameter being all ones.
+.\"
+.\" The page goes on to note that the FUTEX_WAIT_BITSET operation
+.\" is nevertheless used (with a bit mask of all ones) in order to
+.\" obtain the absolute timeout functionality that is useful
+.\" for efficiently implementing Pthreads APIs (which use absolute
+.\" timeouts); FUTEX_WAIT provides only relative timeouts.
+.IP
+The constant
+.BR FUTEX_BITSET_MATCH_ANY ,
+which corresponds to all 32 bits set in the bit mask, can be used as the
+.I val3
+argument for
+.B FUTEX_WAIT_BITSET
+and
+.BR FUTEX_WAKE_BITSET .
+Other than differences in the handling of the
+.I timeout
+argument, the
+.B FUTEX_WAIT
+operation is equivalent to
+.B FUTEX_WAIT_BITSET
+with
+.I val3
+specified as
+.BR FUTEX_BITSET_MATCH_ANY ;
+that is, allow a wake-up by any waker.
+The
+.B FUTEX_WAKE
+operation is equivalent to
+.B FUTEX_WAKE_BITSET
+with
+.I val3
+specified as
+.BR FUTEX_BITSET_MATCH_ANY ;
+that is, wake up any waiter(s).
+.IP
+The
+.I uaddr2
+and
+.I timeout
+arguments are ignored.
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.SS Priority-inheritance futexes
+Linux supports priority-inheritance (PI) futexes in order to handle
+priority-inversion problems that can be encountered with
+normal futex locks.
+Priority inversion is the problem that occurs when a high-priority
+task is blocked waiting to acquire a lock held by a low-priority task,
+while tasks at an intermediate priority continuously preempt
+the low-priority task from the CPU.
+Consequently, the low-priority task makes no progress toward
+releasing the lock, and the high-priority task remains blocked.
+.PP
+Priority inheritance is a mechanism for dealing with
+the priority-inversion problem.
+With this mechanism, when a high-priority task becomes blocked
+by a lock held by a low-priority task,
+the priority of the low-priority task is temporarily raised
+to that of the high-priority task,
+so that it is not preempted by any intermediate level tasks,
+and can thus make progress toward releasing the lock.
+To be effective, priority inheritance must be transitive,
+meaning that if a high-priority task blocks on a lock
+held by a lower-priority task that is itself blocked by a lock
+held by another intermediate-priority task
+(and so on, for chains of arbitrary length),
+then both of those tasks
+(or more generally, all of the tasks in a lock chain)
+have their priorities raised to be the same as the high-priority task.
+.PP
+From a user-space perspective,
+what makes a futex PI-aware is a policy agreement (described below)
+between user space and the kernel about the value of the futex word,
+coupled with the use of the PI-futex operations described below.
+(Unlike the other futex operations described above,
+the PI-futex operations are designed
+for the implementation of very specific IPC mechanisms.)
+.\"
+.\" Quoting Darren Hart:
+.\" These opcodes paired with the PI futex value policy (described below)
+.\" defines a "futex" as PI aware. These were created very specifically
+.\" in support of PI pthread_mutexes, so it makes a lot more sense to
+.\" talk about a PI aware pthread_mutex, than a PI aware futex, since
+.\" there is a lot of policy and scaffolding that has to be built up
+.\" around it to use it properly (this is what a PI pthread_mutex is).
+.PP
+.\" mtk: The following text is drawn from the Hart/Guniguntala paper
+.\" (listed in SEE ALSO), but I have reworded some pieces
+.\" significantly.
+.\"
+The PI-futex operations described below differ from the other
+futex operations in that they impose policy on the use of the value of the
+futex word:
+.IP \[bu] 3
+If the lock is not acquired, the futex word's value shall be 0.
+.IP \[bu]
+If the lock is acquired, the futex word's value shall
+be the thread ID (TID;
+see
+.BR gettid (2))
+of the owning thread.
+.IP \[bu]
+If the lock is owned and there are threads contending for the lock,
+then the
+.B FUTEX_WAITERS
+bit shall be set in the futex word's value; in other words, this value is:
+.IP
+.in +4n
+.EX
+FUTEX_WAITERS | TID
+.EE
+.in
+.IP
+(Note that is invalid for a PI futex word to have no owner and
+.B FUTEX_WAITERS
+set.)
+.PP
+With this policy in place,
+a user-space application can acquire an unacquired
+lock or release a lock using atomic instructions executed in user mode
+(e.g., a compare-and-swap operation such as
+.I cmpxchg
+on the x86 architecture).
+Acquiring a lock simply consists of using compare-and-swap to atomically
+set the futex word's value to the caller's TID if its previous value was 0.
+Releasing a lock requires using compare-and-swap to set the futex word's
+value to 0 if the previous value was the expected TID.
+.PP
+If a futex is already acquired (i.e., has a nonzero value),
+waiters must employ the
+.B FUTEX_LOCK_PI
+or
+.B FUTEX_LOCK_PI2
+operations to acquire the lock.
+If other threads are waiting for the lock, then the
+.B FUTEX_WAITERS
+bit is set in the futex value;
+in this case, the lock owner must employ the
+.B FUTEX_UNLOCK_PI
+operation to release the lock.
+.PP
+In the cases where callers are forced into the kernel
+(i.e., required to perform a
+.BR futex ()
+call),
+they then deal directly with a so-called RT-mutex,
+a kernel locking mechanism which implements the required
+priority-inheritance semantics.
+After the RT-mutex is acquired, the futex value is updated accordingly,
+before the calling thread returns to user space.
+.PP
+It is important to note
+.\" tglx (July 2015):
+.\" If there are multiple waiters on a pi futex then a wake pi operation
+.\" will wake the first waiter and hand over the lock to this waiter. This
+.\" includes handing over the rtmutex which represents the futex in the
+.\" kernel. The strict requirement is that the futex owner and the rtmutex
+.\" owner must be the same, except for the update period which is
+.\" serialized by the futex internal locking. That means the kernel must
+.\" update the user-space value prior to returning to user space
+that the kernel will update the futex word's value prior
+to returning to user space.
+(This prevents the possibility of the futex word's value ending
+up in an invalid state, such as having an owner but the value being 0,
+or having waiters but not having the
+.B FUTEX_WAITERS
+bit set.)
+.PP
+If a futex has an associated RT-mutex in the kernel
+(i.e., there are blocked waiters)
+and the owner of the futex/RT-mutex dies unexpectedly,
+then the kernel cleans up the RT-mutex and hands it over to the next waiter.
+This in turn requires that the user-space value is updated accordingly.
+To indicate that this is required, the kernel sets the
+.B FUTEX_OWNER_DIED
+bit in the futex word along with the thread ID of the new owner.
+User space can detect this situation via the presence of the
+.B FUTEX_OWNER_DIED
+bit and is then responsible for cleaning up the stale state left over by
+the dead owner.
+.\" tglx (July 2015):
+.\" The FUTEX_OWNER_DIED bit can also be set on uncontended futexes, where
+.\" the kernel has no state associated. This happens via the robust futex
+.\" mechanism. In that case the futex value will be set to
+.\" FUTEX_OWNER_DIED. The robust futex mechanism is also available for non
+.\" PI futexes.
+.PP
+PI futexes are operated on by specifying one of the values listed below in
+.IR futex_op .
+Note that the PI futex operations must be used as paired operations
+and are subject to some additional requirements:
+.IP \[bu] 3
+.BR FUTEX_LOCK_PI ,
+.BR FUTEX_LOCK_PI2 ,
+and
+.B FUTEX_TRYLOCK_PI
+pair with
+.BR FUTEX_UNLOCK_PI .
+.B FUTEX_UNLOCK_PI
+must be called only on a futex owned by the calling thread,
+as defined by the value policy, otherwise the error
+.B EPERM
+results.
+.IP \[bu]
+.B FUTEX_WAIT_REQUEUE_PI
+pairs with
+.BR FUTEX_CMP_REQUEUE_PI .
+This must be performed from a non-PI futex to a distinct PI futex
+(or the error
+.B EINVAL
+results).
+Additionally,
+.I val
+(the number of waiters to be woken) must be 1
+(or the error
+.B EINVAL
+results).
+.PP
+The PI futex operations are as follows:
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.TP
+.BR FUTEX_LOCK_PI " (since Linux 2.6.18)"
+.\" commit c87e2837be82df479a6bae9f155c43516d2feebc
+This operation is used after an attempt to acquire
+the lock via an atomic user-mode instruction failed
+because the futex word has a nonzero value\[em]specifically,
+because it contained the (PID-namespace-specific) TID of the lock owner.
+.IP
+The operation checks the value of the futex word at the address
+.IR uaddr .
+If the value is 0, then the kernel tries to atomically set
+the futex value to the caller's TID.
+If the futex word's value is nonzero,
+the kernel atomically sets the
+.B FUTEX_WAITERS
+bit, which signals the futex owner that it cannot unlock the futex in
+user space atomically by setting the futex value to 0.
+.\" tglx (July 2015):
+.\" The operation here is similar to the FUTEX_WAIT logic. When the user
+.\" space atomic acquire does not succeed because the futex value was non
+.\" zero, then the waiter goes into the kernel, takes the kernel internal
+.\" lock and retries the acquisition under the lock. If the acquisition
+.\" does not succeed either, then it sets the FUTEX_WAITERS bit, to signal
+.\" the lock owner that it needs to go into the kernel. Here is the pseudo
+.\" code:
+.\"
+.\" lock(kernel_lock);
+.\" retry:
+.\"
+.\" /*
+.\" * Owner might have unlocked in user space before we
+.\" * were able to set the waiter bit.
+.\" */
+.\" if (atomic_acquire(futex) == SUCCESS) {
+.\" unlock(kernel_lock());
+.\" return 0;
+.\" }
+.\"
+.\" /*
+.\" * Owner might have unlocked after the above atomic_acquire()
+.\" * attempt.
+.\" */
+.\" if (atomic_set_waiters_bit(futex) != SUCCESS)
+.\" goto retry;
+.\"
+.\" queue_waiter();
+.\" unlock(kernel_lock);
+.\" block();
+.\"
+After that, the kernel:
+.RS
+.IP (1) 5
+Tries to find the thread which is associated with the owner TID.
+.IP (2)
+Creates or reuses kernel state on behalf of the owner.
+(If this is the first waiter, there is no kernel state for this
+futex, so kernel state is created by locking the RT-mutex
+and the futex owner is made the owner of the RT-mutex.
+If there are existing waiters, then the existing state is reused.)
+.IP (3)
+Attaches the waiter to the futex
+(i.e., the waiter is enqueued on the RT-mutex waiter list).
+.RE
+.IP
+If more than one waiter exists,
+the enqueueing of the waiter is in descending priority order.
+(For information on priority ordering, see the discussion of the
+.BR SCHED_DEADLINE ,
+.BR SCHED_FIFO ,
+and
+.B SCHED_RR
+scheduling policies in
+.BR sched (7).)
+The owner inherits either the waiter's CPU bandwidth
+(if the waiter is scheduled under the
+.B SCHED_DEADLINE
+policy) or the waiter's priority (if the waiter is scheduled under the
+.B SCHED_RR
+or
+.B SCHED_FIFO
+policy).
+.\" August 2015:
+.\" mtk: If the realm is restricted purely to SCHED_OTHER (SCHED_NORMAL)
+.\" processes, does the nice value come into play also?
+.\"
+.\" tglx: No. SCHED_OTHER/NORMAL tasks are handled in FIFO order
+This inheritance follows the lock chain in the case of nested locking
+.\" (i.e., task 1 blocks on lock A, held by task 2,
+.\" while task 2 blocks on lock B, held by task 3)
+and performs deadlock detection.
+.IP
+The
+.I timeout
+argument provides a timeout for the lock attempt.
+If
+.I timeout
+is not NULL, the structure it points to specifies
+an absolute timeout, measured against the
+.B CLOCK_REALTIME
+clock.
+.\" 2016-07-07 response from Thomas Gleixner on LKML:
+.\" From: Thomas Gleixner <tglx@linutronix.de>
+.\" Date: 6 July 2016 at 20:57
+.\" Subject: Re: futex: Allow FUTEX_CLOCK_REALTIME with FUTEX_WAIT op
+.\"
+.\" On Thu, 23 Jun 2016, Michael Kerrisk (man-pages) wrote:
+.\" > On 06/23/2016 08:28 PM, Darren Hart wrote:
+.\" > > And as a follow-on, what is the reason for FUTEX_LOCK_PI only using
+.\" > > CLOCK_REALTIME? It seems reasonable to me that a user may want to wait a
+.\" > > specific amount of time, regardless of wall time.
+.\" >
+.\" > Yes, that's another weird inconsistency.
+.\"
+.\" The reason is that phtread_mutex_timedlock() uses absolute timeouts based on
+.\" CLOCK_REALTIME. glibc folks asked to make that the default behaviour back
+.\" then when we added LOCK_PI.
+If
+.I timeout
+is NULL, the operation will block indefinitely.
+.IP
+The
+.IR uaddr2 ,
+.IR val ,
+and
+.I val3
+arguments are ignored.
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.TP
+.BR FUTEX_LOCK_PI2 " (since Linux 5.14)"
+.\" commit bf22a6976897977b0a3f1aeba6823c959fc4fdae
+This operation is the same as
+.BR FUTEX_LOCK_PI ,
+except that the clock against which
+.I timeout
+is measured is selectable.
+By default, the (absolute) timeout specified in
+.I timeout
+is measured against the
+.B CLOCK_MONOTONIC
+clock, but if the
+.B FUTEX_CLOCK_REALTIME
+flag is specified in
+.IR futex_op ,
+then the timeout is measured against the
+.B CLOCK_REALTIME
+clock.
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.TP
+.BR FUTEX_TRYLOCK_PI " (since Linux 2.6.18)"
+.\" commit c87e2837be82df479a6bae9f155c43516d2feebc
+This operation tries to acquire the lock at
+.IR uaddr .
+It is invoked when a user-space atomic acquire did not
+succeed because the futex word was not 0.
+.IP
+Because the kernel has access to more state information than user space,
+acquisition of the lock might succeed if performed by the
+kernel in cases where the futex word
+(i.e., the state information accessible to use-space) contains stale state
+.RB ( FUTEX_WAITERS
+and/or
+.BR FUTEX_OWNER_DIED ).
+This can happen when the owner of the futex died.
+User space cannot handle this condition in a race-free manner,
+but the kernel can fix this up and acquire the futex.
+.\" Paraphrasing a f2f conversation with Thomas Gleixner about the
+.\" above point (Aug 2015): ###
+.\" There is a rare possibility of a race condition involving an
+.\" uncontended futex with no owner, but with waiters. The
+.\" kernel-user-space contract is that if a futex is nonzero, you must
+.\" go into kernel. The futex was owned by a task, and that task dies
+.\" but there are no waiters, so the futex value is non zero.
+.\" Therefore, the next locker has to go into the kernel,
+.\" so that the kernel has a chance to clean up. (CMXCH on zero
+.\" in user space would fail, so kernel has to clean up.)
+.\" Darren Hart (Oct 2015):
+.\" The trylock in the kernel has more state, so it can independently
+.\" verify the flags that user space must trust implicitly.
+.IP
+The
+.IR uaddr2 ,
+.IR val ,
+.IR timeout ,
+and
+.I val3
+arguments are ignored.
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.TP
+.BR FUTEX_UNLOCK_PI " (since Linux 2.6.18)"
+.\" commit c87e2837be82df479a6bae9f155c43516d2feebc
+This operation wakes the top priority waiter that is waiting in
+.B FUTEX_LOCK_PI
+or
+.B FUTEX_LOCK_PI2
+on the futex address provided by the
+.I uaddr
+argument.
+.IP
+This is called when the user-space value at
+.I uaddr
+cannot be changed atomically from a TID (of the owner) to 0.
+.IP
+The
+.IR uaddr2 ,
+.IR val ,
+.IR timeout ,
+and
+.I val3
+arguments are ignored.
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.TP
+.BR FUTEX_CMP_REQUEUE_PI " (since Linux 2.6.31)"
+.\" commit 52400ba946759af28442dee6265c5c0180ac7122
+This operation is a PI-aware variant of
+.BR FUTEX_CMP_REQUEUE .
+It requeues waiters that are blocked via
+.B FUTEX_WAIT_REQUEUE_PI
+on
+.I uaddr
+from a non-PI source futex
+.RI ( uaddr )
+to a PI target futex
+.RI ( uaddr2 ).
+.IP
+As with
+.BR FUTEX_CMP_REQUEUE ,
+this operation wakes up a maximum of
+.I val
+waiters that are waiting on the futex at
+.IR uaddr .
+However, for
+.BR FUTEX_CMP_REQUEUE_PI ,
+.I val
+is required to be 1
+(since the main point is to avoid a thundering herd).
+The remaining waiters are removed from the wait queue of the source futex at
+.I uaddr
+and added to the wait queue of the target futex at
+.IR uaddr2 .
+.IP
+The
+.I val2
+.\" val2 is the cap on the number of requeued waiters.
+.\" In the glibc pthread_cond_broadcast() implementation, this argument
+.\" is specified as INT_MAX, and for pthread_cond_signal() it is 0.
+and
+.I val3
+arguments serve the same purposes as for
+.BR FUTEX_CMP_REQUEUE .
+.\"
+.\" The page at http://locklessinc.com/articles/futex_cheat_sheet/
+.\" notes that "priority-inheritance Futex to priority-inheritance
+.\" Futex requeues are currently unsupported". However, probably
+.\" the page does not need to say nothing about this, since
+.\" Thomas Gleixner commented (July 2015): "they never will be
+.\" supported because they make no sense at all"
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.TP
+.BR FUTEX_WAIT_REQUEUE_PI " (since Linux 2.6.31)"
+.\" commit 52400ba946759af28442dee6265c5c0180ac7122
+.\"
+Wait on a non-PI futex at
+.I uaddr
+and potentially be requeued (via a
+.B FUTEX_CMP_REQUEUE_PI
+operation in another task) onto a PI futex at
+.IR uaddr2 .
+The wait operation on
+.I uaddr
+is the same as for
+.BR FUTEX_WAIT .
+.IP
+The waiter can be removed from the wait on
+.I uaddr
+without requeueing on
+.I uaddr2
+via a
+.B FUTEX_WAKE
+operation in another task.
+In this case, the
+.B FUTEX_WAIT_REQUEUE_PI
+operation fails with the error
+.BR EAGAIN .
+.IP
+If
+.I timeout
+is not NULL, the structure it points to specifies
+an absolute timeout for the wait operation.
+If
+.I timeout
+is NULL, the operation can block indefinitely.
+.IP
+The
+.I val3
+argument is ignored.
+.IP
+The
+.B FUTEX_WAIT_REQUEUE_PI
+and
+.B FUTEX_CMP_REQUEUE_PI
+were added to support a fairly specific use case:
+support for priority-inheritance-aware POSIX threads condition variables.
+The idea is that these operations should always be paired,
+in order to ensure that user space and the kernel remain in sync.
+Thus, in the
+.B FUTEX_WAIT_REQUEUE_PI
+operation, the user-space application pre-specifies the target
+of the requeue that takes place in the
+.B FUTEX_CMP_REQUEUE_PI
+operation.
+.\"
+.\" Darren Hart notes that a patch to allow glibc to fully support
+.\" PI-aware pthreads condition variables has not yet been accepted into
+.\" glibc. The story is complex, and can be found at
+.\" https://sourceware.org/bugzilla/show_bug.cgi?id=11588
+.\" Darren notes that in the meantime, the patch is shipped with various
+.\" PREEMPT_RT-enabled Linux systems.
+.\"
+.\" Related to the preceding, Darren proposed that somewhere, man-pages
+.\" should document the following point:
+.\"
+.\" While the Linux kernel, since Linux 2.6.31, supports requeueing of
+.\" priority-inheritance (PI) aware mutexes via the
+.\" FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI futex operations,
+.\" the glibc implementation does not yet take full advantage of this.
+.\" Specifically, the condvar internal data lock remains a non-PI aware
+.\" mutex, regardless of the type of the pthread_mutex associated with
+.\" the condvar. This can lead to an unbounded priority inversion on
+.\" the internal data lock even when associating a PI aware
+.\" pthread_mutex with a condvar during a pthread_cond*_wait
+.\" operation. For this reason, it is not recommended to rely on
+.\" priority inheritance when using pthread condition variables.
+.\"
+.\" The problem is that the obvious location for this text is
+.\" the pthread_cond*wait(3) man page. However, such a man page
+.\" does not currently exist.
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.SH RETURN VALUE
+In the event of an error (and assuming that
+.BR futex ()
+was invoked via
+.BR syscall (2)),
+all operations return \-1 and set
+.I errno
+to indicate the error.
+.PP
+The return value on success depends on the operation,
+as described in the following list:
+.TP
+.B FUTEX_WAIT
+Returns 0 if the caller was woken up.
+Note that a wake-up can also be caused by common futex usage patterns
+in unrelated code that happened to have previously used the futex word's
+memory location (e.g., typical futex-based implementations of
+Pthreads mutexes can cause this under some conditions).
+Therefore, callers should always conservatively assume that a return
+value of 0 can mean a spurious wake-up, and use the futex word's value
+(i.e., the user-space synchronization scheme)
+to decide whether to continue to block or not.
+.TP
+.B FUTEX_WAKE
+Returns the number of waiters that were woken up.
+.TP
+.B FUTEX_FD
+Returns the new file descriptor associated with the futex.
+.TP
+.B FUTEX_REQUEUE
+Returns the number of waiters that were woken up.
+.TP
+.B FUTEX_CMP_REQUEUE
+Returns the total number of waiters that were woken up or
+requeued to the futex for the futex word at
+.IR uaddr2 .
+If this value is greater than
+.IR val ,
+then the difference is the number of waiters requeued to the futex for the
+futex word at
+.IR uaddr2 .
+.TP
+.B FUTEX_WAKE_OP
+Returns the total number of waiters that were woken up.
+This is the sum of the woken waiters on the two futexes for
+the futex words at
+.I uaddr
+and
+.IR uaddr2 .
+.TP
+.B FUTEX_WAIT_BITSET
+Returns 0 if the caller was woken up.
+See
+.B FUTEX_WAIT
+for how to interpret this correctly in practice.
+.TP
+.B FUTEX_WAKE_BITSET
+Returns the number of waiters that were woken up.
+.TP
+.B FUTEX_LOCK_PI
+Returns 0 if the futex was successfully locked.
+.TP
+.B FUTEX_LOCK_PI2
+Returns 0 if the futex was successfully locked.
+.TP
+.B FUTEX_TRYLOCK_PI
+Returns 0 if the futex was successfully locked.
+.TP
+.B FUTEX_UNLOCK_PI
+Returns 0 if the futex was successfully unlocked.
+.TP
+.B FUTEX_CMP_REQUEUE_PI
+Returns the total number of waiters that were woken up or
+requeued to the futex for the futex word at
+.IR uaddr2 .
+If this value is greater than
+.IR val ,
+then difference is the number of waiters requeued to the futex for
+the futex word at
+.IR uaddr2 .
+.TP
+.B FUTEX_WAIT_REQUEUE_PI
+Returns 0 if the caller was successfully requeued to the futex for
+the futex word at
+.IR uaddr2 .
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.SH ERRORS
+.TP
+.B EACCES
+No read access to the memory of a futex word.
+.TP
+.B EAGAIN
+.RB ( FUTEX_WAIT ,
+.BR FUTEX_WAIT_BITSET ,
+.BR FUTEX_WAIT_REQUEUE_PI )
+The value pointed to by
+.I uaddr
+was not equal to the expected value
+.I val
+at the time of the call.
+.IP
+.BR Note :
+on Linux, the symbolic names
+.B EAGAIN
+and
+.B EWOULDBLOCK
+(both of which appear in different parts of the kernel futex code)
+have the same value.
+.TP
+.B EAGAIN
+.RB ( FUTEX_CMP_REQUEUE ,
+.BR FUTEX_CMP_REQUEUE_PI )
+The value pointed to by
+.I uaddr
+is not equal to the expected value
+.IR val3 .
+.TP
+.B EAGAIN
+.RB ( FUTEX_LOCK_PI ,
+.BR FUTEX_LOCK_PI2 ,
+.BR FUTEX_TRYLOCK_PI ,
+.BR FUTEX_CMP_REQUEUE_PI )
+The futex owner thread ID of
+.I uaddr
+(for
+.BR FUTEX_CMP_REQUEUE_PI :
+.IR uaddr2 )
+is about to exit,
+but has not yet handled the internal state cleanup.
+Try again.
+.TP
+.B EDEADLK
+.RB ( FUTEX_LOCK_PI ,
+.BR FUTEX_LOCK_PI2 ,
+.BR FUTEX_TRYLOCK_PI ,
+.BR FUTEX_CMP_REQUEUE_PI )
+The futex word at
+.I uaddr
+is already locked by the caller.
+.TP
+.B EDEADLK
+.\" FIXME . I see that kernel/locking/rtmutex.c uses EDEADLK in some
+.\" places, and EDEADLOCK in others. On almost all architectures
+.\" these constants are synonymous. Is there a reason that both
+.\" names are used?
+.\"
+.\" tglx (July 2015): "No. We should probably fix that."
+.\"
+.RB ( FUTEX_CMP_REQUEUE_PI )
+While requeueing a waiter to the PI futex for the futex word at
+.IR uaddr2 ,
+the kernel detected a deadlock.
+.TP
+.B EFAULT
+A required pointer argument (i.e.,
+.IR uaddr ,
+.IR uaddr2 ,
+or
+.IR timeout )
+did not point to a valid user-space address.
+.TP
+.B EINTR
+A
+.B FUTEX_WAIT
+or
+.B FUTEX_WAIT_BITSET
+operation was interrupted by a signal (see
+.BR signal (7)).
+Before Linux 2.6.22, this error could also be returned for
+a spurious wakeup; since Linux 2.6.22, this no longer happens.
+.TP
+.B EINVAL
+The operation in
+.I futex_op
+is one of those that employs a timeout, but the supplied
+.I timeout
+argument was invalid
+.RI ( tv_sec
+was less than zero, or
+.I tv_nsec
+was not less than 1,000,000,000).
+.TP
+.B EINVAL
+The operation specified in
+.I futex_op
+employs one or both of the pointers
+.I uaddr
+and
+.IR uaddr2 ,
+but one of these does not point to a valid object\[em]that is,
+the address is not four-byte-aligned.
+.TP
+.B EINVAL
+.RB ( FUTEX_WAIT_BITSET ,
+.BR FUTEX_WAKE_BITSET )
+The bit mask supplied in
+.I val3
+is zero.
+.TP
+.B EINVAL
+.RB ( FUTEX_CMP_REQUEUE_PI )
+.I uaddr
+equals
+.I uaddr2
+(i.e., an attempt was made to requeue to the same futex).
+.TP
+.B EINVAL
+.RB ( FUTEX_FD )
+The signal number supplied in
+.I val
+is invalid.
+.TP
+.B EINVAL
+.RB ( FUTEX_WAKE ,
+.BR FUTEX_WAKE_OP ,
+.BR FUTEX_WAKE_BITSET ,
+.BR FUTEX_REQUEUE ,
+.BR FUTEX_CMP_REQUEUE )
+The kernel detected an inconsistency between the user-space state at
+.I uaddr
+and the kernel state\[em]that is, it detected a waiter which waits in
+.B FUTEX_LOCK_PI
+or
+.B FUTEX_LOCK_PI2
+on
+.IR uaddr .
+.TP
+.B EINVAL
+.RB ( FUTEX_LOCK_PI ,
+.BR FUTEX_LOCK_PI2 ,
+.BR FUTEX_TRYLOCK_PI ,
+.BR FUTEX_UNLOCK_PI )
+The kernel detected an inconsistency between the user-space state at
+.I uaddr
+and the kernel state.
+This indicates either state corruption
+or that the kernel found a waiter on
+.I uaddr
+which is waiting via
+.B FUTEX_WAIT
+or
+.BR FUTEX_WAIT_BITSET .
+.TP
+.B EINVAL
+.RB ( FUTEX_CMP_REQUEUE_PI )
+The kernel detected an inconsistency between the user-space state at
+.I uaddr2
+and the kernel state;
+.\" From a conversation with Thomas Gleixner (Aug 2015): ###
+.\" The kernel sees: I have non PI state for a futex you tried to
+.\" tell me was PI
+that is, the kernel detected a waiter which waits via
+.B FUTEX_WAIT
+or
+.B FUTEX_WAIT_BITSET
+on
+.IR uaddr2 .
+.TP
+.B EINVAL
+.RB ( FUTEX_CMP_REQUEUE_PI )
+The kernel detected an inconsistency between the user-space state at
+.I uaddr
+and the kernel state;
+that is, the kernel detected a waiter which waits via
+.B FUTEX_WAIT
+or
+.B FUTEX_WAIT_BITSET
+on
+.IR uaddr .
+.TP
+.B EINVAL
+.RB ( FUTEX_CMP_REQUEUE_PI )
+The kernel detected an inconsistency between the user-space state at
+.I uaddr
+and the kernel state;
+that is, the kernel detected a waiter which waits on
+.I uaddr
+via
+.B FUTEX_LOCK_PI
+or
+.B FUTEX_LOCK_PI2
+(instead of
+.BR FUTEX_WAIT_REQUEUE_PI ).
+.TP
+.B EINVAL
+.RB ( FUTEX_CMP_REQUEUE_PI )
+.\" This deals with the case:
+.\" wait_requeue_pi(A, B);
+.\" requeue_pi(A, C);
+An attempt was made to requeue a waiter to a futex other than that
+specified by the matching
+.B FUTEX_WAIT_REQUEUE_PI
+call for that waiter.
+.TP
+.B EINVAL
+.RB ( FUTEX_CMP_REQUEUE_PI )
+The
+.I val
+argument is not 1.
+.TP
+.B EINVAL
+Invalid argument.
+.TP
+.B ENFILE
+.RB ( FUTEX_FD )
+The system-wide limit on the total number of open files has been reached.
+.TP
+.B ENOMEM
+.RB ( FUTEX_LOCK_PI ,
+.BR FUTEX_LOCK_PI2 ,
+.BR FUTEX_TRYLOCK_PI ,
+.BR FUTEX_CMP_REQUEUE_PI )
+The kernel could not allocate memory to hold state information.
+.TP
+.B ENOSYS
+Invalid operation specified in
+.IR futex_op .
+.TP
+.B ENOSYS
+The
+.B FUTEX_CLOCK_REALTIME
+option was specified in
+.IR futex_op ,
+but the accompanying operation was neither
+.BR FUTEX_WAIT ,
+.BR FUTEX_WAIT_BITSET ,
+.BR FUTEX_WAIT_REQUEUE_PI ,
+nor
+.BR FUTEX_LOCK_PI2 .
+.TP
+.B ENOSYS
+.RB ( FUTEX_LOCK_PI ,
+.BR FUTEX_LOCK_PI2 ,
+.BR FUTEX_TRYLOCK_PI ,
+.BR FUTEX_UNLOCK_PI ,
+.BR FUTEX_CMP_REQUEUE_PI ,
+.BR FUTEX_WAIT_REQUEUE_PI )
+A run-time check determined that the operation is not available.
+The PI-futex operations are not implemented on all architectures and
+are not supported on some CPU variants.
+.TP
+.B EPERM
+.RB ( FUTEX_LOCK_PI ,
+.BR FUTEX_LOCK_PI2 ,
+.BR FUTEX_TRYLOCK_PI ,
+.BR FUTEX_CMP_REQUEUE_PI )
+The caller is not allowed to attach itself to the futex at
+.I uaddr
+(for
+.BR FUTEX_CMP_REQUEUE_PI :
+the futex at
+.IR uaddr2 ).
+(This may be caused by a state corruption in user space.)
+.TP
+.B EPERM
+.RB ( FUTEX_UNLOCK_PI )
+The caller does not own the lock represented by the futex word.
+.TP
+.B ESRCH
+.RB ( FUTEX_LOCK_PI ,
+.BR FUTEX_LOCK_PI2 ,
+.BR FUTEX_TRYLOCK_PI ,
+.BR FUTEX_CMP_REQUEUE_PI )
+The thread ID in the futex word at
+.I uaddr
+does not exist.
+.TP
+.B ESRCH
+.RB ( FUTEX_CMP_REQUEUE_PI )
+The thread ID in the futex word at
+.I uaddr2
+does not exist.
+.TP
+.B ETIMEDOUT
+The operation in
+.I futex_op
+employed the timeout specified in
+.IR timeout ,
+and the timeout expired before the operation completed.
+.\"
+.\""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.0.
+.PP
+Initial futex support was merged in Linux 2.5.7 but with different
+semantics from what was described above.
+A four-argument system call with the semantics
+described in this page was introduced in Linux 2.5.40.
+A fifth argument was added in Linux 2.5.70,
+and a sixth argument was added in Linux 2.6.7.
+.SH EXAMPLES
+The program below demonstrates use of futexes in a program where a parent
+process and a child process use a pair of futexes located inside a
+shared anonymous mapping to synchronize access to a shared resource:
+the terminal.
+The two processes each write
+.I nloops
+(a command-line argument that defaults to 5 if omitted)
+messages to the terminal and employ a synchronization protocol
+that ensures that they alternate in writing messages.
+Upon running this program we see output such as the following:
+.PP
+.in +4n
+.EX
+$ \fB./futex_demo\fP
+Parent (18534) 0
+Child (18535) 0
+Parent (18534) 1
+Child (18535) 1
+Parent (18534) 2
+Child (18535) 2
+Parent (18534) 3
+Child (18535) 3
+Parent (18534) 4
+Child (18535) 4
+.EE
+.in
+.SS Program source
+\&
+.\" SRC BEGIN (futex.c)
+.EX
+/* futex_demo.c
+\&
+ Usage: futex_demo [nloops]
+ (Default: 5)
+\&
+ Demonstrate the use of futexes in a program where parent and child
+ use a pair of futexes located inside a shared anonymous mapping to
+ synchronize access to a shared resource: the terminal. The two
+ processes each write \[aq]num\-loops\[aq] messages to the terminal and employ
+ a synchronization protocol that ensures that they alternate in
+ writing messages.
+*/
+#define _GNU_SOURCE
+#include <err.h>
+#include <errno.h>
+#include <linux/futex.h>
+#include <stdatomic.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <unistd.h>
+\&
+static uint32_t *futex1, *futex2, *iaddr;
+\&
+static int
+futex(uint32_t *uaddr, int futex_op, uint32_t val,
+ const struct timespec *timeout, uint32_t *uaddr2, uint32_t val3)
+{
+ return syscall(SYS_futex, uaddr, futex_op, val,
+ timeout, uaddr2, val3);
+}
+\&
+/* Acquire the futex pointed to by \[aq]futexp\[aq]: wait for its value to
+ become 1, and then set the value to 0. */
+\&
+static void
+fwait(uint32_t *futexp)
+{
+ long s;
+ const uint32_t one = 1;
+\&
+ /* atomic_compare_exchange_strong(ptr, oldval, newval)
+ atomically performs the equivalent of:
+\&
+ if (*ptr == *oldval)
+ *ptr = newval;
+\&
+ It returns true if the test yielded true and *ptr was updated. */
+\&
+ while (1) {
+\&
+ /* Is the futex available? */
+ if (atomic_compare_exchange_strong(futexp, &one, 0))
+ break; /* Yes */
+\&
+ /* Futex is not available; wait. */
+\&
+ s = futex(futexp, FUTEX_WAIT, 0, NULL, NULL, 0);
+ if (s == \-1 && errno != EAGAIN)
+ err(EXIT_FAILURE, "futex\-FUTEX_WAIT");
+ }
+}
+\&
+/* Release the futex pointed to by \[aq]futexp\[aq]: if the futex currently
+ has the value 0, set its value to 1 and then wake any futex waiters,
+ so that if the peer is blocked in fwait(), it can proceed. */
+\&
+static void
+fpost(uint32_t *futexp)
+{
+ long s;
+ const uint32_t zero = 0;
+\&
+ /* atomic_compare_exchange_strong() was described
+ in comments above. */
+\&
+ if (atomic_compare_exchange_strong(futexp, &zero, 1)) {
+ s = futex(futexp, FUTEX_WAKE, 1, NULL, NULL, 0);
+ if (s == \-1)
+ err(EXIT_FAILURE, "futex\-FUTEX_WAKE");
+ }
+}
+\&
+int
+main(int argc, char *argv[])
+{
+ pid_t childPid;
+ unsigned int nloops;
+\&
+ setbuf(stdout, NULL);
+\&
+ nloops = (argc > 1) ? atoi(argv[1]) : 5;
+\&
+ /* Create a shared anonymous mapping that will hold the futexes.
+ Since the futexes are being shared between processes, we
+ subsequently use the "shared" futex operations (i.e., not the
+ ones suffixed "_PRIVATE"). */
+\&
+ iaddr = mmap(NULL, sizeof(*iaddr) * 2, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_SHARED, \-1, 0);
+ if (iaddr == MAP_FAILED)
+ err(EXIT_FAILURE, "mmap");
+\&
+ futex1 = &iaddr[0];
+ futex2 = &iaddr[1];
+\&
+ *futex1 = 0; /* State: unavailable */
+ *futex2 = 1; /* State: available */
+\&
+ /* Create a child process that inherits the shared anonymous
+ mapping. */
+\&
+ childPid = fork();
+ if (childPid == \-1)
+ err(EXIT_FAILURE, "fork");
+\&
+ if (childPid == 0) { /* Child */
+ for (unsigned int j = 0; j < nloops; j++) {
+ fwait(futex1);
+ printf("Child (%jd) %u\en", (intmax_t) getpid(), j);
+ fpost(futex2);
+ }
+\&
+ exit(EXIT_SUCCESS);
+ }
+\&
+ /* Parent falls through to here. */
+\&
+ for (unsigned int j = 0; j < nloops; j++) {
+ fwait(futex2);
+ printf("Parent (%jd) %u\en", (intmax_t) getpid(), j);
+ fpost(futex1);
+ }
+\&
+ wait(NULL);
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.ad l
+.BR get_robust_list (2),
+.BR restart_syscall (2),
+.BR pthread_mutexattr_getprotocol (3),
+.BR futex (7),
+.BR sched (7)
+.PP
+The following kernel source files:
+.IP \[bu] 3
+.I Documentation/pi\-futex.txt
+.IP \[bu]
+.I Documentation/futex\-requeue\-pi.txt
+.IP \[bu]
+.I Documentation/locking/rt\-mutex.txt
+.IP \[bu]
+.I Documentation/locking/rt\-mutex\-design.txt
+.IP \[bu]
+.I Documentation/robust\-futex\-ABI.txt
+.PP
+Franke, H., Russell, R., and Kirwood, M., 2002.
+\fIFuss, Futexes and Furwocks: Fast Userlevel Locking in Linux\fP
+(from proceedings of the Ottawa Linux Symposium 2002),
+.br
+.UR http://kernel.org\:/doc\:/ols\:/2002\:/ols2002\-pages\-479\-495.pdf
+.UE
+.PP
+Hart, D., 2009. \fIA futex overview and update\fP,
+.UR http://lwn.net/Articles/360699/
+.UE
+.PP
+Hart, D.\& and Guniguntala, D., 2009.
+\fIRequeue-PI: Making glibc Condvars PI-Aware\fP
+(from proceedings of the 2009 Real-Time Linux Workshop),
+.UR http://lwn.net/images/conf/rtlws11/papers/proc/p10.pdf
+.UE
+.PP
+Drepper, U., 2011. \fIFutexes Are Tricky\fP,
+.UR http://www.akkadia.org/drepper/futex.pdf
+.UE
+.PP
+Futex example library, futex\-*.tar.bz2 at
+.br
+.UR https://mirrors.kernel.org\:/pub\:/linux\:/kernel\:/people\:/rusty/
+.UE
+.\"
+.\" FIXME(Torvald) We should probably refer to the glibc code here, in
+.\" particular the glibc-internal futex wrapper functions that are
+.\" WIP, and the generic pthread_mutex_t and perhaps condvar
+.\" implementations.
diff --git a/man2/futimesat.2 b/man2/futimesat.2
new file mode 100644
index 0000000..4a120cd
--- /dev/null
+++ b/man2/futimesat.2
@@ -0,0 +1,128 @@
+.\" This manpage is Copyright (C) 2006, Michael Kerrisk
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH futimesat 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+futimesat \- change timestamps of a file relative to a \
+directory file descriptor
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <fcntl.h>" " /* Definition of " AT_* " constants */"
+.B #include <sys/time.h>
+.PP
+.BI "[[deprecated]] int futimesat(int " dirfd ", const char *" pathname ,
+.BI " const struct timeval " times [2]);
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR futimesat ():
+.nf
+ _GNU_SOURCE
+.fi
+.SH DESCRIPTION
+This system call is obsolete.
+Use
+.BR utimensat (2)
+instead.
+.PP
+The
+.BR futimesat ()
+system call operates in exactly the same way as
+.BR utimes (2),
+except for the differences described in this manual page.
+.PP
+If the pathname given in
+.I pathname
+is relative, then it is interpreted relative to the directory
+referred to by the file descriptor
+.I dirfd
+(rather than relative to the current working directory of
+the calling process, as is done by
+.BR utimes (2)
+for a relative pathname).
+.PP
+If
+.I pathname
+is relative and
+.I dirfd
+is the special value
+.BR AT_FDCWD ,
+then
+.I pathname
+is interpreted relative to the current working
+directory of the calling process (like
+.BR utimes (2)).
+.PP
+If
+.I pathname
+is absolute, then
+.I dirfd
+is ignored.
+(See
+.BR openat (2)
+for an explanation of why the
+.I dirfd
+argument is useful.)
+.SH RETURN VALUE
+On success,
+.BR futimesat ()
+returns a 0.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+The same errors that occur for
+.BR utimes (2)
+can also occur for
+.BR futimesat ().
+The following additional errors can occur for
+.BR futimesat ():
+.TP
+.B EBADF
+.I pathname
+is relative but
+.I dirfd
+is neither
+.B AT_FDCWD
+nor a valid file descriptor.
+.TP
+.B ENOTDIR
+.I pathname
+is relative and
+.I dirfd
+is a file descriptor referring to a file other than a directory.
+.SH VERSIONS
+.SS glibc
+If
+.I pathname
+is NULL, then the glibc
+.BR futimesat ()
+wrapper function updates the times for the file referred to by
+.IR dirfd .
+.\" The Solaris futimesat() also has this strangeness.
+.SH STANDARDS
+None.
+.SH HISTORY
+Linux 2.6.16,
+glibc 2.4.
+.PP
+It was implemented from a specification that was proposed for POSIX.1,
+but that specification was replaced by the one for
+.BR utimensat (2).
+.PP
+A similar system call exists on Solaris.
+.SH NOTES
+.SH SEE ALSO
+.BR stat (2),
+.BR utimensat (2),
+.BR utimes (2),
+.BR futimes (3),
+.BR path_resolution (7)
diff --git a/man2/get_kernel_syms.2 b/man2/get_kernel_syms.2
new file mode 100644
index 0000000..307d9ca
--- /dev/null
+++ b/man2/get_kernel_syms.2
@@ -0,0 +1,88 @@
+.\" Copyright (C) 1996 Free Software Foundation, Inc.
+.\"
+.\" SPDX-License-Identifier: GPL-1.0-or-later
+.\"
+.\" 2006-02-09, some reformatting by Luc Van Oostenryck; some
+.\" reformatting and rewordings by mtk
+.\"
+.TH get_kernel_syms 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+get_kernel_syms \- retrieve exported kernel and module symbols
+.SH SYNOPSIS
+.nf
+.B #include <linux/module.h>
+.PP
+.BI "[[deprecated]] int get_kernel_syms(struct kernel_sym *" table );
+.fi
+.SH DESCRIPTION
+.BR Note :
+This system call is present only before Linux 2.6.
+.PP
+If
+.I table
+is NULL,
+.BR get_kernel_syms ()
+returns the number of symbols available for query.
+Otherwise, it fills in a table of structures:
+.PP
+.in +4n
+.EX
+struct kernel_sym {
+ unsigned long value;
+ char name[60];
+};
+.EE
+.in
+.PP
+The symbols are interspersed with magic symbols of the form
+.BI # module-name
+with the kernel having an empty name.
+The value associated with a symbol of this form is the address at
+which the module is loaded.
+.PP
+The symbols exported from each module follow their magic module tag
+and the modules are returned in the reverse of the
+order in which they were loaded.
+.SH RETURN VALUE
+On success, returns the number of symbols copied to
+.IR table .
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+There is only one possible error return:
+.TP
+.B ENOSYS
+.BR get_kernel_syms ()
+is not supported in this version of the kernel.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Removed in Linux 2.6.
+.\" Removed in Linux 2.5.48
+.PP
+This obsolete system call is not supported by glibc.
+No declaration is provided in glibc headers, but, through a quirk of history,
+glibc versions before glibc 2.23 did export an ABI for this system call.
+Therefore, in order to employ this system call,
+it was sufficient to manually declare the interface in your code;
+alternatively, you could invoke the system call using
+.BR syscall (2).
+.SH BUGS
+There is no way to indicate the size of the buffer allocated for
+.IR table .
+If symbols have been added to the kernel since the
+program queried for the symbol table size, memory will be corrupted.
+.PP
+The length of exported symbol names is limited to 59 characters.
+.PP
+Because of these limitations, this system call is deprecated in
+favor of
+.BR query_module (2)
+(which is itself nowadays deprecated
+in favor of other interfaces described on its manual page).
+.SH SEE ALSO
+.BR create_module (2),
+.BR delete_module (2),
+.BR init_module (2),
+.BR query_module (2)
diff --git a/man2/get_mempolicy.2 b/man2/get_mempolicy.2
new file mode 100644
index 0000000..45b8f2f
--- /dev/null
+++ b/man2/get_mempolicy.2
@@ -0,0 +1,235 @@
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft-var
+.\"
+.\" Copyright 2003,2004 Andi Kleen, SuSE Labs.
+.\" and Copyright 2007 Lee Schermerhorn, Hewlett Packard
+.\"
+.\" 2006-02-03, mtk, substantial wording changes and other improvements
+.\" 2007-08-27, Lee Schermerhorn <Lee.Schermerhorn@hp.com>
+.\" more precise specification of behavior.
+.\"
+.TH get_mempolicy 2 2023-07-16 "Linux man-pages 6.05.01"
+.SH NAME
+get_mempolicy \- retrieve NUMA memory policy for a thread
+.SH LIBRARY
+NUMA (Non-Uniform Memory Access) policy library
+.RI ( libnuma ", " \-lnuma )
+.SH SYNOPSIS
+.B "#include <numaif.h>"
+.nf
+.PP
+.BI "long get_mempolicy(int *" mode ,
+.BI " unsigned long " nodemask [(. maxnode " + ULONG_WIDTH - 1)"
+.B " / ULONG_WIDTH],"
+.BI " unsigned long " maxnode ", void *" addr ,
+.BI " unsigned long " flags );
+.fi
+.SH DESCRIPTION
+.BR get_mempolicy ()
+retrieves the NUMA policy of the calling thread or of a memory address,
+depending on the setting of
+.IR flags .
+.PP
+A NUMA machine has different
+memory controllers with different distances to specific CPUs.
+The memory policy defines from which node memory is allocated for
+the thread.
+.PP
+If
+.I flags
+is specified as 0,
+then information about the calling thread's default policy
+(as set by
+.BR set_mempolicy (2))
+is returned, in the buffers pointed to by
+.I mode
+and
+.IR nodemask .
+The value returned in these arguments
+may be used to restore the thread's policy to its state at
+the time of the call to
+.BR get_mempolicy ()
+using
+.BR set_mempolicy (2).
+When
+.I flags
+is 0,
+.I addr
+must be specified as NULL.
+.PP
+If
+.I flags
+specifies
+.B MPOL_F_MEMS_ALLOWED
+(available since Linux 2.6.24), the
+.I mode
+argument is ignored and the set of nodes (memories) that the
+thread is allowed to specify in subsequent calls to
+.BR mbind (2)
+or
+.BR set_mempolicy (2)
+(in the absence of any
+.IR "mode flags" )
+is returned in
+.IR nodemask .
+It is not permitted to combine
+.B MPOL_F_MEMS_ALLOWED
+with either
+.B MPOL_F_ADDR
+or
+.BR MPOL_F_NODE .
+.PP
+If
+.I flags
+specifies
+.BR MPOL_F_ADDR ,
+then information is returned about the policy governing the memory
+address given in
+.IR addr .
+This policy may be different from the thread's default policy if
+.BR mbind (2)
+or one of the helper functions described in
+.BR numa (3)
+has been used to establish a policy for the memory range containing
+.IR addr .
+.PP
+If the
+.I mode
+argument is not NULL, then
+.BR get_mempolicy ()
+will store the policy mode and any optional
+.I "mode flags"
+of the requested NUMA policy in the location pointed to by this argument.
+If
+.I nodemask
+is not NULL, then the nodemask associated with the policy will be stored
+in the location pointed to by this argument.
+.I maxnode
+specifies the number of node IDs
+that can be stored into
+.IR nodemask \[em]that
+is, the maximum node ID plus one.
+The value specified by
+.I maxnode
+is always rounded to a multiple of
+.IR "sizeof(unsigned\ long)*8" .
+.PP
+If
+.I flags
+specifies both
+.B MPOL_F_NODE
+and
+.BR MPOL_F_ADDR ,
+.BR get_mempolicy ()
+will return the node ID of the node on which the address
+.I addr
+is allocated into the location pointed to by
+.IR mode .
+If no page has yet been allocated for the specified address,
+.BR get_mempolicy ()
+will allocate a page as if the thread had performed a read
+(load) access to that address, and return the ID of the node
+where that page was allocated.
+.PP
+If
+.I flags
+specifies
+.BR MPOL_F_NODE ,
+but not
+.BR MPOL_F_ADDR ,
+and the thread's current policy is
+.BR MPOL_INTERLEAVE ,
+then
+.BR get_mempolicy ()
+will return in the location pointed to by a non-NULL
+.I mode
+argument,
+the node ID of the next node that will be used for
+interleaving of internal kernel pages allocated on behalf of the thread.
+.\" Note: code returns next interleave node via 'mode' argument -Lee Schermerhorn
+These allocations include pages for memory-mapped files in
+process memory ranges mapped using the
+.BR mmap (2)
+call with the
+.B MAP_PRIVATE
+flag for read accesses, and in memory ranges mapped with the
+.B MAP_SHARED
+flag for all accesses.
+.PP
+Other flag values are reserved.
+.PP
+For an overview of the possible policies see
+.BR set_mempolicy (2).
+.SH RETURN VALUE
+On success,
+.BR get_mempolicy ()
+returns 0;
+on error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+Part of all of the memory range specified by
+.I nodemask
+and
+.I maxnode
+points outside your accessible address space.
+.TP
+.B EINVAL
+The value specified by
+.I maxnode
+is less than the number of node IDs supported by the system.
+Or
+.I flags
+specified values other than
+.B MPOL_F_NODE
+or
+.BR MPOL_F_ADDR ;
+or
+.I flags
+specified
+.B MPOL_F_ADDR
+and
+.I addr
+is NULL,
+or
+.I flags
+did not specify
+.B MPOL_F_ADDR
+and
+.I addr
+is not NULL.
+Or,
+.I flags
+specified
+.B MPOL_F_NODE
+but not
+.B MPOL_F_ADDR
+and the current thread policy is not
+.BR MPOL_INTERLEAVE .
+Or,
+.I flags
+specified
+.B MPOL_F_MEMS_ALLOWED
+with either
+.B MPOL_F_ADDR
+or
+.BR MPOL_F_NODE .
+(And there are other
+.B EINVAL
+cases.)
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.7.
+.SH NOTES
+For information on library support, see
+.BR numa (7).
+.SH SEE ALSO
+.BR getcpu (2),
+.BR mbind (2),
+.BR mmap (2),
+.BR set_mempolicy (2),
+.BR numa (3),
+.BR numa (7),
+.BR numactl (8)
diff --git a/man2/get_robust_list.2 b/man2/get_robust_list.2
new file mode 100644
index 0000000..7ca4817
--- /dev/null
+++ b/man2/get_robust_list.2
@@ -0,0 +1,156 @@
+.\" Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+.\" Written by Ivana Varekova <varekova@redhat.com>
+.\" and Copyright (c) 2017, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" FIXME Something could be added to this page (or exit(2))
+.\" about exit_robust_list processing
+.\"
+.TH get_robust_list 2 2022-10-30 "Linux man-pages 6.05.01"
+.SH NAME
+get_robust_list, set_robust_list \- get/set list of robust futexes
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/futex.h>" \
+" /* Definition of " "struct robust_list_head" " */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "long syscall(SYS_get_robust_list, int " pid ,
+.BI " struct robust_list_head **" head_ptr ", size_t *" len_ptr );
+.B long syscall(SYS_set_robust_list,
+.BI " struct robust_list_head *" head ", size_t " len );
+.fi
+.PP
+.IR Note :
+glibc provides no wrappers for these system calls,
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+These system calls deal with per-thread robust futex lists.
+These lists are managed in user space:
+the kernel knows only about the location of the head of the list.
+A thread can inform the kernel of the location of its robust futex list using
+.BR set_robust_list ().
+The address of a thread's robust futex list can be obtained using
+.BR get_robust_list ().
+.PP
+The purpose of the robust futex list is to ensure that if a thread
+accidentally fails to unlock a futex before terminating or calling
+.BR execve (2),
+another thread that is waiting on that futex is notified that
+the former owner of the futex has died.
+This notification consists of two pieces: the
+.B FUTEX_OWNER_DIED
+bit is set in the futex word, and the kernel performs a
+.BR futex (2)
+.B FUTEX_WAKE
+operation on one of the threads waiting on the futex.
+.PP
+The
+.BR get_robust_list ()
+system call returns the head of the robust futex list of the thread
+whose thread ID is specified in
+.IR pid .
+If
+.I pid
+is 0,
+the head of the list for the calling thread is returned.
+The list head is stored in the location pointed to by
+.IR head_ptr .
+The size of the object pointed to by
+.I **head_ptr
+is stored in
+.IR len_ptr .
+.PP
+Permission to employ
+.BR get_robust_list ()
+is governed by a ptrace access mode
+.B PTRACE_MODE_READ_REALCREDS
+check; see
+.BR ptrace (2).
+.PP
+The
+.BR set_robust_list ()
+system call requests the kernel to record the head of the list of
+robust futexes owned by the calling thread.
+The
+.I head
+argument is the list head to record.
+The
+.I len
+argument should be
+.IR sizeof(*head) .
+.SH RETURN VALUE
+The
+.BR set_robust_list ()
+and
+.BR get_robust_list ()
+system calls return zero when the operation is successful,
+an error code otherwise.
+.SH ERRORS
+The
+.BR set_robust_list ()
+system call can fail with the following error:
+.TP
+.B EINVAL
+.I len
+does not equal
+.IR "sizeof(struct\ robust_list_head)" .
+.PP
+The
+.BR get_robust_list ()
+system call can fail with the following errors:
+.TP
+.B EFAULT
+The head of the robust futex list can't be stored at the location
+.IR head .
+.TP
+.B EPERM
+The calling process does not have permission to see the robust futex list of
+the thread with the thread ID
+.IR pid ,
+and does not have the
+.B CAP_SYS_PTRACE
+capability.
+.TP
+.B ESRCH
+No thread with the thread ID
+.I pid
+could be found.
+.SH VERSIONS
+These system calls were added in Linux 2.6.17.
+.SH NOTES
+These system calls are not needed by normal applications.
+.PP
+A thread can have only one robust futex list;
+therefore applications that wish
+to use this functionality should use the robust mutexes provided by glibc.
+.PP
+In the initial implementation,
+a thread waiting on a futex was notified that the owner had died
+only if the owner terminated.
+Starting with Linux 2.6.28,
+.\" commit 8141c7f3e7aee618312fa1c15109e1219de784a7
+notification was extended to include the case where the owner performs an
+.BR execve (2).
+.PP
+The thread IDs mentioned in the main text are
+.I kernel
+thread IDs of the kind returned by
+.BR clone (2)
+and
+.BR gettid (2).
+.SH SEE ALSO
+.BR futex (2),
+.BR pthread_mutexattr_setrobust (3)
+.PP
+.I Documentation/robust\-futexes.txt
+and
+.I Documentation/robust\-futex\-ABI.txt
+in the Linux kernel source tree
+.\" http://lwn.net/Articles/172149/
diff --git a/man2/get_thread_area.2 b/man2/get_thread_area.2
new file mode 100644
index 0000000..a03fe54
--- /dev/null
+++ b/man2/get_thread_area.2
@@ -0,0 +1 @@
+.so man2/set_thread_area.2
diff --git a/man2/getcpu.2 b/man2/getcpu.2
new file mode 100644
index 0000000..f90401f
--- /dev/null
+++ b/man2/getcpu.2
@@ -0,0 +1,147 @@
+.\" SPDX-License-Identifier: Linux-man-pages-1-para
+.\"
+.\" This man page is Copyright (C) 2006 Andi Kleen <ak@muc.de>.
+.\"
+.\" 2008, mtk, various edits
+.\"
+.TH getcpu 2 2023-07-15 "Linux man-pages 6.05.01"
+.SH NAME
+getcpu \- determine CPU and NUMA node on which the calling thread is running
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #include <sched.h>
+.PP
+.BI "int getcpu(unsigned int *_Nullable " cpu ", \
+unsigned int *_Nullable " node );
+.fi
+.SH DESCRIPTION
+The
+.BR getcpu ()
+system call identifies the processor and node on which the calling
+thread or process is currently running and writes them into the
+integers pointed to by the
+.I cpu
+and
+.I node
+arguments.
+The processor is a unique small integer identifying a CPU.
+The node is a unique small identifier identifying a NUMA node.
+When either
+.I cpu
+or
+.I node
+is NULL nothing is written to the respective pointer.
+.PP
+The information placed in
+.I cpu
+is guaranteed to be current only at the time of the call:
+unless the CPU affinity has been fixed using
+.BR sched_setaffinity (2),
+the kernel might change the CPU at any time.
+(Normally this does not happen
+because the scheduler tries to minimize movements between CPUs to
+keep caches hot, but it is possible.)
+The caller must allow for the possibility that the information returned in
+.I cpu
+and
+.I node
+is no longer current by the time the call returns.
+.SH RETURN VALUE
+On success, 0 is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+Arguments point outside the calling process's address space.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.19 (x86-64 and i386),
+glibc 2.29.
+.\"
+.SS C library/kernel differences
+The kernel system call has a third argument:
+.PP
+.in +4n
+.nf
+.BI "int getcpu(unsigned int *" cpu ", unsigned int *" node ,
+.BI " struct getcpu_cache *" tcache );
+.fi
+.in
+.PP
+The
+.I tcache
+argument is unused since Linux 2.6.24,
+and (when invoking the system call directly)
+should be specified as NULL,
+unless portability to Linux 2.6.23 or earlier is required.
+.PP
+.\" commit 4307d1e5ada595c87f9a4d16db16ba5edb70dcb1
+.\" Author: Ingo Molnar <mingo@elte.hu>
+.\" Date: Wed Nov 7 18:37:48 2007 +0100
+.\" x86: ignore the sys_getcpu() tcache parameter
+In Linux 2.6.23 and earlier, if the
+.I tcache
+argument was non-NULL,
+then it specified a pointer to a caller-allocated buffer in thread-local
+storage that was used to provide a caching mechanism for
+.BR getcpu ().
+Use of the cache could speed
+.BR getcpu ()
+calls, at the cost that there was a very small chance that
+the returned information would be out of date.
+The caching mechanism was considered to cause problems when
+migrating threads between CPUs, and so the argument is now ignored.
+.\"
+.\" ===== Before Linux 2.6.24: =====
+.\" .I tcache
+.\" is a pointer to a
+.\" .IR "struct getcpu_cache"
+.\" that is used as a cache by
+.\" .BR getcpu ().
+.\" The caller should put the cache into a thread-local variable
+.\" if the process is multithreaded,
+.\" because the cache cannot be shared between different threads.
+.\" .I tcache
+.\" can be NULL.
+.\" If it is not NULL
+.\" .BR getcpu ()
+.\" will use it to speed up operation.
+.\" The information inside the cache is private to the system call
+.\" and should not be accessed by the user program.
+.\" The information placed in the cache can change between Linux releases.
+.\"
+.\" When no cache is specified
+.\" .BR getcpu ()
+.\" will be slower,
+.\" but always retrieve the current CPU and node information.
+.\" With a cache
+.\" .BR getcpu ()
+.\" is faster.
+.\" However, the cached information is updated only once per jiffy (see
+.\" .BR time (7)).
+.\" This means that the information could theoretically be out of date,
+.\" although in practice the scheduler's attempt to maintain
+.\" soft CPU affinity means that the information is unlikely to change
+.\" over the course of the caching interval.
+.SH NOTES
+Linux makes a best effort to make this call as fast as possible.
+(On some architectures, this is done via an implementation in the
+.BR vdso (7).)
+The intention of
+.BR getcpu ()
+is to allow programs to make optimizations with per-CPU data
+or for NUMA optimization.
+.SH SEE ALSO
+.BR mbind (2),
+.BR sched_setaffinity (2),
+.BR set_mempolicy (2),
+.BR sched_getcpu (3),
+.BR cpuset (7),
+.BR vdso (7)
diff --git a/man2/getcwd.2 b/man2/getcwd.2
new file mode 100644
index 0000000..f080be0
--- /dev/null
+++ b/man2/getcwd.2
@@ -0,0 +1,2 @@
+.so man3/getcwd.3
+.\" Because getcwd(3) is layered on a system call of the same name
diff --git a/man2/getdents.2 b/man2/getdents.2
new file mode 100644
index 0000000..604a6ef
--- /dev/null
+++ b/man2/getdents.2
@@ -0,0 +1,319 @@
+.\" Copyright (C) 1995 Andries Brouwer (aeb@cwi.nl)
+.\" and Copyright 2008, 2015 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Written 11 June 1995 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified 22 July 1995 by Michael Chastain <mec@duracef.shout.net>:
+.\" Derived from 'readdir.2'.
+.\" Modified Tue Oct 22 08:11:14 EDT 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\"
+.TH getdents 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+getdents, getdents64 \- get directory entries
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "long syscall(SYS_getdents, unsigned int " fd \
+", struct linux_dirent *" dirp ,
+.BI " unsigned int " count );
+.PP
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #include <dirent.h>
+.PP
+.BI "ssize_t getdents64(int " fd ", void " dirp [. count "], size_t " count );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR getdents (),
+necessitating the use of
+.BR syscall (2).
+.PP
+.IR Note :
+There is no definition of
+.I struct linux_dirent
+in glibc; see NOTES.
+.SH DESCRIPTION
+These are not the interfaces you are interested in.
+Look at
+.BR readdir (3)
+for the POSIX-conforming C library interface.
+This page documents the bare kernel system call interfaces.
+.SS getdents()
+The system call
+.BR getdents ()
+reads several
+.I linux_dirent
+structures from the directory
+referred to by the open file descriptor
+.I fd
+into the buffer pointed to by
+.IR dirp .
+The argument
+.I count
+specifies the size of that buffer.
+.PP
+The
+.I linux_dirent
+structure is declared as follows:
+.PP
+.in +4n
+.EX
+struct linux_dirent {
+ unsigned long d_ino; /* Inode number */
+ unsigned long d_off; /* Offset to next \fIlinux_dirent\fP */
+ unsigned short d_reclen; /* Length of this \fIlinux_dirent\fP */
+ char d_name[]; /* Filename (null\-terminated) */
+ /* length is actually (d_reclen \- 2 \-
+ offsetof(struct linux_dirent, d_name)) */
+ /*
+ char pad; // Zero padding byte
+ char d_type; // File type (only since Linux
+ // 2.6.4); offset is (d_reclen \- 1)
+ */
+}
+.EE
+.in
+.PP
+.I d_ino
+is an inode number.
+.I d_off
+is the distance from the start of the directory to the start of the next
+.IR linux_dirent .
+.I d_reclen
+is the size of this entire
+.IR linux_dirent .
+.I d_name
+is a null-terminated filename.
+.PP
+.I d_type
+is a byte at the end of the structure that indicates the file type.
+It contains one of the following values (defined in
+.IR <dirent.h> ):
+.TP 12
+.B DT_BLK
+This is a block device.
+.TP
+.B DT_CHR
+This is a character device.
+.TP
+.B DT_DIR
+This is a directory.
+.TP
+.B DT_FIFO
+This is a named pipe (FIFO).
+.TP
+.B DT_LNK
+This is a symbolic link.
+.TP
+.B DT_REG
+This is a regular file.
+.TP
+.B DT_SOCK
+This is a UNIX domain socket.
+.TP
+.B DT_UNKNOWN
+The file type is unknown.
+.PP
+The
+.I d_type
+field is implemented since Linux 2.6.4.
+It occupies a space that was previously a zero-filled padding byte in the
+.I linux_dirent
+structure.
+Thus, on kernels up to and including Linux 2.6.3,
+attempting to access this field always provides the value 0
+.RB ( DT_UNKNOWN ).
+.PP
+Currently,
+.\" kernel 2.6.27
+.\" The same sentence is in readdir.2
+only some filesystems (among them: Btrfs, ext2, ext3, and ext4)
+have full support for returning the file type in
+.IR d_type .
+All applications must properly handle a return of
+.BR DT_UNKNOWN .
+.SS getdents64()
+The original Linux
+.BR getdents ()
+system call did not handle large filesystems and large file offsets.
+Consequently, Linux 2.4 added
+.BR getdents64 (),
+with wider types for the
+.I d_ino
+and
+.I d_off
+fields.
+In addition,
+.BR getdents64 ()
+supports an explicit
+.I d_type
+field.
+.PP
+The
+.BR getdents64 ()
+system call is like
+.BR getdents (),
+except that its second argument is a pointer to a buffer containing
+structures of the following type:
+.PP
+.in +4n
+.EX
+struct linux_dirent64 {
+ ino64_t d_ino; /* 64\-bit inode number */
+ off64_t d_off; /* 64\-bit offset to next structure */
+ unsigned short d_reclen; /* Size of this dirent */
+ unsigned char d_type; /* File type */
+ char d_name[]; /* Filename (null\-terminated) */
+};
+.EE
+.in
+.SH RETURN VALUE
+On success, the number of bytes read is returned.
+On end of directory, 0 is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+Invalid file descriptor
+.IR fd .
+.TP
+.B EFAULT
+Argument points outside the calling process's address space.
+.TP
+.B EINVAL
+Result buffer is too small.
+.TP
+.B ENOENT
+No such directory.
+.TP
+.B ENOTDIR
+File descriptor does not refer to a directory.
+.SH STANDARDS
+None.
+.SH HISTORY
+SVr4.
+.\" SVr4 documents additional ENOLINK, EIO error conditions.
+.TP
+.BR getdents64 ()
+glibc 2.30.
+.SH NOTES
+glibc does not provide a wrapper for
+.BR getdents ();
+call
+.BR getdents ()
+using
+.BR syscall (2).
+In that case you will need to define the
+.I linux_dirent
+or
+.I linux_dirent64
+structure yourself.
+.PP
+Probably, you want to use
+.BR readdir (3)
+instead of these system calls.
+.PP
+These calls supersede
+.BR readdir (2).
+.SH EXAMPLES
+.\" FIXME The example program needs to be revised, since it uses the older
+.\" getdents() system call and the structure with smaller field widths.
+The program below demonstrates the use of
+.BR getdents ().
+The following output shows an example of what we see when running this
+program on an ext2 directory:
+.PP
+.in +4n
+.EX
+.RB "$" " ./a.out /testfs/"
+-\-\-\-\-\-\-\-\-\-\-\-\-\-\- nread=120 \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-
+inode# file type d_reclen d_off d_name
+ 2 directory 16 12 .
+ 2 directory 16 24 ..
+ 11 directory 24 44 lost+found
+ 12 regular 16 56 a
+ 228929 directory 16 68 sub
+ 16353 directory 16 80 sub2
+ 130817 directory 16 4096 sub3
+.EE
+.in
+.SS Program source
+\&
+.\" SRC BEGIN (getdents.c)
+.EX
+#define _GNU_SOURCE
+#include <dirent.h> /* Defines DT_* constants */
+#include <err.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+\&
+struct linux_dirent {
+ unsigned long d_ino;
+ off_t d_off;
+ unsigned short d_reclen;
+ char d_name[];
+};
+\&
+#define BUF_SIZE 1024
+\&
+int
+main(int argc, char *argv[])
+{
+ int fd;
+ char d_type;
+ char buf[BUF_SIZE];
+ long nread;
+ struct linux_dirent *d;
+\&
+ fd = open(argc > 1 ? argv[1] : ".", O_RDONLY | O_DIRECTORY);
+ if (fd == \-1)
+ err(EXIT_FAILURE, "open");
+\&
+ for (;;) {
+ nread = syscall(SYS_getdents, fd, buf, BUF_SIZE);
+ if (nread == \-1)
+ err(EXIT_FAILURE, "getdents");
+\&
+ if (nread == 0)
+ break;
+\&
+ printf("\-\-\-\-\-\-\-\-\-\-\-\-\-\-\- nread=%ld \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\en", nread);
+ printf("inode# file type d_reclen d_off d_name\en");
+ for (size_t bpos = 0; bpos < nread;) {
+ d = (struct linux_dirent *) (buf + bpos);
+ printf("%8lu ", d\->d_ino);
+ d_type = *(buf + bpos + d\->d_reclen \- 1);
+ printf("%\-10s ", (d_type == DT_REG) ? "regular" :
+ (d_type == DT_DIR) ? "directory" :
+ (d_type == DT_FIFO) ? "FIFO" :
+ (d_type == DT_SOCK) ? "socket" :
+ (d_type == DT_LNK) ? "symlink" :
+ (d_type == DT_BLK) ? "block dev" :
+ (d_type == DT_CHR) ? "char dev" : "???");
+ printf("%4d %10jd %s\en", d\->d_reclen,
+ (intmax_t) d\->d_off, d\->d_name);
+ bpos += d\->d_reclen;
+ }
+ }
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR readdir (2),
+.BR readdir (3),
+.BR inode (7)
diff --git a/man2/getdents64.2 b/man2/getdents64.2
new file mode 100644
index 0000000..f3674ba
--- /dev/null
+++ b/man2/getdents64.2
@@ -0,0 +1 @@
+.so man2/getdents.2
diff --git a/man2/getdomainname.2 b/man2/getdomainname.2
new file mode 100644
index 0000000..b65cbfb
--- /dev/null
+++ b/man2/getdomainname.2
@@ -0,0 +1,122 @@
+.\" Copyright 1993 Rickard E. Faith (faith@cs.unc.edu)
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 1997-08-25 by Nicolás Lichtmaier <nick@debian.org>
+.\" Modified 2004-06-17 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified 2008-11-27 by mtk
+.\"
+.TH getdomainname 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+getdomainname, setdomainname \- get/set NIS domain name
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int getdomainname(char *" name ", size_t " len );
+.BI "int setdomainname(const char *" name ", size_t " len );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR getdomainname (),
+.BR setdomainname ():
+.nf
+ Since glibc 2.21:
+.\" commit 266865c0e7b79d4196e2cc393693463f03c90bd8
+ _DEFAULT_SOURCE
+ In glibc 2.19 and 2.20:
+ _DEFAULT_SOURCE || (_XOPEN_SOURCE && _XOPEN_SOURCE < 500)
+ Up to and including glibc 2.19:
+ _BSD_SOURCE || (_XOPEN_SOURCE && _XOPEN_SOURCE < 500)
+.fi
+.SH DESCRIPTION
+These functions are used to access or to change the NIS domain name of the
+host system.
+More precisely, they operate on the NIS domain name associated with the calling
+process's UTS namespace.
+.PP
+.BR setdomainname ()
+sets the domain name to the value given in the character array
+.IR name .
+The
+.I len
+argument specifies the number of bytes in
+.IR name .
+(Thus,
+.I name
+does not require a terminating null byte.)
+.PP
+.BR getdomainname ()
+returns the null-terminated domain name in the character array
+.IR name ,
+which has a length of
+.I len
+bytes.
+If the null-terminated domain name requires more than \fIlen\fP bytes,
+.BR getdomainname ()
+returns the first \fIlen\fP bytes (glibc) or gives an error (libc).
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.BR setdomainname ()
+can fail with the following errors:
+.TP
+.B EFAULT
+.I name
+pointed outside of user address space.
+.TP
+.B EINVAL
+.I len
+was negative or too large.
+.TP
+.B EPERM
+The caller did not have the
+.B CAP_SYS_ADMIN
+capability in the user namespace associated with its UTS namespace (see
+.BR namespaces (7)).
+.PP
+.BR getdomainname ()
+can fail with the following errors:
+.TP
+.B EINVAL
+For
+.BR getdomainname ()
+under libc:
+.I name
+is NULL or
+.I name
+is longer than
+.I len
+bytes.
+.SH VERSIONS
+On most Linux architectures (including x86),
+there is no
+.BR getdomainname ()
+system call; instead, glibc implements
+.BR getdomainname ()
+as a library function that returns a copy of the
+.I domainname
+field returned from a call to
+.BR uname (2).
+.SH STANDARDS
+None.
+.\" But they appear on most systems...
+.SH HISTORY
+Since Linux 1.0, the limit on the length of a domain name,
+including the terminating null byte, is 64 bytes.
+In older kernels, it was 8 bytes.
+.SH SEE ALSO
+.BR gethostname (2),
+.BR sethostname (2),
+.BR uname (2),
+.BR uts_namespaces (7)
diff --git a/man2/getegid.2 b/man2/getegid.2
new file mode 100644
index 0000000..d9b10e7
--- /dev/null
+++ b/man2/getegid.2
@@ -0,0 +1 @@
+.so man2/getgid.2
diff --git a/man2/getegid32.2 b/man2/getegid32.2
new file mode 100644
index 0000000..d7da708
--- /dev/null
+++ b/man2/getegid32.2
@@ -0,0 +1 @@
+.so man2/getegid.2
diff --git a/man2/geteuid.2 b/man2/geteuid.2
new file mode 100644
index 0000000..165cfe1
--- /dev/null
+++ b/man2/geteuid.2
@@ -0,0 +1 @@
+.so man2/getuid.2
diff --git a/man2/geteuid32.2 b/man2/geteuid32.2
new file mode 100644
index 0000000..8e60b77
--- /dev/null
+++ b/man2/geteuid32.2
@@ -0,0 +1 @@
+.so man2/geteuid.2
diff --git a/man2/getgid.2 b/man2/getgid.2
new file mode 100644
index 0000000..a1f64e0
--- /dev/null
+++ b/man2/getgid.2
@@ -0,0 +1,70 @@
+.\" Copyright 1993 Rickard E. Faith (faith@cs.unc.edu)
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH getgid 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+getgid, getegid \- get group identity
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.B gid_t getgid(void);
+.B gid_t getegid(void);
+.fi
+.SH DESCRIPTION
+.BR getgid ()
+returns the real group ID of the calling process.
+.PP
+.BR getegid ()
+returns the effective group ID of the calling process.
+.SH ERRORS
+These functions are always successful
+and never modify
+.\" https://www.austingroupbugs.net/view.php?id=511
+.\" 0000511: getuid and friends should not modify errno
+.IR errno .
+.SH VERSIONS
+On Alpha, instead of a pair of
+.BR getgid ()
+and
+.BR getegid ()
+system calls, a single
+.BR getxgid ()
+system call is provided, which returns a pair of real and effective GIDs.
+The glibc
+.BR getgid ()
+and
+.BR getegid ()
+wrapper functions transparently deal with this.
+See
+.BR syscall (2)
+for details regarding register mapping.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, 4.3BSD.
+.PP
+The original Linux
+.BR getgid ()
+and
+.BR getegid ()
+system calls supported only 16-bit group IDs.
+Subsequently, Linux 2.4 added
+.BR getgid32 ()
+and
+.BR getegid32 (),
+supporting 32-bit IDs.
+The glibc
+.BR getgid ()
+and
+.BR getegid ()
+wrapper functions transparently deal with the variations across kernel versions.
+.SH SEE ALSO
+.BR getresgid (2),
+.BR setgid (2),
+.BR setregid (2),
+.BR credentials (7)
diff --git a/man2/getgid32.2 b/man2/getgid32.2
new file mode 100644
index 0000000..d9b10e7
--- /dev/null
+++ b/man2/getgid32.2
@@ -0,0 +1 @@
+.so man2/getgid.2
diff --git a/man2/getgroups.2 b/man2/getgroups.2
new file mode 100644
index 0000000..eb282b9
--- /dev/null
+++ b/man2/getgroups.2
@@ -0,0 +1,219 @@
+.\" Copyright 1993 Rickard E. Faith (faith@cs.unc.edu)
+.\" and Copyright (C) 2008, 2010, 2015, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified Thu Oct 31 12:04:29 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified, 27 May 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on capability requirements
+.\" 2008-05-03, mtk, expanded and rewrote parts of DESCRIPTION and RETURN
+.\" VALUE, made style of page more consistent with man-pages style.
+.\"
+.TH getgroups 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+getgroups, setgroups \- get/set list of supplementary group IDs
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int getgroups(int " size ", gid_t " list []);
+.PP
+.B #include <grp.h>
+.PP
+.BI "int setgroups(size_t " size ", const gid_t *_Nullable " list );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR setgroups ():
+.nf
+ Since glibc 2.19:
+ _DEFAULT_SOURCE
+ glibc 2.19 and earlier:
+ _BSD_SOURCE
+.fi
+.SH DESCRIPTION
+.BR getgroups ()
+returns the supplementary group IDs of the calling process in
+.IR list .
+The argument
+.I size
+should be set to the maximum number of items that can be stored in the
+buffer pointed to by
+.IR list .
+If the calling process is a member of more than
+.I size
+supplementary groups, then an error results.
+.PP
+It is unspecified whether the effective group ID of the calling process
+is included in the returned list.
+(Thus, an application should also call
+.BR getegid (2)
+and add or remove the resulting value.)
+.PP
+If
+.I size
+is zero,
+.I list
+is not modified, but the total number of supplementary group IDs for the
+process is returned.
+This allows the caller to determine the size of a dynamically allocated
+.I list
+to be used in a further call to
+.BR getgroups ().
+.PP
+.BR setgroups ()
+sets the supplementary group IDs for the calling process.
+Appropriate privileges are required (see the description of the
+.B EPERM
+error, below).
+The
+.I size
+argument specifies the number of supplementary group IDs
+in the buffer pointed to by
+.IR list .
+A process can drop all of its supplementary groups with the call:
+.PP
+.in +4n
+.EX
+setgroups(0, NULL);
+.EE
+.in
+.SH RETURN VALUE
+On success,
+.BR getgroups ()
+returns the number of supplementary group IDs.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.PP
+On success,
+.BR setgroups ()
+returns 0.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+.I list
+has an invalid address.
+.PP
+.BR getgroups ()
+can additionally fail with the following error:
+.TP
+.B EINVAL
+.I size
+is less than the number of supplementary group IDs, but is not zero.
+.PP
+.BR setgroups ()
+can additionally fail with the following errors:
+.TP
+.B EINVAL
+.I size
+is greater than
+.B NGROUPS_MAX
+(32 before Linux 2.6.4; 65536 since Linux 2.6.4).
+.TP
+.B ENOMEM
+Out of memory.
+.TP
+.B EPERM
+The calling process has insufficient privilege
+(the caller does not have the
+.B CAP_SETGID
+capability in the user namespace in which it resides).
+.TP
+.BR EPERM " (since Linux 3.19)"
+The use of
+.BR setgroups ()
+is denied in this user namespace.
+See the description of
+.IR /proc/ pid /setgroups
+in
+.BR user_namespaces (7).
+.SH VERSIONS
+.SS C library/kernel differences
+At the kernel level, user IDs and group IDs are a per-thread attribute.
+However, POSIX requires that all threads in a process
+share the same credentials.
+The NPTL threading implementation handles the POSIX requirements by
+providing wrapper functions for
+the various system calls that change process UIDs and GIDs.
+These wrapper functions (including the one for
+.BR setgroups ())
+employ a signal-based technique to ensure
+that when one thread changes credentials,
+all of the other threads in the process also change their credentials.
+For details, see
+.BR nptl (7).
+.SH STANDARDS
+.TP
+.BR getgroups ()
+POSIX.1-2008.
+.TP
+.BR setgroups ()
+None.
+.SH HISTORY
+.TP
+.BR getgroups ()
+SVr4, 4.3BSD, POSIX.1-2001.
+.TP
+.BR setgroups ()
+SVr4, 4.3BSD.
+Since
+.BR setgroups ()
+requires privilege, it is not covered by POSIX.1.
+.PP
+The original Linux
+.BR getgroups ()
+system call supported only 16-bit group IDs.
+Subsequently, Linux 2.4 added
+.BR getgroups32 (),
+supporting 32-bit IDs.
+The glibc
+.BR getgroups ()
+wrapper function transparently deals with the variation across kernel versions.
+.SH NOTES
+A process can have up to
+.B NGROUPS_MAX
+supplementary group IDs
+in addition to the effective group ID.
+The constant
+.B NGROUPS_MAX
+is defined in
+.IR <limits.h> .
+The set of supplementary group IDs
+is inherited from the parent process, and preserved across an
+.BR execve (2).
+.PP
+The maximum number of supplementary group IDs can be found at run time using
+.BR sysconf (3):
+.PP
+.in +4n
+.EX
+long ngroups_max;
+ngroups_max = sysconf(_SC_NGROUPS_MAX);
+.EE
+.in
+.PP
+The maximum return value of
+.BR getgroups ()
+cannot be larger than one more than this value.
+Since Linux 2.6.4, the maximum number of supplementary group IDs is also
+exposed via the Linux-specific read-only file,
+.IR /proc/sys/kernel/ngroups_max .
+.SH SEE ALSO
+.BR getgid (2),
+.BR setgid (2),
+.BR getgrouplist (3),
+.BR group_member (3),
+.BR initgroups (3),
+.BR capabilities (7),
+.BR credentials (7)
diff --git a/man2/getgroups32.2 b/man2/getgroups32.2
new file mode 100644
index 0000000..0ae4cc0
--- /dev/null
+++ b/man2/getgroups32.2
@@ -0,0 +1 @@
+.so man2/getgroups.2
diff --git a/man2/gethostname.2 b/man2/gethostname.2
new file mode 100644
index 0000000..26b8df3
--- /dev/null
+++ b/man2/gethostname.2
@@ -0,0 +1,176 @@
+.\" Copyright 1993 Rickard E. Faith (faith@cs.unc.edu)
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 1995-07-22 by Michael Chastain <mec@duracef.shout.net>:
+.\" 'gethostname' is real system call on Linux/Alpha.
+.\" Modified 1997-01-31 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 2000-06-04, 2001-12-15 by aeb
+.\" Modified 2004-06-17 by mtk
+.\" Modified 2008-11-27 by mtk
+.\"
+.TH gethostname 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+gethostname, sethostname \- get/set hostname
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int gethostname(char *" name ", size_t " len );
+.BI "int sethostname(const char *" name ", size_t " len );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR gethostname ():
+.nf
+ _XOPEN_SOURCE >= 500 || _POSIX_C_SOURCE >= 200112L
+ || /* glibc 2.19 and earlier */ _BSD_SOURCE
+.\" The above is something of a simplification
+.\" also before glibc 2.3 there was a bit churn
+.fi
+.PP
+.BR sethostname ():
+.nf
+ Since glibc 2.21:
+.\" commit 266865c0e7b79d4196e2cc393693463f03c90bd8
+ _DEFAULT_SOURCE
+ In glibc 2.19 and 2.20:
+ _DEFAULT_SOURCE || (_XOPEN_SOURCE && _XOPEN_SOURCE < 500)
+ Up to and including glibc 2.19:
+ _BSD_SOURCE || (_XOPEN_SOURCE && _XOPEN_SOURCE < 500)
+.fi
+.SH DESCRIPTION
+These system calls are used to access or to change the system hostname.
+More precisely, they operate on the hostname associated with the calling
+process's UTS namespace.
+.PP
+.BR sethostname ()
+sets the hostname to the value given in the character array
+.IR name .
+The
+.I len
+argument specifies the number of bytes in
+.IR name .
+(Thus,
+.I name
+does not require a terminating null byte.)
+.PP
+.BR gethostname ()
+returns the null-terminated hostname in the character array
+.IR name ,
+which has a length of
+.I len
+bytes.
+If the null-terminated hostname is too large to fit,
+then the name is truncated, and no error is returned (but see NOTES below).
+POSIX.1 says that if such truncation occurs,
+then it is unspecified whether the returned buffer
+includes a terminating null byte.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+.I name
+is an invalid address.
+.TP
+.B EINVAL
+.I len
+is negative
+.\" Can't occur for gethostbyname() wrapper, since 'len' has an
+.\" unsigned type; can occur for the underlying system call.
+or, for
+.BR sethostname (),
+.I len
+is larger than the maximum allowed size.
+.TP
+.B ENAMETOOLONG
+.RB "(glibc " gethostname ())
+.I len
+is smaller than the actual size.
+(Before glibc 2.1, glibc uses
+.B EINVAL
+for this case.)
+.TP
+.B EPERM
+For
+.BR sethostname (),
+the caller did not have the
+.B CAP_SYS_ADMIN
+capability in the user namespace associated with its UTS namespace (see
+.BR namespaces (7)).
+.SH VERSIONS
+SUSv2 guarantees that "Host names are limited to 255 bytes".
+POSIX.1 guarantees that "Host names (not including
+the terminating null byte) are limited to
+.B HOST_NAME_MAX
+bytes".
+On Linux,
+.B HOST_NAME_MAX
+is defined with the value 64, which has been the limit since Linux 1.0
+(earlier kernels imposed a limit of 8 bytes).
+.SS C library/kernel differences
+The GNU C library does not employ the
+.BR gethostname ()
+system call; instead, it implements
+.BR gethostname ()
+as a library function that calls
+.BR uname (2)
+and copies up to
+.I len
+bytes from the returned
+.I nodename
+field into
+.IR name .
+Having performed the copy, the function then checks if the length of the
+.I nodename
+was greater than or equal to
+.IR len ,
+and if it is, then the function returns \-1 with
+.I errno
+set to
+.BR ENAMETOOLONG ;
+in this case, a terminating null byte is not included in the returned
+.IR name .
+.SH STANDARDS
+.TP
+.BR gethostname ()
+POSIX.1-2008.
+.TP
+.BR sethostname ()
+None.
+.SH HISTORY
+SVr4, 4.4BSD (these interfaces first appeared in 4.2BSD).
+POSIX.1-2001 and POSIX.1-2008 specify
+.BR gethostname ()
+but not
+.BR sethostname ().
+.PP
+Versions of glibc before glibc 2.2
+.\" At least glibc 2.0 and glibc 2.1, older versions not checked
+handle the case where the length of the
+.I nodename
+was greater than or equal to
+.I len
+differently: nothing is copied into
+.I name
+and the function returns \-1 with
+.I errno
+set to
+.BR ENAMETOOLONG .
+.SH SEE ALSO
+.BR hostname (1),
+.BR getdomainname (2),
+.BR setdomainname (2),
+.BR uname (2),
+.BR uts_namespaces (7)
diff --git a/man2/getitimer.2 b/man2/getitimer.2
new file mode 100644
index 0000000..422a04e
--- /dev/null
+++ b/man2/getitimer.2
@@ -0,0 +1,278 @@
+.\" Copyright 7/93 by Darren Senn <sinster@scintilla.santa-clara.ca.us>
+.\" and Copyright (C) 2016, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Based on a similar page Copyright 1992 by Rick Faith
+.\"
+.\" %%%LICENSE_START(FREELY_REDISTRIBUTABLE)
+.\" May be freely distributed and modified
+.\" %%%LICENSE_END
+.\"
+.\" Modified Tue Oct 22 00:22:35 EDT 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" 2005-04-06 mtk, Matthias Lang <matthias@corelatus.se>
+.\" Noted MAX_SEC_IN_JIFFIES ceiling
+.\"
+.TH getitimer 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+getitimer, setitimer \- get or set value of an interval timer
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/time.h>
+.PP
+.BI "int getitimer(int " which ", struct itimerval *" curr_value );
+.BI "int setitimer(int " which ", const struct itimerval *restrict " new_value ,
+.BI " struct itimerval *_Nullable restrict " old_value );
+.fi
+.SH DESCRIPTION
+These system calls provide access to interval timers, that is,
+timers that initially expire at some point in the future,
+and (optionally) at regular intervals after that.
+When a timer expires, a signal is generated for the calling process,
+and the timer is reset to the specified interval
+(if the interval is nonzero).
+.PP
+Three types of timers\[em]specified via the
+.I which
+argument\[em]are provided,
+each of which counts against a different clock and
+generates a different signal on timer expiration:
+.TP
+.B ITIMER_REAL
+This timer counts down in real (i.e., wall clock) time.
+At each expiration, a
+.B SIGALRM
+signal is generated.
+.TP
+.B ITIMER_VIRTUAL
+This timer counts down against the user-mode CPU time consumed by the process.
+(The measurement includes CPU time consumed by all threads in the process.)
+At each expiration, a
+.B SIGVTALRM
+signal is generated.
+.TP
+.B ITIMER_PROF
+This timer counts down against the total (i.e., both user and system)
+CPU time consumed by the process.
+(The measurement includes CPU time consumed by all threads in the process.)
+At each expiration, a
+.B SIGPROF
+signal is generated.
+.IP
+In conjunction with
+.BR ITIMER_VIRTUAL ,
+this timer can be used to profile user and system CPU time
+consumed by the process.
+.PP
+A process has only one of each of the three types of timers.
+.PP
+Timer values are defined by the following structures:
+.PP
+.in +4n
+.EX
+struct itimerval {
+ struct timeval it_interval; /* Interval for periodic timer */
+ struct timeval it_value; /* Time until next expiration */
+};
+\&
+struct timeval {
+ time_t tv_sec; /* seconds */
+ suseconds_t tv_usec; /* microseconds */
+};
+.EE
+.in
+.\"
+.SS getitimer()
+The function
+.BR getitimer ()
+places the current value of the timer specified by
+.I which
+in the buffer pointed to by
+.IR curr_value .
+.PP
+The
+.I it_value
+substructure is populated with the amount of time remaining until
+the next expiration of the specified timer.
+This value changes as the timer counts down, and will be reset to
+.I it_interval
+when the timer expires.
+If both fields of
+.I it_value
+are zero, then this timer is currently disarmed (inactive).
+.PP
+The
+.I it_interval
+substructure is populated with the timer interval.
+If both fields of
+.I it_interval
+are zero, then this is a single-shot timer (i.e., it expires just once).
+.SS setitimer()
+The function
+.BR setitimer ()
+arms or disarms the timer specified by
+.IR which ,
+by setting the timer to the value specified by
+.IR new_value .
+If
+.I old_value
+is non-NULL,
+the buffer it points to is used to return the previous value of the timer
+(i.e., the same information that is returned by
+.BR getitimer ()).
+.PP
+If either field in
+.I new_value.it_value
+is nonzero,
+then the timer is armed to initially expire at the specified time.
+If both fields in
+.I new_value.it_value
+are zero, then the timer is disarmed.
+.PP
+The
+.I new_value.it_interval
+field specifies the new interval for the timer;
+if both of its subfields are zero, the timer is single-shot.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+.IR new_value ,
+.IR old_value ,
+or
+.I curr_value
+is not valid a pointer.
+.TP
+.B EINVAL
+.I which
+is not one of
+.BR ITIMER_REAL ,
+.BR ITIMER_VIRTUAL ,
+or
+.BR ITIMER_PROF ;
+or (since Linux 2.6.22) one of the
+.I tv_usec
+fields in the structure pointed to by
+.I new_value
+contains a value outside the range [0, 999999].
+.SH VERSIONS
+The standards are silent on the meaning of the call:
+.PP
+.in +4n
+.EX
+setitimer(which, NULL, &old_value);
+.EE
+.in
+.PP
+Many systems (Solaris, the BSDs, and perhaps others)
+treat this as equivalent to:
+.PP
+.in +4n
+.EX
+getitimer(which, &old_value);
+.EE
+.in
+.PP
+In Linux, this is treated as being equivalent to a call in which the
+.I new_value
+fields are zero; that is, the timer is disabled.
+.IR "Don't use this Linux misfeature" :
+it is nonportable and unnecessary.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4, 4.4BSD (this call first appeared in 4.2BSD).
+POSIX.1-2008 marks
+.BR getitimer ()
+and
+.BR setitimer ()
+obsolete, recommending the use of the POSIX timers API
+.RB ( timer_gettime (2),
+.BR timer_settime (2),
+etc.) instead.
+.SH NOTES
+Timers will never expire before the requested time,
+but may expire some (short) time afterward, which depends
+on the system timer resolution and on the system load; see
+.BR time (7).
+(But see BUGS below.)
+If the timer expires while the process is active (always true for
+.BR ITIMER_VIRTUAL ),
+the signal will be delivered immediately when generated.
+.PP
+A child created via
+.BR fork (2)
+does not inherit its parent's interval timers.
+Interval timers are preserved across an
+.BR execve (2).
+.PP
+POSIX.1 leaves the
+interaction between
+.BR setitimer ()
+and the three interfaces
+.BR alarm (2),
+.BR sleep (3),
+and
+.BR usleep (3)
+unspecified.
+.SH BUGS
+The generation and delivery of a signal are distinct, and
+only one instance of each of the signals listed above may be pending
+for a process.
+Under very heavy loading, an
+.B ITIMER_REAL
+timer may expire before the signal from a previous expiration
+has been delivered.
+The second signal in such an event will be lost.
+.PP
+Before Linux 2.6.16, timer values are represented in jiffies.
+If a request is made set a timer with a value whose jiffies
+representation exceeds
+.B MAX_SEC_IN_JIFFIES
+(defined in
+.IR include/linux/jiffies.h ),
+then the timer is silently truncated to this ceiling value.
+On Linux/i386 (where, since Linux 2.6.13,
+the default jiffy is 0.004 seconds),
+this means that the ceiling value for a timer is
+approximately 99.42 days.
+Since Linux 2.6.16,
+the kernel uses a different internal representation for times,
+and this ceiling is removed.
+.PP
+On certain systems (including i386),
+Linux kernels before Linux 2.6.12 have a bug which will produce
+premature timer expirations of up to one jiffy under some circumstances.
+This bug is fixed in Linux 2.6.12.
+.\" 4 Jul 2005: It looks like this bug may remain in Linux 2.4.x.
+.\" http://lkml.org/lkml/2005/7/1/165
+.PP
+POSIX.1-2001 says that
+.BR setitimer ()
+should fail if a
+.I tv_usec
+value is specified that is outside of the range [0, 999999].
+However, up to and including Linux 2.6.21,
+Linux does not give an error, but instead silently
+adjusts the corresponding seconds value for the timer.
+From Linux 2.6.22 onward,
+this nonconformance has been repaired:
+an improper
+.I tv_usec
+value results in an
+.B EINVAL
+error.
+.\" Bugzilla report 25 Apr 2006:
+.\" http://bugzilla.kernel.org/show_bug.cgi?id=6443
+.\" "setitimer() should reject noncanonical arguments"
+.SH SEE ALSO
+.BR gettimeofday (2),
+.BR sigaction (2),
+.BR signal (2),
+.BR timer_create (2),
+.BR timerfd_create (2),
+.BR time (7)
diff --git a/man2/getmsg.2 b/man2/getmsg.2
new file mode 100644
index 0000000..5d25ea6
--- /dev/null
+++ b/man2/getmsg.2
@@ -0,0 +1 @@
+.so man2/unimplemented.2
diff --git a/man2/getpagesize.2 b/man2/getpagesize.2
new file mode 100644
index 0000000..6f3b54b
--- /dev/null
+++ b/man2/getpagesize.2
@@ -0,0 +1,89 @@
+.\" Copyright (C) 2001 Andries Brouwer <aeb@cwi.nl>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH getpagesize 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+getpagesize \- get memory page size
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.B int getpagesize(void);
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR getpagesize ():
+.nf
+ Since glibc 2.20:
+ _DEFAULT_SOURCE || ! (_POSIX_C_SOURCE >= 200112L)
+ glibc 2.12 to glibc 2.19:
+ _BSD_SOURCE || ! (_POSIX_C_SOURCE >= 200112L)
+ Before glibc 2.12:
+ _BSD_SOURCE || _XOPEN_SOURCE >= 500
+.\" || _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED
+.fi
+.SH DESCRIPTION
+The function
+.BR getpagesize ()
+returns the number of bytes in a memory page,
+where "page" is a fixed-length block,
+the unit for memory allocation and file mapping performed by
+.BR mmap (2).
+.SH STANDARDS
+None.
+.SH HISTORY
+This call first appeared in 4.2BSD.
+SVr4, 4.4BSD, SUSv2.
+In SUSv2 the
+.BR getpagesize ()
+call is labeled LEGACY, and in POSIX.1-2001
+it has been dropped;
+HP-UX does not have this call.
+.SH NOTES
+Portable applications should employ
+.I sysconf(_SC_PAGESIZE)
+instead of
+.BR getpagesize ():
+.PP
+.in +4n
+.EX
+#include <unistd.h>
+long sz = sysconf(_SC_PAGESIZE);
+.EE
+.in
+.PP
+(Most systems allow the synonym
+.B _SC_PAGE_SIZE
+for
+.BR _SC_PAGESIZE .)
+.PP
+Whether
+.BR getpagesize ()
+is present as a Linux system call depends on the architecture.
+If it is, it returns the kernel symbol
+.BR PAGE_SIZE ,
+whose value depends on the architecture and machine model.
+Generally, one uses binaries that are dependent on the architecture but not
+on the machine model, in order to have a single binary
+distribution per architecture.
+This means that a user program
+should not find
+.B PAGE_SIZE
+at compile time from a header file,
+but use an actual system call, at least for those architectures
+(like sun4) where this dependency exists.
+Here glibc 2.0 fails because its
+.BR getpagesize ()
+returns a statically derived value, and does not use a system call.
+Things are OK in glibc 2.1.
+.SH SEE ALSO
+.BR mmap (2),
+.BR sysconf (3)
diff --git a/man2/getpeername.2 b/man2/getpeername.2
new file mode 100644
index 0000000..d150617
--- /dev/null
+++ b/man2/getpeername.2
@@ -0,0 +1,116 @@
+.\" Copyright (c) 1983, 1991 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" SPDX-License-Identifier: BSD-4-Clause-UC
+.\"
+.\" @(#)getpeername.2 6.5 (Berkeley) 3/10/91
+.\"
+.\" Modified Sat Jul 24 16:37:50 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Modified Thu Jul 30 14:37:50 1993 by Martin Schulze <joey@debian.org>
+.\" Modified Sun Mar 28 21:26:46 1999 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified 17 Jul 2002, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added 'socket' to NAME, so that "man -k socket" will show this page.
+.\"
+.TH getpeername 2 2023-04-03 "Linux man-pages 6.05.01"
+.SH NAME
+getpeername \- get name of connected peer socket
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/socket.h>
+.PP
+.BI "int getpeername(int " sockfd ", struct sockaddr *restrict " addr ,
+.BI " socklen_t *restrict " addrlen );
+.fi
+.SH DESCRIPTION
+.BR getpeername ()
+returns the address of the peer connected to the socket
+.IR sockfd ,
+in the buffer pointed to by
+.IR addr .
+The
+.I addrlen
+argument should be initialized to indicate the amount of space pointed to
+by
+.IR addr .
+On return it contains the actual size of the name returned (in bytes).
+The name is truncated if the buffer provided is too small.
+.PP
+The returned address is truncated if the buffer provided is too small;
+in this case,
+.I addrlen
+will return a value greater than was supplied to the call.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+The argument
+.I sockfd
+is not a valid file descriptor.
+.TP
+.B EFAULT
+The
+.I addr
+argument points to memory not in a valid part of the
+process address space.
+.TP
+.B EINVAL
+.I addrlen
+is invalid (e.g., is negative).
+.TP
+.B ENOBUFS
+Insufficient resources were available in the system
+to perform the operation.
+.TP
+.B ENOTCONN
+The socket is not connected.
+.TP
+.B ENOTSOCK
+The file descriptor
+.I sockfd
+does not refer to a socket.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4, 4.4BSD
+(first appeared in 4.2BSD).
+.SH NOTES
+For stream sockets, once a
+.BR connect (2)
+has been performed, either socket can call
+.BR getpeername ()
+to obtain the address of the peer socket.
+On the other hand, datagram sockets are connectionless.
+Calling
+.BR connect (2)
+on a datagram socket merely sets the peer address for outgoing
+datagrams sent with
+.BR write (2)
+or
+.BR recv (2).
+The caller of
+.BR connect (2)
+can use
+.BR getpeername ()
+to obtain the peer address that it earlier set for the socket.
+However, the peer socket is unaware of this information, and calling
+.BR getpeername ()
+on the peer socket will return no useful information (unless a
+.BR connect (2)
+call was also executed on the peer).
+Note also that the receiver of a datagram can obtain
+the address of the sender when using
+.BR recvfrom (2).
+.SH SEE ALSO
+.BR accept (2),
+.BR bind (2),
+.BR getsockname (2),
+.BR ip (7),
+.BR socket (7),
+.BR unix (7)
diff --git a/man2/getpgid.2 b/man2/getpgid.2
new file mode 100644
index 0000000..d6b107a
--- /dev/null
+++ b/man2/getpgid.2
@@ -0,0 +1 @@
+.so man2/setpgid.2
diff --git a/man2/getpgrp.2 b/man2/getpgrp.2
new file mode 100644
index 0000000..d6b107a
--- /dev/null
+++ b/man2/getpgrp.2
@@ -0,0 +1 @@
+.so man2/setpgid.2
diff --git a/man2/getpid.2 b/man2/getpid.2
new file mode 100644
index 0000000..3b78823
--- /dev/null
+++ b/man2/getpid.2
@@ -0,0 +1,150 @@
+.\" Copyright 1993 Rickard E. Faith (faith@cs.unc.edu)
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH getpid 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+getpid, getppid \- get process identification
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.B pid_t getpid(void);
+.B pid_t getppid(void);
+.fi
+.SH DESCRIPTION
+.BR getpid ()
+returns the process ID (PID) of the calling process.
+(This is often used by
+routines that generate unique temporary filenames.)
+.PP
+.BR getppid ()
+returns the process ID of the parent of the calling process.
+This will be either the ID of the process that created this process using
+.BR fork (),
+or, if that process has already terminated,
+the ID of the process to which this process has been reparented (either
+.BR init (1)
+or a "subreaper" process defined via the
+.BR prctl (2)
+.B PR_SET_CHILD_SUBREAPER
+operation).
+.SH ERRORS
+These functions are always successful.
+.SH VERSIONS
+On Alpha, instead of a pair of
+.BR getpid ()
+and
+.BR getppid ()
+system calls, a single
+.BR getxpid ()
+system call is provided, which returns a pair of PID and parent PID.
+The glibc
+.BR getpid ()
+and
+.BR getppid ()
+wrapper functions transparently deal with this.
+See
+.BR syscall (2)
+for details regarding register mapping.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, 4.3BSD, SVr4.
+.SS C library/kernel differences
+From glibc 2.3.4 up to and including glibc 2.24,
+the glibc wrapper function for
+.BR getpid ()
+cached PIDs,
+with the goal of avoiding additional system calls when a process calls
+.BR getpid ()
+repeatedly.
+Normally this caching was invisible,
+but its correct operation relied on support in the wrapper functions for
+.BR fork (2),
+.BR vfork (2),
+and
+.BR clone (2):
+if an application bypassed the glibc wrappers for these system calls by using
+.BR syscall (2),
+then a call to
+.BR getpid ()
+in the child would return the wrong value
+(to be precise: it would return the PID of the parent process).
+.\" The following program demonstrates this "feature":
+.\"
+.\" #define _GNU_SOURCE
+.\" #include <sys/syscall.h>
+.\" #include <sys/wait.h>
+.\" #include <stdint.h>
+.\" #include <stdio.h>
+.\" #include <stdlib.h>
+.\" #include <unistd.h>
+.\"
+.\" int
+.\" main(int argc, char *argv[])
+.\" {
+.\" /* The following statement fills the getpid() cache */
+.\"
+.\" printf("parent PID = %ld\n", (intmax_t) getpid());
+.\"
+.\" if (syscall(SYS_fork) == 0) {
+.\" if (getpid() != syscall(SYS_getpid))
+.\" printf("child getpid() mismatch: getpid()=%jd; "
+.\" "syscall(SYS_getpid)=%ld\n",
+.\" (intmax_t) getpid(), (long) syscall(SYS_getpid));
+.\" exit(EXIT_SUCCESS);
+.\" }
+.\" wait(NULL);
+.\"}
+In addition, there were cases where
+.BR getpid ()
+could return the wrong value even when invoking
+.BR clone (2)
+via the glibc wrapper function.
+(For a discussion of one such case, see BUGS in
+.BR clone (2).)
+Furthermore, the complexity of the caching code had been
+the source of a few bugs within glibc over the years.
+.PP
+Because of the aforementioned problems,
+since glibc 2.25, the PID cache is removed:
+.\" commit c579f48edba88380635ab98cb612030e3ed8691e
+.\" https://sourceware.org/glibc/wiki/Release/2.25#pid_cache_removal
+calls to
+.BR getpid ()
+always invoke the actual system call, rather than returning a cached value.
+.\" FIXME .
+.\" Review progress of https://bugzilla.redhat.com/show_bug.cgi?id=1469757
+.SH NOTES
+If the caller's parent is in a different PID namespace (see
+.BR pid_namespaces (7)),
+.BR getppid ()
+returns 0.
+.PP
+From a kernel perspective,
+the PID (which is shared by all of the threads in a multithreaded process)
+is sometimes also known as the thread group ID (TGID).
+This contrasts with the kernel thread ID (TID),
+which is unique for each thread.
+For further details, see
+.BR gettid (2)
+and the discussion of the
+.B CLONE_THREAD
+flag in
+.BR clone (2).
+.SH SEE ALSO
+.BR clone (2),
+.BR fork (2),
+.BR gettid (2),
+.BR kill (2),
+.BR exec (3),
+.BR mkstemp (3),
+.BR tempnam (3),
+.BR tmpfile (3),
+.BR tmpnam (3),
+.BR credentials (7),
+.BR pid_namespaces (7)
diff --git a/man2/getpmsg.2 b/man2/getpmsg.2
new file mode 100644
index 0000000..5d25ea6
--- /dev/null
+++ b/man2/getpmsg.2
@@ -0,0 +1 @@
+.so man2/unimplemented.2
diff --git a/man2/getppid.2 b/man2/getppid.2
new file mode 100644
index 0000000..fca885e
--- /dev/null
+++ b/man2/getppid.2
@@ -0,0 +1 @@
+.so man2/getpid.2
diff --git a/man2/getpriority.2 b/man2/getpriority.2
new file mode 100644
index 0000000..723d3d4
--- /dev/null
+++ b/man2/getpriority.2
@@ -0,0 +1,209 @@
+.\" Copyright (c) 1980, 1991 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" SPDX-License-Identifier: BSD-4-Clause-UC
+.\"
+.\" @(#)getpriority.2 6.9 (Berkeley) 3/10/91
+.\"
+.\" Modified 1993-07-24 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1996-07-01 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified 1996-11-06 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 2001-10-21 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Corrected statement under EPERM to clarify privileges required
+.\" Modified 2002-06-21 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Clarified meaning of 0 value for 'who' argument
+.\" Modified 2004-05-27 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.TH getpriority 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+getpriority, setpriority \- get/set program scheduling priority
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/resource.h>
+.PP
+.BI "int getpriority(int " which ", id_t " who );
+.BI "int setpriority(int " which ", id_t " who ", int " prio );
+.fi
+.SH DESCRIPTION
+The scheduling priority of the process, process group, or user, as
+indicated by
+.I which
+and
+.I who
+is obtained with the
+.BR getpriority ()
+call and set with the
+.BR setpriority ()
+call.
+The process attribute dealt with by these system calls is
+the same attribute (also known as the "nice" value) that is dealt with by
+.BR nice (2).
+.PP
+The value
+.I which
+is one of
+.BR PRIO_PROCESS ,
+.BR PRIO_PGRP ,
+or
+.BR PRIO_USER ,
+and
+.I who
+is interpreted relative to
+.I which
+(a process identifier for
+.BR PRIO_PROCESS ,
+process group
+identifier for
+.BR PRIO_PGRP ,
+and a user ID for
+.BR PRIO_USER ).
+A zero value for
+.I who
+denotes (respectively) the calling process, the process group of the
+calling process, or the real user ID of the calling process.
+.PP
+The
+.I prio
+argument is a value in the range \-20 to 19 (but see NOTES below),
+with \-20 being the highest priority and 19 being the lowest priority.
+Attempts to set a priority outside this range
+are silently clamped to the range.
+The default priority is 0;
+lower values give a process a higher scheduling priority.
+.PP
+The
+.BR getpriority ()
+call returns the highest priority (lowest numerical value)
+enjoyed by any of the specified processes.
+The
+.BR setpriority ()
+call sets the priorities of all of the specified processes
+to the specified value.
+.PP
+Traditionally, only a privileged process could lower the nice value
+(i.e., set a higher priority).
+However, since Linux 2.6.12, an unprivileged process can decrease
+the nice value of a target process that has a suitable
+.B RLIMIT_NICE
+soft limit; see
+.BR getrlimit (2)
+for details.
+.SH RETURN VALUE
+On success,
+.BR getpriority ()
+returns the calling thread's nice value, which may be a negative number.
+On error, it returns \-1 and sets
+.I errno
+to indicate the error.
+.PP
+Since a successful call to
+.BR getpriority ()
+can legitimately return the value \-1, it is necessary
+to clear
+.I errno
+prior to the
+call, then check
+.I errno
+afterward to determine
+if \-1 is an error or a legitimate value.
+.PP
+.BR setpriority ()
+returns 0 on success.
+On failure, it returns \-1 and sets
+.I errno
+to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+The caller attempted to set a lower nice value
+(i.e., a higher process priority), but did not
+have the required privilege (on Linux: did not have the
+.B CAP_SYS_NICE
+capability).
+.TP
+.B EINVAL
+.I which
+was not one of
+.BR PRIO_PROCESS ,
+.BR PRIO_PGRP ,
+or
+.BR PRIO_USER .
+.TP
+.B EPERM
+A process was located, but its effective user ID did not match
+either the effective or the real user ID of the caller,
+and was not privileged (on Linux: did not have the
+.B CAP_SYS_NICE
+capability).
+But see NOTES below.
+.TP
+.B ESRCH
+No process was located using the
+.I which
+and
+.I who
+values specified.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001,
+SVr4, 4.4BSD (these interfaces first appeared in 4.2BSD).
+.SH NOTES
+For further details on the nice value, see
+.BR sched (7).
+.PP
+.IR Note :
+the addition of the "autogroup" feature in Linux 2.6.38 means that
+the nice value no longer has its traditional effect in many circumstances.
+For details, see
+.BR sched (7).
+.PP
+A child created by
+.BR fork (2)
+inherits its parent's nice value.
+The nice value is preserved across
+.BR execve (2).
+.PP
+The details on the condition for
+.B EPERM
+depend on the system.
+The above description is what POSIX.1-2001 says, and seems to be followed on
+all System\ V-like systems.
+Linux kernels before Linux 2.6.12 required the real or
+effective user ID of the caller to match
+the real user of the process \fIwho\fP (instead of its effective user ID).
+Linux 2.6.12 and later require
+the effective user ID of the caller to match
+the real or effective user ID of the process \fIwho\fP.
+All BSD-like systems (SunOS 4.1.3, Ultrix 4.2,
+4.3BSD, FreeBSD 4.3, OpenBSD-2.5, ...) behave in the same
+manner as Linux 2.6.12 and later.
+.\"
+.SS C library/kernel differences
+The getpriority system call returns nice values translated to the range 40..1,
+since a negative return value would be interpreted as an error.
+The glibc wrapper function for
+.BR getpriority ()
+translates the value back according to the formula
+.I unice\~=\~20\~\-\~knice
+(thus, the 40..1 range returned by the kernel
+corresponds to the range \-20..19 as seen by user space).
+.SH BUGS
+According to POSIX, the nice value is a per-process setting.
+However, under the current Linux/NPTL implementation of POSIX threads,
+the nice value is a per-thread attribute:
+different threads in the same process can have different nice values.
+Portable applications should avoid relying on the Linux behavior,
+which may be made standards conformant in the future.
+.SH SEE ALSO
+.BR nice (1),
+.BR renice (1),
+.BR fork (2),
+.BR capabilities (7),
+.BR sched (7)
+.PP
+.I Documentation/scheduler/sched\-nice\-design.txt
+in the Linux kernel source tree (since Linux 2.6.23)
diff --git a/man2/getrandom.2 b/man2/getrandom.2
new file mode 100644
index 0000000..565763b
--- /dev/null
+++ b/man2/getrandom.2
@@ -0,0 +1,295 @@
+.\" Copyright (C) 2014, Theodore Ts'o <tytso@mit.edu>
+.\" Copyright (C) 2014,2015 Heinrich Schuchardt <xypron.glpk@gmx.de>
+.\" Copyright (C) 2015, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH getrandom 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+getrandom \- obtain a series of random bytes
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/random.h>
+.PP
+.BI "ssize_t getrandom(void " buf [. buflen "], size_t " buflen ", \
+unsigned int " flags );
+.fi
+.SH DESCRIPTION
+The
+.BR getrandom ()
+system call fills the buffer pointed to by
+.I buf
+with up to
+.I buflen
+random bytes.
+These bytes can be used to seed user-space random number generators
+or for cryptographic purposes.
+.PP
+By default,
+.BR getrandom ()
+draws entropy from the
+.I urandom
+source (i.e., the same source as the
+.I /dev/urandom
+device).
+This behavior can be changed via the
+.I flags
+argument.
+.PP
+If the
+.I urandom
+source has been initialized,
+reads of up to 256 bytes will always return as many bytes as
+requested and will not be interrupted by signals.
+No such guarantees apply for larger buffer sizes.
+For example, if the call is interrupted by a signal handler,
+it may return a partially filled buffer, or fail with the error
+.BR EINTR .
+.PP
+If the
+.I urandom
+source has not yet been initialized, then
+.BR getrandom ()
+will block, unless
+.B GRND_NONBLOCK
+is specified in
+.IR flags .
+.PP
+The
+.I flags
+argument is a bit mask that can contain zero or more of the following values
+ORed together:
+.TP
+.B GRND_RANDOM
+If this bit is set, then random bytes are drawn from the
+.I random
+source
+(i.e., the same source as the
+.I /dev/random
+device)
+instead of the
+.I urandom
+source.
+The
+.I random
+source is limited based on the entropy that can be obtained from environmental
+noise.
+If the number of available bytes in the
+.I random
+source is less than requested in
+.IR buflen ,
+the call returns just the available random bytes.
+If no random bytes are available, the behavior depends on the presence of
+.B GRND_NONBLOCK
+in the
+.I flags
+argument.
+.TP
+.B GRND_NONBLOCK
+By default, when reading from the
+.I random
+source,
+.BR getrandom ()
+blocks if no random bytes are available,
+and when reading from the
+.I urandom
+source, it blocks if the entropy pool has not yet been initialized.
+If the
+.B GRND_NONBLOCK
+flag is set, then
+.BR getrandom ()
+does not block in these cases, but instead immediately returns \-1 with
+.I errno
+set to
+.BR EAGAIN .
+.SH RETURN VALUE
+On success,
+.BR getrandom ()
+returns the number of bytes that were copied to the buffer
+.IR buf .
+This may be less than the number of bytes requested via
+.I buflen
+if either
+.B GRND_RANDOM
+was specified in
+.I flags
+and insufficient entropy was present in the
+.I random
+source or the system call was interrupted by a signal.
+.PP
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EAGAIN
+The requested entropy was not available, and
+.BR getrandom ()
+would have blocked if the
+.B GRND_NONBLOCK
+flag was not set.
+.TP
+.B EFAULT
+The address referred to by
+.I buf
+is outside the accessible address space.
+.TP
+.B EINTR
+The call was interrupted by a signal
+handler; see the description of how interrupted
+.BR read (2)
+calls on "slow" devices are handled with and without the
+.B SA_RESTART
+flag in the
+.BR signal (7)
+man page.
+.TP
+.B EINVAL
+An invalid flag was specified in
+.IR flags .
+.TP
+.B ENOSYS
+The glibc wrapper function for
+.BR getrandom ()
+determined that the underlying kernel does not implement this system call.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 3.17,
+glibc 2.25.
+.SH NOTES
+For an overview and comparison of the various interfaces that
+can be used to obtain randomness, see
+.BR random (7).
+.PP
+Unlike
+.I /dev/random
+and
+.IR /dev/urandom ,
+.BR getrandom ()
+does not involve the use of pathnames or file descriptors.
+Thus,
+.BR getrandom ()
+can be useful in cases where
+.BR chroot (2)
+makes
+.I /dev
+pathnames invisible,
+and where an application (e.g., a daemon during start-up)
+closes a file descriptor for one of these files
+that was opened by a library.
+.\"
+.SS Maximum number of bytes returned
+As of Linux 3.19 the following limits apply:
+.IP \[bu] 3
+When reading from the
+.I urandom
+source, a maximum of 32Mi-1 bytes is returned by a single call to
+.BR getrandom ()
+on systems where
+.I int
+has a size of 32 bits.
+.IP \[bu]
+When reading from the
+.I random
+source, a maximum of 512 bytes is returned.
+.SS Interruption by a signal handler
+When reading from the
+.I urandom
+source
+.RB ( GRND_RANDOM
+is not set),
+.BR getrandom ()
+will block until the entropy pool has been initialized
+(unless the
+.B GRND_NONBLOCK
+flag was specified).
+If a request is made to read a large number of bytes (more than 256),
+.BR getrandom ()
+will block until those bytes have been generated and transferred
+from kernel memory to
+.IR buf .
+When reading from the
+.I random
+source
+.RB ( GRND_RANDOM
+is set),
+.BR getrandom ()
+will block until some random bytes become available
+(unless the
+.B GRND_NONBLOCK
+flag was specified).
+.PP
+The behavior when a call to
+.BR getrandom ()
+that is blocked while reading from the
+.I urandom
+source is interrupted by a signal handler
+depends on the initialization state of the entropy buffer
+and on the request size,
+.IR buflen .
+If the entropy is not yet initialized, then the call fails with the
+.B EINTR
+error.
+If the entropy pool has been initialized
+and the request size is large
+.RI ( buflen "\ >\ 256),"
+the call either succeeds, returning a partially filled buffer,
+or fails with the error
+.BR EINTR .
+If the entropy pool has been initialized and the request size is small
+.RI ( buflen "\ <=\ 256),"
+then
+.BR getrandom ()
+will not fail with
+.BR EINTR .
+Instead, it will return all of the bytes that have been requested.
+.PP
+When reading from the
+.I random
+source, blocking requests of any size can be interrupted by a signal handler
+(the call fails with the error
+.BR EINTR ).
+.PP
+Using
+.BR getrandom ()
+to read small buffers (<=\ 256 bytes) from the
+.I urandom
+source is the preferred mode of usage.
+.PP
+The special treatment of small values of
+.I buflen
+was designed for compatibility with
+OpenBSD's
+.BR getentropy (3),
+which is nowadays supported by glibc.
+.PP
+The user of
+.BR getrandom ()
+.I must
+always check the return value,
+to determine whether either an error occurred
+or fewer bytes than requested were returned.
+In the case where
+.B GRND_RANDOM
+is not specified and
+.I buflen
+is less than or equal to 256,
+a return of fewer bytes than requested should never happen,
+but the careful programmer will check for this anyway!
+.SH BUGS
+As of Linux 3.19, the following bug exists:
+.\" FIXME patch proposed https://lkml.org/lkml/2014/11/29/16
+.IP \[bu] 3
+Depending on CPU load,
+.BR getrandom ()
+does not react to interrupts before reading all bytes requested.
+.SH SEE ALSO
+.BR getentropy (3),
+.BR random (4),
+.BR urandom (4),
+.BR random (7),
+.BR signal (7)
diff --git a/man2/getresgid.2 b/man2/getresgid.2
new file mode 100644
index 0000000..ac4fb7c
--- /dev/null
+++ b/man2/getresgid.2
@@ -0,0 +1 @@
+.so man2/getresuid.2
diff --git a/man2/getresgid32.2 b/man2/getresgid32.2
new file mode 100644
index 0000000..2b3240f
--- /dev/null
+++ b/man2/getresgid32.2
@@ -0,0 +1 @@
+.so man2/getresgid.2
diff --git a/man2/getresuid.2 b/man2/getresuid.2
new file mode 100644
index 0000000..d5cadd7
--- /dev/null
+++ b/man2/getresuid.2
@@ -0,0 +1,70 @@
+.\" Copyright (C) 1997 Andries Brouwer (aeb@cwi.nl)
+.\" and Copyright (c) 2007, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified, 2003-05-26, Michael Kerrisk, <mtk.manpages@gmail.com>
+.\"
+.TH getresuid 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+getresuid, getresgid \- get real, effective, and saved user/group IDs
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #include <unistd.h>
+.PP
+.BI "int getresuid(uid_t *" ruid ", uid_t *" euid ", uid_t *" suid );
+.BI "int getresgid(gid_t *" rgid ", gid_t *" egid ", gid_t *" sgid );
+.fi
+.SH DESCRIPTION
+.BR getresuid ()
+returns the real UID, the effective UID, and the saved set-user-ID
+of the calling process, in the arguments
+.IR ruid ,
+.IR euid ,
+and
+.IR suid ,
+respectively.
+.BR getresgid ()
+performs the analogous task for the process's group IDs.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+One of the arguments specified an address outside the calling program's
+address space.
+.SH STANDARDS
+None.
+These calls also appear on HP-UX and some of the BSDs.
+.SH HISTORY
+Linux 2.1.44,
+glibc 2.3.2.
+.PP
+The original Linux
+.BR getresuid ()
+and
+.BR getresgid ()
+system calls supported only 16-bit user and group IDs.
+Subsequently, Linux 2.4 added
+.BR getresuid32 ()
+and
+.BR getresgid32 (),
+supporting 32-bit IDs.
+The glibc
+.BR getresuid ()
+and
+.BR getresgid ()
+wrapper functions transparently deal with the variations across kernel versions.
+.SH SEE ALSO
+.BR getuid (2),
+.BR setresuid (2),
+.BR setreuid (2),
+.BR setuid (2),
+.BR credentials (7)
diff --git a/man2/getresuid32.2 b/man2/getresuid32.2
new file mode 100644
index 0000000..ac4fb7c
--- /dev/null
+++ b/man2/getresuid32.2
@@ -0,0 +1 @@
+.so man2/getresuid.2
diff --git a/man2/getrlimit.2 b/man2/getrlimit.2
new file mode 100644
index 0000000..afc2c22
--- /dev/null
+++ b/man2/getrlimit.2
@@ -0,0 +1,854 @@
+'\" t
+.\" Copyright (c) 1992 Drew Eckhardt, March 28, 1992
+.\" and Copyright (c) 2002, 2004, 2005, 2008, 2010 Michael Kerrisk
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt <michael@moria.de>
+.\" Modified 1993-07-23 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1996-01-13 by Arnt Gulbrandsen <agulbra@troll.no>
+.\" Modified 1996-01-22 by aeb, following a remark by
+.\" Tigran Aivazian <tigran@sco.com>
+.\" Modified 1996-04-14 by aeb, following a remark by
+.\" Robert Bihlmeyer <robbe@orcus.ping.at>
+.\" Modified 1996-10-22 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 2001-05-04 by aeb, following a remark by
+.\" HÃ¥vard Lygre <hklygre@online.no>
+.\" Modified 2001-04-17 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified 2002-06-13 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added note on nonstandard behavior when SIGCHLD is ignored.
+.\" Modified 2002-07-09 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Enhanced descriptions of 'resource' values
+.\" Modified 2003-11-28 by aeb, added RLIMIT_CORE
+.\" Modified 2004-03-26 by aeb, added RLIMIT_AS
+.\" Modified 2004-06-16 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on CAP_SYS_RESOURCE
+.\"
+.\" 2004-11-16 -- mtk: the getrlimit.2 page, which formally included
+.\" coverage of getrusage(2), has been split, so that the latter
+.\" is now covered in its own getrusage.2.
+.\"
+.\" Modified 2004-11-16, mtk: A few other minor changes
+.\" Modified 2004-11-23, mtk
+.\" Added notes on RLIMIT_MEMLOCK, RLIMIT_NPROC, and RLIMIT_RSS
+.\" to "CONFORMING TO"
+.\" Modified 2004-11-25, mtk
+.\" Rewrote discussion on RLIMIT_MEMLOCK to incorporate kernel
+.\" 2.6.9 changes.
+.\" Added note on RLIMIT_CPU error in older kernels
+.\" 2004-11-03, mtk, Added RLIMIT_SIGPENDING
+.\" 2005-07-13, mtk, documented RLIMIT_MSGQUEUE limit.
+.\" 2005-07-28, mtk, Added descriptions of RLIMIT_NICE and RLIMIT_RTPRIO
+.\" 2008-05-07, mtk / Peter Zijlstra, Added description of RLIMIT_RTTIME
+.\" 2010-11-06, mtk: Added documentation of prlimit()
+.\"
+.TH getrlimit 2 2023-07-20 "Linux man-pages 6.05.01"
+.SH NAME
+getrlimit, setrlimit, prlimit \- get/set resource limits
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/resource.h>
+.PP
+.BI "int getrlimit(int " resource ", struct rlimit *" rlim );
+.BI "int setrlimit(int " resource ", const struct rlimit *" rlim );
+.PP
+.BI "int prlimit(pid_t " pid ", int " resource ,
+.BI " const struct rlimit *_Nullable " new_limit ,
+.BI " struct rlimit *_Nullable " old_limit );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR prlimit ():
+.nf
+ _GNU_SOURCE
+.fi
+.SH DESCRIPTION
+The
+.BR getrlimit ()
+and
+.BR setrlimit ()
+system calls get and set resource limits.
+Each resource has an associated soft and hard limit, as defined by the
+.I rlimit
+structure:
+.PP
+.in +4n
+.EX
+struct rlimit {
+ rlim_t rlim_cur; /* Soft limit */
+ rlim_t rlim_max; /* Hard limit (ceiling for rlim_cur) */
+};
+.EE
+.in
+.PP
+The soft limit is the value that the kernel enforces for the
+corresponding resource.
+The hard limit acts as a ceiling for the soft limit:
+an unprivileged process may set only its soft limit to a value in the
+range from 0 up to the hard limit, and (irreversibly) lower its hard limit.
+A privileged process (under Linux: one with the
+.B CAP_SYS_RESOURCE
+capability in the initial user namespace)
+may make arbitrary changes to either limit value.
+.PP
+The value
+.B RLIM_INFINITY
+denotes no limit on a resource (both in the structure returned by
+.BR getrlimit ()
+and in the structure passed to
+.BR setrlimit ()).
+.PP
+The
+.I resource
+argument must be one of:
+.TP
+.B RLIMIT_AS
+This is the maximum size of the process's virtual memory
+(address space).
+The limit is specified in bytes, and is rounded down to the system page size.
+.\" since Linux 2.0.27 / Linux 2.1.12
+This limit affects calls to
+.BR brk (2),
+.BR mmap (2),
+and
+.BR mremap (2),
+which fail with the error
+.B ENOMEM
+upon exceeding this limit.
+In addition, automatic stack expansion fails
+(and generates a
+.B SIGSEGV
+that kills the process if no alternate stack
+has been made available via
+.BR sigaltstack (2)).
+Since the value is a \fIlong\fP, on machines with a 32-bit \fIlong\fP
+either this limit is at most 2\ GiB, or this resource is unlimited.
+.TP
+.B RLIMIT_CORE
+This is the maximum size of a
+.I core
+file (see
+.BR core (5))
+in bytes that the process may dump.
+When 0 no core dump files are created.
+When nonzero, larger dumps are truncated to this size.
+.TP
+.B RLIMIT_CPU
+This is a limit, in seconds,
+on the amount of CPU time that the process can consume.
+When the process reaches the soft limit, it is sent a
+.B SIGXCPU
+signal.
+The default action for this signal is to terminate the process.
+However, the signal can be caught, and the handler can return control to
+the main program.
+If the process continues to consume CPU time, it will be sent
+.B SIGXCPU
+once per second until the hard limit is reached, at which time
+it is sent
+.BR SIGKILL .
+(This latter point describes Linux behavior.
+Implementations vary in how they treat processes which continue to
+consume CPU time after reaching the soft limit.
+Portable applications that need to catch this signal should
+perform an orderly termination upon first receipt of
+.BR SIGXCPU .)
+.TP
+.B RLIMIT_DATA
+This is the maximum size
+of the process's data segment (initialized data,
+uninitialized data, and heap).
+The limit is specified in bytes, and is rounded down to the system page size.
+This limit affects calls to
+.BR brk (2),
+.BR sbrk (2),
+and (since Linux 4.7)
+.BR mmap (2),
+.\" commits 84638335900f1995495838fe1bd4870c43ec1f67
+.\" ("mm: rework virtual memory accounting"),
+.\" f4fcd55841fc9e46daac553b39361572453c2b88
+.\" (mm: enable RLIMIT_DATA by default with workaround for valgrind).
+which fail with the error
+.B ENOMEM
+upon encountering the soft limit of this resource.
+.TP
+.B RLIMIT_FSIZE
+This is the maximum size in bytes of files that the process may create.
+Attempts to extend a file beyond this limit result in delivery of a
+.B SIGXFSZ
+signal.
+By default, this signal terminates a process, but a process can
+catch this signal instead, in which case the relevant system call (e.g.,
+.BR write (2),
+.BR truncate (2))
+fails with the error
+.BR EFBIG .
+.TP
+.BR RLIMIT_LOCKS " (Linux 2.4.0 to Linux 2.4.24)"
+.\" to be precise: Linux 2.4.0-test9; no longer in Linux 2.4.25 / Linux 2.5.65
+This is a limit on the combined number of
+.BR flock (2)
+locks and
+.BR fcntl (2)
+leases that this process may establish.
+.TP
+.B RLIMIT_MEMLOCK
+This is the maximum number of bytes of memory that may be locked
+into RAM.
+This limit is in effect rounded down to the nearest multiple
+of the system page size.
+This limit affects
+.BR mlock (2),
+.BR mlockall (2),
+and the
+.BR mmap (2)
+.B MAP_LOCKED
+operation.
+Since Linux 2.6.9, it also affects the
+.BR shmctl (2)
+.B SHM_LOCK
+operation, where it sets a maximum on the total bytes in
+shared memory segments (see
+.BR shmget (2))
+that may be locked by the real user ID of the calling process.
+The
+.BR shmctl (2)
+.B SHM_LOCK
+locks are accounted for separately from the per-process memory
+locks established by
+.BR mlock (2),
+.BR mlockall (2),
+and
+.BR mmap (2)
+.BR MAP_LOCKED ;
+a process can lock bytes up to this limit in each of these
+two categories.
+.IP
+Before Linux 2.6.9, this limit controlled the amount of
+memory that could be locked by a privileged process.
+Since Linux 2.6.9, no limits are placed on the amount of memory
+that a privileged process may lock, and this limit instead governs
+the amount of memory that an unprivileged process may lock.
+.TP
+.BR RLIMIT_MSGQUEUE " (since Linux 2.6.8)"
+This is a limit on the number of bytes that can be allocated
+for POSIX message queues for the real user ID of the calling process.
+This limit is enforced for
+.BR mq_open (3).
+Each message queue that the user creates counts (until it is removed)
+against this limit according to the formula:
+.RS 4
+.IP
+Since Linux 3.5:
+.IP
+.in +4n
+.EX
+bytes = attr.mq_maxmsg * sizeof(struct msg_msg) +
+ MIN(attr.mq_maxmsg, MQ_PRIO_MAX) *
+ sizeof(struct posix_msg_tree_node)+
+ /* For overhead */
+ attr.mq_maxmsg * attr.mq_msgsize;
+ /* For message data */
+.EE
+.in
+.IP
+Linux 3.4 and earlier:
+.IP
+.in +4n
+.EX
+bytes = attr.mq_maxmsg * sizeof(struct msg_msg *) +
+ /* For overhead */
+ attr.mq_maxmsg * attr.mq_msgsize;
+ /* For message data */
+.EE
+.in
+.RE
+.IP
+where
+.I attr
+is the
+.I mq_attr
+structure specified as the fourth argument to
+.BR mq_open (3),
+and the
+.I msg_msg
+and
+.I posix_msg_tree_node
+structures are kernel-internal structures.
+.IP
+The "overhead" addend in the formula accounts for overhead
+bytes required by the implementation
+and ensures that the user cannot
+create an unlimited number of zero-length messages (such messages
+nevertheless each consume some system memory for bookkeeping overhead).
+.TP
+.BR RLIMIT_NICE " (since Linux 2.6.12, but see BUGS below)"
+This specifies a ceiling to which the process's nice value can be raised using
+.BR setpriority (2)
+or
+.BR nice (2).
+The actual ceiling for the nice value is calculated as
+.IR "20\ \-\ rlim_cur" .
+The useful range for this limit is thus from 1
+(corresponding to a nice value of 19) to 40
+(corresponding to a nice value of \-20).
+This unusual choice of range was necessary
+because negative numbers cannot be specified
+as resource limit values, since they typically have special meanings.
+For example,
+.B RLIM_INFINITY
+typically is the same as \-1.
+For more detail on the nice value, see
+.BR sched (7).
+.TP
+.B RLIMIT_NOFILE
+This specifies a value one greater than the maximum file descriptor number
+that can be opened by this process.
+Attempts
+.RB ( open (2),
+.BR pipe (2),
+.BR dup (2),
+etc.)
+to exceed this limit yield the error
+.BR EMFILE .
+(Historically, this limit was named
+.B RLIMIT_OFILE
+on BSD.)
+.IP
+Since Linux 4.5,
+this limit also defines the maximum number of file descriptors that
+an unprivileged process (one without the
+.B CAP_SYS_RESOURCE
+capability) may have "in flight" to other processes,
+by being passed across UNIX domain sockets.
+This limit applies to the
+.BR sendmsg (2)
+system call.
+For further details, see
+.BR unix (7).
+.TP
+.B RLIMIT_NPROC
+This is a limit on the number of extant process
+(or, more precisely on Linux, threads)
+for the real user ID of the calling process.
+So long as the current number of processes belonging to this
+process's real user ID is greater than or equal to this limit,
+.BR fork (2)
+fails with the error
+.BR EAGAIN .
+.IP
+The
+.B RLIMIT_NPROC
+limit is not enforced for processes that have either the
+.B CAP_SYS_ADMIN
+or the
+.B CAP_SYS_RESOURCE
+capability,
+or run with real user ID 0.
+.TP
+.B RLIMIT_RSS
+This is a limit (in bytes) on the process's resident set
+(the number of virtual pages resident in RAM).
+This limit has effect only in Linux 2.4.x, x < 30, and there
+affects only calls to
+.BR madvise (2)
+specifying
+.BR MADV_WILLNEED .
+.\" As at Linux 2.6.12, this limit still does nothing in Linux 2.6 though
+.\" talk of making it do something has surfaced from time to time in LKML
+.\" -- MTK, Jul 05
+.TP
+.BR RLIMIT_RTPRIO " (since Linux 2.6.12, but see BUGS)"
+This specifies a ceiling on the real-time priority that may be set for
+this process using
+.BR sched_setscheduler (2)
+and
+.BR sched_setparam (2).
+.IP
+For further details on real-time scheduling policies, see
+.BR sched (7)
+.TP
+.BR RLIMIT_RTTIME " (since Linux 2.6.25)"
+This is a limit (in microseconds)
+on the amount of CPU time that a process scheduled
+under a real-time scheduling policy may consume without making a blocking
+system call.
+For the purpose of this limit,
+each time a process makes a blocking system call,
+the count of its consumed CPU time is reset to zero.
+The CPU time count is not reset if the process continues trying to
+use the CPU but is preempted, its time slice expires, or it calls
+.BR sched_yield (2).
+.IP
+Upon reaching the soft limit, the process is sent a
+.B SIGXCPU
+signal.
+If the process catches or ignores this signal and
+continues consuming CPU time, then
+.B SIGXCPU
+will be generated once each second until the hard limit is reached,
+at which point the process is sent a
+.B SIGKILL
+signal.
+.IP
+The intended use of this limit is to stop a runaway
+real-time process from locking up the system.
+.IP
+For further details on real-time scheduling policies, see
+.BR sched (7)
+.TP
+.BR RLIMIT_SIGPENDING " (since Linux 2.6.8)"
+This is a limit on the number of signals
+that may be queued for the real user ID of the calling process.
+Both standard and real-time signals are counted for the purpose of
+checking this limit.
+However, the limit is enforced only for
+.BR sigqueue (3);
+it is always possible to use
+.BR kill (2)
+to queue one instance of any of the signals that are not already
+queued to the process.
+.\" This replaces the /proc/sys/kernel/rtsig-max system-wide limit
+.\" that was present in Linux <= 2.6.7. MTK Dec 04
+.TP
+.B RLIMIT_STACK
+This is the maximum size of the process stack, in bytes.
+Upon reaching this limit, a
+.B SIGSEGV
+signal is generated.
+To handle this signal, a process must employ an alternate signal stack
+.RB ( sigaltstack (2)).
+.IP
+Since Linux 2.6.23,
+this limit also determines the amount of space used for the process's
+command-line arguments and environment variables; for details, see
+.BR execve (2).
+.SS prlimit()
+.\" commit c022a0acad534fd5f5d5f17280f6d4d135e74e81
+.\" Author: Jiri Slaby <jslaby@suse.cz>
+.\" Date: Tue May 4 18:03:50 2010 +0200
+.\"
+.\" rlimits: implement prlimit64 syscall
+.\"
+.\" commit 6a1d5e2c85d06da35cdfd93f1a27675bfdc3ad8c
+.\" Author: Jiri Slaby <jslaby@suse.cz>
+.\" Date: Wed Mar 24 17:06:58 2010 +0100
+.\"
+.\" rlimits: add rlimit64 structure
+.\"
+The Linux-specific
+.BR prlimit ()
+system call combines and extends the functionality of
+.BR setrlimit ()
+and
+.BR getrlimit ().
+It can be used to both set and get the resource limits of an arbitrary process.
+.PP
+The
+.I resource
+argument has the same meaning as for
+.BR setrlimit ()
+and
+.BR getrlimit ().
+.PP
+If the
+.I new_limit
+argument is not NULL, then the
+.I rlimit
+structure to which it points is used to set new values for
+the soft and hard limits for
+.IR resource .
+If the
+.I old_limit
+argument is not NULL, then a successful call to
+.BR prlimit ()
+places the previous soft and hard limits for
+.I resource
+in the
+.I rlimit
+structure pointed to by
+.IR old_limit .
+.PP
+The
+.I pid
+argument specifies the ID of the process on which the call is to operate.
+If
+.I pid
+is 0, then the call applies to the calling process.
+To set or get the resources of a process other than itself,
+the caller must have the
+.B CAP_SYS_RESOURCE
+capability in the user namespace of the process
+whose resource limits are being changed, or the
+real, effective, and saved set user IDs of the target process
+must match the real user ID of the caller
+.I and
+the real, effective, and saved set group IDs of the target process
+must match the real group ID of the caller.
+.\" FIXME . this permission check is strange
+.\" Asked about this on LKML, 7 Nov 2010
+.\" "Inconsistent credential checking in prlimit() syscall"
+.SH RETURN VALUE
+On success, these system calls return 0.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+A pointer argument points to a location
+outside the accessible address space.
+.TP
+.B EINVAL
+The value specified in
+.I resource
+is not valid;
+or, for
+.BR setrlimit ()
+or
+.BR prlimit ():
+.I rlim\->rlim_cur
+was greater than
+.IR rlim\->rlim_max .
+.TP
+.B EPERM
+An unprivileged process tried to raise the hard limit; the
+.B CAP_SYS_RESOURCE
+capability is required to do this.
+.TP
+.B EPERM
+The caller tried to increase the hard
+.B RLIMIT_NOFILE
+limit above the maximum defined by
+.I /proc/sys/fs/nr_open
+(see
+.BR proc (5))
+.TP
+.B EPERM
+.RB ( prlimit ())
+The calling process did not have permission to set limits
+for the process specified by
+.IR pid .
+.TP
+.B ESRCH
+Could not find a process with the ID specified in
+.IR pid .
+.SH ATTRIBUTES
+For an explanation of the terms used in this section, see
+.BR attributes (7).
+.TS
+allbox;
+lbx lb lb
+l l l.
+Interface Attribute Value
+T{
+.na
+.nh
+.BR getrlimit (),
+.BR setrlimit (),
+.BR prlimit ()
+T} Thread safety MT-Safe
+.TE
+.sp 1
+.SH STANDARDS
+.TP
+.BR getrlimit ()
+.TQ
+.BR setrlimit ()
+POSIX.1-2008.
+.TP
+.BR prlimit ()
+Linux.
+.PP
+.B RLIMIT_MEMLOCK
+and
+.B RLIMIT_NPROC
+derive from BSD and are not specified in POSIX.1;
+they are present on the BSDs and Linux, but on few other implementations.
+.B RLIMIT_RSS
+derives from BSD and is not specified in POSIX.1;
+it is nevertheless present on most implementations.
+.BR \%RLIMIT_MSGQUEUE ,
+.BR RLIMIT_NICE ,
+.BR RLIMIT_RTPRIO ,
+.BR RLIMIT_RTTIME ,
+and
+.B \%RLIMIT_SIGPENDING
+are Linux-specific.
+.SH HISTORY
+.TP
+.BR getrlimit ()
+.TQ
+.BR setrlimit ()
+POSIX.1-2001, SVr4, 4.3BSD.
+.TP
+.BR prlimit ()
+Linux 2.6.36,
+glibc 2.13.
+.SH NOTES
+A child process created via
+.BR fork (2)
+inherits its parent's resource limits.
+Resource limits are preserved across
+.BR execve (2).
+.PP
+Resource limits are per-process attributes that are shared
+by all of the threads in a process.
+.PP
+Lowering the soft limit for a resource below the process's
+current consumption of that resource will succeed
+(but will prevent the process from further increasing
+its consumption of the resource).
+.PP
+One can set the resource limits of the shell using the built-in
+.I ulimit
+command
+.RI ( limit
+in
+.BR csh (1)).
+The shell's resource limits are inherited by the processes that
+it creates to execute commands.
+.PP
+Since Linux 2.6.24, the resource limits of any process can be inspected via
+.IR /proc/ pid /limits ;
+see
+.BR proc (5).
+.PP
+Ancient systems provided a
+.BR vlimit ()
+function with a similar purpose to
+.BR setrlimit ().
+For backward compatibility, glibc also provides
+.BR vlimit ().
+All new applications should be written using
+.BR setrlimit ().
+.SS C library/kernel ABI differences
+Since glibc 2.13, the glibc
+.BR getrlimit ()
+and
+.BR setrlimit ()
+wrapper functions no longer invoke the corresponding system calls,
+but instead employ
+.BR prlimit (),
+for the reasons described in BUGS.
+.PP
+The name of the glibc wrapper function is
+.BR prlimit ();
+the underlying system call is
+.BR prlimit64 ().
+.SH BUGS
+In older Linux kernels, the
+.B SIGXCPU
+and
+.B SIGKILL
+signals delivered when a process encountered the soft and hard
+.B RLIMIT_CPU
+limits were delivered one (CPU) second later than they should have been.
+This was fixed in Linux 2.6.8.
+.PP
+In Linux 2.6.x kernels before Linux 2.6.17, a
+.B RLIMIT_CPU
+limit of 0 is wrongly treated as "no limit" (like
+.BR RLIM_INFINITY ).
+Since Linux 2.6.17, setting a limit of 0 does have an effect,
+but is actually treated as a limit of 1 second.
+.\" see http://marc.theaimsgroup.com/?l=linux-kernel&m=114008066530167&w=2
+.PP
+A kernel bug means that
+.\" See https://lwn.net/Articles/145008/
+.B RLIMIT_RTPRIO
+does not work in Linux 2.6.12; the problem is fixed in Linux 2.6.13.
+.PP
+In Linux 2.6.12, there was an off-by-one mismatch
+between the priority ranges returned by
+.BR getpriority (2)
+and
+.BR RLIMIT_NICE .
+This had the effect that the actual ceiling for the nice value
+was calculated as
+.IR "19\ \-\ rlim_cur" .
+This was fixed in Linux 2.6.13.
+.\" see http://marc.theaimsgroup.com/?l=linux-kernel&m=112256338703880&w=2
+.PP
+Since Linux 2.6.12,
+.\" The relevant patch, sent to LKML, seems to be
+.\" http://thread.gmane.org/gmane.linux.kernel/273462
+.\" From: Roland McGrath <roland <at> redhat.com>
+.\" Subject: [PATCH 7/7] make RLIMIT_CPU/SIGXCPU per-process
+.\" Date: 2005-01-23 23:27:46 GMT
+if a process reaches its soft
+.B RLIMIT_CPU
+limit and has a handler installed for
+.BR SIGXCPU ,
+then, in addition to invoking the signal handler,
+the kernel increases the soft limit by one second.
+This behavior repeats if the process continues to consume CPU time,
+until the hard limit is reached,
+at which point the process is killed.
+Other implementations
+.\" Tested Solaris 10, FreeBSD 9, OpenBSD 5.0
+do not change the
+.B RLIMIT_CPU
+soft limit in this manner,
+and the Linux behavior is probably not standards conformant;
+portable applications should avoid relying on this Linux-specific behavior.
+.\" FIXME . https://bugzilla.kernel.org/show_bug.cgi?id=50951
+The Linux-specific
+.B RLIMIT_RTTIME
+limit exhibits the same behavior when the soft limit is encountered.
+.PP
+Kernels before Linux 2.4.22 did not diagnose the error
+.B EINVAL
+for
+.BR setrlimit ()
+when
+.I rlim\->rlim_cur
+was greater than
+.IR rlim\->rlim_max .
+.\" d3561f78fd379a7110e46c87964ba7aa4120235c
+.PP
+Linux doesn't return an error when an attempt to set
+.B RLIMIT_CPU
+has failed, for compatibility reasons.
+.\"
+.SS Representation of """large""" resource limit values on 32-bit platforms
+The glibc
+.BR getrlimit ()
+and
+.BR setrlimit ()
+wrapper functions use a 64-bit
+.I rlim_t
+data type, even on 32-bit platforms.
+However, the
+.I rlim_t
+data type used in the
+.BR getrlimit ()
+and
+.BR setrlimit ()
+system calls is a (32-bit)
+.IR "unsigned long" .
+.\" Linux still uses long for limits internally:
+.\" c022a0acad534fd5f5d5f17280f6d4d135e74e81
+.\" kernel/sys.c:do_prlimit() still uses struct rlimit which
+.\" uses kernel_ulong_t for its members, i.e. 32-bit on 32-bit kernel.
+Furthermore, in Linux,
+the kernel represents resource limits on 32-bit platforms as
+.IR "unsigned long" .
+However, a 32-bit data type is not wide enough.
+.\" https://bugzilla.kernel.org/show_bug.cgi?id=5042
+.\" https://www.sourceware.org/bugzilla/show_bug.cgi?id=12201
+The most pertinent limit here is
+.BR \%RLIMIT_FSIZE ,
+which specifies the maximum size to which a file can grow:
+to be useful, this limit must be represented using a type
+that is as wide as the type used to
+represent file offsets\[em]that is, as wide as a 64-bit
+.B off_t
+(assuming a program compiled with
+.IR _FILE_OFFSET_BITS=64 ).
+.PP
+To work around this kernel limitation,
+if a program tried to set a resource limit to a value larger than
+can be represented in a 32-bit
+.IR "unsigned long" ,
+then the glibc
+.BR setrlimit ()
+wrapper function silently converted the limit value to
+.BR RLIM_INFINITY .
+In other words, the requested resource limit setting was silently ignored.
+.PP
+Since glibc 2.13,
+.\" https://www.sourceware.org/bugzilla/show_bug.cgi?id=12201
+glibc works around the limitations of the
+.BR \%getrlimit ()
+and
+.BR setrlimit ()
+system calls by implementing
+.BR setrlimit ()
+and
+.BR \%getrlimit ()
+as wrapper functions that call
+.BR prlimit ().
+.SH EXAMPLES
+The program below demonstrates the use of
+.BR prlimit ().
+.PP
+.\" SRC BEGIN (getrlimit.c)
+.EX
+#define _GNU_SOURCE
+#define _FILE_OFFSET_BITS 64
+#include <err.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/resource.h>
+#include <time.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ pid_t pid;
+ struct rlimit old, new;
+ struct rlimit *newp;
+\&
+ if (!(argc == 2 || argc == 4)) {
+ fprintf(stderr, "Usage: %s <pid> [<new\-soft\-limit> "
+ "<new\-hard\-limit>]\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ pid = atoi(argv[1]); /* PID of target process */
+\&
+ newp = NULL;
+ if (argc == 4) {
+ new.rlim_cur = atoi(argv[2]);
+ new.rlim_max = atoi(argv[3]);
+ newp = &new;
+ }
+\&
+ /* Set CPU time limit of target process; retrieve and display
+ previous limit */
+\&
+ if (prlimit(pid, RLIMIT_CPU, newp, &old) == \-1)
+ err(EXIT_FAILURE, "prlimit\-1");
+ printf("Previous limits: soft=%jd; hard=%jd\en",
+ (intmax_t) old.rlim_cur, (intmax_t) old.rlim_max);
+\&
+ /* Retrieve and display new CPU time limit */
+\&
+ if (prlimit(pid, RLIMIT_CPU, NULL, &old) == \-1)
+ err(EXIT_FAILURE, "prlimit\-2");
+ printf("New limits: soft=%jd; hard=%jd\en",
+ (intmax_t) old.rlim_cur, (intmax_t) old.rlim_max);
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR prlimit (1),
+.BR dup (2),
+.BR fcntl (2),
+.BR fork (2),
+.BR getrusage (2),
+.BR mlock (2),
+.BR mmap (2),
+.BR open (2),
+.BR quotactl (2),
+.BR sbrk (2),
+.BR shmctl (2),
+.BR malloc (3),
+.BR sigqueue (3),
+.BR ulimit (3),
+.BR core (5),
+.BR capabilities (7),
+.BR cgroups (7),
+.BR credentials (7),
+.BR signal (7)
diff --git a/man2/getrusage.2 b/man2/getrusage.2
new file mode 100644
index 0000000..8966295
--- /dev/null
+++ b/man2/getrusage.2
@@ -0,0 +1,254 @@
+'\" t
+.\" Copyright (c) 1992 Drew Eckhardt, March 28, 1992
+.\" and Copyright (c) 2002 Michael Kerrisk
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" 2004-11-16 -- mtk: the getrlimit.2 page, which formerly included
+.\" coverage of getrusage(2), has been split, so that the latter is
+.\" now covered in its own getrusage.2. For older details of change
+.\" history, etc., see getrlimit.2
+.\"
+.\" Modified 2004-11-16, mtk, Noted that the nonconformance
+.\" when SIGCHLD is being ignored is fixed in Linux 2.6.9.
+.\" 2008-02-22, Sripathi Kodi <sripathik@in.ibm.com>: Document RUSAGE_THREAD
+.\" 2008-05-25, mtk, clarify RUSAGE_CHILDREN + other clean-ups.
+.\" 2010-05-24, Mark Hills <mark@pogo.org.uk>: Description of fields,
+.\" document ru_maxrss
+.\" 2010-05-24, mtk, enhanced description of various fields
+.\"
+.TH getrusage 2 2023-07-20 "Linux man-pages 6.05.01"
+.SH NAME
+getrusage \- get resource usage
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/resource.h>
+.PP
+.BI "int getrusage(int " who ", struct rusage *" usage );
+.fi
+.SH DESCRIPTION
+.BR getrusage ()
+returns resource usage measures for
+.IR who ,
+which can be one of the following:
+.TP
+.B RUSAGE_SELF
+Return resource usage statistics for the calling process,
+which is the sum of resources used by all threads in the process.
+.TP
+.B RUSAGE_CHILDREN
+Return resource usage statistics for all children of the
+calling process that have terminated and been waited for.
+These statistics will include the resources used by grandchildren,
+and further removed descendants,
+if all of the intervening descendants waited on their terminated children.
+.TP
+.BR RUSAGE_THREAD " (since Linux 2.6.26)"
+Return resource usage statistics for the calling thread.
+The
+.B _GNU_SOURCE
+feature test macro must be defined (before including
+.I any
+header file)
+in order to obtain the definition of this constant from
+.IR <sys/resource.h> .
+.PP
+The resource usages are returned in the structure pointed to by
+.IR usage ,
+which has the following form:
+.PP
+.in +4n
+.EX
+struct rusage {
+ struct timeval ru_utime; /* user CPU time used */
+ struct timeval ru_stime; /* system CPU time used */
+ long ru_maxrss; /* maximum resident set size */
+ long ru_ixrss; /* integral shared memory size */
+ long ru_idrss; /* integral unshared data size */
+ long ru_isrss; /* integral unshared stack size */
+ long ru_minflt; /* page reclaims (soft page faults) */
+ long ru_majflt; /* page faults (hard page faults) */
+ long ru_nswap; /* swaps */
+ long ru_inblock; /* block input operations */
+ long ru_oublock; /* block output operations */
+ long ru_msgsnd; /* IPC messages sent */
+ long ru_msgrcv; /* IPC messages received */
+ long ru_nsignals; /* signals received */
+ long ru_nvcsw; /* voluntary context switches */
+ long ru_nivcsw; /* involuntary context switches */
+};
+.EE
+.in
+.PP
+Not all fields are completed;
+unmaintained fields are set to zero by the kernel.
+(The unmaintained fields are provided for compatibility with other systems,
+and because they may one day be supported on Linux.)
+The fields are interpreted as follows:
+.TP
+.I ru_utime
+This is the total amount of time spent executing in user mode,
+expressed in a
+.I timeval
+structure (seconds plus microseconds).
+.TP
+.I ru_stime
+This is the total amount of time spent executing in kernel mode,
+expressed in a
+.I timeval
+structure (seconds plus microseconds).
+.TP
+.IR ru_maxrss " (since Linux 2.6.32)"
+This is the maximum resident set size used (in kilobytes).
+For
+.BR RUSAGE_CHILDREN ,
+this is the resident set size of the largest child, not the maximum
+resident set size of the process tree.
+.TP
+.IR ru_ixrss " (unmaintained)"
+This field is currently unused on Linux.
+.\" On some systems,
+.\" this is the integral of the text segment memory consumption,
+.\" expressed in kilobyte-seconds.
+.TP
+.IR ru_idrss " (unmaintained)"
+This field is currently unused on Linux.
+.\" On some systems, this is the integral of the data segment memory consumption,
+.\" expressed in kilobyte-seconds.
+.TP
+.IR ru_isrss " (unmaintained)"
+This field is currently unused on Linux.
+.\" On some systems, this is the integral of the stack memory consumption,
+.\" expressed in kilobyte-seconds.
+.TP
+.I ru_minflt
+The number of page faults serviced without any I/O activity; here
+I/O activity is avoided by \*(lqreclaiming\*(rq a page frame from
+the list of pages awaiting reallocation.
+.TP
+.I ru_majflt
+The number of page faults serviced that required I/O activity.
+.TP
+.IR ru_nswap " (unmaintained)"
+This field is currently unused on Linux.
+.\" On some systems, this is the number of swaps out of physical memory.
+.TP
+.IR ru_inblock " (since Linux 2.6.22)"
+The number of times the filesystem had to perform input.
+.TP
+.IR ru_oublock " (since Linux 2.6.22)"
+The number of times the filesystem had to perform output.
+.TP
+.IR ru_msgsnd " (unmaintained)"
+This field is currently unused on Linux.
+.\" On FreeBSD 6.2, this appears to measure messages sent over sockets
+.\" On some systems,
+.\" this field records the number of messages sent over sockets.
+.TP
+.IR ru_msgrcv " (unmaintained)"
+This field is currently unused on Linux.
+.\" On FreeBSD 6.2, this appears to measure messages received over sockets
+.\" On some systems,
+.\" this field records the number of messages received over sockets.
+.TP
+.IR ru_nsignals " (unmaintained)"
+This field is currently unused on Linux.
+.\" On some systems, this field records the number of signals received.
+.TP
+.IR ru_nvcsw " (since Linux 2.6)"
+The number of times a context switch resulted due to a process
+voluntarily giving up the processor before its time slice was
+completed (usually to await availability of a resource).
+.TP
+.IR ru_nivcsw " (since Linux 2.6)"
+The number of times a context switch resulted due to a higher
+priority process becoming runnable or because the current process
+exceeded its time slice.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+.I usage
+points outside the accessible address space.
+.TP
+.B EINVAL
+.I who
+is invalid.
+.SH ATTRIBUTES
+For an explanation of the terms used in this section, see
+.BR attributes (7).
+.TS
+allbox;
+lbx lb lb
+l l l.
+Interface Attribute Value
+T{
+.na
+.nh
+.BR getrusage ()
+T} Thread safety MT-Safe
+.TE
+.sp 1
+.SH STANDARDS
+POSIX.1-2008.
+.PP
+POSIX.1 specifies
+.BR getrusage (),
+but specifies only the fields
+.I ru_utime
+and
+.IR ru_stime .
+.PP
+.B RUSAGE_THREAD
+is Linux-specific.
+.SH HISTORY
+POSIX.1-2001, SVr4, 4.3BSD.
+.PP
+Before Linux 2.6.9, if the disposition of
+.B SIGCHLD
+is set to
+.B SIG_IGN
+then the resource usages of child processes
+are automatically included in the value returned by
+.BR RUSAGE_CHILDREN ,
+although POSIX.1-2001 explicitly prohibits this.
+This nonconformance is rectified in Linux 2.6.9 and later.
+.\" See the description of getrusage() in XSH.
+.\" A similar statement was also in SUSv2.
+.PP
+The structure definition shown at the start of this page
+was taken from 4.3BSD Reno.
+.PP
+Ancient systems provided a
+.BR vtimes ()
+function with a similar purpose to
+.BR getrusage ().
+For backward compatibility, glibc (up until Linux 2.32) also provides
+.BR vtimes ().
+All new applications should be written using
+.BR getrusage ().
+(Since Linux 2.33, glibc no longer provides an
+.BR vtimes ()
+implementation.)
+.SH NOTES
+Resource usage metrics are preserved across an
+.BR execve (2).
+.PP
+See also the description of
+.IR /proc/ pid /stat
+in
+.BR proc (5).
+.SH SEE ALSO
+.BR clock_gettime (2),
+.BR getrlimit (2),
+.BR times (2),
+.BR wait (2),
+.BR wait4 (2),
+.BR clock (3)
diff --git a/man2/getsid.2 b/man2/getsid.2
new file mode 100644
index 0000000..842c980
--- /dev/null
+++ b/man2/getsid.2
@@ -0,0 +1,75 @@
+.\" Copyright (C) 1996 Andries Brouwer (aeb@cwi.nl)
+.\" and Copyright (C) 2016 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" Modified Thu Oct 31 14:18:40 1996 by Eric S. Raymond <esr@y\thyrsus.com>
+.\" Modified 2001-12-17, aeb
+.TH getsid 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+getsid \- get session ID
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "pid_t getsid(pid_t" " pid" );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR getsid ():
+.nf
+ _XOPEN_SOURCE >= 500
+.\" || _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED
+ || /* Since glibc 2.12: */ _POSIX_C_SOURCE >= 200809L
+.fi
+.SH DESCRIPTION
+.BR getsid ()
+returns the session ID of the process with process ID
+.IR pid .
+If
+.I pid
+is 0,
+.BR getsid ()
+returns the session ID of the calling process.
+.SH RETURN VALUE
+On success, a session ID is returned.
+On error, \fI(pid_t)\ \-1\fP is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EPERM
+A process with process ID
+.I pid
+exists, but it is not in the same session as the calling process,
+and the implementation considers this an error.
+.TP
+.B ESRCH
+No process with process ID
+.I pid
+was found.
+.SH VERSIONS
+Linux does not return
+.BR EPERM .
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4.
+Linux 2.0.
+.\" Linux has this system call since Linux 1.3.44.
+.\" There is libc support since libc 5.2.19.
+.SH NOTES
+See
+.BR credentials (7)
+for a description of sessions and session IDs.
+.SH SEE ALSO
+.BR getpgid (2),
+.BR setsid (2),
+.BR credentials (7)
diff --git a/man2/getsockname.2 b/man2/getsockname.2
new file mode 100644
index 0000000..e2cc11e
--- /dev/null
+++ b/man2/getsockname.2
@@ -0,0 +1,85 @@
+.\" Copyright (c) 1983, 1991 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" SPDX-License-Identifier: BSD-4-Clause-UC
+.\"
+.\" @(#)getsockname.2 6.4 (Berkeley) 3/10/91
+.\"
+.\" Modified Sat Jul 24 16:30:29 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Modified Tue Oct 22 00:22:35 EDT 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified Sun Mar 28 21:26:46 1999 by Andries Brouwer <aeb@cwi.nl>
+.\"
+.TH getsockname 2 2023-04-03 "Linux man-pages 6.05.01"
+.SH NAME
+getsockname \- get socket name
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/socket.h>
+.PP
+.BI "int getsockname(int " sockfd ", struct sockaddr *restrict " addr ,
+.BI " socklen_t *restrict " addrlen );
+.fi
+.SH DESCRIPTION
+.BR getsockname ()
+returns the current address to which the socket
+.I sockfd
+is bound, in the buffer pointed to by
+.IR addr .
+The
+.I addrlen
+argument should be initialized to indicate
+the amount of space (in bytes) pointed to by
+.IR addr .
+On return it contains the actual size of the socket address.
+.PP
+The returned address is truncated if the buffer provided is too small;
+in this case,
+.I addrlen
+will return a value greater than was supplied to the call.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+The argument
+.I sockfd
+is not a valid file descriptor.
+.TP
+.B EFAULT
+The
+.I addr
+argument points to memory not in a valid part of the
+process address space.
+.TP
+.B EINVAL
+.I addrlen
+is invalid (e.g., is negative).
+.TP
+.B ENOBUFS
+Insufficient resources were available in the system
+to perform the operation.
+.TP
+.B ENOTSOCK
+The file descriptor
+.I sockfd
+does not refer to a socket.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4, 4.4BSD
+(first appeared in 4.2BSD).
+.\" SVr4 documents additional ENOMEM
+.\" and ENOSR error codes.
+.SH SEE ALSO
+.BR bind (2),
+.BR socket (2),
+.BR getifaddrs (3),
+.BR ip (7),
+.BR socket (7),
+.BR unix (7)
diff --git a/man2/getsockopt.2 b/man2/getsockopt.2
new file mode 100644
index 0000000..f80900a
--- /dev/null
+++ b/man2/getsockopt.2
@@ -0,0 +1,172 @@
+.\" Copyright (c) 1983, 1991 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" SPDX-License-Identifier: BSD-4-Clause-UC
+.\"
+.\" $Id: getsockopt.2,v 1.1 1999/05/24 14:57:04 freitag Exp $
+.\"
+.\" Modified Sat Jul 24 16:19:32 1993 by Rik Faith (faith@cs.unc.edu)
+.\" Modified Mon Apr 22 02:29:06 1996 by Martin Schulze (joey@infodrom.north.de)
+.\" Modified Tue Aug 27 10:52:51 1996 by Andries Brouwer (aeb@cwi.nl)
+.\" Modified Thu Jan 23 13:29:34 1997 by Andries Brouwer (aeb@cwi.nl)
+.\" Modified Sun Mar 28 21:26:46 1999 by Andries Brouwer (aeb@cwi.nl)
+.\" Modified 1999 by Andi Kleen <ak@muc.de>.
+.\" Removed most stuff because it is in socket.7 now.
+.\"
+.TH getsockopt 2 2023-04-03 "Linux man-pages 6.05.01"
+.SH NAME
+getsockopt, setsockopt \- get and set options on sockets
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/socket.h>
+.PP
+.BI "int getsockopt(int " sockfd ", int " level ", int " optname ,
+.BI " void " optval "[restrict *." optlen ],
+.BI " socklen_t *restrict " optlen );
+.BI "int setsockopt(int " sockfd ", int " level ", int " optname ,
+.BI " const void " optval [. optlen ],
+.BI " socklen_t " optlen );
+.fi
+.SH DESCRIPTION
+.BR getsockopt ()
+and
+.BR setsockopt ()
+manipulate options for the socket referred to by the file descriptor
+.IR sockfd .
+Options may exist at multiple
+protocol levels; they are always present at the uppermost
+socket level.
+.PP
+When manipulating socket options, the level at which the
+option resides and the name of the option must be specified.
+To manipulate options at the sockets API level,
+.I level
+is specified as
+.BR SOL_SOCKET .
+To manipulate options at any
+other level the protocol number of the appropriate protocol
+controlling the option is supplied.
+For example,
+to indicate that an option is to be interpreted by the
+.B TCP
+protocol,
+.I level
+should be set to the protocol number of
+.BR TCP ;
+see
+.BR getprotoent (3).
+.PP
+The arguments
+.I optval
+and
+.I optlen
+are used to access option values for
+.BR setsockopt ().
+For
+.BR getsockopt ()
+they identify a buffer in which the value for the
+requested option(s) are to be returned.
+For
+.BR getsockopt (),
+.I optlen
+is a value-result argument, initially containing the
+size of the buffer pointed to by
+.IR optval ,
+and modified on return to indicate the actual size of
+the value returned.
+If no option value is to be supplied or returned,
+.I optval
+may be NULL.
+.PP
+.I Optname
+and any specified options are passed uninterpreted to the appropriate
+protocol module for interpretation.
+The include file
+.I <sys/socket.h>
+contains definitions for socket level options, described below.
+Options at
+other protocol levels vary in format and name; consult the appropriate
+entries in section 4 of the manual.
+.PP
+Most socket-level options utilize an
+.I int
+argument for
+.IR optval .
+For
+.BR setsockopt (),
+the argument should be nonzero to enable a boolean option, or zero if the
+option is to be disabled.
+.PP
+For a description of the available socket options see
+.BR socket (7)
+and the appropriate protocol man pages.
+.SH RETURN VALUE
+On success, zero is returned for the standard options.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.PP
+Netfilter allows the programmer
+to define custom socket options with associated handlers; for such
+options, the return value on success is the value returned by the handler.
+.SH ERRORS
+.TP
+.B EBADF
+The argument
+.I sockfd
+is not a valid file descriptor.
+.TP
+.B EFAULT
+The address pointed to by
+.I optval
+is not in a valid part of the process address space.
+For
+.BR getsockopt (),
+this error may also be returned if
+.I optlen
+is not in a valid part of the process address space.
+.TP
+.B EINVAL
+.I optlen
+invalid in
+.BR setsockopt ().
+In some cases this error can also occur for an invalid value in
+.I optval
+(e.g., for the
+.B IP_ADD_MEMBERSHIP
+option described in
+.BR ip (7)).
+.TP
+.B ENOPROTOOPT
+The option is unknown at the level indicated.
+.TP
+.B ENOTSOCK
+The file descriptor
+.I sockfd
+does not refer to a socket.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001,
+SVr4, 4.4BSD (first appeared in 4.2BSD).
+.\" SVr4 documents additional ENOMEM and ENOSR error codes, but does
+.\" not document the
+.\" .BR SO_SNDLOWAT ", " SO_RCVLOWAT ", " SO_SNDTIMEO ", " SO_RCVTIMEO
+.\" options
+.SH BUGS
+Several of the socket options should be handled at lower levels of the
+system.
+.SH SEE ALSO
+.BR ioctl (2),
+.BR socket (2),
+.BR getprotoent (3),
+.BR protocols (5),
+.BR ip (7),
+.BR packet (7),
+.BR socket (7),
+.BR tcp (7),
+.BR udp (7),
+.BR unix (7)
diff --git a/man2/gettid.2 b/man2/gettid.2
new file mode 100644
index 0000000..d4ca4d0
--- /dev/null
+++ b/man2/gettid.2
@@ -0,0 +1,74 @@
+.\" Copyright 2003 Abhijit Menon-Sen <ams@wiw.org>
+.\" and Copyright (C) 2008 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH gettid 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+gettid \- get thread identification
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #define _GNU_SOURCE
+.B #include <unistd.h>
+.PP
+.B pid_t gettid(void);
+.fi
+.SH DESCRIPTION
+.BR gettid ()
+returns the caller's thread ID (TID).
+In a single-threaded process, the thread ID
+is equal to the process ID (PID, as returned by
+.BR getpid (2)).
+In a multithreaded process, all threads
+have the same PID, but each one has a unique TID.
+For further details, see the discussion of
+.B CLONE_THREAD
+in
+.BR clone (2).
+.SH RETURN VALUE
+On success, returns the thread ID of the calling thread.
+.SH ERRORS
+This call is always successful.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.4.11,
+glibc 2.30.
+.SH NOTES
+The thread ID returned by this call is not the same thing as a
+POSIX thread ID (i.e., the opaque value returned by
+.BR pthread_self (3)).
+.PP
+In a new thread group created by a
+.BR clone (2)
+call that does not specify the
+.B CLONE_THREAD
+flag (or, equivalently, a new process created by
+.BR fork (2)),
+the new process is a thread group leader,
+and its thread group ID (the value returned by
+.BR getpid (2))
+is the same as its thread ID (the value returned by
+.BR gettid ()).
+.SH SEE ALSO
+.BR capget (2),
+.BR clone (2),
+.BR fcntl (2),
+.BR fork (2),
+.BR get_robust_list (2),
+.BR getpid (2),
+.\" .BR kcmp (2),
+.BR ioprio_set (2),
+.\" .BR move_pages (2),
+.\" .BR migrate_pages (2),
+.BR perf_event_open (2),
+.\" .BR process_vm_readv (2),
+.\" .BR ptrace (2),
+.BR sched_setaffinity (2),
+.BR sched_setparam (2),
+.BR sched_setscheduler (2),
+.BR tgkill (2),
+.BR timer_create (2)
diff --git a/man2/gettimeofday.2 b/man2/gettimeofday.2
new file mode 100644
index 0000000..8381cc0
--- /dev/null
+++ b/man2/gettimeofday.2
@@ -0,0 +1,296 @@
+.\" Copyright (c) 1992 Drew Eckhardt (drew@cs.colorado.edu), March 28, 1992
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt (michael@moria.de)
+.\" Modified 1993-07-23 by Rik Faith (faith@cs.unc.edu)
+.\" Modified 1994-08-21 by Michael Chastain (mec@shell.portal.com):
+.\" Fixed necessary '#include' lines.
+.\" Modified 1995-04-15 by Michael Chastain (mec@shell.portal.com):
+.\" Added reference to adjtimex.
+.\" Removed some nonsense lines pointed out by Urs Thuermann,
+.\" (urs@isnogud.escape.de), aeb, 950722.
+.\" Modified 1997-01-14 by Austin Donnelly (and1000@debian.org):
+.\" Added return values section, and bit on EFAULT
+.\" Added clarification on timezone, aeb, 971210.
+.\" Removed "#include <unistd.h>", aeb, 010316.
+.\" Modified, 2004-05-27 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on capability requirement.
+.\"
+.TH gettimeofday 2 2023-07-28 "Linux man-pages 6.05.01"
+.SH NAME
+gettimeofday, settimeofday \- get / set time
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/time.h>
+.PP
+.BI "int gettimeofday(struct timeval *restrict " tv ,
+.BI " struct timezone *_Nullable restrict " tz );
+.BI "int settimeofday(const struct timeval *" tv ,
+.BI " const struct timezone *_Nullable " tz );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR settimeofday ():
+.nf
+ Since glibc 2.19:
+ _DEFAULT_SOURCE
+ glibc 2.19 and earlier:
+ _BSD_SOURCE
+.fi
+.SH DESCRIPTION
+The functions
+.BR gettimeofday ()
+and
+.BR settimeofday ()
+can get and set the time as well as a timezone.
+.PP
+The
+.I tv
+argument is a
+.I struct timeval
+(as specified in
+.IR <sys/time.h> ):
+.PP
+.in +4n
+.EX
+struct timeval {
+ time_t tv_sec; /* seconds */
+ suseconds_t tv_usec; /* microseconds */
+};
+.EE
+.in
+.PP
+and gives the number of seconds and microseconds since the Epoch (see
+.BR time (2)).
+.PP
+The
+.I tz
+argument is a
+.IR "struct timezone" :
+.PP
+.in +4n
+.EX
+struct timezone {
+ int tz_minuteswest; /* minutes west of Greenwich */
+ int tz_dsttime; /* type of DST correction */
+};
+.EE
+.in
+.PP
+If either
+.I tv
+or
+.I tz
+is NULL, the corresponding structure is not set or returned.
+.\" FIXME . The compilation warning looks to be going away in glibc 2.17
+.\" see glibc commit 4b7634a5e03b0da6f8875de9d3f74c1cf6f2a6e8
+(However, compilation warnings will result if
+.I tv
+is NULL.)
+.\" The following is covered under EPERM below:
+.\" .PP
+.\" Only the superuser may use
+.\" .BR settimeofday ().
+.PP
+The use of the
+.I timezone
+structure is obsolete; the
+.I tz
+argument should normally be specified as NULL.
+(See NOTES below.)
+.PP
+Under Linux, there are some peculiar "warp clock" semantics associated
+with the
+.BR settimeofday ()
+system call if on the very first call (after booting)
+that has a non-NULL
+.I tz
+argument, the
+.I tv
+argument is NULL and the
+.I tz_minuteswest
+field is nonzero.
+(The
+.I tz_dsttime
+field should be zero for this case.)
+In such a case it is assumed that the CMOS clock
+is on local time, and that it has to be incremented by this amount
+to get UTC system time.
+No doubt it is a bad idea to use this feature.
+.SH RETURN VALUE
+.BR gettimeofday ()
+and
+.BR settimeofday ()
+return 0 for success.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+One of
+.I tv
+or
+.I tz
+pointed outside the accessible address space.
+.TP
+.B EINVAL
+.RB ( settimeofday ()):
+.I timezone
+is invalid.
+.TP
+.B EINVAL
+.RB ( settimeofday ()):
+.I tv.tv_sec
+is negative or
+.I tv.tv_usec
+is outside the range [0, 999,999].
+.TP
+.BR EINVAL " (since Linux 4.3)"
+.\" commit e1d7ba8735551ed79c7a0463a042353574b96da3
+.RB ( settimeofday ()):
+An attempt was made to set the time to a value less than
+the current value of the
+.B CLOCK_MONOTONIC
+clock (see
+.BR clock_gettime (2)).
+.TP
+.B EPERM
+The calling process has insufficient privilege to call
+.BR settimeofday ();
+under Linux the
+.B CAP_SYS_TIME
+capability is required.
+.SH VERSIONS
+.SS C library/kernel differences
+On some architectures, an implementation of
+.BR gettimeofday ()
+is provided in the
+.BR vdso (7).
+.PP
+The kernel accepts NULL for both
+.I tv
+and
+.IR tz.
+The timezone argument is ignored by glibc and musl,
+and not passed to/from the kernel.
+Android's bionic passes the timezone argument to/from the kernel,
+but Android does not update the kernel timezone
+based on the device timezone in Settings,
+so the kernel's timezone is typically UTC.
+.SH STANDARDS
+.TP
+.BR gettimeofday ()
+POSIX.1-2008 (obsolete).
+.TP
+.BR settimeofday ()
+None.
+.SH HISTORY
+SVr4, 4.3BSD.
+POSIX.1-2001 describes
+.BR gettimeofday ()
+but not
+.BR settimeofday ().
+POSIX.1-2008 marks
+.BR gettimeofday ()
+as obsolete, recommending the use of
+.BR clock_gettime (2)
+instead.
+.PP
+Traditionally, the fields of
+.I struct timeval
+were of type
+.IR long .
+.\"
+.SS The tz_dsttime field
+On a non-Linux kernel, with glibc, the
+.I tz_dsttime
+field of
+.I struct timezone
+will be set to a nonzero value by
+.BR gettimeofday ()
+if the current timezone has ever had or will have a daylight saving
+rule applied.
+In this sense it exactly mirrors the meaning of
+.BR daylight (3)
+for the current zone.
+On Linux, with glibc, the setting of the
+.I tz_dsttime
+field of
+.I struct timezone
+has never been used by
+.BR settimeofday ()
+or
+.BR gettimeofday ().
+.\" it has not
+.\" been and will not be supported by libc or glibc.
+.\" Each and every occurrence of this field in the kernel source
+.\" (other than the declaration) is a bug.
+Thus, the following is purely of historical interest.
+.PP
+On old systems, the field
+.I tz_dsttime
+contains a symbolic constant (values are given below)
+that indicates in which part of the year Daylight Saving Time
+is in force.
+(Note: this value is constant throughout the year:
+it does not indicate that DST is in force, it just selects an
+algorithm.)
+The daylight saving time algorithms defined are as follows:
+.PP
+.in +4n
+.EX
+\fBDST_NONE\fP /* not on DST */
+\fBDST_USA\fP /* USA style DST */
+\fBDST_AUST\fP /* Australian style DST */
+\fBDST_WET\fP /* Western European DST */
+\fBDST_MET\fP /* Middle European DST */
+\fBDST_EET\fP /* Eastern European DST */
+\fBDST_CAN\fP /* Canada */
+\fBDST_GB\fP /* Great Britain and Eire */
+\fBDST_RUM\fP /* Romania */
+\fBDST_TUR\fP /* Turkey */
+\fBDST_AUSTALT\fP /* Australian style with shift in 1986 */
+.EE
+.in
+.PP
+Of course it turned out that the period in which
+Daylight Saving Time is in force cannot be given
+by a simple algorithm, one per country; indeed,
+this period is determined by unpredictable political
+decisions.
+So this method of representing timezones
+has been abandoned.
+.SH NOTES
+The time returned by
+.BR gettimeofday ()
+.I is
+affected by discontinuous jumps in the system time
+(e.g., if the system administrator manually changes the system time).
+If you need a monotonically increasing clock, see
+.BR clock_gettime (2).
+.PP
+Macros for operating on
+.I timeval
+structures are described in
+.BR timeradd (3).
+.SH SEE ALSO
+.BR date (1),
+.BR adjtimex (2),
+.BR clock_gettime (2),
+.BR time (2),
+.BR ctime (3),
+.BR ftime (3),
+.BR timeradd (3),
+.BR capabilities (7),
+.BR time (7),
+.BR vdso (7),
+.BR hwclock (8)
diff --git a/man2/getuid.2 b/man2/getuid.2
new file mode 100644
index 0000000..1b94158
--- /dev/null
+++ b/man2/getuid.2
@@ -0,0 +1,80 @@
+.\" Copyright 1993 Rickard E. Faith (faith@cs.unc.edu)
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Historical remark, aeb, 2004-06-05
+.TH getuid 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+getuid, geteuid \- get user identity
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.B uid_t getuid(void);
+.B uid_t geteuid(void);
+.fi
+.SH DESCRIPTION
+.BR getuid ()
+returns the real user ID of the calling process.
+.PP
+.BR geteuid ()
+returns the effective user ID of the calling process.
+.SH ERRORS
+These functions are always successful
+and never modify
+.\" https://www.austingroupbugs.net/view.php?id=511
+.\" 0000511: getuid and friends should not modify errno
+.IR errno .
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, 4.3BSD.
+.PP
+In UNIX\ V6 the
+.BR getuid ()
+call returned
+.IR "(euid << 8) + uid" .
+UNIX\ V7 introduced separate calls
+.BR getuid ()
+and
+.BR geteuid ().
+.PP
+The original Linux
+.BR getuid ()
+and
+.BR geteuid ()
+system calls supported only 16-bit user IDs.
+Subsequently, Linux 2.4 added
+.BR getuid32 ()
+and
+.BR geteuid32 (),
+supporting 32-bit IDs.
+The glibc
+.BR getuid ()
+and
+.BR geteuid ()
+wrapper functions transparently deal with the variations across kernel versions.
+.PP
+On Alpha, instead of a pair of
+.BR getuid ()
+and
+.BR geteuid ()
+system calls, a single
+.BR getxuid ()
+system call is provided, which returns a pair of real and effective UIDs.
+The glibc
+.BR getuid ()
+and
+.BR geteuid ()
+wrapper functions transparently deal with this.
+See
+.BR syscall (2)
+for details regarding register mapping.
+.SH SEE ALSO
+.BR getresuid (2),
+.BR setreuid (2),
+.BR setuid (2),
+.BR credentials (7)
diff --git a/man2/getuid32.2 b/man2/getuid32.2
new file mode 100644
index 0000000..165cfe1
--- /dev/null
+++ b/man2/getuid32.2
@@ -0,0 +1 @@
+.so man2/getuid.2
diff --git a/man2/getunwind.2 b/man2/getunwind.2
new file mode 100644
index 0000000..eaf7117
--- /dev/null
+++ b/man2/getunwind.2
@@ -0,0 +1,87 @@
+.\" Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+.\" Written by Marcela Maslanova <mmaslano@redhat.com>
+.\" and Copyright 2013, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH getunwind 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+getunwind \- copy the unwind data to caller's buffer
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <linux/unwind.h>
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "[[deprecated]] long syscall(SYS_getunwind, void " buf [. buf_size ],
+.BI " size_t " buf_size );
+.fi
+.SH DESCRIPTION
+.I Note: this system call is obsolete.
+.PP
+The
+IA-64-specific
+.BR getunwind ()
+system call copies the kernel's call frame
+unwind data into the buffer pointed to by
+.I buf
+and returns the size of the unwind data;
+this data describes the gate page (kernel code that
+is mapped into user space).
+.PP
+The size of the buffer
+.I buf
+is specified in
+.IR buf_size .
+The data is copied only if
+.I buf_size
+is greater than or equal to the size of the unwind data and
+.I buf
+is not NULL;
+otherwise, no data is copied, and the call succeeds,
+returning the size that would be needed to store the unwind data.
+.PP
+The first part of the unwind data contains an unwind table.
+The rest contains the associated unwind information, in no particular order.
+The unwind table contains entries of the following form:
+.PP
+.in +4n
+.EX
+u64 start; (64\-bit address of start of function)
+u64 end; (64\-bit address of end of function)
+u64 info; (BUF\-relative offset to unwind info)
+.EE
+.in
+.PP
+An entry whose
+.I start
+value is zero indicates the end of the table.
+For more information about the format, see the
+.I IA-64 Software Conventions and Runtime Architecture
+manual.
+.SH RETURN VALUE
+On success,
+.BR getunwind ()
+returns the size of the unwind data.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.BR getunwind ()
+fails with the error
+.B EFAULT
+if the unwind info can't be stored in the space specified by
+.IR buf .
+.SH STANDARDS
+Linux on IA-64.
+.SH HISTORY
+Linux 2.4.
+.PP
+This system call has been deprecated.
+The modern way to obtain the kernel's unwind data is via the
+.BR vdso (7).
+.SH SEE ALSO
+.BR getauxval (3)
diff --git a/man2/getxattr.2 b/man2/getxattr.2
new file mode 100644
index 0000000..21df6ca
--- /dev/null
+++ b/man2/getxattr.2
@@ -0,0 +1,143 @@
+.\" Copyright (C) Andreas Gruenbacher, February 2001
+.\" Copyright (C) Silicon Graphics Inc, September 2001
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.TH getxattr 2 2023-07-28 "Linux man-pages 6.05.01"
+.SH NAME
+getxattr, lgetxattr, fgetxattr \- retrieve an extended attribute value
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/xattr.h>
+.PP
+.BI "ssize_t getxattr(const char *" path ", const char *" name ,
+.BI " void " value [. size "], size_t " size );
+.BI "ssize_t lgetxattr(const char *" path ", const char *" name ,
+.BI " void " value [. size "], size_t " size );
+.BI "ssize_t fgetxattr(int " fd ", const char *" name ,
+.BI " void " value [. size "], size_t " size );
+.fi
+.SH DESCRIPTION
+Extended attributes are
+.IR name : value
+pairs associated with inodes (files, directories, symbolic links, etc.).
+They are extensions to the normal attributes which are associated
+with all inodes in the system (i.e., the
+.BR stat (2)
+data).
+A complete overview of extended attributes concepts can be found in
+.BR xattr (7).
+.PP
+.BR getxattr ()
+retrieves the value of the extended attribute identified by
+.I name
+and associated with the given
+.I path
+in the filesystem.
+The attribute value is placed in the buffer pointed to by
+.IR value ;
+.I size
+specifies the size of that buffer.
+The return value of the call is the number of bytes placed in
+.IR value .
+.PP
+.BR lgetxattr ()
+is identical to
+.BR getxattr (),
+except in the case of a symbolic link, where the link itself is
+interrogated, not the file that it refers to.
+.PP
+.BR fgetxattr ()
+is identical to
+.BR getxattr (),
+only the open file referred to by
+.I fd
+(as returned by
+.BR open (2))
+is interrogated in place of
+.IR path .
+.PP
+An extended attribute
+.I name
+is a null-terminated string.
+The name includes a namespace prefix; there may be several, disjoint
+namespaces associated with an individual inode.
+The value of an extended attribute is a chunk of arbitrary textual or
+binary data that was assigned using
+.BR setxattr (2).
+.PP
+If
+.I size
+is specified as zero, these calls return the current size of the
+named extended attribute (and leave
+.I value
+unchanged).
+This can be used to determine the size of the buffer that
+should be supplied in a subsequent call.
+(But, bear in mind that there is a possibility that the
+attribute value may change between the two calls,
+so that it is still necessary to check the return status
+from the second call.)
+.SH RETURN VALUE
+On success, these calls return a nonnegative value which is
+the size (in bytes) of the extended attribute value.
+On failure, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B E2BIG
+The size of the attribute value is larger than the maximum size allowed; the
+attribute cannot be retrieved.
+This can happen on filesystems that support
+very large attribute values such as NFSv4, for example.
+.TP
+.B ENODATA
+The named attribute does not exist, or the process has no access to
+this attribute.
+.\" .RB ( ENOATTR
+.\" is defined to be a synonym for
+.\" .BR ENODATA
+.\" in
+.\" .IR <attr/attributes.h> .)
+.TP
+.B ENOTSUP
+Extended attributes are not supported by the filesystem, or are disabled.
+.TP
+.B ERANGE
+The
+.I size
+of the
+.I value
+buffer is too small to hold the result.
+.PP
+In addition, the errors documented in
+.BR stat (2)
+can also occur.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.4,
+glibc 2.3.
+.\" .SH AUTHORS
+.\" Andreas Gruenbacher,
+.\" .RI < a.gruenbacher@computer.org >
+.\" and the SGI XFS development team,
+.\" .RI < linux-xfs@oss.sgi.com >.
+.\" Please send any bug reports or comments to these addresses.
+.SH EXAMPLES
+See
+.BR listxattr (2).
+.SH SEE ALSO
+.BR getfattr (1),
+.BR setfattr (1),
+.BR listxattr (2),
+.BR open (2),
+.BR removexattr (2),
+.BR setxattr (2),
+.BR stat (2),
+.BR symlink (7),
+.BR xattr (7)
diff --git a/man2/gtty.2 b/man2/gtty.2
new file mode 100644
index 0000000..5d25ea6
--- /dev/null
+++ b/man2/gtty.2
@@ -0,0 +1 @@
+.so man2/unimplemented.2
diff --git a/man2/idle.2 b/man2/idle.2
new file mode 100644
index 0000000..197f366
--- /dev/null
+++ b/man2/idle.2
@@ -0,0 +1,44 @@
+.\" Copyright 1993 Rickard E. Faith (faith@cs.unc.edu)
+.\" Portions extracted from linux/mm/swap.c:
+.\" Copyright (C) 1991, 1992 Linus Torvalds
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 21 Aug 1994 by Michael Chastain <mec@shell.portal.com>:
+.\" Added text about calling restriction (new in Linux 1.1.20 I believe).
+.\" N.B. calling "idle" from user process used to hang process!
+.\" Modified Thu Oct 31 14:41:15 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" "
+.TH idle 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+idle \- make process 0 idle
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.B [[deprecated]] int idle(void);
+.fi
+.SH DESCRIPTION
+.BR idle ()
+is an internal system call used during bootstrap.
+It marks the process's pages as swappable, lowers its priority,
+and enters the main scheduling loop.
+.BR idle ()
+never returns.
+.PP
+Only process 0 may call
+.BR idle ().
+Any user process, even a process with superuser permission,
+will receive
+.BR EPERM .
+.SH RETURN VALUE
+.BR idle ()
+never returns for process 0, and always returns \-1 for a user process.
+.SH ERRORS
+.TP
+.B EPERM
+Always, for a user process.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Removed in Linux 2.3.13.
diff --git a/man2/inb.2 b/man2/inb.2
new file mode 100644
index 0000000..2c63c75
--- /dev/null
+++ b/man2/inb.2
@@ -0,0 +1 @@
+.so man2/outb.2
diff --git a/man2/inb_p.2 b/man2/inb_p.2
new file mode 100644
index 0000000..2c63c75
--- /dev/null
+++ b/man2/inb_p.2
@@ -0,0 +1 @@
+.so man2/outb.2
diff --git a/man2/init_module.2 b/man2/init_module.2
new file mode 100644
index 0000000..a5fed4d
--- /dev/null
+++ b/man2/init_module.2
@@ -0,0 +1,342 @@
+.\" Copyright (C) 2012 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" A few fragments remain from a version
+.\" Copyright (C) 1996 Free Software Foundation, Inc.
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH init_module 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+init_module, finit_module \- load a kernel module
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/module.h>" " /* Definition of " MODULE_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_init_module, void " module_image [. len "], \
+unsigned long " len ,
+.BI " const char *" param_values );
+.BI "int syscall(SYS_finit_module, int " fd ,
+.BI " const char *" param_values ", int " flags );
+.fi
+.PP
+.IR Note :
+glibc provides no wrappers for these system calls,
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+.BR init_module ()
+loads an ELF image into kernel space,
+performs any necessary symbol relocations,
+initializes module parameters to values provided by the caller,
+and then runs the module's
+.I init
+function.
+This system call requires privilege.
+.PP
+The
+.I module_image
+argument points to a buffer containing the binary image
+to be loaded;
+.I len
+specifies the size of that buffer.
+The module image should be a valid ELF image, built for the running kernel.
+.PP
+The
+.I param_values
+argument is a string containing space-delimited specifications of the
+values for module parameters (defined inside the module using
+.BR module_param ()
+and
+.BR module_param_array ()).
+The kernel parses this string and initializes the specified
+parameters.
+Each of the parameter specifications has the form:
+.PP
+.RI " " name [\c
+.BI = value\c
+.RB [ ,\c
+.IR value ...]]
+.PP
+The parameter
+.I name
+is one of those defined within the module using
+.IR module_param ()
+(see the Linux kernel source file
+.IR include/linux/moduleparam.h ).
+The parameter
+.I value
+is optional in the case of
+.I bool
+and
+.I invbool
+parameters.
+Values for array parameters are specified as a comma-separated list.
+.SS finit_module()
+The
+.BR finit_module ()
+.\" commit 34e1169d996ab148490c01b65b4ee371cf8ffba2
+.\" https://lwn.net/Articles/519010/
+system call is like
+.BR init_module (),
+but reads the module to be loaded from the file descriptor
+.IR fd .
+It is useful when the authenticity of a kernel module
+can be determined from its location in the filesystem;
+in cases where that is possible,
+the overhead of using cryptographically signed modules to
+determine the authenticity of a module can be avoided.
+The
+.I param_values
+argument is as for
+.BR init_module ().
+.PP
+The
+.I flags
+argument modifies the operation of
+.BR finit_module ().
+It is a bit mask value created by ORing
+together zero or more of the following flags:
+.\" commit 2f3238aebedb243804f58d62d57244edec4149b2
+.TP
+.B MODULE_INIT_IGNORE_MODVERSIONS
+Ignore symbol version hashes.
+.TP
+.B MODULE_INIT_IGNORE_VERMAGIC
+Ignore kernel version magic.
+.PP
+There are some safety checks built into a module to ensure that
+it matches the kernel against which it is loaded.
+.\" http://www.tldp.org/HOWTO/Module-HOWTO/basekerncompat.html
+.\" is dated, but informative
+These checks are recorded when the module is built and
+verified when the module is loaded.
+First, the module records a "vermagic" string containing
+the kernel version number and prominent features (such as the CPU type).
+Second, if the module was built with the
+.B CONFIG_MODVERSIONS
+configuration option enabled,
+a version hash is recorded for each symbol the module uses.
+This hash is based on the types of the arguments and return value
+for the function named by the symbol.
+In this case, the kernel version number within the
+"vermagic" string is ignored,
+as the symbol version hashes are assumed to be sufficiently reliable.
+.PP
+Using the
+.B MODULE_INIT_IGNORE_VERMAGIC
+flag indicates that the "vermagic" string is to be ignored, and the
+.B MODULE_INIT_IGNORE_MODVERSIONS
+flag indicates that the symbol version hashes are to be ignored.
+If the kernel is built to permit forced loading (i.e., configured with
+.BR CONFIG_MODULE_FORCE_LOAD ),
+then loading continues, otherwise it fails with the error
+.B ENOEXEC
+as expected for malformed modules.
+.SH RETURN VALUE
+On success, these system calls return 0.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.BR EBADMSG " (since Linux 3.7)"
+Module signature is misformatted.
+.TP
+.B EBUSY
+Timeout while trying to resolve a symbol reference by this module.
+.TP
+.B EFAULT
+An address argument referred to a location that
+is outside the process's accessible address space.
+.TP
+.BR ENOKEY " (since Linux 3.7)"
+.\" commit 48ba2462ace6072741fd8d0058207d630ce93bf1
+.\" commit 1d0059f3a468825b5fc5405c636a2f6e02707ffa
+.\" commit 106a4ee258d14818467829bf0e12aeae14c16cd7
+Module signature is invalid or
+the kernel does not have a key for this module.
+This error is returned only if the kernel was configured with
+.BR CONFIG_MODULE_SIG_FORCE ;
+if the kernel was not configured with this option,
+then an invalid or unsigned module simply taints the kernel.
+.TP
+.B ENOMEM
+Out of memory.
+.TP
+.B EPERM
+The caller was not privileged
+(did not have the
+.B CAP_SYS_MODULE
+capability),
+or module loading is disabled
+(see
+.I /proc/sys/kernel/modules_disabled
+in
+.BR proc (5)).
+.PP
+The following errors may additionally occur for
+.BR init_module ():
+.TP
+.B EEXIST
+A module with this name is already loaded.
+.TP
+.B EINVAL
+.I param_values
+is invalid, or some part of the ELF image in
+.I module_image
+contains inconsistencies.
+.\" .TP
+.\" .BR EINVAL " (Linux 2.4 and earlier)"
+.\" Some
+.\" .I image
+.\" slot is filled in incorrectly,
+.\" .I image\->name
+.\" does not correspond to the original module name, some
+.\" .I image\->deps
+.\" entry does not correspond to a loaded module,
+.\" or some other similar inconsistency.
+.TP
+.B ENOEXEC
+The binary image supplied in
+.I module_image
+is not an ELF image,
+or is an ELF image that is invalid or for a different architecture.
+.PP
+The following errors may additionally occur for
+.BR finit_module ():
+.TP
+.B EBADF
+The file referred to by
+.I fd
+is not opened for reading.
+.TP
+.B EFBIG
+The file referred to by
+.I fd
+is too large.
+.TP
+.B EINVAL
+.I flags
+is invalid.
+.TP
+.B ENOEXEC
+.I fd
+does not refer to an open file.
+.TP
+.BR ETXTBSY " (since Linux 4.7)"
+.\" commit 39d637af5aa7577f655c58b9e55587566c63a0af
+The file referred to by
+.I fd
+is opened for read-write.
+.PP
+In addition to the above errors, if the module's
+.I init
+function is executed and returns an error, then
+.BR init_module ()
+or
+.BR finit_module ()
+fails and
+.I errno
+is set to the value returned by the
+.I init
+function.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+.TP
+.BR finit_module ()
+Linux 3.8.
+.PP
+The
+.BR init_module ()
+system call is not supported by glibc.
+No declaration is provided in glibc headers, but, through a quirk of history,
+glibc versions before glibc 2.23 did export an ABI for this system call.
+Therefore, in order to employ this system call,
+it is (before glibc 2.23) sufficient to
+manually declare the interface in your code;
+alternatively, you can invoke the system call using
+.BR syscall (2).
+.SS Linux 2.4 and earlier
+In Linux 2.4 and earlier, the
+.BR init_module ()
+system call was rather different:
+.PP
+.B " #include <linux/module.h>"
+.PP
+.BI " int init_module(const char *" name ", struct module *" image );
+.PP
+(User-space applications can detect which version of
+.BR init_module ()
+is available by calling
+.BR query_module ();
+the latter call fails with the error
+.B ENOSYS
+on Linux 2.6 and later.)
+.PP
+The older version of the system call
+loads the relocated module image pointed to by
+.I image
+into kernel space and runs the module's
+.I init
+function.
+The caller is responsible for providing the relocated image (since
+Linux 2.6, the
+.BR init_module ()
+system call does the relocation).
+.PP
+The module image begins with a module structure and is followed by
+code and data as appropriate.
+Since Linux 2.2, the module structure is defined as follows:
+.PP
+.in +4n
+.EX
+struct module {
+ unsigned long size_of_struct;
+ struct module *next;
+ const char *name;
+ unsigned long size;
+ long usecount;
+ unsigned long flags;
+ unsigned int nsyms;
+ unsigned int ndeps;
+ struct module_symbol *syms;
+ struct module_ref *deps;
+ struct module_ref *refs;
+ int (*init)(void);
+ void (*cleanup)(void);
+ const struct exception_table_entry *ex_table_start;
+ const struct exception_table_entry *ex_table_end;
+#ifdef __alpha__
+ unsigned long gp;
+#endif
+};
+.EE
+.in
+.PP
+All of the pointer fields, with the exception of
+.I next
+and
+.IR refs ,
+are expected to point within the module body and be
+initialized as appropriate for kernel space, that is, relocated with
+the rest of the module.
+.SH NOTES
+Information about currently loaded modules can be found in
+.I /proc/modules
+and in the file trees under the per-module subdirectories under
+.IR /sys/module .
+.PP
+See the Linux kernel source file
+.I include/linux/module.h
+for some useful background information.
+.SH SEE ALSO
+.BR create_module (2),
+.BR delete_module (2),
+.BR query_module (2),
+.BR lsmod (8),
+.BR modprobe (8)
diff --git a/man2/inl.2 b/man2/inl.2
new file mode 100644
index 0000000..2c63c75
--- /dev/null
+++ b/man2/inl.2
@@ -0,0 +1 @@
+.so man2/outb.2
diff --git a/man2/inl_p.2 b/man2/inl_p.2
new file mode 100644
index 0000000..2c63c75
--- /dev/null
+++ b/man2/inl_p.2
@@ -0,0 +1 @@
+.so man2/outb.2
diff --git a/man2/inotify_add_watch.2 b/man2/inotify_add_watch.2
new file mode 100644
index 0000000..2604115
--- /dev/null
+++ b/man2/inotify_add_watch.2
@@ -0,0 +1,135 @@
+.\" Copyright (C) 2005 Robert Love
+.\" and Copyright, 2006 Michael Kerrisk
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" 2005-07-19 Robert Love <rlove@rlove.org> - initial version
+.\" 2006-02-07 mtk, various changes
+.\"
+.TH inotify_add_watch 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+inotify_add_watch \- add a watch to an initialized inotify instance
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/inotify.h>
+.PP
+.BI "int inotify_add_watch(int " fd ", const char *" pathname ", uint32_t " mask );
+.fi
+.SH DESCRIPTION
+.BR inotify_add_watch ()
+adds a new watch, or modifies an existing watch,
+for the file whose location is specified in
+.IR pathname ;
+the caller must have read permission for this file.
+The
+.I fd
+argument is a file descriptor referring to the
+inotify instance whose watch list is to be modified.
+The events to be monitored for
+.I pathname
+are specified in the
+.I mask
+bit-mask argument.
+See
+.BR inotify (7)
+for a description of the bits that can be set in
+.IR mask .
+.PP
+A successful call to
+.BR inotify_add_watch ()
+returns a unique watch descriptor for this inotify instance,
+for the filesystem object (inode) that corresponds to
+.IR pathname .
+If the filesystem object
+was not previously being watched by this inotify instance,
+then the watch descriptor is newly allocated.
+If the filesystem object was already being watched
+(perhaps via a different link to the same object), then the descriptor
+for the existing watch is returned.
+.PP
+The watch descriptor is returned by later
+.BR read (2)s
+from the inotify file descriptor.
+These reads fetch
+.I inotify_event
+structures (see
+.BR inotify (7))
+indicating filesystem events;
+the watch descriptor inside this structure identifies
+the object for which the event occurred.
+.SH RETURN VALUE
+On success,
+.BR inotify_add_watch ()
+returns a watch descriptor (a nonnegative integer).
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+Read access to the given file is not permitted.
+.TP
+.B EBADF
+The given file descriptor is not valid.
+.TP
+.B EEXIST
+.I mask
+contains
+.B IN_MASK_CREATE
+and
+.I pathname
+refers to a file already being watched by the same
+.IR fd .
+.TP
+.B EFAULT
+.I pathname
+points outside of the process's accessible address space.
+.TP
+.B EINVAL
+The given event mask contains no valid events; or
+.I mask
+contains both
+.B IN_MASK_ADD
+and
+.BR IN_MASK_CREATE ;
+or
+.I fd
+is not an inotify file descriptor.
+.TP
+.B ENAMETOOLONG
+.I pathname
+is too long.
+.TP
+.B ENOENT
+A directory component in
+.I pathname
+does not exist or is a dangling symbolic link.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.B ENOSPC
+The user limit on the total number of inotify watches was reached or the
+kernel failed to allocate a needed resource.
+.TP
+.B ENOTDIR
+.I mask
+contains
+.B IN_ONLYDIR
+and
+.I pathname
+is not a directory.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.13.
+.SH EXAMPLES
+See
+.BR inotify (7).
+.SH SEE ALSO
+.BR inotify_init (2),
+.BR inotify_rm_watch (2),
+.BR inotify (7)
diff --git a/man2/inotify_init.2 b/man2/inotify_init.2
new file mode 100644
index 0000000..e6be9c3
--- /dev/null
+++ b/man2/inotify_init.2
@@ -0,0 +1,97 @@
+.\" Copyright (C) 2005 Robert Love
+.\" and Copyright (C) 2008, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" 2005-07-19 Robert Love <rlove@rlove.org> - initial version
+.\" 2006-02-07 mtk, minor changes
+.\" 2008-10-10 mtk: add description of inotify_init1()
+.\"
+.TH inotify_init 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+inotify_init, inotify_init1 \- initialize an inotify instance
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/inotify.h>
+.PP
+.B "int inotify_init(void);"
+.BI "int inotify_init1(int " flags );
+.fi
+.SH DESCRIPTION
+For an overview of the inotify API, see
+.BR inotify (7).
+.PP
+.BR inotify_init ()
+initializes a new inotify instance and returns a file descriptor associated
+with a new inotify event queue.
+.PP
+If
+.I flags
+is 0, then
+.BR inotify_init1 ()
+is the same as
+.BR inotify_init ().
+The following values can be bitwise ORed in
+.I flags
+to obtain different behavior:
+.TP
+.B IN_NONBLOCK
+Set the
+.B O_NONBLOCK
+file status flag on the open file description (see
+.BR open (2))
+referred to by the new file descriptor.
+Using this flag saves extra calls to
+.BR fcntl (2)
+to achieve the same result.
+.TP
+.B IN_CLOEXEC
+Set the close-on-exec
+.RB ( FD_CLOEXEC )
+flag on the new file descriptor.
+See the description of the
+.B O_CLOEXEC
+flag in
+.BR open (2)
+for reasons why this may be useful.
+.SH RETURN VALUE
+On success, these system calls return a new file descriptor.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+.RB ( inotify_init1 ())
+An invalid value was specified in
+.IR flags .
+.TP
+.B EMFILE
+The user limit on the total number of inotify instances has been reached.
+.TP
+.B EMFILE
+The per-process limit on the number of open file descriptors has been reached.
+.TP
+.B ENFILE
+The system-wide limit on the total number of open files has been reached.
+.TP
+.B ENOMEM
+Insufficient kernel memory is available.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+.TP
+.BR inotify_init ()
+Linux 2.6.13,
+glibc 2.4.
+.TP
+.BR inotify_init1 ()
+Linux 2.6.27,
+glibc 2.9.
+.SH SEE ALSO
+.BR inotify_add_watch (2),
+.BR inotify_rm_watch (2),
+.BR inotify (7)
diff --git a/man2/inotify_init1.2 b/man2/inotify_init1.2
new file mode 100644
index 0000000..62c5b44
--- /dev/null
+++ b/man2/inotify_init1.2
@@ -0,0 +1 @@
+.so man2/inotify_init.2
diff --git a/man2/inotify_rm_watch.2 b/man2/inotify_rm_watch.2
new file mode 100644
index 0000000..8e1b283
--- /dev/null
+++ b/man2/inotify_rm_watch.2
@@ -0,0 +1,60 @@
+.\" Copyright (C) 2005 Robert Love
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" 2005-07-19 Robert Love <rlove@rlove.org> - initial version
+.\" 2006-02-07 mtk, minor changes
+.\"
+.TH inotify_rm_watch 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+inotify_rm_watch \- remove an existing watch from an inotify instance
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/inotify.h>
+.PP
+.BI "int inotify_rm_watch(int " fd ", int " wd );
+.\" Before glibc 2.10, the second argument was types as uint32_t.
+.\" https://www.sourceware.org/bugzilla/show_bug.cgi?id=7040
+.fi
+.SH DESCRIPTION
+.BR inotify_rm_watch ()
+removes the watch associated with the watch descriptor
+.I wd
+from the inotify instance associated with the file descriptor
+.IR fd .
+.PP
+Removing a watch causes an
+.B IN_IGNORED
+event to be generated for this watch descriptor.
+(See
+.BR inotify (7).)
+.SH RETURN VALUE
+On success,
+.BR inotify_rm_watch ()
+returns zero.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+.I fd
+is not a valid file descriptor.
+.TP
+.B EINVAL
+The watch descriptor
+.I wd
+is not valid; or
+.I fd
+is not an inotify file descriptor.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.13.
+.SH SEE ALSO
+.BR inotify_add_watch (2),
+.BR inotify_init (2),
+.BR inotify (7)
diff --git a/man2/insb.2 b/man2/insb.2
new file mode 100644
index 0000000..2c63c75
--- /dev/null
+++ b/man2/insb.2
@@ -0,0 +1 @@
+.so man2/outb.2
diff --git a/man2/insl.2 b/man2/insl.2
new file mode 100644
index 0000000..2c63c75
--- /dev/null
+++ b/man2/insl.2
@@ -0,0 +1 @@
+.so man2/outb.2
diff --git a/man2/insw.2 b/man2/insw.2
new file mode 100644
index 0000000..2c63c75
--- /dev/null
+++ b/man2/insw.2
@@ -0,0 +1 @@
+.so man2/outb.2
diff --git a/man2/intro.2 b/man2/intro.2
new file mode 100644
index 0000000..ef3d8cf
--- /dev/null
+++ b/man2/intro.2
@@ -0,0 +1,115 @@
+.\" Copyright (C) 2007 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" 2007-10-23 mtk: moved the _syscallN specific material to the
+.\" new _syscall(2) page, and substantially enhanced and rewrote
+.\" the remaining material on this page.
+.\"
+.TH intro 2 2023-02-05 "Linux man-pages 6.05.01"
+.SH NAME
+intro \- introduction to system calls
+.SH DESCRIPTION
+Section 2 of the manual describes the Linux system calls.
+A system call is an entry point into the Linux kernel.
+Usually, system calls are not invoked directly:
+instead, most system calls have corresponding C library
+wrapper functions which perform the steps required
+(e.g., trapping to kernel mode) in order to invoke
+the system call.
+Thus, making a system call looks the same as invoking a normal
+library function.
+.PP
+In many cases, the C library wrapper function does nothing more than:
+.IP \[bu] 3
+copying arguments and the unique system call number to the
+registers where the kernel expects them;
+.IP \[bu]
+trapping to kernel mode,
+at which point the kernel does the real work of the system call;
+.IP \[bu]
+setting
+.I errno
+if the system call returns an error number when the kernel returns the
+CPU to user mode.
+.PP
+However, in a few cases, a wrapper function may do rather more than this,
+for example, performing some preprocessing
+of the arguments before trapping to kernel mode,
+or postprocessing of values returned by the system call.
+Where this is the case, the manual pages in Section 2 generally
+try to note the details of both the (usually GNU) C library API
+interface and the raw system call.
+Most commonly, the main DESCRIPTION will focus on the C library interface,
+and differences for the system call are covered in the NOTES section.
+.PP
+For a list of the Linux system calls, see
+.BR syscalls (2).
+.SH RETURN VALUE
+On error, most system calls return a negative error number
+(i.e., the negated value of one of the constants described in
+.BR errno (3)).
+The C library wrapper hides this detail from the caller: when a
+system call returns a negative value, the wrapper copies the
+absolute value into the
+.I errno
+variable, and returns \-1 as the return value of the wrapper.
+.PP
+The value returned by a successful system call depends on the call.
+Many system calls return 0 on success, but some can return nonzero
+values from a successful call.
+The details are described in the individual manual pages.
+.PP
+In some cases,
+the programmer must define a feature test macro in order to obtain
+the declaration of a system call from the header file specified
+in the man page SYNOPSIS section.
+(Where required, these feature test macros must be defined before including
+.I any
+header files.)
+In such cases, the required macro is described in the man page.
+For further information on feature test macros, see
+.BR feature_test_macros (7).
+.SH STANDARDS
+Certain terms and abbreviations are used to indicate UNIX variants
+and standards to which calls in this section conform.
+See
+.BR standards (7).
+.SH NOTES
+.SS Calling directly
+In most cases, it is unnecessary to invoke a system call directly,
+but there are times when the Standard C library does not implement
+a nice wrapper function for you.
+In this case, the programmer must manually invoke the system call using
+.BR syscall (2).
+Historically, this was also possible using one of the _syscall macros
+described in
+.BR _syscall (2).
+.SS Authors and copyright conditions
+Look at the header of the manual page source for the author(s) and copyright
+conditions.
+Note that these can be different from page to page!
+.SH SEE ALSO
+.ad l
+.nh
+.BR _syscall (2),
+.BR syscall (2),
+.BR syscalls (2),
+.BR errno (3),
+.BR intro (3),
+.BR capabilities (7),
+.BR credentials (7),
+.BR feature_test_macros (7),
+.BR mq_overview (7),
+.BR path_resolution (7),
+.BR pipe (7),
+.BR pty (7),
+.BR sem_overview (7),
+.BR shm_overview (7),
+.BR signal (7),
+.BR socket (7),
+.BR standards (7),
+.BR symlink (7),
+.BR system_data_types (7),
+.BR sysvipc (7),
+.BR time (7)
diff --git a/man2/inw.2 b/man2/inw.2
new file mode 100644
index 0000000..2c63c75
--- /dev/null
+++ b/man2/inw.2
@@ -0,0 +1 @@
+.so man2/outb.2
diff --git a/man2/inw_p.2 b/man2/inw_p.2
new file mode 100644
index 0000000..2c63c75
--- /dev/null
+++ b/man2/inw_p.2
@@ -0,0 +1 @@
+.so man2/outb.2
diff --git a/man2/io_cancel.2 b/man2/io_cancel.2
new file mode 100644
index 0000000..1b413e2
--- /dev/null
+++ b/man2/io_cancel.2
@@ -0,0 +1,106 @@
+.\" Copyright (C) 2003 Free Software Foundation, Inc.
+.\"
+.\" SPDX-License-Identifier: GPL-1.0-or-later
+.\"
+.TH io_cancel 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+io_cancel \- cancel an outstanding asynchronous I/O operation
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.PP
+Alternatively, Asynchronous I/O library
+.RI ( libaio ", " \-laio );
+see VERSIONS.
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/aio_abi.h>" " /* Definition of needed types */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_io_cancel, aio_context_t " ctx_id ", struct iocb *" iocb ,
+.BI " struct io_event *" result );
+.fi
+.SH DESCRIPTION
+.IR Note :
+this page describes the raw Linux system call interface.
+The wrapper function provided by
+.I libaio
+uses a different type for the
+.I ctx_id
+argument.
+See VERSIONS.
+.PP
+The
+.BR io_cancel ()
+system call
+attempts to cancel an asynchronous I/O operation previously submitted with
+.BR io_submit (2).
+The
+.I iocb
+argument describes the operation to be canceled and the
+.I ctx_id
+argument is the AIO context to which the operation was submitted.
+If the operation is successfully canceled, the event will be copied into
+the memory pointed to by
+.I result
+without being placed into the
+completion queue.
+.SH RETURN VALUE
+On success,
+.BR io_cancel ()
+returns 0.
+For the failure return, see VERSIONS.
+.SH ERRORS
+.TP
+.B EAGAIN
+The \fIiocb\fP specified was not canceled.
+.TP
+.B EFAULT
+One of the data structures points to invalid data.
+.TP
+.B EINVAL
+The AIO context specified by \fIctx_id\fP is invalid.
+.TP
+.B ENOSYS
+.BR io_cancel ()
+is not implemented on this architecture.
+.SH VERSIONS
+You probably want to use the
+.BR io_cancel ()
+wrapper function provided by
+.\" http://git.fedorahosted.org/git/?p=libaio.git
+.IR libaio .
+.PP
+Note that the
+.I libaio
+wrapper function uses a different type
+.RI ( io_context_t )
+.\" But glibc is confused, since <libaio.h> uses 'io_context_t' to declare
+.\" the system call.
+for the
+.I ctx_id
+argument.
+Note also that the
+.I libaio
+wrapper does not follow the usual C library conventions for indicating errors:
+on error it returns a negated error number
+(the negative of one of the values listed in ERRORS).
+If the system call is invoked via
+.BR syscall (2),
+then the return value follows the usual conventions for
+indicating an error: \-1, with
+.I errno
+set to a (positive) value that indicates the error.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.5.
+.SH SEE ALSO
+.BR io_destroy (2),
+.BR io_getevents (2),
+.BR io_setup (2),
+.BR io_submit (2),
+.BR aio (7)
+.\" .SH AUTHOR
+.\" Kent Yoder.
diff --git a/man2/io_destroy.2 b/man2/io_destroy.2
new file mode 100644
index 0000000..4f513e5
--- /dev/null
+++ b/man2/io_destroy.2
@@ -0,0 +1,97 @@
+.\" Copyright (C) 2003 Free Software Foundation, Inc.
+.\"
+.\" SPDX-License-Identifier: GPL-1.0-or-later
+.\"
+.TH io_destroy 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+io_destroy \- destroy an asynchronous I/O context
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/aio_abi.h>" " /* Definition of " aio_context_t " */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_io_destroy, aio_context_t " ctx_id );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR io_destroy (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+.IR Note :
+this page describes the raw Linux system call interface.
+The wrapper function provided by
+.I libaio
+uses a different type for the
+.I ctx_id
+argument.
+See VERSIONS.
+.PP
+The
+.BR io_destroy ()
+system call
+will attempt to cancel all outstanding asynchronous I/O operations against
+.IR ctx_id ,
+will block on the completion of all operations
+that could not be canceled, and will destroy the
+.IR ctx_id .
+.SH RETURN VALUE
+On success,
+.BR io_destroy ()
+returns 0.
+For the failure return, see VERSIONS.
+.SH ERRORS
+.TP
+.B EFAULT
+The context pointed to is invalid.
+.TP
+.B EINVAL
+The AIO context specified by \fIctx_id\fP is invalid.
+.TP
+.B ENOSYS
+.BR io_destroy ()
+is not implemented on this architecture.
+.SH VERSIONS
+You probably want to use the
+.BR io_destroy ()
+wrapper function provided by
+.\" http://git.fedorahosted.org/git/?p=libaio.git
+.IR libaio .
+.PP
+Note that the
+.I libaio
+wrapper function uses a different type
+.RI ( io_context_t )
+.\" But glibc is confused, since <libaio.h> uses 'io_context_t' to declare
+.\" the system call.
+for the
+.I ctx_id
+argument.
+Note also that the
+.I libaio
+wrapper does not follow the usual C library conventions for indicating errors:
+on error it returns a negated error number
+(the negative of one of the values listed in ERRORS).
+If the system call is invoked via
+.BR syscall (2),
+then the return value follows the usual conventions for
+indicating an error: \-1, with
+.I errno
+set to a (positive) value that indicates the error.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.5.
+.SH SEE ALSO
+.BR io_cancel (2),
+.BR io_getevents (2),
+.BR io_setup (2),
+.BR io_submit (2),
+.BR aio (7)
+.\" .SH AUTHOR
+.\" Kent Yoder.
diff --git a/man2/io_getevents.2 b/man2/io_getevents.2
new file mode 100644
index 0000000..3cf506c
--- /dev/null
+++ b/man2/io_getevents.2
@@ -0,0 +1,137 @@
+.\" Copyright (C) 2003 Free Software Foundation, Inc.
+.\"
+.\" SPDX-License-Identifier: GPL-1.0-or-later
+.\"
+.TH io_getevents 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+io_getevents \- read asynchronous I/O events from the completion queue
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.PP
+Alternatively, Asynchronous I/O library
+.RI ( libaio ", " \-laio );
+see VERSIONS.
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/aio_abi.h>" " /* Definition of " *io_* " types */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_io_getevents, aio_context_t " ctx_id ,
+.BI " long " min_nr ", long " nr ", struct io_event *" events ,
+.BI " struct timespec *" timeout );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR io_getevents (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+.IR Note :
+this page describes the raw Linux system call interface.
+The wrapper function provided by
+.I libaio
+uses a different type for the
+.I ctx_id
+argument.
+See VERSIONS.
+.PP
+The
+.BR io_getevents ()
+system call
+attempts to read at least \fImin_nr\fP events and
+up to \fInr\fP events from the completion queue of the AIO context
+specified by \fIctx_id\fP.
+.PP
+The \fItimeout\fP argument specifies the amount of time to wait for events,
+and is specified as a relative timeout in a
+.BR timespec (3)
+structure.
+.PP
+The specified time will be rounded up to the system clock granularity
+and is guaranteed not to expire early.
+.PP
+Specifying
+.I timeout
+as NULL means block indefinitely until at least
+.I min_nr
+events have been obtained.
+.SH RETURN VALUE
+On success,
+.BR io_getevents ()
+returns the number of events read.
+This may be 0, or a value less than
+.IR min_nr ,
+if the
+.I timeout
+expired.
+It may also be a nonzero value less than
+.IR min_nr ,
+if the call was interrupted by a signal handler.
+.PP
+For the failure return, see VERSIONS.
+.SH ERRORS
+.TP
+.B EFAULT
+Either \fIevents\fP or \fItimeout\fP is an invalid pointer.
+.TP
+.B EINTR
+Interrupted by a signal handler; see
+.BR signal (7).
+.TP
+.B EINVAL
+\fIctx_id\fP is invalid.
+\fImin_nr\fP is out of range or \fInr\fP is
+out of range.
+.TP
+.B ENOSYS
+.BR io_getevents ()
+is not implemented on this architecture.
+.SH VERSIONS
+You probably want to use the
+.BR io_getevents ()
+wrapper function provided by
+.\" http://git.fedorahosted.org/git/?p=libaio.git
+.IR libaio .
+.PP
+Note that the
+.I libaio
+wrapper function uses a different type
+.RI ( io_context_t )
+.\" But glibc is confused, since <libaio.h> uses 'io_context_t' to declare
+.\" the system call.
+for the
+.I ctx_id
+argument.
+Note also that the
+.I libaio
+wrapper does not follow the usual C library conventions for indicating errors:
+on error it returns a negated error number
+(the negative of one of the values listed in ERRORS).
+If the system call is invoked via
+.BR syscall (2),
+then the return value follows the usual conventions for
+indicating an error: \-1, with
+.I errno
+set to a (positive) value that indicates the error.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.5.
+.SH BUGS
+An invalid
+.I ctx_id
+may cause a segmentation fault instead of generating the error
+.BR EINVAL .
+.SH SEE ALSO
+.BR io_cancel (2),
+.BR io_destroy (2),
+.BR io_setup (2),
+.BR io_submit (2),
+.BR timespec (3),
+.BR aio (7),
+.BR time (7)
+.\" .SH AUTHOR
+.\" Kent Yoder.
diff --git a/man2/io_setup.2 b/man2/io_setup.2
new file mode 100644
index 0000000..0745456
--- /dev/null
+++ b/man2/io_setup.2
@@ -0,0 +1,114 @@
+.\" Copyright (C) 2003 Free Software Foundation, Inc.
+.\"
+.\" SPDX-License-Identifier: GPL-1.0-or-later
+.\"
+.TH io_setup 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+io_setup \- create an asynchronous I/O context
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.PP
+Alternatively, Asynchronous I/O library
+.RI ( libaio ", " \-laio );
+see VERSIONS.
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/aio_abi.h>" " /* Defines needed types */"
+.PP
+.BI "long io_setup(unsigned int " nr_events ", aio_context_t *" ctx_idp );
+.fi
+.PP
+.IR Note :
+There is no glibc wrapper for this system call; see VERSIONS.
+.SH DESCRIPTION
+.IR Note :
+this page describes the raw Linux system call interface.
+The wrapper function provided by
+.I libaio
+uses a different type for the
+.I ctx_idp
+argument.
+See VERSIONS.
+.PP
+The
+.BR io_setup ()
+system call
+creates an asynchronous I/O context suitable for concurrently processing
+\fInr_events\fP operations.
+The
+.I ctx_idp
+argument must not point to an AIO context that already exists, and must
+be initialized to 0 prior to the call.
+On successful creation of the AIO context, \fI*ctx_idp\fP is filled in
+with the resulting handle.
+.SH RETURN VALUE
+On success,
+.BR io_setup ()
+returns 0.
+For the failure return, see VERSIONS.
+.SH ERRORS
+.TP
+.B EAGAIN
+The specified \fInr_events\fP exceeds the limit of available events,
+as defined in
+.I /proc/sys/fs/aio\-max\-nr
+(see
+.BR proc (5)).
+.TP
+.B EFAULT
+An invalid pointer is passed for \fIctx_idp\fP.
+.TP
+.B EINVAL
+\fIctx_idp\fP is not initialized, or the specified \fInr_events\fP
+exceeds internal limits.
+\fInr_events\fP should be greater than 0.
+.TP
+.B ENOMEM
+Insufficient kernel resources are available.
+.TP
+.B ENOSYS
+.BR io_setup ()
+is not implemented on this architecture.
+.SH VERSIONS
+glibc does not provide a wrapper for this system call.
+You could invoke it using
+.BR syscall (2).
+But instead, you probably want to use the
+.BR io_setup ()
+wrapper function provided by
+.\" http://git.fedorahosted.org/git/?p=libaio.git
+.IR libaio .
+.PP
+Note that the
+.I libaio
+wrapper function uses a different type
+.RI ( "io_context_t\ *" )
+.\" But glibc is confused, since <libaio.h> uses 'io_context_t' to declare
+.\" the system call.
+for the
+.I ctx_idp
+argument.
+Note also that the
+.I libaio
+wrapper does not follow the usual C library conventions for indicating errors:
+on error it returns a negated error number
+(the negative of one of the values listed in ERRORS).
+If the system call is invoked via
+.BR syscall (2),
+then the return value follows the usual conventions for
+indicating an error: \-1, with
+.I errno
+set to a (positive) value that indicates the error.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.5.
+.SH SEE ALSO
+.BR io_cancel (2),
+.BR io_destroy (2),
+.BR io_getevents (2),
+.BR io_submit (2),
+.BR aio (7)
+.\" .SH AUTHOR
+.\" Kent Yoder.
diff --git a/man2/io_submit.2 b/man2/io_submit.2
new file mode 100644
index 0000000..51efb6b
--- /dev/null
+++ b/man2/io_submit.2
@@ -0,0 +1,289 @@
+.\" Copyright (C) 2003 Free Software Foundation, Inc.
+.\" and Copyright (C) 2017 Goldwyn Rodrigues <rgoldwyn@suse.de>
+.\"
+.\" SPDX-License-Identifier: GPL-1.0-or-later
+.\"
+.TH io_submit 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+io_submit \- submit asynchronous I/O blocks for processing
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.PP
+Alternatively, Asynchronous I/O library
+.RI ( libaio ", " \-laio );
+see VERSIONS.
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/aio_abi.h>" " /* Defines needed types */"
+.PP
+.BI "int io_submit(aio_context_t " ctx_id ", long " nr \
+", struct iocb **" iocbpp );
+.fi
+.PP
+.IR Note :
+There is no glibc wrapper for this system call; see VERSIONS.
+.SH DESCRIPTION
+.IR Note :
+this page describes the raw Linux system call interface.
+The wrapper function provided by
+.I libaio
+uses a different type for the
+.I ctx_id
+argument.
+See VERSIONS.
+.PP
+The
+.BR io_submit ()
+system call
+queues \fInr\fP I/O request blocks for processing in
+the AIO context \fIctx_id\fP.
+The
+.I iocbpp
+argument should be an array of \fInr\fP AIO control blocks,
+which will be submitted to context \fIctx_id\fP.
+.PP
+The
+.I iocb
+(I/O control block) structure defined in
+.I linux/aio_abi.h
+defines the parameters that control the I/O operation.
+.PP
+.in +4n
+.EX
+#include <linux/aio_abi.h>
+\&
+struct iocb {
+ __u64 aio_data;
+ __u32 PADDED(aio_key, aio_rw_flags);
+ __u16 aio_lio_opcode;
+ __s16 aio_reqprio;
+ __u32 aio_fildes;
+ __u64 aio_buf;
+ __u64 aio_nbytes;
+ __s64 aio_offset;
+ __u64 aio_reserved2;
+ __u32 aio_flags;
+ __u32 aio_resfd;
+};
+.EE
+.in
+.PP
+The fields of this structure are as follows:
+.TP
+.I aio_data
+This data is copied into the
+.I data
+field of the
+.I io_event
+structure upon I/O completion (see
+.BR io_getevents (2)).
+.TP
+.I aio_key
+This is an internal field used by the kernel.
+Do not modify this field after an
+.BR io_submit ()
+call.
+.TP
+.I aio_rw_flags
+This defines the R/W flags passed with structure.
+The valid values are:
+.RS
+.TP
+.BR RWF_APPEND " (since Linux 4.16)"
+.\" commit e1fc742e14e01d84d9693c4aca4ab23da65811fb
+Append data to the end of the file.
+See the description of the flag of the same name in
+.BR pwritev2 (2)
+as well as the description of
+.B O_APPEND
+in
+.BR open (2).
+The
+.I aio_offset
+field is ignored.
+The file offset is not changed.
+.TP
+.BR RWF_DSYNC " (since Linux 4.13)"
+Write operation complete according to requirement of
+synchronized I/O data integrity.
+See the description of the flag of the same name in
+.BR pwritev2 (2)
+as well the description of
+.B O_DSYNC
+in
+.BR open (2).
+.TP
+.BR RWF_HIPRI " (since Linux 4.13)"
+High priority request, poll if possible
+.TP
+.BR RWF_NOWAIT " (since Linux 4.14)"
+Don't wait if the I/O will block for operations such as
+file block allocations, dirty page flush, mutex locks,
+or a congested block device inside the kernel.
+If any of these conditions are met, the control block is returned
+immediately with a return value of
+.B \-EAGAIN
+in the
+.I res
+field of the
+.I io_event
+structure (see
+.BR io_getevents (2)).
+.TP
+.BR RWF_SYNC " (since Linux 4.13)"
+Write operation complete according to requirement of
+synchronized I/O file integrity.
+See the description of the flag of the same name in
+.BR pwritev2 (2)
+as well the description of
+.B O_SYNC
+in
+.BR open (2).
+.RE
+.TP
+.I aio_lio_opcode
+This defines the type of I/O to be performed by the
+.I iocb
+structure.
+The
+valid values are defined by the enum defined in
+.IR linux/aio_abi.h :
+.IP
+.in +4n
+.EX
+enum {
+ IOCB_CMD_PREAD = 0,
+ IOCB_CMD_PWRITE = 1,
+ IOCB_CMD_FSYNC = 2,
+ IOCB_CMD_FDSYNC = 3,
+ IOCB_CMD_POLL = 5,
+ IOCB_CMD_NOOP = 6,
+ IOCB_CMD_PREADV = 7,
+ IOCB_CMD_PWRITEV = 8,
+};
+.EE
+.in
+.TP
+.I aio_reqprio
+This defines the requests priority.
+.TP
+.I aio_fildes
+The file descriptor on which the I/O operation is to be performed.
+.TP
+.I aio_buf
+This is the buffer used to transfer data for a read or write operation.
+.TP
+.I aio_nbytes
+This is the size of the buffer pointed to by
+.IR aio_buf .
+.TP
+.I aio_offset
+This is the file offset at which the I/O operation is to be performed.
+.TP
+.I aio_flags
+This is the set of flags associated with the
+.I iocb
+structure.
+The valid values are:
+.RS
+.TP
+.B IOCB_FLAG_RESFD
+Asynchronous I/O control must signal the file
+descriptor mentioned in
+.I aio_resfd
+upon completion.
+.TP
+.BR IOCB_FLAG_IOPRIO " (since Linux 4.18)"
+.\" commit d9a08a9e616beeccdbd0e7262b7225ffdfa49e92
+Interpret the
+.I aio_reqprio
+field as an
+.B IOPRIO_VALUE
+as defined by
+.IR linux/ioprio.h .
+.RE
+.TP
+.I aio_resfd
+The file descriptor to signal in the event of asynchronous I/O completion.
+.SH RETURN VALUE
+On success,
+.BR io_submit ()
+returns the number of \fIiocb\fPs submitted (which may be
+less than \fInr\fP, or 0 if \fInr\fP is zero).
+For the failure return, see VERSIONS.
+.SH ERRORS
+.TP
+.B EAGAIN
+Insufficient resources are available to queue any \fIiocb\fPs.
+.TP
+.B EBADF
+The file descriptor specified in the first \fIiocb\fP is invalid.
+.TP
+.B EFAULT
+One of the data structures points to invalid data.
+.TP
+.B EINVAL
+The AIO context specified by \fIctx_id\fP is invalid.
+\fInr\fP is less than 0.
+The \fIiocb\fP at
+.I *iocbpp[0]
+is not properly initialized, the operation specified is invalid for the file
+descriptor in the \fIiocb\fP, or the value in the
+.I aio_reqprio
+field is invalid.
+.TP
+.B ENOSYS
+.BR io_submit ()
+is not implemented on this architecture.
+.TP
+.B EPERM
+The
+.I aio_reqprio
+field is set with the class
+.BR IOPRIO_CLASS_RT ,
+but the submitting context does not have the
+.B CAP_SYS_ADMIN
+capability.
+.SH VERSIONS
+glibc does not provide a wrapper for this system call.
+You could invoke it using
+.BR syscall (2).
+But instead, you probably want to use the
+.BR io_submit ()
+wrapper function provided by
+.\" http://git.fedorahosted.org/git/?p=libaio.git
+.IR libaio .
+.PP
+Note that the
+.I libaio
+wrapper function uses a different type
+.RI ( io_context_t )
+.\" But glibc is confused, since <libaio.h> uses 'io_context_t' to declare
+.\" the system call.
+for the
+.I ctx_id
+argument.
+Note also that the
+.I libaio
+wrapper does not follow the usual C library conventions for indicating errors:
+on error it returns a negated error number
+(the negative of one of the values listed in ERRORS).
+If the system call is invoked via
+.BR syscall (2),
+then the return value follows the usual conventions for
+indicating an error: \-1, with
+.I errno
+set to a (positive) value that indicates the error.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.5.
+.SH SEE ALSO
+.BR io_cancel (2),
+.BR io_destroy (2),
+.BR io_getevents (2),
+.BR io_setup (2),
+.BR aio (7)
+.\" .SH AUTHOR
+.\" Kent Yoder.
diff --git a/man2/ioctl.2 b/man2/ioctl.2
new file mode 100644
index 0000000..d6c0701
--- /dev/null
+++ b/man2/ioctl.2
@@ -0,0 +1,185 @@
+.\" Copyright (c) 1980, 1991 Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" SPDX-License-Identifier: BSD-4-Clause-UC
+.\"
+.\" @(#)ioctl.2 6.4 (Berkeley) 3/10/91
+.\"
+.\" Modified 1993-07-23 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1996-10-22 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 1999-06-25 by Rachael Munns <vashti@dream.org.uk>
+.\" Modified 2000-09-21 by Andries Brouwer <aeb@cwi.nl>
+.\"
+.TH ioctl 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+ioctl \- control device
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/ioctl.h>
+.PP
+.BI "int ioctl(int " fd ", unsigned long " request ", ...);"
+.\" POSIX says 'request' is int, but glibc has the above
+.\" See https://bugzilla.kernel.org/show_bug.cgi?id=42705
+.fi
+.SH DESCRIPTION
+The
+.BR ioctl ()
+system call manipulates the underlying device parameters of special files.
+In particular, many operating characteristics of character special files
+(e.g., terminals) may be controlled with
+.BR ioctl ()
+requests.
+The argument
+.I fd
+must be an open file descriptor.
+.PP
+The second argument is a device-dependent request code.
+The third argument is an untyped pointer to memory.
+It's traditionally
+.BI "char *" argp
+(from the days before
+.B "void *"
+was valid C), and will be so named for this discussion.
+.PP
+An
+.BR ioctl ()
+.I request
+has encoded in it whether the argument is an
+.I in
+parameter or
+.I out
+parameter, and the size of the argument
+.I argp
+in bytes.
+Macros and defines used in specifying an
+.BR ioctl ()
+.I request
+are located in the file
+.IR <sys/ioctl.h> .
+See NOTES.
+.SH RETURN VALUE
+Usually, on success zero is returned.
+A few
+.BR ioctl ()
+requests use the return value as an output parameter
+and return a nonnegative value on success.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+.I fd
+is not a valid file descriptor.
+.TP
+.B EFAULT
+.I argp
+references an inaccessible memory area.
+.TP
+.B EINVAL
+.I request
+or
+.I argp
+is not valid.
+.TP
+.B ENOTTY
+.I fd
+is not associated with a character special device.
+.TP
+.B ENOTTY
+The specified request does not apply to the kind of object that the
+file descriptor
+.I fd
+references.
+.SH VERSIONS
+Arguments, returns, and semantics of
+.BR ioctl ()
+vary according to the device driver in question (the call is used as a
+catch-all for operations that don't cleanly fit the UNIX stream I/O
+model).
+.SH STANDARDS
+None.
+.SH HISTORY
+Version\~7 AT&T UNIX.
+.SH NOTES
+In order to use this call, one needs an open file descriptor.
+Often the
+.BR open (2)
+call has unwanted side effects, that can be avoided under Linux
+by giving it the
+.B O_NONBLOCK
+flag.
+.\"
+.SS ioctl structure
+.\" added two sections - aeb
+Ioctl command values are 32-bit constants.
+In principle these constants are completely arbitrary, but people have
+tried to build some structure into them.
+.PP
+The old Linux situation was that of mostly 16-bit constants, where the
+last byte is a serial number, and the preceding byte(s) give a type
+indicating the driver.
+Sometimes the major number was used: 0x03
+for the
+.B HDIO_*
+ioctls, 0x06 for the
+.B LP*
+ioctls.
+And sometimes
+one or more ASCII letters were used.
+For example,
+.B TCGETS
+has value
+0x00005401, with 0x54 = \[aq]T\[aq] indicating the terminal driver, and
+.B CYGETTIMEOUT
+has value 0x00435906, with 0x43 0x59 = \[aq]C\[aq] \[aq]Y\[aq]
+indicating the cyclades driver.
+.PP
+Later (0.98p5) some more information was built into the number.
+One has 2 direction bits
+(00: none, 01: write, 10: read, 11: read/write)
+followed by 14 size bits (giving the size of the argument),
+followed by an 8-bit type (collecting the ioctls in groups
+for a common purpose or a common driver), and an 8-bit
+serial number.
+.PP
+The macros describing this structure live in
+.I <asm/ioctl.h>
+and are
+.B _IO(type,nr)
+and
+.BR "{_IOR,_IOW,_IOWR}(type,nr,size)" .
+They use
+.I sizeof(size)
+so that size is a
+misnomer here: this third argument is a data type.
+.PP
+Note that the size bits are very unreliable: in lots of cases
+they are wrong, either because of buggy macros using
+.IR sizeof(sizeof(struct)) ,
+or because of legacy values.
+.PP
+Thus, it seems that the new structure only gave disadvantages:
+it does not help in checking, but it causes varying values
+for the various architectures.
+.SH SEE ALSO
+.BR execve (2),
+.BR fcntl (2),
+.BR ioctl_console (2),
+.BR ioctl_fat (2),
+.BR ioctl_ficlone (2),
+.BR ioctl_ficlonerange (2),
+.BR ioctl_fideduperange (2),
+.BR ioctl_fslabel (2),
+.BR ioctl_getfsmap (2),
+.BR ioctl_iflags (2),
+.BR ioctl_ns (2),
+.BR ioctl_tty (2),
+.BR ioctl_userfaultfd (2),
+.BR open (2),
+.\" .BR mt (4),
+.BR sd (4),
+.BR tty (4)
diff --git a/man2/ioctl_console.2 b/man2/ioctl_console.2
new file mode 100644
index 0000000..455be75
--- /dev/null
+++ b/man2/ioctl_console.2
@@ -0,0 +1,903 @@
+'\" t
+.\" Copyright (c) 1995 Jim Van Zandt <jrv@vanzandt.mv.com> and aeb
+.\" Sun Feb 26 11:46:23 MET 1995
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" Modified, Sun Feb 26 15:04:20 1995, faith@cs.unc.edu
+.\" Modified, Thu Apr 20 22:08:17 1995, jrv@vanzandt.mv.com
+.\" Modified, Mon Sep 18 22:32:47 1995, hpa@storm.net (H. Peter Anvin)
+.\" FIXME The following are not documented:
+.\" KDFONTOP (since Linux 2.1.111)
+.\" KDGKBDIACRUC (since Linux 2.6.24)
+.\" KDSKBDIACR
+.\" KDSKBDIACRUC (since Linux 2.6.24)
+.\" KDKBDREP (since Linux 2.1.113)
+.\" KDMAPDISP (not implemented as at Linux 2.6.27)
+.\" KDUNMAPDISP (not implemented as at Linux 2.6.27)
+.\" VT_LOCKSWITCH (since Linux 1.3.47, needs CAP_SYS_TTY_CONFIG)
+.\" VT_UNLOCKSWITCH (since Linux 1.3.47, needs CAP_SYS_TTY_CONFIG)
+.\" VT_GETHIFONTMASK (since Linux 2.6.18)
+.\"
+.TH ioctl_console 2 2023-01-22 "Linux man-pages 6.05.01"
+.SH NAME
+ioctl_console \- ioctls for console terminal and virtual consoles
+.SH DESCRIPTION
+The following Linux-specific
+.BR ioctl (2)
+requests are supported for console terminals and virtual consoles.
+Each requires a third argument, assumed here to be
+.IR argp .
+.TP
+.B KDGETLED
+Get state of LEDs.
+.I argp
+points to a
+.IR char .
+The lower three bits
+of
+.I *argp
+are set to the state of the LEDs, as follows:
+.TS
+l l l.
+LED_CAP 0x04 caps lock led
+LED_NUM 0x02 num lock led
+LED_SCR 0x01 scroll lock led
+.TE
+.TP
+.B KDSETLED
+Set the LEDs.
+The LEDs are set to correspond to the lower three bits of the
+unsigned long integer in
+.IR argp .
+However, if a higher order bit is set,
+the LEDs revert to normal: displaying the state of the
+keyboard functions of caps lock, num lock, and scroll lock.
+.PP
+Before Linux 1.1.54, the LEDs just reflected the state of the corresponding
+keyboard flags, and KDGETLED/KDSETLED would also change the keyboard
+flags.
+Since Linux 1.1.54 the LEDs can be made to display arbitrary
+information, but by default they display the keyboard flags.
+The following two ioctls are used to access the keyboard flags.
+.TP
+.B KDGKBLED
+Get keyboard flags CapsLock, NumLock, ScrollLock (not lights).
+.I argp
+points to a char which is set to the flag state.
+The low order three bits (mask 0x7) get the current flag state,
+and the low order bits of the next nibble (mask 0x70) get
+the default flag state.
+(Since Linux 1.1.54.)
+.TP
+.B KDSKBLED
+Set keyboard flags CapsLock, NumLock, ScrollLock (not lights).
+.I argp
+is an unsigned long integer that has the desired flag state.
+The low order three bits (mask 0x7) have the flag state,
+and the low order bits of the next nibble (mask 0x70) have
+the default flag state.
+(Since Linux 1.1.54.)
+.TP
+.B KDGKBTYPE
+Get keyboard type.
+This returns the value KB_101, defined as 0x02.
+.TP
+.B KDADDIO
+Add I/O port as valid.
+Equivalent to
+.IR ioperm(arg,1,1) .
+.TP
+.B KDDELIO
+Delete I/O port as valid.
+Equivalent to
+.IR ioperm(arg,1,0) .
+.TP
+.B KDENABIO
+Enable I/O to video board.
+Equivalent to
+.IR "ioperm(0x3b4, 0x3df\-0x3b4+1, 1)" .
+.TP
+.B KDDISABIO
+Disable I/O to video board.
+Equivalent to
+.IR "ioperm(0x3b4, 0x3df\-0x3b4+1, 0)" .
+.TP
+.B KDSETMODE
+Set text/graphics mode.
+.I argp
+is an unsigned integer containing one of:
+.TS
+l l.
+KD_TEXT 0x00
+KD_GRAPHICS 0x01
+.TE
+.TP
+.B KDGETMODE
+Get text/graphics mode.
+.I argp
+points to an
+.I int
+which is set to one
+of the values shown above for
+.BR KDSETMODE .
+.TP
+.B KDMKTONE
+Generate tone of specified length.
+The lower 16 bits of the unsigned long integer in
+.I argp
+specify the period in clock cycles,
+and the upper 16 bits give the duration in msec.
+If the duration is zero, the sound is turned off.
+Control returns immediately.
+For example,
+.I argp
+= (125<<16) + 0x637 would specify
+the beep normally associated with a ctrl-G.
+(Thus since Linux 0.99pl1; broken in Linux 2.1.49-50.)
+.TP
+.B KIOCSOUND
+Start or stop sound generation.
+The lower 16 bits of
+.I argp
+specify the period in clock cycles
+(that is,
+.I argp
+= 1193180/frequency).
+.I argp
+= 0 turns sound off.
+In either case, control returns immediately.
+.TP
+.B GIO_CMAP
+Get the current default color map from kernel.
+.I argp
+points to
+a 48-byte array.
+(Since Linux 1.3.3.)
+.TP
+.B PIO_CMAP
+Change the default text-mode color map.
+.I argp
+points to a
+48-byte array which contains, in order, the Red, Green, and Blue
+values for the 16 available screen colors: 0 is off, and 255 is full
+intensity.
+The default colors are, in order: black, dark red, dark
+green, brown, dark blue, dark purple, dark cyan, light grey, dark
+grey, bright red, bright green, yellow, bright blue, bright purple,
+bright cyan, and white.
+(Since Linux 1.3.3.)
+.TP
+.B GIO_FONT
+Gets 256-character screen font in expanded form.
+.I argp
+points to an 8192-byte array.
+Fails with error code
+.B EINVAL
+if the
+currently loaded font is a 512-character font, or if the console is
+not in text mode.
+.TP
+.B GIO_FONTX
+Gets screen font and associated information.
+.I argp
+points to a
+.I "struct consolefontdesc"
+(see
+.BR PIO_FONTX ).
+On call, the
+.I charcount
+field should be set to the maximum number of
+characters that would fit in the buffer pointed to by
+.IR chardata .
+On return, the
+.I charcount
+and
+.I charheight
+are filled with
+the respective data for the currently loaded font, and the
+.I chardata
+array contains the font data if the initial value of
+.I charcount
+indicated enough space was available; otherwise the
+buffer is untouched and
+.I errno
+is set to
+.BR ENOMEM .
+(Since Linux 1.3.1.)
+.TP
+.B PIO_FONT
+Sets 256-character screen font.
+Load font into the EGA/VGA character
+generator.
+.I argp
+points to an 8192-byte map, with 32 bytes per
+character.
+Only the first
+.I N
+of them are used for an 8x\fIN\fP font
+(0 <
+.I N
+<= 32).
+This call also invalidates the Unicode mapping.
+.TP
+.B PIO_FONTX
+Sets screen font and associated rendering information.
+.I argp
+points to a
+.IP
+.in +4n
+.EX
+struct consolefontdesc {
+ unsigned short charcount; /* characters in font
+ (256 or 512) */
+ unsigned short charheight; /* scan lines per
+ character (1\-32) */
+ char *chardata; /* font data in
+ expanded form */
+};
+.EE
+.in
+.IP
+If necessary, the screen will be appropriately resized, and
+.B SIGWINCH
+sent to the appropriate processes.
+This call also invalidates the Unicode mapping.
+(Since Linux 1.3.1.)
+.TP
+.B PIO_FONTRESET
+Resets the screen font, size, and Unicode mapping to the bootup
+defaults.
+.I argp
+is unused, but should be set to NULL to
+ensure compatibility with future versions of Linux.
+(Since Linux 1.3.28.)
+.TP
+.B GIO_SCRNMAP
+Get screen mapping from kernel.
+.I argp
+points to an area of size
+E_TABSZ, which is loaded with the font positions used to display each
+character.
+This call is likely to return useless information if the
+currently loaded font is more than 256 characters.
+.TP
+.B GIO_UNISCRNMAP
+Get full Unicode screen mapping from kernel.
+.I argp
+points to an
+area of size
+.IR "E_TABSZ*sizeof(unsigned short)" ,
+which is loaded with the
+Unicodes each character represent.
+A special set of Unicodes,
+starting at U+F000, are used to represent "direct to font" mappings.
+(Since Linux 1.3.1.)
+.TP
+.B PIO_SCRNMAP
+Loads the "user definable" (fourth) table in the kernel which maps
+bytes into console screen symbols.
+.I argp
+points to an area of
+size E_TABSZ.
+.TP
+.B PIO_UNISCRNMAP
+Loads the "user definable" (fourth) table in the kernel which maps
+bytes into Unicodes, which are then translated into screen symbols
+according to the currently loaded Unicode-to-font map.
+Special Unicodes starting at U+F000 can be used to map directly to the font
+symbols.
+(Since Linux 1.3.1.)
+.TP
+.B GIO_UNIMAP
+Get Unicode-to-font mapping from kernel.
+.I argp
+points to a
+.IP
+.in +4n
+.EX
+struct unimapdesc {
+ unsigned short entry_ct;
+ struct unipair *entries;
+};
+.EE
+.in
+.IP
+where
+.I entries
+points to an array of
+.IP
+.in +4n
+.EX
+struct unipair {
+ unsigned short unicode;
+ unsigned short fontpos;
+};
+.EE
+.in
+.IP
+(Since Linux 1.1.92.)
+.TP
+.B PIO_UNIMAP
+Put unicode-to-font mapping in kernel.
+.I argp
+points to a
+.IR "struct unimapdesc" .
+(Since Linux 1.1.92)
+.TP
+.B PIO_UNIMAPCLR
+Clear table, possibly advise hash algorithm.
+.I argp
+points to a
+.IP
+.in +4n
+.EX
+struct unimapinit {
+ unsigned short advised_hashsize; /* 0 if no opinion */
+ unsigned short advised_hashstep; /* 0 if no opinion */
+ unsigned short advised_hashlevel; /* 0 if no opinion */
+};
+.EE
+.in
+.IP
+(Since Linux 1.1.92.)
+.TP
+.B KDGKBMODE
+Gets current keyboard mode.
+.I argp
+points to a
+.I long
+which is set to one
+of these:
+.TS
+l l.
+K_RAW 0x00 /* Raw (scancode) mode */
+K_XLATE 0x01 /* Translate keycodes using keymap */
+K_MEDIUMRAW 0x02 /* Medium raw (scancode) mode */
+K_UNICODE 0x03 /* Unicode mode */
+K_OFF 0x04 /* Disabled mode; since Linux 2.6.39 */
+.\" K_OFF: commit 9fc3de9c83565fcaa23df74c2fc414bb6e7efb0a
+.TE
+.TP
+.B KDSKBMODE
+Sets current keyboard mode.
+.I argp
+is a
+.I long
+equal to one of the values shown for
+.BR KDGKBMODE .
+.TP
+.B KDGKBMETA
+Gets meta key handling mode.
+.I argp
+points to a
+.I long
+which is
+set to one of these:
+.TS
+l l l.
+K_METABIT 0x03 set high order bit
+K_ESCPREFIX 0x04 escape prefix
+.TE
+.TP
+.B KDSKBMETA
+Sets meta key handling mode.
+.I argp
+is a
+.I long
+equal to one of the values shown above for
+.BR KDGKBMETA .
+.TP
+.B KDGKBENT
+Gets one entry in key translation table (keycode to action code).
+.I argp
+points to a
+.IP
+.in +4n
+.EX
+struct kbentry {
+ unsigned char kb_table;
+ unsigned char kb_index;
+ unsigned short kb_value;
+};
+.EE
+.in
+.IP
+with the first two members filled in:
+.I kb_table
+selects the key table (0 <=
+.I kb_table
+< MAX_NR_KEYMAPS),
+and
+.I kb_index
+is the keycode (0 <=
+.I kb_index
+< NR_KEYS).
+.I kb_value
+is set to the corresponding action code,
+or K_HOLE if there is no such key,
+or K_NOSUCHMAP if
+.I kb_table
+is invalid.
+.TP
+.B KDSKBENT
+Sets one entry in translation table.
+.I argp
+points to a
+.IR "struct kbentry" .
+.TP
+.B KDGKBSENT
+Gets one function key string.
+.I argp
+points to a
+.IP
+.in +4n
+.EX
+struct kbsentry {
+ unsigned char kb_func;
+ unsigned char kb_string[512];
+};
+.EE
+.in
+.IP
+.I kb_string
+is set to the (null-terminated) string corresponding to
+the
+.IR kb_func th
+function key action code.
+.TP
+.B KDSKBSENT
+Sets one function key string entry.
+.I argp
+points to a
+.IR "struct kbsentry" .
+.TP
+.B KDGKBDIACR
+Read kernel accent table.
+.I argp
+points to a
+.IP
+.in +4n
+.EX
+struct kbdiacrs {
+ unsigned int kb_cnt;
+ struct kbdiacr kbdiacr[256];
+};
+.EE
+.in
+.IP
+where
+.I kb_cnt
+is the number of entries in the array, each of which
+is a
+.IP
+.in +4n
+.EX
+struct kbdiacr {
+ unsigned char diacr;
+ unsigned char base;
+ unsigned char result;
+};
+.EE
+.in
+.TP
+.B KDGETKEYCODE
+Read kernel keycode table entry (scan code to keycode).
+.I argp
+points to a
+.IP
+.in +4n
+.EX
+struct kbkeycode {
+ unsigned int scancode;
+ unsigned int keycode;
+};
+.EE
+.in
+.IP
+.I keycode
+is set to correspond to the given
+.IR scancode .
+(89 <=
+.I scancode
+<= 255 only.
+For 1 <=
+.I scancode
+<= 88,
+.IR keycode == scancode .)
+(Since Linux 1.1.63.)
+.TP
+.B KDSETKEYCODE
+Write kernel keycode table entry.
+.I argp
+points to a
+.IR "struct kbkeycode" .
+(Since Linux 1.1.63.)
+.TP
+.B KDSIGACCEPT
+The calling process indicates its willingness to accept the signal
+.I argp
+when it is generated by pressing an appropriate key combination.
+(1 <=
+.I argp
+<= NSIG).
+(See
+.IR spawn_console ()
+in
+.IR linux/drivers/char/keyboard.c .)
+.TP
+.B VT_OPENQRY
+Returns the first available (non-opened) console.
+.I argp
+points to an
+.I int
+which is set to the
+number of the vt (1 <=
+.I *argp
+<= MAX_NR_CONSOLES).
+.TP
+.B VT_GETMODE
+Get mode of active vt.
+.I argp
+points to a
+.IP
+.in +4n
+.EX
+struct vt_mode {
+ char mode; /* vt mode */
+ char waitv; /* if set, hang on writes if not active */
+ short relsig; /* signal to raise on release req */
+ short acqsig; /* signal to raise on acquisition */
+ short frsig; /* unused (set to 0) */
+};
+.EE
+.in
+.IP
+which is set to the mode of the active vt.
+.I mode
+is set to one of these values:
+.TS
+l l.
+VT_AUTO auto vt switching
+VT_PROCESS process controls switching
+VT_ACKACQ acknowledge switch
+.TE
+.TP
+.B VT_SETMODE
+Set mode of active vt.
+.I argp
+points to a
+.IR "struct vt_mode" .
+.TP
+.B VT_GETSTATE
+Get global vt state info.
+.I argp
+points to a
+.IP
+.in +4n
+.EX
+struct vt_stat {
+ unsigned short v_active; /* active vt */
+ unsigned short v_signal; /* signal to send */
+ unsigned short v_state; /* vt bit mask */
+};
+.EE
+.in
+.IP
+For each vt in use, the corresponding bit in the
+.I v_state
+member is set.
+(Linux 1.0 through Linux 1.1.92.)
+.TP
+.B VT_RELDISP
+Release a display.
+.TP
+.B VT_ACTIVATE
+Switch to vt
+.I argp
+(1 <=
+.I argp
+<= MAX_NR_CONSOLES).
+.TP
+.B VT_WAITACTIVE
+Wait until vt
+.I argp
+has been activated.
+.TP
+.B VT_DISALLOCATE
+Deallocate the memory associated with vt
+.IR argp .
+(Since Linux 1.1.54.)
+.TP
+.B VT_RESIZE
+Set the kernel's idea of screensize.
+.I argp
+points to a
+.IP
+.in +4n
+.EX
+struct vt_sizes {
+ unsigned short v_rows; /* # rows */
+ unsigned short v_cols; /* # columns */
+ unsigned short v_scrollsize; /* no longer used */
+};
+.EE
+.in
+.IP
+Note that this does not change the videomode.
+See
+.BR resizecons (8).
+(Since Linux 1.1.54.)
+.TP
+.B VT_RESIZEX
+Set the kernel's idea of various screen parameters.
+.I argp
+points to a
+.IP
+.in +4n
+.EX
+struct vt_consize {
+ unsigned short v_rows; /* number of rows */
+ unsigned short v_cols; /* number of columns */
+ unsigned short v_vlin; /* number of pixel rows
+ on screen */
+ unsigned short v_clin; /* number of pixel rows
+ per character */
+ unsigned short v_vcol; /* number of pixel columns
+ on screen */
+ unsigned short v_ccol; /* number of pixel columns
+ per character */
+};
+.EE
+.in
+.IP
+Any parameter may be set to zero, indicating "no change", but if
+multiple parameters are set, they must be self-consistent.
+Note that this does not change the videomode.
+See
+.BR resizecons (8).
+(Since Linux 1.3.3.)
+.PP
+The action of the following ioctls depends on the first byte in the struct
+pointed to by
+.IR argp ,
+referred to here as the
+.IR subcode .
+These are legal only for the superuser or the owner of the current terminal.
+Symbolic
+.IR subcode s
+are available in
+.I <linux/tiocl.h>
+since
+Linux 2.5.71.
+.TP
+.BR TIOCLINUX ", " subcode = 0
+Dump the screen.
+Disappeared in Linux 1.1.92.
+(With Linux 1.1.92 or later, read from
+.I /dev/vcsN
+or
+.I /dev/vcsaN
+instead.)
+.TP
+.BR TIOCLINUX ", " subcode = 1
+Get task information.
+Disappeared in Linux 1.1.92.
+.TP
+.BR TIOCLINUX ", " subcode = TIOCL_SETSEL
+Set selection.
+.I argp
+points to a
+.IP
+.in +4n
+.EX
+struct {
+ char subcode;
+ short xs, ys, xe, ye;
+ short sel_mode;
+};
+.EE
+.in
+.IP
+.I xs
+and
+.I ys
+are the starting column and row.
+.I xe
+and
+.I ye
+are the ending
+column and row.
+(Upper left corner is row=column=1.)
+.I sel_mode
+is 0 for character-by-character selection,
+1 for word-by-word selection,
+or 2 for line-by-line selection.
+The indicated screen characters are highlighted and saved
+in a kernel buffer.
+.TP
+.BR TIOCLINUX ", " subcode = TIOCL_PASTESEL
+Paste selection.
+The characters in the selection buffer are
+written to
+.IR fd .
+.TP
+.BR TIOCLINUX ", " subcode = TIOCL_UNBLANKSCREEN
+Unblank the screen.
+.TP
+.BR TIOCLINUX ", " subcode = TIOCL_SELLOADLUT
+Sets contents of a 256-bit look up table defining characters in a "word",
+for word-by-word selection.
+(Since Linux 1.1.32.)
+.TP
+.BR TIOCLINUX ", " subcode = TIOCL_GETSHIFTSTATE
+.I argp
+points to a char which is set to the value of the kernel
+variable
+.IR shift_state .
+(Since Linux 1.1.32.)
+.TP
+.BR TIOCLINUX ", " subcode = TIOCL_GETMOUSEREPORTING
+.I argp
+points to a char which is set to the value of the kernel
+variable
+.IR report_mouse .
+(Since Linux 1.1.33.)
+.TP
+.BR TIOCLINUX ", " subcode = 8
+Dump screen width and height, cursor position, and all the
+character-attribute pairs.
+(Linux 1.1.67 through Linux 1.1.91 only.
+With Linux 1.1.92 or later, read from
+.I /dev/vcsa*
+instead.)
+.TP
+.BR TIOCLINUX ", " subcode = 9
+Restore screen width and height, cursor position, and all the
+character-attribute pairs.
+(Linux 1.1.67 through Linux 1.1.91 only.
+With Linux 1.1.92 or later, write to
+.I /dev/vcsa*
+instead.)
+.TP
+.BR TIOCLINUX ", " subcode = TIOCL_SETVESABLANK
+Handles the Power Saving
+feature of the new generation of monitors.
+VESA screen blanking mode is set to
+.IR argp[1] ,
+which governs what
+screen blanking does:
+.RS
+.TP
+.B 0
+Screen blanking is disabled.
+.TP
+.B 1
+The current video adapter
+register settings are saved, then the controller is programmed to turn off
+the vertical synchronization pulses.
+This puts the monitor into "standby" mode.
+If your monitor has an Off_Mode timer, then
+it will eventually power down by itself.
+.TP
+.B 2
+The current settings are saved, then both the vertical and horizontal
+synchronization pulses are turned off.
+This puts the monitor into "off" mode.
+If your monitor has no Off_Mode timer,
+or if you want your monitor to power down immediately when the
+blank_timer times out, then you choose this option.
+.RI ( Caution:
+Powering down frequently will damage the monitor.)
+(Since Linux 1.1.76.)
+.RE
+.TP
+.BR TIOCLINUX ", " subcode = TIOCL_SETKMSGREDIRECT
+Change target of kernel messages ("console"):
+by default, and if this is set to
+.BR 0 ,
+messages are written to the currently active VT.
+The VT to write to is a single byte following
+.BR subcode .
+(Since Linux 2.5.36.)
+.TP
+.BR TIOCLINUX ", " subcode = TIOCL_GETFGCONSOLE
+Returns the number of VT currently in foreground.
+(Since Linux 2.5.36.)
+.TP
+.BR TIOCLINUX ", " subcode = TIOCL_SCROLLCONSOLE
+Scroll the foreground VT by the specified amount of
+.I lines
+down,
+or half the screen if
+.BR 0 .
+.I lines
+is *(((int32_t *)&subcode) + 1).
+(Since Linux 2.5.67.)
+.TP
+.BR TIOCLINUX ", " subcode = TIOCL_BLANKSCREEN
+Blank the foreground VT, ignoring "pokes" (typing):
+can only be unblanked explicitly (by switching VTs, to text mode, etc.).
+(Since Linux 2.5.71.)
+.TP
+.BR TIOCLINUX ", " subcode = TIOCL_BLANKEDSCREEN
+Returns the number of VT currently blanked,
+.B 0
+if none.
+(Since Linux 2.5.71.)
+.TP
+.BR TIOCLINUX ", " subcode = 16
+Never used.
+.TP
+.BR TIOCLINUX ", " subcode = TIOCL_GETKMSGREDIRECT
+Returns target of kernel messages.
+(Since Linux 2.6.17.)
+.SH RETURN VALUE
+On success, 0 is returned (except where indicated).
+On failure, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+The file descriptor is invalid.
+.TP
+.B EINVAL
+The file descriptor or
+.I argp
+is invalid.
+.TP
+.B ENOTTY
+The file descriptor is not associated with a character special device,
+or the specified request does not apply to it.
+.TP
+.B EPERM
+Insufficient permission.
+.SH NOTES
+.BR Warning :
+Do not regard this man page as documentation of the Linux console ioctls.
+This is provided for the curious only, as an alternative to reading the
+source.
+Ioctl's are undocumented Linux internals, liable to be changed
+without warning.
+(And indeed, this page more or less describes the
+situation as of kernel version 1.1.94;
+there are many minor and not-so-minor
+differences with earlier versions.)
+.PP
+Very often, ioctls are introduced for communication between the
+kernel and one particular well-known program (fdisk, hdparm, setserial,
+tunelp, loadkeys, selection, setfont, etc.), and their behavior will be
+changed when required by this particular program.
+.PP
+Programs using these ioctls will not be portable to other versions
+of UNIX, will not work on older versions of Linux, and will not work
+on future versions of Linux.
+.PP
+Use POSIX functions.
+.SH SEE ALSO
+.BR dumpkeys (1),
+.BR kbd_mode (1),
+.BR loadkeys (1),
+.BR mknod (1),
+.BR setleds (1),
+.BR setmetamode (1),
+.BR execve (2),
+.BR fcntl (2),
+.BR ioctl_tty (2),
+.BR ioperm (2),
+.BR termios (3),
+.BR console_codes (4),
+.BR mt (4),
+.BR sd (4),
+.BR tty (4),
+.BR ttyS (4),
+.BR vcs (4),
+.BR vcsa (4),
+.BR charsets (7),
+.BR mapscrn (8),
+.BR resizecons (8),
+.BR setfont (8)
+.PP
+.IR /usr/include/linux/kd.h ,
+.I /usr/include/linux/vt.h
diff --git a/man2/ioctl_fat.2 b/man2/ioctl_fat.2
new file mode 100644
index 0000000..67b5a2c
--- /dev/null
+++ b/man2/ioctl_fat.2
@@ -0,0 +1,489 @@
+.\" Copyright (C) 2014, Heinrich Schuchardt <xypron.glpk@gmx.de>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.TH ioctl_fat 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+ioctl_fat \- manipulating the FAT filesystem
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/msdos_fs.h>" " /* Definition of [" V ] FAT_* " and"
+.BR " ATTR_* " constants */"
+.B #include <sys/ioctl.h>
+.PP
+.BI "int ioctl(int " fd ", FAT_IOCTL_GET_ATTRIBUTES, uint32_t *" attr );
+.BI "int ioctl(int " fd ", FAT_IOCTL_SET_ATTRIBUTES, uint32_t *" attr );
+.BI "int ioctl(int " fd ", FAT_IOCTL_GET_VOLUME_ID, uint32_t *" id );
+.BI "int ioctl(int " fd ", VFAT_IOCTL_READDIR_BOTH,"
+.BI " struct __fat_dirent " entry [2]);
+.BI "int ioctl(int " fd ", VFAT_IOCTL_READDIR_SHORT,"
+.BI " struct __fat_dirent " entry [2]);
+.fi
+.SH DESCRIPTION
+The
+.BR ioctl (2)
+system call can be used to read and write metadata of FAT filesystems that
+are not accessible using other system calls.
+.SS Reading and setting file attributes
+Files and directories in the FAT filesystem possess an attribute bit mask that
+can be read with
+.B FAT_IOCTL_GET_ATTRIBUTES
+and written with
+.BR FAT_IOCTL_SET_ATTRIBUTES .
+.PP
+The
+.I fd
+argument contains a file descriptor for a file or directory.
+It is sufficient to create the file descriptor by calling
+.BR open (2)
+with the
+.B O_RDONLY
+flag.
+.PP
+The
+.I attr
+argument contains a pointer to a bit mask.
+The bits of the bit mask are:
+.TP
+.B ATTR_RO
+This bit specifies that the file or directory is read-only.
+.TP
+.B ATTR_HIDDEN
+This bit specifies that the file or directory is hidden.
+.TP
+.B ATTR_SYS
+This bit specifies that the file is a system file.
+.TP
+.B ATTR_VOLUME
+This bit specifies that the file is a volume label.
+This attribute is read-only.
+.TP
+.B ATTR_DIR
+This bit specifies that this is a directory.
+This attribute is read-only.
+.TP
+.B ATTR_ARCH
+This bit indicates that this file or directory should be archived.
+It is set when a file is created or modified.
+It is reset by an archiving system.
+.PP
+The zero value
+.B ATTR_NONE
+can be used to indicate that no attribute bit is set.
+.SS Reading the volume ID
+FAT filesystems are identified by a volume ID.
+The volume ID can be read with
+.BR FAT_IOCTL_GET_VOLUME_ID .
+.PP
+The
+.I fd
+argument can be a file descriptor for any file or directory of the
+filesystem.
+It is sufficient to create the file descriptor by calling
+.BR open (2)
+with the
+.B O_RDONLY
+flag.
+.PP
+The
+.I id
+argument is a pointer to the field that will be filled with the volume ID.
+Typically the volume ID is displayed to the user as a group of two
+16-bit fields:
+.PP
+.in +4n
+.EX
+printf("Volume ID %04x\-%04x\en", id >> 16, id & 0xFFFF);
+.EE
+.in
+.SS Reading short filenames of a directory
+A file or directory on a FAT filesystem always has a short filename
+consisting of up to 8 capital letters, optionally followed by a period
+and up to 3 capital letters for the file extension.
+If the actual filename does not fit into this scheme, it is stored
+as a long filename of up to 255 UTF-16 characters.
+.PP
+The short filenames in a directory can be read with
+.BR VFAT_IOCTL_READDIR_SHORT .
+.B VFAT_IOCTL_READDIR_BOTH
+reads both the short and the long filenames.
+.PP
+The
+.I fd
+argument must be a file descriptor for a directory.
+It is sufficient to create the file descriptor by calling
+.BR open (2)
+with the
+.B O_RDONLY
+flag.
+The file descriptor can be used only once to iterate over the directory
+entries by calling
+.BR ioctl (2)
+repeatedly.
+.PP
+The
+.I entry
+argument is a two-element array of the following structures:
+.PP
+.in +4n
+.EX
+struct __fat_dirent {
+ long d_ino;
+ __kernel_off_t d_off;
+ uint32_t short d_reclen;
+ char d_name[256];
+};
+.EE
+.in
+.PP
+The first entry in the array is for the short filename.
+The second entry is for the long filename.
+.PP
+The
+.I d_ino
+and
+.I d_off
+fields are filled only for long filenames.
+The
+.I d_ino
+field holds the inode number of the directory.
+The
+.I d_off
+field holds the offset of the file entry in the directory.
+As these values are not available for short filenames, the user code should
+simply ignore them.
+.PP
+The field
+.I d_reclen
+contains the length of the filename in the field
+.IR d_name .
+To keep backward compatibility, a length of 0 for the short filename signals
+that the end of the directory has been reached.
+However, the preferred method for detecting the end of the directory
+is to test the
+.BR ioctl (2)
+return value.
+If no long filename exists, field
+.I d_reclen
+is set to 0 and
+.I d_name
+is a character string of length 0 for the long filename.
+.SH RETURN VALUE
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.PP
+For
+.B VFAT_IOCTL_READDIR_BOTH
+and
+.B VFAT_IOCTL_READDIR_SHORT
+a return value of 1 signals that a new directory entry has been read and
+a return value of 0 signals that the end of the directory has been reached.
+.SH ERRORS
+.TP
+.B ENOENT
+This error is returned by
+.B VFAT_IOCTL_READDIR_BOTH
+and
+.B VFAT_IOCTL_READDIR_SHORT
+if the file descriptor
+.I fd
+refers to a removed, but still open directory.
+.TP
+.B ENOTDIR
+This error is returned by
+.B VFAT_IOCTL_READDIR_BOTH
+and
+.B VFAT_IOCTL_READDIR_SHORT
+if the file descriptor
+.I fd
+does not refer to a directory.
+.TP
+.B ENOTTY
+The file descriptor
+.I fd
+does not refer to an object in a FAT filesystem.
+.PP
+For further error values, see
+.BR ioctl (2).
+.SH STANDARDS
+Linux.
+.SH HISTORY
+.TP
+.B VFAT_IOCTL_READDIR_BOTH
+.TQ
+.B VFAT_IOCTL_READDIR_SHORT
+Linux 2.0.
+.TP
+.B FAT_IOCTL_GET_ATTRIBUTES
+.TQ
+.B FAT_IOCTL_SET_ATTRIBUTES
+.\" just before we got Git history
+Linux 2.6.12.
+.TP
+.B FAT_IOCTL_GET_VOLUME_ID
+Linux 3.11.
+.\" commit 6e5b93ee55d401f1619092fb675b57c28c9ed7ec
+.SH EXAMPLES
+.SS Toggling the archive flag
+The following program demonstrates the usage of
+.BR ioctl (2)
+to manipulate file attributes.
+The program reads and displays the archive attribute of a file.
+After inverting the value of the attribute,
+the program reads and displays the attribute again.
+.PP
+The following was recorded when applying the program for the file
+.IR /mnt/user/foo :
+.PP
+.in +4n
+.EX
+# ./toggle_fat_archive_flag /mnt/user/foo
+Archive flag is set
+Toggling archive flag
+Archive flag is not set
+.EE
+.in
+.SS Program source (toggle_fat_archive_flag.c)
+\&
+.\" SRC BEGIN (toggle_fat_archive_flag.c)
+.EX
+#include <fcntl.h>
+#include <linux/msdos_fs.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+\&
+/*
+ * Read file attributes of a file on a FAT filesystem.
+ * Output the state of the archive flag.
+ */
+static uint32_t
+readattr(int fd)
+{
+ int ret;
+ uint32_t attr;
+\&
+ ret = ioctl(fd, FAT_IOCTL_GET_ATTRIBUTES, &attr);
+ if (ret == \-1) {
+ perror("ioctl");
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (attr & ATTR_ARCH)
+ printf("Archive flag is set\en");
+ else
+ printf("Archive flag is not set\en");
+\&
+ return attr;
+}
+\&
+int
+main(int argc, char *argv[])
+{
+ int fd;
+ int ret;
+ uint32_t attr;
+\&
+ if (argc != 2) {
+ printf("Usage: %s FILENAME\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ fd = open(argv[1], O_RDONLY);
+ if (fd == \-1) {
+ perror("open");
+ exit(EXIT_FAILURE);
+ }
+\&
+ /*
+ * Read and display the FAT file attributes.
+ */
+ attr = readattr(fd);
+\&
+ /*
+ * Invert archive attribute.
+ */
+ printf("Toggling archive flag\en");
+ attr \[ha]= ATTR_ARCH;
+\&
+ /*
+ * Write the changed FAT file attributes.
+ */
+ ret = ioctl(fd, FAT_IOCTL_SET_ATTRIBUTES, &attr);
+ if (ret == \-1) {
+ perror("ioctl");
+ exit(EXIT_FAILURE);
+ }
+\&
+ /*
+ * Read and display the FAT file attributes.
+ */
+ readattr(fd);
+\&
+ close(fd);
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SS Reading the volume ID
+The following program demonstrates the use of
+.BR ioctl (2)
+to display the volume ID of a FAT filesystem.
+.PP
+The following output was recorded when applying the program for
+directory
+.IR /mnt/user :
+.PP
+.in +4n
+.EX
+$ ./display_fat_volume_id /mnt/user
+Volume ID 6443\-6241
+.EE
+.in
+.SS Program source (display_fat_volume_id.c)
+\&
+.\" SRC BEGIN (display_fat_volume_id.c)
+.EX
+#include <fcntl.h>
+#include <linux/msdos_fs.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ int fd;
+ int ret;
+ uint32_t id;
+\&
+ if (argc != 2) {
+ printf("Usage: %s FILENAME\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ fd = open(argv[1], O_RDONLY);
+ if (fd == \-1) {
+ perror("open");
+ exit(EXIT_FAILURE);
+ }
+\&
+ /*
+ * Read volume ID.
+ */
+ ret = ioctl(fd, FAT_IOCTL_GET_VOLUME_ID, &id);
+ if (ret == \-1) {
+ perror("ioctl");
+ exit(EXIT_FAILURE);
+ }
+\&
+ /*
+ * Format the output as two groups of 16 bits each.
+ */
+ printf("Volume ID %04x\-%04x\en", id >> 16, id & 0xFFFF);
+\&
+ close(fd);
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SS Listing a directory
+The following program demonstrates the use of
+.BR ioctl (2)
+to list a directory.
+.PP
+The following was recorded when applying the program to the directory
+.IR /mnt/user :
+.PP
+.in +4n
+.EX
+$ \fB./fat_dir /mnt/user\fP
+\&. \-> \[aq]\[aq]
+\&.. \-> \[aq]\[aq]
+ALONGF\[ti]1.TXT \-> \[aq]a long filename.txt\[aq]
+UPPER.TXT \-> \[aq]\[aq]
+LOWER.TXT \-> \[aq]lower.txt\[aq]
+.EE
+.in
+.\"
+.SS Program source
+.in +4n
+.\" SRC BEGIN (ioctl_fat.c)
+.EX
+#include <fcntl.h>
+#include <linux/msdos_fs.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ int fd;
+ int ret;
+ struct __fat_dirent entry[2];
+\&
+ if (argc != 2) {
+ printf("Usage: %s DIRECTORY\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ /*
+ * Open file descriptor for the directory.
+ */
+ fd = open(argv[1], O_RDONLY | O_DIRECTORY);
+ if (fd == \-1) {
+ perror("open");
+ exit(EXIT_FAILURE);
+ }
+\&
+ for (;;) {
+\&
+ /*
+ * Read next directory entry.
+ */
+ ret = ioctl(fd, VFAT_IOCTL_READDIR_BOTH, entry);
+\&
+ /*
+ * If an error occurs, the return value is \-1.
+ * If the end of the directory list has been reached,
+ * the return value is 0.
+ * For backward compatibility the end of the directory
+ * list is also signaled by d_reclen == 0.
+ */
+ if (ret < 1)
+ break;
+\&
+ /*
+ * Write both the short name and the long name.
+ */
+ printf("%s \-> \[aq]%s\[aq]\en", entry[0].d_name, entry[1].d_name);
+ }
+\&
+ if (ret == \-1) {
+ perror("VFAT_IOCTL_READDIR_BOTH");
+ exit(EXIT_FAILURE);
+ }
+\&
+ /*
+ * Close the file descriptor.
+ */
+ close(fd);
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.in
+.SH SEE ALSO
+.BR ioctl (2)
diff --git a/man2/ioctl_ficlone.2 b/man2/ioctl_ficlone.2
new file mode 100644
index 0000000..19bb348
--- /dev/null
+++ b/man2/ioctl_ficlone.2
@@ -0,0 +1 @@
+.so man2/ioctl_ficlonerange.2
diff --git a/man2/ioctl_ficlonerange.2 b/man2/ioctl_ficlonerange.2
new file mode 100644
index 0000000..68cfc67
--- /dev/null
+++ b/man2/ioctl_ficlonerange.2
@@ -0,0 +1,129 @@
+.\" Copyright (c) 2016, Oracle. All rights reserved.
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.TH ioctl_ficlonerange 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+ioctl_ficlonerange, ioctl_ficlone \-
+share some the data of one file with another file
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/fs.h>" " /* Definition of " FICLONE* " constants */"
+.B #include <sys/ioctl.h>
+.PP
+.BI "int ioctl(int " dest_fd ", FICLONERANGE, struct file_clone_range *" arg );
+.BI "int ioctl(int " dest_fd ", FICLONE, int " src_fd );
+.fi
+.SH DESCRIPTION
+If a filesystem supports files sharing physical storage between multiple
+files ("reflink"), this
+.BR ioctl (2)
+operation can be used to make some of the data in the
+.I src_fd
+file appear in the
+.I dest_fd
+file by sharing the underlying storage, which is faster than making a separate
+physical copy of the data.
+Both files must reside within the same filesystem.
+If a file write should occur to a shared region,
+the filesystem must ensure that the changes remain private to the file being
+written.
+This behavior is commonly referred to as "copy on write".
+.PP
+This ioctl reflinks up to
+.I src_length
+bytes from file descriptor
+.I src_fd
+at offset
+.I src_offset
+into the file
+.I dest_fd
+at offset
+.IR dest_offset ,
+provided that both are files.
+If
+.I src_length
+is zero, the ioctl reflinks to the end of the source file.
+This information is conveyed in a structure of
+the following form:
+.PP
+.in +4n
+.EX
+struct file_clone_range {
+ __s64 src_fd;
+ __u64 src_offset;
+ __u64 src_length;
+ __u64 dest_offset;
+};
+.EE
+.in
+.PP
+Clones are atomic with regards to concurrent writes, so no locks need to be
+taken to obtain a consistent cloned copy.
+.PP
+The
+.B FICLONE
+ioctl clones entire files.
+.SH RETURN VALUE
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+Error codes can be one of, but are not limited to, the following:
+.TP
+.B EBADF
+.I src_fd
+is not open for reading;
+.I dest_fd
+is not open for writing or is open for append-only writes;
+or the filesystem which
+.I src_fd
+resides on does not support reflink.
+.TP
+.B EINVAL
+The filesystem does not support reflinking the ranges of the given files.
+This error can also appear if either file descriptor represents
+a device, FIFO, or socket.
+Disk filesystems generally require the offset and length arguments
+to be aligned to the fundamental block size.
+XFS and Btrfs do not support
+overlapping reflink ranges in the same file.
+.TP
+.B EISDIR
+One of the files is a directory and the filesystem does not support shared
+regions in directories.
+.TP
+.B EOPNOTSUPP
+This can appear if the filesystem does not support reflinking either file
+descriptor, or if either file descriptor refers to special inodes.
+.TP
+.B EPERM
+.I dest_fd
+is immutable.
+.TP
+.B ETXTBSY
+One of the files is a swap file.
+Swap files cannot share storage.
+.TP
+.B EXDEV
+.IR dest_fd " and " src_fd
+are not on the same mounted filesystem.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 4.5.
+.PP
+They were previously known as
+.B BTRFS_IOC_CLONE
+and
+.BR BTRFS_IOC_CLONE_RANGE ,
+and were private to Btrfs.
+.SH NOTES
+Because a copy-on-write operation requires the allocation of new storage, the
+.BR fallocate (2)
+operation may unshare shared blocks to guarantee that subsequent writes will
+not fail because of lack of disk space.
+.SH SEE ALSO
+.BR ioctl (2)
diff --git a/man2/ioctl_fideduperange.2 b/man2/ioctl_fideduperange.2
new file mode 100644
index 0000000..5388c5d
--- /dev/null
+++ b/man2/ioctl_fideduperange.2
@@ -0,0 +1,200 @@
+.\" Copyright (c) 2016, Oracle. All rights reserved.
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.TH ioctl_fideduperange 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+ioctl_fideduperange \- share some the data of one file with another file
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/fs.h>" " /* Definition of " FIDEDUPERANGE " and"
+.BR " FILE_DEDUPE_* " constants */
+.B #include <sys/ioctl.h>
+.PP
+.BI "int ioctl(int " src_fd ", FIDEDUPERANGE, struct file_dedupe_range *" arg );
+.fi
+.SH DESCRIPTION
+If a filesystem supports files sharing physical storage between multiple
+files, this
+.BR ioctl (2)
+operation can be used to make some of the data in the
+.B src_fd
+file appear in the
+.B dest_fd
+file by sharing the underlying storage if the file data is identical
+("deduplication").
+Both files must reside within the same filesystem.
+This reduces storage consumption by allowing the filesystem
+to store one shared copy of the data.
+If a file write should occur to a shared
+region, the filesystem must ensure that the changes remain private to the file
+being written.
+This behavior is commonly referred to as "copy on write".
+.PP
+This ioctl performs the "compare and share if identical" operation on up to
+.I src_length
+bytes from file descriptor
+.I src_fd
+at offset
+.IR src_offset .
+This information is conveyed in a structure of the following form:
+.PP
+.in +4n
+.EX
+struct file_dedupe_range {
+ __u64 src_offset;
+ __u64 src_length;
+ __u16 dest_count;
+ __u16 reserved1;
+ __u32 reserved2;
+ struct file_dedupe_range_info info[0];
+};
+.EE
+.in
+.PP
+Deduplication is atomic with regards to concurrent writes, so no locks need to
+be taken to obtain a consistent deduplicated copy.
+.PP
+The fields
+.IR reserved1 " and " reserved2
+must be zero.
+.PP
+Destinations for the deduplication operation are conveyed in the array at the
+end of the structure.
+The number of destinations is given in
+.IR dest_count ,
+and the destination information is conveyed in the following form:
+.PP
+.in +4n
+.EX
+struct file_dedupe_range_info {
+ __s64 dest_fd;
+ __u64 dest_offset;
+ __u64 bytes_deduped;
+ __s32 status;
+ __u32 reserved;
+};
+.EE
+.in
+.PP
+Each deduplication operation targets
+.I src_length
+bytes in file descriptor
+.I dest_fd
+at offset
+.IR dest_offset .
+The field
+.I reserved
+must be zero.
+During the call,
+.I src_fd
+must be open for reading and
+.I dest_fd
+must be open for writing.
+The combined size of the struct
+.I file_dedupe_range
+and the struct
+.I file_dedupe_range_info
+array must not exceed the system page size.
+The maximum size of
+.I src_length
+is filesystem dependent and is typically 16\~MiB.
+This limit will be enforced silently by the filesystem.
+By convention, the storage used by
+.I src_fd
+is mapped into
+.I dest_fd
+and the previous contents in
+.I dest_fd
+are freed.
+.PP
+Upon successful completion of this ioctl, the number of bytes successfully
+deduplicated is returned in
+.I bytes_deduped
+and a status code for the deduplication operation is returned in
+.IR status .
+If even a single byte in the range does not match, the deduplication
+request will be ignored and
+.I status
+set to
+.BR FILE_DEDUPE_RANGE_DIFFERS .
+The
+.I status
+code is set to
+.B FILE_DEDUPE_RANGE_SAME
+for success, a negative error code in case of error, or
+.B FILE_DEDUPE_RANGE_DIFFERS
+if the data did not match.
+.SH RETURN VALUE
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+Possible errors include (but are not limited to) the following:
+.TP
+.B EBADF
+.I src_fd
+is not open for reading;
+.I dest_fd
+is not open for writing or is open for append-only writes; or the filesystem
+which
+.I src_fd
+resides on does not support deduplication.
+.TP
+.B EINVAL
+The filesystem does not support deduplicating the ranges of the given files.
+This error can also appear if either file descriptor represents
+a device, FIFO, or socket.
+Disk filesystems generally require the offset and length arguments
+to be aligned to the fundamental block size.
+Neither Btrfs nor XFS support
+overlapping deduplication ranges in the same file.
+.TP
+.B EISDIR
+One of the files is a directory and the filesystem does not support shared
+regions in directories.
+.TP
+.B ENOMEM
+The kernel was unable to allocate sufficient memory to perform the
+operation or
+.I dest_count
+is so large that the input argument description spans more than a single
+page of memory.
+.TP
+.B EOPNOTSUPP
+This can appear if the filesystem does not support deduplicating either file
+descriptor, or if either file descriptor refers to special inodes.
+.TP
+.B EPERM
+.I dest_fd
+is immutable.
+.TP
+.B ETXTBSY
+One of the files is a swap file.
+Swap files cannot share storage.
+.TP
+.B EXDEV
+.I dest_fd
+and
+.I src_fd
+are not on the same mounted filesystem.
+.SH VERSIONS
+Some filesystems may limit the amount of data that can be deduplicated in a
+single call.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 4.5.
+.PP
+It was previously known as
+.B BTRFS_IOC_FILE_EXTENT_SAME
+and was private to Btrfs.
+.SH NOTES
+Because a copy-on-write operation requires the allocation of new storage, the
+.BR fallocate (2)
+operation may unshare shared blocks to guarantee that subsequent writes will
+not fail because of lack of disk space.
+.SH SEE ALSO
+.BR ioctl (2)
diff --git a/man2/ioctl_fslabel.2 b/man2/ioctl_fslabel.2
new file mode 100644
index 0000000..885a43c
--- /dev/null
+++ b/man2/ioctl_fslabel.2
@@ -0,0 +1,72 @@
+.\" Copyright (c) 2018, Red Hat, Inc. All rights reserved.
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.TH ioctl_fslabel 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+ioctl_fslabel \- get or set a filesystem label
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/fs.h>" " /* Definition of " *FSLABEL* " constants */"
+.B #include <sys/ioctl.h>
+.PP
+.BI "int ioctl(int " fd ", FS_IOC_GETFSLABEL, char " label [FSLABEL_MAX]);
+.BI "int ioctl(int " fd ", FS_IOC_SETFSLABEL, char " label [FSLABEL_MAX]);
+.fi
+.SH DESCRIPTION
+If a filesystem supports online label manipulation, these
+.BR ioctl (2)
+operations can be used to get or set the filesystem label for the filesystem
+on which
+.I fd
+resides.
+The
+.B FS_IOC_SETFSLABEL
+operation requires privilege
+.RB ( CAP_SYS_ADMIN ).
+.SH RETURN VALUE
+On success zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+Possible errors include (but are not limited to) the following:
+.TP
+.B EFAULT
+.I label
+references an inaccessible memory area.
+.TP
+.B EINVAL
+The specified label exceeds the maximum label length for the filesystem.
+.TP
+.B ENOTTY
+This can appear if the filesystem does not support online label manipulation.
+.TP
+.B EPERM
+The calling process does not have sufficient permissions to set the label.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 4.18.
+.PP
+They were previously known as
+.B BTRFS_IOC_GET_FSLABEL
+and
+.B BTRFS_IOC_SET_FSLABEL
+and were private to Btrfs.
+.SH NOTES
+The maximum string length for this interface is
+.BR FSLABEL_MAX ,
+including the terminating null byte (\[aq]\\0\[aq]).
+Filesystems have differing maximum label lengths, which may or
+may not include the terminating null.
+The string provided to
+.B FS_IOC_SETFSLABEL
+must always be null-terminated, and the string returned by
+.B FS_IOC_GETFSLABEL
+will always be null-terminated.
+.SH SEE ALSO
+.BR ioctl (2),
+.BR blkid (8)
diff --git a/man2/ioctl_getfsmap.2 b/man2/ioctl_getfsmap.2
new file mode 100644
index 0000000..e80c1d9
--- /dev/null
+++ b/man2/ioctl_getfsmap.2
@@ -0,0 +1,351 @@
+.\" Copyright (c) 2017, Oracle. All rights reserved.
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.TH ioctl_getfsmap 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+ioctl_getfsmap \- retrieve the physical layout of the filesystem
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/fsmap.h> " "/* Definition of " FS_IOC_GETFSMAP ,
+.BR " FM?_OF_*" ", and " *FMR_OWN_* " constants */"
+.B #include <sys/ioctl.h>
+.PP
+.BI "int ioctl(int " fd ", FS_IOC_GETFSMAP, struct fsmap_head * " arg );
+.fi
+.SH DESCRIPTION
+This
+.BR ioctl (2)
+operation retrieves physical extent mappings for a filesystem.
+This information can be used to discover which files are mapped to a physical
+block, examine free space, or find known bad blocks, among other things.
+.PP
+The sole argument to this operation should be a pointer to a single
+.IR "struct fsmap_head" ":"
+.PP
+.in +4n
+.EX
+struct fsmap {
+ __u32 fmr_device; /* Device ID */
+ __u32 fmr_flags; /* Mapping flags */
+ __u64 fmr_physical; /* Device offset of segment */
+ __u64 fmr_owner; /* Owner ID */
+ __u64 fmr_offset; /* File offset of segment */
+ __u64 fmr_length; /* Length of segment */
+ __u64 fmr_reserved[3]; /* Must be zero */
+};
+\&
+struct fsmap_head {
+ __u32 fmh_iflags; /* Control flags */
+ __u32 fmh_oflags; /* Output flags */
+ __u32 fmh_count; /* # of entries in array incl. input */
+ __u32 fmh_entries; /* # of entries filled in (output) */
+ __u64 fmh_reserved[6]; /* Must be zero */
+\&
+ struct fsmap fmh_keys[2]; /* Low and high keys for
+ the mapping search */
+ struct fsmap fmh_recs[]; /* Returned records */
+};
+.EE
+.in
+.PP
+The two
+.I fmh_keys
+array elements specify the lowest and highest reverse-mapping
+key for which the application would like physical mapping
+information.
+A reverse mapping key consists of the tuple (device, block, owner, offset).
+The owner and offset fields are part of the key because some filesystems
+support sharing physical blocks between multiple files and
+therefore may return multiple mappings for a given physical block.
+.PP
+Filesystem mappings are copied into the
+.I fmh_recs
+array, which immediately follows the header data.
+.\"
+.SS Fields of struct fsmap_head
+The
+.I fmh_iflags
+field is a bit mask passed to the kernel to alter the output.
+No flags are currently defined, so the caller must set this value to zero.
+.PP
+The
+.I fmh_oflags
+field is a bit mask of flags set by the kernel concerning the returned mappings.
+If
+.B FMH_OF_DEV_T
+is set, then the
+.I fmr_device
+field represents a
+.I dev_t
+structure containing the major and minor numbers of the block device.
+.PP
+The
+.I fmh_count
+field contains the number of elements in the array being passed to the
+kernel.
+If this value is 0,
+.I fmh_entries
+will be set to the number of records that would have been returned had
+the array been large enough;
+no mapping information will be returned.
+.PP
+The
+.I fmh_entries
+field contains the number of elements in the
+.I fmh_recs
+array that contain useful information.
+.PP
+The
+.I fmh_reserved
+fields must be set to zero.
+.\"
+.SS Keys
+The two key records in
+.I fsmap_head.fmh_keys
+specify the lowest and highest extent records in the keyspace that the caller
+wants returned.
+A filesystem that can share blocks between files likely requires the tuple
+.RI "(" "device" ", " "physical" ", " "owner" ", " "offset" ", " "flags" ")"
+to uniquely index any filesystem mapping record.
+Classic non-sharing filesystems might be able to identify any record with only
+.RI "(" "device" ", " "physical" ", " "flags" ")."
+For example, if the low key is set to (8:0, 36864, 0, 0, 0), the filesystem will
+only return records for extents starting at or above 36\ KiB on disk.
+If the high key is set to (8:0, 1048576, 0, 0, 0),
+only records below 1\ MiB will be returned.
+The format of
+.I fmr_device
+in the keys must match the format of the same field in the output records,
+as defined below.
+By convention, the field
+.I fsmap_head.fmh_keys[0]
+must contain the low key and
+.I fsmap_head.fmh_keys[1]
+must contain the high key for the request.
+.PP
+For convenience, if
+.I fmr_length
+is set in the low key, it will be added to
+.IR fmr_block " or " fmr_offset
+as appropriate.
+The caller can take advantage of this subtlety to set up subsequent calls
+by copying
+.I fsmap_head.fmh_recs[fsmap_head.fmh_entries \- 1]
+into the low key.
+The function
+.I fsmap_advance
+(defined in
+.IR linux/fsmap.h )
+provides this functionality.
+.\"
+.SS Fields of struct fsmap
+The
+.I fmr_device
+field uniquely identifies the underlying storage device.
+If the
+.B FMH_OF_DEV_T
+flag is set in the header's
+.I fmh_oflags
+field, this field contains a
+.I dev_t
+from which major and minor numbers can be extracted.
+If the flag is not set, this field contains a value that must be unique
+for each unique storage device.
+.PP
+The
+.I fmr_physical
+field contains the disk address of the extent in bytes.
+.PP
+The
+.I fmr_owner
+field contains the owner of the extent.
+This is an inode number unless
+.B FMR_OF_SPECIAL_OWNER
+is set in the
+.I fmr_flags
+field, in which case the value is determined by the filesystem.
+See the section below about owner values for more details.
+.PP
+The
+.I fmr_offset
+field contains the logical address in the mapping record in bytes.
+This field has no meaning if the
+.BR FMR_OF_SPECIAL_OWNER " or " FMR_OF_EXTENT_MAP
+flags are set in
+.IR fmr_flags "."
+.PP
+The
+.I fmr_length
+field contains the length of the extent in bytes.
+.PP
+The
+.I fmr_flags
+field is a bit mask of extent state flags.
+The bits are:
+.RS 0.4i
+.TP
+.B FMR_OF_PREALLOC
+The extent is allocated but not yet written.
+.TP
+.B FMR_OF_ATTR_FORK
+This extent contains extended attribute data.
+.TP
+.B FMR_OF_EXTENT_MAP
+This extent contains extent map information for the owner.
+.TP
+.B FMR_OF_SHARED
+Parts of this extent may be shared.
+.TP
+.B FMR_OF_SPECIAL_OWNER
+The
+.I fmr_owner
+field contains a special value instead of an inode number.
+.TP
+.B FMR_OF_LAST
+This is the last record in the data set.
+.RE
+.PP
+The
+.I fmr_reserved
+field will be set to zero.
+.\"
+.SS Owner values
+Generally, the value of the
+.I fmr_owner
+field for non-metadata extents should be an inode number.
+However, filesystems are under no obligation to report inode numbers;
+they may instead report
+.B FMR_OWN_UNKNOWN
+if the inode number cannot easily be retrieved, if the caller lacks
+sufficient privilege, if the filesystem does not support stable
+inode numbers, or for any other reason.
+If a filesystem wishes to condition the reporting of inode numbers based
+on process capabilities, it is strongly urged that the
+.B CAP_SYS_ADMIN
+capability be used for this purpose.
+.TP
+The following special owner values are generic to all filesystems:
+.RS 0.4i
+.TP
+.B FMR_OWN_FREE
+Free space.
+.TP
+.B FMR_OWN_UNKNOWN
+This extent is in use but its owner is not known or not easily retrieved.
+.TP
+.B FMR_OWN_METADATA
+This extent is filesystem metadata.
+.RE
+.PP
+XFS can return the following special owner values:
+.RS 0.4i
+.TP
+.B XFS_FMR_OWN_FREE
+Free space.
+.TP
+.B XFS_FMR_OWN_UNKNOWN
+This extent is in use but its owner is not known or not easily retrieved.
+.TP
+.B XFS_FMR_OWN_FS
+Static filesystem metadata which exists at a fixed address.
+These are the AG superblock, the AGF, the AGFL, and the AGI headers.
+.TP
+.B XFS_FMR_OWN_LOG
+The filesystem journal.
+.TP
+.B XFS_FMR_OWN_AG
+Allocation group metadata, such as the free space btrees and the
+reverse mapping btrees.
+.TP
+.B XFS_FMR_OWN_INOBT
+The inode and free inode btrees.
+.TP
+.B XFS_FMR_OWN_INODES
+Inode records.
+.TP
+.B XFS_FMR_OWN_REFC
+Reference count information.
+.TP
+.B XFS_FMR_OWN_COW
+This extent is being used to stage a copy-on-write.
+.TP
+.B XFS_FMR_OWN_DEFECTIVE:
+This extent has been marked defective either by the filesystem or the
+underlying device.
+.RE
+.PP
+ext4 can return the following special owner values:
+.RS 0.4i
+.TP
+.B EXT4_FMR_OWN_FREE
+Free space.
+.TP
+.B EXT4_FMR_OWN_UNKNOWN
+This extent is in use but its owner is not known or not easily retrieved.
+.TP
+.B EXT4_FMR_OWN_FS
+Static filesystem metadata which exists at a fixed address.
+This is the superblock and the group descriptors.
+.TP
+.B EXT4_FMR_OWN_LOG
+The filesystem journal.
+.TP
+.B EXT4_FMR_OWN_INODES
+Inode records.
+.TP
+.B EXT4_FMR_OWN_BLKBM
+Block bit map.
+.TP
+.B EXT4_FMR_OWN_INOBM
+Inode bit map.
+.RE
+.SH RETURN VALUE
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+The error placed in
+.I errno
+can be one of, but is not limited to, the following:
+.TP
+.B EBADF
+.I fd
+is not open for reading.
+.TP
+.B EBADMSG
+The filesystem has detected a checksum error in the metadata.
+.TP
+.B EFAULT
+The pointer passed in was not mapped to a valid memory address.
+.TP
+.B EINVAL
+The array is not long enough, the keys do not point to a valid part of
+the filesystem, the low key points to a higher point in the filesystem's
+physical storage address space than the high key, or a nonzero value
+was passed in one of the fields that must be zero.
+.TP
+.B ENOMEM
+Insufficient memory to process the request.
+.TP
+.B EOPNOTSUPP
+The filesystem does not support this command.
+.TP
+.B EUCLEAN
+The filesystem metadata is corrupt and needs repair.
+.SH STANDARDS
+Linux.
+.PP
+Not all filesystems support it.
+.SH HISTORY
+Linux 4.12.
+.SH EXAMPLES
+See
+.I io/fsmap.c
+in the
+.I xfsprogs
+distribution for a sample program.
+.SH SEE ALSO
+.BR ioctl (2)
diff --git a/man2/ioctl_iflags.2 b/man2/ioctl_iflags.2
new file mode 100644
index 0000000..d2c3300
--- /dev/null
+++ b/man2/ioctl_iflags.2
@@ -0,0 +1,202 @@
+.\" Copyright (c) 2017 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\"
+.TH ioctl_iflags 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+ioctl_iflags \- ioctl() operations for inode flags
+.SH DESCRIPTION
+Various Linux filesystems support the notion of
+.IR "inode flags" \[em]attributes
+that modify the semantics of files and directories.
+These flags can be retrieved and modified using two
+.BR ioctl (2)
+operations:
+.PP
+.in +4n
+.EX
+int attr;
+fd = open("pathname", ...);
+\&
+ioctl(fd, FS_IOC_GETFLAGS, &attr); /* Place current flags
+ in \[aq]attr\[aq] */
+attr |= FS_NOATIME_FL; /* Tweak returned bit mask */
+ioctl(fd, FS_IOC_SETFLAGS, &attr); /* Update flags for inode
+ referred to by \[aq]fd\[aq] */
+.EE
+.in
+.PP
+The
+.BR lsattr (1)
+and
+.BR chattr (1)
+shell commands provide interfaces to these two operations,
+allowing a user to view and modify the inode flags associated with a file.
+.PP
+The following flags are supported
+(shown along with the corresponding letter used to indicate the flag by
+.BR lsattr (1)
+and
+.BR chattr (1)):
+.TP
+.BR FS_APPEND_FL " \[aq]a\[aq]"
+The file can be opened only with the
+.B O_APPEND
+flag.
+(This restriction applies even to the superuser.)
+Only a privileged process
+.RB ( CAP_LINUX_IMMUTABLE )
+can set or clear this attribute.
+.TP
+.BR FS_COMPR_FL " \[aq]c\[aq]"
+Store the file in a compressed format on disk.
+This flag is
+.I not
+supported by most of the mainstream filesystem implementations;
+one exception is
+.BR btrfs (5).
+.TP
+.BR FS_DIRSYNC_FL " \[aq]D\[aq] (since Linux 2.6.0)"
+Write directory changes synchronously to disk.
+This flag provides semantics equivalent to the
+.BR mount (2)
+.B MS_DIRSYNC
+option, but on a per-directory basis.
+This flag can be applied only to directories.
+.\" .TP
+.\" .BR FS_EXTENT_FL " \[aq]e\[aq]"
+.\" FIXME Some support on ext4? (EXT4_EXTENTS_FL)
+.TP
+.BR FS_IMMUTABLE_FL " \[aq]i\[aq]"
+The file is immutable:
+no changes are permitted to the file contents or metadata
+(permissions, timestamps, ownership, link count, and so on).
+(This restriction applies even to the superuser.)
+Only a privileged process
+.RB ( CAP_LINUX_IMMUTABLE )
+can set or clear this attribute.
+.TP
+.BR FS_JOURNAL_DATA_FL " \[aq]j\[aq]"
+Enable journaling of file data on
+.BR ext3 (5)
+and
+.BR ext4 (5)
+filesystems.
+On a filesystem that is journaling in
+.I ordered
+or
+.I writeback
+mode, a privileged
+.RB ( CAP_SYS_RESOURCE )
+process can set this flag to enable journaling of data updates on
+a per-file basis.
+.TP
+.BR FS_NOATIME_FL " \[aq]A\[aq]"
+Don't update the file last access time when the file is accessed.
+This can provide I/O performance benefits for applications that do not care
+about the accuracy of this timestamp.
+This flag provides functionality similar to the
+.BR mount (2)
+.B MS_NOATIME
+flag, but on a per-file basis.
+.\" .TP
+.\" .BR FS_NOCOMP_FL " \[aq]\[aq]"
+.\" FIXME Support for FS_NOCOMP_FL on Btrfs?
+.TP
+.BR FS_NOCOW_FL " \[aq]C\[aq] (since Linux 2.6.39)"
+The file will not be subject to copy-on-write updates.
+This flag has an effect only on filesystems that support copy-on-write
+semantics, such as Btrfs.
+See
+.BR chattr (1)
+and
+.BR btrfs (5).
+.TP
+.BR FS_NODUMP_FL " \[aq]d\[aq]"
+Don't include this file in backups made using
+.BR dump (8).
+.TP
+.BR FS_NOTAIL_FL " \[aq]t\[aq]"
+This flag is supported only on Reiserfs.
+It disables the Reiserfs tail-packing feature,
+which tries to pack small files (and the final fragment of larger files)
+into the same disk block as the file metadata.
+.TP
+.BR FS_PROJINHERIT_FL " \[aq]P\[aq] (since Linux 4.5)"
+.\" commit 040cb3786d9b25293b8b0b05b90da0f871e1eb9b
+.\" Flag name was added in Linux 4.4
+.\" FIXME Not currently supported because not in FS_FL_USER_MODIFIABLE?
+Inherit the quota project ID.
+Files and subdirectories will inherit the project ID of the directory.
+This flag can be applied only to directories.
+.TP
+.BR FS_SECRM_FL " \[aq]s\[aq]"
+Mark the file for secure deletion.
+This feature is not implemented by any filesystem,
+since the task of securely erasing a file from a recording medium
+is surprisingly difficult.
+.TP
+.BR FS_SYNC_FL " \[aq]S\[aq]"
+Make file updates synchronous.
+For files, this makes all writes synchronous
+(as though all opens of the file were with the
+.B O_SYNC
+flag).
+For directories, this has the same effect as the
+.B FS_DIRSYNC_FL
+flag.
+.TP
+.BR FS_TOPDIR_FL " \[aq]T\[aq]"
+Mark a directory for special treatment under the Orlov block-allocation
+strategy.
+See
+.BR chattr (1)
+for details.
+This flag can be applied only to directories and
+has an effect only for ext2, ext3, and ext4.
+.TP
+.BR FS_UNRM_FL " \[aq]u\[aq]"
+Allow the file to be undeleted if it is deleted.
+This feature is not implemented by any filesystem,
+since it is possible to implement file-recovery mechanisms outside the kernel.
+.PP
+In most cases,
+when any of the above flags is set on a directory,
+the flag is inherited by files and subdirectories
+created inside that directory.
+Exceptions include
+.BR FS_TOPDIR_FL ,
+which is not inheritable, and
+.BR FS_DIRSYNC_FL ,
+which is inherited only by subdirectories.
+.SH STANDARDS
+Linux.
+.SH NOTES
+In order to change the inode flags of a file using the
+.B FS_IOC_SETFLAGS
+operation,
+the effective user ID of the caller must match the owner of the file,
+or the caller must have the
+.B CAP_FOWNER
+capability.
+.PP
+The type of the argument given to the
+.B FS_IOC_GETFLAGS
+and
+.B FS_IOC_SETFLAGS
+operations is
+.IR int\~* ,
+notwithstanding the implication in the kernel source file
+.I include/uapi/linux/fs.h
+that the argument is
+.IR long\~* .
+.SH SEE ALSO
+.BR chattr (1),
+.BR lsattr (1),
+.BR mount (2),
+.BR btrfs (5),
+.BR ext4 (5),
+.BR xfs (5),
+.BR xattr (7),
+.BR mount (8)
diff --git a/man2/ioctl_ns.2 b/man2/ioctl_ns.2
new file mode 100644
index 0000000..a11e54b
--- /dev/null
+++ b/man2/ioctl_ns.2
@@ -0,0 +1,342 @@
+.\" Copyright (c) 2017 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\"
+.TH ioctl_ns 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+ioctl_ns \- ioctl() operations for Linux namespaces
+.SH DESCRIPTION
+.\" ============================================================
+.\"
+.SS Discovering namespace relationships
+The following
+.BR ioctl (2)
+operations are provided to allow discovery of namespace relationships (see
+.BR user_namespaces (7)
+and
+.BR pid_namespaces (7)).
+The form of the calls is:
+.PP
+.in +4n
+.EX
+new_fd = ioctl(fd, request);
+.EE
+.in
+.PP
+In each case,
+.I fd
+refers to a
+.IR /proc/ pid /ns/*
+file.
+Both operations return a new file descriptor on success.
+.TP
+.BR NS_GET_USERNS " (since Linux 4.9)"
+.\" commit bcac25a58bfc6bd79191ac5d7afb49bea96da8c9
+.\" commit 6786741dbf99e44fb0c0ed85a37582b8a26f1c3b
+Returns a file descriptor that refers to the owning user namespace
+for the namespace referred to by
+.IR fd .
+.TP
+.BR NS_GET_PARENT " (since Linux 4.9)"
+.\" commit a7306ed8d94af729ecef8b6e37506a1c6fc14788
+Returns a file descriptor that refers to the parent namespace of
+the namespace referred to by
+.IR fd .
+This operation is valid only for hierarchical namespaces
+(i.e., PID and user namespaces).
+For user namespaces,
+.B NS_GET_PARENT
+is synonymous with
+.BR NS_GET_USERNS .
+.PP
+The new file descriptor returned by these operations is opened with the
+.B O_RDONLY
+and
+.B O_CLOEXEC
+(close-on-exec; see
+.BR fcntl (2))
+flags.
+.PP
+By applying
+.BR fstat (2)
+to the returned file descriptor, one obtains a
+.I stat
+structure whose
+.I st_dev
+(resident device) and
+.I st_ino
+(inode number) fields together identify the owning/parent namespace.
+This inode number can be matched with the inode number of another
+.IR /proc/ pid /ns/ { pid , user }
+file to determine whether that is the owning/parent namespace.
+.PP
+Either of these
+.BR ioctl (2)
+operations can fail with the following errors:
+.TP
+.B EPERM
+The requested namespace is outside of the caller's namespace scope.
+This error can occur if, for example, the owning user namespace is an
+ancestor of the caller's current user namespace.
+It can also occur on attempts to obtain the parent of the initial
+user or PID namespace.
+.TP
+.B ENOTTY
+The operation is not supported by this kernel version.
+.PP
+Additionally, the
+.B NS_GET_PARENT
+operation can fail with the following error:
+.TP
+.B EINVAL
+.I fd
+refers to a nonhierarchical namespace.
+.PP
+See the EXAMPLES section for an example of the use of these operations.
+.\" ============================================================
+.\"
+.SS Discovering the namespace type
+The
+.B NS_GET_NSTYPE
+.\" commit e5ff5ce6e20ee22511398bb31fb912466cf82a36
+operation (available since Linux 4.11) can be used to discover
+the type of namespace referred to by the file descriptor
+.IR fd :
+.PP
+.in +4n
+.EX
+nstype = ioctl(fd, NS_GET_NSTYPE);
+.EE
+.in
+.PP
+.I fd
+refers to a
+.IR /proc/ pid /ns/*
+file.
+.PP
+The return value is one of the
+.B CLONE_NEW*
+values that can be specified to
+.BR clone (2)
+or
+.BR unshare (2)
+in order to create a namespace.
+.\" ============================================================
+.\"
+.SS Discovering the owner of a user namespace
+The
+.B NS_GET_OWNER_UID
+.\" commit 015bb305b8ebe8d601a238ab70ebdc394c7a19ba
+operation (available since Linux 4.11) can be used to discover
+the owner user ID of a user namespace (i.e., the effective user ID
+of the process that created the user namespace).
+The form of the call is:
+.PP
+.in +4n
+.EX
+uid_t uid;
+ioctl(fd, NS_GET_OWNER_UID, &uid);
+.EE
+.in
+.PP
+.I fd
+refers to a
+.IR /proc/ pid /ns/user
+file.
+.PP
+The owner user ID is returned in the
+.I uid_t
+pointed to by the third argument.
+.PP
+This operation can fail with the following error:
+.TP
+.B EINVAL
+.I fd
+does not refer to a user namespace.
+.SH ERRORS
+Any of the above
+.BR ioctl ()
+operations can return the following errors:
+.TP
+.B ENOTTY
+.I fd
+does not refer to a
+.IR /proc/ pid /ns/ *
+file.
+.SH STANDARDS
+Linux.
+.SH EXAMPLES
+The example shown below uses the
+.BR ioctl (2)
+operations described above to perform simple
+discovery of namespace relationships.
+The following shell sessions show various examples of the use
+of this program.
+.PP
+Trying to get the parent of the initial user namespace fails,
+since it has no parent:
+.PP
+.in +4n
+.EX
+$ \fB./ns_show /proc/self/ns/user p\fP
+The parent namespace is outside your namespace scope
+.EE
+.in
+.PP
+Create a process running
+.BR sleep (1)
+that resides in new user and UTS namespaces,
+and show that the new UTS namespace is associated with the new user namespace:
+.PP
+.in +4n
+.EX
+$ \fBunshare \-Uu sleep 1000 &\fP
+[1] 23235
+$ \fB./ns_show /proc/23235/ns/uts u\fP
+Device/Inode of owning user namespace is: [0,3] / 4026532448
+$ \fBreadlink /proc/23235/ns/user\fP
+user:[4026532448]
+.EE
+.in
+.PP
+Then show that the parent of the new user namespace in the preceding
+example is the initial user namespace:
+.PP
+.in +4n
+.EX
+$ \fBreadlink /proc/self/ns/user\fP
+user:[4026531837]
+$ \fB./ns_show /proc/23235/ns/user p\fP
+Device/Inode of parent namespace is: [0,3] / 4026531837
+.EE
+.in
+.PP
+Start a shell in a new user namespace, and show that from within
+this shell, the parent user namespace can't be discovered.
+Similarly, the UTS namespace
+(which is associated with the initial user namespace)
+can't be discovered.
+.PP
+.in +4n
+.EX
+$ \fBPS1="sh2$ " unshare \-U bash\fP
+sh2$ \fB./ns_show /proc/self/ns/user p\fP
+The parent namespace is outside your namespace scope
+sh2$ \fB./ns_show /proc/self/ns/uts u\fP
+The owning user namespace is outside your namespace scope
+.EE
+.in
+.SS Program source
+\&
+.\" SRC BEGIN (ns_show.c)
+.EX
+/* ns_show.c
+\&
+ Licensed under the GNU General Public License v2 or later.
+*/
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/nsfs.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <unistd.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ int fd, userns_fd, parent_fd;
+ struct stat sb;
+\&
+ if (argc < 2) {
+ fprintf(stderr, "Usage: %s /proc/[pid]/ns/[file] [p|u]\en",
+ argv[0]);
+ fprintf(stderr, "\enDisplay the result of one or both "
+ "of NS_GET_USERNS (u) or NS_GET_PARENT (p)\en"
+ "for the specified /proc/[pid]/ns/[file]. If neither "
+ "\[aq]p\[aq] nor \[aq]u\[aq] is specified,\en"
+ "NS_GET_USERNS is the default.\en");
+ exit(EXIT_FAILURE);
+ }
+\&
+ /* Obtain a file descriptor for the \[aq]ns\[aq] file specified
+ in argv[1]. */
+\&
+ fd = open(argv[1], O_RDONLY);
+ if (fd == \-1) {
+ perror("open");
+ exit(EXIT_FAILURE);
+ }
+\&
+ /* Obtain a file descriptor for the owning user namespace and
+ then obtain and display the inode number of that namespace. */
+\&
+ if (argc < 3 || strchr(argv[2], \[aq]u\[aq])) {
+ userns_fd = ioctl(fd, NS_GET_USERNS);
+\&
+ if (userns_fd == \-1) {
+ if (errno == EPERM)
+ printf("The owning user namespace is outside "
+ "your namespace scope\en");
+ else
+ perror("ioctl\-NS_GET_USERNS");
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (fstat(userns_fd, &sb) == \-1) {
+ perror("fstat\-userns");
+ exit(EXIT_FAILURE);
+ }
+ printf("Device/Inode of owning user namespace is: "
+ "[%x,%x] / %ju\en",
+ major(sb.st_dev),
+ minor(sb.st_dev),
+ (uintmax_t) sb.st_ino);
+\&
+ close(userns_fd);
+ }
+\&
+ /* Obtain a file descriptor for the parent namespace and
+ then obtain and display the inode number of that namespace. */
+\&
+ if (argc > 2 && strchr(argv[2], \[aq]p\[aq])) {
+ parent_fd = ioctl(fd, NS_GET_PARENT);
+\&
+ if (parent_fd == \-1) {
+ if (errno == EINVAL)
+ printf("Can\[aq] get parent namespace of a "
+ "nonhierarchical namespace\en");
+ else if (errno == EPERM)
+ printf("The parent namespace is outside "
+ "your namespace scope\en");
+ else
+ perror("ioctl\-NS_GET_PARENT");
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (fstat(parent_fd, &sb) == \-1) {
+ perror("fstat\-parentns");
+ exit(EXIT_FAILURE);
+ }
+ printf("Device/Inode of parent namespace is: [%x,%x] / %ju\en",
+ major(sb.st_dev),
+ minor(sb.st_dev),
+ (uintmax_t) sb.st_ino);
+\&
+ close(parent_fd);
+ }
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR fstat (2),
+.BR ioctl (2),
+.BR proc (5),
+.BR namespaces (7)
diff --git a/man2/ioctl_pipe.2 b/man2/ioctl_pipe.2
new file mode 100644
index 0000000..31e02bb
--- /dev/null
+++ b/man2/ioctl_pipe.2
@@ -0,0 +1,64 @@
+.\" Copyright (c) 2022 by Cyril Hrubis <chrubi@suse.cz>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH ioctl_pipe 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+ioctl_pipe \- ioctl() operations for General notification mechanism
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/watch_queue.h>" " /* Definition of " IOC_WATCH_QUEUE_ "* */"
+.B #include <sys/ioctl.h>
+.PP
+.BI "int ioctl(int " pipefd "[1], IOC_WATCH_QUEUE_SET_SIZE, int " size );
+.BI "int ioctl(int " pipefd "[1], IOC_WATCH_QUEUE_SET_FILTER,"
+.BI " struct watch_notification_filter *" filter );
+.fi
+.SH DESCRIPTION
+The following
+.BR ioctl (2)
+operations are provided to set up general notification queue parameters.
+The notification queue is built on the top of a
+.BR pipe (2)
+opened with the
+.B O_NOTIFICATION_PIPE
+flag.
+.TP
+.BR IOC_WATCH_QUEUE_SET_SIZE " (since Linux 5.8)"
+.\" commit c73be61cede5882f9605a852414db559c0ebedfd
+Preallocates the pipe buffer memory so that
+it can fit
+.I size
+notification messages.
+Currently,
+.I size
+must be between 1 and 512.
+.TP
+.BR IOC_WATCH_QUEUE_SET_FILTER " (since Linux 5.8)"
+.\" commit c73be61cede5882f9605a852414db559c0ebedfd
+Watch queue filter can limit events that are received.
+Filters are passed in a
+.I struct watch_notification_filter
+and each filter is described by a
+.I struct watch_notification_type_filter
+structure.
+.IP
+.in +4n
+.EX
+struct watch_notification_filter {
+ __u32 nr_filters;
+ __u32 __reserved;
+ struct watch_notification_type_filter filters[];
+};
+\&
+struct watch_notification_type_filter {
+ __u32 type;
+ __u32 info_filter;
+ __u32 info_mask;
+ __u32 subtype_filter[8];
+};
+.EE
+.in
+.SH SEE ALSO
+.BR pipe (2),
+.BR ioctl (2)
diff --git a/man2/ioctl_tty.2 b/man2/ioctl_tty.2
new file mode 100644
index 0000000..dfbd9a8
--- /dev/null
+++ b/man2/ioctl_tty.2
@@ -0,0 +1,913 @@
+'\" t
+.\" Copyright 2002 Walter Harms <walter.harms@informatik.uni-oldenburg.de>
+.\" and Andries Brouwer <aeb@cwi.nl>.
+.\"
+.\" SPDX-License-Identifier: GPL-1.0-or-later
+.\"
+.TH ioctl_tty 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+ioctl_tty \- ioctls for terminals and serial lines
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/ioctl.h>
+.BR "#include <asm/termbits.h>" " /* Definition of " "struct termios" ,
+.BR " struct termios2" ", and"
+.BR " Bnnn" ", " BOTHER ", " CBAUD ", " CLOCAL ,
+.BR " TC*" { FLUSH , ON , OFF "} and other constants */"
+.PP
+.BI "int ioctl(int " fd ", int " cmd ", ...);"
+.fi
+.SH DESCRIPTION
+The
+.BR ioctl (2)
+call for terminals and serial ports accepts many possible command arguments.
+Most require a third argument, of varying type, here called
+.I argp
+or
+.IR arg .
+.PP
+Use of
+.BR ioctl ()
+makes for nonportable programs.
+Use the POSIX interface described in
+.BR termios (3)
+whenever possible.
+.PP
+Please note that
+.B struct termios
+from
+.I <asm/termbits.h>
+is different and incompatible with
+.B struct termios
+from
+.IR <termios.h> .
+These ioctl calls require
+.B struct termios
+from
+.IR <asm/termbits.h> .
+.SS Get and set terminal attributes
+.TP
+.B TCGETS
+Argument:
+.BI "struct termios\~*" argp
+.IP
+Equivalent to
+.IR "tcgetattr(fd, argp)" .
+.IP
+Get the current serial port settings.
+.TP
+.B TCSETS
+Argument:
+.BI "const struct termios\~*" argp
+.IP
+Equivalent to
+.IR "tcsetattr(fd, TCSANOW, argp)" .
+.IP
+Set the current serial port settings.
+.TP
+.B TCSETSW
+Argument:
+.BI "const struct termios\~*" argp
+.IP
+Equivalent to
+.IR "tcsetattr(fd, TCSADRAIN, argp)" .
+.IP
+Allow the output buffer to drain, and
+set the current serial port settings.
+.TP
+.B TCSETSF
+Argument:
+.BI "const struct termios\~*" argp
+.IP
+Equivalent to
+.IR "tcsetattr(fd, TCSAFLUSH, argp)" .
+.IP
+Allow the output buffer to drain, discard pending input, and
+set the current serial port settings.
+.PP
+The following four ioctls, added in Linux 2.6.20,
+.\" commit 64bb6c5e1ddcd47c951740485026ef08975ee2e6
+.\" commit 592ee3a5e5e2a981ef2829a0380093006d045661
+are just like
+.BR TCGETS ,
+.BR TCSETS ,
+.BR TCSETSW ,
+.BR TCSETSF ,
+except that they take a
+.I "struct termios2\~*"
+instead of a
+.IR "struct termios\~*" .
+If the structure member
+.B c_cflag
+contains the flag
+.BR BOTHER ,
+then the baud rate is stored in the structure members
+.B c_ispeed
+and
+.B c_ospeed
+as integer values.
+These ioctls are not supported on all architectures.
+.RS
+.TS
+lb l.
+TCGETS2 \fBstruct termios2 *\fPargp
+TCSETS2 \fBconst struct termios2 *\fPargp
+TCSETSW2 \fBconst struct termios2 *\fPargp
+TCSETSF2 \fBconst struct termios2 *\fPargp
+.TE
+.RE
+.PP
+The following four ioctls are just like
+.BR TCGETS ,
+.BR TCSETS ,
+.BR TCSETSW ,
+.BR TCSETSF ,
+except that they take a
+.I "struct termio\~*"
+instead of a
+.IR "struct termios\~*" .
+.RS
+.TS
+lb l.
+TCGETA \fBstruct termio *\fPargp
+TCSETA \fBconst struct termio *\fPargp
+TCSETAW \fBconst struct termio *\fPargp
+TCSETAF \fBconst struct termio *\fPargp
+.TE
+.RE
+.SS Locking the termios structure
+The
+.I termios
+structure of a terminal can be locked.
+The lock is itself a
+.I termios
+structure, with nonzero bits or fields indicating a
+locked value.
+.TP
+.B TIOCGLCKTRMIOS
+Argument:
+.BI "struct termios\~*" argp
+.IP
+Gets the locking status of the
+.I termios
+structure of the terminal.
+.TP
+.B TIOCSLCKTRMIOS
+Argument:
+.BI "const struct termios\~*" argp
+.IP
+Sets the locking status of the
+.I termios
+structure of the terminal.
+Only a process with the
+.B CAP_SYS_ADMIN
+capability can do this.
+.SS Get and set window size
+Window sizes are kept in the kernel, but not used by the kernel
+(except in the case of virtual consoles, where the kernel will
+update the window size when the size of the virtual console changes,
+for example, by loading a new font).
+.TP
+.B TIOCGWINSZ
+Argument:
+.BI "struct winsize\~*" argp
+.IP
+Get window size.
+.TP
+.B TIOCSWINSZ
+Argument:
+.BI "const struct winsize\~*" argp
+.IP
+Set window size.
+.PP
+The struct used by these ioctls is defined as
+.PP
+.in +4n
+.EX
+struct winsize {
+ unsigned short ws_row;
+ unsigned short ws_col;
+ unsigned short ws_xpixel; /* unused */
+ unsigned short ws_ypixel; /* unused */
+};
+.EE
+.in
+.PP
+When the window size changes, a
+.B SIGWINCH
+signal is sent to the
+foreground process group.
+.SS Sending a break
+.TP
+.B TCSBRK
+Argument:
+.BI "int " arg
+.IP
+Equivalent to
+.IR "tcsendbreak(fd, arg)" .
+.IP
+If the terminal is using asynchronous serial data transmission, and
+.I arg
+is zero, then send a break (a stream of zero bits) for between
+0.25 and 0.5 seconds.
+If the terminal is not using asynchronous
+serial data transmission, then either a break is sent, or the function
+returns without doing anything.
+When
+.I arg
+is nonzero, nobody knows what will happen.
+.IP
+(SVr4, UnixWare, Solaris, and Linux treat
+.I "tcsendbreak(fd,arg)"
+with nonzero
+.I arg
+like
+.IR "tcdrain(fd)" .
+SunOS treats
+.I arg
+as a multiplier, and sends a stream of bits
+.I arg
+times as long as done for zero
+.IR arg .
+DG/UX and AIX treat
+.I arg
+(when nonzero) as a time interval measured in milliseconds.
+HP-UX ignores
+.IR arg .)
+.TP
+.B TCSBRKP
+Argument:
+.BI "int " arg
+.IP
+So-called "POSIX version" of
+.BR TCSBRK .
+It treats nonzero
+.I arg
+as a time interval measured in deciseconds, and does nothing
+when the driver does not support breaks.
+.TP
+.B TIOCSBRK
+Argument:
+.B void
+.IP
+Turn break on, that is, start sending zero bits.
+.TP
+.B TIOCCBRK
+Argument:
+.B void
+.IP
+Turn break off, that is, stop sending zero bits.
+.SS Software flow control
+.TP
+.B TCXONC
+Argument:
+.BI "int " arg
+.IP
+Equivalent to
+.IR "tcflow(fd, arg)" .
+.IP
+See
+.BR tcflow (3)
+for the argument values
+.BR TCOOFF ,
+.BR TCOON ,
+.BR TCIOFF ,
+.BR TCION .
+.SS Buffer count and flushing
+.TP
+.B FIONREAD
+Argument:
+.BI "int\~*" argp
+.IP
+Get the number of bytes in the input buffer.
+.TP
+.B TIOCINQ
+Argument:
+.BI "int\~*" argp
+.IP
+Same as
+.BR FIONREAD .
+.TP
+.B TIOCOUTQ
+Argument:
+.BI "int\~*" argp
+.IP
+Get the number of bytes in the output buffer.
+.TP
+.B TCFLSH
+Argument:
+.BI "int " arg
+.IP
+Equivalent to
+.IR "tcflush(fd, arg)" .
+.IP
+See
+.BR tcflush (3)
+for the argument values
+.BR TCIFLUSH ,
+.BR TCOFLUSH ,
+.BR TCIOFLUSH .
+.TP
+.B TIOCSERGETLSR
+Argument:
+.BI "int\~*" argp
+.IP
+Get line status register.
+Status register has
+.B TIOCSER_TEMT
+bit set when
+output buffer is empty and also hardware transmitter is physically empty.
+.IP
+Does not have to be supported by all serial tty drivers.
+.IP
+.BR tcdrain (3)
+does not wait and returns immediately when
+.B TIOCSER_TEMT
+bit is set.
+.SS Faking input
+.TP
+.B TIOCSTI
+Argument:
+.BI "const char\~*" argp
+.IP
+Insert the given byte in the input queue.
+.SS Redirecting console output
+.TP
+.B TIOCCONS
+Argument:
+.B void
+.IP
+Redirect output that would have gone to
+.I /dev/console
+or
+.I /dev/tty0
+to the given terminal.
+If that was a pseudoterminal master, send it to the slave.
+Before Linux 2.6.10,
+anybody can do this as long as the output was not redirected yet;
+since Linux 2.6.10, only a process with the
+.B CAP_SYS_ADMIN
+capability may do this.
+If output was redirected already, then
+.B EBUSY
+is returned,
+but redirection can be stopped by using this ioctl with
+.I fd
+pointing at
+.I /dev/console
+or
+.IR /dev/tty0 .
+.SS Controlling terminal
+.TP
+.B TIOCSCTTY
+Argument:
+.BI "int " arg
+.IP
+Make the given terminal the controlling terminal of the calling process.
+The calling process must be a session leader and not have a
+controlling terminal already.
+For this case,
+.I arg
+should be specified as zero.
+.IP
+If this terminal is already the controlling terminal
+of a different session group, then the ioctl fails with
+.BR EPERM ,
+unless the caller has the
+.B CAP_SYS_ADMIN
+capability and
+.I arg
+equals 1, in which case the terminal is stolen, and all processes that had
+it as controlling terminal lose it.
+.TP
+.B TIOCNOTTY
+Argument:
+.B void
+.IP
+If the given terminal was the controlling terminal of the calling process,
+give up this controlling terminal.
+If the process was session leader,
+then send
+.B SIGHUP
+and
+.B SIGCONT
+to the foreground process group
+and all processes in the current session lose their controlling terminal.
+.SS Process group and session ID
+.TP
+.B TIOCGPGRP
+Argument:
+.BI "pid_t\~*" argp
+.IP
+When successful, equivalent to
+.IR "*argp = tcgetpgrp(fd)" .
+.IP
+Get the process group ID of the foreground process group on this terminal.
+.TP
+.B TIOCSPGRP
+Argument:
+.BI "const pid_t\~*" argp
+.IP
+Equivalent to
+.IR "tcsetpgrp(fd, *argp)" .
+.IP
+Set the foreground process group ID of this terminal.
+.TP
+.B TIOCGSID
+Argument:
+.BI "pid_t\~*" argp
+.IP
+When successful, equivalent to
+.IR "*argp = tcgetsid(fd)" .
+.IP
+Get the session ID of the given terminal.
+This fails with the error
+.B ENOTTY
+if the terminal is not a master pseudoterminal
+and not our controlling terminal.
+Strange.
+.SS Exclusive mode
+.TP
+.B TIOCEXCL
+Argument:
+.B void
+.IP
+Put the terminal into exclusive mode.
+No further
+.BR open (2)
+operations on the terminal are permitted.
+(They fail with
+.BR EBUSY ,
+except for a process with the
+.B CAP_SYS_ADMIN
+capability.)
+.TP
+.B TIOCGEXCL
+Argument:
+.BI "int\~*" argp
+.IP
+(since Linux 3.8)
+If the terminal is currently in exclusive mode,
+place a nonzero value in the location pointed to by
+.IR argp ;
+otherwise, place zero in
+.IR *argp .
+.TP
+.B TIOCNXCL
+Argument:
+.B void
+.IP
+Disable exclusive mode.
+.SS Line discipline
+.TP
+.B TIOCGETD
+Argument:
+.BI "int\~*" argp
+.IP
+Get the line discipline of the terminal.
+.TP
+.B TIOCSETD
+Argument:
+.BI "const int\~*" argp
+.IP
+Set the line discipline of the terminal.
+.SS Pseudoterminal ioctls
+.TP
+.B TIOCPKT
+Argument:
+.BI "const int\~*" argp
+.IP
+Enable (when
+.RI * argp
+is nonzero) or disable packet mode.
+Can be applied to the master side of a pseudoterminal only (and will return
+.B ENOTTY
+otherwise).
+In packet mode, each subsequent
+.BR read (2)
+will return a packet that either contains a single nonzero control byte,
+or has a single byte containing zero (\[aq]\e0\[aq]) followed by data
+written on the slave side of the pseudoterminal.
+If the first byte is not
+.B TIOCPKT_DATA
+(0), it is an OR of one
+or more of the following bits:
+.IP
+.ad l
+.TS
+lb l.
+TIOCPKT_FLUSHREAD T{
+The read queue for the terminal is flushed.
+T}
+TIOCPKT_FLUSHWRITE T{
+The write queue for the terminal is flushed.
+T}
+TIOCPKT_STOP T{
+Output to the terminal is stopped.
+T}
+TIOCPKT_START T{
+Output to the terminal is restarted.
+T}
+TIOCPKT_DOSTOP T{
+The start and stop characters are \fB\[ha]S\fP/\fB\[ha]Q\fP.
+T}
+TIOCPKT_NOSTOP T{
+The start and stop characters are not \fB\[ha]S\fP/\fB\[ha]Q\fP.
+T}
+.TE
+.ad
+.IP
+While packet mode is in use, the presence
+of control status information to be read
+from the master side may be detected by a
+.BR select (2)
+for exceptional conditions or a
+.BR poll (2)
+for the
+.B POLLPRI
+event.
+.IP
+This mode is used by
+.BR rlogin (1)
+and
+.BR rlogind (8)
+to implement a remote-echoed,
+locally \fB\[ha]S\fP/\fB\[ha]Q\fP flow-controlled remote login.
+.TP
+.B TIOCGPKT
+Argument:
+.BI "const int\~*" argp
+.IP
+(since Linux 3.8)
+Return the current packet mode setting in the integer pointed to by
+.IR argp .
+.TP
+.B TIOCSPTLCK
+Argument:
+.BI "int\~*" argp
+.IP
+Set (if
+.I *argp
+is nonzero) or remove (if
+.I *argp
+is zero) the lock on the pseudoterminal slave device.
+(See also
+.BR unlockpt (3).)
+.TP
+.B TIOCGPTLCK
+Argument:
+.BI "int\~*" argp
+.IP
+(since Linux 3.8)
+Place the current lock state of the pseudoterminal slave device
+in the location pointed to by
+.IR argp .
+.TP
+.B TIOCGPTPEER
+Argument:
+.BI "int " flags
+.IP
+.\" commit 54ebbfb1603415d9953c150535850d30609ef077
+(since Linux 4.13)
+Given a file descriptor in
+.I fd
+that refers to a pseudoterminal master,
+open (with the given
+.BR open (2)-style
+.IR flags )
+and return a new file descriptor that refers to the peer
+pseudoterminal slave device.
+This operation can be performed
+regardless of whether the pathname of the slave device
+is accessible through the calling process's mount namespace.
+.IP
+Security-conscious programs interacting with namespaces may wish to use this
+operation rather than
+.BR open (2)
+with the pathname returned by
+.BR ptsname (3),
+and similar library functions that have insecure APIs.
+(For example, confusion can occur in some cases using
+.BR ptsname (3)
+with a pathname where a devpts filesystem
+has been mounted in a different mount namespace.)
+.PP
+The BSD ioctls
+.BR TIOCSTOP ,
+.BR TIOCSTART ,
+.BR TIOCUCNTL ,
+and
+.B TIOCREMOTE
+have not been implemented under Linux.
+.SS Modem control
+.TP
+.B TIOCMGET
+Argument:
+.BI "int\~*" argp
+.IP
+Get the status of modem bits.
+.TP
+.B TIOCMSET
+Argument:
+.BI "const int\~*" argp
+.IP
+Set the status of modem bits.
+.TP
+.B TIOCMBIC
+Argument:
+.BI "const int\~*" argp
+.IP
+Clear the indicated modem bits.
+.TP
+.B TIOCMBIS
+Argument:
+.BI "const int\~*" argp
+.IP
+Set the indicated modem bits.
+.PP
+The following bits are used by the above ioctls:
+.PP
+.TS
+lb l.
+TIOCM_LE DSR (data set ready/line enable)
+TIOCM_DTR DTR (data terminal ready)
+TIOCM_RTS RTS (request to send)
+TIOCM_ST Secondary TXD (transmit)
+TIOCM_SR Secondary RXD (receive)
+TIOCM_CTS CTS (clear to send)
+TIOCM_CAR DCD (data carrier detect)
+TIOCM_CD see TIOCM_CAR
+TIOCM_RNG RNG (ring)
+TIOCM_RI see TIOCM_RNG
+TIOCM_DSR DSR (data set ready)
+.TE
+.TP
+.B TIOCMIWAIT
+Argument:
+.BI "int " arg
+.IP
+Wait for any of the 4 modem bits (DCD, RI, DSR, CTS) to change.
+The bits of interest are specified as a bit mask in
+.IR arg ,
+by ORing together any of the bit values,
+.BR TIOCM_RNG ,
+.BR TIOCM_DSR ,
+.BR TIOCM_CD ,
+and
+.BR TIOCM_CTS .
+The caller should use
+.B TIOCGICOUNT
+to see which bit has changed.
+.TP
+.B TIOCGICOUNT
+Argument:
+.BI "struct serial_icounter_struct\~*" argp
+.IP
+Get counts of input serial line interrupts (DCD, RI, DSR, CTS).
+The counts are written to the
+.I serial_icounter_struct
+structure pointed to by
+.IR argp .
+.IP
+Note: both 1->0 and 0->1 transitions are counted, except for
+RI, where only 0->1 transitions are counted.
+.SS Marking a line as local
+.TP
+.B TIOCGSOFTCAR
+Argument:
+.BI "int\~*" argp
+.IP
+("Get software carrier flag")
+Get the status of the CLOCAL flag in the c_cflag field of the
+.I termios
+structure.
+.TP
+.B TIOCSSOFTCAR
+Argument:
+.BI "const int\~*" argp
+.IP
+("Set software carrier flag")
+Set the CLOCAL flag in the
+.I termios
+structure when
+.RI * argp
+is nonzero, and clear it otherwise.
+.PP
+If the
+.B CLOCAL
+flag for a line is off, the hardware carrier detect (DCD)
+signal is significant, and an
+.BR open (2)
+of the corresponding terminal will block until DCD is asserted,
+unless the
+.B O_NONBLOCK
+flag is given.
+If
+.B CLOCAL
+is set, the line behaves as if DCD is always asserted.
+The software carrier flag is usually turned on for local devices,
+and is off for lines with modems.
+.SS Linux-specific
+For the
+.B TIOCLINUX
+ioctl, see
+.BR ioctl_console (2).
+.SS Kernel debugging
+.B "#include <linux/tty.h>"
+.TP
+.B TIOCTTYGSTRUCT
+Argument:
+.BI "struct tty_struct\~*" argp
+.IP
+Get the
+.I tty_struct
+corresponding to
+.IR fd .
+This command was removed in Linux 2.5.67.
+.\" commit b3506a09d15dc5aee6d4bb88d759b157016e1864
+.\" Author: Andries E. Brouwer <andries.brouwer@cwi.nl>
+.\" Date: Tue Apr 1 04:42:46 2003 -0800
+.\"
+.\" [PATCH] kill TIOCTTYGSTRUCT
+.\"
+.\" Only used for (dubious) debugging purposes, and exposes
+.\" internal kernel state.
+.\"
+.\" .SS Serial info
+.\" .BR "#include <linux/serial.h>"
+.\" .PP
+.\" .TP
+.\" .BI "TIOCGSERIAL struct serial_struct *" argp
+.\" Get serial info.
+.\" .TP
+.\" .BI "TIOCSSERIAL const struct serial_struct *" argp
+.\" Set serial info.
+.SH RETURN VALUE
+The
+.BR ioctl (2)
+system call returns 0 on success.
+On error, it returns \-1 and sets
+.I errno
+to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+Invalid command parameter.
+.TP
+.B ENOIOCTLCMD
+Unknown command.
+.TP
+.B ENOTTY
+Inappropriate
+.IR fd .
+.TP
+.B EPERM
+Insufficient permission.
+.SH EXAMPLES
+Check the condition of DTR on the serial port.
+.PP
+.\" SRC BEGIN (tiocmget.c)
+.EX
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+\&
+int
+main(void)
+{
+ int fd, serial;
+\&
+ fd = open("/dev/ttyS0", O_RDONLY);
+ ioctl(fd, TIOCMGET, &serial);
+ if (serial & TIOCM_DTR)
+ puts("TIOCM_DTR is set");
+ else
+ puts("TIOCM_DTR is not set");
+ close(fd);
+}
+.EE
+.\" SRC END
+.PP
+Get or set arbitrary baudrate on the serial port.
+.PP
+.\" SRC BEGIN (tcgets.c)
+.EX
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+\&
+#include <asm/termbits.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+\&
+int
+main(int argc, char *argv[])
+{
+#if !defined BOTHER
+ fprintf(stderr, "BOTHER is unsupported\en");
+ /* Program may fallback to TCGETS/TCSETS with Bnnn constants */
+ exit(EXIT_FAILURE);
+#else
+ /* Declare tio structure, its type depends on supported ioctl */
+# if defined TCGETS2
+ struct termios2 tio;
+# else
+ struct termios tio;
+# endif
+ int fd, rc;
+\&
+ if (argc != 2 && argc != 3 && argc != 4) {
+ fprintf(stderr, "Usage: %s device [output [input] ]\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ fd = open(argv[1], O_RDWR | O_NONBLOCK | O_NOCTTY);
+ if (fd < 0) {
+ perror("open");
+ exit(EXIT_FAILURE);
+ }
+\&
+ /* Get the current serial port settings via supported ioctl */
+# if defined TCGETS2
+ rc = ioctl(fd, TCGETS2, &tio);
+# else
+ rc = ioctl(fd, TCGETS, &tio);
+# endif
+ if (rc) {
+ perror("TCGETS");
+ close(fd);
+ exit(EXIT_FAILURE);
+ }
+\&
+ /* Change baud rate when more arguments were provided */
+ if (argc == 3 || argc == 4) {
+ /* Clear the current output baud rate and fill a new value */
+ tio.c_cflag &= \[ti]CBAUD;
+ tio.c_cflag |= BOTHER;
+ tio.c_ospeed = atoi(argv[2]);
+\&
+ /* Clear the current input baud rate and fill a new value */
+ tio.c_cflag &= \[ti](CBAUD << IBSHIFT);
+ tio.c_cflag |= BOTHER << IBSHIFT;
+ /* When 4th argument is not provided reuse output baud rate */
+ tio.c_ispeed = (argc == 4) ? atoi(argv[3]) : atoi(argv[2]);
+\&
+ /* Set new serial port settings via supported ioctl */
+# if defined TCSETS2
+ rc = ioctl(fd, TCSETS2, &tio);
+# else
+ rc = ioctl(fd, TCSETS, &tio);
+# endif
+ if (rc) {
+ perror("TCSETS");
+ close(fd);
+ exit(EXIT_FAILURE);
+ }
+\&
+ /* And get new values which were really configured */
+# if defined TCGETS2
+ rc = ioctl(fd, TCGETS2, &tio);
+# else
+ rc = ioctl(fd, TCGETS, &tio);
+# endif
+ if (rc) {
+ perror("TCGETS");
+ close(fd);
+ exit(EXIT_FAILURE);
+ }
+ }
+\&
+ close(fd);
+\&
+ printf("output baud rate: %u\en", tio.c_ospeed);
+ printf("input baud rate: %u\en", tio.c_ispeed);
+\&
+ exit(EXIT_SUCCESS);
+#endif
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR ldattach (8),
+.BR ioctl (2),
+.BR ioctl_console (2),
+.BR termios (3),
+.BR pty (7)
+.\"
+.\" FIONBIO const int *
+.\" FIONCLEX void
+.\" FIOCLEX void
+.\" FIOASYNC const int *
+.\" from serial.c:
+.\" TIOCSERCONFIG void
+.\" TIOCSERGWILD int *
+.\" TIOCSERSWILD const int *
+.\" TIOCSERGSTRUCT struct async_struct *
+.\" TIOCSERGETMULTI struct serial_multiport_struct *
+.\" TIOCSERSETMULTI const struct serial_multiport_struct *
+.\" TIOCGSERIAL, TIOCSSERIAL (see above)
diff --git a/man2/ioctl_userfaultfd.2 b/man2/ioctl_userfaultfd.2
new file mode 100644
index 0000000..6ab9c11
--- /dev/null
+++ b/man2/ioctl_userfaultfd.2
@@ -0,0 +1,906 @@
+.\" Copyright (c) 2016, IBM Corporation.
+.\" Written by Mike Rapoport <rppt@linux.vnet.ibm.com>
+.\" and Copyright (C) 2016 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\"
+.TH ioctl_userfaultfd 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+ioctl_userfaultfd \- create a file descriptor for handling page faults in user
+space
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/userfaultfd.h>" " /* Definition of " UFFD* " constants */"
+.B #include <sys/ioctl.h>
+.PP
+.BI "int ioctl(int " fd ", int " cmd ", ...);"
+.fi
+.SH DESCRIPTION
+Various
+.BR ioctl (2)
+operations can be performed on a userfaultfd object (created by a call to
+.BR userfaultfd (2))
+using calls of the form:
+.PP
+.in +4n
+.EX
+ioctl(fd, cmd, argp);
+.EE
+.in
+In the above,
+.I fd
+is a file descriptor referring to a userfaultfd object,
+.I cmd
+is one of the commands listed below, and
+.I argp
+is a pointer to a data structure that is specific to
+.IR cmd .
+.PP
+The various
+.BR ioctl (2)
+operations are described below.
+The
+.BR UFFDIO_API ,
+.BR UFFDIO_REGISTER ,
+and
+.B UFFDIO_UNREGISTER
+operations are used to
+.I configure
+userfaultfd behavior.
+These operations allow the caller to choose what features will be enabled and
+what kinds of events will be delivered to the application.
+The remaining operations are
+.I range
+operations.
+These operations enable the calling application to resolve page-fault
+events.
+.\"
+.SS UFFDIO_API
+(Since Linux 4.3.)
+Enable operation of the userfaultfd and perform API handshake.
+.PP
+The
+.I argp
+argument is a pointer to a
+.I uffdio_api
+structure, defined as:
+.PP
+.in +4n
+.EX
+struct uffdio_api {
+ __u64 api; /* Requested API version (input) */
+ __u64 features; /* Requested features (input/output) */
+ __u64 ioctls; /* Available ioctl() operations (output) */
+};
+.EE
+.in
+.PP
+The
+.I api
+field denotes the API version requested by the application.
+.PP
+The kernel verifies that it can support the requested API version,
+and sets the
+.I features
+and
+.I ioctls
+fields to bit masks representing all the available features and the generic
+.BR ioctl (2)
+operations available.
+.PP
+Before Linux 4.11, the
+.I features
+field must be initialized to zero before the call to
+.BR UFFDIO_API ,
+and zero (i.e., no feature bits) is placed in the
+.I features
+field by the kernel upon return from
+.BR ioctl (2).
+.PP
+Starting from Linux 4.11, the
+.I features
+field can be used to ask whether particular features are supported
+and explicitly enable userfaultfd features that are disabled by default.
+The kernel always reports all the available features in the
+.I features
+field.
+.PP
+To enable userfaultfd features the application should set
+a bit corresponding to each feature it wants to enable in the
+.I features
+field.
+If the kernel supports all the requested features it will enable them.
+Otherwise it will zero out the returned
+.I uffdio_api
+structure and return
+.BR EINVAL .
+.\" FIXME add more details about feature negotiation and enablement
+.PP
+The following feature bits may be set:
+.TP
+.BR UFFD_FEATURE_EVENT_FORK " (since Linux 4.11)"
+When this feature is enabled,
+the userfaultfd objects associated with a parent process are duplicated
+into the child process during
+.BR fork (2)
+and a
+.B UFFD_EVENT_FORK
+event is delivered to the userfaultfd monitor
+.TP
+.BR UFFD_FEATURE_EVENT_REMAP " (since Linux 4.11)"
+If this feature is enabled,
+when the faulting process invokes
+.BR mremap (2),
+the userfaultfd monitor will receive an event of type
+.BR UFFD_EVENT_REMAP .
+.TP
+.BR UFFD_FEATURE_EVENT_REMOVE " (since Linux 4.11)"
+If this feature is enabled,
+when the faulting process calls
+.BR madvise (2)
+with the
+.B MADV_DONTNEED
+or
+.B MADV_REMOVE
+advice value to free a virtual memory area
+the userfaultfd monitor will receive an event of type
+.BR UFFD_EVENT_REMOVE .
+.TP
+.BR UFFD_FEATURE_EVENT_UNMAP " (since Linux 4.11)"
+If this feature is enabled,
+when the faulting process unmaps virtual memory either explicitly with
+.BR munmap (2),
+or implicitly during either
+.BR mmap (2)
+or
+.BR mremap (2),
+the userfaultfd monitor will receive an event of type
+.BR UFFD_EVENT_UNMAP .
+.TP
+.BR UFFD_FEATURE_MISSING_HUGETLBFS " (since Linux 4.11)"
+If this feature bit is set,
+the kernel supports registering userfaultfd ranges on hugetlbfs
+virtual memory areas
+.TP
+.BR UFFD_FEATURE_MISSING_SHMEM " (since Linux 4.11)"
+If this feature bit is set,
+the kernel supports registering userfaultfd ranges on shared memory areas.
+This includes all kernel shared memory APIs:
+System V shared memory,
+.BR tmpfs (5),
+shared mappings of
+.IR /dev/zero ,
+.BR mmap (2)
+with the
+.B MAP_SHARED
+flag set,
+.BR memfd_create (2),
+and so on.
+.TP
+.BR UFFD_FEATURE_SIGBUS " (since Linux 4.14)"
+.\" commit 2d6d6f5a09a96cc1fec7ed992b825e05f64cb50e
+If this feature bit is set, no page-fault events
+.RB ( UFFD_EVENT_PAGEFAULT )
+will be delivered.
+Instead, a
+.B SIGBUS
+signal will be sent to the faulting process.
+Applications using this
+feature will not require the use of a userfaultfd monitor for processing
+memory accesses to the regions registered with userfaultfd.
+.TP
+.BR UFFD_FEATURE_THREAD_ID " (since Linux 4.14)"
+If this feature bit is set,
+.I uffd_msg.pagefault.feat.ptid
+will be set to the faulted thread ID for each page-fault message.
+.TP
+.BR UFFD_FEATURE_MINOR_HUGETLBFS " (since Linux 5.13)"
+If this feature bit is set,
+the kernel supports registering userfaultfd ranges
+in minor mode on hugetlbfs-backed memory areas.
+.TP
+.BR UFFD_FEATURE_MINOR_SHMEM " (since Linux 5.14)"
+If this feature bit is set,
+the kernel supports registering userfaultfd ranges
+in minor mode on shmem-backed memory areas.
+.TP
+.BR UFFD_FEATURE_EXACT_ADDRESS " (since Linux 5.18)"
+If this feature bit is set,
+.I uffd_msg.pagefault.address
+will be set to the exact page-fault address that was reported by the hardware,
+and will not mask the offset within the page.
+Note that old Linux versions might indicate the exact address as well,
+even though the feature bit is not set.
+.PP
+The returned
+.I ioctls
+field can contain the following bits:
+.\" FIXME This user-space API seems not fully polished. Why are there
+.\" not constants defined for each of the bit-mask values listed below?
+.TP
+.B 1 << _UFFDIO_API
+The
+.B UFFDIO_API
+operation is supported.
+.TP
+.B 1 << _UFFDIO_REGISTER
+The
+.B UFFDIO_REGISTER
+operation is supported.
+.TP
+.B 1 << _UFFDIO_UNREGISTER
+The
+.B UFFDIO_UNREGISTER
+operation is supported.
+.PP
+This
+.BR ioctl (2)
+operation returns 0 on success.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+Possible errors include:
+.TP
+.B EFAULT
+.I argp
+refers to an address that is outside the calling process's
+accessible address space.
+.TP
+.B EINVAL
+The userfaultfd has already been enabled by a previous
+.B UFFDIO_API
+operation.
+.TP
+.B EINVAL
+The API version requested in the
+.I api
+field is not supported by this kernel, or the
+.I features
+field passed to the kernel includes feature bits that are not supported
+by the current kernel version.
+.\" FIXME In the above error case, the returned 'uffdio_api' structure is
+.\" zeroed out. Why is this done? This should be explained in the manual page.
+.\"
+.\" Mike Rapoport:
+.\" In my understanding the uffdio_api
+.\" structure is zeroed to allow the caller
+.\" to distinguish the reasons for -EINVAL.
+.\"
+.SS UFFDIO_REGISTER
+(Since Linux 4.3.)
+Register a memory address range with the userfaultfd object.
+The pages in the range must be "compatible".
+Please refer to the list of register modes below
+for the compatible memory backends for each mode.
+.PP
+The
+.I argp
+argument is a pointer to a
+.I uffdio_register
+structure, defined as:
+.PP
+.in +4n
+.EX
+struct uffdio_range {
+ __u64 start; /* Start of range */
+ __u64 len; /* Length of range (bytes) */
+};
+\&
+struct uffdio_register {
+ struct uffdio_range range;
+ __u64 mode; /* Desired mode of operation (input) */
+ __u64 ioctls; /* Available ioctl() operations (output) */
+};
+.EE
+.in
+.PP
+The
+.I range
+field defines a memory range starting at
+.I start
+and continuing for
+.I len
+bytes that should be handled by the userfaultfd.
+.PP
+The
+.I mode
+field defines the mode of operation desired for this memory region.
+The following values may be bitwise ORed to set the userfaultfd mode for
+the specified range:
+.TP
+.B UFFDIO_REGISTER_MODE_MISSING
+Track page faults on missing pages.
+Since Linux 4.3,
+only private anonymous ranges are compatible.
+Since Linux 4.11,
+hugetlbfs and shared memory ranges are also compatible.
+.TP
+.B UFFDIO_REGISTER_MODE_WP
+Track page faults on write-protected pages.
+Since Linux 5.7,
+only private anonymous ranges are compatible.
+.TP
+.B UFFDIO_REGISTER_MODE_MINOR
+Track minor page faults.
+Since Linux 5.13,
+only hugetlbfs ranges are compatible.
+Since Linux 5.14,
+compatibility with shmem ranges was added.
+.PP
+If the operation is successful, the kernel modifies the
+.I ioctls
+bit-mask field to indicate which
+.BR ioctl (2)
+operations are available for the specified range.
+This returned bit mask can contain the following bits:
+.TP
+.B 1 << _UFFDIO_COPY
+The
+.B UFFDIO_COPY
+operation is supported.
+.TP
+.B 1 << _UFFDIO_WAKE
+The
+.B UFFDIO_WAKE
+operation is supported.
+.TP
+.B 1 << _UFFDIO_WRITEPROTECT
+The
+.B UFFDIO_WRITEPROTECT
+.TP
+.B 1 << _UFFDIO_ZEROPAGE
+The
+.B UFFDIO_ZEROPAGE
+operation is supported.
+.TP
+.B 1 << _UFFDIO_CONTINUE
+The
+.B UFFDIO_CONTINUE
+operation is supported.
+.PP
+This
+.BR ioctl (2)
+operation returns 0 on success.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+Possible errors include:
+.\" FIXME Is the following error list correct?
+.\"
+.TP
+.B EBUSY
+A mapping in the specified range is registered with another
+userfaultfd object.
+.TP
+.B EFAULT
+.I argp
+refers to an address that is outside the calling process's
+accessible address space.
+.TP
+.B EINVAL
+An invalid or unsupported bit was specified in the
+.I mode
+field; or the
+.I mode
+field was zero.
+.TP
+.B EINVAL
+There is no mapping in the specified address range.
+.TP
+.B EINVAL
+.I range.start
+or
+.I range.len
+is not a multiple of the system page size; or,
+.I range.len
+is zero; or these fields are otherwise invalid.
+.TP
+.B EINVAL
+There as an incompatible mapping in the specified address range.
+.\" Mike Rapoport:
+.\" ENOMEM if the process is exiting and the
+.\" mm_struct has gone by the time userfault grabs it.
+.SS UFFDIO_UNREGISTER
+(Since Linux 4.3.)
+Unregister a memory address range from userfaultfd.
+The pages in the range must be "compatible" (see the description of
+.BR UFFDIO_REGISTER .)
+.PP
+The address range to unregister is specified in the
+.I uffdio_range
+structure pointed to by
+.IR argp .
+.PP
+This
+.BR ioctl (2)
+operation returns 0 on success.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+Possible errors include:
+.TP
+.B EINVAL
+Either the
+.I start
+or the
+.I len
+field of the
+.I ufdio_range
+structure was not a multiple of the system page size; or the
+.I len
+field was zero; or these fields were otherwise invalid.
+.TP
+.B EINVAL
+There as an incompatible mapping in the specified address range.
+.TP
+.B EINVAL
+There was no mapping in the specified address range.
+.\"
+.SS UFFDIO_COPY
+(Since Linux 4.3.)
+Atomically copy a continuous memory chunk into the userfault registered
+range and optionally wake up the blocked thread.
+The source and destination addresses and the number of bytes to copy are
+specified by the
+.IR src ", " dst ", and " len
+fields of the
+.I uffdio_copy
+structure pointed to by
+.IR argp :
+.PP
+.in +4n
+.EX
+struct uffdio_copy {
+ __u64 dst; /* Destination of copy */
+ __u64 src; /* Source of copy */
+ __u64 len; /* Number of bytes to copy */
+ __u64 mode; /* Flags controlling behavior of copy */
+ __s64 copy; /* Number of bytes copied, or negated error */
+};
+.EE
+.in
+.PP
+The following value may be bitwise ORed in
+.I mode
+to change the behavior of the
+.B UFFDIO_COPY
+operation:
+.TP
+.B UFFDIO_COPY_MODE_DONTWAKE
+Do not wake up the thread that waits for page-fault resolution
+.TP
+.B UFFDIO_COPY_MODE_WP
+Copy the page with read-only permission.
+This allows the user to trap the next write to the page,
+which will block and generate another write-protect userfault message.
+This is used only when both
+.B UFFDIO_REGISTER_MODE_MISSING
+and
+.B UFFDIO_REGISTER_MODE_WP
+modes are enabled for the registered range.
+.PP
+The
+.I copy
+field is used by the kernel to return the number of bytes
+that was actually copied, or an error (a negated
+.IR errno -style
+value).
+.\" FIXME Above: Why is the 'copy' field used to return error values?
+.\" This should be explained in the manual page.
+If the value returned in
+.I copy
+doesn't match the value that was specified in
+.IR len ,
+the operation fails with the error
+.BR EAGAIN .
+The
+.I copy
+field is output-only;
+it is not read by the
+.B UFFDIO_COPY
+operation.
+.PP
+This
+.BR ioctl (2)
+operation returns 0 on success.
+In this case, the entire area was copied.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+Possible errors include:
+.TP
+.B EAGAIN
+The number of bytes copied (i.e., the value returned in the
+.I copy
+field)
+does not equal the value that was specified in the
+.I len
+field.
+.TP
+.B EINVAL
+Either
+.I dst
+or
+.I len
+was not a multiple of the system page size, or the range specified by
+.I src
+and
+.I len
+or
+.I dst
+and
+.I len
+was invalid.
+.TP
+.B EINVAL
+An invalid bit was specified in the
+.I mode
+field.
+.TP
+.BR ENOENT " (since Linux 4.11)"
+The faulting process has changed
+its virtual memory layout simultaneously with an outstanding
+.B UFFDIO_COPY
+operation.
+.TP
+.BR ENOSPC " (from Linux 4.11 until Linux 4.13)"
+The faulting process has exited at the time of a
+.B UFFDIO_COPY
+operation.
+.TP
+.BR ESRCH " (since Linux 4.13)"
+The faulting process has exited at the time of a
+.B UFFDIO_COPY
+operation.
+.\"
+.SS UFFDIO_ZEROPAGE
+(Since Linux 4.3.)
+Zero out a memory range registered with userfaultfd.
+.PP
+The requested range is specified by the
+.I range
+field of the
+.I uffdio_zeropage
+structure pointed to by
+.IR argp :
+.PP
+.in +4n
+.EX
+struct uffdio_zeropage {
+ struct uffdio_range range;
+ __u64 mode; /* Flags controlling behavior of copy */
+ __s64 zeropage; /* Number of bytes zeroed, or negated error */
+};
+.EE
+.in
+.PP
+The following value may be bitwise ORed in
+.I mode
+to change the behavior of the
+.B UFFDIO_ZEROPAGE
+operation:
+.TP
+.B UFFDIO_ZEROPAGE_MODE_DONTWAKE
+Do not wake up the thread that waits for page-fault resolution.
+.PP
+The
+.I zeropage
+field is used by the kernel to return the number of bytes
+that was actually zeroed,
+or an error in the same manner as
+.BR UFFDIO_COPY .
+.\" FIXME Why is the 'zeropage' field used to return error values?
+.\" This should be explained in the manual page.
+If the value returned in the
+.I zeropage
+field doesn't match the value that was specified in
+.IR range.len ,
+the operation fails with the error
+.BR EAGAIN .
+The
+.I zeropage
+field is output-only;
+it is not read by the
+.B UFFDIO_ZEROPAGE
+operation.
+.PP
+This
+.BR ioctl (2)
+operation returns 0 on success.
+In this case, the entire area was zeroed.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+Possible errors include:
+.TP
+.B EAGAIN
+The number of bytes zeroed (i.e., the value returned in the
+.I zeropage
+field)
+does not equal the value that was specified in the
+.I range.len
+field.
+.TP
+.B EINVAL
+Either
+.I range.start
+or
+.I range.len
+was not a multiple of the system page size; or
+.I range.len
+was zero; or the range specified was invalid.
+.TP
+.B EINVAL
+An invalid bit was specified in the
+.I mode
+field.
+.TP
+.BR ESRCH " (since Linux 4.13)"
+The faulting process has exited at the time of a
+.B UFFDIO_ZEROPAGE
+operation.
+.\"
+.SS UFFDIO_WAKE
+(Since Linux 4.3.)
+Wake up the thread waiting for page-fault resolution on
+a specified memory address range.
+.PP
+The
+.B UFFDIO_WAKE
+operation is used in conjunction with
+.B UFFDIO_COPY
+and
+.B UFFDIO_ZEROPAGE
+operations that have the
+.B UFFDIO_COPY_MODE_DONTWAKE
+or
+.B UFFDIO_ZEROPAGE_MODE_DONTWAKE
+bit set in the
+.I mode
+field.
+The userfault monitor can perform several
+.B UFFDIO_COPY
+and
+.B UFFDIO_ZEROPAGE
+operations in a batch and then explicitly wake up the faulting thread using
+.BR UFFDIO_WAKE .
+.PP
+The
+.I argp
+argument is a pointer to a
+.I uffdio_range
+structure (shown above) that specifies the address range.
+.PP
+This
+.BR ioctl (2)
+operation returns 0 on success.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+Possible errors include:
+.TP
+.B EINVAL
+The
+.I start
+or the
+.I len
+field of the
+.I ufdio_range
+structure was not a multiple of the system page size; or
+.I len
+was zero; or the specified range was otherwise invalid.
+.SS UFFDIO_WRITEPROTECT (Since Linux 5.7)
+Write-protect or write-unprotect a userfaultfd-registered memory range
+registered with mode
+.BR UFFDIO_REGISTER_MODE_WP .
+.PP
+The
+.I argp
+argument is a pointer to a
+.I uffdio_range
+structure as shown below:
+.PP
+.in +4n
+.EX
+struct uffdio_writeprotect {
+ struct uffdio_range range; /* Range to change write permission*/
+ __u64 mode; /* Mode to change write permission */
+};
+.EE
+.in
+.PP
+There are two mode bits that are supported in this structure:
+.TP
+.B UFFDIO_WRITEPROTECT_MODE_WP
+When this mode bit is set,
+the ioctl will be a write-protect operation upon the memory range specified by
+.IR range .
+Otherwise it will be a write-unprotect operation upon the specified range,
+which can be used to resolve a userfaultfd write-protect page fault.
+.TP
+.B UFFDIO_WRITEPROTECT_MODE_DONTWAKE
+When this mode bit is set,
+do not wake up any thread that waits for
+page-fault resolution after the operation.
+This can be specified only if
+.B UFFDIO_WRITEPROTECT_MODE_WP
+is not specified.
+.PP
+This
+.BR ioctl (2)
+operation returns 0 on success.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+Possible errors include:
+.TP
+.B EINVAL
+The
+.I start
+or the
+.I len
+field of the
+.I ufdio_range
+structure was not a multiple of the system page size; or
+.I len
+was zero; or the specified range was otherwise invalid.
+.TP
+.B EAGAIN
+The process was interrupted; retry this call.
+.TP
+.B ENOENT
+The range specified in
+.I range
+is not valid.
+For example, the virtual address does not exist,
+or not registered with userfaultfd write-protect mode.
+.TP
+.B EFAULT
+Encountered a generic fault during processing.
+.\"
+.SS UFFDIO_CONTINUE
+(Since Linux 5.13.)
+Resolve a minor page fault
+by installing page table entries
+for existing pages in the page cache.
+.PP
+The
+.I argp
+argument is a pointer to a
+.I uffdio_continue
+structure as shown below:
+.PP
+.in +4n
+.EX
+struct uffdio_continue {
+ struct uffdio_range range;
+ /* Range to install PTEs for and continue */
+ __u64 mode; /* Flags controlling the behavior of continue */
+ __s64 mapped; /* Number of bytes mapped, or negated error */
+};
+.EE
+.in
+.PP
+The following value may be bitwise ORed in
+.I mode
+to change the behavior of the
+.B UFFDIO_CONTINUE
+operation:
+.TP
+.B UFFDIO_CONTINUE_MODE_DONTWAKE
+Do not wake up the thread that waits for page-fault resolution.
+.PP
+The
+.I mapped
+field is used by the kernel
+to return the number of bytes that were actually mapped,
+or an error in the same manner as
+.BR UFFDIO_COPY .
+If the value returned in the
+.I mapped
+field doesn't match the value that was specified in
+.IR range.len ,
+the operation fails with the error
+.BR EAGAIN .
+The
+.I mapped
+field is output-only;
+it is not read by the
+.B UFFDIO_CONTINUE
+operation.
+.PP
+This
+.BR ioctl (2)
+operation returns 0 on success.
+In this case,
+the entire area was mapped.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+Possible errors include:
+.TP
+.B EAGAIN
+The number of bytes mapped
+(i.e., the value returned in the
+.I mapped
+field)
+does not equal the value that was specified in the
+.I range.len
+field.
+.TP
+.B EINVAL
+Either
+.I range.start
+or
+.I range.len
+was not a multiple of the system page size; or
+.I range.len
+was zero; or the range specified was invalid.
+.TP
+.B EINVAL
+An invalid bit was specified in the
+.I mode
+field.
+.TP
+.B EEXIST
+One or more pages were already mapped in the given range.
+.TP
+.B ENOENT
+The faulting process has changed its virtual memory layout simultaneously with
+an outstanding
+.B UFFDIO_CONTINUE
+operation.
+.TP
+.B ENOMEM
+Allocating memory needed to setup the page table mappings failed.
+.TP
+.B EFAULT
+No existing page could be found in the page cache for the given range.
+.TP
+.B ESRCH
+The faulting process has exited at the time of a
+.B UFFDIO_CONTINUE
+operation.
+.\"
+.SH RETURN VALUE
+See descriptions of the individual operations, above.
+.SH ERRORS
+See descriptions of the individual operations, above.
+In addition, the following general errors can occur for all of the
+operations described above:
+.TP
+.B EFAULT
+.I argp
+does not point to a valid memory address.
+.TP
+.B EINVAL
+(For all operations except
+.BR UFFDIO_API .)
+The userfaultfd object has not yet been enabled (via the
+.B UFFDIO_API
+operation).
+.SH STANDARDS
+Linux.
+.SH BUGS
+In order to detect available userfault features and
+enable some subset of those features
+the userfaultfd file descriptor must be closed after the first
+.B UFFDIO_API
+operation that queries features availability and reopened before
+the second
+.B UFFDIO_API
+operation that actually enables the desired features.
+.SH EXAMPLES
+See
+.BR userfaultfd (2).
+.SH SEE ALSO
+.BR ioctl (2),
+.BR mmap (2),
+.BR userfaultfd (2)
+.PP
+.I Documentation/admin\-guide/mm/userfaultfd.rst
+in the Linux kernel source tree
diff --git a/man2/ioperm.2 b/man2/ioperm.2
new file mode 100644
index 0000000..dcbad6c
--- /dev/null
+++ b/man2/ioperm.2
@@ -0,0 +1,105 @@
+.\" Copyright (c) 1993 Michael Haardt
+.\" (michael@moria.de)
+.\" Fri Apr 2 11:32:09 MET DST 1993
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" Modified Sat Jul 24 15:12:05 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Modified Tue Aug 1 16:27 1995 by Jochen Karrer
+.\" <cip307@cip.physik.uni-wuerzburg.de>
+.\" Modified Tue Oct 22 08:11:14 EDT 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified Mon Feb 15 17:28:41 CET 1999 by Andries E. Brouwer <aeb@cwi.nl>
+.\" Modified, 27 May 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on capability requirements
+.\"
+.TH ioperm 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+ioperm \- set port input/output permissions
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/io.h>
+.PP
+.BI "int ioperm(unsigned long " from ", unsigned long " num ", int " turn_on );
+.fi
+.SH DESCRIPTION
+.BR ioperm ()
+sets the port access permission bits for the calling thread for
+.I num
+bits starting from port address
+.IR from .
+If
+.I turn_on
+is nonzero, then permission for the specified bits is enabled;
+otherwise it is disabled.
+If
+.I turn_on
+is nonzero, the calling thread must be privileged
+.RB ( CAP_SYS_RAWIO ).
+.PP
+Before Linux 2.6.8,
+only the first 0x3ff I/O ports could be specified in this manner.
+For more ports, the
+.BR iopl (2)
+system call had to be used (with a
+.I level
+argument of 3).
+Since Linux 2.6.8, 65,536 I/O ports can be specified.
+.PP
+Permissions are inherited by the child created by
+.BR fork (2)
+(but see NOTES).
+Permissions are preserved across
+.BR execve (2);
+this is useful for giving port access permissions to unprivileged
+programs.
+.PP
+This call is mostly for the i386 architecture.
+On many other architectures it does not exist or will always
+return an error.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+Invalid values for
+.I from
+or
+.IR num .
+.TP
+.B EIO
+(on PowerPC) This call is not supported.
+.TP
+.B ENOMEM
+.\" Could not allocate I/O bitmap.
+Out of memory.
+.TP
+.B EPERM
+The calling thread has insufficient privilege.
+.SH VERSIONS
+glibc has an
+.BR ioperm ()
+prototype both in
+.I <sys/io.h>
+and in
+.IR <sys/perm.h> .
+Avoid the latter, it is available on i386 only.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Before Linux 2.4,
+permissions were not inherited by a child created by
+.BR fork (2).
+.SH NOTES
+The
+.I /proc/ioports
+file shows the I/O ports that are currently allocated on the system.
+.SH SEE ALSO
+.BR iopl (2),
+.BR outb (2),
+.BR capabilities (7)
diff --git a/man2/iopl.2 b/man2/iopl.2
new file mode 100644
index 0000000..239d206
--- /dev/null
+++ b/man2/iopl.2
@@ -0,0 +1,92 @@
+.\" Copyright 1993 Rickard E. Faith (faith@cs.unc.edu)
+.\" Portions extracted from linux/kernel/ioport.c (no copyright notice).
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified Tue Aug 1 16:47 1995 by Jochen Karrer
+.\" <cip307@cip.physik.uni-wuerzburg.de>
+.\" Modified Tue Oct 22 08:11:14 EDT 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified Fri Nov 27 14:50:36 CET 1998 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified, 27 May 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on capability requirements
+.\"
+.TH iopl 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+iopl \- change I/O privilege level
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/io.h>
+.PP
+.BI "[[deprecated]] int iopl(int " level );
+.fi
+.SH DESCRIPTION
+.BR iopl ()
+changes the I/O privilege level of the calling thread,
+as specified by the two least significant bits in
+.IR level .
+.PP
+The I/O privilege level for a normal thread is 0.
+Permissions are inherited from parents to children.
+.PP
+This call is deprecated, is significantly slower than
+.BR ioperm (2),
+and is only provided for older X servers which require
+access to all 65536 I/O ports.
+It is mostly for the i386 architecture.
+On many other architectures it does not exist or will always
+return an error.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+.I level
+is greater than 3.
+.TP
+.B ENOSYS
+This call is unimplemented.
+.TP
+.B EPERM
+The calling thread has insufficient privilege to call
+.BR iopl ();
+the
+.B CAP_SYS_RAWIO
+capability is required to raise the I/O privilege level
+above its current value.
+.SH VERSIONS
+.\" Libc5 treats it as a system call and has a prototype in
+.\" .IR <unistd.h> .
+.\" glibc1 does not have a prototype.
+glibc2 has a prototype both in
+.I <sys/io.h>
+and in
+.IR <sys/perm.h> .
+Avoid the latter, it is available on i386 only.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Prior to Linux 5.5
+.BR iopl ()
+allowed the thread to disable interrupts while running
+at a higher I/O privilege level.
+This will probably crash the system, and is not recommended.
+.PP
+Prior to Linux 3.7,
+on some architectures (such as i386), permissions
+.I were
+inherited by the child produced by
+.BR fork (2)
+and were preserved across
+.BR execve (2).
+This behavior was inadvertently changed in Linux 3.7,
+and won't be reinstated.
+.SH SEE ALSO
+.BR ioperm (2),
+.BR outb (2),
+.BR capabilities (7)
diff --git a/man2/ioprio_get.2 b/man2/ioprio_get.2
new file mode 100644
index 0000000..d6d5b3b
--- /dev/null
+++ b/man2/ioprio_get.2
@@ -0,0 +1 @@
+.so man2/ioprio_set.2
diff --git a/man2/ioprio_set.2 b/man2/ioprio_set.2
new file mode 100644
index 0000000..7770cbc
--- /dev/null
+++ b/man2/ioprio_set.2
@@ -0,0 +1,362 @@
+.\" Copyright (c) International Business Machines orp., 2006
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" HISTORY:
+.\" 2006-04-27, created by Eduardo M. Fleury <efleury@br.ibm.com>
+.\" with various additions by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\"
+.TH ioprio_set 2 2023-04-03 "Linux man-pages 6.05.01"
+.SH NAME
+ioprio_get, ioprio_set \- get/set I/O scheduling class and priority
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/ioprio.h> " "/* Definition of " IOPRIO_* " constants */"
+.BR "#include <sys/syscall.h> " "/* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_ioprio_get, int " which ", int " who );
+.BI "int syscall(SYS_ioprio_set, int " which ", int " who ", int " ioprio );
+.fi
+.PP
+.IR Note :
+glibc provides no wrappers for these system calls,
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The
+.BR ioprio_get ()
+and
+.BR ioprio_set ()
+system calls get and set the I/O scheduling class and
+priority of one or more threads.
+.PP
+The
+.I which
+and
+.I who
+arguments identify the thread(s) on which the system
+calls operate.
+The
+.I which
+argument determines how
+.I who
+is interpreted, and has one of the following values:
+.TP
+.B IOPRIO_WHO_PROCESS
+.I who
+is a process ID or thread ID identifying a single process or thread.
+If
+.I who
+is 0, then operate on the calling thread.
+.TP
+.B IOPRIO_WHO_PGRP
+.I who
+is a process group ID identifying all the members of a process group.
+If
+.I who
+is 0, then operate on the process group of which the caller is a member.
+.TP
+.B IOPRIO_WHO_USER
+.I who
+is a user ID identifying all of the processes that
+have a matching real UID.
+.\" FIXME . Need to document the behavior when 'who" is specified as 0
+.\" See http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=652443
+.PP
+If
+.I which
+is specified as
+.B IOPRIO_WHO_PGRP
+or
+.B IOPRIO_WHO_USER
+when calling
+.BR ioprio_get (),
+and more than one process matches
+.IR who ,
+then the returned priority will be the highest one found among
+all of the matching processes.
+One priority is said to be
+higher than another one if it belongs to a higher priority
+class
+.RB ( IOPRIO_CLASS_RT
+is the highest priority class;
+.B IOPRIO_CLASS_IDLE
+is the lowest)
+or if it belongs to the same priority class as the other process but
+has a higher priority level (a lower priority number means a
+higher priority level).
+.PP
+The
+.I ioprio
+argument given to
+.BR ioprio_set ()
+is a bit mask that specifies both the scheduling class and the
+priority to be assigned to the target process(es).
+The following macros are used for assembling and dissecting
+.I ioprio
+values:
+.TP
+.BI IOPRIO_PRIO_VALUE( class ", " data )
+Given a scheduling
+.I class
+and priority
+.RI ( data ),
+this macro combines the two values to produce an
+.I ioprio
+value, which is returned as the result of the macro.
+.TP
+.BI IOPRIO_PRIO_CLASS( mask )
+Given
+.I mask
+(an
+.I ioprio
+value), this macro returns its I/O class component, that is,
+one of the values
+.BR IOPRIO_CLASS_RT ,
+.BR IOPRIO_CLASS_BE ,
+or
+.BR IOPRIO_CLASS_IDLE .
+.TP
+.BI IOPRIO_PRIO_DATA( mask )
+Given
+.I mask
+(an
+.I ioprio
+value), this macro returns its priority
+.RI ( data )
+component.
+.PP
+See the NOTES section for more
+information on scheduling classes and priorities,
+as well as the meaning of specifying
+.I ioprio
+as 0.
+.PP
+I/O priorities are supported for reads and for synchronous
+.RB ( O_DIRECT ,
+.BR O_SYNC )
+writes.
+I/O priorities are not supported for asynchronous
+writes because they are issued outside the context of the program
+dirtying the memory, and thus program-specific priorities do not apply.
+.SH RETURN VALUE
+On success,
+.BR ioprio_get ()
+returns the
+.I ioprio
+value of the process with highest I/O priority of any of the processes
+that match the criteria specified in
+.I which
+and
+.IR who .
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.PP
+On success,
+.BR ioprio_set ()
+returns 0.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+Invalid value for
+.I which
+or
+.IR ioprio .
+Refer to the NOTES section for available scheduler
+classes and priority levels for
+.IR ioprio .
+.TP
+.B EPERM
+The calling process does not have the privilege needed to assign this
+.I ioprio
+to the specified process(es).
+See the NOTES section for more information on required
+privileges for
+.BR ioprio_set ().
+.TP
+.B ESRCH
+No process(es) could be found that matched the specification in
+.I which
+and
+.IR who .
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.13.
+.SH NOTES
+Two or more processes or threads can share an I/O context.
+This will be the case when
+.BR clone (2)
+was called with the
+.B CLONE_IO
+flag.
+However, by default, the distinct threads of a process will
+.B not
+share the same I/O context.
+This means that if you want to change the I/O
+priority of all threads in a process, you may need to call
+.BR ioprio_set ()
+on each of the threads.
+The thread ID that you would need for this operation
+is the one that is returned by
+.BR gettid (2)
+or
+.BR clone (2).
+.PP
+These system calls have an effect only when used
+in conjunction with an I/O scheduler that supports I/O priorities.
+As at kernel 2.6.17 the only such scheduler is the Completely Fair Queuing
+(CFQ) I/O scheduler.
+.PP
+If no I/O scheduler has been set for a thread,
+then by default the I/O priority will follow the CPU nice value
+.RB ( setpriority (2)).
+Before Linux 2.6.24,
+once an I/O priority had been set using
+.BR ioprio_set (),
+there was no way to reset the I/O scheduling behavior to the default.
+Since Linux 2.6.24,
+.\" commit 8ec680e4c3ec818efd1652f15199ed1c216ab550
+specifying
+.I ioprio
+as 0 can be used to reset to the default I/O scheduling behavior.
+.SS Selecting an I/O scheduler
+I/O schedulers are selected on a per-device basis via the special
+file
+.IR /sys/block/ device /queue/scheduler .
+.PP
+One can view the current I/O scheduler via the
+.I /sys
+filesystem.
+For example, the following command
+displays a list of all schedulers currently loaded in the kernel:
+.PP
+.in +4n
+.EX
+.RB "$" " cat /sys/block/sda/queue/scheduler"
+noop anticipatory deadline [cfq]
+.EE
+.in
+.PP
+The scheduler surrounded by brackets is the one actually
+in use for the device
+.RI ( sda
+in the example).
+Setting another scheduler is done by writing the name of the
+new scheduler to this file.
+For example, the following command will set the
+scheduler for the
+.I sda
+device to
+.IR cfq :
+.PP
+.in +4n
+.EX
+.RB "$" " su"
+Password:
+.RB "#" " echo cfq > /sys/block/sda/queue/scheduler"
+.EE
+.in
+.\"
+.SS The Completely Fair Queuing (CFQ) I/O scheduler
+Since version 3 (also known as CFQ Time Sliced), CFQ implements
+I/O nice levels similar to those
+of CPU scheduling.
+These nice levels are grouped into three scheduling classes,
+each one containing one or more priority levels:
+.TP
+.BR IOPRIO_CLASS_RT " (1)"
+This is the real-time I/O class.
+This scheduling class is given
+higher priority than any other class:
+processes from this class are
+given first access to the disk every time.
+Thus, this I/O class needs to be used with some
+care: one I/O real-time process can starve the entire system.
+Within the real-time class,
+there are 8 levels of class data (priority) that determine exactly
+how much time this process needs the disk for on each service.
+The highest real-time priority level is 0; the lowest is 7.
+In the future, this might change to be more directly mappable to
+performance, by passing in a desired data rate instead.
+.TP
+.BR IOPRIO_CLASS_BE " (2)"
+This is the best-effort scheduling class,
+which is the default for any process
+that hasn't set a specific I/O priority.
+The class data (priority) determines how much
+I/O bandwidth the process will get.
+Best-effort priority levels are analogous to CPU nice values
+(see
+.BR getpriority (2)).
+The priority level determines a priority relative
+to other processes in the best-effort scheduling class.
+Priority levels range from 0 (highest) to 7 (lowest).
+.TP
+.BR IOPRIO_CLASS_IDLE " (3)"
+This is the idle scheduling class.
+Processes running at this level get I/O
+time only when no one else needs the disk.
+The idle class has no class data.
+Attention is required when assigning this priority class to a process,
+since it may become starved if higher priority processes are
+constantly accessing the disk.
+.PP
+Refer to the kernel source file
+.I Documentation/block/ioprio.txt
+for more information on the CFQ I/O Scheduler and an example program.
+.SS Required permissions to set I/O priorities
+Permission to change a process's priority is granted or denied based
+on two criteria:
+.TP
+.B "Process ownership"
+An unprivileged process may set the I/O priority only for a process
+whose real UID
+matches the real or effective UID of the calling process.
+A process which has the
+.B CAP_SYS_NICE
+capability can change the priority of any process.
+.TP
+.B "What is the desired priority"
+Attempts to set very high priorities
+.RB ( IOPRIO_CLASS_RT )
+require the
+.B CAP_SYS_ADMIN
+capability.
+Up to Linux 2.6.24 also required
+.B CAP_SYS_ADMIN
+to set a very low priority
+.RB ( IOPRIO_CLASS_IDLE ),
+but since Linux 2.6.25, this is no longer required.
+.PP
+A call to
+.BR ioprio_set ()
+must follow both rules, or the call will fail with the error
+.BR EPERM .
+.SH BUGS
+.\" 6 May 07: Bug report raised:
+.\" https://www.sourceware.org/bugzilla/show_bug.cgi?id=4464
+.\" Ulrich Drepper replied that he wasn't going to add these
+.\" to glibc.
+glibc does not yet provide a suitable header file defining
+the function prototypes and macros described on this page.
+Suitable definitions can be found in
+.IR linux/ioprio.h .
+.SH SEE ALSO
+.BR ionice (1),
+.BR getpriority (2),
+.BR open (2),
+.BR capabilities (7),
+.BR cgroups (7)
+.PP
+.I Documentation/block/ioprio.txt
+in the Linux kernel source tree
diff --git a/man2/ipc.2 b/man2/ipc.2
new file mode 100644
index 0000000..0b8a911
--- /dev/null
+++ b/man2/ipc.2
@@ -0,0 +1,63 @@
+.\" Copyright (c) 1995 Michael Chastain (mec@shell.portal.com), 15 April 1995.
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" Modified Tue Oct 22 08:11:14 EDT 1996 by Eric S. Raymond <esr@thyrsus.com>
+.TH ipc 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+ipc \- System V IPC system calls
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/ipc.h>" " /* Definition of needed constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_ipc, unsigned int " call ", int " first ,
+.BI " unsigned long " second ", unsigned long " third \
+", void *" ptr ,
+.BI " long " fifth );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR ipc (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+.BR ipc ()
+is a common kernel entry point for the System\ V IPC calls
+for messages, semaphores, and shared memory.
+.I call
+determines which IPC function to invoke;
+the other arguments are passed through to the appropriate call.
+.PP
+User-space programs should call the appropriate functions by their usual names.
+Only standard library implementors and kernel hackers need to know about
+.BR ipc ().
+.SH VERSIONS
+On some architectures\[em]for example x86-64 and ARM\[em]there is no
+.BR ipc ()
+system call; instead,
+.BR msgctl (2),
+.BR semctl (2),
+.BR shmctl (2),
+and so on really are implemented as separate system calls.
+.SH STANDARDS
+Linux.
+.SH SEE ALSO
+.BR msgctl (2),
+.BR msgget (2),
+.BR msgrcv (2),
+.BR msgsnd (2),
+.BR semctl (2),
+.BR semget (2),
+.BR semop (2),
+.BR semtimedop (2),
+.BR shmat (2),
+.BR shmctl (2),
+.BR shmdt (2),
+.BR shmget (2),
+.BR sysvipc (7)
diff --git a/man2/isastream.2 b/man2/isastream.2
new file mode 100644
index 0000000..5d25ea6
--- /dev/null
+++ b/man2/isastream.2
@@ -0,0 +1 @@
+.so man2/unimplemented.2
diff --git a/man2/kcmp.2 b/man2/kcmp.2
new file mode 100644
index 0000000..98a29f1
--- /dev/null
+++ b/man2/kcmp.2
@@ -0,0 +1,420 @@
+.\" Copyright (C) 2012, Cyrill Gorcunov <gorcunov@openvz.org>
+.\" and Copyright (C) 2012, 2016, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Kernel commit d97b46a64674a267bc41c9e16132ee2a98c3347d
+.\"
+.TH kcmp 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+kcmp \- compare two processes to determine if they share a kernel resource
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/kcmp.h>" " /* Definition of " KCMP_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_kcmp, pid_t " pid1 ", pid_t " pid2 ", int " type ,
+.BI " unsigned long " idx1 ", unsigned long " idx2 );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR kcmp (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The
+.BR kcmp ()
+system call can be used to check whether the two processes identified by
+.I pid1
+and
+.I pid2
+share a kernel resource such as virtual memory, file descriptors,
+and so on.
+.PP
+Permission to employ
+.BR kcmp ()
+is governed by ptrace access mode
+.B PTRACE_MODE_READ_REALCREDS
+checks against both
+.I pid1
+and
+.IR pid2 ;
+see
+.BR ptrace (2).
+.PP
+The
+.I type
+argument specifies which resource is to be compared in the two processes.
+It has one of the following values:
+.TP
+.B KCMP_FILE
+Check whether a file descriptor
+.I idx1
+in the process
+.I pid1
+refers to the same open file description (see
+.BR open (2))
+as file descriptor
+.I idx2
+in the process
+.IR pid2 .
+The existence of two file descriptors that refer to the same
+open file description can occur as a result of
+.BR dup (2)
+(and similar)
+.BR fork (2),
+or passing file descriptors via a domain socket (see
+.BR unix (7)).
+.TP
+.B KCMP_FILES
+Check whether the processes share the same set of open file descriptors.
+The arguments
+.I idx1
+and
+.I idx2
+are ignored.
+See the discussion of the
+.B CLONE_FILES
+flag in
+.BR clone (2).
+.TP
+.B KCMP_FS
+Check whether the processes share the same filesystem information
+(i.e., file mode creation mask, working directory, and filesystem root).
+The arguments
+.I idx1
+and
+.I idx2
+are ignored.
+See the discussion of the
+.B CLONE_FS
+flag in
+.BR clone (2).
+.TP
+.B KCMP_IO
+Check whether the processes share I/O context.
+The arguments
+.I idx1
+and
+.I idx2
+are ignored.
+See the discussion of the
+.B CLONE_IO
+flag in
+.BR clone (2).
+.TP
+.B KCMP_SIGHAND
+Check whether the processes share the same table of signal dispositions.
+The arguments
+.I idx1
+and
+.I idx2
+are ignored.
+See the discussion of the
+.B CLONE_SIGHAND
+flag in
+.BR clone (2).
+.TP
+.B KCMP_SYSVSEM
+Check whether the processes share the same
+list of System\ V semaphore undo operations.
+The arguments
+.I idx1
+and
+.I idx2
+are ignored.
+See the discussion of the
+.B CLONE_SYSVSEM
+flag in
+.BR clone (2).
+.TP
+.B KCMP_VM
+Check whether the processes share the same address space.
+The arguments
+.I idx1
+and
+.I idx2
+are ignored.
+See the discussion of the
+.B CLONE_VM
+flag in
+.BR clone (2).
+.TP
+.BR KCMP_EPOLL_TFD " (since Linux 4.13)"
+.\" commit 0791e3644e5ef21646fe565b9061788d05ec71d4
+Check whether the file descriptor
+.I idx1
+of the process
+.I pid1
+is present in the
+.BR epoll (7)
+instance described by
+.I idx2
+of the process
+.IR pid2 .
+The argument
+.I idx2
+is a pointer to a structure where the target file is described.
+This structure has the form:
+.PP
+.in +4n
+.EX
+struct kcmp_epoll_slot {
+ __u32 efd;
+ __u32 tfd;
+ __u64 toff;
+};
+.EE
+.in
+.PP
+Within this structure,
+.I efd
+is an epoll file descriptor returned from
+.BR epoll_create (2),
+.I tfd
+is a target file descriptor number, and
+.I toff
+is a target file offset counted from zero.
+Several different targets may be registered with
+the same file descriptor number and setting a specific
+offset helps to investigate each of them.
+.PP
+Note the
+.BR kcmp ()
+is not protected against false positives which may occur if
+the processes are currently running.
+One should stop the processes by sending
+.B SIGSTOP
+(see
+.BR signal (7))
+prior to inspection with this system call to obtain meaningful results.
+.SH RETURN VALUE
+The return value of a successful call to
+.BR kcmp ()
+is simply the result of arithmetic comparison
+of kernel pointers (when the kernel compares resources, it uses their
+memory addresses).
+.PP
+The easiest way to explain is to consider an example.
+Suppose that
+.I v1
+and
+.I v2
+are the addresses of appropriate resources, then the return value
+is one of the following:
+.RS
+.TP
+.B 0
+.I v1
+is equal to
+.IR v2 ;
+in other words, the two processes share the resource.
+.TP
+.B 1
+.I v1
+is less than
+.IR v2 .
+.TP
+.B 2
+.I v1
+is greater than
+.IR v2 .
+.TP
+.B 3
+.I v1
+is not equal to
+.IR v2 ,
+but ordering information is unavailable.
+.RE
+.PP
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.PP
+.BR kcmp ()
+was designed to return values suitable for sorting.
+This is particularly handy if one needs to compare
+a large number of file descriptors.
+.SH ERRORS
+.TP
+.B EBADF
+.I type
+is
+.B KCMP_FILE
+and
+.I fd1
+or
+.I fd2
+is not an open file descriptor.
+.TP
+.B EFAULT
+The epoll slot addressed by
+.I idx2
+is outside of the user's address space.
+.TP
+.B EINVAL
+.I type
+is invalid.
+.TP
+.B ENOENT
+The target file is not present in
+.BR epoll (7)
+instance.
+.TP
+.B EPERM
+Insufficient permission to inspect process resources.
+The
+.B CAP_SYS_PTRACE
+capability is required to inspect processes that you do not own.
+Other ptrace limitations may also apply, such as
+.BR CONFIG_SECURITY_YAMA ,
+which, when
+.I /proc/sys/kernel/yama/ptrace_scope
+is 2, limits
+.BR kcmp ()
+to child processes;
+see
+.BR ptrace (2).
+.TP
+.B ESRCH
+Process
+.I pid1
+or
+.I pid2
+does not exist.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 3.5.
+.PP
+Before Linux 5.12,
+this system call is available only if the kernel is configured with
+.BR CONFIG_CHECKPOINT_RESTORE ,
+since the original purpose of the system call was for the
+checkpoint/restore in user space (CRIU) feature.
+(The alternative to this system call would have been to expose suitable
+process information via the
+.BR proc (5)
+filesystem; this was deemed to be unsuitable for security reasons.)
+Since Linux 5.12,
+this system call is also available if the kernel is configured with
+.BR CONFIG_KCMP .
+.SH NOTES
+See
+.BR clone (2)
+for some background information on the shared resources
+referred to on this page.
+.SH EXAMPLES
+The program below uses
+.BR kcmp ()
+to test whether pairs of file descriptors refer to
+the same open file description.
+The program tests different cases for the file descriptor pairs,
+as described in the program output.
+An example run of the program is as follows:
+.PP
+.in +4n
+.EX
+$ \fB./a.out\fP
+Parent PID is 1144
+Parent opened file on FD 3
+\&
+PID of child of fork() is 1145
+ Compare duplicate FDs from different processes:
+ kcmp(1145, 1144, KCMP_FILE, 3, 3) ==> same
+Child opened file on FD 4
+ Compare FDs from distinct open()s in same process:
+ kcmp(1145, 1145, KCMP_FILE, 3, 4) ==> different
+Child duplicated FD 3 to create FD 5
+ Compare duplicated FDs in same process:
+ kcmp(1145, 1145, KCMP_FILE, 3, 5) ==> same
+.EE
+.in
+.SS Program source
+\&
+.\" SRC BEGIN (kcmp.c)
+.EX
+#define _GNU_SOURCE
+#include <err.h>
+#include <fcntl.h>
+#include <linux/kcmp.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+#include <unistd.h>
+\&
+static int
+kcmp(pid_t pid1, pid_t pid2, int type,
+ unsigned long idx1, unsigned long idx2)
+{
+ return syscall(SYS_kcmp, pid1, pid2, type, idx1, idx2);
+}
+\&
+static void
+test_kcmp(char *msg, pid_t pid1, pid_t pid2, int fd_a, int fd_b)
+{
+ printf("\et%s\en", msg);
+ printf("\et\etkcmp(%jd, %jd, KCMP_FILE, %d, %d) ==> %s\en",
+ (intmax_t) pid1, (intmax_t) pid2, fd_a, fd_b,
+ (kcmp(pid1, pid2, KCMP_FILE, fd_a, fd_b) == 0) ?
+ "same" : "different");
+}
+\&
+int
+main(void)
+{
+ int fd1, fd2, fd3;
+ static const char pathname[] = "/tmp/kcmp.test";
+\&
+ fd1 = open(pathname, O_CREAT | O_RDWR, 0600);
+ if (fd1 == \-1)
+ err(EXIT_FAILURE, "open");
+\&
+ printf("Parent PID is %jd\en", (intmax_t) getpid());
+ printf("Parent opened file on FD %d\en\en", fd1);
+\&
+ switch (fork()) {
+ case \-1:
+ err(EXIT_FAILURE, "fork");
+\&
+ case 0:
+ printf("PID of child of fork() is %jd\en", (intmax_t) getpid());
+\&
+ test_kcmp("Compare duplicate FDs from different processes:",
+ getpid(), getppid(), fd1, fd1);
+\&
+ fd2 = open(pathname, O_CREAT | O_RDWR, 0600);
+ if (fd2 == \-1)
+ err(EXIT_FAILURE, "open");
+ printf("Child opened file on FD %d\en", fd2);
+\&
+ test_kcmp("Compare FDs from distinct open()s in same process:",
+ getpid(), getpid(), fd1, fd2);
+\&
+ fd3 = dup(fd1);
+ if (fd3 == \-1)
+ err(EXIT_FAILURE, "dup");
+ printf("Child duplicated FD %d to create FD %d\en", fd1, fd3);
+\&
+ test_kcmp("Compare duplicated FDs in same process:",
+ getpid(), getpid(), fd1, fd3);
+ break;
+\&
+ default:
+ wait(NULL);
+ }
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR clone (2),
+.BR unshare (2)
diff --git a/man2/kexec_file_load.2 b/man2/kexec_file_load.2
new file mode 100644
index 0000000..6c20331
--- /dev/null
+++ b/man2/kexec_file_load.2
@@ -0,0 +1 @@
+.so man2/kexec_load.2
diff --git a/man2/kexec_load.2 b/man2/kexec_load.2
new file mode 100644
index 0000000..604fa1c
--- /dev/null
+++ b/man2/kexec_load.2
@@ -0,0 +1,331 @@
+.\" Copyright (C) 2010 Intel Corporation, Author: Andi Kleen
+.\" and Copyright 2014, Vivek Goyal <vgoyal@redhat.com>
+.\" and Copyright (c) 2015, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH kexec_load 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+kexec_load, kexec_file_load \- load a new kernel for later execution
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/kexec.h>" " /* Definition of " KEXEC_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "long syscall(SYS_kexec_load, unsigned long " entry ,
+.BI " unsigned long " nr_segments \
+", struct kexec_segment *" segments ,
+.BI " unsigned long " flags );
+.BI "long syscall(SYS_kexec_file_load, int " kernel_fd ", int " initrd_fd ,
+.BI " unsigned long " cmdline_len ", const char *" cmdline ,
+.BI " unsigned long " flags );
+.fi
+.PP
+.IR Note :
+glibc provides no wrappers for these system calls,
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The
+.BR kexec_load ()
+system call loads a new kernel that can be executed later by
+.BR reboot (2).
+.PP
+The
+.I flags
+argument is a bit mask that controls the operation of the call.
+The following values can be specified in
+.IR flags :
+.TP
+.BR KEXEC_ON_CRASH " (since Linux 2.6.13)"
+Execute the new kernel automatically on a system crash.
+This "crash kernel" is loaded into an area of reserved memory that
+is determined at boot time using the
+.I crashkernel
+kernel command-line parameter.
+The location of this reserved memory is exported to user space via the
+.I /proc/iomem
+file, in an entry labeled "Crash kernel".
+A user-space application can parse this file and prepare a list of
+segments (see below) that specify this reserved memory as destination.
+If this flag is specified, the kernel checks that the
+target segments specified in
+.I segments
+fall within the reserved region.
+.TP
+.BR KEXEC_PRESERVE_CONTEXT " (since Linux 2.6.27)"
+Preserve the system hardware and
+software states before executing the new kernel.
+This could be used for system suspend.
+This flag is available only if the kernel was configured with
+.BR CONFIG_KEXEC_JUMP ,
+and is effective only if
+.I nr_segments
+is greater than 0.
+.PP
+The high-order bits (corresponding to the mask 0xffff0000) of
+.I flags
+contain the architecture of the to-be-executed kernel.
+Specify (OR) the constant
+.B KEXEC_ARCH_DEFAULT
+to use the current architecture,
+or one of the following architecture constants
+.BR KEXEC_ARCH_386 ,
+.BR KEXEC_ARCH_68K ,
+.BR KEXEC_ARCH_X86_64 ,
+.BR KEXEC_ARCH_PPC ,
+.BR KEXEC_ARCH_PPC64 ,
+.BR KEXEC_ARCH_IA_64 ,
+.BR KEXEC_ARCH_ARM ,
+.BR KEXEC_ARCH_S390 ,
+.BR KEXEC_ARCH_SH ,
+.BR KEXEC_ARCH_MIPS ,
+and
+.BR KEXEC_ARCH_MIPS_LE .
+The architecture must be executable on the CPU of the system.
+.PP
+The
+.I entry
+argument is the physical entry address in the kernel image.
+The
+.I nr_segments
+argument is the number of segments pointed to by the
+.I segments
+pointer;
+the kernel imposes an (arbitrary) limit of 16 on the number of segments.
+The
+.I segments
+argument is an array of
+.I kexec_segment
+structures which define the kernel layout:
+.PP
+.in +4n
+.EX
+struct kexec_segment {
+ void *buf; /* Buffer in user space */
+ size_t bufsz; /* Buffer length in user space */
+ void *mem; /* Physical address of kernel */
+ size_t memsz; /* Physical address length */
+};
+.EE
+.in
+.PP
+The kernel image defined by
+.I segments
+is copied from the calling process into
+the kernel either in regular
+memory or in reserved memory (if
+.B KEXEC_ON_CRASH
+is set).
+The kernel first performs various sanity checks on the
+information passed in
+.IR segments .
+If these checks pass, the kernel copies the segment data to kernel memory.
+Each segment specified in
+.I segments
+is copied as follows:
+.IP \[bu] 3
+.I buf
+and
+.I bufsz
+identify a memory region in the caller's virtual address space
+that is the source of the copy.
+The value in
+.I bufsz
+may not exceed the value in the
+.I memsz
+field.
+.IP \[bu]
+.I mem
+and
+.I memsz
+specify a physical address range that is the target of the copy.
+The values specified in both fields must be multiples of
+the system page size.
+.IP \[bu]
+.I bufsz
+bytes are copied from the source buffer to the target kernel buffer.
+If
+.I bufsz
+is less than
+.IR memsz ,
+then the excess bytes in the kernel buffer are zeroed out.
+.PP
+In case of a normal kexec (i.e., the
+.B KEXEC_ON_CRASH
+flag is not set), the segment data is loaded in any available memory
+and is moved to the final destination at kexec reboot time (e.g., when the
+.BR kexec (8)
+command is executed with the
+.I \-e
+option).
+.PP
+In case of kexec on panic (i.e., the
+.B KEXEC_ON_CRASH
+flag is set), the segment data is
+loaded to reserved memory at the time of the call, and, after a crash,
+the kexec mechanism simply passes control to that kernel.
+.PP
+The
+.BR kexec_load ()
+system call is available only if the kernel was configured with
+.BR CONFIG_KEXEC .
+.SS kexec_file_load()
+The
+.BR kexec_file_load ()
+system call is similar to
+.BR kexec_load (),
+but it takes a different set of arguments.
+It reads the kernel to be loaded from the file referred to by
+the file descriptor
+.IR kernel_fd ,
+and the initrd (initial RAM disk)
+to be loaded from file referred to by the file descriptor
+.IR initrd_fd .
+The
+.I cmdline
+argument is a pointer to a buffer containing the command line
+for the new kernel.
+The
+.I cmdline_len
+argument specifies size of the buffer.
+The last byte in the buffer must be a null byte (\[aq]\e0\[aq]).
+.PP
+The
+.I flags
+argument is a bit mask which modifies the behavior of the call.
+The following values can be specified in
+.IR flags :
+.TP
+.B KEXEC_FILE_UNLOAD
+Unload the currently loaded kernel.
+.TP
+.B KEXEC_FILE_ON_CRASH
+Load the new kernel in the memory region reserved for the crash kernel
+(as for
+.BR KEXEC_ON_CRASH ).
+This kernel is booted if the currently running kernel crashes.
+.TP
+.B KEXEC_FILE_NO_INITRAMFS
+Loading initrd/initramfs is optional.
+Specify this flag if no initramfs is being loaded.
+If this flag is set, the value passed in
+.I initrd_fd
+is ignored.
+.PP
+The
+.BR kexec_file_load ()
+.\" See also http://lwn.net/Articles/603116/
+system call was added to provide support for systems
+where "kexec" loading should be restricted to
+only kernels that are signed.
+This system call is available only if the kernel was configured with
+.BR CONFIG_KEXEC_FILE .
+.SH RETURN VALUE
+On success, these system calls returns 0.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EADDRNOTAVAIL
+.\" See kernel/kexec.::sanity_check_segment_list in the 3.19 kernel source
+The
+.B KEXEC_ON_CRASH
+flags was specified, but the region specified by the
+.I mem
+and
+.I memsz
+fields of one of the
+.I segments
+entries lies outside the range of memory reserved for the crash kernel.
+.TP
+.B EADDRNOTAVAIL
+The value in a
+.I mem
+or
+.I memsz
+field in one of the
+.I segments
+entries is not a multiple of the system page size.
+.TP
+.B EBADF
+.I kernel_fd
+or
+.I initrd_fd
+is not a valid file descriptor.
+.TP
+.B EBUSY
+Another crash kernel is already being loaded
+or a crash kernel is already in use.
+.TP
+.B EINVAL
+.I flags
+is invalid.
+.TP
+.B EINVAL
+The value of a
+.I bufsz
+field in one of the
+.I segments
+entries exceeds the value in the corresponding
+.I memsz
+field.
+.TP
+.B EINVAL
+.I nr_segments
+exceeds
+.B KEXEC_SEGMENT_MAX
+(16).
+.TP
+.B EINVAL
+Two or more of the kernel target buffers overlap.
+.TP
+.B EINVAL
+The value in
+.I cmdline[cmdline_len\-1]
+is not \[aq]\e0\[aq].
+.TP
+.B EINVAL
+The file referred to by
+.I kernel_fd
+or
+.I initrd_fd
+is empty (length zero).
+.TP
+.B ENOEXEC
+.I kernel_fd
+does not refer to an open file, or the kernel can't load this file.
+Currently, the file must be a bzImage and contain an x86 kernel that
+is loadable above 4\ GiB in memory (see the kernel source file
+.IR Documentation/x86/boot.txt ).
+.TP
+.B ENOMEM
+Could not allocate memory.
+.TP
+.B EPERM
+The caller does not have the
+.B CAP_SYS_BOOT
+capability.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+.TP
+.BR kexec_load ()
+Linux 2.6.13.
+.TP
+.BR kexec_file_load ()
+Linux 3.17.
+.SH SEE ALSO
+.BR reboot (2),
+.BR syscall (2),
+.BR kexec (8)
+.PP
+The kernel source files
+.I Documentation/kdump/kdump.txt
+and
+.I Documentation/admin\-guide/kernel\-parameters.txt
diff --git a/man2/keyctl.2 b/man2/keyctl.2
new file mode 100644
index 0000000..d7bd83d
--- /dev/null
+++ b/man2/keyctl.2
@@ -0,0 +1,2297 @@
+.\" Copyright (C) 2016 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" and Copyright (C) 2016 Eugene Syromyatnikov <evgsyr@gmail.com>
+.\" A very few fragments remain from an earlier version of this page
+.\" written by David Howells (dhowells@redhat.com)
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH keyctl 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+keyctl \- manipulate the kernel's key management facility
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.PP
+Alternatively, Linux Key Management Utilities
+.RI ( libkeyutils ", " \-lkeyutils );
+see VERSIONS.
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/keyctl.h>" " /* Definition of " KEY* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "long syscall(SYS_keyctl, int " operation ", unsigned long " arg2 ,
+.BI " unsigned long " arg3 ", unsigned long " arg4 ,
+.BI " unsigned long " arg5 );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR keyctl (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+.BR keyctl ()
+allows user-space programs to perform key manipulation.
+.PP
+The operation performed by
+.BR keyctl ()
+is determined by the value of the
+.I operation
+argument.
+Each of these operations is wrapped by the
+.I libkeyutils
+library (provided by the
+.I keyutils
+package) into individual functions (noted below)
+to permit the compiler to check types.
+.PP
+The permitted values for
+.I operation
+are:
+.TP
+.BR KEYCTL_GET_KEYRING_ID " (since Linux 2.6.10)"
+Map a special key ID to a real key ID for this process.
+.IP
+This operation looks up the special key whose ID is provided in
+.I arg2
+(cast to
+.IR key_serial_t ).
+If the special key is found,
+the ID of the corresponding real key is returned as the function result.
+The following values may be specified in
+.IR arg2 :
+.RS
+.TP
+.B KEY_SPEC_THREAD_KEYRING
+This specifies the calling thread's thread-specific keyring.
+See
+.BR thread\-keyring (7).
+.TP
+.B KEY_SPEC_PROCESS_KEYRING
+This specifies the caller's process-specific keyring.
+See
+.BR process\-keyring (7).
+.TP
+.B KEY_SPEC_SESSION_KEYRING
+This specifies the caller's session-specific keyring.
+See
+.BR session\-keyring (7).
+.TP
+.B KEY_SPEC_USER_KEYRING
+This specifies the caller's UID-specific keyring.
+See
+.BR user\-keyring (7).
+.TP
+.B KEY_SPEC_USER_SESSION_KEYRING
+This specifies the caller's UID-session keyring.
+See
+.BR user\-session\-keyring (7).
+.TP
+.BR KEY_SPEC_REQKEY_AUTH_KEY " (since Linux 2.6.16)"
+.\" commit b5f545c880a2a47947ba2118b2509644ab7a2969
+This specifies the authorization key created by
+.BR request_key (2)
+and passed to the process it spawns to generate a key.
+This key is available only in a
+.BR request\-key (8)-style
+program that was passed an authorization key by the kernel and
+ceases to be available once the requested key has been instantiated; see
+.BR request_key (2).
+.TP
+.BR KEY_SPEC_REQUESTOR_KEYRING " (since Linux 2.6.29)"
+.\" commit 8bbf4976b59fc9fc2861e79cab7beb3f6d647640
+This specifies the key ID for the
+.BR request_key (2)
+destination keyring.
+This keyring is available only in a
+.BR request\-key (8)-style
+program that was passed an authorization key by the kernel and
+ceases to be available once the requested key has been instantiated; see
+.BR request_key (2).
+.RE
+.IP
+The behavior if the key specified in
+.I arg2
+does not exist depends on the value of
+.I arg3
+(cast to
+.IR int ).
+If
+.I arg3
+contains a nonzero value, then\[em]if it is appropriate to do so
+(e.g., when looking up the user, user-session, or session key)\[em]a new key
+is created and its real key ID returned as the function result.
+.\" The keyctl_get_keyring_ID.3 page says that a new key
+.\" "will be created *if it is appropriate to do so**. What is the
+.\" determiner for appropriate?
+.\" David Howells: Some special keys such as KEY_SPEC_REQKEY_AUTH_KEY
+.\" wouldn't get created but user/user-session/session keyring would
+.\" be created.
+Otherwise, the operation fails with the error
+.BR ENOKEY .
+.IP
+If a valid key ID is specified in
+.IR arg2 ,
+and the key exists, then this operation simply returns the key ID.
+If the key does not exist, the call fails with error
+.BR ENOKEY .
+.IP
+The caller must have
+.I search
+permission on a keyring in order for it to be found.
+.IP
+The arguments
+.I arg4
+and
+.I arg5
+are ignored.
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_get_keyring_ID (3).
+.TP
+.BR KEYCTL_JOIN_SESSION_KEYRING " (since Linux 2.6.10)"
+Replace the session keyring this process subscribes to with
+a new session keyring.
+.\" This may be useful in conjunction with some sort of
+.\" session management framework that is employed by the application.
+.IP
+If
+.I arg2
+is NULL,
+an anonymous keyring with the description "_ses" is created
+and the process is subscribed to that keyring as its session keyring,
+displacing the previous session keyring.
+.IP
+Otherwise,
+.I arg2
+(cast to
+.IR "char\ *" )
+is treated as the description (name) of a keyring,
+and the behavior is as follows:
+.RS
+.IP \[bu] 3
+If a keyring with a matching description exists,
+the process will attempt to subscribe to that keyring
+as its session keyring if possible;
+if that is not possible, an error is returned.
+In order to subscribe to the keyring,
+the caller must have
+.I search
+permission on the keyring.
+.IP \[bu]
+If a keyring with a matching description does not exist,
+then a new keyring with the specified description is created,
+and the process is subscribed to that keyring as its session keyring.
+.RE
+.IP
+The arguments
+.IR arg3 ,
+.IR arg4 ,
+and
+.I arg5
+are ignored.
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_join_session_keyring (3).
+.TP
+.BR KEYCTL_UPDATE " (since Linux 2.6.10)"
+Update a key's data payload.
+.IP
+The
+.I arg2
+argument (cast to
+.IR key_serial_t )
+specifies the ID of the key to be updated.
+The
+.I arg3
+argument (cast to
+.IR "void\ *" )
+points to the new payload and
+.I arg4
+(cast to
+.IR size_t )
+contains the new payload size in bytes.
+.IP
+The caller must have
+.I write
+permission on the key specified and the key type must support updating.
+.IP
+A negatively instantiated key (see the description of
+.BR KEYCTL_REJECT )
+can be positively instantiated with this operation.
+.IP
+The
+.I arg5
+argument is ignored.
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_update (3).
+.TP
+.BR KEYCTL_REVOKE " (since Linux 2.6.10)"
+Revoke the key with the ID provided in
+.I arg2
+(cast to
+.IR key_serial_t ).
+The key is scheduled for garbage collection;
+it will no longer be findable,
+and will be unavailable for further operations.
+Further attempts to use the key will fail with the error
+.BR EKEYREVOKED .
+.IP
+The caller must have
+.I write
+or
+.I setattr
+permission on the key.
+.\" Keys with the KEY_FLAG_KEEP bit set cause an EPERM
+.\" error for KEYCTL_REVOKE. Does this need to be documented?
+.\" David Howells: No significance for user space.
+.IP
+The arguments
+.IR arg3 ,
+.IR arg4 ,
+and
+.I arg5
+are ignored.
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_revoke (3).
+.TP
+.BR KEYCTL_CHOWN " (since Linux 2.6.10)"
+Change the ownership (user and group ID) of a key.
+.IP
+The
+.I arg2
+argument (cast to
+.IR key_serial_t )
+contains the key ID.
+The
+.I arg3
+argument (cast to
+.IR uid_t )
+contains the new user ID (or \-1 in case the user ID shouldn't be changed).
+The
+.I arg4
+argument (cast to
+.IR gid_t )
+contains the new group ID (or \-1 in case the group ID shouldn't be changed).
+.IP
+The key must grant the caller
+.I setattr
+permission.
+.IP
+For the UID to be changed, or for the GID to be changed to a group
+the caller is not a member of, the caller must have the
+.B CAP_SYS_ADMIN
+capability (see
+.BR capabilities (7)).
+.IP
+If the UID is to be changed, the new user must have sufficient
+quota to accept the key.
+The quota deduction will be removed from the old user
+to the new user should the UID be changed.
+.IP
+The
+.I arg5
+argument is ignored.
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_chown (3).
+.TP
+.BR KEYCTL_SETPERM " (since Linux 2.6.10)"
+Change the permissions of the key with the ID provided in the
+.I arg2
+argument (cast to
+.IR key_serial_t )
+to the permissions provided in the
+.I arg3
+argument (cast to
+.IR key_perm_t ).
+.IP
+If the caller doesn't have the
+.B CAP_SYS_ADMIN
+capability, it can change permissions only for the keys it owns.
+(More precisely: the caller's filesystem UID must match the UID of the key.)
+.IP
+The key must grant
+.I setattr
+permission to the caller
+.I regardless
+of the caller's capabilities.
+.\" FIXME Above, is it really intended that a privileged process can't
+.\" override the lack of the 'setattr' permission?
+.IP
+The permissions in
+.I arg3
+specify masks of available operations
+for each of the following user categories:
+.RS
+.TP
+.IR possessor " (since Linux 2.6.14)"
+.\" commit 664cceb0093b755739e56572b836a99104ee8a75
+This is the permission granted to a process that possesses the key
+(has it attached searchably to one of the process's keyrings);
+see
+.BR keyrings (7).
+.TP
+.I user
+This is the permission granted to a process
+whose filesystem UID matches the UID of the key.
+.TP
+.I group
+This is the permission granted to a process
+whose filesystem GID or any of its supplementary GIDs
+matches the GID of the key.
+.TP
+.I other
+This is the permission granted to other processes
+that do not match the
+.I user
+and
+.I group
+categories.
+.RE
+.IP
+The
+.IR user ,
+.IR group ,
+and
+.I other
+categories are exclusive: if a process matches the
+.I user
+category, it will not receive permissions granted in the
+.I group
+category; if a process matches the
+.I user
+or
+.I group
+category, then it will not receive permissions granted in the
+.I other
+category.
+.IP
+The
+.I possessor
+category grants permissions that are cumulative with the grants from the
+.IR user ,
+.IR group ,
+or
+.I other
+category.
+.IP
+Each permission mask is eight bits in size,
+with only six bits currently used.
+The available permissions are:
+.RS
+.TP
+.I view
+This permission allows reading attributes of a key.
+.IP
+This permission is required for the
+.B KEYCTL_DESCRIBE
+operation.
+.IP
+The permission bits for each category are
+.BR KEY_POS_VIEW ,
+.BR KEY_USR_VIEW ,
+.BR KEY_GRP_VIEW ,
+and
+.BR KEY_OTH_VIEW .
+.TP
+.I read
+This permission allows reading a key's payload.
+.IP
+This permission is required for the
+.B KEYCTL_READ
+operation.
+.IP
+The permission bits for each category are
+.BR KEY_POS_READ ,
+.BR KEY_USR_READ ,
+.BR KEY_GRP_READ ,
+and
+.BR KEY_OTH_READ .
+.TP
+.I write
+This permission allows update or instantiation of a key's payload.
+For a keyring, it allows keys to be linked and unlinked from the keyring,
+.IP
+This permission is required for the
+.BR KEYCTL_UPDATE ,
+.BR KEYCTL_REVOKE ,
+.BR KEYCTL_CLEAR ,
+.BR KEYCTL_LINK ,
+and
+.B KEYCTL_UNLINK
+operations.
+.IP
+The permission bits for each category are
+.BR KEY_POS_WRITE ,
+.BR KEY_USR_WRITE ,
+.BR KEY_GRP_WRITE ,
+and
+.BR KEY_OTH_WRITE .
+.TP
+.I search
+This permission allows keyrings to be searched and keys to be found.
+Searches can recurse only into nested keyrings that have
+.I search
+permission set.
+.IP
+This permission is required for the
+.BR KEYCTL_GET_KEYRING_ID ,
+.BR KEYCTL_JOIN_SESSION_KEYRING ,
+.BR KEYCTL_SEARCH ,
+and
+.B KEYCTL_INVALIDATE
+operations.
+.IP
+The permission bits for each category are
+.BR KEY_POS_SEARCH ,
+.BR KEY_USR_SEARCH ,
+.BR KEY_GRP_SEARCH ,
+and
+.BR KEY_OTH_SEARCH .
+.TP
+.I link
+This permission allows a key or keyring to be linked to.
+.IP
+This permission is required for the
+.B KEYCTL_LINK
+and
+.B KEYCTL_SESSION_TO_PARENT
+operations.
+.IP
+The permission bits for each category are
+.BR KEY_POS_LINK ,
+.BR KEY_USR_LINK ,
+.BR KEY_GRP_LINK ,
+and
+.BR KEY_OTH_LINK .
+.TP
+.IR setattr " (since Linux 2.6.15)."
+This permission allows a key's UID, GID, and permissions mask to be changed.
+.IP
+This permission is required for the
+.BR KEYCTL_REVOKE ,
+.BR KEYCTL_CHOWN ,
+and
+.B KEYCTL_SETPERM
+operations.
+.IP
+The permission bits for each category are
+.BR KEY_POS_SETATTR ,
+.BR KEY_USR_SETATTR ,
+.BR KEY_GRP_SETATTR ,
+and
+.BR KEY_OTH_SETATTR .
+.RE
+.IP
+As a convenience, the following macros are defined as masks for
+all of the permission bits in each of the user categories:
+.BR KEY_POS_ALL ,
+.BR KEY_USR_ALL ,
+.BR KEY_GRP_ALL ,
+and
+.BR KEY_OTH_ALL .
+.IP
+The
+.I arg4
+and
+.I arg5
+arguments are ignored.
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_setperm (3).
+.TP
+.BR KEYCTL_DESCRIBE " (since Linux 2.6.10)"
+Obtain a string describing the attributes of a specified key.
+.IP
+The ID of the key to be described is specified in
+.I arg2
+(cast to
+.IR key_serial_t ).
+The descriptive string is returned in the buffer pointed to by
+.I arg3
+(cast to
+.IR char\~* );
+.I arg4
+(cast to
+.IR size_t )
+specifies the size of that buffer in bytes.
+.IP
+The key must grant the caller
+.I view
+permission.
+.IP
+The returned string is null-terminated and
+contains the following information about the key:
+.IP
+.in +4n
+.IR type ; uid ; gid ; perm ; description
+.in
+.IP
+In the above,
+.I type
+and
+.I description
+are strings,
+.I uid
+and
+.I gid
+are decimal strings, and
+.I perm
+is a hexadecimal permissions mask.
+The descriptive string is written with the following format:
+.IP
+.in +4n
+.EX
+%s;%d;%d;%08x;%s
+.EE
+.in
+.IP
+.B Note: the intention is that the descriptive string should
+.B be extensible in future kernel versions.
+In particular, the
+.I description
+field will not contain semicolons;
+.\" FIXME But, the kernel does not enforce the requirement
+.\" that the key description contains no semicolons!
+.\" So, user space has no guarantee here??
+.\" Either something more needs to be said here,
+.\" or a kernel fix is required.
+it should be parsed by working backwards from the end of the string
+to find the last semicolon.
+This allows future semicolon-delimited fields to be inserted
+in the descriptive string in the future.
+.IP
+Writing to the buffer is attempted only when
+.I arg3
+is non-NULL and the specified buffer size
+is large enough to accept the descriptive string
+(including the terminating null byte).
+.\" Function commentary says it copies up to buflen bytes, but see the
+.\" (buffer && buflen >= ret) condition in keyctl_describe_key() in
+.\" security/keyctl.c
+In order to determine whether the buffer size was too small,
+check to see if the return value of the operation is greater than
+.IR arg4 .
+.IP
+The
+.I arg5
+argument is ignored.
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_describe (3).
+.TP
+.B KEYCTL_CLEAR
+Clear the contents of (i.e., unlink all keys from) a keyring.
+.IP
+The ID of the key
+(which must be of keyring type)
+.\" or the error ENOTDIR results
+is provided in
+.I arg2
+(cast to
+.IR key_serial_t ).
+.\" According to Documentation/security/keys.txt:
+.\" This function can also be used to clear special kernel keyrings if they
+.\" are appropriately marked if the user has CAP_SYS_ADMIN capability. The
+.\" DNS resolver cache keyring is an example of this.
+.IP
+The caller must have
+.I write
+permission on the keyring.
+.IP
+The arguments
+.IR arg3 ,
+.IR arg4 ,
+and
+.I arg5
+are ignored.
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_clear (3).
+.TP
+.BR KEYCTL_LINK " (since Linux 2.6.10)"
+Create a link from a keyring to a key.
+.IP
+The key to be linked is specified in
+.I arg2
+(cast to
+.IR key_serial_t );
+the keyring is specified in
+.I arg3
+(cast to
+.IR key_serial_t ).
+.IP
+If a key with the same type and description is already linked in the keyring,
+then that key is displaced from the keyring.
+.IP
+Before creating the link,
+the kernel checks the nesting of the keyrings and returns appropriate errors
+if the link would produce a cycle
+or if the nesting of keyrings would be too deep
+(The limit on the nesting of keyrings is determined by the kernel constant
+.BR KEYRING_SEARCH_MAX_DEPTH ,
+defined with the value 6, and is necessary to prevent overflows
+on the kernel stack when recursively searching keyrings).
+.IP
+The caller must have
+.I link
+permission on the key being added and
+.I write
+permission on the keyring.
+.IP
+The arguments
+.I arg4
+and
+.I arg5
+are ignored.
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_link (3).
+.TP
+.BR KEYCTL_UNLINK " (since Linux 2.6.10)"
+Unlink a key from a keyring.
+.IP
+The ID of the key to be unlinked is specified in
+.I arg2
+(cast to
+.IR key_serial_t );
+the ID of the keyring from which it is to be unlinked is specified in
+.I arg3
+(cast to
+.IR key_serial_t ).
+.IP
+If the key is not currently linked into the keyring, an error results.
+.IP
+The caller must have
+.I write
+permission on the keyring from which the key is being removed.
+.IP
+If the last link to a key is removed,
+then that key will be scheduled for destruction.
+.IP
+The arguments
+.I arg4
+and
+.I arg5
+are ignored.
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_unlink (3).
+.TP
+.BR KEYCTL_SEARCH " (since Linux 2.6.10)"
+Search for a key in a keyring tree,
+returning its ID and optionally linking it to a specified keyring.
+.IP
+The tree to be searched is specified by passing
+the ID of the head keyring in
+.I arg2
+(cast to
+.IR key_serial_t ).
+The search is performed breadth-first and recursively.
+.IP
+The
+.I arg3
+and
+.I arg4
+arguments specify the key to be searched for:
+.I arg3
+(cast as
+.IR char\~* )
+contains the key type
+(a null-terminated character string up to 32 bytes in size,
+including the terminating null byte), and
+.I arg4
+(cast as
+.IR char\~* )
+contains the description of the key
+(a null-terminated character string up to 4096 bytes in size,
+including the terminating null byte).
+.IP
+The source keyring must grant
+.I search
+permission to the caller.
+When performing the recursive search, only keyrings that grant the caller
+.I search
+permission will be searched.
+Only keys with for which the caller has
+.I search
+permission can be found.
+.IP
+If the key is found, its ID is returned as the function result.
+.IP
+If the key is found and
+.I arg5
+(cast to
+.IR key_serial_t )
+is nonzero, then, subject to the same constraints and rules as
+.BR KEYCTL_LINK ,
+the key is linked into the keyring whose ID is specified in
+.IR arg5 .
+If the destination keyring specified in
+.I arg5
+already contains a link to a key that has the same type and description,
+then that link will be displaced by a link to
+the key found by this operation.
+.IP
+Instead of valid existing keyring IDs, the source
+.RI ( arg2 )
+and destination
+.RI ( arg5 )
+keyrings can be one of the special keyring IDs listed under
+.BR KEYCTL_GET_KEYRING_ID .
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_search (3).
+.TP
+.BR KEYCTL_READ " (since Linux 2.6.10)"
+Read the payload data of a key.
+.IP
+The ID of the key whose payload is to be read is specified in
+.I arg2
+(cast to
+.IR key_serial_t ).
+This can be the ID of an existing key,
+or any of the special key IDs listed for
+.BR KEYCTL_GET_KEYRING_ID .
+.\" including KEY_SPEC_REQKEY_AUTH_KEY
+.IP
+The payload is placed in the buffer pointed by
+.I arg3
+(cast to
+.IR "char\ *" );
+the size of that buffer must be specified in
+.I arg4
+(cast to
+.IR size_t ).
+.IP
+The returned data will be processed for presentation
+according to the key type.
+For example, a keyring will return an array of
+.I key_serial_t
+entries representing the IDs of all the keys that are linked to it.
+The
+.I user
+key type will return its data as is.
+If a key type does not implement this function,
+the operation fails with the error
+.BR EOPNOTSUPP .
+.IP
+If
+.I arg3
+is not NULL,
+as much of the payload data as will fit is copied into the buffer.
+On a successful return,
+the return value is always the total size of the payload data.
+To determine whether the buffer was of sufficient size,
+check to see that the return value is less than or equal to
+the value supplied in
+.IR arg4 .
+.IP
+The key must either grant the caller
+.I read
+permission, or grant the caller
+.I search
+permission when searched for from the process keyrings
+(i.e., the key is possessed).
+.IP
+The
+.I arg5
+argument is ignored.
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_read (3).
+.TP
+.BR KEYCTL_INSTANTIATE " (since Linux 2.6.10)"
+(Positively) instantiate an uninstantiated key with a specified payload.
+.IP
+The ID of the key to be instantiated is provided in
+.I arg2
+(cast to
+.IR key_serial_t ).
+.IP
+The key payload is specified in the buffer pointed to by
+.I arg3
+(cast to
+.IR "void\ *");
+the size of that buffer is specified in
+.I arg4
+(cast to
+.IR size_t ).
+.IP
+The payload may be a NULL pointer and the buffer size may be 0
+if this is supported by the key type (e.g., it is a keyring).
+.IP
+The operation may be fail if the payload data is in the wrong format
+or is otherwise invalid.
+.IP
+If
+.I arg5
+(cast to
+.IR key_serial_t )
+is nonzero, then, subject to the same constraints and rules as
+.BR KEYCTL_LINK ,
+the instantiated key is linked into the keyring whose ID specified in
+.IR arg5 .
+.IP
+The caller must have the appropriate authorization key,
+and once the uninstantiated key has been instantiated,
+the authorization key is revoked.
+In other words, this operation is available only from a
+.BR request\-key (8)-style
+program.
+See
+.BR request_key (2)
+for an explanation of uninstantiated keys and key instantiation.
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_instantiate (3).
+.TP
+.BR KEYCTL_NEGATE " (since Linux 2.6.10)"
+Negatively instantiate an uninstantiated key.
+.IP
+This operation is equivalent to the call:
+.IP
+.in +4n
+.EX
+keyctl(KEYCTL_REJECT, arg2, arg3, ENOKEY, arg4);
+.EE
+.in
+.IP
+The
+.I arg5
+argument is ignored.
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_negate (3).
+.TP
+.BR KEYCTL_SET_REQKEY_KEYRING " (since Linux 2.6.13)"
+Set the default keyring to which implicitly requested keys
+will be linked for this thread, and return the previous setting.
+Implicit key requests are those made by internal kernel components,
+.\" I.e., calls to the kernel's internal request_key() interface,
+.\" which is distinct from the request_key(2) system call (which
+.\" ultimately employs the kernel-internal interface).
+such as can occur when, for example, opening files
+on an AFS or NFS filesystem.
+Setting the default keyring also has an effect when requesting
+a key from user space; see
+.BR request_key (2)
+for details.
+.IP
+The
+.I arg2
+argument (cast to
+.IR int )
+should contain one of the following values,
+to specify the new default keyring:
+.RS
+.TP
+.B KEY_REQKEY_DEFL_NO_CHANGE
+Don't change the default keyring.
+This can be used to discover the current default keyring
+(without changing it).
+.TP
+.B KEY_REQKEY_DEFL_DEFAULT
+This selects the default behaviour,
+which is to use the thread-specific keyring if there is one,
+otherwise the process-specific keyring if there is one,
+otherwise the session keyring if there is one,
+otherwise the UID-specific session keyring,
+otherwise the user-specific keyring.
+.TP
+.B KEY_REQKEY_DEFL_THREAD_KEYRING
+Use the thread-specific keyring
+.RB ( thread\-keyring (7))
+as the new default keyring.
+.TP
+.B KEY_REQKEY_DEFL_PROCESS_KEYRING
+Use the process-specific keyring
+.RB ( process\-keyring (7))
+as the new default keyring.
+.TP
+.B KEY_REQKEY_DEFL_SESSION_KEYRING
+Use the session-specific keyring
+.RB ( session\-keyring (7))
+as the new default keyring.
+.TP
+.B KEY_REQKEY_DEFL_USER_KEYRING
+Use the UID-specific keyring
+.RB ( user\-keyring (7))
+as the new default keyring.
+.TP
+.B KEY_REQKEY_DEFL_USER_SESSION_KEYRING
+Use the UID-specific session keyring
+.RB ( user\-session\-keyring (7))
+as the new default keyring.
+.TP
+.BR KEY_REQKEY_DEFL_REQUESTOR_KEYRING " (since Linux 2.6.29)"
+.\" 8bbf4976b59fc9fc2861e79cab7beb3f6d647640
+Use the requestor keyring.
+.\" FIXME The preceding explanation needs to be expanded.
+.\" Is the following correct:
+.\"
+.\" The requestor keyring is the dest_keyring that
+.\" was supplied to a call to request_key(2)?
+.\"
+.\" David Howells said: to be checked
+.RE
+.IP
+All other values are invalid.
+.\" (including the still-unsupported KEY_REQKEY_DEFL_GROUP_KEYRING)
+.IP
+The arguments
+.IR arg3 ,
+.IR arg4 ,
+and
+.I arg5
+are ignored.
+.IP
+The setting controlled by this operation is inherited by the child of
+.BR fork (2)
+and preserved across
+.BR execve (2).
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_set_reqkey_keyring (3).
+.TP
+.BR KEYCTL_SET_TIMEOUT " (since Linux 2.6.16)"
+Set a timeout on a key.
+.IP
+The ID of the key is specified in
+.I arg2
+(cast to
+.IR key_serial_t ).
+The timeout value, in seconds from the current time,
+is specified in
+.I arg3
+(cast to
+.IR "unsigned int" ).
+The timeout is measured against the realtime clock.
+.IP
+Specifying the timeout value as 0 clears any existing timeout on the key.
+.IP
+The
+.I /proc/keys
+file displays the remaining time until each key will expire.
+(This is the only method of discovering the timeout on a key.)
+.IP
+The caller must either have the
+.I setattr
+permission on the key
+or hold an instantiation authorization token for the key (see
+.BR request_key (2)).
+.IP
+The key and any links to the key will be
+automatically garbage collected after the timeout expires.
+Subsequent attempts to access the key will then fail with the error
+.BR EKEYEXPIRED .
+.IP
+This operation cannot be used to set timeouts on revoked, expired,
+or negatively instantiated keys.
+.IP
+The arguments
+.I arg4
+and
+.I arg5
+are ignored.
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_set_timeout (3).
+.TP
+.BR KEYCTL_ASSUME_AUTHORITY " (since Linux 2.6.16)"
+Assume (or divest) the authority for the calling thread
+to instantiate a key.
+.IP
+The
+.I arg2
+argument (cast to
+.IR key_serial_t )
+specifies either a nonzero key ID to assume authority,
+or the value 0 to divest authority.
+.IP
+If
+.I arg2
+is nonzero, then it specifies the ID of an uninstantiated key for which
+authority is to be assumed.
+That key can then be instantiated using one of
+.BR KEYCTL_INSTANTIATE ,
+.BR KEYCTL_INSTANTIATE_IOV ,
+.BR KEYCTL_REJECT ,
+or
+.BR KEYCTL_NEGATE .
+Once the key has been instantiated,
+the thread is automatically divested of authority to instantiate the key.
+.IP
+Authority over a key can be assumed only if the calling thread has present
+in its keyrings the authorization key that is
+associated with the specified key.
+(In other words, the
+.B KEYCTL_ASSUME_AUTHORITY
+operation is available only from a
+.BR request\-key (8)-style
+program; see
+.BR request_key (2)
+for an explanation of how this operation is used.)
+The caller must have
+.I search
+permission on the authorization key.
+.IP
+If the specified key has a matching authorization key,
+then the ID of that key is returned.
+The authorization key can be read
+.RB ( KEYCTL_READ )
+to obtain the callout information passed to
+.BR request_key (2).
+.IP
+If the ID given in
+.I arg2
+is 0, then the currently assumed authority is cleared (divested),
+and the value 0 is returned.
+.IP
+The
+.B KEYCTL_ASSUME_AUTHORITY
+mechanism allows a program such as
+.BR request\-key (8)
+to assume the necessary authority to instantiate a new uninstantiated key
+that was created as a consequence of a call to
+.BR request_key (2).
+For further information, see
+.BR request_key (2)
+and the kernel source file
+.IR Documentation/security/keys\-request\-key.txt .
+.IP
+The arguments
+.IR arg3 ,
+.IR arg4 ,
+and
+.I arg5
+are ignored.
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_assume_authority (3).
+.TP
+.BR KEYCTL_GET_SECURITY " (since Linux 2.6.26)"
+.\" commit 70a5bb72b55e82fbfbf1e22cae6975fac58a1e2d
+Get the LSM (Linux Security Module) security label of the specified key.
+.IP
+The ID of the key whose security label is to be fetched is specified in
+.I arg2
+(cast to
+.IR key_serial_t ).
+The security label (terminated by a null byte)
+will be placed in the buffer pointed to by
+.I arg3
+argument (cast to
+.IR "char\ *" );
+the size of the buffer must be provided in
+.I arg4
+(cast to
+.IR size_t ).
+.IP
+If
+.I arg3
+is specified as NULL or the buffer size specified in
+.I arg4
+is too small, the full size of the security label string
+(including the terminating null byte)
+is returned as the function result,
+and nothing is copied to the buffer.
+.IP
+The caller must have
+.I view
+permission on the specified key.
+.IP
+The returned security label string will be rendered in a form appropriate
+to the LSM in force.
+For example, with SELinux, it may look like:
+.IP
+.in +4n
+.EX
+unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023
+.EE
+.in
+.IP
+If no LSM is currently in force,
+then an empty string is placed in the buffer.
+.IP
+The
+.I arg5
+argument is ignored.
+.IP
+This operation is exposed by
+.I libkeyutils
+via the functions
+.BR keyctl_get_security (3)
+and
+.BR keyctl_get_security_alloc (3).
+.TP
+.BR KEYCTL_SESSION_TO_PARENT " (since Linux 2.6.32)"
+.\" commit ee18d64c1f632043a02e6f5ba5e045bb26a5465f
+Replace the session keyring to which the
+.I parent
+of the calling process
+subscribes with the session keyring of the calling process.
+.\" What is the use case for KEYCTL_SESSION_TO_PARENT?
+.\" David Howells: the Process Authentication Groups people requested this,
+.\" but then didn't use it; maybe there are no users.
+.IP
+The keyring will be replaced in the parent process at the point
+where the parent next transitions from kernel space to user space.
+.IP
+The keyring must exist and must grant the caller
+.I link
+permission.
+The parent process must be single-threaded and have
+the same effective ownership as this process
+and must not be set-user-ID or set-group-ID.
+The UID of the parent process's existing session keyring (f it has one),
+as well as the UID of the caller's session keyring
+much match the caller's effective UID.
+.IP
+The fact that it is the parent process that is affected by this operation
+allows a program such as the shell to start a child process that
+uses this operation to change the shell's session keyring.
+(This is what the
+.BR keyctl (1)
+.B new_session
+command does.)
+.IP
+The arguments
+.IR arg2 ,
+.IR arg3 ,
+.IR arg4 ,
+and
+.I arg5
+are ignored.
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_session_to_parent (3).
+.TP
+.BR KEYCTL_REJECT " (since Linux 2.6.39)"
+.\" commit fdd1b94581782a2ddf9124414e5b7a5f48ce2f9c
+Mark a key as negatively instantiated and set an expiration timer
+on the key.
+This operation provides a superset of the functionality of the earlier
+.B KEYCTL_NEGATE
+operation.
+.IP
+The ID of the key that is to be negatively instantiated is specified in
+.I arg2
+(cast to
+.IR key_serial_t ).
+The
+.I arg3
+(cast to
+.IR "unsigned int" )
+argument specifies the lifetime of the key, in seconds.
+The
+.I arg4
+argument (cast to
+.IR "unsigned int" )
+specifies the error to be returned when a search hits this key;
+typically, this is one of
+.BR EKEYREJECTED ,
+.BR EKEYREVOKED ,
+or
+.BR EKEYEXPIRED .
+.IP
+If
+.I arg5
+(cast to
+.IR key_serial_t )
+is nonzero, then, subject to the same constraints and rules as
+.BR KEYCTL_LINK ,
+the negatively instantiated key is linked into the keyring
+whose ID is specified in
+.IR arg5 .
+.IP
+The caller must have the appropriate authorization key.
+In other words, this operation is available only from a
+.BR request\-key (8)-style
+program.
+See
+.BR request_key (2).
+.IP
+The caller must have the appropriate authorization key,
+and once the uninstantiated key has been instantiated,
+the authorization key is revoked.
+In other words, this operation is available only from a
+.BR request\-key (8)-style
+program.
+See
+.BR request_key (2)
+for an explanation of uninstantiated keys and key instantiation.
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_reject (3).
+.TP
+.BR KEYCTL_INSTANTIATE_IOV " (since Linux 2.6.39)"
+.\" commit ee009e4a0d4555ed522a631bae9896399674f063
+Instantiate an uninstantiated key with a payload specified
+via a vector of buffers.
+.IP
+This operation is the same as
+.BR KEYCTL_INSTANTIATE ,
+but the payload data is specified as an array of
+.I iovec
+structures (see
+.BR iovec (3type)).
+.IP
+The pointer to the payload vector is specified in
+.I arg3
+(cast as
+.IR "const struct iovec\~*" ).
+The number of items in the vector is specified in
+.I arg4
+(cast as
+.IR "unsigned int" ).
+.IP
+The
+.I arg2
+(key ID)
+and
+.I arg5
+(keyring ID)
+are interpreted as for
+.BR KEYCTL_INSTANTIATE .
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_instantiate_iov (3).
+.TP
+.BR KEYCTL_INVALIDATE " (since Linux 3.5)"
+.\" commit fd75815f727f157a05f4c96b5294a4617c0557da
+Mark a key as invalid.
+.IP
+The ID of the key to be invalidated is specified in
+.I arg2
+(cast to
+.IR key_serial_t ).
+.IP
+To invalidate a key,
+the caller must have
+.I search
+permission on the key.
+.\" CAP_SYS_ADMIN is permitted to invalidate certain special keys
+.IP
+This operation marks the key as invalid
+and schedules immediate garbage collection.
+The garbage collector removes the invalidated key from all keyrings and
+deletes the key when its reference count reaches zero.
+After this operation,
+the key will be ignored by all searches,
+even if it is not yet deleted.
+.IP
+Keys that are marked invalid become invisible to normal key operations
+immediately, though they are still visible in
+.I /proc/keys
+(marked with an 'i' flag)
+until they are actually removed.
+.IP
+The arguments
+.IR arg3 ,
+.IR arg4 ,
+and
+.I arg5
+are ignored.
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_invalidate (3).
+.TP
+.BR KEYCTL_GET_PERSISTENT " (since Linux 3.13)"
+.\" commit f36f8c75ae2e7d4da34f4c908cebdb4aa42c977e
+Get the persistent keyring
+.RB ( persistent\-keyring (7))
+for a specified user and link it to a specified keyring.
+.IP
+The user ID is specified in
+.I arg2
+(cast to
+.IR uid_t ).
+If the value \-1 is specified, the caller's real user ID is used.
+The ID of the destination keyring is specified in
+.I arg3
+(cast to
+.IR key_serial_t ).
+.IP
+The caller must have the
+.B CAP_SETUID
+capability in its user namespace in order to fetch the persistent keyring
+for a user ID that does not match either the real or effective user ID
+of the caller.
+.IP
+If the call is successful,
+a link to the persistent keyring is added to the keyring
+whose ID was specified in
+.IR arg3 .
+.IP
+The caller must have
+.I write
+permission on the keyring.
+.IP
+The persistent keyring will be created by the kernel
+if it does not yet exist.
+.IP
+Each time the
+.B KEYCTL_GET_PERSISTENT
+operation is performed, the persistent keyring will
+have its expiration timeout reset to the value in:
+.IP
+.in +4n
+.EX
+/proc/sys/kernel/keys/persistent_keyring_expiry
+.EE
+.in
+.IP
+Should the timeout be reached,
+the persistent keyring will be removed and
+everything it pins can then be garbage collected.
+.IP
+Persistent keyrings were added in Linux 3.13.
+.IP
+The arguments
+.I arg4
+and
+.I arg5
+are ignored.
+.IP
+This operation is exposed by
+.I libkeyutils
+via the function
+.BR keyctl_get_persistent (3).
+.TP
+.BR KEYCTL_DH_COMPUTE " (since Linux 4.7)"
+.\" commit ddbb41148724367394d0880c516bfaeed127b52e
+Compute a Diffie-Hellman shared secret or public key,
+optionally applying key derivation function (KDF) to the result.
+.IP
+The
+.I arg2
+argument is a pointer to a set of parameters containing
+serial numbers for three
+.I """user"""
+keys used in the Diffie-Hellman calculation,
+packaged in a structure of the following form:
+.IP
+.in +4n
+.EX
+struct keyctl_dh_params {
+ int32_t private; /* The local private key */
+ int32_t prime; /* The prime, known to both parties */
+ int32_t base; /* The base integer: either a shared
+ generator or the remote public key */
+};
+.EE
+.in
+.IP
+Each of the three keys specified in this structure must grant the caller
+.I read
+permission.
+The payloads of these keys are used to calculate the Diffie-Hellman
+result as:
+.IP
+.in +4n
+.EX
+base \[ha] private mod prime
+.EE
+.in
+.IP
+If the base is the shared generator, the result is the local public key.
+If the base is the remote public key, the result is the shared secret.
+.IP
+The
+.I arg3
+argument (cast to
+.IR char\~* )
+points to a buffer where the result of the calculation is placed.
+The size of that buffer is specified in
+.I arg4
+(cast to
+.IR size_t ).
+.IP
+The buffer must be large enough to accommodate the output data,
+otherwise an error is returned.
+If
+.I arg4
+is specified zero,
+in which case the buffer is not used and
+the operation returns the minimum required buffer size
+(i.e., the length of the prime).
+.IP
+Diffie-Hellman computations can be performed in user space,
+but require a multiple-precision integer (MPI) library.
+Moving the implementation into the kernel gives access to
+the kernel MPI implementation,
+and allows access to secure or acceleration hardware.
+.IP
+Adding support for DH computation to the
+.BR keyctl ()
+system call was considered a good fit due to the DH algorithm's use
+for deriving shared keys;
+it also allows the type of the key to determine
+which DH implementation (software or hardware) is appropriate.
+.\" commit f1c316a3ab9d24df6022682422fe897492f2c0c8
+.IP
+If the
+.I arg5
+argument is
+.BR NULL ,
+then the DH result itself is returned.
+Otherwise (since Linux 4.12), it is a pointer to a structure which specifies
+parameters of the KDF operation to be applied:
+.IP
+.in +4n
+.EX
+struct keyctl_kdf_params {
+ char *hashname; /* Hash algorithm name */
+ char *otherinfo; /* SP800\-56A OtherInfo */
+ __u32 otherinfolen; /* Length of otherinfo data */
+ __u32 __spare[8]; /* Reserved */
+};
+.EE
+.in
+.IP
+The
+.I hashname
+field is a null-terminated string which specifies a hash name
+(available in the kernel's crypto API; the list of the hashes available
+is rather tricky to observe; please refer to the
+.UR https://www.kernel.org\:/doc\:/html\:/latest\:/crypto\:/architecture.html
+"Kernel Crypto API Architecture"
+.UE
+documentation for the information regarding how hash names are constructed and
+your kernel's source and configuration regarding what ciphers
+and templates with type
+.B CRYPTO_ALG_TYPE_SHASH
+are available)
+to be applied to DH result in KDF operation.
+.IP
+The
+.I otherinfo
+field is an
+.I OtherInfo
+data as described in SP800-56A section 5.8.1.2 and is algorithm-specific.
+This data is concatenated with the result of DH operation and is provided as
+an input to the KDF operation.
+Its size is provided in the
+.I otherinfolen
+field and is limited by
+.B KEYCTL_KDF_MAX_OI_LEN
+constant that defined in
+.I security/keys/internal.h
+to a value of 64.
+.IP
+The
+.B __spare
+field is currently unused.
+.\" commit 4f9dabfaf8df971f8a3b6aa324f8f817be38d538
+It was ignored until Linux 4.13 (but still should be
+user-addressable since it is copied to the kernel),
+and should contain zeros since Linux 4.13.
+.IP
+The KDF implementation complies with SP800-56A as well
+as with SP800-108 (the counter KDF).
+.IP
+.\" keyutils commit 742c9d7b94051d3b21f9f61a73ed6b5f3544cb82
+.\" keyutils commit d68a981e5db41d059ac782071c35d1e8f3aaf61c
+This operation is exposed by
+.I libkeyutils
+(from
+.I libkeyutils
+1.5.10 onwards) via the functions
+.BR keyctl_dh_compute (3)
+and
+.BR keyctl_dh_compute_alloc (3).
+.TP
+.BR KEYCTL_RESTRICT_KEYRING " (since Linux 4.12)"
+.\" commit 6563c91fd645556c7801748f15bc727c77fcd311
+.\" commit 7228b66aaf723a623e578aa4db7d083bb39546c9
+Apply a key-linking restriction to the keyring with the ID provided in
+.I arg2
+(cast to
+.IR key_serial_t ).
+The caller must have
+.I setattr
+permission on the key.
+If
+.I arg3
+is NULL, any attempt to add a key to the keyring is blocked;
+otherwise it contains a pointer to a string with a key type name and
+.I arg4
+contains a pointer to string that describes the type-specific restriction.
+As of Linux 4.12, only the type "asymmetric" has restrictions defined:
+.RS
+.TP
+.B builtin_trusted
+Allows only keys that are signed by a key linked to the built-in keyring
+(".builtin_trusted_keys").
+.TP
+.B builtin_and_secondary_trusted
+Allows only keys that are signed by a key linked to the secondary keyring
+(".secondary_trusted_keys") or, by extension, a key in a built-in keyring,
+as the latter is linked to the former.
+.TP
+.BI key_or_keyring: key
+.TQ
+.BI key_or_keyring: key :chain
+If
+.I key
+specifies the ID of a key of type "asymmetric",
+then only keys that are signed by this key are allowed.
+.IP
+If
+.I key
+specifies the ID of a keyring,
+then only keys that are signed by a key linked
+to this keyring are allowed.
+.IP
+If ":chain" is specified, keys that are signed by a keys linked to the
+destination keyring (that is, the keyring with the ID specified in the
+.I arg2
+argument) are also allowed.
+.RE
+.IP
+Note that a restriction can be configured only once for the specified keyring;
+once a restriction is set, it can't be overridden.
+.IP
+The argument
+.I arg5
+is ignored.
+.\" FIXME Document KEYCTL_RESTRICT_KEYRING, added in Linux 4.12
+.\" commit 6563c91fd645556c7801748f15bc727c77fcd311
+.\" Author: Mat Martineau <mathew.j.martineau@linux.intel.com>
+.\" See Documentation/security/keys.txt
+.SH RETURN VALUE
+For a successful call, the return value depends on the operation:
+.TP
+.B KEYCTL_GET_KEYRING_ID
+The ID of the requested keyring.
+.TP
+.B KEYCTL_JOIN_SESSION_KEYRING
+The ID of the joined session keyring.
+.TP
+.B KEYCTL_DESCRIBE
+The size of the description (including the terminating null byte),
+irrespective of the provided buffer size.
+.TP
+.B KEYCTL_SEARCH
+The ID of the key that was found.
+.TP
+.B KEYCTL_READ
+The amount of data that is available in the key,
+irrespective of the provided buffer size.
+.TP
+.B KEYCTL_SET_REQKEY_KEYRING
+The ID of the previous default keyring
+to which implicitly requested keys were linked
+(one of
+.BR KEY_REQKEY_DEFL_USER_* ).
+.TP
+.B KEYCTL_ASSUME_AUTHORITY
+Either 0, if the ID given was 0,
+or the ID of the authorization key matching the specified key,
+if a nonzero key ID was provided.
+.TP
+.B KEYCTL_GET_SECURITY
+The size of the LSM security label string
+(including the terminating null byte),
+irrespective of the provided buffer size.
+.TP
+.B KEYCTL_GET_PERSISTENT
+The ID of the persistent keyring.
+.TP
+.B KEYCTL_DH_COMPUTE
+The number of bytes copied to the buffer, or, if
+.I arg4
+is 0, the required buffer size.
+.TP
+All other operations
+Zero.
+.PP
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+The requested operation wasn't permitted.
+.TP
+.B EAGAIN
+.I operation
+was
+.B KEYCTL_DH_COMPUTE
+and there was an error during crypto module initialization.
+.TP
+.B EDEADLK
+.I operation
+was
+.B KEYCTL_LINK
+and the requested link would result in a cycle.
+.TP
+.B EDEADLK
+.I operation
+was
+.B KEYCTL_RESTRICT_KEYRING
+and the requested keyring restriction would result in a cycle.
+.TP
+.B EDQUOT
+The key quota for the caller's user would be exceeded by creating a key or
+linking it to the keyring.
+.TP
+.B EEXIST
+.I operation
+was
+.B KEYCTL_RESTRICT_KEYRING
+and keyring provided in
+.I arg2
+argument already has a restriction set.
+.TP
+.B EFAULT
+.I operation
+was
+.B KEYCTL_DH_COMPUTE
+and one of the following has failed:
+.RS
+.IP \[bu] 3
+copying of the
+.IR "struct keyctl_dh_params" ,
+provided in the
+.I arg2
+argument, from user space;
+.IP \[bu]
+copying of the
+.IR "struct keyctl_kdf_params" ,
+provided in the non-NULL
+.I arg5
+argument, from user space
+(in case kernel supports performing KDF operation on DH operation result);
+.IP \[bu]
+copying of data pointed by the
+.I hashname
+field of the
+.I "struct keyctl_kdf_params"
+from user space;
+.IP \[bu]
+copying of data pointed by the
+.I otherinfo
+field of the
+.I struct keyctl_kdf_params
+from user space if the
+.I otherinfolen
+field was nonzero;
+.IP \[bu]
+copying of the result to user space.
+.RE
+.TP
+.B EINVAL
+.I operation
+was
+.B KEYCTL_SETPERM
+and an invalid permission bit was specified in
+.IR arg3 .
+.TP
+.B EINVAL
+.I operation
+was
+.B KEYCTL_SEARCH
+and the size of the description in
+.I arg4
+(including the terminating null byte) exceeded 4096 bytes.
+.TP
+.B EINVAL
+size of the string (including the terminating null byte) specified in
+.I arg3
+(the key type)
+or
+.I arg4
+(the key description)
+exceeded the limit (32 bytes and 4096 bytes respectively).
+.TP
+.BR EINVAL " (before Linux 4.12)"
+.I operation
+was
+.BR KEYCTL_DH_COMPUTE ,
+argument
+.I arg5
+was non-NULL.
+.TP
+.B EINVAL
+.I operation
+was
+.B KEYCTL_DH_COMPUTE
+And the digest size of the hashing algorithm supplied is zero.
+.TP
+.B EINVAL
+.I operation
+was
+.B KEYCTL_DH_COMPUTE
+and the buffer size provided is not enough to hold the result.
+Provide 0 as a buffer size in order to obtain the minimum buffer size.
+.TP
+.B EINVAL
+.I operation
+was
+.B KEYCTL_DH_COMPUTE
+and the hash name provided in the
+.I hashname
+field of the
+.I struct keyctl_kdf_params
+pointed by
+.I arg5
+argument is too big (the limit is implementation-specific and varies between
+kernel versions, but it is deemed big enough for all valid algorithm names).
+.TP
+.B EINVAL
+.\" commit 4f9dabfaf8df971f8a3b6aa324f8f817be38d538
+.I operation
+was
+.B KEYCTL_DH_COMPUTE
+and the
+.I __spare
+field of the
+.I struct keyctl_kdf_params
+provided in the
+.I arg5
+argument contains nonzero values.
+.TP
+.B EKEYEXPIRED
+An expired key was found or specified.
+.TP
+.B EKEYREJECTED
+A rejected key was found or specified.
+.TP
+.B EKEYREVOKED
+A revoked key was found or specified.
+.TP
+.B ELOOP
+.I operation
+was
+.B KEYCTL_LINK
+and the requested link would cause the maximum nesting depth
+for keyrings to be exceeded.
+.TP
+.B EMSGSIZE
+.I operation
+was
+.B KEYCTL_DH_COMPUTE
+and the buffer length exceeds
+.B KEYCTL_KDF_MAX_OUTPUT_LEN
+(which is 1024 currently)
+or the
+.I otherinfolen
+field of the
+.I struct keyctl_kdf_parms
+passed in
+.I arg5
+exceeds
+.B KEYCTL_KDF_MAX_OI_LEN
+(which is 64 currently).
+.TP
+.BR ENFILE " (before Linux 3.13)"
+.I operation
+was
+.B KEYCTL_LINK
+and the keyring is full.
+(Before Linux 3.13,
+.\" commit b2a4df200d570b2c33a57e1ebfa5896e4bc81b69
+the available space for storing keyring links was limited to
+a single page of memory; since Linux 3.13, there is no fixed limit.)
+.TP
+.B ENOENT
+.I operation
+was
+.B KEYCTL_UNLINK
+and the key to be unlinked isn't linked to the keyring.
+.TP
+.B ENOENT
+.I operation
+was
+.B KEYCTL_DH_COMPUTE
+and the hashing algorithm specified in the
+.I hashname
+field of the
+.I struct keyctl_kdf_params
+pointed by
+.I arg5
+argument hasn't been found.
+.TP
+.B ENOENT
+.I operation
+was
+.B KEYCTL_RESTRICT_KEYRING
+and the type provided in
+.I arg3
+argument doesn't support setting key linking restrictions.
+.TP
+.B ENOKEY
+No matching key was found or an invalid key was specified.
+.TP
+.B ENOKEY
+The value
+.B KEYCTL_GET_KEYRING_ID
+was specified in
+.IR operation ,
+the key specified in
+.I arg2
+did not exist, and
+.I arg3
+was zero (meaning don't create the key if it didn't exist).
+.TP
+.B ENOMEM
+One of kernel memory allocation routines failed during the execution of the
+syscall.
+.TP
+.B ENOTDIR
+A key of keyring type was expected but the ID of a key with
+a different type was provided.
+.TP
+.B EOPNOTSUPP
+.I operation
+was
+.B KEYCTL_READ
+and the key type does not support reading
+(e.g., the type is
+.IR """login""" ).
+.TP
+.B EOPNOTSUPP
+.I operation
+was
+.B KEYCTL_UPDATE
+and the key type does not support updating.
+.TP
+.B EOPNOTSUPP
+.I operation
+was
+.BR KEYCTL_RESTRICT_KEYRING ,
+the type provided in
+.I arg3
+argument was "asymmetric",
+and the key specified in the restriction specification provided in
+.I arg4
+has type other than "asymmetric" or "keyring".
+.TP
+.B EPERM
+.I operation
+was
+.BR KEYCTL_GET_PERSISTENT ,
+.I arg2
+specified a UID other than the calling thread's real or effective UID,
+and the caller did not have the
+.B CAP_SETUID
+capability.
+.TP
+.B EPERM
+.I operation
+was
+.B KEYCTL_SESSION_TO_PARENT
+and either:
+all of the UIDs (GIDs) of the parent process do not match
+the effective UID (GID) of the calling process;
+the UID of the parent's existing session keyring or
+the UID of the caller's session keyring did not match
+the effective UID of the caller;
+the parent process is not single-thread;
+or the parent process is
+.BR init (1)
+or a kernel thread.
+.TP
+.B ETIMEDOUT
+.I operation
+was
+.B KEYCTL_DH_COMPUTE
+and the initialization of crypto modules has timed out.
+.SH VERSIONS
+A wrapper is provided in the
+.I libkeyutils
+library.
+(The accompanying package provides the
+.I <keyutils.h>
+header file.)
+However, rather than using this system call directly,
+you probably want to use the various library functions
+mentioned in the descriptions of individual operations above.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.10.
+.SH EXAMPLES
+The program below provide subset of the functionality of the
+.BR request\-key (8)
+program provided by the
+.I keyutils
+package.
+For informational purposes,
+the program records various information in a log file.
+.PP
+As described in
+.BR request_key (2),
+the
+.BR request\-key (8)
+program is invoked with command-line arguments that
+describe a key that is to be instantiated.
+The example program fetches and logs these arguments.
+The program assumes authority to instantiate the requested key,
+and then instantiates that key.
+.PP
+The following shell session demonstrates the use of this program.
+In the session,
+we compile the program and then use it to temporarily replace the standard
+.BR request\-key (8)
+program.
+(Note that temporarily disabling the standard
+.BR request\-key (8)
+program may not be safe on some systems.)
+While our example program is installed,
+we use the example program shown in
+.BR request_key (2)
+to request a key.
+.PP
+.in +4n
+.EX
+$ \fBcc \-o key_instantiate key_instantiate.c \-lkeyutils\fP
+$ \fBsudo mv /sbin/request\-key /sbin/request\-key.backup\fP
+$ \fBsudo cp key_instantiate /sbin/request\-key\fP
+$ \fB./t_request_key user mykey somepayloaddata\fP
+Key ID is 20d035bf
+$ \fBsudo mv /sbin/request\-key.backup /sbin/request\-key\fP
+.EE
+.in
+.PP
+Looking at the log file created by this program,
+we can see the command-line arguments supplied to our example program:
+.PP
+.in +4n
+.EX
+$ \fBcat /tmp/key_instantiate.log\fP
+Time: Mon Nov 7 13:06:47 2016
+\&
+Command line arguments:
+ argv[0]: /sbin/request\-key
+ operation: create
+ key_to_instantiate: 20d035bf
+ UID: 1000
+ GID: 1000
+ thread_keyring: 0
+ process_keyring: 0
+ session_keyring: 256e6a6
+\&
+Key description: user;1000;1000;3f010000;mykey
+Auth key payload: somepayloaddata
+Destination keyring: 256e6a6
+Auth key description: .request_key_auth;1000;1000;0b010000;20d035bf
+.EE
+.in
+.PP
+The last few lines of the above output show that the example program
+was able to fetch:
+.IP \[bu] 3
+the description of the key to be instantiated,
+which included the name of the key
+.RI ( mykey );
+.IP \[bu]
+the payload of the authorization key, which consisted of the data
+.RI ( somepayloaddata )
+passed to
+.BR request_key (2);
+.IP \[bu]
+the destination keyring that was specified in the call to
+.BR request_key (2);
+and
+.IP \[bu]
+the description of the authorization key,
+where we can see that the name of the authorization key matches
+the ID of the key that is to be instantiated
+.RI ( 20d035bf ).
+.PP
+The example program in
+.BR request_key (2)
+specified the destination keyring as
+.BR KEY_SPEC_SESSION_KEYRING .
+By examining the contents of
+.IR /proc/keys ,
+we can see that this was translated to the ID of the destination keyring
+.RI ( 0256e6a6 )
+shown in the log output above;
+we can also see the newly created key with the name
+.I mykey
+and ID
+.IR 20d035bf .
+.PP
+.in +4n
+.EX
+$ \fBcat /proc/keys | egrep \[aq]mykey|256e6a6\[aq]\fP
+0256e6a6 I\-\-Q\-\-\- 194 perm 3f030000 1000 1000 keyring _ses: 3
+20d035bf I\-\-Q\-\-\- 1 perm 3f010000 1000 1000 user mykey: 16
+.EE
+.in
+.SS Program source
+\&
+.\" SRC BEGIN (key_instantiate.c)
+.EX
+/* key_instantiate.c */
+\&
+#include <errno.h>
+#include <keyutils.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <time.h>
+\&
+#ifndef KEY_SPEC_REQUESTOR_KEYRING
+#define KEY_SPEC_REQUESTOR_KEYRING (\-8)
+#endif
+\&
+int
+main(int argc, char *argv[])
+{
+ int akp_size; /* Size of auth_key_payload */
+ int auth_key;
+ char dbuf[256];
+ char auth_key_payload[256];
+ char *operation;
+ FILE *fp;
+ gid_t gid;
+ uid_t uid;
+ time_t t;
+ key_serial_t key_to_instantiate, dest_keyring;
+ key_serial_t thread_keyring, process_keyring, session_keyring;
+\&
+ if (argc != 8) {
+ fprintf(stderr, "Usage: %s op key uid gid thread_keyring "
+ "process_keyring session_keyring\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ fp = fopen("/tmp/key_instantiate.log", "w");
+ if (fp == NULL)
+ exit(EXIT_FAILURE);
+\&
+ setbuf(fp, NULL);
+\&
+ t = time(NULL);
+ fprintf(fp, "Time: %s\en", ctime(&t));
+\&
+ /*
+ * The kernel passes a fixed set of arguments to the program
+ * that it execs; fetch them.
+ */
+ operation = argv[1];
+ key_to_instantiate = atoi(argv[2]);
+ uid = atoi(argv[3]);
+ gid = atoi(argv[4]);
+ thread_keyring = atoi(argv[5]);
+ process_keyring = atoi(argv[6]);
+ session_keyring = atoi(argv[7]);
+\&
+ fprintf(fp, "Command line arguments:\en");
+ fprintf(fp, " argv[0]: %s\en", argv[0]);
+ fprintf(fp, " operation: %s\en", operation);
+ fprintf(fp, " key_to_instantiate: %jx\en",
+ (uintmax_t) key_to_instantiate);
+ fprintf(fp, " UID: %jd\en", (intmax_t) uid);
+ fprintf(fp, " GID: %jd\en", (intmax_t) gid);
+ fprintf(fp, " thread_keyring: %jx\en",
+ (uintmax_t) thread_keyring);
+ fprintf(fp, " process_keyring: %jx\en",
+ (uintmax_t) process_keyring);
+ fprintf(fp, " session_keyring: %jx\en",
+ (uintmax_t) session_keyring);
+ fprintf(fp, "\en");
+\&
+ /*
+ * Assume the authority to instantiate the key named in argv[2].
+ */
+ if (keyctl(KEYCTL_ASSUME_AUTHORITY, key_to_instantiate) == \-1) {
+ fprintf(fp, "KEYCTL_ASSUME_AUTHORITY failed: %s\en",
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+\&
+ /*
+ * Fetch the description of the key that is to be instantiated.
+ */
+ if (keyctl(KEYCTL_DESCRIBE, key_to_instantiate,
+ dbuf, sizeof(dbuf)) == \-1) {
+ fprintf(fp, "KEYCTL_DESCRIBE failed: %s\en", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+\&
+ fprintf(fp, "Key description: %s\en", dbuf);
+\&
+ /*
+ * Fetch the payload of the authorization key, which is
+ * actually the callout data given to request_key().
+ */
+ akp_size = keyctl(KEYCTL_READ, KEY_SPEC_REQKEY_AUTH_KEY,
+ auth_key_payload, sizeof(auth_key_payload));
+ if (akp_size == \-1) {
+ fprintf(fp, "KEYCTL_READ failed: %s\en", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+\&
+ auth_key_payload[akp_size] = \[aq]\e0\[aq];
+ fprintf(fp, "Auth key payload: %s\en", auth_key_payload);
+\&
+ /*
+ * For interest, get the ID of the authorization key and
+ * display it.
+ */
+ auth_key = keyctl(KEYCTL_GET_KEYRING_ID,
+ KEY_SPEC_REQKEY_AUTH_KEY);
+ if (auth_key == \-1) {
+ fprintf(fp, "KEYCTL_GET_KEYRING_ID failed: %s\en",
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+\&
+ fprintf(fp, "Auth key ID: %jx\en", (uintmax_t) auth_key);
+\&
+ /*
+ * Fetch key ID for the request_key(2) destination keyring.
+ */
+ dest_keyring = keyctl(KEYCTL_GET_KEYRING_ID,
+ KEY_SPEC_REQUESTOR_KEYRING);
+ if (dest_keyring == \-1) {
+ fprintf(fp, "KEYCTL_GET_KEYRING_ID failed: %s\en",
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+\&
+ fprintf(fp, "Destination keyring: %jx\en", (uintmax_t) dest_keyring);
+\&
+ /*
+ * Fetch the description of the authorization key. This
+ * allows us to see the key type, UID, GID, permissions,
+ * and description (name) of the key. Among other things,
+ * we will see that the name of the key is a hexadecimal
+ * string representing the ID of the key to be instantiated.
+ */
+ if (keyctl(KEYCTL_DESCRIBE, KEY_SPEC_REQKEY_AUTH_KEY,
+ dbuf, sizeof(dbuf)) == \-1)
+ {
+ fprintf(fp, "KEYCTL_DESCRIBE failed: %s\en", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+\&
+ fprintf(fp, "Auth key description: %s\en", dbuf);
+\&
+ /*
+ * Instantiate the key using the callout data that was supplied
+ * in the payload of the authorization key.
+ */
+ if (keyctl(KEYCTL_INSTANTIATE, key_to_instantiate,
+ auth_key_payload, akp_size + 1, dest_keyring) == \-1)
+ {
+ fprintf(fp, "KEYCTL_INSTANTIATE failed: %s\en",
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.ad l
+.nh
+.BR keyctl (1),
+.BR add_key (2),
+.BR request_key (2),
+.\" .BR find_key_by_type_and_name (3)
+.\" There is a man page, but this function seems not to exist
+.BR keyctl (3),
+.BR keyctl_assume_authority (3),
+.BR keyctl_chown (3),
+.BR keyctl_clear (3),
+.BR keyctl_describe (3),
+.BR keyctl_describe_alloc (3),
+.BR keyctl_dh_compute (3),
+.BR keyctl_dh_compute_alloc (3),
+.BR keyctl_get_keyring_ID (3),
+.BR keyctl_get_persistent (3),
+.BR keyctl_get_security (3),
+.BR keyctl_get_security_alloc (3),
+.BR keyctl_instantiate (3),
+.BR keyctl_instantiate_iov (3),
+.BR keyctl_invalidate (3),
+.BR keyctl_join_session_keyring (3),
+.BR keyctl_link (3),
+.BR keyctl_negate (3),
+.BR keyctl_read (3),
+.BR keyctl_read_alloc (3),
+.BR keyctl_reject (3),
+.BR keyctl_revoke (3),
+.BR keyctl_search (3),
+.BR keyctl_session_to_parent (3),
+.BR keyctl_set_reqkey_keyring (3),
+.BR keyctl_set_timeout (3),
+.BR keyctl_setperm (3),
+.BR keyctl_unlink (3),
+.BR keyctl_update (3),
+.BR recursive_key_scan (3),
+.BR recursive_session_key_scan (3),
+.BR capabilities (7),
+.BR credentials (7),
+.BR keyrings (7),
+.BR keyutils (7),
+.BR persistent\-keyring (7),
+.BR process\-keyring (7),
+.BR session\-keyring (7),
+.BR thread\-keyring (7),
+.BR user\-keyring (7),
+.BR user_namespaces (7),
+.BR user\-session\-keyring (7),
+.BR request\-key (8)
+.PP
+The kernel source files under
+.I Documentation/security/keys/
+(or, before Linux 4.13, in the file
+.IR Documentation/security/keys.txt ).
diff --git a/man2/kill.2 b/man2/kill.2
new file mode 100644
index 0000000..d0a2e6f
--- /dev/null
+++ b/man2/kill.2
@@ -0,0 +1,165 @@
+.\" Copyright (c) 1992 Drew Eckhardt (drew@cs.colorado.edu), March 28, 1992
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt <michael@moria.de>
+.\" Modified by Thomas Koenig <ig25@rz.uni-karlsruhe.de>
+.\" Modified 1993-07-23 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1993-07-25 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1995-11-01 by Michael Haardt
+.\" <michael@cantor.informatik.rwth-aachen.de>
+.\" Modified 1996-04-14 by Andries Brouwer <aeb@cwi.nl>
+.\" [added some polishing contributed by Mike Battersby <mib@deakin.edu.au>]
+.\" Modified 1996-07-21 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified 1997-01-17 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified 2001-12-18 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified 2002-07-24 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added note on historical rules enforced when an unprivileged process
+.\" sends a signal.
+.\" Modified 2004-06-16 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added note on CAP_KILL
+.\" Modified 2004-06-24 by aeb
+.\" Modified, 2004-11-30, after idea from emmanuel.colbus@ensimag.imag.fr
+.\"
+.TH kill 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+kill \- send signal to a process
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <signal.h>
+.PP
+.BI "int kill(pid_t " pid ", int " sig );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR kill ():
+.nf
+ _POSIX_C_SOURCE
+.fi
+.SH DESCRIPTION
+The
+.BR kill ()
+system call
+can be used to send any signal to any process group or process.
+.PP
+If \fIpid\fP is positive, then signal \fIsig\fP is sent to the
+process with the ID specified by \fIpid\fP.
+.PP
+If \fIpid\fP equals 0, then \fIsig\fP is sent to every process in the
+process group of the calling process.
+.PP
+If \fIpid\fP equals \-1, then \fIsig\fP is sent to every process
+for which the calling process has permission to send signals,
+except for process 1 (\fIinit\fP), but see below.
+.PP
+If \fIpid\fP is less than \-1, then \fIsig\fP is sent to every process
+in the process group whose ID is \fI\-pid\fP.
+.PP
+If \fIsig\fP is 0, then no signal is sent,
+but existence and permission checks are still performed;
+this can be used to check for the existence of a process ID or
+process group ID that the caller is permitted to signal.
+.PP
+For a process to have permission to send a signal,
+it must either be privileged (under Linux: have the
+.B CAP_KILL
+capability in the user namespace of the target process),
+or the real or effective user ID of the sending process must equal
+the real or saved set-user-ID of the target process.
+In the case of
+.BR SIGCONT ,
+it suffices when the sending and receiving
+processes belong to the same session.
+(Historically, the rules were different; see NOTES.)
+.SH RETURN VALUE
+On success (at least one signal was sent), zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+An invalid signal was specified.
+.TP
+.B EPERM
+The calling process does not have permission to send the signal
+to any of the target processes.
+.TP
+.B ESRCH
+The target process or process group does not exist.
+Note that an existing process might be a zombie,
+a process that has terminated execution, but
+has not yet been
+.BR wait (2)ed
+for.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4, 4.3BSD.
+.SS Linux notes
+Across different kernel versions, Linux has enforced different rules
+for the permissions required for an unprivileged process
+to send a signal to another process.
+.\" In the 0.* kernels things chopped and changed quite
+.\" a bit - MTK, 24 Jul 02
+In Linux 1.0 to 1.2.2, a signal could be sent if the
+effective user ID of the sender matched effective user ID of the target,
+or the real user ID of the sender matched the real user ID of the target.
+From Linux 1.2.3 until 1.3.77, a signal could be sent if the
+effective user ID of the sender matched either the real or effective
+user ID of the target.
+The current rules, which conform to POSIX.1, were adopted
+in Linux 1.3.78.
+.SH NOTES
+The only signals that can be sent to process ID 1, the
+.I init
+process, are those for which
+.I init
+has explicitly installed signal handlers.
+This is done to assure the
+system is not brought down accidentally.
+.PP
+POSIX.1 requires that \fIkill(\-1,sig)\fP send \fIsig\fP
+to all processes that the calling process may send signals to,
+except possibly for some implementation-defined system processes.
+Linux allows a process to signal itself, but on Linux the call
+\fIkill(\-1,sig)\fP does not signal the calling process.
+.PP
+POSIX.1 requires that if a process sends a signal to itself,
+and the sending thread does not have the signal blocked,
+and no other thread
+has it unblocked or is waiting for it in
+.BR sigwait (3),
+at least one
+unblocked signal must be delivered to the sending thread before the
+.BR kill ()
+returns.
+.SH BUGS
+In Linux 2.6 up to and including Linux 2.6.7,
+there was a bug that meant that when sending signals to a process group,
+.BR kill ()
+failed with the error
+.B EPERM
+if the caller did not have permission to send the signal to \fIany\fP (rather
+than \fIall\fP) of the members of the process group.
+Notwithstanding this error return, the signal was still delivered
+to all of the processes for which the caller had permission to signal.
+.SH SEE ALSO
+.BR kill (1),
+.BR _exit (2),
+.BR pidfd_send_signal (2),
+.BR signal (2),
+.BR tkill (2),
+.BR exit (3),
+.BR killpg (3),
+.BR sigqueue (3),
+.BR capabilities (7),
+.BR credentials (7),
+.BR signal (7)
diff --git a/man2/landlock_add_rule.2 b/man2/landlock_add_rule.2
new file mode 100644
index 0000000..28d5417
--- /dev/null
+++ b/man2/landlock_add_rule.2
@@ -0,0 +1,131 @@
+.\" Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
+.\" Copyright © 2019-2020 ANSSI
+.\" Copyright © 2021 Microsoft Corporation
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH landlock_add_rule 2 2023-07-08 "Linux man-pages 6.05.01"
+.SH NAME
+landlock_add_rule \- add a new Landlock rule to a ruleset
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/landlock.h>" " /* Definition of " LANDLOCK_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.PP
+.BI "int syscall(SYS_landlock_add_rule, int " ruleset_fd ,
+.BI " enum landlock_rule_type " rule_type ,
+.BI " const void *" rule_attr ", uint32_t " flags );
+.fi
+.SH DESCRIPTION
+A Landlock rule describes an action on an object.
+An object is currently a file hierarchy,
+and the related filesystem actions
+are defined with a set of access rights.
+This
+.BR landlock_add_rule ()
+system call enables adding a new Landlock rule to an existing ruleset
+created with
+.BR landlock_create_ruleset (2).
+See
+.BR landlock (7)
+for a global overview.
+.PP
+.I ruleset_fd
+is a Landlock ruleset file descriptor obtained with
+.BR landlock_create_ruleset (2).
+.PP
+.I rule_type
+identifies the structure type pointed to by
+.IR rule_attr .
+Currently, Linux supports the following
+.I rule_type
+value:
+.TP
+.B LANDLOCK_RULE_PATH_BENEATH
+This defines the object type as a file hierarchy.
+In this case,
+.I rule_attr
+points to the following structure:
+.IP
+.in +4n
+.EX
+struct landlock_path_beneath_attr {
+ __u64 allowed_access;
+ __s32 parent_fd;
+} __attribute__((packed));
+.EE
+.in
+.IP
+.I allowed_access
+contains a bitmask of allowed filesystem actions for this file hierarchy
+(see
+.B Filesystem actions
+in
+.BR landlock (7)).
+.IP
+.I parent_fd
+is an opened file descriptor, preferably with the
+.I O_PATH
+flag,
+which identifies the parent directory of the file hierarchy or
+just a file.
+.PP
+.I flags
+must be 0.
+.SH RETURN VALUE
+On success,
+.BR landlock_add_rule ()
+returns 0.
+.SH ERRORS
+.BR landlock_add_rule ()
+can fail for the following reasons:
+.TP
+.B EOPNOTSUPP
+Landlock is supported by the kernel but disabled at boot time.
+.TP
+.B EINVAL
+.I flags
+is not 0, or the rule accesses are inconsistent (i.e.,
+.I rule_attr\->allowed_access
+is not a subset of the ruleset handled accesses).
+.TP
+.B ENOMSG
+Empty accesses (i.e.,
+.I rule_attr\->allowed_access
+is 0).
+.TP
+.B EBADF
+.I ruleset_fd
+is not a file descriptor for the current thread,
+or a member of
+.I rule_attr
+is not a file descriptor as expected.
+.TP
+.B EBADFD
+.I ruleset_fd
+is not a ruleset file descriptor,
+or a member of
+.I rule_attr
+is not the expected file descriptor type.
+.TP
+.B EPERM
+.I ruleset_fd
+has no write access to the underlying ruleset.
+.TP
+.B EFAULT
+.I rule_attr
+was not a valid address.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 5.13.
+.SH EXAMPLES
+See
+.BR landlock (7).
+.SH SEE ALSO
+.BR landlock_create_ruleset (2),
+.BR landlock_restrict_self (2),
+.BR landlock (7)
diff --git a/man2/landlock_create_ruleset.2 b/man2/landlock_create_ruleset.2
new file mode 100644
index 0000000..faadb57
--- /dev/null
+++ b/man2/landlock_create_ruleset.2
@@ -0,0 +1,124 @@
+.\" Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
+.\" Copyright © 2019-2020 ANSSI
+.\" Copyright © 2021 Microsoft Corporation
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH landlock_create_ruleset 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+landlock_create_ruleset \- create a new Landlock ruleset
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/landlock.h>" " /* Definition of " LANDLOCK_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.PP
+.B int syscall(SYS_landlock_create_ruleset,
+.BI " const struct landlock_ruleset_attr *" attr ,
+.BI " size_t " size " , uint32_t " flags );
+.fi
+.SH DESCRIPTION
+A Landlock ruleset identifies a set of rules (i.e., actions on objects).
+This
+.BR landlock_create_ruleset ()
+system call enables creating a new file descriptor identifying a ruleset.
+This file descriptor can then be used by
+.BR landlock_add_rule (2)
+and
+.BR landlock_restrict_self (2).
+See
+.BR landlock (7)
+for a global overview.
+.PP
+.I attr
+specifies the properties of the new ruleset.
+It points to the following structure:
+.IP
+.in +4n
+.EX
+struct landlock_ruleset_attr {
+ __u64 handled_access_fs;
+};
+.EE
+.in
+.IP
+.I handled_access_fs
+is a bitmask of actions that is handled by this ruleset and
+should then be forbidden if no rule explicitly allows them
+(see
+.B Filesystem actions
+in
+.BR landlock (7)).
+This enables simply restricting ambient rights
+(e.g., global filesystem access) and is needed for compatibility reasons.
+.PP
+.I size
+must be specified as
+.I sizeof(struct landlock_ruleset_attr)
+for compatibility reasons.
+.PP
+.I flags
+must be 0 if
+.I attr
+is used.
+Otherwise,
+.I flags
+can be set to:
+.TP
+.B LANDLOCK_CREATE_RULESET_VERSION
+If
+.I attr
+is NULL and
+.I size
+is 0, then the returned value is the highest supported Landlock ABI version
+(starting at 1).
+This version can be used for a best-effort security approach,
+which is encouraged when user space is not pinned to a specific kernel
+version.
+All features documented in these man pages are available with the version
+1.
+.SH RETURN VALUE
+On success,
+.BR landlock_create_ruleset ()
+returns a new Landlock ruleset file descriptor,
+or a Landlock ABI version,
+according to
+.IR flags .
+.SH ERRORS
+.BR landlock_create_ruleset ()
+can fail for the following reasons:
+.TP
+.B EOPNOTSUPP
+Landlock is supported by the kernel but disabled at boot time.
+.TP
+.B EINVAL
+Unknown
+.IR flags ,
+or unknown access, or too small
+.IR size .
+.TP
+.B E2BIG
+.I size
+is too big.
+.TP
+.B EFAULT
+.I attr
+was not a valid address.
+.TP
+.B ENOMSG
+Empty accesses (i.e.,
+.I attr\->handled_access_fs
+is 0).
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 5.13.
+.SH EXAMPLES
+See
+.BR landlock (7).
+.SH SEE ALSO
+.BR landlock_add_rule (2),
+.BR landlock_restrict_self (2),
+.BR landlock (7)
diff --git a/man2/landlock_restrict_self.2 b/man2/landlock_restrict_self.2
new file mode 100644
index 0000000..f02c3a1
--- /dev/null
+++ b/man2/landlock_restrict_self.2
@@ -0,0 +1,116 @@
+.\" Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net>
+.\" Copyright © 2019-2020 ANSSI
+.\" Copyright © 2021 Microsoft Corporation
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH landlock_restrict_self 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+landlock_restrict_self \- enforce a Landlock ruleset
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/landlock.h>" " /* Definition of " LANDLOCK_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.PP
+.BI "int syscall(SYS_landlock_restrict_self, int " ruleset_fd ,
+.BI " uint32_t " flags );
+.SH DESCRIPTION
+Once a Landlock ruleset is populated with the desired rules, the
+.BR landlock_restrict_self ()
+system call enables enforcing this ruleset on the calling thread.
+See
+.BR landlock (7)
+for a global overview.
+.PP
+A thread can be restricted with multiple rulesets that are then
+composed together to form the thread's Landlock domain.
+This can be seen as a stack of rulesets but
+it is implemented in a more efficient way.
+A domain can only be updated in such a way that
+the constraints of each past and future composed rulesets
+will restrict the thread and its future children for their entire life.
+It is then possible to gradually enforce tailored access control policies
+with multiple independent rulesets coming from different sources
+(e.g., init system configuration, user session policy,
+built-in application policy).
+However, most applications should only need one call to
+.BR landlock_restrict_self ()
+and they should avoid arbitrary numbers of such calls because of the
+composed rulesets limit.
+Instead, developers are encouraged to build a tailored ruleset thanks to
+multiple calls to
+.BR landlock_add_rule (2).
+.PP
+In order to enforce a ruleset, either the caller must have the
+.B CAP_SYS_ADMIN
+capability in its user namespace, or the thread must already have the
+.I no_new_privs
+bit set.
+As for
+.BR seccomp (2),
+this avoids scenarios where unprivileged processes can affect
+the behavior of privileged children (e.g., because of set-user-ID binaries).
+If that bit was not already set by an ancestor of this thread,
+the thread must make the following call:
+.IP
+.EX
+prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+.EE
+.PP
+.I ruleset_fd
+is a Landlock ruleset file descriptor obtained with
+.BR landlock_create_ruleset (2)
+and fully populated with a set of calls to
+.BR landlock_add_rule (2).
+.PP
+.I flags
+must be 0.
+.SH RETURN VALUE
+On success,
+.BR landlock_restrict_self ()
+returns 0.
+.SH ERRORS
+.BR landlock_restrict_self ()
+can fail for the following reasons:
+.TP
+.B EOPNOTSUPP
+Landlock is supported by the kernel but disabled at boot time.
+.TP
+.B EINVAL
+.I flags
+is not 0.
+.TP
+.B EBADF
+.I ruleset_fd
+is not a file descriptor for the current thread.
+.TP
+.B EBADFD
+.I ruleset_fd
+is not a ruleset file descriptor.
+.TP
+.B EPERM
+.I ruleset_fd
+has no read access to the underlying ruleset,
+or the calling thread is not running with
+.IR no_new_privs ,
+or it doesn't have the
+.B CAP_SYS_ADMIN
+in its user namespace.
+.TP
+.B E2BIG
+The maximum number of composed rulesets is reached for the calling thread.
+This limit is currently 64.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 5.13.
+.SH EXAMPLES
+See
+.BR landlock (7).
+.SH SEE ALSO
+.BR landlock_create_ruleset (2),
+.BR landlock_add_rule (2),
+.BR landlock (7)
diff --git a/man2/lchown.2 b/man2/lchown.2
new file mode 100644
index 0000000..f0a5635
--- /dev/null
+++ b/man2/lchown.2
@@ -0,0 +1 @@
+.so man2/chown.2
diff --git a/man2/lchown32.2 b/man2/lchown32.2
new file mode 100644
index 0000000..8ed3964
--- /dev/null
+++ b/man2/lchown32.2
@@ -0,0 +1 @@
+.so man2/lchown.2
diff --git a/man2/lgetxattr.2 b/man2/lgetxattr.2
new file mode 100644
index 0000000..d9e5d90
--- /dev/null
+++ b/man2/lgetxattr.2
@@ -0,0 +1 @@
+.so man2/getxattr.2
diff --git a/man2/link.2 b/man2/link.2
new file mode 100644
index 0000000..1533409
--- /dev/null
+++ b/man2/link.2
@@ -0,0 +1,425 @@
+.\" This manpage is Copyright (C) 1992 Drew Eckhardt;
+.\" and Copyright (C) 1993 Michael Haardt, Ian Jackson.
+.\" and Copyright (C) 2006, 2014 Michael Kerrisk
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 1993-07-23 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1994-08-21 by Michael Haardt
+.\" Modified 2004-06-23 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified 2005-04-04, as per suggestion by Michael Hardt for rename.2
+.\"
+.TH link 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+link, linkat \- make a new name for a file
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int link(const char *" oldpath ", const char *" newpath );
+.PP
+.BR "#include <fcntl.h> " "/* Definition of " AT_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int linkat(int " olddirfd ", const char *" oldpath ,
+.BI " int " newdirfd ", const char *" newpath ", int " flags );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR linkat ():
+.nf
+ Since glibc 2.10:
+ _POSIX_C_SOURCE >= 200809L
+ Before glibc 2.10:
+ _ATFILE_SOURCE
+.fi
+.SH DESCRIPTION
+.BR link ()
+creates a new link (also known as a hard link) to an existing file.
+.PP
+If
+.I newpath
+exists, it will
+.I not
+be overwritten.
+.PP
+This new name may be used exactly as the old one for any operation;
+both names refer to the same file (and so have the same permissions
+and ownership) and it is impossible to tell which name was the
+"original".
+.SS linkat()
+The
+.BR linkat ()
+system call operates in exactly the same way as
+.BR link (),
+except for the differences described here.
+.PP
+If the pathname given in
+.I oldpath
+is relative, then it is interpreted relative to the directory
+referred to by the file descriptor
+.I olddirfd
+(rather than relative to the current working directory of
+the calling process, as is done by
+.BR link ()
+for a relative pathname).
+.PP
+If
+.I oldpath
+is relative and
+.I olddirfd
+is the special value
+.BR AT_FDCWD ,
+then
+.I oldpath
+is interpreted relative to the current working
+directory of the calling process (like
+.BR link ()).
+.PP
+If
+.I oldpath
+is absolute, then
+.I olddirfd
+is ignored.
+.PP
+The interpretation of
+.I newpath
+is as for
+.IR oldpath ,
+except that a relative pathname is interpreted relative
+to the directory referred to by the file descriptor
+.IR newdirfd .
+.PP
+The following values can be bitwise ORed in
+.IR flags :
+.TP
+.BR AT_EMPTY_PATH " (since Linux 2.6.39)"
+.\" commit 11a7b371b64ef39fc5fb1b6f2218eef7c4d035e3
+If
+.I oldpath
+is an empty string, create a link to the file referenced by
+.I olddirfd
+(which may have been obtained using the
+.BR open (2)
+.B O_PATH
+flag).
+In this case,
+.I olddirfd
+can refer to any type of file except a directory.
+This will generally not work if the file has a link count of zero (files
+created with
+.B O_TMPFILE
+and without
+.B O_EXCL
+are an exception).
+The caller must have the
+.B CAP_DAC_READ_SEARCH
+capability in order to use this flag.
+This flag is Linux-specific; define
+.B _GNU_SOURCE
+.\" Before glibc 2.16, defining _ATFILE_SOURCE sufficed
+to obtain its definition.
+.TP
+.BR AT_SYMLINK_FOLLOW " (since Linux 2.6.18)"
+By default,
+.BR linkat (),
+does not dereference
+.I oldpath
+if it is a symbolic link (like
+.BR link ()).
+The flag
+.B AT_SYMLINK_FOLLOW
+can be specified in
+.I flags
+to cause
+.I oldpath
+to be dereferenced if it is a symbolic link.
+If procfs is mounted,
+this can be used as an alternative to
+.BR AT_EMPTY_PATH ,
+like this:
+.IP
+.in +4n
+.EX
+linkat(AT_FDCWD, "/proc/self/fd/<fd>", newdirfd,
+ newname, AT_SYMLINK_FOLLOW);
+.EE
+.in
+.PP
+Before Linux 2.6.18, the
+.I flags
+argument was unused, and had to be specified as 0.
+.PP
+See
+.BR openat (2)
+for an explanation of the need for
+.BR linkat ().
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+Write access to the directory containing
+.I newpath
+is denied, or search permission is denied for one of the directories
+in the path prefix of
+.I oldpath
+or
+.IR newpath .
+(See also
+.BR path_resolution (7).)
+.TP
+.B EDQUOT
+The user's quota of disk blocks on the filesystem has been exhausted.
+.TP
+.B EEXIST
+.I newpath
+already exists.
+.TP
+.B EFAULT
+.IR oldpath " or " newpath " points outside your accessible address space."
+.TP
+.B EIO
+An I/O error occurred.
+.TP
+.B ELOOP
+Too many symbolic links were encountered in resolving
+.IR oldpath " or " newpath .
+.TP
+.B EMLINK
+The file referred to by
+.I oldpath
+already has the maximum number of links to it.
+For example, on an
+.BR ext4 (5)
+filesystem that does not employ the
+.I dir_index
+feature, the limit on the number of hard links to a file is 65,000; on
+.BR btrfs (5),
+the limit is 65,535 links.
+.TP
+.B ENAMETOOLONG
+.IR oldpath " or " newpath " was too long."
+.TP
+.B ENOENT
+A directory component in
+.IR oldpath " or " newpath
+does not exist or is a dangling symbolic link.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.B ENOSPC
+The device containing the file has no room for the new directory
+entry.
+.TP
+.B ENOTDIR
+A component used as a directory in
+.IR oldpath " or " newpath
+is not, in fact, a directory.
+.TP
+.B EPERM
+.I oldpath
+is a directory.
+.TP
+.B EPERM
+The filesystem containing
+.IR oldpath " and " newpath
+does not support the creation of hard links.
+.TP
+.BR EPERM " (since Linux 3.6)"
+The caller does not have permission to create a hard link to this file
+(see the description of
+.I /proc/sys/fs/protected_hardlinks
+in
+.BR proc (5)).
+.TP
+.B EPERM
+.I oldpath
+is marked immutable or append-only.
+(See
+.BR ioctl_iflags (2).)
+.TP
+.B EROFS
+The file is on a read-only filesystem.
+.TP
+.B EXDEV
+.IR oldpath " and " newpath
+are not on the same mounted filesystem.
+(Linux permits a filesystem to be mounted at multiple points, but
+.BR link ()
+does not work across different mounts,
+even if the same filesystem is mounted on both.)
+.PP
+The following additional errors can occur for
+.BR linkat ():
+.TP
+.B EBADF
+.I oldpath
+.RI ( newpath )
+is relative but
+.I olddirfd
+.RI ( newdirfd )
+is neither
+.B AT_FDCWD
+nor a valid file descriptor.
+.TP
+.B EINVAL
+An invalid flag value was specified in
+.IR flags .
+.TP
+.B ENOENT
+.B AT_EMPTY_PATH
+was specified in
+.IR flags ,
+but the caller did not have the
+.B CAP_DAC_READ_SEARCH
+capability.
+.TP
+.B ENOENT
+An attempt was made to link to the
+.I /proc/self/fd/NN
+file corresponding to a file descriptor created with
+.IP
+.in +4n
+.EX
+open(path, O_TMPFILE | O_EXCL, mode);
+.EE
+.in
+.IP
+See
+.BR open (2).
+.TP
+.B ENOENT
+An attempt was made to link to a
+.I /proc/self/fd/NN
+file corresponding to a file that has been deleted.
+.TP
+.B ENOENT
+.I oldpath
+is a relative pathname and
+.I olddirfd
+refers to a directory that has been deleted,
+or
+.I newpath
+is a relative pathname and
+.I newdirfd
+refers to a directory that has been deleted.
+.TP
+.B ENOTDIR
+.I oldpath
+is relative and
+.I olddirfd
+is a file descriptor referring to a file other than a directory;
+or similar for
+.I newpath
+and
+.I newdirfd
+.TP
+.B EPERM
+.B AT_EMPTY_PATH
+was specified in
+.IR flags ,
+.I oldpath
+is an empty string, and
+.I olddirfd
+refers to a directory.
+.SH VERSIONS
+POSIX.1-2001 says that
+.BR link ()
+should dereference
+.I oldpath
+if it is a symbolic link.
+However, since Linux 2.0,
+.\" more precisely: since Linux 1.3.56
+Linux does not do so: if
+.I oldpath
+is a symbolic link, then
+.I newpath
+is created as a (hard) link to the same symbolic link file
+(i.e.,
+.I newpath
+becomes a symbolic link to the same file that
+.I oldpath
+refers to).
+Some other implementations behave in the same manner as Linux.
+.\" For example, the default Solaris compilation environment
+.\" behaves like Linux, and contributors to a March 2005
+.\" thread in the Austin mailing list reported that some
+.\" other (System V) implementations did/do the same -- MTK, Apr 05
+POSIX.1-2008 changes the specification of
+.BR link (),
+making it implementation-dependent whether or not
+.I oldpath
+is dereferenced if it is a symbolic link.
+For precise control over the treatment of symbolic links when
+creating a link, use
+.BR linkat ().
+.SS glibc
+On older kernels where
+.BR linkat ()
+is unavailable, the glibc wrapper function falls back to the use of
+.BR link (),
+unless the
+.B AT_SYMLINK_FOLLOW
+is specified.
+When
+.I oldpath
+and
+.I newpath
+are relative pathnames,
+glibc constructs pathnames based on the symbolic links in
+.I /proc/self/fd
+that correspond to the
+.I olddirfd
+and
+.I newdirfd
+arguments.
+.SH STANDARDS
+.TP
+.BR link ()
+POSIX.1-2008.
+.SH HISTORY
+.TP
+.BR link ()
+SVr4, 4.3BSD, POSIX.1-2001 (but see VERSIONS).
+.\" SVr4 documents additional ENOLINK and
+.\" EMULTIHOP error conditions; POSIX.1 does not document ELOOP.
+.\" X/OPEN does not document EFAULT, ENOMEM or EIO.
+.TP
+.BR linkat ()
+POSIX.1-2008.
+Linux 2.6.16,
+glibc 2.4.
+.SH NOTES
+Hard links, as created by
+.BR link (),
+cannot span filesystems.
+Use
+.BR symlink (2)
+if this is required.
+.SH BUGS
+On NFS filesystems, the return code may be wrong in case the NFS server
+performs the link creation and dies before it can say so.
+Use
+.BR stat (2)
+to find out if the link got created.
+.SH SEE ALSO
+.BR ln (1),
+.BR open (2),
+.BR rename (2),
+.BR stat (2),
+.BR symlink (2),
+.BR unlink (2),
+.BR path_resolution (7),
+.BR symlink (7)
diff --git a/man2/linkat.2 b/man2/linkat.2
new file mode 100644
index 0000000..a7d6da5
--- /dev/null
+++ b/man2/linkat.2
@@ -0,0 +1 @@
+.so man2/link.2
diff --git a/man2/listen.2 b/man2/listen.2
new file mode 100644
index 0000000..9512366
--- /dev/null
+++ b/man2/listen.2
@@ -0,0 +1,155 @@
+.\" Copyright (c) 1983, 1991 The Regents of the University of California.
+.\" and Copyright (C) 2007, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" All rights reserved.
+.\"
+.\" SPDX-License-Identifier: BSD-4-Clause-UC
+.\"
+.\" $Id: listen.2,v 1.6 1999/05/18 14:10:32 freitag Exp $
+.\"
+.\" Modified Fri Jul 23 22:07:54 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 950727 by aeb, following a suggestion by Urs Thuermann
+.\" <urs@isnogud.escape.de>
+.\" Modified Tue Oct 22 08:11:14 EDT 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 1998 by Andi Kleen
+.\" Modified 11 May 2001 by Sam Varshavchik <mrsam@courier-mta.com>
+.\"
+.\"
+.TH listen 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+listen \- listen for connections on a socket
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/socket.h>
+.PP
+.BI "int listen(int " sockfd ", int " backlog );
+.fi
+.SH DESCRIPTION
+.BR listen ()
+marks the socket referred to by
+.I sockfd
+as a passive socket, that is, as a socket that will
+be used to accept incoming connection requests using
+.BR accept (2).
+.PP
+The
+.I sockfd
+argument is a file descriptor that refers to a socket of type
+.B SOCK_STREAM
+or
+.BR SOCK_SEQPACKET .
+.PP
+The
+.I backlog
+argument defines the maximum length
+to which the queue of pending connections for
+.I sockfd
+may grow.
+If a connection request arrives when the queue is full, the client
+may receive an error with an indication of
+.B ECONNREFUSED
+or, if the underlying protocol supports retransmission, the request may be
+ignored so that a later reattempt at connection succeeds.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EADDRINUSE
+Another socket is already listening on the same port.
+.TP
+.B EADDRINUSE
+(Internet domain sockets)
+The socket referred to by
+.I sockfd
+had not previously been bound to an address and,
+upon attempting to bind it to an ephemeral port,
+it was determined that all port numbers in the ephemeral port range
+are currently in use.
+See the discussion of
+.I /proc/sys/net/ipv4/ip_local_port_range
+in
+.BR ip (7).
+.TP
+.B EBADF
+The argument
+.I sockfd
+is not a valid file descriptor.
+.TP
+.B ENOTSOCK
+The file descriptor
+.I sockfd
+does not refer to a socket.
+.TP
+.B EOPNOTSUPP
+The socket is not of a type that supports the
+.BR listen ()
+operation.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, 4.4BSD
+(first appeared in 4.2BSD).
+.SH NOTES
+To accept connections, the following steps are performed:
+.RS 4
+.IP (1) 5
+A socket is created with
+.BR socket (2).
+.IP (2)
+The socket is bound to a local address using
+.BR bind (2),
+so that other sockets may be
+.BR connect (2)ed
+to it.
+.IP (3)
+A willingness to accept incoming connections and a queue limit for incoming
+connections are specified with
+.BR listen ().
+.IP (4)
+Connections are accepted with
+.BR accept (2).
+.RE
+.PP
+The behavior of the
+.I backlog
+argument on TCP sockets changed with Linux 2.2.
+Now it specifies the queue length for
+.I completely
+established sockets waiting to be accepted,
+instead of the number of incomplete connection requests.
+The maximum length of the queue for incomplete sockets
+can be set using
+.IR /proc/sys/net/ipv4/tcp_max_syn_backlog .
+When syncookies are enabled there is no logical maximum
+length and this setting is ignored.
+See
+.BR tcp (7)
+for more information.
+.PP
+If the
+.I backlog
+argument is greater than the value in
+.IR /proc/sys/net/core/somaxconn ,
+then it is silently capped to that value.
+Since Linux 5.4, the default in this file is 4096;
+in earlier kernels, the default value is 128.
+Before Linux 2.4.25, this limit was a hard coded value,
+.BR SOMAXCONN ,
+with the value 128.
+.\" The following is now rather historic information (MTK, Jun 05)
+.\" Don't rely on this value in portable applications since BSD
+.\" (and some BSD-derived systems) limit the backlog to 5.
+.SH EXAMPLES
+See
+.BR bind (2).
+.SH SEE ALSO
+.BR accept (2),
+.BR bind (2),
+.BR connect (2),
+.BR socket (2),
+.BR socket (7)
diff --git a/man2/listxattr.2 b/man2/listxattr.2
new file mode 100644
index 0000000..58f5ce0
--- /dev/null
+++ b/man2/listxattr.2
@@ -0,0 +1,322 @@
+.\" Copyright (C) Andreas Gruenbacher, February 2001
+.\" Copyright (C) Silicon Graphics Inc, September 2001
+.\" Copyright (C) 2015 Heinrich Schuchardt <xypron.glpk@gmx.de>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.TH listxattr 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+listxattr, llistxattr, flistxattr \- list extended attribute names
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/xattr.h>
+.PP
+.BI "ssize_t listxattr(const char *" path ", char *_Nullable " list \
+", size_t " size );
+.BI "ssize_t llistxattr(const char *" path ", char *_Nullable " list \
+", size_t " size );
+.BI "ssize_t flistxattr(int " fd ", char *_Nullable " list ", size_t " size );
+.fi
+.SH DESCRIPTION
+Extended attributes are
+.IR name : value
+pairs associated with inodes (files, directories, symbolic links, etc.).
+They are extensions to the normal attributes which are associated
+with all inodes in the system (i.e., the
+.BR stat (2)
+data).
+A complete overview of extended attributes concepts can be found in
+.BR xattr (7).
+.PP
+.BR listxattr ()
+retrieves the list
+of extended attribute names associated with the given
+.I path
+in the filesystem.
+The retrieved list is placed in
+.IR list ,
+a caller-allocated buffer whose size (in bytes) is specified in the argument
+.IR size .
+The list is the set of (null-terminated) names, one after the other.
+Names of extended attributes to which the calling process does not
+have access may be omitted from the list.
+The length of the attribute name
+.I list
+is returned.
+.PP
+.BR llistxattr ()
+is identical to
+.BR listxattr (),
+except in the case of a symbolic link, where the list of names of
+extended attributes associated with the link itself is retrieved,
+not the file that it refers to.
+.PP
+.BR flistxattr ()
+is identical to
+.BR listxattr (),
+only the open file referred to by
+.I fd
+(as returned by
+.BR open (2))
+is interrogated in place of
+.IR path .
+.PP
+A single extended attribute
+.I name
+is a null-terminated string.
+The name includes a namespace prefix; there may be several, disjoint
+namespaces associated with an individual inode.
+.PP
+If
+.I size
+is specified as zero, these calls return the current size of the
+list of extended attribute names (and leave
+.I list
+unchanged).
+This can be used to determine the size of the buffer that
+should be supplied in a subsequent call.
+(But, bear in mind that there is a possibility that the
+set of extended attributes may change between the two calls,
+so that it is still necessary to check the return status
+from the second call.)
+.SS Example
+The
+.I list
+of names is returned as an unordered array of null-terminated character strings
+(attribute names are separated by null bytes (\[aq]\e0\[aq])),
+like this:
+.PP
+.in +4n
+.EX
+user.name1\e0system.name1\e0user.name2\e0
+.EE
+.in
+.PP
+Filesystems that implement POSIX ACLs using
+extended attributes might return a
+.I list
+like this:
+.PP
+.in +4n
+.EX
+system.posix_acl_access\e0system.posix_acl_default\e0
+.EE
+.in
+.SH RETURN VALUE
+On success, a nonnegative number is returned indicating the size of the
+extended attribute name list.
+On failure, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B E2BIG
+The size of the list of extended attribute names is larger than the maximum
+size allowed; the list cannot be retrieved.
+This can happen on filesystems that support an unlimited number of
+extended attributes per file such as XFS, for example.
+See BUGS.
+.TP
+.B ENOTSUP
+Extended attributes are not supported by the filesystem, or are disabled.
+.TP
+.B ERANGE
+The
+.I size
+of the
+.I list
+buffer is too small to hold the result.
+.PP
+In addition, the errors documented in
+.BR stat (2)
+can also occur.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.4,
+glibc 2.3.
+.\" .SH AUTHORS
+.\" Andreas Gruenbacher,
+.\" .RI < a.gruenbacher@computer.org >
+.\" and the SGI XFS development team,
+.\" .RI < linux-xfs@oss.sgi.com >.
+.\" Please send any bug reports or comments to these addresses.
+.SH BUGS
+.\" The xattr(7) page refers to this text:
+As noted in
+.BR xattr (7),
+the VFS imposes a limit of 64\ kB on the size of the extended
+attribute name list returned by
+.BR listxattr ().
+If the total size of attribute names attached to a file exceeds this limit,
+it is no longer possible to retrieve the list of attribute names.
+.SH EXAMPLES
+The following program demonstrates the usage of
+.BR listxattr ()
+and
+.BR getxattr (2).
+For the file whose pathname is provided as a command-line argument,
+it lists all extended file attributes and their values.
+.PP
+To keep the code simple, the program assumes that attribute keys and
+values are constant during the execution of the program.
+A production program should expect and handle changes during
+execution of the program.
+For example,
+the number of bytes required for attribute keys
+might increase between the two calls to
+.BR listxattr ().
+An application could handle this possibility using
+a loop that retries the call
+(perhaps up to a predetermined maximum number of attempts)
+with a larger buffer each time it fails with the error
+.BR ERANGE .
+Calls to
+.BR getxattr (2)
+could be handled similarly.
+.PP
+The following output was recorded by first creating a file, setting
+some extended file attributes,
+and then listing the attributes with the example program.
+.SS Example output
+.in +4n
+.EX
+$ \fBtouch /tmp/foo\fP
+$ \fBsetfattr \-n user.fred \-v chocolate /tmp/foo\fP
+$ \fBsetfattr \-n user.frieda \-v bar /tmp/foo\fP
+$ \fBsetfattr \-n user.empty /tmp/foo\fP
+$ \fB./listxattr /tmp/foo\fP
+user.fred: chocolate
+user.frieda: bar
+user.empty: <no value>
+.EE
+.in
+.SS Program source (listxattr.c)
+.\" SRC BEGIN (listxattr.c)
+.EX
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/xattr.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ char *buf, *key, *val;
+ ssize_t buflen, keylen, vallen;
+\&
+ if (argc != 2) {
+ fprintf(stderr, "Usage: %s path\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ /*
+ * Determine the length of the buffer needed.
+ */
+ buflen = listxattr(argv[1], NULL, 0);
+ if (buflen == \-1) {
+ perror("listxattr");
+ exit(EXIT_FAILURE);
+ }
+ if (buflen == 0) {
+ printf("%s has no attributes.\en", argv[1]);
+ exit(EXIT_SUCCESS);
+ }
+\&
+ /*
+ * Allocate the buffer.
+ */
+ buf = malloc(buflen);
+ if (buf == NULL) {
+ perror("malloc");
+ exit(EXIT_FAILURE);
+ }
+\&
+ /*
+ * Copy the list of attribute keys to the buffer.
+ */
+ buflen = listxattr(argv[1], buf, buflen);
+ if (buflen == \-1) {
+ perror("listxattr");
+ exit(EXIT_FAILURE);
+ }
+\&
+ /*
+ * Loop over the list of zero terminated strings with the
+ * attribute keys. Use the remaining buffer length to determine
+ * the end of the list.
+ */
+ key = buf;
+ while (buflen > 0) {
+\&
+ /*
+ * Output attribute key.
+ */
+ printf("%s: ", key);
+\&
+ /*
+ * Determine length of the value.
+ */
+ vallen = getxattr(argv[1], key, NULL, 0);
+ if (vallen == \-1)
+ perror("getxattr");
+\&
+ if (vallen > 0) {
+\&
+ /*
+ * Allocate value buffer.
+ * One extra byte is needed to append 0x00.
+ */
+ val = malloc(vallen + 1);
+ if (val == NULL) {
+ perror("malloc");
+ exit(EXIT_FAILURE);
+ }
+\&
+ /*
+ * Copy value to buffer.
+ */
+ vallen = getxattr(argv[1], key, val, vallen);
+ if (vallen == \-1) {
+ perror("getxattr");
+ } else {
+ /*
+ * Output attribute value.
+ */
+ val[vallen] = 0;
+ printf("%s", val);
+ }
+\&
+ free(val);
+ } else if (vallen == 0) {
+ printf("<no value>");
+ }
+\&
+ printf("\en");
+\&
+ /*
+ * Forward to next attribute key.
+ */
+ keylen = strlen(key) + 1;
+ buflen \-= keylen;
+ key += keylen;
+ }
+\&
+ free(buf);
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR getfattr (1),
+.BR setfattr (1),
+.BR getxattr (2),
+.BR open (2),
+.BR removexattr (2),
+.BR setxattr (2),
+.BR stat (2),
+.BR symlink (7),
+.BR xattr (7)
diff --git a/man2/llistxattr.2 b/man2/llistxattr.2
new file mode 100644
index 0000000..117bd2b
--- /dev/null
+++ b/man2/llistxattr.2
@@ -0,0 +1 @@
+.so man2/listxattr.2
diff --git a/man2/llseek.2 b/man2/llseek.2
new file mode 100644
index 0000000..64de504
--- /dev/null
+++ b/man2/llseek.2
@@ -0,0 +1,92 @@
+.\" Copyright (C) 1995 Andries Brouwer (aeb@cwi.nl)
+.\" Written 10 June 1995 by Andries Brouwer <aeb@cwi.nl>
+.\" and Copyright (C) 2007, 2015, 2020, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified Thu Oct 31 15:16:23 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\"
+.TH _llseek 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+_llseek \- reposition read/write file offset
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS__llseek, unsigned int " fd ", unsigned long " offset_high ,
+.BI " unsigned long " offset_low ", loff_t *" result ,
+.BI " unsigned int " whence );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR _llseek (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+Note: for information about the
+.BR llseek (3)
+library function, see
+.BR lseek64 (3).
+.PP
+The
+.BR _llseek ()
+system call repositions the offset of the open file description associated
+with the file descriptor
+.I fd
+to the value
+.IP
+(offset_high << 32) | offset_low
+.PP
+This new offset is a byte offset
+relative to the beginning of the file, the current file offset,
+or the end of the file, depending on whether
+.I whence
+is
+.BR SEEK_SET ,
+.BR SEEK_CUR ,
+or
+.BR SEEK_END ,
+respectively.
+.PP
+The new file offset is returned in the argument
+.IR result .
+The type
+.I loff_t
+is a 64-bit signed type.
+.PP
+This system call exists on various 32-bit platforms to support
+seeking to large file offsets.
+.SH RETURN VALUE
+Upon successful completion,
+.BR _llseek ()
+returns 0.
+Otherwise, a value of \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+.I fd
+is not an open file descriptor.
+.TP
+.B EFAULT
+Problem with copying results to user space.
+.TP
+.B EINVAL
+.I whence
+is invalid.
+.SH VERSIONS
+You probably want to use the
+.BR lseek (2)
+wrapper function instead.
+.SH STANDARDS
+Linux.
+.SH SEE ALSO
+.BR lseek (2),
+.BR open (2),
+.BR lseek64 (3)
diff --git a/man2/lock.2 b/man2/lock.2
new file mode 100644
index 0000000..5d25ea6
--- /dev/null
+++ b/man2/lock.2
@@ -0,0 +1 @@
+.so man2/unimplemented.2
diff --git a/man2/lookup_dcookie.2 b/man2/lookup_dcookie.2
new file mode 100644
index 0000000..4543e45
--- /dev/null
+++ b/man2/lookup_dcookie.2
@@ -0,0 +1,86 @@
+.\" Copyright (C) 2003 John Levon <levon@movementarian.org>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 2004-06-17 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.TH lookup_dcookie 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+lookup_dcookie \- return a directory entry's path
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_lookup_dcookie, uint64_t " cookie ", char *" buffer ,
+.BI " size_t " len );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR lookup_dcookie (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+Look up the full path of the directory entry specified by the value
+.IR cookie .
+The cookie is an opaque identifier uniquely identifying a particular
+directory entry.
+The buffer given is filled in with the full path of the directory entry.
+.PP
+For
+.BR lookup_dcookie ()
+to return successfully,
+the kernel must still hold a cookie reference to the directory entry.
+.SH RETURN VALUE
+On success,
+.BR lookup_dcookie ()
+returns the length of the path string copied into the buffer.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+The buffer was not valid.
+.TP
+.B EINVAL
+The kernel has no registered cookie/directory entry mappings at the
+time of lookup, or the cookie does not refer to a valid directory entry.
+.TP
+.B ENAMETOOLONG
+The name could not fit in the buffer.
+.TP
+.B ENOMEM
+The kernel could not allocate memory for the temporary buffer holding
+the path.
+.TP
+.B EPERM
+The process does not have the capability
+.B CAP_SYS_ADMIN
+required to look up cookie values.
+.TP
+.B ERANGE
+The buffer was not large enough to hold the path of the directory entry.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.5.43.
+.PP
+The
+.B ENAMETOOLONG
+error was added in Linux 2.5.70.
+.SH NOTES
+.BR lookup_dcookie ()
+is a special-purpose system call, currently used only by the
+.BR oprofile (1)
+profiler.
+It relies on a kernel driver to register cookies for directory entries.
+.PP
+The path returned may be suffixed by the string " (deleted)" if the directory
+entry has been removed.
+.SH SEE ALSO
+.BR oprofile (1)
diff --git a/man2/lremovexattr.2 b/man2/lremovexattr.2
new file mode 100644
index 0000000..38d01cc
--- /dev/null
+++ b/man2/lremovexattr.2
@@ -0,0 +1 @@
+.so man2/removexattr.2
diff --git a/man2/lseek.2 b/man2/lseek.2
new file mode 100644
index 0000000..7ef7930
--- /dev/null
+++ b/man2/lseek.2
@@ -0,0 +1,252 @@
+.\" Copyright (c) 1980, 1991 Regents of the University of California.
+.\" and Copyright (c) 2011, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" All rights reserved.
+.\"
+.\" SPDX-License-Identifier: BSD-4-Clause-UC
+.\"
+.\" @(#)lseek.2 6.5 (Berkeley) 3/10/91
+.\"
+.\" Modified 1993-07-23 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1995-06-10 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified 1996-10-31 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 1998-01-17 by Michael Haardt
+.\" <michael@cantor.informatik.rwth-aachen.de>
+.\" Modified 2001-09-24 by Michael Haardt <michael@moria.de>
+.\" Modified 2003-08-21 by Andries Brouwer <aeb@cwi.nl>
+.\" 2011-09-18, mtk, Added SEEK_DATA + SEEK_HOLE
+.\"
+.TH lseek 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+lseek \- reposition read/write file offset
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "off_t lseek(int " fd ", off_t " offset ", int " whence );
+.fi
+.SH DESCRIPTION
+.BR lseek ()
+repositions the file offset of the open file description
+associated with the file descriptor
+.I fd
+to the argument
+.I offset
+according to the directive
+.I whence
+as follows:
+.TP
+.B SEEK_SET
+The file offset is set to
+.I offset
+bytes.
+.TP
+.B SEEK_CUR
+The file offset is set to its current location plus
+.I offset
+bytes.
+.TP
+.B SEEK_END
+The file offset is set to the size of the file plus
+.I offset
+bytes.
+.PP
+.BR lseek ()
+allows the file offset to be set beyond the end
+of the file (but this does not change the size of the file).
+If data is later written at this point, subsequent reads of the data
+in the gap (a "hole") return null bytes (\[aq]\e0\[aq]) until
+data is actually written into the gap.
+.SS Seeking file data and holes
+Since Linux 3.1, Linux supports the following additional values for
+.IR whence :
+.TP
+.B SEEK_DATA
+Adjust the file offset to the next location
+in the file greater than or equal to
+.I offset
+containing data.
+If
+.I offset
+points to data,
+then the file offset is set to
+.IR offset .
+.TP
+.B SEEK_HOLE
+Adjust the file offset to the next hole in the file
+greater than or equal to
+.IR offset .
+If
+.I offset
+points into the middle of a hole,
+then the file offset is set to
+.IR offset .
+If there is no hole past
+.IR offset ,
+then the file offset is adjusted to the end of the file
+(i.e., there is an implicit hole at the end of any file).
+.PP
+In both of the above cases,
+.BR lseek ()
+fails if
+.I offset
+points past the end of the file.
+.PP
+These operations allow applications to map holes in a sparsely
+allocated file.
+This can be useful for applications such as file backup tools,
+which can save space when creating backups and preserve holes,
+if they have a mechanism for discovering holes.
+.PP
+For the purposes of these operations, a hole is a sequence of zeros that
+(normally) has not been allocated in the underlying file storage.
+However, a filesystem is not obliged to report holes,
+so these operations are not a guaranteed mechanism for
+mapping the storage space actually allocated to a file.
+(Furthermore, a sequence of zeros that actually has been written
+to the underlying storage may not be reported as a hole.)
+In the simplest implementation,
+a filesystem can support the operations by making
+.B SEEK_HOLE
+always return the offset of the end of the file,
+and making
+.B SEEK_DATA
+always return
+.I offset
+(i.e., even if the location referred to by
+.I offset
+is a hole,
+it can be considered to consist of data that is a sequence of zeros).
+.\" https://lkml.org/lkml/2011/4/22/79
+.\" http://lwn.net/Articles/440255/
+.\" http://blogs.oracle.com/bonwick/entry/seek_hole_and_seek_data
+.PP
+The
+.B _GNU_SOURCE
+feature test macro must be defined in order to obtain the definitions of
+.B SEEK_DATA
+and
+.B SEEK_HOLE
+from
+.IR <unistd.h> .
+.PP
+The
+.B SEEK_HOLE
+and
+.B SEEK_DATA
+operations are supported for the following filesystems:
+.IP \[bu] 3
+Btrfs (since Linux 3.1)
+.IP \[bu]
+OCFS (since Linux 3.2)
+.\" commit 93862d5e1ab875664c6cc95254fc365028a48bb1
+.IP \[bu]
+XFS (since Linux 3.5)
+.IP \[bu]
+ext4 (since Linux 3.8)
+.IP \[bu]
+.BR tmpfs (5)
+(since Linux 3.8)
+.IP \[bu]
+NFS (since Linux 3.18)
+.\" commit 1c6dcbe5ceff81c2cf8d929646af675cd59fe7c0
+.\" commit 24bab491220faa446d945624086d838af41d616c
+.IP \[bu]
+FUSE (since Linux 4.5)
+.\" commit 0b5da8db145bfd44266ac964a2636a0cf8d7c286
+.IP \[bu]
+GFS2 (since Linux 4.15)
+.\" commit 3a27411cb4bc3ce31db228e3569ad01b462a4310
+.SH RETURN VALUE
+Upon successful completion,
+.BR lseek ()
+returns the resulting offset location as measured in bytes from the
+beginning of the file.
+On error, the value \fI(off_t)\ \-1\fP is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+.I fd
+is not an open file descriptor.
+.TP
+.B EINVAL
+.I whence
+is not valid.
+Or: the resulting file offset would be negative,
+or beyond the end of a seekable device.
+.\" Some systems may allow negative offsets for character devices
+.\" and/or for remote filesystems.
+.TP
+.B ENXIO
+.I whence
+is
+.B SEEK_DATA
+or
+.BR SEEK_HOLE ,
+and
+.I offset
+is beyond the end of the file, or
+.I whence
+is
+.B SEEK_DATA
+and
+.I offset
+is within a hole at the end of the file.
+.TP
+.B EOVERFLOW
+.\" HP-UX 11 says EINVAL for this case (but POSIX.1 says EOVERFLOW)
+The resulting file offset cannot be represented in an
+.IR off_t .
+.TP
+.B ESPIPE
+.I fd
+is associated with a pipe, socket, or FIFO.
+.SH VERSIONS
+On Linux, using
+.BR lseek ()
+on a terminal device fails with the error
+.BR ESPIPE .
+.\" Other systems return the number of written characters,
+.\" using SEEK_SET to set the counter. (Of written characters.)
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4, 4.3BSD.
+.PP
+.B SEEK_DATA
+and
+.B SEEK_HOLE
+are nonstandard extensions also present in Solaris,
+FreeBSD, and DragonFly BSD;
+they are proposed for inclusion in the next POSIX revision (Issue 8).
+.\" FIXME . Review http://austingroupbugs.net/view.php?id=415 in the future
+.SH NOTES
+See
+.BR open (2)
+for a discussion of the relationship between file descriptors,
+open file descriptions, and files.
+.PP
+If the
+.B O_APPEND
+file status flag is set on the open file description,
+then a
+.BR write (2)
+.I always
+moves the file offset to the end of the file, regardless of the use of
+.BR lseek ().
+.PP
+Some devices are incapable of seeking and POSIX does not specify which
+devices must support
+.BR lseek ().
+.SH SEE ALSO
+.BR dup (2),
+.BR fallocate (2),
+.BR fork (2),
+.BR open (2),
+.BR fseek (3),
+.BR lseek64 (3),
+.BR posix_fallocate (3)
diff --git a/man2/lsetxattr.2 b/man2/lsetxattr.2
new file mode 100644
index 0000000..dc07807
--- /dev/null
+++ b/man2/lsetxattr.2
@@ -0,0 +1 @@
+.so man2/setxattr.2
diff --git a/man2/lstat.2 b/man2/lstat.2
new file mode 100644
index 0000000..b1a86c1
--- /dev/null
+++ b/man2/lstat.2
@@ -0,0 +1 @@
+.so man2/stat.2
diff --git a/man2/lstat64.2 b/man2/lstat64.2
new file mode 100644
index 0000000..89b1c84
--- /dev/null
+++ b/man2/lstat64.2
@@ -0,0 +1 @@
+.so man2/lstat.2
diff --git a/man2/madvise.2 b/man2/madvise.2
new file mode 100644
index 0000000..5782574
--- /dev/null
+++ b/man2/madvise.2
@@ -0,0 +1,898 @@
+.\" Copyright (C) 2001 David Gómez <davidge@jazzfree.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Based on comments from mm/filemap.c. Last modified on 10-06-2001
+.\" Modified, 25 Feb 2002, Michael Kerrisk, <mtk.manpages@gmail.com>
+.\" Added notes on MADV_DONTNEED
+.\" 2010-06-19, mtk, Added documentation of MADV_MERGEABLE and
+.\" MADV_UNMERGEABLE
+.\" 2010-06-15, Andi Kleen, Add documentation of MADV_HWPOISON.
+.\" 2010-06-19, Andi Kleen, Add documentation of MADV_SOFT_OFFLINE.
+.\" 2011-09-18, Doug Goldstein <cardoe@cardoe.com>
+.\" Document MADV_HUGEPAGE and MADV_NOHUGEPAGE
+.\"
+.TH madvise 2 2023-04-03 "Linux man-pages 6.05.01"
+.SH NAME
+madvise \- give advice about use of memory
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/mman.h>
+.PP
+.BI "int madvise(void " addr [. length "], size_t " length ", int " advice );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR madvise ():
+.nf
+ Since glibc 2.19:
+ _DEFAULT_SOURCE
+ Up to and including glibc 2.19:
+ _BSD_SOURCE
+.fi
+.SH DESCRIPTION
+The
+.BR madvise ()
+system call is used to give advice or directions to the kernel
+about the address range beginning at address
+.I addr
+and with size
+.IR length .
+.BR madvise ()
+only operates on whole pages, therefore
+.I addr
+must be page-aligned.
+The value of
+.I length
+is rounded up to a multiple of page size.
+In most cases,
+the goal of such advice is to improve system or application performance.
+.PP
+Initially, the system call supported a set of "conventional"
+.I advice
+values, which are also available on several other implementations.
+(Note, though, that
+.BR madvise ()
+is not specified in POSIX.)
+Subsequently, a number of Linux-specific
+.I advice
+values have been added.
+.\"
+.\" ======================================================================
+.\"
+.SS Conventional advice values
+The
+.I advice
+values listed below
+allow an application to tell the kernel how it expects to use
+some mapped or shared memory areas, so that the kernel can choose
+appropriate read-ahead and caching techniques.
+These
+.I advice
+values do not influence the semantics of the application
+(except in the case of
+.BR MADV_DONTNEED ),
+but may influence its performance.
+All of the
+.I advice
+values listed here have analogs in the POSIX-specified
+.BR posix_madvise (3)
+function, and the values have the same meanings, with the exception of
+.BR MADV_DONTNEED .
+.PP
+The advice is indicated in the
+.I advice
+argument, which is one of the following:
+.TP
+.B MADV_NORMAL
+No special treatment.
+This is the default.
+.TP
+.B MADV_RANDOM
+Expect page references in random order.
+(Hence, read ahead may be less useful than normally.)
+.TP
+.B MADV_SEQUENTIAL
+Expect page references in sequential order.
+(Hence, pages in the given range can be aggressively read ahead,
+and may be freed soon after they are accessed.)
+.TP
+.B MADV_WILLNEED
+Expect access in the near future.
+(Hence, it might be a good idea to read some pages ahead.)
+.TP
+.B MADV_DONTNEED
+Do not expect access in the near future.
+(For the time being, the application is finished with the given range,
+so the kernel can free resources associated with it.)
+.IP
+After a successful
+.B MADV_DONTNEED
+operation,
+the semantics of memory access in the specified region are changed:
+subsequent accesses of pages in the range will succeed, but will result
+in either repopulating the memory contents from the
+up-to-date contents of the underlying mapped file
+(for shared file mappings, shared anonymous mappings,
+and shmem-based techniques such as System V shared memory segments)
+or zero-fill-on-demand pages for anonymous private mappings.
+.IP
+Note that, when applied to shared mappings,
+.B MADV_DONTNEED
+might not lead to immediate freeing of the pages in the range.
+The kernel is free to delay freeing the pages until an appropriate moment.
+The resident set size (RSS) of the calling process will be immediately
+reduced however.
+.IP
+.B MADV_DONTNEED
+cannot be applied to locked pages, or
+.B VM_PFNMAP
+pages.
+(Pages marked with the kernel-internal
+.B VM_PFNMAP
+.\" http://lwn.net/Articles/162860/
+flag are special memory areas that are not managed
+by the virtual memory subsystem.
+Such pages are typically created by device drivers that
+map the pages into user space.)
+.IP
+Support for Huge TLB pages was added in Linux v5.18.
+Addresses within a mapping backed by Huge TLB pages must be aligned
+to the underlying Huge TLB page size,
+and the range length is rounded up
+to a multiple of the underlying Huge TLB page size.
+.\"
+.\" ======================================================================
+.\"
+.SS Linux-specific advice values
+The following Linux-specific
+.I advice
+values have no counterparts in the POSIX-specified
+.BR posix_madvise (3),
+and may or may not have counterparts in the
+.BR madvise ()
+interface available on other implementations.
+Note that some of these operations change the semantics of memory accesses.
+.TP
+.BR MADV_REMOVE " (since Linux 2.6.16)"
+.\" commit f6b3ec238d12c8cc6cc71490c6e3127988460349
+Free up a given range of pages
+and its associated backing store.
+This is equivalent to punching a hole in the corresponding
+range of the backing store (see
+.BR fallocate (2)).
+Subsequent accesses in the specified address range will see
+data with a value of zero.
+.\" Databases want to use this feature to drop a section of their
+.\" bufferpool (shared memory segments) - without writing back to
+.\" disk/swap space. This feature is also useful for supporting
+.\" hot-plug memory on UML.
+.IP
+The specified address range must be mapped shared and writable.
+This flag cannot be applied to locked pages, or
+.B VM_PFNMAP
+pages.
+.IP
+In the initial implementation, only
+.BR tmpfs (5)
+supported
+.BR MADV_REMOVE ;
+but since Linux 3.5,
+.\" commit 3f31d07571eeea18a7d34db9af21d2285b807a17
+any filesystem which supports the
+.BR fallocate (2)
+.B FALLOC_FL_PUNCH_HOLE
+mode also supports
+.BR MADV_REMOVE .
+Filesystems which do not support
+.B MADV_REMOVE
+fail with the error
+.BR EOPNOTSUPP .
+.IP
+Support for the Huge TLB filesystem was added in Linux v4.3.
+.TP
+.BR MADV_DONTFORK " (since Linux 2.6.16)"
+.\" commit f822566165dd46ff5de9bf895cfa6c51f53bb0c4
+.\" See http://lwn.net/Articles/171941/
+Do not make the pages in this range available to the child after a
+.BR fork (2).
+This is useful to prevent copy-on-write semantics from changing
+the physical location of a page if the parent writes to it after a
+.BR fork (2).
+(Such page relocations cause problems for hardware that
+DMAs into the page.)
+.\" [PATCH] madvise MADV_DONTFORK/MADV_DOFORK
+.\" Currently, copy-on-write may change the physical address of
+.\" a page even if the user requested that the page is pinned in
+.\" memory (either by mlock or by get_user_pages). This happens
+.\" if the process forks meanwhile, and the parent writes to that
+.\" page. As a result, the page is orphaned: in case of
+.\" get_user_pages, the application will never see any data hardware
+.\" DMA's into this page after the COW. In case of mlock'd memory,
+.\" the parent is not getting the realtime/security benefits of mlock.
+.\"
+.\" In particular, this affects the Infiniband modules which do DMA from
+.\" and into user pages all the time.
+.\"
+.\" This patch adds madvise options to control whether memory range is
+.\" inherited across fork. Useful e.g. for when hardware is doing DMA
+.\" from/into these pages. Could also be useful to an application
+.\" wanting to speed up its forks by cutting large areas out of
+.\" consideration.
+.\"
+.\" SEE ALSO: http://lwn.net/Articles/171941/
+.\" "Tweaks to madvise() and posix_fadvise()", 14 Feb 2006
+.TP
+.BR MADV_DOFORK " (since Linux 2.6.16)"
+Undo the effect of
+.BR MADV_DONTFORK ,
+restoring the default behavior, whereby a mapping is inherited across
+.BR fork (2).
+.TP
+.BR MADV_HWPOISON " (since Linux 2.6.32)"
+.\" commit 9893e49d64a4874ea67849ee2cfbf3f3d6817573
+Poison the pages in the range specified by
+.I addr
+and
+.I length
+and handle subsequent references to those pages
+like a hardware memory corruption.
+This operation is available only for privileged
+.RB ( CAP_SYS_ADMIN )
+processes.
+This operation may result in the calling process receiving a
+.B SIGBUS
+and the page being unmapped.
+.IP
+This feature is intended for testing of memory error-handling code;
+it is available only if the kernel was configured with
+.BR CONFIG_MEMORY_FAILURE .
+.TP
+.BR MADV_MERGEABLE " (since Linux 2.6.32)"
+.\" commit f8af4da3b4c14e7267c4ffb952079af3912c51c5
+Enable Kernel Samepage Merging (KSM) for the pages in the range specified by
+.I addr
+and
+.IR length .
+The kernel regularly scans those areas of user memory that have
+been marked as mergeable,
+looking for pages with identical content.
+These are replaced by a single write-protected page (which is automatically
+copied if a process later wants to update the content of the page).
+KSM merges only private anonymous pages (see
+.BR mmap (2)).
+.IP
+The KSM feature is intended for applications that generate many
+instances of the same data (e.g., virtualization systems such as KVM).
+It can consume a lot of processing power; use with care.
+See the Linux kernel source file
+.I Documentation/admin\-guide/mm/ksm.rst
+for more details.
+.IP
+The
+.B MADV_MERGEABLE
+and
+.B MADV_UNMERGEABLE
+operations are available only if the kernel was configured with
+.BR CONFIG_KSM .
+.TP
+.BR MADV_UNMERGEABLE " (since Linux 2.6.32)"
+Undo the effect of an earlier
+.B MADV_MERGEABLE
+operation on the specified address range;
+KSM unmerges whatever pages it had merged in the address range specified by
+.I addr
+and
+.IR length .
+.TP
+.BR MADV_SOFT_OFFLINE " (since Linux 2.6.33)"
+.\" commit afcf938ee0aac4ef95b1a23bac704c6fbeb26de6
+Soft offline the pages in the range specified by
+.I addr
+and
+.IR length .
+The memory of each page in the specified range is preserved
+(i.e., when next accessed, the same content will be visible,
+but in a new physical page frame),
+and the original page is offlined
+(i.e., no longer used, and taken out of normal memory management).
+The effect of the
+.B MADV_SOFT_OFFLINE
+operation is invisible to (i.e., does not change the semantics of)
+the calling process.
+.IP
+This feature is intended for testing of memory error-handling code;
+it is available only if the kernel was configured with
+.BR CONFIG_MEMORY_FAILURE .
+.TP
+.BR MADV_HUGEPAGE " (since Linux 2.6.38)"
+.\" commit 0af4e98b6b095c74588af04872f83d333c958c32
+.\" http://lwn.net/Articles/358904/
+.\" https://lwn.net/Articles/423584/
+Enable Transparent Huge Pages (THP) for pages in the range specified by
+.I addr
+and
+.IR length .
+The kernel will regularly scan the areas marked as huge page candidates
+to replace them with huge pages.
+The kernel will also allocate huge pages directly when the region is
+naturally aligned to the huge page size (see
+.BR posix_memalign (2)).
+.IP
+This feature is primarily aimed at applications that use large mappings of
+data and access large regions of that memory at a time (e.g., virtualization
+systems such as QEMU).
+It can very easily waste memory (e.g., a 2\ MB mapping that only ever accesses
+1 byte will result in 2\ MB of wired memory instead of one 4\ KB page).
+See the Linux kernel source file
+.I Documentation/admin\-guide/mm/transhuge.rst
+for more details.
+.IP
+Most common kernels configurations provide
+.BR MADV_HUGEPAGE -style
+behavior by default, and thus
+.B MADV_HUGEPAGE
+is normally not necessary.
+It is mostly intended for embedded systems, where
+.BR MADV_HUGEPAGE -style
+behavior may not be enabled by default in the kernel.
+On such systems,
+this flag can be used in order to selectively enable THP.
+Whenever
+.B MADV_HUGEPAGE
+is used, it should always be in regions of memory with
+an access pattern that the developer knows in advance won't risk
+to increase the memory footprint of the application when transparent
+hugepages are enabled.
+.IP
+.\" commit 99cb0dbd47a15d395bf3faa78dc122bc5efe3fc0
+Since Linux 5.4,
+automatic scan of eligible areas and replacement by huge pages works with
+private anonymous pages (see
+.BR mmap (2)),
+shmem pages,
+and file-backed pages.
+For all memory types,
+memory may only be replaced by huge pages on hugepage-aligned boundaries.
+For file-mapped memory
+\[em]including tmpfs (see
+.BR tmpfs (2))\[em]
+the mapping must also be naturally hugepage-aligned within the file.
+Additionally,
+for file-backed,
+non-tmpfs memory,
+the file must not be open for write and the mapping must be executable.
+.IP
+The VMA must not be marked
+.BR VM_NOHUGEPAGE ,
+.BR VM_HUGETLB ,
+.BR VM_IO ,
+.BR VM_DONTEXPAND ,
+.BR VM_MIXEDMAP ,
+or
+.BR VM_PFNMAP ,
+nor can it be stack memory or backed by a DAX-enabled device
+(unless the DAX device is hot-plugged as System RAM).
+The process must also not have
+.B PR_SET_THP_DISABLE
+set (see
+.BR prctl (2)).
+.IP
+The
+.BR MADV_HUGEPAGE ,
+.BR MADV_NOHUGEPAGE ,
+and
+.B MADV_COLLAPSE
+operations are available only if the kernel was configured with
+.B CONFIG_TRANSPARENT_HUGEPAGE
+and file/shmem memory is only supported if the kernel was configured with
+.BR CONFIG_READ_ONLY_THP_FOR_FS .
+.TP
+.BR MADV_NOHUGEPAGE " (since Linux 2.6.38)"
+Ensures that memory in the address range specified by
+.I addr
+and
+.I length
+will not be backed by transparent hugepages.
+.TP
+.BR MADV_COLLAPSE " (since Linux 6.1)"
+.\" commit 7d8faaf155454f8798ec56404faca29a82689c77
+.\" commit 34488399fa08faaf664743fa54b271eb6f9e1321
+Perform a best-effort synchronous collapse of
+the native pages mapped by the memory range
+into Transparent Huge Pages (THPs).
+.B MADV_COLLAPSE
+operates on the current state of memory of the calling process and
+makes no persistent changes or guarantees on how pages will be mapped,
+constructed,
+or faulted in the future.
+.IP
+.B MADV_COLLAPSE
+supports private anonymous pages (see
+.BR mmap (2)),
+shmem pages,
+and file-backed pages.
+See
+.B MADV_HUGEPAGE
+for general information on memory requirements for THP.
+If the range provided spans multiple VMAs,
+the semantics of the collapse over each VMA is independent from the others.
+If collapse of a given huge page-aligned/sized region fails,
+the operation may continue to attempt collapsing
+the remainder of the specified memory.
+.B MADV_COLLAPSE
+will automatically clamp the provided range to be hugepage-aligned.
+.IP
+All non-resident pages covered by the range
+will first be swapped/faulted-in,
+before being copied onto a freshly allocated hugepage.
+If the native pages compose the same PTE-mapped hugepage,
+and are suitably aligned,
+allocation of a new hugepage may be elided and
+collapse may happen in-place.
+Unmapped pages will have their data directly initialized to 0
+in the new hugepage.
+However,
+for every eligible hugepage-aligned/sized region to be collapsed,
+at least one page must currently be backed by physical memory.
+.IP
+.B MADV_COLLAPSE
+is independent of any sysfs
+(see
+.BR sysfs (5))
+setting under
+.IR /sys/kernel/mm/transparent_hugepage ,
+both in terms of determining THP eligibility,
+and allocation semantics.
+See Linux kernel source file
+.I Documentation/admin\-guide/mm/transhuge.rst
+for more information.
+.B MADV_COLLAPSE
+also ignores
+.B huge=
+tmpfs mount when operating on tmpfs files.
+Allocation for the new hugepage may enter direct reclaim and/or compaction,
+regardless of VMA flags
+(though
+.B VM_NOHUGEPAGE
+is still respected).
+.IP
+When the system has multiple NUMA nodes,
+the hugepage will be allocated from
+the node providing the most native pages.
+.IP
+If all hugepage-sized/aligned regions covered by the provided range were
+either successfully collapsed,
+or were already PMD-mapped THPs,
+this operation will be deemed successful.
+Note that this doesn't guarantee anything about
+other possible mappings of the memory.
+In the event multiple hugepage-aligned/sized areas fail to collapse,
+only the most-recently\[en]failed code will be set in
+.IR errno .
+.TP
+.BR MADV_DONTDUMP " (since Linux 3.4)"
+.\" commit 909af768e88867016f427264ae39d27a57b6a8ed
+.\" commit accb61fe7bb0f5c2a4102239e4981650f9048519
+Exclude from a core dump those pages in the range specified by
+.I addr
+and
+.IR length .
+This is useful in applications that have large areas of memory
+that are known not to be useful in a core dump.
+The effect of
+.B MADV_DONTDUMP
+takes precedence over the bit mask that is set via the
+.IR /proc/ pid /coredump_filter
+file (see
+.BR core (5)).
+.TP
+.BR MADV_DODUMP " (since Linux 3.4)"
+Undo the effect of an earlier
+.BR MADV_DONTDUMP .
+.TP
+.BR MADV_FREE " (since Linux 4.5)"
+The application no longer requires the pages in the range specified by
+.I addr
+and
+.IR len .
+The kernel can thus free these pages,
+but the freeing could be delayed until memory pressure occurs.
+For each of the pages that has been marked to be freed
+but has not yet been freed,
+the free operation will be canceled if the caller writes into the page.
+After a successful
+.B MADV_FREE
+operation, any stale data (i.e., dirty, unwritten pages) will be lost
+when the kernel frees the pages.
+However, subsequent writes to pages in the range will succeed
+and then kernel cannot free those dirtied pages,
+so that the caller can always see just written data.
+If there is no subsequent write,
+the kernel can free the pages at any time.
+Once pages in the range have been freed, the caller will
+see zero-fill-on-demand pages upon subsequent page references.
+.IP
+The
+.B MADV_FREE
+operation
+can be applied only to private anonymous pages (see
+.BR mmap (2)).
+Before Linux 4.12,
+.\" commit 93e06c7a645343d222c9a838834a51042eebbbf7
+when freeing pages on a swapless system,
+the pages in the given range are freed instantly,
+regardless of memory pressure.
+.TP
+.BR MADV_WIPEONFORK " (since Linux 4.14)"
+.\" commit d2cd9ede6e193dd7d88b6d27399e96229a551b19
+Present the child process with zero-filled memory in this range after a
+.BR fork (2).
+This is useful in forking servers in order to ensure
+that sensitive per-process data
+(for example, PRNG seeds, cryptographic secrets, and so on)
+is not handed to child processes.
+.IP
+The
+.B MADV_WIPEONFORK
+operation can be applied only to private anonymous pages (see
+.BR mmap (2)).
+.IP
+Within the child created by
+.BR fork (2),
+the
+.B MADV_WIPEONFORK
+setting remains in place on the specified address range.
+This setting is cleared during
+.BR execve (2).
+.TP
+.BR MADV_KEEPONFORK " (since Linux 4.14)"
+.\" commit d2cd9ede6e193dd7d88b6d27399e96229a551b19
+Undo the effect of an earlier
+.BR MADV_WIPEONFORK .
+.TP
+.BR MADV_COLD " (since Linux 5.4)"
+.\" commit 9c276cc65a58faf98be8e56962745ec99ab87636
+Deactivate a given range of pages.
+This will make the pages a more probable
+reclaim target should there be a memory pressure.
+This is a nondestructive operation.
+The advice might be ignored for some pages in the range when it is not
+applicable.
+.TP
+.BR MADV_PAGEOUT " (since Linux 5.4)"
+.\" commit 1a4e58cce84ee88129d5d49c064bd2852b481357
+Reclaim a given range of pages.
+This is done to free up memory occupied by these pages.
+If a page is anonymous, it will be swapped out.
+If a page is file-backed and dirty, it will be written back to the backing
+storage.
+The advice might be ignored for some pages in the range when it is not
+applicable.
+.TP
+.BR MADV_POPULATE_READ " (since Linux 5.14)"
+"Populate (prefault) page tables readable,
+faulting in all pages in the range just as if manually reading from each page;
+however,
+avoid the actual memory access that would have been performed after handling
+the fault.
+.IP
+In contrast to
+.BR MAP_POPULATE ,
+.B MADV_POPULATE_READ
+does not hide errors,
+can be applied to (parts of) existing mappings and will always populate
+(prefault) page tables readable.
+One example use case is prefaulting a file mapping,
+reading all file content from disk;
+however,
+pages won't be dirtied and consequently won't have to be written back to disk
+when evicting the pages from memory.
+.IP
+Depending on the underlying mapping,
+map the shared zeropage,
+preallocate memory or read the underlying file;
+files with holes might or might not preallocate blocks.
+If populating fails,
+a
+.B SIGBUS
+signal is not generated; instead, an error is returned.
+.IP
+If
+.B MADV_POPULATE_READ
+succeeds,
+all page tables have been populated (prefaulted) readable once.
+If
+.B MADV_POPULATE_READ
+fails,
+some page tables might have been populated.
+.IP
+.B MADV_POPULATE_READ
+cannot be applied to mappings without read permissions
+and special mappings,
+for example,
+mappings marked with kernel-internal flags such as
+.B VM_PFNMAP
+or
+.BR VM_IO ,
+or secret memory regions created using
+.BR memfd_secret(2) .
+.IP
+Note that with
+.BR MADV_POPULATE_READ ,
+the process can be killed at any moment when the system runs out of memory.
+.TP
+.BR MADV_POPULATE_WRITE " (since Linux 5.14)"
+Populate (prefault) page tables writable,
+faulting in all pages in the range just as if manually writing to each
+each page;
+however,
+avoid the actual memory access that would have been performed after handling
+the fault.
+.IP
+In contrast to
+.BR MAP_POPULATE ,
+MADV_POPULATE_WRITE does not hide errors,
+can be applied to (parts of) existing mappings and will always populate
+(prefault) page tables writable.
+One example use case is preallocating memory,
+breaking any CoW (Copy on Write).
+.IP
+Depending on the underlying mapping,
+preallocate memory or read the underlying file;
+files with holes will preallocate blocks.
+If populating fails,
+a
+.B SIGBUS
+signal is not generated; instead, an error is returned.
+.IP
+If
+.B MADV_POPULATE_WRITE
+succeeds,
+all page tables have been populated (prefaulted) writable once.
+If
+.B MADV_POPULATE_WRITE
+fails,
+some page tables might have been populated.
+.IP
+.B MADV_POPULATE_WRITE
+cannot be applied to mappings without write permissions
+and special mappings,
+for example,
+mappings marked with kernel-internal flags such as
+.B VM_PFNMAP
+or
+.BR VM_IO ,
+or secret memory regions created using
+.BR memfd_secret(2) .
+.IP
+Note that with
+.BR MADV_POPULATE_WRITE ,
+the process can be killed at any moment when the system runs out of memory.
+.SH RETURN VALUE
+On success,
+.BR madvise ()
+returns zero.
+On error, it returns \-1 and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+.I advice
+is
+.BR MADV_REMOVE ,
+but the specified address range is not a shared writable mapping.
+.TP
+.B EAGAIN
+A kernel resource was temporarily unavailable.
+.TP
+.B EBADF
+The map exists, but the area maps something that isn't a file.
+.TP
+.B EBUSY
+(for
+.BR MADV_COLLAPSE )
+Could not charge hugepage to cgroup: cgroup limit exceeded.
+.TP
+.B EFAULT
+.I advice
+is
+.B MADV_POPULATE_READ
+or
+.BR MADV_POPULATE_WRITE ,
+and populating (prefaulting) page tables failed because a
+.B SIGBUS
+would have been generated on actual memory access and the reason is not a
+HW poisoned page
+(HW poisoned pages can,
+for example,
+be created using the
+.B MADV_HWPOISON
+flag described elsewhere in this page).
+.TP
+.B EINVAL
+.I addr
+is not page-aligned or
+.I length
+is negative.
+.\" .I length
+.\" is zero,
+.TP
+.B EINVAL
+.I advice
+is not a valid.
+.TP
+.B EINVAL
+.I advice
+is
+.B MADV_COLD
+or
+.B MADV_PAGEOUT
+and the specified address range includes locked, Huge TLB pages, or
+.B VM_PFNMAP
+pages.
+.TP
+.B EINVAL
+.I advice
+is
+.B MADV_DONTNEED
+or
+.B MADV_REMOVE
+and the specified address range includes locked, Huge TLB pages, or
+.B VM_PFNMAP
+pages.
+.TP
+.B EINVAL
+.I advice
+is
+.B MADV_MERGEABLE
+or
+.BR MADV_UNMERGEABLE ,
+but the kernel was not configured with
+.BR CONFIG_KSM .
+.TP
+.B EINVAL
+.I advice
+is
+.B MADV_FREE
+or
+.B MADV_WIPEONFORK
+but the specified address range includes file, Huge TLB,
+.BR MAP_SHARED ,
+or
+.B VM_PFNMAP
+ranges.
+.TP
+.B EINVAL
+.I advice
+is
+.B MADV_POPULATE_READ
+or
+.BR MADV_POPULATE_WRITE ,
+but the specified address range includes ranges with insufficient permissions
+or special mappings,
+for example,
+mappings marked with kernel-internal flags such a
+.B VM_IO
+or
+.BR VM_PFNMAP ,
+or secret memory regions created using
+.BR memfd_secret(2) .
+.TP
+.B EIO
+(for
+.BR MADV_WILLNEED )
+Paging in this area would exceed the process's
+maximum resident set size.
+.TP
+.B ENOMEM
+(for
+.BR MADV_WILLNEED )
+Not enough memory: paging in failed.
+.TP
+.B ENOMEM
+(for
+.BR MADV_COLLAPSE )
+Not enough memory: could not allocate hugepage.
+.TP
+.B ENOMEM
+Addresses in the specified range are not currently
+mapped, or are outside the address space of the process.
+.TP
+.B ENOMEM
+.I advice
+is
+.B MADV_POPULATE_READ
+or
+.BR MADV_POPULATE_WRITE ,
+and populating (prefaulting) page tables failed because there was not enough
+memory.
+.TP
+.B EPERM
+.I advice
+is
+.BR MADV_HWPOISON ,
+but the caller does not have the
+.B CAP_SYS_ADMIN
+capability.
+.TP
+.B EHWPOISON
+.I advice
+is
+.B MADV_POPULATE_READ
+or
+.BR MADV_POPULATE_WRITE ,
+and populating (prefaulting) page tables failed because a HW poisoned page
+(HW poisoned pages can,
+for example,
+be created using the
+.B MADV_HWPOISON
+flag described elsewhere in this page)
+was encountered.
+.SH VERSIONS
+Versions of this system call, implementing a wide variety of
+.I advice
+values, exist on many other implementations.
+Other implementations typically implement at least the flags listed
+above under
+.IR "Conventional advice flags" ,
+albeit with some variation in semantics.
+.PP
+POSIX.1-2001 describes
+.BR posix_madvise (3)
+with constants
+.BR POSIX_MADV_NORMAL ,
+.BR POSIX_MADV_RANDOM ,
+.BR POSIX_MADV_SEQUENTIAL ,
+.BR POSIX_MADV_WILLNEED ,
+and
+.BR POSIX_MADV_DONTNEED ,
+and so on, with behavior close to the similarly named flags listed above.
+.SS Linux
+The Linux implementation requires that the address
+.I addr
+be page-aligned, and allows
+.I length
+to be zero.
+If there are some parts of the specified address range
+that are not mapped, the Linux version of
+.BR madvise ()
+ignores them and applies the call to the rest (but returns
+.B ENOMEM
+from the system call, as it should).
+.PP
+.I madvise(0,\ 0,\ advice)
+will return zero iff
+.I advice
+is supported by the kernel and can be relied on to probe for support.
+.SH STANDARDS
+None.
+.SH HISTORY
+First appeared in 4.4BSD.
+.PP
+Since Linux 3.18,
+.\" commit d3ac21cacc24790eb45d735769f35753f5b56ceb
+support for this system call is optional,
+depending on the setting of the
+.B CONFIG_ADVISE_SYSCALLS
+configuration option.
+.SH SEE ALSO
+.BR getrlimit (2),
+.BR memfd_secret (2),
+.BR mincore (2),
+.BR mmap (2),
+.BR mprotect (2),
+.BR msync (2),
+.BR munmap (2),
+.BR prctl (2),
+.BR process_madvise (2),
+.BR posix_madvise (3),
+.BR core (5)
diff --git a/man2/madvise1.2 b/man2/madvise1.2
new file mode 100644
index 0000000..5d25ea6
--- /dev/null
+++ b/man2/madvise1.2
@@ -0,0 +1 @@
+.so man2/unimplemented.2
diff --git a/man2/mbind.2 b/man2/mbind.2
new file mode 100644
index 0000000..064b8a1
--- /dev/null
+++ b/man2/mbind.2
@@ -0,0 +1,486 @@
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft-var
+.\"
+.\" Copyright 2003,2004 Andi Kleen, SuSE Labs.
+.\" and Copyright 2007 Lee Schermerhorn, Hewlett Packard
+.\"
+.\" 2006-02-03, mtk, substantial wording changes and other improvements
+.\" 2007-08-27, Lee Schermerhorn <Lee.Schermerhorn@hp.com>
+.\" more precise specification of behavior.
+.\"
+.\" FIXME
+.\" Linux 3.8 added MPOL_MF_LAZY, which needs to be documented.
+.\" Does it also apply for move_pages()?
+.\"
+.\" commit b24f53a0bea38b266d219ee651b22dba727c44ae
+.\" Author: Lee Schermerhorn <lee.schermerhorn@hp.com>
+.\" Date: Thu Oct 25 14:16:32 2012 +0200
+.\"
+.TH mbind 2 2023-07-16 "Linux man-pages 6.05.01"
+.SH NAME
+mbind \- set memory policy for a memory range
+.SH LIBRARY
+NUMA (Non-Uniform Memory Access) policy library
+.RI ( libnuma ", " \-lnuma )
+.SH SYNOPSIS
+.nf
+.B "#include <numaif.h>"
+.PP
+.BI "long mbind(void " addr [. len "], unsigned long " len ", int " mode ,
+.BI " const unsigned long " nodemask [(. maxnode " + ULONG_WIDTH - 1)"
+.B " / ULONG_WIDTH],"
+.BI " unsigned long " maxnode ", unsigned int " flags );
+.fi
+.SH DESCRIPTION
+.BR mbind ()
+sets the NUMA memory policy,
+which consists of a policy mode and zero or more nodes,
+for the memory range starting with
+.I addr
+and continuing for
+.I len
+bytes.
+The memory policy defines from which node memory is allocated.
+.PP
+If the memory range specified by the
+.IR addr " and " len
+arguments includes an "anonymous" region of memory\[em]that is
+a region of memory created using the
+.BR mmap (2)
+system call with the
+.BR MAP_ANONYMOUS \[em]or
+a memory-mapped file, mapped using the
+.BR mmap (2)
+system call with the
+.B MAP_PRIVATE
+flag, pages will be allocated only according to the specified
+policy when the application writes (stores) to the page.
+For anonymous regions, an initial read access will use a shared
+page in the kernel containing all zeros.
+For a file mapped with
+.BR MAP_PRIVATE ,
+an initial read access will allocate pages according to the
+memory policy of the thread that causes the page to be allocated.
+This may not be the thread that called
+.BR mbind ().
+.PP
+The specified policy will be ignored for any
+.B MAP_SHARED
+mappings in the specified memory range.
+Rather the pages will be allocated according to the memory policy
+of the thread that caused the page to be allocated.
+Again, this may not be the thread that called
+.BR mbind ().
+.PP
+If the specified memory range includes a shared memory region
+created using the
+.BR shmget (2)
+system call and attached using the
+.BR shmat (2)
+system call,
+pages allocated for the anonymous or shared memory region will
+be allocated according to the policy specified, regardless of which
+process attached to the shared memory segment causes the allocation.
+If, however, the shared memory region was created with the
+.B SHM_HUGETLB
+flag,
+the huge pages will be allocated according to the policy specified
+only if the page allocation is caused by the process that calls
+.BR mbind ()
+for that region.
+.PP
+By default,
+.BR mbind ()
+has an effect only for new allocations; if the pages inside
+the range have been already touched before setting the policy,
+then the policy has no effect.
+This default behavior may be overridden by the
+.B MPOL_MF_MOVE
+and
+.B MPOL_MF_MOVE_ALL
+flags described below.
+.PP
+The
+.I mode
+argument must specify one of
+.BR MPOL_DEFAULT ,
+.BR MPOL_BIND ,
+.BR MPOL_INTERLEAVE ,
+.BR MPOL_PREFERRED ,
+or
+.B MPOL_LOCAL
+(which are described in detail below).
+All policy modes except
+.B MPOL_DEFAULT
+require the caller to specify the node or nodes to which the mode applies,
+via the
+.I nodemask
+argument.
+.PP
+The
+.I mode
+argument may also include an optional
+.IR "mode flag" .
+The supported
+.I "mode flags"
+are:
+.TP
+.BR MPOL_F_STATIC_NODES " (since Linux-2.6.26)"
+A nonempty
+.I nodemask
+specifies physical node IDs.
+Linux does not remap the
+.I nodemask
+when the thread moves to a different cpuset context,
+nor when the set of nodes allowed by the thread's
+current cpuset context changes.
+.TP
+.BR MPOL_F_RELATIVE_NODES " (since Linux-2.6.26)"
+A nonempty
+.I nodemask
+specifies node IDs that are relative to the set of
+node IDs allowed by the thread's current cpuset.
+.PP
+.I nodemask
+points to a bit mask of nodes containing up to
+.I maxnode
+bits.
+The bit mask size is rounded to the next multiple of
+.IR "sizeof(unsigned long)" ,
+but the kernel will use bits only up to
+.IR maxnode .
+A NULL value of
+.I nodemask
+or a
+.I maxnode
+value of zero specifies the empty set of nodes.
+If the value of
+.I maxnode
+is zero,
+the
+.I nodemask
+argument is ignored.
+Where a
+.I nodemask
+is required, it must contain at least one node that is on-line,
+allowed by the thread's current cpuset context
+(unless the
+.B MPOL_F_STATIC_NODES
+mode flag is specified),
+and contains memory.
+.PP
+The
+.I mode
+argument must include one of the following values:
+.TP
+.B MPOL_DEFAULT
+This mode requests that any nondefault policy be removed,
+restoring default behavior.
+When applied to a range of memory via
+.BR mbind (),
+this means to use the thread memory policy,
+which may have been set with
+.BR set_mempolicy (2).
+If the mode of the thread memory policy is also
+.BR MPOL_DEFAULT ,
+the system-wide default policy will be used.
+The system-wide default policy allocates
+pages on the node of the CPU that triggers the allocation.
+For
+.BR MPOL_DEFAULT ,
+the
+.I nodemask
+and
+.I maxnode
+arguments must be specify the empty set of nodes.
+.TP
+.B MPOL_BIND
+This mode specifies a strict policy that restricts memory allocation to
+the nodes specified in
+.IR nodemask .
+If
+.I nodemask
+specifies more than one node, page allocations will come from
+the node with sufficient free memory that is closest to
+the node where the allocation takes place.
+Pages will not be allocated from any node not specified in the
+IR nodemask .
+(Before Linux 2.6.26,
+.\" commit 19770b32609b6bf97a3dece2529089494cbfc549
+page allocations came from
+the node with the lowest numeric node ID first, until that node
+contained no free memory.
+Allocations then came from the node with the next highest
+node ID specified in
+.I nodemask
+and so forth, until none of the specified nodes contained free memory.)
+.TP
+.B MPOL_INTERLEAVE
+This mode specifies that page allocations be interleaved across the
+set of nodes specified in
+.IR nodemask .
+This optimizes for bandwidth instead of latency
+by spreading out pages and memory accesses to those pages across
+multiple nodes.
+To be effective the memory area should be fairly large,
+at least 1\ MB or bigger with a fairly uniform access pattern.
+Accesses to a single page of the area will still be limited to
+the memory bandwidth of a single node.
+.TP
+.B MPOL_PREFERRED
+This mode sets the preferred node for allocation.
+The kernel will try to allocate pages from this
+node first and fall back to other nodes if the
+preferred nodes is low on free memory.
+If
+.I nodemask
+specifies more than one node ID, the first node in the
+mask will be selected as the preferred node.
+If the
+.I nodemask
+and
+.I maxnode
+arguments specify the empty set, then the memory is allocated on
+the node of the CPU that triggered the allocation.
+.TP
+.BR MPOL_LOCAL " (since Linux 3.8)"
+.\" commit 479e2802d09f1e18a97262c4c6f8f17ae5884bd8
+.\" commit f2a07f40dbc603c15f8b06e6ec7f768af67b424f
+This mode specifies "local allocation"; the memory is allocated on
+the node of the CPU that triggered the allocation (the "local node").
+The
+.I nodemask
+and
+.I maxnode
+arguments must specify the empty set.
+If the "local node" is low on free memory,
+the kernel will try to allocate memory from other nodes.
+The kernel will allocate memory from the "local node"
+whenever memory for this node is available.
+If the "local node" is not allowed by the thread's current cpuset context,
+the kernel will try to allocate memory from other nodes.
+The kernel will allocate memory from the "local node" whenever
+it becomes allowed by the thread's current cpuset context.
+By contrast,
+.B MPOL_DEFAULT
+reverts to the memory policy of the thread (which may be set via
+.BR set_mempolicy (2));
+that policy may be something other than "local allocation".
+.PP
+If
+.B MPOL_MF_STRICT
+is passed in
+.I flags
+and
+.I mode
+is not
+.BR MPOL_DEFAULT ,
+then the call fails with the error
+.B EIO
+if the existing pages in the memory range don't follow the policy.
+.\" According to the kernel code, the following is not true
+.\" --Lee Schermerhorn
+.\" In Linux 2.6.16 or later the kernel will also try to move pages
+.\" to the requested node with this flag.
+.PP
+If
+.B MPOL_MF_MOVE
+is specified in
+.IR flags ,
+then the kernel will attempt to move all the existing pages
+in the memory range so that they follow the policy.
+Pages that are shared with other processes will not be moved.
+If
+.B MPOL_MF_STRICT
+is also specified, then the call fails with the error
+.B EIO
+if some pages could not be moved.
+If the
+.B MPOL_INTERLEAVE
+policy was specified,
+pages already residing on the specified nodes
+will not be moved such that they are interleaved.
+.PP
+If
+.B MPOL_MF_MOVE_ALL
+is passed in
+.IR flags ,
+then the kernel will attempt to move all existing pages in the memory range
+regardless of whether other processes use the pages.
+The calling thread must be privileged
+.RB ( CAP_SYS_NICE )
+to use this flag.
+If
+.B MPOL_MF_STRICT
+is also specified, then the call fails with the error
+.B EIO
+if some pages could not be moved.
+If the
+.B MPOL_INTERLEAVE
+policy was specified,
+pages already residing on the specified nodes
+will not be moved such that they are interleaved.
+.\" ---------------------------------------------------------------
+.SH RETURN VALUE
+On success,
+.BR mbind ()
+returns 0;
+on error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.\" ---------------------------------------------------------------
+.SH ERRORS
+.\" I think I got all of the error returns. --Lee Schermerhorn
+.TP
+.B EFAULT
+Part or all of the memory range specified by
+.I nodemask
+and
+.I maxnode
+points outside your accessible address space.
+Or, there was an unmapped hole in the specified memory range specified by
+.I addr
+and
+.IR len .
+.TP
+.B EINVAL
+An invalid value was specified for
+.I flags
+or
+.IR mode ;
+or
+.I addr + len
+was less than
+.IR addr ;
+or
+.I addr
+is not a multiple of the system page size.
+Or,
+.I mode
+is
+.B MPOL_DEFAULT
+and
+.I nodemask
+specified a nonempty set;
+or
+.I mode
+is
+.B MPOL_BIND
+or
+.B MPOL_INTERLEAVE
+and
+.I nodemask
+is empty.
+Or,
+.I maxnode
+exceeds a kernel-imposed limit.
+.\" As at 2.6.23, this limit is "a page worth of bits", e.g.,
+.\" 8 * 4096 bits, assuming a 4kB page size.
+Or,
+.I nodemask
+specifies one or more node IDs that are
+greater than the maximum supported node ID.
+Or, none of the node IDs specified by
+.I nodemask
+are on-line and allowed by the thread's current cpuset context,
+or none of the specified nodes contain memory.
+Or, the
+.I mode
+argument specified both
+.B MPOL_F_STATIC_NODES
+and
+.BR MPOL_F_RELATIVE_NODES .
+.TP
+.B EIO
+.B MPOL_MF_STRICT
+was specified and an existing page was already on a node
+that does not follow the policy;
+or
+.B MPOL_MF_MOVE
+or
+.B MPOL_MF_MOVE_ALL
+was specified and the kernel was unable to move all existing
+pages in the range.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.B EPERM
+The
+.I flags
+argument included the
+.B MPOL_MF_MOVE_ALL
+flag and the caller does not have the
+.B CAP_SYS_NICE
+privilege.
+.\" ---------------------------------------------------------------
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.7.
+.PP
+Support for huge page policy was added with Linux 2.6.16.
+For interleave policy to be effective on huge page mappings the
+policied memory needs to be tens of megabytes or larger.
+.PP
+Before Linux 5.7.
+.\" commit dcf1763546d76c372f3136c8d6b2b6e77f140cf0
+.B MPOL_MF_STRICT
+was ignored on huge page mappings.
+.PP
+.B MPOL_MF_MOVE
+and
+.B MPOL_MF_MOVE_ALL
+are available only on Linux 2.6.16 and later.
+.SH NOTES
+For information on library support, see
+.BR numa (7).
+.PP
+NUMA policy is not supported on a memory-mapped file range
+that was mapped with the
+.B MAP_SHARED
+flag.
+.PP
+The
+.B MPOL_DEFAULT
+mode can have different effects for
+.BR mbind ()
+and
+.BR set_mempolicy (2).
+When
+.B MPOL_DEFAULT
+is specified for
+.BR set_mempolicy (2),
+the thread's memory policy reverts to the system default policy
+or local allocation.
+When
+.B MPOL_DEFAULT
+is specified for a range of memory using
+.BR mbind (),
+any pages subsequently allocated for that range will use
+the thread's memory policy, as set by
+.BR set_mempolicy (2).
+This effectively removes the explicit policy from the
+specified range, "falling back" to a possibly nondefault
+policy.
+To select explicit "local allocation" for a memory range,
+specify a
+.I mode
+of
+.B MPOL_LOCAL
+or
+.B MPOL_PREFERRED
+with an empty set of nodes.
+This method will work for
+.BR set_mempolicy (2),
+as well.
+.SH SEE ALSO
+.BR get_mempolicy (2),
+.BR getcpu (2),
+.BR mmap (2),
+.BR set_mempolicy (2),
+.BR shmat (2),
+.BR shmget (2),
+.BR numa (3),
+.BR cpuset (7),
+.BR numa (7),
+.BR numactl (8)
diff --git a/man2/membarrier.2 b/man2/membarrier.2
new file mode 100644
index 0000000..f118fd0
--- /dev/null
+++ b/man2/membarrier.2
@@ -0,0 +1,460 @@
+'\" t
+.\" Copyright 2015-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH membarrier 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+membarrier \- issue memory barriers on a set of threads
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.PP
+.BR "#include <linux/membarrier.h>" \
+" /* Definition of " MEMBARRIER_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_membarrier, int " cmd ", unsigned int " flags \
+", int " cpu_id );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR membarrier (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The
+.BR membarrier ()
+system call helps reducing the overhead of the memory barrier
+instructions required to order memory accesses on multi-core systems.
+However, this system call is heavier than a memory barrier, so using it
+effectively is
+.I not
+as simple as replacing memory barriers with this
+system call, but requires understanding of the details below.
+.PP
+Use of memory barriers needs to be done taking into account that a
+memory barrier always needs to be either matched with its memory barrier
+counterparts, or that the architecture's memory model doesn't require the
+matching barriers.
+.PP
+There are cases where one side of the matching barriers (which we will
+refer to as "fast side") is executed much more often than the other
+(which we will refer to as "slow side").
+This is a prime target for the use of
+.BR membarrier ().
+The key idea is to replace, for these matching
+barriers, the fast-side memory barriers by simple compiler barriers,
+for example:
+.PP
+.in +4n
+.EX
+asm volatile ("" : : : "memory")
+.EE
+.in
+.PP
+and replace the slow-side memory barriers by calls to
+.BR membarrier ().
+.PP
+This will add overhead to the slow side, and remove overhead from the
+fast side, thus resulting in an overall performance increase as long as
+the slow side is infrequent enough that the overhead of the
+.BR membarrier ()
+calls does not outweigh the performance gain on the fast side.
+.PP
+The
+.I cmd
+argument is one of the following:
+.TP
+.BR MEMBARRIER_CMD_QUERY " (since Linux 4.3)"
+Query the set of supported commands.
+The return value of the call is a bit mask of supported
+commands.
+.BR MEMBARRIER_CMD_QUERY ,
+which has the value 0,
+is not itself included in this bit mask.
+This command is always supported (on kernels where
+.BR membarrier ()
+is provided).
+.TP
+.BR MEMBARRIER_CMD_GLOBAL " (since Linux 4.16)"
+Ensure that all threads from all processes on the system pass through a
+state where all memory accesses to user-space addresses match program
+order between entry to and return from the
+.BR membarrier ()
+system call.
+All threads on the system are targeted by this command.
+.TP
+.BR MEMBARRIER_CMD_GLOBAL_EXPEDITED " (since Linux 4.16)"
+Execute a memory barrier on all running threads of all processes that
+previously registered with
+.BR MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED .
+.IP
+Upon return from the system call, the calling thread has a guarantee that all
+running threads have passed through a state where all memory accesses to
+user-space addresses match program order between entry to and return
+from the system call (non-running threads are de facto in such a state).
+This guarantee is provided only for the threads of processes that
+previously registered with
+.BR MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED .
+.IP
+Given that registration is about the intent to receive the barriers, it
+is valid to invoke
+.B MEMBARRIER_CMD_GLOBAL_EXPEDITED
+from a process that has not employed
+.BR MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED .
+.IP
+The "expedited" commands complete faster than the non-expedited ones;
+they never block, but have the downside of causing extra overhead.
+.TP
+.BR MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED " (since Linux 4.16)"
+Register the process's intent to receive
+.B MEMBARRIER_CMD_GLOBAL_EXPEDITED
+memory barriers.
+.TP
+.BR MEMBARRIER_CMD_PRIVATE_EXPEDITED " (since Linux 4.14)"
+Execute a memory barrier on each running thread belonging to the same
+process as the calling thread.
+.IP
+Upon return from the system call, the calling
+thread has a guarantee that all its running thread siblings have passed
+through a state where all memory accesses to user-space addresses match
+program order between entry to and return from the system call
+(non-running threads are de facto in such a state).
+This guarantee is provided only for threads in
+the same process as the calling thread.
+.IP
+The "expedited" commands complete faster than the non-expedited ones;
+they never block, but have the downside of causing extra overhead.
+.IP
+A process must register its intent to use the private
+expedited command prior to using it.
+.TP
+.BR MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED " (since Linux 4.14)"
+Register the process's intent to use
+.BR MEMBARRIER_CMD_PRIVATE_EXPEDITED .
+.TP
+.BR MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE " (since Linux 4.16)"
+In addition to providing the memory ordering guarantees described in
+.BR MEMBARRIER_CMD_PRIVATE_EXPEDITED ,
+upon return from system call the calling thread has a guarantee that all its
+running thread siblings have executed a core serializing instruction.
+This guarantee is provided only for threads in
+the same process as the calling thread.
+.IP
+The "expedited" commands complete faster than the non-expedited ones,
+they never block, but have the downside of causing extra overhead.
+.IP
+A process must register its intent to use the private expedited sync
+core command prior to using it.
+.TP
+.BR MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE " (since Linux 4.16)"
+Register the process's intent to use
+.BR MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE .
+.TP
+.BR MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ " (since Linux 5.10)"
+Ensure the caller thread, upon return from system call, that all its
+running thread siblings have any currently running rseq critical sections
+restarted if
+.I flags
+parameter is 0; if
+.I flags
+parameter is
+.BR MEMBARRIER_CMD_FLAG_CPU ,
+then this operation is performed only on CPU indicated by
+.IR cpu_id .
+This guarantee is provided only for threads in
+the same process as the calling thread.
+.IP
+RSEQ membarrier is only available in the "private expedited" form.
+.IP
+A process must register its intent to use the private expedited rseq
+command prior to using it.
+.TP
+.BR MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ " (since Linux 5.10)"
+Register the process's intent to use
+.BR MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ .
+.TP
+.BR MEMBARRIER_CMD_SHARED " (since Linux 4.3)"
+This is an alias for
+.B MEMBARRIER_CMD_GLOBAL
+that exists for header backward compatibility.
+.PP
+The
+.I flags
+argument must be specified as 0 unless the command is
+.BR MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ ,
+in which case
+.I flags
+can be either 0 or
+.BR MEMBARRIER_CMD_FLAG_CPU .
+.PP
+The
+.I cpu_id
+argument is ignored unless
+.I flags
+is
+.BR MEMBARRIER_CMD_FLAG_CPU ,
+in which case it must specify the CPU targeted by this membarrier
+command.
+.PP
+All memory accesses performed in program order from each targeted thread
+are guaranteed to be ordered with respect to
+.BR membarrier ().
+.PP
+If we use the semantic
+.I barrier()
+to represent a compiler barrier forcing memory
+accesses to be performed in program order across the barrier, and
+.I smp_mb()
+to represent explicit memory barriers forcing full memory
+ordering across the barrier, we have the following ordering table for
+each pairing of
+.IR barrier() ,
+.BR membarrier (),
+and
+.IR smp_mb() .
+The pair ordering is detailed as (O: ordered, X: not ordered):
+.PP
+.RS
+.TS
+l c c c.
+\& barrier() smp_mb() membarrier()
+barrier() X X O
+smp_mb() X O O
+membarrier() O O O
+.TE
+.RE
+.SH RETURN VALUE
+On success, the
+.B MEMBARRIER_CMD_QUERY
+operation returns a bit mask of supported commands, and the
+.BR MEMBARRIER_CMD_GLOBAL ,
+.BR MEMBARRIER_CMD_GLOBAL_EXPEDITED ,
+.BR MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED ,
+.BR MEMBARRIER_CMD_PRIVATE_EXPEDITED ,
+.BR MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED ,
+.BR MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE ,
+and
+.B MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE
+operations return zero.
+On error, \-1 is returned,
+and
+.I errno
+is set to indicate the error.
+.PP
+For a given command, with
+.I flags
+set to 0, this system call is
+guaranteed to always return the same value until reboot.
+Further calls with the same arguments will lead to the same result.
+Therefore, with
+.I flags
+set to 0, error handling is required only for the first call to
+.BR membarrier ().
+.SH ERRORS
+.TP
+.B EINVAL
+.I cmd
+is invalid, or
+.I flags
+is nonzero, or the
+.B MEMBARRIER_CMD_GLOBAL
+command is disabled because the
+.I nohz_full
+CPU parameter has been set, or the
+.B MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE
+and
+.B MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE
+commands are not implemented by the architecture.
+.TP
+.B ENOSYS
+The
+.BR membarrier ()
+system call is not implemented by this kernel.
+.TP
+.B EPERM
+The current process was not registered prior to using private expedited
+commands.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 4.3.
+.PP
+Before Linux 5.10, the prototype was:
+.PP
+.in +4n
+.EX
+.BI "int membarrier(int " cmd ", int " flags );
+.EE
+.in
+.SH NOTES
+A memory barrier instruction is part of the instruction set of
+architectures with weakly ordered memory models.
+It orders memory
+accesses prior to the barrier and after the barrier with respect to
+matching barriers on other cores.
+For instance, a load fence can order
+loads prior to and following that fence with respect to stores ordered
+by store fences.
+.PP
+Program order is the order in which instructions are ordered in the
+program assembly code.
+.PP
+Examples where
+.BR membarrier ()
+can be useful include implementations
+of Read-Copy-Update libraries and garbage collectors.
+.SH EXAMPLES
+Assuming a multithreaded application where "fast_path()" is executed
+very frequently, and where "slow_path()" is executed infrequently, the
+following code (x86) can be transformed using
+.BR membarrier ():
+.PP
+.in +4n
+.\" SRC BEGIN (membarrier.c)
+.EX
+#include <stdlib.h>
+\&
+static volatile int a, b;
+\&
+static void
+fast_path(int *read_b)
+{
+ a = 1;
+ asm volatile ("mfence" : : : "memory");
+ *read_b = b;
+}
+\&
+static void
+slow_path(int *read_a)
+{
+ b = 1;
+ asm volatile ("mfence" : : : "memory");
+ *read_a = a;
+}
+\&
+int
+main(void)
+{
+ int read_a, read_b;
+\&
+ /*
+ * Real applications would call fast_path() and slow_path()
+ * from different threads. Call those from main() to keep
+ * this example short.
+ */
+\&
+ slow_path(&read_a);
+ fast_path(&read_b);
+\&
+ /*
+ * read_b == 0 implies read_a == 1 and
+ * read_a == 0 implies read_b == 1.
+ */
+\&
+ if (read_b == 0 && read_a == 0)
+ abort();
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.in
+.PP
+The code above transformed to use
+.BR membarrier ()
+becomes:
+.PP
+.in +4n
+.EX
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <linux/membarrier.h>
+\&
+static volatile int a, b;
+\&
+static int
+membarrier(int cmd, unsigned int flags, int cpu_id)
+{
+ return syscall(__NR_membarrier, cmd, flags, cpu_id);
+}
+\&
+static int
+init_membarrier(void)
+{
+ int ret;
+\&
+ /* Check that membarrier() is supported. */
+\&
+ ret = membarrier(MEMBARRIER_CMD_QUERY, 0, 0);
+ if (ret < 0) {
+ perror("membarrier");
+ return \-1;
+ }
+\&
+ if (!(ret & MEMBARRIER_CMD_GLOBAL)) {
+ fprintf(stderr,
+ "membarrier does not support MEMBARRIER_CMD_GLOBAL\en");
+ return \-1;
+ }
+\&
+ return 0;
+}
+\&
+static void
+fast_path(int *read_b)
+{
+ a = 1;
+ asm volatile ("" : : : "memory");
+ *read_b = b;
+}
+\&
+static void
+slow_path(int *read_a)
+{
+ b = 1;
+ membarrier(MEMBARRIER_CMD_GLOBAL, 0, 0);
+ *read_a = a;
+}
+\&
+int
+main(int argc, char *argv[])
+{
+ int read_a, read_b;
+\&
+ if (init_membarrier())
+ exit(EXIT_FAILURE);
+\&
+ /*
+ * Real applications would call fast_path() and slow_path()
+ * from different threads. Call those from main() to keep
+ * this example short.
+ */
+\&
+ slow_path(&read_a);
+ fast_path(&read_b);
+\&
+ /*
+ * read_b == 0 implies read_a == 1 and
+ * read_a == 0 implies read_b == 1.
+ */
+\&
+ if (read_b == 0 && read_a == 0)
+ abort();
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.in
+.\" .SH SEE ALSO
+.\" FIXME See if the following syscalls make it into Linux 4.15 or later
+.\" .BR cpu_opv (2),
+.\" .BR rseq (2)
diff --git a/man2/memfd_create.2 b/man2/memfd_create.2
new file mode 100644
index 0000000..fb18abc
--- /dev/null
+++ b/man2/memfd_create.2
@@ -0,0 +1,545 @@
+.\" Copyright (C) 2014 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" and Copyright (C) 2014 David Herrmann <dh.herrmann@gmail.com>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.TH memfd_create 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+memfd_create \- create an anonymous file
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #include <sys/mman.h>
+.PP
+.BI "int memfd_create(const char *" name ", unsigned int " flags ");"
+.fi
+.SH DESCRIPTION
+.BR memfd_create ()
+creates an anonymous file and returns a file descriptor that refers to it.
+The file behaves like a regular file, and so can be modified,
+truncated, memory-mapped, and so on.
+However, unlike a regular file,
+it lives in RAM and has a volatile backing storage.
+Once all references to the file are dropped, it is automatically released.
+Anonymous memory is used for all backing pages of the file.
+Therefore, files created by
+.BR memfd_create ()
+have the same semantics as other anonymous
+.\" David Herrmann:
+.\" memfd uses VM_NORESERVE so each page is accounted on first access.
+.\" This means, the overcommit-limits (see __vm_enough_memory()) and the
+.\" memory-cgroup limits (mem_cgroup_try_charge()) are applied. Note that
+.\" those are accounted on "current" and "current->mm", that is, the
+.\" process doing the first page access.
+memory allocations such as those allocated using
+.BR mmap (2)
+with the
+.B MAP_ANONYMOUS
+flag.
+.PP
+The initial size of the file is set to 0.
+Following the call, the file size should be set using
+.BR ftruncate (2).
+(Alternatively, the file may be populated by calls to
+.BR write (2)
+or similar.)
+.PP
+The name supplied in
+.I name
+is used as a filename and will be displayed
+as the target of the corresponding symbolic link in the directory
+.IR /proc/self/fd/ .
+The displayed name is always prefixed with
+.I memfd:
+and serves only for debugging purposes.
+Names do not affect the behavior of the file descriptor,
+and as such multiple files can have the same name without any side effects.
+.PP
+The following values may be bitwise ORed in
+.I flags
+to change the behavior of
+.BR memfd_create ():
+.TP
+.B MFD_CLOEXEC
+Set the close-on-exec
+.RB ( FD_CLOEXEC )
+flag on the new file descriptor.
+See the description of the
+.B O_CLOEXEC
+flag in
+.BR open (2)
+for reasons why this may be useful.
+.TP
+.B MFD_ALLOW_SEALING
+Allow sealing operations on this file.
+See the discussion of the
+.B F_ADD_SEALS
+and
+.B F_GET_SEALS
+operations in
+.BR fcntl (2),
+and also NOTES, below.
+The initial set of seals is empty.
+If this flag is not set, the initial set of seals will be
+.BR F_SEAL_SEAL ,
+meaning that no other seals can be set on the file.
+.\" FIXME Why is the MFD_ALLOW_SEALING behavior not simply the default?
+.\" Is it worth adding some text explaining this?
+.TP
+.BR MFD_HUGETLB " (since Linux 4.14)"
+.\" commit 749df87bd7bee5a79cef073f5d032ddb2b211de8
+The anonymous file will be created in the hugetlbfs filesystem using
+huge pages.
+See the Linux kernel source file
+.I Documentation/admin\-guide/mm/hugetlbpage.rst
+for more information about hugetlbfs.
+.\" commit 47b9012ecdc747f6936395265e677d41e11a31ff
+Specifying both
+.B MFD_HUGETLB
+and
+.B MFD_ALLOW_SEALING
+in
+.I flags
+is supported since Linux 4.16.
+.TP
+.BR MFD_HUGE_2MB ", " MFD_HUGE_1GB ", " "..."
+Used in conjunction with
+.B MFD_HUGETLB
+to select alternative hugetlb page sizes (respectively, 2\ MB, 1\ GB, ...)
+on systems that support multiple hugetlb page sizes.
+Definitions for known
+huge page sizes are included in the header file
+.I <linux/memfd.h>.
+.IP
+For details on encoding huge page sizes not included in the header file,
+see the discussion of the similarly named constants in
+.BR mmap (2).
+.PP
+Unused bits in
+.I flags
+must be 0.
+.PP
+As its return value,
+.BR memfd_create ()
+returns a new file descriptor that can be used to refer to the file.
+This file descriptor is opened for both reading and writing
+.RB ( O_RDWR )
+and
+.B O_LARGEFILE
+is set for the file descriptor.
+.PP
+With respect to
+.BR fork (2)
+and
+.BR execve (2),
+the usual semantics apply for the file descriptor created by
+.BR memfd_create ().
+A copy of the file descriptor is inherited by the child produced by
+.BR fork (2)
+and refers to the same file.
+The file descriptor is preserved across
+.BR execve (2),
+unless the close-on-exec flag has been set.
+.SH RETURN VALUE
+On success,
+.BR memfd_create ()
+returns a new file descriptor.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+The address in
+.I name
+points to invalid memory.
+.TP
+.B EINVAL
+.I flags
+included unknown bits.
+.TP
+.B EINVAL
+.I name
+was too long.
+(The limit is
+.\" NAME_MAX - strlen("memfd:")
+249 bytes, excluding the terminating null byte.)
+.TP
+.B EINVAL
+Both
+.B MFD_HUGETLB
+and
+.B MFD_ALLOW_SEALING
+were specified in
+.IR flags .
+.TP
+.B EMFILE
+The per-process limit on the number of open file descriptors has been reached.
+.TP
+.B ENFILE
+The system-wide limit on the total number of open files has been reached.
+.TP
+.B ENOMEM
+There was insufficient memory to create a new anonymous file.
+.TP
+.B EPERM
+The
+.B MFD_HUGETLB
+flag was specified, but the caller was not privileged (did not have the
+.B CAP_IPC_LOCK
+capability)
+and is not a member of the
+.I sysctl_hugetlb_shm_group
+group; see the description of
+.I /proc/sys/vm/sysctl_hugetlb_shm_group
+in
+.BR proc (5).
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 3.17,
+glibc 2.27.
+.SH NOTES
+.\" See also http://lwn.net/Articles/593918/
+.\" and http://lwn.net/Articles/594919/ and http://lwn.net/Articles/591108/
+The
+.BR memfd_create ()
+system call provides a simple alternative to manually mounting a
+.BR tmpfs (5)
+filesystem and creating and opening a file in that filesystem.
+The primary purpose of
+.BR memfd_create ()
+is to create files and associated file descriptors that are
+used with the file-sealing APIs provided by
+.BR fcntl (2).
+.PP
+The
+.BR memfd_create ()
+system call also has uses without file sealing
+(which is why file-sealing is disabled, unless explicitly requested with the
+.B MFD_ALLOW_SEALING
+flag).
+In particular, it can be used as an alternative to creating files in
+.I tmp
+or as an alternative to using the
+.BR open (2)
+.B O_TMPFILE
+in cases where there is no intention to actually link the
+resulting file into the filesystem.
+.SS File sealing
+In the absence of file sealing,
+processes that communicate via shared memory must either trust each other,
+or take measures to deal with the possibility that an untrusted peer
+may manipulate the shared memory region in problematic ways.
+For example, an untrusted peer might modify the contents of the
+shared memory at any time, or shrink the shared memory region.
+The former possibility leaves the local process vulnerable to
+time-of-check-to-time-of-use race conditions
+(typically dealt with by copying data from
+the shared memory region before checking and using it).
+The latter possibility leaves the local process vulnerable to
+.B SIGBUS
+signals when an attempt is made to access a now-nonexistent
+location in the shared memory region.
+(Dealing with this possibility necessitates the use of a handler for the
+.B SIGBUS
+signal.)
+.PP
+Dealing with untrusted peers imposes extra complexity on
+code that employs shared memory.
+Memory sealing enables that extra complexity to be eliminated,
+by allowing a process to operate secure in the knowledge that
+its peer can't modify the shared memory in an undesired fashion.
+.PP
+An example of the usage of the sealing mechanism is as follows:
+.IP (1) 5
+The first process creates a
+.BR tmpfs (5)
+file using
+.BR memfd_create ().
+The call yields a file descriptor used in subsequent steps.
+.IP (2)
+The first process
+sizes the file created in the previous step using
+.BR ftruncate (2),
+maps it using
+.BR mmap (2),
+and populates the shared memory with the desired data.
+.IP (3)
+The first process uses the
+.BR fcntl (2)
+.B F_ADD_SEALS
+operation to place one or more seals on the file,
+in order to restrict further modifications on the file.
+(If placing the seal
+.BR F_SEAL_WRITE ,
+then it will be necessary to first unmap the shared writable mapping
+created in the previous step.
+Otherwise, behavior similar to
+.B F_SEAL_WRITE
+can be achieved by using
+.BR F_SEAL_FUTURE_WRITE ,
+which will prevent future writes via
+.BR mmap (2)
+and
+.BR write (2)
+from succeeding while keeping existing shared writable mappings).
+.IP (4)
+A second process obtains a file descriptor for the
+.BR tmpfs (5)
+file and maps it.
+Among the possible ways in which this could happen are the following:
+.RS
+.IP \[bu] 3
+The process that called
+.BR memfd_create ()
+could transfer the resulting file descriptor to the second process
+via a UNIX domain socket (see
+.BR unix (7)
+and
+.BR cmsg (3)).
+The second process then maps the file using
+.BR mmap (2).
+.IP \[bu]
+The second process is created via
+.BR fork (2)
+and thus automatically inherits the file descriptor and mapping.
+(Note that in this case and the next,
+there is a natural trust relationship between the two processes,
+since they are running under the same user ID.
+Therefore, file sealing would not normally be necessary.)
+.IP \[bu]
+The second process opens the file
+.IR /proc/ pid /fd/ fd,
+where
+.I <pid>
+is the PID of the first process (the one that called
+.BR memfd_create ()),
+and
+.I <fd>
+is the number of the file descriptor returned by the call to
+.BR memfd_create ()
+in that process.
+The second process then maps the file using
+.BR mmap (2).
+.RE
+.IP (5)
+The second process uses the
+.BR fcntl (2)
+.B F_GET_SEALS
+operation to retrieve the bit mask of seals
+that has been applied to the file.
+This bit mask can be inspected in order to determine
+what kinds of restrictions have been placed on file modifications.
+If desired, the second process can apply further seals
+to impose additional restrictions (so long as the
+.B F_SEAL_SEAL
+seal has not yet been applied).
+.SH EXAMPLES
+Below are shown two example programs that demonstrate the use of
+.BR memfd_create ()
+and the file sealing API.
+.PP
+The first program,
+.IR t_memfd_create.c ,
+creates a
+.BR tmpfs (5)
+file using
+.BR memfd_create (),
+sets a size for the file, maps it into memory,
+and optionally places some seals on the file.
+The program accepts up to three command-line arguments,
+of which the first two are required.
+The first argument is the name to associate with the file,
+the second argument is the size to be set for the file,
+and the optional third argument is a string of characters that specify
+seals to be set on the file.
+.PP
+The second program,
+.IR t_get_seals.c ,
+can be used to open an existing file that was created via
+.BR memfd_create ()
+and inspect the set of seals that have been applied to that file.
+.PP
+The following shell session demonstrates the use of these programs.
+First we create a
+.BR tmpfs (5)
+file and set some seals on it:
+.PP
+.in +4n
+.EX
+$ \fB./t_memfd_create my_memfd_file 4096 sw &\fP
+[1] 11775
+PID: 11775; fd: 3; /proc/11775/fd/3
+.EE
+.in
+.PP
+At this point, the
+.I t_memfd_create
+program continues to run in the background.
+From another program, we can obtain a file descriptor for the
+file created by
+.BR memfd_create ()
+by opening the
+.IR /proc/ pid /fd
+file that corresponds to the file descriptor opened by
+.BR memfd_create ().
+Using that pathname, we inspect the content of the
+.IR /proc/ pid /fd
+symbolic link, and use our
+.I t_get_seals
+program to view the seals that have been placed on the file:
+.PP
+.in +4n
+.EX
+$ \fBreadlink /proc/11775/fd/3\fP
+/memfd:my_memfd_file (deleted)
+$ \fB./t_get_seals /proc/11775/fd/3\fP
+Existing seals: WRITE SHRINK
+.EE
+.in
+.SS Program source: t_memfd_create.c
+\&
+.\" SRC BEGIN (t_memfd_create.c)
+.EX
+#define _GNU_SOURCE
+#include <err.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ int fd;
+ char *name, *seals_arg;
+ ssize_t len;
+ unsigned int seals;
+\&
+ if (argc < 3) {
+ fprintf(stderr, "%s name size [seals]\en", argv[0]);
+ fprintf(stderr, "\et\[aq]seals\[aq] can contain any of the "
+ "following characters:\en");
+ fprintf(stderr, "\et\etg \- F_SEAL_GROW\en");
+ fprintf(stderr, "\et\ets \- F_SEAL_SHRINK\en");
+ fprintf(stderr, "\et\etw \- F_SEAL_WRITE\en");
+ fprintf(stderr, "\et\etW \- F_SEAL_FUTURE_WRITE\en");
+ fprintf(stderr, "\et\etS \- F_SEAL_SEAL\en");
+ exit(EXIT_FAILURE);
+ }
+\&
+ name = argv[1];
+ len = atoi(argv[2]);
+ seals_arg = argv[3];
+\&
+ /* Create an anonymous file in tmpfs; allow seals to be
+ placed on the file. */
+\&
+ fd = memfd_create(name, MFD_ALLOW_SEALING);
+ if (fd == \-1)
+ err(EXIT_FAILURE, "memfd_create");
+\&
+ /* Size the file as specified on the command line. */
+\&
+ if (ftruncate(fd, len) == \-1)
+ err(EXIT_FAILURE, "truncate");
+\&
+ printf("PID: %jd; fd: %d; /proc/%jd/fd/%d\en",
+ (intmax_t) getpid(), fd, (intmax_t) getpid(), fd);
+\&
+ /* Code to map the file and populate the mapping with data
+ omitted. */
+\&
+ /* If a \[aq]seals\[aq] command\-line argument was supplied, set some
+ seals on the file. */
+\&
+ if (seals_arg != NULL) {
+ seals = 0;
+\&
+ if (strchr(seals_arg, \[aq]g\[aq]) != NULL)
+ seals |= F_SEAL_GROW;
+ if (strchr(seals_arg, \[aq]s\[aq]) != NULL)
+ seals |= F_SEAL_SHRINK;
+ if (strchr(seals_arg, \[aq]w\[aq]) != NULL)
+ seals |= F_SEAL_WRITE;
+ if (strchr(seals_arg, \[aq]W\[aq]) != NULL)
+ seals |= F_SEAL_FUTURE_WRITE;
+ if (strchr(seals_arg, \[aq]S\[aq]) != NULL)
+ seals |= F_SEAL_SEAL;
+\&
+ if (fcntl(fd, F_ADD_SEALS, seals) == \-1)
+ err(EXIT_FAILURE, "fcntl");
+ }
+\&
+ /* Keep running, so that the file created by memfd_create()
+ continues to exist. */
+\&
+ pause();
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SS Program source: t_get_seals.c
+\&
+.\" SRC BEGIN (t_get_seals.c)
+.EX
+#define _GNU_SOURCE
+#include <err.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ int fd;
+ unsigned int seals;
+\&
+ if (argc != 2) {
+ fprintf(stderr, "%s /proc/PID/fd/FD\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ fd = open(argv[1], O_RDWR);
+ if (fd == \-1)
+ err(EXIT_FAILURE, "open");
+\&
+ seals = fcntl(fd, F_GET_SEALS);
+ if (seals == \-1)
+ err(EXIT_FAILURE, "fcntl");
+\&
+ printf("Existing seals:");
+ if (seals & F_SEAL_SEAL)
+ printf(" SEAL");
+ if (seals & F_SEAL_GROW)
+ printf(" GROW");
+ if (seals & F_SEAL_WRITE)
+ printf(" WRITE");
+ if (seals & F_SEAL_FUTURE_WRITE)
+ printf(" FUTURE_WRITE");
+ if (seals & F_SEAL_SHRINK)
+ printf(" SHRINK");
+ printf("\en");
+\&
+ /* Code to map the file and access the contents of the
+ resulting mapping omitted. */
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR fcntl (2),
+.BR ftruncate (2),
+.BR memfd_secret (2),
+.BR mmap (2),
+.BR shmget (2),
+.BR shm_open (3)
diff --git a/man2/memfd_secret.2 b/man2/memfd_secret.2
new file mode 100644
index 0000000..fcc39f6
--- /dev/null
+++ b/man2/memfd_secret.2
@@ -0,0 +1,204 @@
+.\" Copyright (c) 2021, IBM Corporation.
+.\" Written by Mike Rapoport <rppt@linux.ibm.com>
+.\"
+.\" Based on memfd_create(2) man page
+.\" Copyright (C) 2014 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" and Copyright (C) 2014 David Herrmann <dh.herrmann@gmail.com>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.TH memfd_secret 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+memfd_secret \- create an anonymous RAM-based file
+to access secret memory regions
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.PP
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_memfd_secret, unsigned int " flags );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR memfd_secret (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+.BR memfd_secret ()
+creates an anonymous RAM-based file and returns a file descriptor
+that refers to it.
+The file provides a way to create and access memory regions
+with stronger protection than usual RAM-based files and
+anonymous memory mappings.
+Once all open references to the file are closed,
+it is automatically released.
+The initial size of the file is set to 0.
+Following the call, the file size should be set using
+.BR ftruncate (2).
+.PP
+The memory areas backing the file created with
+.BR memfd_secret (2)
+are visible only to the processes that have access to the file descriptor.
+The memory region is removed from the kernel page tables
+and only the page tables of the processes holding the file descriptor
+map the corresponding physical memory.
+(Thus, the pages in the region can't be accessed by the kernel itself,
+so that, for example, pointers to the region can't be passed to
+system calls.)
+.PP
+The following values may be bitwise ORed in
+.I flags
+to control the behavior of
+.BR memfd_secret ():
+.TP
+.B FD_CLOEXEC
+Set the close-on-exec flag on the new file descriptor,
+which causes the region to be removed from the process on
+.BR execve (2).
+See the description of the
+.B O_CLOEXEC
+flag in
+.BR open (2)
+.PP
+As its return value,
+.BR memfd_secret ()
+returns a new file descriptor that refers to an anonymous file.
+This file descriptor is opened for both reading and writing
+.RB ( O_RDWR )
+and
+.B O_LARGEFILE
+is set for the file descriptor.
+.PP
+With respect to
+.BR fork (2)
+and
+.BR execve (2),
+the usual semantics apply for the file descriptor created by
+.BR memfd_secret ().
+A copy of the file descriptor is inherited by the child produced by
+.BR fork (2)
+and refers to the same file.
+The file descriptor is preserved across
+.BR execve (2),
+unless the close-on-exec flag has been set.
+.PP
+The memory region is locked into memory in the same way as with
+.BR mlock (2),
+so that it will never be written into swap,
+and hibernation is inhibited for as long as any
+.BR memfd_secret ()
+descriptions exist.
+However the implementation of
+.BR memfd_secret ()
+will not try to populate the whole range during the
+.BR mmap (2)
+call that attaches the region into the process's address space;
+instead, the pages are only actually allocated
+as they are faulted in.
+The amount of memory allowed for memory mappings
+of the file descriptor obeys the same rules as
+.BR mlock (2)
+and cannot exceed
+.BR RLIMIT_MEMLOCK .
+.SH RETURN VALUE
+On success,
+.BR memfd_secret ()
+returns a new file descriptor.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+.I flags
+included unknown bits.
+.TP
+.B EMFILE
+The per-process limit on the number of open file descriptors has been reached.
+.TP
+.B EMFILE
+The system-wide limit on the total number of open files has been reached.
+.TP
+.B ENOMEM
+There was insufficient memory to create a new anonymous file.
+.TP
+.B ENOSYS
+.BR memfd_secret ()
+is not implemented on this architecture,
+or has not been enabled on the kernel command-line with
+.BR secretmem_enable =1.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 5.14.
+.SH NOTES
+The
+.BR memfd_secret ()
+system call is designed to allow a user-space process
+to create a range of memory that is inaccessible to anybody else -
+kernel included.
+There is no 100% guarantee that kernel won't be able to access
+memory ranges backed by
+.BR memfd_secret ()
+in any circumstances, but nevertheless,
+it is much harder to exfiltrate data from these regions.
+.PP
+.BR memfd_secret ()
+provides the following protections:
+.IP \[bu] 3
+Enhanced protection
+(in conjunction with all the other in-kernel attack prevention systems)
+against ROP attacks.
+Absence of any in-kernel primitive for accessing memory backed by
+.BR memfd_secret ()
+means that one-gadget ROP attack
+can't work to perform data exfiltration.
+The attacker would need to find enough ROP gadgets
+to reconstruct the missing page table entries,
+which significantly increases difficulty of the attack,
+especially when other protections like the kernel stack size limit
+and address space layout randomization are in place.
+.IP \[bu]
+Prevent cross-process user-space memory exposures.
+Once a region for a
+.BR memfd_secret ()
+memory mapping is allocated,
+the user can't accidentally pass it into the kernel
+to be transmitted somewhere.
+The memory pages in this region cannot be accessed via the direct map
+and they are disallowed in get_user_pages.
+.IP \[bu]
+Harden against exploited kernel flaws.
+In order to access memory areas backed by
+.BR memfd_secret (),
+a kernel-side attack would need to
+either walk the page tables and create new ones,
+or spawn a new privileged user-space process to perform
+secrets exfiltration using
+.BR ptrace (2).
+.PP
+The way
+.BR memfd_secret ()
+allocates and locks the memory may impact overall system performance,
+therefore the system call is disabled by default and only available
+if the system administrator turned it on using
+"secretmem.enable=y" kernel parameter.
+.PP
+To prevent potential data leaks of memory regions backed by
+.BR memfd_secret ()
+from a hybernation image,
+hybernation is prevented when there are active
+.BR memfd_secret ()
+users.
+.SH SEE ALSO
+.BR fcntl (2),
+.BR ftruncate (2),
+.BR mlock (2),
+.BR memfd_create (2),
+.BR mmap (2),
+.BR setrlimit (2)
diff --git a/man2/migrate_pages.2 b/man2/migrate_pages.2
new file mode 100644
index 0000000..177f463
--- /dev/null
+++ b/man2/migrate_pages.2
@@ -0,0 +1,174 @@
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft-2-para
+.\"
+.\" Copyright 2009 Intel Corporation
+.\" Author: Andi Kleen
+.\" Based on the move_pages manpage which was
+.\" This manpage is Copyright (C) 2006 Silicon Graphics, Inc.
+.\" Christoph Lameter
+.\"
+.TH migrate_pages 2 2023-07-15 "Linux man-pages 6.05.01"
+.SH NAME
+migrate_pages \- move all pages in a process to another set of nodes
+.SH LIBRARY
+NUMA (Non-Uniform Memory Access) policy library
+.RI ( libnuma ", " \-lnuma )
+.SH SYNOPSIS
+.nf
+.B #include <numaif.h>
+.PP
+.BI "long migrate_pages(int " pid ", unsigned long " maxnode,
+.BI " const unsigned long *" old_nodes,
+.BI " const unsigned long *" new_nodes );
+.fi
+.SH DESCRIPTION
+.BR migrate_pages ()
+attempts to move all pages of the process
+.I pid
+that are in memory nodes
+.I old_nodes
+to the memory nodes in
+.IR new_nodes .
+Pages not located in any node in
+.I old_nodes
+will not be migrated.
+As far as possible,
+the kernel maintains the relative topology relationship inside
+.I old_nodes
+during the migration to
+.IR new_nodes .
+.PP
+The
+.I old_nodes
+and
+.I new_nodes
+arguments are pointers to bit masks of node numbers, with up to
+.I maxnode
+bits in each mask.
+These masks are maintained as arrays of unsigned
+.I long
+integers (in the last
+.I long
+integer, the bits beyond those specified by
+.I maxnode
+are ignored).
+The
+.I maxnode
+argument is the maximum node number in the bit mask plus one (this is the same
+as in
+.BR mbind (2),
+but different from
+.BR select (2)).
+.PP
+The
+.I pid
+argument is the ID of the process whose pages are to be moved.
+To move pages in another process,
+the caller must be privileged
+.RB ( CAP_SYS_NICE )
+or the real or effective user ID of the calling process must match the
+real or saved-set user ID of the target process.
+If
+.I pid
+is 0, then
+.BR migrate_pages ()
+moves pages of the calling process.
+.PP
+Pages shared with another process will be moved only if the initiating
+process has the
+.B CAP_SYS_NICE
+privilege.
+.SH RETURN VALUE
+On success
+.BR migrate_pages ()
+returns the number of pages that could not be moved
+(i.e., a return of zero means that all pages were successfully moved).
+On error, it returns \-1, and sets
+.I errno
+to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+Part or all of the memory range specified by
+.IR old_nodes / new_nodes
+and
+.I maxnode
+points outside your accessible address space.
+.TP
+.B EINVAL
+The value specified by
+.I maxnode
+exceeds a kernel-imposed limit.
+.\" As at 3.5, this limit is "a page worth of bits", e.g.,
+.\" 8 * 4096 bits, assuming a 4kB page size.
+Or,
+.I old_nodes
+or
+.I new_nodes
+specifies one or more node IDs that are
+greater than the maximum supported node ID.
+Or, none of the node IDs specified by
+.I new_nodes
+are on-line and allowed by the process's current cpuset context,
+or none of the specified nodes contain memory.
+.TP
+.B EPERM
+Insufficient privilege
+.RB ( CAP_SYS_NICE )
+to move pages of the process specified by
+.IR pid ,
+or insufficient privilege
+.RB ( CAP_SYS_NICE )
+to access the specified target nodes.
+.TP
+.B ESRCH
+No process matching
+.I pid
+could be found.
+.\" FIXME Document the other errors that can occur for migrate_pages()
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.16.
+.SH NOTES
+For information on library support, see
+.BR numa (7).
+.PP
+Use
+.BR get_mempolicy (2)
+with the
+.B MPOL_F_MEMS_ALLOWED
+flag to obtain the set of nodes that are allowed by
+the calling process's cpuset.
+Note that this information is subject to change at any
+time by manual or automatic reconfiguration of the cpuset.
+.PP
+Use of
+.BR migrate_pages ()
+may result in pages whose location
+(node) violates the memory policy established for the
+specified addresses (see
+.BR mbind (2))
+and/or the specified process (see
+.BR set_mempolicy (2)).
+That is, memory policy does not constrain the destination
+nodes used by
+.BR migrate_pages ().
+.PP
+The
+.I <numaif.h>
+header is not included with glibc, but requires installing
+.I libnuma\-devel
+or a similar package.
+.SH SEE ALSO
+.BR get_mempolicy (2),
+.BR mbind (2),
+.BR set_mempolicy (2),
+.BR numa (3),
+.BR numa_maps (5),
+.BR cpuset (7),
+.BR numa (7),
+.BR migratepages (8),
+.BR numastat (8)
+.PP
+.I Documentation/vm/page_migration.rst
+in the Linux kernel source tree
diff --git a/man2/mincore.2 b/man2/mincore.2
new file mode 100644
index 0000000..9ffca56
--- /dev/null
+++ b/man2/mincore.2
@@ -0,0 +1,158 @@
+.\" Copyright (C) 2001 Bert Hubert <ahu@ds9a.nl>
+.\" and Copyright (C) 2007 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Created Sun Jun 3 17:23:32 2001 by bert hubert <ahu@ds9a.nl>
+.\" Slightly adapted, following comments by Hugh Dickins, aeb, 2001-06-04.
+.\" Modified, 20 May 2003, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified, 30 Apr 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" 2005-04-05 mtk, Fixed error descriptions
+.\" after message from <gordon.jin@intel.com>
+.\" 2007-01-08 mtk, rewrote various parts
+.\"
+.TH mincore 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+mincore \- determine whether pages are resident in memory
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/mman.h>
+.PP
+.BI "int mincore(void " addr [. length "], size_t " length ", unsigned char *" vec );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR mincore ():
+.nf
+ Since glibc 2.19:
+ _DEFAULT_SOURCE
+ glibc 2.19 and earlier:
+ _BSD_SOURCE || _SVID_SOURCE
+.fi
+.SH DESCRIPTION
+.BR mincore ()
+returns a vector that indicates whether pages
+of the calling process's virtual memory are resident in core (RAM),
+and so will not cause a disk access (page fault) if referenced.
+The kernel returns residency information about the pages
+starting at the address
+.IR addr ,
+and continuing for
+.I length
+bytes.
+.PP
+The
+.I addr
+argument must be a multiple of the system page size.
+The
+.I length
+argument need not be a multiple of the page size,
+but since residency information is returned for whole pages,
+.I length
+is effectively rounded up to the next multiple of the page size.
+One may obtain the page size
+.RB ( PAGE_SIZE )
+using
+.IR sysconf(_SC_PAGESIZE) .
+.PP
+The
+.I vec
+argument must point to an array containing at least
+.I "(length+PAGE_SIZE\-1) / PAGE_SIZE"
+bytes.
+On return,
+the least significant bit of each byte will be set if
+the corresponding page is currently resident in memory,
+and be clear otherwise.
+(The settings of the other bits in each byte are undefined;
+these bits are reserved for possible later use.)
+Of course the information returned in
+.I vec
+is only a snapshot: pages that are not
+locked in memory can come and go at any moment, and the contents of
+.I vec
+may already be stale by the time this call returns.
+.SH RETURN VALUE
+On success,
+.BR mincore ()
+returns zero.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.B EAGAIN
+kernel is temporarily out of resources.
+.TP
+.B EFAULT
+.I vec
+points to an invalid address.
+.TP
+.B EINVAL
+.I addr
+is not a multiple of the page size.
+.TP
+.B ENOMEM
+.I length
+is greater than
+.RI ( TASK_SIZE " \- " addr ).
+(This could occur if a negative value is specified for
+.IR length ,
+since that value will be interpreted as a large
+unsigned integer.)
+In Linux 2.6.11 and earlier, the error
+.B EINVAL
+was returned for this condition.
+.TP
+.B ENOMEM
+.I addr
+to
+.I addr
++
+.I length
+contained unmapped memory.
+.SH STANDARDS
+None.
+.SH HISTORY
+Linux 2.3.99pre1,
+glibc 2.2.
+.PP
+First appeared in 4.4BSD.
+.PP
+NetBSD, FreeBSD, OpenBSD, Solaris 8,
+AIX 5.1, SunOS 4.1.
+.SH BUGS
+Before Linux 2.6.21,
+.BR mincore ()
+did not return correct information for
+.B MAP_PRIVATE
+mappings, or for nonlinear mappings (established using
+.BR remap_file_pages (2)).
+.\" Linux (up to now, 2.6.5),
+.\" .B mincore
+.\" does not return correct information for MAP_PRIVATE mappings:
+.\" for a MAP_PRIVATE file mapping,
+.\" .B mincore
+.\" returns the residency of the file pages, rather than any
+.\" modified process-private pages that have been copied on write;
+.\" for a MAP_PRIVATE mapping of
+.\" .IR /dev/zero ,
+.\" .B mincore
+.\" always reports pages as nonresident;
+.\" and for a MAP_PRIVATE, MAP_ANONYMOUS mapping,
+.\" .B mincore
+.\" always fails with the error
+.\" .BR ENOMEM .
+.SH SEE ALSO
+.BR fincore (1),
+.BR madvise (2),
+.BR mlock (2),
+.BR mmap (2),
+.BR posix_fadvise (2),
+.BR posix_madvise (3)
diff --git a/man2/mkdir.2 b/man2/mkdir.2
new file mode 100644
index 0000000..c3342bd
--- /dev/null
+++ b/man2/mkdir.2
@@ -0,0 +1,250 @@
+.\" This manpage is Copyright (C) 1992 Drew Eckhardt;
+.\" and Copyright (C) 1993 Michael Haardt
+.\" and Copyright (C) 1993,1994 Ian Jackson
+.\" and Copyright (C) 2006, 2014 Michael Kerrisk
+.\"
+.\" SPDX-License-Identifier: GPL-1.0-or-later
+.\"
+.TH mkdir 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+mkdir, mkdirat \- create a directory
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/stat.h>
+.\" .B #include <unistd.h>
+.PP
+.BI "int mkdir(const char *" pathname ", mode_t " mode );
+.PP
+.BR "#include <fcntl.h> " "/* Definition of AT_* constants */"
+.B #include <sys/stat.h>
+.PP
+.BI "int mkdirat(int " dirfd ", const char *" pathname ", mode_t " mode );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR mkdirat ():
+.nf
+ Since glibc 2.10:
+ _POSIX_C_SOURCE >= 200809L
+ Before glibc 2.10:
+ _ATFILE_SOURCE
+.fi
+.SH DESCRIPTION
+.BR mkdir ()
+attempts to create a directory named
+.IR pathname .
+.PP
+The argument
+.I mode
+specifies the mode for the new directory (see
+.BR inode (7)).
+It is modified by the process's
+.I umask
+in the usual way: in the absence of a default ACL, the mode of the
+created directory is
+.RI ( mode " & \[ti]" umask " & 0777)."
+Whether other
+.I mode
+bits are honored for the created directory depends on the operating system.
+For Linux, see NOTES below.
+.PP
+The newly created directory will be owned by the effective user ID of the
+process.
+If the directory containing the file has the set-group-ID
+bit set, or if the filesystem is mounted with BSD group semantics
+.RI ( "mount \-o bsdgroups"
+or, synonymously
+.IR "mount \-o grpid" ),
+the new directory will inherit the group ownership from its parent;
+otherwise it will be owned by the effective group ID of the process.
+.PP
+If the parent directory has the set-group-ID bit set, then so will the
+newly created directory.
+.\"
+.\"
+.SS mkdirat()
+The
+.BR mkdirat ()
+system call operates in exactly the same way as
+.BR mkdir (),
+except for the differences described here.
+.PP
+If the pathname given in
+.I pathname
+is relative, then it is interpreted relative to the directory
+referred to by the file descriptor
+.I dirfd
+(rather than relative to the current working directory of
+the calling process, as is done by
+.BR mkdir ()
+for a relative pathname).
+.PP
+If
+.I pathname
+is relative and
+.I dirfd
+is the special value
+.BR AT_FDCWD ,
+then
+.I pathname
+is interpreted relative to the current working
+directory of the calling process (like
+.BR mkdir ()).
+.PP
+If
+.I pathname
+is absolute, then
+.I dirfd
+is ignored.
+.PP
+See
+.BR openat (2)
+for an explanation of the need for
+.BR mkdirat ().
+.SH RETURN VALUE
+.BR mkdir ()
+and
+.BR mkdirat ()
+return zero on success.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+The parent directory does not allow write permission to the process,
+or one of the directories in
+.I pathname
+did not allow search permission.
+(See also
+.BR path_resolution (7).)
+.TP
+.B EBADF
+.RB ( mkdirat ())
+.I pathname
+is relative but
+.I dirfd
+is neither
+.B AT_FDCWD
+nor a valid file descriptor.
+.TP
+.B EDQUOT
+The user's quota of disk blocks or inodes on the filesystem has been
+exhausted.
+.TP
+.B EEXIST
+.I pathname
+already exists (not necessarily as a directory).
+This includes the case where
+.I pathname
+is a symbolic link, dangling or not.
+.TP
+.B EFAULT
+.IR pathname " points outside your accessible address space."
+.TP
+.B EINVAL
+The final component ("basename") of the new directory's
+.I pathname
+is invalid
+(e.g., it contains characters not permitted by the underlying filesystem).
+.TP
+.B ELOOP
+Too many symbolic links were encountered in resolving
+.IR pathname .
+.TP
+.B EMLINK
+The number of links to the parent directory would exceed
+.BR LINK_MAX .
+.TP
+.B ENAMETOOLONG
+.IR pathname " was too long."
+.TP
+.B ENOENT
+A directory component in
+.I pathname
+does not exist or is a dangling symbolic link.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.B ENOSPC
+The device containing
+.I pathname
+has no room for the new directory.
+.TP
+.B ENOSPC
+The new directory cannot be created because the user's disk quota is
+exhausted.
+.TP
+.B ENOTDIR
+A component used as a directory in
+.I pathname
+is not, in fact, a directory.
+.TP
+.B ENOTDIR
+.RB ( mkdirat ())
+.I pathname
+is relative and
+.I dirfd
+is a file descriptor referring to a file other than a directory.
+.TP
+.B EPERM
+The filesystem containing
+.I pathname
+does not support the creation of directories.
+.TP
+.B EROFS
+.I pathname
+refers to a file on a read-only filesystem.
+.SH VERSIONS
+Under Linux, apart from the permission bits, the
+.B S_ISVTX
+.I mode
+bit is also honored.
+.SS glibc notes
+On older kernels where
+.BR mkdirat ()
+is unavailable, the glibc wrapper function falls back to the use of
+.BR mkdir ().
+When
+.I pathname
+is a relative pathname,
+glibc constructs a pathname based on the symbolic link in
+.I /proc/self/fd
+that corresponds to the
+.I dirfd
+argument.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+.TP
+.BR mkdir ()
+SVr4, BSD, POSIX.1-2001.
+.\" SVr4 documents additional EIO, EMULTIHOP
+.TP
+.BR mkdirat ()
+Linux 2.6.16,
+glibc 2.4.
+.SH NOTES
+There are many infelicities in the protocol underlying NFS.
+Some of these affect
+.BR mkdir ().
+.SH SEE ALSO
+.BR mkdir (1),
+.BR chmod (2),
+.BR chown (2),
+.BR mknod (2),
+.BR mount (2),
+.BR rmdir (2),
+.BR stat (2),
+.BR umask (2),
+.BR unlink (2),
+.BR acl (5),
+.BR path_resolution (7)
diff --git a/man2/mkdirat.2 b/man2/mkdirat.2
new file mode 100644
index 0000000..467b98a
--- /dev/null
+++ b/man2/mkdirat.2
@@ -0,0 +1 @@
+.so man2/mkdir.2
diff --git a/man2/mknod.2 b/man2/mknod.2
new file mode 100644
index 0000000..0925aea
--- /dev/null
+++ b/man2/mknod.2
@@ -0,0 +1,302 @@
+.\" This manpage is Copyright (C) 1992 Drew Eckhardt;
+.\" and Copyright (C) 1993 Michael Haardt
+.\" and Copyright (C) 1993,1994 Ian Jackson
+.\" and Copyright (C) 2006, 2014, Michael Kerrisk
+.\"
+.\" SPDX-License-Identifier: GPL-1.0-or-later
+.\"
+.\" Modified 1996-08-18 by urs
+.\" Modified 2003-04-23 by Michael Kerrisk
+.\" Modified 2004-06-23 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.TH mknod 2 2023-03-31 "Linux man-pages 6.05.01"
+.SH NAME
+mknod, mknodat \- create a special or ordinary file
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/stat.h>
+.PP
+.BI "int mknod(const char *" pathname ", mode_t " mode ", dev_t " dev );
+.PP
+.BR "#include <fcntl.h> " "/* Definition of AT_* constants */"
+.B #include <sys/stat.h>
+.PP
+.BI "int mknodat(int " dirfd ", const char *" pathname ", mode_t " mode \
+", dev_t " dev );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR mknod ():
+.nf
+ _XOPEN_SOURCE >= 500
+.\" || _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED
+ || /* Since glibc 2.19: */ _DEFAULT_SOURCE
+ || /* glibc <= 2.19: */ _BSD_SOURCE || _SVID_SOURCE
+.fi
+.SH DESCRIPTION
+The system call
+.BR mknod ()
+creates a filesystem node (file, device special file, or
+named pipe) named
+.IR pathname ,
+with attributes specified by
+.I mode
+and
+.IR dev .
+.PP
+The
+.I mode
+argument specifies both the file mode to use and the type of node
+to be created.
+It should be a combination (using bitwise OR) of one of the file types
+listed below and zero or more of the file mode bits listed in
+.BR inode (7).
+.PP
+The file mode is modified by the process's
+.I umask
+in the usual way: in the absence of a default ACL, the permissions of the
+created node are
+.RI ( mode " & \[ti]" umask ).
+.PP
+The file type must be one of
+.BR S_IFREG ,
+.BR S_IFCHR ,
+.BR S_IFBLK ,
+.BR S_IFIFO ,
+or
+.B S_IFSOCK
+.\" (S_IFSOCK since Linux 1.2.4)
+to specify a regular file (which will be created empty), character
+special file, block special file, FIFO (named pipe), or UNIX domain socket,
+respectively.
+(Zero file type is equivalent to type
+.BR S_IFREG .)
+.PP
+If the file type is
+.B S_IFCHR
+or
+.BR S_IFBLK ,
+then
+.I dev
+specifies the major and minor numbers of the newly created device
+special file
+.RB ( makedev (3)
+may be useful to build the value for
+.IR dev );
+otherwise it is ignored.
+.PP
+If
+.I pathname
+already exists, or is a symbolic link, this call fails with an
+.B EEXIST
+error.
+.PP
+The newly created node will be owned by the effective user ID of the
+process.
+If the directory containing the node has the set-group-ID
+bit set, or if the filesystem is mounted with BSD group semantics, the
+new node will inherit the group ownership from its parent directory;
+otherwise it will be owned by the effective group ID of the process.
+.\"
+.\"
+.SS mknodat()
+The
+.BR mknodat ()
+system call operates in exactly the same way as
+.BR mknod (),
+except for the differences described here.
+.PP
+If the pathname given in
+.I pathname
+is relative, then it is interpreted relative to the directory
+referred to by the file descriptor
+.I dirfd
+(rather than relative to the current working directory of
+the calling process, as is done by
+.BR mknod ()
+for a relative pathname).
+.PP
+If
+.I pathname
+is relative and
+.I dirfd
+is the special value
+.BR AT_FDCWD ,
+then
+.I pathname
+is interpreted relative to the current working
+directory of the calling process (like
+.BR mknod ()).
+.PP
+If
+.I pathname
+is absolute, then
+.I dirfd
+is ignored.
+.PP
+See
+.BR openat (2)
+for an explanation of the need for
+.BR mknodat ().
+.SH RETURN VALUE
+.BR mknod ()
+and
+.BR mknodat ()
+return zero on success.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+The parent directory does not allow write permission to the process,
+or one of the directories in the path prefix of
+.I pathname
+did not allow search permission.
+(See also
+.BR path_resolution (7).)
+.TP
+.B EBADF
+.RB ( mknodat ())
+.I pathname
+is relative but
+.I dirfd
+is neither
+.B AT_FDCWD
+nor a valid file descriptor.
+.TP
+.B EDQUOT
+The user's quota of disk blocks or inodes on the filesystem has been
+exhausted.
+.TP
+.B EEXIST
+.I pathname
+already exists.
+This includes the case where
+.I pathname
+is a symbolic link, dangling or not.
+.TP
+.B EFAULT
+.IR pathname " points outside your accessible address space."
+.TP
+.B EINVAL
+.I mode
+requested creation of something other than a regular file, device
+special file, FIFO or socket.
+.TP
+.B ELOOP
+Too many symbolic links were encountered in resolving
+.IR pathname .
+.TP
+.B ENAMETOOLONG
+.IR pathname " was too long."
+.TP
+.B ENOENT
+A directory component in
+.I pathname
+does not exist or is a dangling symbolic link.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.B ENOSPC
+The device containing
+.I pathname
+has no room for the new node.
+.TP
+.B ENOTDIR
+A component used as a directory in
+.I pathname
+is not, in fact, a directory.
+.TP
+.B ENOTDIR
+.RB ( mknodat ())
+.I pathname
+is relative and
+.I dirfd
+is a file descriptor referring to a file other than a directory.
+.TP
+.B EPERM
+.I mode
+requested creation of something other than a regular file,
+FIFO (named pipe), or UNIX domain socket, and the caller
+is not privileged (Linux: does not have the
+.B CAP_MKNOD
+capability);
+.\" For UNIX domain sockets and regular files, EPERM is returned only in
+.\" Linux 2.2 and earlier; in Linux 2.4 and later, unprivileged can
+.\" use mknod() to make these files.
+also returned if the filesystem containing
+.I pathname
+does not support the type of node requested.
+.TP
+.B EROFS
+.I pathname
+refers to a file on a read-only filesystem.
+.SH VERSIONS
+POSIX.1-2001 says: "The only portable use of
+.BR mknod ()
+is to create a FIFO-special file.
+If
+.I mode
+is not
+.B S_IFIFO
+or
+.I dev
+is not 0, the behavior of
+.BR mknod ()
+is unspecified."
+However, nowadays one should never use
+.BR mknod ()
+for this purpose; one should use
+.BR mkfifo (3),
+a function especially defined for this purpose.
+.PP
+Under Linux,
+.BR mknod ()
+cannot be used to create directories.
+One should make directories with
+.BR mkdir (2).
+.\" and one should make UNIX domain sockets with socket(2) and bind(2).
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+.TP
+.BR mknod ()
+SVr4, 4.4BSD, POSIX.1-2001 (but see VERSIONS).
+.\" The Linux version differs from the SVr4 version in that it
+.\" does not require root permission to create pipes, also in that no
+.\" EMULTIHOP, ENOLINK, or EINTR error is documented.
+.TP
+.BR mknodat ()
+Linux 2.6.16,
+glibc 2.4.
+POSIX.1-2008.
+.SH NOTES
+There are many infelicities in the protocol underlying NFS.
+Some of these affect
+.BR mknod ()
+and
+.BR mknodat ().
+.SH SEE ALSO
+.BR mknod (1),
+.BR chmod (2),
+.BR chown (2),
+.BR fcntl (2),
+.BR mkdir (2),
+.BR mount (2),
+.BR socket (2),
+.BR stat (2),
+.BR umask (2),
+.BR unlink (2),
+.BR makedev (3),
+.BR mkfifo (3),
+.BR acl (5),
+.BR path_resolution (7)
diff --git a/man2/mknodat.2 b/man2/mknodat.2
new file mode 100644
index 0000000..3db2282
--- /dev/null
+++ b/man2/mknodat.2
@@ -0,0 +1 @@
+.so man2/mknod.2
diff --git a/man2/mlock.2 b/man2/mlock.2
new file mode 100644
index 0000000..1efe3dd
--- /dev/null
+++ b/man2/mlock.2
@@ -0,0 +1,507 @@
+.\" Copyright (C) Michael Kerrisk, 2004
+.\" using some material drawn from earlier man pages
+.\" written by Thomas Kuhn, Copyright 1996
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.TH mlock 2 2023-04-08 "Linux man-pages 6.05.01"
+.SH NAME
+mlock, mlock2, munlock, mlockall, munlockall \- lock and unlock memory
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/mman.h>
+.PP
+.BI "int mlock(const void " addr [. len "], size_t " len );
+.BI "int mlock2(const void " addr [. len "], size_t " len ", \
+unsigned int " flags );
+.BI "int munlock(const void " addr [. len "], size_t " len );
+.PP
+.BI "int mlockall(int " flags );
+.B int munlockall(void);
+.fi
+.SH DESCRIPTION
+.BR mlock (),
+.BR mlock2 (),
+and
+.BR mlockall ()
+lock part or all of the calling process's virtual address
+space into RAM, preventing that memory from being paged to the
+swap area.
+.PP
+.BR munlock ()
+and
+.BR munlockall ()
+perform the converse operation,
+unlocking part or all of the calling process's virtual address space,
+so that pages in the specified virtual address range
+can be swapped out again if required by the kernel memory manager.
+.PP
+Memory locking and unlocking are performed in units of whole pages.
+.SS mlock(), mlock2(), and munlock()
+.BR mlock ()
+locks pages in the address range starting at
+.I addr
+and continuing for
+.I len
+bytes.
+All pages that contain a part of the specified address range are
+guaranteed to be resident in RAM when the call returns successfully;
+the pages are guaranteed to stay in RAM until later unlocked.
+.PP
+.BR mlock2 ()
+.\" commit a8ca5d0ecbdde5cc3d7accacbd69968b0c98764e
+.\" commit de60f5f10c58d4f34b68622442c0e04180367f3f
+.\" commit b0f205c2a3082dd9081f9a94e50658c5fa906ff1
+also locks pages in the specified range starting at
+.I addr
+and continuing for
+.I len
+bytes.
+However, the state of the pages contained in that range after the call
+returns successfully will depend on the value in the
+.I flags
+argument.
+.PP
+The
+.I flags
+argument can be either 0 or the following constant:
+.TP
+.B MLOCK_ONFAULT
+Lock pages that are currently resident and mark the entire range so
+that the remaining nonresident pages are locked when they are populated
+by a page fault.
+.PP
+If
+.I flags
+is 0,
+.BR mlock2 ()
+behaves exactly the same as
+.BR mlock ().
+.PP
+.BR munlock ()
+unlocks pages in the address range starting at
+.I addr
+and continuing for
+.I len
+bytes.
+After this call, all pages that contain a part of the specified
+memory range can be moved to external swap space again by the kernel.
+.SS mlockall() and munlockall()
+.BR mlockall ()
+locks all pages mapped into the address space of the
+calling process.
+This includes the pages of the code, data, and stack
+segment, as well as shared libraries, user space kernel data, shared
+memory, and memory-mapped files.
+All mapped pages are guaranteed
+to be resident in RAM when the call returns successfully;
+the pages are guaranteed to stay in RAM until later unlocked.
+.PP
+The
+.I flags
+argument is constructed as the bitwise OR of one or more of the
+following constants:
+.TP
+.B MCL_CURRENT
+Lock all pages which are currently mapped into the address space of
+the process.
+.TP
+.B MCL_FUTURE
+Lock all pages which will become mapped into the address space of the
+process in the future.
+These could be, for instance, new pages required
+by a growing heap and stack as well as new memory-mapped files or
+shared memory regions.
+.TP
+.BR MCL_ONFAULT " (since Linux 4.4)"
+Used together with
+.BR MCL_CURRENT ,
+.BR MCL_FUTURE ,
+or both.
+Mark all current (with
+.BR MCL_CURRENT )
+or future (with
+.BR MCL_FUTURE )
+mappings to lock pages when they are faulted in.
+When used with
+.BR MCL_CURRENT ,
+all present pages are locked, but
+.BR mlockall ()
+will not fault in non-present pages.
+When used with
+.BR MCL_FUTURE ,
+all future mappings will be marked to lock pages when they are faulted
+in, but they will not be populated by the lock when the mapping is
+created.
+.B MCL_ONFAULT
+must be used with either
+.B MCL_CURRENT
+or
+.B MCL_FUTURE
+or both.
+.PP
+If
+.B MCL_FUTURE
+has been specified, then a later system call (e.g.,
+.BR mmap (2),
+.BR sbrk (2),
+.BR malloc (3)),
+may fail if it would cause the number of locked bytes to exceed
+the permitted maximum (see below).
+In the same circumstances, stack growth may likewise fail:
+the kernel will deny stack expansion and deliver a
+.B SIGSEGV
+signal to the process.
+.PP
+.BR munlockall ()
+unlocks all pages mapped into the address space of the
+calling process.
+.SH RETURN VALUE
+On success, these system calls return 0.
+On error, \-1 is returned,
+.I errno
+is set to indicate the error,
+and no changes are made to any locks in the
+address space of the process.
+.SH ERRORS
+.\"SVr4 documents an additional EAGAIN error code.
+.TP
+.B EAGAIN
+.RB ( mlock (),
+.BR mlock2 (),
+and
+.BR munlock ())
+Some or all of the specified address range could not be locked.
+.TP
+.B EINVAL
+.RB ( mlock (),
+.BR mlock2 (),
+and
+.BR munlock ())
+The result of the addition
+.IR addr + len
+was less than
+.I addr
+(e.g., the addition may have resulted in an overflow).
+.TP
+.B EINVAL
+.RB ( mlock2 ())
+Unknown \fIflags\fP were specified.
+.TP
+.B EINVAL
+.RB ( mlockall ())
+Unknown \fIflags\fP were specified or
+.B MCL_ONFAULT
+was specified without either
+.B MCL_FUTURE
+or
+.BR MCL_CURRENT .
+.TP
+.B EINVAL
+(Not on Linux)
+.I addr
+was not a multiple of the page size.
+.TP
+.B ENOMEM
+.RB ( mlock (),
+.BR mlock2 (),
+and
+.BR munlock ())
+Some of the specified address range does not correspond to mapped
+pages in the address space of the process.
+.TP
+.B ENOMEM
+.RB ( mlock (),
+.BR mlock2 (),
+and
+.BR munlock ())
+Locking or unlocking a region would result in the total number of
+mappings with distinct attributes (e.g., locked versus unlocked)
+exceeding the allowed maximum.
+.\" I.e., the number of VMAs would exceed the 64kB maximum
+(For example, unlocking a range in the middle of a currently locked
+mapping would result in three mappings:
+two locked mappings at each end and an unlocked mapping in the middle.)
+.TP
+.B ENOMEM
+(Linux 2.6.9 and later) the caller had a nonzero
+.B RLIMIT_MEMLOCK
+soft resource limit, but tried to lock more memory than the limit
+permitted.
+This limit is not enforced if the process is privileged
+.RB ( CAP_IPC_LOCK ).
+.TP
+.B ENOMEM
+(Linux 2.4 and earlier) the calling process tried to lock more than
+half of RAM.
+.\" In the case of mlock(), this check is somewhat buggy: it doesn't
+.\" take into account whether the to-be-locked range overlaps with
+.\" already locked pages. Thus, suppose we allocate
+.\" (num_physpages / 4 + 1) of memory, and lock those pages once using
+.\" mlock(), and then lock the *same* page range a second time.
+.\" In the case, the second mlock() call will fail, since the check
+.\" calculates that the process is trying to lock (num_physpages / 2 + 2)
+.\" pages, which of course is not true. (MTK, Nov 04, kernel 2.4.28)
+.TP
+.B EPERM
+The caller is not privileged, but needs privilege
+.RB ( CAP_IPC_LOCK )
+to perform the requested operation.
+.TP
+.B EPERM
+.RB ( munlockall ())
+(Linux 2.6.8 and earlier) The caller was not privileged
+.RB ( CAP_IPC_LOCK ).
+.SH VERSIONS
+.SS Linux
+Under Linux,
+.BR mlock (),
+.BR mlock2 (),
+and
+.BR munlock ()
+automatically round
+.I addr
+down to the nearest page boundary.
+However, the POSIX.1 specification of
+.BR mlock ()
+and
+.BR munlock ()
+allows an implementation to require that
+.I addr
+is page aligned, so portable applications should ensure this.
+.PP
+The
+.I VmLck
+field of the Linux-specific
+.IR /proc/ pid /status
+file shows how many kilobytes of memory the process with ID
+.I PID
+has locked using
+.BR mlock (),
+.BR mlock2 (),
+.BR mlockall (),
+and
+.BR mmap (2)
+.BR MAP_LOCKED .
+.SH STANDARDS
+.TP
+.BR mlock ()
+.TQ
+.BR munlock ()
+.TQ
+.BR mlockall ()
+.TQ
+.BR munlockall ()
+POSIX.1-2008.
+.TP
+.BR mlock2 ()
+Linux.
+.PP
+On POSIX systems on which
+.BR mlock ()
+and
+.BR munlock ()
+are available,
+.B _POSIX_MEMLOCK_RANGE
+is defined in \fI<unistd.h>\fP and the number of bytes in a page
+can be determined from the constant
+.B PAGESIZE
+(if defined) in \fI<limits.h>\fP or by calling
+.IR sysconf(_SC_PAGESIZE) .
+.PP
+On POSIX systems on which
+.BR mlockall ()
+and
+.BR munlockall ()
+are available,
+.B _POSIX_MEMLOCK
+is defined in \fI<unistd.h>\fP to a value greater than 0.
+(See also
+.BR sysconf (3).)
+.\" POSIX.1-2001: It shall be defined to -1 or 0 or 200112L.
+.\" -1: unavailable, 0: ask using sysconf().
+.\" glibc defines it to 1.
+.SH HISTORY
+.TP
+.BR mlock ()
+.TQ
+.BR munlock ()
+.TQ
+.BR mlockall ()
+.TQ
+.BR munlockall ()
+POSIX.1-2001, POSIX.1-2008, SVr4.
+.TP
+.BR mlock2 ()
+Linux 4.4,
+glibc 2.27.
+.SH NOTES
+Memory locking has two main applications: real-time algorithms and
+high-security data processing.
+Real-time applications require
+deterministic timing, and, like scheduling, paging is one major cause
+of unexpected program execution delays.
+Real-time applications will
+usually also switch to a real-time scheduler with
+.BR sched_setscheduler (2).
+Cryptographic security software often handles critical bytes like
+passwords or secret keys as data structures.
+As a result of paging,
+these secrets could be transferred onto a persistent swap store medium,
+where they might be accessible to the enemy long after the security
+software has erased the secrets in RAM and terminated.
+(But be aware that the suspend mode on laptops and some desktop
+computers will save a copy of the system's RAM to disk, regardless
+of memory locks.)
+.PP
+Real-time processes that are using
+.BR mlockall ()
+to prevent delays on page faults should reserve enough
+locked stack pages before entering the time-critical section,
+so that no page fault can be caused by function calls.
+This can be achieved by calling a function that allocates a
+sufficiently large automatic variable (an array) and writes to the
+memory occupied by this array in order to touch these stack pages.
+This way, enough pages will be mapped for the stack and can be
+locked into RAM.
+The dummy writes ensure that not even copy-on-write
+page faults can occur in the critical section.
+.PP
+Memory locks are not inherited by a child created via
+.BR fork (2)
+and are automatically removed (unlocked) during an
+.BR execve (2)
+or when the process terminates.
+The
+.BR mlockall ()
+.B MCL_FUTURE
+and
+.B MCL_FUTURE | MCL_ONFAULT
+settings are not inherited by a child created via
+.BR fork (2)
+and are cleared during an
+.BR execve (2).
+.PP
+Note that
+.BR fork (2)
+will prepare the address space for a copy-on-write operation.
+The consequence is that any write access that follows will cause
+a page fault that in turn may cause high latencies for a real-time process.
+Therefore, it is crucial not to invoke
+.BR fork (2)
+after an
+.BR mlockall ()
+or
+.BR mlock ()
+operation\[em]not even from a thread which runs at a low priority within
+a process which also has a thread running at elevated priority.
+.PP
+The memory lock on an address range is automatically removed
+if the address range is unmapped via
+.BR munmap (2).
+.PP
+Memory locks do not stack, that is, pages which have been locked several times
+by calls to
+.BR mlock (),
+.BR mlock2 (),
+or
+.BR mlockall ()
+will be unlocked by a single call to
+.BR munlock ()
+for the corresponding range or by
+.BR munlockall ().
+Pages which are mapped to several locations or by several processes stay
+locked into RAM as long as they are locked at least at one location or by
+at least one process.
+.PP
+If a call to
+.BR mlockall ()
+which uses the
+.B MCL_FUTURE
+flag is followed by another call that does not specify this flag, the
+changes made by the
+.B MCL_FUTURE
+call will be lost.
+.PP
+The
+.BR mlock2 ()
+.B MLOCK_ONFAULT
+flag and the
+.BR mlockall ()
+.B MCL_ONFAULT
+flag allow efficient memory locking for applications that deal with
+large mappings where only a (small) portion of pages in the mapping are touched.
+In such cases, locking all of the pages in a mapping would incur
+a significant penalty for memory locking.
+.SS Limits and permissions
+In Linux 2.6.8 and earlier,
+a process must be privileged
+.RB ( CAP_IPC_LOCK )
+in order to lock memory and the
+.B RLIMIT_MEMLOCK
+soft resource limit defines a limit on how much memory the process may lock.
+.PP
+Since Linux 2.6.9, no limits are placed on the amount of memory
+that a privileged process can lock and the
+.B RLIMIT_MEMLOCK
+soft resource limit instead defines a limit on how much memory an
+unprivileged process may lock.
+.SH BUGS
+In Linux 4.8 and earlier,
+a bug in the kernel's accounting of locked memory for unprivileged processes
+(i.e., without
+.BR CAP_IPC_LOCK )
+meant that if the region specified by
+.I addr
+and
+.I len
+overlapped an existing lock,
+then the already locked bytes in the overlapping region were counted twice
+when checking against the limit.
+Such double accounting could incorrectly calculate a "total locked memory"
+value for the process that exceeded the
+.B RLIMIT_MEMLOCK
+limit, with the result that
+.BR mlock ()
+and
+.BR mlock2 ()
+would fail on requests that should have succeeded.
+This bug was fixed
+.\" commit 0cf2f6f6dc605e587d2c1120f295934c77e810e8
+in Linux 4.9.
+.PP
+In Linux 2.4 series of kernels up to and including Linux 2.4.17,
+a bug caused the
+.BR mlockall ()
+.B MCL_FUTURE
+flag to be inherited across a
+.BR fork (2).
+This was rectified in Linux 2.4.18.
+.PP
+Since Linux 2.6.9, if a privileged process calls
+.I mlockall(MCL_FUTURE)
+and later drops privileges (loses the
+.B CAP_IPC_LOCK
+capability by, for example,
+setting its effective UID to a nonzero value),
+then subsequent memory allocations (e.g.,
+.BR mmap (2),
+.BR brk (2))
+will fail if the
+.B RLIMIT_MEMLOCK
+resource limit is encountered.
+.\" See the following LKML thread:
+.\" http://marc.theaimsgroup.com/?l=linux-kernel&m=113801392825023&w=2
+.\" "Rationale for RLIMIT_MEMLOCK"
+.\" 23 Jan 2006
+.SH SEE ALSO
+.BR mincore (2),
+.BR mmap (2),
+.BR setrlimit (2),
+.BR shmctl (2),
+.BR sysconf (3),
+.BR proc (5),
+.BR capabilities (7)
diff --git a/man2/mlock2.2 b/man2/mlock2.2
new file mode 100644
index 0000000..5e5b3c7
--- /dev/null
+++ b/man2/mlock2.2
@@ -0,0 +1 @@
+.so man2/mlock.2
diff --git a/man2/mlockall.2 b/man2/mlockall.2
new file mode 100644
index 0000000..5e5b3c7
--- /dev/null
+++ b/man2/mlockall.2
@@ -0,0 +1 @@
+.so man2/mlock.2
diff --git a/man2/mmap.2 b/man2/mmap.2
new file mode 100644
index 0000000..3d9a887
--- /dev/null
+++ b/man2/mmap.2
@@ -0,0 +1,1035 @@
+'\" t
+.\" Copyright (C) 1996 Andries Brouwer <aeb@cwi.nl>
+.\" and Copyright (C) 2006, 2007 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 1997-01-31 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 2000-03-25 by Jim Van Zandt <jrv@vanzandt.mv.com>
+.\" Modified 2001-10-04 by John Levon <moz@compsoc.man.ac.uk>
+.\" Modified 2003-02-02 by Andi Kleen <ak@muc.de>
+.\" Modified 2003-05-21 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" MAP_LOCKED works from Linux 2.5.37
+.\" Modified 2004-06-17 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified 2004-09-11 by aeb
+.\" Modified 2004-12-08, from Eric Estievenart <eric.estievenart@free.fr>
+.\" Modified 2004-12-08, mtk, formatting tidy-ups
+.\" Modified 2006-12-04, mtk, various parts rewritten
+.\" 2007-07-10, mtk, Added an example program.
+.\" 2008-11-18, mtk, document MAP_STACK
+.\"
+.TH mmap 2 2023-07-20 "Linux man-pages 6.05.01"
+.SH NAME
+mmap, munmap \- map or unmap files or devices into memory
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/mman.h>
+.PP
+.BI "void *mmap(void " addr [. length "], size_t " length \
+", int " prot ", int " flags ,
+.BI " int " fd ", off_t " offset );
+.BI "int munmap(void " addr [. length "], size_t " length );
+.fi
+.PP
+See NOTES for information on feature test macro requirements.
+.SH DESCRIPTION
+.BR mmap ()
+creates a new mapping in the virtual address space of
+the calling process.
+The starting address for the new mapping is specified in
+.IR addr .
+The
+.I length
+argument specifies the length of the mapping (which must be greater than 0).
+.PP
+If
+.I addr
+is NULL,
+then the kernel chooses the (page-aligned) address
+at which to create the mapping;
+this is the most portable method of creating a new mapping.
+If
+.I addr
+is not NULL,
+then the kernel takes it as a hint about where to place the mapping;
+on Linux, the kernel will pick a nearby page boundary (but always above
+or equal to the value specified by
+.IR /proc/sys/vm/mmap_min_addr )
+and attempt to create the mapping there.
+If another mapping already exists there, the kernel picks a new address that
+may or may not depend on the hint.
+.\" Before Linux 2.6.24, the address was rounded up to the next page
+.\" boundary; since Linux 2.6.24, it is rounded down!
+The address of the new mapping is returned as the result of the call.
+.PP
+The contents of a file mapping (as opposed to an anonymous mapping; see
+.B MAP_ANONYMOUS
+below), are initialized using
+.I length
+bytes starting at offset
+.I offset
+in the file (or other object) referred to by the file descriptor
+.IR fd .
+.I offset
+must be a multiple of the page size as returned by
+.IR sysconf(_SC_PAGE_SIZE) .
+.PP
+After the
+.BR mmap ()
+call has returned, the file descriptor,
+.IR fd ,
+can be closed immediately without invalidating the mapping.
+.PP
+The
+.I prot
+argument describes the desired memory protection of the mapping
+(and must not conflict with the open mode of the file).
+It is either
+.B PROT_NONE
+or the bitwise OR of one or more of the following flags:
+.TP 1.1i
+.B PROT_EXEC
+Pages may be executed.
+.TP
+.B PROT_READ
+Pages may be read.
+.TP
+.B PROT_WRITE
+Pages may be written.
+.TP
+.B PROT_NONE
+Pages may not be accessed.
+.\"
+.SS The flags argument
+The
+.I flags
+argument determines whether updates to the mapping
+are visible to other processes mapping the same region,
+and whether updates are carried through to the underlying file.
+This behavior is determined by including exactly one
+of the following values in
+.IR flags :
+.TP
+.B MAP_SHARED
+Share this mapping.
+Updates to the mapping are visible to other processes mapping the same region,
+and (in the case of file-backed mappings)
+are carried through to the underlying file.
+(To precisely control when updates are carried through
+to the underlying file requires the use of
+.BR msync (2).)
+.TP
+.BR MAP_SHARED_VALIDATE " (since Linux 4.15)"
+This flag provides the same behavior as
+.B MAP_SHARED
+except that
+.B MAP_SHARED
+mappings ignore unknown flags in
+.IR flags .
+By contrast, when creating a mapping using
+.BR MAP_SHARED_VALIDATE ,
+the kernel verifies all passed flags are known and fails the
+mapping with the error
+.B EOPNOTSUPP
+for unknown flags.
+This mapping type is also required to be able to use some mapping flags
+(e.g.,
+.BR MAP_SYNC ).
+.TP
+.B MAP_PRIVATE
+Create a private copy-on-write mapping.
+Updates to the mapping are not visible to other processes
+mapping the same file, and are not carried through to
+the underlying file.
+It is unspecified whether changes made to the file after the
+.BR mmap ()
+call are visible in the mapped region.
+.PP
+Both
+.B MAP_SHARED
+and
+.B MAP_PRIVATE
+are described in POSIX.1-2001 and POSIX.1-2008.
+.B MAP_SHARED_VALIDATE
+is a Linux extension.
+.PP
+In addition, zero or more of the following values can be ORed in
+.IR flags :
+.TP
+.BR MAP_32BIT " (since Linux 2.4.20, 2.6)"
+Put the mapping into the first 2 Gigabytes of the process address space.
+This flag is supported only on x86-64, for 64-bit programs.
+It was added to allow thread stacks to be allocated somewhere
+in the first 2\ GB of memory,
+so as to improve context-switch performance on some early
+64-bit processors.
+.\" See http://lwn.net/Articles/294642 "Tangled up in threads", 19 Aug 08
+Modern x86-64 processors no longer have this performance problem,
+so use of this flag is not required on those systems.
+The
+.B MAP_32BIT
+flag is ignored when
+.B MAP_FIXED
+is set.
+.TP
+.B MAP_ANON
+Synonym for
+.BR MAP_ANONYMOUS ;
+provided for compatibility with other implementations.
+.TP
+.B MAP_ANONYMOUS
+The mapping is not backed by any file;
+its contents are initialized to zero.
+The
+.I fd
+argument is ignored;
+however, some implementations require
+.I fd
+to be \-1 if
+.B MAP_ANONYMOUS
+(or
+.BR MAP_ANON )
+is specified,
+and portable applications should ensure this.
+The
+.I offset
+argument should be zero.
+.\" See the pgoff overflow check in do_mmap().
+.\" See the offset check in sys_mmap in arch/x86/kernel/sys_x86_64.c.
+Support for
+.B MAP_ANONYMOUS
+in conjunction with
+.B MAP_SHARED
+was added in Linux 2.4.
+.TP
+.B MAP_DENYWRITE
+This flag is ignored.
+.\" Introduced in 1.1.36, removed in 1.3.24.
+(Long ago\[em]Linux 2.0 and earlier\[em]it signaled
+that attempts to write to the underlying file should fail with
+.BR ETXTBSY .
+But this was a source of denial-of-service attacks.)
+.TP
+.B MAP_EXECUTABLE
+This flag is ignored.
+.\" Introduced in 1.1.38, removed in 1.3.24. Flag tested in proc_follow_link.
+.\" (Long ago, it signaled that the underlying file is an executable.
+.\" However, that information was not really used anywhere.)
+.\" Linus talked about DOS related to MAP_EXECUTABLE, but he was thinking of
+.\" MAP_DENYWRITE?
+.TP
+.B MAP_FILE
+Compatibility flag.
+Ignored.
+.\" On some systems, this was required as the opposite of
+.\" MAP_ANONYMOUS -- mtk, 1 May 2007
+.TP
+.B MAP_FIXED
+Don't interpret
+.I addr
+as a hint: place the mapping at exactly that address.
+.I addr
+must be suitably aligned: for most architectures a multiple of the page
+size is sufficient; however, some architectures may impose additional
+restrictions.
+If the memory region specified by
+.I addr
+and
+.I length
+overlaps pages of any existing mapping(s), then the overlapped
+part of the existing mapping(s) will be discarded.
+If the specified address cannot be used,
+.BR mmap ()
+will fail.
+.IP
+Software that aspires to be portable should use the
+.B MAP_FIXED
+flag with care,
+keeping in mind that the exact layout of a process's memory mappings
+is allowed to change significantly between Linux versions,
+C library versions, and operating system releases.
+.I Carefully read the discussion of this flag in NOTES!
+.TP
+.BR MAP_FIXED_NOREPLACE " (since Linux 4.17)"
+.\" commit a4ff8e8620d3f4f50ac4b41e8067b7d395056843
+This flag provides behavior that is similar to
+.B MAP_FIXED
+with respect to the
+.I addr
+enforcement, but differs in that
+.B MAP_FIXED_NOREPLACE
+never clobbers a preexisting mapped range.
+If the requested range would collide with an existing mapping,
+then this call fails with the error
+.B EEXIST.
+This flag can therefore be used as a way to atomically
+(with respect to other threads) attempt to map an address range:
+one thread will succeed; all others will report failure.
+.IP
+Note that older kernels which do not recognize the
+.B MAP_FIXED_NOREPLACE
+flag will typically (upon detecting a collision with a preexisting mapping)
+fall back to a
+.RB \[lq]non- MAP_FIXED \[rq]
+type of behavior:
+they will return an address that is different from the requested address.
+Therefore, backward-compatible software
+should check the returned address against the requested address.
+.TP
+.B MAP_GROWSDOWN
+This flag is used for stacks.
+It indicates to the kernel virtual memory system that the mapping
+should extend downward in memory.
+The return address is one page lower than the memory area that is
+actually created in the process's virtual address space.
+Touching an address in the "guard" page below the mapping will cause
+the mapping to grow by a page.
+This growth can be repeated until the mapping grows to within a
+page of the high end of the next lower mapping,
+at which point touching the "guard" page will result in a
+.B SIGSEGV
+signal.
+.TP
+.BR MAP_HUGETLB " (since Linux 2.6.32)"
+Allocate the mapping using "huge" pages.
+See the Linux kernel source file
+.I Documentation/admin\-guide/mm/hugetlbpage.rst
+for further information, as well as NOTES, below.
+.TP
+.BR MAP_HUGE_2MB ", " MAP_HUGE_1GB " (since Linux 3.8)"
+.\" See https://lwn.net/Articles/533499/
+Used in conjunction with
+.B MAP_HUGETLB
+to select alternative hugetlb page sizes (respectively, 2\ MB and 1\ GB)
+on systems that support multiple hugetlb page sizes.
+.IP
+More generally, the desired huge page size can be configured by encoding
+the base-2 logarithm of the desired page size in the six bits at the offset
+.BR MAP_HUGE_SHIFT .
+(A value of zero in this bit field provides the default huge page size;
+the default huge page size can be discovered via the
+.I Hugepagesize
+field exposed by
+.IR /proc/meminfo .)
+Thus, the above two constants are defined as:
+.IP
+.in +4n
+.EX
+#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
+#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
+.EE
+.in
+.IP
+The range of huge page sizes that are supported by the system
+can be discovered by listing the subdirectories in
+.IR /sys/kernel/mm/hugepages .
+.TP
+.BR MAP_LOCKED " (since Linux 2.5.37)"
+Mark the mapped region to be locked in the same way as
+.BR mlock (2).
+This implementation will try to populate (prefault) the whole range but the
+.BR mmap ()
+call doesn't fail with
+.B ENOMEM
+if this fails.
+Therefore major faults might happen later on.
+So the semantic is not as strong as
+.BR mlock (2).
+One should use
+.BR mmap ()
+plus
+.BR mlock (2)
+when major faults are not acceptable after the initialization of the mapping.
+The
+.B MAP_LOCKED
+flag is ignored in older kernels.
+.\" If set, the mapped pages will not be swapped out.
+.TP
+.BR MAP_NONBLOCK " (since Linux 2.5.46)"
+This flag is meaningful only in conjunction with
+.BR MAP_POPULATE .
+Don't perform read-ahead:
+create page tables entries only for pages
+that are already present in RAM.
+Since Linux 2.6.23,
+.\" commit 54cb8821de07f2ffcd28c380ce9b93d5784b40d7
+this flag causes
+.B MAP_POPULATE
+to do nothing.
+One day, the combination of
+.B MAP_POPULATE
+and
+.B MAP_NONBLOCK
+may be reimplemented.
+.TP
+.B MAP_NORESERVE
+Do not reserve swap space for this mapping.
+When swap space is reserved, one has the guarantee
+that it is possible to modify the mapping.
+When swap space is not reserved one might get
+.B SIGSEGV
+upon a write
+if no physical memory is available.
+See also the discussion of the file
+.I /proc/sys/vm/overcommit_memory
+in
+.BR proc (5).
+Before Linux 2.6, this flag had effect only for
+private writable mappings.
+.TP
+.BR MAP_POPULATE " (since Linux 2.5.46)"
+Populate (prefault) page tables for a mapping.
+For a file mapping, this causes read-ahead on the file.
+This will help to reduce blocking on page faults later.
+The
+.BR mmap ()
+call doesn't fail if the mapping cannot be populated (for example, due
+to limitations on the number of mapped huge pages when using
+.BR MAP_HUGETLB ).
+Support for
+.B MAP_POPULATE
+in conjunction with private mappings was added in Linux 2.6.23.
+.TP
+.BR MAP_STACK " (since Linux 2.6.27)"
+Allocate the mapping at an address suitable for a process
+or thread stack.
+.IP
+This flag is currently a no-op on Linux.
+However, by employing this flag, applications can ensure that
+they transparently obtain support if the flag
+is implemented in the future.
+Thus, it is used in the glibc threading implementation to allow for
+the fact that some architectures may (later) require special treatment
+for stack allocations.
+.\" See http://lwn.net/Articles/294642 "Tangled up in threads", 19 Aug 08
+.\" commit cd98a04a59e2f94fa64d5bf1e26498d27427d5e7
+.\" http://thread.gmane.org/gmane.linux.kernel/720412
+.\" "pthread_create() slow for many threads; also time to revisit 64b
+.\" context switch optimization?"
+A further reason to employ this flag is portability:
+.B MAP_STACK
+exists (and has an effect) on some other systems (e.g., some of the BSDs).
+.TP
+.BR MAP_SYNC " (since Linux 4.15)"
+This flag is available only with the
+.B MAP_SHARED_VALIDATE
+mapping type;
+mappings of type
+.B MAP_SHARED
+will silently ignore this flag.
+This flag is supported only for files supporting DAX
+(direct mapping of persistent memory).
+For other files, creating a mapping with this flag results in an
+.B EOPNOTSUPP
+error.
+.IP
+Shared file mappings with this flag provide the guarantee that while
+some memory is mapped writable in the address space of the process,
+it will be visible in the same file at the same offset even after
+the system crashes or is rebooted.
+In conjunction with the use of appropriate CPU instructions,
+this provides users of such mappings with a more efficient way
+of making data modifications persistent.
+.TP
+.BR MAP_UNINITIALIZED " (since Linux 2.6.33)"
+Don't clear anonymous pages.
+This flag is intended to improve performance on embedded devices.
+This flag is honored only if the kernel was configured with the
+.B CONFIG_MMAP_ALLOW_UNINITIALIZED
+option.
+Because of the security implications,
+that option is normally enabled only on embedded devices
+(i.e., devices where one has complete control of the contents of user memory).
+.PP
+Of the above flags, only
+.B MAP_FIXED
+is specified in POSIX.1-2001 and POSIX.1-2008.
+However, most systems also support
+.B MAP_ANONYMOUS
+(or its synonym
+.BR MAP_ANON ).
+.\" FIXME . for later review when Issue 8 is one day released...
+.\" POSIX may add MAP_ANON in the future
+.\" http://austingroupbugs.net/tag_view_page.php?tag_id=8
+.\" http://austingroupbugs.net/view.php?id=850
+.SS munmap()
+The
+.BR munmap ()
+system call deletes the mappings for the specified address range, and
+causes further references to addresses within the range to generate
+invalid memory references.
+The region is also automatically unmapped
+when the process is terminated.
+On the other hand, closing the file
+descriptor does not unmap the region.
+.PP
+The address
+.I addr
+must be a multiple of the page size (but
+.I length
+need not be).
+All pages containing a part
+of the indicated range are unmapped, and subsequent references
+to these pages will generate
+.BR SIGSEGV .
+It is not an error if the
+indicated range does not contain any mapped pages.
+.SH RETURN VALUE
+On success,
+.BR mmap ()
+returns a pointer to the mapped area.
+On error, the value
+.B MAP_FAILED
+(that is,
+.IR "(void\ *)\ \-1" )
+is returned, and
+.I errno
+is set to indicate the error.
+.PP
+On success,
+.BR munmap ()
+returns 0.
+On failure, it returns \-1, and
+.I errno
+is set to indicate the error (probably to
+.BR EINVAL ).
+.SH ERRORS
+.TP
+.B EACCES
+A file descriptor refers to a non-regular file.
+Or a file mapping was requested, but
+.I fd
+is not open for reading.
+Or
+.B MAP_SHARED
+was requested and
+.B PROT_WRITE
+is set, but
+.I fd
+is not open in read/write
+.RB ( O_RDWR )
+mode.
+Or
+.B PROT_WRITE
+is set, but the file is append-only.
+.TP
+.B EAGAIN
+The file has been locked, or too much memory has been locked (see
+.BR setrlimit (2)).
+.TP
+.B EBADF
+.I fd
+is not a valid file descriptor (and
+.B MAP_ANONYMOUS
+was not set).
+.TP
+.B EEXIST
+.B MAP_FIXED_NOREPLACE
+was specified in
+.IR flags ,
+and the range covered by
+.I addr
+and
+.I length
+clashes with an existing mapping.
+.TP
+.B EINVAL
+We don't like
+.IR addr ,
+.IR length ,
+or
+.I offset
+(e.g., they are too large, or not aligned on a page boundary).
+.TP
+.B EINVAL
+(since Linux 2.6.12)
+.I length
+was 0.
+.TP
+.B EINVAL
+.I flags
+contained none of
+.BR MAP_PRIVATE ,
+.BR MAP_SHARED ,
+or
+.BR MAP_SHARED_VALIDATE .
+.TP
+.B ENFILE
+.\" This is for shared anonymous segments
+.\" [2.6.7] shmem_zero_setup()-->shmem_file_setup()-->get_empty_filp()
+The system-wide limit on the total number of open files has been reached.
+.\" .TP
+.\" .B ENOEXEC
+.\" A file could not be mapped for reading.
+.TP
+.B ENODEV
+The underlying filesystem of the specified file does not support
+memory mapping.
+.TP
+.B ENOMEM
+No memory is available.
+.TP
+.B ENOMEM
+The process's maximum number of mappings would have been exceeded.
+This error can also occur for
+.BR munmap (),
+when unmapping a region in the middle of an existing mapping,
+since this results in two smaller mappings on either side of
+the region being unmapped.
+.TP
+.B ENOMEM
+(since Linux 4.7)
+The process's
+.B RLIMIT_DATA
+limit, described in
+.BR getrlimit (2),
+would have been exceeded.
+.TP
+.B ENOMEM
+We don't like
+.IR addr ,
+because it exceeds the virtual address space of the CPU.
+.TP
+.B EOVERFLOW
+On 32-bit architecture together with the large file extension
+(i.e., using 64-bit
+.IR off_t ):
+the number of pages used for
+.I length
+plus number of pages used for
+.I offset
+would overflow
+.I "unsigned long"
+(32 bits).
+.TP
+.B EPERM
+The
+.I prot
+argument asks for
+.B PROT_EXEC
+but the mapped area belongs to a file on a filesystem that
+was mounted no-exec.
+.\" (Since Linux 2.4.25 / Linux 2.6.0.)
+.TP
+.B EPERM
+The operation was prevented by a file seal; see
+.BR fcntl (2).
+.TP
+.B EPERM
+The
+.B MAP_HUGETLB
+flag was specified, but the caller was not privileged (did not have the
+.B CAP_IPC_LOCK
+capability)
+and is not a member of the
+.I sysctl_hugetlb_shm_group
+group; see the description of
+.I /proc/sys/vm/sysctl_hugetlb_shm_group
+in
+.TP
+.B ETXTBSY
+.B MAP_DENYWRITE
+was set but the object specified by
+.I fd
+is open for writing.
+.PP
+Use of a mapped region can result in these signals:
+.TP
+.B SIGSEGV
+Attempted write into a region mapped as read-only.
+.TP
+.B SIGBUS
+Attempted access to a page of the buffer that lies beyond the
+end of the mapped file.
+For an explanation of the treatment of the bytes in the page that
+corresponds to the end of a mapped file that is not a multiple
+of the page size, see NOTES.
+.SH ATTRIBUTES
+For an explanation of the terms used in this section, see
+.BR attributes (7).
+.TS
+allbox;
+lbx lb lb
+l l l.
+Interface Attribute Value
+T{
+.na
+.nh
+.BR mmap (),
+.BR munmap ()
+T} Thread safety MT-Safe
+.TE
+.sp 1
+.SH VERSIONS
+On some hardware architectures (e.g., i386),
+.B PROT_WRITE
+implies
+.BR PROT_READ .
+It is architecture dependent whether
+.B PROT_READ
+implies
+.B PROT_EXEC
+or not.
+Portable programs should always set
+.B PROT_EXEC
+if they intend to execute code in the new mapping.
+.PP
+The portable way to create a mapping is to specify
+.I addr
+as 0 (NULL), and omit
+.B MAP_FIXED
+from
+.IR flags .
+In this case, the system chooses the address for the mapping;
+the address is chosen so as not to conflict with any existing mapping,
+and will not be 0.
+If the
+.B MAP_FIXED
+flag is specified, and
+.I addr
+is 0 (NULL), then the mapped address will be 0 (NULL).
+.PP
+Certain
+.I flags
+constants are defined only if suitable feature test macros are defined
+(possibly by default):
+.B _DEFAULT_SOURCE
+with glibc 2.19 or later;
+or
+.B _BSD_SOURCE
+or
+.B _SVID_SOURCE
+in glibc 2.19 and earlier.
+(Employing
+.B _GNU_SOURCE
+also suffices,
+and requiring that macro specifically would have been more logical,
+since these flags are all Linux-specific.)
+The relevant flags are:
+.BR MAP_32BIT ,
+.B MAP_ANONYMOUS
+(and the synonym
+.BR MAP_ANON ),
+.BR MAP_DENYWRITE ,
+.BR MAP_EXECUTABLE ,
+.BR MAP_FILE ,
+.BR MAP_GROWSDOWN ,
+.BR MAP_HUGETLB ,
+.BR MAP_LOCKED ,
+.BR MAP_NONBLOCK ,
+.BR MAP_NORESERVE ,
+.BR MAP_POPULATE ,
+and
+.BR MAP_STACK .
+.SS C library/kernel differences
+This page describes the interface provided by the glibc
+.BR mmap ()
+wrapper function.
+Originally, this function invoked a system call of the same name.
+Since Linux 2.4, that system call has been superseded by
+.BR mmap2 (2),
+and nowadays
+.\" Since around glibc 2.1/2.2, depending on the platform.
+the glibc
+.BR mmap ()
+wrapper function invokes
+.BR mmap2 (2)
+with a suitably adjusted value for
+.IR offset .
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4, 4.4BSD.
+.\" SVr4 documents additional error codes ENXIO and ENODEV.
+.\" SUSv2 documents additional error codes EMFILE and EOVERFLOW.
+.PP
+On POSIX systems on which
+.BR mmap (),
+.BR msync (2),
+and
+.BR munmap ()
+are available,
+.B _POSIX_MAPPED_FILES
+is defined in \fI<unistd.h>\fP to a value greater than 0.
+(See also
+.BR sysconf (3).)
+.\" POSIX.1-2001: It shall be defined to -1 or 0 or 200112L.
+.\" -1: unavailable, 0: ask using sysconf().
+.\" glibc defines it to 1.
+.SH NOTES
+Memory mapped by
+.BR mmap ()
+is preserved across
+.BR fork (2),
+with the same attributes.
+.PP
+A file is mapped in multiples of the page size.
+For a file that is not
+a multiple of the page size,
+the remaining bytes in the partial page at the end of the mapping
+are zeroed when mapped,
+and modifications to that region are not written out to the file.
+The effect of
+changing the size of the underlying file of a mapping on the pages that
+correspond to added or removed regions of the file is unspecified.
+.PP
+An application can determine which pages of a mapping are
+currently resident in the buffer/page cache using
+.BR mincore (2).
+.\"
+.SS Using MAP_FIXED safely
+The only safe use for
+.B MAP_FIXED
+is where the address range specified by
+.I addr
+and
+.I length
+was previously reserved using another mapping;
+otherwise, the use of
+.B MAP_FIXED
+is hazardous because it forcibly removes preexisting mappings,
+making it easy for a multithreaded process to corrupt its own address space.
+.PP
+For example, suppose that thread A looks through
+.IR /proc/ pid /maps
+in order to locate an unused address range that it can map using
+.BR MAP_FIXED ,
+while thread B simultaneously acquires part or all of that same
+address range.
+When thread A subsequently employs
+.BR mmap(MAP_FIXED) ,
+it will effectively clobber the mapping that thread B created.
+In this scenario,
+thread B need not create a mapping directly; simply making a library call
+that, internally, uses
+.BR dlopen (3)
+to load some other shared library, will suffice.
+The
+.BR dlopen (3)
+call will map the library into the process's address space.
+Furthermore, almost any library call may be implemented in a way that
+adds memory mappings to the address space, either with this technique,
+or by simply allocating memory.
+Examples include
+.BR brk (2),
+.BR malloc (3),
+.BR pthread_create (3),
+and the PAM libraries
+.UR http://www.linux-pam.org
+.UE .
+.PP
+Since Linux 4.17, a multithreaded program can use the
+.B MAP_FIXED_NOREPLACE
+flag to avoid the hazard described above
+when attempting to create a mapping at a fixed address
+that has not been reserved by a preexisting mapping.
+.\"
+.SS Timestamps changes for file-backed mappings
+For file-backed mappings, the
+.I st_atime
+field for the mapped file may be updated at any time between the
+.BR mmap ()
+and the corresponding unmapping; the first reference to a mapped
+page will update the field if it has not been already.
+.PP
+The
+.I st_ctime
+and
+.I st_mtime
+field for a file mapped with
+.B PROT_WRITE
+and
+.B MAP_SHARED
+will be updated after
+a write to the mapped region, and before a subsequent
+.BR msync (2)
+with the
+.B MS_SYNC
+or
+.B MS_ASYNC
+flag, if one occurs.
+.\"
+.SS Huge page (Huge TLB) mappings
+For mappings that employ huge pages, the requirements for the arguments of
+.BR mmap ()
+and
+.BR munmap ()
+differ somewhat from the requirements for mappings
+that use the native system page size.
+.PP
+For
+.BR mmap (),
+.I offset
+must be a multiple of the underlying huge page size.
+The system automatically aligns
+.I length
+to be a multiple of the underlying huge page size.
+.PP
+For
+.BR munmap (),
+.IR addr ,
+and
+.I length
+must both be a multiple of the underlying huge page size.
+.\"
+.SH BUGS
+On Linux, there are no guarantees like those suggested above under
+.BR MAP_NORESERVE .
+By default, any process can be killed
+at any moment when the system runs out of memory.
+.PP
+Before Linux 2.6.7, the
+.B MAP_POPULATE
+flag has effect only if
+.I prot
+is specified as
+.BR PROT_NONE .
+.PP
+SUSv3 specifies that
+.BR mmap ()
+should fail if
+.I length
+is 0.
+However, before Linux 2.6.12,
+.BR mmap ()
+succeeded in this case: no mapping was created and the call returned
+.IR addr .
+Since Linux 2.6.12,
+.BR mmap ()
+fails with the error
+.B EINVAL
+for this case.
+.PP
+POSIX specifies that the system shall always
+zero fill any partial page at the end
+of the object and that system will never write any modification of the
+object beyond its end.
+On Linux, when you write data to such partial page after the end
+of the object, the data stays in the page cache even after the file
+is closed and unmapped
+and even though the data is never written to the file itself,
+subsequent mappings may see the modified content.
+In some cases, this could be fixed by calling
+.BR msync (2)
+before the unmap takes place;
+however, this doesn't work on
+.BR tmpfs (5)
+(for example, when using the POSIX shared memory interface documented in
+.BR shm_overview (7)).
+.SH EXAMPLES
+.\" FIXME . Add an example here that uses an anonymous shared region for
+.\" IPC between parent and child.
+The following program prints part of the file specified in
+its first command-line argument to standard output.
+The range of bytes to be printed is specified via offset and length
+values in the second and third command-line arguments.
+The program creates a memory mapping of the required
+pages of the file and then uses
+.BR write (2)
+to output the desired bytes.
+.SS Program source
+.\" SRC BEGIN (mmap.c)
+.EX
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+\&
+#define handle_error(msg) \e
+ do { perror(msg); exit(EXIT_FAILURE); } while (0)
+\&
+int
+main(int argc, char *argv[])
+{
+ int fd;
+ char *addr;
+ off_t offset, pa_offset;
+ size_t length;
+ ssize_t s;
+ struct stat sb;
+\&
+ if (argc < 3 || argc > 4) {
+ fprintf(stderr, "%s file offset [length]\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ fd = open(argv[1], O_RDONLY);
+ if (fd == \-1)
+ handle_error("open");
+\&
+ if (fstat(fd, &sb) == \-1) /* To obtain file size */
+ handle_error("fstat");
+\&
+ offset = atoi(argv[2]);
+ pa_offset = offset & \[ti](sysconf(_SC_PAGE_SIZE) \- 1);
+ /* offset for mmap() must be page aligned */
+\&
+ if (offset >= sb.st_size) {
+ fprintf(stderr, "offset is past end of file\en");
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (argc == 4) {
+ length = atoi(argv[3]);
+ if (offset + length > sb.st_size)
+ length = sb.st_size \- offset;
+ /* Can\[aq]t display bytes past end of file */
+\&
+ } else { /* No length arg ==> display to end of file */
+ length = sb.st_size \- offset;
+ }
+\&
+ addr = mmap(NULL, length + offset \- pa_offset, PROT_READ,
+ MAP_PRIVATE, fd, pa_offset);
+ if (addr == MAP_FAILED)
+ handle_error("mmap");
+\&
+ s = write(STDOUT_FILENO, addr + offset \- pa_offset, length);
+ if (s != length) {
+ if (s == \-1)
+ handle_error("write");
+\&
+ fprintf(stderr, "partial write");
+ exit(EXIT_FAILURE);
+ }
+\&
+ munmap(addr, length + offset \- pa_offset);
+ close(fd);
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR ftruncate (2),
+.BR getpagesize (2),
+.BR memfd_create (2),
+.BR mincore (2),
+.BR mlock (2),
+.BR mmap2 (2),
+.BR mprotect (2),
+.BR mremap (2),
+.BR msync (2),
+.BR remap_file_pages (2),
+.BR setrlimit (2),
+.BR shmat (2),
+.BR userfaultfd (2),
+.BR shm_open (3),
+.BR shm_overview (7)
+.PP
+The descriptions of the following files in
+.BR proc (5):
+.IR /proc/ pid /maps ,
+.IR /proc/ pid /map_files ,
+and
+.IR /proc/ pid /smaps .
+.PP
+B.O. Gallmeister, POSIX.4, O'Reilly, pp. 128\[en]129 and 389\[en]391.
+.\"
+.\" Repeat after me: private read-only mappings are 100% equivalent to
+.\" shared read-only mappings. No ifs, buts, or maybes. -- Linus
diff --git a/man2/mmap2.2 b/man2/mmap2.2
new file mode 100644
index 0000000..e1704e3
--- /dev/null
+++ b/man2/mmap2.2
@@ -0,0 +1,85 @@
+.\" Copyright (C) 2002, Michael Kerrisk
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 31 Jan 2002, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added description of mmap2
+.\" Modified, 2004-11-25, mtk -- removed stray #endif in prototype
+.\"
+.TH mmap2 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+mmap2 \- map files or devices into memory
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <sys/mman.h>" " /* Definition of " MAP_* " and " PROT_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "void *syscall(SYS_mmap2, unsigned long " addr ", unsigned long " length ,
+.BI " unsigned long " prot ", unsigned long " flags ,
+.BI " unsigned long " fd ", unsigned long " pgoffset );
+.fi
+.SH DESCRIPTION
+This is probably not the system call that you are interested in; instead, see
+.BR mmap (2),
+which describes the glibc wrapper function that invokes this system call.
+.PP
+The
+.BR mmap2 ()
+system call provides the same interface as
+.BR mmap (2),
+except that the final argument specifies the offset into the
+file in 4096-byte units (instead of bytes, as is done by
+.BR mmap (2)).
+This enables applications that use a 32-bit
+.I off_t
+to map large files (up to 2\[ha]44 bytes).
+.SH RETURN VALUE
+On success,
+.BR mmap2 ()
+returns a pointer to the mapped area.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+Problem with getting the data from user space.
+.TP
+.B EINVAL
+(Various platforms where the page size is not 4096 bytes.)
+.I "offset\ *\ 4096"
+is not a multiple of the system page size.
+.PP
+.BR mmap2 ()
+can also return any of the errors described in
+.BR mmap (2).
+.SH VERSIONS
+On architectures where this system call is present,
+the glibc
+.BR mmap ()
+wrapper function invokes this system call rather than the
+.BR mmap (2)
+system call.
+.PP
+This system call does not exist on x86-64.
+.PP
+On ia64, the unit for
+.I offset
+is actually the system page size, rather than 4096 bytes.
+.\" ia64 can have page sizes ranging from 4 kB to 64 kB.
+.\" On cris, it looks like the unit might also be the page size,
+.\" which is 8192 bytes. -- mtk, June 2007
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.3.31.
+.SH SEE ALSO
+.BR getpagesize (2),
+.BR mmap (2),
+.BR mremap (2),
+.BR msync (2),
+.BR shm_open (3)
diff --git a/man2/modify_ldt.2 b/man2/modify_ldt.2
new file mode 100644
index 0000000..0364289
--- /dev/null
+++ b/man2/modify_ldt.2
@@ -0,0 +1,196 @@
+.\" Copyright (c) 1995 Michael Chastain (mec@duracef.shout.net), 22 July 1995.
+.\" Copyright (c) 2015 Andrew Lutomirski
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.TH modify_ldt 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+modify_ldt \- get or set a per-process LDT entry
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <asm/ldt.h>" " /* Definition of " "struct user_desc" " */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_modify_ldt, int " func ", void " ptr [. bytecount ],
+.BI " unsigned long " bytecount );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR modify_ldt (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+.BR modify_ldt ()
+reads or writes the local descriptor table (LDT) for a process.
+The LDT
+is an array of segment descriptors that can be referenced by user code.
+Linux allows processes to configure a per-process (actually per-mm) LDT.
+For more information about the LDT, see the Intel Software Developer's
+Manual or the AMD Architecture Programming Manual.
+.PP
+When
+.I func
+is 0,
+.BR modify_ldt ()
+reads the LDT into the memory pointed to by
+.IR ptr .
+The number of bytes read is the smaller of
+.I bytecount
+and the actual size of the LDT, although the kernel may act as though
+the LDT is padded with additional trailing zero bytes.
+On success,
+.BR modify_ldt ()
+will return the number of bytes read.
+.PP
+When
+.I func
+is 1 or 0x11,
+.BR modify_ldt ()
+modifies the LDT entry indicated by
+.IR ptr\->entry_number .
+.I ptr
+points to a
+.I user_desc
+structure
+and
+.I bytecount
+must equal the size of this structure.
+.PP
+The
+.I user_desc
+structure is defined in \fI<asm/ldt.h>\fP as:
+.PP
+.in +4n
+.EX
+struct user_desc {
+ unsigned int entry_number;
+ unsigned int base_addr;
+ unsigned int limit;
+ unsigned int seg_32bit:1;
+ unsigned int contents:2;
+ unsigned int read_exec_only:1;
+ unsigned int limit_in_pages:1;
+ unsigned int seg_not_present:1;
+ unsigned int useable:1;
+};
+.EE
+.in
+.PP
+In Linux 2.4 and earlier, this structure was named
+.IR modify_ldt_ldt_s .
+.PP
+The
+.I contents
+field is the segment type (data, expand-down data, non-conforming code, or
+conforming code).
+The other fields match their descriptions in the CPU manual, although
+.BR modify_ldt ()
+cannot set the hardware-defined "accessed" bit described in the CPU manual.
+.PP
+A
+.I user_desc
+is considered "empty" if
+.I read_exec_only
+and
+.I seg_not_present
+are set to 1 and all of the other fields are 0.
+An LDT entry can be cleared by setting it to an "empty"
+.I user_desc
+or, if
+.I func
+is 1, by setting both
+.I base
+and
+.I limit
+to 0.
+.PP
+A conforming code segment (i.e., one with
+.IR contents==3 )
+will be rejected if
+.I
+func
+is 1 or if
+.I seg_not_present
+is 0.
+.PP
+When
+.I func
+is 2,
+.BR modify_ldt ()
+will read zeros.
+This appears to be a leftover from Linux 2.4.
+.SH RETURN VALUE
+On success,
+.BR modify_ldt ()
+returns either the actual number of bytes read (for reading)
+or 0 (for writing).
+On failure,
+.BR modify_ldt ()
+returns \-1 and sets
+.I errno
+to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+.I ptr
+points outside the address space.
+.TP
+.B EINVAL
+.I ptr
+is 0,
+or
+.I func
+is 1 and
+.I bytecount
+is not equal to the size of the structure
+.IR user_desc ,
+or
+.I func
+is 1 or 0x11 and the new LDT entry has invalid values.
+.TP
+.B ENOSYS
+.I func
+is neither 0, 1, 2, nor 0x11.
+.SH STANDARDS
+Linux.
+.SH NOTES
+.BR modify_ldt ()
+should not be used for thread-local storage, as it slows down context
+switches and only supports a limited number of threads.
+Threading libraries should use
+.BR set_thread_area (2)
+or
+.BR arch_prctl (2)
+instead, except on extremely old kernels that do not support those system
+calls.
+.PP
+The normal use for
+.BR modify_ldt ()
+is to run legacy 16-bit or segmented 32-bit code.
+Not all kernels allow 16-bit segments to be installed, however.
+.PP
+Even on 64-bit kernels,
+.BR modify_ldt ()
+cannot be used to create a long mode (i.e., 64-bit) code segment.
+The undocumented field "lm" in
+.I user_desc
+is not useful, and, despite its name,
+does not result in a long mode segment.
+.SH BUGS
+On 64-bit kernels before Linux 3.19,
+.\" commit e30ab185c490e9a9381385529e0fd32f0a399495
+setting the "lm" bit in
+.I user_desc
+prevents the descriptor from being considered empty.
+Keep in mind that the
+"lm" bit does not exist in the 32-bit headers, but these buggy kernels
+will still notice the bit even when set in a 32-bit process.
+.SH SEE ALSO
+.BR arch_prctl (2),
+.BR set_thread_area (2),
+.BR vm86 (2)
diff --git a/man2/mount.2 b/man2/mount.2
new file mode 100644
index 0000000..916b68c
--- /dev/null
+++ b/man2/mount.2
@@ -0,0 +1,971 @@
+.\" Copyright (C) 1993 Rickard E. Faith <faith@cs.unc.edu>
+.\" and Copyright (C) 1994 Andries E. Brouwer <aeb@cwi.nl>
+.\" and Copyright (C) 2002, 2005, 2016 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 1996-11-04 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 2001-10-13 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added note on historical behavior of MS_NOSUID
+.\" Modified 2002-05-16 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Extensive changes and additions
+.\" Modified 2002-05-27 by aeb
+.\" Modified 2002-06-11 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Enhanced descriptions of MS_MOVE, MS_BIND, and MS_REMOUNT
+.\" Modified 2004-06-17 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" 2005-05-18, mtk, Added MNT_EXPIRE, plus a few other tidy-ups.
+.\" 2008-10-06, mtk: move umount*() material into separate umount.2 page.
+.\" 2008-10-06, mtk: Add discussion of namespaces.
+.\"
+.TH mount 2 2023-04-03 "Linux man-pages 6.05.01"
+.SH NAME
+mount \- mount filesystem
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B "#include <sys/mount.h>"
+.PP
+.BI "int mount(const char *" source ", const char *" target ,
+.BI " const char *" filesystemtype ", unsigned long " mountflags ,
+.BI " const void *_Nullable " data );
+.fi
+.SH DESCRIPTION
+.BR mount ()
+attaches the filesystem specified by
+.I source
+(which is often a pathname referring to a device,
+but can also be the pathname of a directory or file,
+or a dummy string) to the location (a directory or file)
+specified by the pathname in
+.IR target .
+.PP
+Appropriate privilege (Linux: the
+.B CAP_SYS_ADMIN
+capability) is required to mount filesystems.
+.PP
+Values for the
+.I filesystemtype
+argument supported by the kernel are listed in
+.I /proc/filesystems
+(e.g., "btrfs", "ext4", "jfs", "xfs", "vfat", "fuse",
+"tmpfs", "cgroup", "proc", "mqueue", "nfs", "cifs", "iso9660").
+Further types may become available when the appropriate modules
+are loaded.
+.PP
+The
+.I data
+argument is interpreted by the different filesystems.
+Typically it is a string of comma-separated options
+understood by this filesystem.
+See
+.BR mount (8)
+for details of the options available for each filesystem type.
+This argument may be specified as NULL, if there are no options.
+.PP
+A call to
+.BR mount ()
+performs one of a number of general types of operation,
+depending on the bits specified in
+.IR mountflags .
+The choice of which operation to perform is determined by
+testing the bits set in
+.IR mountflags ,
+with the tests being conducted in the order listed here:
+.IP \[bu] 3
+Remount an existing mount:
+.I mountflags
+includes
+.BR MS_REMOUNT .
+.IP \[bu]
+Create a bind mount:
+.I mountflags
+includes
+.BR MS_BIND .
+.IP \[bu]
+Change the propagation type of an existing mount:
+.I mountflags
+includes one of
+.BR MS_SHARED ,
+.BR MS_PRIVATE ,
+.BR MS_SLAVE ,
+or
+.BR MS_UNBINDABLE .
+.IP \[bu]
+Move an existing mount to a new location:
+.I mountflags
+includes
+.BR MS_MOVE .
+.IP \[bu]
+Create a new mount:
+.I mountflags
+includes none of the above flags.
+.PP
+Each of these operations is detailed later in this page.
+Further flags may be specified in
+.I mountflags
+to modify the behavior of
+.BR mount (),
+as described below.
+.\"
+.SS Additional mount flags
+The list below describes the additional flags that can be specified in
+.IR mountflags .
+Note that some operation types ignore some or all of these flags,
+as described later in this page.
+.\"
+.\" FIXME 2.6.25 Added MS_I_VERSION, which needs to be documented.
+.\" commit 7a224228ed79d587ece2304869000aad1b8e97dd
+.\" (This is a per-superblock flag)
+.\"
+.TP
+.BR MS_DIRSYNC " (since Linux 2.5.19)"
+Make directory changes on this filesystem synchronous.
+(This property can be obtained for individual directories
+or subtrees using
+.BR chattr (1).)
+.TP
+.BR MS_LAZYTIME " (since Linux 4.0)"
+.\" commit 0ae45f63d4ef8d8eeec49c7d8b44a1775fff13e8
+.\" commit fe032c422c5ba562ba9c2d316f55e258e03259c6
+.\" commit a26f49926da938f47561f386be56a83dd37a496d
+Reduce on-disk updates of inode timestamps (atime, mtime, ctime)
+by maintaining these changes only in memory.
+The on-disk timestamps are updated only when:
+.RS
+.IP \[bu] 3
+the inode needs to be updated for some change unrelated to file timestamps;
+.IP \[bu]
+the application employs
+.BR fsync (2),
+.BR syncfs (2),
+or
+.BR sync (2);
+.IP \[bu]
+an undeleted inode is evicted from memory; or
+.IP \[bu]
+more than 24 hours have passed since the inode was written to disk.
+.RE
+.IP
+This mount option significantly reduces writes
+needed to update the inode's timestamps, especially mtime and atime.
+However, in the event of a system crash, the atime and mtime fields
+on disk might be out of date by up to 24 hours.
+.IP
+Examples of workloads where this option could be of significant benefit
+include frequent random writes to preallocated files,
+as well as cases where the
+.B MS_STRICTATIME
+mount option is also enabled.
+(The advantage of combining
+.B MS_STRICTATIME
+and
+.B MS_LAZYTIME
+is that
+.BR stat (2)
+will return the correctly updated atime, but the atime updates
+will be flushed to disk only in the cases listed above.)
+.TP
+.B MS_MANDLOCK
+Permit mandatory locking on files in this filesystem.
+(Mandatory locking must still be enabled on a per-file basis,
+as described in
+.BR fcntl (2).)
+Since Linux 4.5,
+.\" commit 95ace75414f312f9a7b93d873f386987b92a5301
+this mount option requires the
+.B CAP_SYS_ADMIN
+capability and a kernel configured with the
+.B CONFIG_MANDATORY_FILE_LOCKING
+option.
+Mandatory locking has been fully deprecated in Linux 5.15, so
+this flag should be considered deprecated.
+.TP
+.B MS_NOATIME
+Do not update access times for (all types of) files on this filesystem.
+.TP
+.B MS_NODEV
+Do not allow access to devices (special files) on this filesystem.
+.TP
+.B MS_NODIRATIME
+Do not update access times for directories on this filesystem.
+This flag provides a subset of the functionality provided by
+.BR MS_NOATIME ;
+that is,
+.B MS_NOATIME
+implies
+.BR MS_NODIRATIME .
+.TP
+.B MS_NOEXEC
+Do not allow programs to be executed from this filesystem.
+.\" (Possibly useful for a filesystem that contains non-Linux executables.
+.\" Often used as a security feature, e.g., to make sure that restricted
+.\" users cannot execute files uploaded using ftp or so.)
+.TP
+.B MS_NOSUID
+Do not honor set-user-ID and set-group-ID bits or file capabilities
+when executing programs from this filesystem.
+In addition, SELinux domain
+transitions require the permission
+.IR nosuid_transition ,
+which in turn needs
+also the policy capability
+.IR nnp_nosuid_transition .
+.\" (This is a security feature to prevent users executing set-user-ID and
+.\" set-group-ID programs from removable disk devices.)
+.TP
+.B MS_RDONLY
+Mount filesystem read-only.
+.TP
+.BR MS_REC " (since Linux 2.4.11)"
+Used in conjunction with
+.B MS_BIND
+to create a recursive bind mount,
+and in conjunction with the propagation type flags to recursively change
+the propagation type of all of the mounts in a subtree.
+See below for further details.
+.TP
+.BR MS_RELATIME " (since Linux 2.6.20)"
+When a file on this filesystem is accessed,
+update the file's last access time (atime) only if the current value
+of atime is less than or equal to the file's last modification time (mtime)
+or last status change time (ctime).
+This option is useful for programs, such as
+.BR mutt (1),
+that need to know when a file has been read since it was last modified.
+Since Linux 2.6.30, the kernel defaults to the behavior provided
+by this flag (unless
+.B MS_NOATIME
+was specified), and the
+.B MS_STRICTATIME
+flag is required to obtain traditional semantics.
+In addition, since Linux 2.6.30,
+the file's last access time is always updated if it
+is more than 1 day old.
+.\" Matthew Garrett notes in the patch that added this behavior
+.\" that this lets utilities such as tmpreaper (which deletes
+.\" files based on last access time) work correctly.
+.TP
+.BR MS_SILENT " (since Linux 2.6.17)"
+Suppress the display of certain
+.RI ( printk ())
+warning messages in the kernel log.
+This flag supersedes the misnamed and obsolete
+.B MS_VERBOSE
+flag (available since Linux 2.4.12), which has the same meaning.
+.TP
+.BR MS_STRICTATIME " (since Linux 2.6.30)"
+Always update the last access time (atime) when files on this
+filesystem are accessed.
+(This was the default behavior before Linux 2.6.30.)
+Specifying this flag overrides the effect of setting the
+.B MS_NOATIME
+and
+.B MS_RELATIME
+flags.
+.TP
+.B MS_SYNCHRONOUS
+Make writes on this filesystem synchronous (as though
+the
+.B O_SYNC
+flag to
+.BR open (2)
+was specified for all file opens to this filesystem).
+.TP
+.BR MS_NOSYMFOLLOW " (since Linux 5.10)"
+.\" dab741e0e02bd3c4f5e2e97be74b39df2523fc6e
+Do not follow symbolic links when resolving paths.
+Symbolic links can still be created,
+and
+.BR readlink (1),
+.BR readlink (2),
+.BR realpath (1),
+and
+.BR realpath (3)
+all still work properly.
+.PP
+From Linux 2.4 onward, some of the above flags are
+settable on a per-mount basis,
+while others apply to the superblock of the mounted filesystem,
+meaning that all mounts of the same filesystem share those flags.
+(Previously, all of the flags were per-superblock.)
+.PP
+The per-mount-point flags are as follows:
+.IP \[bu] 3
+Since Linux 2.4:
+.BR MS_NODEV ", " MS_NOEXEC ", and " MS_NOSUID
+flags are settable on a per-mount-point basis.
+.IP \[bu]
+Additionally, since Linux 2.6.16:
+.B MS_NOATIME
+and
+.BR MS_NODIRATIME .
+.IP \[bu]
+Additionally, since Linux 2.6.20:
+.BR MS_RELATIME .
+.PP
+The following flags are per-superblock:
+.BR MS_DIRSYNC ,
+.BR MS_LAZYTIME ,
+.BR MS_MANDLOCK ,
+.BR MS_SILENT ,
+and
+.BR MS_SYNCHRONOUS .
+.\" And MS_I_VERSION?
+The initial settings of these flags are determined on the first
+mount of the filesystem, and will be shared by all subsequent mounts
+of the same filesystem.
+Subsequently, the settings of the flags can be changed
+via a remount operation (see below).
+Such changes will be visible via all mounts associated
+with the filesystem.
+.PP
+Since Linux 2.6.16,
+.B MS_RDONLY
+can be set or cleared on a per-mount-point basis as well as on
+the underlying filesystem superblock.
+The mounted filesystem will be writable only if neither the filesystem
+nor the mountpoint are flagged as read-only.
+.\"
+.SS Remounting an existing mount
+An existing mount may be remounted by specifying
+.B MS_REMOUNT
+in
+.IR mountflags .
+This allows you to change the
+.I mountflags
+and
+.I data
+of an existing mount without having to unmount and remount the filesystem.
+.I target
+should be the same value specified in the initial
+.BR mount ()
+call.
+.PP
+The
+.I source
+and
+.I filesystemtype
+arguments are ignored.
+.PP
+The
+.I mountflags
+and
+.I data
+arguments should match the values used in the original
+.BR mount ()
+call, except for those parameters that are being deliberately changed.
+.PP
+The following
+.I mountflags
+can be changed:
+.BR MS_LAZYTIME ,
+.\" FIXME
+.\" MS_LAZYTIME seems to be available only on a few filesystems,
+.\" and on ext4, it seems (from experiment that this flag
+.\" can only be enabled (but not disabled) on a remount.
+.\" The following code in ext4_remount() (kernel 4.17) seems to
+.\" confirm this:
+.\"
+.\" if (*flags & SB_LAZYTIME)
+.\" sb->s_flags |= SB_LAZYTIME;
+.BR MS_MANDLOCK ,
+.BR MS_NOATIME ,
+.BR MS_NODEV ,
+.BR MS_NODIRATIME ,
+.BR MS_NOEXEC ,
+.BR MS_NOSUID ,
+.BR MS_RELATIME ,
+.BR MS_RDONLY ,
+.B MS_STRICTATIME
+(whose effect is to clear the
+.B MS_NOATIME
+and
+.B MS_RELATIME
+flags),
+and
+.BR MS_SYNCHRONOUS .
+Attempts to change the setting of the
+.\" See the definition of MS_RMT_MASK in include/uapi/linux/fs.h,
+.\" which excludes MS_DIRSYNC and MS_SILENT, although SB_DIRSYNC
+.\" and SB_SILENT are split out as per-superblock flags in do_mount()
+.\" (Linux 4.17 source code)
+.B MS_DIRSYNC
+and
+.B MS_SILENT
+flags during a remount are silently ignored.
+Note that changes to per-superblock flags are visible via
+all mounts of the associated filesystem
+(because the per-superblock flags are shared by all mounts).
+.PP
+Since Linux 3.17,
+.\" commit ffbc6f0ead47fa5a1dc9642b0331cb75c20a640e
+if none of
+.BR MS_NOATIME ,
+.BR MS_NODIRATIME ,
+.BR MS_RELATIME ,
+or
+.B MS_STRICTATIME
+is specified in
+.IR mountflags ,
+then the remount operation preserves the existing values of these flags
+(rather than defaulting to
+.BR MS_RELATIME ).
+.PP
+Since Linux 2.6.26, the
+.B MS_REMOUNT
+flag can be used with
+.B MS_BIND
+to modify only the per-mount-point flags.
+.\" See https://lwn.net/Articles/281157/
+This is particularly useful for setting or clearing the "read-only"
+flag on a mount without changing the underlying filesystem.
+Specifying
+.I mountflags
+as:
+.PP
+.in +4n
+.EX
+MS_REMOUNT | MS_BIND | MS_RDONLY
+.EE
+.in
+.PP
+will make access through this mountpoint read-only, without affecting
+other mounts.
+.\"
+.SS Creating a bind mount
+If
+.I mountflags
+includes
+.B MS_BIND
+(available since Linux 2.4),
+.\" since Linux 2.4.0-test9
+then perform a bind mount.
+A bind mount makes a file or a directory subtree visible at
+another point within the single directory hierarchy.
+Bind mounts may cross filesystem boundaries and span
+.BR chroot (2)
+jails.
+.PP
+The
+.I filesystemtype
+and
+.I data
+arguments are ignored.
+.PP
+The remaining bits (other than
+.BR MS_REC ,
+described below) in the
+.I mountflags
+argument are also ignored.
+(The bind mount has the same mount options as
+the underlying mount.)
+However, see the discussion of remounting above,
+for a method of making an existing bind mount read-only.
+.PP
+By default, when a directory is bind mounted,
+only that directory is mounted;
+if there are any submounts under the directory tree,
+they are not bind mounted.
+If the
+.B MS_REC
+flag is also specified, then a recursive bind mount operation is performed:
+all submounts under the
+.I source
+subtree (other than unbindable mounts)
+are also bind mounted at the corresponding location in the
+.I target
+subtree.
+.\"
+.SS Changing the propagation type of an existing mount
+If
+.I mountflags
+includes one of
+.BR MS_SHARED ,
+.BR MS_PRIVATE ,
+.BR MS_SLAVE ,
+or
+.B MS_UNBINDABLE
+(all available since Linux 2.6.15),
+then the propagation type of an existing mount is changed.
+If more than one of these flags is specified, an error results.
+.PP
+The only other flags that can be specified while changing
+the propagation type are
+.B MS_REC
+(described below) and
+.B MS_SILENT
+(which is ignored).
+.PP
+The
+.IR source ,
+.IR filesystemtype ,
+and
+.I data
+arguments are ignored.
+.PP
+The meanings of the propagation type flags are as follows:
+.TP
+.B MS_SHARED
+Make this mount shared.
+Mount and unmount events immediately under this mount will propagate
+to the other mounts that are members of this mount's peer group.
+Propagation here means that the same mount or unmount will automatically
+occur under all of the other mounts in the peer group.
+Conversely, mount and unmount events that take place under
+peer mounts will propagate to this mount.
+.TP
+.B MS_PRIVATE
+Make this mount private.
+Mount and unmount events do not propagate into or out of this mount.
+.TP
+.B MS_SLAVE
+If this is a shared mount that is a member of a peer group
+that contains other members, convert it to a slave mount.
+If this is a shared mount that is a member of a peer group
+that contains no other members, convert it to a private mount.
+Otherwise, the propagation type of the mount is left unchanged.
+.IP
+When a mount is a slave,
+mount and unmount events propagate into this mount from
+the (master) shared peer group of which it was formerly a member.
+Mount and unmount events under this mount do not propagate to any peer.
+.IP
+A mount can be the slave of another peer group
+while at the same time sharing mount and unmount events
+with a peer group of which it is a member.
+.TP
+.B MS_UNBINDABLE
+Make this mount unbindable.
+This is like a private mount,
+and in addition this mount can't be bind mounted.
+When a recursive bind mount
+.RB ( mount ()
+with the
+.B MS_BIND
+and
+.B MS_REC
+flags) is performed on a directory subtree,
+any unbindable mounts within the subtree are automatically pruned
+(i.e., not replicated)
+when replicating that subtree to produce the target subtree.
+.PP
+By default, changing the propagation type affects only the
+.I target
+mount.
+If the
+.B MS_REC
+flag is also specified in
+.IR mountflags ,
+then the propagation type of all mounts under
+.I target
+is also changed.
+.PP
+For further details regarding mount propagation types
+(including the default propagation type assigned to new mounts), see
+.BR mount_namespaces (7).
+.\"
+.SS Moving a mount
+If
+.I mountflags
+contains the flag
+.B MS_MOVE
+(available since Linux 2.4.18),
+then move a subtree:
+.I source
+specifies an existing mount and
+.I target
+specifies the new location to which that mount is to be relocated.
+The move is atomic: at no point is the subtree unmounted.
+.PP
+The remaining bits in the
+.I mountflags
+argument are ignored, as are the
+.I filesystemtype
+and
+.I data
+arguments.
+.\"
+.SS Creating a new mount
+If none of
+.BR MS_REMOUNT ,
+.BR MS_BIND ,
+.BR MS_MOVE ,
+.BR MS_SHARED ,
+.BR MS_PRIVATE ,
+.BR MS_SLAVE ,
+or
+.B MS_UNBINDABLE
+is specified in
+.IR mountflags ,
+then
+.BR mount ()
+performs its default action: creating a new mount.
+.I source
+specifies the source for the new mount, and
+.I target
+specifies the directory at which to create the mount point.
+.PP
+The
+.I filesystemtype
+and
+.I data
+arguments are employed, and further bits may be specified in
+.I mountflags
+to modify the behavior of the call.
+.\"
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+The error values given below result from filesystem type independent
+errors.
+Each filesystem type may have its own special errors and its
+own special behavior.
+See the Linux kernel source code for details.
+.TP
+.B EACCES
+A component of a path was not searchable.
+(See also
+.BR path_resolution (7).)
+.TP
+.B EACCES
+Mounting a read-only filesystem was attempted without giving the
+.B MS_RDONLY
+flag.
+.IP
+The filesystem may be read-only for various reasons, including:
+it resides on a read-only optical disk;
+it is resides on a device with a physical switch that has been set to
+mark the device read-only;
+the filesystem implementation was compiled with read-only support;
+or errors were detected when initially mounting the filesystem,
+so that it was marked read-only
+and can't be remounted as read-write (until the errors are fixed).
+.IP
+Some filesystems instead return the error
+.B EROFS
+on an attempt to mount a read-only filesystem.
+.TP
+.B EACCES
+The block device
+.I source
+is located on a filesystem mounted with the
+.B MS_NODEV
+option.
+.\" mtk: Probably: write permission is required for MS_BIND, with
+.\" the error EPERM if not present; CAP_DAC_OVERRIDE is required.
+.TP
+.B EBUSY
+An attempt was made to stack a new mount directly on
+top of an existing mount point that was created in this
+mount namespace with the same
+.I source
+and
+.IR target .
+.TP
+.B EBUSY
+.I source
+cannot be remounted read-only,
+because it still holds files open for writing.
+.TP
+.B EFAULT
+One of the pointer arguments points outside the user address space.
+.TP
+.B EINVAL
+.I source
+had an invalid superblock.
+.TP
+.B EINVAL
+A remount operation
+.RB ( MS_REMOUNT )
+was attempted, but
+.I source
+was not already mounted on
+.IR target .
+.TP
+.B EINVAL
+A move operation
+.RB ( MS_MOVE )
+was attempted, but the mount tree under
+.I source
+includes unbindable mounts and
+.I target
+is a mount that has propagation type
+.BR MS_SHARED .
+.TP
+.B EINVAL
+A move operation
+.RB ( MS_MOVE )
+was attempted, but the parent mount of
+.I source
+mount has propagation type
+.BR MS_SHARED .
+.TP
+.B EINVAL
+A move operation
+.RB ( MS_MOVE )
+was attempted, but
+.I source
+was not a mount, or was \[aq]/\[aq].
+.TP
+.B EINVAL
+A bind operation
+.RB ( MS_BIND )
+was requested where
+.I source
+referred a mount namespace magic link (i.e., a
+.IR /proc/ pid /ns/mnt
+magic link or a bind mount to such a link)
+and the propagation type of the parent mount of
+.I target
+was
+.BR MS_SHARED ,
+.\" See commit 8823c079ba7136dc1948d6f6dcb5f8022bde438e
+but propagation of the requested bind mount could lead to a circular
+dependency that might prevent the mount namespace from ever being freed.
+.TP
+.B EINVAL
+.I mountflags
+includes more than one of
+.BR MS_SHARED ,
+.BR MS_PRIVATE ,
+.BR MS_SLAVE ,
+or
+.BR MS_UNBINDABLE .
+.TP
+.B EINVAL
+.I mountflags
+includes
+.BR MS_SHARED ,
+.BR MS_PRIVATE ,
+.BR MS_SLAVE ,
+or
+.B MS_UNBINDABLE
+and also includes a flag other than
+.B MS_REC
+or
+.BR MS_SILENT .
+.TP
+.B EINVAL
+An attempt was made to bind mount an unbindable mount.
+.TP
+.B EINVAL
+In an unprivileged mount namespace
+(i.e., a mount namespace owned by a user namespace
+that was created by an unprivileged user),
+a bind mount operation
+.RB ( MS_BIND )
+was attempted without specifying
+.RB ( MS_REC ),
+which would have revealed the filesystem tree underneath one of
+the submounts of the directory being bound.
+.TP
+.B ELOOP
+Too many links encountered during pathname resolution.
+.TP
+.B ELOOP
+A move operation was attempted, and
+.I target
+is a descendant of
+.IR source .
+.TP
+.B EMFILE
+(In case no block device is required:)
+Table of dummy devices is full.
+.TP
+.B ENAMETOOLONG
+A pathname was longer than
+.BR MAXPATHLEN .
+.TP
+.B ENODEV
+.I filesystemtype
+not configured in the kernel.
+.TP
+.B ENOENT
+A pathname was empty or had a nonexistent component.
+.TP
+.B ENOMEM
+The kernel could not allocate a free page to copy filenames or data into.
+.TP
+.B ENOTBLK
+.I source
+is not a block device (and a device was required).
+.TP
+.B ENOTDIR
+.IR target ,
+or a prefix of
+.IR source ,
+is not a directory.
+.TP
+.B ENXIO
+The major number of the block device
+.I source
+is out of range.
+.TP
+.B EPERM
+The caller does not have the required privileges.
+.TP
+.B EPERM
+An attempt was made to modify
+.RB ( MS_REMOUNT )
+the
+.BR MS_RDONLY ,
+.BR MS_NOSUID ,
+or
+.B MS_NOEXEC
+flag, or one of the "atime" flags
+.RB ( MS_NOATIME ,
+.BR MS_NODIRATIME ,
+.BR MS_RELATIME )
+of an existing mount, but the mount is locked; see
+.BR mount_namespaces (7).
+.TP
+.B EROFS
+Mounting a read-only filesystem was attempted without giving the
+.B MS_RDONLY
+flag.
+See
+.BR EACCES ,
+above.
+.\"
+.SH STANDARDS
+Linux.
+.SH HISTORY
+The definitions of
+.BR MS_DIRSYNC ,
+.BR MS_MOVE ,
+.BR MS_PRIVATE ,
+.BR MS_REC ,
+.BR MS_RELATIME ,
+.BR MS_SHARED ,
+.BR MS_SLAVE ,
+.BR MS_STRICTATIME ,
+and
+.B MS_UNBINDABLE
+were added to glibc headers in glibc 2.12.
+.PP
+Since Linux 2.4 a single filesystem can be mounted at
+multiple mount points, and multiple mounts can be stacked
+on the same mount point.
+.\" Multiple mounts on same mount point: since Linux 2.3.99pre7.
+.PP
+The
+.I mountflags
+argument may have the magic number 0xC0ED (\fBMS_MGC_VAL\fP)
+in the top 16 bits.
+(All of the other flags discussed in DESCRIPTION
+occupy the low order 16 bits of
+.IR mountflags .)
+Specifying
+.B MS_MGC_VAL
+was required before Linux 2.4,
+but since Linux 2.4 is no longer required and is ignored if specified.
+.PP
+The original
+.B MS_SYNC
+flag was renamed
+.B MS_SYNCHRONOUS
+in 1.1.69
+when a different
+.B MS_SYNC
+was added to \fI<mman.h>\fP.
+.PP
+Before Linux 2.4 an attempt to execute a set-user-ID or set-group-ID program
+on a filesystem mounted with
+.B MS_NOSUID
+would fail with
+.BR EPERM .
+Since Linux 2.4 the set-user-ID and set-group-ID bits are
+just silently ignored in this case.
+.\" The change is in patch-2.4.0-prerelease.
+.\"
+.SH NOTES
+.SS Mount namespaces
+Starting with Linux 2.4.19, Linux provides mount namespaces.
+A mount namespace is the set of filesystem mounts that
+are visible to a process.
+Mount namespaces can be (and usually are)
+shared between multiple processes,
+and changes to the namespace (i.e., mounts and unmounts) by one process
+are visible to all other processes sharing the same namespace.
+(The pre-2.4.19 Linux situation can be considered as one in which
+a single namespace was shared by every process on the system.)
+.PP
+A child process created by
+.BR fork (2)
+shares its parent's mount namespace;
+the mount namespace is preserved across an
+.BR execve (2).
+.PP
+A process can obtain a private mount namespace if:
+it was created using the
+.BR clone (2)
+.B CLONE_NEWNS
+flag,
+in which case its new namespace is initialized to be a
+.I copy
+of the namespace of the process that called
+.BR clone (2);
+or it calls
+.BR unshare (2)
+with the
+.B CLONE_NEWNS
+flag,
+which causes the caller's mount namespace to obtain a private copy
+of the namespace that it was previously sharing with other processes,
+so that future mounts and unmounts by the caller are invisible
+to other processes (except child processes that the caller
+subsequently creates) and vice versa.
+.PP
+For further details on mount namespaces, see
+.BR mount_namespaces (7).
+.\"
+.SS Parental relationship between mounts
+Each mount has a parent mount.
+The overall parental relationship of all mounts defines
+the single directory hierarchy seen by the processes within a mount namespace.
+.PP
+The parent of a new mount is defined when the mount is created.
+In the usual case,
+the parent of a new mount is the mount of the filesystem
+containing the directory or file at which the new mount is attached.
+In the case where a new mount is stacked on top of an existing mount,
+the parent of the new mount is the previous mount that was stacked
+at that location.
+.PP
+The parental relationship between mounts can be discovered via the
+.IR /proc/ pid /mountinfo
+file (see below).
+.\"
+.SS \fI/proc/\fPpid\fI/mounts\fP and \fI/proc/\fPpid\fI/mountinfo\fP
+The Linux-specific
+.IR /proc/ pid /mounts
+file exposes the list of mounts in the mount
+namespace of the process with the specified ID.
+The
+.IR /proc/ pid /mountinfo
+file exposes even more information about mounts,
+including the propagation type and mount ID information that makes it
+possible to discover the parental relationship between mounts.
+See
+.BR proc (5)
+and
+.BR mount_namespaces (7)
+for details of this file.
+.SH SEE ALSO
+.BR mountpoint (1),
+.BR chroot (2),
+.BR ioctl_iflags (2),
+.BR mount_setattr (2),
+.BR pivot_root (2),
+.BR umount (2),
+.BR mount_namespaces (7),
+.BR path_resolution (7),
+.BR findmnt (8),
+.BR lsblk (8),
+.BR mount (8),
+.BR umount (8)
diff --git a/man2/mount_setattr.2 b/man2/mount_setattr.2
new file mode 100644
index 0000000..fafaba2
--- /dev/null
+++ b/man2/mount_setattr.2
@@ -0,0 +1,1055 @@
+.\" Copyright (c) 2021 by Christian Brauner <christian.brauner@ubuntu.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH mount_setattr 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+mount_setattr \- change properties of a mount or mount tree
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/fcntl.h>" " /* Definition of " AT_* " constants */"
+.BR "#include <linux/mount.h>" " /* Definition of " MOUNT_ATTR_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_mount_setattr, int " dirfd ", const char *" pathname ,
+.BI " unsigned int " flags ", struct mount_attr *" attr \
+", size_t " size );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR mount_setattr (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The
+.BR mount_setattr ()
+system call changes the mount properties of a mount or an entire mount tree.
+If
+.I pathname
+is a relative pathname,
+then it is interpreted relative to
+the directory referred to by the file descriptor
+.IR dirfd .
+If
+.I dirfd
+is the special value
+.BR AT_FDCWD ,
+then
+.I pathname
+is interpreted relative to
+the current working directory of the calling process.
+If
+.I pathname
+is the empty string and
+.B AT_EMPTY_PATH
+is specified in
+.IR flags ,
+then the mount properties of the mount identified by
+.I dirfd
+are changed.
+(See
+.BR openat (2)
+for an explanation of why the
+.I dirfd
+argument is useful.)
+.PP
+The
+.BR mount_setattr ()
+system call uses an extensible structure
+.RI ( "struct mount_attr" )
+to allow for future extensions.
+Any non-flag extensions to
+.BR mount_setattr ()
+will be implemented as new fields appended to the this structure,
+with a zero value in a new field resulting in the kernel behaving
+as though that extension field was not present.
+Therefore,
+the caller
+.I must
+zero-fill this structure on initialization.
+See the "Extensibility" subsection under
+.B NOTES
+for more details.
+.PP
+The
+.I size
+argument should usually be specified as
+.IR "sizeof(struct mount_attr)" .
+However, if the caller is using a kernel that supports an extended
+.IR "struct mount_attr" ,
+but the caller does not intend to make use of these features,
+it is possible to pass the size of an earlier
+version of the structure together with the extended structure.
+This allows the kernel to not copy later parts of the structure
+that aren't used anyway.
+With each extension that changes the size of
+.IR "struct mount_attr" ,
+the kernel will expose a definition of the form
+.BI MOUNT_ATTR_SIZE_VER number\c
+\&.
+For example, the macro for the size of the initial version of
+.I struct mount_attr
+is
+.BR MOUNT_ATTR_SIZE_VER0 .
+.PP
+The
+.I flags
+argument can be used to alter the pathname resolution behavior.
+The supported values are:
+.TP
+.B AT_EMPTY_PATH
+If
+.I pathname
+is the empty string,
+change the mount properties on
+.I dirfd
+itself.
+.TP
+.B AT_RECURSIVE
+Change the mount properties of the entire mount tree.
+.TP
+.B AT_SYMLINK_NOFOLLOW
+Don't follow trailing symbolic links.
+.TP
+.B AT_NO_AUTOMOUNT
+Don't trigger automounts.
+.PP
+The
+.I attr
+argument of
+.BR mount_setattr ()
+is a structure of the following form:
+.PP
+.in +4n
+.EX
+struct mount_attr {
+ __u64 attr_set; /* Mount properties to set */
+ __u64 attr_clr; /* Mount properties to clear */
+ __u64 propagation; /* Mount propagation type */
+ __u64 userns_fd; /* User namespace file descriptor */
+};
+.EE
+.in
+.PP
+The
+.I attr_set
+and
+.I attr_clr
+members are used to specify the mount properties that
+are supposed to be set or cleared for a mount or mount tree.
+Flags set in
+.I attr_set
+enable a property on a mount or mount tree,
+and flags set in
+.I attr_clr
+remove a property from a mount or mount tree.
+.PP
+When changing mount properties,
+the kernel will first clear the flags specified
+in the
+.I attr_clr
+field,
+and then set the flags specified in the
+.I attr_set
+field.
+For example, these settings:
+.PP
+.in +4n
+.EX
+struct mount_attr attr = {
+ .attr_clr = MOUNT_ATTR_NOEXEC | MOUNT_ATTR_NODEV,
+ .attr_set = MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID,
+};
+.EE
+.in
+.PP
+are equivalent to the following steps:
+.PP
+.in +4n
+.EX
+unsigned int current_mnt_flags = mnt\->mnt_flags;
+\&
+/*
+ * Clear all flags set in .attr_clr,
+ * clearing MOUNT_ATTR_NOEXEC and MOUNT_ATTR_NODEV.
+ */
+current_mnt_flags &= \(tiattr\->attr_clr;
+\&
+/*
+ * Now set all flags set in .attr_set,
+ * applying MOUNT_ATTR_RDONLY and MOUNT_ATTR_NOSUID.
+ */
+current_mnt_flags |= attr\->attr_set;
+\&
+mnt\->mnt_flags = current_mnt_flags;
+.EE
+.in
+.PP
+As a result of this change, the mount or mount tree (a) is read-only;
+(b) blocks the execution of set-user-ID and set-group-ID programs;
+(c) allows execution of programs; and (d) allows access to devices.
+.PP
+Multiple changes with the same set of flags requested
+in
+.I attr_clr
+and
+.I attr_set
+are guaranteed to be idempotent after the changes have been applied.
+.PP
+The following mount attributes can be specified in the
+.I attr_set
+or
+.I attr_clr
+fields:
+.TP
+.B MOUNT_ATTR_RDONLY
+If set in
+.IR attr_set ,
+makes the mount read-only.
+If set in
+.IR attr_clr ,
+removes the read-only setting if set on the mount.
+.TP
+.B MOUNT_ATTR_NOSUID
+If set in
+.IR attr_set ,
+causes the mount not to honor the set-user-ID and set-group-ID mode bits and
+file capabilities when executing programs.
+If set in
+.IR attr_clr ,
+clears the set-user-ID, set-group-ID,
+and file capability restriction if set on this mount.
+.TP
+.B MOUNT_ATTR_NODEV
+If set in
+.IR attr_set ,
+prevents access to devices on this mount.
+If set in
+.IR attr_clr ,
+removes the restriction that prevented accessing devices on this mount.
+.TP
+.B MOUNT_ATTR_NOEXEC
+If set in
+.IR attr_set ,
+prevents executing programs on this mount.
+If set in
+.IR attr_clr ,
+removes the restriction that prevented executing programs on this mount.
+.TP
+.B MOUNT_ATTR_NOSYMFOLLOW
+If set in
+.IR attr_set ,
+prevents following symbolic links on this mount.
+If set in
+.IR attr_clr ,
+removes the restriction that prevented following symbolic links on this mount.
+.TP
+.B MOUNT_ATTR_NODIRATIME
+If set in
+.IR attr_set ,
+prevents updating access time for directories on this mount.
+If set in
+.IR attr_clr ,
+removes the restriction that prevented updating access time for directories.
+Note that
+.B MOUNT_ATTR_NODIRATIME
+can be combined with other access-time settings
+and is implied by the noatime setting.
+All other access-time settings are mutually exclusive.
+.TP
+.BR MOUNT_ATTR__ATIME " - changing access-time settings"
+The access-time values listed below are an enumeration that
+includes the value zero, expressed in the bits defined by the mask
+.BR MOUNT_ATTR__ATIME .
+Even though these bits are an enumeration
+(in contrast to the other mount flags such as
+.BR MOUNT_ATTR_NOEXEC ),
+they are nonetheless passed in
+.I attr_set
+and
+.I attr_clr
+for consistency with
+.BR fsmount (2),
+which introduced this behavior.
+.IP
+Note that,
+since the access-time values are an enumeration rather than bit values,
+a caller wanting to transition to a different access-time setting
+cannot simply specify the access-time setting in
+.IR attr_set ,
+but must also include
+.B MOUNT_ATTR__ATIME
+in the
+.I attr_clr
+field.
+The kernel will verify that
+.B MOUNT_ATTR__ATIME
+isn't partially set in
+.I attr_clr
+(i.e., either all bits in the
+.B MOUNT_ATTR__ATIME
+bit field are either set or clear), and that
+.I attr_set
+doesn't have any access-time bits set if
+.B MOUNT_ATTR__ATIME
+isn't set in
+.IR attr_clr .
+.RS
+.TP
+.B MOUNT_ATTR_RELATIME
+When a file is accessed via this mount,
+update the file's last access time (atime)
+only if the current value of atime is less than or equal to
+the file's last modification time (mtime) or last status change time (ctime).
+.IP
+To enable this access-time setting on a mount or mount tree,
+.B MOUNT_ATTR_RELATIME
+must be set in
+.I attr_set
+and
+.B MOUNT_ATTR__ATIME
+must be set in the
+.I attr_clr
+field.
+.TP
+.B MOUNT_ATTR_NOATIME
+Do not update access times for (all types of) files on this mount.
+.IP
+To enable this access-time setting on a mount or mount tree,
+.B MOUNT_ATTR_NOATIME
+must be set in
+.I attr_set
+and
+.B MOUNT_ATTR__ATIME
+must be set in the
+.I attr_clr
+field.
+.TP
+.B MOUNT_ATTR_STRICTATIME
+Always update the last access time (atime)
+when files are accessed on this mount.
+.IP
+To enable this access-time setting on a mount or mount tree,
+.B MOUNT_ATTR_STRICTATIME
+must be set in
+.I attr_set
+and
+.B MOUNT_ATTR__ATIME
+must be set in the
+.I attr_clr
+field.
+.RE
+.TP
+.B MOUNT_ATTR_IDMAP
+If set in
+.IR attr_set ,
+creates an ID-mapped mount.
+The ID mapping is taken from the user namespace specified in
+.I userns_fd
+and attached to the mount.
+.IP
+Since it is not supported to
+change the ID mapping of a mount after it has been ID mapped,
+it is invalid to specify
+.B MOUNT_ATTR_IDMAP
+in
+.IR attr_clr .
+.IP
+For further details, see the subsection "ID-mapped mounts" under NOTES.
+.PP
+The
+.I propagation
+field is used to specify the propagation type of the mount or mount tree.
+This field either has the value zero,
+meaning leave the propagation type unchanged, or it has one of
+the following values:
+.TP
+.B MS_PRIVATE
+Turn all mounts into private mounts.
+.TP
+.B MS_SHARED
+Turn all mounts into shared mounts.
+.TP
+.B MS_SLAVE
+Turn all mounts into dependent mounts.
+.TP
+.B MS_UNBINDABLE
+Turn all mounts into unbindable mounts.
+.PP
+For further details on the above propagation types, see
+.BR mount_namespaces (7).
+.SH RETURN VALUE
+On success,
+.BR mount_setattr ()
+returns zero.
+On error,
+\-1 is returned and
+.I errno
+is set to indicate the cause of the error.
+.SH ERRORS
+.TP
+.B EBADF
+.I pathname
+is relative but
+.I dirfd
+is neither
+.B AT_FDCWD
+nor a valid file descriptor.
+.TP
+.B EBADF
+.I userns_fd
+is not a valid file descriptor.
+.TP
+.B EBUSY
+The caller tried to change the mount to
+.BR MOUNT_ATTR_RDONLY ,
+but the mount still holds files open for writing.
+.TP
+.B EBUSY
+The caller tried to create an ID-mapped mount raising
+.B MOUNT_ATTR_IDMAP
+and specifying
+.I userns_fd
+but the mount still holds files open for writing.
+.TP
+.B EINVAL
+The pathname specified via the
+.I dirfd
+and
+.I pathname
+arguments to
+.BR mount_setattr ()
+isn't a mount point.
+.TP
+.B EINVAL
+An unsupported value was set in
+.IR flags .
+.TP
+.B EINVAL
+An unsupported value was specified in the
+.I attr_set
+field of
+.IR mount_attr .
+.TP
+.B EINVAL
+An unsupported value was specified in the
+.I attr_clr
+field of
+.IR mount_attr .
+.TP
+.B EINVAL
+An unsupported value was specified in the
+.I propagation
+field of
+.IR mount_attr .
+.TP
+.B EINVAL
+More than one of
+.BR MS_SHARED ,
+.BR MS_SLAVE ,
+.BR MS_PRIVATE ,
+or
+.B MS_UNBINDABLE
+was set in the
+.I propagation
+field of
+.IR mount_attr .
+.TP
+.B EINVAL
+An access-time setting was specified in the
+.I attr_set
+field without
+.B MOUNT_ATTR__ATIME
+being set in the
+.I attr_clr
+field.
+.TP
+.B EINVAL
+.B MOUNT_ATTR_IDMAP
+was specified in
+.IR attr_clr .
+.TP
+.B EINVAL
+A file descriptor value was specified in
+.I userns_fd
+which exceeds
+.BR INT_MAX .
+.TP
+.B EINVAL
+A valid file descriptor value was specified in
+.IR userns_fd ,
+but the file descriptor did not refer to a user namespace.
+.TP
+.B EINVAL
+The underlying filesystem does not support ID-mapped mounts.
+.TP
+.B EINVAL
+The mount that is to be ID mapped is not a detached mount;
+that is, the mount has not previously been visible in a mount namespace.
+.TP
+.B EINVAL
+A partial access-time setting was specified in
+.I attr_clr
+instead of
+.B MOUNT_ATTR__ATIME
+being set.
+.TP
+.B EINVAL
+The mount is located outside the caller's mount namespace.
+.TP
+.B EINVAL
+The underlying filesystem has been mounted in a mount namespace that is
+owned by a noninitial user namespace
+.TP
+.B ENOENT
+A pathname was empty or had a nonexistent component.
+.TP
+.B ENOMEM
+When changing mount propagation to
+.BR MS_SHARED ,
+a new peer group ID needs to be allocated for all mounts without a peer group
+ID set.
+This allocation failed because there was not
+enough memory to allocate the relevant internal structures.
+.TP
+.B ENOSPC
+When changing mount propagation to
+.BR MS_SHARED ,
+a new peer group ID needs to be allocated for all mounts without a peer group
+ID set.
+This allocation failed because
+the kernel has run out of IDs.
+.\" Christian Brauner: i.e. someone has somehow managed to
+.\" allocate so many peer groups and managed to keep the kernel running
+.\" (???) that the ida has ran out of ids
+.\" Note that technically further error codes are possible that are
+.\" specific to the ID allocation implementation used.
+.TP
+.B EPERM
+One of the mounts had at least one of
+.BR MOUNT_ATTR_NOATIME ,
+.BR MOUNT_ATTR_NODEV ,
+.BR MOUNT_ATTR_NODIRATIME ,
+.BR MOUNT_ATTR_NOEXEC ,
+.BR MOUNT_ATTR_NOSUID ,
+or
+.B MOUNT_ATTR_RDONLY
+set and the flag is locked.
+Mount attributes become locked on a mount if:
+.RS
+.IP \[bu] 3
+A new mount or mount tree is created causing mount propagation across user
+namespaces
+(i.e., propagation to a mount namespace owned by a different user namespace).
+The kernel will lock the aforementioned flags to prevent these sensitive
+properties from being altered.
+.IP \[bu]
+A new mount and user namespace pair is created.
+This happens for example when specifying
+.B CLONE_NEWUSER | CLONE_NEWNS
+in
+.BR unshare (2),
+.BR clone (2),
+or
+.BR clone3 (2).
+The aforementioned flags become locked in the new mount namespace
+to prevent sensitive mount properties from being altered.
+Since the newly created mount namespace will be owned by the
+newly created user namespace,
+a calling process that is privileged in the new
+user namespace would\[em]in the absence of such locking\[em]be
+able to alter sensitive mount properties (e.g., to remount a mount
+that was marked read-only as read-write in the new mount namespace).
+.RE
+.TP
+.B EPERM
+A valid file descriptor value was specified in
+.IR userns_fd ,
+but the file descriptor refers to the initial user namespace.
+.TP
+.B EPERM
+An attempt was made to add an ID mapping to a mount that is already ID mapped.
+.TP
+.B EPERM
+The caller does not have
+.B CAP_SYS_ADMIN
+in the initial user namespace.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 5.12.
+.\" commit 7d6beb71da3cc033649d641e1e608713b8220290
+.\" commit 2a1867219c7b27f928e2545782b86daaf9ad50bd
+.\" commit 9caccd41541a6f7d6279928d9f971f6642c361af
+.SH NOTES
+.SS ID-mapped mounts
+Creating an ID-mapped mount makes it possible to
+change the ownership of all files located under a mount.
+Thus, ID-mapped mounts make it possible to
+change ownership in a temporary and localized way.
+It is a localized change because the ownership changes are
+visible only via a specific mount.
+All other users and locations where the filesystem is exposed are unaffected.
+It is a temporary change because
+the ownership changes are tied to the lifetime of the mount.
+.PP
+Whenever callers interact with the filesystem through an ID-mapped mount,
+the ID mapping of the mount will be applied to
+user and group IDs associated with filesystem objects.
+This encompasses the user and group IDs associated with inodes
+and also the following
+.BR xattr (7)
+keys:
+.IP \[bu] 3
+.IR security.capability ,
+whenever filesystem capabilities
+are stored or returned in the
+.B VFS_CAP_REVISION_3
+format,
+which stores a root user ID alongside the capabilities
+(see
+.BR capabilities (7)).
+.IP \[bu]
+.I system.posix_acl_access
+and
+.IR system.posix_acl_default ,
+whenever user IDs or group IDs are stored in
+.B ACL_USER
+or
+.B ACL_GROUP
+entries.
+.PP
+The following conditions must be met in order to create an ID-mapped mount:
+.IP \[bu] 3
+The caller must have the
+.B CAP_SYS_ADMIN
+capability in the user namespace the filesystem was mounted in.
+.\" commit bd303368b776eead1c29e6cdda82bde7128b82a7
+.\" Christian Brauner
+.\" Note, currently no filesystems mountable in non-initial user namespaces
+.\" support ID-mapped mounts.
+.IP \[bu]
+The underlying filesystem must support ID-mapped mounts.
+Currently, the following filesystems support ID-mapped mounts:
+.\" fs_flags = FS_ALLOW_IDMAP in kernel sources
+.RS
+.IP \[bu] 3
+.PD 0
+.BR xfs (5)
+(since Linux 5.12)
+.IP \[bu]
+.BR ext4 (5)
+(since Linux 5.12)
+.IP \[bu]
+.B FAT
+(since Linux 5.12)
+.IP \[bu]
+.BR btrfs (5)
+(since Linux 5.15)
+.\" commit 5b9b26f5d0b88b74001dcfe4ab8a8f2f4e744112
+.IP \[bu]
+.B ntfs3
+(since Linux 5.15)
+.\" commit 82cae269cfa953032fbb8980a7d554d60fb00b17
+.IP \[bu]
+.B f2fs
+(since Linux 5.18)
+.\" commit 984fc4e76d63345499f01c0c198a4b44860cf027
+.IP \[bu]
+.B erofs
+(since Linux 5.19)
+.\" commit 6c459b78d4793afbba6d864c466cc5cd2932459d
+.IP \[bu]
+.B overlayfs
+(ID-mapped lower and upper layers supported since Linux 5.19)
+.PD
+.RE
+.IP \[bu]
+The mount must not already be ID-mapped.
+This also implies that the ID mapping of a mount cannot be altered.
+.IP \[bu]
+The mount must not have any writers.
+.\" commit 1bbcd277a53e08d619ffeec56c5c9287f2bf42f
+.IP \[bu]
+The mount must be a detached mount;
+that is,
+it must have been created by calling
+.BR open_tree (2)
+with the
+.B OPEN_TREE_CLONE
+flag and it must not already have been visible in a mount namespace.
+(To put things another way:
+the mount must not have been attached to the filesystem hierarchy
+with a system call such as
+.BR move_mount (2).)
+.PP
+ID mappings can be created for user IDs, group IDs, and project IDs.
+An ID mapping is essentially a mapping of a range of user or group IDs into
+another or the same range of user or group IDs.
+ID mappings are written to map files as three numbers
+separated by white space.
+The first two numbers specify the starting user or group ID
+in each of the two user namespaces.
+The third number specifies the range of the ID mapping.
+For example,
+a mapping for user IDs such as "1000\ 1001\ 1" would indicate that
+user ID 1000 in the caller's user namespace is mapped to
+user ID 1001 in its ancestor user namespace.
+Since the map range is 1,
+only user ID 1000 is mapped.
+.PP
+It is possible to specify up to 340 ID mappings for each ID mapping type.
+If any user IDs or group IDs are not mapped,
+all files owned by that unmapped user or group ID will appear as
+being owned by the overflow user ID or overflow group ID respectively.
+.PP
+Further details on setting up ID mappings can be found in
+.BR user_namespaces (7).
+.PP
+In the common case, the user namespace passed in
+.I userns_fd
+(together with
+.B MOUNT_ATTR_IDMAP
+in
+.IR attr_set )
+to create an ID-mapped mount will be the user namespace of a container.
+In other scenarios it will be a dedicated user namespace associated with
+a user's login session as is the case for portable home directories in
+.BR systemd-homed.service (8)).
+It is also perfectly fine to create a dedicated user namespace
+for the sake of ID mapping a mount.
+.PP
+ID-mapped mounts can be useful in the following
+and a variety of other scenarios:
+.IP \[bu] 3
+Sharing files or filesystems
+between multiple users or multiple machines,
+especially in complex scenarios.
+For example,
+ID-mapped mounts are used to implement portable home directories in
+.BR systemd-homed.service (8),
+where they allow users to move their home directory
+to an external storage device
+and use it on multiple computers
+where they are assigned different user IDs and group IDs.
+This effectively makes it possible to
+assign random user IDs and group IDs at login time.
+.IP \[bu]
+Sharing files or filesystems
+from the host with unprivileged containers.
+This allows a user to avoid having to change ownership permanently through
+.BR chown (2).
+.IP \[bu]
+ID mapping a container's root filesystem.
+Users don't need to change ownership permanently through
+.BR chown (2).
+Especially for large root filesystems, using
+.BR chown (2)
+can be prohibitively expensive.
+.IP \[bu]
+Sharing files or filesystems
+between containers with non-overlapping ID mappings.
+.IP \[bu]
+Implementing discretionary access (DAC) permission checking
+for filesystems lacking a concept of ownership.
+.IP \[bu]
+Efficiently changing ownership on a per-mount basis.
+In contrast to
+.BR chown (2),
+changing ownership of large sets of files is instantaneous with
+ID-mapped mounts.
+This is especially useful when ownership of
+an entire root filesystem of a virtual machine or container
+is to be changed as mentioned above.
+With ID-mapped mounts,
+a single
+.BR mount_setattr ()
+system call will be sufficient to change the ownership of all files.
+.IP \[bu]
+Taking the current ownership into account.
+ID mappings specify precisely
+what a user or group ID is supposed to be mapped to.
+This contrasts with the
+.BR chown (2)
+system call which cannot by itself
+take the current ownership of the files it changes into account.
+It simply changes the ownership to the specified user ID and group ID.
+.IP \[bu]
+Locally and temporarily restricted ownership changes.
+ID-mapped mounts make it possible to change ownership locally,
+restricting the ownership changes to specific mounts,
+and temporarily as the ownership changes only apply as long as the mount exists.
+By contrast,
+changing ownership via the
+.BR chown (2)
+system call changes the ownership globally and permanently.
+.\"
+.SS Extensibility
+In order to allow for future extensibility,
+.BR mount_setattr ()
+requires the user-space application to specify the size of the
+.I mount_attr
+structure that it is passing.
+By providing this information, it is possible for
+.BR mount_setattr ()
+to provide both forwards- and backwards-compatibility, with
+.I size
+acting as an implicit version number.
+(Because new extension fields will always
+be appended, the structure size will always increase.)
+This extensibility design is very similar to other system calls such as
+.BR perf_setattr (2),
+.BR perf_event_open (2),
+.BR clone3 (2)
+and
+.BR openat2 (2).
+.PP
+Let
+.I usize
+be the size of the structure as specified by the user-space application,
+and let
+.I ksize
+be the size of the structure which the kernel supports,
+then there are three cases to consider:
+.IP \[bu] 3
+If
+.I ksize
+equals
+.IR usize ,
+then there is no version mismatch and
+.I attr
+can be used verbatim.
+.IP \[bu]
+If
+.I ksize
+is larger than
+.IR usize ,
+then there are some extension fields that the kernel supports
+which the user-space application is unaware of.
+Because a zero value in any added extension field signifies a no-op,
+the kernel treats all of the extension fields
+not provided by the user-space application
+as having zero values.
+This provides backwards-compatibility.
+.IP \[bu]
+If
+.I ksize
+is smaller than
+.IR usize ,
+then there are some extension fields which the user-space application is aware
+of but which the kernel does not support.
+Because any extension field must have its zero values signify a no-op,
+the kernel can safely ignore the unsupported extension fields
+if they are all zero.
+If any unsupported extension fields are non-zero,
+then \-1 is returned and
+.I errno
+is set to
+.BR E2BIG .
+This provides forwards-compatibility.
+.PP
+Because the definition of
+.I struct mount_attr
+may change in the future
+(with new fields being added when system headers are updated),
+user-space applications should zero-fill
+.I struct mount_attr
+to ensure that recompiling the program with new headers will not result in
+spurious errors at run time.
+The simplest way is to use a designated initializer:
+.PP
+.in +4n
+.EX
+struct mount_attr attr = {
+ .attr_set = MOUNT_ATTR_RDONLY,
+ .attr_clr = MOUNT_ATTR_NODEV
+};
+.EE
+.in
+.PP
+Alternatively, the structure can be zero-filled using
+.BR memset (3)
+or similar functions:
+.PP
+.in +4n
+.EX
+struct mount_attr attr;
+memset(&attr, 0, sizeof(attr));
+attr.attr_set = MOUNT_ATTR_RDONLY;
+attr.attr_clr = MOUNT_ATTR_NODEV;
+.EE
+.in
+.PP
+A user-space application that wishes to determine which extensions the running
+kernel supports can do so by conducting a binary search on
+.I size
+with a structure which has every byte nonzero
+(to find the largest value which doesn't produce an error of
+.BR E2BIG ).
+.SH EXAMPLES
+.\" SRC BEGIN (mount_setattr.c)
+.EX
+/*
+ * This program allows the caller to create a new detached mount
+ * and set various properties on it.
+ */
+#define _GNU_SOURCE
+#include <err.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <linux/mount.h>
+#include <linux/types.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+\&
+static inline int
+mount_setattr(int dirfd, const char *pathname, unsigned int flags,
+ struct mount_attr *attr, size_t size)
+{
+ return syscall(SYS_mount_setattr, dirfd, pathname, flags,
+ attr, size);
+}
+\&
+static inline int
+open_tree(int dirfd, const char *filename, unsigned int flags)
+{
+ return syscall(SYS_open_tree, dirfd, filename, flags);
+}
+\&
+static inline int
+move_mount(int from_dirfd, const char *from_pathname,
+ int to_dirfd, const char *to_pathname, unsigned int flags)
+{
+ return syscall(SYS_move_mount, from_dirfd, from_pathname,
+ to_dirfd, to_pathname, flags);
+}
+\&
+static const struct option longopts[] = {
+ {"map\-mount", required_argument, NULL, \[aq]a\[aq]},
+ {"recursive", no_argument, NULL, \[aq]b\[aq]},
+ {"read\-only", no_argument, NULL, \[aq]c\[aq]},
+ {"block\-setid", no_argument, NULL, \[aq]d\[aq]},
+ {"block\-devices", no_argument, NULL, \[aq]e\[aq]},
+ {"block\-exec", no_argument, NULL, \[aq]f\[aq]},
+ {"no\-access\-time", no_argument, NULL, \[aq]g\[aq]},
+ { NULL, 0, NULL, 0 },
+};
+\&
+int
+main(int argc, char *argv[])
+{
+ int fd_userns = \-1;
+ int fd_tree;
+ int index = 0;
+ int ret;
+ bool recursive = false;
+ const char *source;
+ const char *target;
+ struct mount_attr *attr = &(struct mount_attr){};
+\&
+ while ((ret = getopt_long_only(argc, argv, "",
+ longopts, &index)) != \-1) {
+ switch (ret) {
+ case \[aq]a\[aq]:
+ fd_userns = open(optarg, O_RDONLY | O_CLOEXEC);
+ if (fd_userns == \-1)
+ err(EXIT_FAILURE, "open(%s)", optarg);
+ break;
+ case \[aq]b\[aq]:
+ recursive = true;
+ break;
+ case \[aq]c\[aq]:
+ attr\->attr_set |= MOUNT_ATTR_RDONLY;
+ break;
+ case \[aq]d\[aq]:
+ attr\->attr_set |= MOUNT_ATTR_NOSUID;
+ break;
+ case \[aq]e\[aq]:
+ attr\->attr_set |= MOUNT_ATTR_NODEV;
+ break;
+ case \[aq]f\[aq]:
+ attr\->attr_set |= MOUNT_ATTR_NOEXEC;
+ break;
+ case \[aq]g\[aq]:
+ attr\->attr_set |= MOUNT_ATTR_NOATIME;
+ attr\->attr_clr |= MOUNT_ATTR__ATIME;
+ break;
+ default:
+ errx(EXIT_FAILURE, "Invalid argument specified");
+ }
+ }
+\&
+ if ((argc \- optind) < 2)
+ errx(EXIT_FAILURE, "Missing source or target mount point");
+\&
+ source = argv[optind];
+ target = argv[optind + 1];
+\&
+ /* In the following, \-1 as the \[aq]dirfd\[aq] argument ensures that
+ open_tree() fails if \[aq]source\[aq] is not an absolute pathname. */
+.\" Christian Brauner
+.\" When writing programs I like to never use relative paths with AT_FDCWD
+.\" because. Because making assumptions about the current working directory
+.\" of the calling process is just too easy to get wrong; especially when
+.\" pivot_root() or chroot() are in play.
+.\" My absolut preference (joke intended) is to open a well-known starting
+.\" point with an absolute path to get a dirfd and then scope all future
+.\" operations beneath that dirfd. This already works with old-style
+.\" openat() and _very_ cautious programming but openat2() and its
+.\" resolve-flag space have made this **chef's kiss**.
+.\" If I can't operate based on a well-known dirfd I use absolute paths
+.\" with a -EBADF dirfd passed to *at() functions.
+\&
+ fd_tree = open_tree(\-1, source,
+ OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC |
+ AT_EMPTY_PATH | (recursive ? AT_RECURSIVE : 0));
+ if (fd_tree == \-1)
+ err(EXIT_FAILURE, "open(%s)", source);
+\&
+ if (fd_userns >= 0) {
+ attr\->attr_set |= MOUNT_ATTR_IDMAP;
+ attr\->userns_fd = fd_userns;
+ }
+\&
+ ret = mount_setattr(fd_tree, "",
+ AT_EMPTY_PATH | (recursive ? AT_RECURSIVE : 0),
+ attr, sizeof(struct mount_attr));
+ if (ret == \-1)
+ err(EXIT_FAILURE, "mount_setattr");
+\&
+ close(fd_userns);
+\&
+ /* In the following, \-1 as the \[aq]to_dirfd\[aq] argument ensures that
+ open_tree() fails if \[aq]target\[aq] is not an absolute pathname. */
+\&
+ ret = move_mount(fd_tree, "", \-1, target,
+ MOVE_MOUNT_F_EMPTY_PATH);
+ if (ret == \-1)
+ err(EXIT_FAILURE, "move_mount() to %s", target);
+\&
+ close(fd_tree);
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR newgidmap (1),
+.BR newuidmap (1),
+.BR clone (2),
+.BR mount (2),
+.BR unshare (2),
+.BR proc (5),
+.BR capabilities (7),
+.BR mount_namespaces (7),
+.BR user_namespaces (7),
+.BR xattr (7)
diff --git a/man2/move_pages.2 b/man2/move_pages.2
new file mode 100644
index 0000000..c2be4bb
--- /dev/null
+++ b/man2/move_pages.2
@@ -0,0 +1,253 @@
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft-2-para
+.\"
+.\" This manpage is Copyright (C) 2006 Silicon Graphics, Inc.
+.\" Christoph Lameter
+.\"
+.\" FIXME Should programs normally be using move_pages() directly, or should
+.\" they rather be using interfaces in the numactl package?
+.\" (e.g., compare with recommendation in mbind(2)).
+.\" Does this page need to give advice on this topic?
+.\"
+.TH move_pages 2 2023-07-15 "Linux man-pages 6.05.01"
+.SH NAME
+move_pages \- move individual pages of a process to another node
+.SH LIBRARY
+NUMA (Non-Uniform Memory Access) policy library
+.RI ( libnuma ", " \-lnuma )
+.SH SYNOPSIS
+.nf
+.B #include <numaif.h>
+.PP
+.BI "long move_pages(int " pid ", unsigned long " count ", \
+void *" pages [. count ],
+.BI " const int " nodes [. count "], int " status [. count "], \
+int " flags );
+.fi
+.SH DESCRIPTION
+.BR move_pages ()
+moves the specified
+.I pages
+of the process
+.I pid
+to the memory nodes specified by
+.IR nodes .
+The result of the move is reflected in
+.IR status .
+The
+.I flags
+indicate constraints on the pages to be moved.
+.PP
+.I pid
+is the ID of the process in which pages are to be moved.
+If
+.I pid
+is 0, then
+.BR move_pages ()
+moves pages of the calling process.
+.PP
+To move pages in another process requires the following privileges:
+.IP \[bu] 3
+Up to and including Linux 4.12:
+the caller must be privileged
+.RB ( CAP_SYS_NICE )
+or the real or effective user ID of the calling process must match the
+real or saved-set user ID of the target process.
+.IP \[bu]
+The older rules allowed the caller to discover various
+virtual address choices made by the kernel that could lead
+to the defeat of address-space-layout randomization
+for a process owned by the same UID as the caller,
+the rules were changed starting with Linux 4.13.
+Since Linux 4.13,
+.\" commit 197e7e521384a23b9e585178f3f11c9fa08274b9
+permission is governed by a ptrace access mode
+.B PTRACE_MODE_READ_REALCREDS
+check with respect to the target process; see
+.BR ptrace (2).
+.PP
+.I count
+is the number of pages to move.
+It defines the size of the three arrays
+.IR pages ,
+.IR nodes ,
+and
+.IR status .
+.PP
+.I pages
+is an array of pointers to the pages that should be moved.
+These are pointers that should be aligned to page boundaries.
+.\" FIXME Describe the result if pointers in the 'pages' array are
+.\" not aligned to page boundaries
+Addresses are specified as seen by the process specified by
+.IR pid .
+.PP
+.I nodes
+is an array of integers that specify the desired location for each page.
+Each element in the array is a node number.
+.I nodes
+can also be NULL, in which case
+.BR move_pages ()
+does not move any pages but instead will return the node
+where each page currently resides, in the
+.I status
+array.
+Obtaining the status of each page may be necessary to determine
+pages that need to be moved.
+.PP
+.I status
+is an array of integers that return the status of each page.
+The array contains valid values only if
+.BR move_pages ()
+did not return an error.
+Preinitialization of the array to a value
+which cannot represent a real numa node or valid error of status array
+could help to identify pages that have been migrated.
+.PP
+.I flags
+specify what types of pages to move.
+.B MPOL_MF_MOVE
+means that only pages that are in exclusive use by the process
+are to be moved.
+.B MPOL_MF_MOVE_ALL
+means that pages shared between multiple processes can also be moved.
+The process must be privileged
+.RB ( CAP_SYS_NICE )
+to use
+.BR MPOL_MF_MOVE_ALL .
+.SS Page states in the status array
+The following values can be returned in each element of the
+.I status
+array.
+.TP
+.B 0..MAX_NUMNODES
+Identifies the node on which the page resides.
+.TP
+.B \-EACCES
+The page is mapped by multiple processes and can be moved only if
+.B MPOL_MF_MOVE_ALL
+is specified.
+.TP
+.B \-EBUSY
+The page is currently busy and cannot be moved.
+Try again later.
+This occurs if a page is undergoing I/O or another kernel subsystem
+is holding a reference to the page.
+.TP
+.B \-EFAULT
+This is a zero page or the memory area is not mapped by the process.
+.TP
+.B \-EIO
+Unable to write back a page.
+The page has to be written back
+in order to move it since the page is dirty and the filesystem
+does not provide a migration function that would allow the move
+of dirty pages.
+.TP
+.B \-EINVAL
+A dirty page cannot be moved.
+The filesystem does not
+provide a migration function and has no ability to write back pages.
+.TP
+.B \-ENOENT
+The page is not present.
+.TP
+.B \-ENOMEM
+Unable to allocate memory on target node.
+.SH RETURN VALUE
+On success
+.BR move_pages ()
+returns zero.
+.\" FIXME . Is the following quite true: does the wrapper in numactl
+.\" do the right thing?
+On error, it returns \-1, and sets
+.I errno
+to indicate the error.
+If positive value is returned, it is the number of
+nonmigrated pages.
+.SH ERRORS
+.TP
+.B Positive value
+The number of nonmigrated pages if they were the result of nonfatal
+reasons (since
+.\" commit a49bd4d7163707de377aee062f17befef6da891b
+Linux 4.17).
+.TP
+.B E2BIG
+Too many pages to move.
+Since Linux 2.6.29,
+.\" commit 3140a2273009c01c27d316f35ab76a37e105fdd8
+the kernel no longer generates this error.
+.TP
+.B EACCES
+.\" FIXME Clarify "current cpuset" in the description of the EACCES error.
+.\" Is that the cpuset of the caller or the target?
+One of the target nodes is not allowed by the current cpuset.
+.TP
+.B EFAULT
+Parameter array could not be accessed.
+.TP
+.B EINVAL
+Flags other than
+.B MPOL_MF_MOVE
+and
+.B MPOL_MF_MOVE_ALL
+was specified or an attempt was made to migrate pages of a kernel thread.
+.TP
+.B ENODEV
+One of the target nodes is not online.
+.TP
+.B EPERM
+The caller specified
+.B MPOL_MF_MOVE_ALL
+without sufficient privileges
+.RB ( CAP_SYS_NICE ).
+Or, the caller attempted to move pages of a process belonging
+to another user but did not have privilege to do so
+.RB ( CAP_SYS_NICE ).
+.TP
+.B ESRCH
+Process does not exist.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.18.
+.SH NOTES
+For information on library support, see
+.BR numa (7).
+.PP
+Use
+.BR get_mempolicy (2)
+with the
+.B MPOL_F_MEMS_ALLOWED
+flag to obtain the set of nodes that are allowed by
+.\" FIXME Clarify "current cpuset". Is that the cpuset of the caller
+.\" or the target?
+the current cpuset.
+Note that this information is subject to change at any
+time by manual or automatic reconfiguration of the cpuset.
+.PP
+Use of this function may result in pages whose location
+(node) violates the memory policy established for the
+specified addresses (See
+.BR mbind (2))
+and/or the specified process (See
+.BR set_mempolicy (2)).
+That is, memory policy does not constrain the destination
+nodes used by
+.BR move_pages ().
+.PP
+The
+.I <numaif.h>
+header is not included with glibc, but requires installing
+.I libnuma\-devel
+or a similar package.
+.SH SEE ALSO
+.BR get_mempolicy (2),
+.BR mbind (2),
+.BR set_mempolicy (2),
+.BR numa (3),
+.BR numa_maps (5),
+.BR cpuset (7),
+.BR numa (7),
+.BR migratepages (8),
+.BR numastat (8)
diff --git a/man2/mprotect.2 b/man2/mprotect.2
new file mode 100644
index 0000000..22aa42b
--- /dev/null
+++ b/man2/mprotect.2
@@ -0,0 +1,363 @@
+.\" Copyright (C) 2007 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" and Copyright (C) 1995 Michael Shields <shields@tembel.org>.
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 1996-10-22 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 1997-05-31 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified 2003-08-24 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified 2004-08-16 by Andi Kleen <ak@muc.de>
+.\" 2007-06-02, mtk: Fairly substantial rewrites and additions, and
+.\" a much improved example program.
+.\"
+.TH mprotect 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+mprotect, pkey_mprotect \- set protection on a region of memory
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/mman.h>
+.PP
+.BI "int mprotect(void " addr [. len "], size_t " len ", int " prot );
+.PP
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #include <sys/mman.h>
+.PP
+.BI "int pkey_mprotect(void " addr [. len "], size_t " len ", int " prot ", int " pkey ");"
+.fi
+.SH DESCRIPTION
+.BR mprotect ()
+changes the access protections for the calling process's memory pages
+containing any part of the address range in the
+interval [\fIaddr\fP,\ \fIaddr\fP+\fIlen\fP\-1].
+.I addr
+must be aligned to a page boundary.
+.PP
+If the calling process tries to access memory in a manner
+that violates the protections, then the kernel generates a
+.B SIGSEGV
+signal for the process.
+.PP
+.I prot
+is a combination of the following access flags:
+.B PROT_NONE
+or a bitwise OR of the other values in the following list:
+.TP
+.B PROT_NONE
+The memory cannot be accessed at all.
+.TP
+.B PROT_READ
+The memory can be read.
+.TP
+.B PROT_WRITE
+The memory can be modified.
+.TP
+.B PROT_EXEC
+The memory can be executed.
+.TP
+.BR PROT_SEM " (since Linux 2.5.7)"
+The memory can be used for atomic operations.
+This flag was introduced as part of the
+.BR futex (2)
+implementation (in order to guarantee the ability to perform atomic
+operations required by commands such as
+.BR FUTEX_WAIT ),
+but is not currently used in on any architecture.
+.TP
+.BR PROT_SAO " (since Linux 2.6.26)"
+.\" commit aba46c5027cb59d98052231b36efcbbde9c77a1d
+.\" commit ef3d3246a0d06be622867d21af25f997aeeb105f
+The memory should have strong access ordering.
+This feature is specific to
+the PowerPC architecture
+(version 2.06 of the architecture specification adds the SAO CPU feature,
+and it is available on POWER 7 or PowerPC A2, for example).
+.PP
+Additionally (since Linux 2.6.0),
+.I prot
+can have one of the following flags set:
+.TP
+.\" mm/mmap.c:
+.\" vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
+.\" mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+.\" And calc_vm_flag_bits converts only GROWSDOWN/DENYWRITE/LOCKED.
+.B PROT_GROWSUP
+Apply the protection mode up to the end of a mapping
+that grows upwards.
+(Such mappings are created for the stack area on
+architectures\[em]for example, HP-PARISC\[em]that
+have an upwardly growing stack.)
+.\" The VMA is one that was marked with VM_GROWSUP by the kernel
+.\" when the stack was created. Note that (unlike VM_GROWSDOWN),
+.\" there is no mmap() flag (analogous to MAP_GROWSDOWN) for
+.\" creating a VMA that is marked VM_GROWSUP.
+.TP
+.B PROT_GROWSDOWN
+Apply the protection mode down to the beginning of a mapping
+that grows downward
+(which should be a stack segment or a segment mapped with the
+.B MAP_GROWSDOWN
+flag set).
+.PP
+Like
+.BR mprotect (),
+.BR pkey_mprotect ()
+changes the protection on the pages specified by
+.I addr
+and
+.IR len .
+The
+.I pkey
+argument specifies the protection key (see
+.BR pkeys (7))
+to assign to the memory.
+The protection key must be allocated with
+.BR pkey_alloc (2)
+before it is passed to
+.BR pkey_mprotect ().
+For an example of the use of this system call, see
+.BR pkeys (7).
+.SH RETURN VALUE
+On success,
+.BR mprotect ()
+and
+.BR pkey_mprotect ()
+return zero.
+On error, these system calls return \-1, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+The memory cannot be given the specified access.
+This can happen, for example, if you
+.BR mmap (2)
+a file to which you have read-only access, then ask
+.BR mprotect ()
+to mark it
+.BR PROT_WRITE .
+.TP
+.B EINVAL
+\fIaddr\fP is not a valid pointer,
+or not a multiple of the system page size.
+.TP
+.B EINVAL
+.RB ( pkey_mprotect ())
+\fIpkey\fP has not been allocated with
+.BR pkey_alloc (2)
+.TP
+.B EINVAL
+Both
+.B PROT_GROWSUP
+and
+.B PROT_GROWSDOWN
+were specified in
+.IR prot .
+.TP
+.B EINVAL
+Invalid flags specified in
+.IR prot .
+.TP
+.B EINVAL
+(PowerPC architecture)
+.B PROT_SAO
+was specified in
+.IR prot ,
+but SAO hardware feature is not available.
+.TP
+.B ENOMEM
+Internal kernel structures could not be allocated.
+.TP
+.B ENOMEM
+Addresses in the range
+.RI [ addr ,
+.IR addr + len \-1]
+are invalid for the address space of the process,
+or specify one or more pages that are not mapped.
+(Before Linux 2.4.19, the error
+.B EFAULT
+was incorrectly produced for these cases.)
+.TP
+.B ENOMEM
+Changing the protection of a memory region would result in the total number of
+mappings with distinct attributes (e.g., read versus read/write protection)
+exceeding the allowed maximum.
+.\" I.e., the number of VMAs would exceed the 64 kB maximum
+(For example, making the protection of a range
+.B PROT_READ
+in the middle of a region currently protected as
+.B PROT_READ|PROT_WRITE
+would result in three mappings:
+two read/write mappings at each end and a read-only mapping in the middle.)
+.SH VERSIONS
+.\" SVr4 defines an additional error
+.\" code EAGAIN. The SVr4 error conditions don't map neatly onto Linux's.
+POSIX says that the behavior of
+.BR mprotect ()
+is unspecified if it is applied to a region of memory that
+was not obtained via
+.BR mmap (2).
+.PP
+On Linux, it is always permissible to call
+.BR mprotect ()
+on any address in a process's address space (except for the
+kernel vsyscall area).
+In particular, it can be used
+to change existing code mappings to be writable.
+.PP
+Whether
+.B PROT_EXEC
+has any effect different from
+.B PROT_READ
+depends on processor architecture, kernel version, and process state.
+If
+.B READ_IMPLIES_EXEC
+is set in the process's personality flags (see
+.BR personality (2)),
+specifying
+.B PROT_READ
+will implicitly add
+.BR PROT_EXEC .
+.PP
+On some hardware architectures (e.g., i386),
+.B PROT_WRITE
+implies
+.BR PROT_READ .
+.PP
+POSIX.1 says that an implementation may permit access
+other than that specified in
+.IR prot ,
+but at a minimum can allow write access only if
+.B PROT_WRITE
+has been set, and must not allow any access if
+.B PROT_NONE
+has been set.
+.PP
+Applications should be careful when mixing use of
+.BR mprotect ()
+and
+.BR pkey_mprotect ().
+On x86, when
+.BR mprotect ()
+is used with
+.I prot
+set to
+.B PROT_EXEC
+a pkey may be allocated and set on the memory implicitly
+by the kernel, but only when the pkey was 0 previously.
+.PP
+On systems that do not support protection keys in hardware,
+.BR pkey_mprotect ()
+may still be used, but
+.I pkey
+must be set to \-1.
+When called this way, the operation of
+.BR pkey_mprotect ()
+is equivalent to
+.BR mprotect ().
+.SH STANDARDS
+.TP
+.BR mprotect ()
+POSIX.1-2008.
+.TP
+.BR pkey_mprotect ()
+Linux.
+.SH HISTORY
+.TP
+.BR mprotect ()
+POSIX.1-2001, SVr4.
+.TP
+.BR pkey_mprotect ()
+Linux 4.9,
+glibc 2.27.
+.SH NOTES
+.SH EXAMPLES
+.\" sigaction.2 refers to this example
+The program below demonstrates the use of
+.BR mprotect ().
+The program allocates four pages of memory, makes the third
+of these pages read-only, and then executes a loop that walks upward
+through the allocated region modifying bytes.
+.PP
+An example of what we might see when running the program is the
+following:
+.PP
+.in +4n
+.EX
+.RB "$" " ./a.out"
+Start of region: 0x804c000
+Got SIGSEGV at address: 0x804e000
+.EE
+.in
+.SS Program source
+\&
+.\" SRC BEGIN (mprotect.c)
+.EX
+#include <malloc.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <unistd.h>
+\&
+#define handle_error(msg) \e
+ do { perror(msg); exit(EXIT_FAILURE); } while (0)
+\&
+static char *buffer;
+\&
+static void
+handler(int sig, siginfo_t *si, void *unused)
+{
+ /* Note: calling printf() from a signal handler is not safe
+ (and should not be done in production programs), since
+ printf() is not async\-signal\-safe; see signal\-safety(7).
+ Nevertheless, we use printf() here as a simple way of
+ showing that the handler was called. */
+\&
+ printf("Got SIGSEGV at address: %p\en", si\->si_addr);
+ exit(EXIT_FAILURE);
+}
+\&
+int
+main(void)
+{
+ int pagesize;
+ struct sigaction sa;
+\&
+ sa.sa_flags = SA_SIGINFO;
+ sigemptyset(&sa.sa_mask);
+ sa.sa_sigaction = handler;
+ if (sigaction(SIGSEGV, &sa, NULL) == \-1)
+ handle_error("sigaction");
+\&
+ pagesize = sysconf(_SC_PAGE_SIZE);
+ if (pagesize == \-1)
+ handle_error("sysconf");
+\&
+ /* Allocate a buffer aligned on a page boundary;
+ initial protection is PROT_READ | PROT_WRITE. */
+\&
+ buffer = memalign(pagesize, 4 * pagesize);
+ if (buffer == NULL)
+ handle_error("memalign");
+\&
+ printf("Start of region: %p\en", buffer);
+\&
+ if (mprotect(buffer + pagesize * 2, pagesize,
+ PROT_READ) == \-1)
+ handle_error("mprotect");
+\&
+ for (char *p = buffer ; ; )
+ *(p++) = \[aq]a\[aq];
+\&
+ printf("Loop completed\en"); /* Should never happen */
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR mmap (2),
+.BR sysconf (3),
+.BR pkeys (7)
diff --git a/man2/mpx.2 b/man2/mpx.2
new file mode 100644
index 0000000..5d25ea6
--- /dev/null
+++ b/man2/mpx.2
@@ -0,0 +1 @@
+.so man2/unimplemented.2
diff --git a/man2/mq_getsetattr.2 b/man2/mq_getsetattr.2
new file mode 100644
index 0000000..b47e264
--- /dev/null
+++ b/man2/mq_getsetattr.2
@@ -0,0 +1,33 @@
+.\" Copyright (C) 2006 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH mq_getsetattr 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+mq_getsetattr \- get/set message queue attributes
+.SH SYNOPSIS
+.nf
+.BR "#include <mqueue.h>" " /* Definition of " "struct mq_attr" " */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_mq_getsetattr, mqd_t " mqdes ,
+.BI " const struct mq_attr *" newattr ", struct mq_attr *" oldattr );
+.fi
+.SH DESCRIPTION
+Do not use this system call.
+.PP
+This is the low-level system call used to implement
+.BR mq_getattr (3)
+and
+.BR mq_setattr (3).
+For an explanation of how this system call operates,
+see the description of
+.BR mq_setattr (3).
+.SH STANDARDS
+None.
+.SH NOTES
+Never call it unless you are writing a C library!
+.SH SEE ALSO
+.BR mq_getattr (3),
+.BR mq_overview (7)
diff --git a/man2/mq_notify.2 b/man2/mq_notify.2
new file mode 100644
index 0000000..505a45e
--- /dev/null
+++ b/man2/mq_notify.2
@@ -0,0 +1,2 @@
+.so man3/mq_notify.3
+.\" Because mq_notify(3) is layered on a system call of the same name
diff --git a/man2/mq_open.2 b/man2/mq_open.2
new file mode 100644
index 0000000..ce82835
--- /dev/null
+++ b/man2/mq_open.2
@@ -0,0 +1,2 @@
+.so man3/mq_open.3
+.\" Because mq_open(3) is layered on a system call of the same name
diff --git a/man2/mq_timedreceive.2 b/man2/mq_timedreceive.2
new file mode 100644
index 0000000..b4184f8
--- /dev/null
+++ b/man2/mq_timedreceive.2
@@ -0,0 +1,2 @@
+.so man3/mq_timedreceive.3
+.\" Because mq_timedreceive(3) is layered on a system call of the same name
diff --git a/man2/mq_timedsend.2 b/man2/mq_timedsend.2
new file mode 100644
index 0000000..db95863
--- /dev/null
+++ b/man2/mq_timedsend.2
@@ -0,0 +1,2 @@
+.so man3/mq_timedsend.3
+.\" Because mq_timedsend(3) is layered on a system call of the same name
diff --git a/man2/mq_unlink.2 b/man2/mq_unlink.2
new file mode 100644
index 0000000..c5f2768
--- /dev/null
+++ b/man2/mq_unlink.2
@@ -0,0 +1,2 @@
+.so man3/mq_unlink.3
+.\" Because mq_unlink(3) is layered on a system call of the same name
diff --git a/man2/mremap.2 b/man2/mremap.2
new file mode 100644
index 0000000..f2b2b98
--- /dev/null
+++ b/man2/mremap.2
@@ -0,0 +1,352 @@
+.\" Copyright (c) 1996 Tom Bjorkholm <tomb@mydata.se>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" 1996-04-11 Tom Bjorkholm <tomb@mydata.se>
+.\" First version written (1.3.86)
+.\" 1996-04-12 Tom Bjorkholm <tomb@mydata.se>
+.\" Update for Linux 1.3.87 and later
+.\" 2005-10-11 mtk: Added NOTES for MREMAP_FIXED; revised EINVAL text.
+.\"
+.TH mremap 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+mremap \- remap a virtual memory address
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #include <sys/mman.h>
+.PP
+.BI "void *mremap(void " old_address [. old_size "], size_t " old_size ,
+.BI " size_t " new_size ", int " flags ", ... /* void *" new_address " */);"
+.fi
+.SH DESCRIPTION
+.BR mremap ()
+expands (or shrinks) an existing memory mapping, potentially
+moving it at the same time (controlled by the \fIflags\fP argument and
+the available virtual address space).
+.PP
+\fIold_address\fP is the old address of the virtual memory block that you
+want to expand (or shrink).
+Note that \fIold_address\fP has to be page
+aligned.
+\fIold_size\fP is the old size of the
+virtual memory block.
+\fInew_size\fP is the requested size of the
+virtual memory block after the resize.
+An optional fifth argument,
+.IR new_address ,
+may be provided; see the description of
+.B MREMAP_FIXED
+below.
+.PP
+If the value of \fIold_size\fP is zero, and \fIold_address\fP refers to
+a shareable mapping (see
+.BR mmap (2)
+.BR MAP_SHARED ),
+then
+.BR mremap ()
+will create a new mapping of the same pages.
+\fInew_size\fP
+will be the size of the new mapping and the location of the new mapping
+may be specified with \fInew_address\fP; see the description of
+.B MREMAP_FIXED
+below.
+If a new mapping is requested via this method, then the
+.B MREMAP_MAYMOVE
+flag must also be specified.
+.PP
+The \fIflags\fP bit-mask argument may be 0, or include the following flags:
+.TP
+.B MREMAP_MAYMOVE
+By default, if there is not sufficient space to expand a mapping
+at its current location, then
+.BR mremap ()
+fails.
+If this flag is specified, then the kernel is permitted to
+relocate the mapping to a new virtual address, if necessary.
+If the mapping is relocated,
+then absolute pointers into the old mapping location
+become invalid (offsets relative to the starting address of
+the mapping should be employed).
+.TP
+.BR MREMAP_FIXED " (since Linux 2.3.31)"
+This flag serves a similar purpose to the
+.B MAP_FIXED
+flag of
+.BR mmap (2).
+If this flag is specified, then
+.BR mremap ()
+accepts a fifth argument,
+.IR "void\ *new_address" ,
+which specifies a page-aligned address to which the mapping must
+be moved.
+Any previous mapping at the address range specified by
+.I new_address
+and
+.I new_size
+is unmapped.
+.IP
+If
+.B MREMAP_FIXED
+is specified, then
+.B MREMAP_MAYMOVE
+must also be specified.
+.TP
+.BR MREMAP_DONTUNMAP " (since Linux 5.7)"
+.\" commit e346b3813067d4b17383f975f197a9aa28a3b077
+This flag, which must be used in conjunction with
+.BR MREMAP_MAYMOVE ,
+remaps a mapping to a new address but does not unmap the mapping at
+.IR old_address .
+.IP
+The
+.B MREMAP_DONTUNMAP
+flag can be used only with private anonymous mappings
+(see the description of
+.B MAP_PRIVATE
+and
+.B MAP_ANONYMOUS
+in
+.BR mmap (2)).
+.IP
+After completion,
+any access to the range specified by
+.I old_address
+and
+.I old_size
+will result in a page fault.
+The page fault will be handled by a
+.BR userfaultfd (2)
+handler
+if the address is in a range previously registered with
+.BR userfaultfd (2).
+Otherwise, the kernel allocates a zero-filled page to handle the fault.
+.IP
+The
+.B MREMAP_DONTUNMAP
+flag may be used to atomically move a mapping while leaving the source
+mapped.
+See NOTES for some possible applications of
+.BR MREMAP_DONTUNMAP .
+.PP
+If the memory segment specified by
+.I old_address
+and
+.I old_size
+is locked (using
+.BR mlock (2)
+or similar), then this lock is maintained when the segment is
+resized and/or relocated.
+As a consequence, the amount of memory locked by the process may change.
+.SH RETURN VALUE
+On success
+.BR mremap ()
+returns a pointer to the new virtual memory area.
+On error, the value
+.B MAP_FAILED
+(that is, \fI(void\ *)\ \-1\fP) is returned,
+and \fIerrno\fP is set to indicate the error.
+.SH ERRORS
+.TP
+.B EAGAIN
+The caller tried to expand a memory segment that is locked,
+but this was not possible without exceeding the
+.B RLIMIT_MEMLOCK
+resource limit.
+.TP
+.B EFAULT
+Some address in the range
+\fIold_address\fP to \fIold_address\fP+\fIold_size\fP is an invalid
+virtual memory address for this process.
+You can also get
+.B EFAULT
+even if there exist mappings that cover the
+whole address space requested, but those mappings are of different types.
+.TP
+.B EINVAL
+An invalid argument was given.
+Possible causes are:
+.RS
+.IP \[bu] 3
+\fIold_address\fP was not
+page aligned;
+.IP \[bu]
+a value other than
+.B MREMAP_MAYMOVE
+or
+.B MREMAP_FIXED
+or
+.B MREMAP_DONTUNMAP
+was specified in
+.IR flags ;
+.IP \[bu]
+.I new_size
+was zero;
+.IP \[bu]
+.I new_size
+or
+.I new_address
+was invalid;
+.IP \[bu]
+the new address range specified by
+.I new_address
+and
+.I new_size
+overlapped the old address range specified by
+.I old_address
+and
+.IR old_size ;
+.IP \[bu]
+.B MREMAP_FIXED
+or
+.B MREMAP_DONTUNMAP
+was specified without also specifying
+.BR MREMAP_MAYMOVE ;
+.IP \[bu]
+.B MREMAP_DONTUNMAP
+was specified, but one or more pages in the range specified by
+.I old_address
+and
+.I old_size
+were not private anonymous;
+.IP \[bu]
+.B MREMAP_DONTUNMAP
+was specified and
+.I old_size
+was not equal to
+.IR new_size ;
+.IP \[bu]
+\fIold_size\fP was zero and \fIold_address\fP does not refer to a
+shareable mapping (but see BUGS);
+.IP \[bu]
+\fIold_size\fP was zero and the
+.B MREMAP_MAYMOVE
+flag was not specified.
+.RE
+.TP
+.B ENOMEM
+Not enough memory was available to complete the operation.
+Possible causes are:
+.RS
+.IP \[bu] 3
+The memory area cannot be expanded at the current virtual address, and the
+.B MREMAP_MAYMOVE
+flag is not set in \fIflags\fP.
+Or, there is not enough (virtual) memory available.
+.IP \[bu]
+.B MREMAP_DONTUNMAP
+was used causing a new mapping to be created that would exceed the
+(virtual) memory available.
+Or, it would exceed the maximum number of allowed mappings.
+.RE
+.SH STANDARDS
+Linux.
+.SH HISTORY
+.\" 4.2BSD had a (never actually implemented)
+.\" .BR mremap (2)
+.\" call with completely different semantics.
+.\" .PP
+Prior to glibc 2.4, glibc did not expose the definition of
+.BR MREMAP_FIXED ,
+and the prototype for
+.BR mremap ()
+did not allow for the
+.I new_address
+argument.
+.SH NOTES
+.BR mremap ()
+changes the
+mapping between virtual addresses and memory pages.
+This can be used to implement a very efficient
+.BR realloc (3).
+.PP
+In Linux, memory is divided into pages.
+A process has (one or)
+several linear virtual memory segments.
+Each virtual memory segment has one
+or more mappings to real memory pages (in the page table).
+Each virtual memory segment has its own
+protection (access rights), which may cause
+a segmentation violation
+.RB ( SIGSEGV )
+if the memory is accessed incorrectly (e.g.,
+writing to a read-only segment).
+Accessing virtual memory outside of the
+segments will also cause a segmentation violation.
+.PP
+If
+.BR mremap ()
+is used to move or expand an area locked with
+.BR mlock (2)
+or equivalent, the
+.BR mremap ()
+call will make a best effort to populate the new area but will not fail
+with
+.B ENOMEM
+if the area cannot be populated.
+.\"
+.SS MREMAP_DONTUNMAP use cases
+Possible applications for
+.B MREMAP_DONTUNMAP
+include:
+.IP \[bu] 3
+Non-cooperative
+.BR userfaultfd (2):
+an application can yank out a virtual address range using
+.B MREMAP_DONTUNMAP
+and then employ a
+.BR userfaultfd (2)
+handler to handle the page faults that subsequently occur
+as other threads in the process touch pages in the yanked range.
+.IP \[bu]
+Garbage collection:
+.B MREMAP_DONTUNMAP
+can be used in conjunction with
+.BR userfaultfd (2)
+to implement garbage collection algorithms (e.g., in a Java virtual machine).
+Such an implementation can be cheaper (and simpler)
+than conventional garbage collection techniques that involve
+marking pages with protection
+.B PROT_NONE
+in conjunction with the use of a
+.B SIGSEGV
+handler to catch accesses to those pages.
+.SH BUGS
+Before Linux 4.14,
+if
+.I old_size
+was zero and the mapping referred to by
+.I old_address
+was a private mapping
+.RB ( mmap "(2) " MAP_PRIVATE ),
+.BR mremap ()
+created a new private mapping unrelated to the original mapping.
+This behavior was unintended
+and probably unexpected in user-space applications
+(since the intention of
+.BR mremap ()
+is to create a new mapping based on the original mapping).
+Since Linux 4.14,
+.\" commit dba58d3b8c5045ad89c1c95d33d01451e3964db7
+.BR mremap ()
+fails with the error
+.B EINVAL
+in this scenario.
+.SH SEE ALSO
+.BR brk (2),
+.BR getpagesize (2),
+.BR getrlimit (2),
+.BR mlock (2),
+.BR mmap (2),
+.BR sbrk (2),
+.BR malloc (3),
+.BR realloc (3)
+.PP
+Your favorite text book on operating systems
+for more information on paged memory
+(e.g., \fIModern Operating Systems\fP by Andrew S.\& Tanenbaum,
+\fIInside Linux\fP by Randolph Bentson,
+\fIThe Design of the UNIX Operating System\fP by Maurice J.\& Bach)
diff --git a/man2/msgctl.2 b/man2/msgctl.2
new file mode 100644
index 0000000..b905b0f
--- /dev/null
+++ b/man2/msgctl.2
@@ -0,0 +1,424 @@
+'\" t
+.\" Copyright 1993 Giorgio Ciucci (giorgio@crcc.it)
+.\" and Copyright 2004, 2005 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified Tue Oct 22 08:11:14 EDT 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified Sun Feb 18 01:59:29 2001 by Andries E. Brouwer <aeb@cwi.nl>
+.\" Modified, 27 May 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on CAP_IPC_OWNER requirement
+.\" Modified, 17 Jun 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on CAP_SYS_ADMIN requirement for IPC_SET and IPC_RMID
+.\" Modified, 11 Nov 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Language and formatting clean-ups
+.\" Added msqid_ds and ipc_perm structure definitions
+.\" 2005-08-02, mtk: Added IPC_INFO, MSG_INFO, MSG_STAT descriptions
+.\" 2018-03-20, dbueso: Added MSG_STAT_ANY description.
+.\"
+.TH msgctl 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+msgctl \- System V message control operations
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/msg.h>
+.PP
+.BI "int msgctl(int " msqid ", int " cmd ", struct msqid_ds *" buf );
+.fi
+.SH DESCRIPTION
+.BR msgctl ()
+performs the control operation specified by
+.I cmd
+on the System\ V message queue with identifier
+.IR msqid .
+.PP
+The
+.I msqid_ds
+data structure is defined in \fI<sys/msg.h>\fP as follows:
+.PP
+.in +4n
+.EX
+struct msqid_ds {
+ struct ipc_perm msg_perm; /* Ownership and permissions */
+ time_t msg_stime; /* Time of last msgsnd(2) */
+ time_t msg_rtime; /* Time of last msgrcv(2) */
+ time_t msg_ctime; /* Time of creation or last
+ modification by msgctl() */
+ unsigned long msg_cbytes; /* # of bytes in queue */
+ msgqnum_t msg_qnum; /* # number of messages in queue */
+ msglen_t msg_qbytes; /* Maximum # of bytes in queue */
+ pid_t msg_lspid; /* PID of last msgsnd(2) */
+ pid_t msg_lrpid; /* PID of last msgrcv(2) */
+};
+.EE
+.in
+.PP
+The fields of the
+.I msqid_ds
+structure are as follows:
+.TP 11
+.I msg_perm
+This is an
+.I ipc_perm
+structure (see below) that specifies the access permissions on the message
+queue.
+.TP
+.I msg_stime
+Time of the last
+.BR msgsnd (2)
+system call.
+.TP
+.I msg_rtime
+Time of the last
+.BR msgrcv (2)
+system call.
+.TP
+.I msg_ctime
+Time of creation of queue or time of last
+.BR msgctl ()
+.B IPC_SET
+operation.
+.TP
+.I msg_cbytes
+Number of bytes in all messages currently on the message queue.
+This is a nonstandard Linux extension that is not specified in POSIX.
+.TP
+.I msg_qnum
+Number of messages currently on the message queue.
+.TP
+.I msg_qbytes
+Maximum number of bytes of message text allowed on the message
+queue.
+.TP
+.I msg_lspid
+ID of the process that performed the last
+.BR msgsnd (2)
+system call.
+.TP
+.I msg_lrpid
+ID of the process that performed the last
+.BR msgrcv (2)
+system call.
+.PP
+The
+.I ipc_perm
+structure is defined as follows
+(the highlighted fields are settable using
+.BR IPC_SET ):
+.PP
+.in +4n
+.EX
+struct ipc_perm {
+ key_t __key; /* Key supplied to msgget(2) */
+ uid_t \fBuid\fP; /* Effective UID of owner */
+ gid_t \fBgid\fP; /* Effective GID of owner */
+ uid_t cuid; /* Effective UID of creator */
+ gid_t cgid; /* Effective GID of creator */
+ unsigned short \fBmode\fP; /* Permissions */
+ unsigned short __seq; /* Sequence number */
+};
+.EE
+.in
+.PP
+The least significant 9 bits of the
+.I mode
+field of the
+.I ipc_perm
+structure define the access permissions for the message queue.
+The permission bits are as follows:
+.TS
+l l.
+0400 Read by user
+0200 Write by user
+0040 Read by group
+0020 Write by group
+0004 Read by others
+0002 Write by others
+.TE
+.PP
+Bits 0100, 0010, and 0001 (the execute bits) are unused by the system.
+.PP
+Valid values for
+.I cmd
+are:
+.TP
+.B IPC_STAT
+Copy information from the kernel data structure associated with
+.I msqid
+into the
+.I msqid_ds
+structure pointed to by
+.IR buf .
+The caller must have read permission on the message queue.
+.TP
+.B IPC_SET
+Write the values of some members of the
+.I msqid_ds
+structure pointed to by
+.I buf
+to the kernel data structure associated with this message queue,
+updating also its
+.I msg_ctime
+member.
+.IP
+The following members of the structure are updated:
+.IR msg_qbytes ,
+.IR msg_perm.uid ,
+.IR msg_perm.gid ,
+and (the least significant 9 bits of)
+.IR msg_perm.mode .
+.IP
+The effective UID of the calling process must match the owner
+.RI ( msg_perm.uid )
+or creator
+.RI ( msg_perm.cuid )
+of the message queue, or the caller must be privileged.
+Appropriate privilege (Linux: the
+.B CAP_SYS_RESOURCE
+capability) is required to raise the
+.I msg_qbytes
+value beyond the system parameter
+.BR MSGMNB .
+.TP
+.B IPC_RMID
+Immediately remove the message queue,
+awakening all waiting reader and writer processes (with an error
+return and
+.I errno
+set to
+.BR EIDRM ).
+The calling process must have appropriate privileges
+or its effective user ID must be either that of the creator or owner
+of the message queue.
+The third argument to
+.BR msgctl ()
+is ignored in this case.
+.TP
+.BR IPC_INFO " (Linux-specific)"
+Return information about system-wide message queue limits and
+parameters in the structure pointed to by
+.IR buf .
+This structure is of type
+.I msginfo
+(thus, a cast is required),
+defined in
+.I <sys/msg.h>
+if the
+.B _GNU_SOURCE
+feature test macro is defined:
+.IP
+.in +4n
+.EX
+struct msginfo {
+ int msgpool; /* Size in kibibytes of buffer pool
+ used to hold message data;
+ unused within kernel */
+ int msgmap; /* Maximum number of entries in message
+ map; unused within kernel */
+ int msgmax; /* Maximum number of bytes that can be
+ written in a single message */
+ int msgmnb; /* Maximum number of bytes that can be
+ written to queue; used to initialize
+ msg_qbytes during queue creation
+ (msgget(2)) */
+ int msgmni; /* Maximum number of message queues */
+ int msgssz; /* Message segment size;
+ unused within kernel */
+ int msgtql; /* Maximum number of messages on all queues
+ in system; unused within kernel */
+ unsigned short msgseg;
+ /* Maximum number of segments;
+ unused within kernel */
+};
+.EE
+.in
+.IP
+The
+.IR msgmni ,
+.IR msgmax ,
+and
+.I msgmnb
+settings can be changed via
+.I /proc
+files of the same name; see
+.BR proc (5)
+for details.
+.TP
+.BR MSG_INFO " (Linux-specific)"
+Return a
+.I msginfo
+structure containing the same information as for
+.BR IPC_INFO ,
+except that the following fields are returned with information
+about system resources consumed by message queues: the
+.I msgpool
+field returns the number of message queues that currently exist
+on the system; the
+.I msgmap
+field returns the total number of messages in all queues
+on the system; and the
+.I msgtql
+field returns the total number of bytes in all messages
+in all queues on the system.
+.TP
+.BR MSG_STAT " (Linux-specific)"
+Return a
+.I msqid_ds
+structure as for
+.BR IPC_STAT .
+However, the
+.I msqid
+argument is not a queue identifier, but instead an index into
+the kernel's internal array that maintains information about
+all message queues on the system.
+.TP
+.BR MSG_STAT_ANY " (Linux-specific, since Linux 4.17)"
+Return a
+.I msqid_ds
+structure as for
+.BR MSG_STAT .
+However,
+.I msg_perm.mode
+is not checked for read access for
+.I msqid
+meaning that any user can employ this operation (just as any user may read
+.I /proc/sysvipc/msg
+to obtain the same information).
+.SH RETURN VALUE
+On success,
+.BR IPC_STAT ,
+.BR IPC_SET ,
+and
+.B IPC_RMID
+return 0.
+A successful
+.B IPC_INFO
+or
+.B MSG_INFO
+operation returns the index of the highest used entry in the
+kernel's internal array recording information about all
+message queues.
+(This information can be used with repeated
+.B MSG_STAT
+or
+.B MSG_STAT_ANY
+operations to obtain information about all queues on the system.)
+A successful
+.B MSG_STAT
+or
+.B MSG_STAT_ANY
+operation returns the identifier of the queue whose index was given in
+.IR msqid .
+.PP
+On failure, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+The argument
+.I cmd
+is equal to
+.B IPC_STAT
+or
+.BR MSG_STAT ,
+but the calling process does not have read permission on the message queue
+.IR msqid ,
+and does not have the
+.B CAP_IPC_OWNER
+capability in the user namespace that governs its IPC namespace.
+.TP
+.B EFAULT
+The argument
+.I cmd
+has the value
+.B IPC_SET
+or
+.BR IPC_STAT ,
+but the address pointed to by
+.I buf
+isn't accessible.
+.TP
+.B EIDRM
+The message queue was removed.
+.TP
+.B EINVAL
+Invalid value for
+.I cmd
+or
+.IR msqid .
+Or: for a
+.B MSG_STAT
+operation, the index value specified in
+.I msqid
+referred to an array slot that is currently unused.
+.TP
+.B EPERM
+The argument
+.I cmd
+has the value
+.B IPC_SET
+or
+.BR IPC_RMID ,
+but the effective user ID of the calling process is not the creator
+(as found in
+.IR msg_perm.cuid )
+or the owner
+(as found in
+.IR msg_perm.uid )
+of the message queue,
+and the caller is not privileged (Linux: does not have the
+.B CAP_SYS_ADMIN
+capability).
+.TP
+.B EPERM
+An attempt
+.RB ( IPC_SET )
+was made to increase
+.I msg_qbytes
+beyond the system parameter
+.BR MSGMNB ,
+but the caller is not privileged (Linux: does not have the
+.B CAP_SYS_RESOURCE
+capability).
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4.
+.\" SVID does not document the EIDRM error condition.
+.PP
+Various fields in the \fIstruct msqid_ds\fP were
+typed as
+.I short
+under Linux 2.2
+and have become
+.I long
+under Linux 2.4.
+To take advantage of this,
+a recompilation under glibc-2.1.91 or later should suffice.
+(The kernel distinguishes old and new calls by an
+.B IPC_64
+flag in
+.IR cmd .)
+.SH NOTES
+The
+.BR IPC_INFO ,
+.BR MSG_STAT ,
+and
+.B MSG_INFO
+operations are used by the
+.BR ipcs (1)
+program to provide information on allocated resources.
+In the future these may modified or moved to a
+.I /proc
+filesystem interface.
+.SH SEE ALSO
+.BR msgget (2),
+.BR msgrcv (2),
+.BR msgsnd (2),
+.BR capabilities (7),
+.BR mq_overview (7),
+.BR sysvipc (7)
diff --git a/man2/msgget.2 b/man2/msgget.2
new file mode 100644
index 0000000..0774f49
--- /dev/null
+++ b/man2/msgget.2
@@ -0,0 +1,217 @@
+.\" Copyright 1993 Giorgio Ciucci <giorgio@crcc.it>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Added correction due to Nick Duffek <nsd@bbc.com>, aeb, 960426
+.\" Modified Wed Nov 6 04:00:31 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified, 8 Jan 2003, Michael Kerrisk, <mtk.manpages@gmail.com>
+.\" Removed EIDRM from errors - that can't happen...
+.\" Modified, 27 May 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on capability requirements
+.\" Modified, 11 Nov 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Language and formatting clean-ups
+.\" Added notes on /proc files
+.\"
+.TH msgget 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+msgget \- get a System V message queue identifier
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/msg.h>
+.PP
+.BI "int msgget(key_t " key ", int " msgflg );
+.fi
+.SH DESCRIPTION
+The
+.BR msgget ()
+system call returns the System\ V message queue identifier associated
+with the value of the
+.I key
+argument.
+It may be used either to obtain the identifier of a previously created
+message queue (when
+.I msgflg
+is zero and
+.I key
+does not have the value
+.BR IPC_PRIVATE ),
+or to create a new set.
+.PP
+A new message queue is created if
+.I key
+has the value
+.B IPC_PRIVATE
+or
+.I key
+isn't
+.BR IPC_PRIVATE ,
+no message queue with the given key
+.I key
+exists, and
+.B IPC_CREAT
+is specified in
+.IR msgflg .
+.PP
+If
+.I msgflg
+specifies both
+.B IPC_CREAT
+and
+.B IPC_EXCL
+and a message queue already exists for
+.IR key ,
+then
+.BR msgget ()
+fails with
+.I errno
+set to
+.BR EEXIST .
+(This is analogous to the effect of the combination
+.B O_CREAT | O_EXCL
+for
+.BR open (2).)
+.PP
+Upon creation, the least significant bits of the argument
+.I msgflg
+define the permissions of the message queue.
+These permission bits have the same format and semantics
+as the permissions specified for the
+.I mode
+argument of
+.BR open (2).
+(The execute permissions are not used.)
+.PP
+If a new message queue is created,
+then its associated data structure
+.I msqid_ds
+(see
+.BR msgctl (2))
+is initialized as follows:
+.IP \[bu] 3
+.I msg_perm.cuid
+and
+.I msg_perm.uid
+are set to the effective user ID of the calling process.
+.IP \[bu]
+.I msg_perm.cgid
+and
+.I msg_perm.gid
+are set to the effective group ID of the calling process.
+.IP \[bu]
+The least significant 9 bits of
+.I msg_perm.mode
+are set to the least significant 9 bits of
+.IR msgflg .
+.IP \[bu]
+.IR msg_qnum ,
+.IR msg_lspid ,
+.IR msg_lrpid ,
+.IR msg_stime ,
+and
+.I msg_rtime
+are set to 0.
+.IP \[bu]
+.I msg_ctime
+is set to the current time.
+.IP \[bu]
+.I msg_qbytes
+is set to the system limit
+.BR MSGMNB .
+.PP
+If the message queue already exists the permissions are
+verified, and a check is made to see if it is marked for
+destruction.
+.SH RETURN VALUE
+On success,
+.BR msgget ()
+returns the message queue identifier (a nonnegative integer).
+On failure, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+A message queue exists for
+.IR key ,
+but the calling process does not have permission to access the queue,
+and does not have the
+.B CAP_IPC_OWNER
+capability in the user namespace that governs its IPC namespace.
+.TP
+.B EEXIST
+.B IPC_CREAT
+and
+.B IPC_EXCL
+were specified in
+.IR msgflg ,
+but a message queue already exists for
+.IR key .
+.TP
+.B ENOENT
+No message queue exists for
+.I key
+and
+.I msgflg
+did not specify
+.BR IPC_CREAT .
+.TP
+.B ENOMEM
+A message queue has to be created but the system does not have enough
+memory for the new data structure.
+.TP
+.B ENOSPC
+A message queue has to be created but the system limit for the maximum
+number of message queues
+.RB ( MSGMNI )
+would be exceeded.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4.
+.SS Linux
+Until Linux 2.3.20, Linux would return
+.B EIDRM
+for a
+.BR msgget ()
+on a message queue scheduled for deletion.
+.SH NOTES
+.B IPC_PRIVATE
+isn't a flag field but a
+.I key_t
+type.
+If this special value is used for
+.IR key ,
+the system call ignores everything but the least significant 9 bits of
+.I msgflg
+and creates a new message queue (on success).
+.PP
+The following is a system limit on message queue resources affecting a
+.BR msgget ()
+call:
+.TP
+.B MSGMNI
+System-wide limit on the number of message queues.
+Before Linux 3.19,
+.\" commit 0050ee059f7fc86b1df2527aaa14ed5dc72f9973
+the default value for this limit was calculated using a formula
+based on available system memory.
+Since Linux 3.19, the default value is 32,000.
+On Linux, this limit can be read and modified via
+.IR /proc/sys/kernel/msgmni .
+.SH BUGS
+The name choice
+.B IPC_PRIVATE
+was perhaps unfortunate,
+.B IPC_NEW
+would more clearly show its function.
+.SH SEE ALSO
+.BR msgctl (2),
+.BR msgrcv (2),
+.BR msgsnd (2),
+.BR ftok (3),
+.BR capabilities (7),
+.BR mq_overview (7),
+.BR sysvipc (7)
diff --git a/man2/msgop.2 b/man2/msgop.2
new file mode 100644
index 0000000..381875e
--- /dev/null
+++ b/man2/msgop.2
@@ -0,0 +1,684 @@
+.\" Copyright 1993 Giorgio Ciucci <giorgio@crcc.it>
+.\" and Copyright 2015 Bill Pemberton <wfp5p@worldbroken.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified Tue Oct 22 16:40:11 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified Mon Jul 10 21:09:59 2000 by aeb
+.\" Modified 1 Jun 2002, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Language clean-ups.
+.\" Enhanced and corrected information on msg_qbytes, MSGMNB and MSGMAX
+.\" Added note on restart behavior of msgsnd() and msgrcv()
+.\" Formatting clean-ups (argument and field names marked as .I
+.\" instead of .B)
+.\" Modified, 27 May 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on capability requirements
+.\" Modified, 11 Nov 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Language and formatting clean-ups
+.\" Added notes on /proc files
+.\"
+.TH MSGOP 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+msgrcv, msgsnd \- System V message queue operations
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/msg.h>
+.PP
+.BI "int msgsnd(int " msqid ", const void " msgp [. msgsz "], size_t " msgsz ,
+.BI " int " msgflg );
+.PP
+.BI "ssize_t msgrcv(int " msqid ", void " msgp [. msgsz "], size_t " msgsz \
+", long " msgtyp ,
+.BI " int " msgflg );
+.fi
+.SH DESCRIPTION
+The
+.BR msgsnd ()
+and
+.BR msgrcv ()
+system calls are used to send messages to,
+and receive messages from, a System\ V message queue.
+The calling process must have write permission on the message queue
+in order to send a message, and read permission to receive a message.
+.PP
+The
+.I msgp
+argument is a pointer to a caller-defined structure
+of the following general form:
+.PP
+.in +4n
+.EX
+struct msgbuf {
+ long mtype; /* message type, must be > 0 */
+ char mtext[1]; /* message data */
+};
+.EE
+.in
+.PP
+The
+.I mtext
+field is an array (or other structure) whose size is specified by
+.IR msgsz ,
+a nonnegative integer value.
+Messages of zero length (i.e., no
+.I mtext
+field) are permitted.
+The
+.I mtype
+field must have a strictly positive integer value.
+This value can be
+used by the receiving process for message selection
+(see the description of
+.BR msgrcv ()
+below).
+.SS msgsnd()
+The
+.BR msgsnd ()
+system call appends a copy of the message pointed to by
+.I msgp
+to the message queue whose identifier is specified
+by
+.IR msqid .
+.PP
+If sufficient space is available in the queue,
+.BR msgsnd ()
+succeeds immediately.
+The queue capacity is governed by the
+.I msg_qbytes
+field in the associated data structure for the message queue.
+During queue creation this field is initialized to
+.B MSGMNB
+bytes, but this limit can be modified using
+.BR msgctl (2).
+A message queue is considered to be full if either of the following
+conditions is true:
+.IP \[bu] 3
+Adding a new message to the queue would cause the total number of bytes
+in the queue to exceed the queue's maximum size (the
+.I msg_qbytes
+field).
+.IP \[bu]
+Adding another message to the queue would cause the total number of messages
+in the queue to exceed the queue's maximum size (the
+.I msg_qbytes
+field).
+This check is necessary to prevent an unlimited number of zero-length
+messages being placed on the queue.
+Although such messages contain no data,
+they nevertheless consume (locked) kernel memory.
+.PP
+If insufficient space is available in the queue, then the default
+behavior of
+.BR msgsnd ()
+is to block until space becomes available.
+If
+.B IPC_NOWAIT
+is specified in
+.IR msgflg ,
+then the call instead fails with the error
+.BR EAGAIN .
+.PP
+A blocked
+.BR msgsnd ()
+call may also fail if:
+.IP \[bu] 3
+the queue is removed,
+in which case the system call fails with
+.I errno
+set to
+.BR EIDRM ;
+or
+.IP \[bu]
+a signal is caught, in which case the system call fails
+with
+.I errno
+set to
+.BR EINTR ; see
+.BR signal (7).
+.RB ( msgsnd ()
+is never automatically restarted after being interrupted by a
+signal handler, regardless of the setting of the
+.B SA_RESTART
+flag when establishing a signal handler.)
+.PP
+Upon successful completion the message queue data structure is updated
+as follows:
+.IP \[bu] 3
+.I msg_lspid
+is set to the process ID of the calling process.
+.IP \[bu]
+.I msg_qnum
+is incremented by 1.
+.IP \[bu]
+.I msg_stime
+is set to the current time.
+.SS msgrcv()
+The
+.BR msgrcv ()
+system call removes a message from the queue specified by
+.I msqid
+and places it in the buffer
+pointed to by
+.IR msgp .
+.PP
+The argument
+.I msgsz
+specifies the maximum size in bytes for the member
+.I mtext
+of the structure pointed to by the
+.I msgp
+argument.
+If the message text has length greater than
+.IR msgsz ,
+then the behavior depends on whether
+.B MSG_NOERROR
+is specified in
+.IR msgflg .
+If
+.B MSG_NOERROR
+is specified, then
+the message text will be truncated (and the truncated part will be
+lost); if
+.B MSG_NOERROR
+is not specified, then
+the message isn't removed from the queue and
+the system call fails returning \-1 with
+.I errno
+set to
+.BR E2BIG .
+.PP
+Unless
+.B MSG_COPY
+is specified in
+.I msgflg
+(see below),
+the
+.I msgtyp
+argument specifies the type of message requested, as follows:
+.IP \[bu] 3
+If
+.I msgtyp
+is 0,
+then the first message in the queue is read.
+.IP \[bu]
+If
+.I msgtyp
+is greater than 0,
+then the first message in the queue of type
+.I msgtyp
+is read, unless
+.B MSG_EXCEPT
+was specified in
+.IR msgflg ,
+in which case
+the first message in the queue of type not equal to
+.I msgtyp
+will be read.
+.IP \[bu]
+If
+.I msgtyp
+is less than 0,
+then the first message in the queue with the lowest type less than or
+equal to the absolute value of
+.I msgtyp
+will be read.
+.PP
+The
+.I msgflg
+argument is a bit mask constructed by ORing together zero or more
+of the following flags:
+.TP
+.B IPC_NOWAIT
+Return immediately if no message of the requested type is in the queue.
+The system call fails with
+.I errno
+set to
+.BR ENOMSG .
+.TP
+.BR MSG_COPY " (since Linux 3.8)"
+.\" commit 4a674f34ba04a002244edaf891b5da7fc1473ae8
+Nondestructively fetch a copy of the message at the ordinal position
+in the queue specified by
+.I msgtyp
+(messages are considered to be numbered starting at 0).
+.IP
+This flag must be specified in conjunction with
+.BR IPC_NOWAIT ,
+with the result that, if there is no message available at the given position,
+the call fails immediately with the error
+.BR ENOMSG .
+Because they alter the meaning of
+.I msgtyp
+in orthogonal ways,
+.B MSG_COPY
+and
+.B MSG_EXCEPT
+may not both be specified in
+.IR msgflg .
+.IP
+The
+.B MSG_COPY
+flag was added for the implementation of
+the kernel checkpoint-restore facility and
+is available only if the kernel was built with the
+.B CONFIG_CHECKPOINT_RESTORE
+option.
+.TP
+.B MSG_EXCEPT
+Used with
+.I msgtyp
+greater than 0
+to read the first message in the queue with message type that differs
+from
+.IR msgtyp .
+.TP
+.B MSG_NOERROR
+To truncate the message text if longer than
+.I msgsz
+bytes.
+.PP
+If no message of the requested type is available and
+.B IPC_NOWAIT
+isn't specified in
+.IR msgflg ,
+the calling process is blocked until one of the following conditions occurs:
+.IP \[bu] 3
+A message of the desired type is placed in the queue.
+.IP \[bu]
+The message queue is removed from the system.
+In this case, the system call fails with
+.I errno
+set to
+.BR EIDRM .
+.IP \[bu]
+The calling process catches a signal.
+In this case, the system call fails with
+.I errno
+set to
+.BR EINTR .
+.RB ( msgrcv ()
+is never automatically restarted after being interrupted by a
+signal handler, regardless of the setting of the
+.B SA_RESTART
+flag when establishing a signal handler.)
+.PP
+Upon successful completion the message queue data structure is updated
+as follows:
+.IP
+.I msg_lrpid
+is set to the process ID of the calling process.
+.IP
+.I msg_qnum
+is decremented by 1.
+.IP
+.I msg_rtime
+is set to the current time.
+.SH RETURN VALUE
+On success,
+.BR msgsnd ()
+returns 0
+and
+.BR msgrcv ()
+returns the number of bytes actually copied into the
+.I mtext
+array.
+On failure, both functions return \-1, and set
+.I errno
+to indicate the error.
+.SH ERRORS
+.BR msgsnd ()
+can fail with the following errors:
+.TP
+.B EACCES
+The calling process does not have write permission on the message queue,
+and does not have the
+.B CAP_IPC_OWNER
+capability in the user namespace that governs its IPC namespace.
+.TP
+.B EAGAIN
+The message can't be sent due to the
+.I msg_qbytes
+limit for the queue and
+.B IPC_NOWAIT
+was specified in
+.IR msgflg .
+.TP
+.B EFAULT
+The address pointed to by
+.I msgp
+isn't accessible.
+.TP
+.B EIDRM
+The message queue was removed.
+.TP
+.B EINTR
+Sleeping on a full message queue condition, the process caught a signal.
+.TP
+.B EINVAL
+Invalid
+.I msqid
+value, or nonpositive
+.I mtype
+value, or
+invalid
+.I msgsz
+value (less than 0 or greater than the system value
+.BR MSGMAX ).
+.TP
+.B ENOMEM
+The system does not have enough memory to make a copy of the
+message pointed to by
+.IR msgp .
+.PP
+.BR msgrcv ()
+can fail with the following errors:
+.TP
+.B E2BIG
+The message text length is greater than
+.I msgsz
+and
+.B MSG_NOERROR
+isn't specified in
+.IR msgflg .
+.TP
+.B EACCES
+The calling process does not have read permission on the message queue,
+and does not have the
+.B CAP_IPC_OWNER
+capability in the user namespace that governs its IPC namespace.
+.TP
+.B EFAULT
+The address pointed to by
+.I msgp
+isn't accessible.
+.TP
+.B EIDRM
+While the process was sleeping to receive a message,
+the message queue was removed.
+.TP
+.B EINTR
+While the process was sleeping to receive a message,
+the process caught a signal; see
+.BR signal (7).
+.TP
+.B EINVAL
+.I msqid
+was invalid, or
+.I msgsz
+was less than 0.
+.TP
+.BR EINVAL " (since Linux 3.14)"
+.I msgflg
+specified
+.BR MSG_COPY ,
+but not
+.BR IPC_NOWAIT .
+.TP
+.BR EINVAL " (since Linux 3.14)"
+.I msgflg
+specified both
+.B MSG_COPY
+and
+.BR MSG_EXCEPT .
+.TP
+.B ENOMSG
+.B IPC_NOWAIT
+was specified in
+.I msgflg
+and no message of the requested type existed on the message queue.
+.TP
+.B ENOMSG
+.B IPC_NOWAIT
+and
+.B MSG_COPY
+were specified in
+.I msgflg
+and the queue contains less than
+.I msgtyp
+messages.
+.TP
+.BR ENOSYS " (since Linux 3.8)"
+Both
+.B MSG_COPY
+and
+.B IPC_NOWAIT
+were specified in
+.IR msgflg ,
+and this kernel was configured without
+.BR CONFIG_CHECKPOINT_RESTORE .
+.SH STANDARDS
+POSIX.1-2008.
+.PP
+The
+.B MSG_EXCEPT
+and
+.B MSG_COPY
+flags are Linux-specific;
+their definitions can be obtained by defining the
+.B _GNU_SOURCE
+.\" MSG_COPY since glibc 2.18
+feature test macro.
+.SH HISTORY
+POSIX.1-2001, SVr4.
+.PP
+The
+.I msgp
+argument is declared as \fIstruct msgbuf\ *\fP in
+glibc 2.0 and 2.1.
+It is declared as \fIvoid\ *\fP
+in glibc 2.2 and later, as required by SUSv2 and SUSv3.
+.SH NOTES
+The following limits on message queue resources affect the
+.BR msgsnd ()
+call:
+.TP
+.B MSGMAX
+Maximum size of a message text, in bytes (default value: 8192 bytes).
+On Linux, this limit can be read and modified via
+.IR /proc/sys/kernel/msgmax .
+.TP
+.B MSGMNB
+Maximum number of bytes that can be held in a message queue
+(default value: 16384 bytes).
+On Linux, this limit can be read and modified via
+.IR /proc/sys/kernel/msgmnb .
+A privileged process
+(Linux: a process with the
+.B CAP_SYS_RESOURCE
+capability)
+can increase the size of a message queue beyond
+.B MSGMNB
+using the
+.BR msgctl (2)
+.B IPC_SET
+operation.
+.PP
+The implementation has no intrinsic system-wide limits on the
+number of message headers
+.RB ( MSGTQL )
+and the number of bytes in the message pool
+.RB ( MSGPOOL ).
+.SH BUGS
+In Linux 3.13 and earlier,
+if
+.BR msgrcv ()
+was called with the
+.B MSG_COPY
+flag, but without
+.BR IPC_NOWAIT ,
+and the message queue contained less than
+.I msgtyp
+messages, then the call would block until the next message is written
+to the queue.
+.\" http://marc.info/?l=linux-kernel&m=139048542803605&w=2
+At that point, the call would return a copy of the message,
+.I regardless
+of whether that message was at the ordinal position
+.IR msgtyp .
+This bug is fixed
+.\" commit 4f87dac386cc43d5525da7a939d4b4e7edbea22c
+in Linux 3.14.
+.PP
+Specifying both
+.B MSG_COPY
+and
+.B MSC_EXCEPT
+in
+.I msgflg
+is a logical error (since these flags impose different interpretations on
+.IR msgtyp ).
+In Linux 3.13 and earlier,
+.\" http://marc.info/?l=linux-kernel&m=139048542803605&w=2
+this error was not diagnosed by
+.BR msgrcv ().
+This bug is fixed
+.\" commit 4f87dac386cc43d5525da7a939d4b4e7edbea22c
+in Linux 3.14.
+.SH EXAMPLES
+The program below demonstrates the use of
+.BR msgsnd ()
+and
+.BR msgrcv ().
+.PP
+The example program is first run with the \fB\-s\fP option to send a
+message and then run again with the \fB\-r\fP option to receive a
+message.
+.PP
+The following shell session shows a sample run of the program:
+.PP
+.in +4n
+.EX
+.RB "$" " ./a.out \-s"
+sent: a message at Wed Mar 4 16:25:45 2015
+.PP
+.RB "$" " ./a.out \-r"
+message received: a message at Wed Mar 4 16:25:45 2015
+.EE
+.in
+.SS Program source
+\&
+.\" SRC BEGIN (msgop.c)
+.EX
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ipc.h>
+#include <sys/msg.h>
+#include <time.h>
+#include <unistd.h>
+\&
+struct msgbuf {
+ long mtype;
+ char mtext[80];
+};
+\&
+static void
+usage(char *prog_name, char *msg)
+{
+ if (msg != NULL)
+ fputs(msg, stderr);
+\&
+ fprintf(stderr, "Usage: %s [options]\en", prog_name);
+ fprintf(stderr, "Options are:\en");
+ fprintf(stderr, "\-s send message using msgsnd()\en");
+ fprintf(stderr, "\-r read message using msgrcv()\en");
+ fprintf(stderr, "\-t message type (default is 1)\en");
+ fprintf(stderr, "\-k message queue key (default is 1234)\en");
+ exit(EXIT_FAILURE);
+}
+\&
+static void
+send_msg(int qid, int msgtype)
+{
+ time_t t;
+ struct msgbuf msg;
+\&
+ msg.mtype = msgtype;
+\&
+ time(&t);
+ snprintf(msg.mtext, sizeof(msg.mtext), "a message at %s",
+ ctime(&t));
+\&
+ if (msgsnd(qid, &msg, sizeof(msg.mtext),
+ IPC_NOWAIT) == \-1)
+ {
+ perror("msgsnd error");
+ exit(EXIT_FAILURE);
+ }
+ printf("sent: %s\en", msg.mtext);
+}
+\&
+static void
+get_msg(int qid, int msgtype)
+{
+ struct msgbuf msg;
+\&
+ if (msgrcv(qid, &msg, sizeof(msg.mtext), msgtype,
+ MSG_NOERROR | IPC_NOWAIT) == \-1) {
+ if (errno != ENOMSG) {
+ perror("msgrcv");
+ exit(EXIT_FAILURE);
+ }
+ printf("No message available for msgrcv()\en");
+ } else {
+ printf("message received: %s\en", msg.mtext);
+ }
+}
+\&
+int
+main(int argc, char *argv[])
+{
+ int qid, opt;
+ int mode = 0; /* 1 = send, 2 = receive */
+ int msgtype = 1;
+ int msgkey = 1234;
+\&
+ while ((opt = getopt(argc, argv, "srt:k:")) != \-1) {
+ switch (opt) {
+ case \[aq]s\[aq]:
+ mode = 1;
+ break;
+ case \[aq]r\[aq]:
+ mode = 2;
+ break;
+ case \[aq]t\[aq]:
+ msgtype = atoi(optarg);
+ if (msgtype <= 0)
+ usage(argv[0], "\-t option must be greater than 0\en");
+ break;
+ case \[aq]k\[aq]:
+ msgkey = atoi(optarg);
+ break;
+ default:
+ usage(argv[0], "Unrecognized option\en");
+ }
+ }
+\&
+ if (mode == 0)
+ usage(argv[0], "must use either \-s or \-r option\en");
+\&
+ qid = msgget(msgkey, IPC_CREAT | 0666);
+\&
+ if (qid == \-1) {
+ perror("msgget");
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (mode == 2)
+ get_msg(qid, msgtype);
+ else
+ send_msg(qid, msgtype);
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR msgctl (2),
+.BR msgget (2),
+.BR capabilities (7),
+.BR mq_overview (7),
+.BR sysvipc (7)
diff --git a/man2/msgrcv.2 b/man2/msgrcv.2
new file mode 100644
index 0000000..b34869e
--- /dev/null
+++ b/man2/msgrcv.2
@@ -0,0 +1 @@
+.so man2/msgop.2
diff --git a/man2/msgsnd.2 b/man2/msgsnd.2
new file mode 100644
index 0000000..b34869e
--- /dev/null
+++ b/man2/msgsnd.2
@@ -0,0 +1 @@
+.so man2/msgop.2
diff --git a/man2/msync.2 b/man2/msync.2
new file mode 100644
index 0000000..baa328d
--- /dev/null
+++ b/man2/msync.2
@@ -0,0 +1,140 @@
+.\" Copyright (C) 1996 Andries Brouwer (aeb@cwi.nl)
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH msync 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+msync \- synchronize a file with a memory map
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/mman.h>
+.PP
+.BI "int msync(void " addr [. length "], size_t " length ", int " flags );
+.fi
+.SH DESCRIPTION
+.BR msync ()
+flushes changes made to the in-core copy of a file that was mapped
+into memory using
+.BR mmap (2)
+back to the filesystem.
+Without use of this call,
+there is no guarantee that changes are written back before
+.BR munmap (2)
+is called.
+To be more precise, the part of the file that
+corresponds to the memory area starting at
+.I addr
+and having length
+.I length
+is updated.
+.PP
+The
+.I flags
+argument should specify exactly one of
+.B MS_ASYNC
+and
+.BR MS_SYNC ,
+and may additionally include the
+.B MS_INVALIDATE
+bit.
+These bits have the following meanings:
+.TP
+.B MS_ASYNC
+Specifies that an update be scheduled, but the call returns immediately.
+.TP
+.B MS_SYNC
+Requests an update and waits for it to complete.
+.TP
+.B MS_INVALIDATE
+.\" Since Linux 2.4, this seems to be a no-op (other than the
+.\" EBUSY check for VM_LOCKED).
+Asks to invalidate other mappings of the same file
+(so that they can be updated with the fresh values just written).
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBUSY
+.B MS_INVALIDATE
+was specified in
+.IR flags ,
+and a memory lock exists for the specified address range.
+.TP
+.B EINVAL
+.I addr
+is not a multiple of PAGESIZE; or any bit other than
+.BR MS_ASYNC " | " MS_INVALIDATE " | " MS_SYNC
+is set in
+.IR flags ;
+or both
+.B MS_SYNC
+and
+.B MS_ASYNC
+are set in
+.IR flags .
+.TP
+.B ENOMEM
+The indicated memory (or part of it) was not mapped.
+.SH VERSIONS
+According to POSIX, either
+.B MS_SYNC
+or
+.B MS_ASYNC
+must be specified in
+.IR flags ,
+and indeed failure to include one of these flags will cause
+.BR msync ()
+to fail on some systems.
+However, Linux permits a call to
+.BR msync ()
+that specifies neither of these flags,
+with semantics that are (currently) equivalent to specifying
+.BR MS_ASYNC .
+(Since Linux 2.6.19,
+.\" commit 204ec841fbea3e5138168edbc3a76d46747cc987
+.B MS_ASYNC
+is in fact a no-op, since the kernel properly tracks dirty
+pages and flushes them to storage as necessary.)
+Notwithstanding the Linux behavior,
+portable, future-proof applications should ensure that they specify either
+.B MS_SYNC
+or
+.B MS_ASYNC
+in
+.IR flags .
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001.
+.PP
+This call was introduced in Linux 1.3.21, and then used
+.B EFAULT
+instead of
+.BR ENOMEM .
+In Linux 2.4.19, this was changed to the POSIX value
+.BR ENOMEM .
+.PP
+On POSIX systems on which
+.BR msync ()
+is available, both
+.B _POSIX_MAPPED_FILES
+and
+.B _POSIX_SYNCHRONIZED_IO
+are defined in
+.I <unistd.h>
+to a value greater than 0.
+(See also
+.BR sysconf (3).)
+.\" POSIX.1-2001: It shall be defined to -1 or 0 or 200112L.
+.\" -1: unavailable, 0: ask using sysconf().
+.\" glibc defines them to 1.
+.SH SEE ALSO
+.BR mmap (2)
+.PP
+B.O. Gallmeister, POSIX.4, O'Reilly, pp. 128\[en]129 and 389\[en]391.
diff --git a/man2/munlock.2 b/man2/munlock.2
new file mode 100644
index 0000000..5e5b3c7
--- /dev/null
+++ b/man2/munlock.2
@@ -0,0 +1 @@
+.so man2/mlock.2
diff --git a/man2/munlockall.2 b/man2/munlockall.2
new file mode 100644
index 0000000..5e5b3c7
--- /dev/null
+++ b/man2/munlockall.2
@@ -0,0 +1 @@
+.so man2/mlock.2
diff --git a/man2/munmap.2 b/man2/munmap.2
new file mode 100644
index 0000000..8902d1b
--- /dev/null
+++ b/man2/munmap.2
@@ -0,0 +1 @@
+.so man2/mmap.2
diff --git a/man2/name_to_handle_at.2 b/man2/name_to_handle_at.2
new file mode 100644
index 0000000..090521c
--- /dev/null
+++ b/man2/name_to_handle_at.2
@@ -0,0 +1 @@
+.so man2/open_by_handle_at.2
diff --git a/man2/nanosleep.2 b/man2/nanosleep.2
new file mode 100644
index 0000000..4693dc8
--- /dev/null
+++ b/man2/nanosleep.2
@@ -0,0 +1,220 @@
+.\" Copyright (C) Markus Kuhn, 1996
+.\" and Copyright (C) Linux Foundation, 2008, written by Michael Kerrisk
+.\" <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" 1996-04-10 Markus Kuhn <mskuhn@cip.informatik.uni-erlangen.de>
+.\" First version written
+.\" Modified, 2004-10-24, aeb
+.\" 2008-06-24, mtk
+.\" Minor rewrites of some parts.
+.\" NOTES: describe case where clock_nanosleep() can be preferable.
+.\" NOTES: describe CLOCK_REALTIME versus CLOCK_NANOSLEEP
+.\" Replace crufty discussion of HZ with a pointer to time(7).
+.TH nanosleep 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+nanosleep \- high-resolution sleep
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <time.h>
+.PP
+.BI "int nanosleep(const struct timespec *" req ,
+.BI " struct timespec *_Nullable " rem );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR nanosleep ():
+.nf
+ _POSIX_C_SOURCE >= 199309L
+.fi
+.SH DESCRIPTION
+.BR nanosleep ()
+suspends the execution of the calling thread
+until either at least the time specified in
+.I *req
+has elapsed, or the delivery of a signal
+that triggers the invocation of a handler in the calling thread or
+that terminates the process.
+.PP
+If the call is interrupted by a signal handler,
+.BR nanosleep ()
+returns \-1, sets
+.I errno
+to
+.BR EINTR ,
+and writes the remaining time into the structure pointed to by
+.I rem
+unless
+.I rem
+is NULL.
+The value of
+.I *rem
+can then be used to call
+.BR nanosleep ()
+again and complete the specified pause (but see NOTES).
+.PP
+The
+.BR timespec (3)
+structure
+is used to specify intervals of time with nanosecond precision.
+.PP
+The value of the nanoseconds field must be in the range [0, 999999999].
+.PP
+Compared to
+.BR sleep (3)
+and
+.BR usleep (3),
+.BR nanosleep ()
+has the following advantages:
+it provides a higher resolution for specifying the sleep interval;
+POSIX.1 explicitly specifies that it
+does not interact with signals;
+and it makes the task of resuming a sleep that has been
+interrupted by a signal handler easier.
+.SH RETURN VALUE
+On successfully sleeping for the requested interval,
+.BR nanosleep ()
+returns 0.
+If the call is interrupted by a signal handler or encounters an error,
+then it returns \-1, with
+.I errno
+set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+Problem with copying information from user space.
+.TP
+.B EINTR
+The pause has been interrupted by a signal that was
+delivered to the thread (see
+.BR signal (7)).
+The remaining sleep time has been written
+into
+.I *rem
+so that the thread can easily call
+.BR nanosleep ()
+again and continue with the pause.
+.TP
+.B EINVAL
+The value in the
+.I tv_nsec
+field was not in the range [0, 999999999] or
+.I tv_sec
+was negative.
+.SH VERSIONS
+POSIX.1 specifies that
+.BR nanosleep ()
+should measure time against the
+.B CLOCK_REALTIME
+clock.
+However, Linux measures the time using the
+.B CLOCK_MONOTONIC
+clock.
+.\" See also http://thread.gmane.org/gmane.linux.kernel/696854/
+.\" Subject: nanosleep() uses CLOCK_MONOTONIC, should be CLOCK_REALTIME?
+.\" Date: 2008-06-22 07:35:41 GMT
+This probably does not matter, since the POSIX.1 specification for
+.BR clock_settime (2)
+says that discontinuous changes in
+.B CLOCK_REALTIME
+should not affect
+.BR nanosleep ():
+.RS
+.PP
+Setting the value of the
+.B CLOCK_REALTIME
+clock via
+.BR clock_settime (2)
+shall
+have no effect on threads that are blocked waiting for a relative time
+service based upon this clock, including the
+.BR nanosleep ()
+function; ...
+Consequently, these time services shall expire when the requested relative
+interval elapses, independently of the new or old value of the clock.
+.RE
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001.
+.PP
+In order to support applications requiring much more precise pauses
+(e.g., in order to control some time-critical hardware),
+.BR nanosleep ()
+would handle pauses of up to 2 milliseconds by busy waiting with microsecond
+precision when called from a thread scheduled under a real-time policy
+like
+.B SCHED_FIFO
+or
+.BR SCHED_RR .
+This special extension was removed in Linux 2.5.39,
+and is thus not available in Linux 2.6.0 and later kernels.
+.SH NOTES
+If the interval specified in
+.I req
+is not an exact multiple of the granularity underlying clock (see
+.BR time (7)),
+then the interval will be rounded up to the next multiple.
+Furthermore, after the sleep completes, there may still be a delay before
+the CPU becomes free to once again execute the calling thread.
+.PP
+The fact that
+.BR nanosleep ()
+sleeps for a relative interval can be problematic if the call
+is repeatedly restarted after being interrupted by signals,
+since the time between the interruptions and restarts of the call
+will lead to drift in the time when the sleep finally completes.
+This problem can be avoided by using
+.BR clock_nanosleep (2)
+with an absolute time value.
+.SH BUGS
+If a program that catches signals and uses
+.BR nanosleep ()
+receives signals at a very high rate,
+then scheduling delays and rounding errors in the kernel's
+calculation of the sleep interval and the returned
+.I remain
+value mean that the
+.I remain
+value may steadily
+.I increase
+on successive restarts of the
+.BR nanosleep ()
+call.
+To avoid such problems, use
+.BR clock_nanosleep (2)
+with the
+.B TIMER_ABSTIME
+flag to sleep to an absolute deadline.
+.PP
+In Linux 2.4, if
+.BR nanosleep ()
+is stopped by a signal (e.g.,
+.BR SIGTSTP ),
+then the call fails with the error
+.B EINTR
+after the thread is resumed by a
+.B SIGCONT
+signal.
+If the system call is subsequently restarted,
+then the time that the thread spent in the stopped state is
+.I not
+counted against the sleep interval.
+This problem is fixed in Linux 2.6.0 and later kernels.
+.SH SEE ALSO
+.BR clock_nanosleep (2),
+.BR restart_syscall (2),
+.BR sched_setscheduler (2),
+.BR timer_create (2),
+.BR sleep (3),
+.BR timespec (3),
+.BR usleep (3),
+.BR time (7)
diff --git a/man2/newfstatat.2 b/man2/newfstatat.2
new file mode 100644
index 0000000..7791269
--- /dev/null
+++ b/man2/newfstatat.2
@@ -0,0 +1 @@
+.so man2/fstatat.2
diff --git a/man2/nfsservctl.2 b/man2/nfsservctl.2
new file mode 100644
index 0000000..5267f81
--- /dev/null
+++ b/man2/nfsservctl.2
@@ -0,0 +1,70 @@
+.\" %%%LICENSE_START(PUBLIC_DOMAIN)
+.\" This text is in the public domain.
+.\" %%%LICENSE_END
+.\"
+.TH nfsservctl 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+nfsservctl \- syscall interface to kernel nfs daemon
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <linux/nfsd/syscall.h>
+.PP
+.BI "long nfsservctl(int " cmd ", struct nfsctl_arg *" argp ,
+.BI " union nfsctl_res *" resp );
+.fi
+.SH DESCRIPTION
+.IR Note :
+Since Linux 3.1, this system call no longer exists.
+It has been replaced by a set of files in the
+.I nfsd
+filesystem; see
+.BR nfsd (7).
+.PP
+.in +4n
+.EX
+/*
+ * These are the commands understood by nfsctl().
+ */
+#define NFSCTL_SVC 0 /* This is a server process. */
+#define NFSCTL_ADDCLIENT 1 /* Add an NFS client. */
+#define NFSCTL_DELCLIENT 2 /* Remove an NFS client. */
+#define NFSCTL_EXPORT 3 /* Export a filesystem. */
+#define NFSCTL_UNEXPORT 4 /* Unexport a filesystem. */
+#define NFSCTL_UGIDUPDATE 5 /* Update a client\[aq]s UID/GID map
+ (only in Linux 2.4.x and earlier). */
+#define NFSCTL_GETFH 6 /* Get a file handle (used by mountd(8))
+ (only in Linux 2.4.x and earlier). */
+\&
+struct nfsctl_arg {
+ int ca_version; /* safeguard */
+ union {
+ struct nfsctl_svc u_svc;
+ struct nfsctl_client u_client;
+ struct nfsctl_export u_export;
+ struct nfsctl_uidmap u_umap;
+ struct nfsctl_fhparm u_getfh;
+ unsigned int u_debug;
+ } u;
+}
+\&
+union nfsctl_res {
+ struct knfs_fh cr_getfh;
+ unsigned int cr_debug;
+};
+.EE
+.in
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Removed in Linux 3.1.
+Removed in glibc 2.28.
+.SH SEE ALSO
+.BR nfsd (7)
diff --git a/man2/nice.2 b/man2/nice.2
new file mode 100644
index 0000000..d26a1be
--- /dev/null
+++ b/man2/nice.2
@@ -0,0 +1,118 @@
+.\" Copyright (c) 1992 Drew Eckhardt <drew@cs.colorado.edu>, March 28, 1992
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt <michael@moria.de>
+.\" Modified 1993-07-24 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1996-11-04 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 2001-06-04 by aeb
+.\" Modified 2004-05-27 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.TH nice 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+nice \- change process priority
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int nice(int " inc );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR nice ():
+.nf
+ _XOPEN_SOURCE
+ || /* Since glibc 2.19: */ _DEFAULT_SOURCE
+ || /* glibc <= 2.19: */ _BSD_SOURCE || _SVID_SOURCE
+.fi
+.SH DESCRIPTION
+.BR nice ()
+adds
+.I inc
+to the nice value for the calling thread.
+(A higher nice value means a lower priority.)
+.PP
+The range of the nice value is +19 (low priority) to \-20 (high priority).
+Attempts to set a nice value outside the range are clamped to the range.
+.PP
+Traditionally, only a privileged process could lower the nice value
+(i.e., set a higher priority).
+However, since Linux 2.6.12, an unprivileged process can decrease
+the nice value of a target process that has a suitable
+.B RLIMIT_NICE
+soft limit; see
+.BR getrlimit (2)
+for details.
+.SH RETURN VALUE
+On success, the new nice value is returned (but see NOTES below).
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.PP
+A successful call can legitimately return \-1.
+To detect an error, set
+.I errno
+to 0 before the call, and check whether it is nonzero after
+.BR nice ()
+returns \-1.
+.SH ERRORS
+.TP
+.B EPERM
+The calling process attempted to increase its priority by
+supplying a negative
+.I inc
+but has insufficient privileges.
+Under Linux, the
+.B CAP_SYS_NICE
+capability is required.
+(But see the discussion of the
+.B RLIMIT_NICE
+resource limit in
+.BR setrlimit (2).)
+.SH VERSIONS
+.SS C library/kernel differences
+POSIX.1 specifies that
+.BR nice ()
+should return the new nice value.
+However, the raw Linux system call returns 0 on success.
+Likewise, the
+.BR nice ()
+wrapper function provided in glibc 2.2.3 and earlier returns 0 on success.
+.PP
+Since glibc 2.2.4, the
+.BR nice ()
+wrapper function provided by glibc provides conformance to POSIX.1 by calling
+.BR getpriority (2)
+to obtain the new nice value, which is then returned to the caller.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4, 4.3BSD.
+.\" SVr4 documents an additional
+.\" .B EINVAL
+.\" error code.
+.SH NOTES
+For further details on the nice value, see
+.BR sched (7).
+.PP
+.IR Note :
+the addition of the "autogroup" feature in Linux 2.6.38 means that
+the nice value no longer has its traditional effect in many circumstances.
+For details, see
+.BR sched (7).
+.SH SEE ALSO
+.BR nice (1),
+.BR renice (1),
+.BR fork (2),
+.BR getpriority (2),
+.BR getrlimit (2),
+.BR setpriority (2),
+.BR capabilities (7),
+.BR sched (7)
diff --git a/man2/oldfstat.2 b/man2/oldfstat.2
new file mode 100644
index 0000000..b1a86c1
--- /dev/null
+++ b/man2/oldfstat.2
@@ -0,0 +1 @@
+.so man2/stat.2
diff --git a/man2/oldlstat.2 b/man2/oldlstat.2
new file mode 100644
index 0000000..b1a86c1
--- /dev/null
+++ b/man2/oldlstat.2
@@ -0,0 +1 @@
+.so man2/stat.2
diff --git a/man2/oldolduname.2 b/man2/oldolduname.2
new file mode 100644
index 0000000..450f7b1
--- /dev/null
+++ b/man2/oldolduname.2
@@ -0,0 +1 @@
+.so man2/uname.2
diff --git a/man2/oldstat.2 b/man2/oldstat.2
new file mode 100644
index 0000000..b1a86c1
--- /dev/null
+++ b/man2/oldstat.2
@@ -0,0 +1 @@
+.so man2/stat.2
diff --git a/man2/olduname.2 b/man2/olduname.2
new file mode 100644
index 0000000..450f7b1
--- /dev/null
+++ b/man2/olduname.2
@@ -0,0 +1 @@
+.so man2/uname.2
diff --git a/man2/open.2 b/man2/open.2
new file mode 100644
index 0000000..52286f6
--- /dev/null
+++ b/man2/open.2
@@ -0,0 +1,1934 @@
+.\" This manpage is Copyright (C) 1992 Drew Eckhardt;
+.\" and Copyright (C) 1993 Michael Haardt, Ian Jackson.
+.\" and Copyright (C) 2008 Greg Banks
+.\" and Copyright (C) 2006, 2008, 2013, 2014 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 1993-07-21 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1994-08-21 by Michael Haardt
+.\" Modified 1996-04-13 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified 1996-05-13 by Thomas Koenig
+.\" Modified 1996-12-20 by Michael Haardt
+.\" Modified 1999-02-19 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified 1998-11-28 by Joseph S. Myers <jsm28@hermes.cam.ac.uk>
+.\" Modified 1999-06-03 by Michael Haardt
+.\" Modified 2002-05-07 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified 2004-06-23 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" 2004-12-08, mtk, reordered flags list alphabetically
+.\" 2004-12-08, Martin Pool <mbp@sourcefrog.net> (& mtk), added O_NOATIME
+.\" 2007-09-18, mtk, Added description of O_CLOEXEC + other minor edits
+.\" 2008-01-03, mtk, with input from Trond Myklebust
+.\" <trond.myklebust@fys.uio.no> and Timo Sirainen <tss@iki.fi>
+.\" Rewrite description of O_EXCL.
+.\" 2008-01-11, Greg Banks <gnb@melbourne.sgi.com>: add more detail
+.\" on O_DIRECT.
+.\" 2008-02-26, Michael Haardt: Reorganized text for O_CREAT and mode
+.\"
+.\" FIXME . Apr 08: The next POSIX revision has O_EXEC, O_SEARCH, and
+.\" O_TTYINIT. Eventually these may need to be documented. --mtk
+.\"
+.TH open 2 2023-05-20 "Linux man-pages 6.05.01"
+.SH NAME
+open, openat, creat \- open and possibly create a file
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <fcntl.h>
+.PP
+.BI "int open(const char *" pathname ", int " flags ", ..."
+.BI " \fR/*\fP mode_t " mode " \fR*/\fP );"
+.PP
+.BI "int creat(const char *" pathname ", mode_t " mode );
+.PP
+.BI "int openat(int " dirfd ", const char *" pathname ", int " flags ", ..."
+.BI " \fR/*\fP mode_t " mode " \fR*/\fP );"
+.PP
+/* Documented separately, in \fBopenat2\fP(2): */
+.BI "int openat2(int " dirfd ", const char *" pathname ,
+.BI " const struct open_how *" how ", size_t " size );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR openat ():
+.nf
+ Since glibc 2.10:
+ _POSIX_C_SOURCE >= 200809L
+ Before glibc 2.10:
+ _ATFILE_SOURCE
+.fi
+.SH DESCRIPTION
+The
+.BR open ()
+system call opens the file specified by
+.IR pathname .
+If the specified file does not exist,
+it may optionally (if
+.B O_CREAT
+is specified in
+.IR flags )
+be created by
+.BR open ().
+.PP
+The return value of
+.BR open ()
+is a file descriptor, a small, nonnegative integer that is an index
+to an entry in the process's table of open file descriptors.
+The file descriptor is used
+in subsequent system calls
+.RB ( read "(2), " write "(2), " lseek "(2), " fcntl (2),
+etc.) to refer to the open file.
+The file descriptor returned by a successful call will be
+the lowest-numbered file descriptor not currently open for the process.
+.PP
+By default, the new file descriptor is set to remain open across an
+.BR execve (2)
+(i.e., the
+.B FD_CLOEXEC
+file descriptor flag described in
+.BR fcntl (2)
+is initially disabled); the
+.B O_CLOEXEC
+flag, described below, can be used to change this default.
+The file offset is set to the beginning of the file (see
+.BR lseek (2)).
+.PP
+A call to
+.BR open ()
+creates a new
+.IR "open file description" ,
+an entry in the system-wide table of open files.
+The open file description records the file offset and the file status flags
+(see below).
+A file descriptor is a reference to an open file description;
+this reference is unaffected if
+.I pathname
+is subsequently removed or modified to refer to a different file.
+For further details on open file descriptions, see NOTES.
+.PP
+The argument
+.I flags
+must include one of the following
+.IR "access modes" :
+.BR O_RDONLY ", " O_WRONLY ", or " O_RDWR .
+These request opening the file read-only, write-only, or read/write,
+respectively.
+.PP
+In addition, zero or more file creation flags and file status flags
+can be
+bitwise ORed
+in
+.IR flags .
+The
+.I file creation flags
+are
+.BR O_CLOEXEC ,
+.BR O_CREAT ,
+.BR O_DIRECTORY ,
+.BR O_EXCL ,
+.BR O_NOCTTY ,
+.BR O_NOFOLLOW ,
+.BR O_TMPFILE ,
+and
+.BR O_TRUNC .
+The
+.I file status flags
+are all of the remaining flags listed below.
+.\" SUSv4 divides the flags into:
+.\" * Access mode
+.\" * File creation
+.\" * File status
+.\" * Other (O_CLOEXEC, O_DIRECTORY, O_NOFOLLOW)
+.\" though it's not clear what the difference between "other" and
+.\" "File creation" flags is. I raised an Aardvark to see if this
+.\" can be clarified in SUSv4; 10 Oct 2008.
+.\" http://thread.gmane.org/gmane.comp.standards.posix.austin.general/64/focus=67
+.\" TC1 (balloted in 2013), resolved this, so that those three constants
+.\" are also categorized" as file status flags.
+.\"
+The distinction between these two groups of flags is that
+the file creation flags affect the semantics of the open operation itself,
+while the file status flags affect the semantics of subsequent I/O operations.
+The file status flags can be retrieved and (in some cases)
+modified; see
+.BR fcntl (2)
+for details.
+.PP
+The full list of file creation flags and file status flags is as follows:
+.TP
+.B O_APPEND
+The file is opened in append mode.
+Before each
+.BR write (2),
+the file offset is positioned at the end of the file,
+as if with
+.BR lseek (2).
+The modification of the file offset and the write operation
+are performed as a single atomic step.
+.IP
+.B O_APPEND
+may lead to corrupted files on NFS filesystems if more than one process
+appends data to a file at once.
+.\" For more background, see
+.\" http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=453946
+.\" http://nfs.sourceforge.net/
+This is because NFS does not support
+appending to a file, so the client kernel has to simulate it, which
+can't be done without a race condition.
+.TP
+.B O_ASYNC
+Enable signal-driven I/O:
+generate a signal
+.RB ( SIGIO
+by default, but this can be changed via
+.BR fcntl (2))
+when input or output becomes possible on this file descriptor.
+This feature is available only for terminals, pseudoterminals,
+sockets, and (since Linux 2.6) pipes and FIFOs.
+See
+.BR fcntl (2)
+for further details.
+See also BUGS, below.
+.TP
+.BR O_CLOEXEC " (since Linux 2.6.23)"
+.\" NOTE! several other man pages refer to this text
+Enable the close-on-exec flag for the new file descriptor.
+.\" FIXME . for later review when Issue 8 is one day released...
+.\" POSIX proposes to fix many APIs that provide hidden FDs
+.\" http://austingroupbugs.net/tag_view_page.php?tag_id=8
+.\" http://austingroupbugs.net/view.php?id=368
+Specifying this flag permits a program to avoid additional
+.BR fcntl (2)
+.B F_SETFD
+operations to set the
+.B FD_CLOEXEC
+flag.
+.IP
+Note that the use of this flag is essential in some multithreaded programs,
+because using a separate
+.BR fcntl (2)
+.B F_SETFD
+operation to set the
+.B FD_CLOEXEC
+flag does not suffice to avoid race conditions
+where one thread opens a file descriptor and
+attempts to set its close-on-exec flag using
+.BR fcntl (2)
+at the same time as another thread does a
+.BR fork (2)
+plus
+.BR execve (2).
+Depending on the order of execution,
+the race may lead to the file descriptor returned by
+.BR open ()
+being unintentionally leaked to the program executed by the child process
+created by
+.BR fork (2).
+(This kind of race is in principle possible for any system call
+that creates a file descriptor whose close-on-exec flag should be set,
+and various other Linux system calls provide an equivalent of the
+.B O_CLOEXEC
+flag to deal with this problem.)
+.\" This flag fixes only one form of the race condition;
+.\" The race can also occur with, for example, file descriptors
+.\" returned by accept(), pipe(), etc.
+.TP
+.B O_CREAT
+If
+.I pathname
+does not exist, create it as a regular file.
+.IP
+The owner (user ID) of the new file is set to the effective user ID
+of the process.
+.IP
+The group ownership (group ID) of the new file is set either to
+the effective group ID of the process (System V semantics)
+or to the group ID of the parent directory (BSD semantics).
+On Linux, the behavior depends on whether the
+set-group-ID mode bit is set on the parent directory:
+if that bit is set, then BSD semantics apply;
+otherwise, System V semantics apply.
+For some filesystems, the behavior also depends on the
+.I bsdgroups
+and
+.I sysvgroups
+mount options described in
+.BR mount (8).
+.\" As at Linux 2.6.25, bsdgroups is supported by ext2, ext3, ext4, and
+.\" XFS (since Linux 2.6.14).
+.IP
+The
+.I mode
+argument specifies the file mode bits to be applied when a new file is created.
+If neither
+.B O_CREAT
+nor
+.B O_TMPFILE
+is specified in
+.IR flags ,
+then
+.I mode
+is ignored (and can thus be specified as 0, or simply omitted).
+The
+.I mode
+argument
+.B must
+be supplied if
+.B O_CREAT
+or
+.B O_TMPFILE
+is specified in
+.IR flags ;
+if it is not supplied,
+some arbitrary bytes from the stack will be applied as the file mode.
+.IP
+The effective mode is modified by the process's
+.I umask
+in the usual way: in the absence of a default ACL, the mode of the
+created file is
+.IR "(mode\ &\ \[ti]umask)" .
+.IP
+Note that
+.I mode
+applies only to future accesses of the
+newly created file; the
+.BR open ()
+call that creates a read-only file may well return a read/write
+file descriptor.
+.IP
+The following symbolic constants are provided for
+.IR mode :
+.RS
+.TP 9
+.B S_IRWXU
+00700 user (file owner) has read, write, and execute permission
+.TP
+.B S_IRUSR
+00400 user has read permission
+.TP
+.B S_IWUSR
+00200 user has write permission
+.TP
+.B S_IXUSR
+00100 user has execute permission
+.TP
+.B S_IRWXG
+00070 group has read, write, and execute permission
+.TP
+.B S_IRGRP
+00040 group has read permission
+.TP
+.B S_IWGRP
+00020 group has write permission
+.TP
+.B S_IXGRP
+00010 group has execute permission
+.TP
+.B S_IRWXO
+00007 others have read, write, and execute permission
+.TP
+.B S_IROTH
+00004 others have read permission
+.TP
+.B S_IWOTH
+00002 others have write permission
+.TP
+.B S_IXOTH
+00001 others have execute permission
+.RE
+.IP
+According to POSIX, the effect when other bits are set in
+.I mode
+is unspecified.
+On Linux, the following bits are also honored in
+.IR mode :
+.RS
+.TP 9
+.B S_ISUID
+0004000 set-user-ID bit
+.TP
+.B S_ISGID
+0002000 set-group-ID bit (see
+.BR inode (7)).
+.TP
+.B S_ISVTX
+0001000 sticky bit (see
+.BR inode (7)).
+.RE
+.TP
+.BR O_DIRECT " (since Linux 2.4.10)"
+Try to minimize cache effects of the I/O to and from this file.
+In general this will degrade performance, but it is useful in
+special situations, such as when applications do their own caching.
+File I/O is done directly to/from user-space buffers.
+The
+.B O_DIRECT
+flag on its own makes an effort to transfer data synchronously,
+but does not give the guarantees of the
+.B O_SYNC
+flag that data and necessary metadata are transferred.
+To guarantee synchronous I/O,
+.B O_SYNC
+must be used in addition to
+.BR O_DIRECT .
+See NOTES below for further discussion.
+.IP
+A semantically similar (but deprecated) interface for block devices
+is described in
+.BR raw (8).
+.TP
+.B O_DIRECTORY
+If \fIpathname\fP is not a directory, cause the open to fail.
+.\" But see the following and its replies:
+.\" http://marc.theaimsgroup.com/?t=112748702800001&r=1&w=2
+.\" [PATCH] open: O_DIRECTORY and O_CREAT together should fail
+.\" O_DIRECTORY | O_CREAT causes O_DIRECTORY to be ignored.
+This flag was added in Linux 2.1.126, to
+avoid denial-of-service problems if
+.BR opendir (3)
+is called on a
+FIFO or tape device.
+.TP
+.B O_DSYNC
+Write operations on the file will complete according to the requirements of
+synchronized I/O
+.I data
+integrity completion.
+.IP
+By the time
+.BR write (2)
+(and similar)
+return, the output data
+has been transferred to the underlying hardware,
+along with any file metadata that would be required to retrieve that data
+(i.e., as though each
+.BR write (2)
+was followed by a call to
+.BR fdatasync (2)).
+.IR "See NOTES below" .
+.TP
+.B O_EXCL
+Ensure that this call creates the file:
+if this flag is specified in conjunction with
+.BR O_CREAT ,
+and
+.I pathname
+already exists, then
+.BR open ()
+fails with the error
+.BR EEXIST .
+.IP
+When these two flags are specified, symbolic links are not followed:
+.\" POSIX.1-2001 explicitly requires this behavior.
+if
+.I pathname
+is a symbolic link, then
+.BR open ()
+fails regardless of where the symbolic link points.
+.IP
+In general, the behavior of
+.B O_EXCL
+is undefined if it is used without
+.BR O_CREAT .
+There is one exception: on Linux 2.6 and later,
+.B O_EXCL
+can be used without
+.B O_CREAT
+if
+.I pathname
+refers to a block device.
+If the block device is in use by the system (e.g., mounted),
+.BR open ()
+fails with the error
+.BR EBUSY .
+.IP
+On NFS,
+.B O_EXCL
+is supported only when using NFSv3 or later on kernel 2.6 or later.
+In NFS environments where
+.B O_EXCL
+support is not provided, programs that rely on it
+for performing locking tasks will contain a race condition.
+Portable programs that want to perform atomic file locking using a lockfile,
+and need to avoid reliance on NFS support for
+.BR O_EXCL ,
+can create a unique file on
+the same filesystem (e.g., incorporating hostname and PID), and use
+.BR link (2)
+to make a link to the lockfile.
+If
+.BR link (2)
+returns 0, the lock is successful.
+Otherwise, use
+.BR stat (2)
+on the unique file to check if its link count has increased to 2,
+in which case the lock is also successful.
+.TP
+.B O_LARGEFILE
+(LFS)
+Allow files whose sizes cannot be represented in an
+.I off_t
+(but can be represented in an
+.IR off64_t )
+to be opened.
+The
+.B _LARGEFILE64_SOURCE
+macro must be defined
+(before including
+.I any
+header files)
+in order to obtain this definition.
+Setting the
+.B _FILE_OFFSET_BITS
+feature test macro to 64 (rather than using
+.BR O_LARGEFILE )
+is the preferred
+method of accessing large files on 32-bit systems (see
+.BR feature_test_macros (7)).
+.TP
+.BR O_NOATIME " (since Linux 2.6.8)"
+Do not update the file last access time
+.RI ( st_atime
+in the inode)
+when the file is
+.BR read (2).
+.IP
+This flag can be employed only if one of the following conditions is true:
+.RS
+.IP \[bu] 3
+The effective UID of the process
+.\" Strictly speaking: the filesystem UID
+matches the owner UID of the file.
+.IP \[bu]
+The calling process has the
+.B CAP_FOWNER
+capability in its user namespace and
+the owner UID of the file has a mapping in the namespace.
+.RE
+.IP
+This flag is intended for use by indexing or backup programs,
+where its use can significantly reduce the amount of disk activity.
+This flag may not be effective on all filesystems.
+One example is NFS, where the server maintains the access time.
+.\" The O_NOATIME flag also affects the treatment of st_atime
+.\" by mmap() and readdir(2), MTK, Dec 04.
+.TP
+.B O_NOCTTY
+If
+.I pathname
+refers to a terminal device\[em]see
+.BR tty (4)\[em]it
+will not become the process's controlling terminal even if the
+process does not have one.
+.TP
+.B O_NOFOLLOW
+If the trailing component (i.e., basename) of
+.I pathname
+is a symbolic link, then the open fails, with the error
+.BR ELOOP .
+Symbolic links in earlier components of the pathname will still be
+followed.
+(Note that the
+.B ELOOP
+error that can occur in this case is indistinguishable from the case where
+an open fails because there are too many symbolic links found
+while resolving components in the prefix part of the pathname.)
+.IP
+This flag is a FreeBSD extension, which was added in Linux 2.1.126,
+and has subsequently been standardized in POSIX.1-2008.
+.IP
+See also
+.B O_PATH
+below.
+.\" The headers from glibc 2.0.100 and later include a
+.\" definition of this flag; \fIkernels before Linux 2.1.126 will ignore it if
+.\" used\fP.
+.TP
+.BR O_NONBLOCK " or " O_NDELAY
+When possible, the file is opened in nonblocking mode.
+Neither the
+.BR open ()
+nor any subsequent I/O operations on the file descriptor which is
+returned will cause the calling process to wait.
+.IP
+Note that the setting of this flag has no effect on the operation of
+.BR poll (2),
+.BR select (2),
+.BR epoll (7),
+and similar,
+since those interfaces merely inform the caller about whether
+a file descriptor is "ready",
+meaning that an I/O operation performed on
+the file descriptor with the
+.B O_NONBLOCK
+flag
+.I clear
+would not block.
+.IP
+Note that this flag has no effect for regular files and block devices;
+that is, I/O operations will (briefly) block when device activity
+is required, regardless of whether
+.B O_NONBLOCK
+is set.
+Since
+.B O_NONBLOCK
+semantics might eventually be implemented,
+applications should not depend upon blocking behavior
+when specifying this flag for regular files and block devices.
+.IP
+For the handling of FIFOs (named pipes), see also
+.BR fifo (7).
+For a discussion of the effect of
+.B O_NONBLOCK
+in conjunction with mandatory file locks and with file leases, see
+.BR fcntl (2).
+.TP
+.BR O_PATH " (since Linux 2.6.39)"
+.\" commit 1abf0c718f15a56a0a435588d1b104c7a37dc9bd
+.\" commit 326be7b484843988afe57566b627fb7a70beac56
+.\" commit 65cfc6722361570bfe255698d9cd4dccaf47570d
+.\"
+.\" http://thread.gmane.org/gmane.linux.man/2790/focus=3496
+.\" Subject: Re: [PATCH] open(2): document O_PATH
+.\" Newsgroups: gmane.linux.man, gmane.linux.kernel
+.\"
+Obtain a file descriptor that can be used for two purposes:
+to indicate a location in the filesystem tree and
+to perform operations that act purely at the file descriptor level.
+The file itself is not opened, and other file operations (e.g.,
+.BR read (2),
+.BR write (2),
+.BR fchmod (2),
+.BR fchown (2),
+.BR fgetxattr (2),
+.BR ioctl (2),
+.BR mmap (2))
+fail with the error
+.BR EBADF .
+.IP
+The following operations
+.I can
+be performed on the resulting file descriptor:
+.RS
+.IP \[bu] 3
+.BR close (2).
+.IP \[bu]
+.BR fchdir (2),
+if the file descriptor refers to a directory
+(since Linux 3.5).
+.\" commit 332a2e1244bd08b9e3ecd378028513396a004a24
+.IP \[bu]
+.BR fstat (2)
+(since Linux 3.6).
+.IP \[bu]
+.\" fstat(): commit 55815f70147dcfa3ead5738fd56d3574e2e3c1c2
+.BR fstatfs (2)
+(since Linux 3.12).
+.\" fstatfs(): commit 9d05746e7b16d8565dddbe3200faa1e669d23bbf
+.IP \[bu]
+Duplicating the file descriptor
+.RB ( dup (2),
+.BR fcntl (2)
+.BR F_DUPFD ,
+etc.).
+.IP \[bu]
+Getting and setting file descriptor flags
+.RB ( fcntl (2)
+.B F_GETFD
+and
+.BR F_SETFD ).
+.IP \[bu]
+Retrieving open file status flags using the
+.BR fcntl (2)
+.B F_GETFL
+operation: the returned flags will include the bit
+.BR O_PATH .
+.IP \[bu]
+Passing the file descriptor as the
+.I dirfd
+argument of
+.BR openat ()
+and the other "*at()" system calls.
+This includes
+.BR linkat (2)
+with
+.B AT_EMPTY_PATH
+(or via procfs using
+.BR AT_SYMLINK_FOLLOW )
+even if the file is not a directory.
+.IP \[bu]
+Passing the file descriptor to another process via a UNIX domain socket
+(see
+.B SCM_RIGHTS
+in
+.BR unix (7)).
+.RE
+.IP
+When
+.B O_PATH
+is specified in
+.IR flags ,
+flag bits other than
+.BR O_CLOEXEC ,
+.BR O_DIRECTORY ,
+and
+.B O_NOFOLLOW
+are ignored.
+.IP
+Opening a file or directory with the
+.B O_PATH
+flag requires no permissions on the object itself
+(but does require execute permission on the directories in the path prefix).
+Depending on the subsequent operation,
+a check for suitable file permissions may be performed (e.g.,
+.BR fchdir (2)
+requires execute permission on the directory referred to
+by its file descriptor argument).
+By contrast,
+obtaining a reference to a filesystem object by opening it with the
+.B O_RDONLY
+flag requires that the caller have read permission on the object,
+even when the subsequent operation (e.g.,
+.BR fchdir (2),
+.BR fstat (2))
+does not require read permission on the object.
+.IP
+If
+.I pathname
+is a symbolic link and the
+.B O_NOFOLLOW
+flag is also specified,
+then the call returns a file descriptor referring to the symbolic link.
+This file descriptor can be used as the
+.I dirfd
+argument in calls to
+.BR fchownat (2),
+.BR fstatat (2),
+.BR linkat (2),
+and
+.BR readlinkat (2)
+with an empty pathname to have the calls operate on the symbolic link.
+.IP
+If
+.I pathname
+refers to an automount point that has not yet been triggered, so no
+other filesystem is mounted on it, then the call returns a file
+descriptor referring to the automount directory without triggering a mount.
+.BR fstatfs (2)
+can then be used to determine if it is, in fact, an untriggered
+automount point
+.RB ( ".f_type == AUTOFS_SUPER_MAGIC" ).
+.IP
+One use of
+.B O_PATH
+for regular files is to provide the equivalent of POSIX.1's
+.B O_EXEC
+functionality.
+This permits us to open a file for which we have execute
+permission but not read permission, and then execute that file,
+with steps something like the following:
+.IP
+.in +4n
+.EX
+char buf[PATH_MAX];
+fd = open("some_prog", O_PATH);
+snprintf(buf, PATH_MAX, "/proc/self/fd/%d", fd);
+execl(buf, "some_prog", (char *) NULL);
+.EE
+.in
+.IP
+An
+.B O_PATH
+file descriptor can also be passed as the argument of
+.BR fexecve (3).
+.TP
+.B O_SYNC
+Write operations on the file will complete according to the requirements of
+synchronized I/O
+.I file
+integrity completion
+(by contrast with the
+synchronized I/O
+.I data
+integrity completion
+provided by
+.BR O_DSYNC .)
+.IP
+By the time
+.BR write (2)
+(or similar)
+returns, the output data and associated file metadata
+have been transferred to the underlying hardware
+(i.e., as though each
+.BR write (2)
+was followed by a call to
+.BR fsync (2)).
+.IR "See NOTES below" .
+.TP
+.BR O_TMPFILE " (since Linux 3.11)"
+.\" commit 60545d0d4610b02e55f65d141c95b18ccf855b6e
+.\" commit f4e0c30c191f87851c4a53454abb55ee276f4a7e
+.\" commit bb458c644a59dbba3a1fe59b27106c5e68e1c4bd
+Create an unnamed temporary regular file.
+The
+.I pathname
+argument specifies a directory;
+an unnamed inode will be created in that directory's filesystem.
+Anything written to the resulting file will be lost when
+the last file descriptor is closed, unless the file is given a name.
+.IP
+.B O_TMPFILE
+must be specified with one of
+.B O_RDWR
+or
+.B O_WRONLY
+and, optionally,
+.BR O_EXCL .
+If
+.B O_EXCL
+is not specified, then
+.BR linkat (2)
+can be used to link the temporary file into the filesystem, making it
+permanent, using code like the following:
+.IP
+.in +4n
+.EX
+char path[PATH_MAX];
+fd = open("/path/to/dir", O_TMPFILE | O_RDWR,
+ S_IRUSR | S_IWUSR);
+\&
+/* File I/O on \[aq]fd\[aq]... */
+\&
+linkat(fd, "", AT_FDCWD, "/path/for/file", AT_EMPTY_PATH);
+\&
+/* If the caller doesn\[aq]t have the CAP_DAC_READ_SEARCH
+ capability (needed to use AT_EMPTY_PATH with linkat(2)),
+ and there is a proc(5) filesystem mounted, then the
+ linkat(2) call above can be replaced with:
+\&
+snprintf(path, PATH_MAX, "/proc/self/fd/%d", fd);
+linkat(AT_FDCWD, path, AT_FDCWD, "/path/for/file",
+ AT_SYMLINK_FOLLOW);
+*/
+.EE
+.in
+.IP
+In this case,
+the
+.BR open ()
+.I mode
+argument determines the file permission mode, as with
+.BR O_CREAT .
+.IP
+Specifying
+.B O_EXCL
+in conjunction with
+.B O_TMPFILE
+prevents a temporary file from being linked into the filesystem
+in the above manner.
+(Note that the meaning of
+.B O_EXCL
+in this case is different from the meaning of
+.B O_EXCL
+otherwise.)
+.IP
+There are two main use cases for
+.\" Inspired by http://lwn.net/Articles/559147/
+.BR O_TMPFILE :
+.RS
+.IP \[bu] 3
+Improved
+.BR tmpfile (3)
+functionality: race-free creation of temporary files that
+(1) are automatically deleted when closed;
+(2) can never be reached via any pathname;
+(3) are not subject to symlink attacks; and
+(4) do not require the caller to devise unique names.
+.IP \[bu]
+Creating a file that is initially invisible, which is then populated
+with data and adjusted to have appropriate filesystem attributes
+.RB ( fchown (2),
+.BR fchmod (2),
+.BR fsetxattr (2),
+etc.)
+before being atomically linked into the filesystem
+in a fully formed state (using
+.BR linkat (2)
+as described above).
+.RE
+.IP
+.B O_TMPFILE
+requires support by the underlying filesystem;
+only a subset of Linux filesystems provide that support.
+In the initial implementation, support was provided in
+the ext2, ext3, ext4, UDF, Minix, and tmpfs filesystems.
+.\" To check for support, grep for "tmpfile" in kernel sources
+Support for other filesystems has subsequently been added as follows:
+XFS (Linux 3.15);
+.\" commit 99b6436bc29e4f10e4388c27a3e4810191cc4788
+.\" commit ab29743117f9f4c22ac44c13c1647fb24fb2bafe
+Btrfs (Linux 3.16);
+.\" commit ef3b9af50bfa6a1f02cd7b3f5124b712b1ba3e3c
+F2FS (Linux 3.16);
+.\" commit 50732df02eefb39ab414ef655979c2c9b64ad21c
+and ubifs (Linux 4.9)
+.TP
+.B O_TRUNC
+If the file already exists and is a regular file and the access mode allows
+writing (i.e., is
+.B O_RDWR
+or
+.BR O_WRONLY )
+it will be truncated to length 0.
+If the file is a FIFO or terminal device file, the
+.B O_TRUNC
+flag is ignored.
+Otherwise, the effect of
+.B O_TRUNC
+is unspecified.
+.SS creat()
+A call to
+.BR creat ()
+is equivalent to calling
+.BR open ()
+with
+.I flags
+equal to
+.BR O_CREAT|O_WRONLY|O_TRUNC .
+.SS openat()
+The
+.BR openat ()
+system call operates in exactly the same way as
+.BR open (),
+except for the differences described here.
+.PP
+The
+.I dirfd
+argument is used in conjunction with the
+.I pathname
+argument as follows:
+.IP \[bu] 3
+If the pathname given in
+.I pathname
+is absolute, then
+.I dirfd
+is ignored.
+.IP \[bu]
+If the pathname given in
+.I pathname
+is relative and
+.I dirfd
+is the special value
+.BR AT_FDCWD ,
+then
+.I pathname
+is interpreted relative to the current working
+directory of the calling process (like
+.BR open ()).
+.IP \[bu]
+If the pathname given in
+.I pathname
+is relative, then it is interpreted relative to the directory
+referred to by the file descriptor
+.I dirfd
+(rather than relative to the current working directory of
+the calling process, as is done by
+.BR open ()
+for a relative pathname).
+In this case,
+.I dirfd
+must be a directory that was opened for reading
+.RB ( O_RDONLY )
+or using the
+.B O_PATH
+flag.
+.PP
+If the pathname given in
+.I pathname
+is relative, and
+.I dirfd
+is not a valid file descriptor, an error
+.RB ( EBADF )
+results.
+(Specifying an invalid file descriptor number in
+.I dirfd
+can be used as a means to ensure that
+.I pathname
+is absolute.)
+.\"
+.SS openat2(2)
+The
+.BR openat2 (2)
+system call is an extension of
+.BR openat (),
+and provides a superset of the features of
+.BR openat ().
+It is documented separately, in
+.BR openat2 (2).
+.SH RETURN VALUE
+On success,
+.BR open (),
+.BR openat (),
+and
+.BR creat ()
+return the new file descriptor (a nonnegative integer).
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.BR open (),
+.BR openat (),
+and
+.BR creat ()
+can fail with the following errors:
+.TP
+.B EACCES
+The requested access to the file is not allowed, or search permission
+is denied for one of the directories in the path prefix of
+.IR pathname ,
+or the file did not exist yet and write access to the parent directory
+is not allowed.
+(See also
+.BR path_resolution (7).)
+.TP
+.B EACCES
+.\" commit 30aba6656f61ed44cba445a3c0d38b296fa9e8f5
+Where
+.B O_CREAT
+is specified, the
+.I protected_fifos
+or
+.I protected_regular
+sysctl is enabled, the file already exists and is a FIFO or regular file, the
+owner of the file is neither the current user nor the owner of the
+containing directory, and the containing directory is both world- or
+group-writable and sticky.
+For details, see the descriptions of
+.I /proc/sys/fs/protected_fifos
+and
+.I /proc/sys/fs/protected_regular
+in
+.BR proc (5).
+.TP
+.B EBADF
+.RB ( openat ())
+.I pathname
+is relative but
+.I dirfd
+is neither
+.B AT_FDCWD
+nor a valid file descriptor.
+.TP
+.B EBUSY
+.B O_EXCL
+was specified in
+.I flags
+and
+.I pathname
+refers to a block device that is in use by the system (e.g., it is mounted).
+.TP
+.B EDQUOT
+Where
+.B O_CREAT
+is specified, the file does not exist, and the user's quota of disk
+blocks or inodes on the filesystem has been exhausted.
+.TP
+.B EEXIST
+.I pathname
+already exists and
+.BR O_CREAT " and " O_EXCL
+were used.
+.TP
+.B EFAULT
+.I pathname
+points outside your accessible address space.
+.TP
+.B EFBIG
+See
+.BR EOVERFLOW .
+.TP
+.B EINTR
+While blocked waiting to complete an open of a slow device
+(e.g., a FIFO; see
+.BR fifo (7)),
+the call was interrupted by a signal handler; see
+.BR signal (7).
+.TP
+.B EINVAL
+The filesystem does not support the
+.B O_DIRECT
+flag.
+See
+.B NOTES
+for more information.
+.TP
+.B EINVAL
+Invalid value in
+.\" In particular, __O_TMPFILE instead of O_TMPFILE
+.IR flags .
+.TP
+.B EINVAL
+.B O_TMPFILE
+was specified in
+.IR flags ,
+but neither
+.B O_WRONLY
+nor
+.B O_RDWR
+was specified.
+.TP
+.B EINVAL
+.B O_CREAT
+was specified in
+.I flags
+and the final component ("basename") of the new file's
+.I pathname
+is invalid
+(e.g., it contains characters not permitted by the underlying filesystem).
+.TP
+.B EINVAL
+The final component ("basename") of
+.I pathname
+is invalid
+(e.g., it contains characters not permitted by the underlying filesystem).
+.TP
+.B EISDIR
+.I pathname
+refers to a directory and the access requested involved writing
+(that is,
+.B O_WRONLY
+or
+.B O_RDWR
+is set).
+.TP
+.B EISDIR
+.I pathname
+refers to an existing directory,
+.B O_TMPFILE
+and one of
+.B O_WRONLY
+or
+.B O_RDWR
+were specified in
+.IR flags ,
+but this kernel version does not provide the
+.B O_TMPFILE
+functionality.
+.TP
+.B ELOOP
+Too many symbolic links were encountered in resolving
+.IR pathname .
+.TP
+.B ELOOP
+.I pathname
+was a symbolic link, and
+.I flags
+specified
+.B O_NOFOLLOW
+but not
+.BR O_PATH .
+.TP
+.B EMFILE
+The per-process limit on the number of open file descriptors has been reached
+(see the description of
+.B RLIMIT_NOFILE
+in
+.BR getrlimit (2)).
+.TP
+.B ENAMETOOLONG
+.I pathname
+was too long.
+.TP
+.B ENFILE
+The system-wide limit on the total number of open files has been reached.
+.TP
+.B ENODEV
+.I pathname
+refers to a device special file and no corresponding device exists.
+(This is a Linux kernel bug; in this situation
+.B ENXIO
+must be returned.)
+.TP
+.B ENOENT
+.B O_CREAT
+is not set and the named file does not exist.
+.TP
+.B ENOENT
+A directory component in
+.I pathname
+does not exist or is a dangling symbolic link.
+.TP
+.B ENOENT
+.I pathname
+refers to a nonexistent directory,
+.B O_TMPFILE
+and one of
+.B O_WRONLY
+or
+.B O_RDWR
+were specified in
+.IR flags ,
+but this kernel version does not provide the
+.B O_TMPFILE
+functionality.
+.TP
+.B ENOMEM
+The named file is a FIFO,
+but memory for the FIFO buffer can't be allocated because
+the per-user hard limit on memory allocation for pipes has been reached
+and the caller is not privileged; see
+.BR pipe (7).
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.B ENOSPC
+.I pathname
+was to be created but the device containing
+.I pathname
+has no room for the new file.
+.TP
+.B ENOTDIR
+A component used as a directory in
+.I pathname
+is not, in fact, a directory, or \fBO_DIRECTORY\fP was specified and
+.I pathname
+was not a directory.
+.TP
+.B ENOTDIR
+.RB ( openat ())
+.I pathname
+is a relative pathname and
+.I dirfd
+is a file descriptor referring to a file other than a directory.
+.TP
+.B ENXIO
+.BR O_NONBLOCK " | " O_WRONLY
+is set, the named file is a FIFO, and
+no process has the FIFO open for reading.
+.TP
+.B ENXIO
+The file is a device special file and no corresponding device exists.
+.TP
+.B ENXIO
+The file is a UNIX domain socket.
+.TP
+.B EOPNOTSUPP
+The filesystem containing
+.I pathname
+does not support
+.BR O_TMPFILE .
+.TP
+.B EOVERFLOW
+.I pathname
+refers to a regular file that is too large to be opened.
+The usual scenario here is that an application compiled
+on a 32-bit platform without
+.I \-D_FILE_OFFSET_BITS=64
+tried to open a file whose size exceeds
+.I (1<<31)\-1
+bytes;
+see also
+.B O_LARGEFILE
+above.
+This is the error specified by POSIX.1;
+before Linux 2.6.24, Linux gave the error
+.B EFBIG
+for this case.
+.\" See http://bugzilla.kernel.org/show_bug.cgi?id=7253
+.\" "Open of a large file on 32-bit fails with EFBIG, should be EOVERFLOW"
+.\" Reported 2006-10-03
+.TP
+.B EPERM
+The
+.B O_NOATIME
+flag was specified, but the effective user ID of the caller
+.\" Strictly speaking, it's the filesystem UID... (MTK)
+did not match the owner of the file and the caller was not privileged.
+.TP
+.B EPERM
+The operation was prevented by a file seal; see
+.BR fcntl (2).
+.TP
+.B EROFS
+.I pathname
+refers to a file on a read-only filesystem and write access was
+requested.
+.TP
+.B ETXTBSY
+.I pathname
+refers to an executable image which is currently being executed and
+write access was requested.
+.TP
+.B ETXTBSY
+.I pathname
+refers to a file that is currently in use as a swap file, and the
+.B O_TRUNC
+flag was specified.
+.TP
+.B ETXTBSY
+.I pathname
+refers to a file that is currently being read by the kernel (e.g., for
+module/firmware loading), and write access was requested.
+.TP
+.B EWOULDBLOCK
+The
+.B O_NONBLOCK
+flag was specified, and an incompatible lease was held on the file
+(see
+.BR fcntl (2)).
+.SH VERSIONS
+The (undefined) effect of
+.B O_RDONLY | O_TRUNC
+varies among implementations.
+On many systems the file is actually truncated.
+.\" Linux 2.0, 2.5: truncate
+.\" Solaris 5.7, 5.8: truncate
+.\" Irix 6.5: truncate
+.\" Tru64 5.1B: truncate
+.\" HP-UX 11.22: truncate
+.\" FreeBSD 4.7: truncate
+.SS Synchronized I/O
+The POSIX.1-2008 "synchronized I/O" option
+specifies different variants of synchronized I/O,
+and specifies the
+.BR open ()
+flags
+.BR O_SYNC ,
+.BR O_DSYNC ,
+and
+.B O_RSYNC
+for controlling the behavior.
+Regardless of whether an implementation supports this option,
+it must at least support the use of
+.B O_SYNC
+for regular files.
+.PP
+Linux implements
+.B O_SYNC
+and
+.BR O_DSYNC ,
+but not
+.BR O_RSYNC .
+Somewhat incorrectly, glibc defines
+.B O_RSYNC
+to have the same value as
+.BR O_SYNC .
+.RB ( O_RSYNC
+is defined in the Linux header file
+.I <asm/fcntl.h>
+on HP PA-RISC, but it is not used.)
+.PP
+.B O_SYNC
+provides synchronized I/O
+.I file
+integrity completion,
+meaning write operations will flush data and all associated metadata
+to the underlying hardware.
+.B O_DSYNC
+provides synchronized I/O
+.I data
+integrity completion,
+meaning write operations will flush data
+to the underlying hardware,
+but will only flush metadata updates that are required
+to allow a subsequent read operation to complete successfully.
+Data integrity completion can reduce the number of disk operations
+that are required for applications that don't need the guarantees
+of file integrity completion.
+.PP
+To understand the difference between the two types of completion,
+consider two pieces of file metadata:
+the file last modification timestamp
+.RI ( st_mtime )
+and the file length.
+All write operations will update the last file modification timestamp,
+but only writes that add data to the end of the
+file will change the file length.
+The last modification timestamp is not needed to ensure that
+a read completes successfully, but the file length is.
+Thus,
+.B O_DSYNC
+would only guarantee to flush updates to the file length metadata
+(whereas
+.B O_SYNC
+would also always flush the last modification timestamp metadata).
+.PP
+Before Linux 2.6.33, Linux implemented only the
+.B O_SYNC
+flag for
+.BR open ().
+However, when that flag was specified,
+most filesystems actually provided the equivalent of synchronized I/O
+.I data
+integrity completion (i.e.,
+.B O_SYNC
+was actually implemented as the equivalent of
+.BR O_DSYNC ).
+.PP
+Since Linux 2.6.33, proper
+.B O_SYNC
+support is provided.
+However, to ensure backward binary compatibility,
+.B O_DSYNC
+was defined with the same value as the historical
+.BR O_SYNC ,
+and
+.B O_SYNC
+was defined as a new (two-bit) flag value that includes the
+.B O_DSYNC
+flag value.
+This ensures that applications compiled against
+new headers get at least
+.B O_DSYNC
+semantics before Linux 2.6.33.
+.\"
+.SS C library/kernel differences
+Since glibc 2.26,
+the glibc wrapper function for
+.BR open ()
+employs the
+.BR openat ()
+system call, rather than the kernel's
+.BR open ()
+system call.
+For certain architectures, this is also true before glibc 2.26.
+.\"
+.SH STANDARDS
+.TP
+.BR open ()
+.TQ
+.BR creat ()
+.TQ
+.BR openat ()
+POSIX.1-2008.
+.PP
+.BR openat2 (2)
+Linux.
+.PP
+The
+.BR O_DIRECT ,
+.BR O_NOATIME ,
+.BR O_PATH ,
+and
+.B O_TMPFILE
+flags are Linux-specific.
+One must define
+.B _GNU_SOURCE
+to obtain their definitions.
+.PP
+The
+.BR O_CLOEXEC ,
+.BR O_DIRECTORY ,
+and
+.B O_NOFOLLOW
+flags are not specified in POSIX.1-2001,
+but are specified in POSIX.1-2008.
+Since glibc 2.12, one can obtain their definitions by defining either
+.B _POSIX_C_SOURCE
+with a value greater than or equal to 200809L or
+.B _XOPEN_SOURCE
+with a value greater than or equal to 700.
+In glibc 2.11 and earlier, one obtains the definitions by defining
+.BR _GNU_SOURCE .
+.SH HISTORY
+.TP
+.BR open ()
+.TQ
+.BR creat ()
+SVr4, 4.3BSD, POSIX.1-2001.
+.TP
+.BR openat ()
+POSIX.1-2008.
+Linux 2.6.16,
+glibc 2.4.
+.SH NOTES
+Under Linux, the
+.B O_NONBLOCK
+flag is sometimes used in cases where one wants to open
+but does not necessarily have the intention to read or write.
+For example,
+this may be used to open a device in order to get a file descriptor
+for use with
+.BR ioctl (2).
+.PP
+Note that
+.BR open ()
+can open device special files, but
+.BR creat ()
+cannot create them; use
+.BR mknod (2)
+instead.
+.PP
+If the file is newly created, its
+.IR st_atime ,
+.IR st_ctime ,
+.I st_mtime
+fields
+(respectively, time of last access, time of last status change, and
+time of last modification; see
+.BR stat (2))
+are set
+to the current time, and so are the
+.I st_ctime
+and
+.I st_mtime
+fields of the
+parent directory.
+Otherwise, if the file is modified because of the
+.B O_TRUNC
+flag, its
+.I st_ctime
+and
+.I st_mtime
+fields are set to the current time.
+.PP
+The files in the
+.IR /proc/ pid /fd
+directory show the open file descriptors of the process with the PID
+.IR pid .
+The files in the
+.IR /proc/ pid /fdinfo
+directory show even more information about these file descriptors.
+See
+.BR proc (5)
+for further details of both of these directories.
+.PP
+The Linux header file
+.B <asm/fcntl.h>
+doesn't define
+.BR O_ASYNC ;
+the (BSD-derived)
+.B FASYNC
+synonym is defined instead.
+.\"
+.\"
+.SS Open file descriptions
+The term open file description is the one used by POSIX to refer to the
+entries in the system-wide table of open files.
+In other contexts, this object is
+variously also called an "open file object",
+a "file handle", an "open file table entry",
+or\[em]in kernel-developer parlance\[em]a
+.IR "struct file" .
+.PP
+When a file descriptor is duplicated (using
+.BR dup (2)
+or similar),
+the duplicate refers to the same open file description
+as the original file descriptor,
+and the two file descriptors consequently share
+the file offset and file status flags.
+Such sharing can also occur between processes:
+a child process created via
+.BR fork (2)
+inherits duplicates of its parent's file descriptors,
+and those duplicates refer to the same open file descriptions.
+.PP
+Each
+.BR open ()
+of a file creates a new open file description;
+thus, there may be multiple open file descriptions
+corresponding to a file inode.
+.PP
+On Linux, one can use the
+.BR kcmp (2)
+.B KCMP_FILE
+operation to test whether two file descriptors
+(in the same process or in two different processes)
+refer to the same open file description.
+.\"
+.SS NFS
+There are many infelicities in the protocol underlying NFS, affecting
+amongst others
+.BR O_SYNC " and " O_NDELAY .
+.PP
+On NFS filesystems with UID mapping enabled,
+.BR open ()
+may
+return a file descriptor but, for example,
+.BR read (2)
+requests are denied
+with
+.BR EACCES .
+This is because the client performs
+.BR open ()
+by checking the
+permissions, but UID mapping is performed by the server upon
+read and write requests.
+.\"
+.\"
+.SS FIFOs
+Opening the read or write end of a FIFO blocks until the other
+end is also opened (by another process or thread).
+See
+.BR fifo (7)
+for further details.
+.\"
+.\"
+.SS File access mode
+Unlike the other values that can be specified in
+.IR flags ,
+the
+.I "access mode"
+values
+.BR O_RDONLY ", " O_WRONLY ", and " O_RDWR
+do not specify individual bits.
+Rather, they define the low order two bits of
+.IR flags ,
+and are defined respectively as 0, 1, and 2.
+In other words, the combination
+.B "O_RDONLY | O_WRONLY"
+is a logical error, and certainly does not have the same meaning as
+.BR O_RDWR .
+.PP
+Linux reserves the special, nonstandard access mode 3 (binary 11) in
+.I flags
+to mean:
+check for read and write permission on the file and return a file descriptor
+that can't be used for reading or writing.
+This nonstandard access mode is used by some Linux drivers to return a
+file descriptor that is to be used only for device-specific
+.BR ioctl (2)
+operations.
+.\" See for example util-linux's disk-utils/setfdprm.c
+.\" For some background on access mode 3, see
+.\" http://thread.gmane.org/gmane.linux.kernel/653123
+.\" "[RFC] correct flags to f_mode conversion in __dentry_open"
+.\" LKML, 12 Mar 2008
+.\"
+.\"
+.SS Rationale for openat() and other "directory file descriptor" APIs
+.BR openat ()
+and the other system calls and library functions that take
+a directory file descriptor argument
+(i.e.,
+.BR execveat (2),
+.BR faccessat (2),
+.BR fanotify_mark (2),
+.BR fchmodat (2),
+.BR fchownat (2),
+.BR fspick (2),
+.BR fstatat (2),
+.BR futimesat (2),
+.BR linkat (2),
+.BR mkdirat (2),
+.BR mknodat (2),
+.BR mount_setattr (2),
+.BR move_mount (2),
+.BR name_to_handle_at (2),
+.BR open_tree (2),
+.BR openat2 (2),
+.BR readlinkat (2),
+.BR renameat (2),
+.BR renameat2 (2),
+.BR statx (2),
+.BR symlinkat (2),
+.BR unlinkat (2),
+.BR utimensat (2),
+.BR mkfifoat (3),
+and
+.BR scandirat (3))
+address two problems with the older interfaces that preceded them.
+Here, the explanation is in terms of the
+.BR openat ()
+call, but the rationale is analogous for the other interfaces.
+.PP
+First,
+.BR openat ()
+allows an application to avoid race conditions that could
+occur when using
+.BR open ()
+to open files in directories other than the current working directory.
+These race conditions result from the fact that some component
+of the directory prefix given to
+.BR open ()
+could be changed in parallel with the call to
+.BR open ().
+Suppose, for example, that we wish to create the file
+.I dir1/dir2/xxx.dep
+if the file
+.I dir1/dir2/xxx
+exists.
+The problem is that between the existence check and the file-creation step,
+.I dir1
+or
+.I dir2
+(which might be symbolic links)
+could be modified to point to a different location.
+Such races can be avoided by
+opening a file descriptor for the target directory,
+and then specifying that file descriptor as the
+.I dirfd
+argument of (say)
+.BR fstatat (2)
+and
+.BR openat ().
+The use of the
+.I dirfd
+file descriptor also has other benefits:
+.IP \[bu] 3
+the file descriptor is a stable reference to the directory,
+even if the directory is renamed; and
+.IP \[bu]
+the open file descriptor prevents the underlying filesystem from
+being dismounted,
+just as when a process has a current working directory on a filesystem.
+.PP
+Second,
+.BR openat ()
+allows the implementation of a per-thread "current working
+directory", via file descriptor(s) maintained by the application.
+(This functionality can also be obtained by tricks based
+on the use of
+.IR /proc/self/fd/ dirfd,
+but less efficiently.)
+.PP
+The
+.I dirfd
+argument for these APIs can be obtained by using
+.BR open ()
+or
+.BR openat ()
+to open a directory (with either the
+.B O_RDONLY
+or the
+.B O_PATH
+flag).
+Alternatively, such a file descriptor can be obtained by applying
+.BR dirfd (3)
+to a directory stream created using
+.BR opendir (3).
+.PP
+When these APIs are given a
+.I dirfd
+argument of
+.B AT_FDCWD
+or the specified pathname is absolute,
+then they handle their pathname argument in the same way as
+the corresponding conventional APIs.
+However, in this case, several of the APIs have a
+.I flags
+argument that provides access to functionality that is not available with
+the corresponding conventional APIs.
+.\"
+.\"
+.SS O_DIRECT
+The
+.B O_DIRECT
+flag may impose alignment restrictions on the length and address
+of user-space buffers and the file offset of I/Os.
+In Linux alignment
+restrictions vary by filesystem and kernel version and might be
+absent entirely.
+The handling of misaligned
+.B O_DIRECT
+I/Os also varies;
+they can either fail with
+.B EINVAL
+or fall back to buffered I/O.
+.PP
+Since Linux 6.1,
+.B O_DIRECT
+support and alignment restrictions for a file can be queried using
+.BR statx (2),
+using the
+.B STATX_DIOALIGN
+flag.
+Support for
+.B STATX_DIOALIGN
+varies by filesystem;
+see
+.BR statx (2).
+.PP
+Some filesystems provide their own interfaces for querying
+.B O_DIRECT
+alignment restrictions,
+for example the
+.B XFS_IOC_DIOINFO
+operation in
+.BR xfsctl (3).
+.B STATX_DIOALIGN
+should be used instead when it is available.
+.PP
+If none of the above is available,
+then direct I/O support and alignment restrictions
+can only be assumed from known characteristics of the filesystem,
+the individual file,
+the underlying storage device(s),
+and the kernel version.
+In Linux 2.4,
+most filesystems based on block devices require that
+the file offset and the length and memory address of all I/O segments
+be multiples of the filesystem block size
+(typically 4096 bytes).
+In Linux 2.6.0,
+this was relaxed to the logical block size of the block device
+(typically 512 bytes).
+A block device's logical block size can be determined using the
+.BR ioctl (2)
+.B BLKSSZGET
+operation or from the shell using the command:
+.PP
+.in +4n
+.EX
+blockdev \-\-getss
+.EE
+.in
+.PP
+.B O_DIRECT
+I/Os should never be run concurrently with the
+.BR fork (2)
+system call,
+if the memory buffer is a private mapping
+(i.e., any mapping created with the
+.BR mmap (2)
+.B MAP_PRIVATE
+flag;
+this includes memory allocated on the heap and statically allocated buffers).
+Any such I/Os, whether submitted via an asynchronous I/O interface or from
+another thread in the process,
+should be completed before
+.BR fork (2)
+is called.
+Failure to do so can result in data corruption and undefined behavior in
+parent and child processes.
+This restriction does not apply when the memory buffer for the
+.B O_DIRECT
+I/Os was created using
+.BR shmat (2)
+or
+.BR mmap (2)
+with the
+.B MAP_SHARED
+flag.
+Nor does this restriction apply when the memory buffer has been advised as
+.B MADV_DONTFORK
+with
+.BR madvise (2),
+ensuring that it will not be available
+to the child after
+.BR fork (2).
+.PP
+The
+.B O_DIRECT
+flag was introduced in SGI IRIX, where it has alignment
+restrictions similar to those of Linux 2.4.
+IRIX has also a
+.BR fcntl (2)
+call to query appropriate alignments, and sizes.
+FreeBSD 4.x introduced
+a flag of the same name, but without alignment restrictions.
+.PP
+.B O_DIRECT
+support was added in Linux 2.4.10.
+Older Linux kernels simply ignore this flag.
+Some filesystems may not implement the flag, in which case
+.BR open ()
+fails with the error
+.B EINVAL
+if it is used.
+.PP
+Applications should avoid mixing
+.B O_DIRECT
+and normal I/O to the same file,
+and especially to overlapping byte regions in the same file.
+Even when the filesystem correctly handles the coherency issues in
+this situation, overall I/O throughput is likely to be slower than
+using either mode alone.
+Likewise, applications should avoid mixing
+.BR mmap (2)
+of files with direct I/O to the same files.
+.PP
+The behavior of
+.B O_DIRECT
+with NFS will differ from local filesystems.
+Older kernels, or
+kernels configured in certain ways, may not support this combination.
+The NFS protocol does not support passing the flag to the server, so
+.B O_DIRECT
+I/O will bypass the page cache only on the client; the server may
+still cache the I/O.
+The client asks the server to make the I/O
+synchronous to preserve the synchronous semantics of
+.BR O_DIRECT .
+Some servers will perform poorly under these circumstances, especially
+if the I/O size is small.
+Some servers may also be configured to
+lie to clients about the I/O having reached stable storage; this
+will avoid the performance penalty at some risk to data integrity
+in the event of server power failure.
+The Linux NFS client places no alignment restrictions on
+.B O_DIRECT
+I/O.
+.PP
+In summary,
+.B O_DIRECT
+is a potentially powerful tool that should be used with caution.
+It is recommended that applications treat use of
+.B O_DIRECT
+as a performance option which is disabled by default.
+.SH BUGS
+Currently, it is not possible to enable signal-driven
+I/O by specifying
+.B O_ASYNC
+when calling
+.BR open ();
+use
+.BR fcntl (2)
+to enable this flag.
+.\" FIXME . Check bugzilla report on open(O_ASYNC)
+.\" See http://bugzilla.kernel.org/show_bug.cgi?id=5993
+.PP
+One must check for two different error codes,
+.B EISDIR
+and
+.BR ENOENT ,
+when trying to determine whether the kernel supports
+.B O_TMPFILE
+functionality.
+.PP
+When both
+.B O_CREAT
+and
+.B O_DIRECTORY
+are specified in
+.I flags
+and the file specified by
+.I pathname
+does not exist,
+.BR open ()
+will create a regular file (i.e.,
+.B O_DIRECTORY
+is ignored).
+.SH SEE ALSO
+.BR chmod (2),
+.BR chown (2),
+.BR close (2),
+.BR dup (2),
+.BR fcntl (2),
+.BR link (2),
+.BR lseek (2),
+.BR mknod (2),
+.BR mmap (2),
+.BR mount (2),
+.BR open_by_handle_at (2),
+.BR openat2 (2),
+.BR read (2),
+.BR socket (2),
+.BR stat (2),
+.BR umask (2),
+.BR unlink (2),
+.BR write (2),
+.BR fopen (3),
+.BR acl (5),
+.BR fifo (7),
+.BR inode (7),
+.BR path_resolution (7),
+.BR symlink (7)
diff --git a/man2/open_by_handle_at.2 b/man2/open_by_handle_at.2
new file mode 100644
index 0000000..b5e3d75
--- /dev/null
+++ b/man2/open_by_handle_at.2
@@ -0,0 +1,751 @@
+.\" Copyright (c) 2014 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH open_by_handle_at 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+name_to_handle_at, open_by_handle_at \- obtain handle
+for a pathname and open file via a handle
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #include <fcntl.h>
+.PP
+.BI "int name_to_handle_at(int " dirfd ", const char *" pathname ,
+.BI " struct file_handle *" handle ,
+.BI " int *" mount_id ", int " flags );
+.BI "int open_by_handle_at(int " mount_fd ", struct file_handle *" handle ,
+.BI " int " flags );
+.fi
+.SH DESCRIPTION
+The
+.BR name_to_handle_at ()
+and
+.BR open_by_handle_at ()
+system calls split the functionality of
+.BR openat (2)
+into two parts:
+.BR name_to_handle_at ()
+returns an opaque handle that corresponds to a specified file;
+.BR open_by_handle_at ()
+opens the file corresponding to a handle returned by a previous call to
+.BR name_to_handle_at ()
+and returns an open file descriptor.
+.\"
+.\"
+.SS name_to_handle_at()
+The
+.BR name_to_handle_at ()
+system call returns a file handle and a mount ID corresponding to
+the file specified by the
+.I dirfd
+and
+.I pathname
+arguments.
+The file handle is returned via the argument
+.IR handle ,
+which is a pointer to a structure of the following form:
+.PP
+.in +4n
+.EX
+struct file_handle {
+ unsigned int handle_bytes; /* Size of f_handle [in, out] */
+ int handle_type; /* Handle type [out] */
+ unsigned char f_handle[0]; /* File identifier (sized by
+ caller) [out] */
+};
+.EE
+.in
+.PP
+It is the caller's responsibility to allocate the structure
+with a size large enough to hold the handle returned in
+.IR f_handle .
+Before the call, the
+.I handle_bytes
+field should be initialized to contain the allocated size for
+.IR f_handle .
+(The constant
+.BR MAX_HANDLE_SZ ,
+defined in
+.IR <fcntl.h> ,
+specifies the maximum expected size for a file handle.
+It is not a
+guaranteed upper limit as future filesystems may require more space.)
+Upon successful return, the
+.I handle_bytes
+field is updated to contain the number of bytes actually written to
+.IR f_handle .
+.PP
+The caller can discover the required size for the
+.I file_handle
+structure by making a call in which
+.I handle\->handle_bytes
+is zero;
+in this case, the call fails with the error
+.B EOVERFLOW
+and
+.I handle\->handle_bytes
+is set to indicate the required size;
+the caller can then use this information to allocate a structure
+of the correct size (see EXAMPLES below).
+Some care is needed here as
+.B EOVERFLOW
+can also indicate that no file handle is available for this particular
+name in a filesystem which does normally support file-handle lookup.
+This case can be detected when the
+.B EOVERFLOW
+error is returned without
+.I handle_bytes
+being increased.
+.PP
+Other than the use of the
+.I handle_bytes
+field, the caller should treat the
+.I file_handle
+structure as an opaque data type: the
+.I handle_type
+and
+.I f_handle
+fields are needed only by a subsequent call to
+.BR open_by_handle_at ().
+.PP
+The
+.I flags
+argument is a bit mask constructed by ORing together zero or more of
+.B AT_EMPTY_PATH
+and
+.BR AT_SYMLINK_FOLLOW ,
+described below.
+.PP
+Together, the
+.I pathname
+and
+.I dirfd
+arguments identify the file for which a handle is to be obtained.
+There are four distinct cases:
+.IP \[bu] 3
+If
+.I pathname
+is a nonempty string containing an absolute pathname,
+then a handle is returned for the file referred to by that pathname.
+In this case,
+.I dirfd
+is ignored.
+.IP \[bu]
+If
+.I pathname
+is a nonempty string containing a relative pathname and
+.I dirfd
+has the special value
+.BR AT_FDCWD ,
+then
+.I pathname
+is interpreted relative to the current working directory of the caller,
+and a handle is returned for the file to which it refers.
+.IP \[bu]
+If
+.I pathname
+is a nonempty string containing a relative pathname and
+.I dirfd
+is a file descriptor referring to a directory, then
+.I pathname
+is interpreted relative to the directory referred to by
+.IR dirfd ,
+and a handle is returned for the file to which it refers.
+(See
+.BR openat (2)
+for an explanation of why "directory file descriptors" are useful.)
+.IP \[bu]
+If
+.I pathname
+is an empty string and
+.I flags
+specifies the value
+.BR AT_EMPTY_PATH ,
+then
+.I dirfd
+can be an open file descriptor referring to any type of file,
+or
+.BR AT_FDCWD ,
+meaning the current working directory,
+and a handle is returned for the file to which it refers.
+.PP
+The
+.I mount_id
+argument returns an identifier for the filesystem
+mount that corresponds to
+.IR pathname .
+This corresponds to the first field in one of the records in
+.IR /proc/self/mountinfo .
+Opening the pathname in the fifth field of that record yields a file
+descriptor for the mount point;
+that file descriptor can be used in a subsequent call to
+.BR open_by_handle_at ().
+.I mount_id
+is returned both for a successful call and for a call that results
+in the error
+.BR EOVERFLOW .
+.PP
+By default,
+.BR name_to_handle_at ()
+does not dereference
+.I pathname
+if it is a symbolic link, and thus returns a handle for the link itself.
+If
+.B AT_SYMLINK_FOLLOW
+is specified in
+.IR flags ,
+.I pathname
+is dereferenced if it is a symbolic link
+(so that the call returns a handle for the file referred to by the link).
+.PP
+.BR name_to_handle_at ()
+does not trigger a mount when the final component of the pathname is an
+automount point.
+When a filesystem supports both file handles and
+automount points, a
+.BR name_to_handle_at ()
+call on an automount point will return with error
+.B EOVERFLOW
+without having increased
+.IR handle_bytes .
+This can happen since Linux 4.13
+.\" commit 20fa19027286983ab2734b5910c4a687436e0c31
+with NFS when accessing a directory
+which is on a separate filesystem on the server.
+In this case, the automount can be triggered by adding a "/" to the end
+of the pathname.
+.SS open_by_handle_at()
+The
+.BR open_by_handle_at ()
+system call opens the file referred to by
+.IR handle ,
+a file handle returned by a previous call to
+.BR name_to_handle_at ().
+.PP
+The
+.I mount_fd
+argument is a file descriptor for any object (file, directory, etc.)
+in the mounted filesystem with respect to which
+.I handle
+should be interpreted.
+The special value
+.B AT_FDCWD
+can be specified, meaning the current working directory of the caller.
+.PP
+The
+.I flags
+argument
+is as for
+.BR open (2).
+If
+.I handle
+refers to a symbolic link, the caller must specify the
+.B O_PATH
+flag, and the symbolic link is not dereferenced; the
+.B O_NOFOLLOW
+flag, if specified, is ignored.
+.PP
+The caller must have the
+.B CAP_DAC_READ_SEARCH
+capability to invoke
+.BR open_by_handle_at ().
+.SH RETURN VALUE
+On success,
+.BR name_to_handle_at ()
+returns 0,
+and
+.BR open_by_handle_at ()
+returns a file descriptor (a nonnegative integer).
+.PP
+In the event of an error, both system calls return \-1 and set
+.I errno
+to indicate the error.
+.SH ERRORS
+.BR name_to_handle_at ()
+and
+.BR open_by_handle_at ()
+can fail for the same errors as
+.BR openat (2).
+In addition, they can fail with the errors noted below.
+.PP
+.BR name_to_handle_at ()
+can fail with the following errors:
+.TP
+.B EFAULT
+.IR pathname ,
+.IR mount_id ,
+or
+.I handle
+points outside your accessible address space.
+.TP
+.B EINVAL
+.I flags
+includes an invalid bit value.
+.TP
+.B EINVAL
+.I handle\->handle_bytes
+is greater than
+.BR MAX_HANDLE_SZ .
+.TP
+.B ENOENT
+.I pathname
+is an empty string, but
+.B AT_EMPTY_PATH
+was not specified in
+.IR flags .
+.TP
+.B ENOTDIR
+The file descriptor supplied in
+.I dirfd
+does not refer to a directory,
+and it is not the case that both
+.I flags
+includes
+.B AT_EMPTY_PATH
+and
+.I pathname
+is an empty string.
+.TP
+.B EOPNOTSUPP
+The filesystem does not support decoding of a pathname to a file handle.
+.TP
+.B EOVERFLOW
+The
+.I handle\->handle_bytes
+value passed into the call was too small.
+When this error occurs,
+.I handle\->handle_bytes
+is updated to indicate the required size for the handle.
+.\"
+.\"
+.PP
+.BR open_by_handle_at ()
+can fail with the following errors:
+.TP
+.B EBADF
+.I mount_fd
+is not an open file descriptor.
+.TP
+.B EBADF
+.I pathname
+is relative but
+.I dirfd
+is neither
+.B AT_FDCWD
+nor a valid file descriptor.
+.TP
+.B EFAULT
+.I handle
+points outside your accessible address space.
+.TP
+.B EINVAL
+.I handle\->handle_bytes
+is greater than
+.B MAX_HANDLE_SZ
+or is equal to zero.
+.TP
+.B ELOOP
+.I handle
+refers to a symbolic link, but
+.B O_PATH
+was not specified in
+.IR flags .
+.TP
+.B EPERM
+The caller does not have the
+.B CAP_DAC_READ_SEARCH
+capability.
+.TP
+.B ESTALE
+The specified
+.I handle
+is not valid.
+This error will occur if, for example, the file has been deleted.
+.SH VERSIONS
+FreeBSD has a broadly similar pair of system calls in the form of
+.BR getfh ()
+and
+.BR openfh ().
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.39,
+glibc 2.14.
+.SH NOTES
+A file handle can be generated in one process using
+.BR name_to_handle_at ()
+and later used in a different process that calls
+.BR open_by_handle_at ().
+.PP
+Some filesystem don't support the translation of pathnames to
+file handles, for example,
+.IR /proc ,
+.IR /sys ,
+and various network filesystems.
+.PP
+A file handle may become invalid ("stale") if a file is deleted,
+or for other filesystem-specific reasons.
+Invalid handles are notified by an
+.B ESTALE
+error from
+.BR open_by_handle_at ().
+.PP
+These system calls are designed for use by user-space file servers.
+For example, a user-space NFS server might generate a file handle
+and pass it to an NFS client.
+Later, when the client wants to open the file,
+it could pass the handle back to the server.
+.\" https://lwn.net/Articles/375888/
+.\" "Open by handle" - Jonathan Corbet, 2010-02-23
+This sort of functionality allows a user-space file server to operate in
+a stateless fashion with respect to the files it serves.
+.PP
+If
+.I pathname
+refers to a symbolic link and
+.I flags
+does not specify
+.BR AT_SYMLINK_FOLLOW ,
+then
+.BR name_to_handle_at ()
+returns a handle for the link (rather than the file to which it refers).
+.\" commit bcda76524cd1fa32af748536f27f674a13e56700
+The process receiving the handle can later perform operations
+on the symbolic link by converting the handle to a file descriptor using
+.BR open_by_handle_at ()
+with the
+.B O_PATH
+flag, and then passing the file descriptor as the
+.I dirfd
+argument in system calls such as
+.BR readlinkat (2)
+and
+.BR fchownat (2).
+.SS Obtaining a persistent filesystem ID
+The mount IDs in
+.I /proc/self/mountinfo
+can be reused as filesystems are unmounted and mounted.
+Therefore, the mount ID returned by
+.BR name_to_handle_at ()
+(in
+.IR *mount_id )
+should not be treated as a persistent identifier
+for the corresponding mounted filesystem.
+However, an application can use the information in the
+.I mountinfo
+record that corresponds to the mount ID
+to derive a persistent identifier.
+.PP
+For example, one can use the device name in the fifth field of the
+.I mountinfo
+record to search for the corresponding device UUID via the symbolic links in
+.IR /dev/disks/by\-uuid .
+(A more comfortable way of obtaining the UUID is to use the
+.\" e.g., http://stackoverflow.com/questions/6748429/using-libblkid-to-find-uuid-of-a-partition
+.BR libblkid (3)
+library.)
+That process can then be reversed,
+using the UUID to look up the device name,
+and then obtaining the corresponding mount point,
+in order to produce the
+.I mount_fd
+argument used by
+.BR open_by_handle_at ().
+.SH EXAMPLES
+The two programs below demonstrate the use of
+.BR name_to_handle_at ()
+and
+.BR open_by_handle_at ().
+The first program
+.RI ( t_name_to_handle_at.c )
+uses
+.BR name_to_handle_at ()
+to obtain the file handle and mount ID
+for the file specified in its command-line argument;
+the handle and mount ID are written to standard output.
+.PP
+The second program
+.RI ( t_open_by_handle_at.c )
+reads a mount ID and file handle from standard input.
+The program then employs
+.BR open_by_handle_at ()
+to open the file using that handle.
+If an optional command-line argument is supplied, then the
+.I mount_fd
+argument for
+.BR open_by_handle_at ()
+is obtained by opening the directory named in that argument.
+Otherwise,
+.I mount_fd
+is obtained by scanning
+.I /proc/self/mountinfo
+to find a record whose mount ID matches the mount ID
+read from standard input,
+and the mount directory specified in that record is opened.
+(These programs do not deal with the fact that mount IDs are not persistent.)
+.PP
+The following shell session demonstrates the use of these two programs:
+.PP
+.in +4n
+.EX
+$ \fBecho \[aq]Can you please think about it?\[aq] > cecilia.txt\fP
+$ \fB./t_name_to_handle_at cecilia.txt > fh\fP
+$ \fB./t_open_by_handle_at < fh\fP
+open_by_handle_at: Operation not permitted
+$ \fBsudo ./t_open_by_handle_at < fh\fP # Need CAP_SYS_ADMIN
+Read 31 bytes
+$ \fBrm cecilia.txt\fP
+.EE
+.in
+.PP
+Now we delete and (quickly) re-create the file so that
+it has the same content and (by chance) the same inode.
+Nevertheless,
+.BR open_by_handle_at ()
+.\" Christoph Hellwig: That's why the file handles contain a generation
+.\" counter that gets incremented in this case.
+recognizes that the original file referred to by the file handle
+no longer exists.
+.PP
+.in +4n
+.EX
+$ \fBstat \-\-printf="%i\en" cecilia.txt\fP # Display inode number
+4072121
+$ \fBrm cecilia.txt\fP
+$ \fBecho \[aq]Can you please think about it?\[aq] > cecilia.txt\fP
+$ \fBstat \-\-printf="%i\en" cecilia.txt\fP # Check inode number
+4072121
+$ \fBsudo ./t_open_by_handle_at < fh\fP
+open_by_handle_at: Stale NFS file handle
+.EE
+.in
+.SS Program source: t_name_to_handle_at.c
+\&
+.\" SRC BEGIN (t_name_to_handle_at.c)
+.EX
+#define _GNU_SOURCE
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ int mount_id, fhsize, flags, dirfd;
+ char *pathname;
+ struct file_handle *fhp;
+\&
+ if (argc != 2) {
+ fprintf(stderr, "Usage: %s pathname\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ pathname = argv[1];
+\&
+ /* Allocate file_handle structure. */
+\&
+ fhsize = sizeof(*fhp);
+ fhp = malloc(fhsize);
+ if (fhp == NULL)
+ err(EXIT_FAILURE, "malloc");
+\&
+ /* Make an initial call to name_to_handle_at() to discover
+ the size required for file handle. */
+\&
+ dirfd = AT_FDCWD; /* For name_to_handle_at() calls */
+ flags = 0; /* For name_to_handle_at() calls */
+ fhp\->handle_bytes = 0;
+ if (name_to_handle_at(dirfd, pathname, fhp,
+ &mount_id, flags) != \-1
+ || errno != EOVERFLOW)
+ {
+ fprintf(stderr, "Unexpected result from name_to_handle_at()\en");
+ exit(EXIT_FAILURE);
+ }
+\&
+ /* Reallocate file_handle structure with correct size. */
+\&
+ fhsize = sizeof(*fhp) + fhp\->handle_bytes;
+ fhp = realloc(fhp, fhsize); /* Copies fhp\->handle_bytes */
+ if (fhp == NULL)
+ err(EXIT_FAILURE, "realloc");
+\&
+ /* Get file handle from pathname supplied on command line. */
+\&
+ if (name_to_handle_at(dirfd, pathname, fhp, &mount_id, flags) == \-1)
+ err(EXIT_FAILURE, "name_to_handle_at");
+\&
+ /* Write mount ID, file handle size, and file handle to stdout,
+ for later reuse by t_open_by_handle_at.c. */
+\&
+ printf("%d\en", mount_id);
+ printf("%u %d ", fhp\->handle_bytes, fhp\->handle_type);
+ for (size_t j = 0; j < fhp\->handle_bytes; j++)
+ printf(" %02x", fhp\->f_handle[j]);
+ printf("\en");
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SS Program source: t_open_by_handle_at.c
+\&
+.\" SRC BEGIN (t_open_by_handle_at.c)
+.EX
+#define _GNU_SOURCE
+#include <err.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+\&
+/* Scan /proc/self/mountinfo to find the line whose mount ID matches
+ \[aq]mount_id\[aq]. (An easier way to do this is to install and use the
+ \[aq]libmount\[aq] library provided by the \[aq]util\-linux\[aq] project.)
+ Open the corresponding mount path and return the resulting file
+ descriptor. */
+\&
+static int
+open_mount_path_by_id(int mount_id)
+{
+ int mi_mount_id, found;
+ char mount_path[PATH_MAX];
+ char *linep;
+ FILE *fp;
+ size_t lsize;
+ ssize_t nread;
+\&
+ fp = fopen("/proc/self/mountinfo", "r");
+ if (fp == NULL)
+ err(EXIT_FAILURE, "fopen");
+\&
+ found = 0;
+ linep = NULL;
+ while (!found) {
+ nread = getline(&linep, &lsize, fp);
+ if (nread == \-1)
+ break;
+\&
+ nread = sscanf(linep, "%d %*d %*s %*s %s",
+ &mi_mount_id, mount_path);
+ if (nread != 2) {
+ fprintf(stderr, "Bad sscanf()\en");
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (mi_mount_id == mount_id)
+ found = 1;
+ }
+ free(linep);
+\&
+ fclose(fp);
+\&
+ if (!found) {
+ fprintf(stderr, "Could not find mount point\en");
+ exit(EXIT_FAILURE);
+ }
+\&
+ return open(mount_path, O_RDONLY);
+}
+\&
+int
+main(int argc, char *argv[])
+{
+ int mount_id, fd, mount_fd, handle_bytes;
+ char buf[1000];
+#define LINE_SIZE 100
+ char line1[LINE_SIZE], line2[LINE_SIZE];
+ char *nextp;
+ ssize_t nread;
+ struct file_handle *fhp;
+\&
+ if ((argc > 1 && strcmp(argv[1], "\-\-help") == 0) || argc > 2) {
+ fprintf(stderr, "Usage: %s [mount\-path]\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ /* Standard input contains mount ID and file handle information:
+\&
+ Line 1: <mount_id>
+ Line 2: <handle_bytes> <handle_type> <bytes of handle in hex>
+ */
+\&
+ if (fgets(line1, sizeof(line1), stdin) == NULL ||
+ fgets(line2, sizeof(line2), stdin) == NULL)
+ {
+ fprintf(stderr, "Missing mount_id / file handle\en");
+ exit(EXIT_FAILURE);
+ }
+\&
+ mount_id = atoi(line1);
+\&
+ handle_bytes = strtoul(line2, &nextp, 0);
+\&
+ /* Given handle_bytes, we can now allocate file_handle structure. */
+\&
+ fhp = malloc(sizeof(*fhp) + handle_bytes);
+ if (fhp == NULL)
+ err(EXIT_FAILURE, "malloc");
+\&
+ fhp\->handle_bytes = handle_bytes;
+\&
+ fhp\->handle_type = strtoul(nextp, &nextp, 0);
+\&
+ for (size_t j = 0; j < fhp\->handle_bytes; j++)
+ fhp\->f_handle[j] = strtoul(nextp, &nextp, 16);
+\&
+ /* Obtain file descriptor for mount point, either by opening
+ the pathname specified on the command line, or by scanning
+ /proc/self/mounts to find a mount that matches the \[aq]mount_id\[aq]
+ that we received from stdin. */
+\&
+ if (argc > 1)
+ mount_fd = open(argv[1], O_RDONLY);
+ else
+ mount_fd = open_mount_path_by_id(mount_id);
+\&
+ if (mount_fd == \-1)
+ err(EXIT_FAILURE, "opening mount fd");
+\&
+ /* Open file using handle and mount point. */
+\&
+ fd = open_by_handle_at(mount_fd, fhp, O_RDONLY);
+ if (fd == \-1)
+ err(EXIT_FAILURE, "open_by_handle_at");
+\&
+ /* Try reading a few bytes from the file. */
+\&
+ nread = read(fd, buf, sizeof(buf));
+ if (nread == \-1)
+ err(EXIT_FAILURE, "read");
+\&
+ printf("Read %zd bytes\en", nread);
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR open (2),
+.BR libblkid (3),
+.BR blkid (8),
+.BR findfs (8),
+.BR mount (8)
+.PP
+The
+.I libblkid
+and
+.I libmount
+documentation in the latest
+.I util\-linux
+release at
+.UR https://www.kernel.org/pub/linux/utils/util\-linux/
+.UE
diff --git a/man2/openat.2 b/man2/openat.2
new file mode 100644
index 0000000..604e121
--- /dev/null
+++ b/man2/openat.2
@@ -0,0 +1 @@
+.so man2/open.2
diff --git a/man2/openat2.2 b/man2/openat2.2
new file mode 100644
index 0000000..b98bbaf
--- /dev/null
+++ b/man2/openat2.2
@@ -0,0 +1,582 @@
+.\" Copyright (C) 2019 Aleksa Sarai <cyphar@cyphar.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.TH openat2 2 2023-04-23 "Linux man-pages 6.05.01"
+.SH NAME
+openat2 \- open and possibly create a file (extended)
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <fcntl.h>" \
+" /* Definition of " O_* " and " S_* " constants */"
+.BR "#include <linux/openat2.h>" " /* Definition of " RESOLVE_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "long syscall(SYS_openat2, int " dirfd ", const char *" pathname ,
+.BI " struct open_how *" how ", size_t " size );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR openat2 (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The
+.BR openat2 ()
+system call is an extension of
+.BR openat (2)
+and provides a superset of its functionality.
+.PP
+The
+.BR openat2 ()
+system call opens the file specified by
+.IR pathname .
+If the specified file does not exist, it may optionally (if
+.B O_CREAT
+is specified in
+.IR how.flags )
+be created.
+.PP
+As with
+.BR openat (2),
+if
+.I pathname
+is a relative pathname, then it is interpreted relative to the
+directory referred to by the file descriptor
+.I dirfd
+(or the current working directory of the calling process, if
+.I dirfd
+is the special value
+.BR AT_FDCWD ).
+If
+.I pathname
+is an absolute pathname, then
+.I dirfd
+is ignored (unless
+.I how.resolve
+contains
+.BR RESOLVE_IN_ROOT ,
+in which case
+.I pathname
+is resolved relative to
+.IR dirfd ).
+.PP
+Rather than taking a single
+.I flags
+argument, an extensible structure (\fIhow\fP) is passed to allow for
+future extensions.
+The
+.I size
+argument must be specified as
+.IR "sizeof(struct open_how)" .
+.\"
+.SS The open_how structure
+The
+.I how
+argument specifies how
+.I pathname
+should be opened, and acts as a superset of the
+.I flags
+and
+.I mode
+arguments to
+.BR openat (2).
+This argument is a pointer to an
+.I open_how
+structure,
+described in
+.BR open_how (2type).
+.PP
+Any future extensions to
+.BR openat2 ()
+will be implemented as new fields appended to the
+.I open_how
+structure,
+with a zero value in a new field resulting in the kernel behaving
+as though that extension field was not present.
+Therefore, the caller
+.I must
+zero-fill this structure on
+initialization.
+(See the "Extensibility" section of the
+.B NOTES
+for more detail on why this is necessary.)
+.PP
+The fields of the
+.I open_how
+structure are as follows:
+.TP
+.I flags
+This field specifies
+the file creation and file status flags to use when opening the file.
+All of the
+.B O_*
+flags defined for
+.BR openat (2)
+are valid
+.BR openat2 ()
+flag values.
+.IP
+Whereas
+.BR openat (2)
+ignores unknown bits in its
+.I flags
+argument,
+.BR openat2 ()
+returns an error if unknown or conflicting flags are specified in
+.IR how.flags .
+.TP
+.I mode
+This field specifies the
+mode for the new file, with identical semantics to the
+.I mode
+argument of
+.BR openat (2).
+.IP
+Whereas
+.BR openat (2)
+ignores bits other than those in the range
+.I 07777
+in its
+.I mode
+argument,
+.BR openat2 ()
+returns an error if
+.I how.mode
+contains bits other than
+.IR 07777 .
+Similarly, an error is returned if
+.BR openat2 ()
+is called with a nonzero
+.I how.mode
+and
+.I how.flags
+does not contain
+.B O_CREAT
+or
+.BR O_TMPFILE .
+.TP
+.I resolve
+This is a bit-mask of flags that modify the way in which
+.B all
+components of
+.I pathname
+will be resolved.
+(See
+.BR path_resolution (7)
+for background information.)
+.IP
+The primary use case for these flags is to allow trusted programs to restrict
+how untrusted paths (or paths inside untrusted directories) are resolved.
+The full list of
+.I resolve
+flags is as follows:
+.RS
+.TP
+.B RESOLVE_BENEATH
+.\" commit adb21d2b526f7f196b2f3fdca97d80ba05dd14a0
+Do not permit the path resolution to succeed if any component of the resolution
+is not a descendant of the directory indicated by
+.IR dirfd .
+This causes absolute symbolic links (and absolute values of
+.IR pathname )
+to be rejected.
+.IP
+Currently, this flag also disables magic-link resolution (see below).
+However, this may change in the future.
+Therefore, to ensure that magic links are not resolved,
+the caller should explicitly specify
+.BR RESOLVE_NO_MAGICLINKS .
+.TP
+.B RESOLVE_IN_ROOT
+.\" commit 8db52c7e7ee1bd861b6096fcafc0fe7d0f24a994
+Treat the directory referred to by
+.I dirfd
+as the root directory while resolving
+.IR pathname .
+Absolute symbolic links are interpreted relative to
+.IR dirfd .
+If a prefix component of
+.I pathname
+equates to
+.IR dirfd ,
+then an immediately following
+.I ..\&
+component likewise equates to
+.I dirfd
+(just as
+.I /..\&
+is traditionally equivalent to
+.IR / ).
+If
+.I pathname
+is an absolute path, it is also interpreted relative to
+.IR dirfd .
+.IP
+The effect of this flag is as though the calling process had used
+.BR chroot (2)
+to (temporarily) modify its root directory (to the directory
+referred to by
+.IR dirfd ).
+However, unlike
+.BR chroot (2)
+(which changes the filesystem root permanently for a process),
+.B RESOLVE_IN_ROOT
+allows a program to efficiently restrict path resolution on a per-open basis.
+.IP
+Currently, this flag also disables magic-link resolution.
+However, this may change in the future.
+Therefore, to ensure that magic links are not resolved,
+the caller should explicitly specify
+.BR RESOLVE_NO_MAGICLINKS .
+.TP
+.B RESOLVE_NO_MAGICLINKS
+.\" commit 278121417a72d87fb29dd8c48801f80821e8f75a
+Disallow all magic-link resolution during path resolution.
+.IP
+Magic links are symbolic link-like objects that are most notably found in
+.BR proc (5);
+examples include
+.IR /proc/ pid /exe
+and
+.IR /proc/ pid /fd/* .
+(See
+.BR symlink (7)
+for more details.)
+.IP
+Unknowingly opening magic links can be risky for some applications.
+Examples of such risks include the following:
+.RS
+.IP \[bu] 3
+If the process opening a pathname is a controlling process that
+currently has no controlling terminal (see
+.BR credentials (7)),
+then opening a magic link inside
+.IR /proc/ pid /fd
+that happens to refer to a terminal
+would cause the process to acquire a controlling terminal.
+.IP \[bu]
+.\" From https://lwn.net/Articles/796868/:
+.\" The presence of this flag will prevent a path lookup operation
+.\" from traversing through one of these magic links, thus blocking
+.\" (for example) attempts to escape from a container via a /proc
+.\" entry for an open file descriptor.
+In a containerized environment,
+a magic link inside
+.I /proc
+may refer to an object outside the container,
+and thus may provide a means to escape from the container.
+.RE
+.IP
+Because of such risks,
+an application may prefer to disable magic link resolution using the
+.B RESOLVE_NO_MAGICLINKS
+flag.
+.IP
+If the trailing component (i.e., basename) of
+.I pathname
+is a magic link,
+.I how.resolve
+contains
+.BR RESOLVE_NO_MAGICLINKS ,
+and
+.I how.flags
+contains both
+.B O_PATH
+and
+.BR O_NOFOLLOW ,
+then an
+.B O_PATH
+file descriptor referencing the magic link will be returned.
+.TP
+.B RESOLVE_NO_SYMLINKS
+.\" commit 278121417a72d87fb29dd8c48801f80821e8f75a
+Disallow resolution of symbolic links during path resolution.
+This option implies
+.BR RESOLVE_NO_MAGICLINKS .
+.IP
+If the trailing component (i.e., basename) of
+.I pathname
+is a symbolic link,
+.I how.resolve
+contains
+.BR RESOLVE_NO_SYMLINKS ,
+and
+.I how.flags
+contains both
+.B O_PATH
+and
+.BR O_NOFOLLOW ,
+then an
+.B O_PATH
+file descriptor referencing the symbolic link will be returned.
+.IP
+Note that the effect of the
+.B RESOLVE_NO_SYMLINKS
+flag,
+which affects the treatment of symbolic links in all of the components of
+.IR pathname ,
+differs from the effect of the
+.B O_NOFOLLOW
+file creation flag (in
+.IR how.flags ),
+which affects the handling of symbolic links only in the final component of
+.IR pathname .
+.IP
+Applications that employ the
+.B RESOLVE_NO_SYMLINKS
+flag are encouraged to make its use configurable
+(unless it is used for a specific security purpose),
+as symbolic links are very widely used by end-users.
+Setting this flag indiscriminately\[em]i.e.,
+for purposes not specifically related to security\[em]for all uses of
+.BR openat2 ()
+may result in spurious errors on previously functional systems.
+This may occur if, for example,
+a system pathname that is used by an application is modified
+(e.g., in a new distribution release)
+so that a pathname component (now) contains a symbolic link.
+.TP
+.B RESOLVE_NO_XDEV
+.\" commit 72ba29297e1439efaa54d9125b866ae9d15df339
+Disallow traversal of mount points during path resolution (including all bind
+mounts).
+Consequently,
+.I pathname
+must either be on the same mount as the directory referred to by
+.IR dirfd ,
+or on the same mount as the current working directory if
+.I dirfd
+is specified as
+.BR AT_FDCWD .
+.IP
+Applications that employ the
+.B RESOLVE_NO_XDEV
+flag are encouraged to make its use configurable (unless it is
+used for a specific security purpose),
+as bind mounts are widely used by end-users.
+Setting this flag indiscriminately\[em]i.e.,
+for purposes not specifically related to security\[em]for all uses of
+.BR openat2 ()
+may result in spurious errors on previously functional systems.
+This may occur if, for example,
+a system pathname that is used by an application is modified
+(e.g., in a new distribution release)
+so that a pathname component (now) contains a bind mount.
+.TP
+.B RESOLVE_CACHED
+Make the open operation fail unless all path components are already present
+in the kernel's lookup cache.
+If any kind of revalidation or I/O is needed to satisfy the lookup,
+.BR openat2 ()
+fails with the error
+.B EAGAIN .
+This is useful in providing a fast-path open that can be performed without
+resorting to thread offload, or other mechanisms that an application might
+use to offload slower operations.
+.RE
+.IP
+If any bits other than those listed above are set in
+.IR how.resolve ,
+an error is returned.
+.SH RETURN VALUE
+On success, a new file descriptor is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+The set of errors returned by
+.BR openat2 ()
+includes all of the errors returned by
+.BR openat (2),
+as well as the following additional errors:
+.TP
+.B E2BIG
+An extension that this kernel does not support was specified in
+.IR how .
+(See the "Extensibility" section of
+.B NOTES
+for more detail on how extensions are handled.)
+.TP
+.B EAGAIN
+.I how.resolve
+contains either
+.B RESOLVE_IN_ROOT
+or
+.BR RESOLVE_BENEATH ,
+and the kernel could not ensure that a ".." component didn't escape (due to a
+race condition or potential attack).
+The caller may choose to retry the
+.BR openat2 ()
+call.
+.TP
+.B EAGAIN
+.B RESOLVE_CACHED
+was set, and the open operation cannot be performed using only cached
+information.
+The caller should retry without
+.B RESOLVE_CACHED
+set in
+.I how.resolve .
+.TP
+.B EINVAL
+An unknown flag or invalid value was specified in
+.IR how .
+.TP
+.B EINVAL
+.I mode
+is nonzero, but
+.I how.flags
+does not contain
+.B O_CREAT
+or
+.BR O_TMPFILE .
+.TP
+.B EINVAL
+.I size
+was smaller than any known version of
+.IR "struct open_how" .
+.TP
+.B ELOOP
+.I how.resolve
+contains
+.BR RESOLVE_NO_SYMLINKS ,
+and one of the path components was a symbolic link (or magic link).
+.TP
+.B ELOOP
+.I how.resolve
+contains
+.BR RESOLVE_NO_MAGICLINKS ,
+and one of the path components was a magic link.
+.TP
+.B EXDEV
+.I how.resolve
+contains either
+.B RESOLVE_IN_ROOT
+or
+.BR RESOLVE_BENEATH ,
+and an escape from the root during path resolution was detected.
+.TP
+.B EXDEV
+.I how.resolve
+contains
+.BR RESOLVE_NO_XDEV ,
+and a path component crosses a mount point.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 5.6.
+.\" commit fddb5d430ad9fa91b49b1d34d0202ffe2fa0e179
+.PP
+The semantics of
+.B RESOLVE_BENEATH
+were modeled after FreeBSD's
+.BR O_BENEATH .
+.SH NOTES
+.SS Extensibility
+In order to allow for future extensibility,
+.BR openat2 ()
+requires the user-space application to specify the size of the
+.I open_how
+structure that it is passing.
+By providing this information, it is possible for
+.BR openat2 ()
+to provide both forwards- and backwards-compatibility, with
+.I size
+acting as an implicit version number.
+(Because new extension fields will always
+be appended, the structure size will always increase.)
+This extensibility design is very similar to other system calls such as
+.BR sched_setattr (2),
+.BR perf_event_open (2),
+and
+.BR clone3 (2).
+.PP
+If we let
+.I usize
+be the size of the structure as specified by the user-space application, and
+.I ksize
+be the size of the structure which the kernel supports, then there are
+three cases to consider:
+.IP \[bu] 3
+If
+.I ksize
+equals
+.IR usize ,
+then there is no version mismatch and
+.I how
+can be used verbatim.
+.IP \[bu]
+If
+.I ksize
+is larger than
+.IR usize ,
+then there are some extension fields that the kernel supports
+which the user-space application
+is unaware of.
+Because a zero value in any added extension field signifies a no-op,
+the kernel
+treats all of the extension fields not provided by the user-space application
+as having zero values.
+This provides backwards-compatibility.
+.IP \[bu]
+If
+.I ksize
+is smaller than
+.IR usize ,
+then there are some extension fields which the user-space application
+is aware of but which the kernel does not support.
+Because any extension field must have its zero values signify a no-op,
+the kernel can
+safely ignore the unsupported extension fields if they are all-zero.
+If any unsupported extension fields are nonzero, then \-1 is returned and
+.I errno
+is set to
+.BR E2BIG .
+This provides forwards-compatibility.
+.PP
+Because the definition of
+.I struct open_how
+may change in the future (with new fields being added when system headers are
+updated), user-space applications should zero-fill
+.I struct open_how
+to ensure that recompiling the program with new headers will not result in
+spurious errors at run time.
+The simplest way is to use a designated
+initializer:
+.PP
+.in +4n
+.EX
+struct open_how how = { .flags = O_RDWR,
+ .resolve = RESOLVE_IN_ROOT };
+.EE
+.in
+.PP
+or explicitly using
+.BR memset (3)
+or similar:
+.PP
+.in +4n
+.EX
+struct open_how how;
+memset(&how, 0, sizeof(how));
+how.flags = O_RDWR;
+how.resolve = RESOLVE_IN_ROOT;
+.EE
+.in
+.PP
+A user-space application that wishes to determine which extensions
+the running kernel supports can do so by conducting a binary search on
+.I size
+with a structure which has every byte nonzero (to find the largest value
+which doesn't produce an error of
+.BR E2BIG ).
+.SH SEE ALSO
+.BR openat (2),
+.BR open_how (2type),
+.BR path_resolution (7),
+.BR symlink (7)
diff --git a/man2/outb.2 b/man2/outb.2
new file mode 100644
index 0000000..4a3f877
--- /dev/null
+++ b/man2/outb.2
@@ -0,0 +1,84 @@
+.\" Copyright (c) 1995 Paul Gortmaker
+.\" (gpg109@rsphy1.anu.edu.au)
+.\" Wed Nov 29 10:58:54 EST 1995
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.TH outb 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+outb, outw, outl, outsb, outsw, outsl,
+inb, inw, inl, insb, insw, insl,
+outb_p, outw_p, outl_p, inb_p, inw_p, inl_p \- port I/O
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/io.h>
+.PP
+.BI "unsigned char inb(unsigned short " port );
+.BI "unsigned char inb_p(unsigned short " port );
+.BI "unsigned short inw(unsigned short " port );
+.BI "unsigned short inw_p(unsigned short " port );
+.BI "unsigned int inl(unsigned short " port );
+.BI "unsigned int inl_p(unsigned short " port );
+.PP
+.BI "void outb(unsigned char " value ", unsigned short " port );
+.BI "void outb_p(unsigned char " value ", unsigned short " port );
+.BI "void outw(unsigned short " value ", unsigned short " port );
+.BI "void outw_p(unsigned short " value ", unsigned short " port );
+.BI "void outl(unsigned int " value ", unsigned short " port );
+.BI "void outl_p(unsigned int " value ", unsigned short " port );
+.PP
+.BI "void insb(unsigned short " port ", void " addr [. count ],
+.BI " unsigned long " count );
+.BI "void insw(unsigned short " port ", void " addr [. count ],
+.BI " unsigned long " count );
+.BI "void insl(unsigned short " port ", void " addr [. count ],
+.BI " unsigned long " count );
+.BI "void outsb(unsigned short " port ", const void " addr [. count ],
+.BI " unsigned long " count );
+.BI "void outsw(unsigned short " port ", const void " addr [. count ],
+.BI " unsigned long " count );
+.BI "void outsl(unsigned short " port ", const void " addr [. count ],
+.BI " unsigned long " count );
+.fi
+.SH DESCRIPTION
+This family of functions is used to do low-level port input and output.
+The out* functions do port output, the in* functions do port input;
+the b-suffix functions are byte-width and the w-suffix functions
+word-width; the _p-suffix functions pause until the I/O completes.
+.PP
+They are primarily designed for internal kernel use,
+but can be used from user space.
+.\" , given the following information
+.\" in addition to that given in
+.\" .BR outb (9).
+.PP
+You must compile with \fB\-O\fP or \fB\-O2\fP or similar.
+The functions
+are defined as inline macros, and will not be substituted in without
+optimization enabled, causing unresolved references at link time.
+.PP
+You use
+.BR ioperm (2)
+or alternatively
+.BR iopl (2)
+to tell the kernel to allow the user space application to access the
+I/O ports in question.
+Failure to do this will cause the application
+to receive a segmentation fault.
+.SH VERSIONS
+.BR outb ()
+and friends are hardware-specific.
+The
+.I value
+argument is passed first and the
+.I port
+argument is passed second,
+which is the opposite order from most DOS implementations.
+.SH STANDARDS
+None.
+.SH SEE ALSO
+.BR ioperm (2),
+.BR iopl (2)
diff --git a/man2/outb_p.2 b/man2/outb_p.2
new file mode 100644
index 0000000..2c63c75
--- /dev/null
+++ b/man2/outb_p.2
@@ -0,0 +1 @@
+.so man2/outb.2
diff --git a/man2/outl.2 b/man2/outl.2
new file mode 100644
index 0000000..2c63c75
--- /dev/null
+++ b/man2/outl.2
@@ -0,0 +1 @@
+.so man2/outb.2
diff --git a/man2/outl_p.2 b/man2/outl_p.2
new file mode 100644
index 0000000..2c63c75
--- /dev/null
+++ b/man2/outl_p.2
@@ -0,0 +1 @@
+.so man2/outb.2
diff --git a/man2/outsb.2 b/man2/outsb.2
new file mode 100644
index 0000000..2c63c75
--- /dev/null
+++ b/man2/outsb.2
@@ -0,0 +1 @@
+.so man2/outb.2
diff --git a/man2/outsl.2 b/man2/outsl.2
new file mode 100644
index 0000000..2c63c75
--- /dev/null
+++ b/man2/outsl.2
@@ -0,0 +1 @@
+.so man2/outb.2
diff --git a/man2/outsw.2 b/man2/outsw.2
new file mode 100644
index 0000000..2c63c75
--- /dev/null
+++ b/man2/outsw.2
@@ -0,0 +1 @@
+.so man2/outb.2
diff --git a/man2/outw.2 b/man2/outw.2
new file mode 100644
index 0000000..2c63c75
--- /dev/null
+++ b/man2/outw.2
@@ -0,0 +1 @@
+.so man2/outb.2
diff --git a/man2/outw_p.2 b/man2/outw_p.2
new file mode 100644
index 0000000..2c63c75
--- /dev/null
+++ b/man2/outw_p.2
@@ -0,0 +1 @@
+.so man2/outb.2
diff --git a/man2/pause.2 b/man2/pause.2
new file mode 100644
index 0000000..0e7bbcd
--- /dev/null
+++ b/man2/pause.2
@@ -0,0 +1,50 @@
+.\" Copyright (c) 1992 Drew Eckhardt (drew@cs.colorado.edu), March 28, 1992
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt (michael@moria.de)
+.\" Modified Sat Jul 24 14:48:00 1993 by Rik Faith (faith@cs.unc.edu)
+.\" Modified 1995 by Mike Battersby (mib@deakin.edu.au)
+.\" Modified 2000 by aeb, following Michael Kerrisk
+.\"
+.TH pause 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+pause \- wait for signal
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.B int pause(void);
+.fi
+.SH DESCRIPTION
+.BR pause ()
+causes the calling process (or thread) to sleep
+until a signal is delivered that either terminates the process or causes
+the invocation of a signal-catching function.
+.SH RETURN VALUE
+.BR pause ()
+returns only when a signal was caught and the
+signal-catching function returned.
+In this case,
+.BR pause ()
+returns \-1, and
+.I errno
+is set to
+.\" .BR ERESTARTNOHAND .
+.BR EINTR .
+.SH ERRORS
+.TP
+.B EINTR
+a signal was caught and the signal-catching function returned.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4, 4.3BSD.
+.SH SEE ALSO
+.BR kill (2),
+.BR select (2),
+.BR signal (2),
+.BR sigsuspend (2)
diff --git a/man2/pciconfig_iobase.2 b/man2/pciconfig_iobase.2
new file mode 100644
index 0000000..5ab2995
--- /dev/null
+++ b/man2/pciconfig_iobase.2
@@ -0,0 +1 @@
+.so man2/pciconfig_read.2
diff --git a/man2/pciconfig_read.2 b/man2/pciconfig_read.2
new file mode 100644
index 0000000..7913ba0
--- /dev/null
+++ b/man2/pciconfig_read.2
@@ -0,0 +1,122 @@
+.\" Contributed by Niki A. Rahimi, LTC Security Development
+.\" narahimi@us.ibm.com
+.\"
+.\" %%%LICENSE_START(FREELY_REDISTRIBUTABLE)
+.\" May be freely distributed and modified.
+.\" %%%LICENSE_END
+.\"
+.TH pciconfig_read 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+pciconfig_read, pciconfig_write, pciconfig_iobase \-
+pci device information handling
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <pci.h>
+.PP
+.BI "int pciconfig_read(unsigned long " bus ", unsigned long " dfn ,
+.BI " unsigned long " off ", unsigned long " len ,
+.BI " unsigned char *" buf );
+.BI "int pciconfig_write(unsigned long " bus ", unsigned long " dfn ,
+.BI " unsigned long " off ", unsigned long " len ,
+.BI " unsigned char *" buf );
+.BI "int pciconfig_iobase(int " which ", unsigned long " bus ,
+.BI " unsigned long " devfn );
+.fi
+.SH DESCRIPTION
+Most of the interaction with PCI devices is already handled by the
+kernel PCI layer,
+and thus these calls should not normally need to be accessed from user space.
+.TP
+.BR pciconfig_read ()
+Reads to
+.I buf
+from device
+.I dev
+at offset
+.I off
+value.
+.TP
+.BR pciconfig_write ()
+Writes from
+.I buf
+to device
+.I dev
+at offset
+.I off
+value.
+.TP
+.BR pciconfig_iobase ()
+You pass it a bus/devfn pair and get a physical address for either the
+memory offset (for things like prep, this is 0xc0000000),
+the IO base for PIO cycles, or the ISA holes if any.
+.SH RETURN VALUE
+.TP
+.BR pciconfig_read ()
+On success, zero is returned.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.TP
+.BR pciconfig_write ()
+On success, zero is returned.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.TP
+.BR pciconfig_iobase ()
+Returns information on locations of various I/O
+regions in physical memory according to the
+.I which
+value.
+Values for
+.I which
+are:
+.BR IOBASE_BRIDGE_NUMBER ,
+.BR IOBASE_MEMORY ,
+.BR IOBASE_IO ,
+.BR IOBASE_ISA_IO ,
+.BR IOBASE_ISA_MEM .
+.SH ERRORS
+.TP
+.B EINVAL
+.I len
+value is invalid.
+This does not apply to
+.BR pciconfig_iobase ().
+.TP
+.B EIO
+I/O error.
+.TP
+.B ENODEV
+For
+.BR pciconfig_iobase (),
+"hose" value is NULL.
+For the other calls, could not find a slot.
+.TP
+.B ENOSYS
+The system has not implemented these calls
+.RB ( CONFIG_PCI
+not defined).
+.TP
+.B EOPNOTSUPP
+This return value is valid only for
+.BR pciconfig_iobase ().
+It is returned if the value for
+.I which
+is invalid.
+.TP
+.B EPERM
+User does not have the
+.B CAP_SYS_ADMIN
+capability.
+This does not apply to
+.BR pciconfig_iobase ().
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.0.26/2.1.11.
+.SH SEE ALSO
+.BR capabilities (7)
diff --git a/man2/pciconfig_write.2 b/man2/pciconfig_write.2
new file mode 100644
index 0000000..5ab2995
--- /dev/null
+++ b/man2/pciconfig_write.2
@@ -0,0 +1 @@
+.so man2/pciconfig_read.2
diff --git a/man2/perf_event_open.2 b/man2/perf_event_open.2
new file mode 100644
index 0000000..d9e7877
--- /dev/null
+++ b/man2/perf_event_open.2
@@ -0,0 +1,3989 @@
+.\" Copyright (c) 2012, Vincent Weaver
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" This document is based on the perf_event.h header file, the
+.\" tools/perf/design.txt file, and a lot of bitter experience.
+.\"
+.TH perf_event_open 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+perf_event_open \- set up performance monitoring
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/perf_event.h>" " /* Definition of " PERF_* " constants */"
+.BR "#include <linux/hw_breakpoint.h>" " /* Definition of " HW_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_perf_event_open, struct perf_event_attr *" attr ,
+.BI " pid_t " pid ", int " cpu ", int " group_fd \
+", unsigned long " flags );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR perf_event_open (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+Given a list of parameters,
+.BR perf_event_open ()
+returns a file descriptor, for use in subsequent system calls
+.RB ( read "(2), " mmap "(2), " prctl "(2), " fcntl "(2), etc.)."
+.PP
+A call to
+.BR perf_event_open ()
+creates a file descriptor that allows measuring performance
+information.
+Each file descriptor corresponds to one
+event that is measured; these can be grouped together
+to measure multiple events simultaneously.
+.PP
+Events can be enabled and disabled in two ways: via
+.BR ioctl (2)
+and via
+.BR prctl (2).
+When an event is disabled it does not count or generate overflows but does
+continue to exist and maintain its count value.
+.PP
+Events come in two flavors: counting and sampled.
+A
+.I counting
+event is one that is used for counting the aggregate number of events
+that occur.
+In general, counting event results are gathered with a
+.BR read (2)
+call.
+A
+.I sampling
+event periodically writes measurements to a buffer that can then
+be accessed via
+.BR mmap (2).
+.SS Arguments
+The
+.I pid
+and
+.I cpu
+arguments allow specifying which process and CPU to monitor:
+.TP
+.BR "pid == 0" " and " "cpu == \-1"
+This measures the calling process/thread on any CPU.
+.TP
+.BR "pid == 0" " and " "cpu >= 0"
+This measures the calling process/thread only
+when running on the specified CPU.
+.TP
+.BR "pid > 0" " and " "cpu == \-1"
+This measures the specified process/thread on any CPU.
+.TP
+.BR "pid > 0" " and " "cpu >= 0"
+This measures the specified process/thread only
+when running on the specified CPU.
+.TP
+.BR "pid == \-1" " and " "cpu >= 0"
+This measures all processes/threads on the specified CPU.
+This requires
+.B CAP_PERFMON
+(since Linux 5.8) or
+.B CAP_SYS_ADMIN
+capability or a
+.I /proc/sys/kernel/perf_event_paranoid
+value of less than 1.
+.TP
+.BR "pid == \-1" " and " "cpu == \-1"
+This setting is invalid and will return an error.
+.PP
+When
+.I pid
+is greater than zero, permission to perform this system call
+is governed by
+.B CAP_PERFMON
+(since Linux 5.9) and a ptrace access mode
+.B PTRACE_MODE_READ_REALCREDS
+check on older Linux versions; see
+.BR ptrace (2).
+.PP
+The
+.I group_fd
+argument allows event groups to be created.
+An event group has one event which is the group leader.
+The leader is created first, with
+.IR group_fd " = \-1."
+The rest of the group members are created with subsequent
+.BR perf_event_open ()
+calls with
+.I group_fd
+being set to the file descriptor of the group leader.
+(A single event on its own is created with
+.IR group_fd " = \-1"
+and is considered to be a group with only 1 member.)
+An event group is scheduled onto the CPU as a unit:
+it will be put onto the CPU
+only if all of the events in the group can be put onto the CPU.
+This means that the values of the member events can be meaningfully compared
+\[em]added, divided (to get ratios), and so on\[em]
+with each other,
+since they have counted events for the same set of executed instructions.
+.PP
+The
+.I flags
+argument is formed by ORing together zero or more of the following values:
+.TP
+.BR PERF_FLAG_FD_CLOEXEC " (since Linux 3.14)"
+.\" commit a21b0b354d4ac39be691f51c53562e2c24443d9e
+This flag enables the close-on-exec flag for the created
+event file descriptor,
+so that the file descriptor is automatically closed on
+.BR execve (2).
+Setting the close-on-exec flags at creation time, rather than later with
+.BR fcntl (2),
+avoids potential race conditions where the calling thread invokes
+.BR perf_event_open ()
+and
+.BR fcntl (2)
+at the same time as another thread calls
+.BR fork (2)
+then
+.BR execve (2).
+.TP
+.B PERF_FLAG_FD_NO_GROUP
+This flag tells the event to ignore the
+.I group_fd
+parameter except for the purpose of setting up output redirection
+using the
+.B PERF_FLAG_FD_OUTPUT
+flag.
+.TP
+.BR PERF_FLAG_FD_OUTPUT " (broken since Linux 2.6.35)"
+.\" commit ac9721f3f54b27a16c7e1afb2481e7ee95a70318
+This flag re-routes the event's sampled output to instead
+be included in the mmap buffer of the event specified by
+.IR group_fd .
+.TP
+.BR PERF_FLAG_PID_CGROUP " (since Linux 2.6.39)"
+.\" commit e5d1367f17ba6a6fed5fd8b74e4d5720923e0c25
+This flag activates per-container system-wide monitoring.
+A container
+is an abstraction that isolates a set of resources for finer-grained
+control (CPUs, memory, etc.).
+In this mode, the event is measured
+only if the thread running on the monitored CPU belongs to the designated
+container (cgroup).
+The cgroup is identified by passing a file descriptor
+opened on its directory in the cgroupfs filesystem.
+For instance, if the
+cgroup to monitor is called
+.IR test ,
+then a file descriptor opened on
+.I /dev/cgroup/test
+(assuming cgroupfs is mounted on
+.IR /dev/cgroup )
+must be passed as the
+.I pid
+parameter.
+cgroup monitoring is available only
+for system-wide events and may therefore require extra permissions.
+.PP
+The
+.I perf_event_attr
+structure provides detailed configuration information
+for the event being created.
+.PP
+.in +4n
+.EX
+struct perf_event_attr {
+ __u32 type; /* Type of event */
+ __u32 size; /* Size of attribute structure */
+ __u64 config; /* Type\-specific configuration */
+\&
+ union {
+ __u64 sample_period; /* Period of sampling */
+ __u64 sample_freq; /* Frequency of sampling */
+ };
+\&
+ __u64 sample_type; /* Specifies values included in sample */
+ __u64 read_format; /* Specifies values returned in read */
+\&
+ __u64 disabled : 1, /* off by default */
+ inherit : 1, /* children inherit it */
+ pinned : 1, /* must always be on PMU */
+ exclusive : 1, /* only group on PMU */
+ exclude_user : 1, /* don\[aq]t count user */
+ exclude_kernel : 1, /* don\[aq]t count kernel */
+ exclude_hv : 1, /* don\[aq]t count hypervisor */
+ exclude_idle : 1, /* don\[aq]t count when idle */
+ mmap : 1, /* include mmap data */
+ comm : 1, /* include comm data */
+ freq : 1, /* use freq, not period */
+ inherit_stat : 1, /* per task counts */
+ enable_on_exec : 1, /* next exec enables */
+ task : 1, /* trace fork/exit */
+ watermark : 1, /* wakeup_watermark */
+ precise_ip : 2, /* skid constraint */
+ mmap_data : 1, /* non\-exec mmap data */
+ sample_id_all : 1, /* sample_type all events */
+ exclude_host : 1, /* don\[aq]t count in host */
+ exclude_guest : 1, /* don\[aq]t count in guest */
+ exclude_callchain_kernel : 1,
+ /* exclude kernel callchains */
+ exclude_callchain_user : 1,
+ /* exclude user callchains */
+ mmap2 : 1, /* include mmap with inode data */
+ comm_exec : 1, /* flag comm events that are
+ due to exec */
+ use_clockid : 1, /* use clockid for time fields */
+ context_switch : 1, /* context switch data */
+ write_backward : 1, /* Write ring buffer from end
+ to beginning */
+ namespaces : 1, /* include namespaces data */
+ ksymbol : 1, /* include ksymbol events */
+ bpf_event : 1, /* include bpf events */
+ aux_output : 1, /* generate AUX records
+ instead of events */
+ cgroup : 1, /* include cgroup events */
+ text_poke : 1, /* include text poke events */
+ build_id : 1, /* use build id in mmap2 events */
+ inherit_thread : 1, /* children only inherit */
+ /* if cloned with CLONE_THREAD */
+ remove_on_exec : 1, /* event is removed from task
+ on exec */
+ sigtrap : 1, /* send synchronous SIGTRAP
+ on event */
+\&
+ __reserved_1 : 26;
+\&
+ union {
+ __u32 wakeup_events; /* wakeup every n events */
+ __u32 wakeup_watermark; /* bytes before wakeup */
+ };
+\&
+ __u32 bp_type; /* breakpoint type */
+\&
+ union {
+ __u64 bp_addr; /* breakpoint address */
+ __u64 kprobe_func; /* for perf_kprobe */
+ __u64 uprobe_path; /* for perf_uprobe */
+ __u64 config1; /* extension of config */
+ };
+\&
+ union {
+ __u64 bp_len; /* breakpoint length */
+ __u64 kprobe_addr; /* with kprobe_func == NULL */
+ __u64 probe_offset; /* for perf_[k,u]probe */
+ __u64 config2; /* extension of config1 */
+ };
+ __u64 branch_sample_type; /* enum perf_branch_sample_type */
+ __u64 sample_regs_user; /* user regs to dump on samples */
+ __u32 sample_stack_user; /* size of stack to dump on
+ samples */
+ __s32 clockid; /* clock to use for time fields */
+ __u64 sample_regs_intr; /* regs to dump on samples */
+ __u32 aux_watermark; /* aux bytes before wakeup */
+ __u16 sample_max_stack; /* max frames in callchain */
+ __u16 __reserved_2; /* align to u64 */
+ __u32 aux_sample_size; /* max aux sample size */
+ __u32 __reserved_3; /* align to u64 */
+ __u64 sig_data; /* user data for sigtrap */
+\&
+};
+.EE
+.in
+.PP
+The fields of the
+.I perf_event_attr
+structure are described in more detail below:
+.TP
+.I type
+This field specifies the overall event type.
+It has one of the following values:
+.RS
+.TP
+.B PERF_TYPE_HARDWARE
+This indicates one of the "generalized" hardware events provided
+by the kernel.
+See the
+.I config
+field definition for more details.
+.TP
+.B PERF_TYPE_SOFTWARE
+This indicates one of the software-defined events provided by the kernel
+(even if no hardware support is available).
+.TP
+.B PERF_TYPE_TRACEPOINT
+This indicates a tracepoint
+provided by the kernel tracepoint infrastructure.
+.TP
+.B PERF_TYPE_HW_CACHE
+This indicates a hardware cache event.
+This has a special encoding, described in the
+.I config
+field definition.
+.TP
+.B PERF_TYPE_RAW
+This indicates a "raw" implementation-specific event in the
+.IR config " field."
+.TP
+.BR PERF_TYPE_BREAKPOINT " (since Linux 2.6.33)"
+.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
+This indicates a hardware breakpoint as provided by the CPU.
+Breakpoints can be read/write accesses to an address as well as
+execution of an instruction address.
+.TP
+dynamic PMU
+Since Linux 2.6.38,
+.\" commit 2e80a82a49c4c7eca4e35734380f28298ba5db19
+.BR perf_event_open ()
+can support multiple PMUs.
+To enable this, a value exported by the kernel can be used in the
+.I type
+field to indicate which PMU to use.
+The value to use can be found in the sysfs filesystem:
+there is a subdirectory per PMU instance under
+.IR /sys/bus/event_source/devices .
+In each subdirectory there is a
+.I type
+file whose content is an integer that can be used in the
+.I type
+field.
+For instance,
+.I /sys/bus/event_source/devices/cpu/type
+contains the value for the core CPU PMU, which is usually 4.
+.TP
+.BR kprobe " and " uprobe " (since Linux 4.17)"
+.\" commit 65074d43fc77bcae32776724b7fa2696923c78e4
+.\" commit e12f03d7031a977356e3d7b75a68c2185ff8d155
+.\" commit 33ea4b24277b06dbc55d7f5772a46f029600255e
+These two dynamic PMUs create a kprobe/uprobe and attach it to the
+file descriptor generated by perf_event_open.
+The kprobe/uprobe will be destroyed on the destruction of the file descriptor.
+See fields
+.IR kprobe_func ,
+.IR uprobe_path ,
+.IR kprobe_addr ,
+and
+.I probe_offset
+for more details.
+.RE
+.TP
+.I "size"
+The size of the
+.I perf_event_attr
+structure for forward/backward compatibility.
+Set this using
+.I sizeof(struct perf_event_attr)
+to allow the kernel to see
+the struct size at the time of compilation.
+.IP
+The related define
+.B PERF_ATTR_SIZE_VER0
+is set to 64; this was the size of the first published struct.
+.B PERF_ATTR_SIZE_VER1
+is 72, corresponding to the addition of breakpoints in Linux 2.6.33.
+.\" commit cb5d76999029ae7a517cb07dfa732c1b5a934fc2
+.\" this was added much later when PERF_ATTR_SIZE_VER2 happened
+.\" but the actual attr_size had increased in Linux 2.6.33
+.B PERF_ATTR_SIZE_VER2
+is 80 corresponding to the addition of branch sampling in Linux 3.4.
+.\" commit cb5d76999029ae7a517cb07dfa732c1b5a934fc2
+.B PERF_ATTR_SIZE_VER3
+is 96 corresponding to the addition
+of
+.I sample_regs_user
+and
+.I sample_stack_user
+in Linux 3.7.
+.\" commit 1659d129ed014b715b0b2120e6fd929bdd33ed03
+.B PERF_ATTR_SIZE_VER4
+is 104 corresponding to the addition of
+.I sample_regs_intr
+in Linux 3.19.
+.\" commit 60e2364e60e86e81bc6377f49779779e6120977f
+.B PERF_ATTR_SIZE_VER5
+is 112 corresponding to the addition of
+.I aux_watermark
+in Linux 4.1.
+.\" commit 1a5941312414c71dece6717da9a0fa1303127afa
+.TP
+.I "config"
+This specifies which event you want, in conjunction with
+the
+.I type
+field.
+The
+.I config1
+and
+.I config2
+fields are also taken into account in cases where 64 bits is not
+enough to fully specify the event.
+The encoding of these fields are event dependent.
+.IP
+There are various ways to set the
+.I config
+field that are dependent on the value of the previously
+described
+.I type
+field.
+What follows are various possible settings for
+.I config
+separated out by
+.IR type .
+.IP
+If
+.I type
+is
+.BR PERF_TYPE_HARDWARE ,
+we are measuring one of the generalized hardware CPU events.
+Not all of these are available on all platforms.
+Set
+.I config
+to one of the following:
+.RS 12
+.TP
+.B PERF_COUNT_HW_CPU_CYCLES
+Total cycles.
+Be wary of what happens during CPU frequency scaling.
+.TP
+.B PERF_COUNT_HW_INSTRUCTIONS
+Retired instructions.
+Be careful, these can be affected by various
+issues, most notably hardware interrupt counts.
+.TP
+.B PERF_COUNT_HW_CACHE_REFERENCES
+Cache accesses.
+Usually this indicates Last Level Cache accesses but this may
+vary depending on your CPU.
+This may include prefetches and coherency messages; again this
+depends on the design of your CPU.
+.TP
+.B PERF_COUNT_HW_CACHE_MISSES
+Cache misses.
+Usually this indicates Last Level Cache misses; this is intended to be
+used in conjunction with the
+.B PERF_COUNT_HW_CACHE_REFERENCES
+event to calculate cache miss rates.
+.TP
+.B PERF_COUNT_HW_BRANCH_INSTRUCTIONS
+Retired branch instructions.
+Prior to Linux 2.6.35, this used
+the wrong event on AMD processors.
+.\" commit f287d332ce835f77a4f5077d2c0ef1e3f9ea42d2
+.TP
+.B PERF_COUNT_HW_BRANCH_MISSES
+Mispredicted branch instructions.
+.TP
+.B PERF_COUNT_HW_BUS_CYCLES
+Bus cycles, which can be different from total cycles.
+.TP
+.BR PERF_COUNT_HW_STALLED_CYCLES_FRONTEND " (since Linux 3.0)"
+.\" commit 8f62242246351b5a4bc0c1f00c0c7003edea128a
+Stalled cycles during issue.
+.TP
+.BR PERF_COUNT_HW_STALLED_CYCLES_BACKEND " (since Linux 3.0)"
+.\" commit 8f62242246351b5a4bc0c1f00c0c7003edea128a
+Stalled cycles during retirement.
+.TP
+.BR PERF_COUNT_HW_REF_CPU_CYCLES " (since Linux 3.3)"
+.\" commit c37e17497e01fc0f5d2d6feb5723b210b3ab8890
+Total cycles; not affected by CPU frequency scaling.
+.RE
+.IP
+If
+.I type
+is
+.BR PERF_TYPE_SOFTWARE ,
+we are measuring software events provided by the kernel.
+Set
+.I config
+to one of the following:
+.RS 12
+.TP
+.B PERF_COUNT_SW_CPU_CLOCK
+This reports the CPU clock, a high-resolution per-CPU timer.
+.TP
+.B PERF_COUNT_SW_TASK_CLOCK
+This reports a clock count specific to the task that is running.
+.TP
+.B PERF_COUNT_SW_PAGE_FAULTS
+This reports the number of page faults.
+.TP
+.B PERF_COUNT_SW_CONTEXT_SWITCHES
+This counts context switches.
+Until Linux 2.6.34, these were all reported as user-space
+events, after that they are reported as happening in the kernel.
+.\" commit e49a5bd38159dfb1928fd25b173bc9de4bbadb21
+.TP
+.B PERF_COUNT_SW_CPU_MIGRATIONS
+This reports the number of times the process
+has migrated to a new CPU.
+.TP
+.B PERF_COUNT_SW_PAGE_FAULTS_MIN
+This counts the number of minor page faults.
+These did not require disk I/O to handle.
+.TP
+.B PERF_COUNT_SW_PAGE_FAULTS_MAJ
+This counts the number of major page faults.
+These required disk I/O to handle.
+.TP
+.BR PERF_COUNT_SW_ALIGNMENT_FAULTS " (since Linux 2.6.33)"
+.\" commit f7d7986060b2890fc26db6ab5203efbd33aa2497
+This counts the number of alignment faults.
+These happen when unaligned memory accesses happen; the kernel
+can handle these but it reduces performance.
+This happens only on some architectures (never on x86).
+.TP
+.BR PERF_COUNT_SW_EMULATION_FAULTS " (since Linux 2.6.33)"
+.\" commit f7d7986060b2890fc26db6ab5203efbd33aa2497
+This counts the number of emulation faults.
+The kernel sometimes traps on unimplemented instructions
+and emulates them for user space.
+This can negatively impact performance.
+.TP
+.BR PERF_COUNT_SW_DUMMY " (since Linux 3.12)"
+.\" commit fa0097ee690693006ab1aea6c01ad3c851b65c77
+This is a placeholder event that counts nothing.
+Informational sample record types such as mmap or comm
+must be associated with an active event.
+This dummy event allows gathering such records without requiring
+a counting event.
+.TP
+.BR PERF_COUNT_SW_BPF_OUTPUT " (since Linux 4.4)"
+.\" commit a43eec304259a6c637f4014a6d4767159b6a3aa3
+This is used to generate raw sample data from BPF.
+BPF programs can write to this event using
+.B bpf_perf_event_output
+helper.
+.TP
+.BR PERF_COUNT_SW_CGROUP_SWITCHES " (since Linux 5.13)"
+.\" commit d0d1dd628527c77db2391ce0293c1ed344b2365f
+This counts context switches to a task in a different cgroup.
+In other words, if the next task is in the same cgroup,
+it won't count the switch.
+.RE
+.PP
+.RS
+If
+.I type
+is
+.BR PERF_TYPE_TRACEPOINT ,
+then we are measuring kernel tracepoints.
+The value to use in
+.I config
+can be obtained from under debugfs
+.I tracing/events/*/*/id
+if ftrace is enabled in the kernel.
+.RE
+.PP
+.RS
+If
+.I type
+is
+.BR PERF_TYPE_HW_CACHE ,
+then we are measuring a hardware CPU cache event.
+To calculate the appropriate
+.I config
+value, use the following equation:
+.RS 4
+.PP
+.in +4n
+.EX
+config = (perf_hw_cache_id) |
+ (perf_hw_cache_op_id << 8) |
+ (perf_hw_cache_op_result_id << 16);
+.EE
+.in
+.PP
+where
+.I perf_hw_cache_id
+is one of:
+.RS 4
+.TP
+.B PERF_COUNT_HW_CACHE_L1D
+for measuring Level 1 Data Cache
+.TP
+.B PERF_COUNT_HW_CACHE_L1I
+for measuring Level 1 Instruction Cache
+.TP
+.B PERF_COUNT_HW_CACHE_LL
+for measuring Last-Level Cache
+.TP
+.B PERF_COUNT_HW_CACHE_DTLB
+for measuring the Data TLB
+.TP
+.B PERF_COUNT_HW_CACHE_ITLB
+for measuring the Instruction TLB
+.TP
+.B PERF_COUNT_HW_CACHE_BPU
+for measuring the branch prediction unit
+.TP
+.BR PERF_COUNT_HW_CACHE_NODE " (since Linux 3.1)"
+.\" commit 89d6c0b5bdbb1927775584dcf532d98b3efe1477
+for measuring local memory accesses
+.RE
+.PP
+and
+.I perf_hw_cache_op_id
+is one of:
+.RS 4
+.TP
+.B PERF_COUNT_HW_CACHE_OP_READ
+for read accesses
+.TP
+.B PERF_COUNT_HW_CACHE_OP_WRITE
+for write accesses
+.TP
+.B PERF_COUNT_HW_CACHE_OP_PREFETCH
+for prefetch accesses
+.RE
+.PP
+and
+.I perf_hw_cache_op_result_id
+is one of:
+.RS 4
+.TP
+.B PERF_COUNT_HW_CACHE_RESULT_ACCESS
+to measure accesses
+.TP
+.B PERF_COUNT_HW_CACHE_RESULT_MISS
+to measure misses
+.RE
+.RE
+.PP
+If
+.I type
+is
+.BR PERF_TYPE_RAW ,
+then a custom "raw"
+.I config
+value is needed.
+Most CPUs support events that are not covered by the "generalized" events.
+These are implementation defined; see your CPU manual (for example
+the Intel Volume 3B documentation or the AMD BIOS and Kernel Developer
+Guide).
+The libpfm4 library can be used to translate from the name in the
+architectural manuals to the raw hex value
+.BR perf_event_open ()
+expects in this field.
+.PP
+If
+.I type
+is
+.BR PERF_TYPE_BREAKPOINT ,
+then leave
+.I config
+set to zero.
+Its parameters are set in other places.
+.PP
+If
+.I type
+is
+.B kprobe
+or
+.BR uprobe ,
+set
+.I retprobe
+(bit 0 of
+.IR config ,
+see
+.IR /sys/bus/event_source/devices/[k,u]probe/format/retprobe )
+for kretprobe/uretprobe.
+See fields
+.IR kprobe_func ,
+.IR uprobe_path ,
+.IR kprobe_addr ,
+and
+.I probe_offset
+for more details.
+.RE
+.TP
+.IR kprobe_func ", " uprobe_path ", " kprobe_addr ", and " probe_offset
+These fields describe the kprobe/uprobe for dynamic PMUs
+.B kprobe
+and
+.BR uprobe .
+For
+.BR kprobe :
+use
+.I kprobe_func
+and
+.IR probe_offset ,
+or use
+.I kprobe_addr
+and leave
+.I kprobe_func
+as NULL.
+For
+.BR uprobe :
+use
+.I uprobe_path
+and
+.IR probe_offset .
+.TP
+.IR sample_period ", " sample_freq
+A "sampling" event is one that generates an overflow notification
+every N events, where N is given by
+.IR sample_period .
+A sampling event has
+.IR sample_period " > 0."
+When an overflow occurs, requested data is recorded
+in the mmap buffer.
+The
+.I sample_type
+field controls what data is recorded on each overflow.
+.IP
+.I sample_freq
+can be used if you wish to use frequency rather than period.
+In this case, you set the
+.I freq
+flag.
+The kernel will adjust the sampling period
+to try and achieve the desired rate.
+The rate of adjustment is a
+timer tick.
+.TP
+.I sample_type
+The various bits in this field specify which values to include
+in the sample.
+They will be recorded in a ring-buffer,
+which is available to user space using
+.BR mmap (2).
+The order in which the values are saved in the
+sample are documented in the MMAP Layout subsection below;
+it is not the
+.I "enum perf_event_sample_format"
+order.
+.RS
+.TP
+.B PERF_SAMPLE_IP
+Records instruction pointer.
+.TP
+.B PERF_SAMPLE_TID
+Records the process and thread IDs.
+.TP
+.B PERF_SAMPLE_TIME
+Records a timestamp.
+.TP
+.B PERF_SAMPLE_ADDR
+Records an address, if applicable.
+.TP
+.B PERF_SAMPLE_READ
+Record counter values for all events in a group, not just the group leader.
+.TP
+.B PERF_SAMPLE_CALLCHAIN
+Records the callchain (stack backtrace).
+.TP
+.B PERF_SAMPLE_ID
+Records a unique ID for the opened event's group leader.
+.TP
+.B PERF_SAMPLE_CPU
+Records CPU number.
+.TP
+.B PERF_SAMPLE_PERIOD
+Records the current sampling period.
+.TP
+.B PERF_SAMPLE_STREAM_ID
+Records a unique ID for the opened event.
+Unlike
+.B PERF_SAMPLE_ID
+the actual ID is returned, not the group leader.
+This ID is the same as the one returned by
+.BR PERF_FORMAT_ID .
+.TP
+.B PERF_SAMPLE_RAW
+Records additional data, if applicable.
+Usually returned by tracepoint events.
+.TP
+.BR PERF_SAMPLE_BRANCH_STACK " (since Linux 3.4)"
+.\" commit bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e
+This provides a record of recent branches, as provided
+by CPU branch sampling hardware (such as Intel Last Branch Record).
+Not all hardware supports this feature.
+.IP
+See the
+.I branch_sample_type
+field for how to filter which branches are reported.
+.TP
+.BR PERF_SAMPLE_REGS_USER " (since Linux 3.7)"
+.\" commit 4018994f3d8785275ef0e7391b75c3462c029e56
+Records the current user-level CPU register state
+(the values in the process before the kernel was called).
+.TP
+.BR PERF_SAMPLE_STACK_USER " (since Linux 3.7)"
+.\" commit c5ebcedb566ef17bda7b02686e0d658a7bb42ee7
+Records the user level stack, allowing stack unwinding.
+.TP
+.BR PERF_SAMPLE_WEIGHT " (since Linux 3.10)"
+.\" commit c3feedf2aaf9ac8bad6f19f5d21e4ee0b4b87e9c
+Records a hardware provided weight value that expresses how
+costly the sampled event was.
+This allows the hardware to highlight expensive events in
+a profile.
+.TP
+.BR PERF_SAMPLE_DATA_SRC " (since Linux 3.10)"
+.\" commit d6be9ad6c960f43800a6f118932bc8a5a4eadcd1
+Records the data source: where in the memory hierarchy
+the data associated with the sampled instruction came from.
+This is available only if the underlying hardware
+supports this feature.
+.TP
+.BR PERF_SAMPLE_IDENTIFIER " (since Linux 3.12)"
+.\" commit ff3d527cebc1fa3707c617bfe9e74f53fcfb0955
+Places the
+.B SAMPLE_ID
+value in a fixed position in the record,
+either at the beginning (for sample events) or at the end
+(if a non-sample event).
+.IP
+This was necessary because a sample stream may have
+records from various different event sources with different
+.I sample_type
+settings.
+Parsing the event stream properly was not possible because the
+format of the record was needed to find
+.BR SAMPLE_ID ,
+but
+the format could not be found without knowing what
+event the sample belonged to (causing a circular
+dependency).
+.IP
+The
+.B PERF_SAMPLE_IDENTIFIER
+setting makes the event stream always parsable
+by putting
+.B SAMPLE_ID
+in a fixed location, even though
+it means having duplicate
+.B SAMPLE_ID
+values in records.
+.TP
+.BR PERF_SAMPLE_TRANSACTION " (since Linux 3.13)"
+.\" commit fdfbbd07e91f8fe387140776f3fd94605f0c89e5
+Records reasons for transactional memory abort events
+(for example, from Intel TSX transactional memory support).
+.IP
+The
+.I precise_ip
+setting must be greater than 0 and a transactional memory abort
+event must be measured or no values will be recorded.
+Also note that some perf_event measurements, such as sampled
+cycle counting, may cause extraneous aborts (by causing an
+interrupt during a transaction).
+.TP
+.BR PERF_SAMPLE_REGS_INTR " (since Linux 3.19)"
+.\" commit 60e2364e60e86e81bc6377f49779779e6120977f
+Records a subset of the current CPU register state
+as specified by
+.IR sample_regs_intr .
+Unlike
+.B PERF_SAMPLE_REGS_USER
+the register values will return kernel register
+state if the overflow happened while kernel
+code is running.
+If the CPU supports hardware sampling of
+register state (i.e., PEBS on Intel x86) and
+.I precise_ip
+is set higher than zero then the register
+values returned are those captured by
+hardware at the time of the sampled
+instruction's retirement.
+.TP
+.BR PERF_SAMPLE_PHYS_ADDR " (since Linux 4.13)"
+.\" commit fc7ce9c74c3ad232b084d80148654f926d01ece7
+Records physical address of data like in
+.BR PERF_SAMPLE_ADDR .
+.TP
+.BR PERF_SAMPLE_CGROUP " (since Linux 5.7)"
+.\" commit 96aaab686505c449e24d76e76507290dcc30e008
+Records (perf_event) cgroup ID of the process.
+This corresponds to the
+.I id
+field in the
+.B PERF_RECORD_CGROUP
+event.
+.TP
+.BR PERF_SAMPLE_DATA_PAGE_SIZE " (since Linux 5.11)"
+.\" commit 8d97e71811aaafe4abf611dc24822fd6e73df1a1
+Records page size of data like in
+.BR PERF_SAMPLE_ADDR .
+.TP
+.BR PERF_SAMPLE_CODE_PAGE_SIZE " (since Linux 5.11)"
+.\" commit 995f088efebe1eba0282a6ffa12411b37f8990c2
+Records page size of ip like in
+.BR PERF_SAMPLE_IP .
+.TP
+.BR PERF_SAMPLE_WEIGHT_STRUCT " (since Linux 5.12)"
+.\" commit 2a6c6b7d7ad346f0679d0963cb19b3f0ea7ef32c
+Records hardware provided weight values like in
+.BR PERF_SAMPLE_WEIGHT ,
+but it can represent multiple values in a struct.
+This shares the same space as
+.BR PERF_SAMPLE_WEIGHT ,
+so users can apply either of those,
+not both.
+It has the following format and
+the meaning of each field is
+dependent on the hardware implementation.
+.PP
+.in +4n
+.EX
+union perf_sample_weight {
+ u64 full; /* PERF_SAMPLE_WEIGHT */
+ struct { /* PERF_SAMPLE_WEIGHT_STRUCT */
+ u32 var1_dw;
+ u16 var2_w;
+ u16 var3_w;
+ };
+};
+.EE
+.in
+.RE
+.TP
+.I read_format
+This field specifies the format of the data returned by
+.BR read (2)
+on a
+.BR perf_event_open ()
+file descriptor.
+.RS
+.TP
+.B PERF_FORMAT_TOTAL_TIME_ENABLED
+Adds the 64-bit
+.I time_enabled
+field.
+This can be used to calculate estimated totals if
+the PMU is overcommitted and multiplexing is happening.
+.TP
+.B PERF_FORMAT_TOTAL_TIME_RUNNING
+Adds the 64-bit
+.I time_running
+field.
+This can be used to calculate estimated totals if
+the PMU is overcommitted and multiplexing is happening.
+.TP
+.B PERF_FORMAT_ID
+Adds a 64-bit unique value that corresponds to the event group.
+.TP
+.B PERF_FORMAT_GROUP
+Allows all counter values in an event group to be read with one read.
+.TP
+.B PERF_FORMAT_LOST " (since Linux 6.0)"
+.\" commit 119a784c81270eb88e573174ed2209225d646656
+Adds a 64-bit value that is the number of lost samples for this event.
+This would be only meaningful when
+.I sample_period
+or
+.I sample_freq
+is set.
+.RE
+.TP
+.I disabled
+The
+.I disabled
+bit specifies whether the counter starts out disabled or enabled.
+If disabled, the event can later be enabled by
+.BR ioctl (2),
+.BR prctl (2),
+or
+.IR enable_on_exec .
+.IP
+When creating an event group, typically the group leader is initialized
+with
+.I disabled
+set to 1 and any child events are initialized with
+.I disabled
+set to 0.
+Despite
+.I disabled
+being 0, the child events will not start until the group leader
+is enabled.
+.TP
+.I inherit
+The
+.I inherit
+bit specifies that this counter should count events of child
+tasks as well as the task specified.
+This applies only to new children, not to any existing children at
+the time the counter is created (nor to any new children of
+existing children).
+.IP
+Inherit does not work for some combinations of
+.I read_format
+values, such as
+.BR PERF_FORMAT_GROUP .
+.TP
+.I pinned
+The
+.I pinned
+bit specifies that the counter should always be on the CPU if at all
+possible.
+It applies only to hardware counters and only to group leaders.
+If a pinned counter cannot be put onto the CPU (e.g., because there are
+not enough hardware counters or because of a conflict with some other
+event), then the counter goes into an 'error' state, where reads
+return end-of-file (i.e.,
+.BR read (2)
+returns 0) until the counter is subsequently enabled or disabled.
+.TP
+.I exclusive
+The
+.I exclusive
+bit specifies that when this counter's group is on the CPU,
+it should be the only group using the CPU's counters.
+In the future this may allow monitoring programs to
+support PMU features that need to run alone so that they do not
+disrupt other hardware counters.
+.IP
+Note that many unexpected situations may prevent events with the
+.I exclusive
+bit set from ever running.
+This includes any users running a system-wide
+measurement as well as any kernel use of the performance counters
+(including the commonly enabled NMI Watchdog Timer interface).
+.TP
+.I exclude_user
+If this bit is set, the count excludes events that happen in user space.
+.TP
+.I exclude_kernel
+If this bit is set, the count excludes events that happen in kernel space.
+.TP
+.I exclude_hv
+If this bit is set, the count excludes events that happen in the
+hypervisor.
+This is mainly for PMUs that have built-in support for handling this
+(such as POWER).
+Extra support is needed for handling hypervisor measurements on most
+machines.
+.TP
+.I exclude_idle
+If set, don't count when the CPU is running the idle task.
+While you can currently enable this for any event type, it is ignored
+for all but software events.
+.TP
+.I mmap
+The
+.I mmap
+bit enables generation of
+.B PERF_RECORD_MMAP
+samples for every
+.BR mmap (2)
+call that has
+.B PROT_EXEC
+set.
+This allows tools to notice new executable code being mapped into
+a program (dynamic shared libraries for example)
+so that addresses can be mapped back to the original code.
+.TP
+.I comm
+The
+.I comm
+bit enables tracking of process command name as modified by the
+.BR execve (2)
+and
+.BR prctl (PR_SET_NAME)
+system calls as well as writing to
+.IR /proc/self/comm .
+If the
+.I comm_exec
+flag is also successfully set (possible since Linux 3.16),
+.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871
+then the misc flag
+.B PERF_RECORD_MISC_COMM_EXEC
+can be used to differentiate the
+.BR execve (2)
+case from the others.
+.TP
+.I freq
+If this bit is set, then
+.I sample_frequency
+not
+.I sample_period
+is used when setting up the sampling interval.
+.TP
+.I inherit_stat
+This bit enables saving of event counts on context switch for
+inherited tasks.
+This is meaningful only if the
+.I inherit
+field is set.
+.TP
+.I enable_on_exec
+If this bit is set, a counter is automatically
+enabled after a call to
+.BR execve (2).
+.TP
+.I task
+If this bit is set, then
+fork/exit notifications are included in the ring buffer.
+.TP
+.I watermark
+If set, have an overflow notification happen when we cross the
+.I wakeup_watermark
+boundary.
+Otherwise, overflow notifications happen after
+.I wakeup_events
+samples.
+.TP
+.IR precise_ip " (since Linux 2.6.35)"
+.\" commit ab608344bcbde4f55ec4cd911b686b0ce3eae076
+This controls the amount of skid.
+Skid is how many instructions
+execute between an event of interest happening and the kernel
+being able to stop and record the event.
+Smaller skid is
+better and allows more accurate reporting of which events
+correspond to which instructions, but hardware is often limited
+with how small this can be.
+.IP
+The possible values of this field are the following:
+.RS
+.TP
+.B 0
+.B SAMPLE_IP
+can have arbitrary skid.
+.TP
+.B 1
+.B SAMPLE_IP
+must have constant skid.
+.TP
+.B 2
+.B SAMPLE_IP
+requested to have 0 skid.
+.TP
+.B 3
+.B SAMPLE_IP
+must have 0 skid.
+See also the description of
+.BR PERF_RECORD_MISC_EXACT_IP .
+.RE
+.TP
+.IR mmap_data " (since Linux 2.6.36)"
+.\" commit 3af9e859281bda7eb7c20b51879cf43aa788ac2e
+This is the counterpart of the
+.I mmap
+field.
+This enables generation of
+.B PERF_RECORD_MMAP
+samples for
+.BR mmap (2)
+calls that do not have
+.B PROT_EXEC
+set (for example data and SysV shared memory).
+.TP
+.IR sample_id_all " (since Linux 2.6.38)"
+.\" commit c980d1091810df13f21aabbce545fd98f545bbf7
+If set, then TID, TIME, ID, STREAM_ID, and CPU can
+additionally be included in
+.RB non- PERF_RECORD_SAMPLE s
+if the corresponding
+.I sample_type
+is selected.
+.IP
+If
+.B PERF_SAMPLE_IDENTIFIER
+is specified, then an additional ID value is included
+as the last value to ease parsing the record stream.
+This may lead to the
+.I id
+value appearing twice.
+.IP
+The layout is described by this pseudo-structure:
+.IP
+.in +4n
+.EX
+struct sample_id {
+ { u32 pid, tid; } /* if PERF_SAMPLE_TID set */
+ { u64 time; } /* if PERF_SAMPLE_TIME set */
+ { u64 id; } /* if PERF_SAMPLE_ID set */
+ { u64 stream_id;} /* if PERF_SAMPLE_STREAM_ID set */
+ { u32 cpu, res; } /* if PERF_SAMPLE_CPU set */
+ { u64 id; } /* if PERF_SAMPLE_IDENTIFIER set */
+};
+.EE
+.in
+.TP
+.IR exclude_host " (since Linux 3.2)"
+.\" commit a240f76165e6255384d4bdb8139895fac7988799
+When conducting measurements that include processes running
+VM instances (i.e., have executed a
+.B KVM_RUN
+.BR ioctl (2)),
+only measure events happening inside a guest instance.
+This is only meaningful outside the guests; this setting does
+not change counts gathered inside of a guest.
+Currently, this functionality is x86 only.
+.TP
+.IR exclude_guest " (since Linux 3.2)"
+.\" commit a240f76165e6255384d4bdb8139895fac7988799
+When conducting measurements that include processes running
+VM instances (i.e., have executed a
+.B KVM_RUN
+.BR ioctl (2)),
+do not measure events happening inside guest instances.
+This is only meaningful outside the guests; this setting does
+not change counts gathered inside of a guest.
+Currently, this functionality is x86 only.
+.TP
+.IR exclude_callchain_kernel " (since Linux 3.7)"
+.\" commit d077526485d5c9b12fe85d0b2b3b7041e6bc5f91
+Do not include kernel callchains.
+.TP
+.IR exclude_callchain_user " (since Linux 3.7)"
+.\" commit d077526485d5c9b12fe85d0b2b3b7041e6bc5f91
+Do not include user callchains.
+.TP
+.IR mmap2 " (since Linux 3.16)"
+.\" commit 13d7a2410fa637f450a29ecb515ac318ee40c741
+.\" This is tricky; was committed during 3.12 development
+.\" but right before release was disabled.
+.\" So while you could select mmap2 starting with Linux 3.12
+.\" it did not work until Linux 3.16
+.\" commit a5a5ba72843dd05f991184d6cb9a4471acce1005
+Generate an extended executable mmap record that contains enough
+additional information to uniquely identify shared mappings.
+The
+.I mmap
+flag must also be set for this to work.
+.TP
+.IR comm_exec " (since Linux 3.16)"
+.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871
+This is purely a feature-detection flag, it does not change
+kernel behavior.
+If this flag can successfully be set, then, when
+.I comm
+is enabled, the
+.B PERF_RECORD_MISC_COMM_EXEC
+flag will be set in the
+.I misc
+field of a comm record header if the rename event being
+reported was caused by a call to
+.BR execve (2).
+This allows tools to distinguish between the various
+types of process renaming.
+.TP
+.IR use_clockid " (since Linux 4.1)"
+.\" commit 34f439278cef7b1177f8ce24f9fc81dfc6221d3b
+This allows selecting which internal Linux clock to use
+when generating timestamps via the
+.I clockid
+field.
+This can make it easier to correlate perf sample times with
+timestamps generated by other tools.
+.TP
+.IR context_switch " (since Linux 4.3)"
+.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
+This enables the generation of
+.B PERF_RECORD_SWITCH
+records when a context switch occurs.
+It also enables the generation of
+.B PERF_RECORD_SWITCH_CPU_WIDE
+records when sampling in CPU-wide mode.
+This functionality is in addition to existing tracepoint and
+software events for measuring context switches.
+The advantage of this method is that it will give full
+information even with strict
+.I perf_event_paranoid
+settings.
+.TP
+.IR write_backward " (since Linux 4.6)"
+.\" commit 9ecda41acb971ebd07c8fb35faf24005c0baea12
+This causes the ring buffer to be written from the end to the beginning.
+This is to support reading from overwritable ring buffer.
+.TP
+.IR namespaces " (since Linux 4.11)"
+.\" commit e422267322cd319e2695a535e47c5b1feeac45eb
+This enables the generation of
+.B PERF_RECORD_NAMESPACES
+records when a task enters a new namespace.
+Each namespace has a combination of device and inode numbers.
+.TP
+.IR ksymbol " (since Linux 5.0)"
+.\" commit 76193a94522f1d4edf2447a536f3f796ce56343b
+This enables the generation of
+.B PERF_RECORD_KSYMBOL
+records when new kernel symbols are registered or unregistered.
+This is analyzing dynamic kernel functions like eBPF.
+.TP
+.IR bpf_event " (since Linux 5.0)"
+.\" commit 6ee52e2a3fe4ea35520720736e6791df1fb67106
+This enables the generation of
+.B PERF_RECORD_BPF_EVENT
+records when an eBPF program is loaded or unloaded.
+.TP
+.IR aux_output " (since Linux 5.4)"
+.\" commit ab43762ef010967e4ccd53627f70a2eecbeafefb
+This allows normal (non-AUX) events to generate data for AUX events
+if the hardware supports it.
+.TP
+.IR cgroup " (since Linux 5.7)"
+.\" commit 96aaab686505c449e24d76e76507290dcc30e008
+This enables the generation of
+.B PERF_RECORD_CGROUP
+records when a new cgroup is created (and activated).
+.TP
+.IR text_poke " (since Linux 5.8)"
+.\" commit e17d43b93e544f5016c0251d2074c15568d5d963
+This enables the generation of
+.B PERF_RECORD_TEXT_POKE
+records when there's a change to the kernel text
+(i.e., self-modifying code).
+.TP
+.IR build_id " (since Linux 5.12)"
+.\" commit 88a16a1309333e43d328621ece3e9fa37027e8eb
+This changes the contents in the
+.B PERF_RECORD_MMAP2
+to have a build-id instead of device and inode numbers.
+.TP
+.IR inherit_thread " (since Linux 5.13)"
+.\" commit 2b26f0aa004995f49f7b6f4100dd0e4c39a9ed5f
+This disables the inheritance of the event to a child process.
+Only new threads in the same process
+(which is cloned with
+.BR CLONE_THREAD )
+will inherit the event.
+.TP
+.IR remove_on_exec " (since Linux 5.13)"
+.\" commit 2e498d0a74e5b88a6689ae1b811f247f91ff188e
+This closes the event when it starts a new process image by
+.BR execve (2).
+.TP
+.IR sigtrap " (since Linux 5.13)"
+.\" commit 97ba62b278674293762c3d91f724f1bb922f04e0
+This enables synchronous signal delivery of
+.B SIGTRAP
+on event overflow.
+.TP
+.IR wakeup_events ", " wakeup_watermark
+This union sets how many samples
+.RI ( wakeup_events )
+or bytes
+.RI ( wakeup_watermark )
+happen before an overflow notification happens.
+Which one is used is selected by the
+.I watermark
+bit flag.
+.IP
+.I wakeup_events
+counts only
+.B PERF_RECORD_SAMPLE
+record types.
+To receive overflow notification for all
+.B PERF_RECORD
+types choose watermark and set
+.I wakeup_watermark
+to 1.
+.IP
+Prior to Linux 3.0, setting
+.\" commit f506b3dc0ec454a16d40cab9ee5d75435b39dc50
+.I wakeup_events
+to 0 resulted in no overflow notifications;
+more recent kernels treat 0 the same as 1.
+.TP
+.IR bp_type " (since Linux 2.6.33)"
+.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
+This chooses the breakpoint type.
+It is one of:
+.RS
+.TP
+.B HW_BREAKPOINT_EMPTY
+No breakpoint.
+.TP
+.B HW_BREAKPOINT_R
+Count when we read the memory location.
+.TP
+.B HW_BREAKPOINT_W
+Count when we write the memory location.
+.TP
+.B HW_BREAKPOINT_RW
+Count when we read or write the memory location.
+.TP
+.B HW_BREAKPOINT_X
+Count when we execute code at the memory location.
+.PP
+The values can be combined via a bitwise or, but the
+combination of
+.B HW_BREAKPOINT_R
+or
+.B HW_BREAKPOINT_W
+with
+.B HW_BREAKPOINT_X
+is not allowed.
+.RE
+.TP
+.IR bp_addr " (since Linux 2.6.33)"
+.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
+This is the address of the breakpoint.
+For execution breakpoints, this is the memory address of the instruction
+of interest; for read and write breakpoints, it is the memory address
+of the memory location of interest.
+.TP
+.IR config1 " (since Linux 2.6.39)"
+.\" commit a7e3ed1e470116c9d12c2f778431a481a6be8ab6
+.I config1
+is used for setting events that need an extra register or otherwise
+do not fit in the regular config field.
+Raw OFFCORE_EVENTS on Nehalem/Westmere/SandyBridge use this field
+on Linux 3.3 and later kernels.
+.TP
+.IR bp_len " (since Linux 2.6.33)"
+.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e
+.I bp_len
+is the length of the breakpoint being measured if
+.I type
+is
+.BR PERF_TYPE_BREAKPOINT .
+Options are
+.BR HW_BREAKPOINT_LEN_1 ,
+.BR HW_BREAKPOINT_LEN_2 ,
+.BR HW_BREAKPOINT_LEN_4 ,
+and
+.BR HW_BREAKPOINT_LEN_8 .
+For an execution breakpoint, set this to
+.IR sizeof(long) .
+.TP
+.IR config2 " (since Linux 2.6.39)"
+.\" commit a7e3ed1e470116c9d12c2f778431a481a6be8ab6
+.I config2
+is a further extension of the
+.I config1
+field.
+.TP
+.IR branch_sample_type " (since Linux 3.4)"
+.\" commit bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e
+If
+.B PERF_SAMPLE_BRANCH_STACK
+is enabled, then this specifies what branches to include
+in the branch record.
+.IP
+The first part of the value is the privilege level, which
+is a combination of one of the values listed below.
+If the user does not set privilege level explicitly, the kernel
+will use the event's privilege level.
+Event and branch privilege levels do not have to match.
+.RS
+.TP
+.B PERF_SAMPLE_BRANCH_USER
+Branch target is in user space.
+.TP
+.B PERF_SAMPLE_BRANCH_KERNEL
+Branch target is in kernel space.
+.TP
+.B PERF_SAMPLE_BRANCH_HV
+Branch target is in hypervisor.
+.TP
+.B PERF_SAMPLE_BRANCH_PLM_ALL
+A convenience value that is the three preceding values ORed together.
+.PP
+In addition to the privilege value, at least one or more of the
+following bits must be set.
+.TP
+.B PERF_SAMPLE_BRANCH_ANY
+Any branch type.
+.TP
+.B PERF_SAMPLE_BRANCH_ANY_CALL
+Any call branch (includes direct calls, indirect calls, and far jumps).
+.TP
+.B PERF_SAMPLE_BRANCH_IND_CALL
+Indirect calls.
+.TP
+.BR PERF_SAMPLE_BRANCH_CALL " (since Linux 4.4)"
+.\" commit c229bf9dc179d2023e185c0f705bdf68484c1e73
+Direct calls.
+.TP
+.B PERF_SAMPLE_BRANCH_ANY_RETURN
+Any return branch.
+.TP
+.BR PERF_SAMPLE_BRANCH_IND_JUMP " (since Linux 4.2)"
+.\" commit c9fdfa14c3792c0160849c484e83aa57afd80ccc
+Indirect jumps.
+.TP
+.BR PERF_SAMPLE_BRANCH_COND " (since Linux 3.16)"
+.\" commit bac52139f0b7ab31330e98fd87fc5a2664951050
+Conditional branches.
+.TP
+.BR PERF_SAMPLE_BRANCH_ABORT_TX " (since Linux 3.11)"
+.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
+Transactional memory aborts.
+.TP
+.BR PERF_SAMPLE_BRANCH_IN_TX " (since Linux 3.11)"
+.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
+Branch in transactional memory transaction.
+.TP
+.BR PERF_SAMPLE_BRANCH_NO_TX " (since Linux 3.11)"
+.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
+Branch not in transactional memory transaction.
+.BR PERF_SAMPLE_BRANCH_CALL_STACK " (since Linux 4.1)"
+.\" commit 2c44b1936bb3b135a3fac8b3493394d42e51cf70
+Branch is part of a hardware-generated call stack.
+This requires hardware support, currently only found
+on Intel x86 Haswell or newer.
+.RE
+.TP
+.IR sample_regs_user " (since Linux 3.7)"
+.\" commit 4018994f3d8785275ef0e7391b75c3462c029e56
+This bit mask defines the set of user CPU registers to dump on samples.
+The layout of the register mask is architecture-specific and
+is described in the kernel header file
+.IR arch/ARCH/include/uapi/asm/perf_regs.h .
+.TP
+.IR sample_stack_user " (since Linux 3.7)"
+.\" commit c5ebcedb566ef17bda7b02686e0d658a7bb42ee7
+This defines the size of the user stack to dump if
+.B PERF_SAMPLE_STACK_USER
+is specified.
+.TP
+.IR clockid " (since Linux 4.1)"
+.\" commit 34f439278cef7b1177f8ce24f9fc81dfc6221d3b
+If
+.I use_clockid
+is set, then this field selects which internal Linux timer to
+use for timestamps.
+The available timers are defined in
+.IR linux/time.h ,
+with
+.BR CLOCK_MONOTONIC ,
+.BR CLOCK_MONOTONIC_RAW ,
+.BR CLOCK_REALTIME ,
+.BR CLOCK_BOOTTIME ,
+and
+.B CLOCK_TAI
+currently supported.
+.TP
+.IR aux_watermark " (since Linux 4.1)"
+.\" commit 1a5941312414c71dece6717da9a0fa1303127afa
+This specifies how much data is required to trigger a
+.B PERF_RECORD_AUX
+sample.
+.TP
+.IR sample_max_stack " (since Linux 4.8)"
+.\" commit 97c79a38cd454602645f0470ffb444b3b75ce574
+When
+.I sample_type
+includes
+.BR PERF_SAMPLE_CALLCHAIN ,
+this field specifies how many stack frames to report when
+generating the callchain.
+.TP
+.IR aux_sample_size " (since Linux 5.5)"
+.\" commit a4faf00d994c40e64f656805ac375c65e324eefb
+When
+.B PERF_SAMPLE_AUX
+flag is set,
+specify the desired size of AUX data.
+Note that it can get smaller data than the specified size.
+.TP
+.IR sig_data " (since Linux 5.13)"
+.\" commit 97ba62b278674293762c3d91f724f1bb922f04e0
+This data will be copied to user's signal handler
+(through
+.I si_perf
+in the
+.IR siginfo_t )
+to disambiguate which event triggered the signal.
+.SS Reading results
+Once a
+.BR perf_event_open ()
+file descriptor has been opened, the values
+of the events can be read from the file descriptor.
+The values that are there are specified by the
+.I read_format
+field in the
+.I attr
+structure at open time.
+.PP
+If you attempt to read into a buffer that is not big enough to hold the
+data, the error
+.B ENOSPC
+results.
+.PP
+Here is the layout of the data returned by a read:
+.IP \[bu] 3
+If
+.B PERF_FORMAT_GROUP
+was specified to allow reading all events in a group at once:
+.IP
+.in +4n
+.EX
+struct read_format {
+ u64 nr; /* The number of events */
+ u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
+ u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
+ struct {
+ u64 value; /* The value of the event */
+ u64 id; /* if PERF_FORMAT_ID */
+ u64 lost; /* if PERF_FORMAT_LOST */
+ } values[nr];
+};
+.EE
+.in
+.IP \[bu]
+If
+.B PERF_FORMAT_GROUP
+was
+.I not
+specified:
+.IP
+.in +4n
+.EX
+struct read_format {
+ u64 value; /* The value of the event */
+ u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
+ u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
+ u64 id; /* if PERF_FORMAT_ID */
+ u64 lost; /* if PERF_FORMAT_LOST */
+};
+.EE
+.in
+.PP
+The values read are as follows:
+.TP
+.I nr
+The number of events in this file descriptor.
+Available only if
+.B PERF_FORMAT_GROUP
+was specified.
+.TP
+.IR time_enabled ", " time_running
+Total time the event was enabled and running.
+Normally these values are the same.
+Multiplexing happens if the number of events is more than the
+number of available PMU counter slots.
+In that case the events run only part of the time and the
+.I time_enabled
+and
+.I time running
+values can be used to scale an estimated value for the count.
+.TP
+.I value
+An unsigned 64-bit value containing the counter result.
+.TP
+.I id
+A globally unique value for this particular event; only present if
+.B PERF_FORMAT_ID
+was specified in
+.IR read_format .
+.TP
+.I lost
+The number of lost samples of this event;
+only present if
+.B PERF_FORMAT_LOST
+was specified in
+.IR read_format .
+.SS MMAP layout
+When using
+.BR perf_event_open ()
+in sampled mode, asynchronous events
+(like counter overflow or
+.B PROT_EXEC
+mmap tracking)
+are logged into a ring-buffer.
+This ring-buffer is created and accessed through
+.BR mmap (2).
+.PP
+The mmap size should be 1+2\[ha]n pages, where the first page is a
+metadata page
+.RI ( "struct perf_event_mmap_page" )
+that contains various
+bits of information such as where the ring-buffer head is.
+.PP
+Before Linux 2.6.39, there is a bug that means you must allocate an mmap
+ring buffer when sampling even if you do not plan to access it.
+.PP
+The structure of the first metadata mmap page is as follows:
+.PP
+.in +4n
+.EX
+struct perf_event_mmap_page {
+ __u32 version; /* version number of this structure */
+ __u32 compat_version; /* lowest version this is compat with */
+ __u32 lock; /* seqlock for synchronization */
+ __u32 index; /* hardware counter identifier */
+ __s64 offset; /* add to hardware counter value */
+ __u64 time_enabled; /* time event active */
+ __u64 time_running; /* time event on CPU */
+ union {
+ __u64 capabilities;
+ struct {
+ __u64 cap_usr_time / cap_usr_rdpmc / cap_bit0 : 1,
+ cap_bit0_is_deprecated : 1,
+ cap_user_rdpmc : 1,
+ cap_user_time : 1,
+ cap_user_time_zero : 1,
+ };
+ };
+ __u16 pmc_width;
+ __u16 time_shift;
+ __u32 time_mult;
+ __u64 time_offset;
+ __u64 __reserved[120]; /* Pad to 1 k */
+ __u64 data_head; /* head in the data section */
+ __u64 data_tail; /* user\-space written tail */
+ __u64 data_offset; /* where the buffer starts */
+ __u64 data_size; /* data buffer size */
+ __u64 aux_head;
+ __u64 aux_tail;
+ __u64 aux_offset;
+ __u64 aux_size;
+\&
+}
+.EE
+.in
+.PP
+The following list describes the fields in the
+.I perf_event_mmap_page
+structure in more detail:
+.TP
+.I version
+Version number of this structure.
+.TP
+.I compat_version
+The lowest version this is compatible with.
+.TP
+.I lock
+A seqlock for synchronization.
+.TP
+.I index
+A unique hardware counter identifier.
+.TP
+.I offset
+When using rdpmc for reads this offset value
+must be added to the one returned by rdpmc to get
+the current total event count.
+.TP
+.I time_enabled
+Time the event was active.
+.TP
+.I time_running
+Time the event was running.
+.TP
+.IR cap_usr_time " / " cap_usr_rdpmc " / " cap_bit0 " (since Linux 3.4)"
+.\" commit c7206205d00ab375839bd6c7ddb247d600693c09
+There was a bug in the definition of
+.I cap_usr_time
+and
+.I cap_usr_rdpmc
+from Linux 3.4 until Linux 3.11.
+Both bits were defined to point to the same location, so it was
+impossible to know if
+.I cap_usr_time
+or
+.I cap_usr_rdpmc
+were actually set.
+.IP
+Starting with Linux 3.12, these are renamed to
+.\" commit fa7315871046b9a4c48627905691dbde57e51033
+.I cap_bit0
+and you should use the
+.I cap_user_time
+and
+.I cap_user_rdpmc
+fields instead.
+.TP
+.IR cap_bit0_is_deprecated " (since Linux 3.12)"
+.\" commit fa7315871046b9a4c48627905691dbde57e51033
+If set, this bit indicates that the kernel supports
+the properly separated
+.I cap_user_time
+and
+.I cap_user_rdpmc
+bits.
+.IP
+If not-set, it indicates an older kernel where
+.I cap_usr_time
+and
+.I cap_usr_rdpmc
+map to the same bit and thus both features should
+be used with caution.
+.TP
+.IR cap_user_rdpmc " (since Linux 3.12)"
+.\" commit fa7315871046b9a4c48627905691dbde57e51033
+If the hardware supports user-space read of performance counters
+without syscall (this is the "rdpmc" instruction on x86), then
+the following code can be used to do a read:
+.IP
+.in +4n
+.EX
+u32 seq, time_mult, time_shift, idx, width;
+u64 count, enabled, running;
+u64 cyc, time_offset;
+\&
+do {
+ seq = pc\->lock;
+ barrier();
+ enabled = pc\->time_enabled;
+ running = pc\->time_running;
+\&
+ if (pc\->cap_usr_time && enabled != running) {
+ cyc = rdtsc();
+ time_offset = pc\->time_offset;
+ time_mult = pc\->time_mult;
+ time_shift = pc\->time_shift;
+ }
+\&
+ idx = pc\->index;
+ count = pc\->offset;
+\&
+ if (pc\->cap_usr_rdpmc && idx) {
+ width = pc\->pmc_width;
+ count += rdpmc(idx \- 1);
+ }
+\&
+ barrier();
+} while (pc\->lock != seq);
+.EE
+.in
+.TP
+.IR cap_user_time " (since Linux 3.12)"
+.\" commit fa7315871046b9a4c48627905691dbde57e51033
+This bit indicates the hardware has a constant, nonstop
+timestamp counter (TSC on x86).
+.TP
+.IR cap_user_time_zero " (since Linux 3.12)"
+.\" commit fa7315871046b9a4c48627905691dbde57e51033
+Indicates the presence of
+.I time_zero
+which allows mapping timestamp values to
+the hardware clock.
+.TP
+.I pmc_width
+If
+.IR cap_usr_rdpmc ,
+this field provides the bit-width of the value
+read using the rdpmc or equivalent instruction.
+This can be used to sign extend the result like:
+.IP
+.in +4n
+.EX
+pmc <<= 64 \- pmc_width;
+pmc >>= 64 \- pmc_width; // signed shift right
+count += pmc;
+.EE
+.in
+.TP
+.IR time_shift ", " time_mult ", " time_offset
+.IP
+If
+.IR cap_usr_time ,
+these fields can be used to compute the time
+delta since
+.I time_enabled
+(in nanoseconds) using rdtsc or similar.
+.IP
+.in +4n
+.EX
+u64 quot, rem;
+u64 delta;
+\&
+quot = cyc >> time_shift;
+rem = cyc & (((u64)1 << time_shift) \- 1);
+delta = time_offset + quot * time_mult +
+ ((rem * time_mult) >> time_shift);
+.EE
+.in
+.IP
+Where
+.IR time_offset ,
+.IR time_mult ,
+.IR time_shift ,
+and
+.I cyc
+are read in the
+seqcount loop described above.
+This delta can then be added to
+enabled and possible running (if idx), improving the scaling:
+.IP
+.in +4n
+.EX
+enabled += delta;
+if (idx)
+ running += delta;
+quot = count / running;
+rem = count % running;
+count = quot * enabled + (rem * enabled) / running;
+.EE
+.in
+.TP
+.IR time_zero " (since Linux 3.12)"
+.\" commit fa7315871046b9a4c48627905691dbde57e51033
+.IP
+If
+.I cap_usr_time_zero
+is set, then the hardware clock (the TSC timestamp counter on x86)
+can be calculated from the
+.IR time_zero ,
+.IR time_mult ,
+and
+.I time_shift
+values:
+.IP
+.in +4n
+.EX
+time = timestamp \- time_zero;
+quot = time / time_mult;
+rem = time % time_mult;
+cyc = (quot << time_shift) + (rem << time_shift) / time_mult;
+.EE
+.in
+.IP
+And vice versa:
+.IP
+.in +4n
+.EX
+quot = cyc >> time_shift;
+rem = cyc & (((u64)1 << time_shift) \- 1);
+timestamp = time_zero + quot * time_mult +
+ ((rem * time_mult) >> time_shift);
+.EE
+.in
+.TP
+.I data_head
+This points to the head of the data section.
+The value continuously increases, it does not wrap.
+The value needs to be manually wrapped by the size of the mmap buffer
+before accessing the samples.
+.IP
+On SMP-capable platforms, after reading the
+.I data_head
+value,
+user space should issue an rmb().
+.TP
+.I data_tail
+When the mapping is
+.BR PROT_WRITE ,
+the
+.I data_tail
+value should be written by user space to reflect the last read data.
+In this case, the kernel will not overwrite unread data.
+.TP
+.IR data_offset " (since Linux 4.1)"
+.\" commit e8c6deac69629c0cb97c3d3272f8631ef17f8f0f
+Contains the offset of the location in the mmap buffer
+where perf sample data begins.
+.TP
+.IR data_size " (since Linux 4.1)"
+.\" commit e8c6deac69629c0cb97c3d3272f8631ef17f8f0f
+Contains the size of the perf sample region within
+the mmap buffer.
+.TP
+.IR aux_head ", " aux_tail ", " aux_offset ", " aux_size " (since Linux 4.1)"
+.\" commit 45bfb2e50471abbbfd83d40d28c986078b0d24ff
+The AUX region allows
+.BR mmap (2)-ing
+a separate sample buffer for
+high-bandwidth data streams (separate from the main perf sample buffer).
+An example of a high-bandwidth stream is instruction tracing support,
+as is found in newer Intel processors.
+.IP
+To set up an AUX area, first
+.I aux_offset
+needs to be set with an offset greater than
+.IR data_offset + data_size
+and
+.I aux_size
+needs to be set to the desired buffer size.
+The desired offset and size must be page aligned, and the size
+must be a power of two.
+These values are then passed to mmap in order to map the AUX buffer.
+Pages in the AUX buffer are included as part of the
+.B RLIMIT_MEMLOCK
+resource limit (see
+.BR setrlimit (2)),
+and also as part of the
+.I perf_event_mlock_kb
+allowance.
+.IP
+By default, the AUX buffer will be truncated if it will not fit
+in the available space in the ring buffer.
+If the AUX buffer is mapped as a read only buffer, then it will
+operate in ring buffer mode where old data will be overwritten
+by new.
+In overwrite mode, it might not be possible to infer where the
+new data began, and it is the consumer's job to disable
+measurement while reading to avoid possible data races.
+.IP
+The
+.I aux_head
+and
+.I aux_tail
+ring buffer pointers have the same behavior and ordering
+rules as the previous described
+.I data_head
+and
+.IR data_tail .
+.PP
+The following 2^n ring-buffer pages have the layout described below.
+.PP
+If
+.I perf_event_attr.sample_id_all
+is set, then all event types will
+have the sample_type selected fields related to where/when (identity)
+an event took place (TID, TIME, ID, CPU, STREAM_ID) described in
+.B PERF_RECORD_SAMPLE
+below, it will be stashed just after the
+.I perf_event_header
+and the fields already present for the existing
+fields, that is, at the end of the payload.
+This allows a newer perf.data
+file to be supported by older perf tools, with the new optional
+fields being ignored.
+.PP
+The mmap values start with a header:
+.PP
+.in +4n
+.EX
+struct perf_event_header {
+ __u32 type;
+ __u16 misc;
+ __u16 size;
+};
+.EE
+.in
+.PP
+Below, we describe the
+.I perf_event_header
+fields in more detail.
+For ease of reading,
+the fields with shorter descriptions are presented first.
+.TP
+.I size
+This indicates the size of the record.
+.TP
+.I misc
+The
+.I misc
+field contains additional information about the sample.
+.IP
+The CPU mode can be determined from this value by masking with
+.B PERF_RECORD_MISC_CPUMODE_MASK
+and looking for one of the following (note these are not
+bit masks, only one can be set at a time):
+.RS
+.TP
+.B PERF_RECORD_MISC_CPUMODE_UNKNOWN
+Unknown CPU mode.
+.TP
+.B PERF_RECORD_MISC_KERNEL
+Sample happened in the kernel.
+.TP
+.B PERF_RECORD_MISC_USER
+Sample happened in user code.
+.TP
+.B PERF_RECORD_MISC_HYPERVISOR
+Sample happened in the hypervisor.
+.TP
+.BR PERF_RECORD_MISC_GUEST_KERNEL " (since Linux 2.6.35)"
+.\" commit 39447b386c846bbf1c56f6403c5282837486200f
+Sample happened in the guest kernel.
+.TP
+.B PERF_RECORD_MISC_GUEST_USER " (since Linux 2.6.35)"
+.\" commit 39447b386c846bbf1c56f6403c5282837486200f
+Sample happened in guest user code.
+.RE
+.PP
+.RS
+Since the following three statuses are generated by
+different record types, they alias to the same bit:
+.TP
+.BR PERF_RECORD_MISC_MMAP_DATA " (since Linux 3.10)"
+.\" commit 2fe85427e3bf65d791700d065132772fc26e4d75
+This is set when the mapping is not executable;
+otherwise the mapping is executable.
+.TP
+.BR PERF_RECORD_MISC_COMM_EXEC " (since Linux 3.16)"
+.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871
+This is set for a
+.B PERF_RECORD_COMM
+record on kernels more recent than Linux 3.16
+if a process name change was caused by an
+.BR execve (2)
+system call.
+.TP
+.BR PERF_RECORD_MISC_SWITCH_OUT " (since Linux 4.3)"
+.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
+When a
+.B PERF_RECORD_SWITCH
+or
+.B PERF_RECORD_SWITCH_CPU_WIDE
+record is generated, this bit indicates that the
+context switch is away from the current process
+(instead of into the current process).
+.RE
+.PP
+.RS
+In addition, the following bits can be set:
+.TP
+.B PERF_RECORD_MISC_EXACT_IP
+This indicates that the content of
+.B PERF_SAMPLE_IP
+points
+to the actual instruction that triggered the event.
+See also
+.IR perf_event_attr.precise_ip .
+.TP
+.BR PERF_RECORD_MISC_SWITCH_OUT_PREEMPT " (since Linux 4.17)"
+.\" commit 101592b4904ecf6b8ed2a4784d41d180319d95a1
+When a
+.B PERF_RECORD_SWITCH
+or
+.B PERF_RECORD_SWITCH_CPU_WIDE
+record is generated,
+this indicates the context switch was a preemption.
+.TP
+.BR PERF_RECORD_MISC_MMAP_BUILD_ID " (since Linux 5.12)"
+.\" commit 88a16a1309333e43d328621ece3e9fa37027e8eb
+This indicates that the content of
+.B PERF_SAMPLE_MMAP2
+contains build-ID data instead of device major and minor numbers
+as well as the inode number.
+.TP
+.BR PERF_RECORD_MISC_EXT_RESERVED " (since Linux 2.6.35)"
+.\" commit 1676b8a077c352085d52578fb4f29350b58b6e74
+This indicates there is extended data available (currently not used).
+.TP
+.B PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT
+.\" commit 930e6fcd2bcce9bcd9d4aa7e755678d33f3fe6f4
+This bit is not set by the kernel.
+It is reserved for the user-space perf utility to indicate that
+.IR /proc/ pid /maps
+parsing was taking too long and was stopped, and thus the mmap
+records may be truncated.
+.RE
+.TP
+.I type
+The
+.I type
+value is one of the below.
+The values in the corresponding record (that follows the header)
+depend on the
+.I type
+selected as shown.
+.RS
+.TP 4
+.B PERF_RECORD_MMAP
+The MMAP events record the
+.B PROT_EXEC
+mappings so that we can correlate
+user-space IPs to code.
+They have the following structure:
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u32 pid, tid;
+ u64 addr;
+ u64 len;
+ u64 pgoff;
+ char filename[];
+};
+.EE
+.in
+.RS
+.TP
+.I pid
+is the process ID.
+.TP
+.I tid
+is the thread ID.
+.TP
+.I addr
+is the address of the allocated memory.
+.I len
+is the length of the allocated memory.
+.I pgoff
+is the page offset of the allocated memory.
+.I filename
+is a string describing the backing of the allocated memory.
+.RE
+.TP
+.B PERF_RECORD_LOST
+This record indicates when events are lost.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u64 id;
+ u64 lost;
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I id
+is the unique event ID for the samples that were lost.
+.TP
+.I lost
+is the number of events that were lost.
+.RE
+.TP
+.B PERF_RECORD_COMM
+This record indicates a change in the process name.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u32 pid;
+ u32 tid;
+ char comm[];
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I pid
+is the process ID.
+.TP
+.I tid
+is the thread ID.
+.TP
+.I comm
+is a string containing the new name of the process.
+.RE
+.TP
+.B PERF_RECORD_EXIT
+This record indicates a process exit event.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u32 pid, ppid;
+ u32 tid, ptid;
+ u64 time;
+ struct sample_id sample_id;
+};
+.EE
+.in
+.TP
+.BR PERF_RECORD_THROTTLE ", " PERF_RECORD_UNTHROTTLE
+This record indicates a throttle/unthrottle event.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u64 time;
+ u64 id;
+ u64 stream_id;
+ struct sample_id sample_id;
+};
+.EE
+.in
+.TP
+.B PERF_RECORD_FORK
+This record indicates a fork event.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u32 pid, ppid;
+ u32 tid, ptid;
+ u64 time;
+ struct sample_id sample_id;
+};
+.EE
+.in
+.TP
+.B PERF_RECORD_READ
+This record indicates a read event.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u32 pid, tid;
+ struct read_format values;
+ struct sample_id sample_id;
+};
+.EE
+.in
+.TP
+.B PERF_RECORD_SAMPLE
+This record indicates a sample.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u64 sample_id; /* if PERF_SAMPLE_IDENTIFIER */
+ u64 ip; /* if PERF_SAMPLE_IP */
+ u32 pid, tid; /* if PERF_SAMPLE_TID */
+ u64 time; /* if PERF_SAMPLE_TIME */
+ u64 addr; /* if PERF_SAMPLE_ADDR */
+ u64 id; /* if PERF_SAMPLE_ID */
+ u64 stream_id; /* if PERF_SAMPLE_STREAM_ID */
+ u32 cpu, res; /* if PERF_SAMPLE_CPU */
+ u64 period; /* if PERF_SAMPLE_PERIOD */
+ struct read_format v;
+ /* if PERF_SAMPLE_READ */
+ u64 nr; /* if PERF_SAMPLE_CALLCHAIN */
+ u64 ips[nr]; /* if PERF_SAMPLE_CALLCHAIN */
+ u32 size; /* if PERF_SAMPLE_RAW */
+ char data[size]; /* if PERF_SAMPLE_RAW */
+ u64 bnr; /* if PERF_SAMPLE_BRANCH_STACK */
+ struct perf_branch_entry lbr[bnr];
+ /* if PERF_SAMPLE_BRANCH_STACK */
+ u64 abi; /* if PERF_SAMPLE_REGS_USER */
+ u64 regs[weight(mask)];
+ /* if PERF_SAMPLE_REGS_USER */
+ u64 size; /* if PERF_SAMPLE_STACK_USER */
+ char data[size]; /* if PERF_SAMPLE_STACK_USER */
+ u64 dyn_size; /* if PERF_SAMPLE_STACK_USER &&
+ size != 0 */
+ union perf_sample_weight weight;
+ /* if PERF_SAMPLE_WEIGHT */
+ /* || PERF_SAMPLE_WEIGHT_STRUCT */
+ u64 data_src; /* if PERF_SAMPLE_DATA_SRC */
+ u64 transaction; /* if PERF_SAMPLE_TRANSACTION */
+ u64 abi; /* if PERF_SAMPLE_REGS_INTR */
+ u64 regs[weight(mask)];
+ /* if PERF_SAMPLE_REGS_INTR */
+ u64 phys_addr; /* if PERF_SAMPLE_PHYS_ADDR */
+ u64 cgroup; /* if PERF_SAMPLE_CGROUP */
+ u64 data_page_size;
+ /* if PERF_SAMPLE_DATA_PAGE_SIZE */
+ u64 code_page_size;
+ /* if PERF_SAMPLE_CODE_PAGE_SIZE */
+ u64 size; /* if PERF_SAMPLE_AUX */
+ char data[size]; /* if PERF_SAMPLE_AUX */
+};
+.EE
+.in
+.RS 4
+.TP 4
+.I sample_id
+If
+.B PERF_SAMPLE_IDENTIFIER
+is enabled, a 64-bit unique ID is included.
+This is a duplication of the
+.B PERF_SAMPLE_ID
+.I id
+value, but included at the beginning of the sample
+so parsers can easily obtain the value.
+.TP
+.I ip
+If
+.B PERF_SAMPLE_IP
+is enabled, then a 64-bit instruction
+pointer value is included.
+.TP
+.IR pid ", " tid
+If
+.B PERF_SAMPLE_TID
+is enabled, then a 32-bit process ID
+and 32-bit thread ID are included.
+.TP
+.I time
+If
+.B PERF_SAMPLE_TIME
+is enabled, then a 64-bit timestamp
+is included.
+This is obtained via local_clock() which is a hardware timestamp
+if available and the jiffies value if not.
+.TP
+.I addr
+If
+.B PERF_SAMPLE_ADDR
+is enabled, then a 64-bit address is included.
+This is usually the address of a tracepoint,
+breakpoint, or software event; otherwise the value is 0.
+.TP
+.I id
+If
+.B PERF_SAMPLE_ID
+is enabled, a 64-bit unique ID is included.
+If the event is a member of an event group, the group leader ID is returned.
+This ID is the same as the one returned by
+.BR PERF_FORMAT_ID .
+.TP
+.I stream_id
+If
+.B PERF_SAMPLE_STREAM_ID
+is enabled, a 64-bit unique ID is included.
+Unlike
+.B PERF_SAMPLE_ID
+the actual ID is returned, not the group leader.
+This ID is the same as the one returned by
+.BR PERF_FORMAT_ID .
+.TP
+.IR cpu ", " res
+If
+.B PERF_SAMPLE_CPU
+is enabled, this is a 32-bit value indicating
+which CPU was being used, in addition to a reserved (unused)
+32-bit value.
+.TP
+.I period
+If
+.B PERF_SAMPLE_PERIOD
+is enabled, a 64-bit value indicating
+the current sampling period is written.
+.TP
+.I v
+If
+.B PERF_SAMPLE_READ
+is enabled, a structure of type read_format
+is included which has values for all events in the event group.
+The values included depend on the
+.I read_format
+value used at
+.BR perf_event_open ()
+time.
+.TP
+.IR nr ", " ips[nr]
+If
+.B PERF_SAMPLE_CALLCHAIN
+is enabled, then a 64-bit number is included
+which indicates how many following 64-bit instruction pointers will
+follow.
+This is the current callchain.
+.TP
+.IR size ", " data[size]
+If
+.B PERF_SAMPLE_RAW
+is enabled, then a 32-bit value indicating size
+is included followed by an array of 8-bit values of length size.
+The values are padded with 0 to have 64-bit alignment.
+.IP
+This RAW record data is opaque with respect to the ABI.
+The ABI doesn't make any promises with respect to the stability
+of its content, it may vary depending
+on event, hardware, and kernel version.
+.TP
+.IR bnr ", " lbr[bnr]
+If
+.B PERF_SAMPLE_BRANCH_STACK
+is enabled, then a 64-bit value indicating
+the number of records is included, followed by
+.I bnr
+.I perf_branch_entry
+structures which each include the fields:
+.RS
+.TP
+.I from
+This indicates the source instruction (may not be a branch).
+.TP
+.I to
+The branch target.
+.TP
+.I mispred
+The branch target was mispredicted.
+.TP
+.I predicted
+The branch target was predicted.
+.TP
+.IR in_tx " (since Linux 3.11)"
+.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
+The branch was in a transactional memory transaction.
+.TP
+.IR abort " (since Linux 3.11)"
+.\" commit 135c5612c460f89657c4698fe2ea753f6f667963
+The branch was in an aborted transactional memory transaction.
+.TP
+.IR cycles " (since Linux 4.3)"
+.\" commit 71ef3c6b9d4665ee7afbbe4c208a98917dcfc32f
+This reports the number of cycles elapsed since the
+previous branch stack update.
+.PP
+The entries are from most to least recent, so the first entry
+has the most recent branch.
+.PP
+Support for
+.IR mispred ,
+.IR predicted ,
+and
+.I cycles
+is optional; if not supported, those
+values will be 0.
+.PP
+The type of branches recorded is specified by the
+.I branch_sample_type
+field.
+.RE
+.TP
+.IR abi ", " regs[weight(mask)]
+If
+.B PERF_SAMPLE_REGS_USER
+is enabled, then the user CPU registers are recorded.
+.IP
+The
+.I abi
+field is one of
+.BR PERF_SAMPLE_REGS_ABI_NONE ,
+.BR PERF_SAMPLE_REGS_ABI_32 ,
+or
+.BR PERF_SAMPLE_REGS_ABI_64 .
+.IP
+The
+.I regs
+field is an array of the CPU registers that were specified by
+the
+.I sample_regs_user
+attr field.
+The number of values is the number of bits set in the
+.I sample_regs_user
+bit mask.
+.TP
+.IR size ", " data[size] ", " dyn_size
+If
+.B PERF_SAMPLE_STACK_USER
+is enabled, then the user stack is recorded.
+This can be used to generate stack backtraces.
+.I size
+is the size requested by the user in
+.I sample_stack_user
+or else the maximum record size.
+.I data
+is the stack data (a raw dump of the memory pointed to by the
+stack pointer at the time of sampling).
+.I dyn_size
+is the amount of data actually dumped (can be less than
+.IR size ).
+Note that
+.I dyn_size
+is omitted if
+.I size
+is 0.
+.TP
+.I weight
+If
+.B PERF_SAMPLE_WEIGHT
+or
+.B PERF_SAMPLE_WEIGHT_STRUCT
+is enabled, then a 64-bit value provided by the hardware
+is recorded that indicates how costly the event was.
+This allows expensive events to stand out more clearly
+in profiles.
+.TP
+.I data_src
+If
+.B PERF_SAMPLE_DATA_SRC
+is enabled, then a 64-bit value is recorded that is made up of
+the following fields:
+.RS
+.TP 4
+.I mem_op
+Type of opcode, a bitwise combination of:
+.IP
+.PD 0
+.RS
+.TP 24
+.B PERF_MEM_OP_NA
+Not available
+.TP
+.B PERF_MEM_OP_LOAD
+Load instruction
+.TP
+.B PERF_MEM_OP_STORE
+Store instruction
+.TP
+.B PERF_MEM_OP_PFETCH
+Prefetch
+.TP
+.B PERF_MEM_OP_EXEC
+Executable code
+.RE
+.PD
+.TP
+.I mem_lvl
+Memory hierarchy level hit or miss, a bitwise combination of
+the following, shifted left by
+.BR PERF_MEM_LVL_SHIFT :
+.IP
+.PD 0
+.RS
+.TP 24
+.B PERF_MEM_LVL_NA
+Not available
+.TP
+.B PERF_MEM_LVL_HIT
+Hit
+.TP
+.B PERF_MEM_LVL_MISS
+Miss
+.TP
+.B PERF_MEM_LVL_L1
+Level 1 cache
+.TP
+.B PERF_MEM_LVL_LFB
+Line fill buffer
+.TP
+.B PERF_MEM_LVL_L2
+Level 2 cache
+.TP
+.B PERF_MEM_LVL_L3
+Level 3 cache
+.TP
+.B PERF_MEM_LVL_LOC_RAM
+Local DRAM
+.TP
+.B PERF_MEM_LVL_REM_RAM1
+Remote DRAM 1 hop
+.TP
+.B PERF_MEM_LVL_REM_RAM2
+Remote DRAM 2 hops
+.TP
+.B PERF_MEM_LVL_REM_CCE1
+Remote cache 1 hop
+.TP
+.B PERF_MEM_LVL_REM_CCE2
+Remote cache 2 hops
+.TP
+.B PERF_MEM_LVL_IO
+I/O memory
+.TP
+.B PERF_MEM_LVL_UNC
+Uncached memory
+.RE
+.PD
+.TP
+.I mem_snoop
+Snoop mode, a bitwise combination of the following, shifted left by
+.BR PERF_MEM_SNOOP_SHIFT :
+.IP
+.PD 0
+.RS
+.TP 24
+.B PERF_MEM_SNOOP_NA
+Not available
+.TP
+.B PERF_MEM_SNOOP_NONE
+No snoop
+.TP
+.B PERF_MEM_SNOOP_HIT
+Snoop hit
+.TP
+.B PERF_MEM_SNOOP_MISS
+Snoop miss
+.TP
+.B PERF_MEM_SNOOP_HITM
+Snoop hit modified
+.RE
+.PD
+.TP
+.I mem_lock
+Lock instruction, a bitwise combination of the following, shifted left by
+.BR PERF_MEM_LOCK_SHIFT :
+.IP
+.PD 0
+.RS
+.TP 24
+.B PERF_MEM_LOCK_NA
+Not available
+.TP
+.B PERF_MEM_LOCK_LOCKED
+Locked transaction
+.RE
+.PD
+.TP
+.I mem_dtlb
+TLB access hit or miss, a bitwise combination of the following, shifted
+left by
+.BR PERF_MEM_TLB_SHIFT :
+.IP
+.PD 0
+.RS
+.TP 24
+.B PERF_MEM_TLB_NA
+Not available
+.TP
+.B PERF_MEM_TLB_HIT
+Hit
+.TP
+.B PERF_MEM_TLB_MISS
+Miss
+.TP
+.B PERF_MEM_TLB_L1
+Level 1 TLB
+.TP
+.B PERF_MEM_TLB_L2
+Level 2 TLB
+.TP
+.B PERF_MEM_TLB_WK
+Hardware walker
+.TP
+.B PERF_MEM_TLB_OS
+OS fault handler
+.RE
+.PD
+.RE
+.TP
+.I transaction
+If the
+.B PERF_SAMPLE_TRANSACTION
+flag is set, then a 64-bit field is recorded describing
+the sources of any transactional memory aborts.
+.IP
+The field is a bitwise combination of the following values:
+.RS
+.TP
+.B PERF_TXN_ELISION
+Abort from an elision type transaction (Intel-CPU-specific).
+.TP
+.B PERF_TXN_TRANSACTION
+Abort from a generic transaction.
+.TP
+.B PERF_TXN_SYNC
+Synchronous abort (related to the reported instruction).
+.TP
+.B PERF_TXN_ASYNC
+Asynchronous abort (not related to the reported instruction).
+.TP
+.B PERF_TXN_RETRY
+Retryable abort (retrying the transaction may have succeeded).
+.TP
+.B PERF_TXN_CONFLICT
+Abort due to memory conflicts with other threads.
+.TP
+.B PERF_TXN_CAPACITY_WRITE
+Abort due to write capacity overflow.
+.TP
+.B PERF_TXN_CAPACITY_READ
+Abort due to read capacity overflow.
+.RE
+.IP
+In addition, a user-specified abort code can be obtained from
+the high 32 bits of the field by shifting right by
+.B PERF_TXN_ABORT_SHIFT
+and masking with the value
+.BR PERF_TXN_ABORT_MASK .
+.TP
+.IR abi ", " regs[weight(mask)]
+If
+.B PERF_SAMPLE_REGS_INTR
+is enabled, then the user CPU registers are recorded.
+.IP
+The
+.I abi
+field is one of
+.BR PERF_SAMPLE_REGS_ABI_NONE ,
+.BR PERF_SAMPLE_REGS_ABI_32 ,
+or
+.BR PERF_SAMPLE_REGS_ABI_64 .
+.IP
+The
+.I regs
+field is an array of the CPU registers that were specified by
+the
+.I sample_regs_intr
+attr field.
+The number of values is the number of bits set in the
+.I sample_regs_intr
+bit mask.
+.TP
+.I phys_addr
+If the
+.B PERF_SAMPLE_PHYS_ADDR
+flag is set, then the 64-bit physical address is recorded.
+.TP
+.I cgroup
+If the
+.B PERF_SAMPLE_CGROUP
+flag is set,
+then the 64-bit cgroup ID (for the perf_event subsystem) is recorded.
+To get the pathname of the cgroup, the ID should match to one in a
+.BR PERF_RECORD_CGROUP .
+.TP
+.I data_page_size
+If the
+.B PERF_SAMPLE_DATA_PAGE_SIZE
+flag is set,
+then the 64-bit page size value of the
+.B data
+address is recorded.
+.TP
+.I code_page_size
+If the
+.B PERF_SAMPLE_CODE_PAGE_SIZE
+flag is set,
+then the 64-bit page size value of the
+.B ip
+address is recorded.
+.TP
+.I size
+.TQ
+.IR data [ size ]
+If
+.B PERF_SAMPLE_AUX
+is enabled,
+a snapshot of the aux buffer is recorded.
+.RE
+.TP
+.B PERF_RECORD_MMAP2
+This record includes extended information on
+.BR mmap (2)
+calls returning executable mappings.
+The format is similar to that of the
+.B PERF_RECORD_MMAP
+record, but includes extra values that allow uniquely identifying
+shared mappings.
+Depending on the
+.B PERF_RECORD_MISC_MMAP_BUILD_ID
+bit in the header,
+the extra values have different layout and meanings.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u32 pid;
+ u32 tid;
+ u64 addr;
+ u64 len;
+ u64 pgoff;
+ union {
+ struct {
+ u32 maj;
+ u32 min;
+ u64 ino;
+ u64 ino_generation;
+ };
+ struct { /* if PERF_RECORD_MISC_MMAP_BUILD_ID */
+ u8 build_id_size;
+ u8 __reserved_1;
+ u16 __reserved_2;
+ u8 build_id[20];
+ };
+ };
+ u32 prot;
+ u32 flags;
+ char filename[];
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I pid
+is the process ID.
+.TP
+.I tid
+is the thread ID.
+.TP
+.I addr
+is the address of the allocated memory.
+.TP
+.I len
+is the length of the allocated memory.
+.TP
+.I pgoff
+is the page offset of the allocated memory.
+.TP
+.I maj
+is the major ID of the underlying device.
+.TP
+.I min
+is the minor ID of the underlying device.
+.TP
+.I ino
+is the inode number.
+.TP
+.I ino_generation
+is the inode generation.
+.TP
+.I build_id_size
+is the actual size of
+.I build_id
+field (up to 20).
+.TP
+.I build_id
+is a raw data to identify a binary.
+.TP
+.I prot
+is the protection information.
+.TP
+.I flags
+is the flags information.
+.TP
+.I filename
+is a string describing the backing of the allocated memory.
+.RE
+.TP
+.BR PERF_RECORD_AUX " (since Linux 4.1)"
+.\" commit 68db7e98c3a6ebe7284b6cf14906ed7c55f3f7f0
+This record reports that new data is available in the separate
+AUX buffer region.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u64 aux_offset;
+ u64 aux_size;
+ u64 flags;
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I aux_offset
+offset in the AUX mmap region where the new data begins.
+.TP
+.I aux_size
+size of the data made available.
+.TP
+.I flags
+describes the AUX update.
+.RS
+.TP
+.B PERF_AUX_FLAG_TRUNCATED
+if set, then the data returned was truncated to fit the available
+buffer size.
+.TP
+.B PERF_AUX_FLAG_OVERWRITE
+.\" commit 2023a0d2829e521fe6ad6b9907f3f90bfbf57142
+if set, then the data returned has overwritten previous data.
+.RE
+.RE
+.TP
+.BR PERF_RECORD_ITRACE_START " (since Linux 4.1)"
+.\" ec0d7729bbaed4b9d2d3fada693278e13a3d1368
+This record indicates which process has initiated an instruction
+trace event, allowing tools to properly correlate the instruction
+addresses in the AUX buffer with the proper executable.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u32 pid;
+ u32 tid;
+};
+.EE
+.in
+.RS
+.TP
+.I pid
+process ID of the thread starting an instruction trace.
+.TP
+.I tid
+thread ID of the thread starting an instruction trace.
+.RE
+.TP
+.BR PERF_RECORD_LOST_SAMPLES " (since Linux 4.2)"
+.\" f38b0dbb491a6987e198aa6b428db8692a6480f8
+When using hardware sampling (such as Intel PEBS) this record
+indicates some number of samples that may have been lost.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u64 lost;
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I lost
+the number of potentially lost samples.
+.RE
+.TP
+.BR PERF_RECORD_SWITCH " (since Linux 4.3)"
+.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
+This record indicates a context switch has happened.
+The
+.B PERF_RECORD_MISC_SWITCH_OUT
+bit in the
+.I misc
+field indicates whether it was a context switch into
+or away from the current process.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ struct sample_id sample_id;
+};
+.EE
+.in
+.TP
+.BR PERF_RECORD_SWITCH_CPU_WIDE " (since Linux 4.3)"
+.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4
+As with
+.B PERF_RECORD_SWITCH
+this record indicates a context switch has happened,
+but it only occurs when sampling in CPU-wide mode
+and provides additional information on the process
+being switched to/from.
+The
+.B PERF_RECORD_MISC_SWITCH_OUT
+bit in the
+.I misc
+field indicates whether it was a context switch into
+or away from the current process.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u32 next_prev_pid;
+ u32 next_prev_tid;
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I next_prev_pid
+The process ID of the previous (if switching in)
+or next (if switching out) process on the CPU.
+.TP
+.I next_prev_tid
+The thread ID of the previous (if switching in)
+or next (if switching out) thread on the CPU.
+.RE
+.TP
+.BR PERF_RECORD_NAMESPACES " (since Linux 4.11)"
+.\" commit e422267322cd319e2695a535e47c5b1feeac45eb
+This record includes various namespace information of a process.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u32 pid;
+ u32 tid;
+ u64 nr_namespaces;
+ struct { u64 dev, inode } [nr_namespaces];
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I pid
+is the process ID
+.TP
+.I tid
+is the thread ID
+.TP
+.I nr_namespace
+is the number of namespaces in this record
+.RE
+.IP
+Each namespace has
+.I dev
+and
+.I inode
+fields and is recorded in the
+fixed position like below:
+.RS
+.TP
+.BR NET_NS_INDEX = 0
+Network namespace
+.TP
+.BR UTS_NS_INDEX = 1
+UTS namespace
+.TP
+.BR IPC_NS_INDEX = 2
+IPC namespace
+.TP
+.BR PID_NS_INDEX = 3
+PID namespace
+.TP
+.BR USER_NS_INDEX = 4
+User namespace
+.TP
+.BR MNT_NS_INDEX = 5
+Mount namespace
+.TP
+.BR CGROUP_NS_INDEX = 6
+Cgroup namespace
+.RE
+.TP
+.BR PERF_RECORD_KSYMBOL " (since Linux 5.0)"
+.\" commit 76193a94522f1d4edf2447a536f3f796ce56343b
+This record indicates kernel symbol register/unregister events.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u64 addr;
+ u32 len;
+ u16 ksym_type;
+ u16 flags;
+ char name[];
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I addr
+is the address of the kernel symbol.
+.TP
+.I len
+is the length of the kernel symbol.
+.TP
+.I ksym_type
+is the type of the kernel symbol.
+Currently the following types are available:
+.RS
+.TP
+.B PERF_RECORD_KSYMBOL_TYPE_BPF
+The kernel symbol is a BPF function.
+.RE
+.TP
+.I flags
+If the
+.B PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER
+is set, then this event is for unregistering the kernel symbol.
+.RE
+.TP
+.BR PERF_RECORD_BPF_EVENT " (since Linux 5.0)"
+.\" commit 6ee52e2a3fe4ea35520720736e6791df1fb67106
+This record indicates BPF program is loaded or unloaded.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u16 type;
+ u16 flags;
+ u32 id;
+ u8 tag[BPF_TAG_SIZE];
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I type
+is one of the following values:
+.RS
+.TP
+.B PERF_BPF_EVENT_PROG_LOAD
+A BPF program is loaded
+.TP
+.B PERF_BPF_EVENT_PROG_UNLOAD
+A BPF program is unloaded
+.RE
+.TP
+.I id
+is the ID of the BPF program.
+.TP
+.I tag
+is the tag of the BPF program.
+Currently,
+.B BPF_TAG_SIZE
+is defined as 8.
+.RE
+.TP
+.BR PERF_RECORD_CGROUP " (since Linux 5.7)"
+.\" commit 96aaab686505c449e24d76e76507290dcc30e008
+This record indicates a new cgroup is created and activated.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u64 id;
+ char path[];
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I id
+is the cgroup identifier.
+This can be also retrieved by
+.BR name_to_handle_at (2)
+on the cgroup path (as a file handle).
+.TP
+.I path
+is the path of the cgroup from the root.
+.RE
+.TP
+.BR PERF_RECORD_TEXT_POKE " (since Linux 5.8)"
+.\" commit e17d43b93e544f5016c0251d2074c15568d5d963
+This record indicates a change in the kernel text.
+This includes addition and removal of the text
+and the corresponding length is zero in this case.
+.IP
+.in +4n
+.EX
+struct {
+ struct perf_event_header header;
+ u64 addr;
+ u16 old_len;
+ u16 new_len;
+ u8 bytes[];
+ struct sample_id sample_id;
+};
+.EE
+.in
+.RS
+.TP
+.I addr
+is the address of the change
+.TP
+.I old_len
+is the old length
+.TP
+.I new_len
+is the new length
+.TP
+.I bytes
+contains old bytes immediately followed by new bytes.
+.RE
+.RE
+.SS Overflow handling
+Events can be set to notify when a threshold is crossed,
+indicating an overflow.
+Overflow conditions can be captured by monitoring the
+event file descriptor with
+.BR poll (2),
+.BR select (2),
+or
+.BR epoll (7).
+Alternatively, the overflow events can be captured via sa signal handler,
+by enabling I/O signaling on the file descriptor; see the discussion of the
+.B F_SETOWN
+and
+.B F_SETSIG
+operations in
+.BR fcntl (2).
+.PP
+Overflows are generated only by sampling events
+.RI ( sample_period
+must have a nonzero value).
+.PP
+There are two ways to generate overflow notifications.
+.PP
+The first is to set a
+.I wakeup_events
+or
+.I wakeup_watermark
+value that will trigger if a certain number of samples
+or bytes have been written to the mmap ring buffer.
+In this case,
+.B POLL_IN
+is indicated.
+.PP
+The other way is by use of the
+.B PERF_EVENT_IOC_REFRESH
+ioctl.
+This ioctl adds to a counter that decrements each time the event overflows.
+When nonzero,
+.B POLL_IN
+is indicated, but
+once the counter reaches 0
+.B POLL_HUP
+is indicated and
+the underlying event is disabled.
+.PP
+Refreshing an event group leader refreshes all siblings and
+refreshing with a parameter of 0 currently enables infinite
+refreshes;
+these behaviors are unsupported and should not be relied on.
+.\" See https://lkml.org/lkml/2011/5/24/337
+.PP
+Starting with Linux 3.18,
+.\" commit 179033b3e064d2cd3f5f9945e76b0a0f0fbf4883
+.B POLL_HUP
+is indicated if the event being monitored is attached to a different
+process and that process exits.
+.SS rdpmc instruction
+Starting with Linux 3.4 on x86, you can use the
+.\" commit c7206205d00ab375839bd6c7ddb247d600693c09
+.I rdpmc
+instruction to get low-latency reads without having to enter the kernel.
+Note that using
+.I rdpmc
+is not necessarily faster than other methods for reading event values.
+.PP
+Support for this can be detected with the
+.I cap_usr_rdpmc
+field in the mmap page; documentation on how
+to calculate event values can be found in that section.
+.PP
+Originally, when rdpmc support was enabled, any process (not just ones
+with an active perf event) could use the rdpmc instruction to access
+the counters.
+Starting with Linux 4.0,
+.\" 7911d3f7af14a614617e38245fedf98a724e46a9
+rdpmc support is only allowed if an event is currently enabled
+in a process's context.
+To restore the old behavior, write the value 2 to
+.IR /sys/devices/cpu/rdpmc .
+.SS perf_event ioctl calls
+Various ioctls act on
+.BR perf_event_open ()
+file descriptors:
+.TP
+.B PERF_EVENT_IOC_ENABLE
+This enables the individual event or event group specified by the
+file descriptor argument.
+.IP
+If the
+.B PERF_IOC_FLAG_GROUP
+bit is set in the ioctl argument, then all events in a group are
+enabled, even if the event specified is not the group leader
+(but see BUGS).
+.TP
+.B PERF_EVENT_IOC_DISABLE
+This disables the individual counter or event group specified by the
+file descriptor argument.
+.IP
+Enabling or disabling the leader of a group enables or disables the
+entire group; that is, while the group leader is disabled, none of the
+counters in the group will count.
+Enabling or disabling a member of a group other than the leader
+affects only that counter; disabling a non-leader
+stops that counter from counting but doesn't affect any other counter.
+.IP
+If the
+.B PERF_IOC_FLAG_GROUP
+bit is set in the ioctl argument, then all events in a group are
+disabled, even if the event specified is not the group leader
+(but see BUGS).
+.TP
+.B PERF_EVENT_IOC_REFRESH
+Non-inherited overflow counters can use this
+to enable a counter for a number of overflows specified by the argument,
+after which it is disabled.
+Subsequent calls of this ioctl add the argument value to the current
+count.
+An overflow notification with
+.B POLL_IN
+set will happen on each overflow until the
+count reaches 0; when that happens a notification with
+.B POLL_HUP
+set is sent and the event is disabled.
+Using an argument of 0 is considered undefined behavior.
+.TP
+.B PERF_EVENT_IOC_RESET
+Reset the event count specified by the
+file descriptor argument to zero.
+This resets only the counts; there is no way to reset the
+multiplexing
+.I time_enabled
+or
+.I time_running
+values.
+.IP
+If the
+.B PERF_IOC_FLAG_GROUP
+bit is set in the ioctl argument, then all events in a group are
+reset, even if the event specified is not the group leader
+(but see BUGS).
+.TP
+.B PERF_EVENT_IOC_PERIOD
+This updates the overflow period for the event.
+.IP
+Since Linux 3.7 (on ARM)
+.\" commit 3581fe0ef37ce12ac7a4f74831168352ae848edc
+and Linux 3.14 (all other architectures),
+.\" commit bad7192b842c83e580747ca57104dd51fe08c223
+the new period takes effect immediately.
+On older kernels, the new period did not take effect until
+after the next overflow.
+.IP
+The argument is a pointer to a 64-bit value containing the
+desired new period.
+.IP
+Prior to Linux 2.6.36,
+.\" commit ad0cf3478de8677f720ee06393b3147819568d6a
+this ioctl always failed due to a bug
+in the kernel.
+.TP
+.B PERF_EVENT_IOC_SET_OUTPUT
+This tells the kernel to report event notifications to the specified
+file descriptor rather than the default one.
+The file descriptors must all be on the same CPU.
+.IP
+The argument specifies the desired file descriptor, or \-1 if
+output should be ignored.
+.TP
+.BR PERF_EVENT_IOC_SET_FILTER " (since Linux 2.6.33)"
+.\" commit 6fb2915df7f0747d9044da9dbff5b46dc2e20830
+This adds an ftrace filter to this event.
+.IP
+The argument is a pointer to the desired ftrace filter.
+.TP
+.BR PERF_EVENT_IOC_ID " (since Linux 3.12)"
+.\" commit cf4957f17f2a89984915ea808876d9c82225b862
+This returns the event ID value for the given event file descriptor.
+.IP
+The argument is a pointer to a 64-bit unsigned integer
+to hold the result.
+.TP
+.BR PERF_EVENT_IOC_SET_BPF " (since Linux 4.1)"
+.\" commit 2541517c32be2531e0da59dfd7efc1ce844644f5
+This allows attaching a Berkeley Packet Filter (BPF)
+program to an existing kprobe tracepoint event.
+You need
+.B CAP_PERFMON
+(since Linux 5.8) or
+.B CAP_SYS_ADMIN
+privileges to use this ioctl.
+.IP
+The argument is a BPF program file descriptor that was created by
+a previous
+.BR bpf (2)
+system call.
+.TP
+.BR PERF_EVENT_IOC_PAUSE_OUTPUT " (since Linux 4.7)"
+.\" commit 86e7972f690c1017fd086cdfe53d8524e68c661c
+This allows pausing and resuming the event's ring-buffer.
+A paused ring-buffer does not prevent generation of samples,
+but simply discards them.
+The discarded samples are considered lost, and cause a
+.B PERF_RECORD_LOST
+sample to be generated when possible.
+An overflow signal may still be triggered by the discarded sample
+even though the ring-buffer remains empty.
+.IP
+The argument is an unsigned 32-bit integer.
+A nonzero value pauses the ring-buffer, while a
+zero value resumes the ring-buffer.
+.TP
+.BR PERF_EVENT_MODIFY_ATTRIBUTES " (since Linux 4.17)"
+.\" commit 32ff77e8cc9e66cc4fb38098f64fd54cc8f54573
+This allows modifying an existing event without the overhead
+of closing and reopening a new event.
+Currently this is supported only for breakpoint events.
+.IP
+The argument is a pointer to a
+.I perf_event_attr
+structure containing the updated event settings.
+.TP
+.BR PERF_EVENT_IOC_QUERY_BPF " (since Linux 4.16)"
+.\" commit f371b304f12e31fe30207c41ca7754564e0ea4dc
+This allows querying which Berkeley Packet Filter (BPF)
+programs are attached to an existing kprobe tracepoint.
+You can only attach one BPF program per event, but you can
+have multiple events attached to a tracepoint.
+Querying this value on one tracepoint event returns the ID
+of all BPF programs in all events attached to the tracepoint.
+You need
+.B CAP_PERFMON
+(since Linux 5.8) or
+.B CAP_SYS_ADMIN
+privileges to use this ioctl.
+.IP
+The argument is a pointer to a structure
+.in +4n
+.EX
+struct perf_event_query_bpf {
+ __u32 ids_len;
+ __u32 prog_cnt;
+ __u32 ids[0];
+};
+.EE
+.in
+.IP
+The
+.I ids_len
+field indicates the number of ids that can fit in the provided
+.I ids
+array.
+The
+.I prog_cnt
+value is filled in by the kernel with the number of attached
+BPF programs.
+The
+.I ids
+array is filled with the ID of each attached BPF program.
+If there are more programs than will fit in the array, then the
+kernel will return
+.B ENOSPC
+and
+.I ids_len
+will indicate the number of program IDs that were successfully copied.
+.\"
+.SS Using prctl(2)
+A process can enable or disable all currently open event groups
+using the
+.BR prctl (2)
+.B PR_TASK_PERF_EVENTS_ENABLE
+and
+.B PR_TASK_PERF_EVENTS_DISABLE
+operations.
+This applies only to events created locally by the calling process.
+This does not apply to events created by other processes attached
+to the calling process or inherited events from a parent process.
+Only group leaders are enabled and disabled,
+not any other members of the groups.
+.SS perf_event related configuration files
+Files in
+.I /proc/sys/kernel/
+.RS 4
+.TP
+.I /proc/sys/kernel/perf_event_paranoid
+The
+.I perf_event_paranoid
+file can be set to restrict access to the performance counters.
+.IP
+.PD 0
+.RS
+.TP
+.B 2
+allow only user-space measurements (default since Linux 4.6).
+.\" default changed in commit 0161028b7c8aebef64194d3d73e43bc3b53b5c66
+.TP
+.B 1
+allow both kernel and user measurements (default before Linux 4.6).
+.TP
+.B 0
+allow access to CPU-specific data but not raw tracepoint samples.
+.TP
+.B \-1
+no restrictions.
+.RE
+.PD
+.IP
+The existence of the
+.I perf_event_paranoid
+file is the official method for determining if a kernel supports
+.BR perf_event_open ().
+.TP
+.I /proc/sys/kernel/perf_event_max_sample_rate
+This sets the maximum sample rate.
+Setting this too high can allow
+users to sample at a rate that impacts overall machine performance
+and potentially lock up the machine.
+The default value is
+100000 (samples per second).
+.TP
+.I /proc/sys/kernel/perf_event_max_stack
+.\" Introduced in c5dfd78eb79851e278b7973031b9ca363da87a7e
+This file sets the maximum depth of stack frame entries reported
+when generating a call trace.
+.TP
+.I /proc/sys/kernel/perf_event_mlock_kb
+Maximum number of pages an unprivileged user can
+.BR mlock (2).
+The default is 516 (kB).
+.RE
+.PP
+Files in
+.I /sys/bus/event_source/devices/
+.PP
+.RS 4
+Since Linux 2.6.34, the kernel supports having multiple PMUs
+available for monitoring.
+Information on how to program these PMUs can be found under
+.IR /sys/bus/event_source/devices/ .
+Each subdirectory corresponds to a different PMU.
+.TP
+.IR /sys/bus/event_source/devices/*/type " (since Linux 2.6.38)"
+.\" commit abe43400579d5de0078c2d3a760e6598e183f871
+This contains an integer that can be used in the
+.I type
+field of
+.I perf_event_attr
+to indicate that you wish to use this PMU.
+.TP
+.IR /sys/bus/event_source/devices/cpu/rdpmc " (since Linux 3.4)"
+.\" commit 0c9d42ed4cee2aa1dfc3a260b741baae8615744f
+If this file is 1, then direct user-space access to the
+performance counter registers is allowed via the rdpmc instruction.
+This can be disabled by echoing 0 to the file.
+.IP
+As of Linux 4.0
+.\" a66734297f78707ce39d756b656bfae861d53f62
+.\" 7911d3f7af14a614617e38245fedf98a724e46a9
+the behavior has changed, so that 1 now means only allow access
+to processes with active perf events, with 2 indicating the old
+allow-anyone-access behavior.
+.TP
+.IR /sys/bus/event_source/devices/*/format/ " (since Linux 3.4)"
+.\" commit 641cc938815dfd09f8fa1ec72deb814f0938ac33
+This subdirectory contains information on the architecture-specific
+subfields available for programming the various
+.I config
+fields in the
+.I perf_event_attr
+struct.
+.IP
+The content of each file is the name of the config field, followed
+by a colon, followed by a series of integer bit ranges separated by
+commas.
+For example, the file
+.I event
+may contain the value
+.I config1:1,6\-10,44
+which indicates that event is an attribute that occupies bits 1,6\[en]10, and 44
+of
+.IR perf_event_attr::config1 .
+.TP
+.IR /sys/bus/event_source/devices/*/events/ " (since Linux 3.4)"
+.\" commit 641cc938815dfd09f8fa1ec72deb814f0938ac33
+This subdirectory contains files with predefined events.
+The contents are strings describing the event settings
+expressed in terms of the fields found in the previously mentioned
+.I ./format/
+directory.
+These are not necessarily complete lists of all events supported by
+a PMU, but usually a subset of events deemed useful or interesting.
+.IP
+The content of each file is a list of attribute names
+separated by commas.
+Each entry has an optional value (either hex or decimal).
+If no value is specified, then it is assumed to be a single-bit
+field with a value of 1.
+An example entry may look like this:
+.IR event=0x2,inv,ldlat=3 .
+.TP
+.I /sys/bus/event_source/devices/*/uevent
+This file is the standard kernel device interface
+for injecting hotplug events.
+.TP
+.IR /sys/bus/event_source/devices/*/cpumask " (since Linux 3.7)"
+.\" commit 314d9f63f385096580e9e2a06eaa0745d92fe4ac
+The
+.I cpumask
+file contains a comma-separated list of integers that
+indicate a representative CPU number for each socket (package)
+on the motherboard.
+This is needed when setting up uncore or northbridge events, as
+those PMUs present socket-wide events.
+.RE
+.SH RETURN VALUE
+On success,
+.BR perf_event_open ()
+returns the new file descriptor.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+The errors returned by
+.BR perf_event_open ()
+can be inconsistent, and may
+vary across processor architectures and performance monitoring units.
+.TP
+.B E2BIG
+Returned if the
+.I perf_event_attr
+.I size
+value is too small
+(smaller than
+.BR PERF_ATTR_SIZE_VER0 ),
+too big (larger than the page size),
+or larger than the kernel supports and the extra bytes are not zero.
+When
+.B E2BIG
+is returned, the
+.I perf_event_attr
+.I size
+field is overwritten by the kernel to be the size of the structure
+it was expecting.
+.TP
+.B EACCES
+Returned when the requested event requires
+.B CAP_PERFMON
+(since Linux 5.8) or
+.B CAP_SYS_ADMIN
+permissions (or a more permissive perf_event paranoid setting).
+Some common cases where an unprivileged process
+may encounter this error:
+attaching to a process owned by a different user;
+monitoring all processes on a given CPU (i.e., specifying the
+.I pid
+argument as \-1);
+and not setting
+.I exclude_kernel
+when the paranoid setting requires it.
+.TP
+.B EBADF
+Returned if the
+.I group_fd
+file descriptor is not valid, or, if
+.B PERF_FLAG_PID_CGROUP
+is set,
+the cgroup file descriptor in
+.I pid
+is not valid.
+.TP
+.BR EBUSY " (since Linux 4.1)"
+.\" bed5b25ad9c8a2f5d735ef0bc746ec870c01c1b0
+Returned if another event already has exclusive
+access to the PMU.
+.TP
+.B EFAULT
+Returned if the
+.I attr
+pointer points at an invalid memory address.
+.TP
+.B EINTR
+Returned when trying to mix perf and ftrace handling
+for a uprobe.
+.TP
+.B EINVAL
+Returned if the specified event is invalid.
+There are many possible reasons for this.
+A not-exhaustive list:
+.I sample_freq
+is higher than the maximum setting;
+the
+.I cpu
+to monitor does not exist;
+.I read_format
+is out of range;
+.I sample_type
+is out of range;
+the
+.I flags
+value is out of range;
+.I exclusive
+or
+.I pinned
+set and the event is not a group leader;
+the event
+.I config
+values are out of range or set reserved bits;
+the generic event selected is not supported; or
+there is not enough room to add the selected event.
+.TP
+.B EMFILE
+Each opened event uses one file descriptor.
+If a large number of events are opened,
+the per-process limit on the number of open file descriptors will be reached,
+and no more events can be created.
+.TP
+.B ENODEV
+Returned when the event involves a feature not supported
+by the current CPU.
+.TP
+.B ENOENT
+Returned if the
+.I type
+setting is not valid.
+This error is also returned for
+some unsupported generic events.
+.TP
+.B ENOSPC
+Prior to Linux 3.3, if there was not enough room for the event,
+.\" commit aa2bc1ade59003a379ffc485d6da2d92ea3370a6
+.B ENOSPC
+was returned.
+In Linux 3.3, this was changed to
+.BR EINVAL .
+.B ENOSPC
+is still returned if you try to add more breakpoint events
+than supported by the hardware.
+.TP
+.B ENOSYS
+Returned if
+.B PERF_SAMPLE_STACK_USER
+is set in
+.I sample_type
+and it is not supported by hardware.
+.TP
+.B EOPNOTSUPP
+Returned if an event requiring a specific hardware feature is
+requested but there is no hardware support.
+This includes requesting low-skid events if not supported,
+branch tracing if it is not available, sampling if no PMU
+interrupt is available, and branch stacks for software events.
+.TP
+.BR EOVERFLOW " (since Linux 4.8)"
+.\" 97c79a38cd454602645f0470ffb444b3b75ce574
+Returned if
+.B PERF_SAMPLE_CALLCHAIN
+is requested and
+.I sample_max_stack
+is larger than the maximum specified in
+.IR /proc/sys/kernel/perf_event_max_stack .
+.TP
+.B EPERM
+Returned on many (but not all) architectures when an unsupported
+.IR exclude_hv ", " exclude_idle ", " exclude_user ", or " exclude_kernel
+setting is specified.
+.IP
+It can also happen, as with
+.BR EACCES ,
+when the requested event requires
+.B CAP_PERFMON
+(since Linux 5.8) or
+.B CAP_SYS_ADMIN
+permissions (or a more permissive perf_event paranoid setting).
+This includes setting a breakpoint on a kernel address,
+and (since Linux 3.13) setting a kernel function-trace tracepoint.
+.\" commit a4e95fc2cbb31d70a65beffeaf8773f881328c34
+.TP
+.B ESRCH
+Returned if attempting to attach to a process that does not exist.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+.BR perf_event_open ()
+was introduced in Linux 2.6.31 but was called
+.\" commit 0793a61d4df8daeac6492dbf8d2f3e5713caae5e
+.BR perf_counter_open ().
+It was renamed in Linux 2.6.32.
+.\" commit cdd6c482c9ff9c55475ee7392ec8f672eddb7be6
+.SH NOTES
+The official way of knowing if
+.BR perf_event_open ()
+support is enabled is checking
+for the existence of the file
+.IR /proc/sys/kernel/perf_event_paranoid .
+.PP
+.B CAP_PERFMON
+capability (since Linux 5.8) provides secure approach to
+performance monitoring and observability operations in a system
+according to the principal of least privilege (POSIX IEEE 1003.1e).
+Accessing system performance monitoring and observability operations
+using
+.B CAP_PERFMON
+rather than the much more powerful
+.B CAP_SYS_ADMIN
+excludes chances to misuse credentials and makes operations more secure.
+.B CAP_SYS_ADMIN
+usage for secure system performance monitoring and observability
+is discouraged in favor of the
+.B CAP_PERFMON
+capability.
+.SH BUGS
+The
+.B F_SETOWN_EX
+option to
+.BR fcntl (2)
+is needed to properly get overflow signals in threads.
+This was introduced in Linux 2.6.32.
+.\" commit ba0a6c9f6fceed11c6a99e8326f0477fe383e6b5
+.PP
+Prior to Linux 2.6.33 (at least for x86),
+.\" commit b690081d4d3f6a23541493f1682835c3cd5c54a1
+the kernel did not check
+if events could be scheduled together until read time.
+The same happens on all known kernels if the NMI watchdog is enabled.
+This means to see if a given set of events works you have to
+.BR perf_event_open (),
+start, then read before you know for sure you
+can get valid measurements.
+.PP
+Prior to Linux 2.6.34,
+.\" FIXME . cannot find a kernel commit for this one
+event constraints were not enforced by the kernel.
+In that case, some events would silently return "0" if the kernel
+scheduled them in an improper counter slot.
+.PP
+Prior to Linux 2.6.34, there was a bug when multiplexing where the
+wrong results could be returned.
+.\" commit 45e16a6834b6af098702e5ea6c9a40de42ff77d8
+.PP
+Kernels from Linux 2.6.35 to Linux 2.6.39 can quickly crash the kernel if
+"inherit" is enabled and many threads are started.
+.\" commit 38b435b16c36b0d863efcf3f07b34a6fac9873fd
+.PP
+Prior to Linux 2.6.35,
+.\" commit 050735b08ca8a016bbace4445fa025b88fee770b
+.B PERF_FORMAT_GROUP
+did not work with attached processes.
+.PP
+There is a bug in the kernel code between
+Linux 2.6.36 and Linux 3.0 that ignores the
+"watermark" field and acts as if a wakeup_event
+was chosen if the union has a
+nonzero value in it.
+.\" commit 4ec8363dfc1451f8c8f86825731fe712798ada02
+.PP
+From Linux 2.6.31 to Linux 3.4, the
+.B PERF_IOC_FLAG_GROUP
+ioctl argument was broken and would repeatedly operate
+on the event specified rather than iterating across
+all sibling events in a group.
+.\" commit 724b6daa13e100067c30cfc4d1ad06629609dc4e
+.PP
+From Linux 3.4 to Linux 3.11, the mmap
+.\" commit fa7315871046b9a4c48627905691dbde57e51033
+.I cap_usr_rdpmc
+and
+.I cap_usr_time
+bits mapped to the same location.
+Code should migrate to the new
+.I cap_user_rdpmc
+and
+.I cap_user_time
+fields instead.
+.PP
+Always double-check your results!
+Various generalized events have had wrong values.
+For example, retired branches measured
+the wrong thing on AMD machines until Linux 2.6.35.
+.\" commit f287d332ce835f77a4f5077d2c0ef1e3f9ea42d2
+.SH EXAMPLES
+The following is a short example that measures the total
+instruction count of a call to
+.BR printf (3).
+.PP
+.\" SRC BEGIN (perf_event_open.c)
+.EX
+#include <linux/perf_event.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+\&
+static long
+perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
+ int cpu, int group_fd, unsigned long flags)
+{
+ int ret;
+\&
+ ret = syscall(SYS_perf_event_open, hw_event, pid, cpu,
+ group_fd, flags);
+ return ret;
+}
+\&
+int
+main(void)
+{
+ int fd;
+ long long count;
+ struct perf_event_attr pe;
+\&
+ memset(&pe, 0, sizeof(pe));
+ pe.type = PERF_TYPE_HARDWARE;
+ pe.size = sizeof(pe);
+ pe.config = PERF_COUNT_HW_INSTRUCTIONS;
+ pe.disabled = 1;
+ pe.exclude_kernel = 1;
+ pe.exclude_hv = 1;
+\&
+ fd = perf_event_open(&pe, 0, \-1, \-1, 0);
+ if (fd == \-1) {
+ fprintf(stderr, "Error opening leader %llx\en", pe.config);
+ exit(EXIT_FAILURE);
+ }
+\&
+ ioctl(fd, PERF_EVENT_IOC_RESET, 0);
+ ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
+\&
+ printf("Measuring instruction count for this printf\en");
+\&
+ ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
+ read(fd, &count, sizeof(count));
+\&
+ printf("Used %lld instructions\en", count);
+\&
+ close(fd);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR perf (1),
+.BR fcntl (2),
+.BR mmap (2),
+.BR open (2),
+.BR prctl (2),
+.BR read (2)
+.PP
+.I Documentation/admin\-guide/perf\-security.rst
+in the kernel source tree
diff --git a/man2/perfmonctl.2 b/man2/perfmonctl.2
new file mode 100644
index 0000000..2155bb4
--- /dev/null
+++ b/man2/perfmonctl.2
@@ -0,0 +1,193 @@
+.\" Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+.\" and Copyright (C) 2013 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Written by Ivana Varekova <varekova@redhat.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH perfmonctl 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+perfmonctl \- interface to IA-64 performance monitoring unit
+.SH SYNOPSIS
+.nf
+.B #include <syscall.h>
+.B #include <perfmon.h>
+.PP
+.BI "long perfmonctl(int " fd ", int " cmd ", void " arg [. narg "], int " narg ");"
+.fi
+.PP
+.IR Note :
+There is no glibc wrapper for this system call; see HISTORY.
+.SH DESCRIPTION
+The IA-64-specific
+.BR perfmonctl ()
+system call provides an interface to the
+PMU (performance monitoring unit).
+The PMU consists of PMD (performance monitoring data) registers and
+PMC (performance monitoring control) registers,
+which gather hardware statistics.
+.PP
+.BR perfmonctl ()
+applies the operation
+.I cmd
+to the input arguments specified by
+.IR arg .
+The number of arguments is defined by \fInarg\fR.
+The
+.I fd
+argument specifies the perfmon context to operate on.
+.PP
+Supported values for
+.I cmd
+are:
+.TP
+.B PFM_CREATE_CONTEXT
+.nf
+.BI "perfmonctl(int " fd ", PFM_CREATE_CONTEXT, pfarg_context_t *" ctxt ", 1);"
+.fi
+Set up a context.
+.IP
+The
+.I fd
+parameter is ignored.
+A new perfmon context is created as specified in
+.I ctxt
+and its file descriptor is returned in \fIctxt\->ctx_fd\fR.
+.IP
+The file descriptor can be used in subsequent calls to
+.BR perfmonctl ()
+and can be used to read event notifications (type
+.IR pfm_msg_t )
+using
+.BR read (2).
+The file descriptor is pollable using
+.BR select (2),
+.BR poll (2),
+and
+.BR epoll (7).
+.IP
+The context can be destroyed by calling
+.BR close (2)
+on the file descriptor.
+.TP
+.B PFM_WRITE_PMCS
+.\" pfm_write_pmcs()
+.nf
+.BI "perfmonctl(int " fd ", PFM_WRITE_PMCS, pfarg_reg_t *" pmcs ", n);"
+.fi
+Set PMC registers.
+.TP
+.B PFM_WRITE_PMDS
+.nf
+.BI "perfmonctl(int " fd ", PFM_WRITE_PMDS, pfarg_reg_t *" pmds ", n);"
+.fi
+.\" pfm_write_pmds()
+Set PMD registers.
+.TP
+.B PFM_READ_PMDS
+.\" pfm_read_pmds()
+.nf
+.BI "perfmonctl(int " fd ", PFM_READ_PMDS, pfarg_reg_t *" pmds ", n);"
+.fi
+Read PMD registers.
+.TP
+.B PFM_START
+.\" pfm_start()
+.nf
+.\" .BI "perfmonctl(int " fd ", PFM_START, arg, 1);
+.BI "perfmonctl(int " fd ", PFM_START, NULL, 0);"
+.fi
+Start monitoring.
+.TP
+.B PFM_STOP
+.\" pfm_stop()
+.nf
+.BI "perfmonctl(int " fd ", PFM_STOP, NULL, 0);"
+.fi
+Stop monitoring.
+.TP
+.B PFM_LOAD_CONTEXT
+.\" pfm_context_load()
+.nf
+.BI "perfmonctl(int " fd ", PFM_LOAD_CONTEXT, pfarg_load_t *" largs ", 1);"
+.fi
+Attach the context to a thread.
+.TP
+.B PFM_UNLOAD_CONTEXT
+.\" pfm_context_unload()
+.nf
+.BI "perfmonctl(int " fd ", PFM_UNLOAD_CONTEXT, NULL, 0);"
+.fi
+Detach the context from a thread.
+.TP
+.B PFM_RESTART
+.\" pfm_restart()
+.nf
+.BI "perfmonctl(int " fd ", PFM_RESTART, NULL, 0);"
+.fi
+Restart monitoring after receiving an overflow notification.
+.TP
+.B PFM_GET_FEATURES
+.\" pfm_get_features()
+.nf
+.BI "perfmonctl(int " fd ", PFM_GET_FEATURES, pfarg_features_t *" arg ", 1);"
+.fi
+.TP
+.B PFM_DEBUG
+.\" pfm_debug()
+.nf
+.BI "perfmonctl(int " fd ", PFM_DEBUG, " val ", 0);"
+.fi
+If
+.I val
+is nonzero, enable debugging mode, otherwise disable.
+.TP
+.B PFM_GET_PMC_RESET_VAL
+.\" pfm_get_pmc_reset()
+.nf
+.BI "perfmonctl(int " fd ", PFM_GET_PMC_RESET_VAL, pfarg_reg_t *" req ", n);"
+.fi
+Reset PMC registers to default values.
+.\"
+.\"
+.\" .TP
+.\" .B PFM_CREATE_EVTSETS
+.\"
+.\" create or modify event sets
+.\" .nf
+.\" .BI "perfmonctl(int " fd ", PFM_CREATE_EVTSETS, pfarg_setdesc_t *desc , n);
+.\" .fi
+.\" .TP
+.\" .B PFM_DELETE_EVTSETS
+.\" delete event sets
+.\" .nf
+.\" .BI "perfmonctl(int " fd ", PFM_DELETE_EVTSET, pfarg_setdesc_t *desc , n);
+.\" .fi
+.\" .TP
+.\" .B PFM_GETINFO_EVTSETS
+.\" get information about event sets
+.\" .nf
+.\" .BI "perfmonctl(int " fd ", PFM_GETINFO_EVTSETS, pfarg_setinfo_t *info, n);
+.\" .fi
+.SH RETURN VALUE
+.BR perfmonctl ()
+returns zero when the operation is successful.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH STANDARDS
+Linux on IA-64.
+.SH HISTORY
+Added in Linux 2.4;
+.\" commit ecf5b72d5f66af843f189dfe9ce31598c3e48ad7
+removed in Linux 5.10.
+.PP
+This system call was broken for many years,
+and ultimately removed in Linux 5.10.
+.PP
+glibc does not provide a wrapper for this system call;
+on kernels where it exists, call it using
+.BR syscall (2).
+.SH SEE ALSO
+.BR gprof (1)
+.PP
+The perfmon2 interface specification
diff --git a/man2/personality.2 b/man2/personality.2
new file mode 100644
index 0000000..e76af79
--- /dev/null
+++ b/man2/personality.2
@@ -0,0 +1,296 @@
+.\" Copyright (C) 1995, Thomas K. Dyas <tdyas@eden.rutgers.edu>
+.\" and Copyright (C) 2016, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Created Sat Aug 21 1995 Thomas K. Dyas <tdyas@eden.rutgers.edu>
+.\"
+.\" typo corrected, aeb, 950825
+.\" added layout change from joey, 960722
+.\" changed prototype, documented 0xffffffff, aeb, 030101
+.\" Modified 2004-11-03 patch from Martin Schulze <joey@infodrom.org>
+.\"
+.TH personality 2 2023-04-29 "Linux man-pages 6.05.01"
+.SH NAME
+personality \- set the process execution domain
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/personality.h>
+.PP
+.BI "int personality(unsigned long " persona );
+.fi
+.SH DESCRIPTION
+Linux supports different execution domains, or personalities, for each
+process.
+Among other things, execution domains tell Linux how to map
+signal numbers into signal actions.
+The execution domain system allows
+Linux to provide limited support for binaries compiled under other
+UNIX-like operating systems.
+.PP
+If
+.I persona
+is not
+0xffffffff, then
+.BR personality ()
+sets the caller's execution domain to the value specified by
+.IR persona .
+Specifying
+.I persona
+as 0xffffffff provides a way of retrieving
+the current persona without changing it.
+.PP
+A list of the available execution domains can be found in
+.IR <sys/personality.h> .
+The execution domain is a 32-bit value in which the top three
+bytes are set aside for flags that cause the kernel to modify the
+behavior of certain system calls so as to emulate historical or
+architectural quirks.
+The least significant byte is a value defining the personality
+the kernel should assume.
+The flag values are as follows:
+.TP
+.BR ADDR_COMPAT_LAYOUT " (since Linux 2.6.9)"
+With this flag set, provide legacy virtual address space layout.
+.TP
+.BR ADDR_NO_RANDOMIZE " (since Linux 2.6.12)"
+With this flag set, disable address-space-layout randomization.
+.TP
+.BR ADDR_LIMIT_32BIT " (since Linux 2.2)"
+Limit the address space to 32 bits.
+.TP
+.BR ADDR_LIMIT_3GB " (since Linux 2.4.0)"
+With this flag set, use 0xc0000000 as the offset at which to search
+a virtual memory chunk on
+.BR mmap (2);
+otherwise use 0xffffe000.
+Applies to 32-bit x86 processes only.
+.TP
+.BR FDPIC_FUNCPTRS " (since Linux 2.6.11)"
+User-space function pointers to signal handlers point
+to descriptors.
+Applies only to ARM if BINFMT_ELF_FDPIC and SuperH.
+.TP
+.BR MMAP_PAGE_ZERO " (since Linux 2.4.0)"
+Map page 0 as read-only
+(to support binaries that depend on this SVr4 behavior).
+.TP
+.BR READ_IMPLIES_EXEC " (since Linux 2.6.8)"
+With this flag set,
+.B PROT_READ
+implies
+.B PROT_EXEC
+for
+.BR mmap (2).
+.TP
+.BR SHORT_INODE " (since Linux 2.4.0)"
+No effect.
+.TP
+.BR STICKY_TIMEOUTS " (since Linux 1.2.0)"
+With this flag set,
+.BR select (2),
+.BR pselect (2),
+and
+.BR ppoll (2)
+do not modify the returned timeout argument when
+interrupted by a signal handler.
+.TP
+.BR UNAME26 " (since Linux 3.1)"
+Have
+.BR uname (2)
+report a 2.6.(40+x) version number rather than a MAJOR.x version number.
+Added as a stopgap measure to support broken applications that
+could not handle the
+kernel version-numbering switch from Linux 2.6.x to Linux 3.x.
+.TP
+.BR WHOLE_SECONDS " (since Linux 1.2.0)"
+No effect.
+.PP
+The available execution domains are:
+.TP
+.BR PER_BSD " (since Linux 1.2.0)"
+BSD. (No effects.)
+.TP
+.BR PER_HPUX " (since Linux 2.4)"
+Support for 32-bit HP/UX.
+This support was never complete, and was dropped so that since Linux 4.0,
+this value has no effect.
+.TP
+.BR PER_IRIX32 " (since Linux 2.2)"
+IRIX 5 32-bit.
+Never fully functional; support dropped in Linux 2.6.27.
+Implies
+.BR STICKY_TIMEOUTS .
+.TP
+.BR PER_IRIX64 " (since Linux 2.2)"
+IRIX 6 64-bit.
+Implies
+.BR STICKY_TIMEOUTS ;
+otherwise no effect.
+.TP
+.BR PER_IRIXN32 " (since Linux 2.2)"
+IRIX 6 new 32-bit.
+Implies
+.BR STICKY_TIMEOUTS ;
+otherwise no effect.
+.TP
+.BR PER_ISCR4 " (since Linux 1.2.0)"
+Implies
+.BR STICKY_TIMEOUTS ;
+otherwise no effect.
+.TP
+.BR PER_LINUX " (since Linux 1.2.0)"
+Linux.
+.TP
+.BR PER_LINUX32 " (since Linux 2.2)"
+.BR uname (2)
+returns the name of the 32-bit architecture in the
+.I machine
+field ("i686" instead of "x86_64", &c.).
+.IP
+Under ia64 (Itanium), processes with this personality don't have the
+O_LARGEFILE
+.BR open (2)
+flag forced.
+.IP
+Under 64-bit ARM, setting this personality is forbidden if
+.BR execve (2)ing
+a 32-bit process would also be forbidden
+(cf. the allow_mismatched_32bit_el0 kernel parameter and
+.IR Documentation/arm64/asymmetric-32bit.rst ).
+.TP
+.BR PER_LINUX32_3GB " (since Linux 2.4)"
+Same as
+.BR PER_LINUX32 ,
+but implies
+.BR ADDR_LIMIT_3GB .
+.TP
+.BR PER_LINUX_32BIT " (since Linux 2.0)"
+Same as
+.BR PER_LINUX ,
+but implies
+.BR ADDR_LIMIT_32BIT .
+.TP
+.BR PER_LINUX_FDPIC " (since Linux 2.6.11)"
+Same as
+.BR PER_LINUX ,
+but implies
+.BR FDPIC_FUNCPTRS .
+.TP
+.BR PER_OSF4 " (since Linux 2.4)"
+OSF/1 v4.
+.\" commit 987f20a9dcce3989e48d87cff3952c095c994445
+No effect since Linux 6.1, which removed a.out binary support.
+Before, on alpha,
+.\" Following is from a comment in arch/alpha/kernel/osf_sys.c
+would clear top 32 bits of iov_len in the user's buffer for
+compatibility with old versions of OSF/1 where iov_len
+was defined as.
+.IR int .
+.TP
+.BR PER_OSR5 " (since Linux 2.4)"
+SCO OpenServer 5.
+Implies
+.B STICKY_TIMEOUTS
+and
+.BR WHOLE_SECONDS ;
+otherwise no effect.
+.TP
+.BR PER_RISCOS " (since Linux 2.3.7; macro since Linux 2.3.13)"
+Acorn RISC OS/Arthur (MIPS).
+No effect.
+.\" commit 125ec7b4e90cbae4eed5a7ff1ee479cc331dcf3c
+Up to Linux v4.0, would set the emulation altroot to
+.I /usr/gnemul/riscos
+(cf.\&
+.BR PER_SUNOS ,
+below).
+Before then, up to Linux 2.6.3, just Arthur emulation.
+.TP
+.BR PER_SCOSVR3 " (since Linux 1.2.0)"
+SCO UNIX System V Release 3.
+Same as
+.BR PER_OSR5 ,
+but also implies
+.BR SHORT_INODE .
+.TP
+.BR PER_SOLARIS " (since Linux 2.4)"
+Solaris.
+Implies
+.BR STICKY_TIMEOUTS ;
+otherwise no effect.
+.TP
+.BR PER_SUNOS " (since Linux 2.4.0)"
+Sun OS.
+Same as
+.BR PER_BSD ,
+but implies
+.BR STICKY_TIMEOUTS .
+Prior to Linux 2.6.26,
+diverted library and dynamic linker searches to
+.IR /usr/gnemul .
+Buggy, largely unmaintained, and almost entirely unused.
+.TP
+.BR PER_SVR3 " (since Linux 1.2.0)"
+AT&T UNIX System V Release 3.
+Implies
+.B STICKY_TIMEOUTS
+and
+.BR SHORT_INODE ;
+otherwise no effect.
+.TP
+.BR PER_SVR4 " (since Linux 1.2.0)"
+AT&T UNIX System V Release 4.
+Implies
+.B STICKY_TIMEOUTS
+and
+.BR MMAP_PAGE_ZERO ;
+otherwise no effect.
+.TP
+.BR PER_UW7 " (since Linux 2.4)"
+UnixWare 7.
+Implies
+.B STICKY_TIMEOUTS
+and
+.BR MMAP_PAGE_ZERO ;
+otherwise no effect.
+.TP
+.BR PER_WYSEV386 " (since Linux 1.2.0)"
+WYSE UNIX System V/386.
+Implies
+.B STICKY_TIMEOUTS
+and
+.BR SHORT_INODE ;
+otherwise no effect.
+.TP
+.BR PER_XENIX " (since Linux 1.2.0)"
+XENIX.
+Implies
+.B STICKY_TIMEOUTS
+and
+.BR SHORT_INODE ;
+otherwise no effect.
+.SH RETURN VALUE
+On success, the previous
+.I persona
+is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+The kernel was unable to change the personality.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 1.1.20,
+.\" (and thus first in a stable kernel release with Linux 1.2.0)
+glibc 2.3.
+.\" personality wrapper first appeared in glibc 1.90,
+.\" <sys/personality.h> was added later in glibc 2.2.91.
+.SH SEE ALSO
+.BR setarch (8)
diff --git a/man2/phys.2 b/man2/phys.2
new file mode 100644
index 0000000..5d25ea6
--- /dev/null
+++ b/man2/phys.2
@@ -0,0 +1 @@
+.so man2/unimplemented.2
diff --git a/man2/pidfd_getfd.2 b/man2/pidfd_getfd.2
new file mode 100644
index 0000000..9e3af2a
--- /dev/null
+++ b/man2/pidfd_getfd.2
@@ -0,0 +1,144 @@
+.\" Copyright (c) 2020 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH pidfd_getfd 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+pidfd_getfd \- obtain a duplicate of another process's file descriptor
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_pidfd_getfd, int " pidfd ", int " targetfd ,
+.BI " unsigned int " flags );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR pidfd_getfd (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The
+.BR pidfd_getfd ()
+system call allocates a new file descriptor in the calling process.
+This new file descriptor is a duplicate of an existing file descriptor,
+.IR targetfd ,
+in the process referred to by the PID file descriptor
+.IR pidfd .
+.PP
+The duplicate file descriptor refers to the same open file description (see
+.BR open (2))
+as the original file descriptor in the process referred to by
+.IR pidfd .
+The two file descriptors thus share file status flags and file offset.
+Furthermore, operations on the underlying file object
+(for example, assigning an address to a socket object using
+.BR bind (2))
+can equally be performed via the duplicate file descriptor.
+.PP
+The close-on-exec flag
+.RB ( FD_CLOEXEC ;
+see
+.BR fcntl (2))
+is set on the file descriptor returned by
+.BR pidfd_getfd ().
+.PP
+The
+.I flags
+argument is reserved for future use.
+Currently, it must be specified as 0.
+.PP
+Permission to duplicate another process's file descriptor
+is governed by a ptrace access mode
+.B PTRACE_MODE_ATTACH_REALCREDS
+check (see
+.BR ptrace (2)).
+.SH RETURN VALUE
+On success,
+.BR pidfd_getfd ()
+returns a file descriptor (a nonnegative integer).
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+.I pidfd
+is not a valid PID file descriptor.
+.TP
+.B EBADF
+.I targetfd
+is not an open file descriptor in the process referred to by
+.IR pidfd .
+.TP
+.B EINVAL
+.I flags
+is not 0.
+.TP
+.B EMFILE
+The per-process limit on the number of open file descriptors has been reached
+(see the description of
+.B RLIMIT_NOFILE
+in
+.BR getrlimit (2)).
+.TP
+.B ENFILE
+The system-wide limit on the total number of open files has been reached.
+.TP
+.B EPERM
+The calling process did not have
+.B PTRACE_MODE_ATTACH_REALCREDS
+permissions (see
+.BR ptrace (2))
+over the process referred to by
+.IR pidfd .
+.TP
+.B ESRCH
+The process referred to by
+.I pidfd
+does not exist
+(i.e., it has terminated and been waited on).
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 5.6.
+.\" commit 8649c322f75c96e7ced2fec201e123b2b073bf09
+.SH NOTES
+For a description of PID file descriptors, see
+.BR pidfd_open (2).
+.PP
+The effect of
+.BR pidfd_getfd ()
+is similar to the use of
+.B SCM_RIGHTS
+messages described in
+.BR unix (7),
+but differs in the following respects:
+.IP \[bu] 3
+In order to pass a file descriptor using an
+.B SCM_RIGHTS
+message,
+the two processes must first establish a UNIX domain socket connection.
+.IP \[bu]
+The use of
+.B SCM_RIGHTS
+requires cooperation on the part of the process whose
+file descriptor is being copied.
+By contrast, no such cooperation is necessary when using
+.BR pidfd_getfd ().
+.IP \[bu]
+The ability to use
+.BR pidfd_getfd ()
+is restricted by a
+.B PTRACE_MODE_ATTACH_REALCREDS
+ptrace access mode check.
+.SH SEE ALSO
+.BR clone3 (2),
+.BR dup (2),
+.BR kcmp (2),
+.BR pidfd_open (2)
diff --git a/man2/pidfd_open.2 b/man2/pidfd_open.2
new file mode 100644
index 0000000..8321e82
--- /dev/null
+++ b/man2/pidfd_open.2
@@ -0,0 +1,269 @@
+.\" Copyright (c) 2019 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH pidfd_open 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+pidfd_open \- obtain a file descriptor that refers to a process
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_pidfd_open, pid_t " pid ", unsigned int " flags );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR pidfd_open (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The
+.BR pidfd_open ()
+system call creates a file descriptor that refers to
+the process whose PID is specified in
+.IR pid .
+The file descriptor is returned as the function result;
+the close-on-exec flag is set on the file descriptor.
+.PP
+The
+.I flags
+argument either has the value 0, or contains the following flag:
+.TP
+.BR PIDFD_NONBLOCK " (since Linux 5.10)"
+.\" commit 4da9af0014b51c8b015ed8c622440ef28912efe6
+Return a nonblocking file descriptor.
+If the process referred to by the file descriptor has not yet terminated,
+then an attempt to wait on the file descriptor using
+.BR waitid (2)
+will immediately return the error
+.B EAGAIN
+rather than blocking.
+.SH RETURN VALUE
+On success,
+.BR pidfd_open ()
+returns a file descriptor (a nonnegative integer).
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+.I flags
+is not valid.
+.TP
+.B EINVAL
+.I pid
+is not valid.
+.TP
+.B EMFILE
+The per-process limit on the number of open file descriptors has been reached
+(see the description of
+.B RLIMIT_NOFILE
+in
+.BR getrlimit (2)).
+.TP
+.B ENFILE
+The system-wide limit on the total number of open files has been reached.
+.TP
+.B ENODEV
+The anonymous inode filesystem is not available in this kernel.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.B ESRCH
+The process specified by
+.I pid
+does not exist.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 5.3.
+.SH NOTES
+The following code sequence can be used to obtain a file descriptor
+for the child of
+.BR fork (2):
+.PP
+.in +4n
+.EX
+pid = fork();
+if (pid > 0) { /* If parent */
+ pidfd = pidfd_open(pid, 0);
+ ...
+}
+.EE
+.in
+.PP
+Even if the child has already terminated by the time of the
+.BR pidfd_open ()
+call, its PID will not have been recycled and the returned
+file descriptor will refer to the resulting zombie process.
+Note, however, that this is guaranteed only if the following
+conditions hold true:
+.IP \[bu] 3
+the disposition of
+.B SIGCHLD
+has not been explicitly set to
+.B SIG_IGN
+(see
+.BR sigaction (2));
+.IP \[bu]
+the
+.B SA_NOCLDWAIT
+flag was not specified while establishing a handler for
+.B SIGCHLD
+or while setting the disposition of that signal to
+.B SIG_DFL
+(see
+.BR sigaction (2));
+and
+.IP \[bu]
+the zombie process was not reaped elsewhere in the program
+(e.g., either by an asynchronously executed signal handler or by
+.BR wait (2)
+or similar in another thread).
+.PP
+If any of these conditions does not hold,
+then the child process (along with a PID file descriptor that refers to it)
+should instead be created using
+.BR clone (2)
+with the
+.B CLONE_PIDFD
+flag.
+.\"
+.SS Use cases for PID file descriptors
+A PID file descriptor returned by
+.BR pidfd_open ()
+(or by
+.BR clone (2)
+with the
+.B CLONE_PID
+flag) can be used for the following purposes:
+.IP \[bu] 3
+The
+.BR pidfd_send_signal (2)
+system call can be used to send a signal to the process referred to by
+a PID file descriptor.
+.IP \[bu]
+A PID file descriptor can be monitored using
+.BR poll (2),
+.BR select (2),
+and
+.BR epoll (7).
+When the process that it refers to terminates,
+these interfaces indicate the file descriptor as readable.
+Note, however, that in the current implementation,
+nothing can be read from the file descriptor
+.RB ( read (2)
+on the file descriptor fails with the error
+.BR EINVAL ).
+.IP \[bu]
+If the PID file descriptor refers to a child of the calling process,
+then it can be waited on using
+.BR waitid (2).
+.IP \[bu]
+The
+.BR pidfd_getfd (2)
+system call can be used to obtain a duplicate of a file descriptor
+of another process referred to by a PID file descriptor.
+.IP \[bu]
+A PID file descriptor can be used as the argument of
+.BR setns (2)
+in order to move into one or more of the same namespaces as the process
+referred to by the file descriptor.
+.IP \[bu]
+A PID file descriptor can be used as the argument of
+.BR process_madvise (2)
+in order to provide advice on the memory usage patterns of the process
+referred to by the file descriptor.
+.PP
+The
+.BR pidfd_open ()
+system call is the preferred way of obtaining a PID file descriptor
+for an already existing process.
+The alternative is to obtain a file descriptor by opening a
+.IR /proc/ pid
+directory.
+However, the latter technique is possible only if the
+.BR proc (5)
+filesystem is mounted;
+furthermore, the file descriptor obtained in this way is
+.I not
+pollable and can't be waited on with
+.BR waitid (2).
+.SH EXAMPLES
+The program below opens a PID file descriptor for the
+process whose PID is specified as its command-line argument.
+It then uses
+.BR poll (2)
+to monitor the file descriptor for process exit, as indicated by an
+.B EPOLLIN
+event.
+.\"
+.SS Program source
+\&
+.\" SRC BEGIN (pidfd_open.c)
+.EX
+#define _GNU_SOURCE
+#include <poll.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+\&
+static int
+pidfd_open(pid_t pid, unsigned int flags)
+{
+ return syscall(SYS_pidfd_open, pid, flags);
+}
+\&
+int
+main(int argc, char *argv[])
+{
+ int pidfd, ready;
+ struct pollfd pollfd;
+\&
+ if (argc != 2) {
+ fprintf(stderr, "Usage: %s <pid>\en", argv[0]);
+ exit(EXIT_SUCCESS);
+ }
+\&
+ pidfd = pidfd_open(atoi(argv[1]), 0);
+ if (pidfd == \-1) {
+ perror("pidfd_open");
+ exit(EXIT_FAILURE);
+ }
+\&
+ pollfd.fd = pidfd;
+ pollfd.events = POLLIN;
+\&
+ ready = poll(&pollfd, 1, \-1);
+ if (ready == \-1) {
+ perror("poll");
+ exit(EXIT_FAILURE);
+ }
+\&
+ printf("Events (%#x): POLLIN is %sset\en", pollfd.revents,
+ (pollfd.revents & POLLIN) ? "" : "not ");
+\&
+ close(pidfd);
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR clone (2),
+.BR kill (2),
+.BR pidfd_getfd (2),
+.BR pidfd_send_signal (2),
+.BR poll (2),
+.BR process_madvise (2),
+.BR select (2),
+.BR setns (2),
+.BR waitid (2),
+.BR epoll (7)
diff --git a/man2/pidfd_send_signal.2 b/man2/pidfd_send_signal.2
new file mode 100644
index 0000000..670ea71
--- /dev/null
+++ b/man2/pidfd_send_signal.2
@@ -0,0 +1,240 @@
+.\" Copyright (c) 2019 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH pidfd_send_signal 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+pidfd_send_signal \- send a signal to a process specified by a file descriptor
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/signal.h>" " /* Definition of " SIG* " constants */"
+.BR "#include <signal.h>" " /* Definition of " SI_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_pidfd_send_signal, int " pidfd ", int " sig ,
+.BI " siginfo_t *_Nullable " info ", unsigned int " flags );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR pidfd_send_signal (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The
+.BR pidfd_send_signal ()
+system call sends the signal
+.I sig
+to the target process referred to by
+.IR pidfd ,
+a PID file descriptor that refers to a process.
+.\" See the very detailed commit message for kernel commit
+.\" 3eb39f47934f9d5a3027fe00d906a45fe3a15fad
+.PP
+If the
+.I info
+argument points to a
+.I siginfo_t
+buffer, that buffer should be populated as described in
+.BR rt_sigqueueinfo (2).
+.PP
+If the
+.I info
+argument is a NULL pointer,
+this is equivalent to specifying a pointer to a
+.I siginfo_t
+buffer whose fields match the values that are
+implicitly supplied when a signal is sent using
+.BR kill (2):
+.PP
+.PD 0
+.IP \[bu] 3
+.I si_signo
+is set to the signal number;
+.IP \[bu]
+.I si_errno
+is set to 0;
+.IP \[bu]
+.I si_code
+is set to
+.BR SI_USER ;
+.IP \[bu]
+.I si_pid
+is set to the caller's PID; and
+.IP \[bu]
+.I si_uid
+is set to the caller's real user ID.
+.PD
+.PP
+The calling process must either be in the same PID namespace as the
+process referred to by
+.IR pidfd ,
+or be in an ancestor of that namespace.
+.PP
+The
+.I flags
+argument is reserved for future use;
+currently, this argument must be specified as 0.
+.SH RETURN VALUE
+On success,
+.BR pidfd_send_signal ()
+returns 0.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+.I pidfd
+is not a valid PID file descriptor.
+.TP
+.B EINVAL
+.I sig
+is not a valid signal.
+.TP
+.B EINVAL
+The calling process is not in a PID namespace from which it can
+send a signal to the target process.
+.TP
+.B EINVAL
+.I flags
+is not 0.
+.TP
+.B EPERM
+The calling process does not have permission to send the signal
+to the target process.
+.TP
+.B EPERM
+.I pidfd
+doesn't refer to the calling process, and
+.I info.si_code
+is invalid (see
+.BR rt_sigqueueinfo (2)).
+.TP
+.B ESRCH
+The target process does not exist
+(i.e., it has terminated and been waited on).
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 5.1.
+.SH NOTES
+.SS PID file descriptors
+The
+.I pidfd
+argument is a PID file descriptor,
+a file descriptor that refers to process.
+Such a file descriptor can be obtained in any of the following ways:
+.IP \[bu] 3
+by opening a
+.IR /proc/ pid
+directory;
+.IP \[bu]
+using
+.BR pidfd_open (2);
+or
+.IP \[bu]
+via the PID file descriptor that is returned by a call to
+.BR clone (2)
+or
+.BR clone3 (2)
+that specifies the
+.B CLONE_PIDFD
+flag.
+.PP
+The
+.BR pidfd_send_signal ()
+system call allows the avoidance of race conditions that occur
+when using traditional interfaces (such as
+.BR kill (2))
+to signal a process.
+The problem is that the traditional interfaces specify the target process
+via a process ID (PID),
+with the result that the sender may accidentally send a signal to
+the wrong process if the originally intended target process
+has terminated and its PID has been recycled for another process.
+By contrast,
+a PID file descriptor is a stable reference to a specific process;
+if that process terminates,
+.BR pidfd_send_signal ()
+fails with the error
+.BR ESRCH .
+.SH EXAMPLES
+.\" SRC BEGIN (pidfd_send_signal.c)
+.EX
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+\&
+static int
+pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
+ unsigned int flags)
+{
+ return syscall(SYS_pidfd_send_signal, pidfd, sig, info, flags);
+}
+\&
+int
+main(int argc, char *argv[])
+{
+ int pidfd, sig;
+ char path[PATH_MAX];
+ siginfo_t info;
+\&
+ if (argc != 3) {
+ fprintf(stderr, "Usage: %s <pid> <signal>\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ sig = atoi(argv[2]);
+\&
+ /* Obtain a PID file descriptor by opening the /proc/PID directory
+ of the target process. */
+\&
+ snprintf(path, sizeof(path), "/proc/%s", argv[1]);
+\&
+ pidfd = open(path, O_RDONLY);
+ if (pidfd == \-1) {
+ perror("open");
+ exit(EXIT_FAILURE);
+ }
+\&
+ /* Populate a \[aq]siginfo_t\[aq] structure for use with
+ pidfd_send_signal(). */
+\&
+ memset(&info, 0, sizeof(info));
+ info.si_code = SI_QUEUE;
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_uid = getuid();
+ info.si_pid = getpid();
+ info.si_value.sival_int = 1234;
+\&
+ /* Send the signal. */
+\&
+ if (pidfd_send_signal(pidfd, sig, &info, 0) == \-1) {
+ perror("pidfd_send_signal");
+ exit(EXIT_FAILURE);
+ }
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR clone (2),
+.BR kill (2),
+.BR pidfd_open (2),
+.BR rt_sigqueueinfo (2),
+.BR sigaction (2),
+.BR pid_namespaces (7),
+.BR signal (7)
diff --git a/man2/pipe.2 b/man2/pipe.2
new file mode 100644
index 0000000..d8142f9
--- /dev/null
+++ b/man2/pipe.2
@@ -0,0 +1,304 @@
+.\" Copyright (C) 2005, 2008, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" (A few fragments remain from an earlier (1992) version by
+.\" Drew Eckhardt <drew@cs.colorado.edu>.)
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt <michael@moria.de>
+.\" Modified 1993-07-23 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1996-10-22 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 2004-06-17 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified 2005, mtk: added an example program
+.\" Modified 2008-01-09, mtk: rewrote DESCRIPTION; minor additions
+.\" to EXAMPLE text.
+.\" 2008-10-10, mtk: add description of pipe2()
+.\"
+.TH pipe 2 2023-07-30 "Linux man-pages 6.05.01"
+.SH NAME
+pipe, pipe2 \- create pipe
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int pipe(int " pipefd [2]);
+.PP
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.BR "#include <fcntl.h>" " /* Definition of " O_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int pipe2(int " pipefd "[2], int " flags );
+.PP
+/* On Alpha, IA-64, MIPS, SuperH, and SPARC/SPARC64, pipe() has the
+ following prototype; see VERSIONS */
+.PP
+.B #include <unistd.h>
+.PP
+.B struct fd_pair {
+.B " long fd[2];"
+.B "};"
+.B struct fd_pair pipe(void);
+.fi
+.SH DESCRIPTION
+.BR pipe ()
+creates a pipe, a unidirectional data channel that
+can be used for interprocess communication.
+The array
+.I pipefd
+is used to return two file descriptors referring to the ends of the pipe.
+.I pipefd[0]
+refers to the read end of the pipe.
+.I pipefd[1]
+refers to the write end of the pipe.
+Data written to the write end of the pipe is buffered by the kernel
+until it is read from the read end of the pipe.
+For further details, see
+.BR pipe (7).
+.PP
+If
+.I flags
+is 0, then
+.BR pipe2 ()
+is the same as
+.BR pipe ().
+The following values can be bitwise ORed in
+.I flags
+to obtain different behavior:
+.TP
+.B O_CLOEXEC
+Set the close-on-exec
+.RB ( FD_CLOEXEC )
+flag on the two new file descriptors.
+See the description of the same flag in
+.BR open (2)
+for reasons why this may be useful.
+.TP
+.BR O_DIRECT " (since Linux 3.4)"
+.\" commit 9883035ae7edef3ec62ad215611cb8e17d6a1a5d
+Create a pipe that performs I/O in "packet" mode.
+Each
+.BR write (2)
+to the pipe is dealt with as a separate packet, and
+.BR read (2)s
+from the pipe will read one packet at a time.
+Note the following points:
+.RS
+.IP \[bu] 3
+Writes of greater than
+.B PIPE_BUF
+bytes (see
+.BR pipe (7))
+will be split into multiple packets.
+The constant
+.B PIPE_BUF
+is defined in
+.IR <limits.h> .
+.IP \[bu]
+If a
+.BR read (2)
+specifies a buffer size that is smaller than the next packet,
+then the requested number of bytes are read,
+and the excess bytes in the packet are discarded.
+Specifying a buffer size of
+.B PIPE_BUF
+will be sufficient to read the largest possible packets
+(see the previous point).
+.IP \[bu]
+Zero-length packets are not supported.
+(A
+.BR read (2)
+that specifies a buffer size of zero is a no-op, and returns 0.)
+.RE
+.IP
+Older kernels that do not support this flag will indicate this via an
+.B EINVAL
+error.
+.IP
+Since Linux 4.5,
+.\" commit 0dbf5f20652108106cb822ad7662c786baaa03ff
+.\" FIXME . But, it is not possible to specify O_DIRECT when opening a FIFO
+it is possible to change the
+.B O_DIRECT
+setting of a pipe file descriptor using
+.BR fcntl (2).
+.TP
+.B O_NONBLOCK
+Set the
+.B O_NONBLOCK
+file status flag on the open file descriptions
+referred to by the new file descriptors.
+Using this flag saves extra calls to
+.BR fcntl (2)
+to achieve the same result.
+.TP
+.B O_NOTIFICATION_PIPE
+Since Linux 5.8,
+.\" commit c73be61cede5882f9605a852414db559c0ebedfd
+general notification mechanism is built on the top of the pipe where
+kernel splices notification messages into pipes opened by user space.
+The owner of the pipe has to tell the kernel which sources of events to watch
+and filters can also be applied to select
+which subevents should be placed into the pipe.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned,
+.I errno
+is set to indicate the error, and
+.I pipefd
+is left unchanged.
+.PP
+On Linux (and other systems),
+.BR pipe ()
+does not modify
+.I pipefd
+on failure.
+A requirement standardizing this behavior was added in POSIX.1-2008 TC2.
+.\" http://austingroupbugs.net/view.php?id=467
+The Linux-specific
+.BR pipe2 ()
+system call
+likewise does not modify
+.I pipefd
+on failure.
+.SH ERRORS
+.TP
+.B EFAULT
+.I pipefd
+is not valid.
+.TP
+.B EINVAL
+.RB ( pipe2 ())
+Invalid value in
+.IR flags .
+.TP
+.B EMFILE
+The per-process limit on the number of open file descriptors has been reached.
+.TP
+.B ENFILE
+The system-wide limit on the total number of open files has been reached.
+.TP
+.B ENFILE
+The user hard limit on memory that can be allocated for pipes
+has been reached and the caller is not privileged; see
+.BR pipe (7).
+.TP
+.B ENOPKG
+.RB ( pipe2 ())
+.B O_NOTIFICATION_PIPE
+was passed in
+.I flags
+and support for notifications
+.RB ( CONFIG_WATCH_QUEUE )
+is not compiled into the kernel.
+.SH VERSIONS
+.\" See http://math-atlas.sourceforge.net/devel/assembly/64.psabi.1.33.ps.Z
+.\" for example, section 3.2.1 "Registers and the Stack Frame".
+The System V ABI on some architectures allows the use of more than one register
+for returning multiple values; several architectures
+(namely, Alpha, IA-64, MIPS, SuperH, and SPARC/SPARC64)
+(ab)use this feature in order to implement the
+.BR pipe ()
+system call in a functional manner:
+the call doesn't take any arguments and returns
+a pair of file descriptors as the return value on success.
+The glibc
+.BR pipe ()
+wrapper function transparently deals with this.
+See
+.BR syscall (2)
+for information regarding registers used for storing second file descriptor.
+.SH STANDARDS
+.TP
+.BR pipe ()
+POSIX.1-2008.
+.TP
+.BR pipe2 ()
+Linux.
+.SH HISTORY
+.TP
+.BR pipe ()
+POSIX.1-2001.
+.TP
+.BR pipe2 ()
+Linux 2.6.27,
+glibc 2.9.
+.SH EXAMPLES
+.\" fork.2 refers to this example program.
+The following program creates a pipe, and then
+.BR fork (2)s
+to create a child process;
+the child inherits a duplicate set of file
+descriptors that refer to the same pipe.
+After the
+.BR fork (2),
+each process closes the file descriptors that it doesn't need for the pipe
+(see
+.BR pipe (7)).
+The parent then writes the string contained in the program's
+command-line argument to the pipe,
+and the child reads this string a byte at a time from the pipe
+and echoes it on standard output.
+.SS Program source
+.\" SRC BEGIN (pipe.c)
+.EX
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/wait.h>
+#include <unistd.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ int pipefd[2];
+ char buf;
+ pid_t cpid;
+\&
+ if (argc != 2) {
+ fprintf(stderr, "Usage: %s <string>\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (pipe(pipefd) == \-1) {
+ perror("pipe");
+ exit(EXIT_FAILURE);
+ }
+\&
+ cpid = fork();
+ if (cpid == \-1) {
+ perror("fork");
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (cpid == 0) { /* Child reads from pipe */
+ close(pipefd[1]); /* Close unused write end */
+\&
+ while (read(pipefd[0], &buf, 1) > 0)
+ write(STDOUT_FILENO, &buf, 1);
+\&
+ write(STDOUT_FILENO, "\en", 1);
+ close(pipefd[0]);
+ _exit(EXIT_SUCCESS);
+\&
+ } else { /* Parent writes argv[1] to pipe */
+ close(pipefd[0]); /* Close unused read end */
+ write(pipefd[1], argv[1], strlen(argv[1]));
+ close(pipefd[1]); /* Reader will see EOF */
+ wait(NULL); /* Wait for child */
+ exit(EXIT_SUCCESS);
+ }
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR fork (2),
+.BR read (2),
+.BR socketpair (2),
+.BR splice (2),
+.BR tee (2),
+.BR vmsplice (2),
+.BR write (2),
+.BR popen (3),
+.BR pipe (7)
diff --git a/man2/pipe2.2 b/man2/pipe2.2
new file mode 100644
index 0000000..980e240
--- /dev/null
+++ b/man2/pipe2.2
@@ -0,0 +1 @@
+.so man2/pipe.2
diff --git a/man2/pivot_root.2 b/man2/pivot_root.2
new file mode 100644
index 0000000..a4077ef
--- /dev/null
+++ b/man2/pivot_root.2
@@ -0,0 +1,409 @@
+.\" Copyright (C) 2019 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" A very few fragments remain from an earlier page written by
+.\" Werner Almesberger in 2000
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH pivot_root 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+pivot_root \- change the root mount
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_pivot_root, const char *" new_root \
+", const char *" put_old );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR pivot_root (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+.BR pivot_root ()
+changes the root mount in the mount namespace of the calling process.
+More precisely, it moves the root mount to the
+directory \fIput_old\fP and makes \fInew_root\fP the new root mount.
+The calling process must have the
+.B CAP_SYS_ADMIN
+capability in the user namespace that owns the caller's mount namespace.
+.PP
+.BR pivot_root ()
+changes the root directory and the current working directory
+of each process or thread in the same mount namespace to
+.I new_root
+if they point to the old root directory.
+(See also NOTES.)
+On the other hand,
+.BR pivot_root ()
+does not change the caller's current working directory
+(unless it is on the old root directory),
+and thus it should be followed by a
+\fBchdir("/")\fP call.
+.PP
+The following restrictions apply:
+.IP \[bu] 3
+.I new_root
+and
+.I put_old
+must be directories.
+.IP \[bu]
+.I new_root
+and
+.I put_old
+must not be on the same mount as the current root.
+.IP \[bu]
+\fIput_old\fP must be at or underneath \fInew_root\fP;
+that is, adding some nonnegative
+number of "\fI/..\fP" suffixes to the pathname pointed to by
+.I put_old
+must yield the same directory as \fInew_root\fP.
+.IP \[bu]
+.I new_root
+must be a path to a mount point, but can't be
+.IR """/""" .
+A path that is not already a mount point can be converted into one by
+bind mounting the path onto itself.
+.IP \[bu]
+The propagation type of the parent mount of
+.I new_root
+and the parent mount of the current root directory must not be
+.BR MS_SHARED ;
+similarly, if
+.I put_old
+is an existing mount point, its propagation type must not be
+.BR MS_SHARED .
+These restrictions ensure that
+.BR pivot_root ()
+never propagates any changes to another mount namespace.
+.IP \[bu]
+The current root directory must be a mount point.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+\fIerrno\fP is set to indicate the error.
+.SH ERRORS
+.BR pivot_root ()
+may fail with any of the same errors as
+.BR stat (2).
+Additionally, it may fail with the following errors:
+.TP
+.B EBUSY
+.\" Reconfirmed that the following error occurs on Linux 5.0 by
+.\" specifying 'new_root' as "/rootfs" and 'put_old' as
+.\" "/rootfs/oldrootfs", and *not* bind mounting "/rootfs" on top of
+.\" itself. Of course, this is an odd situation, since a later check
+.\" in the kernel code will in any case yield EINVAL if 'new_root' is
+.\" not a mount point. However, when the system call was first added,
+.\" 'new_root' was not required to be a mount point. So, this
+.\" error is nowadays probably just the result of crufty accumulation.
+.\" This error can also occur if we bind mount "/" on top of itself
+.\" and try to specify "/" as the 'new' (again, an odd situation). So,
+.\" the EBUSY check in the kernel does still seem necessary to prevent
+.\" that case. Furthermore, the "or put_old" piece is probably
+.\" redundant text (although the check is in the kernel), since,
+.\" in another check, 'put_old' is required to be under 'new_root'.
+.I new_root
+or
+.I put_old
+is on the current root mount.
+(This error covers the pathological case where
+.I new_root
+is
+.IR """/""" .)
+.TP
+.B EINVAL
+.I new_root
+is not a mount point.
+.TP
+.B EINVAL
+\fIput_old\fP is not at or underneath \fInew_root\fP.
+.TP
+.B EINVAL
+The current root directory is not a mount point
+(because of an earlier
+.BR chroot (2)).
+.TP
+.B EINVAL
+The current root is on the rootfs (initial ramfs) mount; see NOTES.
+.TP
+.B EINVAL
+Either the mount point at
+.IR new_root ,
+or the parent mount of that mount point,
+has propagation type
+.BR MS_SHARED .
+.TP
+.B EINVAL
+.I put_old
+is a mount point and has the propagation type
+.BR MS_SHARED .
+.TP
+.B ENOTDIR
+\fInew_root\fP or \fIput_old\fP is not a directory.
+.TP
+.B EPERM
+The calling process does not have the
+.B CAP_SYS_ADMIN
+capability.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.3.41.
+.SH NOTES
+A command-line interface for this system call is provided by
+.BR pivot_root (8).
+.PP
+.BR pivot_root ()
+allows the caller to switch to a new root filesystem while at the same time
+placing the old root mount at a location under
+.I new_root
+from where it can subsequently be unmounted.
+(The fact that it moves all processes that have a root directory
+or current working directory on the old root directory to the
+new root frees the old root directory of users,
+allowing the old root mount to be unmounted more easily.)
+.PP
+One use of
+.BR pivot_root ()
+is during system startup, when the
+system mounts a temporary root filesystem (e.g., an
+.BR initrd (4)),
+then mounts the real root filesystem, and eventually turns the latter into
+the root directory of all relevant processes and threads.
+A modern use is to set up a root filesystem during
+the creation of a container.
+.PP
+The fact that
+.BR pivot_root ()
+modifies process root and current working directories in the
+manner noted in DESCRIPTION
+is necessary in order to prevent kernel threads from keeping the old
+root mount busy with their root and current working directories,
+even if they never access
+the filesystem in any way.
+.PP
+The rootfs (initial ramfs) cannot be
+.BR pivot_root ()ed.
+The recommended method of changing the root filesystem in this case is
+to delete everything in rootfs, overmount rootfs with the new root, attach
+.IR stdin / stdout / stderr
+to the new
+.IR /dev/console ,
+and exec the new
+.BR init (1).
+Helper programs for this process exist; see
+.BR switch_root (8).
+.\"
+.SS pivot_root(\[dq].\[dq], \[dq].\[dq])
+.I new_root
+and
+.I put_old
+may be the same directory.
+In particular, the following sequence allows a pivot-root operation
+without needing to create and remove a temporary directory:
+.PP
+.in +4n
+.EX
+chdir(new_root);
+pivot_root(".", ".");
+umount2(".", MNT_DETACH);
+.EE
+.in
+.PP
+This sequence succeeds because the
+.BR pivot_root ()
+call stacks the old root mount point
+on top of the new root mount point at
+.IR / .
+At that point, the calling process's root directory and current
+working directory refer to the new root mount point
+.RI ( new_root ).
+During the subsequent
+.BR umount ()
+call, resolution of
+.I """."""
+starts with
+.I new_root
+and then moves up the list of mounts stacked at
+.IR / ,
+with the result that old root mount point is unmounted.
+.\"
+.SS Historical notes
+For many years, this manual page carried the following text:
+.RS
+.PP
+.BR pivot_root ()
+may or may not change the current root and the current
+working directory of any processes or threads which use the old
+root directory.
+The caller of
+.BR pivot_root ()
+must ensure that processes with root or current working directory
+at the old root operate correctly in either case.
+An easy way to ensure this is to change their
+root and current working directory to \fInew_root\fP before invoking
+.BR pivot_root ().
+.RE
+.PP
+This text, written before the system call implementation was
+even finalized in the kernel, was probably intended to warn users
+at that time that the implementation might change before final release.
+However, the behavior stated in DESCRIPTION
+has remained consistent since this system call
+was first implemented and will not change now.
+.SH EXAMPLES
+.\" FIXME
+.\" Would it be better, because simpler, to use unshare(2)
+.\" rather than clone(2) in the example below?
+The program below demonstrates the use of
+.BR pivot_root ()
+inside a mount namespace that is created using
+.BR clone (2).
+After pivoting to the root directory named in the program's
+first command-line argument, the child created by
+.BR clone (2)
+then executes the program named in the remaining command-line arguments.
+.PP
+We demonstrate the program by creating a directory that will serve as
+the new root filesystem and placing a copy of the (statically linked)
+.BR busybox (1)
+executable in that directory.
+.PP
+.in +4n
+.EX
+$ \fBmkdir /tmp/rootfs\fP
+$ \fBls \-id /tmp/rootfs\fP # Show inode number of new root directory
+319459 /tmp/rootfs
+$ \fBcp $(which busybox) /tmp/rootfs\fP
+$ \fBPS1=\[aq]bbsh$ \[aq] sudo ./pivot_root_demo /tmp/rootfs /busybox sh\fP
+bbsh$ \fBPATH=/\fP
+bbsh$ \fBbusybox ln busybox ln\fP
+bbsh$ \fBln busybox echo\fP
+bbsh$ \fBln busybox ls\fP
+bbsh$ \fBls\fP
+busybox echo ln ls
+bbsh$ \fBls \-id /\fP # Compare with inode number above
+319459 /
+bbsh$ \fBecho \[aq]hello world\[aq]\fP
+hello world
+.EE
+.in
+.SS Program source
+\&
+.PP
+.\" SRC BEGIN (pivot_root.c)
+.EX
+/* pivot_root_demo.c */
+\&
+#define _GNU_SOURCE
+#include <err.h>
+#include <limits.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+#include <unistd.h>
+\&
+static int
+pivot_root(const char *new_root, const char *put_old)
+{
+ return syscall(SYS_pivot_root, new_root, put_old);
+}
+\&
+#define STACK_SIZE (1024 * 1024)
+\&
+static int /* Startup function for cloned child */
+child(void *arg)
+{
+ char path[PATH_MAX];
+ char **args = arg;
+ char *new_root = args[0];
+ const char *put_old = "/oldrootfs";
+\&
+ /* Ensure that \[aq]new_root\[aq] and its parent mount don\[aq]t have
+ shared propagation (which would cause pivot_root() to
+ return an error), and prevent propagation of mount
+ events to the initial mount namespace. */
+\&
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) == \-1)
+ err(EXIT_FAILURE, "mount\-MS_PRIVATE");
+\&
+ /* Ensure that \[aq]new_root\[aq] is a mount point. */
+\&
+ if (mount(new_root, new_root, NULL, MS_BIND, NULL) == \-1)
+ err(EXIT_FAILURE, "mount\-MS_BIND");
+\&
+ /* Create directory to which old root will be pivoted. */
+\&
+ snprintf(path, sizeof(path), "%s/%s", new_root, put_old);
+ if (mkdir(path, 0777) == \-1)
+ err(EXIT_FAILURE, "mkdir");
+\&
+ /* And pivot the root filesystem. */
+\&
+ if (pivot_root(new_root, path) == \-1)
+ err(EXIT_FAILURE, "pivot_root");
+\&
+ /* Switch the current working directory to "/". */
+\&
+ if (chdir("/") == \-1)
+ err(EXIT_FAILURE, "chdir");
+\&
+ /* Unmount old root and remove mount point. */
+\&
+ if (umount2(put_old, MNT_DETACH) == \-1)
+ perror("umount2");
+ if (rmdir(put_old) == \-1)
+ perror("rmdir");
+\&
+ /* Execute the command specified in argv[1]... */
+\&
+ execv(args[1], &args[1]);
+ err(EXIT_FAILURE, "execv");
+}
+\&
+int
+main(int argc, char *argv[])
+{
+ char *stack;
+\&
+ /* Create a child process in a new mount namespace. */
+\&
+ stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, \-1, 0);
+ if (stack == MAP_FAILED)
+ err(EXIT_FAILURE, "mmap");
+\&
+ if (clone(child, stack + STACK_SIZE,
+ CLONE_NEWNS | SIGCHLD, &argv[1]) == \-1)
+ err(EXIT_FAILURE, "clone");
+\&
+ /* Parent falls through to here; wait for child. */
+\&
+ if (wait(NULL) == \-1)
+ err(EXIT_FAILURE, "wait");
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR chdir (2),
+.BR chroot (2),
+.BR mount (2),
+.BR stat (2),
+.BR initrd (4),
+.BR mount_namespaces (7),
+.BR pivot_root (8),
+.BR switch_root (8)
diff --git a/man2/pkey_alloc.2 b/man2/pkey_alloc.2
new file mode 100644
index 0000000..53d8f2a
--- /dev/null
+++ b/man2/pkey_alloc.2
@@ -0,0 +1,115 @@
+.\" Copyright (C) 2016 Intel Corporation
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH pkey_alloc 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+pkey_alloc, pkey_free \- allocate or free a protection key
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #include <sys/mman.h>
+.PP
+.BI "int pkey_alloc(unsigned int " flags ", unsigned int " access_rights ");"
+.BI "int pkey_free(int " pkey ");"
+.fi
+.SH DESCRIPTION
+.BR pkey_alloc ()
+allocates a protection key (pkey) and allows it to be passed to
+.BR pkey_mprotect (2).
+.PP
+The
+.BR pkey_alloc ()
+.I flags
+is reserved for future use and currently must always be specified as 0.
+.PP
+The
+.BR pkey_alloc ()
+.I access_rights
+argument may contain zero or more disable operations:
+.TP
+.B PKEY_DISABLE_ACCESS
+Disable all data access to memory covered by the returned protection key.
+.TP
+.B PKEY_DISABLE_WRITE
+Disable write access to memory covered by the returned protection key.
+.PP
+.BR pkey_free ()
+frees a protection key and makes it available for later
+allocations.
+After a protection key has been freed, it may no longer be used
+in any protection-key-related operations.
+.PP
+An application should not call
+.BR pkey_free ()
+on any protection key which has been assigned to an address
+range by
+.BR pkey_mprotect (2)
+and which is still in use.
+The behavior in this case is undefined and may result in an error.
+.SH RETURN VALUE
+On success,
+.BR pkey_alloc ()
+returns a positive protection key value.
+On success,
+.BR pkey_free ()
+returns zero.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+.IR pkey ,
+.IR flags ,
+or
+.I access_rights
+is invalid.
+.TP
+.B ENOSPC
+.RB ( pkey_alloc ())
+All protection keys available for the current process have
+been allocated.
+The number of keys available is architecture-specific and
+implementation-specific and may be reduced by kernel-internal use
+of certain keys.
+There are currently 15 keys available to user programs on x86.
+.IP
+This error will also be returned if the processor or operating system
+does not support protection keys.
+Applications should always be prepared to handle this error, since
+factors outside of the application's control can reduce the number
+of available pkeys.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 4.9,
+glibc 2.27.
+.SH NOTES
+.BR pkey_alloc ()
+is always safe to call regardless of whether or not the operating system
+supports protection keys.
+It can be used in lieu of any other mechanism for detecting pkey support
+and will simply fail with the error
+.B ENOSPC
+if the operating system has no pkey support.
+.PP
+The kernel guarantees that the contents of the hardware rights
+register (PKRU) will be preserved only for allocated protection
+keys.
+Any time a key is unallocated (either before the first call
+returning that key from
+.BR pkey_alloc ()
+or after it is freed via
+.BR pkey_free ()),
+the kernel may make arbitrary changes to the parts of the
+rights register affecting access to that key.
+.SH EXAMPLES
+See
+.BR pkeys (7).
+.SH SEE ALSO
+.BR pkey_mprotect (2),
+.BR pkeys (7)
diff --git a/man2/pkey_free.2 b/man2/pkey_free.2
new file mode 100644
index 0000000..5b524cb
--- /dev/null
+++ b/man2/pkey_free.2
@@ -0,0 +1 @@
+.so man2/pkey_alloc.2
diff --git a/man2/pkey_mprotect.2 b/man2/pkey_mprotect.2
new file mode 100644
index 0000000..b4f9309
--- /dev/null
+++ b/man2/pkey_mprotect.2
@@ -0,0 +1 @@
+.so man2/mprotect.2
diff --git a/man2/poll.2 b/man2/poll.2
new file mode 100644
index 0000000..2b024d3
--- /dev/null
+++ b/man2/poll.2
@@ -0,0 +1,649 @@
+.\" Copyright (C) 2006, 2019 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Additions from Richard Gooch <rgooch@atnf.CSIRO.AU> and aeb, 971207
+.\" 2006-03-13, mtk, Added ppoll() + various other rewordings
+.\" 2006-07-01, mtk, Added POLLRDHUP + various other wording and
+.\" formatting changes.
+.\"
+.TH poll 2 2023-07-08 "Linux man-pages 6.05.01"
+.SH NAME
+poll, ppoll \- wait for some event on a file descriptor
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <poll.h>
+.PP
+.BI "int poll(struct pollfd *" fds ", nfds_t " nfds ", int " timeout );
+.PP
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #include <poll.h>
+.PP
+.BI "int ppoll(struct pollfd *" fds ", nfds_t " nfds ,
+.BI " const struct timespec *_Nullable " tmo_p ,
+.BI " const sigset_t *_Nullable " sigmask );
+.fi
+.SH DESCRIPTION
+.BR poll ()
+performs a similar task to
+.BR select (2):
+it waits for one of a set of file descriptors to become ready
+to perform I/O.
+The Linux-specific
+.BR epoll (7)
+API performs a similar task, but offers features beyond those found in
+.BR poll ().
+.PP
+The set of file descriptors to be monitored is specified in the
+.I fds
+argument, which is an array of structures of the following form:
+.PP
+.in +4n
+.EX
+struct pollfd {
+ int fd; /* file descriptor */
+ short events; /* requested events */
+ short revents; /* returned events */
+};
+.EE
+.in
+.PP
+The caller should specify the number of items in the
+.I fds
+array in
+.IR nfds .
+.PP
+The field
+.I fd
+contains a file descriptor for an open file.
+If this field is negative, then the corresponding
+.I events
+field is ignored and the
+.I revents
+field returns zero.
+(This provides an easy way of ignoring a
+file descriptor for a single
+.BR poll ()
+call: simply set the
+.I fd
+field to its bitwise complement.)
+.PP
+The field
+.I events
+is an input parameter, a bit mask specifying the events the application
+is interested in for the file descriptor
+.IR fd .
+This field may be specified as zero,
+in which case the only events that can be returned in
+.I revents
+are
+.BR POLLHUP ,
+.BR POLLERR ,
+and
+.B POLLNVAL
+(see below).
+.PP
+The field
+.I revents
+is an output parameter, filled by the kernel with the events that
+actually occurred.
+The bits returned in
+.I revents
+can include any of those specified in
+.IR events ,
+or one of the values
+.BR POLLERR ,
+.BR POLLHUP ,
+or
+.BR POLLNVAL .
+(These three bits are meaningless in the
+.I events
+field, and will be set in the
+.I revents
+field whenever the corresponding condition is true.)
+.PP
+If none of the events requested (and no error) has occurred for any
+of the file descriptors, then
+.BR poll ()
+blocks until one of the events occurs.
+.PP
+The
+.I timeout
+argument specifies the number of milliseconds that
+.BR poll ()
+should block waiting for a file descriptor to become ready.
+The call will block until either:
+.IP \[bu] 3
+a file descriptor becomes ready;
+.IP \[bu]
+the call is interrupted by a signal handler; or
+.IP \[bu]
+the timeout expires.
+.PP
+Being "ready" means that the requested operation will not block; thus,
+.BR poll ()ing
+regular files,
+block devices,
+and other files with no reasonable polling semantic
+.I always
+returns instantly as ready to read and write.
+.PP
+Note that the
+.I timeout
+interval will be rounded up to the system clock granularity,
+and kernel scheduling delays mean that the blocking interval
+may overrun by a small amount.
+Specifying a negative value in
+.I timeout
+means an infinite timeout.
+Specifying a
+.I timeout
+of zero causes
+.BR poll ()
+to return immediately, even if no file descriptors are ready.
+.PP
+The bits that may be set/returned in
+.I events
+and
+.I revents
+are defined in \fI<poll.h>\fP:
+.TP
+.B POLLIN
+There is data to read.
+.TP
+.B POLLPRI
+There is some exceptional condition on the file descriptor.
+Possibilities include:
+.RS
+.IP \[bu] 3
+There is out-of-band data on a TCP socket (see
+.BR tcp (7)).
+.IP \[bu]
+A pseudoterminal master in packet mode has seen a state change on the slave
+(see
+.BR ioctl_tty (2)).
+.IP \[bu]
+A
+.I cgroup.events
+file has been modified (see
+.BR cgroups (7)).
+.RE
+.TP
+.B POLLOUT
+Writing is now possible, though a write larger than the available space
+in a socket or pipe will still block (unless
+.B O_NONBLOCK
+is set).
+.TP
+.BR POLLRDHUP " (since Linux 2.6.17)"
+Stream socket peer closed connection,
+or shut down writing half of connection.
+The
+.B _GNU_SOURCE
+feature test macro must be defined
+(before including
+.I any
+header files)
+in order to obtain this definition.
+.TP
+.B POLLERR
+Error condition (only returned in
+.IR revents ;
+ignored in
+.IR events ).
+This bit is also set for a file descriptor referring
+to the write end of a pipe when the read end has been closed.
+.TP
+.B POLLHUP
+Hang up (only returned in
+.IR revents ;
+ignored in
+.IR events ).
+Note that when reading from a channel such as a pipe or a stream socket,
+this event merely indicates that the peer closed its end of the channel.
+Subsequent reads from the channel will return 0 (end of file)
+only after all outstanding data in the channel has been consumed.
+.TP
+.B POLLNVAL
+Invalid request:
+.I fd
+not open (only returned in
+.IR revents ;
+ignored in
+.IR events ).
+.PP
+When compiling with
+.B _XOPEN_SOURCE
+defined, one also has the following,
+which convey no further information beyond the bits listed above:
+.TP
+.B POLLRDNORM
+Equivalent to
+.BR POLLIN .
+.TP
+.B POLLRDBAND
+Priority band data can be read (generally unused on Linux).
+.\" POLLRDBAND is used in the DECnet protocol.
+.TP
+.B POLLWRNORM
+Equivalent to
+.BR POLLOUT .
+.TP
+.B POLLWRBAND
+Priority data may be written.
+.PP
+Linux also knows about, but does not use
+.BR POLLMSG .
+.SS ppoll()
+The relationship between
+.BR poll ()
+and
+.BR ppoll ()
+is analogous to the relationship between
+.BR select (2)
+and
+.BR pselect (2):
+like
+.BR pselect (2),
+.BR ppoll ()
+allows an application to safely wait until either a file descriptor
+becomes ready or until a signal is caught.
+.PP
+Other than the difference in the precision of the
+.I timeout
+argument, the following
+.BR ppoll ()
+call:
+.PP
+.in +4n
+.EX
+ready = ppoll(&fds, nfds, tmo_p, &sigmask);
+.EE
+.in
+.PP
+is nearly equivalent to
+.I atomically
+executing the following calls:
+.PP
+.in +4n
+.EX
+sigset_t origmask;
+int timeout;
+\&
+timeout = (tmo_p == NULL) ? \-1 :
+ (tmo_p\->tv_sec * 1000 + tmo_p\->tv_nsec / 1000000);
+pthread_sigmask(SIG_SETMASK, &sigmask, &origmask);
+ready = poll(&fds, nfds, timeout);
+pthread_sigmask(SIG_SETMASK, &origmask, NULL);
+.EE
+.in
+.PP
+The above code segment is described as
+.I nearly
+equivalent because whereas a negative
+.I timeout
+value for
+.BR poll ()
+is interpreted as an infinite timeout, a negative value expressed in
+.I *tmo_p
+results in an error from
+.BR ppoll ().
+.PP
+See the description of
+.BR pselect (2)
+for an explanation of why
+.BR ppoll ()
+is necessary.
+.PP
+If the
+.I sigmask
+argument is specified as NULL, then
+no signal mask manipulation is performed
+(and thus
+.BR ppoll ()
+differs from
+.BR poll ()
+only in the precision of the
+.I timeout
+argument).
+.PP
+The
+.I tmo_p
+argument specifies an upper limit on the amount of time that
+.BR ppoll ()
+will block.
+This argument is a pointer to a
+.BR timespec (3)
+structure.
+.PP
+If
+.I tmo_p
+is specified as NULL, then
+.BR ppoll ()
+can block indefinitely.
+.SH RETURN VALUE
+On success,
+.BR poll ()
+returns a nonnegative value which is the number of elements in the
+.I pollfds
+whose
+.I revents
+fields have been set to a nonzero value (indicating an event or an error).
+A return value of zero indicates that the system call timed out
+before any file descriptors became ready.
+.PP
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+.I fds
+points outside the process's accessible address space.
+The array given as argument was not contained in the calling program's
+address space.
+.TP
+.B EINTR
+A signal occurred before any requested event; see
+.BR signal (7).
+.TP
+.B EINVAL
+The
+.I nfds
+value exceeds the
+.B RLIMIT_NOFILE
+value.
+.TP
+.B EINVAL
+.RB ( ppoll ())
+The timeout value expressed in
+.I *tmo_p
+is invalid (negative).
+.TP
+.B ENOMEM
+Unable to allocate memory for kernel data structures.
+.SH VERSIONS
+On some other UNIX systems,
+.\" Darwin, according to a report by Jeremy Sequoia, relayed by Josh Triplett
+.BR poll ()
+can fail with the error
+.B EAGAIN
+if the system fails to allocate kernel-internal resources, rather than
+.B ENOMEM
+as Linux does.
+POSIX permits this behavior.
+Portable programs may wish to check for
+.B EAGAIN
+and loop, just as with
+.BR EINTR .
+.PP
+Some implementations define the nonstandard constant
+.B INFTIM
+with the value \-1 for use as a
+.I timeout
+for
+.BR poll ().
+This constant is not provided in glibc.
+.SS C library/kernel differences
+The Linux
+.BR ppoll ()
+system call modifies its
+.I tmo_p
+argument.
+However, the glibc wrapper function hides this behavior
+by using a local variable for the timeout argument that
+is passed to the system call.
+Thus, the glibc
+.BR ppoll ()
+function does not modify its
+.I tmo_p
+argument.
+.PP
+The raw
+.BR ppoll ()
+system call has a fifth argument,
+.IR "size_t sigsetsize" ,
+which specifies the size in bytes of the
+.I sigmask
+argument.
+The glibc
+.BR ppoll ()
+wrapper function specifies this argument as a fixed value
+(equal to
+.IR sizeof(kernel_sigset_t) ).
+See
+.BR sigprocmask (2)
+for a discussion on the differences between the kernel and the libc
+notion of the sigset.
+.SH STANDARDS
+.TP
+.BR poll ()
+POSIX.1-2008.
+.TP
+.BR ppoll ()
+Linux.
+.\" FIXME .
+.\" ppoll() is proposed for inclusion in POSIX:
+.\" https://www.austingroupbugs.net/view.php?id=1263
+.\" NetBSD 3.0 has a pollts() which is like Linux ppoll().
+.SH HISTORY
+.TP
+.BR poll ()
+POSIX.1-2001.
+Linux 2.1.23.
+.IP
+On older kernels that lack this system call,
+the glibc
+.BR poll ()
+wrapper function provides emulation using
+.BR select (2).
+.TP
+.BR ppoll ()
+Linux 2.6.16,
+glibc 2.4.
+.SH NOTES
+The operation of
+.BR poll ()
+and
+.BR ppoll ()
+is not affected by the
+.B O_NONBLOCK
+flag.
+.PP
+For a discussion of what may happen if a file descriptor being monitored by
+.BR poll ()
+is closed in another thread, see
+.BR select (2).
+.SH BUGS
+See the discussion of spurious readiness notifications under the
+BUGS section of
+.BR select (2).
+.SH EXAMPLES
+The program below opens each of the files named in its command-line
+arguments and monitors the resulting file descriptors for readiness to read
+.RB ( POLLIN ).
+The program loops, repeatedly using
+.BR poll ()
+to monitor the file descriptors,
+printing the number of ready file descriptors on return.
+For each ready file descriptor, the program:
+.IP \[bu] 3
+displays the returned
+.I revents
+field in a human-readable form;
+.IP \[bu]
+if the file descriptor is readable, reads some data from it,
+and displays that data on standard output; and
+.IP \[bu]
+if the file descriptor was not readable,
+but some other event occurred (presumably
+.BR POLLHUP ),
+closes the file descriptor.
+.PP
+Suppose we run the program in one terminal, asking it to open a FIFO:
+.PP
+.in +4n
+.EX
+$ \fBmkfifo myfifo\fP
+$ \fB./poll_input myfifo\fP
+.EE
+.in
+.PP
+In a second terminal window, we then open the FIFO for writing,
+write some data to it, and close the FIFO:
+.PP
+.in +4n
+.EX
+$ \fBecho aaaaabbbbbccccc > myfifo\fP
+.EE
+.in
+.PP
+In the terminal where we are running the program, we would then see:
+.PP
+.in +4n
+.EX
+Opened "myfifo" on fd 3
+About to poll()
+Ready: 1
+ fd=3; events: POLLIN POLLHUP
+ read 10 bytes: aaaaabbbbb
+About to poll()
+Ready: 1
+ fd=3; events: POLLIN POLLHUP
+ read 6 bytes: ccccc
+\&
+About to poll()
+Ready: 1
+ fd=3; events: POLLHUP
+ closing fd 3
+All file descriptors closed; bye
+.EE
+.in
+.PP
+In the above output, we see that
+.BR poll ()
+returned three times:
+.IP \[bu] 3
+On the first return, the bits returned in the
+.I revents
+field were
+.BR POLLIN ,
+indicating that the file descriptor is readable, and
+.BR POLLHUP ,
+indicating that the other end of the FIFO has been closed.
+The program then consumed some of the available input.
+.IP \[bu]
+The second return from
+.BR poll ()
+also indicated
+.B POLLIN
+and
+.BR POLLHUP ;
+the program then consumed the last of the available input.
+.IP \[bu]
+On the final return,
+.BR poll ()
+indicated only
+.B POLLHUP
+on the FIFO,
+at which point the file descriptor was closed and the program terminated.
+.\"
+.SS Program source
+\&
+.\" SRC BEGIN (poll_input.c)
+.EX
+/* poll_input.c
+\&
+ Licensed under GNU General Public License v2 or later.
+*/
+#include <fcntl.h>
+#include <poll.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+\&
+#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \e
+ } while (0)
+\&
+int
+main(int argc, char *argv[])
+{
+ int ready;
+ char buf[10];
+ nfds_t num_open_fds, nfds;
+ ssize_t s;
+ struct pollfd *pfds;
+\&
+ if (argc < 2) {
+ fprintf(stderr, "Usage: %s file...\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ num_open_fds = nfds = argc \- 1;
+ pfds = calloc(nfds, sizeof(struct pollfd));
+ if (pfds == NULL)
+ errExit("malloc");
+\&
+ /* Open each file on command line, and add it to \[aq]pfds\[aq] array. */
+\&
+ for (nfds_t j = 0; j < nfds; j++) {
+ pfds[j].fd = open(argv[j + 1], O_RDONLY);
+ if (pfds[j].fd == \-1)
+ errExit("open");
+\&
+ printf("Opened \e"%s\e" on fd %d\en", argv[j + 1], pfds[j].fd);
+\&
+ pfds[j].events = POLLIN;
+ }
+\&
+ /* Keep calling poll() as long as at least one file descriptor is
+ open. */
+\&
+ while (num_open_fds > 0) {
+ printf("About to poll()\en");
+ ready = poll(pfds, nfds, \-1);
+ if (ready == \-1)
+ errExit("poll");
+\&
+ printf("Ready: %d\en", ready);
+\&
+ /* Deal with array returned by poll(). */
+\&
+ for (nfds_t j = 0; j < nfds; j++) {
+ if (pfds[j].revents != 0) {
+ printf(" fd=%d; events: %s%s%s\en", pfds[j].fd,
+ (pfds[j].revents & POLLIN) ? "POLLIN " : "",
+ (pfds[j].revents & POLLHUP) ? "POLLHUP " : "",
+ (pfds[j].revents & POLLERR) ? "POLLERR " : "");
+\&
+ if (pfds[j].revents & POLLIN) {
+ s = read(pfds[j].fd, buf, sizeof(buf));
+ if (s == \-1)
+ errExit("read");
+ printf(" read %zd bytes: %.*s\en",
+ s, (int) s, buf);
+ } else { /* POLLERR | POLLHUP */
+ printf(" closing fd %d\en", pfds[j].fd);
+ if (close(pfds[j].fd) == \-1)
+ errExit("close");
+ num_open_fds\-\-;
+ }
+ }
+ }
+ }
+\&
+ printf("All file descriptors closed; bye\en");
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR restart_syscall (2),
+.BR select (2),
+.BR select_tut (2),
+.BR timespec (3),
+.BR epoll (7),
+.BR time (7)
diff --git a/man2/posix_fadvise.2 b/man2/posix_fadvise.2
new file mode 100644
index 0000000..38e9745
--- /dev/null
+++ b/man2/posix_fadvise.2
@@ -0,0 +1,227 @@
+.\" Copyright 2003 Abhijit Menon-Sen <ams@wiw.org>
+.\" and Copyright (C) 2010, 2015, 2017 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" 2005-04-08 mtk, noted kernel version and added BUGS
+.\" 2010-10-09, mtk, document arm_fadvise64_64()
+.\"
+.TH posix_fadvise 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+posix_fadvise \- predeclare an access pattern for file data
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <fcntl.h>
+.PP
+.BI "int posix_fadvise(int " fd ", off_t " offset ", off_t " len \
+", int " advice ");"
+.fi
+.PP
+.ad l
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR posix_fadvise ():
+.nf
+ _POSIX_C_SOURCE >= 200112L
+.fi
+.SH DESCRIPTION
+Programs can use
+.BR posix_fadvise ()
+to announce an intention to access
+file data in a specific pattern in the future, thus allowing the kernel
+to perform appropriate optimizations.
+.PP
+The \fIadvice\fP applies to a (not necessarily existent) region starting
+at \fIoffset\fP and extending for \fIlen\fP bytes (or until the end of
+the file if \fIlen\fP is 0) within the file referred to by \fIfd\fP.
+The \fIadvice\fP is not binding;
+it merely constitutes an expectation on behalf of
+the application.
+.PP
+Permissible values for \fIadvice\fP include:
+.TP
+.B POSIX_FADV_NORMAL
+Indicates that the application has no advice to give about its access
+pattern for the specified data.
+If no advice is given for an open file,
+this is the default assumption.
+.TP
+.B POSIX_FADV_SEQUENTIAL
+The application expects to access the specified data sequentially (with
+lower offsets read before higher ones).
+.TP
+.B POSIX_FADV_RANDOM
+The specified data will be accessed in random order.
+.TP
+.B POSIX_FADV_NOREUSE
+The specified data will be accessed only once.
+.IP
+Before Linux 2.6.18, \fBPOSIX_FADV_NOREUSE\fP had the
+same semantics as \fBPOSIX_FADV_WILLNEED\fP.
+This was probably a bug; since Linux 2.6.18, this flag is a no-op.
+.TP
+.B POSIX_FADV_WILLNEED
+The specified data will be accessed in the near future.
+.IP
+\fBPOSIX_FADV_WILLNEED\fP initiates a
+nonblocking read of the specified region into the page cache.
+The amount of data read may be decreased by the kernel depending
+on virtual memory load.
+(A few megabytes will usually be fully satisfied,
+and more is rarely useful.)
+.TP
+.B POSIX_FADV_DONTNEED
+The specified data will not be accessed in the near future.
+.IP
+\fBPOSIX_FADV_DONTNEED\fP attempts to free cached pages associated with
+the specified region.
+This is useful, for example, while streaming large
+files.
+A program may periodically request the kernel to free cached data
+that has already been used, so that more useful cached pages are not
+discarded instead.
+.IP
+Requests to discard partial pages are ignored.
+It is preferable to preserve needed data than discard unneeded data.
+If the application requires that data be considered for discarding, then
+.I offset
+and
+.I len
+must be page-aligned.
+.IP
+The implementation
+.I may
+attempt to write back dirty pages in the specified region,
+but this is not guaranteed.
+Any unwritten dirty pages will not be freed.
+If the application wishes to ensure that dirty pages will be released,
+it should call
+.BR fsync (2)
+or
+.BR fdatasync (2)
+first.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, an error number is returned.
+.SH ERRORS
+.TP
+.B EBADF
+The \fIfd\fP argument was not a valid file descriptor.
+.TP
+.B EINVAL
+An invalid value was specified for \fIadvice\fP.
+.TP
+.B ESPIPE
+The specified file descriptor refers to a pipe or FIFO.
+.RB ( ESPIPE
+is the error specified by POSIX,
+but before Linux 2.6.16,
+.\" commit 87ba81dba431232548ce29d5d224115d0c2355ac
+Linux returned
+.B EINVAL
+in this case.)
+.SH VERSIONS
+Under Linux, \fBPOSIX_FADV_NORMAL\fP sets the readahead window to the
+default size for the backing device; \fBPOSIX_FADV_SEQUENTIAL\fP doubles
+this size, and \fBPOSIX_FADV_RANDOM\fP disables file readahead entirely.
+These changes affect the entire file, not just the specified region
+(but other open file handles to the same file are unaffected).
+.SS C library/kernel differences
+The name of the wrapper function in the C library is
+.BR posix_fadvise ().
+The underlying system call is called
+.BR fadvise64 ()
+(or, on some architectures,
+.BR fadvise64_64 ());
+the difference between the two is that the former system call
+assumes that the type of the \fIlen\fP argument is \fIsize_t\fP,
+while the latter expects \fIloff_t\fP there.
+.SS Architecture-specific variants
+Some architectures require
+64-bit arguments to be aligned in a suitable pair of registers (see
+.BR syscall (2)
+for further detail).
+On such architectures, the call signature of
+.BR posix_fadvise ()
+shown in the SYNOPSIS would force
+a register to be wasted as padding between the
+.I fd
+and
+.I offset
+arguments.
+Therefore, these architectures define a version of the
+system call that orders the arguments suitably,
+but is otherwise exactly the same as
+.BR posix_fadvise ().
+.PP
+For example, since Linux 2.6.14, ARM has the following system call:
+.PP
+.in +4n
+.EX
+.BI "long arm_fadvise64_64(int " fd ", int " advice ,
+.BI " loff_t " offset ", loff_t " len );
+.EE
+.in
+.PP
+These architecture-specific details are generally
+hidden from applications by the glibc
+.BR posix_fadvise ()
+wrapper function,
+which invokes the appropriate architecture-specific system call.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001.
+.PP
+Kernel support first appeared in Linux 2.5.60;
+the underlying system call is called
+.BR fadvise64 ().
+.\" of fadvise64_64()
+Library support has been provided since glibc 2.2,
+via the wrapper function
+.BR posix_fadvise ().
+.PP
+Since Linux 3.18,
+.\" commit d3ac21cacc24790eb45d735769f35753f5b56ceb
+support for the underlying system call is optional,
+depending on the setting of the
+.B CONFIG_ADVISE_SYSCALLS
+configuration option.
+.PP
+The type of the
+.I len
+argument was changed from
+.I size_t
+to
+.I off_t
+in POSIX.1-2001 TC1.
+.SH NOTES
+The contents of the kernel buffer cache can be cleared via the
+.I /proc/sys/vm/drop_caches
+interface described in
+.BR proc (5).
+.PP
+One can obtain a snapshot of which pages of a file are resident
+in the buffer cache by opening a file, mapping it with
+.BR mmap (2),
+and then applying
+.BR mincore (2)
+to the mapping.
+.SH BUGS
+Before Linux 2.6.6, if
+.I len
+was specified as 0, then this was interpreted literally as "zero bytes",
+rather than as meaning "all bytes through to the end of the file".
+.SH SEE ALSO
+.BR fincore (1),
+.BR mincore (2),
+.BR readahead (2),
+.BR sync_file_range (2),
+.BR posix_fallocate (3),
+.BR posix_madvise (3)
diff --git a/man2/ppoll.2 b/man2/ppoll.2
new file mode 100644
index 0000000..227cd0e
--- /dev/null
+++ b/man2/ppoll.2
@@ -0,0 +1 @@
+.so man2/poll.2
diff --git a/man2/prctl.2 b/man2/prctl.2
new file mode 100644
index 0000000..a592bba
--- /dev/null
+++ b/man2/prctl.2
@@ -0,0 +1,2544 @@
+.\" Copyright (C) 1998 Andries Brouwer (aeb@cwi.nl)
+.\" and Copyright (C) 2002, 2006, 2008, 2012, 2013, 2015 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" and Copyright Guillem Jover <guillem@hadrons.org>
+.\" and Copyright (C) 2010 Andi Kleen <andi@firstfloor.org>
+.\" and Copyright (C) 2012 Cyrill Gorcunov <gorcunov@openvz.org>
+.\" and Copyright (C) 2014 Dave Hansen / Intel
+.\" and Copyright (c) 2016 Eugene Syromyatnikov <evgsyr@gmail.com>
+.\" and Copyright (c) 2018 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+.\" and Copyright (c) 2020 Dave Martin <Dave.Martin@arm.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified Thu Nov 11 04:19:42 MET 1999, aeb: added PR_GET_PDEATHSIG
+.\" Modified 27 Jun 02, Michael Kerrisk
+.\" Added PR_SET_DUMPABLE, PR_GET_DUMPABLE,
+.\" PR_SET_KEEPCAPS, PR_GET_KEEPCAPS
+.\" Modified 2006-08-30 Guillem Jover <guillem@hadrons.org>
+.\" Updated Linux versions where the options where introduced.
+.\" Added PR_SET_TIMING, PR_GET_TIMING, PR_SET_NAME, PR_GET_NAME,
+.\" PR_SET_UNALIGN, PR_GET_UNALIGN, PR_SET_FPEMU, PR_GET_FPEMU,
+.\" PR_SET_FPEXC, PR_GET_FPEXC
+.\" 2008-04-29 Serge Hallyn, Document PR_CAPBSET_READ and PR_CAPBSET_DROP
+.\" 2008-06-13 Erik Bosman, <ejbosman@cs.vu.nl>
+.\" Document PR_GET_TSC and PR_SET_TSC.
+.\" 2008-06-15 mtk, Document PR_SET_SECCOMP, PR_GET_SECCOMP
+.\" 2009-10-03 Andi Kleen, document PR_MCE_KILL
+.\" 2012-04 Cyrill Gorcunov, Document PR_SET_MM
+.\" 2012-04-25 Michael Kerrisk, Document PR_TASK_PERF_EVENTS_DISABLE and
+.\" PR_TASK_PERF_EVENTS_ENABLE
+.\" 2012-09-20 Kees Cook, update PR_SET_SECCOMP for mode 2
+.\" 2012-09-20 Kees Cook, document PR_SET_NO_NEW_PRIVS, PR_GET_NO_NEW_PRIVS
+.\" 2012-10-25 Michael Kerrisk, Document PR_SET_TIMERSLACK and
+.\" PR_GET_TIMERSLACK
+.\" 2013-01-10 Kees Cook, document PR_SET_PTRACER
+.\" 2012-02-04 Michael Kerrisk, document PR_{SET,GET}_CHILD_SUBREAPER
+.\" 2014-11-10 Dave Hansen, document PR_MPX_{EN,DIS}ABLE_MANAGEMENT
+.\"
+.\"
+.TH prctl 2 2023-07-28 "Linux man-pages 6.05.01"
+.SH NAME
+prctl \- operations on a process or thread
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/prctl.h>
+.PP
+.BI "int prctl(int " option ", ..."
+.BI " \fR/*\fP unsigned long " arg2 ", unsigned long " arg3 ,
+.BI " unsigned long " arg4 ", unsigned long " arg5 " \fR*/\fP );"
+.fi
+.SH DESCRIPTION
+.BR prctl ()
+manipulates various aspects of the behavior
+of the calling thread or process.
+.PP
+Note that careless use of some
+.BR prctl ()
+operations can confuse the user-space run-time environment,
+so these operations should be used with care.
+.PP
+.BR prctl ()
+is called with a first argument describing what to do
+(with values defined in \fI<linux/prctl.h>\fP), and further
+arguments with a significance depending on the first one.
+The first argument can be:
+.\"
+.\" prctl PR_CAP_AMBIENT
+.TP
+.BR PR_CAP_AMBIENT " (since Linux 4.3)"
+.\" commit 58319057b7847667f0c9585b9de0e8932b0fdb08
+Reads or changes the ambient capability set of the calling thread,
+according to the value of
+.IR arg2 ,
+which must be one of the following:
+.RS
+.\"
+.TP
+.B PR_CAP_AMBIENT_RAISE
+The capability specified in
+.I arg3
+is added to the ambient set.
+The specified capability must already be present in
+both the permitted and the inheritable sets of the process.
+This operation is not permitted if the
+.B SECBIT_NO_CAP_AMBIENT_RAISE
+securebit is set.
+.TP
+.B PR_CAP_AMBIENT_LOWER
+The capability specified in
+.I arg3
+is removed from the ambient set.
+.TP
+.B PR_CAP_AMBIENT_IS_SET
+The
+.BR prctl ()
+call returns 1 if the capability in
+.I arg3
+is in the ambient set and 0 if it is not.
+.TP
+.B PR_CAP_AMBIENT_CLEAR_ALL
+All capabilities will be removed from the ambient set.
+This operation requires setting
+.I arg3
+to zero.
+.RE
+.IP
+In all of the above operations,
+.I arg4
+and
+.I arg5
+must be specified as 0.
+.IP
+Higher-level interfaces layered on top of the above operations are
+provided in the
+.BR libcap (3)
+library in the form of
+.BR cap_get_ambient (3),
+.BR cap_set_ambient (3),
+and
+.BR cap_reset_ambient (3).
+.\" prctl PR_CAPBSET_READ
+.TP
+.BR PR_CAPBSET_READ " (since Linux 2.6.25)"
+Return (as the function result) 1 if the capability specified in
+.I arg2
+is in the calling thread's capability bounding set,
+or 0 if it is not.
+(The capability constants are defined in
+.IR <linux/capability.h> .)
+The capability bounding set dictates
+whether the process can receive the capability through a
+file's permitted capability set on a subsequent call to
+.BR execve (2).
+.IP
+If the capability specified in
+.I arg2
+is not valid, then the call fails with the error
+.BR EINVAL .
+.IP
+A higher-level interface layered on top of this operation is provided in the
+.BR libcap (3)
+library in the form of
+.BR cap_get_bound (3).
+.\" prctl PR_CAPBSET_DROP
+.TP
+.BR PR_CAPBSET_DROP " (since Linux 2.6.25)"
+If the calling thread has the
+.B CAP_SETPCAP
+capability within its user namespace, then drop the capability specified by
+.I arg2
+from the calling thread's capability bounding set.
+Any children of the calling thread will inherit the newly
+reduced bounding set.
+.IP
+The call fails with the error:
+.B EPERM
+if the calling thread does not have the
+.BR CAP_SETPCAP ;
+.B EINVAL
+if
+.I arg2
+does not represent a valid capability; or
+.B EINVAL
+if file capabilities are not enabled in the kernel,
+in which case bounding sets are not supported.
+.IP
+A higher-level interface layered on top of this operation is provided in the
+.BR libcap (3)
+library in the form of
+.BR cap_drop_bound (3).
+.\" prctl PR_SET_CHILD_SUBREAPER
+.TP
+.BR PR_SET_CHILD_SUBREAPER " (since Linux 3.4)"
+.\" commit ebec18a6d3aa1e7d84aab16225e87fd25170ec2b
+If
+.I arg2
+is nonzero,
+set the "child subreaper" attribute of the calling process;
+if
+.I arg2
+is zero, unset the attribute.
+.IP
+A subreaper fulfills the role of
+.BR init (1)
+for its descendant processes.
+When a process becomes orphaned
+(i.e., its immediate parent terminates),
+then that process will be reparented to
+the nearest still living ancestor subreaper.
+Subsequently, calls to
+.BR getppid (2)
+in the orphaned process will now return the PID of the subreaper process,
+and when the orphan terminates, it is the subreaper process that
+will receive a
+.B SIGCHLD
+signal and will be able to
+.BR wait (2)
+on the process to discover its termination status.
+.IP
+The setting of the "child subreaper" attribute
+is not inherited by children created by
+.BR fork (2)
+and
+.BR clone (2).
+The setting is preserved across
+.BR execve (2).
+.IP
+Establishing a subreaper process is useful in session management frameworks
+where a hierarchical group of processes is managed by a subreaper process
+that needs to be informed when one of the processes\[em]for example,
+a double-forked daemon\[em]terminates
+(perhaps so that it can restart that process).
+Some
+.BR init (1)
+frameworks (e.g.,
+.BR systemd (1))
+employ a subreaper process for similar reasons.
+.\" prctl PR_GET_CHILD_SUBREAPER
+.TP
+.BR PR_GET_CHILD_SUBREAPER " (since Linux 3.4)"
+Return the "child subreaper" setting of the caller,
+in the location pointed to by
+.IR "(int\~*) arg2" .
+.\" prctl PR_SET_DUMPABLE
+.TP
+.BR PR_SET_DUMPABLE " (since Linux 2.3.20)"
+Set the state of the "dumpable" attribute,
+which determines whether core dumps are produced for the calling process
+upon delivery of a signal whose default behavior is to produce a core dump.
+.IP
+Up to and including Linux 2.6.12,
+.I arg2
+must be either 0
+.RB ( SUID_DUMP_DISABLE ,
+process is not dumpable) or 1
+.RB ( SUID_DUMP_USER ,
+process is dumpable).
+Between Linux 2.6.13 and Linux 2.6.17,
+.\" commit abf75a5033d4da7b8a7e92321d74021d1fcfb502
+the value 2 was also permitted,
+which caused any binary which normally would not be dumped
+to be dumped readable by root only;
+for security reasons, this feature has been removed.
+.\" See http://marc.theaimsgroup.com/?l=linux-kernel&m=115270289030630&w=2
+.\" Subject: Fix prctl privilege escalation (CVE-2006-2451)
+.\" From: Marcel Holtmann <marcel () holtmann ! org>
+.\" Date: 2006-07-12 11:12:00
+(See also the description of
+.I /proc/sys/fs/\:suid_dumpable
+in
+.BR proc (5).)
+.IP
+Normally, the "dumpable" attribute is set to 1.
+However, it is reset to the current value contained in the file
+.I /proc/sys/fs/\:suid_dumpable
+(which by default has the value 0),
+in the following circumstances:
+.\" See kernel/cred.c::commit_creds() (Linux 3.18 sources)
+.RS
+.IP \[bu] 3
+The process's effective user or group ID is changed.
+.IP \[bu]
+The process's filesystem user or group ID is changed (see
+.BR credentials (7)).
+.IP \[bu]
+The process executes
+.RB ( execve (2))
+a set-user-ID or set-group-ID program, resulting in a change
+of either the effective user ID or the effective group ID.
+.IP \[bu]
+The process executes
+.RB ( execve (2))
+a program that has file capabilities (see
+.BR capabilities (7)),
+.\" See kernel/cred.c::commit_creds()
+but only if the permitted capabilities
+gained exceed those already permitted for the process.
+.\" Also certain namespace operations;
+.RE
+.IP
+Processes that are not dumpable can not be attached via
+.BR ptrace (2)
+.BR PTRACE_ATTACH ;
+see
+.BR ptrace (2)
+for further details.
+.IP
+If a process is not dumpable,
+the ownership of files in the process's
+.IR /proc/ pid
+directory is affected as described in
+.BR proc (5).
+.\" prctl PR_GET_DUMPABLE
+.TP
+.BR PR_GET_DUMPABLE " (since Linux 2.3.20)"
+Return (as the function result) the current state of the calling
+process's dumpable attribute.
+.\" Since Linux 2.6.13, the dumpable flag can have the value 2,
+.\" but in Linux 2.6.13 PR_GET_DUMPABLE simply returns 1 if the dumpable
+.\" flags has a nonzero value. This was fixed in Linux 2.6.14.
+.\" prctl PR_SET_ENDIAN
+.TP
+.BR PR_SET_ENDIAN " (since Linux 2.6.18, PowerPC only)"
+Set the endian-ness of the calling process to the value given
+in \fIarg2\fP, which should be one of the following:
+.\" Respectively 0, 1, 2
+.BR PR_ENDIAN_BIG ,
+.BR PR_ENDIAN_LITTLE ,
+or
+.B PR_ENDIAN_PPC_LITTLE
+(PowerPC pseudo little endian).
+.\" prctl PR_GET_ENDIAN
+.TP
+.BR PR_GET_ENDIAN " (since Linux 2.6.18, PowerPC only)"
+Return the endian-ness of the calling process,
+in the location pointed to by
+.IR "(int\~*) arg2" .
+.\" prctl PR_SET_FP_MODE
+.TP
+.BR PR_SET_FP_MODE " (since Linux 4.0, only on MIPS)"
+.\" commit 9791554b45a2acc28247f66a5fd5bbc212a6b8c8
+On the MIPS architecture,
+user-space code can be built using an ABI which permits linking
+with code that has more restrictive floating-point (FP) requirements.
+For example, user-space code may be built to target the O32 FPXX ABI
+and linked with code built for either one of the more restrictive
+FP32 or FP64 ABIs.
+When more restrictive code is linked in,
+the overall requirement for the process is to use the more
+restrictive floating-point mode.
+.IP
+Because the kernel has no means of knowing in advance
+which mode the process should be executed in,
+and because these restrictions can
+change over the lifetime of the process, the
+.B PR_SET_FP_MODE
+operation is provided to allow control of the floating-point mode
+from user space.
+.IP
+.\" https://dmz-portal.mips.com/wiki/MIPS_O32_ABI_-_FR0_and_FR1_Interlinking
+The
+.I (unsigned int) arg2
+argument is a bit mask describing the floating-point mode used:
+.RS
+.TP
+.B PR_FP_MODE_FR
+When this bit is
+.I unset
+(so called
+.BR FR=0 " or " FR0
+mode), the 32 floating-point registers are 32 bits wide,
+and 64-bit registers are represented as a pair of registers
+(even- and odd- numbered,
+with the even-numbered register containing the lower 32 bits,
+and the odd-numbered register containing the higher 32 bits).
+.IP
+When this bit is
+.I set
+(on supported hardware),
+the 32 floating-point registers are 64 bits wide (so called
+.BR FR=1 " or " FR1
+mode).
+Note that modern MIPS implementations (MIPS R6 and newer) support
+.B FR=1
+mode only.
+.IP
+Applications that use the O32 FP32 ABI can operate only when this bit is
+.I unset
+.RB ( FR=0 ;
+or they can be used with FRE enabled, see below).
+Applications that use the O32 FP64 ABI
+(and the O32 FP64A ABI, which exists to
+provide the ability to operate with existing FP32 code; see below)
+can operate only when this bit is
+.I set
+.RB ( FR=1 ).
+Applications that use the O32 FPXX ABI can operate with either
+.B FR=0
+or
+.B FR=1 .
+.TP
+.B PR_FP_MODE_FRE
+Enable emulation of 32-bit floating-point mode.
+When this mode is enabled,
+it emulates 32-bit floating-point operations
+by raising a reserved-instruction exception
+on every instruction that uses 32-bit formats and
+the kernel then handles the instruction in software.
+(The problem lies in the discrepancy of handling odd-numbered registers
+which are the high 32 bits of 64-bit registers with even numbers in
+.B FR=0
+mode and the lower 32-bit parts of odd-numbered 64-bit registers in
+.B FR=1
+mode.)
+Enabling this bit is necessary when code with the O32 FP32 ABI should operate
+with code with compatible the O32 FPXX or O32 FP64A ABIs (which require
+.B FR=1
+FPU mode) or when it is executed on newer hardware (MIPS R6 onwards)
+which lacks
+.B FR=0
+mode support when a binary with the FP32 ABI is used.
+.IP
+Note that this mode makes sense only when the FPU is in 64-bit mode
+.RB ( FR=1 ).
+.IP
+Note that the use of emulation inherently has a significant performance hit
+and should be avoided if possible.
+.RE
+.IP
+In the N32/N64 ABI, 64-bit floating-point mode is always used,
+so FPU emulation is not required and the FPU always operates in
+.B FR=1
+mode.
+.IP
+This option is mainly intended for use by the dynamic linker
+.RB ( ld.so (8)).
+.IP
+The arguments
+.IR arg3 ,
+.IR arg4 ,
+and
+.I arg5
+are ignored.
+.\" prctl PR_GET_FP_MODE
+.TP
+.BR PR_GET_FP_MODE " (since Linux 4.0, only on MIPS)"
+Return (as the function result)
+the current floating-point mode (see the description of
+.B PR_SET_FP_MODE
+for details).
+.IP
+On success,
+the call returns a bit mask which represents the current floating-point mode.
+.IP
+The arguments
+.IR arg2 ,
+.IR arg3 ,
+.IR arg4 ,
+and
+.I arg5
+are ignored.
+.\" prctl PR_SET_FPEMU
+.TP
+.BR PR_SET_FPEMU " (since Linux 2.4.18, 2.5.9, only on ia64)"
+Set floating-point emulation control bits to \fIarg2\fP.
+Pass
+.B PR_FPEMU_NOPRINT
+to silently emulate floating-point operation accesses, or
+.B PR_FPEMU_SIGFPE
+to not emulate floating-point operations and send
+.B SIGFPE
+instead.
+.\" prctl PR_GET_FPEMU
+.TP
+.BR PR_GET_FPEMU " (since Linux 2.4.18, 2.5.9, only on ia64)"
+Return floating-point emulation control bits,
+in the location pointed to by
+.IR "(int\~*) arg2" .
+.\" prctl PR_SET_FPEXC
+.TP
+.BR PR_SET_FPEXC " (since Linux 2.4.21, 2.5.32, only on PowerPC)"
+Set floating-point exception mode to \fIarg2\fP.
+Pass \fBPR_FP_EXC_SW_ENABLE\fP to use FPEXC for FP exception enables,
+\fBPR_FP_EXC_DIV\fP for floating-point divide by zero,
+\fBPR_FP_EXC_OVF\fP for floating-point overflow,
+\fBPR_FP_EXC_UND\fP for floating-point underflow,
+\fBPR_FP_EXC_RES\fP for floating-point inexact result,
+\fBPR_FP_EXC_INV\fP for floating-point invalid operation,
+\fBPR_FP_EXC_DISABLED\fP for FP exceptions disabled,
+\fBPR_FP_EXC_NONRECOV\fP for async nonrecoverable exception mode,
+\fBPR_FP_EXC_ASYNC\fP for async recoverable exception mode,
+\fBPR_FP_EXC_PRECISE\fP for precise exception mode.
+.\" prctl PR_GET_FPEXC
+.TP
+.BR PR_GET_FPEXC " (since Linux 2.4.21, 2.5.32, only on PowerPC)"
+Return floating-point exception mode,
+in the location pointed to by
+.IR "(int\~*) arg2" .
+.\" prctl PR_SET_IO_FLUSHER
+.TP
+.BR PR_SET_IO_FLUSHER " (since Linux 5.6)"
+If a user process is involved in the block layer or filesystem I/O path,
+and can allocate memory while processing I/O requests it must set
+\fIarg2\fP to 1.
+This will put the process in the IO_FLUSHER state,
+which allows it special treatment to make progress when allocating memory.
+If \fIarg2\fP is 0, the process will clear the IO_FLUSHER state, and
+the default behavior will be used.
+.IP
+The calling process must have the
+.B CAP_SYS_RESOURCE
+capability.
+.IP
+.IR arg3 ,
+.IR arg4 ,
+and
+.I arg5
+must be zero.
+.IP
+The IO_FLUSHER state is inherited by a child process created via
+.BR fork (2)
+and is preserved across
+.BR execve (2).
+.IP
+Examples of IO_FLUSHER applications are FUSE daemons, SCSI device
+emulation daemons, and daemons that perform error handling like multipath
+path recovery applications.
+.\" prctl PR_GET_IO_FLUSHER
+.TP
+.B PR_GET_IO_FLUSHER (Since Linux 5.6)
+Return (as the function result) the IO_FLUSHER state of the caller.
+A value of 1 indicates that the caller is in the IO_FLUSHER state;
+0 indicates that the caller is not in the IO_FLUSHER state.
+.IP
+The calling process must have the
+.B CAP_SYS_RESOURCE
+capability.
+.IP
+.IR arg2 ,
+.IR arg3 ,
+.IR arg4 ,
+and
+.I arg5
+must be zero.
+.\" prctl PR_SET_KEEPCAPS
+.TP
+.BR PR_SET_KEEPCAPS " (since Linux 2.2.18)"
+Set the state of the calling thread's "keep capabilities" flag.
+The effect of this flag is described in
+.BR capabilities (7).
+.I arg2
+must be either 0 (clear the flag)
+or 1 (set the flag).
+The "keep capabilities" value will be reset to 0 on subsequent calls to
+.BR execve (2).
+.\" prctl PR_GET_KEEPCAPS
+.TP
+.BR PR_GET_KEEPCAPS " (since Linux 2.2.18)"
+Return (as the function result) the current state of the calling thread's
+"keep capabilities" flag.
+See
+.BR capabilities (7)
+for a description of this flag.
+.\" prctl PR_MCE_KILL
+.TP
+.BR PR_MCE_KILL " (since Linux 2.6.32)"
+Set the machine check memory corruption kill policy for the calling thread.
+If
+.I arg2
+is
+.BR PR_MCE_KILL_CLEAR ,
+clear the thread memory corruption kill policy and use the system-wide default.
+(The system-wide default is defined by
+.IR /proc/sys/vm/memory_failure_early_kill ;
+see
+.BR proc (5).)
+If
+.I arg2
+is
+.BR PR_MCE_KILL_SET ,
+use a thread-specific memory corruption kill policy.
+In this case,
+.I arg3
+defines whether the policy is
+.I early kill
+.RB ( PR_MCE_KILL_EARLY ),
+.I late kill
+.RB ( PR_MCE_KILL_LATE ),
+or the system-wide default
+.RB ( PR_MCE_KILL_DEFAULT ).
+Early kill means that the thread receives a
+.B SIGBUS
+signal as soon as hardware memory corruption is detected inside
+its address space.
+In late kill mode, the process is killed only when it accesses a corrupted page.
+See
+.BR sigaction (2)
+for more information on the
+.B SIGBUS
+signal.
+The policy is inherited by children.
+The remaining unused
+.BR prctl ()
+arguments must be zero for future compatibility.
+.\" prctl PR_MCE_KILL_GET
+.TP
+.BR PR_MCE_KILL_GET " (since Linux 2.6.32)"
+Return (as the function result)
+the current per-process machine check kill policy.
+All unused
+.BR prctl ()
+arguments must be zero.
+.\" prctl PR_SET_MM
+.TP
+.BR PR_SET_MM " (since Linux 3.3)"
+.\" commit 028ee4be34a09a6d48bdf30ab991ae933a7bc036
+Modify certain kernel memory map descriptor fields
+of the calling process.
+Usually these fields are set by the kernel and dynamic loader (see
+.BR ld.so (8)
+for more information) and a regular application should not use this feature.
+However, there are cases, such as self-modifying programs,
+where a program might find it useful to change its own memory map.
+.IP
+The calling process must have the
+.B CAP_SYS_RESOURCE
+capability.
+The value in
+.I arg2
+is one of the options below, while
+.I arg3
+provides a new value for the option.
+The
+.I arg4
+and
+.I arg5
+arguments must be zero if unused.
+.IP
+Before Linux 3.10,
+.\" commit 52b3694157e3aa6df871e283115652ec6f2d31e0
+this feature is available only if the kernel is built with the
+.B CONFIG_CHECKPOINT_RESTORE
+option enabled.
+.RS
+.TP
+.B PR_SET_MM_START_CODE
+Set the address above which the program text can run.
+The corresponding memory area must be readable and executable,
+but not writable or shareable (see
+.BR mprotect (2)
+and
+.BR mmap (2)
+for more information).
+.TP
+.B PR_SET_MM_END_CODE
+Set the address below which the program text can run.
+The corresponding memory area must be readable and executable,
+but not writable or shareable.
+.TP
+.B PR_SET_MM_START_DATA
+Set the address above which initialized and
+uninitialized (bss) data are placed.
+The corresponding memory area must be readable and writable,
+but not executable or shareable.
+.TP
+.B PR_SET_MM_END_DATA
+Set the address below which initialized and
+uninitialized (bss) data are placed.
+The corresponding memory area must be readable and writable,
+but not executable or shareable.
+.TP
+.B PR_SET_MM_START_STACK
+Set the start address of the stack.
+The corresponding memory area must be readable and writable.
+.TP
+.B PR_SET_MM_START_BRK
+Set the address above which the program heap can be expanded with
+.BR brk (2)
+call.
+The address must be greater than the ending address of
+the current program data segment.
+In addition, the combined size of the resulting heap and
+the size of the data segment can't exceed the
+.B RLIMIT_DATA
+resource limit (see
+.BR setrlimit (2)).
+.TP
+.B PR_SET_MM_BRK
+Set the current
+.BR brk (2)
+value.
+The requirements for the address are the same as for the
+.B PR_SET_MM_START_BRK
+option.
+.PP
+The following options are available since Linux 3.5.
+.\" commit fe8c7f5cbf91124987106faa3bdf0c8b955c4cf7
+.TP
+.B PR_SET_MM_ARG_START
+Set the address above which the program command line is placed.
+.TP
+.B PR_SET_MM_ARG_END
+Set the address below which the program command line is placed.
+.TP
+.B PR_SET_MM_ENV_START
+Set the address above which the program environment is placed.
+.TP
+.B PR_SET_MM_ENV_END
+Set the address below which the program environment is placed.
+.IP
+The address passed with
+.BR PR_SET_MM_ARG_START ,
+.BR PR_SET_MM_ARG_END ,
+.BR PR_SET_MM_ENV_START ,
+and
+.B PR_SET_MM_ENV_END
+should belong to a process stack area.
+Thus, the corresponding memory area must be readable, writable, and
+(depending on the kernel configuration) have the
+.B MAP_GROWSDOWN
+attribute set (see
+.BR mmap (2)).
+.TP
+.B PR_SET_MM_AUXV
+Set a new auxiliary vector.
+The
+.I arg3
+argument should provide the address of the vector.
+The
+.I arg4
+is the size of the vector.
+.TP
+.B PR_SET_MM_EXE_FILE
+.\" commit b32dfe377102ce668775f8b6b1461f7ad428f8b6
+Supersede the
+.IR /proc/ pid /exe
+symbolic link with a new one pointing to a new executable file
+identified by the file descriptor provided in
+.I arg3
+argument.
+The file descriptor should be obtained with a regular
+.BR open (2)
+call.
+.IP
+To change the symbolic link, one needs to unmap all existing
+executable memory areas, including those created by the kernel itself
+(for example the kernel usually creates at least one executable
+memory area for the ELF
+.I .text
+section).
+.IP
+In Linux 4.9 and earlier, the
+.\" commit 3fb4afd9a504c2386b8435028d43283216bf588e
+.B PR_SET_MM_EXE_FILE
+operation can be performed only once in a process's lifetime;
+attempting to perform the operation a second time results in the error
+.BR EPERM .
+This restriction was enforced for security reasons that were subsequently
+deemed specious,
+and the restriction was removed in Linux 4.10 because some
+user-space applications needed to perform this operation more than once.
+.PP
+The following options are available since Linux 3.18.
+.\" commit f606b77f1a9e362451aca8f81d8f36a3a112139e
+.TP
+.B PR_SET_MM_MAP
+Provides one-shot access to all the addresses by passing in a
+.I struct prctl_mm_map
+(as defined in \fI<linux/prctl.h>\fP).
+The
+.I arg4
+argument should provide the size of the struct.
+.IP
+This feature is available only if the kernel is built with the
+.B CONFIG_CHECKPOINT_RESTORE
+option enabled.
+.TP
+.B PR_SET_MM_MAP_SIZE
+Returns the size of the
+.I struct prctl_mm_map
+the kernel expects.
+This allows user space to find a compatible struct.
+The
+.I arg4
+argument should be a pointer to an unsigned int.
+.IP
+This feature is available only if the kernel is built with the
+.B CONFIG_CHECKPOINT_RESTORE
+option enabled.
+.RE
+.\" prctl PR_SET_VMA
+.TP
+.BR PR_SET_VMA " (since Linux 5.17)"
+.\" Commit 9a10064f5625d5572c3626c1516e0bebc6c9fe9b
+Sets an attribute specified in
+.I arg2
+for virtual memory areas starting from the address specified in
+.I arg3
+and spanning the size specified in
+.IR arg4 .
+.I arg5
+specifies the value of the attribute to be set.
+.IP
+Note that assigning an attribute to a virtual memory area
+might prevent it from being merged with adjacent virtual memory areas
+due to the difference in that attribute's value.
+.IP
+Currently,
+.I arg2
+must be one of:
+.RS
+.TP
+.B PR_SET_VMA_ANON_NAME
+Set a name for anonymous virtual memory areas.
+.I arg5
+should be a pointer to a null-terminated string containing the name.
+The name length including null byte cannot exceed 80 bytes.
+If
+.I arg5
+is NULL, the name of the appropriate anonymous virtual memory areas
+will be reset.
+The name can contain only printable ascii characters (including space),
+except \[aq][\[aq], \[aq]]\[aq], \[aq]\e\[aq], \[aq]$\[aq], and \[aq]\[ga]\[aq].
+.RE
+.\" prctl PR_MPX_ENABLE_MANAGEMENT
+.TP
+.BR PR_MPX_ENABLE_MANAGEMENT ", " PR_MPX_DISABLE_MANAGEMENT " (since Linux 3.19, removed in Linux 5.4; only on x86)"
+.\" commit fe3d197f84319d3bce379a9c0dc17b1f48ad358c
+.\" See also http://lwn.net/Articles/582712/
+.\" See also https://gcc.gnu.org/wiki/Intel%20MPX%20support%20in%20the%20GCC%20compiler
+Enable or disable kernel management of Memory Protection eXtensions (MPX)
+bounds tables.
+The
+.IR arg2 ,
+.IR arg3 ,
+.IR arg4 ,
+and
+.I arg5
+.\" commit e9d1b4f3c60997fe197bf0243cb4a41a44387a88
+arguments must be zero.
+.IP
+MPX is a hardware-assisted mechanism for performing bounds checking on
+pointers.
+It consists of a set of registers storing bounds information
+and a set of special instruction prefixes that tell the CPU on which
+instructions it should do bounds enforcement.
+There is a limited number of these registers and
+when there are more pointers than registers,
+their contents must be "spilled" into a set of tables.
+These tables are called "bounds tables" and the MPX
+.BR prctl ()
+operations control
+whether the kernel manages their allocation and freeing.
+.IP
+When management is enabled, the kernel will take over allocation
+and freeing of the bounds tables.
+It does this by trapping the #BR exceptions that result
+at first use of missing bounds tables and
+instead of delivering the exception to user space,
+it allocates the table and populates the bounds directory
+with the location of the new table.
+For freeing, the kernel checks to see if bounds tables are
+present for memory which is not allocated, and frees them if so.
+.IP
+Before enabling MPX management using
+.BR PR_MPX_ENABLE_MANAGEMENT ,
+the application must first have allocated a user-space buffer for
+the bounds directory and placed the location of that directory in the
+.I bndcfgu
+register.
+.IP
+These calls fail if the CPU or kernel does not support MPX.
+Kernel support for MPX is enabled via the
+.B CONFIG_X86_INTEL_MPX
+configuration option.
+You can check whether the CPU supports MPX by looking for the
+.I mpx
+CPUID bit, like with the following command:
+.IP
+.in +4n
+.EX
+cat /proc/cpuinfo | grep \[aq] mpx \[aq]
+.EE
+.in
+.IP
+A thread may not switch in or out of long (64-bit) mode while MPX is
+enabled.
+.IP
+All threads in a process are affected by these calls.
+.IP
+The child of a
+.BR fork (2)
+inherits the state of MPX management.
+During
+.BR execve (2),
+MPX management is reset to a state as if
+.B PR_MPX_DISABLE_MANAGEMENT
+had been called.
+.IP
+For further information on Intel MPX, see the kernel source file
+.IR Documentation/x86/intel_mpx.txt .
+.IP
+.\" commit f240652b6032b48ad7fa35c5e701cc4c8d697c0b
+.\" See also https://lkml.kernel.org/r/20190705175321.DB42F0AD@viggo.jf.intel.com
+Due to a lack of toolchain support,
+.BR PR_MPX_ENABLE_MANAGEMENT " and " PR_MPX_DISABLE_MANAGEMENT
+are not supported in Linux 5.4 and later.
+.\" prctl PR_SET_NAME
+.TP
+.BR PR_SET_NAME " (since Linux 2.6.9)"
+Set the name of the calling thread,
+using the value in the location pointed to by
+.IR "(char\~*) arg2" .
+The name can be up to 16 bytes long,
+.\" TASK_COMM_LEN in include/linux/sched.h
+including the terminating null byte.
+(If the length of the string, including the terminating null byte,
+exceeds 16 bytes, the string is silently truncated.)
+This is the same attribute that can be set via
+.BR pthread_setname_np (3)
+and retrieved using
+.BR pthread_getname_np (3).
+The attribute is likewise accessible via
+.IR /proc/self/task/ tid /comm
+(see
+.BR proc (5)),
+where
+.I tid
+is the thread ID of the calling thread, as returned by
+.BR gettid (2).
+.\" prctl PR_GET_NAME
+.TP
+.BR PR_GET_NAME " (since Linux 2.6.11)"
+Return the name of the calling thread,
+in the buffer pointed to by
+.IR "(char\~*) arg2" .
+The buffer should allow space for up to 16 bytes;
+the returned string will be null-terminated.
+.\" prctl PR_SET_NO_NEW_PRIVS
+.TP
+.BR PR_SET_NO_NEW_PRIVS " (since Linux 3.5)"
+Set the calling thread's
+.I no_new_privs
+attribute to the value in
+.IR arg2 .
+With
+.I no_new_privs
+set to 1,
+.BR execve (2)
+promises not to grant privileges to do anything
+that could not have been done without the
+.BR execve (2)
+call (for example,
+rendering the set-user-ID and set-group-ID mode bits,
+and file capabilities non-functional).
+Once set, the
+.I no_new_privs
+attribute cannot be unset.
+The setting of this attribute is inherited by children created by
+.BR fork (2)
+and
+.BR clone (2),
+and preserved across
+.BR execve (2).
+.IP
+Since Linux 4.10,
+the value of a thread's
+.I no_new_privs
+attribute can be viewed via the
+.I NoNewPrivs
+field in the
+.IR /proc/ pid /status
+file.
+.IP
+For more information, see the kernel source file
+.I Documentation/userspace\-api/no_new_privs.rst
+.\" commit 40fde647ccb0ae8c11d256d271e24d385eed595b
+(or
+.I Documentation/prctl/no_new_privs.txt
+before Linux 4.13).
+See also
+.BR seccomp (2).
+.\" prctl PR_GET_NO_NEW_PRIVS
+.TP
+.BR PR_GET_NO_NEW_PRIVS " (since Linux 3.5)"
+Return (as the function result) the value of the
+.I no_new_privs
+attribute for the calling thread.
+A value of 0 indicates the regular
+.BR execve (2)
+behavior.
+A value of 1 indicates
+.BR execve (2)
+will operate in the privilege-restricting mode described above.
+.\" prctl PR_PAC_RESET_KEYS
+.\" commit ba830885656414101b2f8ca88786524d4bb5e8c1
+.TP
+.BR PR_PAC_RESET_KEYS " (since Linux 5.0, only on arm64)"
+Securely reset the thread's pointer authentication keys
+to fresh random values generated by the kernel.
+.IP
+The set of keys to be reset is specified by
+.IR arg2 ,
+which must be a logical OR of zero or more of the following:
+.RS
+.TP
+.B PR_PAC_APIAKEY
+instruction authentication key A
+.TP
+.B PR_PAC_APIBKEY
+instruction authentication key B
+.TP
+.B PR_PAC_APDAKEY
+data authentication key A
+.TP
+.B PR_PAC_APDBKEY
+data authentication key B
+.TP
+.B PR_PAC_APGAKEY
+generic authentication \[lq]A\[rq] key.
+.IP
+(Yes folks, there really is no generic B key.)
+.RE
+.IP
+As a special case, if
+.I arg2
+is zero, then all the keys are reset.
+Since new keys could be added in future,
+this is the recommended way to completely wipe the existing keys
+when establishing a clean execution context.
+Note that there is no need to use
+.B PR_PAC_RESET_KEYS
+in preparation for calling
+.BR execve (2),
+since
+.BR execve (2)
+resets all the pointer authentication keys.
+.IP
+The remaining arguments
+.IR arg3 ", " arg4 ", and " arg5
+must all be zero.
+.IP
+If the arguments are invalid,
+and in particular if
+.I arg2
+contains set bits that are unrecognized
+or that correspond to a key not available on this platform,
+then the call fails with error
+.BR EINVAL .
+.IP
+.B Warning:
+Because the compiler or run-time environment
+may be using some or all of the keys,
+a successful
+.B PR_PAC_RESET_KEYS
+may crash the calling process.
+The conditions for using it safely are complex and system-dependent.
+Don't use it unless you know what you are doing.
+.IP
+For more information, see the kernel source file
+.I Documentation/arm64/pointer\-authentication.rst
+.\"commit b693d0b372afb39432e1c49ad7b3454855bc6bed
+(or
+.I Documentation/arm64/pointer\-authentication.txt
+before Linux 5.3).
+.\" prctl PR_SET_PDEATHSIG
+.TP
+.BR PR_SET_PDEATHSIG " (since Linux 2.1.57)"
+Set the parent-death signal
+of the calling process to \fIarg2\fP (either a signal value
+in the range
+.RB [ 1 ,
+.IR NSIG\~\-\~1 ],
+or
+.B 0
+to clear).
+This is the signal that the calling process will get when its
+parent dies.
+.IP
+.IR Warning :
+.\" https://bugzilla.kernel.org/show_bug.cgi?id=43300
+the "parent" in this case is considered to be the
+.I thread
+that created this process.
+In other words, the signal will be sent when that thread terminates
+(via, for example,
+.BR pthread_exit (3)),
+rather than after all of the threads in the parent process terminate.
+.IP
+The parent-death signal is sent upon subsequent termination of the parent
+thread and also upon termination of each subreaper process
+(see the description of
+.B PR_SET_CHILD_SUBREAPER
+above) to which the caller is subsequently reparented.
+If the parent thread and all ancestor subreapers have already terminated
+by the time of the
+.B PR_SET_PDEATHSIG
+operation, then no parent-death signal is sent to the caller.
+.IP
+The parent-death signal is process-directed (see
+.BR signal (7))
+and, if the child installs a handler using the
+.BR sigaction (2)
+.B SA_SIGINFO
+flag, the
+.I si_pid
+field of the
+.I siginfo_t
+argument of the handler contains the PID of the terminating parent process.
+.IP
+The parent-death signal setting is cleared for the child of a
+.BR fork (2).
+It is also
+(since Linux 2.4.36 / 2.6.23)
+.\" commit d2d56c5f51028cb9f3d800882eb6f4cbd3f9099f
+cleared when executing a set-user-ID or set-group-ID binary,
+or a binary that has associated capabilities (see
+.BR capabilities (7));
+otherwise, this value is preserved across
+.BR execve (2).
+The parent-death signal setting is also cleared upon changes to
+any of the following thread credentials:
+.\" FIXME capability changes can also trigger this; see
+.\" kernel/cred.c::commit_creds in the Linux 5.6 source.
+effective user ID, effective group ID, filesystem user ID,
+or filesystem group ID.
+.\" prctl PR_GET_PDEATHSIG
+.TP
+.BR PR_GET_PDEATHSIG " (since Linux 2.3.15)"
+Return the current value of the parent process death signal,
+in the location pointed to by
+.IR "(int\~*) arg2" .
+.\" prctl PR_SET_PTRACER
+.TP
+.BR PR_SET_PTRACER " (since Linux 3.4)"
+.\" commit 2d514487faf188938a4ee4fb3464eeecfbdcf8eb
+.\" commit bf06189e4d14641c0148bea16e9dd24943862215
+This is meaningful only when the Yama LSM is enabled and in mode 1
+("restricted ptrace", visible via
+.IR /proc/sys/kernel/yama/ptrace_scope ).
+When a "ptracer process ID" is passed in \fIarg2\fP,
+the caller is declaring that the ptracer process can
+.BR ptrace (2)
+the calling process as if it were a direct process ancestor.
+Each
+.B PR_SET_PTRACER
+operation replaces the previous "ptracer process ID".
+Employing
+.B PR_SET_PTRACER
+with
+.I arg2
+set to 0 clears the caller's "ptracer process ID".
+If
+.I arg2
+is
+.BR PR_SET_PTRACER_ANY ,
+the ptrace restrictions introduced by Yama are effectively disabled for the
+calling process.
+.IP
+For further information, see the kernel source file
+.I Documentation/admin\-guide/LSM/Yama.rst
+.\" commit 90bb766440f2147486a2acc3e793d7b8348b0c22
+(or
+.I Documentation/security/Yama.txt
+before Linux 4.13).
+.\" prctl PR_SET_SECCOMP
+.TP
+.BR PR_SET_SECCOMP " (since Linux 2.6.23)"
+.\" See http://thread.gmane.org/gmane.linux.kernel/542632
+.\" [PATCH 0 of 2] seccomp updates
+.\" andrea@cpushare.com
+Set the secure computing (seccomp) mode for the calling thread, to limit
+the available system calls.
+The more recent
+.BR seccomp (2)
+system call provides a superset of the functionality of
+.BR PR_SET_SECCOMP ,
+and is the preferred interface for new applications.
+.IP
+The seccomp mode is selected via
+.IR arg2 .
+(The seccomp constants are defined in
+.IR <linux/seccomp.h> .)
+The following values can be specified:
+.RS
+.TP
+.BR SECCOMP_MODE_STRICT " (since Linux 2.6.23)"
+See the description of
+.B SECCOMP_SET_MODE_STRICT
+in
+.BR seccomp (2).
+.IP
+This operation is available only
+if the kernel is configured with
+.B CONFIG_SECCOMP
+enabled.
+.TP
+.BR SECCOMP_MODE_FILTER " (since Linux 3.5)"
+The allowed system calls are defined by a pointer
+to a Berkeley Packet Filter passed in
+.IR arg3 .
+This argument is a pointer to
+.IR "struct sock_fprog" ;
+it can be designed to filter
+arbitrary system calls and system call arguments.
+See the description of
+.B SECCOMP_SET_MODE_FILTER
+in
+.BR seccomp (2).
+.IP
+This operation is available only
+if the kernel is configured with
+.B CONFIG_SECCOMP_FILTER
+enabled.
+.RE
+.IP
+For further details on seccomp filtering, see
+.BR seccomp (2).
+.\" prctl PR_GET_SECCOMP
+.TP
+.BR PR_GET_SECCOMP " (since Linux 2.6.23)"
+Return (as the function result)
+the secure computing mode of the calling thread.
+If the caller is not in secure computing mode, this operation returns 0;
+if the caller is in strict secure computing mode, then the
+.BR prctl ()
+call will cause a
+.B SIGKILL
+signal to be sent to the process.
+If the caller is in filter mode, and this system call is allowed by the
+seccomp filters, it returns 2; otherwise, the process is killed with a
+.B SIGKILL
+signal.
+.IP
+This operation is available only
+if the kernel is configured with
+.B CONFIG_SECCOMP
+enabled.
+.IP
+Since Linux 3.8, the
+.I Seccomp
+field of the
+.IR /proc/ pid /status
+file provides a method of obtaining the same information,
+without the risk that the process is killed; see
+.BR proc (5).
+.\" prctl PR_SET_SECUREBITS
+.TP
+.BR PR_SET_SECUREBITS " (since Linux 2.6.26)"
+Set the "securebits" flags of the calling thread to the value supplied in
+.IR arg2 .
+See
+.BR capabilities (7).
+.\" prctl PR_GET_SECUREBITS
+.TP
+.BR PR_GET_SECUREBITS " (since Linux 2.6.26)"
+Return (as the function result)
+the "securebits" flags of the calling thread.
+See
+.BR capabilities (7).
+.\" prctl PR_GET_SPECULATION_CTRL
+.TP
+.BR PR_GET_SPECULATION_CTRL " (since Linux 4.17)"
+Return (as the function result)
+the state of the speculation misfeature specified in
+.IR arg2 .
+Currently, the only permitted value for this argument is
+.B PR_SPEC_STORE_BYPASS
+(otherwise the call fails with the error
+.BR ENODEV ).
+.IP
+The return value uses bits 0-3 with the following meaning:
+.RS
+.TP
+.B PR_SPEC_PRCTL
+Mitigation can be controlled per thread by
+.BR PR_SET_SPECULATION_CTRL .
+.TP
+.B PR_SPEC_ENABLE
+The speculation feature is enabled, mitigation is disabled.
+.TP
+.B PR_SPEC_DISABLE
+The speculation feature is disabled, mitigation is enabled.
+.TP
+.B PR_SPEC_FORCE_DISABLE
+Same as
+.B PR_SPEC_DISABLE
+but cannot be undone.
+.TP
+.BR PR_SPEC_DISABLE_NOEXEC " (since Linux 5.1)"
+Same as
+.BR PR_SPEC_DISABLE ,
+but the state will be cleared on
+.BR execve (2).
+.RE
+.IP
+If all bits are 0,
+then the CPU is not affected by the speculation misfeature.
+.IP
+If
+.B PR_SPEC_PRCTL
+is set, then per-thread control of the mitigation is available.
+If not set,
+.BR prctl ()
+for the speculation misfeature will fail.
+.IP
+The
+.IR arg3 ,
+.IR arg4 ,
+and
+.I arg5
+arguments must be specified as 0; otherwise the call fails with the error
+.BR EINVAL .
+.\" prctl PR_SET_SPECULATION_CTRL
+.TP
+.BR PR_SET_SPECULATION_CTRL " (since Linux 4.17)"
+.\" commit b617cfc858161140d69cc0b5cc211996b557a1c7
+.\" commit 356e4bfff2c5489e016fdb925adbf12a1e3950ee
+Sets the state of the speculation misfeature specified in
+.IR arg2 .
+The speculation-misfeature settings are per-thread attributes.
+.IP
+Currently,
+.I arg2
+must be one of:
+.RS
+.TP
+.B PR_SPEC_STORE_BYPASS
+Set the state of the speculative store bypass misfeature.
+.\" commit 9137bb27e60e554dab694eafa4cca241fa3a694f
+.TP
+.BR PR_SPEC_INDIRECT_BRANCH " (since Linux 4.20)"
+Set the state of the indirect branch speculation misfeature.
+.RE
+.IP
+If
+.I arg2
+does not have one of the above values,
+then the call fails with the error
+.BR ENODEV .
+.IP
+The
+.I arg3
+argument is used to hand in the control value,
+which is one of the following:
+.RS
+.TP
+.B PR_SPEC_ENABLE
+The speculation feature is enabled, mitigation is disabled.
+.TP
+.B PR_SPEC_DISABLE
+The speculation feature is disabled, mitigation is enabled.
+.TP
+.B PR_SPEC_FORCE_DISABLE
+Same as
+.BR PR_SPEC_DISABLE ,
+but cannot be undone.
+A subsequent
+.BR prctl (\c
+.IR arg2 ,
+.BR PR_SPEC_ENABLE )
+with the same value for
+.I arg2
+will fail with the error
+.BR EPERM .
+.\" commit 71368af9027f18fe5d1c6f372cfdff7e4bde8b48
+.TP
+.BR PR_SPEC_DISABLE_NOEXEC " (since Linux 5.1)"
+Same as
+.BR PR_SPEC_DISABLE ,
+but the state will be cleared on
+.BR execve (2).
+Currently only supported for
+.I arg2
+equal to
+.B PR_SPEC_STORE_BYPASS.
+.RE
+.IP
+Any unsupported value in
+.I arg3
+will result in the call failing with the error
+.BR ERANGE .
+.IP
+The
+.I arg4
+and
+.I arg5
+arguments must be specified as 0; otherwise the call fails with the error
+.BR EINVAL .
+.IP
+The speculation feature can also be controlled by the
+.B spec_store_bypass_disable
+boot parameter.
+This parameter may enforce a read-only policy which will result in the
+.BR prctl ()
+call failing with the error
+.BR ENXIO .
+For further details, see the kernel source file
+.IR Documentation/admin\-guide/kernel\-parameters.txt .
+.\" prctl PR_SVE_SET_VL
+.\" commit 2d2123bc7c7f843aa9db87720de159a049839862
+.\" linux-5.6/Documentation/arm64/sve.rst
+.TP
+.BR PR_SVE_SET_VL " (since Linux 4.15, only on arm64)"
+Configure the thread's SVE vector length,
+as specified by
+.IR "(int) arg2" .
+Arguments
+.IR arg3 ,
+.IR arg4 ,
+and
+.I arg5
+are ignored.
+.IP
+The bits of
+.I arg2
+corresponding to
+.B PR_SVE_VL_LEN_MASK
+must be set to the desired vector length in bytes.
+This is interpreted as an upper bound:
+the kernel will select the greatest available vector length
+that does not exceed the value specified.
+In particular, specifying
+.B SVE_VL_MAX
+(defined in
+.I <asm/sigcontext.h>)
+for the
+.B PR_SVE_VL_LEN_MASK
+bits requests the maximum supported vector length.
+.IP
+In addition, the other bits of
+.I arg2
+must be set to one of the following combinations of flags:
+.RS
+.TP
+.B 0
+Perform the change immediately.
+At the next
+.BR execve (2)
+in the thread,
+the vector length will be reset to the value configured in
+.IR /proc/sys/abi/sve_default_vector_length .
+.TP
+.B PR_SVE_VL_INHERIT
+Perform the change immediately.
+Subsequent
+.BR execve (2)
+calls will preserve the new vector length.
+.TP
+.B PR_SVE_SET_VL_ONEXEC
+Defer the change, so that it is performed at the next
+.BR execve (2)
+in the thread.
+Further
+.BR execve (2)
+calls will reset the vector length to the value configured in
+.IR /proc/sys/abi/sve_default_vector_length .
+.TP
+.B "PR_SVE_SET_VL_ONEXEC | PR_SVE_VL_INHERIT"
+Defer the change, so that it is performed at the next
+.BR execve (2)
+in the thread.
+Further
+.BR execve (2)
+calls will preserve the new vector length.
+.RE
+.IP
+In all cases,
+any previously pending deferred change is canceled.
+.IP
+The call fails with error
+.B EINVAL
+if SVE is not supported on the platform, if
+.I arg2
+is unrecognized or invalid, or the value in the bits of
+.I arg2
+corresponding to
+.B PR_SVE_VL_LEN_MASK
+is outside the range
+.BR SVE_VL_MIN .. SVE_VL_MAX
+or is not a multiple of 16.
+.IP
+On success,
+a nonnegative value is returned that describes the
+.I selected
+configuration.
+If
+.B PR_SVE_SET_VL_ONEXEC
+was included in
+.IR arg2 ,
+then the configuration described by the return value
+will take effect at the next
+.BR execve (2).
+Otherwise, the configuration is already in effect when the
+.B PR_SVE_SET_VL
+call returns.
+In either case, the value is encoded in the same way as the return value of
+.BR PR_SVE_GET_VL .
+Note that there is no explicit flag in the return value
+corresponding to
+.BR PR_SVE_SET_VL_ONEXEC .
+.IP
+The configuration (including any pending deferred change)
+is inherited across
+.BR fork (2)
+and
+.BR clone (2).
+.IP
+For more information, see the kernel source file
+.I Documentation/arm64/sve.rst
+.\"commit b693d0b372afb39432e1c49ad7b3454855bc6bed
+(or
+.I Documentation/arm64/sve.txt
+before Linux 5.3).
+.IP
+.B Warning:
+Because the compiler or run-time environment
+may be using SVE, using this call without the
+.B PR_SVE_SET_VL_ONEXEC
+flag may crash the calling process.
+The conditions for using it safely are complex and system-dependent.
+Don't use it unless you really know what you are doing.
+.\" prctl PR_SVE_GET_VL
+.TP
+.BR PR_SVE_GET_VL " (since Linux 4.15, only on arm64)"
+Get the thread's current SVE vector length configuration.
+.IP
+Arguments
+.IR arg2 ", " arg3 ", " arg4 ", and " arg5
+are ignored.
+.IP
+Provided that the kernel and platform support SVE,
+this operation always succeeds,
+returning a nonnegative value that describes the
+.I current
+configuration.
+The bits corresponding to
+.B PR_SVE_VL_LEN_MASK
+contain the currently configured vector length in bytes.
+The bit corresponding to
+.B PR_SVE_VL_INHERIT
+indicates whether the vector length will be inherited
+across
+.BR execve (2).
+.IP
+Note that there is no way to determine whether there is
+a pending vector length change that has not yet taken effect.
+.IP
+For more information, see the kernel source file
+.I Documentation/arm64/sve.rst
+.\"commit b693d0b372afb39432e1c49ad7b3454855bc6bed
+(or
+.I Documentation/arm64/sve.txt
+before Linux 5.3).
+.TP
+.\" prctl PR_SET_SYSCALL_USER_DISPATCH
+.\" commit 1446e1df9eb183fdf81c3f0715402f1d7595d4
+.BR PR_SET_SYSCALL_USER_DISPATCH " (since Linux 5.11, x86 only)"
+Configure the Syscall User Dispatch mechanism
+for the calling thread.
+This mechanism allows an application
+to selectively intercept system calls
+so that they can be handled within the application itself.
+Interception takes the form of a thread-directed
+.B SIGSYS
+signal that is delivered to the thread
+when it makes a system call.
+If intercepted,
+the system call is not executed by the kernel.
+.IP
+To enable this mechanism,
+.I arg2
+should be set to
+.BR PR_SYS_DISPATCH_ON .
+Once enabled, further system calls will be selectively intercepted,
+depending on a control variable provided by user space.
+In this case,
+.I arg3
+and
+.I arg4
+respectively identify the
+.I offset
+and
+.I length
+of a single contiguous memory region in the process address space
+from where system calls are always allowed to be executed,
+regardless of the control variable.
+(Typically, this area would include the area of memory
+containing the C library.)
+.IP
+.I arg5
+points to a char-sized variable
+that is a fast switch to allow/block system call execution
+without the overhead of doing another system call
+to re-configure Syscall User Dispatch.
+This control variable can either be set to
+.B SYSCALL_DISPATCH_FILTER_BLOCK
+to block system calls from executing
+or to
+.B SYSCALL_DISPATCH_FILTER_ALLOW
+to temporarily allow them to be executed.
+This value is checked by the kernel
+on every system call entry,
+and any unexpected value will raise
+an uncatchable
+.B SIGSYS
+at that time,
+killing the application.
+.IP
+When a system call is intercepted,
+the kernel sends a thread-directed
+.B SIGSYS
+signal to the triggering thread.
+Various fields will be set in the
+.I siginfo_t
+structure (see
+.BR sigaction (2))
+associated with the signal:
+.RS
+.IP \[bu] 3
+.I si_signo
+will contain
+.BR SIGSYS .
+.IP \[bu]
+.I si_call_addr
+will show the address of the system call instruction.
+.IP \[bu]
+.I si_syscall
+and
+.I si_arch
+will indicate which system call was attempted.
+.IP \[bu]
+.I si_code
+will contain
+.BR SYS_USER_DISPATCH .
+.IP \[bu]
+.I si_errno
+will be set to 0.
+.RE
+.IP
+The program counter will be as though the system call happened
+(i.e., the program counter will not point to the system call instruction).
+.IP
+When the signal handler returns to the kernel,
+the system call completes immediately
+and returns to the calling thread,
+without actually being executed.
+If necessary
+(i.e., when emulating the system call on user space.),
+the signal handler should set the system call return value
+to a sane value,
+by modifying the register context stored in the
+.I ucontext
+argument of the signal handler.
+See
+.BR sigaction (2),
+.BR sigreturn (2),
+and
+.BR getcontext (3)
+for more information.
+.IP
+If
+.I arg2
+is set to
+.BR PR_SYS_DISPATCH_OFF ,
+Syscall User Dispatch is disabled for that thread.
+the remaining arguments must be set to 0.
+.IP
+The setting is not preserved across
+.BR fork (2),
+.BR clone (2),
+or
+.BR execve (2).
+.IP
+For more information,
+see the kernel source file
+.I Documentation/admin\-guide/syscall\-user\-dispatch.rst
+.\" prctl PR_SET_TAGGED_ADDR_CTRL
+.\" commit 63f0c60379650d82250f22e4cf4137ef3dc4f43d
+.TP
+.BR PR_SET_TAGGED_ADDR_CTRL " (since Linux 5.4, only on arm64)"
+Controls support for passing tagged user-space addresses to the kernel
+(i.e., addresses where bits 56\[em]63 are not all zero).
+.IP
+The level of support is selected by
+.IR "arg2" ,
+which can be one of the following:
+.RS
+.TP
+.B 0
+Addresses that are passed
+for the purpose of being dereferenced by the kernel
+must be untagged.
+.TP
+.B PR_TAGGED_ADDR_ENABLE
+Addresses that are passed
+for the purpose of being dereferenced by the kernel
+may be tagged, with the exceptions summarized below.
+.RE
+.IP
+The remaining arguments
+.IR arg3 ", " arg4 ", and " arg5
+must all be zero.
+.\" Enforcement added in
+.\" commit 3e91ec89f527b9870fe42dcbdb74fd389d123a95
+.IP
+On success, the mode specified in
+.I arg2
+is set for the calling thread and the return value is 0.
+If the arguments are invalid,
+the mode specified in
+.I arg2
+is unrecognized,
+or if this feature is unsupported by the kernel
+or disabled via
+.IR /proc/sys/abi/tagged_addr_disabled ,
+the call fails with the error
+.BR EINVAL .
+.IP
+In particular, if
+.BR prctl ( PR_SET_TAGGED_ADDR_CTRL ,
+0, 0, 0, 0)
+fails with
+.BR EINVAL ,
+then all addresses passed to the kernel must be untagged.
+.IP
+Irrespective of which mode is set,
+addresses passed to certain interfaces
+must always be untagged:
+.RS
+.IP \[bu] 3
+.BR brk (2),
+.BR mmap (2),
+.BR shmat (2),
+.BR shmdt (2),
+and the
+.I new_address
+argument of
+.BR mremap (2).
+.IP
+(Prior to Linux 5.6 these accepted tagged addresses,
+but the behaviour may not be what you expect.
+Don't rely on it.)
+.IP \[bu]
+\[oq]polymorphic\[cq] interfaces
+that accept pointers to arbitrary types cast to a
+.I void *
+or other generic type, specifically
+.BR prctl (),
+.BR ioctl (2),
+and in general
+.BR setsockopt (2)
+(only certain specific
+.BR setsockopt (2)
+options allow tagged addresses).
+.RE
+.IP
+This list of exclusions may shrink
+when moving from one kernel version to a later kernel version.
+While the kernel may make some guarantees
+for backwards compatibility reasons,
+for the purposes of new software
+the effect of passing tagged addresses to these interfaces
+is unspecified.
+.IP
+The mode set by this call is inherited across
+.BR fork (2)
+and
+.BR clone (2).
+The mode is reset by
+.BR execve (2)
+to 0
+(i.e., tagged addresses not permitted in the user/kernel ABI).
+.IP
+For more information, see the kernel source file
+.IR Documentation/arm64/tagged\-address\-abi.rst .
+.IP
+.B Warning:
+This call is primarily intended for use by the run-time environment.
+A successful
+.B PR_SET_TAGGED_ADDR_CTRL
+call elsewhere may crash the calling process.
+The conditions for using it safely are complex and system-dependent.
+Don't use it unless you know what you are doing.
+.\" prctl PR_GET_TAGGED_ADDR_CTRL
+.\" commit 63f0c60379650d82250f22e4cf4137ef3dc4f43d
+.TP
+.BR PR_GET_TAGGED_ADDR_CTRL " (since Linux 5.4, only on arm64)"
+Returns the current tagged address mode
+for the calling thread.
+.IP
+Arguments
+.IR arg2 ", " arg3 ", " arg4 ", and " arg5
+must all be zero.
+.IP
+If the arguments are invalid
+or this feature is disabled or unsupported by the kernel,
+the call fails with
+.BR EINVAL .
+In particular, if
+.BR prctl ( PR_GET_TAGGED_ADDR_CTRL ,
+0, 0, 0, 0)
+fails with
+.BR EINVAL ,
+then this feature is definitely either unsupported,
+or disabled via
+.IR /proc/sys/abi/tagged_addr_disabled .
+In this case,
+all addresses passed to the kernel must be untagged.
+.IP
+Otherwise, the call returns a nonnegative value
+describing the current tagged address mode,
+encoded in the same way as the
+.I arg2
+argument of
+.BR PR_SET_TAGGED_ADDR_CTRL .
+.IP
+For more information, see the kernel source file
+.IR Documentation/arm64/tagged\-address\-abi.rst .
+.\"
+.\" prctl PR_TASK_PERF_EVENTS_DISABLE
+.TP
+.BR PR_TASK_PERF_EVENTS_DISABLE " (since Linux 2.6.31)"
+Disable all performance counters attached to the calling process,
+regardless of whether the counters were created by
+this process or another process.
+Performance counters created by the calling process for other
+processes are unaffected.
+For more information on performance counters, see the Linux kernel source file
+.IR tools/perf/design.txt .
+.IP
+Originally called
+.BR PR_TASK_PERF_COUNTERS_DISABLE ;
+.\" commit 1d1c7ddbfab358445a542715551301b7fc363e28
+renamed (retaining the same numerical value)
+in Linux 2.6.32.
+.\"
+.\" prctl PR_TASK_PERF_EVENTS_ENABLE
+.TP
+.BR PR_TASK_PERF_EVENTS_ENABLE " (since Linux 2.6.31)"
+The converse of
+.BR PR_TASK_PERF_EVENTS_DISABLE ;
+enable performance counters attached to the calling process.
+.IP
+Originally called
+.BR PR_TASK_PERF_COUNTERS_ENABLE ;
+.\" commit 1d1c7ddbfab358445a542715551301b7fc363e28
+renamed
+.\" commit cdd6c482c9ff9c55475ee7392ec8f672eddb7be6
+in Linux 2.6.32.
+.\"
+.\" prctl PR_SET_THP_DISABLE
+.TP
+.BR PR_SET_THP_DISABLE " (since Linux 3.15)"
+.\" commit a0715cc22601e8830ace98366c0c2bd8da52af52
+Set the state of the "THP disable" flag for the calling thread.
+If
+.I arg2
+has a nonzero value, the flag is set, otherwise it is cleared.
+Setting this flag provides a method
+for disabling transparent huge pages
+for jobs where the code cannot be modified, and using a malloc hook with
+.BR madvise (2)
+is not an option (i.e., statically allocated data).
+The setting of the "THP disable" flag is inherited by a child created via
+.BR fork (2)
+and is preserved across
+.BR execve (2).
+.\" prctl PR_GET_THP_DISABLE
+.TP
+.BR PR_GET_THP_DISABLE " (since Linux 3.15)"
+Return (as the function result) the current setting of the "THP disable"
+flag for the calling thread:
+either 1, if the flag is set, or 0, if it is not.
+.\" prctl PR_GET_TID_ADDRESS
+.TP
+.BR PR_GET_TID_ADDRESS " (since Linux 3.5)"
+.\" commit 300f786b2683f8bb1ec0afb6e1851183a479c86d
+Return the
+.I clear_child_tid
+address set by
+.BR set_tid_address (2)
+and the
+.BR clone (2)
+.B CLONE_CHILD_CLEARTID
+flag, in the location pointed to by
+.IR "(int\~**)\~arg2" .
+This feature is available only if the kernel is built with the
+.B CONFIG_CHECKPOINT_RESTORE
+option enabled.
+Note that since the
+.BR prctl ()
+system call does not have a compat implementation for
+the AMD64 x32 and MIPS n32 ABIs,
+and the kernel writes out a pointer using the kernel's pointer size,
+this operation expects a user-space buffer of 8 (not 4) bytes on these ABIs.
+.\" prctl PR_SET_TIMERSLACK
+.TP
+.BR PR_SET_TIMERSLACK " (since Linux 2.6.28)"
+.\" See https://lwn.net/Articles/369549/
+.\" commit 6976675d94042fbd446231d1bd8b7de71a980ada
+Each thread has two associated timer slack values:
+a "default" value, and a "current" value.
+This operation sets the "current" timer slack value for the calling thread.
+.I arg2
+is an unsigned long value, then maximum "current" value is ULONG_MAX and
+the minimum "current" value is 1.
+If the nanosecond value supplied in
+.I arg2
+is greater than zero, then the "current" value is set to this value.
+If
+.I arg2
+is equal to zero,
+the "current" timer slack is reset to the
+thread's "default" timer slack value.
+.IP
+The "current" timer slack is used by the kernel to group timer expirations
+for the calling thread that are close to one another;
+as a consequence, timer expirations for the thread may be
+up to the specified number of nanoseconds late (but will never expire early).
+Grouping timer expirations can help reduce system power consumption
+by minimizing CPU wake-ups.
+.IP
+The timer expirations affected by timer slack are those set by
+.BR select (2),
+.BR pselect (2),
+.BR poll (2),
+.BR ppoll (2),
+.BR epoll_wait (2),
+.BR epoll_pwait (2),
+.BR clock_nanosleep (2),
+.BR nanosleep (2),
+and
+.BR futex (2)
+(and thus the library functions implemented via futexes, including
+.\" List obtained by grepping for futex usage in glibc source
+.BR pthread_cond_timedwait (3),
+.BR pthread_mutex_timedlock (3),
+.BR pthread_rwlock_timedrdlock (3),
+.BR pthread_rwlock_timedwrlock (3),
+and
+.BR sem_timedwait (3)).
+.IP
+Timer slack is not applied to threads that are scheduled under
+a real-time scheduling policy (see
+.BR sched_setscheduler (2)).
+.IP
+When a new thread is created,
+the two timer slack values are made the same as the "current" value
+of the creating thread.
+Thereafter, a thread can adjust its "current" timer slack value via
+.BR PR_SET_TIMERSLACK .
+The "default" value can't be changed.
+The timer slack values of
+.I init
+(PID 1), the ancestor of all processes,
+are 50,000 nanoseconds (50 microseconds).
+The timer slack value is inherited by a child created via
+.BR fork (2),
+and is preserved across
+.BR execve (2).
+.IP
+Since Linux 4.6, the "current" timer slack value of any process
+can be examined and changed via the file
+.IR /proc/ pid /timerslack_ns .
+See
+.BR proc (5).
+.\" prctl PR_GET_TIMERSLACK
+.TP
+.BR PR_GET_TIMERSLACK " (since Linux 2.6.28)"
+Return (as the function result)
+the "current" timer slack value of the calling thread.
+.\" prctl PR_SET_TIMING
+.TP
+.BR PR_SET_TIMING " (since Linux 2.6.0)"
+.\" Precisely: Linux 2.6.0-test4
+Set whether to use (normal, traditional) statistical process timing or
+accurate timestamp-based process timing, by passing
+.B PR_TIMING_STATISTICAL
+.\" 0
+or
+.B PR_TIMING_TIMESTAMP
+.\" 1
+to \fIarg2\fP.
+.B PR_TIMING_TIMESTAMP
+is not currently implemented
+(attempting to set this mode will yield the error
+.BR EINVAL ).
+.\" PR_TIMING_TIMESTAMP doesn't do anything in Linux 2.6.26-rc8,
+.\" and looking at the patch history, it appears
+.\" that it never did anything.
+.\" prctl PR_GET_TIMING
+.TP
+.BR PR_GET_TIMING " (since Linux 2.6.0)"
+.\" Precisely: Linux 2.6.0-test4
+Return (as the function result) which process timing method is currently
+in use.
+.\" prctl PR_SET_TSC
+.TP
+.BR PR_SET_TSC " (since Linux 2.6.26, x86 only)"
+Set the state of the flag determining whether the timestamp counter
+can be read by the process.
+Pass
+.B PR_TSC_ENABLE
+to
+.I arg2
+to allow it to be read, or
+.B PR_TSC_SIGSEGV
+to generate a
+.B SIGSEGV
+when the process tries to read the timestamp counter.
+.\" prctl PR_GET_TSC
+.TP
+.BR PR_GET_TSC " (since Linux 2.6.26, x86 only)"
+Return the state of the flag determining whether the timestamp counter
+can be read,
+in the location pointed to by
+.IR "(int\~*) arg2" .
+.\" prctl PR_SET_UNALIGN
+.TP
+.B PR_SET_UNALIGN
+(Only on: ia64, since Linux 2.3.48; parisc, since Linux 2.6.15;
+PowerPC, since Linux 2.6.18; Alpha, since Linux 2.6.22;
+.\" sh: 94ea5e449ae834af058ef005d16a8ad44fcf13d6
+.\" tile: 2f9ac29eec71a696cb0dcc5fb82c0f8d4dac28c9
+sh, since Linux 2.6.34; tile, since Linux 3.12)
+Set unaligned access control bits to \fIarg2\fP.
+Pass
+\fBPR_UNALIGN_NOPRINT\fP to silently fix up unaligned user accesses,
+or \fBPR_UNALIGN_SIGBUS\fP to generate
+.B SIGBUS
+on unaligned user access.
+Alpha also supports an additional flag with the value
+of 4 and no corresponding named constant,
+which instructs kernel to not fix up
+unaligned accesses (it is analogous to providing the
+.B UAC_NOFIX
+flag in
+.B SSI_NVPAIRS
+operation of the
+.BR setsysinfo ()
+system call on Tru64).
+.\" prctl PR_GET_UNALIGN
+.TP
+.B PR_GET_UNALIGN
+(See
+.B PR_SET_UNALIGN
+for information on versions and architectures.)
+Return unaligned access control bits, in the location pointed to by
+.IR "(unsigned int\~*) arg2" .
+.\" prctl PR_GET_AUXV
+.TP
+.BR PR_GET_AUXV " (since Linux 6.4)"
+Get the auxiliary vector (auxv) into the buffer pointed to by
+.IR "(void\~*) arg2" ,
+whose length is given by \fIarg3\fP.
+If the buffer is not long enough for the full auxiliary vector,
+the copy will be truncated.
+Return (as the function result)
+the full length of the auxiliary vector.
+\fIarg4\fP and \fIarg5\fP must be 0.
+.SH RETURN VALUE
+On success,
+.BR PR_CAP_AMBIENT + PR_CAP_AMBIENT_IS_SET ,
+.BR PR_CAPBSET_READ ,
+.BR PR_GET_DUMPABLE ,
+.BR PR_GET_FP_MODE ,
+.BR PR_GET_IO_FLUSHER ,
+.BR PR_GET_KEEPCAPS ,
+.BR PR_MCE_KILL_GET ,
+.BR PR_GET_NO_NEW_PRIVS ,
+.BR PR_GET_SECUREBITS ,
+.BR PR_GET_SPECULATION_CTRL ,
+.BR PR_SVE_GET_VL ,
+.BR PR_SVE_SET_VL ,
+.BR PR_GET_TAGGED_ADDR_CTRL ,
+.BR PR_GET_THP_DISABLE ,
+.BR PR_GET_TIMING ,
+.BR PR_GET_TIMERSLACK ,
+.BR PR_GET_AUXV ,
+and (if it returns)
+.B PR_GET_SECCOMP
+return the nonnegative values described above.
+All other
+.I option
+values return 0 on success.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+.I option
+is
+.B PR_SET_SECCOMP
+and
+.I arg2
+is
+.BR SECCOMP_MODE_FILTER ,
+but the process does not have the
+.B CAP_SYS_ADMIN
+capability or has not set the
+.I no_new_privs
+attribute (see the discussion of
+.B PR_SET_NO_NEW_PRIVS
+above).
+.TP
+.B EACCES
+.I option
+is
+.BR PR_SET_MM ,
+and
+.I arg3
+is
+.BR PR_SET_MM_EXE_FILE ,
+the file is not executable.
+.TP
+.B EBADF
+.I option
+is
+.BR PR_SET_MM ,
+.I arg3
+is
+.BR PR_SET_MM_EXE_FILE ,
+and the file descriptor passed in
+.I arg4
+is not valid.
+.TP
+.B EBUSY
+.I option
+is
+.BR PR_SET_MM ,
+.I arg3
+is
+.BR PR_SET_MM_EXE_FILE ,
+and this the second attempt to change the
+.IR /proc/ pid /exe
+symbolic link, which is prohibited.
+.TP
+.B EFAULT
+.I arg2
+is an invalid address.
+.TP
+.B EFAULT
+.I option
+is
+.BR PR_SET_SECCOMP ,
+.I arg2
+is
+.BR SECCOMP_MODE_FILTER ,
+the system was built with
+.BR CONFIG_SECCOMP_FILTER ,
+and
+.I arg3
+is an invalid address.
+.TP
+.B EFAULT
+.I option
+is
+.B PR_SET_SYSCALL_USER_DISPATCH
+and
+.I arg5
+has an invalid address.
+.TP
+.B EINVAL
+The value of
+.I option
+is not recognized,
+or not supported on this system.
+.TP
+.B EINVAL
+.I option
+is
+.B PR_MCE_KILL
+or
+.B PR_MCE_KILL_GET
+or
+.BR PR_SET_MM ,
+and unused
+.BR prctl ()
+arguments were not specified as zero.
+.TP
+.B EINVAL
+.I arg2
+is not valid value for this
+.IR option .
+.TP
+.B EINVAL
+.I option
+is
+.B PR_SET_SECCOMP
+or
+.BR PR_GET_SECCOMP ,
+and the kernel was not configured with
+.BR CONFIG_SECCOMP .
+.TP
+.B EINVAL
+.I option
+is
+.BR PR_SET_SECCOMP ,
+.I arg2
+is
+.BR SECCOMP_MODE_FILTER ,
+and the kernel was not configured with
+.BR CONFIG_SECCOMP_FILTER .
+.TP
+.B EINVAL
+.I option
+is
+.BR PR_SET_MM ,
+and one of the following is true
+.RS
+.IP \[bu] 3
+.I arg4
+or
+.I arg5
+is nonzero;
+.IP \[bu]
+.I arg3
+is greater than
+.B TASK_SIZE
+(the limit on the size of the user address space for this architecture);
+.IP \[bu]
+.I arg2
+is
+.BR PR_SET_MM_START_CODE ,
+.BR PR_SET_MM_END_CODE ,
+.BR PR_SET_MM_START_DATA ,
+.BR PR_SET_MM_END_DATA ,
+or
+.BR PR_SET_MM_START_STACK ,
+and the permissions of the corresponding memory area are not as required;
+.IP \[bu]
+.I arg2
+is
+.B PR_SET_MM_START_BRK
+or
+.BR PR_SET_MM_BRK ,
+and
+.I arg3
+is less than or equal to the end of the data segment
+or specifies a value that would cause the
+.B RLIMIT_DATA
+resource limit to be exceeded.
+.RE
+.TP
+.B EINVAL
+.I option
+is
+.B PR_SET_PTRACER
+and
+.I arg2
+is not 0,
+.BR PR_SET_PTRACER_ANY ,
+or the PID of an existing process.
+.TP
+.B EINVAL
+.I option
+is
+.B PR_SET_PDEATHSIG
+and
+.I arg2
+is not a valid signal number.
+.TP
+.B EINVAL
+.I option
+is
+.B PR_SET_DUMPABLE
+and
+.I arg2
+is neither
+.B SUID_DUMP_DISABLE
+nor
+.BR SUID_DUMP_USER .
+.TP
+.B EINVAL
+.I option
+is
+.B PR_SET_TIMING
+and
+.I arg2
+is not
+.BR PR_TIMING_STATISTICAL .
+.TP
+.B EINVAL
+.I option
+is
+.B PR_SET_NO_NEW_PRIVS
+and
+.I arg2
+is not equal to 1
+or
+.IR arg3 ,
+.IR arg4 ,
+or
+.I arg5
+is nonzero.
+.TP
+.B EINVAL
+.I option
+is
+.B PR_GET_NO_NEW_PRIVS
+and
+.IR arg2 ,
+.IR arg3 ,
+.IR arg4 ,
+or
+.I arg5
+is nonzero.
+.TP
+.B EINVAL
+.I option
+is
+.B PR_SET_THP_DISABLE
+and
+.IR arg3 ,
+.IR arg4 ,
+or
+.I arg5
+is nonzero.
+.TP
+.B EINVAL
+.I option
+is
+.B PR_GET_THP_DISABLE
+and
+.IR arg2 ,
+.IR arg3 ,
+.IR arg4 ,
+or
+.I arg5
+is nonzero.
+.TP
+.B EINVAL
+.I option
+is
+.B PR_CAP_AMBIENT
+and an unused argument
+.RI ( arg4 ,
+.IR arg5 ,
+or,
+in the case of
+.BR PR_CAP_AMBIENT_CLEAR_ALL ,
+.IR arg3 )
+is nonzero; or
+.I arg2
+has an invalid value;
+or
+.I arg2
+is
+.BR PR_CAP_AMBIENT_LOWER ,
+.BR PR_CAP_AMBIENT_RAISE ,
+or
+.B PR_CAP_AMBIENT_IS_SET
+and
+.I arg3
+does not specify a valid capability.
+.TP
+.B EINVAL
+.I option
+was
+.B PR_GET_SPECULATION_CTRL
+or
+.B PR_SET_SPECULATION_CTRL
+and unused arguments to
+.BR prctl ()
+are not 0.
+.B EINVAL
+.I option
+is
+.B PR_PAC_RESET_KEYS
+and the arguments are invalid or unsupported.
+See the description of
+.B PR_PAC_RESET_KEYS
+above for details.
+.TP
+.B EINVAL
+.I option
+is
+.B PR_SVE_SET_VL
+and the arguments are invalid or unsupported,
+or SVE is not available on this platform.
+See the description of
+.B PR_SVE_SET_VL
+above for details.
+.TP
+.B EINVAL
+.I option
+is
+.B PR_SVE_GET_VL
+and SVE is not available on this platform.
+.TP
+.B EINVAL
+.I option
+is
+.B PR_SET_SYSCALL_USER_DISPATCH
+and one of the following is true:
+.RS
+.IP \[bu] 3
+.I arg2
+is
+.B PR_SYS_DISPATCH_OFF
+and the remaining arguments are not 0;
+.IP \[bu]
+.I arg2
+is
+.B PR_SYS_DISPATCH_ON
+and the memory range specified is outside the
+address space of the process.
+.IP \[bu]
+.I arg2
+is invalid.
+.RE
+.TP
+.B EINVAL
+.I option
+is
+.B PR_SET_TAGGED_ADDR_CTRL
+and the arguments are invalid or unsupported.
+See the description of
+.B PR_SET_TAGGED_ADDR_CTRL
+above for details.
+.TP
+.B EINVAL
+.I option
+is
+.B PR_GET_TAGGED_ADDR_CTRL
+and the arguments are invalid or unsupported.
+See the description of
+.B PR_GET_TAGGED_ADDR_CTRL
+above for details.
+.TP
+.B ENODEV
+.I option
+was
+.B PR_SET_SPECULATION_CTRL
+the kernel or CPU does not support the requested speculation misfeature.
+.TP
+.B ENXIO
+.I option
+was
+.B PR_MPX_ENABLE_MANAGEMENT
+or
+.B PR_MPX_DISABLE_MANAGEMENT
+and the kernel or the CPU does not support MPX management.
+Check that the kernel and processor have MPX support.
+.TP
+.B ENXIO
+.I option
+was
+.B PR_SET_SPECULATION_CTRL
+implies that the control of the selected speculation misfeature is not possible.
+See
+.B PR_GET_SPECULATION_CTRL
+for the bit fields to determine which option is available.
+.TP
+.B EOPNOTSUPP
+.I option
+is
+.B PR_SET_FP_MODE
+and
+.I arg2
+has an invalid or unsupported value.
+.TP
+.B EPERM
+.I option
+is
+.BR PR_SET_SECUREBITS ,
+and the caller does not have the
+.B CAP_SETPCAP
+capability,
+or tried to unset a "locked" flag,
+or tried to set a flag whose corresponding locked flag was set
+(see
+.BR capabilities (7)).
+.TP
+.B EPERM
+.I option
+is
+.B PR_SET_SPECULATION_CTRL
+wherein the speculation was disabled with
+.B PR_SPEC_FORCE_DISABLE
+and caller tried to enable it again.
+.TP
+.B EPERM
+.I option
+is
+.BR PR_SET_KEEPCAPS ,
+and the caller's
+.B SECBIT_KEEP_CAPS_LOCKED
+flag is set
+(see
+.BR capabilities (7)).
+.TP
+.B EPERM
+.I option
+is
+.BR PR_CAPBSET_DROP ,
+and the caller does not have the
+.B CAP_SETPCAP
+capability.
+.TP
+.B EPERM
+.I option
+is
+.BR PR_SET_MM ,
+and the caller does not have the
+.B CAP_SYS_RESOURCE
+capability.
+.TP
+.B EPERM
+.I option
+is
+.B PR_CAP_AMBIENT
+and
+.I arg2
+is
+.BR PR_CAP_AMBIENT_RAISE ,
+but either the capability specified in
+.I arg3
+is not present in the process's permitted and inheritable capability sets,
+or the
+.B PR_CAP_AMBIENT_LOWER
+securebit has been set.
+.TP
+.B ERANGE
+.I option
+was
+.B PR_SET_SPECULATION_CTRL
+and
+.I arg3
+is not
+.BR PR_SPEC_ENABLE ,
+.BR PR_SPEC_DISABLE ,
+.BR PR_SPEC_FORCE_DISABLE ,
+nor
+.BR PR_SPEC_DISABLE_NOEXEC .
+.SH VERSIONS
+IRIX has a
+.BR prctl ()
+system call (also introduced in Linux 2.1.44
+as irix_prctl on the MIPS architecture),
+with prototype
+.PP
+.in +4n
+.EX
+.BI "ptrdiff_t prctl(int " option ", int " arg2 ", int " arg3 );
+.EE
+.in
+.PP
+and options to get the maximum number of processes per user,
+get the maximum number of processors the calling process can use,
+find out whether a specified process is currently blocked,
+get or set the maximum stack size, and so on.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.1.57,
+glibc 2.0.6
+.SH SEE ALSO
+.BR signal (2),
+.BR core (5)
diff --git a/man2/pread.2 b/man2/pread.2
new file mode 100644
index 0000000..9764e53
--- /dev/null
+++ b/man2/pread.2
@@ -0,0 +1,146 @@
+.\" Copyright (C) 1999 Joseph Samuel Myers.
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH pread 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+pread, pwrite \- read from or write to a file descriptor at a given offset
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "ssize_t pread(int " fd ", void " buf [. count "], size_t " count ,
+.BI " off_t " offset );
+.BI "ssize_t pwrite(int " fd ", const void " buf [. count "], size_t " count ,
+.BI " off_t " offset );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR pread (),
+.BR pwrite ():
+.nf
+ _XOPEN_SOURCE >= 500
+ || /* Since glibc 2.12: */ _POSIX_C_SOURCE >= 200809L
+.fi
+.SH DESCRIPTION
+.BR pread ()
+reads up to
+.I count
+bytes from file descriptor
+.I fd
+at offset
+.I offset
+(from the start of the file) into the buffer starting at
+.IR buf .
+The file offset is not changed.
+.PP
+.BR pwrite ()
+writes up to
+.I count
+bytes from the buffer starting at
+.I buf
+to the file descriptor
+.I fd
+at offset
+.IR offset .
+The file offset is not changed.
+.PP
+The file referenced by
+.I fd
+must be capable of seeking.
+.SH RETURN VALUE
+On success,
+.BR pread ()
+returns the number of bytes read
+(a return of zero indicates end of file)
+and
+.BR pwrite ()
+returns the number of bytes written.
+.PP
+Note that it is not an error for a successful call to transfer fewer bytes
+than requested (see
+.BR read (2)
+and
+.BR write (2)).
+.PP
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.BR pread ()
+can fail and set
+.I errno
+to any error specified for
+.BR read (2)
+or
+.BR lseek (2).
+.BR pwrite ()
+can fail and set
+.I errno
+to any error specified for
+.BR write (2)
+or
+.BR lseek (2).
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001.
+.PP
+Added in Linux 2.1.60;
+the entries in the i386 system call table were added in Linux 2.1.69.
+C library support (including emulation using
+.BR lseek (2)
+on older kernels without the system calls) was added in glibc 2.1.
+.SS C library/kernel differences
+On Linux, the underlying system calls were renamed
+in Linux 2.6:
+.BR pread ()
+became
+.BR pread64 (),
+and
+.BR pwrite ()
+became
+.BR pwrite64 ().
+The system call numbers remained the same.
+The glibc
+.BR pread ()
+and
+.BR pwrite ()
+wrapper functions transparently deal with the change.
+.PP
+On some 32-bit architectures,
+the calling signature for these system calls differ,
+for the reasons described in
+.BR syscall (2).
+.SH NOTES
+The
+.BR pread ()
+and
+.BR pwrite ()
+system calls are especially useful in multithreaded applications.
+They allow multiple threads to perform I/O on the same file descriptor
+without being affected by changes to the file offset by other threads.
+.SH BUGS
+POSIX requires that opening a file with the
+.B O_APPEND
+flag should have no effect on the location at which
+.BR pwrite ()
+writes data.
+However, on Linux, if a file is opened with
+.\" FIXME . https://bugzilla.kernel.org/show_bug.cgi?id=43178
+.BR O_APPEND ,
+.BR pwrite ()
+appends data to the end of the file, regardless of the value of
+.IR offset .
+.SH SEE ALSO
+.BR lseek (2),
+.BR read (2),
+.BR readv (2),
+.BR write (2)
diff --git a/man2/pread64.2 b/man2/pread64.2
new file mode 100644
index 0000000..87eacb2
--- /dev/null
+++ b/man2/pread64.2
@@ -0,0 +1 @@
+.so man2/pread.2
diff --git a/man2/preadv.2 b/man2/preadv.2
new file mode 100644
index 0000000..54e3384
--- /dev/null
+++ b/man2/preadv.2
@@ -0,0 +1 @@
+.so man2/readv.2
diff --git a/man2/preadv2.2 b/man2/preadv2.2
new file mode 100644
index 0000000..54e3384
--- /dev/null
+++ b/man2/preadv2.2
@@ -0,0 +1 @@
+.so man2/readv.2
diff --git a/man2/prlimit.2 b/man2/prlimit.2
new file mode 100644
index 0000000..df6d736
--- /dev/null
+++ b/man2/prlimit.2
@@ -0,0 +1 @@
+.so man2/getrlimit.2
diff --git a/man2/prlimit64.2 b/man2/prlimit64.2
new file mode 100644
index 0000000..df6d736
--- /dev/null
+++ b/man2/prlimit64.2
@@ -0,0 +1 @@
+.so man2/getrlimit.2
diff --git a/man2/process_madvise.2 b/man2/process_madvise.2
new file mode 100644
index 0000000..b95f4e3
--- /dev/null
+++ b/man2/process_madvise.2
@@ -0,0 +1,209 @@
+.\" Copyright (C) 2021 Suren Baghdasaryan <surenb@google.com>
+.\" and Copyright (C) 2021 Minchan Kim <minchan@kernel.org>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Commit ecb8ac8b1f146915aa6b96449b66dd48984caacc
+.\"
+.TH process_madvise 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+process_madvise \- give advice about use of memory to a process
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <sys/mman.h>" " /* Definition of " MADV_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.BR "#include <sys/uio.h>" " /* Definition of " "struct iovec" " type */"
+.B #include <unistd.h>
+.PP
+.BI "ssize_t syscall(SYS_process_madvise, int " pidfd ,
+.BI " const struct iovec *" iovec ", size_t " vlen \
+", int " advice ,
+.BI " unsigned int " flags ");"
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR process_madvise (),
+necessitating the use of
+.BR syscall (2).
+.\" FIXME: See <https://sourceware.org/bugzilla/show_bug.cgi?id=27380>
+.SH DESCRIPTION
+The
+.BR process_madvise ()
+system call is used to give advice or directions to the kernel about the
+address ranges of another process or of the calling process.
+It provides the advice for the address ranges described by
+.I iovec
+and
+.IR vlen .
+The goal of such advice is to improve system or application performance.
+.PP
+The
+.I pidfd
+argument is a PID file descriptor (see
+.BR pidfd_open (2))
+that specifies the process to which the advice is to be applied.
+.PP
+The pointer
+.I iovec
+points to an array of
+.I iovec
+structures, described in
+.BR iovec (3type).
+.PP
+.I vlen
+specifies the number of elements in the array of
+.I iovec
+structures.
+This value must be less than or equal to
+.B IOV_MAX
+(defined in
+.I <limits.h>
+or accessible via the call
+.IR sysconf(_SC_IOV_MAX) ).
+.PP
+The
+.I advice
+argument is one of the following values:
+.TP
+.B MADV_COLD
+See
+.BR madvise (2).
+.TP
+.B MADV_COLLAPSE
+See
+.BR madvise (2).
+.TP
+.B MADV_PAGEOUT
+See
+.BR madvise (2).
+.TP
+.B MADV_WILLNEED
+See
+.BR madvise (2).
+.PP
+The
+.I flags
+argument is reserved for future use; currently, this argument must be
+specified as 0.
+.PP
+The
+.I vlen
+and
+.I iovec
+arguments are checked before applying any advice.
+If
+.I vlen
+is too big, or
+.I iovec
+is invalid,
+then an error will be returned immediately and no advice will be applied.
+.PP
+The advice might be applied to only a part of
+.I iovec
+if one of its elements points to an invalid memory region in the
+remote process.
+No further elements will be processed beyond that point.
+(See the discussion regarding partial advice in RETURN VALUE.)
+.PP
+.\" commit 96cfe2c0fd23ea7c2368d14f769d287e7ae1082e
+Starting in Linux 5.12,
+permission to apply advice to another process is governed by
+ptrace access mode
+.B PTRACE_MODE_READ_FSCREDS
+check (see
+.BR ptrace (2));
+in addition,
+because of the performance implications of applying the advice,
+the caller must have the
+.B CAP_SYS_NICE
+capability
+(see
+.BR capabilities (7)).
+.SH RETURN VALUE
+On success,
+.BR process_madvise ()
+returns the number of bytes advised.
+This return value may be less than the total number of requested bytes,
+if an error occurred after some
+.I iovec
+elements were already processed.
+The caller should check the return value to determine whether a partial
+advice occurred.
+.PP
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+.I pidfd
+is not a valid PID file descriptor.
+.TP
+.B EFAULT
+The memory described by
+.I iovec
+is outside the accessible address space of the process referred to by
+.IR pidfd .
+.TP
+.B EINVAL
+.I flags
+is not 0.
+.TP
+.B EINVAL
+The sum of the
+.I iov_len
+values of
+.I iovec
+overflows a
+.I ssize_t
+value.
+.TP
+.B EINVAL
+.I vlen
+is too large.
+.TP
+.B ENOMEM
+Could not allocate memory for internal copies of the
+.I iovec
+structures.
+.TP
+.B EPERM
+The caller does not have permission to access the address space of the process
+.IR pidfd .
+.TP
+.B ESRCH
+The target process does not exist (i.e., it has terminated and been waited on).
+.PP
+See
+.BR madvise (2)
+for
+.IR advice -specific
+errors.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 5.10.
+.\" commit ecb8ac8b1f146915aa6b96449b66dd48984caacc
+.PP
+Support for this system call is optional,
+depending on the setting of the
+.B CONFIG_ADVISE_SYSCALLS
+configuration option.
+.PP
+When this system call first appeared in Linux 5.10,
+permission to apply advice to another process was entirely governed by
+ptrace access mode
+.B PTRACE_MODE_ATTACH_FSCREDS
+check (see
+.BR ptrace (2)).
+This requirement was relaxed in Linux 5.12 so that the caller didn't require
+full control over the target process.
+.SH SEE ALSO
+.BR madvise (2),
+.BR pidfd_open (2),
+.BR process_vm_readv (2),
+.BR process_vm_write (2)
diff --git a/man2/process_vm_readv.2 b/man2/process_vm_readv.2
new file mode 100644
index 0000000..86da35c
--- /dev/null
+++ b/man2/process_vm_readv.2
@@ -0,0 +1,314 @@
+.\" Copyright (C) 2011 Christopher Yeoh <cyeoh@au1.ibm.com>
+.\" and Copyright (C) 2012 Mike Frysinger <vapier@gentoo.org>
+.\" and Copyright (C) 2012 Michael Kerrisk <mtk.man-pages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Commit fcf634098c00dd9cd247447368495f0b79be12d1
+.\"
+.TH process_vm_readv 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+process_vm_readv, process_vm_writev \-
+transfer data between process address spaces
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/uio.h>
+.PP
+.BI "ssize_t process_vm_readv(pid_t " pid ,
+.BI " const struct iovec *" local_iov ,
+.BI " unsigned long " liovcnt ,
+.BI " const struct iovec *" remote_iov ,
+.BI " unsigned long " riovcnt ,
+.BI " unsigned long " flags ");"
+.BI "ssize_t process_vm_writev(pid_t " pid ,
+.BI " const struct iovec *" local_iov ,
+.BI " unsigned long " liovcnt ,
+.BI " const struct iovec *" remote_iov ,
+.BI " unsigned long " riovcnt ,
+.BI " unsigned long " flags ");"
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR process_vm_readv (),
+.BR process_vm_writev ():
+.nf
+ _GNU_SOURCE
+.fi
+.SH DESCRIPTION
+These system calls transfer data between the address space
+of the calling process ("the local process") and the process identified by
+.I pid
+("the remote process").
+The data moves directly between the address spaces of the two processes,
+without passing through kernel space.
+.PP
+The
+.BR process_vm_readv ()
+system call transfers data from the remote process to the local process.
+The data to be transferred is identified by
+.I remote_iov
+and
+.IR riovcnt :
+.I remote_iov
+is a pointer to an array describing address ranges in the process
+.IR pid ,
+and
+.I riovcnt
+specifies the number of elements in
+.IR remote_iov .
+The data is transferred to the locations specified by
+.I local_iov
+and
+.IR liovcnt :
+.I local_iov
+is a pointer to an array describing address ranges in the calling process,
+and
+.I liovcnt
+specifies the number of elements in
+.IR local_iov .
+.PP
+The
+.BR process_vm_writev ()
+system call is the converse of
+.BR process_vm_readv ()\[em]it
+transfers data from the local process to the remote process.
+Other than the direction of the transfer, the arguments
+.IR liovcnt ,
+.IR local_iov ,
+.IR riovcnt ,
+and
+.I remote_iov
+have the same meaning as for
+.BR process_vm_readv ().
+.PP
+The
+.I local_iov
+and
+.I remote_iov
+arguments point to an array of
+.I iovec
+structures, described in
+.BR iovec (3type).
+.PP
+Buffers are processed in array order.
+This means that
+.BR process_vm_readv ()
+completely fills
+.I local_iov[0]
+before proceeding to
+.IR local_iov[1] ,
+and so on.
+Likewise,
+.I remote_iov[0]
+is completely read before proceeding to
+.IR remote_iov[1] ,
+and so on.
+.PP
+Similarly,
+.BR process_vm_writev ()
+writes out the entire contents of
+.I local_iov[0]
+before proceeding to
+.IR local_iov[1] ,
+and it completely fills
+.I remote_iov[0]
+before proceeding to
+.IR remote_iov[1] .
+.PP
+The lengths of
+.I remote_iov[i].iov_len
+and
+.I local_iov[i].iov_len
+do not have to be the same.
+Thus, it is possible to split a single local buffer
+into multiple remote buffers, or vice versa.
+.PP
+The
+.I flags
+argument is currently unused and must be set to 0.
+.PP
+The values specified in the
+.I liovcnt
+and
+.I riovcnt
+arguments must be less than or equal to
+.B IOV_MAX
+(defined in
+.I <limits.h>
+or accessible via the call
+.IR sysconf(_SC_IOV_MAX) ).
+.\" In time, glibc might provide a wrapper that works around this limit,
+.\" as is done for readv()/writev()
+.PP
+The count arguments and
+.I local_iov
+are checked before doing any transfers.
+If the counts are too big, or
+.I local_iov
+is invalid,
+or the addresses refer to regions that are inaccessible to the local process,
+none of the vectors will be processed
+and an error will be returned immediately.
+.PP
+Note, however, that these system calls do not check the memory regions
+in the remote process until just before doing the read/write.
+Consequently, a partial read/write (see RETURN VALUE)
+may result if one of the
+.I remote_iov
+elements points to an invalid memory region in the remote process.
+No further reads/writes will be attempted beyond that point.
+Keep this in mind when attempting to read data of unknown length
+(such as C strings that are null-terminated) from a remote process,
+by avoiding spanning memory pages (typically 4\ KiB) in a single remote
+.I iovec
+element.
+(Instead, split the remote read into two
+.I remote_iov
+elements and have them merge back into a single write
+.I local_iov
+entry.
+The first read entry goes up to the page boundary,
+while the second starts on the next page boundary.)
+.PP
+Permission to read from or write to another process
+is governed by a ptrace access mode
+.B PTRACE_MODE_ATTACH_REALCREDS
+check; see
+.BR ptrace (2).
+.SH RETURN VALUE
+On success,
+.BR process_vm_readv ()
+returns the number of bytes read and
+.BR process_vm_writev ()
+returns the number of bytes written.
+This return value may be less than the total number of requested bytes,
+if a partial read/write occurred.
+(Partial transfers apply at the granularity of
+.I iovec
+elements.
+These system calls won't perform a partial transfer that splits a single
+.I iovec
+element.)
+The caller should check the return value to determine whether
+a partial read/write occurred.
+.PP
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+The memory described by
+.I local_iov
+is outside the caller's accessible address space.
+.TP
+.B EFAULT
+The memory described by
+.I remote_iov
+is outside the accessible address space of the process
+.IR pid .
+.TP
+.B EINVAL
+The sum of the
+.I iov_len
+values of either
+.I local_iov
+or
+.I remote_iov
+overflows a
+.I ssize_t
+value.
+.TP
+.B EINVAL
+.I flags
+is not 0.
+.TP
+.B EINVAL
+.I liovcnt
+or
+.I riovcnt
+is too large.
+.TP
+.B ENOMEM
+Could not allocate memory for internal copies of the
+.I iovec
+structures.
+.TP
+.B EPERM
+The caller does not have permission to access the address space of the process
+.IR pid .
+.TP
+.B ESRCH
+No process with ID
+.I pid
+exists.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 3.2,
+glibc 2.15.
+.SH NOTES
+The data transfers performed by
+.BR process_vm_readv ()
+and
+.BR process_vm_writev ()
+are not guaranteed to be atomic in any way.
+.PP
+These system calls were designed to permit fast message passing
+by allowing messages to be exchanged with a single copy operation
+(rather than the double copy that would be required
+when using, for example, shared memory or pipes).
+.\" Original user is MPI, http://www.mcs.anl.gov/research/projects/mpi/
+.\" See also some benchmarks at http://lwn.net/Articles/405284/
+.\" and http://marc.info/?l=linux-mm&m=130105930902915&w=2
+.SH EXAMPLES
+The following code sample demonstrates the use of
+.BR process_vm_readv ().
+It reads 20 bytes at the address 0x10000 from the process with PID 10
+and writes the first 10 bytes into
+.I buf1
+and the second 10 bytes into
+.IR buf2 .
+.PP
+.\" SRC BEGIN (process_vm_readv.c)
+.EX
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+\&
+int
+main(void)
+{
+ char buf1[10];
+ char buf2[10];
+ pid_t pid = 10; /* PID of remote process */
+ ssize_t nread;
+ struct iovec local[2];
+ struct iovec remote[1];
+\&
+ local[0].iov_base = buf1;
+ local[0].iov_len = 10;
+ local[1].iov_base = buf2;
+ local[1].iov_len = 10;
+ remote[0].iov_base = (void *) 0x10000;
+ remote[0].iov_len = 20;
+\&
+ nread = process_vm_readv(pid, local, 2, remote, 1, 0);
+ if (nread != 20)
+ exit(EXIT_FAILURE);
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR readv (2),
+.BR writev (2)
diff --git a/man2/process_vm_writev.2 b/man2/process_vm_writev.2
new file mode 100644
index 0000000..7b198a9
--- /dev/null
+++ b/man2/process_vm_writev.2
@@ -0,0 +1 @@
+.so man2/process_vm_readv.2
diff --git a/man2/prof.2 b/man2/prof.2
new file mode 100644
index 0000000..5d25ea6
--- /dev/null
+++ b/man2/prof.2
@@ -0,0 +1 @@
+.so man2/unimplemented.2
diff --git a/man2/pselect.2 b/man2/pselect.2
new file mode 100644
index 0000000..e177843
--- /dev/null
+++ b/man2/pselect.2
@@ -0,0 +1 @@
+.so man2/select.2
diff --git a/man2/pselect6.2 b/man2/pselect6.2
new file mode 100644
index 0000000..e177843
--- /dev/null
+++ b/man2/pselect6.2
@@ -0,0 +1 @@
+.so man2/select.2
diff --git a/man2/ptrace.2 b/man2/ptrace.2
new file mode 100644
index 0000000..4149a32
--- /dev/null
+++ b/man2/ptrace.2
@@ -0,0 +1,2974 @@
+.\" Copyright (c) 1993 Michael Haardt <michael@moria.de>
+.\" Fri Apr 2 11:32:09 MET DST 1993
+.\"
+.\" and changes Copyright (C) 1999 Mike Coleman (mkc@acm.org)
+.\" -- major revision to fully document ptrace semantics per recent Linux
+.\" kernel (2.2.10) and glibc (2.1.2)
+.\" Sun Nov 7 03:18:35 CST 1999
+.\"
+.\" and Copyright (c) 2011, Denys Vlasenko <vda.linux@googlemail.com>
+.\" and Copyright (c) 2015, 2016, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" Modified Fri Jul 23 23:47:18 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Modified Fri Jan 31 16:46:30 1997 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified Thu Oct 7 17:28:49 1999 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified, 27 May 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on capability requirements
+.\"
+.\" 2006-03-24, Chuck Ebbert <76306.1226@compuserve.com>
+.\" Added PTRACE_SETOPTIONS, PTRACE_GETEVENTMSG, PTRACE_GETSIGINFO,
+.\" PTRACE_SETSIGINFO, PTRACE_SYSEMU, PTRACE_SYSEMU_SINGLESTEP
+.\" (Thanks to Blaisorblade, Daniel Jacobowitz and others who helped.)
+.\" 2011-09, major update by Denys Vlasenko <vda.linux@googlemail.com>
+.\" 2015-01, Kees Cook <keescook@chromium.org>
+.\" Added PTRACE_O_TRACESECCOMP, PTRACE_EVENT_SECCOMP
+.\"
+.\" FIXME The following are undocumented:
+.\"
+.\" PTRACE_GETWMMXREGS
+.\" PTRACE_SETWMMXREGS
+.\" ARM
+.\" Linux 2.6.12
+.\"
+.\" PTRACE_SET_SYSCALL
+.\" ARM and ARM64
+.\" Linux 2.6.16
+.\" commit 3f471126ee53feb5e9b210ea2f525ed3bb9b7a7f
+.\" Author: Nicolas Pitre <nico@cam.org>
+.\" Date: Sat Jan 14 19:30:04 2006 +0000
+.\"
+.\" PTRACE_GETCRUNCHREGS
+.\" PTRACE_SETCRUNCHREGS
+.\" ARM
+.\" Linux 2.6.18
+.\" commit 3bec6ded282b331552587267d67a06ed7fd95ddd
+.\" Author: Lennert Buytenhek <buytenh@wantstofly.org>
+.\" Date: Tue Jun 27 22:56:18 2006 +0100
+.\"
+.\" PTRACE_GETVFPREGS
+.\" PTRACE_SETVFPREGS
+.\" ARM and ARM64
+.\" Linux 2.6.30
+.\" commit 3d1228ead618b88e8606015cbabc49019981805d
+.\" Author: Catalin Marinas <catalin.marinas@arm.com>
+.\" Date: Wed Feb 11 13:12:56 2009 +0100
+.\"
+.\" PTRACE_GETHBPREGS
+.\" PTRACE_SETHBPREGS
+.\" ARM and ARM64
+.\" Linux 2.6.37
+.\" commit 864232fa1a2f8dfe003438ef0851a56722740f3e
+.\" Author: Will Deacon <will.deacon@arm.com>
+.\" Date: Fri Sep 3 10:42:55 2010 +0100
+.\"
+.\" PTRACE_SINGLEBLOCK
+.\" Since at least Linux 2.4.0 on various architectures
+.\" Since Linux 2.6.25 on x86 (and others?)
+.\" commit 5b88abbf770a0e1975c668743100f42934f385e8
+.\" Author: Roland McGrath <roland@redhat.com>
+.\" Date: Wed Jan 30 13:30:53 2008 +0100
+.\" ptrace: generic PTRACE_SINGLEBLOCK
+.\"
+.\" PTRACE_GETFPXREGS
+.\" PTRACE_SETFPXREGS
+.\" Since at least Linux 2.4.0 on various architectures
+.\"
+.\" PTRACE_GETFDPIC
+.\" PTRACE_GETFDPIC_EXEC
+.\" PTRACE_GETFDPIC_INTERP
+.\" blackfin, c6x, frv, sh
+.\" First appearance in Linux 2.6.11 on frv
+.\"
+.\" and others that can be found in the arch/*/include/uapi/asm/ptrace files
+.\"
+.TH ptrace 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+ptrace \- process trace
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/ptrace.h>
+.PP
+.BI "long ptrace(enum __ptrace_request " request ", pid_t " pid ,
+.BI " void *" addr ", void *" data );
+.fi
+.SH DESCRIPTION
+The
+.BR ptrace ()
+system call provides a means by which one process (the "tracer")
+may observe and control the execution of another process (the "tracee"),
+and examine and change the tracee's memory and registers.
+It is primarily used to implement breakpoint debugging and system
+call tracing.
+.PP
+A tracee first needs to be attached to the tracer.
+Attachment and subsequent commands are per thread:
+in a multithreaded process,
+every thread can be individually attached to a
+(potentially different) tracer,
+or left not attached and thus not debugged.
+Therefore, "tracee" always means "(one) thread",
+never "a (possibly multithreaded) process".
+Ptrace commands are always sent to
+a specific tracee using a call of the form
+.PP
+.in +4n
+.EX
+ptrace(PTRACE_foo, pid, ...)
+.EE
+.in
+.PP
+where
+.I pid
+is the thread ID of the corresponding Linux thread.
+.PP
+(Note that in this page, a "multithreaded process"
+means a thread group consisting of threads created using the
+.BR clone (2)
+.B CLONE_THREAD
+flag.)
+.PP
+A process can initiate a trace by calling
+.BR fork (2)
+and having the resulting child do a
+.BR PTRACE_TRACEME ,
+followed (typically) by an
+.BR execve (2).
+Alternatively, one process may commence tracing another process using
+.B PTRACE_ATTACH
+or
+.BR PTRACE_SEIZE .
+.PP
+While being traced, the tracee will stop each time a signal is delivered,
+even if the signal is being ignored.
+(An exception is
+.BR SIGKILL ,
+which has its usual effect.)
+The tracer will be notified at its next call to
+.BR waitpid (2)
+(or one of the related "wait" system calls); that call will return a
+.I status
+value containing information that indicates
+the cause of the stop in the tracee.
+While the tracee is stopped,
+the tracer can use various ptrace requests to inspect and modify the tracee.
+The tracer then causes the tracee to continue,
+optionally ignoring the delivered signal
+(or even delivering a different signal instead).
+.PP
+If the
+.B PTRACE_O_TRACEEXEC
+option is not in effect, all successful calls to
+.BR execve (2)
+by the traced process will cause it to be sent a
+.B SIGTRAP
+signal,
+giving the parent a chance to gain control before the new program
+begins execution.
+.PP
+When the tracer is finished tracing, it can cause the tracee to continue
+executing in a normal, untraced mode via
+.BR PTRACE_DETACH .
+.PP
+The value of
+.I request
+determines the action to be performed:
+.TP
+.B PTRACE_TRACEME
+Indicate that this process is to be traced by its parent.
+A process probably shouldn't make this request if its parent
+isn't expecting to trace it.
+.RI ( pid ,
+.IR addr ,
+and
+.I data
+are ignored.)
+.IP
+The
+.B PTRACE_TRACEME
+request is used only by the tracee;
+the remaining requests are used only by the tracer.
+In the following requests,
+.I pid
+specifies the thread ID of the tracee to be acted on.
+For requests other than
+.BR PTRACE_ATTACH ,
+.BR PTRACE_SEIZE ,
+.BR PTRACE_INTERRUPT ,
+and
+.BR PTRACE_KILL ,
+the tracee must be stopped.
+.TP
+.BR PTRACE_PEEKTEXT ", " PTRACE_PEEKDATA
+Read a word at the address
+.I addr
+in the tracee's memory, returning the word as the result of the
+.BR ptrace ()
+call.
+Linux does not have separate text and data address spaces,
+so these two requests are currently equivalent.
+.RI ( data
+is ignored; but see NOTES.)
+.TP
+.B PTRACE_PEEKUSER
+.\" PTRACE_PEEKUSR in kernel source, but glibc uses PTRACE_PEEKUSER,
+.\" and that is the name that seems common on other systems.
+Read a word at offset
+.I addr
+in the tracee's USER area,
+which holds the registers and other information about the process
+(see
+.IR <sys/user.h> ).
+The word is returned as the result of the
+.BR ptrace ()
+call.
+Typically, the offset must be word-aligned, though this might vary by
+architecture.
+See NOTES.
+.RI ( data
+is ignored; but see NOTES.)
+.TP
+.BR PTRACE_POKETEXT ", " PTRACE_POKEDATA
+Copy the word
+.I data
+to the address
+.I addr
+in the tracee's memory.
+As for
+.B PTRACE_PEEKTEXT
+and
+.BR PTRACE_PEEKDATA ,
+these two requests are currently equivalent.
+.TP
+.B PTRACE_POKEUSER
+.\" PTRACE_POKEUSR in kernel source, but glibc uses PTRACE_POKEUSER,
+.\" and that is the name that seems common on other systems.
+Copy the word
+.I data
+to offset
+.I addr
+in the tracee's USER area.
+As for
+.BR PTRACE_PEEKUSER ,
+the offset must typically be word-aligned.
+In order to maintain the integrity of the kernel,
+some modifications to the USER area are disallowed.
+.\" FIXME In the preceding sentence, which modifications are disallowed,
+.\" and when they are disallowed, how does user space discover that fact?
+.TP
+.BR PTRACE_GETREGS ", " PTRACE_GETFPREGS
+Copy the tracee's general-purpose or floating-point registers,
+respectively, to the address
+.I data
+in the tracer.
+See
+.I <sys/user.h>
+for information on the format of this data.
+.RI ( addr
+is ignored.)
+Note that SPARC systems have the meaning of
+.I data
+and
+.I addr
+reversed; that is,
+.I data
+is ignored and the registers are copied to the address
+.IR addr .
+.B PTRACE_GETREGS
+and
+.B PTRACE_GETFPREGS
+are not present on all architectures.
+.TP
+.BR PTRACE_GETREGSET " (since Linux 2.6.34)"
+Read the tracee's registers.
+.I addr
+specifies, in an architecture-dependent way, the type of registers to be read.
+.B NT_PRSTATUS
+(with numerical value 1)
+usually results in reading of general-purpose registers.
+If the CPU has, for example,
+floating-point and/or vector registers, they can be retrieved by setting
+.I addr
+to the corresponding
+.B NT_foo
+constant.
+.I data
+points to a
+.BR "struct iovec" ,
+which describes the destination buffer's location and length.
+On return, the kernel modifies
+.B iov.len
+to indicate the actual number of bytes returned.
+.TP
+.BR PTRACE_SETREGS ", " PTRACE_SETFPREGS
+Modify the tracee's general-purpose or floating-point registers,
+respectively, from the address
+.I data
+in the tracer.
+As for
+.BR PTRACE_POKEUSER ,
+some general-purpose register modifications may be disallowed.
+.\" FIXME . In the preceding sentence, which modifications are disallowed,
+.\" and when they are disallowed, how does user space discover that fact?
+.RI ( addr
+is ignored.)
+Note that SPARC systems have the meaning of
+.I data
+and
+.I addr
+reversed; that is,
+.I data
+is ignored and the registers are copied from the address
+.IR addr .
+.B PTRACE_SETREGS
+and
+.B PTRACE_SETFPREGS
+are not present on all architectures.
+.TP
+.BR PTRACE_SETREGSET " (since Linux 2.6.34)"
+Modify the tracee's registers.
+The meaning of
+.I addr
+and
+.I data
+is analogous to
+.BR PTRACE_GETREGSET .
+.TP
+.BR PTRACE_GETSIGINFO " (since Linux 2.3.99-pre6)"
+Retrieve information about the signal that caused the stop.
+Copy a
+.I siginfo_t
+structure (see
+.BR sigaction (2))
+from the tracee to the address
+.I data
+in the tracer.
+.RI ( addr
+is ignored.)
+.TP
+.BR PTRACE_SETSIGINFO " (since Linux 2.3.99-pre6)"
+Set signal information:
+copy a
+.I siginfo_t
+structure from the address
+.I data
+in the tracer to the tracee.
+This will affect only signals that would normally be delivered to
+the tracee and were caught by the tracer.
+It may be difficult to tell
+these normal signals from synthetic signals generated by
+.BR ptrace ()
+itself.
+.RI ( addr
+is ignored.)
+.TP
+.BR PTRACE_PEEKSIGINFO " (since Linux 3.10)"
+.\" commit 84c751bd4aebbaae995fe32279d3dba48327bad4
+Retrieve
+.I siginfo_t
+structures without removing signals from a queue.
+.I addr
+points to a
+.I ptrace_peeksiginfo_args
+structure that specifies the ordinal position from which
+copying of signals should start,
+and the number of signals to copy.
+.I siginfo_t
+structures are copied into the buffer pointed to by
+.IR data .
+The return value contains the number of copied signals (zero indicates
+that there is no signal corresponding to the specified ordinal position).
+Within the returned
+.I siginfo
+structures,
+the
+.I si_code
+field includes information
+.RB ( __SI_CHLD ,
+.BR __SI_FAULT ,
+etc.) that are not otherwise exposed to user space.
+.PP
+.in +4n
+.EX
+struct ptrace_peeksiginfo_args {
+ u64 off; /* Ordinal position in queue at which
+ to start copying signals */
+ u32 flags; /* PTRACE_PEEKSIGINFO_SHARED or 0 */
+ s32 nr; /* Number of signals to copy */
+};
+.EE
+.in
+.IP
+Currently, there is only one flag,
+.BR PTRACE_PEEKSIGINFO_SHARED ,
+for dumping signals from the process-wide signal queue.
+If this flag is not set,
+signals are read from the per-thread queue of the specified thread.
+.in
+.TP
+.BR PTRACE_GETSIGMASK " (since Linux 3.11)"
+.\" commit 29000caecbe87b6b66f144f72111f0d02fbbf0c1
+Place a copy of the mask of blocked signals (see
+.BR sigprocmask (2))
+in the buffer pointed to by
+.IR data ,
+which should be a pointer to a buffer of type
+.IR sigset_t .
+The
+.I addr
+argument contains the size of the buffer pointed to by
+.I data
+(i.e.,
+.IR sizeof(sigset_t) ).
+.TP
+.BR PTRACE_SETSIGMASK " (since Linux 3.11)"
+Change the mask of blocked signals (see
+.BR sigprocmask (2))
+to the value specified in the buffer pointed to by
+.IR data ,
+which should be a pointer to a buffer of type
+.IR sigset_t .
+The
+.I addr
+argument contains the size of the buffer pointed to by
+.I data
+(i.e.,
+.IR sizeof(sigset_t) ).
+.TP
+.BR PTRACE_SETOPTIONS " (since Linux 2.4.6; see BUGS for caveats)"
+Set ptrace options from
+.IR data .
+.RI ( addr
+is ignored.)
+.I data
+is interpreted as a bit mask of options,
+which are specified by the following flags:
+.RS
+.TP
+.BR PTRACE_O_EXITKILL " (since Linux 3.8)"
+.\" commit 992fb6e170639b0849bace8e49bf31bd37c4123
+Send a
+.B SIGKILL
+signal to the tracee if the tracer exits.
+This option is useful for ptrace jailers that
+want to ensure that tracees can never escape the tracer's control.
+.TP
+.BR PTRACE_O_TRACECLONE " (since Linux 2.5.46)"
+Stop the tracee at the next
+.BR clone (2)
+and automatically start tracing the newly cloned process,
+which will start with a
+.BR SIGSTOP ,
+or
+.B PTRACE_EVENT_STOP
+if
+.B PTRACE_SEIZE
+was used.
+A
+.BR waitpid (2)
+by the tracer will return a
+.I status
+value such that
+.IP
+.nf
+ status>>8 == (SIGTRAP | (PTRACE_EVENT_CLONE<<8))
+.fi
+.IP
+The PID of the new process can be retrieved with
+.BR PTRACE_GETEVENTMSG .
+.IP
+This option may not catch
+.BR clone (2)
+calls in all cases.
+If the tracee calls
+.BR clone (2)
+with the
+.B CLONE_VFORK
+flag,
+.B PTRACE_EVENT_VFORK
+will be delivered instead
+if
+.B PTRACE_O_TRACEVFORK
+is set; otherwise if the tracee calls
+.BR clone (2)
+with the exit signal set to
+.BR SIGCHLD ,
+.B PTRACE_EVENT_FORK
+will be delivered if
+.B PTRACE_O_TRACEFORK
+is set.
+.TP
+.BR PTRACE_O_TRACEEXEC " (since Linux 2.5.46)"
+Stop the tracee at the next
+.BR execve (2).
+A
+.BR waitpid (2)
+by the tracer will return a
+.I status
+value such that
+.IP
+.nf
+ status>>8 == (SIGTRAP | (PTRACE_EVENT_EXEC<<8))
+.fi
+.IP
+If the execing thread is not a thread group leader,
+the thread ID is reset to thread group leader's ID before this stop.
+Since Linux 3.0, the former thread ID can be retrieved with
+.BR PTRACE_GETEVENTMSG .
+.TP
+.BR PTRACE_O_TRACEEXIT " (since Linux 2.5.60)"
+Stop the tracee at exit.
+A
+.BR waitpid (2)
+by the tracer will return a
+.I status
+value such that
+.IP
+.nf
+ status>>8 == (SIGTRAP | (PTRACE_EVENT_EXIT<<8))
+.fi
+.IP
+The tracee's exit status can be retrieved with
+.BR PTRACE_GETEVENTMSG .
+.IP
+The tracee is stopped early during process exit,
+when registers are still available,
+allowing the tracer to see where the exit occurred,
+whereas the normal exit notification is done after the process
+is finished exiting.
+Even though context is available,
+the tracer cannot prevent the exit from happening at this point.
+.TP
+.BR PTRACE_O_TRACEFORK " (since Linux 2.5.46)"
+Stop the tracee at the next
+.BR fork (2)
+and automatically start tracing the newly forked process,
+which will start with a
+.BR SIGSTOP ,
+or
+.B PTRACE_EVENT_STOP
+if
+.B PTRACE_SEIZE
+was used.
+A
+.BR waitpid (2)
+by the tracer will return a
+.I status
+value such that
+.IP
+.nf
+ status>>8 == (SIGTRAP | (PTRACE_EVENT_FORK<<8))
+.fi
+.IP
+The PID of the new process can be retrieved with
+.BR PTRACE_GETEVENTMSG .
+.TP
+.BR PTRACE_O_TRACESYSGOOD " (since Linux 2.4.6)"
+When delivering system call traps, set bit 7 in the signal number
+(i.e., deliver
+.IR "SIGTRAP|0x80" ).
+This makes it easy for the tracer to distinguish
+normal traps from those caused by a system call.
+.TP
+.BR PTRACE_O_TRACEVFORK " (since Linux 2.5.46)"
+Stop the tracee at the next
+.BR vfork (2)
+and automatically start tracing the newly vforked process,
+which will start with a
+.BR SIGSTOP ,
+or
+.B PTRACE_EVENT_STOP
+if
+.B PTRACE_SEIZE
+was used.
+A
+.BR waitpid (2)
+by the tracer will return a
+.I status
+value such that
+.IP
+.nf
+ status>>8 == (SIGTRAP | (PTRACE_EVENT_VFORK<<8))
+.fi
+.IP
+The PID of the new process can be retrieved with
+.BR PTRACE_GETEVENTMSG .
+.TP
+.BR PTRACE_O_TRACEVFORKDONE " (since Linux 2.5.60)"
+Stop the tracee at the completion of the next
+.BR vfork (2).
+A
+.BR waitpid (2)
+by the tracer will return a
+.I status
+value such that
+.IP
+.nf
+ status>>8 == (SIGTRAP | (PTRACE_EVENT_VFORK_DONE<<8))
+.fi
+.IP
+The PID of the new process can (since Linux 2.6.18) be retrieved with
+.BR PTRACE_GETEVENTMSG .
+.TP
+.BR PTRACE_O_TRACESECCOMP " (since Linux 3.5)"
+Stop the tracee when a
+.BR seccomp (2)
+.B SECCOMP_RET_TRACE
+rule is triggered.
+A
+.BR waitpid (2)
+by the tracer will return a
+.I status
+value such that
+.IP
+.nf
+ status>>8 == (SIGTRAP | (PTRACE_EVENT_SECCOMP<<8))
+.fi
+.IP
+While this triggers a
+.B PTRACE_EVENT
+stop, it is similar to a syscall-enter-stop.
+For details, see the note on
+.B PTRACE_EVENT_SECCOMP
+below.
+The seccomp event message data (from the
+.B SECCOMP_RET_DATA
+portion of the seccomp filter rule) can be retrieved with
+.BR PTRACE_GETEVENTMSG .
+.TP
+.BR PTRACE_O_SUSPEND_SECCOMP " (since Linux 4.3)"
+.\" commit 13c4a90119d28cfcb6b5bdd820c233b86c2b0237
+Suspend the tracee's seccomp protections.
+This applies regardless of mode, and
+can be used when the tracee has not yet installed seccomp filters.
+That is, a valid use case is to suspend a tracee's seccomp protections
+before they are installed by the tracee,
+let the tracee install the filters,
+and then clear this flag when the filters should be resumed.
+Setting this option requires that the tracer have the
+.B CAP_SYS_ADMIN
+capability,
+not have any seccomp protections installed, and not have
+.B PTRACE_O_SUSPEND_SECCOMP
+set on itself.
+.RE
+.TP
+.BR PTRACE_GETEVENTMSG " (since Linux 2.5.46)"
+Retrieve a message (as an
+.IR "unsigned long" )
+about the ptrace event
+that just happened, placing it at the address
+.I data
+in the tracer.
+For
+.BR PTRACE_EVENT_EXIT ,
+this is the tracee's exit status.
+For
+.BR PTRACE_EVENT_FORK ,
+.BR PTRACE_EVENT_VFORK ,
+.BR PTRACE_EVENT_VFORK_DONE ,
+and
+.BR PTRACE_EVENT_CLONE ,
+this is the PID of the new process.
+For
+.BR PTRACE_EVENT_SECCOMP ,
+this is the
+.BR seccomp (2)
+filter's
+.B SECCOMP_RET_DATA
+associated with the triggered rule.
+.RI ( addr
+is ignored.)
+.TP
+.B PTRACE_CONT
+Restart the stopped tracee process.
+If
+.I data
+is nonzero,
+it is interpreted as the number of a signal to be delivered to the tracee;
+otherwise, no signal is delivered.
+Thus, for example, the tracer can control
+whether a signal sent to the tracee is delivered or not.
+.RI ( addr
+is ignored.)
+.TP
+.BR PTRACE_SYSCALL ", " PTRACE_SINGLESTEP
+Restart the stopped tracee as for
+.BR PTRACE_CONT ,
+but arrange for the tracee to be stopped at
+the next entry to or exit from a system call,
+or after execution of a single instruction, respectively.
+(The tracee will also, as usual, be stopped upon receipt of a signal.)
+From the tracer's perspective, the tracee will appear to have been
+stopped by receipt of a
+.BR SIGTRAP .
+So, for
+.BR PTRACE_SYSCALL ,
+for example, the idea is to inspect
+the arguments to the system call at the first stop,
+then do another
+.B PTRACE_SYSCALL
+and inspect the return value of the system call at the second stop.
+The
+.I data
+argument is treated as for
+.BR PTRACE_CONT .
+.RI ( addr
+is ignored.)
+.TP
+.BR PTRACE_SET_SYSCALL " (since Linux 2.6.16)"
+.\" commit 3f471126ee53feb5e9b210ea2f525ed3bb9b7a7f
+When in syscall-enter-stop,
+change the number of the system call that is about to
+be executed to the number specified in the
+.I data
+argument.
+The
+.I addr
+argument is ignored.
+This request is currently
+.\" As of 4.19-rc2
+supported only on arm (and arm64, though only for backwards compatibility),
+.\" commit 27aa55c5e5123fa8b8ad0156559d34d7edff58ca
+but most other architectures have other means of accomplishing this
+(usually by changing the register that the userland code passed the
+system call number in).
+.\" see change_syscall in tools/testing/selftests/seccomp/seccomp_bpf.c
+.\" and also strace's linux/*/set_scno.c files.
+.TP
+.BR PTRACE_SYSEMU ", " PTRACE_SYSEMU_SINGLESTEP " (since Linux 2.6.14)"
+For
+.BR PTRACE_SYSEMU ,
+continue and stop on entry to the next system call,
+which will not be executed.
+See the documentation on syscall-stops below.
+For
+.BR PTRACE_SYSEMU_SINGLESTEP ,
+do the same but also singlestep if not a system call.
+This call is used by programs like
+User Mode Linux that want to emulate all the tracee's system calls.
+The
+.I data
+argument is treated as for
+.BR PTRACE_CONT .
+The
+.I addr
+argument is ignored.
+These requests are currently
+.\" As at 3.7
+supported only on x86.
+.TP
+.BR PTRACE_LISTEN " (since Linux 3.4)"
+Restart the stopped tracee, but prevent it from executing.
+The resulting state of the tracee is similar to a process which
+has been stopped by a
+.B SIGSTOP
+(or other stopping signal).
+See the "group-stop" subsection for additional information.
+.B PTRACE_LISTEN
+works only on tracees attached by
+.BR PTRACE_SEIZE .
+.TP
+.B PTRACE_KILL
+Send the tracee a
+.B SIGKILL
+to terminate it.
+.RI ( addr
+and
+.I data
+are ignored.)
+.IP
+.I This operation is deprecated; do not use it!
+Instead, send a
+.B SIGKILL
+directly using
+.BR kill (2)
+or
+.BR tgkill (2).
+The problem with
+.B PTRACE_KILL
+is that it requires the tracee to be in signal-delivery-stop,
+otherwise it may not work
+(i.e., may complete successfully but won't kill the tracee).
+By contrast, sending a
+.B SIGKILL
+directly has no such limitation.
+.\" [Note from Denys Vlasenko:
+.\" deprecation suggested by Oleg Nesterov. He prefers to deprecate it
+.\" instead of describing (and needing to support) PTRACE_KILL's quirks.]
+.TP
+.BR PTRACE_INTERRUPT " (since Linux 3.4)"
+Stop a tracee.
+If the tracee is running or sleeping in kernel space and
+.B PTRACE_SYSCALL
+is in effect,
+the system call is interrupted and syscall-exit-stop is reported.
+(The interrupted system call is restarted when the tracee is restarted.)
+If the tracee was already stopped by a signal and
+.B PTRACE_LISTEN
+was sent to it,
+the tracee stops with
+.B PTRACE_EVENT_STOP
+and
+.I WSTOPSIG(status)
+returns the stop signal.
+If any other ptrace-stop is generated at the same time (for example,
+if a signal is sent to the tracee), this ptrace-stop happens.
+If none of the above applies (for example, if the tracee is running in user
+space), it stops with
+.B PTRACE_EVENT_STOP
+with
+.I WSTOPSIG(status)
+==
+.BR SIGTRAP .
+.B PTRACE_INTERRUPT
+only works on tracees attached by
+.BR PTRACE_SEIZE .
+.TP
+.B PTRACE_ATTACH
+Attach to the process specified in
+.IR pid ,
+making it a tracee of the calling process.
+.\" No longer true (removed by Denys Vlasenko, 2011, who remarks:
+.\" "I think it isn't true in non-ancient 2.4 and in Linux 2.6/3.x.
+.\" Basically, it's not true for any Linux in practical use.
+.\" ; the behavior of the tracee is as if it had done a
+.\" .BR PTRACE_TRACEME .
+.\" The calling process actually becomes the parent of the tracee
+.\" process for most purposes (e.g., it will receive
+.\" notification of tracee events and appears in
+.\" .BR ps (1)
+.\" output as the tracee's parent), but a
+.\" .BR getppid (2)
+.\" by the tracee will still return the PID of the original parent.
+The tracee is sent a
+.BR SIGSTOP ,
+but will not necessarily have stopped
+by the completion of this call; use
+.BR waitpid (2)
+to wait for the tracee to stop.
+See the "Attaching and detaching" subsection for additional information.
+.RI ( addr
+and
+.I data
+are ignored.)
+.IP
+Permission to perform a
+.B PTRACE_ATTACH
+is governed by a ptrace access mode
+.B PTRACE_MODE_ATTACH_REALCREDS
+check; see below.
+.TP
+.BR PTRACE_SEIZE " (since Linux 3.4)"
+.\"
+.\" Noted by Dmitry Levin:
+.\"
+.\" PTRACE_SEIZE was introduced by commit v3.1-rc1~308^2~28, but
+.\" it had to be used along with a temporary flag PTRACE_SEIZE_DEVEL,
+.\" which was removed later by commit v3.4-rc1~109^2~20.
+.\"
+.\" That is, [before] v3.4 we had a test mode of PTRACE_SEIZE API,
+.\" which was not compatible with the current PTRACE_SEIZE API introduced
+.\" in Linux 3.4.
+.\"
+Attach to the process specified in
+.IR pid ,
+making it a tracee of the calling process.
+Unlike
+.BR PTRACE_ATTACH ,
+.B PTRACE_SEIZE
+does not stop the process.
+Group-stops are reported as
+.B PTRACE_EVENT_STOP
+and
+.I WSTOPSIG(status)
+returns the stop signal.
+Automatically attached children stop with
+.B PTRACE_EVENT_STOP
+and
+.I WSTOPSIG(status)
+returns
+.B SIGTRAP
+instead of having
+.B SIGSTOP
+signal delivered to them.
+.BR execve (2)
+does not deliver an extra
+.BR SIGTRAP .
+Only a
+.BR PTRACE_SEIZE d
+process can accept
+.B PTRACE_INTERRUPT
+and
+.B PTRACE_LISTEN
+commands.
+The "seized" behavior just described is inherited by
+children that are automatically attached using
+.BR PTRACE_O_TRACEFORK ,
+.BR PTRACE_O_TRACEVFORK ,
+and
+.BR PTRACE_O_TRACECLONE .
+.I addr
+must be zero.
+.I data
+contains a bit mask of ptrace options to activate immediately.
+.IP
+Permission to perform a
+.B PTRACE_SEIZE
+is governed by a ptrace access mode
+.B PTRACE_MODE_ATTACH_REALCREDS
+check; see below.
+.\"
+.TP
+.BR PTRACE_SECCOMP_GET_FILTER " (since Linux 4.4)"
+.\" commit f8e529ed941ba2bbcbf310b575d968159ce7e895
+This operation allows the tracer to dump the tracee's
+classic BPF filters.
+.IP
+.I addr
+is an integer specifying the index of the filter to be dumped.
+The most recently installed filter has the index 0.
+If
+.I addr
+is greater than the number of installed filters,
+the operation fails with the error
+.BR ENOENT .
+.IP
+.I data
+is either a pointer to a
+.I struct sock_filter
+array that is large enough to store the BPF program,
+or NULL if the program is not to be stored.
+.IP
+Upon success,
+the return value is the number of instructions in the BPF program.
+If
+.I data
+was NULL, then this return value can be used to correctly size the
+.I struct sock_filter
+array passed in a subsequent call.
+.IP
+This operation fails with the error
+.B EACCES
+if the caller does not have the
+.B CAP_SYS_ADMIN
+capability or if the caller is in strict or filter seccomp mode.
+If the filter referred to by
+.I addr
+is not a classic BPF filter, the operation fails with the error
+.BR EMEDIUMTYPE .
+.IP
+This operation is available if the kernel was configured with both the
+.B CONFIG_SECCOMP_FILTER
+and the
+.B CONFIG_CHECKPOINT_RESTORE
+options.
+.TP
+.B PTRACE_DETACH
+Restart the stopped tracee as for
+.BR PTRACE_CONT ,
+but first detach from it.
+Under Linux, a tracee can be detached in this way regardless
+of which method was used to initiate tracing.
+.RI ( addr
+is ignored.)
+.\"
+.TP
+.BR PTRACE_GET_THREAD_AREA " (since Linux 2.6.0)"
+This operation performs a similar task to
+.BR get_thread_area (2).
+It reads the TLS entry in the GDT whose index is given in
+.IR addr ,
+placing a copy of the entry into the
+.I struct user_desc
+pointed to by
+.IR data .
+(By contrast with
+.BR get_thread_area (2),
+the
+.I entry_number
+of the
+.I struct user_desc
+is ignored.)
+.TP
+.BR PTRACE_SET_THREAD_AREA " (since Linux 2.6.0)"
+This operation performs a similar task to
+.BR set_thread_area (2).
+It sets the TLS entry in the GDT whose index is given in
+.IR addr ,
+assigning it the data supplied in the
+.I struct user_desc
+pointed to by
+.IR data .
+(By contrast with
+.BR set_thread_area (2),
+the
+.I entry_number
+of the
+.I struct user_desc
+is ignored; in other words,
+this ptrace operation can't be used to allocate a free TLS entry.)
+.TP
+.BR PTRACE_GET_SYSCALL_INFO " (since Linux 5.3)"
+.\" commit 201766a20e30f982ccfe36bebfad9602c3ff574a
+Retrieve information about the system call that caused the stop.
+The information is placed into the buffer pointed by the
+.I data
+argument, which should be a pointer to a buffer of type
+.IR "struct ptrace_syscall_info" .
+The
+.I addr
+argument contains the size of the buffer pointed to
+by the
+.I data
+argument (i.e.,
+.IR "sizeof(struct ptrace_syscall_info)" ).
+The return value contains the number of bytes available
+to be written by the kernel.
+If the size of the data to be written by the kernel exceeds the size
+specified by the
+.I addr
+argument, the output data is truncated.
+.IP
+The
+.I ptrace_syscall_info
+structure contains the following fields:
+.IP
+.in +4n
+.EX
+struct ptrace_syscall_info {
+ __u8 op; /* Type of system call stop */
+ __u32 arch; /* AUDIT_ARCH_* value; see seccomp(2) */
+ __u64 instruction_pointer; /* CPU instruction pointer */
+ __u64 stack_pointer; /* CPU stack pointer */
+ union {
+ struct { /* op == PTRACE_SYSCALL_INFO_ENTRY */
+ __u64 nr; /* System call number */
+ __u64 args[6]; /* System call arguments */
+ } entry;
+ struct { /* op == PTRACE_SYSCALL_INFO_EXIT */
+ __s64 rval; /* System call return value */
+ __u8 is_error; /* System call error flag;
+ Boolean: does rval contain
+ an error value (\-ERRCODE) or
+ a nonerror return value? */
+ } exit;
+ struct { /* op == PTRACE_SYSCALL_INFO_SECCOMP */
+ __u64 nr; /* System call number */
+ __u64 args[6]; /* System call arguments */
+ __u32 ret_data; /* SECCOMP_RET_DATA portion
+ of SECCOMP_RET_TRACE
+ return value */
+ } seccomp;
+ };
+};
+.EE
+.in
+.IP
+The
+.IR op ,
+.IR arch ,
+.IR instruction_pointer ,
+and
+.I stack_pointer
+fields are defined for all kinds of ptrace system call stops.
+The rest of the structure is a union; one should read only those fields
+that are meaningful for the kind of system call stop specified by the
+.I op
+field.
+.IP
+The
+.I op
+field has one of the following values (defined in
+.IR <linux/ptrace.h> )
+indicating what type of stop occurred and
+which part of the union is filled:
+.RS
+.TP
+.B PTRACE_SYSCALL_INFO_ENTRY
+The
+.I entry
+component of the union contains information relating to a
+system call entry stop.
+.TP
+.B PTRACE_SYSCALL_INFO_EXIT
+The
+.I exit
+component of the union contains information relating to a
+system call exit stop.
+.TP
+.B PTRACE_SYSCALL_INFO_SECCOMP
+The
+.I seccomp
+component of the union contains information relating to a
+.B PTRACE_EVENT_SECCOMP
+stop.
+.TP
+.B PTRACE_SYSCALL_INFO_NONE
+No component of the union contains relevant information.
+.RE
+.IP
+In case of system call entry or exit stops,
+the data returned by
+.B PTRACE_GET_SYSCALL_INFO
+is limited to type
+.B PTRACE_SYSCALL_INFO_NONE
+unless
+.B PTRACE_O_TRACESYSGOOD
+option is set before the corresponding system call stop has occurred.
+.\"
+.SS Death under ptrace
+When a (possibly multithreaded) process receives a killing signal
+(one whose disposition is set to
+.B SIG_DFL
+and whose default action is to kill the process),
+all threads exit.
+Tracees report their death to their tracer(s).
+Notification of this event is delivered via
+.BR waitpid (2).
+.PP
+Note that the killing signal will first cause signal-delivery-stop
+(on one tracee only),
+and only after it is injected by the tracer
+(or after it was dispatched to a thread which isn't traced),
+will death from the signal happen on
+.I all
+tracees within a multithreaded process.
+(The term "signal-delivery-stop" is explained below.)
+.PP
+.B SIGKILL
+does not generate signal-delivery-stop and
+therefore the tracer can't suppress it.
+.B SIGKILL
+kills even within system calls
+(syscall-exit-stop is not generated prior to death by
+.BR SIGKILL ).
+The net effect is that
+.B SIGKILL
+always kills the process (all its threads),
+even if some threads of the process are ptraced.
+.PP
+When the tracee calls
+.BR _exit (2),
+it reports its death to its tracer.
+Other threads are not affected.
+.PP
+When any thread executes
+.BR exit_group (2),
+every tracee in its thread group reports its death to its tracer.
+.PP
+If the
+.B PTRACE_O_TRACEEXIT
+option is on,
+.B PTRACE_EVENT_EXIT
+will happen before actual death.
+This applies to exits via
+.BR exit (2),
+.BR exit_group (2),
+and signal deaths (except
+.BR SIGKILL ,
+depending on the kernel version; see BUGS below),
+and when threads are torn down on
+.BR execve (2)
+in a multithreaded process.
+.PP
+The tracer cannot assume that the ptrace-stopped tracee exists.
+There are many scenarios when the tracee may die while stopped (such as
+.BR SIGKILL ).
+Therefore, the tracer must be prepared to handle an
+.B ESRCH
+error on any ptrace operation.
+Unfortunately, the same error is returned if the tracee
+exists but is not ptrace-stopped
+(for commands which require a stopped tracee),
+or if it is not traced by the process which issued the ptrace call.
+The tracer needs to keep track of the stopped/running state of the tracee,
+and interpret
+.B ESRCH
+as "tracee died unexpectedly" only if it knows that the tracee has
+been observed to enter ptrace-stop.
+Note that there is no guarantee that
+.I waitpid(WNOHANG)
+will reliably report the tracee's death status if a
+ptrace operation returned
+.BR ESRCH .
+.I waitpid(WNOHANG)
+may return 0 instead.
+In other words, the tracee may be "not yet fully dead",
+but already refusing ptrace requests.
+.PP
+The tracer can't assume that the tracee
+.I always
+ends its life by reporting
+.I WIFEXITED(status)
+or
+.IR WIFSIGNALED(status) ;
+there are cases where this does not occur.
+For example, if a thread other than thread group leader does an
+.BR execve (2),
+it disappears;
+its PID will never be seen again,
+and any subsequent ptrace stops will be reported under
+the thread group leader's PID.
+.SS Stopped states
+A tracee can be in two states: running or stopped.
+For the purposes of ptrace, a tracee which is blocked in a system call
+(such as
+.BR read (2),
+.BR pause (2),
+etc.)
+is nevertheless considered to be running, even if the tracee is blocked
+for a long time.
+The state of the tracee after
+.B PTRACE_LISTEN
+is somewhat of a gray area: it is not in any ptrace-stop (ptrace commands
+won't work on it, and it will deliver
+.BR waitpid (2)
+notifications),
+but it also may be considered "stopped" because
+it is not executing instructions (is not scheduled), and if it was
+in group-stop before
+.BR PTRACE_LISTEN ,
+it will not respond to signals until
+.B SIGCONT
+is received.
+.PP
+There are many kinds of states when the tracee is stopped, and in ptrace
+discussions they are often conflated.
+Therefore, it is important to use precise terms.
+.PP
+In this manual page, any stopped state in which the tracee is ready
+to accept ptrace commands from the tracer is called
+.IR ptrace-stop .
+Ptrace-stops can
+be further subdivided into
+.IR signal-delivery-stop ,
+.IR group-stop ,
+.IR syscall-stop ,
+.IR "PTRACE_EVENT stops" ,
+and so on.
+These stopped states are described in detail below.
+.PP
+When the running tracee enters ptrace-stop, it notifies its tracer using
+.BR waitpid (2)
+(or one of the other "wait" system calls).
+Most of this manual page assumes that the tracer waits with:
+.PP
+.in +4n
+.EX
+pid = waitpid(pid_or_minus_1, &status, __WALL);
+.EE
+.in
+.PP
+Ptrace-stopped tracees are reported as returns with
+.I pid
+greater than 0 and
+.I WIFSTOPPED(status)
+true.
+.\" Denys Vlasenko:
+.\" Do we require __WALL usage, or will just using 0 be ok? (With 0,
+.\" I am not 100% sure there aren't ugly corner cases.) Are the
+.\" rules different if user wants to use waitid? Will waitid require
+.\" WEXITED?
+.\"
+.PP
+The
+.B __WALL
+flag does not include the
+.B WSTOPPED
+and
+.B WEXITED
+flags, but implies their functionality.
+.PP
+Setting the
+.B WCONTINUED
+flag when calling
+.BR waitpid (2)
+is not recommended: the "continued" state is per-process and
+consuming it can confuse the real parent of the tracee.
+.PP
+Use of the
+.B WNOHANG
+flag may cause
+.BR waitpid (2)
+to return 0 ("no wait results available yet")
+even if the tracer knows there should be a notification.
+Example:
+.PP
+.in +4n
+.EX
+errno = 0;
+ptrace(PTRACE_CONT, pid, 0L, 0L);
+if (errno == ESRCH) {
+ /* tracee is dead */
+ r = waitpid(tracee, &status, __WALL | WNOHANG);
+ /* r can still be 0 here! */
+}
+.EE
+.in
+.\" FIXME .
+.\" waitid usage? WNOWAIT?
+.\" describe how wait notifications queue (or not queue)
+.PP
+The following kinds of ptrace-stops exist: signal-delivery-stops,
+group-stops,
+.B PTRACE_EVENT
+stops, syscall-stops.
+They all are reported by
+.BR waitpid (2)
+with
+.I WIFSTOPPED(status)
+true.
+They may be differentiated by examining the value
+.IR status>>8 ,
+and if there is ambiguity in that value, by querying
+.BR PTRACE_GETSIGINFO .
+(Note: the
+.I WSTOPSIG(status)
+macro can't be used to perform this examination,
+because it returns the value
+.IR "(status>>8)\ &\ 0xff" .)
+.SS Signal-delivery-stop
+When a (possibly multithreaded) process receives any signal except
+.BR SIGKILL ,
+the kernel selects an arbitrary thread which handles the signal.
+(If the signal is generated with
+.BR tgkill (2),
+the target thread can be explicitly selected by the caller.)
+If the selected thread is traced, it enters signal-delivery-stop.
+At this point, the signal is not yet delivered to the process,
+and can be suppressed by the tracer.
+If the tracer doesn't suppress the signal,
+it passes the signal to the tracee in the next ptrace restart request.
+This second step of signal delivery is called
+.I "signal injection"
+in this manual page.
+Note that if the signal is blocked,
+signal-delivery-stop doesn't happen until the signal is unblocked,
+with the usual exception that
+.B SIGSTOP
+can't be blocked.
+.PP
+Signal-delivery-stop is observed by the tracer as
+.BR waitpid (2)
+returning with
+.I WIFSTOPPED(status)
+true, with the signal returned by
+.IR WSTOPSIG(status) .
+If the signal is
+.BR SIGTRAP ,
+this may be a different kind of ptrace-stop;
+see the "Syscall-stops" and "execve" sections below for details.
+If
+.I WSTOPSIG(status)
+returns a stopping signal, this may be a group-stop; see below.
+.SS Signal injection and suppression
+After signal-delivery-stop is observed by the tracer,
+the tracer should restart the tracee with the call
+.PP
+.in +4n
+.EX
+ptrace(PTRACE_restart, pid, 0, sig)
+.EE
+.in
+.PP
+where
+.B PTRACE_restart
+is one of the restarting ptrace requests.
+If
+.I sig
+is 0, then a signal is not delivered.
+Otherwise, the signal
+.I sig
+is delivered.
+This operation is called
+.I "signal injection"
+in this manual page, to distinguish it from signal-delivery-stop.
+.PP
+The
+.I sig
+value may be different from the
+.I WSTOPSIG(status)
+value: the tracer can cause a different signal to be injected.
+.PP
+Note that a suppressed signal still causes system calls to return
+prematurely.
+In this case, system calls will be restarted: the tracer will
+observe the tracee to reexecute the interrupted system call (or
+.BR restart_syscall (2)
+system call for a few system calls which use a different mechanism
+for restarting) if the tracer uses
+.BR PTRACE_SYSCALL .
+Even system calls (such as
+.BR poll (2))
+which are not restartable after signal are restarted after
+signal is suppressed;
+however, kernel bugs exist which cause some system calls to fail with
+.B EINTR
+even though no observable signal is injected to the tracee.
+.PP
+Restarting ptrace commands issued in ptrace-stops other than
+signal-delivery-stop are not guaranteed to inject a signal, even if
+.I sig
+is nonzero.
+No error is reported; a nonzero
+.I sig
+may simply be ignored.
+Ptrace users should not try to "create a new signal" this way: use
+.BR tgkill (2)
+instead.
+.PP
+The fact that signal injection requests may be ignored
+when restarting the tracee after
+ptrace stops that are not signal-delivery-stops
+is a cause of confusion among ptrace users.
+One typical scenario is that the tracer observes group-stop,
+mistakes it for signal-delivery-stop, restarts the tracee with
+.PP
+.in +4n
+.EX
+ptrace(PTRACE_restart, pid, 0, stopsig)
+.EE
+.in
+.PP
+with the intention of injecting
+.IR stopsig ,
+but
+.I stopsig
+gets ignored and the tracee continues to run.
+.PP
+The
+.B SIGCONT
+signal has a side effect of waking up (all threads of)
+a group-stopped process.
+This side effect happens before signal-delivery-stop.
+The tracer can't suppress this side effect (it can
+only suppress signal injection, which only causes the
+.B SIGCONT
+handler to not be executed in the tracee, if such a handler is installed).
+In fact, waking up from group-stop may be followed by
+signal-delivery-stop for signal(s)
+.I other than
+.BR SIGCONT ,
+if they were pending when
+.B SIGCONT
+was delivered.
+In other words,
+.B SIGCONT
+may be not the first signal observed by the tracee after it was sent.
+.PP
+Stopping signals cause (all threads of) a process to enter group-stop.
+This side effect happens after signal injection, and therefore can be
+suppressed by the tracer.
+.PP
+In Linux 2.4 and earlier, the
+.B SIGSTOP
+signal can't be injected.
+.\" In the Linux 2.4 sources, in arch/i386/kernel/signal.c::do_signal(),
+.\" there is:
+.\"
+.\" /* The debugger continued. Ignore SIGSTOP. */
+.\" if (signr == SIGSTOP)
+.\" continue;
+.PP
+.B PTRACE_GETSIGINFO
+can be used to retrieve a
+.I siginfo_t
+structure which corresponds to the delivered signal.
+.B PTRACE_SETSIGINFO
+may be used to modify it.
+If
+.B PTRACE_SETSIGINFO
+has been used to alter
+.IR siginfo_t ,
+the
+.I si_signo
+field and the
+.I sig
+parameter in the restarting command must match,
+otherwise the result is undefined.
+.SS Group-stop
+When a (possibly multithreaded) process receives a stopping signal,
+all threads stop.
+If some threads are traced, they enter a group-stop.
+Note that the stopping signal will first cause signal-delivery-stop
+(on one tracee only), and only after it is injected by the tracer
+(or after it was dispatched to a thread which isn't traced),
+will group-stop be initiated on
+.I all
+tracees within the multithreaded process.
+As usual, every tracee reports its group-stop separately
+to the corresponding tracer.
+.PP
+Group-stop is observed by the tracer as
+.BR waitpid (2)
+returning with
+.I WIFSTOPPED(status)
+true, with the stopping signal available via
+.IR WSTOPSIG(status) .
+The same result is returned by some other classes of ptrace-stops,
+therefore the recommended practice is to perform the call
+.PP
+.in +4n
+.EX
+ptrace(PTRACE_GETSIGINFO, pid, 0, &siginfo)
+.EE
+.in
+.PP
+The call can be avoided if the signal is not
+.BR SIGSTOP ,
+.BR SIGTSTP ,
+.BR SIGTTIN ,
+or
+.BR SIGTTOU ;
+only these four signals are stopping signals.
+If the tracer sees something else, it can't be a group-stop.
+Otherwise, the tracer needs to call
+.BR PTRACE_GETSIGINFO .
+If
+.B PTRACE_GETSIGINFO
+fails with
+.BR EINVAL ,
+then it is definitely a group-stop.
+(Other failure codes are possible, such as
+.B ESRCH
+("no such process") if a
+.B SIGKILL
+killed the tracee.)
+.PP
+If tracee was attached using
+.BR PTRACE_SEIZE ,
+group-stop is indicated by
+.BR PTRACE_EVENT_STOP :
+.IR "status>>16 == PTRACE_EVENT_STOP" .
+This allows detection of group-stops
+without requiring an extra
+.B PTRACE_GETSIGINFO
+call.
+.PP
+As of Linux 2.6.38,
+after the tracer sees the tracee ptrace-stop and until it
+restarts or kills it, the tracee will not run,
+and will not send notifications (except
+.B SIGKILL
+death) to the tracer, even if the tracer enters into another
+.BR waitpid (2)
+call.
+.PP
+The kernel behavior described in the previous paragraph
+causes a problem with transparent handling of stopping signals.
+If the tracer restarts the tracee after group-stop,
+the stopping signal
+is effectively ignored\[em]the tracee doesn't remain stopped, it runs.
+If the tracer doesn't restart the tracee before entering into the next
+.BR waitpid (2),
+future
+.B SIGCONT
+signals will not be reported to the tracer;
+this would cause the
+.B SIGCONT
+signals to have no effect on the tracee.
+.PP
+Since Linux 3.4, there is a method to overcome this problem: instead of
+.BR PTRACE_CONT ,
+a
+.B PTRACE_LISTEN
+command can be used to restart a tracee in a way where it does not execute,
+but waits for a new event which it can report via
+.BR waitpid (2)
+(such as when
+it is restarted by a
+.BR SIGCONT ).
+.SS PTRACE_EVENT stops
+If the tracer sets
+.B PTRACE_O_TRACE_*
+options, the tracee will enter ptrace-stops called
+.B PTRACE_EVENT
+stops.
+.PP
+.B PTRACE_EVENT
+stops are observed by the tracer as
+.BR waitpid (2)
+returning with
+.IR WIFSTOPPED(status) ,
+and
+.I WSTOPSIG(status)
+returns
+.B SIGTRAP
+(or for
+.BR PTRACE_EVENT_STOP ,
+returns the stopping signal if tracee is in a group-stop).
+An additional bit is set in the higher byte of the status word:
+the value
+.I status>>8
+will be
+.PP
+.in +4n
+.EX
+((PTRACE_EVENT_foo<<8) | SIGTRAP).
+.EE
+.in
+.PP
+The following events exist:
+.TP
+.B PTRACE_EVENT_VFORK
+Stop before return from
+.BR vfork (2)
+or
+.BR clone (2)
+with the
+.B CLONE_VFORK
+flag.
+When the tracee is continued after this stop, it will wait for child to
+exit/exec before continuing its execution
+(in other words, the usual behavior on
+.BR vfork (2)).
+.TP
+.B PTRACE_EVENT_FORK
+Stop before return from
+.BR fork (2)
+or
+.BR clone (2)
+with the exit signal set to
+.BR SIGCHLD .
+.TP
+.B PTRACE_EVENT_CLONE
+Stop before return from
+.BR clone (2).
+.TP
+.B PTRACE_EVENT_VFORK_DONE
+Stop before return from
+.BR vfork (2)
+or
+.BR clone (2)
+with the
+.B CLONE_VFORK
+flag,
+but after the child unblocked this tracee by exiting or execing.
+.PP
+For all four stops described above,
+the stop occurs in the parent (i.e., the tracee),
+not in the newly created thread.
+.B PTRACE_GETEVENTMSG
+can be used to retrieve the new thread's ID.
+.TP
+.B PTRACE_EVENT_EXEC
+Stop before return from
+.BR execve (2).
+Since Linux 3.0,
+.B PTRACE_GETEVENTMSG
+returns the former thread ID.
+.TP
+.B PTRACE_EVENT_EXIT
+Stop before exit (including death from
+.BR exit_group (2)),
+signal death, or exit caused by
+.BR execve (2)
+in a multithreaded process.
+.B PTRACE_GETEVENTMSG
+returns the exit status.
+Registers can be examined
+(unlike when "real" exit happens).
+The tracee is still alive; it needs to be
+.BR PTRACE_CONT ed
+or
+.BR PTRACE_DETACH ed
+to finish exiting.
+.TP
+.B PTRACE_EVENT_STOP
+Stop induced by
+.B PTRACE_INTERRUPT
+command, or group-stop, or initial ptrace-stop when a new child is attached
+(only if attached using
+.BR PTRACE_SEIZE ).
+.TP
+.B PTRACE_EVENT_SECCOMP
+Stop triggered by a
+.BR seccomp (2)
+rule on tracee syscall entry when
+.B PTRACE_O_TRACESECCOMP
+has been set by the tracer.
+The seccomp event message data (from the
+.B SECCOMP_RET_DATA
+portion of the seccomp filter rule) can be retrieved with
+.BR PTRACE_GETEVENTMSG .
+The semantics of this stop are described in
+detail in a separate section below.
+.PP
+.B PTRACE_GETSIGINFO
+on
+.B PTRACE_EVENT
+stops returns
+.B SIGTRAP
+in
+.IR si_signo ,
+with
+.I si_code
+set to
+.IR "(event<<8)\ |\ SIGTRAP" .
+.SS Syscall-stops
+If the tracee was restarted by
+.B PTRACE_SYSCALL
+or
+.BR PTRACE_SYSEMU ,
+the tracee enters
+syscall-enter-stop just prior to entering any system call (which
+will not be executed if the restart was using
+.BR PTRACE_SYSEMU ,
+regardless of any change made to registers at this point or how the
+tracee is restarted after this stop).
+No matter which method caused the syscall-entry-stop,
+if the tracer restarts the tracee with
+.BR PTRACE_SYSCALL ,
+the tracee enters syscall-exit-stop when the system call is finished,
+or if it is interrupted by a signal.
+(That is, signal-delivery-stop never happens between syscall-enter-stop
+and syscall-exit-stop; it happens
+.I after
+syscall-exit-stop.).
+If the tracee is continued using any other method (including
+.BR PTRACE_SYSEMU ),
+no syscall-exit-stop occurs.
+Note that all mentions
+.B PTRACE_SYSEMU
+apply equally to
+.BR PTRACE_SYSEMU_SINGLESTEP .
+.PP
+However, even if the tracee was continued using
+.BR PTRACE_SYSCALL ,
+it is not guaranteed that the next stop will be a syscall-exit-stop.
+Other possibilities are that the tracee may stop in a
+.B PTRACE_EVENT
+stop (including seccomp stops), exit (if it entered
+.BR _exit (2)
+or
+.BR exit_group (2)),
+be killed by
+.BR SIGKILL ,
+or die silently (if it is a thread group leader, the
+.BR execve (2)
+happened in another thread,
+and that thread is not traced by the same tracer;
+this situation is discussed later).
+.PP
+Syscall-enter-stop and syscall-exit-stop are observed by the tracer as
+.BR waitpid (2)
+returning with
+.I WIFSTOPPED(status)
+true, and
+.I WSTOPSIG(status)
+giving
+.BR SIGTRAP .
+If the
+.B PTRACE_O_TRACESYSGOOD
+option was set by the tracer, then
+.I WSTOPSIG(status)
+will give the value
+.IR "(SIGTRAP\ |\ 0x80)" .
+.PP
+Syscall-stops can be distinguished from signal-delivery-stop with
+.B SIGTRAP
+by querying
+.B PTRACE_GETSIGINFO
+for the following cases:
+.TP
+.IR si_code " <= 0"
+.B SIGTRAP
+was delivered as a result of a user-space action,
+for example, a system call
+.RB ( tgkill (2),
+.BR kill (2),
+.BR sigqueue (3),
+etc.),
+expiration of a POSIX timer,
+change of state on a POSIX message queue,
+or completion of an asynchronous I/O request.
+.TP
+.IR si_code " == SI_KERNEL (0x80)"
+.B SIGTRAP
+was sent by the kernel.
+.TP
+.IR si_code " == SIGTRAP or " si_code " == (SIGTRAP|0x80)"
+This is a syscall-stop.
+.PP
+However, syscall-stops happen very often (twice per system call),
+and performing
+.B PTRACE_GETSIGINFO
+for every syscall-stop may be somewhat expensive.
+.PP
+Some architectures allow the cases to be distinguished
+by examining registers.
+For example, on x86,
+.I rax
+==
+.RB \- ENOSYS
+in syscall-enter-stop.
+Since
+.B SIGTRAP
+(like any other signal) always happens
+.I after
+syscall-exit-stop,
+and at this point
+.I rax
+almost never contains
+.RB \- ENOSYS ,
+the
+.B SIGTRAP
+looks like "syscall-stop which is not syscall-enter-stop";
+in other words, it looks like a
+"stray syscall-exit-stop" and can be detected this way.
+But such detection is fragile and is best avoided.
+.PP
+Using the
+.B PTRACE_O_TRACESYSGOOD
+option is the recommended method to distinguish syscall-stops
+from other kinds of ptrace-stops,
+since it is reliable and does not incur a performance penalty.
+.PP
+Syscall-enter-stop and syscall-exit-stop are
+indistinguishable from each other by the tracer.
+The tracer needs to keep track of the sequence of
+ptrace-stops in order to not misinterpret syscall-enter-stop as
+syscall-exit-stop or vice versa.
+In general, a syscall-enter-stop is
+always followed by syscall-exit-stop,
+.B PTRACE_EVENT
+stop, or the tracee's death;
+no other kinds of ptrace-stop can occur in between.
+However, note that seccomp stops (see below) can cause syscall-exit-stops,
+without preceding syscall-entry-stops.
+If seccomp is in use, care needs
+to be taken not to misinterpret such stops as syscall-entry-stops.
+.PP
+If after syscall-enter-stop,
+the tracer uses a restarting command other than
+.BR PTRACE_SYSCALL ,
+syscall-exit-stop is not generated.
+.PP
+.B PTRACE_GETSIGINFO
+on syscall-stops returns
+.B SIGTRAP
+in
+.IR si_signo ,
+with
+.I si_code
+set to
+.B SIGTRAP
+or
+.IR (SIGTRAP|0x80) .
+.\"
+.SS PTRACE_EVENT_SECCOMP stops (Linux 3.5 to Linux 4.7)
+The behavior of
+.B PTRACE_EVENT_SECCOMP
+stops and their interaction with other kinds
+of ptrace stops has changed between kernel versions.
+This documents the behavior
+from their introduction until Linux 4.7 (inclusive).
+The behavior in later kernel versions is documented in the next section.
+.PP
+A
+.B PTRACE_EVENT_SECCOMP
+stop occurs whenever a
+.B SECCOMP_RET_TRACE
+rule is triggered.
+This is independent of which methods was used to restart the system call.
+Notably, seccomp still runs even if the tracee was restarted using
+.B PTRACE_SYSEMU
+and this system call is unconditionally skipped.
+.PP
+Restarts from this stop will behave as if the stop had occurred right
+before the system call in question.
+In particular, both
+.B PTRACE_SYSCALL
+and
+.B PTRACE_SYSEMU
+will normally cause a subsequent syscall-entry-stop.
+However, if after the
+.B PTRACE_EVENT_SECCOMP
+the system call number is negative,
+both the syscall-entry-stop and the system call itself will be skipped.
+This means that if the system call number is negative after a
+.B PTRACE_EVENT_SECCOMP
+and the tracee is restarted using
+.BR PTRACE_SYSCALL ,
+the next observed stop will be a syscall-exit-stop,
+rather than the syscall-entry-stop that might have been expected.
+.\"
+.SS PTRACE_EVENT_SECCOMP stops (since Linux 4.8)
+Starting with Linux 4.8,
+.\" commit 93e35efb8de45393cf61ed07f7b407629bf698ea
+the
+.B PTRACE_EVENT_SECCOMP
+stop was reordered to occur between syscall-entry-stop and
+syscall-exit-stop.
+Note that seccomp no longer runs (and no
+.B PTRACE_EVENT_SECCOMP
+will be reported) if the system call is skipped due to
+.BR PTRACE_SYSEMU .
+.PP
+Functionally, a
+.B PTRACE_EVENT_SECCOMP
+stop functions comparably
+to a syscall-entry-stop (i.e., continuations using
+.B PTRACE_SYSCALL
+will cause syscall-exit-stops,
+the system call number may be changed and any other modified registers
+are visible to the to-be-executed system call as well).
+Note that there may be,
+but need not have been a preceding syscall-entry-stop.
+.PP
+After a
+.B PTRACE_EVENT_SECCOMP
+stop, seccomp will be rerun, with a
+.B SECCOMP_RET_TRACE
+rule now functioning the same as a
+.BR SECCOMP_RET_ALLOW .
+Specifically, this means that if registers are not modified during the
+.B PTRACE_EVENT_SECCOMP
+stop, the system call will then be allowed.
+.\"
+.SS PTRACE_SINGLESTEP stops
+[Details of these kinds of stops are yet to be documented.]
+.\"
+.\" FIXME .
+.\" document stops occurring with PTRACE_SINGLESTEP
+.\"
+.SS Informational and restarting ptrace commands
+Most ptrace commands (all except
+.BR PTRACE_ATTACH ,
+.BR PTRACE_SEIZE ,
+.BR PTRACE_TRACEME ,
+.BR PTRACE_INTERRUPT ,
+and
+.BR PTRACE_KILL )
+require the tracee to be in a ptrace-stop, otherwise they fail with
+.BR ESRCH .
+.PP
+When the tracee is in ptrace-stop,
+the tracer can read and write data to
+the tracee using informational commands.
+These commands leave the tracee in ptrace-stopped state:
+.PP
+.in +4n
+.EX
+ptrace(PTRACE_PEEKTEXT/PEEKDATA/PEEKUSER, pid, addr, 0);
+ptrace(PTRACE_POKETEXT/POKEDATA/POKEUSER, pid, addr, long_val);
+ptrace(PTRACE_GETREGS/GETFPREGS, pid, 0, &struct);
+ptrace(PTRACE_SETREGS/SETFPREGS, pid, 0, &struct);
+ptrace(PTRACE_GETREGSET, pid, NT_foo, &iov);
+ptrace(PTRACE_SETREGSET, pid, NT_foo, &iov);
+ptrace(PTRACE_GETSIGINFO, pid, 0, &siginfo);
+ptrace(PTRACE_SETSIGINFO, pid, 0, &siginfo);
+ptrace(PTRACE_GETEVENTMSG, pid, 0, &long_var);
+ptrace(PTRACE_SETOPTIONS, pid, 0, PTRACE_O_flags);
+.EE
+.in
+.PP
+Note that some errors are not reported.
+For example, setting signal information
+.RI ( siginfo )
+may have no effect in some ptrace-stops, yet the call may succeed
+(return 0 and not set
+.IR errno );
+querying
+.B PTRACE_GETEVENTMSG
+may succeed and return some random value if current ptrace-stop
+is not documented as returning a meaningful event message.
+.PP
+The call
+.PP
+.in +4n
+.EX
+ptrace(PTRACE_SETOPTIONS, pid, 0, PTRACE_O_flags);
+.EE
+.in
+.PP
+affects one tracee.
+The tracee's current flags are replaced.
+Flags are inherited by new tracees created and "auto-attached" via active
+.BR PTRACE_O_TRACEFORK ,
+.BR PTRACE_O_TRACEVFORK ,
+or
+.B PTRACE_O_TRACECLONE
+options.
+.PP
+Another group of commands makes the ptrace-stopped tracee run.
+They have the form:
+.PP
+.in +4n
+.EX
+ptrace(cmd, pid, 0, sig);
+.EE
+.in
+.PP
+where
+.I cmd
+is
+.BR PTRACE_CONT ,
+.BR PTRACE_LISTEN ,
+.BR PTRACE_DETACH ,
+.BR PTRACE_SYSCALL ,
+.BR PTRACE_SINGLESTEP ,
+.BR PTRACE_SYSEMU ,
+or
+.BR PTRACE_SYSEMU_SINGLESTEP .
+If the tracee is in signal-delivery-stop,
+.I sig
+is the signal to be injected (if it is nonzero).
+Otherwise,
+.I sig
+may be ignored.
+(When restarting a tracee from a ptrace-stop other than signal-delivery-stop,
+recommended practice is to always pass 0 in
+.IR sig .)
+.SS Attaching and detaching
+A thread can be attached to the tracer using the call
+.PP
+.in +4n
+.EX
+ptrace(PTRACE_ATTACH, pid, 0, 0);
+.EE
+.in
+.PP
+or
+.PP
+.in +4n
+.EX
+ptrace(PTRACE_SEIZE, pid, 0, PTRACE_O_flags);
+.EE
+.in
+.PP
+.B PTRACE_ATTACH
+sends
+.B SIGSTOP
+to this thread.
+If the tracer wants this
+.B SIGSTOP
+to have no effect, it needs to suppress it.
+Note that if other signals are concurrently sent to
+this thread during attach,
+the tracer may see the tracee enter signal-delivery-stop
+with other signal(s) first!
+The usual practice is to reinject these signals until
+.B SIGSTOP
+is seen, then suppress
+.B SIGSTOP
+injection.
+The design bug here is that a ptrace attach and a concurrently delivered
+.B SIGSTOP
+may race and the concurrent
+.B SIGSTOP
+may be lost.
+.\"
+.\" FIXME Describe how to attach to a thread which is already group-stopped.
+.PP
+Since attaching sends
+.B SIGSTOP
+and the tracer usually suppresses it, this may cause a stray
+.B EINTR
+return from the currently executing system call in the tracee,
+as described in the "Signal injection and suppression" section.
+.PP
+Since Linux 3.4,
+.B PTRACE_SEIZE
+can be used instead of
+.BR PTRACE_ATTACH .
+.B PTRACE_SEIZE
+does not stop the attached process.
+If you need to stop
+it after attach (or at any other time) without sending it any signals,
+use
+.B PTRACE_INTERRUPT
+command.
+.PP
+The request
+.PP
+.in +4n
+.EX
+ptrace(PTRACE_TRACEME, 0, 0, 0);
+.EE
+.in
+.PP
+turns the calling thread into a tracee.
+The thread continues to run (doesn't enter ptrace-stop).
+A common practice is to follow the
+.B PTRACE_TRACEME
+with
+.PP
+.in +4n
+.EX
+raise(SIGSTOP);
+.EE
+.in
+.PP
+and allow the parent (which is our tracer now) to observe our
+signal-delivery-stop.
+.PP
+If the
+.BR PTRACE_O_TRACEFORK ,
+.BR PTRACE_O_TRACEVFORK ,
+or
+.B PTRACE_O_TRACECLONE
+options are in effect, then children created by, respectively,
+.BR vfork (2)
+or
+.BR clone (2)
+with the
+.B CLONE_VFORK
+flag,
+.BR fork (2)
+or
+.BR clone (2)
+with the exit signal set to
+.BR SIGCHLD ,
+and other kinds of
+.BR clone (2),
+are automatically attached to the same tracer which traced their parent.
+.B SIGSTOP
+is delivered to the children, causing them to enter
+signal-delivery-stop after they exit the system call which created them.
+.PP
+Detaching of the tracee is performed by:
+.PP
+.in +4n
+.EX
+ptrace(PTRACE_DETACH, pid, 0, sig);
+.EE
+.in
+.PP
+.B PTRACE_DETACH
+is a restarting operation;
+therefore it requires the tracee to be in ptrace-stop.
+If the tracee is in signal-delivery-stop, a signal can be injected.
+Otherwise, the
+.I sig
+parameter may be silently ignored.
+.PP
+If the tracee is running when the tracer wants to detach it,
+the usual solution is to send
+.B SIGSTOP
+(using
+.BR tgkill (2),
+to make sure it goes to the correct thread),
+wait for the tracee to stop in signal-delivery-stop for
+.B SIGSTOP
+and then detach it (suppressing
+.B SIGSTOP
+injection).
+A design bug is that this can race with concurrent
+.BR SIGSTOP s.
+Another complication is that the tracee may enter other ptrace-stops
+and needs to be restarted and waited for again, until
+.B SIGSTOP
+is seen.
+Yet another complication is to be sure that
+the tracee is not already ptrace-stopped,
+because no signal delivery happens while it is\[em]not even
+.BR SIGSTOP .
+.\" FIXME Describe how to detach from a group-stopped tracee so that it
+.\" doesn't run, but continues to wait for SIGCONT.
+.PP
+If the tracer dies, all tracees are automatically detached and restarted,
+unless they were in group-stop.
+Handling of restart from group-stop is currently buggy,
+but the "as planned" behavior is to leave tracee stopped and waiting for
+.BR SIGCONT .
+If the tracee is restarted from signal-delivery-stop,
+the pending signal is injected.
+.SS execve(2) under ptrace
+.\" clone(2) CLONE_THREAD says:
+.\" If any of the threads in a thread group performs an execve(2),
+.\" then all threads other than the thread group leader are terminated,
+.\" and the new program is executed in the thread group leader.
+.\"
+When one thread in a multithreaded process calls
+.BR execve (2),
+the kernel destroys all other threads in the process,
+.\" In Linux 3.1 sources, see fs/exec.c::de_thread()
+and resets the thread ID of the execing thread to the
+thread group ID (process ID).
+(Or, to put things another way, when a multithreaded process does an
+.BR execve (2),
+at completion of the call, it appears as though the
+.BR execve (2)
+occurred in the thread group leader, regardless of which thread did the
+.BR execve (2).)
+This resetting of the thread ID looks very confusing to tracers:
+.IP \[bu] 3
+All other threads stop in
+.B PTRACE_EVENT_EXIT
+stop, if the
+.B PTRACE_O_TRACEEXIT
+option was turned on.
+Then all other threads except the thread group leader report
+death as if they exited via
+.BR _exit (2)
+with exit code 0.
+.IP \[bu]
+The execing tracee changes its thread ID while it is in the
+.BR execve (2).
+(Remember, under ptrace, the "pid" returned from
+.BR waitpid (2),
+or fed into ptrace calls, is the tracee's thread ID.)
+That is, the tracee's thread ID is reset to be the same as its process ID,
+which is the same as the thread group leader's thread ID.
+.IP \[bu]
+Then a
+.B PTRACE_EVENT_EXEC
+stop happens, if the
+.B PTRACE_O_TRACEEXEC
+option was turned on.
+.IP \[bu]
+If the thread group leader has reported its
+.B PTRACE_EVENT_EXIT
+stop by this time,
+it appears to the tracer that
+the dead thread leader "reappears from nowhere".
+(Note: the thread group leader does not report death via
+.I WIFEXITED(status)
+until there is at least one other live thread.
+This eliminates the possibility that the tracer will see
+it dying and then reappearing.)
+If the thread group leader was still alive,
+for the tracer this may look as if thread group leader
+returns from a different system call than it entered,
+or even "returned from a system call even though
+it was not in any system call".
+If the thread group leader was not traced
+(or was traced by a different tracer), then during
+.BR execve (2)
+it will appear as if it has become a tracee of
+the tracer of the execing tracee.
+.PP
+All of the above effects are the artifacts of
+the thread ID change in the tracee.
+.PP
+The
+.B PTRACE_O_TRACEEXEC
+option is the recommended tool for dealing with this situation.
+First, it enables
+.B PTRACE_EVENT_EXEC
+stop,
+which occurs before
+.BR execve (2)
+returns.
+In this stop, the tracer can use
+.B PTRACE_GETEVENTMSG
+to retrieve the tracee's former thread ID.
+(This feature was introduced in Linux 3.0.)
+Second, the
+.B PTRACE_O_TRACEEXEC
+option disables legacy
+.B SIGTRAP
+generation on
+.BR execve (2).
+.PP
+When the tracer receives
+.B PTRACE_EVENT_EXEC
+stop notification,
+it is guaranteed that except this tracee and the thread group leader,
+no other threads from the process are alive.
+.PP
+On receiving the
+.B PTRACE_EVENT_EXEC
+stop notification,
+the tracer should clean up all its internal
+data structures describing the threads of this process,
+and retain only one data structure\[em]one which
+describes the single still running tracee, with
+.PP
+.in +4n
+.EX
+thread ID == thread group ID == process ID.
+.EE
+.in
+.PP
+Example: two threads call
+.BR execve (2)
+at the same time:
+.PP
+.nf
+*** we get syscall-enter-stop in thread 1: **
+PID1 execve("/bin/foo", "foo" <unfinished ...>
+*** we issue PTRACE_SYSCALL for thread 1 **
+*** we get syscall-enter-stop in thread 2: **
+PID2 execve("/bin/bar", "bar" <unfinished ...>
+*** we issue PTRACE_SYSCALL for thread 2 **
+*** we get PTRACE_EVENT_EXEC for PID0, we issue PTRACE_SYSCALL **
+*** we get syscall-exit-stop for PID0: **
+PID0 <... execve resumed> ) = 0
+.fi
+.PP
+If the
+.B PTRACE_O_TRACEEXEC
+option is
+.I not
+in effect for the execing tracee,
+and if the tracee was
+.BR PTRACE_ATTACH ed
+rather that
+.BR PTRACE_SEIZE d,
+the kernel delivers an extra
+.B SIGTRAP
+to the tracee after
+.BR execve (2)
+returns.
+This is an ordinary signal (similar to one which can be
+generated by
+.IR "kill \-TRAP" ),
+not a special kind of ptrace-stop.
+Employing
+.B PTRACE_GETSIGINFO
+for this signal returns
+.I si_code
+set to 0
+.RI ( SI_USER ).
+This signal may be blocked by signal mask,
+and thus may be delivered (much) later.
+.PP
+Usually, the tracer (for example,
+.BR strace (1))
+would not want to show this extra post-execve
+.B SIGTRAP
+signal to the user, and would suppress its delivery to the tracee (if
+.B SIGTRAP
+is set to
+.BR SIG_DFL ,
+it is a killing signal).
+However, determining
+.I which
+.B SIGTRAP
+to suppress is not easy.
+Setting the
+.B PTRACE_O_TRACEEXEC
+option or using
+.B PTRACE_SEIZE
+and thus suppressing this extra
+.B SIGTRAP
+is the recommended approach.
+.SS Real parent
+The ptrace API (ab)uses the standard UNIX parent/child signaling over
+.BR waitpid (2).
+This used to cause the real parent of the process to stop receiving
+several kinds of
+.BR waitpid (2)
+notifications when the child process is traced by some other process.
+.PP
+Many of these bugs have been fixed, but as of Linux 2.6.38 several still
+exist; see BUGS below.
+.PP
+As of Linux 2.6.38, the following is believed to work correctly:
+.IP \[bu] 3
+exit/death by signal is reported first to the tracer, then,
+when the tracer consumes the
+.BR waitpid (2)
+result, to the real parent (to the real parent only when the
+whole multithreaded process exits).
+If the tracer and the real parent are the same process,
+the report is sent only once.
+.SH RETURN VALUE
+On success, the
+.B PTRACE_PEEK*
+requests return the requested data (but see NOTES),
+the
+.B PTRACE_SECCOMP_GET_FILTER
+request returns the number of instructions in the BPF program,
+the
+.B PTRACE_GET_SYSCALL_INFO
+request returns the number of bytes available to be written by the kernel,
+and other requests return zero.
+.PP
+On error, all requests return \-1, and
+.I errno
+is set to indicate the error.
+Since the value returned by a successful
+.B PTRACE_PEEK*
+request may be \-1, the caller must clear
+.I errno
+before the call, and then check it afterward
+to determine whether or not an error occurred.
+.SH ERRORS
+.TP
+.B EBUSY
+(i386 only) There was an error with allocating or freeing a debug register.
+.TP
+.B EFAULT
+There was an attempt to read from or write to an invalid area in
+the tracer's or the tracee's memory,
+probably because the area wasn't mapped or accessible.
+Unfortunately, under Linux, different variations of this fault
+will return
+.B EIO
+or
+.B EFAULT
+more or less arbitrarily.
+.TP
+.B EINVAL
+An attempt was made to set an invalid option.
+.TP
+.B EIO
+.I request
+is invalid, or an attempt was made to read from or
+write to an invalid area in the tracer's or the tracee's memory,
+or there was a word-alignment violation,
+or an invalid signal was specified during a restart request.
+.TP
+.B EPERM
+The specified process cannot be traced.
+This could be because the
+tracer has insufficient privileges (the required capability is
+.BR CAP_SYS_PTRACE );
+unprivileged processes cannot trace processes that they
+cannot send signals to or those running
+set-user-ID/set-group-ID programs, for obvious reasons.
+Alternatively, the process may already be being traced,
+or (before Linux 2.6.26) be
+.BR init (1)
+(PID 1).
+.TP
+.B ESRCH
+The specified process does not exist, or is not currently being traced
+by the caller, or is not stopped
+(for requests that require a stopped tracee).
+.SH STANDARDS
+None.
+.SH HISTORY
+SVr4, 4.3BSD.
+.PP
+Before Linux 2.6.26,
+.\" See commit 00cd5c37afd5f431ac186dd131705048c0a11fdb
+.BR init (1),
+the process with PID 1, may not be traced.
+.SH NOTES
+Although arguments to
+.BR ptrace ()
+are interpreted according to the prototype given,
+glibc currently declares
+.BR ptrace ()
+as a variadic function with only the
+.I request
+argument fixed.
+It is recommended to always supply four arguments,
+even if the requested operation does not use them,
+setting unused/ignored arguments to
+.I 0L
+or
+.IR "(void\ *)\ 0".
+.PP
+A tracees parent continues to be the tracer even if that tracer calls
+.BR execve (2).
+.PP
+The layout of the contents of memory and the USER area are
+quite operating-system- and architecture-specific.
+The offset supplied, and the data returned,
+might not entirely match with the definition of
+.IR "struct user" .
+.\" See http://lkml.org/lkml/2008/5/8/375
+.PP
+The size of a "word" is determined by the operating-system variant
+(e.g., for 32-bit Linux it is 32 bits).
+.PP
+This page documents the way the
+.BR ptrace ()
+call works currently in Linux.
+Its behavior differs significantly on other flavors of UNIX.
+In any case, use of
+.BR ptrace ()
+is highly specific to the operating system and architecture.
+.\"
+.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.SS Ptrace access mode checking
+Various parts of the kernel-user-space API (not just
+.BR ptrace ()
+operations), require so-called "ptrace access mode" checks,
+whose outcome determines whether an operation is permitted
+(or, in a few cases, causes a "read" operation to return sanitized data).
+These checks are performed in cases where one process can
+inspect sensitive information about,
+or in some cases modify the state of, another process.
+The checks are based on factors such as the credentials and capabilities
+of the two processes,
+whether or not the "target" process is dumpable,
+and the results of checks performed by any enabled Linux Security Module
+(LSM)\[em]for example, SELinux, Yama, or Smack\[em]and by the commoncap LSM
+(which is always invoked).
+.PP
+Prior to Linux 2.6.27, all access checks were of a single type.
+Since Linux 2.6.27,
+.\" commit 006ebb40d3d65338bd74abb03b945f8d60e362bd
+two access mode levels are distinguished:
+.TP
+.B PTRACE_MODE_READ
+For "read" operations or other operations that are less dangerous,
+such as:
+.BR get_robust_list (2);
+.BR kcmp (2);
+reading
+.IR /proc/ pid /auxv ,
+.IR /proc/ pid /environ ,
+or
+.IR /proc/ pid /stat ;
+or
+.BR readlink (2)
+of a
+.IR /proc/ pid /ns/*
+file.
+.TP
+.B PTRACE_MODE_ATTACH
+For "write" operations, or other operations that are more dangerous,
+such as: ptrace attaching
+.RB ( PTRACE_ATTACH )
+to another process
+or calling
+.BR process_vm_writev (2).
+.RB ( PTRACE_MODE_ATTACH
+was effectively the default before Linux 2.6.27.)
+.\"
+.\" Regarding the above description of the distinction between
+.\" PTRACE_MODE_READ and PTRACE_MODE_ATTACH, Stephen Smalley notes:
+.\"
+.\" That was the intent when the distinction was introduced, but it doesn't
+.\" appear to have been properly maintained, e.g. there is now a common
+.\" helper lock_trace() that is used for
+.\" /proc/pid/{stack,syscall,personality} but checks PTRACE_MODE_ATTACH, and
+.\" PTRACE_MODE_ATTACH is also used in timerslack_ns_write/show(). Likely
+.\" should review and make them consistent. There was also some debate
+.\" about proper handling of /proc/pid/fd. Arguably that one might belong
+.\" back in the _ATTACH camp.
+.\"
+.PP
+Since Linux 4.5,
+.\" commit caaee6234d05a58c5b4d05e7bf766131b810a657
+the above access mode checks are combined (ORed) with
+one of the following modifiers:
+.TP
+.B PTRACE_MODE_FSCREDS
+Use the caller's filesystem UID and GID (see
+.BR credentials (7))
+or effective capabilities for LSM checks.
+.TP
+.B PTRACE_MODE_REALCREDS
+Use the caller's real UID and GID or permitted capabilities for LSM checks.
+This was effectively the default before Linux 4.5.
+.PP
+Because combining one of the credential modifiers with one of
+the aforementioned access modes is typical,
+some macros are defined in the kernel sources for the combinations:
+.TP
+.B PTRACE_MODE_READ_FSCREDS
+Defined as
+.BR "PTRACE_MODE_READ | PTRACE_MODE_FSCREDS" .
+.TP
+.B PTRACE_MODE_READ_REALCREDS
+Defined as
+.BR "PTRACE_MODE_READ | PTRACE_MODE_REALCREDS" .
+.TP
+.B PTRACE_MODE_ATTACH_FSCREDS
+Defined as
+.BR "PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS" .
+.TP
+.B PTRACE_MODE_ATTACH_REALCREDS
+Defined as
+.BR "PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS" .
+.PP
+One further modifier can be ORed with the access mode:
+.TP
+.BR PTRACE_MODE_NOAUDIT " (since Linux 3.3)"
+.\" commit 69f594a38967f4540ce7a29b3fd214e68a8330bd
+.\" Just for /proc/pid/stat
+Don't audit this access mode check.
+This modifier is employed for ptrace access mode checks
+(such as checks when reading
+.IR /proc/ pid /stat )
+that merely cause the output to be filtered or sanitized,
+rather than causing an error to be returned to the caller.
+In these cases, accessing the file is not a security violation and
+there is no reason to generate a security audit record.
+This modifier suppresses the generation of
+such an audit record for the particular access check.
+.PP
+Note that all of the
+.B PTRACE_MODE_*
+constants described in this subsection are kernel-internal,
+and not visible to user space.
+The constant names are mentioned here in order to label the various kinds of
+ptrace access mode checks that are performed for various system calls
+and accesses to various pseudofiles (e.g., under
+.IR /proc ).
+These names are used in other manual pages to provide a simple
+shorthand for labeling the different kernel checks.
+.PP
+The algorithm employed for ptrace access mode checking determines whether
+the calling process is allowed to perform the corresponding action
+on the target process.
+(In the case of opening
+.IR /proc/ pid
+files, the "calling process" is the one opening the file,
+and the process with the corresponding PID is the "target process".)
+The algorithm is as follows:
+.IP (1) 5
+If the calling thread and the target thread are in the same
+thread group, access is always allowed.
+.IP (2)
+If the access mode specifies
+.BR PTRACE_MODE_FSCREDS ,
+then, for the check in the next step,
+employ the caller's filesystem UID and GID.
+(As noted in
+.BR credentials (7),
+the filesystem UID and GID almost always have the same values
+as the corresponding effective IDs.)
+.IP
+Otherwise, the access mode specifies
+.BR PTRACE_MODE_REALCREDS ,
+so use the caller's real UID and GID for the checks in the next step.
+(Most APIs that check the caller's UID and GID use the effective IDs.
+For historical reasons, the
+.B PTRACE_MODE_REALCREDS
+check uses the real IDs instead.)
+.IP (3)
+Deny access if
+.I neither
+of the following is true:
+.RS
+.IP \[bu] 3
+The real, effective, and saved-set user IDs of the target
+match the caller's user ID,
+.I and
+the real, effective, and saved-set group IDs of the target
+match the caller's group ID.
+.IP \[bu]
+The caller has the
+.B CAP_SYS_PTRACE
+capability in the user namespace of the target.
+.RE
+.IP (4)
+Deny access if the target process "dumpable" attribute has a value other than 1
+.RB ( SUID_DUMP_USER ;
+see the discussion of
+.B PR_SET_DUMPABLE
+in
+.BR prctl (2)),
+and the caller does not have the
+.B CAP_SYS_PTRACE
+capability in the user namespace of the target process.
+.IP (5)
+The kernel LSM
+.IR security_ptrace_access_check ()
+interface is invoked to see if ptrace access is permitted.
+The results depend on the LSM(s).
+The implementation of this interface in the commoncap LSM performs
+the following steps:
+.\" (in cap_ptrace_access_check()):
+.RS
+.IP (5.1) 7
+If the access mode includes
+.BR PTRACE_MODE_FSCREDS ,
+then use the caller's
+.I effective
+capability set
+in the following check;
+otherwise (the access mode specifies
+.BR PTRACE_MODE_REALCREDS ,
+so) use the caller's
+.I permitted
+capability set.
+.IP (5.2)
+Deny access if
+.I neither
+of the following is true:
+.RS
+.IP \[bu] 3
+The caller and the target process are in the same user namespace,
+and the caller's capabilities are a superset of the target process's
+.I permitted
+capabilities.
+.IP \[bu]
+The caller has the
+.B CAP_SYS_PTRACE
+capability in the target process's user namespace.
+.RE
+.IP
+Note that the commoncap LSM does not distinguish between
+.B PTRACE_MODE_READ
+and
+.BR PTRACE_MODE_ATTACH .
+.RE
+.IP (6)
+If access has not been denied by any of the preceding steps,
+then access is allowed.
+.\"
+.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.SS /proc/sys/kernel/yama/ptrace_scope
+On systems with the Yama Linux Security Module (LSM) installed
+(i.e., the kernel was configured with
+.BR CONFIG_SECURITY_YAMA ),
+the
+.I /proc/sys/kernel/yama/ptrace_scope
+file (available since Linux 3.4)
+.\" commit 2d514487faf188938a4ee4fb3464eeecfbdcf8eb
+can be used to restrict the ability to trace a process with
+.BR ptrace ()
+(and thus also the ability to use tools such as
+.BR strace (1)
+and
+.BR gdb (1)).
+The goal of such restrictions is to prevent attack escalation whereby
+a compromised process can ptrace-attach to other sensitive processes
+(e.g., a GPG agent or an SSH session) owned by the user in order
+to gain additional credentials that may exist in memory
+and thus expand the scope of the attack.
+.PP
+More precisely, the Yama LSM limits two types of operations:
+.IP \[bu] 3
+Any operation that performs a ptrace access mode
+.B PTRACE_MODE_ATTACH
+check\[em]for example,
+.BR ptrace ()
+.BR PTRACE_ATTACH .
+(See the "Ptrace access mode checking" discussion above.)
+.IP \[bu]
+.BR ptrace ()
+.BR PTRACE_TRACEME .
+.PP
+A process that has the
+.B CAP_SYS_PTRACE
+capability can update the
+.I /proc/sys/kernel/yama/ptrace_scope
+file with one of the following values:
+.TP
+0 ("classic ptrace permissions")
+No additional restrictions on operations that perform
+.B PTRACE_MODE_ATTACH
+checks (beyond those imposed by the commoncap and other LSMs).
+.IP
+The use of
+.B PTRACE_TRACEME
+is unchanged.
+.TP
+1 ("restricted ptrace") [default value]
+When performing an operation that requires a
+.B PTRACE_MODE_ATTACH
+check, the calling process must either have the
+.B CAP_SYS_PTRACE
+capability in the user namespace of the target process or
+it must have a predefined relationship with the target process.
+By default,
+the predefined relationship is that the target process
+must be a descendant of the caller.
+.IP
+A target process can employ the
+.BR prctl (2)
+.B PR_SET_PTRACER
+operation to declare an additional PID that is allowed to perform
+.B PTRACE_MODE_ATTACH
+operations on the target.
+See the kernel source file
+.I Documentation/admin\-guide/LSM/Yama.rst
+.\" commit 90bb766440f2147486a2acc3e793d7b8348b0c22
+(or
+.I Documentation/security/Yama.txt
+before Linux 4.13)
+for further details.
+.IP
+The use of
+.B PTRACE_TRACEME
+is unchanged.
+.TP
+2 ("admin-only attach")
+Only processes with the
+.B CAP_SYS_PTRACE
+capability in the user namespace of the target process may perform
+.B PTRACE_MODE_ATTACH
+operations or trace children that employ
+.BR PTRACE_TRACEME .
+.TP
+3 ("no attach")
+No process may perform
+.B PTRACE_MODE_ATTACH
+operations or trace children that employ
+.BR PTRACE_TRACEME .
+.IP
+Once this value has been written to the file, it cannot be changed.
+.PP
+With respect to values 1 and 2,
+note that creating a new user namespace effectively removes the
+protection offered by Yama.
+This is because a process in the parent user namespace whose effective
+UID matches the UID of the creator of a child namespace
+has all capabilities (including
+.BR CAP_SYS_PTRACE )
+when performing operations within the child user namespace
+(and further-removed descendants of that namespace).
+Consequently, when a process tries to use user namespaces to sandbox itself,
+it inadvertently weakens the protections offered by the Yama LSM.
+.\"
+.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.\"
+.SS C library/kernel differences
+At the system call level, the
+.BR PTRACE_PEEKTEXT ,
+.BR PTRACE_PEEKDATA ,
+and
+.B PTRACE_PEEKUSER
+requests have a different API: they store the result
+at the address specified by the
+.I data
+parameter, and the return value is the error flag.
+The glibc wrapper function provides the API given in DESCRIPTION above,
+with the result being returned via the function return value.
+.SH BUGS
+On hosts with Linux 2.6 kernel headers,
+.B PTRACE_SETOPTIONS
+is declared with a different value than the one for Linux 2.4.
+This leads to applications compiled with Linux 2.6 kernel
+headers failing when run on Linux 2.4.
+This can be worked around by redefining
+.B PTRACE_SETOPTIONS
+to
+.BR PTRACE_OLDSETOPTIONS ,
+if that is defined.
+.PP
+Group-stop notifications are sent to the tracer, but not to real parent.
+Last confirmed on 2.6.38.6.
+.PP
+If a thread group leader is traced and exits by calling
+.BR _exit (2),
+.\" Note from Denys Vlasenko:
+.\" Here "exits" means any kind of death - _exit, exit_group,
+.\" signal death. Signal death and exit_group cases are trivial,
+.\" though: since signal death and exit_group kill all other threads
+.\" too, "until all other threads exit" thing happens rather soon
+.\" in these cases. Therefore, only _exit presents observably
+.\" puzzling behavior to ptrace users: thread leader _exit's,
+.\" but WIFEXITED isn't reported! We are trying to explain here
+.\" why it is so.
+a
+.B PTRACE_EVENT_EXIT
+stop will happen for it (if requested), but the subsequent
+.B WIFEXITED
+notification will not be delivered until all other threads exit.
+As explained above, if one of other threads calls
+.BR execve (2),
+the death of the thread group leader will
+.I never
+be reported.
+If the execed thread is not traced by this tracer,
+the tracer will never know that
+.BR execve (2)
+happened.
+One possible workaround is to
+.B PTRACE_DETACH
+the thread group leader instead of restarting it in this case.
+Last confirmed on 2.6.38.6.
+.\" FIXME . need to test/verify this scenario
+.PP
+A
+.B SIGKILL
+signal may still cause a
+.B PTRACE_EVENT_EXIT
+stop before actual signal death.
+This may be changed in the future;
+.B SIGKILL
+is meant to always immediately kill tasks even under ptrace.
+Last confirmed on Linux 3.13.
+.PP
+Some system calls return with
+.B EINTR
+if a signal was sent to a tracee, but delivery was suppressed by the tracer.
+(This is very typical operation: it is usually
+done by debuggers on every attach, in order to not introduce
+a bogus
+.BR SIGSTOP ).
+As of Linux 3.2.9, the following system calls are affected
+(this list is likely incomplete):
+.BR epoll_wait (2),
+and
+.BR read (2)
+from an
+.BR inotify (7)
+file descriptor.
+The usual symptom of this bug is that when you attach to
+a quiescent process with the command
+.PP
+.in +4n
+.EX
+strace \-p <process\-ID>
+.EE
+.in
+.PP
+then, instead of the usual
+and expected one-line output such as
+.PP
+.in +4n
+.EX
+restart_syscall(<... resuming interrupted call ...>_
+.EE
+.in
+.PP
+or
+.PP
+.in +4n
+.EX
+select(6, [5], NULL, [5], NULL_
+.EE
+.in
+.PP
+('_' denotes the cursor position), you observe more than one line.
+For example:
+.PP
+.in +4n
+.EX
+ clock_gettime(CLOCK_MONOTONIC, {15370, 690928118}) = 0
+ epoll_wait(4,_
+.EE
+.in
+.PP
+What is not visible here is that the process was blocked in
+.BR epoll_wait (2)
+before
+.BR strace (1)
+has attached to it.
+Attaching caused
+.BR epoll_wait (2)
+to return to user space with the error
+.BR EINTR .
+In this particular case, the program reacted to
+.B EINTR
+by checking the current time, and then executing
+.BR epoll_wait (2)
+again.
+(Programs which do not expect such "stray"
+.B EINTR
+errors may behave in an unintended way upon an
+.BR strace (1)
+attach.)
+.PP
+Contrary to the normal rules, the glibc wrapper for
+.BR ptrace ()
+can set
+.I errno
+to zero.
+.SH SEE ALSO
+.BR gdb (1),
+.BR ltrace (1),
+.BR strace (1),
+.BR clone (2),
+.BR execve (2),
+.BR fork (2),
+.BR gettid (2),
+.BR prctl (2),
+.BR seccomp (2),
+.BR sigaction (2),
+.BR tgkill (2),
+.BR vfork (2),
+.BR waitpid (2),
+.BR exec (3),
+.BR capabilities (7),
+.BR signal (7)
diff --git a/man2/putmsg.2 b/man2/putmsg.2
new file mode 100644
index 0000000..5d25ea6
--- /dev/null
+++ b/man2/putmsg.2
@@ -0,0 +1 @@
+.so man2/unimplemented.2
diff --git a/man2/putpmsg.2 b/man2/putpmsg.2
new file mode 100644
index 0000000..5d25ea6
--- /dev/null
+++ b/man2/putpmsg.2
@@ -0,0 +1 @@
+.so man2/unimplemented.2
diff --git a/man2/pwrite.2 b/man2/pwrite.2
new file mode 100644
index 0000000..87eacb2
--- /dev/null
+++ b/man2/pwrite.2
@@ -0,0 +1 @@
+.so man2/pread.2
diff --git a/man2/pwrite64.2 b/man2/pwrite64.2
new file mode 100644
index 0000000..9290e0a
--- /dev/null
+++ b/man2/pwrite64.2
@@ -0,0 +1 @@
+.so man2/pwrite.2
diff --git a/man2/pwritev.2 b/man2/pwritev.2
new file mode 100644
index 0000000..54e3384
--- /dev/null
+++ b/man2/pwritev.2
@@ -0,0 +1 @@
+.so man2/readv.2
diff --git a/man2/pwritev2.2 b/man2/pwritev2.2
new file mode 100644
index 0000000..54e3384
--- /dev/null
+++ b/man2/pwritev2.2
@@ -0,0 +1 @@
+.so man2/readv.2
diff --git a/man2/query_module.2 b/man2/query_module.2
new file mode 100644
index 0000000..519650a
--- /dev/null
+++ b/man2/query_module.2
@@ -0,0 +1,194 @@
+.\" Copyright (C) 1996 Free Software Foundation, Inc.
+.\"
+.\" SPDX-License-Identifier: GPL-1.0-or-later
+.\"
+.\" 2006-02-09, some reformatting by Luc Van Oostenryck; some
+.\" reformatting and rewordings by mtk
+.\"
+.TH query_module 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+query_module \- query the kernel for various bits pertaining to modules
+.SH SYNOPSIS
+.nf
+.B #include <linux/module.h>
+.PP
+.BI "[[deprecated]] int query_module(const char *" name ", int " which ,
+.BI " void " buf [. bufsize "], \
+size_t " bufsize ,
+.BI " size_t *" ret );
+.fi
+.SH DESCRIPTION
+.IR Note :
+This system call is present only before Linux 2.6.
+.PP
+.BR query_module ()
+requests information from the kernel about loadable modules.
+The returned information is placed in the buffer pointed to by
+.IR buf .
+The caller must specify the size of
+.I buf
+in
+.IR bufsize .
+The precise nature and format of the returned information
+depend on the operation specified by
+.IR which .
+Some operations require
+.I name
+to identify a currently loaded module, some allow
+.I name
+to be NULL, indicating the kernel proper.
+.PP
+The following values can be specified for
+.IR which :
+.TP
+.B 0
+Returns success, if the kernel supports
+.BR query_module ().
+Used to probe for availability of the system call.
+.TP
+.B QM_MODULES
+Returns the names of all loaded modules.
+The returned buffer consists of a sequence of null-terminated strings;
+.I ret
+is set to the number of
+modules.
+.\" ret is set on ENOSPC
+.TP
+.B QM_DEPS
+Returns the names of all modules used by the indicated module.
+The returned buffer consists of a sequence of null-terminated strings;
+.I ret
+is set to the number of modules.
+.\" ret is set on ENOSPC
+.TP
+.B QM_REFS
+Returns the names of all modules using the indicated module.
+This is the inverse of
+.BR QM_DEPS .
+The returned buffer consists of a sequence of null-terminated strings;
+.I ret
+is set to the number of modules.
+.\" ret is set on ENOSPC
+.TP
+.B QM_SYMBOLS
+Returns the symbols and values exported by the kernel or the indicated
+module.
+The returned buffer is an array of structures of the following form
+.\" ret is set on ENOSPC
+.IP
+.in +4n
+.EX
+struct module_symbol {
+ unsigned long value;
+ unsigned long name;
+};
+.EE
+.in
+.IP
+followed by null-terminated strings.
+The value of
+.I name
+is the character offset of the string relative to the start of
+.IR buf ;
+.I ret
+is set to the number of symbols.
+.TP
+.B QM_INFO
+Returns miscellaneous information about the indicated module.
+The output buffer format is:
+.IP
+.in +4n
+.EX
+struct module_info {
+ unsigned long address;
+ unsigned long size;
+ unsigned long flags;
+};
+.EE
+.in
+.IP
+where
+.I address
+is the kernel address at which the module resides,
+.I size
+is the size of the module in bytes, and
+.I flags
+is a mask of
+.BR MOD_RUNNING ,
+.BR MOD_AUTOCLEAN ,
+and so on, that indicates the current status of the module
+(see the Linux kernel source file
+.IR include/linux/module.h ).
+.I ret
+is set to the size of the
+.I module_info
+structure.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+At least one of
+.IR name ,
+.IR buf ,
+or
+.I ret
+was outside the program's accessible address space.
+.TP
+.B EINVAL
+Invalid
+.IR which ;
+or
+.I name
+is NULL (indicating "the kernel"),
+but this is not permitted with the specified value of
+.IR which .
+.\" Not permitted with QM_DEPS, QM_REFS, or QM_INFO.
+.TP
+.B ENOENT
+No module by that
+.I name
+exists.
+.TP
+.B ENOSPC
+The buffer size provided was too small.
+.I ret
+is set to the minimum size needed.
+.TP
+.B ENOSYS
+.BR query_module ()
+is not supported in this version of the kernel
+(e.g., Linux 2.6 or later).
+.SH STANDARDS
+Linux.
+.SH VERSIONS
+Removed in Linux 2.6.
+.\" Removed in Linux 2.5.48
+.PP
+Some of the information that was formerly available via
+.BR query_module ()
+can be obtained from
+.IR /proc/modules ,
+.IR /proc/kallsyms ,
+and the files under the directory
+.IR /sys/module .
+.PP
+The
+.BR query_module ()
+system call is not supported by glibc.
+No declaration is provided in glibc headers, but,
+through a quirk of history, glibc does export an ABI for this system call.
+Therefore, in order to employ this system call,
+it is sufficient to manually declare the interface in your code;
+alternatively, you can invoke the system call using
+.BR syscall (2).
+.SH SEE ALSO
+.BR create_module (2),
+.BR delete_module (2),
+.BR get_kernel_syms (2),
+.BR init_module (2),
+.BR lsmod (8),
+.BR modinfo (8)
diff --git a/man2/quotactl.2 b/man2/quotactl.2
new file mode 100644
index 0000000..716f934
--- /dev/null
+++ b/man2/quotactl.2
@@ -0,0 +1,806 @@
+.\" Copyright (c) 2010, Jan Kara
+.\" A few pieces copyright (c) 1996 Andries Brouwer (aeb@cwi.nl)
+.\" and copyright 2010 (c) Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH quotactl 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+quotactl \- manipulate disk quotas
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/quota.h>
+.BR "#include <xfs/xqm.h>" " /* Definition of " Q_X* " and " XFS_QUOTA_* \
+" constants"
+.RB " (or " <linux/dqblk_xfs.h> "; see NOTES) */"
+.PP
+.BI "int quotactl(int " cmd ", const char *_Nullable " special ", int " id ,
+.BI " caddr_t " addr );
+.fi
+.SH DESCRIPTION
+The quota system can be used to set per-user, per-group, and per-project limits
+on the amount of disk space used on a filesystem.
+For each user and/or group,
+a soft limit and a hard limit can be set for each filesystem.
+The hard limit can't be exceeded.
+The soft limit can be exceeded, but warnings will ensue.
+Moreover, the user can't exceed the soft limit for more than grace period
+duration (one week by default) at a time;
+after this, the soft limit counts as a hard limit.
+.PP
+The
+.BR quotactl ()
+call manipulates disk quotas.
+The
+.I cmd
+argument indicates a command to be applied to the user or
+group ID specified in
+.IR id .
+To initialize the
+.I cmd
+argument, use the
+.I QCMD(subcmd, type)
+macro.
+The
+.I type
+value is either
+.BR USRQUOTA ,
+for user quotas,
+.BR GRPQUOTA ,
+for group quotas, or (since Linux 4.1)
+.\" 847aac644e92e5624f2c153bab409bf713d5ff9a
+.BR PRJQUOTA ,
+for project quotas.
+The
+.I subcmd
+value is described below.
+.PP
+The
+.I special
+argument is a pointer to a null-terminated string containing the pathname
+of the (mounted) block special device for the filesystem being manipulated.
+.PP
+The
+.I addr
+argument is the address of an optional, command-specific, data structure
+that is copied in or out of the system.
+The interpretation of
+.I addr
+is given with each operation below.
+.PP
+The
+.I subcmd
+value is one of the following operations:
+.TP
+.B Q_QUOTAON
+Turn on quotas for a filesystem.
+The
+.I id
+argument is the identification number of the quota format to be used.
+Currently, there are three supported quota formats:
+.RS
+.TP 13
+.B QFMT_VFS_OLD
+The original quota format.
+.TP
+.B QFMT_VFS_V0
+The standard VFS v0 quota format, which can handle 32-bit UIDs and GIDs
+and quota limits up to 2\[ha]42 bytes and 2\[ha]32 inodes.
+.TP
+.B QFMT_VFS_V1
+A quota format that can handle 32-bit UIDs and GIDs
+and quota limits of 2\[ha]63 \- 1 bytes and 2\[ha]63 \- 1 inodes.
+.RE
+.IP
+The
+.I addr
+argument points to the pathname of a file containing the quotas for
+the filesystem.
+The quota file must exist; it is normally created with the
+.BR quotacheck (8)
+program
+.IP
+Quota information can be also stored in hidden system inodes
+for ext4, XFS, and other filesystems if the filesystem is configured so.
+In this case, there are no visible quota files and there is no need to
+use
+.BR quotacheck (8).
+Quota information is always kept consistent by the filesystem and the
+.B Q_QUOTAON
+operation serves only to enable enforcement of quota limits.
+The presence of hidden
+system inodes with quota information is indicated by the
+.B DQF_SYS_FILE
+flag in the
+.I dqi_flags
+field returned by the
+.B Q_GETINFO
+operation.
+.IP
+This operation requires privilege
+.RB ( CAP_SYS_ADMIN ).
+.TP
+.B Q_QUOTAOFF
+Turn off quotas for a filesystem.
+The
+.I addr
+and
+.I id
+arguments are ignored.
+This operation requires privilege
+.RB ( CAP_SYS_ADMIN ).
+.TP
+.B Q_GETQUOTA
+Get disk quota limits and current usage for user or group
+.IR id .
+The
+.I addr
+argument is a pointer to a
+.I dqblk
+structure defined in
+.I <sys/quota.h>
+as follows:
+.IP
+.in +4n
+.EX
+/* uint64_t is an unsigned 64\-bit integer;
+ uint32_t is an unsigned 32\-bit integer */
+\&
+struct dqblk { /* Definition since Linux 2.4.22 */
+ uint64_t dqb_bhardlimit; /* Absolute limit on disk
+ quota blocks alloc */
+ uint64_t dqb_bsoftlimit; /* Preferred limit on
+ disk quota blocks */
+ uint64_t dqb_curspace; /* Current occupied space
+ (in bytes) */
+ uint64_t dqb_ihardlimit; /* Maximum number of
+ allocated inodes */
+ uint64_t dqb_isoftlimit; /* Preferred inode limit */
+ uint64_t dqb_curinodes; /* Current number of
+ allocated inodes */
+ uint64_t dqb_btime; /* Time limit for excessive
+ disk use */
+ uint64_t dqb_itime; /* Time limit for excessive
+ files */
+ uint32_t dqb_valid; /* Bit mask of QIF_*
+ constants */
+};
+\&
+/* Flags in dqb_valid that indicate which fields in
+ dqblk structure are valid. */
+\&
+#define QIF_BLIMITS 1
+#define QIF_SPACE 2
+#define QIF_ILIMITS 4
+#define QIF_INODES 8
+#define QIF_BTIME 16
+#define QIF_ITIME 32
+#define QIF_LIMITS (QIF_BLIMITS | QIF_ILIMITS)
+#define QIF_USAGE (QIF_SPACE | QIF_INODES)
+#define QIF_TIMES (QIF_BTIME | QIF_ITIME)
+#define QIF_ALL (QIF_LIMITS | QIF_USAGE | QIF_TIMES)
+.EE
+.in
+.IP
+The
+.I dqb_valid
+field is a bit mask that is set to indicate the entries in the
+.I dqblk
+structure that are valid.
+Currently, the kernel fills in all entries of the
+.I dqblk
+structure and marks them as valid in the
+.I dqb_valid
+field.
+Unprivileged users may retrieve only their own quotas;
+a privileged user
+.RB ( CAP_SYS_ADMIN )
+can retrieve the quotas of any user.
+.TP
+.BR Q_GETNEXTQUOTA " (since Linux 4.6)"
+.\" commit 926132c0257a5a8d149a6a395cc3405e55420566
+This operation is the same as
+.BR Q_GETQUOTA ,
+but it returns quota information for the next ID greater than or equal to
+.I id
+that has a quota set.
+.IP
+The
+.I addr
+argument is a pointer to a
+.I nextdqblk
+structure whose fields are as for the
+.IR dqblk ,
+except for the addition of a
+.I dqb_id
+field that is used to return the ID for which
+quota information is being returned:
+.IP
+.in +4n
+.EX
+struct nextdqblk {
+ uint64_t dqb_bhardlimit;
+ uint64_t dqb_bsoftlimit;
+ uint64_t dqb_curspace;
+ uint64_t dqb_ihardlimit;
+ uint64_t dqb_isoftlimit;
+ uint64_t dqb_curinodes;
+ uint64_t dqb_btime;
+ uint64_t dqb_itime;
+ uint32_t dqb_valid;
+ uint32_t dqb_id;
+};
+.EE
+.in
+.TP
+.B Q_SETQUOTA
+Set quota information for user or group
+.IR id ,
+using the information supplied in the
+.I dqblk
+structure pointed to by
+.IR addr .
+The
+.I dqb_valid
+field of the
+.I dqblk
+structure indicates which entries in the structure have been set by the caller.
+This operation supersedes the
+.B Q_SETQLIM
+and
+.B Q_SETUSE
+operations in the previous quota interfaces.
+This operation requires privilege
+.RB ( CAP_SYS_ADMIN ).
+.TP
+.BR Q_GETINFO " (since Linux 2.4.22)"
+Get information (like grace times) about quotafile.
+The
+.I addr
+argument should be a pointer to a
+.I dqinfo
+structure.
+This structure is defined in
+.I <sys/quota.h>
+as follows:
+.IP
+.in +4n
+.EX
+/* uint64_t is an unsigned 64\-bit integer;
+ uint32_t is an unsigned 32\-bit integer */
+\&
+struct dqinfo { /* Defined since Linux 2.4.22 */
+ uint64_t dqi_bgrace; /* Time before block soft limit
+ becomes hard limit */
+ uint64_t dqi_igrace; /* Time before inode soft limit
+ becomes hard limit */
+ uint32_t dqi_flags; /* Flags for quotafile
+ (DQF_*) */
+ uint32_t dqi_valid;
+};
+\&
+/* Bits for dqi_flags */
+\&
+/* Quota format QFMT_VFS_OLD */
+\&
+#define DQF_ROOT_SQUASH (1 << 0) /* Root squash enabled */
+ /* Before Linux v4.0, this had been defined
+ privately as V1_DQF_RSQUASH */
+\&
+/* Quota format QFMT_VFS_V0 / QFMT_VFS_V1 */
+\&
+#define DQF_SYS_FILE (1 << 16) /* Quota stored in
+ a system file */
+\&
+/* Flags in dqi_valid that indicate which fields in
+ dqinfo structure are valid. */
+\&
+#define IIF_BGRACE 1
+#define IIF_IGRACE 2
+#define IIF_FLAGS 4
+#define IIF_ALL (IIF_BGRACE | IIF_IGRACE | IIF_FLAGS)
+.EE
+.in
+.IP
+The
+.I dqi_valid
+field in the
+.I dqinfo
+structure indicates the entries in the structure that are valid.
+Currently, the kernel fills in all entries of the
+.I dqinfo
+structure and marks them all as valid in the
+.I dqi_valid
+field.
+The
+.I id
+argument is ignored.
+.TP
+.BR Q_SETINFO " (since Linux 2.4.22)"
+Set information about quotafile.
+The
+.I addr
+argument should be a pointer to a
+.I dqinfo
+structure.
+The
+.I dqi_valid
+field of the
+.I dqinfo
+structure indicates the entries in the structure
+that have been set by the caller.
+This operation supersedes the
+.B Q_SETGRACE
+and
+.B Q_SETFLAGS
+operations in the previous quota interfaces.
+The
+.I id
+argument is ignored.
+This operation requires privilege
+.RB ( CAP_SYS_ADMIN ).
+.TP
+.BR Q_GETFMT " (since Linux 2.4.22)"
+Get quota format used on the specified filesystem.
+The
+.I addr
+argument should be a pointer to a 4-byte buffer
+where the format number will be stored.
+.TP
+.B Q_SYNC
+Update the on-disk copy of quota usages for a filesystem.
+If
+.I special
+is NULL, then all filesystems with active quotas are sync'ed.
+The
+.I addr
+and
+.I id
+arguments are ignored.
+.TP
+.BR Q_GETSTATS " (supported up to Linux 2.4.21)"
+Get statistics and other generic information about the quota subsystem.
+The
+.I addr
+argument should be a pointer to a
+.I dqstats
+structure in which data should be stored.
+This structure is defined in
+.IR <sys/quota.h> .
+The
+.I special
+and
+.I id
+arguments are ignored.
+.IP
+This operation is obsolete and was removed in Linux 2.4.22.
+Files in
+.I /proc/sys/fs/quota/
+carry the information instead.
+.PP
+For XFS filesystems making use of the XFS Quota Manager (XQM),
+the above operations are bypassed and the following operations are used:
+.TP
+.B Q_XQUOTAON
+Turn on quotas for an XFS filesystem.
+XFS provides the ability to turn on/off quota limit enforcement
+with quota accounting.
+Therefore, XFS expects
+.I addr
+to be a pointer to an
+.I "unsigned int"
+that contains a bitwise combination of the following flags (defined in
+.IR <xfs/xqm.h> ):
+.IP
+.in +4n
+.EX
+XFS_QUOTA_UDQ_ACCT /* User quota accounting */
+XFS_QUOTA_UDQ_ENFD /* User quota limits enforcement */
+XFS_QUOTA_GDQ_ACCT /* Group quota accounting */
+XFS_QUOTA_GDQ_ENFD /* Group quota limits enforcement */
+XFS_QUOTA_PDQ_ACCT /* Project quota accounting */
+XFS_QUOTA_PDQ_ENFD /* Project quota limits enforcement */
+.EE
+.in
+.IP
+This operation requires privilege
+.RB ( CAP_SYS_ADMIN ).
+The
+.I id
+argument is ignored.
+.TP
+.B Q_XQUOTAOFF
+Turn off quotas for an XFS filesystem.
+As with
+.BR Q_QUOTAON ,
+XFS filesystems expect a pointer to an
+.I "unsigned int"
+that specifies whether quota accounting and/or limit enforcement need
+to be turned off (using the same flags as for
+.B Q_XQUOTAON
+operation).
+This operation requires privilege
+.RB ( CAP_SYS_ADMIN ).
+The
+.I id
+argument is ignored.
+.TP
+.B Q_XGETQUOTA
+Get disk quota limits and current usage for user
+.IR id .
+The
+.I addr
+argument is a pointer to an
+.I fs_disk_quota
+structure, which is defined in
+.I <xfs/xqm.h>
+as follows:
+.IP
+.in +4n
+.EX
+/* All the blk units are in BBs (Basic Blocks) of
+ 512 bytes. */
+\&
+#define FS_DQUOT_VERSION 1 /* fs_disk_quota.d_version */
+\&
+#define XFS_USER_QUOTA (1<<0) /* User quota type */
+#define XFS_PROJ_QUOTA (1<<1) /* Project quota type */
+#define XFS_GROUP_QUOTA (1<<2) /* Group quota type */
+\&
+struct fs_disk_quota {
+ int8_t d_version; /* Version of this structure */
+ int8_t d_flags; /* XFS_{USER,PROJ,GROUP}_QUOTA */
+ uint16_t d_fieldmask; /* Field specifier */
+ uint32_t d_id; /* User, project, or group ID */
+ uint64_t d_blk_hardlimit; /* Absolute limit on
+ disk blocks */
+ uint64_t d_blk_softlimit; /* Preferred limit on
+ disk blocks */
+ uint64_t d_ino_hardlimit; /* Maximum # allocated
+ inodes */
+ uint64_t d_ino_softlimit; /* Preferred inode limit */
+ uint64_t d_bcount; /* # disk blocks owned by
+ the user */
+ uint64_t d_icount; /* # inodes owned by the user */
+ int32_t d_itimer; /* Zero if within inode limits */
+ /* If not, we refuse service */
+ int32_t d_btimer; /* Similar to above; for
+ disk blocks */
+ uint16_t d_iwarns; /* # warnings issued with
+ respect to # of inodes */
+ uint16_t d_bwarns; /* # warnings issued with
+ respect to disk blocks */
+ int32_t d_padding2; /* Padding \- for future use */
+ uint64_t d_rtb_hardlimit; /* Absolute limit on realtime
+ (RT) disk blocks */
+ uint64_t d_rtb_softlimit; /* Preferred limit on RT
+ disk blocks */
+ uint64_t d_rtbcount; /* # realtime blocks owned */
+ int32_t d_rtbtimer; /* Similar to above; for RT
+ disk blocks */
+ uint16_t d_rtbwarns; /* # warnings issued with
+ respect to RT disk blocks */
+ int16_t d_padding3; /* Padding \- for future use */
+ char d_padding4[8]; /* Yet more padding */
+};
+.EE
+.in
+.IP
+Unprivileged users may retrieve only their own quotas;
+a privileged user
+.RB ( CAP_SYS_ADMIN )
+may retrieve the quotas of any user.
+.TP
+.BR Q_XGETNEXTQUOTA " (since Linux 4.6)"
+.\" commit 8b37524962b9c54423374717786198f5c0820a28
+This operation is the same as
+.BR Q_XGETQUOTA ,
+but it returns (in the
+.I fs_disk_quota
+structure pointed by
+.IR addr )
+quota information for the next ID greater than or equal to
+.I id
+that has a quota set.
+Note that since
+.I fs_disk_quota
+already has
+.I q_id
+field, no separate structure type is needed (in contrast with
+.B Q_GETQUOTA
+and
+.B Q_GETNEXTQUOTA
+operations)
+.TP
+.B Q_XSETQLIM
+Set disk quota limits for user
+.IR id .
+The
+.I addr
+argument is a pointer to an
+.I fs_disk_quota
+structure.
+This operation requires privilege
+.RB ( CAP_SYS_ADMIN ).
+.TP
+.B Q_XGETQSTAT
+Returns XFS filesystem-specific quota information in the
+.I fs_quota_stat
+structure pointed by
+.IR addr .
+This is useful for finding out how much space is used to store quota
+information, and also to get the quota on/off status of a given local XFS
+filesystem.
+The
+.I fs_quota_stat
+structure itself is defined as follows:
+.IP
+.in +4n
+.EX
+#define FS_QSTAT_VERSION 1 /* fs_quota_stat.qs_version */
+\&
+struct fs_qfilestat {
+ uint64_t qfs_ino; /* Inode number */
+ uint64_t qfs_nblks; /* Number of BBs
+ 512\-byte\-blocks */
+ uint32_t qfs_nextents; /* Number of extents */
+};
+\&
+struct fs_quota_stat {
+ int8_t qs_version; /* Version number for
+ future changes */
+ uint16_t qs_flags; /* XFS_QUOTA_{U,P,G}DQ_{ACCT,ENFD} */
+ int8_t qs_pad; /* Unused */
+ struct fs_qfilestat qs_uquota; /* User quota storage
+ information */
+ struct fs_qfilestat qs_gquota; /* Group quota storage
+ information */
+ uint32_t qs_incoredqs; /* Number of dquots in core */
+ int32_t qs_btimelimit; /* Limit for blocks timer */
+ int32_t qs_itimelimit; /* Limit for inodes timer */
+ int32_t qs_rtbtimelimit;/* Limit for RT
+ blocks timer */
+ uint16_t qs_bwarnlimit; /* Limit for # of warnings */
+ uint16_t qs_iwarnlimit; /* Limit for # of warnings */
+};
+.EE
+.in
+.IP
+The
+.I id
+argument is ignored.
+.TP
+.B Q_XGETQSTATV
+Returns XFS filesystem-specific quota information in the
+.I fs_quota_statv
+pointed to by
+.IR addr .
+This version of the operation uses a structure with proper versioning support,
+along with appropriate layout (all fields are naturally aligned) and
+padding to avoiding special compat handling;
+it also provides the ability to get statistics regarding
+the project quota file.
+The
+.I fs_quota_statv
+structure itself is defined as follows:
+.IP
+.in +4n
+.EX
+#define FS_QSTATV_VERSION1 1 /* fs_quota_statv.qs_version */
+\&
+struct fs_qfilestatv {
+ uint64_t qfs_ino; /* Inode number */
+ uint64_t qfs_nblks; /* Number of BBs
+ 512\-byte\-blocks */
+ uint32_t qfs_nextents; /* Number of extents */
+ uint32_t qfs_pad; /* Pad for 8\-byte alignment */
+};
+\&
+struct fs_quota_statv {
+ int8_t qs_version; /* Version for future
+ changes */
+ uint8_t qs_pad1; /* Pad for 16\-bit alignment */
+ uint16_t qs_flags; /* XFS_QUOTA_.* flags */
+ uint32_t qs_incoredqs; /* Number of dquots incore */
+ struct fs_qfilestatv qs_uquota; /* User quota
+ information */
+ struct fs_qfilestatv qs_gquota; /* Group quota
+ information */
+ struct fs_qfilestatv qs_pquota; /* Project quota
+ information */
+ int32_t qs_btimelimit; /* Limit for blocks timer */
+ int32_t qs_itimelimit; /* Limit for inodes timer */
+ int32_t qs_rtbtimelimit; /* Limit for RT blocks
+ timer */
+ uint16_t qs_bwarnlimit; /* Limit for # of warnings */
+ uint16_t qs_iwarnlimit; /* Limit for # of warnings */
+ uint64_t qs_pad2[8]; /* For future proofing */
+};
+.EE
+.in
+.IP
+The
+.I qs_version
+field of the structure should be filled with the version of the structure
+supported by the callee (for now, only
+.I FS_QSTAT_VERSION1
+is supported).
+The kernel will fill the structure in accordance with
+version provided.
+The
+.I id
+argument is ignored.
+.TP
+.BR Q_XQUOTARM " (buggy until Linux 3.16)"
+.\" 9da93f9b7cdf8ab28da6b364cdc1fafc8670b4dc
+Free the disk space taken by disk quotas.
+The
+.I addr
+argument should be a pointer to an
+.I "unsigned int"
+value containing flags (the same as in
+.I d_flags
+field of
+.I fs_disk_quota
+structure)
+which identify what types of quota
+should be removed.
+(Note that the quota type passed in the
+.I cmd
+argument is ignored, but should remain valid in order to pass preliminary
+quotactl syscall handler checks.)
+.IP
+Quotas must have already been turned off.
+The
+.I id
+argument is ignored.
+.TP
+.BR Q_XQUOTASYNC " (since Linux 2.6.15; no-op since Linux 3.4)"
+.\" Added in commit ee34807a65aa0c5911dc27682863afca780a003e
+This operation was an XFS quota equivalent to
+.BR Q_SYNC ,
+but it is no-op since Linux 3.4,
+.\" 4b217ed9e30f94b6e8e5e262020ef0ceab6113af
+as
+.BR sync (1)
+writes quota information to disk now
+(in addition to the other filesystem metadata that it writes out).
+The
+.IR special ", " id " and " addr
+arguments are ignored.
+.SH RETURN VALUE
+On success,
+.BR quotactl ()
+returns 0; on error \-1
+is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+.I cmd
+is
+.BR Q_QUOTAON ,
+and the quota file pointed to by
+.I addr
+exists, but is not a regular file or
+is not on the filesystem pointed to by
+.IR special .
+.TP
+.B EBUSY
+.I cmd
+is
+.BR Q_QUOTAON ,
+but another
+.B Q_QUOTAON
+had already been performed.
+.TP
+.B EFAULT
+.I addr
+or
+.I special
+is invalid.
+.TP
+.B EINVAL
+.I cmd
+or
+.I type
+is invalid.
+.TP
+.B EINVAL
+.I cmd
+is
+.BR Q_QUOTAON ,
+but the specified quota file is corrupted.
+.TP
+.BR EINVAL " (since Linux 5.5)"
+.\" 3dd4d40b420846dd35869ccc8f8627feef2cff32
+.I cmd
+is
+.BR Q_XQUOTARM ,
+but
+.I addr
+does not point to valid quota types.
+.TP
+.B ENOENT
+The file specified by
+.I special
+or
+.I addr
+does not exist.
+.TP
+.B ENOSYS
+The kernel has not been compiled with the
+.B CONFIG_QUOTA
+option.
+.TP
+.B ENOTBLK
+.I special
+is not a block device.
+.TP
+.B EPERM
+The caller lacked the required privilege
+.RB ( CAP_SYS_ADMIN )
+for the specified operation.
+.TP
+.B ERANGE
+.I cmd
+is
+.BR Q_SETQUOTA ,
+but the specified limits are out of the range allowed by the quota format.
+.TP
+.B ESRCH
+No disk quota is found for the indicated user.
+Quotas have not been turned on for this filesystem.
+.TP
+.B ESRCH
+.I cmd
+is
+.BR Q_QUOTAON ,
+but the specified quota format was not found.
+.TP
+.B ESRCH
+.I cmd
+is
+.B Q_GETNEXTQUOTA
+or
+.BR Q_XGETNEXTQUOTA ,
+but there is no ID greater than or equal to
+.I id
+that has an active quota.
+.SH NOTES
+Instead of
+.I <xfs/xqm.h>
+one can use
+.IR <linux/dqblk_xfs.h> ,
+taking into account that there are several naming discrepancies:
+.IP \[bu] 3
+Quota enabling flags (of format
+.BR XFS_QUOTA_[UGP]DQ_{ACCT,ENFD} )
+are defined without a leading "X", as
+.BR FS_QUOTA_[UGP]DQ_{ACCT,ENFD} .
+.IP \[bu]
+The same is true for
+.B XFS_{USER,GROUP,PROJ}_QUOTA
+quota type flags, which are defined as
+.BR FS_{USER,GROUP,PROJ}_QUOTA .
+.IP \[bu]
+The
+.I dqblk_xfs.h
+header file defines its own
+.BR XQM_USRQUOTA ,
+.BR XQM_GRPQUOTA ,
+and
+.B XQM_PRJQUOTA
+constants for the available quota types, but their values are the same as for
+constants without the
+.B XQM_
+prefix.
+.SH SEE ALSO
+.BR quota (1),
+.BR getrlimit (2),
+.BR quotacheck (8),
+.BR quotaon (8)
diff --git a/man2/read.2 b/man2/read.2
new file mode 100644
index 0000000..955efa4
--- /dev/null
+++ b/man2/read.2
@@ -0,0 +1,245 @@
+.\" This manpage is Copyright (C) 1992 Drew Eckhardt;
+.\" and Copyright (C) 1993 Michael Haardt, Ian Jackson.
+.\" and Copyright (C) 2009-2015 Michael Kerrisk, <mtk.manpages.gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified Sat Jul 24 00:06:00 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Modified Wed Jan 17 16:02:32 1996 by Michael Haardt
+.\" <michael@cantor.informatik.rwth-aachen.de>
+.\" Modified Thu Apr 11 19:26:35 1996 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified Sun Jul 21 18:59:33 1996 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified Fri Jan 31 16:47:33 1997 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified Sat Jul 12 20:45:39 1997 by Michael Haardt
+.\" <michael@cantor.informatik.rwth-aachen.de>
+.\"
+.TH read 2 2023-04-03 "Linux man-pages 6.05.01"
+.SH NAME
+read \- read from a file descriptor
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "ssize_t read(int " fd ", void " buf [. count "], size_t " count );
+.fi
+.SH DESCRIPTION
+.BR read ()
+attempts to read up to
+.I count
+bytes from file descriptor
+.I fd
+into the buffer starting at
+.IR buf .
+.PP
+On files that support seeking,
+the read operation commences at the file offset,
+and the file offset is incremented by the number of bytes read.
+If the file offset is at or past the end of file,
+no bytes are read, and
+.BR read ()
+returns zero.
+.PP
+If
+.I count
+is zero,
+.BR read ()
+.I may
+detect the errors described below.
+In the absence of any errors,
+or if
+.BR read ()
+does not check for errors, a
+.BR read ()
+with a
+.I count
+of 0 returns zero and has no other effects.
+.PP
+According to POSIX.1, if
+.I count
+is greater than
+.BR SSIZE_MAX ,
+the result is implementation-defined;
+see NOTES for the upper limit on Linux.
+.SH RETURN VALUE
+On success, the number of bytes read is returned (zero indicates end of
+file), and the file position is advanced by this number.
+It is not an error if this number is smaller than the number of bytes
+requested; this may happen for example because fewer bytes are actually
+available right now (maybe because we were close to end-of-file, or
+because we are reading from a pipe, or from a terminal), or because
+.BR read ()
+was interrupted by a signal.
+See also NOTES.
+.PP
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+In this case, it is left unspecified whether
+the file position (if any) changes.
+.SH ERRORS
+.TP
+.B EAGAIN
+The file descriptor
+.I fd
+refers to a file other than a socket and has been marked nonblocking
+.RB ( O_NONBLOCK ),
+and the read would block.
+See
+.BR open (2)
+for further details on the
+.B O_NONBLOCK
+flag.
+.TP
+.BR EAGAIN " or " EWOULDBLOCK
+.\" Actually EAGAIN on Linux
+The file descriptor
+.I fd
+refers to a socket and has been marked nonblocking
+.RB ( O_NONBLOCK ),
+and the read would block.
+POSIX.1-2001 allows either error to be returned for this case,
+and does not require these constants to have the same value,
+so a portable application should check for both possibilities.
+.TP
+.B EBADF
+.I fd
+is not a valid file descriptor or is not open for reading.
+.TP
+.B EFAULT
+.I buf
+is outside your accessible address space.
+.TP
+.B EINTR
+The call was interrupted by a signal before any data was read; see
+.BR signal (7).
+.TP
+.B EINVAL
+.I fd
+is attached to an object which is unsuitable for reading;
+or the file was opened with the
+.B O_DIRECT
+flag, and either the address specified in
+.IR buf ,
+the value specified in
+.IR count ,
+or the file offset is not suitably aligned.
+.TP
+.B EINVAL
+.I fd
+was created via a call to
+.BR timerfd_create (2)
+and the wrong size buffer was given to
+.BR read ();
+see
+.BR timerfd_create (2)
+for further information.
+.TP
+.B EIO
+I/O error.
+This will happen for example when the process is in a
+background process group, tries to read from its controlling terminal,
+and either it is ignoring or blocking
+.B SIGTTIN
+or its process group
+is orphaned.
+It may also occur when there is a low-level I/O error
+while reading from a disk or tape.
+A further possible cause of
+.B EIO
+on networked filesystems is when an advisory lock had been taken
+out on the file descriptor and this lock has been lost.
+See the
+.I "Lost locks"
+section of
+.BR fcntl (2)
+for further details.
+.TP
+.B EISDIR
+.I fd
+refers to a directory.
+.PP
+Other errors may occur, depending on the object connected to
+.IR fd .
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+SVr4, 4.3BSD, POSIX.1-2001.
+.SH NOTES
+On Linux,
+.BR read ()
+(and similar system calls) will transfer at most
+0x7ffff000 (2,147,479,552) bytes,
+returning the number of bytes actually transferred.
+.\" commit e28cc71572da38a5a12c1cfe4d7032017adccf69
+(This is true on both 32-bit and 64-bit systems.)
+.PP
+On NFS filesystems, reading small amounts of data will update the
+timestamp only the first time, subsequent calls may not do so.
+This is caused
+by client side attribute caching, because most if not all NFS clients
+leave
+.I st_atime
+(last file access time)
+updates to the server, and client side reads satisfied from the
+client's cache will not cause
+.I st_atime
+updates on the server as there are no
+server-side reads.
+UNIX semantics can be obtained by disabling client-side attribute caching,
+but in most situations this will substantially
+increase server load and decrease performance.
+.SH BUGS
+According to POSIX.1-2008/SUSv4 Section XSI 2.9.7
+("Thread Interactions with Regular File Operations"):
+.PP
+.RS 4
+All of the following functions shall be atomic with respect to
+each other in the effects specified in POSIX.1-2008 when they
+operate on regular files or symbolic links: ...
+.RE
+.PP
+Among the APIs subsequently listed are
+.BR read ()
+and
+.BR readv (2).
+And among the effects that should be atomic across threads (and processes)
+are updates of the file offset.
+However, before Linux 3.14,
+this was not the case: if two processes that share
+an open file description (see
+.BR open (2))
+perform a
+.BR read ()
+(or
+.BR readv (2))
+at the same time, then the I/O operations were not atomic
+with respect updating the file offset,
+with the result that the reads in the two processes
+might (incorrectly) overlap in the blocks of data that they obtained.
+This problem was fixed in Linux 3.14.
+.\" http://thread.gmane.org/gmane.linux.kernel/1649458
+.\" From: Michael Kerrisk (man-pages <mtk.manpages <at> gmail.com>
+.\" Subject: Update of file offset on write() etc. is non-atomic with I/O
+.\" Date: 2014-02-17 15:41:37 GMT
+.\" Newsgroups: gmane.linux.kernel, gmane.linux.file-systems
+.\" commit 9c225f2655e36a470c4f58dbbc99244c5fc7f2d4
+.\" Author: Linus Torvalds <torvalds@linux-foundation.org>
+.\" Date: Mon Mar 3 09:36:58 2014 -0800
+.\"
+.\" vfs: atomic f_pos accesses as per POSIX
+.SH SEE ALSO
+.BR close (2),
+.BR fcntl (2),
+.BR ioctl (2),
+.BR lseek (2),
+.BR open (2),
+.BR pread (2),
+.BR readdir (2),
+.BR readlink (2),
+.BR readv (2),
+.BR select (2),
+.BR write (2),
+.BR fread (3)
diff --git a/man2/readahead.2 b/man2/readahead.2
new file mode 100644
index 0000000..b97f085
--- /dev/null
+++ b/man2/readahead.2
@@ -0,0 +1,99 @@
+.\" This manpage is Copyright (C) 2004, Michael Kerrisk
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" 2004-05-40 Created by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" 2004-10-05 aeb, minor correction
+.\"
+.TH readahead 2 2023-07-15 "Linux man-pages 6.05.01"
+.SH NAME
+readahead \- initiate file readahead into page cache
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #define _FILE_OFFSET_BITS 64
+.B #include <fcntl.h>
+.PP
+.BI "ssize_t readahead(int " fd ", off_t " offset ", size_t " count );
+.fi
+.SH DESCRIPTION
+.BR readahead ()
+initiates readahead on a file so that subsequent reads from that file will
+be satisfied from the cache, and not block on disk I/O
+(assuming the readahead was initiated early enough and that other activity
+on the system did not in the meantime flush pages from the cache).
+.PP
+The
+.I fd
+argument is a file descriptor identifying the file which is
+to be read.
+The
+.I offset
+argument specifies the starting point from which data is to be read
+and
+.I count
+specifies the number of bytes to be read.
+I/O is performed in whole pages, so that
+.I offset
+is effectively rounded down to a page boundary
+and bytes are read up to the next page boundary greater than or
+equal to
+.IR "(offset+count)" .
+.BR readahead ()
+does not read beyond the end of the file.
+The file offset of the open file description referred to by the file descriptor
+.I fd
+is left unchanged.
+.SH RETURN VALUE
+On success,
+.BR readahead ()
+returns 0; on failure, \-1 is returned, with
+.I errno
+set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+.I fd
+is not a valid file descriptor or is not open for reading.
+.TP
+.B EINVAL
+.I fd
+does not refer to a file type to which
+.BR readahead ()
+can be applied.
+.SH VERSIONS
+On some 32-bit architectures,
+the calling signature for this system call differs,
+for the reasons described in
+.BR syscall (2).
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.4.13,
+glibc 2.3.
+.SH NOTES
+.B _FILE_OFFSET_BITS
+should be defined to be 64 in code that uses a pointer to
+.BR readahead ,
+if the code is intended to be portable
+to traditional 32-bit x86 and ARM platforms where
+.BR off_t 's
+width defaults to 32 bits.
+.SH BUGS
+.BR readahead ()
+attempts to schedule the reads in the background and return immediately.
+However, it may block while it reads the filesystem metadata needed
+to locate the requested blocks.
+This occurs frequently with ext[234] on large files
+using indirect blocks instead of extents,
+giving the appearance that the call blocks until the requested data has
+been read.
+.SH SEE ALSO
+.BR lseek (2),
+.BR madvise (2),
+.BR mmap (2),
+.BR posix_fadvise (2),
+.BR read (2)
diff --git a/man2/readdir.2 b/man2/readdir.2
new file mode 100644
index 0000000..6b06ff4
--- /dev/null
+++ b/man2/readdir.2
@@ -0,0 +1,116 @@
+.\" Copyright (C) 1995 Andries Brouwer (aeb@cwi.nl)
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Written 11 June 1995 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified 22 July 1995 by Michael Chastain <mec@duracef.shout.net>:
+.\" In 1.3.X, returns only one entry each time; return value is different.
+.\" Modified 2004-12-01, mtk, fixed headers listed in SYNOPSIS
+.\"
+.TH readdir 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+readdir \- read directory entry
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_readdir, unsigned int " fd ,
+.BI " struct old_linux_dirent *" dirp ", unsigned int " count );
+.fi
+.PP
+.IR Note :
+There is no definition of
+.BR "struct old_linux_dirent" ;
+see NOTES.
+.SH DESCRIPTION
+This is not the function you are interested in.
+Look at
+.BR readdir (3)
+for the POSIX conforming C library interface.
+This page documents the bare kernel system call interface,
+which is superseded by
+.BR getdents (2).
+.PP
+.BR readdir ()
+reads one
+.I old_linux_dirent
+structure from the directory
+referred to by the file descriptor
+.I fd
+into the buffer pointed to by
+.IR dirp .
+The argument
+.I count
+is ignored; at most one
+.I old_linux_dirent
+structure is read.
+.PP
+The
+.I old_linux_dirent
+structure is declared (privately in Linux kernel file
+.BR fs/readdir.c )
+as follows:
+.PP
+.in +4n
+.EX
+struct old_linux_dirent {
+ unsigned long d_ino; /* inode number */
+ unsigned long d_offset; /* offset to this \fIold_linux_dirent\fP */
+ unsigned short d_namlen; /* length of this \fId_name\fP */
+ char d_name[1]; /* filename (null\-terminated) */
+}
+.EE
+.in
+.PP
+.I d_ino
+is an inode number.
+.I d_offset
+is the distance from the start of the directory to this
+.IR old_linux_dirent .
+.I d_reclen
+is the size of
+.IR d_name ,
+not counting the terminating null byte (\[aq]\e0\[aq]).
+.I d_name
+is a null-terminated filename.
+.SH RETURN VALUE
+On success, 1 is returned.
+On end of directory, 0 is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+Invalid file descriptor
+.IR fd .
+.TP
+.B EFAULT
+Argument points outside the calling process's address space.
+.TP
+.B EINVAL
+Result buffer is too small.
+.TP
+.B ENOENT
+No such directory.
+.TP
+.B ENOTDIR
+File descriptor does not refer to a directory.
+.SH VERSIONS
+You will need to define the
+.I old_linux_dirent
+structure yourself.
+However, probably you should use
+.BR readdir (3)
+instead.
+.PP
+This system call does not exist on x86-64.
+.SH STANDARDS
+Linux.
+.SH SEE ALSO
+.BR getdents (2),
+.BR readdir (3)
diff --git a/man2/readlink.2 b/man2/readlink.2
new file mode 100644
index 0000000..fe2369d
--- /dev/null
+++ b/man2/readlink.2
@@ -0,0 +1,331 @@
+.\" Copyright (c) 1983, 1991 The Regents of the University of California.
+.\" And Copyright (C) 2011 Guillem Jover <guillem@hadrons.org>
+.\" And Copyright (C) 2006, 2014 Michael Kerrisk
+.\" All rights reserved.
+.\"
+.\" SPDX-License-Identifier: BSD-4-Clause-UC
+.\"
+.\" @(#)readlink.2 6.8 (Berkeley) 3/10/91
+.\"
+.\" Modified Sat Jul 24 00:10:21 1993 by Rik Faith (faith@cs.unc.edu)
+.\" Modified Tue Jul 9 23:55:17 1996 by aeb
+.\" Modified Fri Jan 24 00:26:00 1997 by aeb
+.\" 2011-09-20, Guillem Jover <guillem@hadrons.org>:
+.\" Added text on dynamically allocating buffer + example program
+.\"
+.TH readlink 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+readlink, readlinkat \- read value of a symbolic link
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "ssize_t readlink(const char *restrict " pathname ", char *restrict " buf ,
+.BI " size_t " bufsiz );
+.PP
+.BR "#include <fcntl.h> " "/* Definition of " AT_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "ssize_t readlinkat(int " dirfd ", const char *restrict " pathname ,
+.BI " char *restrict " buf ", size_t " bufsiz );
+.PP
+.fi
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR readlink ():
+.nf
+ _XOPEN_SOURCE >= 500 || _POSIX_C_SOURCE >= 200112L
+.\" || _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED
+ || /* glibc <= 2.19: */ _BSD_SOURCE
+.fi
+.PP
+.BR readlinkat ():
+.nf
+ Since glibc 2.10:
+ _POSIX_C_SOURCE >= 200809L
+ Before glibc 2.10:
+ _ATFILE_SOURCE
+.fi
+.SH DESCRIPTION
+.BR readlink ()
+places the contents of the symbolic link
+.I pathname
+in the buffer
+.IR buf ,
+which has size
+.IR bufsiz .
+.BR readlink ()
+does not append a terminating null byte to
+.IR buf .
+It will (silently) truncate the contents (to a length of
+.I bufsiz
+characters), in case the buffer is too small to hold all of the contents.
+.SS readlinkat()
+The
+.BR readlinkat ()
+system call operates in exactly the same way as
+.BR readlink (),
+except for the differences described here.
+.PP
+If the pathname given in
+.I pathname
+is relative, then it is interpreted relative to the directory
+referred to by the file descriptor
+.I dirfd
+(rather than relative to the current working directory of
+the calling process, as is done by
+.BR readlink ()
+for a relative pathname).
+.PP
+If
+.I pathname
+is relative and
+.I dirfd
+is the special value
+.BR AT_FDCWD ,
+then
+.I pathname
+is interpreted relative to the current working
+directory of the calling process (like
+.BR readlink ()).
+.PP
+If
+.I pathname
+is absolute, then
+.I dirfd
+is ignored.
+.PP
+Since Linux 2.6.39,
+.\" commit 65cfc6722361570bfe255698d9cd4dccaf47570d
+.I pathname
+can be an empty string,
+in which case the call operates on the symbolic link referred to by
+.I dirfd
+(which should have been obtained using
+.BR open (2)
+with the
+.B O_PATH
+and
+.B O_NOFOLLOW
+flags).
+.PP
+See
+.BR openat (2)
+for an explanation of the need for
+.BR readlinkat ().
+.SH RETURN VALUE
+On success, these calls return the number of bytes placed in
+.IR buf .
+(If the returned value equals
+.IR bufsiz ,
+then truncation may have occurred.)
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+Search permission is denied for a component of the path prefix.
+(See also
+.BR path_resolution (7).)
+.TP
+.B EBADF
+.RB ( readlinkat ())
+.I pathname
+is relative but
+.I dirfd
+is neither
+.B AT_FDCWD
+nor a valid file descriptor.
+.TP
+.B EFAULT
+.I buf
+extends outside the process's allocated address space.
+.TP
+.B EINVAL
+.I bufsiz
+is not positive.
+.\" At the glibc level, bufsiz is unsigned, so this error can only occur
+.\" if bufsiz==0. However, the in the kernel syscall, bufsiz is signed,
+.\" and this error can also occur if bufsiz < 0.
+.\" See: http://thread.gmane.org/gmane.linux.man/380
+.\" Subject: [patch 0/3] [RFC] kernel/glibc mismatch of "readlink" syscall?
+.TP
+.B EINVAL
+The named file (i.e., the final filename component of
+.IR pathname )
+is not a symbolic link.
+.TP
+.B EIO
+An I/O error occurred while reading from the filesystem.
+.TP
+.B ELOOP
+Too many symbolic links were encountered in translating the pathname.
+.TP
+.B ENAMETOOLONG
+A pathname, or a component of a pathname, was too long.
+.TP
+.B ENOENT
+The named file does not exist.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.B ENOTDIR
+A component of the path prefix is not a directory.
+.TP
+.B ENOTDIR
+.RB ( readlinkat ())
+.I pathname
+is relative and
+.I dirfd
+is a file descriptor referring to a file other than a directory.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+.TP
+.BR readlink ()
+4.4BSD
+(first appeared in 4.2BSD),
+POSIX.1-2001, POSIX.1-2008.
+.TP
+.BR readlinkat ()
+POSIX.1-2008.
+Linux 2.6.16,
+glibc 2.4.
+.PP
+Up to and including glibc 2.4, the return type of
+.BR readlink ()
+was declared as
+.IR int .
+Nowadays, the return type is declared as
+.IR ssize_t ,
+as (newly) required in POSIX.1-2001.
+.SS glibc
+On older kernels where
+.BR readlinkat ()
+is unavailable, the glibc wrapper function falls back to the use of
+.BR readlink ().
+When
+.I pathname
+is a relative pathname,
+glibc constructs a pathname based on the symbolic link in
+.I /proc/self/fd
+that corresponds to the
+.I dirfd
+argument.
+.SH NOTES
+Using a statically sized buffer might not provide enough room for the
+symbolic link contents.
+The required size for the buffer can be obtained from the
+.I stat.st_size
+value returned by a call to
+.BR lstat (2)
+on the link.
+However, the number of bytes written by
+.BR readlink ()
+and
+.BR readlinkat ()
+should be checked to make sure that the size of the
+symbolic link did not increase between the calls.
+Dynamically allocating the buffer for
+.BR readlink ()
+and
+.BR readlinkat ()
+also addresses a common portability problem when using
+.B PATH_MAX
+for the buffer size,
+as this constant is not guaranteed to be defined per POSIX
+if the system does not have such limit.
+.SH EXAMPLES
+The following program allocates the buffer needed by
+.BR readlink ()
+dynamically from the information provided by
+.BR lstat (2),
+falling back to a buffer of size
+.B PATH_MAX
+in cases where
+.BR lstat (2)
+reports a size of zero.
+.PP
+.\" SRC BEGIN (readlink.c)
+.EX
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ char *buf;
+ ssize_t nbytes, bufsiz;
+ struct stat sb;
+\&
+ if (argc != 2) {
+ fprintf(stderr, "Usage: %s <pathname>\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (lstat(argv[1], &sb) == \-1) {
+ perror("lstat");
+ exit(EXIT_FAILURE);
+ }
+\&
+ /* Add one to the link size, so that we can determine whether
+ the buffer returned by readlink() was truncated. */
+\&
+ bufsiz = sb.st_size + 1;
+\&
+ /* Some magic symlinks under (for example) /proc and /sys
+ report \[aq]st_size\[aq] as zero. In that case, take PATH_MAX as
+ a "good enough" estimate. */
+\&
+ if (sb.st_size == 0)
+ bufsiz = PATH_MAX;
+\&
+ buf = malloc(bufsiz);
+ if (buf == NULL) {
+ perror("malloc");
+ exit(EXIT_FAILURE);
+ }
+\&
+ nbytes = readlink(argv[1], buf, bufsiz);
+ if (nbytes == \-1) {
+ perror("readlink");
+ exit(EXIT_FAILURE);
+ }
+\&
+ /* Print only \[aq]nbytes\[aq] of \[aq]buf\[aq], as it doesn't contain a terminating
+ null byte (\[aq]\e0\[aq]). */
+ printf("\[aq]%s\[aq] points to \[aq]%.*s\[aq]\en", argv[1], (int) nbytes, buf);
+\&
+ /* If the return value was equal to the buffer size, then the
+ the link target was larger than expected (perhaps because the
+ target was changed between the call to lstat() and the call to
+ readlink()). Warn the user that the returned target may have
+ been truncated. */
+\&
+ if (nbytes == bufsiz)
+ printf("(Returned buffer may have been truncated)\en");
+\&
+ free(buf);
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR readlink (1),
+.BR lstat (2),
+.BR stat (2),
+.BR symlink (2),
+.BR realpath (3),
+.BR path_resolution (7),
+.BR symlink (7)
diff --git a/man2/readlinkat.2 b/man2/readlinkat.2
new file mode 100644
index 0000000..b29d1b5
--- /dev/null
+++ b/man2/readlinkat.2
@@ -0,0 +1 @@
+.so man2/readlink.2
diff --git a/man2/readv.2 b/man2/readv.2
new file mode 100644
index 0000000..db6abbc
--- /dev/null
+++ b/man2/readv.2
@@ -0,0 +1,427 @@
+.\" Copyright (C) 2007, 2010 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" and Copyright (c) 1993 by Thomas Koenig (ig25@rz.uni-karlsruhe.de)
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified Sat Jul 24 18:34:44 1993 by Rik Faith (faith@cs.unc.edu)
+.\" Merged readv.[23], 2002-10-17, aeb
+.\" 2007-04-30 mtk, A fairly major rewrite to fix errors and
+.\" add more details.
+.\" 2010-11-16, mtk, Added documentation of preadv() and pwritev()
+.\"
+.TH readv 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+readv, writev, preadv, pwritev, preadv2, pwritev2 \-
+read or write data into multiple buffers
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/uio.h>
+.PP
+.BI "ssize_t readv(int " fd ", const struct iovec *" iov ", int " iovcnt );
+.BI "ssize_t writev(int " fd ", const struct iovec *" iov ", int " iovcnt );
+.PP
+.BI "ssize_t preadv(int " fd ", const struct iovec *" iov ", int " iovcnt ,
+.BI " off_t " offset );
+.BI "ssize_t pwritev(int " fd ", const struct iovec *" iov ", int " iovcnt ,
+.BI " off_t " offset );
+.PP
+.BI "ssize_t preadv2(int " fd ", const struct iovec *" iov ", int " iovcnt ,
+.BI " off_t " offset ", int " flags );
+.BI "ssize_t pwritev2(int " fd ", const struct iovec *" iov ", int " iovcnt ,
+.BI " off_t " offset ", int " flags );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR preadv (),
+.BR pwritev ():
+.nf
+ Since glibc 2.19:
+ _DEFAULT_SOURCE
+ glibc 2.19 and earlier:
+ _BSD_SOURCE
+.fi
+.SH DESCRIPTION
+The
+.BR readv ()
+system call reads
+.I iovcnt
+buffers from the file associated with the file descriptor
+.I fd
+into the buffers described by
+.I iov
+("scatter input").
+.PP
+The
+.BR writev ()
+system call writes
+.I iovcnt
+buffers of data described by
+.I iov
+to the file associated with the file descriptor
+.I fd
+("gather output").
+.PP
+The pointer
+.I iov
+points to an array of
+.I iovec
+structures,
+described in
+.BR iovec (3type).
+.PP
+The
+.BR readv ()
+system call works just like
+.BR read (2)
+except that multiple buffers are filled.
+.PP
+The
+.BR writev ()
+system call works just like
+.BR write (2)
+except that multiple buffers are written out.
+.PP
+Buffers are processed in array order.
+This means that
+.BR readv ()
+completely fills
+.I iov[0]
+before proceeding to
+.IR iov[1] ,
+and so on.
+(If there is insufficient data, then not all buffers pointed to by
+.I iov
+may be filled.)
+Similarly,
+.BR writev ()
+writes out the entire contents of
+.I iov[0]
+before proceeding to
+.IR iov[1] ,
+and so on.
+.PP
+The data transfers performed by
+.BR readv ()
+and
+.BR writev ()
+are atomic: the data written by
+.\" Regarding atomicity, see https://bugzilla.kernel.org/show_bug.cgi?id=10596
+.BR writev ()
+is written as a single block that is not intermingled with output
+from writes in other processes;
+analogously,
+.BR readv ()
+is guaranteed to read a contiguous block of data from the file,
+regardless of read operations performed in other threads or processes
+that have file descriptors referring to the same open file description
+(see
+.BR open (2)).
+.SS preadv() and pwritev()
+The
+.BR preadv ()
+system call combines the functionality of
+.BR readv ()
+and
+.BR pread (2).
+It performs the same task as
+.BR readv (),
+but adds a fourth argument,
+.IR offset ,
+which specifies the file offset at which the input operation
+is to be performed.
+.PP
+The
+.BR pwritev ()
+system call combines the functionality of
+.BR writev ()
+and
+.BR pwrite (2).
+It performs the same task as
+.BR writev (),
+but adds a fourth argument,
+.IR offset ,
+which specifies the file offset at which the output operation
+is to be performed.
+.PP
+The file offset is not changed by these system calls.
+The file referred to by
+.I fd
+must be capable of seeking.
+.SS preadv2() and pwritev2()
+These system calls are similar to
+.BR preadv ()
+and
+.BR pwritev ()
+calls, but add a fifth argument,
+.IR flags ,
+which modifies the behavior on a per-call basis.
+.PP
+Unlike
+.BR preadv ()
+and
+.BR pwritev (),
+if the
+.I offset
+argument is \-1, then the current file offset is used and updated.
+.PP
+The
+.I flags
+argument contains a bitwise OR of zero or more of the following flags:
+.TP
+.BR RWF_DSYNC " (since Linux 4.7)"
+.\" commit e864f39569f4092c2b2bc72c773b6e486c7e3bd9
+Provide a per-write equivalent of the
+.B O_DSYNC
+.BR open (2)
+flag.
+This flag is meaningful only for
+.BR pwritev2 (),
+and its effect applies only to the data range written by the system call.
+.TP
+.BR RWF_HIPRI " (since Linux 4.6)"
+High priority read/write.
+Allows block-based filesystems to use polling of the device,
+which provides lower latency, but may use additional resources.
+(Currently, this feature is usable only on a file descriptor opened using the
+.B O_DIRECT
+flag.)
+.TP
+.BR RWF_SYNC " (since Linux 4.7)"
+.\" commit e864f39569f4092c2b2bc72c773b6e486c7e3bd9
+Provide a per-write equivalent of the
+.B O_SYNC
+.BR open (2)
+flag.
+This flag is meaningful only for
+.BR pwritev2 (),
+and its effect applies only to the data range written by the system call.
+.TP
+.BR RWF_NOWAIT " (since Linux 4.14)"
+.\" commit 3239d834847627b6634a4139cf1dc58f6f137a46
+.\" commit 91f9943e1c7b6638f27312d03fe71fcc67b23571
+Do not wait for data which is not immediately available.
+If this flag is specified, the
+.BR preadv2 ()
+system call will return instantly if it would have to read data from
+the backing storage or wait for a lock.
+If some data was successfully read, it will return the number of bytes read.
+If no bytes were read, it will return \-1 and set
+.I errno
+to
+.B EAGAIN
+(but see
+.BR BUGS ).
+Currently, this flag is meaningful only for
+.BR preadv2 ().
+.TP
+.BR RWF_APPEND " (since Linux 4.16)"
+.\" commit e1fc742e14e01d84d9693c4aca4ab23da65811fb
+Provide a per-write equivalent of the
+.B O_APPEND
+.BR open (2)
+flag.
+This flag is meaningful only for
+.BR pwritev2 (),
+and its effect applies only to the data range written by the system call.
+The
+.I offset
+argument does not affect the write operation;
+the data is always appended to the end of the file.
+However, if the
+.I offset
+argument is \-1, the current file offset is updated.
+.SH RETURN VALUE
+On success,
+.BR readv (),
+.BR preadv (),
+and
+.BR preadv2 ()
+return the number of bytes read;
+.BR writev (),
+.BR pwritev (),
+and
+.BR pwritev2 ()
+return the number of bytes written.
+.PP
+Note that it is not an error for a successful call to transfer fewer bytes
+than requested (see
+.BR read (2)
+and
+.BR write (2)).
+.PP
+On error, \-1 is returned, and \fIerrno\fP is set to indicate the error.
+.SH ERRORS
+The errors are as given for
+.BR read (2)
+and
+.BR write (2).
+Furthermore,
+.BR preadv (),
+.BR preadv2 (),
+.BR pwritev (),
+and
+.BR pwritev2 ()
+can also fail for the same reasons as
+.BR lseek (2).
+Additionally, the following errors are defined:
+.TP
+.B EINVAL
+The sum of the
+.I iov_len
+values overflows an
+.I ssize_t
+value.
+.TP
+.B EINVAL
+The vector count,
+.IR iovcnt ,
+is less than zero or greater than the permitted maximum.
+.TP
+.B EOPNOTSUPP
+An unknown flag is specified in \fIflags\fP.
+.SH VERSIONS
+.SS C library/kernel differences
+The raw
+.BR preadv ()
+and
+.BR pwritev ()
+system calls have call signatures that differ slightly from that of the
+corresponding GNU C library wrapper functions shown in the SYNOPSIS.
+The final argument,
+.IR offset ,
+is unpacked by the wrapper functions into two arguments in the system calls:
+.PP
+.BI " unsigned long " pos_l ", unsigned long " pos
+.PP
+These arguments contain, respectively, the low order and high order 32 bits of
+.IR offset .
+.SH STANDARDS
+.TP
+.BR readv ()
+.TQ
+.BR writev ()
+POSIX.1-2008.
+.TP
+.BR preadv ()
+.TQ
+.BR pwritev ()
+BSD.
+.TP
+.BR preadv2 ()
+.TQ
+.BR pwritev2 ()
+Linux.
+.SH HISTORY
+.TP
+.BR readv ()
+.TQ
+.BR writev ()
+POSIX.1-2001,
+4.4BSD (first appeared in 4.2BSD).
+.\" Linux libc5 used \fIsize_t\fP as the type of the \fIiovcnt\fP argument,
+.\" and \fIint\fP as the return type.
+.\" The readv/writev system calls were buggy before Linux 1.3.40.
+.\" (Says release.libc.)
+.PP
+.BR preadv (),
+.BR pwritev ():
+Linux 2.6.30,
+glibc 2.10.
+.PP
+.BR preadv2 (),
+.BR pwritev2 ():
+Linux 4.6,
+glibc 2.26.
+.SS Historical C library/kernel differences
+To deal with the fact that
+.B IOV_MAX
+was so low on early versions of Linux,
+the glibc wrapper functions for
+.BR readv ()
+and
+.BR writev ()
+did some extra work if they detected that the underlying kernel
+system call failed because this limit was exceeded.
+In the case of
+.BR readv (),
+the wrapper function allocated a temporary buffer large enough
+for all of the items specified by
+.IR iov ,
+passed that buffer in a call to
+.BR read (2),
+copied data from the buffer to the locations specified by the
+.I iov_base
+fields of the elements of
+.IR iov ,
+and then freed the buffer.
+The wrapper function for
+.BR writev ()
+performed the analogous task using a temporary buffer and a call to
+.BR write (2).
+.PP
+The need for this extra effort in the glibc wrapper functions
+went away with Linux 2.2 and later.
+However, glibc continued to provide this behavior until glibc 2.10.
+Starting with glibc 2.9,
+the wrapper functions provide this behavior only if the library detects
+that the system is running a Linux kernel older than Linux 2.6.18
+(an arbitrarily selected kernel version).
+And since glibc 2.20
+(which requires a minimum of Linux 2.6.32),
+the glibc wrapper functions always just directly invoke the system calls.
+.SH NOTES
+POSIX.1 allows an implementation to place a limit on
+the number of items that can be passed in
+.IR iov .
+An implementation can advertise its limit by defining
+.B IOV_MAX
+in
+.I <limits.h>
+or at run time via the return value from
+.IR sysconf(_SC_IOV_MAX) .
+On modern Linux systems, the limit is 1024.
+Back in Linux 2.0 days, this limit was 16.
+.\"
+.\"
+.SH BUGS
+Linux 5.9 and Linux 5.10 have a bug where
+.BR preadv2 ()
+with the
+.B RWF_NOWAIT
+flag may return 0 even when not at end of file.
+.\" See
+.\" <https://lore.kernel.org/linux-fsdevel/fea8b16d-5a69-40f9-b123-e84dcd6e8f2e@www.fastmail.com/T/#u>
+.\" The bug was introduced in
+.\" efa8480a831 fs: RWF_NOWAIT should imply IOCB_NOIO
+.\"and fixed in
+.\" 06c0444290 mm/filemap.c: generic_file_buffered_read() now uses find_get_pages_contig
+.SH EXAMPLES
+The following code sample demonstrates the use of
+.BR writev ():
+.PP
+.in +4n
+.EX
+char *str0 = "hello ";
+char *str1 = "world\en";
+ssize_t nwritten;
+struct iovec iov[2];
+\&
+iov[0].iov_base = str0;
+iov[0].iov_len = strlen(str0);
+iov[1].iov_base = str1;
+iov[1].iov_len = strlen(str1);
+\&
+nwritten = writev(STDOUT_FILENO, iov, 2);
+.EE
+.in
+.SH SEE ALSO
+.BR pread (2),
+.BR read (2),
+.BR write (2)
diff --git a/man2/reboot.2 b/man2/reboot.2
new file mode 100644
index 0000000..681087f
--- /dev/null
+++ b/man2/reboot.2
@@ -0,0 +1,236 @@
+.\" Copyright (c) 1998 Andries Brouwer (aeb@cwi.nl), 24 September 1998
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified, 27 May 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on capability requirements
+.\"
+.TH reboot 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+reboot \- reboot or enable/disable Ctrl-Alt-Del
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.RB "/* Since Linux 2.1.30 there are symbolic names " LINUX_REBOOT_*
+ for the constants and a fourth argument to the call: */
+.PP
+.BR "#include <linux/reboot.h> " \
+"/* Definition of " LINUX_REBOOT_* " constants */"
+.BR "#include <sys/syscall.h> " "/* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_reboot, int " magic ", int " magic2 ", int " cmd ", void *" arg );
+.PP
+/* Under glibc and most alternative libc's (including uclibc, dietlibc,
+ musl and a few others), some of the constants involved have gotten
+.RB " symbolic names " RB_* ", and the library call is a 1-argument"
+ wrapper around the system call: */
+.PP
+.BR "#include <sys/reboot.h> " "/* Definition of " RB_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int reboot(int " cmd );
+.fi
+.SH DESCRIPTION
+The
+.BR reboot ()
+call reboots the system, or enables/disables the reboot keystroke
+(abbreviated CAD, since the default is Ctrl-Alt-Delete;
+it can be changed using
+.BR loadkeys (1)).
+.PP
+This system call fails (with the error
+.BR EINVAL )
+unless
+.I magic
+equals
+.B LINUX_REBOOT_MAGIC1
+(that is, 0xfee1dead) and
+.I magic2
+equals
+.B LINUX_REBOOT_MAGIC2
+(that is, 0x28121969).
+However, since Linux 2.1.17 also
+.B LINUX_REBOOT_MAGIC2A
+(that is, 0x05121996)
+and since Linux 2.1.97 also
+.B LINUX_REBOOT_MAGIC2B
+(that is, 0x16041998)
+and since Linux 2.5.71 also
+.B LINUX_REBOOT_MAGIC2C
+(that is, 0x20112000)
+are permitted as values for
+.IR magic2 .
+(The hexadecimal values of these constants are meaningful.)
+.PP
+The
+.I cmd
+argument can have the following values:
+.TP
+.B LINUX_REBOOT_CMD_CAD_OFF
+.RB ( RB_DISABLE_CAD ,
+0).
+CAD is disabled.
+This means that the CAD keystroke will cause a
+.B SIGINT
+signal to be
+sent to init (process 1), whereupon this process may decide upon a
+proper action (maybe: kill all processes, sync, reboot).
+.TP
+.B LINUX_REBOOT_CMD_CAD_ON
+.RB ( RB_ENABLE_CAD ,
+0x89abcdef).
+CAD is enabled.
+This means that the CAD keystroke will immediately cause
+the action associated with
+.BR LINUX_REBOOT_CMD_RESTART .
+.TP
+.B LINUX_REBOOT_CMD_HALT
+.RB ( RB_HALT_SYSTEM ,
+0xcdef0123; since Linux 1.1.76).
+The message "System halted." is printed, and the system is halted.
+Control is given to the ROM monitor, if there is one.
+If not preceded by a
+.BR sync (2),
+data will be lost.
+.TP
+.B LINUX_REBOOT_CMD_KEXEC
+.RB ( RB_KEXEC ,
+0x45584543, since Linux 2.6.13).
+Execute a kernel that has been loaded earlier with
+.BR kexec_load (2).
+This option is available only if the kernel was configured with
+.BR CONFIG_KEXEC .
+.TP
+.B LINUX_REBOOT_CMD_POWER_OFF
+.RB ( RB_POWER_OFF ,
+0x4321fedc; since Linux 2.1.30).
+The message "Power down." is printed, the system is stopped,
+and all power is removed from the system, if possible.
+If not preceded by a
+.BR sync (2),
+data will be lost.
+.TP
+.B LINUX_REBOOT_CMD_RESTART
+.RB ( RB_AUTOBOOT ,
+0x1234567).
+The message "Restarting system." is printed, and a default
+restart is performed immediately.
+If not preceded by a
+.BR sync (2),
+data will be lost.
+.TP
+.B LINUX_REBOOT_CMD_RESTART2
+(0xa1b2c3d4; since Linux 2.1.30).
+The message "Restarting system with command \[aq]%s\[aq]" is printed,
+and a restart (using the command string given in
+.IR arg )
+is performed immediately.
+If not preceded by a
+.BR sync (2),
+data will be lost.
+.TP
+.B LINUX_REBOOT_CMD_SW_SUSPEND
+.RB ( RB_SW_SUSPEND ,
+0xd000fce1; since Linux 2.5.18).
+The system is suspended (hibernated) to disk.
+This option is available only if the kernel was configured with
+.BR CONFIG_HIBERNATION .
+.PP
+Only the superuser may call
+.BR reboot ().
+.PP
+The precise effect of the above actions depends on the architecture.
+For the i386 architecture, the additional argument does not do
+anything at present (2.1.122), but the type of reboot can be
+determined by kernel command-line arguments ("reboot=...") to be
+either warm or cold, and either hard or through the BIOS.
+.\"
+.SS Behavior inside PID namespaces
+.\" commit cf3f89214ef6a33fad60856bc5ffd7bb2fc4709b
+.\" see also commit 923c7538236564c46ee80c253a416705321f13e3
+Since Linux 3.4,
+if
+.BR reboot ()
+is called
+from a PID namespace other than the initial PID namespace
+with one of the
+.I cmd
+values listed below,
+it performs a "reboot" of that namespace:
+the "init" process of the PID namespace is immediately terminated,
+with the effects described in
+.BR pid_namespaces (7).
+.PP
+The values that can be supplied in
+.I cmd
+when calling
+.BR reboot ()
+in this case are as follows:
+.TP
+.BR LINUX_REBOOT_CMD_RESTART ", " LINUX_REBOOT_CMD_RESTART2
+The "init" process is terminated,
+and
+.BR wait (2)
+in the parent process reports that the child was killed with a
+.B SIGHUP
+signal.
+.TP
+.BR LINUX_REBOOT_CMD_POWER_OFF ", " LINUX_REBOOT_CMD_HALT
+The "init" process is terminated,
+and
+.BR wait (2)
+in the parent process reports that the child was killed with a
+.B SIGINT
+signal.
+.PP
+For the other
+.I cmd
+values,
+.BR reboot ()
+returns \-1 and
+.I errno
+is set to
+.BR EINVAL .
+.SH RETURN VALUE
+For the values of
+.I cmd
+that stop or restart the system,
+a successful call to
+.BR reboot ()
+does not return.
+For the other
+.I cmd
+values, zero is returned on success.
+In all cases, \-1 is returned on failure, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+Problem with getting user-space data under
+.BR LINUX_REBOOT_CMD_RESTART2 .
+.TP
+.B EINVAL
+Bad magic numbers or \fIcmd\fP.
+.TP
+.B EPERM
+The calling process has insufficient privilege to call
+.BR reboot ();
+the caller must have the
+.B CAP_SYS_BOOT
+inside its user namespace.
+.SH STANDARDS
+Linux.
+.SH SEE ALSO
+.BR systemctl (1),
+.BR systemd (1),
+.BR kexec_load (2),
+.BR sync (2),
+.BR bootparam (7),
+.BR capabilities (7),
+.BR ctrlaltdel (8),
+.BR halt (8),
+.BR shutdown (8)
diff --git a/man2/recv.2 b/man2/recv.2
new file mode 100644
index 0000000..395236d
--- /dev/null
+++ b/man2/recv.2
@@ -0,0 +1,563 @@
+.\" Copyright (c) 1983, 1990, 1991 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" SPDX-License-Identifier: BSD-4-Clause-UC
+.\"
+.\" $Id: recv.2,v 1.3 1999/05/13 11:33:38 freitag Exp $
+.\"
+.\" Modified Sat Jul 24 00:22:20 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Modified Tue Oct 22 17:45:19 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 1998,1999 by Andi Kleen
+.\" 2001-06-19 corrected SO_EE_OFFENDER, bug report by James Hawtin
+.\"
+.TH recv 2 2023-07-18 "Linux man-pages 6.05.01"
+.SH NAME
+recv, recvfrom, recvmsg \- receive a message from a socket
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/socket.h>
+.PP
+.BI "ssize_t recv(int " sockfd ", void " buf [. len "], size_t " len ,
+.BI " int " flags );
+.BI "ssize_t recvfrom(int " sockfd ", void " buf "[restrict ." len "], size_t " len ,
+.BI " int " flags ,
+.BI " struct sockaddr *_Nullable restrict " src_addr ,
+.BI " socklen_t *_Nullable restrict " addrlen );
+.BI "ssize_t recvmsg(int " sockfd ", struct msghdr *" msg ", int " flags );
+.fi
+.SH DESCRIPTION
+The
+.BR recv (),
+.BR recvfrom (),
+and
+.BR recvmsg ()
+calls are used to receive messages from a socket.
+They may be used
+to receive data on both connectionless and connection-oriented sockets.
+This page first describes common features of all three system calls,
+and then describes the differences between the calls.
+.PP
+The only difference between
+.BR recv ()
+and
+.BR read (2)
+is the presence of
+.IR flags .
+With a zero
+.I flags
+argument,
+.BR recv ()
+is generally equivalent to
+.BR read (2)
+(but see NOTES).
+Also, the following call
+.PP
+.in +4n
+.EX
+recv(sockfd, buf, len, flags);
+.EE
+.in
+.PP
+is equivalent to
+.PP
+.in +4n
+.EX
+recvfrom(sockfd, buf, len, flags, NULL, NULL);
+.EE
+.in
+.PP
+All three calls return the length of the message on successful
+completion.
+If a message is too long to fit in the supplied buffer, excess
+bytes may be discarded depending on the type of socket the message is
+received from.
+.PP
+If no messages are available at the socket, the receive calls wait for a
+message to arrive, unless the socket is nonblocking (see
+.BR fcntl (2)),
+in which case the value \-1 is returned and
+.I errno
+is set to
+.BR EAGAIN " or " EWOULDBLOCK .
+The receive calls normally return any data available, up to the requested
+amount, rather than waiting for receipt of the full amount requested.
+.PP
+An application can use
+.BR select (2),
+.BR poll (2),
+or
+.BR epoll (7)
+to determine when more data arrives on a socket.
+.SS The flags argument
+The
+.I flags
+argument is formed by ORing one or more of the following values:
+.TP
+.BR MSG_CMSG_CLOEXEC " (" recvmsg "() only; since Linux 2.6.23)"
+Set the close-on-exec flag for the file descriptor received
+via a UNIX domain file descriptor using the
+.B SCM_RIGHTS
+operation (described in
+.BR unix (7)).
+This flag is useful for the same reasons as the
+.B O_CLOEXEC
+flag of
+.BR open (2).
+.TP
+.BR MSG_DONTWAIT " (since Linux 2.2)"
+Enables nonblocking operation; if the operation would block,
+the call fails with the error
+.BR EAGAIN " or " EWOULDBLOCK .
+This provides similar behavior to setting the
+.B O_NONBLOCK
+flag (via the
+.BR fcntl (2)
+.B F_SETFL
+operation), but differs in that
+.B MSG_DONTWAIT
+is a per-call option, whereas
+.B O_NONBLOCK
+is a setting on the open file description (see
+.BR open (2)),
+which will affect all threads in the calling process
+and as well as other processes that hold file descriptors
+referring to the same open file description.
+.TP
+.BR MSG_ERRQUEUE " (since Linux 2.2)"
+This flag
+specifies that queued errors should be received from the socket error queue.
+The error is passed in
+an ancillary message with a type dependent on the protocol (for IPv4
+.BR IP_RECVERR ).
+The user should supply a buffer of sufficient size.
+See
+.BR cmsg (3)
+and
+.BR ip (7)
+for more information.
+The payload of the original packet that caused the error
+is passed as normal data via
+.IR msg_iovec .
+The original destination address of the datagram that caused the error
+is supplied via
+.IR msg_name .
+.IP
+The error is supplied in a
+.I sock_extended_err
+structure:
+.IP
+.in +4n
+.EX
+#define SO_EE_ORIGIN_NONE 0
+#define SO_EE_ORIGIN_LOCAL 1
+#define SO_EE_ORIGIN_ICMP 2
+#define SO_EE_ORIGIN_ICMP6 3
+\&
+struct sock_extended_err
+{
+ uint32_t ee_errno; /* Error number */
+ uint8_t ee_origin; /* Where the error originated */
+ uint8_t ee_type; /* Type */
+ uint8_t ee_code; /* Code */
+ uint8_t ee_pad; /* Padding */
+ uint32_t ee_info; /* Additional information */
+ uint32_t ee_data; /* Other data */
+ /* More data may follow */
+};
+\&
+struct sockaddr *SO_EE_OFFENDER(struct sock_extended_err *);
+.EE
+.in
+.IP
+.I ee_errno
+contains the
+.I errno
+number of the queued error.
+.I ee_origin
+is the origin code of where the error originated.
+The other fields are protocol-specific.
+The macro
+.B SO_EE_OFFENDER
+returns a pointer to the address of the network object
+where the error originated from given a pointer to the ancillary message.
+If this address is not known, the
+.I sa_family
+member of the
+.I sockaddr
+contains
+.B AF_UNSPEC
+and the other fields of the
+.I sockaddr
+are undefined.
+The payload of the packet that caused the error is passed as normal data.
+.IP
+For local errors, no address is passed (this
+can be checked with the
+.I cmsg_len
+member of the
+.IR cmsghdr ).
+For error receives,
+the
+.B MSG_ERRQUEUE
+flag is set in the
+.IR msghdr .
+After an error has been passed, the pending socket error
+is regenerated based on the next queued error and will be passed
+on the next socket operation.
+.TP
+.B MSG_OOB
+This flag requests receipt of out-of-band data that would not be received
+in the normal data stream.
+Some protocols place expedited data
+at the head of the normal data queue, and thus this flag cannot
+be used with such protocols.
+.TP
+.B MSG_PEEK
+This flag causes the receive operation to
+return data from the beginning of the
+receive queue without removing that data from the queue.
+Thus, a
+subsequent receive call will return the same data.
+.TP
+.BR MSG_TRUNC " (since Linux 2.2)"
+For raw
+.RB ( AF_PACKET ),
+Internet datagram (since Linux 2.4.27/2.6.8),
+netlink (since Linux 2.6.22),
+and UNIX datagram as well as sequenced-packet
+.\" commit 9f6f9af7694ede6314bed281eec74d588ba9474f
+(since Linux 3.4) sockets:
+return the real length of the packet or datagram,
+even when it was longer than the passed buffer.
+.IP
+For use with Internet stream sockets, see
+.BR tcp (7).
+.TP
+.BR MSG_WAITALL " (since Linux 2.2)"
+This flag requests that the operation block until the full request is
+satisfied.
+However, the call may still return less data than requested if
+a signal is caught, an error or disconnect occurs, or the next data to be
+received is of a different type than that returned.
+This flag has no effect for datagram sockets.
+.\"
+.SS recvfrom()
+.BR recvfrom ()
+places the received message into the buffer
+.IR buf .
+The caller must specify the size of the buffer in
+.IR len .
+.PP
+If
+.I src_addr
+is not NULL,
+and the underlying protocol provides the source address of the message,
+that source address is placed in the buffer pointed to by
+.IR src_addr .
+.\" (Note: for datagram sockets in both the UNIX and Internet domains,
+.\" .I src_addr
+.\" is filled in.
+.\" .I src_addr
+.\" is also filled in for stream sockets in the UNIX domain, but is not
+.\" filled in for stream sockets in the Internet domain.)
+.\" [The above notes on AF_UNIX and AF_INET sockets apply as at
+.\" Kernel 2.4.18. (MTK, 22 Jul 02)]
+In this case,
+.I addrlen
+is a value-result argument.
+Before the call,
+it should be initialized to the size of the buffer associated with
+.IR src_addr .
+Upon return,
+.I addrlen
+is updated to contain the actual size of the source address.
+The returned address is truncated if the buffer provided is too small;
+in this case,
+.I addrlen
+will return a value greater than was supplied to the call.
+.PP
+If the caller is not interested in the source address,
+.I src_addr
+and
+.I addrlen
+should be specified as NULL.
+.\"
+.SS recv()
+The
+.BR recv ()
+call is normally used only on a
+.I connected
+socket (see
+.BR connect (2)).
+It is equivalent to the call:
+.PP
+.in +4n
+.EX
+recvfrom(fd, buf, len, flags, NULL, 0);
+.EE
+.in
+.\"
+.SS recvmsg()
+The
+.BR recvmsg ()
+call uses a
+.I msghdr
+structure to minimize the number of directly supplied arguments.
+This structure is defined as follows in
+.IR <sys/socket.h> :
+.PP
+.in +4n
+.EX
+struct msghdr {
+ void *msg_name; /* Optional address */
+ socklen_t msg_namelen; /* Size of address */
+ struct iovec *msg_iov; /* Scatter/gather array */
+ size_t msg_iovlen; /* # elements in msg_iov */
+ void *msg_control; /* Ancillary data, see below */
+ size_t msg_controllen; /* Ancillary data buffer len */
+ int msg_flags; /* Flags on received message */
+};
+.EE
+.in
+.PP
+The
+.I msg_name
+field points to a caller-allocated buffer that is used to
+return the source address if the socket is unconnected.
+The caller should set
+.I msg_namelen
+to the size of this buffer before this call;
+upon return from a successful call,
+.I msg_namelen
+will contain the length of the returned address.
+If the application does not need to know the source address,
+.I msg_name
+can be specified as NULL.
+.PP
+The fields
+.I msg_iov
+and
+.I msg_iovlen
+describe scatter-gather locations, as discussed in
+.BR readv (2).
+.PP
+The field
+.IR msg_control ,
+which has length
+.IR msg_controllen ,
+points to a buffer for other protocol control-related messages or
+miscellaneous ancillary data.
+When
+.BR recvmsg ()
+is called,
+.I msg_controllen
+should contain the length of the available buffer in
+.IR msg_control ;
+upon return from a successful call it will contain the length
+of the control message sequence.
+.PP
+The messages are of the form:
+.PP
+.in +4n
+.EX
+struct cmsghdr {
+ size_t cmsg_len; /* Data byte count, including header
+ (type is socklen_t in POSIX) */
+ int cmsg_level; /* Originating protocol */
+ int cmsg_type; /* Protocol\-specific type */
+/* followed by
+ unsigned char cmsg_data[]; */
+};
+.EE
+.in
+.PP
+Ancillary data should be accessed only by the macros defined in
+.BR cmsg (3).
+.PP
+As an example, Linux uses this ancillary data mechanism to pass extended
+errors, IP options, or file descriptors over UNIX domain sockets.
+For further information on the use of ancillary data in various
+socket domains, see
+.BR unix (7)
+and
+.BR ip (7).
+.PP
+The
+.I msg_flags
+field in the
+.I msghdr
+is set on return of
+.BR recvmsg ().
+It can contain several flags:
+.TP
+.B MSG_EOR
+indicates end-of-record; the data returned completed a record (generally
+used with sockets of type
+.BR SOCK_SEQPACKET ).
+.TP
+.B MSG_TRUNC
+indicates that the trailing portion of a datagram was discarded because the
+datagram was larger than the buffer supplied.
+.TP
+.B MSG_CTRUNC
+indicates that some control data was discarded due to lack of space in the
+buffer for ancillary data.
+.TP
+.B MSG_OOB
+is returned to indicate that expedited or out-of-band data was received.
+.TP
+.B MSG_ERRQUEUE
+indicates that no data was received but an extended error from the socket
+error queue.
+.TP
+.BR MSG_CMSG_CLOEXEC " (since Linux 2.6.23)"
+.\" commit 4a19542e5f694cd408a32c3d9dc593ba9366e2d7
+indicates that
+.B MSG_CMSG_CLOEXEC
+was specified in the
+.I flags
+argument of
+.BR recvmsg ().
+.SH RETURN VALUE
+These calls return the number of bytes received, or \-1
+if an error occurred.
+In the event of an error,
+.I errno
+is set to indicate the error.
+.PP
+When a stream socket peer has performed an orderly shutdown,
+the return value will be 0 (the traditional "end-of-file" return).
+.PP
+Datagram sockets in various domains (e.g., the UNIX and Internet domains)
+permit zero-length datagrams.
+When such a datagram is received, the return value is 0.
+.PP
+The value 0 may also be returned if the requested number of bytes
+to receive from a stream socket was 0.
+.SH ERRORS
+These are some standard errors generated by the socket layer.
+Additional errors
+may be generated and returned from the underlying protocol modules;
+see their manual pages.
+.TP
+.BR EAGAIN " or " EWOULDBLOCK
+.\" Actually EAGAIN on Linux
+The socket is marked nonblocking and the receive operation
+would block, or a receive timeout had been set and the timeout expired
+before data was received.
+POSIX.1 allows either error to be returned for this case,
+and does not require these constants to have the same value,
+so a portable application should check for both possibilities.
+.TP
+.B EBADF
+The argument
+.I sockfd
+is an invalid file descriptor.
+.TP
+.B ECONNREFUSED
+A remote host refused to allow the network connection (typically
+because it is not running the requested service).
+.TP
+.B EFAULT
+The receive buffer pointer(s) point outside the process's
+address space.
+.TP
+.B EINTR
+The receive was interrupted by delivery of a signal before
+any data was available; see
+.BR signal (7).
+.TP
+.B EINVAL
+Invalid argument passed.
+.\" e.g., msg_namelen < 0 for recvmsg() or addrlen < 0 for recvfrom()
+.TP
+.B ENOMEM
+Could not allocate memory for
+.BR recvmsg ().
+.TP
+.B ENOTCONN
+The socket is associated with a connection-oriented protocol
+and has not been connected (see
+.BR connect (2)
+and
+.BR accept (2)).
+.TP
+.B ENOTSOCK
+The file descriptor
+.I sockfd
+does not refer to a socket.
+.SH VERSIONS
+According to POSIX.1,
+.\" POSIX.1-2001, POSIX.1-2008
+the
+.I msg_controllen
+field of the
+.I msghdr
+structure should be typed as
+.IR socklen_t ,
+and the
+.I msg_iovlen
+field should be typed as
+.IR int ,
+but glibc currently types both as
+.IR size_t .
+.\" glibc bug for msg_controllen raised 12 Mar 2006
+.\" http://sourceware.org/bugzilla/show_bug.cgi?id=2448
+.\" The problem is an underlying kernel issue: the size of the
+.\" __kernel_size_t type used to type these fields varies
+.\" across architectures, but socklen_t is always 32 bits,
+.\" as (at least with GCC) is int.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001,
+4.4BSD (first appeared in 4.2BSD).
+.PP
+POSIX.1 describes only the
+.BR MSG_OOB ,
+.BR MSG_PEEK ,
+and
+.B MSG_WAITALL
+flags.
+.SH NOTES
+If a zero-length datagram is pending,
+.BR read (2)
+and
+.BR recv ()
+with a
+.I flags
+argument of zero provide different behavior.
+In this circumstance,
+.BR read (2)
+has no effect (the datagram remains pending), while
+.BR recv ()
+consumes the pending datagram.
+.PP
+See
+.BR recvmmsg (2)
+for information about a Linux-specific system call
+that can be used to receive multiple datagrams in a single call.
+.SH EXAMPLES
+An example of the use of
+.BR recvfrom ()
+is shown in
+.BR getaddrinfo (3).
+.SH SEE ALSO
+.BR fcntl (2),
+.BR getsockopt (2),
+.BR read (2),
+.BR recvmmsg (2),
+.BR select (2),
+.BR shutdown (2),
+.BR socket (2),
+.BR cmsg (3),
+.BR sockatmark (3),
+.BR ip (7),
+.BR ipv6 (7),
+.BR socket (7),
+.BR tcp (7),
+.BR udp (7),
+.BR unix (7)
diff --git a/man2/recvfrom.2 b/man2/recvfrom.2
new file mode 100644
index 0000000..13228c3
--- /dev/null
+++ b/man2/recvfrom.2
@@ -0,0 +1 @@
+.so man2/recv.2
diff --git a/man2/recvmmsg.2 b/man2/recvmmsg.2
new file mode 100644
index 0000000..d5b0f5a
--- /dev/null
+++ b/man2/recvmmsg.2
@@ -0,0 +1,276 @@
+.\" Copyright (C) 2011 by Andi Kleen <andi@firstfloor.org>
+.\" and Copyright (c) 2011 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Syscall added in following commit
+.\" commit a2e2725541fad72416326798c2d7fa4dafb7d337
+.\" Author: Arnaldo Carvalho de Melo <acme@redhat.com>
+.\" Date: Mon Oct 12 23:40:10 2009 -0700
+.\"
+.TH recvmmsg 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+recvmmsg \- receive multiple messages on a socket
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #include <sys/socket.h>
+.PP
+.BI "int recvmmsg(int " sockfd ", struct mmsghdr *" msgvec \
+", unsigned int " vlen ","
+.BI " int " flags ", struct timespec *" timeout ");"
+.fi
+.SH DESCRIPTION
+The
+.BR recvmmsg ()
+system call is an extension of
+.BR recvmsg (2)
+that allows the caller to receive multiple messages from a socket
+using a single system call.
+(This has performance benefits for some applications.)
+A further extension over
+.BR recvmsg (2)
+is support for a timeout on the receive operation.
+.PP
+The
+.I sockfd
+argument is the file descriptor of the socket to receive data from.
+.PP
+The
+.I msgvec
+argument is a pointer to an array of
+.I mmsghdr
+structures.
+The size of this array is specified in
+.IR vlen .
+.PP
+The
+.I mmsghdr
+structure is defined in
+.I <sys/socket.h>
+as:
+.PP
+.in +4n
+.EX
+struct mmsghdr {
+ struct msghdr msg_hdr; /* Message header */
+ unsigned int msg_len; /* Number of received bytes for header */
+};
+.EE
+.in
+.PP
+The
+.I msg_hdr
+field is a
+.I msghdr
+structure, as described in
+.BR recvmsg (2).
+The
+.I msg_len
+field is the number of bytes returned for the message in the entry.
+This field has the same value as the return value of a single
+.BR recvmsg (2)
+on the header.
+.PP
+The
+.I flags
+argument contains flags ORed together.
+The flags are the same as documented for
+.BR recvmsg (2),
+with the following addition:
+.TP
+.BR MSG_WAITFORONE " (since Linux 2.6.34)"
+Turns on
+.B MSG_DONTWAIT
+after the first message has been received.
+.PP
+The
+.I timeout
+argument points to a
+.I struct timespec
+(see
+.BR clock_gettime (2))
+defining a timeout (seconds plus nanoseconds) for the receive operation
+.RI ( "but see BUGS!" ).
+(This interval will be rounded up to the system clock granularity,
+and kernel scheduling delays mean that the blocking interval
+may overrun by a small amount.)
+If
+.I timeout
+is NULL, then the operation blocks indefinitely.
+.PP
+A blocking
+.BR recvmmsg ()
+call blocks until
+.I vlen
+messages have been received
+or until the timeout expires.
+A nonblocking call reads as many messages as are available
+(up to the limit specified by
+.IR vlen )
+and returns immediately.
+.PP
+On return from
+.BR recvmmsg (),
+successive elements of
+.I msgvec
+are updated to contain information about each received message:
+.I msg_len
+contains the size of the received message;
+the subfields of
+.I msg_hdr
+are updated as described in
+.BR recvmsg (2).
+The return value of the call indicates the number of elements of
+.I msgvec
+that have been updated.
+.SH RETURN VALUE
+On success,
+.BR recvmmsg ()
+returns the number of messages received in
+.IR msgvec ;
+on error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+Errors are as for
+.BR recvmsg (2).
+In addition, the following error can occur:
+.TP
+.B EINVAL
+.I timeout
+is invalid.
+.PP
+See also BUGS.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.33,
+glibc 2.12.
+.SH BUGS
+The
+.I timeout
+argument does not work as intended.
+.\" FIXME . https://bugzilla.kernel.org/show_bug.cgi?id=75371
+.\" http://thread.gmane.org/gmane.linux.man/5677
+The timeout is checked only after the receipt of each datagram,
+so that if up to
+.I vlen\-1
+datagrams are received before the timeout expires,
+but then no further datagrams are received, the call will block forever.
+.PP
+If an error occurs after at least one message has been received,
+the call succeeds, and returns the number of messages received.
+The error code is expected to be returned on a subsequent call to
+.BR recvmmsg ().
+In the current implementation, however, the error code can be overwritten
+in the meantime by an unrelated network event on a socket,
+for example an incoming ICMP packet.
+.SH EXAMPLES
+The following program uses
+.BR recvmmsg ()
+to receive multiple messages on a socket and stores
+them in multiple buffers.
+The call returns if all buffers are filled or if the
+timeout specified has expired.
+.PP
+The following snippet periodically generates UDP datagrams
+containing a random number:
+.PP
+.in +4n
+.EX
+.RB "$" " while true; do echo $RANDOM > /dev/udp/127.0.0.1/1234;"
+.B " sleep 0.25; done"
+.EE
+.in
+.PP
+These datagrams are read by the example application, which
+can give the following output:
+.PP
+.in +4n
+.EX
+.RB "$" " ./a.out"
+5 messages received
+1 11782
+2 11345
+3 304
+4 13514
+5 28421
+.EE
+.in
+.SS Program source
+\&
+.\" SRC BEGIN (recvmmsg.c)
+.EX
+#define _GNU_SOURCE
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <time.h>
+\&
+int
+main(void)
+{
+#define VLEN 10
+#define BUFSIZE 200
+#define TIMEOUT 1
+ int sockfd, retval;
+ char bufs[VLEN][BUFSIZE+1];
+ struct iovec iovecs[VLEN];
+ struct mmsghdr msgs[VLEN];
+ struct timespec timeout;
+ struct sockaddr_in addr;
+\&
+ sockfd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sockfd == \-1) {
+ perror("socket()");
+ exit(EXIT_FAILURE);
+ }
+\&
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ addr.sin_port = htons(1234);
+ if (bind(sockfd, (struct sockaddr *) &addr, sizeof(addr)) == \-1) {
+ perror("bind()");
+ exit(EXIT_FAILURE);
+ }
+\&
+ memset(msgs, 0, sizeof(msgs));
+ for (size_t i = 0; i < VLEN; i++) {
+ iovecs[i].iov_base = bufs[i];
+ iovecs[i].iov_len = BUFSIZE;
+ msgs[i].msg_hdr.msg_iov = &iovecs[i];
+ msgs[i].msg_hdr.msg_iovlen = 1;
+ }
+\&
+ timeout.tv_sec = TIMEOUT;
+ timeout.tv_nsec = 0;
+\&
+ retval = recvmmsg(sockfd, msgs, VLEN, 0, &timeout);
+ if (retval == \-1) {
+ perror("recvmmsg()");
+ exit(EXIT_FAILURE);
+ }
+\&
+ printf("%d messages received\en", retval);
+ for (size_t i = 0; i < retval; i++) {
+ bufs[i][msgs[i].msg_len] = 0;
+ printf("%zu %s", i+1, bufs[i]);
+ }
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR clock_gettime (2),
+.BR recvmsg (2),
+.BR sendmmsg (2),
+.BR sendmsg (2),
+.BR socket (2),
+.BR socket (7)
diff --git a/man2/recvmsg.2 b/man2/recvmsg.2
new file mode 100644
index 0000000..13228c3
--- /dev/null
+++ b/man2/recvmsg.2
@@ -0,0 +1 @@
+.so man2/recv.2
diff --git a/man2/remap_file_pages.2 b/man2/remap_file_pages.2
new file mode 100644
index 0000000..ab4ee51
--- /dev/null
+++ b/man2/remap_file_pages.2
@@ -0,0 +1,170 @@
+.\" Copyright (C) 2003, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" 2003-12-10 Initial creation, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" 2004-10-28 aeb, corrected prototype, prot must be 0
+.\"
+.TH remap_file_pages 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+remap_file_pages \- create a nonlinear file mapping
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #include <sys/mman.h>
+.PP
+.BI "[[deprecated]] int remap_file_pages(void " addr [. size "], size_t " size ,
+.BI " int " prot ", size_t " pgoff ", \
+int " flags );
+.fi
+.SH DESCRIPTION
+.BR Note :
+.\" commit 33041a0d76d3c3e0aff28ac95a2ffdedf1282dbc
+.\" http://lwn.net/Articles/597632/
+this system call was marked as deprecated starting with Linux 3.16.
+In Linux 4.0, the implementation was replaced
+.\" commit c8d78c1823f46519473949d33f0d1d33fe21ea16
+by a slower in-kernel emulation.
+Those few applications that use this system call should
+consider migrating to alternatives.
+This change was made because the kernel code for this system call was complex,
+and it is believed to be little used or perhaps even completely unused.
+While it had some use cases in database applications on 32-bit systems,
+those use cases don't exist on 64-bit systems.
+.PP
+The
+.BR remap_file_pages ()
+system call is used to create a nonlinear mapping, that is, a mapping
+in which the pages of the file are mapped into a nonsequential order
+in memory.
+The advantage of using
+.BR remap_file_pages ()
+over using repeated calls to
+.BR mmap (2)
+is that the former approach does not require the kernel to create
+additional VMA (Virtual Memory Area) data structures.
+.PP
+To create a nonlinear mapping we perform the following steps:
+.TP 3
+1.
+Use
+.BR mmap (2)
+to create a mapping (which is initially linear).
+This mapping must be created with the
+.B MAP_SHARED
+flag.
+.TP
+2.
+Use one or more calls to
+.BR remap_file_pages ()
+to rearrange the correspondence between the pages of the mapping
+and the pages of the file.
+It is possible to map the same page of a file
+into multiple locations within the mapped region.
+.PP
+The
+.I pgoff
+and
+.I size
+arguments specify the region of the file that is to be relocated
+within the mapping:
+.I pgoff
+is a file offset in units of the system page size;
+.I size
+is the length of the region in bytes.
+.PP
+The
+.I addr
+argument serves two purposes.
+First, it identifies the mapping whose pages we want to rearrange.
+Thus,
+.I addr
+must be an address that falls within
+a region previously mapped by a call to
+.BR mmap (2).
+Second,
+.I addr
+specifies the address at which the file pages
+identified by
+.I pgoff
+and
+.I size
+will be placed.
+.PP
+The values specified in
+.I addr
+and
+.I size
+should be multiples of the system page size.
+If they are not, then the kernel rounds
+.I both
+values
+.I down
+to the nearest multiple of the page size.
+.\" This rounding is weird, and not consistent with the treatment of
+.\" the analogous arguments for munmap()/mprotect() and for mlock().
+.\" MTK, 14 Sep 2005
+.PP
+The
+.I prot
+argument must be specified as 0.
+.PP
+The
+.I flags
+argument has the same meaning as for
+.BR mmap (2),
+but all flags other than
+.B MAP_NONBLOCK
+are ignored.
+.SH RETURN VALUE
+On success,
+.BR remap_file_pages ()
+returns 0.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+.I addr
+does not refer to a valid mapping
+created with the
+.B MAP_SHARED
+flag.
+.TP
+.B EINVAL
+.IR addr ,
+.IR size ,
+.IR prot ,
+or
+.I pgoff
+is invalid.
+.\" And possibly others from vma->vm_ops->populate()
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.5.46,
+glibc 2.3.3.
+.SH NOTES
+Since Linux 2.6.23,
+.\" commit 3ee6dafc677a68e461a7ddafc94a580ebab80735
+.BR remap_file_pages ()
+creates non-linear mappings only
+on in-memory filesystems such as
+.BR tmpfs (5),
+hugetlbfs or ramfs.
+On filesystems with a backing store,
+.BR remap_file_pages ()
+is not much more efficient than using
+.BR mmap (2)
+to adjust which parts of the file are mapped to which addresses.
+.SH SEE ALSO
+.BR getpagesize (2),
+.BR mmap (2),
+.BR mmap2 (2),
+.BR mprotect (2),
+.BR mremap (2),
+.BR msync (2)
diff --git a/man2/removexattr.2 b/man2/removexattr.2
new file mode 100644
index 0000000..1a9f53f
--- /dev/null
+++ b/man2/removexattr.2
@@ -0,0 +1,100 @@
+.\" Copyright (C) Andreas Gruenbacher, February 2001
+.\" Copyright (C) Silicon Graphics Inc, September 2001
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.TH removexattr 2 2023-04-08 "Linux man-pages 6.05.01"
+.SH NAME
+removexattr, lremovexattr, fremovexattr \- remove an extended attribute
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/xattr.h>
+.PP
+.BI "int removexattr(const char *" path ", const char *" name );
+.BI "int lremovexattr(const char *" path ", const char *" name );
+.BI "int fremovexattr(int " fd ", const char *" name );
+.fi
+.SH DESCRIPTION
+Extended attributes are
+.IR name : value
+pairs associated with inodes (files, directories, symbolic links, etc.).
+They are extensions to the normal attributes which are associated
+with all inodes in the system (i.e., the
+.BR stat (2)
+data).
+A complete overview of extended attributes concepts can be found in
+.BR xattr (7).
+.PP
+.BR removexattr ()
+removes the extended attribute identified by
+.I name
+and associated with the given
+.I path
+in the filesystem.
+.PP
+.BR lremovexattr ()
+is identical to
+.BR removexattr (),
+except in the case of a symbolic link, where the extended attribute is
+removed from the link itself, not the file that it refers to.
+.PP
+.BR fremovexattr ()
+is identical to
+.BR removexattr (),
+only the extended attribute is removed from the open file referred to by
+.I fd
+(as returned by
+.BR open (2))
+in place of
+.IR path .
+.PP
+An extended attribute name is a null-terminated string.
+The
+.I name
+includes a namespace prefix; there may be several, disjoint
+namespaces associated with an individual inode.
+.SH RETURN VALUE
+On success, zero is returned.
+On failure, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B ENODATA
+The named attribute does not exist.
+.\" .RB ( ENOATTR
+.\" is defined to be a synonym for
+.\" .BR ENODATA
+.\" in
+.\" .IR <attr/attributes.h> .)
+.TP
+.B ENOTSUP
+Extended attributes are not supported by the filesystem, or are disabled.
+.PP
+In addition, the errors documented in
+.BR stat (2)
+can also occur.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.4,
+glibc 2.3.
+.\" .SH AUTHORS
+.\" Andreas Gruenbacher,
+.\" .RI < a.gruenbacher@computer.org >
+.\" and the SGI XFS development team,
+.\" .RI < linux-xfs@oss.sgi.com >.
+.\" Please send any bug reports or comments to these addresses.
+.SH SEE ALSO
+.BR getfattr (1),
+.BR setfattr (1),
+.BR getxattr (2),
+.BR listxattr (2),
+.BR open (2),
+.BR setxattr (2),
+.BR stat (2),
+.BR symlink (7),
+.BR xattr (7)
diff --git a/man2/rename.2 b/man2/rename.2
new file mode 100644
index 0000000..9963af6
--- /dev/null
+++ b/man2/rename.2
@@ -0,0 +1,549 @@
+.\" This manpage is Copyright (C) 1992 Drew Eckhardt;
+.\" and Copyright (C) 1993 Michael Haardt;
+.\" and Copyright (C) 1993,1995 Ian Jackson
+.\" and Copyright (C) 2006, 2014 Michael Kerrisk
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified Sat Jul 24 00:35:52 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Modified Thu Jun 4 12:21:13 1998 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified Thu Mar 3 09:49:35 2005 by Michael Haardt <michael@moria.de>
+.\" 2007-03-25, mtk, added various text to DESCRIPTION.
+.\"
+.TH rename 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+rename, renameat, renameat2 \- change the name or location of a file
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <stdio.h>
+.PP
+.BI "int rename(const char *" oldpath ", const char *" newpath );
+.PP
+.BR "#include <fcntl.h> " "/* Definition of " AT_* " constants */"
+.B #include <stdio.h>
+.PP
+.BI "int renameat(int " olddirfd ", const char *" oldpath ,
+.BI " int " newdirfd ", const char *" newpath );
+.BI "int renameat2(int " olddirfd ", const char *" oldpath ,
+.BI " int " newdirfd ", const char *" newpath \
+", unsigned int " flags );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.nf
+.BR renameat ():
+ Since glibc 2.10:
+ _POSIX_C_SOURCE >= 200809L
+ Before glibc 2.10:
+ _ATFILE_SOURCE
+.PP
+.BR renameat2 ():
+ _GNU_SOURCE
+.fi
+.SH DESCRIPTION
+.BR rename ()
+renames a file, moving it between directories if required.
+Any other hard links to the file (as created using
+.BR link (2))
+are unaffected.
+Open file descriptors for
+.I oldpath
+are also unaffected.
+.PP
+Various restrictions determine whether or not the rename operation succeeds:
+see ERRORS below.
+.PP
+If
+.I newpath
+already exists, it will be atomically replaced, so that there is
+no point at which another process attempting to access
+.I newpath
+will find it missing.
+However, there will probably be a window in which both
+.I oldpath
+and
+.I newpath
+refer to the file being renamed.
+.PP
+If
+.I oldpath
+and
+.I newpath
+are existing hard links referring to the same file, then
+.BR rename ()
+does nothing, and returns a success status.
+.PP
+If
+.I newpath
+exists but the operation fails for some reason,
+.BR rename ()
+guarantees to leave an instance of
+.I newpath
+in place.
+.PP
+.I oldpath
+can specify a directory.
+In this case,
+.I newpath
+must either not exist, or it must specify an empty directory.
+.PP
+If
+.I oldpath
+refers to a symbolic link, the link is renamed; if
+.I newpath
+refers to a symbolic link, the link will be overwritten.
+.SS renameat()
+The
+.BR renameat ()
+system call operates in exactly the same way as
+.BR rename (),
+except for the differences described here.
+.PP
+If the pathname given in
+.I oldpath
+is relative, then it is interpreted relative to the directory
+referred to by the file descriptor
+.I olddirfd
+(rather than relative to the current working directory of
+the calling process, as is done by
+.BR rename ()
+for a relative pathname).
+.PP
+If
+.I oldpath
+is relative and
+.I olddirfd
+is the special value
+.BR AT_FDCWD ,
+then
+.I oldpath
+is interpreted relative to the current working
+directory of the calling process (like
+.BR rename ()).
+.PP
+If
+.I oldpath
+is absolute, then
+.I olddirfd
+is ignored.
+.PP
+The interpretation of
+.I newpath
+is as for
+.IR oldpath ,
+except that a relative pathname is interpreted relative
+to the directory referred to by the file descriptor
+.IR newdirfd .
+.PP
+See
+.BR openat (2)
+for an explanation of the need for
+.BR renameat ().
+.SS renameat2()
+.BR renameat2 ()
+has an additional
+.I flags
+argument.
+A
+.BR renameat2 ()
+call with a zero
+.I flags
+argument is equivalent to
+.BR renameat ().
+.PP
+The
+.I flags
+argument is a bit mask consisting of zero or more of the following flags:
+.TP
+.B RENAME_EXCHANGE
+Atomically exchange
+.I oldpath
+and
+.IR newpath .
+Both pathnames must exist
+but may be of different types (e.g., one could be a non-empty directory
+and the other a symbolic link).
+.TP
+.B RENAME_NOREPLACE
+Don't overwrite
+.I newpath
+of the rename.
+Return an error if
+.I newpath
+already exists.
+.IP
+.B RENAME_NOREPLACE
+can't be employed together with
+.BR RENAME_EXCHANGE .
+.IP
+.B RENAME_NOREPLACE
+requires support from the underlying filesystem.
+Support for various filesystems was added as follows:
+.RS
+.IP \[bu] 3
+ext4 (Linux 3.15);
+.\" ext4: commit 0a7c3937a1f23f8cb5fc77ae01661e9968a51d0c
+.IP \[bu]
+btrfs, tmpfs, and cifs (Linux 3.17);
+.IP \[bu]
+xfs (Linux 4.0);
+.\" btrfs: commit 80ace85c915d0f41016f82917218997b72431258
+.\" tmpfs: commit 3b69ff51d087d265aa4af3a532fc4f20bf33e718
+.\" cifs: commit 7c33d5972ce382bcc506d16235f1e9b7d22cbef8
+.\"
+.\" gfs2 in Linux 4.2?
+.IP \[bu]
+Support for many other filesystems was added in Linux 4.9, including
+ext2, minix, reiserfs, jfs, vfat, and bpf.
+.\" Also affs, bfs, exofs, hfs, hfsplus, jffs2, logfs, msdos,
+.\" nilfs2, omfs, sysvfs, ubifs, udf, ufs
+.\" hugetlbfs, ramfs
+.\" local filesystems: commit f03b8ad8d38634d13e802165cc15917481b47835
+.\" libfs: commit e0e0be8a835520e2f7c89f214dfda570922a1b90
+.RE
+.TP
+.BR RENAME_WHITEOUT " (since Linux 3.18)"
+.\" commit 0d7a855526dd672e114aff2ac22b60fc6f155b08
+.\" commit 787fb6bc9682ec7c05fb5d9561b57100fbc1cc41
+This operation makes sense only for overlay/union
+filesystem implementations.
+.IP
+Specifying
+.B RENAME_WHITEOUT
+creates a "whiteout" object at the source of
+the rename at the same time as performing the rename.
+The whole operation is atomic,
+so that if the rename succeeds then the whiteout will also have been created.
+.IP
+A "whiteout" is an object that has special meaning in union/overlay
+filesystem constructs.
+In these constructs,
+multiple layers exist and only the top one is ever modified.
+A whiteout on an upper layer will effectively hide a
+matching file in the lower layer,
+making it appear as if the file didn't exist.
+.IP
+When a file that exists on the lower layer is renamed,
+the file is first copied up (if not already on the upper layer)
+and then renamed on the upper, read-write layer.
+At the same time, the source file needs to be "whiteouted"
+(so that the version of the source file in the lower layer
+is rendered invisible).
+The whole operation needs to be done atomically.
+.IP
+When not part of a union/overlay,
+the whiteout appears as a character device with a {0,0} device number.
+.\" https://www.freebsd.org/cgi/man.cgi?query=mount_unionfs&manpath=FreeBSD+11.0-RELEASE
+(Note that other union/overlay implementations may employ different methods
+for storing whiteout entries; specifically, BSD union mount employs
+a separate inode type,
+.BR DT_WHT ,
+which, while supported by some filesystems available in Linux,
+such as CODA and XFS, is ignored by the kernel's whiteout support code,
+as of Linux 4.19, at least.)
+.IP
+.B RENAME_WHITEOUT
+requires the same privileges as creating a device node (i.e., the
+.B CAP_MKNOD
+capability).
+.IP
+.B RENAME_WHITEOUT
+can't be employed together with
+.BR RENAME_EXCHANGE .
+.IP
+.B RENAME_WHITEOUT
+requires support from the underlying filesystem.
+Among the filesystems that support it are
+tmpfs (since Linux 3.18),
+.\" tmpfs: commit 46fdb794e3f52ef18b859ebc92f0a9d7db21c5df
+ext4 (since Linux 3.18),
+.\" ext4: commit cd808deced431b66b5fa4e5c193cb7ec0059eaff
+XFS (since Linux 4.1),
+.\" XFS: commit 7dcf5c3e4527cfa2807567b00387cf2ed5e07f00
+f2fs (since Linux 4.2),
+.\" f2fs: commit 7e01e7ad746bc8198a8b46163ddc73a1c7d22339
+btrfs (since Linux 4.7),
+.\" btrfs: commit cdd1fedf8261cd7a73c0596298902ff4f0f04492
+and ubifs (since Linux 4.9).
+.\" ubifs: commit 9e0a1fff8db56eaaebb74b4a3ef65f86811c4798
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+Write permission is denied for the directory containing
+.I oldpath
+or
+.IR newpath ,
+or, search permission is denied for one of the directories
+in the path prefix of
+.I oldpath
+or
+.IR newpath ,
+or
+.I oldpath
+is a directory and does not allow write permission (needed to update
+the
+.I ..
+entry).
+(See also
+.BR path_resolution (7).)
+.TP
+.B EBUSY
+The rename fails because
+.IR oldpath " or " newpath
+is a directory that is in use by some process (perhaps as
+current working directory, or as root directory, or because
+it was open for reading) or is in use by the system
+(for example as a mount point), while the system considers
+this an error.
+(Note that there is no requirement to return
+.B EBUSY
+in such
+cases\[em]there is nothing wrong with doing the rename anyway\[em]but
+it is allowed to return
+.B EBUSY
+if the system cannot otherwise
+handle such situations.)
+.TP
+.B EDQUOT
+The user's quota of disk blocks on the filesystem has been exhausted.
+.TP
+.B EFAULT
+.IR oldpath " or " newpath " points outside your accessible address space."
+.TP
+.B EINVAL
+The new pathname contained a path prefix of the old, or, more generally,
+an attempt was made to make a directory a subdirectory of itself.
+.TP
+.B EISDIR
+.I newpath
+is an existing directory, but
+.I oldpath
+is not a directory.
+.TP
+.B ELOOP
+Too many symbolic links were encountered in resolving
+.IR oldpath " or " newpath .
+.TP
+.B EMLINK
+.I oldpath
+already has the maximum number of links to it, or
+it was a directory and the directory containing
+.I newpath
+has the maximum number of links.
+.TP
+.B ENAMETOOLONG
+.IR oldpath " or " newpath " was too long."
+.TP
+.B ENOENT
+The link named by
+.I oldpath
+does not exist;
+or, a directory component in
+.I newpath
+does not exist;
+or,
+.I oldpath
+or
+.I newpath
+is an empty string.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.B ENOSPC
+The device containing the file has no room for the new directory
+entry.
+.TP
+.B ENOTDIR
+A component used as a directory in
+.IR oldpath " or " newpath
+is not, in fact, a directory.
+Or,
+.I oldpath
+is a directory, and
+.I newpath
+exists but is not a directory.
+.TP
+.BR ENOTEMPTY " or " EEXIST
+.I newpath
+is a nonempty directory, that is, contains entries other than "." and "..".
+.TP
+.BR EPERM " or " EACCES
+The directory containing
+.I oldpath
+has the sticky bit
+.RB ( S_ISVTX )
+set and the process's effective user ID is neither
+the user ID of the file to be deleted nor that of the directory
+containing it, and the process is not privileged
+(Linux: does not have the
+.B CAP_FOWNER
+capability);
+or
+.I newpath
+is an existing file and the directory containing it has the sticky bit set
+and the process's effective user ID is neither the user ID of the file
+to be replaced nor that of the directory containing it,
+and the process is not privileged
+(Linux: does not have the
+.B CAP_FOWNER
+capability);
+or the filesystem containing
+.I oldpath
+does not support renaming of the type requested.
+.TP
+.B EROFS
+The file is on a read-only filesystem.
+.TP
+.B EXDEV
+.IR oldpath " and " newpath
+are not on the same mounted filesystem.
+(Linux permits a filesystem to be mounted at multiple points, but
+.BR rename ()
+does not work across different mount points,
+even if the same filesystem is mounted on both.)
+.PP
+The following additional errors can occur for
+.BR renameat ()
+and
+.BR renameat2 ():
+.TP
+.B EBADF
+.I oldpath
+.RI ( newpath )
+is relative but
+.I olddirfd
+.RI ( newdirfd )
+is not a valid file descriptor.
+.TP
+.B ENOTDIR
+.I oldpath
+is relative and
+.I olddirfd
+is a file descriptor referring to a file other than a directory;
+or similar for
+.I newpath
+and
+.I newdirfd
+.PP
+The following additional errors can occur for
+.BR renameat2 ():
+.TP
+.B EEXIST
+.I flags
+contains
+.B RENAME_NOREPLACE
+and
+.I newpath
+already exists.
+.TP
+.B EINVAL
+An invalid flag was specified in
+.IR flags .
+.TP
+.B EINVAL
+Both
+.B RENAME_NOREPLACE
+and
+.B RENAME_EXCHANGE
+were specified in
+.IR flags .
+.TP
+.B EINVAL
+Both
+.B RENAME_WHITEOUT
+and
+.B RENAME_EXCHANGE
+were specified in
+.IR flags .
+.TP
+.B EINVAL
+The filesystem does not support one of the flags in
+.IR flags .
+.TP
+.B ENOENT
+.I flags
+contains
+.B RENAME_EXCHANGE
+and
+.I newpath
+does not exist.
+.TP
+.B EPERM
+.B RENAME_WHITEOUT
+was specified in
+.IR flags ,
+but the caller does not have the
+.B CAP_MKNOD
+capability.
+.SH STANDARDS
+.TP
+.BR rename ()
+C11, POSIX.1-2008.
+.TP
+.BR renameat ()
+POSIX.1-2008.
+.TP
+.BR renameat2 ()
+Linux.
+.SH HISTORY
+.TP
+.BR rename ()
+4.3BSD, C89, POSIX.1-2001.
+.TP
+.BR renameat ()
+Linux 2.6.16,
+glibc 2.4.
+.TP
+.BR renameat2 ()
+Linux 3.15,
+glibc 2.28.
+.SS glibc notes
+On older kernels where
+.BR renameat ()
+is unavailable, the glibc wrapper function falls back to the use of
+.BR rename ().
+When
+.I oldpath
+and
+.I newpath
+are relative pathnames,
+glibc constructs pathnames based on the symbolic links in
+.I /proc/self/fd
+that correspond to the
+.I olddirfd
+and
+.I newdirfd
+arguments.
+.SH BUGS
+On NFS filesystems, you can not assume that if the operation
+failed, the file was not renamed.
+If the server does the rename operation
+and then crashes, the retransmitted RPC which will be processed when the
+server is up again causes a failure.
+The application is expected to
+deal with this.
+See
+.BR link (2)
+for a similar problem.
+.SH SEE ALSO
+.BR mv (1),
+.BR rename (1),
+.BR chmod (2),
+.BR link (2),
+.BR symlink (2),
+.BR unlink (2),
+.BR path_resolution (7),
+.BR symlink (7)
diff --git a/man2/renameat.2 b/man2/renameat.2
new file mode 100644
index 0000000..9b74442
--- /dev/null
+++ b/man2/renameat.2
@@ -0,0 +1 @@
+.so man2/rename.2
diff --git a/man2/renameat2.2 b/man2/renameat2.2
new file mode 100644
index 0000000..9b74442
--- /dev/null
+++ b/man2/renameat2.2
@@ -0,0 +1 @@
+.so man2/rename.2
diff --git a/man2/request_key.2 b/man2/request_key.2
new file mode 100644
index 0000000..80187d1
--- /dev/null
+++ b/man2/request_key.2
@@ -0,0 +1,562 @@
+.\" Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+.\" Written by David Howells (dhowells@redhat.com)
+.\" and Copyright (C) 2016 Michael Kerrisk <mtk.man-pages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.TH request_key 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+request_key \- request a key from the kernel's key management facility
+.SH LIBRARY
+Linux Key Management Utilities
+.RI ( libkeyutils ", " \-lkeyutils )
+.SH SYNOPSIS
+.nf
+.B #include <keyutils.h>
+.PP
+.BI "key_serial_t request_key(const char *" type ", const char *" description ,
+.BI " const char *_Nullable " callout_info ,
+.BI " key_serial_t " dest_keyring );
+.fi
+.SH DESCRIPTION
+.BR request_key ()
+attempts to find a key of the given
+.I type
+with a description (name) that matches the specified
+.IR description .
+If such a key could not be found, then the key is optionally created.
+If the key is found or created,
+.BR request_key ()
+attaches it to the keyring whose ID is specified in
+.I dest_keyring
+and returns the key's serial number.
+.PP
+.BR request_key ()
+first recursively searches for a matching key in all of the keyrings
+attached to the calling process.
+The keyrings are searched in the order: thread-specific keyring,
+process-specific keyring, and then session keyring.
+.PP
+If
+.BR request_key ()
+is called from a program invoked by
+.BR request_key ()
+on behalf of some other process to generate a key, then the keyrings of that
+other process will be searched next,
+using that other process's user ID, group ID,
+supplementary group IDs, and security context to determine access.
+.\" David Howells: we can then have an arbitrarily long sequence
+.\" of "recursive" request-key upcalls. There is no limit, other
+.\" than number of PIDs, etc.
+.PP
+The search of the keyring tree is breadth-first:
+the keys in each keyring searched are checked for a match before any child
+keyrings are recursed into.
+Only keys for which the caller has
+.I search
+permission be found, and only keyrings for which the caller has
+.I search
+permission may be searched.
+.PP
+If the key is not found and
+.I callout
+is NULL, then the call fails with the error
+.BR ENOKEY .
+.PP
+If the key is not found and
+.I callout
+is not NULL, then the kernel attempts to invoke a user-space
+program to instantiate the key.
+The details are given below.
+.PP
+The
+.I dest_keyring
+serial number may be that of a valid keyring for which the caller has
+.I write
+permission, or it may be one of the following special keyring IDs:
+.TP
+.B KEY_SPEC_THREAD_KEYRING
+This specifies the caller's thread-specific keyring (see
+.BR thread\-keyring (7)).
+.TP
+.B KEY_SPEC_PROCESS_KEYRING
+This specifies the caller's process-specific keyring (see
+.BR process\-keyring (7)).
+.TP
+.B KEY_SPEC_SESSION_KEYRING
+This specifies the caller's session-specific keyring (see
+.BR session\-keyring (7)).
+.TP
+.B KEY_SPEC_USER_KEYRING
+This specifies the caller's UID-specific keyring (see
+.BR user\-keyring (7)).
+.TP
+.B KEY_SPEC_USER_SESSION_KEYRING
+This specifies the caller's UID-session keyring (see
+.BR user\-session\-keyring (7)).
+.PP
+When the
+.I dest_keyring
+is specified as 0
+and no key construction has been performed,
+then no additional linking is done.
+.PP
+Otherwise, if
+.I dest_keyring
+is 0 and a new key is constructed, the new key will be linked
+to the "default" keyring.
+More precisely, when the kernel tries to determine to which keyring the
+newly constructed key should be linked,
+it tries the following keyrings,
+beginning with the keyring set via the
+.BR keyctl (2)
+.B KEYCTL_SET_REQKEY_KEYRING
+operation and continuing in the order shown below
+until it finds the first keyring that exists:
+.IP \[bu] 3
+.\" 8bbf4976b59fc9fc2861e79cab7beb3f6d647640
+The requestor keyring
+.RB ( KEY_REQKEY_DEFL_REQUESTOR_KEYRING ,
+since Linux 2.6.29).
+.\" FIXME
+.\" Actually, is the preceding point correct?
+.\" If I understand correctly, we'll only get here if
+.\" 'dest_keyring' is zero, in which case KEY_REQKEY_DEFL_REQUESTOR_KEYRING
+.\" won't refer to a keyring. Have I misunderstood?
+.IP \[bu]
+The thread-specific keyring
+.RB ( KEY_REQKEY_DEFL_THREAD_KEYRING ;
+see
+.BR thread\-keyring (7)).
+.IP \[bu]
+The process-specific keyring
+.RB ( KEY_REQKEY_DEFL_PROCESS_KEYRING ;
+see
+.BR process\-keyring (7)).
+.IP \[bu]
+The session-specific keyring
+.RB ( KEY_REQKEY_DEFL_SESSION_KEYRING ;
+see
+.BR session\-keyring (7)).
+.IP \[bu]
+The session keyring for the process's user ID
+.RB ( KEY_REQKEY_DEFL_USER_SESSION_KEYRING ;
+see
+.BR user\-session\-keyring (7)).
+This keyring is expected to always exist.
+.IP \[bu]
+The UID-specific keyring
+.RB ( KEY_REQKEY_DEFL_USER_KEYRING ;
+see
+.BR user\-keyring (7)).
+This keyring is also expected to always exist.
+.\" mtk: Are there circumstances where the user sessions and UID-specific
+.\" keyrings do not exist?
+.\"
+.\" David Howells:
+.\" The uid keyrings don't exist until someone tries to access them -
+.\" at which point they're both created. When you log in, pam_keyinit
+.\" creates a link to your user keyring in the session keyring it just
+.\" created, thereby creating the user and user-session keyrings.
+.\"
+.\" and David elaborated that "access" means:
+.\"
+.\" It means lookup_user_key() was passed KEY_LOOKUP_CREATE. So:
+.\"
+.\" add_key() - destination keyring
+.\" request_key() - destination keyring
+.\" KEYCTL_GET_KEYRING_ID - if create arg is true
+.\" KEYCTL_CLEAR
+.\" KEYCTL_LINK - both args
+.\" KEYCTL_SEARCH - destination keyring
+.\" KEYCTL_CHOWN
+.\" KEYCTL_SETPERM
+.\" KEYCTL_SET_TIMEOUT
+.\" KEYCTL_INSTANTIATE - destination keyring
+.\" KEYCTL_INSTANTIATE_IOV - destination keyring
+.\" KEYCTL_NEGATE - destination keyring
+.\" KEYCTL_REJECT - destination keyring
+.\" KEYCTL_GET_PERSISTENT - destination keyring
+.\"
+.\" will all create a keyring under some circumstances. Whereas the rest,
+.\" such as KEYCTL_GET_SECURITY, KEYCTL_READ and KEYCTL_REVOKE, won't.
+.PP
+If the
+.BR keyctl (2)
+.B KEYCTL_SET_REQKEY_KEYRING
+operation specifies
+.B KEY_REQKEY_DEFL_DEFAULT
+(or no
+.B KEYCTL_SET_REQKEY_KEYRING
+operation is performed),
+then the kernel looks for a keyring
+starting from the beginning of the list.
+.\"
+.SS Requesting user-space instantiation of a key
+If the kernel cannot find a key matching
+.I type
+and
+.IR description ,
+and
+.I callout
+is not NULL, then the kernel attempts to invoke a user-space
+program to instantiate a key with the given
+.I type
+and
+.IR description .
+In this case, the following steps are performed:
+.IP (1) 5
+The kernel creates an uninstantiated key, U, with the requested
+.I type
+and
+.IR description .
+.IP (2)
+The kernel creates an authorization key, V,
+.\" struct request_key_auth, defined in security/keys/internal.h
+that refers to the key U and records the facts that the caller of
+.BR request_key ()
+is:
+.RS
+.IP (2.1) 7
+the context in which the key U should be instantiated and secured, and
+.IP (2.2)
+the context from which associated key requests may be satisfied.
+.RE
+.IP
+The authorization key is constructed as follows:
+.RS
+.IP \[bu] 3
+The key type is
+.IR """.request_key_auth""" .
+.IP \[bu]
+The key's UID and GID are the same as the corresponding filesystem IDs
+of the requesting process.
+.IP \[bu]
+The key grants
+.IR view ,
+.IR read ,
+and
+.I search
+permissions to the key possessor as well as
+.I view
+permission for the key user.
+.IP \[bu]
+The description (name) of the key is the hexadecimal
+string representing the ID of the key that is to be instantiated
+in the requesting program.
+.IP \[bu]
+The payload of the key is taken from the data specified in
+.IR callout_info .
+.IP \[bu]
+Internally, the kernel also records the PID of the process that called
+.BR request_key ().
+.RE
+.IP (3)
+The kernel creates a process that executes a user-space service such as
+.BR request\-key (8)
+with a new session keyring that contains a link to the authorization key, V.
+.\" The request\-key(8) program can be invoked in circumstances *other* than
+.\" when triggered by request_key(2). For example, upcalls from places such
+.\" as the DNS resolver.
+.IP
+This program is supplied with the following command-line arguments:
+.RS
+.IP [0] 5
+The string
+.IR """/sbin/request\-key""" .
+.IP [1]
+The string
+.I """create"""
+(indicating that a key is to be created).
+.IP [2]
+The ID of the key that is to be instantiated.
+.IP [3]
+The filesystem UID of the caller of
+.BR request_key ().
+.IP [4]
+The filesystem GID of the caller of
+.BR request_key ().
+.IP [5]
+The ID of the thread keyring of the caller of
+.BR request_key ().
+This may be zero if that keyring hasn't been created.
+.IP [6]
+The ID of the process keyring of the caller of
+.BR request_key ().
+This may be zero if that keyring hasn't been created.
+.IP [7]
+The ID of the session keyring of the caller of
+.BR request_key ().
+.RE
+.IP
+.IR Note :
+each of the command-line arguments that is a key ID is encoded in
+.I decimal
+(unlike the key IDs shown in
+.IR /proc/keys ,
+which are shown as hexadecimal values).
+.IP (4)
+The program spawned in the previous step:
+.RS
+.IP \[bu] 3
+Assumes the authority to instantiate the key U using the
+.BR keyctl (2)
+.B KEYCTL_ASSUME_AUTHORITY
+operation (typically via the
+.BR keyctl_assume_authority (3)
+function).
+.IP \[bu]
+Obtains the callout data from the payload of the authorization key V
+(using the
+.BR keyctl (2)
+.B KEYCTL_READ
+operation (or, more commonly, the
+.BR keyctl_read (3)
+function) with a key ID value of
+.BR KEY_SPEC_REQKEY_AUTH_KEY ).
+.IP \[bu]
+Instantiates the key
+(or execs another program that performs that task),
+specifying the payload and destination keyring.
+(The destination keyring that the requestor specified when calling
+.BR request_key ()
+can be accessed using the special key ID
+.BR KEY_SPEC_REQUESTOR_KEYRING .)
+.\" Should an instantiating program be using KEY_SPEC_REQUESTOR_KEYRING?
+.\" I couldn't find a use in the keyutils git repo.
+.\" According to David Howells:
+.\" * This feature is provided, but not used at the moment.
+.\" * A key added to that ring is then owned by the requester
+Instantiation is performed using the
+.BR keyctl (2)
+.B KEYCTL_INSTANTIATE
+operation (or, more commonly, the
+.BR keyctl_instantiate (3)
+function).
+At this point, the
+.BR request_key ()
+call completes, and the requesting program can continue execution.
+.RE
+.PP
+If these steps are unsuccessful, then an
+.B ENOKEY
+error will be returned to the caller of
+.BR request_key ()
+and a temporary, negatively instantiated key will be installed
+in the keyring specified by
+.IR dest_keyring .
+This will expire after a few seconds, but will cause subsequent calls to
+.BR request_key ()
+to fail until it does.
+The purpose of this negatively instantiated key is to prevent
+(possibly different) processes making repeated requests
+(that require expensive
+.BR request\-key (8)
+upcalls) for a key that can't (at the moment) be positively instantiated.
+.PP
+Once the key has been instantiated, the authorization key
+.RB ( KEY_SPEC_REQKEY_AUTH_KEY )
+is revoked, and the destination keyring
+.RB ( KEY_SPEC_REQUESTOR_KEYRING )
+is no longer accessible from the
+.BR request\-key (8)
+program.
+.PP
+If a key is created, then\[em]regardless of whether it is a valid key or
+a negatively instantiated key\[em]it will displace any other key with
+the same type and description from the keyring specified in
+.IR dest_keyring .
+.SH RETURN VALUE
+On success,
+.BR request_key ()
+returns the serial number of the key it found or caused to be created.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+The keyring wasn't available for modification by the user.
+.TP
+.B EDQUOT
+The key quota for this user would be exceeded by creating this key or linking
+it to the keyring.
+.TP
+.B EFAULT
+One of
+.IR type ,
+.IR description ,
+or
+.I callout_info
+points outside the process's accessible address space.
+.TP
+.B EINTR
+The request was interrupted by a signal; see
+.BR signal (7).
+.TP
+.B EINVAL
+The size of the string (including the terminating null byte) specified in
+.I type
+or
+.I description
+exceeded the limit (32 bytes and 4096 bytes respectively).
+.TP
+.B EINVAL
+The size of the string (including the terminating null byte) specified in
+.I callout_info
+exceeded the system page size.
+.TP
+.B EKEYEXPIRED
+An expired key was found, but no replacement could be obtained.
+.TP
+.B EKEYREJECTED
+The attempt to generate a new key was rejected.
+.TP
+.B EKEYREVOKED
+A revoked key was found, but no replacement could be obtained.
+.TP
+.B ENOKEY
+No matching key was found.
+.TP
+.B ENOMEM
+Insufficient memory to create a key.
+.TP
+.B EPERM
+The
+.I type
+argument started with a period (\[aq].\[aq]).
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.10.
+.PP
+The ability to instantiate keys upon request was added
+.\" commit 3e30148c3d524a9c1c63ca28261bc24c457eb07a
+in Linux 2.6.13.
+.SH EXAMPLES
+The program below demonstrates the use of
+.BR request_key ().
+The
+.IR type ,
+.IR description ,
+and
+.I callout_info
+arguments for the system call are taken from the values
+supplied in the command-line arguments.
+The call specifies the session keyring as the target keyring.
+.PP
+In order to demonstrate this program,
+we first create a suitable entry in the file
+.IR /etc/request\-key.conf .
+.PP
+.in +4n
+.EX
+$ sudo sh
+# \fBecho \[aq]create user mtk:* * /bin/keyctl instantiate %k %c %S\[aq] \e\fP
+ \fB> /etc/request\-key.conf\fP
+# \fBexit\fP
+.EE
+.in
+.PP
+This entry specifies that when a new "user" key with the prefix
+"mtk:" must be instantiated, that task should be performed via the
+.BR keyctl (1)
+command's
+.B instantiate
+operation.
+The arguments supplied to the
+.B instantiate
+operation are:
+the ID of the uninstantiated key
+.RI ( %k );
+the callout data supplied to the
+.BR request_key ()
+call
+.RI ( %c );
+and the session keyring
+.RI ( %S )
+of the requestor (i.e., the caller of
+.BR request_key ()).
+See
+.BR request\-key.conf (5)
+for details of these
+.I %
+specifiers.
+.PP
+Then we run the program and check the contents of
+.I /proc/keys
+to verify that the requested key has been instantiated:
+.PP
+.in +4n
+.EX
+$ \fB./t_request_key user mtk:key1 "Payload data"\fP
+$ \fBgrep \[aq]2dddaf50\[aq] /proc/keys\fP
+2dddaf50 I\-\-Q\-\-\- 1 perm 3f010000 1000 1000 user mtk:key1: 12
+.EE
+.in
+.PP
+For another example of the use of this program, see
+.BR keyctl (2).
+.SS Program source
+\&
+.\" SRC BEGIN (t_request_key.c)
+.EX
+/* t_request_key.c */
+\&
+#include <keyutils.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ key_serial_t key;
+\&
+ if (argc != 4) {
+ fprintf(stderr, "Usage: %s type description callout\-data\en",
+ argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ key = request_key(argv[1], argv[2], argv[3],
+ KEY_SPEC_SESSION_KEYRING);
+ if (key == \-1) {
+ perror("request_key");
+ exit(EXIT_FAILURE);
+ }
+\&
+ printf("Key ID is %jx\en", (uintmax_t) key);
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.ad l
+.nh
+.BR keyctl (1),
+.BR add_key (2),
+.BR keyctl (2),
+.BR keyctl (3),
+.BR capabilities (7),
+.BR keyrings (7),
+.BR keyutils (7),
+.BR persistent\-keyring (7),
+.BR process\-keyring (7),
+.BR session\-keyring (7),
+.BR thread\-keyring (7),
+.BR user\-keyring (7),
+.BR user\-session\-keyring (7),
+.BR request\-key (8)
+.PP
+The kernel source files
+.I Documentation/security/keys/core.rst
+and
+.I Documentation/keys/request\-key.rst
+(or, before Linux 4.13, in the files
+.\" commit b68101a1e8f0263dbc7b8375d2a7c57c6216fb76
+.I Documentation/security/keys.txt
+and
+.\" commit 3db38ed76890565772fcca3279cc8d454ea6176b
+.IR Documentation/security/keys\-request\-key.txt ).
diff --git a/man2/restart_syscall.2 b/man2/restart_syscall.2
new file mode 100644
index 0000000..4b0e101
--- /dev/null
+++ b/man2/restart_syscall.2
@@ -0,0 +1,123 @@
+.\" Copyright (c) 2013 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" http://thread.gmane.org/gmane.linux.kernel/76552/focus=76803
+.\" From: Linus Torvalds <torvalds <at> transmeta.com>
+.\" Subject: Re: [PATCH] compatibility syscall layer (lets try again)
+.\" Newsgroups: gmane.linux.kernel
+.\" Date: 2002-12-05 02:51:12 GMT
+.\"
+.\" See also Section 11.3.3 of Understanding the Linux Kernel, 3rd edition
+.\"
+.TH restart_syscall 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+restart_syscall \- restart a system call after interruption by a stop signal
+.SH SYNOPSIS
+.nf
+.B long restart_syscall(void);
+.fi
+.PP
+.IR Note :
+There is no glibc wrapper for this system call; see NOTES.
+.SH DESCRIPTION
+The
+.BR restart_syscall ()
+system call is used to restart certain system calls
+after a process that was stopped by a signal (e.g.,
+.B SIGSTOP
+or
+.BR SIGTSTP )
+is later resumed after receiving a
+.B SIGCONT
+signal.
+This system call is designed only for internal use by the kernel.
+.PP
+.BR restart_syscall ()
+is used for restarting only those system calls that,
+when restarted, should adjust their time-related parameters\[em]namely
+.BR poll (2)
+(since Linux 2.6.24),
+.BR nanosleep (2)
+(since Linux 2.6),
+.BR clock_nanosleep (2)
+(since Linux 2.6),
+and
+.BR futex (2),
+when employed with the
+.B FUTEX_WAIT
+(since Linux 2.6.22)
+and
+.B FUTEX_WAIT_BITSET
+(since Linux 2.6.31)
+operations.
+.\" These system calls correspond to the special internal errno value
+.\" ERESTART_RESTARTBLOCK. Each of the system calls has a "restart"
+.\" helper function that is invoked by restart_syscall().
+.\" Notable (as at Linux 3.17) is that poll() has such a "restart"
+.\" function, but ppoll(), select(), and pselect() do not.
+.\" This means that the latter system calls do not take account of the
+.\" time spent in the stopped state when restarting.
+.BR restart_syscall ()
+restarts the interrupted system call with a
+time argument that is suitably adjusted to account for the
+time that has already elapsed (including the time where the process
+was stopped by a signal).
+Without the
+.BR restart_syscall ()
+mechanism, restarting these system calls would not correctly deduct the
+already elapsed time when the process continued execution.
+.SH RETURN VALUE
+The return value of
+.BR restart_syscall ()
+is the return value of whatever system call is being restarted.
+.SH ERRORS
+.I errno
+is set as per the errors for whatever system call is being restarted by
+.BR restart_syscall ().
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.
+.SH NOTES
+There is no glibc wrapper for this system call,
+because it is intended for use only by the kernel and
+should never be called by applications.
+.PP
+The kernel uses
+.BR restart_syscall ()
+to ensure that when a system call is restarted
+after a process has been stopped by a signal and then resumed by
+.BR SIGCONT ,
+then the time that the process spent in the stopped state is counted
+against the timeout interval specified in the original system call.
+In the case of system calls that take a timeout argument and
+automatically restart after a stop signal plus
+.BR SIGCONT ,
+but which do not have the
+.BR restart_syscall ()
+mechanism built in, then, after the process resumes execution,
+the time that the process spent in the stop state is
+.I not
+counted against the timeout value.
+Notable examples of system calls that suffer this problem are
+.BR ppoll (2),
+.BR select (2),
+and
+.BR pselect (2).
+.PP
+From user space, the operation of
+.BR restart_syscall ()
+is largely invisible:
+to the process that made the system call that is restarted,
+it appears as though that system call executed and
+returned in the usual fashion.
+.SH SEE ALSO
+.BR sigaction (2),
+.BR sigreturn (2),
+.BR signal (7)
+.\" FIXME . ppoll(2), select(2), and pselect(2)
+.\" should probably get the restart_syscall() treatment:
+.\" If a select() call is suspended by stop-sig+SIGCONT, the time
+.\" spent suspended is *not* deducted when the select() is restarted.
+.\" FIXME . check whether recvmmsg() handles stop-sig+SIGCONT properly.
diff --git a/man2/rmdir.2 b/man2/rmdir.2
new file mode 100644
index 0000000..5bd7370
--- /dev/null
+++ b/man2/rmdir.2
@@ -0,0 +1,128 @@
+.\" This manpage is Copyright (C) 1992 Drew Eckhardt;
+.\" and Copyright (C) 1993 Michael Haardt, Ian Jackson.
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 1993-07-24 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1997-01-31 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 2004-06-23 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.TH rmdir 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+rmdir \- delete a directory
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int rmdir(const char *" pathname );
+.fi
+.SH DESCRIPTION
+.BR rmdir ()
+deletes a directory, which must be empty.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+Write access to the directory containing
+.I pathname
+was not allowed, or one of the directories in the path prefix of
+.I pathname
+did not allow search permission.
+(See also
+.BR path_resolution (7).)
+.TP
+.B EBUSY
+.I pathname
+is currently in use by the system or some process that prevents its
+removal.
+On Linux, this means
+.I pathname
+is currently used as a mount point
+or is the root directory of the calling process.
+.TP
+.B EFAULT
+.IR pathname " points outside your accessible address space."
+.TP
+.B EINVAL
+.I pathname
+has
+.I .
+as last component.
+.TP
+.B ELOOP
+Too many symbolic links were encountered in resolving
+.IR pathname .
+.TP
+.B ENAMETOOLONG
+.IR pathname " was too long."
+.TP
+.B ENOENT
+A directory component in
+.I pathname
+does not exist or is a dangling symbolic link.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.B ENOTDIR
+.IR pathname ,
+or a component used as a directory in
+.IR pathname ,
+is not, in fact, a directory.
+.TP
+.B ENOTEMPTY
+.I pathname
+contains entries other than
+.IR . " and " .. " ;"
+or,
+.I pathname
+has
+.I ..
+as its final component.
+POSIX.1 also allows
+.\" POSIX.1-2001, POSIX.1-2008
+.B EEXIST
+for this condition.
+.TP
+.B EPERM
+The directory containing
+.I pathname
+has the sticky bit
+.RB ( S_ISVTX )
+set and the process's effective user ID is neither the user ID
+of the file to be deleted nor that of the directory containing it,
+and the process is not privileged (Linux: does not have the
+.B CAP_FOWNER
+capability).
+.TP
+.B EPERM
+The filesystem containing
+.I pathname
+does not support the removal of directories.
+.TP
+.B EROFS
+.I pathname
+refers to a directory on a read-only filesystem.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4, 4.3BSD.
+.SH BUGS
+Infelicities in the protocol underlying NFS can cause the unexpected
+disappearance of directories which are still being used.
+.SH SEE ALSO
+.BR rm (1),
+.BR rmdir (1),
+.BR chdir (2),
+.BR chmod (2),
+.BR mkdir (2),
+.BR rename (2),
+.BR unlink (2),
+.BR unlinkat (2)
diff --git a/man2/rt_sigaction.2 b/man2/rt_sigaction.2
new file mode 100644
index 0000000..d642d26
--- /dev/null
+++ b/man2/rt_sigaction.2
@@ -0,0 +1 @@
+.so man2/sigaction.2
diff --git a/man2/rt_sigpending.2 b/man2/rt_sigpending.2
new file mode 100644
index 0000000..304adff
--- /dev/null
+++ b/man2/rt_sigpending.2
@@ -0,0 +1 @@
+.so man2/sigpending.2
diff --git a/man2/rt_sigprocmask.2 b/man2/rt_sigprocmask.2
new file mode 100644
index 0000000..5eab7ac
--- /dev/null
+++ b/man2/rt_sigprocmask.2
@@ -0,0 +1 @@
+.so man2/sigprocmask.2
diff --git a/man2/rt_sigqueueinfo.2 b/man2/rt_sigqueueinfo.2
new file mode 100644
index 0000000..b8b0157
--- /dev/null
+++ b/man2/rt_sigqueueinfo.2
@@ -0,0 +1,195 @@
+.\" Copyright (c) 2002, 2011 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH rt_sigqueueinfo 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+rt_sigqueueinfo, rt_tgsigqueueinfo \- queue a signal and data
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/signal.h>" " /* Definition of " SI_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_rt_sigqueueinfo, pid_t " tgid ,
+.BI " int " sig ", siginfo_t *" info );
+.BI "int syscall(SYS_rt_tgsigqueueinfo, pid_t " tgid ", pid_t " tid ,
+.BI " int " sig ", siginfo_t *" info );
+.fi
+.PP
+.IR Note :
+There are no glibc wrappers for these system calls; see NOTES.
+.SH DESCRIPTION
+The
+.BR rt_sigqueueinfo ()
+and
+.BR rt_tgsigqueueinfo ()
+system calls are the low-level interfaces used to send a signal plus data
+to a process or thread.
+The receiver of the signal can obtain the accompanying data
+by establishing a signal handler with the
+.BR sigaction (2)
+.B SA_SIGINFO
+flag.
+.PP
+These system calls are not intended for direct application use;
+they are provided to allow the implementation of
+.BR sigqueue (3)
+and
+.BR pthread_sigqueue (3).
+.PP
+The
+.BR rt_sigqueueinfo ()
+system call sends the signal
+.I sig
+to the thread group with the ID
+.IR tgid .
+(The term "thread group" is synonymous with "process", and
+.I tid
+corresponds to the traditional UNIX process ID.)
+The signal will be delivered to an arbitrary member of the thread group
+(i.e., one of the threads that is not currently blocking the signal).
+.PP
+The
+.I info
+argument specifies the data to accompany the signal.
+This argument is a pointer to a structure of type
+.IR siginfo_t ,
+described in
+.BR sigaction (2)
+(and defined by including
+.IR <sigaction.h> ).
+The caller should set the following fields in this structure:
+.TP
+.I si_code
+This should be one of the
+.B SI_*
+codes in the Linux kernel source file
+.IR include/asm\-generic/siginfo.h .
+If the signal is being sent to any process other than the caller itself,
+the following restrictions apply:
+.RS
+.IP \[bu] 3
+The code can't be a value greater than or equal to zero.
+In particular, it can't be
+.BR SI_USER ,
+which is used by the kernel to indicate a signal sent by
+.BR kill (2),
+and nor can it be
+.BR SI_KERNEL ,
+which is used to indicate a signal generated by the kernel.
+.IP \[bu]
+The code can't (since Linux 2.6.39) be
+.BR SI_TKILL ,
+which is used by the kernel to indicate a signal sent using
+.\" tkill(2) or
+.BR tgkill (2).
+.RE
+.TP
+.I si_pid
+This should be set to a process ID,
+typically the process ID of the sender.
+.TP
+.I si_uid
+This should be set to a user ID,
+typically the real user ID of the sender.
+.TP
+.I si_value
+This field contains the user data to accompany the signal.
+For more information, see the description of the last
+.RI ( "union sigval" )
+argument of
+.BR sigqueue (3).
+.PP
+Internally, the kernel sets the
+.I si_signo
+field to the value specified in
+.IR sig ,
+so that the receiver of the signal can also obtain
+the signal number via that field.
+.PP
+The
+.BR rt_tgsigqueueinfo ()
+system call is like
+.BR rt_sigqueueinfo (),
+but sends the signal and data to the single thread
+specified by the combination of
+.IR tgid ,
+a thread group ID,
+and
+.IR tid ,
+a thread in that thread group.
+.SH RETURN VALUE
+On success, these system calls return 0.
+On error, they return \-1 and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EAGAIN
+The limit of signals which may be queued has been reached.
+(See
+.BR signal (7)
+for further information.)
+.TP
+.B EINVAL
+.IR sig ,
+.IR tgid ,
+or
+.I tid
+was invalid.
+.TP
+.B EPERM
+The caller does not have permission to send the signal to the target.
+For the required permissions, see
+.BR kill (2).
+.TP
+.B EPERM
+.I tgid
+specifies a process other than the caller and
+.I info\->si_code
+is invalid.
+.TP
+.B ESRCH
+.BR rt_sigqueueinfo ():
+No thread group matching
+.I tgid
+was found.
+.PP
+.BR rt_tgsigqueinfo ():
+No thread matching
+.I tgid
+and
+.I tid
+was found.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+.TP
+.BR rt_sigqueueinfo ()
+Linux 2.2.
+.TP
+.BR rt_tgsigqueueinfo ()
+Linux 2.6.31.
+.SH NOTES
+Since these system calls are not intended for application use,
+there are no glibc wrapper functions; use
+.BR syscall (2)
+in the unlikely case that you want to call them directly.
+.PP
+As with
+.BR kill (2),
+the null signal (0) can be used to check if the specified process
+or thread exists.
+.SH SEE ALSO
+.BR kill (2),
+.BR pidfd_send_signal (2),
+.BR sigaction (2),
+.BR sigprocmask (2),
+.BR tgkill (2),
+.BR pthread_sigqueue (3),
+.BR sigqueue (3),
+.BR signal (7)
diff --git a/man2/rt_sigreturn.2 b/man2/rt_sigreturn.2
new file mode 100644
index 0000000..830b7b9
--- /dev/null
+++ b/man2/rt_sigreturn.2
@@ -0,0 +1 @@
+.so man2/sigreturn.2
diff --git a/man2/rt_sigsuspend.2 b/man2/rt_sigsuspend.2
new file mode 100644
index 0000000..96d99c4
--- /dev/null
+++ b/man2/rt_sigsuspend.2
@@ -0,0 +1 @@
+.so man2/sigsuspend.2
diff --git a/man2/rt_sigtimedwait.2 b/man2/rt_sigtimedwait.2
new file mode 100644
index 0000000..ca098e5
--- /dev/null
+++ b/man2/rt_sigtimedwait.2
@@ -0,0 +1 @@
+.so man2/sigtimedwait.2
diff --git a/man2/rt_tgsigqueueinfo.2 b/man2/rt_tgsigqueueinfo.2
new file mode 100644
index 0000000..7b6cf68
--- /dev/null
+++ b/man2/rt_tgsigqueueinfo.2
@@ -0,0 +1 @@
+.so man2/rt_sigqueueinfo.2
diff --git a/man2/s390_guarded_storage.2 b/man2/s390_guarded_storage.2
new file mode 100644
index 0000000..63d5c83
--- /dev/null
+++ b/man2/s390_guarded_storage.2
@@ -0,0 +1,162 @@
+.\" Copyright (C) 2018 Eugene Syromyatnikov <evgsyr@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH s390_guarded_storage 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+s390_guarded_storage \- operations with z/Architecture guarded storage facility
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <asm/guarded_storage.h> " "/* Definition of " GS_* " constants */"
+.BR "#include <sys/syscall.h> " \
+"/* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_s390_guarded_storage, int " command ,
+.BI " struct gs_cb *" gs_cb );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR s390_guarded_storage (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The
+.BR s390_guarded_storage ()
+system call enables the use of the Guarded Storage Facility
+(a z/Architecture-specific feature) for user-space processes.
+.PP
+.\" The description is based on
+.\" http://www-05.ibm.com/de/linux-on-z-ws-us/agenda/pdfs/8_-_Linux_Whats_New_-_Stefan_Raspl.pdf
+.\" and "z/Architecture Principles of Operation" obtained from
+.\" http://publibfi.boulder.ibm.com/epubs/pdf/dz9zr011.pdf
+The guarded storage facility is a hardware feature that allows marking up to
+64 memory regions (as of z14) as guarded;
+reading a pointer with a newly introduced "Load Guarded" (LGG)
+or "Load Logical and Shift Guarded" (LLGFSG) instructions will cause
+a range check on the loaded value and invoke a (previously set up)
+user-space handler if one of the guarded regions is affected.
+.PP
+The
+.\" The command description is copied from v4.12-rc1~139^2~56^2 commit message
+.I command
+argument indicates which function to perform.
+The following commands are supported:
+.TP
+.B GS_ENABLE
+Enable the guarded storage facility for the calling task.
+The initial content of the guarded storage control block will be all zeros.
+After enablement, user-space code can use the "Load Guarded Storage
+Controls" (LGSC) instruction (or the
+.BR load_gs_cb ()
+function wrapper provided in the
+.I asm/guarded_storage.h
+header) to load an arbitrary control block.
+While a task is enabled, the kernel will save and restore the calling content
+of the guarded storage registers on context switch.
+.TP
+.B GS_DISABLE
+Disables the use of the guarded storage facility for the calling task.
+The kernel will cease to save and restore the content of the guarded storage
+registers, the task-specific content of these registers is lost.
+.TP
+.B GS_SET_BC_CB
+Set a broadcast guarded storage control block to the one provided in the
+.I gs_cb
+argument.
+This is called per thread and associates a specific guarded storage control
+block with the calling task.
+This control block will be used in the broadcast command
+.BR GS_BROADCAST .
+.TP
+.B GS_CLEAR_BC_CB
+Clears the broadcast guarded storage control block.
+The guarded storage control block will no longer have the association
+established by the
+.B GS_SET_BC_CB
+command.
+.TP
+.B GS_BROADCAST
+Sends a broadcast to all thread siblings of the calling task.
+Every sibling that has established a broadcast guarded storage control block
+will load this control block and will be enabled for guarded storage.
+The broadcast guarded storage control block is consumed; a second broadcast
+without a refresh of the stored control block with
+.B GS_SET_BC_CB
+will not have any effect.
+.PP
+The
+.I gs_cb
+argument specifies the address of a guarded storage control block structure
+and is currently used only by the
+.B GS_SET_BC_CB
+command; all other aforementioned commands ignore this argument.
+.SH RETURN VALUE
+On success, the return value of
+.BR s390_guarded_storage ()
+is 0.
+.PP
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+.I command
+was
+.B GS_SET_BC_CB
+and the copying of the guarded storage control block structure pointed by the
+.I gs_cb
+argument has failed.
+.TP
+.B EINVAL
+The value provided in the
+.I command
+argument was not valid.
+.TP
+.B ENOMEM
+.I command
+was one of
+.BR GS_ENABLE " or " GS_SET_BC_CB ,
+and the allocation of a new guarded storage control block has failed.
+.TP
+.B EOPNOTSUPP
+The guarded storage facility is not supported by the hardware.
+.SH STANDARDS
+Linux on s390.
+.SH HISTORY
+.\" 916cda1aa1b412d7cf2991c3af7479544942d121, v4.12-rc1~139^2~56^2
+Linux 4.12.
+System z14.
+.SH NOTES
+The description of the guarded storage facility along with related
+instructions and Guarded Storage Control Block and
+Guarded Storage Event Parameter List structure layouts
+is available in "z/Architecture Principles of Operations"
+beginning from the twelfth edition.
+.PP
+The
+.I gs_cb
+structure has a field
+.I gsepla
+(Guarded Storage Event Parameter List Address), which is a user-space pointer
+to a Guarded Storage Event Parameter List structure
+(that contains the address
+of the aforementioned event handler in the
+.I gseha
+field), and its layout is available as a
+.B gs_epl
+structure type definition in the
+.I asm/guarded_storage.h
+header.
+.\" .PP
+.\" For the example of using the guarded storage facility, see
+.\" .UR https://developer.ibm.com/javasdk/2017/09/25/concurrent-scavenge-using-guarded-storage-facility-works/
+.\" the article with the description of its usage in the Java Garbage Collection
+.\" .UE
+.SH SEE ALSO
+.BR syscall (2)
diff --git a/man2/s390_pci_mmio_read.2 b/man2/s390_pci_mmio_read.2
new file mode 100644
index 0000000..dedc390
--- /dev/null
+++ b/man2/s390_pci_mmio_read.2
@@ -0,0 +1 @@
+.so man2/s390_pci_mmio_write.2
diff --git a/man2/s390_pci_mmio_write.2 b/man2/s390_pci_mmio_write.2
new file mode 100644
index 0000000..07788e9
--- /dev/null
+++ b/man2/s390_pci_mmio_write.2
@@ -0,0 +1,94 @@
+.\" Copyright (c) IBM Corp. 2015
+.\" Author: Alexey Ishchuk <aishchuk@linux.vnet.ibm.com>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.TH s390_pci_mmio_write 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+s390_pci_mmio_write, s390_pci_mmio_read \- transfer data to/from PCI
+MMIO memory page
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_s390_pci_mmio_write, unsigned long " mmio_addr ,
+.BI " const void " user_buffer [. length "], \
+size_t " length );
+.BI "int syscall(SYS_s390_pci_mmio_read, unsigned long " mmio_addr ,
+.BI " void " user_buffer [. length "], size_t " length );
+.fi
+.PP
+.IR Note :
+glibc provides no wrappers for these system calls,
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The
+.BR s390_pci_mmio_write ()
+system call writes
+.I length
+bytes of data from the user-space buffer
+.I user_buffer
+to the PCI MMIO memory location specified by
+.IR mmio_addr .
+The
+.BR s390_pci_mmio_read ()
+system call reads
+.I length
+bytes of
+data from the PCI MMIO memory location specified by
+.I mmio_addr
+to the user-space buffer
+.IR user_buffer .
+.PP
+These system calls must be used instead of the simple assignment
+or data-transfer operations that are used to access the PCI MMIO
+memory areas mapped to user space on the Linux System z platform.
+The address specified by
+.I mmio_addr
+must belong to a PCI MMIO memory page mapping in the caller's address space,
+and the data being written or read must not cross a page boundary.
+The
+.I length
+value cannot be greater than the system page size.
+.SH RETURN VALUE
+On success,
+.BR s390_pci_mmio_write ()
+and
+.BR s390_pci_mmio_read ()
+return 0.
+On failure, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+The address in
+.I mmio_addr
+is invalid.
+.TP
+.B EFAULT
+.I user_buffer
+does not point to a valid location in the caller's address space.
+.TP
+.B EINVAL
+Invalid
+.I length
+argument.
+.TP
+.B ENODEV
+PCI support is not enabled.
+.TP
+.B ENOMEM
+Insufficient memory.
+.SH STANDARDS
+Linux on s390.
+.SH HISTORY
+Linux 3.19.
+System z EC12.
+.SH SEE ALSO
+.BR syscall (2)
diff --git a/man2/s390_runtime_instr.2 b/man2/s390_runtime_instr.2
new file mode 100644
index 0000000..fb1be13
--- /dev/null
+++ b/man2/s390_runtime_instr.2
@@ -0,0 +1,104 @@
+.\" Copyright (c) IBM Corp. 2012
+.\" Author: Jan Glauber <jang@linux.vnet.ibm.com>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.TH s390_runtime_instr 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+s390_runtime_instr \- enable/disable s390 CPU run-time instrumentation
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <asm/runtime_instr.h>" " /* Definition of " S390_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_s390_runtime_instr, int " command ", int " signum );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR s390_runtime_instr (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The
+.BR s390_runtime_instr ()
+system call starts or stops CPU run-time instrumentation for the
+calling thread.
+.PP
+The
+.I command
+argument controls whether run-time instrumentation is started
+.RB ( S390_RUNTIME_INSTR_START ,
+1) or stopped
+.RB ( S390_RUNTIME_INSTR_STOP ,
+2) for the calling thread.
+.PP
+The
+.I signum
+argument specifies the number of a real-time signal.
+This argument was used to specify a signal number that should be delivered
+to the thread if the run-time instrumentation buffer was full or if
+the run-time-instrumentation-halted interrupt had occurred.
+This feature was never used,
+and in Linux 4.4 support for this feature was removed;
+.\" commit b38feccd663b55ab07116208b68e1ffc7c3c7e78
+thus, in current kernels, this argument is ignored.
+.SH RETURN VALUE
+On success,
+.BR s390_runtime_instr ()
+returns 0 and enables the thread for
+run-time instrumentation by assigning the thread a default run-time
+instrumentation control block.
+The caller can then read and modify the control block and start the run-time
+instrumentation.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+The value specified in
+.I command
+is not a valid command.
+.TP
+.B EINVAL
+The value specified in
+.I signum
+is not a real-time signal number.
+From Linux 4.4 onwards, the
+.I signum
+argument has no effect,
+so that an invalid signal number will not result in an error.
+.TP
+.B ENOMEM
+Allocating memory for the run-time instrumentation control block failed.
+.TP
+.B EOPNOTSUPP
+The run-time instrumentation facility is not available.
+.SH STANDARDS
+Linux on s390.
+.SH HISTORY
+Linux 3.7.
+System z EC12.
+.SH NOTES
+The
+.I asm/runtime_instr.h
+header file is available
+.\" commit df2f815a7df7edb5335a3bdeee6a8f9f6f9c35c4
+since Linux 4.16.
+.PP
+Starting with Linux 4.4,
+support for signalling was removed, as was the check whether
+.I signum
+is a valid real-time signal.
+For backwards compatibility with older kernels, it is recommended to pass
+a valid real-time signal number in
+.I signum
+and install a handler for that signal.
+.SH SEE ALSO
+.BR syscall (2),
+.BR signal (7)
diff --git a/man2/s390_sthyi.2 b/man2/s390_sthyi.2
new file mode 100644
index 0000000..9c6af82
--- /dev/null
+++ b/man2/s390_sthyi.2
@@ -0,0 +1,133 @@
+.\" Copyright IBM Corp. 2017
+.\" Author: QingFeng Hao <haoqf@linux.vnet.ibm.com>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.TH s390_sthyi 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+s390_sthyi \- emulate STHYI instruction
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <asm/sthyi.h>" " /* Definition of " STHYI_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_s390_sthyi, unsigned long " function_code ,
+.BI " void *" resp_buffer ", uint64_t *" return_code ,
+.BI " unsigned long " flags );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR s390_sthyi (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The
+.BR s390_sthyi ()
+system call emulates the STHYI (Store Hypervisor Information) instruction.
+It provides hardware resource information for the machine and its
+virtualization levels.
+This includes CPU type and capacity, as well as the machine model and
+other metrics.
+.PP
+The
+.I function_code
+argument indicates which function to perform.
+The following code(s) are supported:
+.TP
+.B STHYI_FC_CP_IFL_CAP
+Return CP (Central Processor) and IFL (Integrated Facility for Linux)
+capacity information.
+.PP
+The
+.I resp_buffer
+argument specifies the address of a response buffer.
+When the
+.I function_code
+is
+.BR STHYI_FC_CP_IFL_CAP ,
+the buffer must be one page (4K) in size.
+If the system call returns 0,
+the response buffer will be filled with CPU capacity information.
+Otherwise, the response buffer's content is unchanged.
+.PP
+The
+.I return_code
+argument stores the return code of the STHYI instruction,
+using one of the following values:
+.TP
+0
+Success.
+.TP
+4
+Unsupported function code.
+.PP
+For further details about
+.IR return_code ,
+.IR function_code ,
+and
+.IR resp_buffer ,
+see the reference given in NOTES.
+.PP
+The
+.I flags
+argument is provided to allow for future extensions and currently
+must be set to 0.
+.SH RETURN VALUE
+On success (that is: emulation succeeded), the return value of
+.BR s390_sthyi ()
+matches the condition code of the STHYI instructions, which is a value
+in the range [0..3].
+A return value of 0 indicates that CPU capacity information is stored in
+.IR *resp_buffer .
+A return value of 3 indicates "unsupported function code" and the content of
+.I *resp_buffer
+is unchanged.
+The return values 1 and 2 are reserved.
+.PP
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+The value specified in
+.I resp_buffer
+or
+.I return_code
+is not a valid address.
+.TP
+.B EINVAL
+The value specified in
+.I flags
+is nonzero.
+.TP
+.B ENOMEM
+Allocating memory for handling the CPU capacity information failed.
+.TP
+.B EOPNOTSUPP
+The value specified in
+.I function_code
+is not valid.
+.SH STANDARDS
+Linux on s390.
+.SH HISTORY
+Linux 4.15.
+.SH NOTES
+For details of the STHYI instruction, see
+.UR https://www.ibm.com\:/support\:/knowledgecenter\:/SSB27U_6.3.0\:/com.ibm.zvm.v630.hcpb4\:/hcpb4sth.htm
+the documentation page
+.UE .
+.PP
+When the system call interface is used, the response buffer doesn't
+have to fulfill alignment requirements described in the STHYI
+instruction definition.
+.PP
+The kernel caches the response (for up to one second, as of Linux 4.16).
+Subsequent system call invocations may return the cached response.
+.SH SEE ALSO
+.BR syscall (2)
diff --git a/man2/sbrk.2 b/man2/sbrk.2
new file mode 100644
index 0000000..a3711a5
--- /dev/null
+++ b/man2/sbrk.2
@@ -0,0 +1 @@
+.so man2/brk.2
diff --git a/man2/sched_get_priority_max.2 b/man2/sched_get_priority_max.2
new file mode 100644
index 0000000..491e134
--- /dev/null
+++ b/man2/sched_get_priority_max.2
@@ -0,0 +1,112 @@
+.\" Copyright (C) Tom Bjorkholm & Markus Kuhn, 1996
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" 1996-04-01 Tom Bjorkholm <tomb@mydata.se>
+.\" First version written
+.\" 1996-04-10 Markus Kuhn <mskuhn@cip.informatik.uni-erlangen.de>
+.\" revision
+.\"
+.TH sched_get_priority_max 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+sched_get_priority_max, sched_get_priority_min \- get static priority range
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sched.h>
+.PP
+.BI "int sched_get_priority_max(int " policy );
+.BI "int sched_get_priority_min(int " policy );
+.fi
+.SH DESCRIPTION
+.BR sched_get_priority_max ()
+returns the maximum priority value that can be used with the
+scheduling algorithm identified by
+.IR policy .
+.BR sched_get_priority_min ()
+returns the minimum priority value that can be used with the
+scheduling algorithm identified by
+.IR policy .
+Supported
+.I policy
+values are
+.BR SCHED_FIFO ,
+.BR SCHED_RR ,
+.BR SCHED_OTHER ,
+.BR SCHED_BATCH ,
+.BR SCHED_IDLE ,
+and
+.BR SCHED_DEADLINE .
+Further details about these policies can be found in
+.BR sched (7).
+.PP
+Processes with numerically higher priority values are scheduled before
+processes with numerically lower priority values.
+Thus, the value
+returned by
+.BR sched_get_priority_max ()
+will be greater than the
+value returned by
+.BR sched_get_priority_min ().
+.PP
+Linux allows the static priority range 1 to 99 for the
+.B SCHED_FIFO
+and
+.B SCHED_RR
+policies, and the priority 0 for the remaining policies.
+Scheduling priority ranges for the various policies
+are not alterable.
+.PP
+The range of scheduling priorities may vary on other POSIX systems,
+thus it is a good idea for portable applications to use a virtual
+priority range and map it to the interval given by
+.BR sched_get_priority_max ()
+and
+.BR sched_get_priority_min ()
+POSIX.1 requires
+.\" POSIX.1-2001, POSIX.1-2008 (XBD 2.8.4)
+a spread of at least 32 between the maximum and the minimum values for
+.B SCHED_FIFO
+and
+.BR SCHED_RR .
+.PP
+POSIX systems on which
+.BR sched_get_priority_max ()
+and
+.BR sched_get_priority_min ()
+are available define
+.B _POSIX_PRIORITY_SCHEDULING
+in
+.IR <unistd.h> .
+.SH RETURN VALUE
+On success,
+.BR sched_get_priority_max ()
+and
+.BR sched_get_priority_min ()
+return the maximum/minimum priority value for the named scheduling
+policy.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+The argument
+.I policy
+does not identify a defined scheduling policy.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001.
+.SH SEE ALSO
+.ad l
+.nh
+.BR sched_getaffinity (2),
+.BR sched_getparam (2),
+.BR sched_getscheduler (2),
+.BR sched_setaffinity (2),
+.BR sched_setparam (2),
+.BR sched_setscheduler (2),
+.BR sched (7)
diff --git a/man2/sched_get_priority_min.2 b/man2/sched_get_priority_min.2
new file mode 100644
index 0000000..17b99f0
--- /dev/null
+++ b/man2/sched_get_priority_min.2
@@ -0,0 +1 @@
+.so man2/sched_get_priority_max.2
diff --git a/man2/sched_getaffinity.2 b/man2/sched_getaffinity.2
new file mode 100644
index 0000000..f376c11
--- /dev/null
+++ b/man2/sched_getaffinity.2
@@ -0,0 +1 @@
+.so man2/sched_setaffinity.2
diff --git a/man2/sched_getattr.2 b/man2/sched_getattr.2
new file mode 100644
index 0000000..cb2c346
--- /dev/null
+++ b/man2/sched_getattr.2
@@ -0,0 +1 @@
+.so man2/sched_setattr.2
diff --git a/man2/sched_getparam.2 b/man2/sched_getparam.2
new file mode 100644
index 0000000..d39facd
--- /dev/null
+++ b/man2/sched_getparam.2
@@ -0,0 +1 @@
+.so man2/sched_setparam.2
diff --git a/man2/sched_getscheduler.2 b/man2/sched_getscheduler.2
new file mode 100644
index 0000000..13aa827
--- /dev/null
+++ b/man2/sched_getscheduler.2
@@ -0,0 +1 @@
+.so man2/sched_setscheduler.2
diff --git a/man2/sched_rr_get_interval.2 b/man2/sched_rr_get_interval.2
new file mode 100644
index 0000000..cbd6247
--- /dev/null
+++ b/man2/sched_rr_get_interval.2
@@ -0,0 +1,110 @@
+.\" Copyright (C) Tom Bjorkholm & Markus Kuhn, 1996
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" 1996-04-01 Tom Bjorkholm <tomb@mydata.se>
+.\" First version written
+.\" 1996-04-10 Markus Kuhn <mskuhn@cip.informatik.uni-erlangen.de>
+.\" revision
+.\"
+.TH sched_rr_get_interval 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+sched_rr_get_interval \- get the SCHED_RR interval for the named process
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sched.h>
+.PP
+.BI "int sched_rr_get_interval(pid_t " pid ", struct timespec *" tp );
+.fi
+.SH DESCRIPTION
+.BR sched_rr_get_interval ()
+writes into the
+.BR timespec (3)
+structure pointed to by
+.I tp
+the round-robin time quantum for the process identified by
+.IR pid .
+The specified process should be running under the
+.B SCHED_RR
+scheduling policy.
+.PP
+If
+.I pid
+is zero, the time quantum for the calling process is written into
+.IR *tp .
+.\" FIXME . On Linux, sched_rr_get_interval()
+.\" returns the timeslice for SCHED_OTHER processes -- this timeslice
+.\" is influenced by the nice value.
+.\" For SCHED_FIFO processes, this always returns 0.
+.\"
+.\" The round-robin time quantum value is not alterable under Linux
+.\" 1.3.81.
+.\"
+.SH RETURN VALUE
+On success,
+.BR sched_rr_get_interval ()
+returns 0.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+Problem with copying information to user space.
+.TP
+.B EINVAL
+Invalid pid.
+.TP
+.B ENOSYS
+The system call is not yet implemented (only on rather old kernels).
+.TP
+.B ESRCH
+Could not find a process with the ID
+.IR pid .
+.SH VERSIONS
+.SS Linux
+Linux 3.9 added
+.\" commit ce0dbbbb30aee6a835511d5be446462388ba9eee
+a new mechanism for adjusting (and viewing) the
+.B SCHED_RR
+quantum: the
+.I /proc/sys/kernel/sched_rr_timeslice_ms
+file exposes the quantum as a millisecond value, whose default is 100.
+Writing 0 to this file resets the quantum to the default value.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001.
+.SS Linux
+POSIX does not specify any mechanism for controlling the size of the
+round-robin time quantum.
+Older Linux kernels provide a (nonportable) method of doing this.
+The quantum can be controlled by adjusting the process's nice value (see
+.BR setpriority (2)).
+Assigning a negative (i.e., high) nice value results in a longer quantum;
+assigning a positive (i.e., low) nice value results in a shorter quantum.
+The default quantum is 0.1 seconds;
+the degree to which changing the nice value affects the
+quantum has varied somewhat across kernel versions.
+This method of adjusting the quantum was removed
+.\" commit a4ec24b48ddef1e93f7578be53270f0b95ad666c
+starting with Linux 2.6.24.
+.SH NOTES
+POSIX systems on which
+.BR sched_rr_get_interval ()
+is available define
+.B _POSIX_PRIORITY_SCHEDULING
+in
+.IR <unistd.h> .
+.\" .SH BUGS
+.\" As of Linux 1.3.81
+.\" .BR sched_rr_get_interval ()
+.\" returns with error
+.\" ENOSYS, because SCHED_RR has not yet been fully implemented and tested
+.\" properly.
+.SH SEE ALSO
+.BR timespec (3),
+.BR sched (7)
diff --git a/man2/sched_setaffinity.2 b/man2/sched_setaffinity.2
new file mode 100644
index 0000000..9389e09
--- /dev/null
+++ b/man2/sched_setaffinity.2
@@ -0,0 +1,427 @@
+.\" Copyright (C) 2002 Robert Love
+.\" and Copyright (C) 2006, 2015 Michael Kerrisk
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" 2002-11-19 Robert Love <rml@tech9.net> - initial version
+.\" 2004-04-20 mtk - fixed description of return value
+.\" 2004-04-22 aeb - added glibc prototype history
+.\" 2005-05-03 mtk - noted that sched_setaffinity may cause thread
+.\" migration and that CPU affinity is a per-thread attribute.
+.\" 2006-02-03 mtk -- Major rewrite
+.\" 2008-11-12, mtk, removed CPU_*() macro descriptions to a
+.\" separate CPU_SET(3) page.
+.\"
+.TH sched_setaffinity 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+sched_setaffinity, sched_getaffinity \- \
+set and get a thread's CPU affinity mask
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #include <sched.h>
+.PP
+.BI "int sched_setaffinity(pid_t " pid ", size_t " cpusetsize ,
+.BI " const cpu_set_t *" mask );
+.BI "int sched_getaffinity(pid_t " pid ", size_t " cpusetsize ,
+.BI " cpu_set_t *" mask );
+.fi
+.SH DESCRIPTION
+A thread's CPU affinity mask determines the set of CPUs on which
+it is eligible to run.
+On a multiprocessor system, setting the CPU affinity mask
+can be used to obtain performance benefits.
+For example,
+by dedicating one CPU to a particular thread
+(i.e., setting the affinity mask of that thread to specify a single CPU,
+and setting the affinity mask of all other threads to exclude that CPU),
+it is possible to ensure maximum execution speed for that thread.
+Restricting a thread to run on a single CPU also avoids
+the performance cost caused by the cache invalidation that occurs
+when a thread ceases to execute on one CPU and then
+recommences execution on a different CPU.
+.PP
+A CPU affinity mask is represented by the
+.I cpu_set_t
+structure, a "CPU set", pointed to by
+.IR mask .
+A set of macros for manipulating CPU sets is described in
+.BR CPU_SET (3).
+.PP
+.BR sched_setaffinity ()
+sets the CPU affinity mask of the thread whose ID is
+.I pid
+to the value specified by
+.IR mask .
+If
+.I pid
+is zero, then the calling thread is used.
+The argument
+.I cpusetsize
+is the length (in bytes) of the data pointed to by
+.IR mask .
+Normally this argument would be specified as
+.IR "sizeof(cpu_set_t)" .
+.PP
+If the thread specified by
+.I pid
+is not currently running on one of the CPUs specified in
+.IR mask ,
+then that thread is migrated to one of the CPUs specified in
+.IR mask .
+.PP
+.BR sched_getaffinity ()
+writes the affinity mask of the thread whose ID is
+.I pid
+into the
+.I cpu_set_t
+structure pointed to by
+.IR mask .
+The
+.I cpusetsize
+argument specifies the size (in bytes) of
+.IR mask .
+If
+.I pid
+is zero, then the mask of the calling thread is returned.
+.SH RETURN VALUE
+On success,
+.BR sched_setaffinity ()
+and
+.BR sched_getaffinity ()
+return 0 (but see "C library/kernel differences" below,
+which notes that the underlying
+.BR sched_getaffinity ()
+differs in its return value).
+On failure, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+A supplied memory address was invalid.
+.TP
+.B EINVAL
+The affinity bit mask
+.I mask
+contains no processors that are currently physically on the system
+and permitted to the thread according to any restrictions that
+may be imposed by
+.I cpuset
+cgroups or the "cpuset" mechanism described in
+.BR cpuset (7).
+.TP
+.B EINVAL
+.RB ( sched_getaffinity ()
+and, before Linux 2.6.9,
+.BR sched_setaffinity ())
+.I cpusetsize
+is smaller than the size of the affinity mask used by the kernel.
+.TP
+.B EPERM
+.RB ( sched_setaffinity ())
+The calling thread does not have appropriate privileges.
+The caller needs an effective user ID equal to the real user ID
+or effective user ID of the thread identified by
+.IR pid ,
+or it must possess the
+.B CAP_SYS_NICE
+capability in the user namespace of the thread
+.IR pid .
+.TP
+.B ESRCH
+The thread whose ID is \fIpid\fP could not be found.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.5.8,
+glibc 2.3.
+.PP
+Initially, the glibc interfaces included a
+.I cpusetsize
+argument, typed as
+.IR "unsigned int" .
+In glibc 2.3.3, the
+.I cpusetsize
+argument was removed, but was then restored in glibc 2.3.4, with type
+.IR size_t .
+.SH NOTES
+After a call to
+.BR sched_setaffinity (),
+the set of CPUs on which the thread will actually run is
+the intersection of the set specified in the
+.I mask
+argument and the set of CPUs actually present on the system.
+The system may further restrict the set of CPUs on which the thread
+runs if the "cpuset" mechanism described in
+.BR cpuset (7)
+is being used.
+These restrictions on the actual set of CPUs on which the thread
+will run are silently imposed by the kernel.
+.PP
+There are various ways of determining the number of CPUs
+available on the system, including: inspecting the contents of
+.IR /proc/cpuinfo ;
+using
+.BR sysconf (3)
+to obtain the values of the
+.B _SC_NPROCESSORS_CONF
+and
+.B _SC_NPROCESSORS_ONLN
+parameters; and inspecting the list of CPU directories under
+.IR /sys/devices/system/cpu/ .
+.PP
+.BR sched (7)
+has a description of the Linux scheduling scheme.
+.PP
+The affinity mask is a per-thread attribute that can be
+adjusted independently for each of the threads in a thread group.
+The value returned from a call to
+.BR gettid (2)
+can be passed in the argument
+.IR pid .
+Specifying
+.I pid
+as 0 will set the attribute for the calling thread,
+and passing the value returned from a call to
+.BR getpid (2)
+will set the attribute for the main thread of the thread group.
+(If you are using the POSIX threads API, then use
+.BR pthread_setaffinity_np (3)
+instead of
+.BR sched_setaffinity ().)
+.PP
+The
+.I isolcpus
+boot option can be used to isolate one or more CPUs at boot time,
+so that no processes are scheduled onto those CPUs.
+Following the use of this boot option,
+the only way to schedule processes onto the isolated CPUs is via
+.BR sched_setaffinity ()
+or the
+.BR cpuset (7)
+mechanism.
+For further information, see the kernel source file
+.IR Documentation/admin\-guide/kernel\-parameters.txt .
+As noted in that file,
+.I isolcpus
+is the preferred mechanism of isolating CPUs
+(versus the alternative of manually setting the CPU affinity
+of all processes on the system).
+.PP
+A child created via
+.BR fork (2)
+inherits its parent's CPU affinity mask.
+The affinity mask is preserved across an
+.BR execve (2).
+.SS C library/kernel differences
+This manual page describes the glibc interface for the CPU affinity calls.
+The actual system call interface is slightly different, with the
+.I mask
+being typed as
+.IR "unsigned long\ *" ,
+reflecting the fact that the underlying implementation of CPU
+sets is a simple bit mask.
+.PP
+On success, the raw
+.BR sched_getaffinity ()
+system call returns the number of bytes placed copied into the
+.I mask
+buffer;
+this will be the minimum of
+.I cpusetsize
+and the size (in bytes) of the
+.I cpumask_t
+data type that is used internally by the kernel to
+represent the CPU set bit mask.
+.SS Handling systems with large CPU affinity masks
+The underlying system calls (which represent CPU masks as bit masks of type
+.IR "unsigned long\ *" )
+impose no restriction on the size of the CPU mask.
+However, the
+.I cpu_set_t
+data type used by glibc has a fixed size of 128 bytes,
+meaning that the maximum CPU number that can be represented is 1023.
+.\" FIXME . See https://sourceware.org/bugzilla/show_bug.cgi?id=15630
+.\" and https://sourceware.org/ml/libc-alpha/2013-07/msg00288.html
+If the kernel CPU affinity mask is larger than 1024,
+then calls of the form:
+.PP
+.in +4n
+.EX
+sched_getaffinity(pid, sizeof(cpu_set_t), &mask);
+.EE
+.in
+.PP
+fail with the error
+.BR EINVAL ,
+the error produced by the underlying system call for the case where the
+.I mask
+size specified in
+.I cpusetsize
+is smaller than the size of the affinity mask used by the kernel.
+(Depending on the system CPU topology, the kernel affinity mask can
+be substantially larger than the number of active CPUs in the system.)
+.PP
+When working on systems with large kernel CPU affinity masks,
+one must dynamically allocate the
+.I mask
+argument (see
+.BR CPU_ALLOC (3)).
+Currently, the only way to do this is by probing for the size
+of the required mask using
+.BR sched_getaffinity ()
+calls with increasing mask sizes (until the call does not fail with the error
+.BR EINVAL ).
+.PP
+Be aware that
+.BR CPU_ALLOC (3)
+may allocate a slightly larger CPU set than requested
+(because CPU sets are implemented as bit masks allocated in units of
+.IR sizeof(long) ).
+Consequently,
+.BR sched_getaffinity ()
+can set bits beyond the requested allocation size, because the kernel
+sees a few additional bits.
+Therefore, the caller should iterate over the bits in the returned set,
+counting those which are set, and stop upon reaching the value returned by
+.BR CPU_COUNT (3)
+(rather than iterating over the number of bits
+requested to be allocated).
+.SH EXAMPLES
+The program below creates a child process.
+The parent and child then each assign themselves to a specified CPU
+and execute identical loops that consume some CPU time.
+Before terminating, the parent waits for the child to complete.
+The program takes three command-line arguments:
+the CPU number for the parent,
+the CPU number for the child,
+and the number of loop iterations that both processes should perform.
+.PP
+As the sample runs below demonstrate, the amount of real and CPU time
+consumed when running the program will depend on intra-core caching effects
+and whether the processes are using the same CPU.
+.PP
+We first employ
+.BR lscpu (1)
+to determine that this (x86)
+system has two cores, each with two CPUs:
+.PP
+.in +4n
+.EX
+$ \fBlscpu | egrep \-i \[aq]core.*:|socket\[aq]\fP
+Thread(s) per core: 2
+Core(s) per socket: 2
+Socket(s): 1
+.EE
+.in
+.PP
+We then time the operation of the example program for three cases:
+both processes running on the same CPU;
+both processes running on different CPUs on the same core;
+and both processes running on different CPUs on different cores.
+.PP
+.in +4n
+.EX
+$ \fBtime \-p ./a.out 0 0 100000000\fP
+real 14.75
+user 3.02
+sys 11.73
+$ \fBtime \-p ./a.out 0 1 100000000\fP
+real 11.52
+user 3.98
+sys 19.06
+$ \fBtime \-p ./a.out 0 3 100000000\fP
+real 7.89
+user 3.29
+sys 12.07
+.EE
+.in
+.SS Program source
+\&
+.\" SRC BEGIN (sched_setaffinity.c)
+.EX
+#define _GNU_SOURCE
+#include <err.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/wait.h>
+#include <unistd.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ int parentCPU, childCPU;
+ cpu_set_t set;
+ unsigned int nloops;
+\&
+ if (argc != 4) {
+ fprintf(stderr, "Usage: %s parent\-cpu child\-cpu num\-loops\en",
+ argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ parentCPU = atoi(argv[1]);
+ childCPU = atoi(argv[2]);
+ nloops = atoi(argv[3]);
+\&
+ CPU_ZERO(&set);
+\&
+ switch (fork()) {
+ case \-1: /* Error */
+ err(EXIT_FAILURE, "fork");
+\&
+ case 0: /* Child */
+ CPU_SET(childCPU, &set);
+\&
+ if (sched_setaffinity(getpid(), sizeof(set), &set) == \-1)
+ err(EXIT_FAILURE, "sched_setaffinity");
+\&
+ for (unsigned int j = 0; j < nloops; j++)
+ getppid();
+\&
+ exit(EXIT_SUCCESS);
+\&
+ default: /* Parent */
+ CPU_SET(parentCPU, &set);
+\&
+ if (sched_setaffinity(getpid(), sizeof(set), &set) == \-1)
+ err(EXIT_FAILURE, "sched_setaffinity");
+\&
+ for (unsigned int j = 0; j < nloops; j++)
+ getppid();
+\&
+ wait(NULL); /* Wait for child to terminate */
+ exit(EXIT_SUCCESS);
+ }
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.ad l
+.nh
+.BR lscpu (1),
+.BR nproc (1),
+.BR taskset (1),
+.BR clone (2),
+.BR getcpu (2),
+.BR getpriority (2),
+.BR gettid (2),
+.BR nice (2),
+.BR sched_get_priority_max (2),
+.BR sched_get_priority_min (2),
+.BR sched_getscheduler (2),
+.BR sched_setscheduler (2),
+.BR setpriority (2),
+.BR CPU_SET (3),
+.BR get_nprocs (3),
+.BR pthread_setaffinity_np (3),
+.BR sched_getcpu (3),
+.BR capabilities (7),
+.BR cpuset (7),
+.BR sched (7),
+.BR numactl (8)
diff --git a/man2/sched_setattr.2 b/man2/sched_setattr.2
new file mode 100644
index 0000000..b4975c6
--- /dev/null
+++ b/man2/sched_setattr.2
@@ -0,0 +1,447 @@
+.\" Copyright (C) 2014 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" and Copyright (C) 2014 Peter Zijlstra <peterz@infradead.org>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH sched_setattr 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+sched_setattr, sched_getattr \-
+set and get scheduling policy and attributes
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <sched.h>" " /* Definition of " SCHED_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_sched_setattr, pid_t " pid ", struct sched_attr *" attr ,
+.BI " unsigned int " flags );
+.BI "int syscall(SYS_sched_getattr, pid_t " pid ", struct sched_attr *" attr ,
+.BI " unsigned int " size ", unsigned int " flags );
+.fi
+.\" FIXME . Add feature test macro requirements
+.PP
+.IR Note :
+glibc provides no wrappers for these system calls,
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+.SS sched_setattr()
+The
+.BR sched_setattr ()
+system call sets the scheduling policy and
+associated attributes for the thread whose ID is specified in
+.IR pid .
+If
+.I pid
+equals zero,
+the scheduling policy and attributes of the calling thread will be set.
+.PP
+Currently, Linux supports the following "normal"
+(i.e., non-real-time) scheduling policies as values that may be specified in
+.IR policy :
+.TP 14
+.B SCHED_OTHER
+the standard round-robin time-sharing policy;
+.\" In the 2.6 kernel sources, SCHED_OTHER is actually called
+.\" SCHED_NORMAL.
+.TP
+.B SCHED_BATCH
+for "batch" style execution of processes; and
+.TP
+.B SCHED_IDLE
+for running
+.I very
+low priority background jobs.
+.PP
+Various "real-time" policies are also supported,
+for special time-critical applications that need precise control over
+the way in which runnable threads are selected for execution.
+For the rules governing when a process may use these policies, see
+.BR sched (7).
+The real-time policies that may be specified in
+.I policy
+are:
+.TP 14
+.B SCHED_FIFO
+a first-in, first-out policy; and
+.TP
+.B SCHED_RR
+a round-robin policy.
+.PP
+Linux also provides the following policy:
+.TP 14
+.B SCHED_DEADLINE
+a deadline scheduling policy; see
+.BR sched (7)
+for details.
+.PP
+The
+.I attr
+argument is a pointer to a structure that defines
+the new scheduling policy and attributes for the specified thread.
+This structure has the following form:
+.PP
+.in +4n
+.EX
+struct sched_attr {
+ u32 size; /* Size of this structure */
+ u32 sched_policy; /* Policy (SCHED_*) */
+ u64 sched_flags; /* Flags */
+ s32 sched_nice; /* Nice value (SCHED_OTHER,
+ SCHED_BATCH) */
+ u32 sched_priority; /* Static priority (SCHED_FIFO,
+ SCHED_RR) */
+ /* Remaining fields are for SCHED_DEADLINE */
+ u64 sched_runtime;
+ u64 sched_deadline;
+ u64 sched_period;
+};
+.EE
+.in
+.PP
+The fields of the
+.I sched_attr
+structure are as follows:
+.TP
+.B size
+This field should be set to the size of the structure in bytes, as in
+.IR "sizeof(struct sched_attr)" .
+If the provided structure is smaller than the kernel structure,
+any additional fields are assumed to be '0'.
+If the provided structure is larger than the kernel structure,
+the kernel verifies that all additional fields are 0;
+if they are not,
+.BR sched_setattr ()
+fails with the error
+.B E2BIG
+and updates
+.I size
+to contain the size of the kernel structure.
+.IP
+The above behavior when the size of the user-space
+.I sched_attr
+structure does not match the size of the kernel structure
+allows for future extensibility of the interface.
+Malformed applications that pass oversize structures
+won't break in the future if the size of the kernel
+.I sched_attr
+structure is increased.
+In the future,
+it could also allow applications that know about a larger user-space
+.I sched_attr
+structure to determine whether they are running on an older kernel
+that does not support the larger structure.
+.TP
+.I sched_policy
+This field specifies the scheduling policy, as one of the
+.B SCHED_*
+values listed above.
+.TP
+.I sched_flags
+This field contains zero or more of the following flags
+that are ORed together to control scheduling behavior:
+.RS
+.TP
+.B SCHED_FLAG_RESET_ON_FORK
+Children created by
+.BR fork (2)
+do not inherit privileged scheduling policies.
+See
+.BR sched (7)
+for details.
+.TP
+.BR SCHED_FLAG_RECLAIM " (since Linux 4.13)"
+.\" 2d4283e9d583a3ee8cfb1cbb9c1270614df4c29d
+This flag allows a
+.B SCHED_DEADLINE
+thread to reclaim bandwidth unused by other real-time threads.
+.\" Bandwidth reclaim is done via the GRUB algorithm; see
+.\" Documentation/scheduler/sched-deadline.txt
+.TP
+.BR SCHED_FLAG_DL_OVERRUN " (since Linux 4.16)"
+.\" commit 34be39305a77b8b1ec9f279163c7cdb6cc719b91
+This flag allows an application to get informed about run-time overruns in
+.B SCHED_DEADLINE
+threads.
+Such overruns may be caused by (for example) coarse execution time accounting
+or incorrect parameter assignment.
+Notification takes the form of a
+.B SIGXCPU
+signal which is generated on each overrun.
+.IP
+This
+.B SIGXCPU
+signal is
+.I process-directed
+(see
+.BR signal (7))
+rather than thread-directed.
+This is probably a bug.
+On the one hand,
+.BR sched_setattr ()
+is being used to set a per-thread attribute.
+On the other hand, if the process-directed signal is delivered to
+a thread inside the process other than the one that had a run-time overrun,
+the application has no way of knowing which thread overran.
+.RE
+.TP
+.I sched_nice
+This field specifies the nice value to be set when specifying
+.I sched_policy
+as
+.B SCHED_OTHER
+or
+.BR SCHED_BATCH .
+The nice value is a number in the range \-20 (high priority)
+to +19 (low priority); see
+.BR sched (7).
+.TP
+.I sched_priority
+This field specifies the static priority to be set when specifying
+.I sched_policy
+as
+.B SCHED_FIFO
+or
+.BR SCHED_RR .
+The allowed range of priorities for these policies can be determined using
+.BR sched_get_priority_min (2)
+and
+.BR sched_get_priority_max (2).
+For other policies, this field must be specified as 0.
+.TP
+.I sched_runtime
+This field specifies the "Runtime" parameter for deadline scheduling.
+The value is expressed in nanoseconds.
+This field, and the next two fields,
+are used only for
+.B SCHED_DEADLINE
+scheduling; for further details, see
+.BR sched (7).
+.TP
+.I sched_deadline
+This field specifies the "Deadline" parameter for deadline scheduling.
+The value is expressed in nanoseconds.
+.TP
+.I sched_period
+This field specifies the "Period" parameter for deadline scheduling.
+The value is expressed in nanoseconds.
+.PP
+The
+.I flags
+argument is provided to allow for future extensions to the interface;
+in the current implementation it must be specified as 0.
+.\"
+.\"
+.SS sched_getattr()
+The
+.BR sched_getattr ()
+system call fetches the scheduling policy and the
+associated attributes for the thread whose ID is specified in
+.IR pid .
+If
+.I pid
+equals zero,
+the scheduling policy and attributes of the calling thread
+will be retrieved.
+.PP
+The
+.I size
+argument should be set to the size of the
+.I sched_attr
+structure as known to user space.
+The value must be at least as large as the size of the initially published
+.I sched_attr
+structure, or the call fails with the error
+.BR EINVAL .
+.PP
+The retrieved scheduling attributes are placed in the fields of the
+.I sched_attr
+structure pointed to by
+.IR attr .
+The kernel sets
+.I attr.size
+to the size of its
+.I sched_attr
+structure.
+.PP
+If the caller-provided
+.I attr
+buffer is larger than the kernel's
+.I sched_attr
+structure,
+the additional bytes in the user-space structure are not touched.
+If the caller-provided structure is smaller than the kernel
+.I sched_attr
+structure, the kernel will silently not return any values which would be stored
+outside the provided space.
+As with
+.BR sched_setattr (),
+these semantics allow for future extensibility of the interface.
+.PP
+The
+.I flags
+argument is provided to allow for future extensions to the interface;
+in the current implementation it must be specified as 0.
+.SH RETURN VALUE
+On success,
+.BR sched_setattr ()
+and
+.BR sched_getattr ()
+return 0.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.BR sched_getattr ()
+and
+.BR sched_setattr ()
+can both fail for the following reasons:
+.TP
+.B EINVAL
+.I attr
+is NULL; or
+.I pid
+is negative; or
+.I flags
+is not zero.
+.TP
+.B ESRCH
+The thread whose ID is
+.I pid
+could not be found.
+.PP
+In addition,
+.BR sched_getattr ()
+can fail for the following reasons:
+.TP
+.B E2BIG
+The buffer specified by
+.I size
+and
+.I attr
+is too small.
+.TP
+.B EINVAL
+.I size
+is invalid; that is, it is smaller than the initial version of the
+.I sched_attr
+structure (48 bytes) or larger than the system page size.
+.PP
+In addition,
+.BR sched_setattr ()
+can fail for the following reasons:
+.TP
+.B E2BIG
+The buffer specified by
+.I size
+and
+.I attr
+is larger than the kernel structure,
+and one or more of the excess bytes is nonzero.
+.TP
+.B EBUSY
+.B SCHED_DEADLINE
+admission control failure, see
+.BR sched (7).
+.TP
+.B EINVAL
+.I attr.sched_policy
+is not one of the recognized policies;
+.I attr.sched_flags
+contains a flag other than
+.BR SCHED_FLAG_RESET_ON_FORK ;
+or
+.I attr.sched_priority
+is invalid; or
+.I attr.sched_policy
+is
+.B SCHED_DEADLINE
+and the deadline scheduling parameters in
+.I attr
+are invalid.
+.TP
+.B EPERM
+The caller does not have appropriate privileges.
+.TP
+.B EPERM
+The CPU affinity mask of the thread specified by
+.I pid
+does not include all CPUs in the system
+(see
+.BR sched_setaffinity (2)).
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 3.14.
+.\" FIXME . Add glibc version
+.SH NOTES
+glibc does not provide wrappers for these system calls; call them using
+.BR syscall (2).
+.PP
+.BR sched_setattr ()
+provides a superset of the functionality of
+.BR sched_setscheduler (2),
+.BR sched_setparam (2),
+.BR nice (2),
+and (other than the ability to set the priority of all processes
+belonging to a specified user or all processes in a specified group)
+.BR setpriority (2).
+Analogously,
+.BR sched_getattr ()
+provides a superset of the functionality of
+.BR sched_getscheduler (2),
+.BR sched_getparam (2),
+and (partially)
+.BR getpriority (2).
+.SH BUGS
+In Linux versions up to
+.\" FIXME . patch sent to Peter Zijlstra
+3.15,
+.BR sched_setattr ()
+failed with the error
+.B EFAULT
+instead of
+.B E2BIG
+for the case described in ERRORS.
+.PP
+Up to Linux 5.3,
+.BR sched_getattr ()
+failed with the error
+.B EFBIG
+if the in-kernel
+.I sched_attr
+structure was larger than the
+.I size
+passed by user space.
+.\" In Linux versions up to up 3.15,
+.\" FIXME . patch from Peter Zijlstra pending
+.\" .BR sched_setattr ()
+.\" allowed a negative
+.\" .I attr.sched_policy
+.\" value.
+.SH SEE ALSO
+.ad l
+.nh
+.BR chrt (1),
+.BR nice (2),
+.BR sched_get_priority_max (2),
+.BR sched_get_priority_min (2),
+.BR sched_getaffinity (2),
+.BR sched_getparam (2),
+.BR sched_getscheduler (2),
+.BR sched_rr_get_interval (2),
+.BR sched_setaffinity (2),
+.BR sched_setparam (2),
+.BR sched_setscheduler (2),
+.BR sched_yield (2),
+.BR setpriority (2),
+.BR pthread_getschedparam (3),
+.BR pthread_setschedparam (3),
+.BR pthread_setschedprio (3),
+.BR capabilities (7),
+.BR cpuset (7),
+.BR sched (7)
+.ad
diff --git a/man2/sched_setparam.2 b/man2/sched_setparam.2
new file mode 100644
index 0000000..f054f5d
--- /dev/null
+++ b/man2/sched_setparam.2
@@ -0,0 +1,121 @@
+.\" Copyright (C) Tom Bjorkholm & Markus Kuhn, 1996
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" 1996-04-01 Tom Bjorkholm <tomb@mydata.se>
+.\" First version written
+.\" 1996-04-10 Markus Kuhn <mskuhn@cip.informatik.uni-erlangen.de>
+.\" revision
+.\" Modified 2004-05-27 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.TH sched_setparam 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+sched_setparam, sched_getparam \- set and get scheduling parameters
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sched.h>
+.PP
+.BI "int sched_setparam(pid_t " pid ", const struct sched_param *" param );
+.BI "int sched_getparam(pid_t " pid ", struct sched_param *" param );
+.PP
+\fBstruct sched_param {
+ ...
+ int \fIsched_priority\fB;
+ ...
+};
+.fi
+.SH DESCRIPTION
+.BR sched_setparam ()
+sets the scheduling parameters associated with the scheduling policy
+for the thread whose thread ID is specified in \fIpid\fP.
+If \fIpid\fP is zero, then
+the parameters of the calling thread are set.
+The interpretation of
+the argument \fIparam\fP depends on the scheduling
+policy of the thread identified by
+.IR pid .
+See
+.BR sched (7)
+for a description of the scheduling policies supported under Linux.
+.PP
+.BR sched_getparam ()
+retrieves the scheduling parameters for the
+thread identified by \fIpid\fP.
+If \fIpid\fP is zero, then the parameters
+of the calling thread are retrieved.
+.PP
+.BR sched_setparam ()
+checks the validity of \fIparam\fP for the scheduling policy of the
+thread.
+The value \fIparam\->sched_priority\fP must lie within the
+range given by
+.BR sched_get_priority_min (2)
+and
+.BR sched_get_priority_max (2).
+.PP
+For a discussion of the privileges and resource limits related to
+scheduling priority and policy, see
+.BR sched (7).
+.PP
+POSIX systems on which
+.BR sched_setparam ()
+and
+.BR sched_getparam ()
+are available define
+.B _POSIX_PRIORITY_SCHEDULING
+in \fI<unistd.h>\fP.
+.SH RETURN VALUE
+On success,
+.BR sched_setparam ()
+and
+.BR sched_getparam ()
+return 0.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+Invalid arguments:
+.I param
+is NULL or
+.I pid
+is negative
+.TP
+.B EINVAL
+.RB ( sched_setparam ())
+The argument \fIparam\fP does not make sense for the current
+scheduling policy.
+.TP
+.B EPERM
+.RB ( sched_setparam ())
+The caller does not have appropriate privileges
+(Linux: does not have the
+.B CAP_SYS_NICE
+capability).
+.TP
+.B ESRCH
+The thread whose ID is \fIpid\fP could not be found.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001.
+.SH SEE ALSO
+.ad l
+.nh
+.BR getpriority (2),
+.BR gettid (2),
+.BR nice (2),
+.BR sched_get_priority_max (2),
+.BR sched_get_priority_min (2),
+.BR sched_getaffinity (2),
+.BR sched_getscheduler (2),
+.BR sched_setaffinity (2),
+.BR sched_setattr (2),
+.BR sched_setscheduler (2),
+.BR setpriority (2),
+.BR capabilities (7),
+.BR sched (7)
diff --git a/man2/sched_setscheduler.2 b/man2/sched_setscheduler.2
new file mode 100644
index 0000000..20ad5c2
--- /dev/null
+++ b/man2/sched_setscheduler.2
@@ -0,0 +1,232 @@
+.\" Copyright (C) 2014 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\"
+.TH sched_setscheduler 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+sched_setscheduler, sched_getscheduler \-
+set and get scheduling policy/parameters
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sched.h>
+.PP
+.BI "int sched_setscheduler(pid_t " pid ", int " policy ,
+.BI " const struct sched_param *" param );
+.BI "int sched_getscheduler(pid_t " pid );
+.fi
+.SH DESCRIPTION
+The
+.BR sched_setscheduler ()
+system call
+sets both the scheduling policy and parameters for the
+thread whose ID is specified in \fIpid\fP.
+If \fIpid\fP equals zero, the
+scheduling policy and parameters of the calling thread will be set.
+.PP
+The scheduling parameters are specified in the
+.I param
+argument, which is a pointer to a structure of the following form:
+.PP
+.in +4n
+.EX
+struct sched_param {
+ ...
+ int sched_priority;
+ ...
+};
+.EE
+.in
+.PP
+In the current implementation, the structure contains only one field,
+.IR sched_priority .
+The interpretation of
+.I param
+depends on the selected policy.
+.PP
+Currently, Linux supports the following "normal"
+(i.e., non-real-time) scheduling policies as values that may be specified in
+.IR policy :
+.TP 14
+.B SCHED_OTHER
+the standard round-robin time-sharing policy;
+.\" In the 2.6 kernel sources, SCHED_OTHER is actually called
+.\" SCHED_NORMAL.
+.TP
+.B SCHED_BATCH
+for "batch" style execution of processes; and
+.TP
+.B SCHED_IDLE
+for running
+.I very
+low priority background jobs.
+.PP
+For each of the above policies,
+.I param\->sched_priority
+must be 0.
+.PP
+Various "real-time" policies are also supported,
+for special time-critical applications that need precise control over
+the way in which runnable threads are selected for execution.
+For the rules governing when a process may use these policies, see
+.BR sched (7).
+The real-time policies that may be specified in
+.I policy
+are:
+.TP 14
+.B SCHED_FIFO
+a first-in, first-out policy; and
+.TP
+.B SCHED_RR
+a round-robin policy.
+.PP
+For each of the above policies,
+.I param\->sched_priority
+specifies a scheduling priority for the thread.
+This is a number in the range returned by calling
+.BR sched_get_priority_min (2)
+and
+.BR sched_get_priority_max (2)
+with the specified
+.IR policy .
+On Linux, these system calls return, respectively, 1 and 99.
+.PP
+Since Linux 2.6.32, the
+.B SCHED_RESET_ON_FORK
+flag can be ORed in
+.I policy
+when calling
+.BR sched_setscheduler ().
+As a result of including this flag, children created by
+.BR fork (2)
+do not inherit privileged scheduling policies.
+See
+.BR sched (7)
+for details.
+.PP
+.BR sched_getscheduler ()
+returns the current scheduling policy of the thread
+identified by \fIpid\fP.
+If \fIpid\fP equals zero, the policy of the
+calling thread will be retrieved.
+.SH RETURN VALUE
+On success,
+.BR sched_setscheduler ()
+returns zero.
+On success,
+.BR sched_getscheduler ()
+returns the policy for the thread (a nonnegative integer).
+On error, both calls return \-1, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+Invalid arguments:
+.I pid
+is negative or
+.I param
+is NULL.
+.TP
+.B EINVAL
+.RB ( sched_setscheduler ())
+.I policy
+is not one of the recognized policies.
+.TP
+.B EINVAL
+.RB ( sched_setscheduler ())
+.I param
+does not make sense for the specified
+.IR policy .
+.TP
+.B EPERM
+The calling thread does not have appropriate privileges.
+.TP
+.B ESRCH
+The thread whose ID is \fIpid\fP could not be found.
+.SH VERSIONS
+POSIX.1 does not detail the permissions that an unprivileged
+thread requires in order to call
+.BR sched_setscheduler (),
+and details vary across systems.
+For example, the Solaris 7 manual page says that
+the real or effective user ID of the caller must
+match the real user ID or the save set-user-ID of the target.
+.PP
+The scheduling policy and parameters are in fact per-thread
+attributes on Linux.
+The value returned from a call to
+.BR gettid (2)
+can be passed in the argument
+.IR pid .
+Specifying
+.I pid
+as 0 will operate on the attributes of the calling thread,
+and passing the value returned from a call to
+.BR getpid (2)
+will operate on the attributes of the main thread of the thread group.
+(If you are using the POSIX threads API, then use
+.BR pthread_setschedparam (3),
+.BR pthread_getschedparam (3),
+and
+.BR pthread_setschedprio (3),
+instead of the
+.BR sched_* (2)
+system calls.)
+.SH STANDARDS
+POSIX.1-2008 (but see BUGS below).
+.PP
+.B SCHED_BATCH
+and
+.B SCHED_IDLE
+are Linux-specific.
+.SH HISTORY
+POSIX.1-2001.
+.SH NOTES
+Further details of the semantics of all of the above "normal"
+and "real-time" scheduling policies can be found in the
+.BR sched (7)
+manual page.
+That page also describes an additional policy,
+.BR SCHED_DEADLINE ,
+which is settable only via
+.BR sched_setattr (2).
+.PP
+POSIX systems on which
+.BR sched_setscheduler ()
+and
+.BR sched_getscheduler ()
+are available define
+.B _POSIX_PRIORITY_SCHEDULING
+in \fI<unistd.h>\fP.
+.SH BUGS
+POSIX.1 says that on success,
+.BR sched_setscheduler ()
+should return the previous scheduling policy.
+Linux
+.BR sched_setscheduler ()
+does not conform to this requirement,
+since it always returns 0 on success.
+.SH SEE ALSO
+.ad l
+.nh
+.BR chrt (1),
+.BR nice (2),
+.BR sched_get_priority_max (2),
+.BR sched_get_priority_min (2),
+.BR sched_getaffinity (2),
+.BR sched_getattr (2),
+.BR sched_getparam (2),
+.BR sched_rr_get_interval (2),
+.BR sched_setaffinity (2),
+.BR sched_setattr (2),
+.BR sched_setparam (2),
+.BR sched_yield (2),
+.BR setpriority (2),
+.BR capabilities (7),
+.BR cpuset (7),
+.BR sched (7)
+.ad
diff --git a/man2/sched_yield.2 b/man2/sched_yield.2
new file mode 100644
index 0000000..154fd4f
--- /dev/null
+++ b/man2/sched_yield.2
@@ -0,0 +1,76 @@
+.\" Copyright (C) Tom Bjorkholm & Markus Kuhn, 1996
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" 1996-04-01 Tom Bjorkholm <tomb@mydata.se>
+.\" First version written
+.\" 1996-04-10 Markus Kuhn <mskuhn@cip.informatik.uni-erlangen.de>
+.\" revision
+.\"
+.TH sched_yield 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+sched_yield \- yield the processor
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sched.h>
+.PP
+.B int sched_yield(void);
+.fi
+.SH DESCRIPTION
+.BR sched_yield ()
+causes the calling thread to relinquish the CPU.
+The thread is moved to the end of the queue for its static
+priority and a new thread gets to run.
+.SH RETURN VALUE
+On success,
+.BR sched_yield ()
+returns 0.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+In the Linux implementation,
+.BR sched_yield ()
+always succeeds.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001 (but optional).
+POSIX.1-2008.
+.PP
+Before POSIX.1-2008,
+systems on which
+.BR sched_yield ()
+is available defined
+.B _POSIX_PRIORITY_SCHEDULING
+in
+.IR <unistd.h> .
+.SH CAVEATS
+.BR sched_yield ()
+is intended for use with real-time scheduling policies (i.e.,
+.B SCHED_FIFO
+or
+.BR SCHED_RR ).
+Use of
+.BR sched_yield ()
+with nondeterministic scheduling policies such as
+.B SCHED_OTHER
+is unspecified and very likely means your application design is broken.
+.PP
+If the calling thread is the only thread in the highest
+priority list at that time,
+it will continue to run after a call to
+.BR sched_yield ().
+.PP
+Avoid calling
+.BR sched_yield ()
+unnecessarily or inappropriately
+(e.g., when resources needed by other
+schedulable threads are still held by the caller),
+since doing so will result in unnecessary context switches,
+which will degrade system performance.
+.SH SEE ALSO
+.BR sched (7)
diff --git a/man2/seccomp.2 b/man2/seccomp.2
new file mode 100644
index 0000000..6b32eec
--- /dev/null
+++ b/man2/seccomp.2
@@ -0,0 +1,1245 @@
+.\" Copyright (C) 2014 Kees Cook <keescook@chromium.org>
+.\" and Copyright (C) 2012 Will Drewry <wad@chromium.org>
+.\" and Copyright (C) 2008, 2014,2017 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" and Copyright (C) 2017 Tyler Hicks <tyhicks@canonical.com>
+.\" and Copyright (C) 2020 Tycho Andersen <tycho@tycho.ws>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH seccomp 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+seccomp \- operate on Secure Computing state of the process
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/seccomp.h>" " /* Definition of " SECCOMP_* " constants */"
+.BR "#include <linux/filter.h>" " /* Definition of " "struct sock_fprog" " */"
+.BR "#include <linux/audit.h>" " /* Definition of " AUDIT_* " constants */"
+.BR "#include <linux/signal.h>" " /* Definition of " SIG* " constants */"
+.BR "#include <sys/ptrace.h>" " /* Definition of " PTRACE_* " constants */"
+.\" Kees Cook noted: Anything that uses SECCOMP_RET_TRACE returns will
+.\" need <sys/ptrace.h>
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_seccomp, unsigned int " operation ", unsigned int " flags ,
+.BI " void *" args );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR seccomp (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The
+.BR seccomp ()
+system call operates on the Secure Computing (seccomp) state of the
+calling process.
+.PP
+Currently, Linux supports the following
+.I operation
+values:
+.TP
+.B SECCOMP_SET_MODE_STRICT
+The only system calls that the calling thread is permitted to make are
+.BR read (2),
+.BR write (2),
+.BR _exit (2)
+(but not
+.BR exit_group (2)),
+and
+.BR sigreturn (2).
+Other system calls result in the termination of the calling thread,
+or termination of the entire process with the
+.B SIGKILL
+signal when there is only one thread.
+Strict secure computing mode is useful for number-crunching
+applications that may need to execute untrusted byte code, perhaps
+obtained by reading from a pipe or socket.
+.IP
+Note that although the calling thread can no longer call
+.BR sigprocmask (2),
+it can use
+.BR sigreturn (2)
+to block all signals apart from
+.B SIGKILL
+and
+.BR SIGSTOP .
+This means that
+.BR alarm (2)
+(for example) is not sufficient for restricting the process's execution time.
+Instead, to reliably terminate the process,
+.B SIGKILL
+must be used.
+This can be done by using
+.BR timer_create (2)
+with
+.B SIGEV_SIGNAL
+and
+.I sigev_signo
+set to
+.BR SIGKILL ,
+or by using
+.BR setrlimit (2)
+to set the hard limit for
+.BR RLIMIT_CPU .
+.IP
+This operation is available only if the kernel is configured with
+.B CONFIG_SECCOMP
+enabled.
+.IP
+The value of
+.I flags
+must be 0, and
+.I args
+must be NULL.
+.IP
+This operation is functionally identical to the call:
+.IP
+.in +4n
+.EX
+prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT);
+.EE
+.in
+.TP
+.B SECCOMP_SET_MODE_FILTER
+The system calls allowed are defined by a pointer to a Berkeley Packet
+Filter (BPF) passed via
+.IR args .
+This argument is a pointer to a
+.IR "struct\~sock_fprog" ;
+it can be designed to filter arbitrary system calls and system call
+arguments.
+If the filter is invalid,
+.BR seccomp ()
+fails, returning
+.B EINVAL
+in
+.IR errno .
+.IP
+If
+.BR fork (2)
+or
+.BR clone (2)
+is allowed by the filter, any child processes will be constrained to
+the same system call filters as the parent.
+If
+.BR execve (2)
+is allowed,
+the existing filters will be preserved across a call to
+.BR execve (2).
+.IP
+In order to use the
+.B SECCOMP_SET_MODE_FILTER
+operation, either the calling thread must have the
+.B CAP_SYS_ADMIN
+capability in its user namespace, or the thread must already have the
+.I no_new_privs
+bit set.
+If that bit was not already set by an ancestor of this thread,
+the thread must make the following call:
+.IP
+.in +4n
+.EX
+prctl(PR_SET_NO_NEW_PRIVS, 1);
+.EE
+.in
+.IP
+Otherwise, the
+.B SECCOMP_SET_MODE_FILTER
+operation fails and returns
+.B EACCES
+in
+.IR errno .
+This requirement ensures that an unprivileged process cannot apply
+a malicious filter and then invoke a set-user-ID or
+other privileged program using
+.BR execve (2),
+thus potentially compromising that program.
+(Such a malicious filter might, for example, cause an attempt to use
+.BR setuid (2)
+to set the caller's user IDs to nonzero values to instead
+return 0 without actually making the system call.
+Thus, the program might be tricked into retaining superuser privileges
+in circumstances where it is possible to influence it to do
+dangerous things because it did not actually drop privileges.)
+.IP
+If
+.BR prctl (2)
+or
+.BR seccomp ()
+is allowed by the attached filter, further filters may be added.
+This will increase evaluation time, but allows for further reduction of
+the attack surface during execution of a thread.
+.IP
+The
+.B SECCOMP_SET_MODE_FILTER
+operation is available only if the kernel is configured with
+.B CONFIG_SECCOMP_FILTER
+enabled.
+.IP
+When
+.I flags
+is 0, this operation is functionally identical to the call:
+.IP
+.in +4n
+.EX
+prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args);
+.EE
+.in
+.IP
+The recognized
+.I flags
+are:
+.RS
+.TP
+.BR SECCOMP_FILTER_FLAG_LOG " (since Linux 4.14)"
+.\" commit e66a39977985b1e69e17c4042cb290768eca9b02
+All filter return actions except
+.B SECCOMP_RET_ALLOW
+should be logged.
+An administrator may override this filter flag by preventing specific
+actions from being logged via the
+.I /proc/sys/kernel/seccomp/actions_logged
+file.
+.TP
+.BR SECCOMP_FILTER_FLAG_NEW_LISTENER " (since Linux 5.0)"
+.\" commit 6a21cc50f0c7f87dae5259f6cfefe024412313f6
+After successfully installing the filter program,
+return a new user-space notification file descriptor.
+(The close-on-exec flag is set for the file descriptor.)
+When the filter returns
+.B SECCOMP_RET_USER_NOTIF
+a notification will be sent to this file descriptor.
+.IP
+At most one seccomp filter using the
+.B SECCOMP_FILTER_FLAG_NEW_LISTENER
+flag can be installed for a thread.
+.IP
+See
+.BR seccomp_unotify (2)
+for further details.
+.TP
+.BR SECCOMP_FILTER_FLAG_SPEC_ALLOW " (since Linux 4.17)"
+.\" commit 00a02d0c502a06d15e07b857f8ff921e3e402675
+Disable Speculative Store Bypass mitigation.
+.TP
+.B SECCOMP_FILTER_FLAG_TSYNC
+When adding a new filter, synchronize all other threads of the calling
+process to the same seccomp filter tree.
+A "filter tree" is the ordered list of filters attached to a thread.
+(Attaching identical filters in separate
+.BR seccomp ()
+calls results in different filters from this perspective.)
+.IP
+If any thread cannot synchronize to the same filter tree,
+the call will not attach the new seccomp filter,
+and will fail, returning the first thread ID found that cannot synchronize.
+Synchronization will fail if another thread in the same process is in
+.B SECCOMP_MODE_STRICT
+or if it has attached new seccomp filters to itself,
+diverging from the calling thread's filter tree.
+.RE
+.TP
+.BR SECCOMP_GET_ACTION_AVAIL " (since Linux 4.14)"
+.\" commit d612b1fd8010d0d67b5287fe146b8b55bcbb8655
+Test to see if an action is supported by the kernel.
+This operation is helpful to confirm that the kernel knows
+of a more recently added filter return action
+since the kernel treats all unknown actions as
+.BR SECCOMP_RET_KILL_PROCESS .
+.IP
+The value of
+.I flags
+must be 0, and
+.I args
+must be a pointer to an unsigned 32-bit filter return action.
+.TP
+.BR SECCOMP_GET_NOTIF_SIZES " (since Linux 5.0)"
+.\" commit 6a21cc50f0c7f87dae5259f6cfefe024412313f6
+Get the sizes of the seccomp user-space notification structures.
+Since these structures may evolve and grow over time,
+this command can be used to determine how
+much memory to allocate for sending and receiving notifications.
+.IP
+The value of
+.I flags
+must be 0, and
+.I args
+must be a pointer to a
+.IR "struct seccomp_notif_sizes" ,
+which has the following form:
+.IP
+.EX
+struct seccomp_notif_sizes
+ __u16 seccomp_notif; /* Size of notification structure */
+ __u16 seccomp_notif_resp; /* Size of response structure */
+ __u16 seccomp_data; /* Size of \[aq]struct seccomp_data\[aq] */
+};
+.EE
+.IP
+See
+.BR seccomp_unotify (2)
+for further details.
+.\"
+.SS Filters
+When adding filters via
+.BR SECCOMP_SET_MODE_FILTER ,
+.I args
+points to a filter program:
+.PP
+.in +4n
+.EX
+struct sock_fprog {
+ unsigned short len; /* Number of BPF instructions */
+ struct sock_filter *filter; /* Pointer to array of
+ BPF instructions */
+};
+.EE
+.in
+.PP
+Each program must contain one or more BPF instructions:
+.PP
+.in +4n
+.EX
+struct sock_filter { /* Filter block */
+ __u16 code; /* Actual filter code */
+ __u8 jt; /* Jump true */
+ __u8 jf; /* Jump false */
+ __u32 k; /* Generic multiuse field */
+};
+.EE
+.in
+.PP
+When executing the instructions, the BPF program operates on the
+system call information made available (i.e., use the
+.B BPF_ABS
+addressing mode) as a (read-only)
+.\" Quoting Kees Cook:
+.\" If BPF even allows changing the data, it's not copied back to
+.\" the syscall when it runs. Anything wanting to do things like
+.\" that would need to use ptrace to catch the call and directly
+.\" modify the registers before continuing with the call.
+buffer of the following form:
+.PP
+.in +4n
+.EX
+struct seccomp_data {
+ int nr; /* System call number */
+ __u32 arch; /* AUDIT_ARCH_* value
+ (see <linux/audit.h>) */
+ __u64 instruction_pointer; /* CPU instruction pointer */
+ __u64 args[6]; /* Up to 6 system call arguments */
+};
+.EE
+.in
+.PP
+Because numbering of system calls varies between architectures and
+some architectures (e.g., x86-64) allow user-space code to use
+the calling conventions of multiple architectures
+(and the convention being used may vary over the life of a process that uses
+.BR execve (2)
+to execute binaries that employ the different conventions),
+it is usually necessary to verify the value of the
+.I arch
+field.
+.PP
+It is strongly recommended to use an allow-list approach whenever
+possible because such an approach is more robust and simple.
+A deny-list will have to be updated whenever a potentially
+dangerous system call is added (or a dangerous flag or option if those
+are deny-listed), and it is often possible to alter the
+representation of a value without altering its meaning, leading to
+a deny-list bypass.
+See also
+.I Caveats
+below.
+.PP
+The
+.I arch
+field is not unique for all calling conventions.
+The x86-64 ABI and the x32 ABI both use
+.B AUDIT_ARCH_X86_64
+as
+.IR arch ,
+and they run on the same processors.
+Instead, the mask
+.B __X32_SYSCALL_BIT
+is used on the system call number to tell the two ABIs apart.
+.\" As noted by Dave Drysdale in a note at the end of
+.\" https://lwn.net/Articles/604515/
+.\" One additional detail to point out for the x32 ABI case:
+.\" the syscall number gets a high bit set (__X32_SYSCALL_BIT),
+.\" to mark it as an x32 call.
+.\"
+.\" If x32 support is included in the kernel, then __SYSCALL_MASK
+.\" will have a value that is not all-ones, and this will trigger
+.\" an extra instruction in system_call to mask off the extra bit,
+.\" so that the syscall table indexing still works.
+.PP
+This means that a policy must either deny all syscalls with
+.B __X32_SYSCALL_BIT
+or it must recognize syscalls with and without
+.B __X32_SYSCALL_BIT
+set.
+A list of system calls to be denied based on
+.I nr
+that does not also contain
+.I nr
+values with
+.B __X32_SYSCALL_BIT
+set can be bypassed by a malicious program that sets
+.BR __X32_SYSCALL_BIT .
+.PP
+Additionally, kernels prior to Linux 5.4 incorrectly permitted
+.I nr
+in the ranges 512-547 as well as the corresponding non-x32 syscalls ORed
+with
+.BR __X32_SYSCALL_BIT .
+For example,
+.I nr
+== 521 and
+.I nr
+== (101 |
+.BR __X32_SYSCALL_BIT )
+would result in invocations of
+.BR ptrace (2)
+with potentially confused x32-vs-x86_64 semantics in the kernel.
+Policies intended to work on kernels before Linux 5.4 must ensure that they
+deny or otherwise correctly handle these system calls.
+On Linux 5.4 and newer,
+.\" commit 6365b842aae4490ebfafadfc6bb27a6d3cc54757
+such system calls will fail with the error
+.BR ENOSYS ,
+without doing anything.
+.PP
+The
+.I instruction_pointer
+field provides the address of the machine-language instruction that
+performed the system call.
+This might be useful in conjunction with the use of
+.IR /proc/ pid /maps
+to perform checks based on which region (mapping) of the program
+made the system call.
+(Probably, it is wise to lock down the
+.BR mmap (2)
+and
+.BR mprotect (2)
+system calls to prevent the program from subverting such checks.)
+.PP
+When checking values from
+.IR args ,
+keep in mind that arguments are often
+silently truncated before being processed, but after the seccomp check.
+For example, this happens if the i386 ABI is used on an
+x86-64 kernel: although the kernel will normally not look beyond
+the 32 lowest bits of the arguments, the values of the full
+64-bit registers will be present in the seccomp data.
+A less surprising example is that if the x86-64 ABI is used to perform
+a system call that takes an argument of type
+.IR int ,
+the more-significant half of the argument register is ignored by
+the system call, but visible in the seccomp data.
+.PP
+A seccomp filter returns a 32-bit value consisting of two parts:
+the most significant 16 bits
+(corresponding to the mask defined by the constant
+.BR SECCOMP_RET_ACTION_FULL )
+contain one of the "action" values listed below;
+the least significant 16-bits (defined by the constant
+.BR SECCOMP_RET_DATA )
+are "data" to be associated with this return value.
+.PP
+If multiple filters exist, they are \fIall\fP executed,
+in reverse order of their addition to the filter tree\[em]that is,
+the most recently installed filter is executed first.
+(Note that all filters will be called
+even if one of the earlier filters returns
+.BR SECCOMP_RET_KILL .
+This is done to simplify the kernel code and to provide a
+tiny speed-up in the execution of sets of filters by
+avoiding a check for this uncommon case.)
+.\" From an Aug 2015 conversation with Kees Cook where I asked why *all*
+.\" filters are applied even if one of the early filters returns
+.\" SECCOMP_RET_KILL:
+.\"
+.\" It's just because it would be an optimization that would only speed up
+.\" the RET_KILL case, but it's the uncommon one and the one that doesn't
+.\" benefit meaningfully from such a change (you need to kill the process
+.\" really quickly?). We would speed up killing a program at the (albeit
+.\" tiny) expense to all other filtered programs. Best to keep the filter
+.\" execution logic clear, simple, and as fast as possible for all
+.\" filters.
+The return value for the evaluation of a given system call is the first-seen
+action value of highest precedence (along with its accompanying data)
+returned by execution of all of the filters.
+.PP
+In decreasing order of precedence,
+the action values that may be returned by a seccomp filter are:
+.TP
+.BR SECCOMP_RET_KILL_PROCESS " (since Linux 4.14)"
+.\" commit 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945
+.\" commit 0466bdb99e8744bc9befa8d62a317f0fd7fd7421
+This value results in immediate termination of the process,
+with a core dump.
+The system call is not executed.
+By contrast with
+.B SECCOMP_RET_KILL_THREAD
+below, all threads in the thread group are terminated.
+(For a discussion of thread groups, see the description of the
+.B CLONE_THREAD
+flag in
+.BR clone (2).)
+.IP
+The process terminates
+.I "as though"
+killed by a
+.B SIGSYS
+signal.
+Even if a signal handler has been registered for
+.BR SIGSYS ,
+the handler will be ignored in this case and the process always terminates.
+To a parent process that is waiting on this process (using
+.BR waitpid (2)
+or similar), the returned
+.I wstatus
+will indicate that its child was terminated as though by a
+.B SIGSYS
+signal.
+.TP
+.BR SECCOMP_RET_KILL_THREAD " (or " SECCOMP_RET_KILL )
+This value results in immediate termination of the thread
+that made the system call.
+The system call is not executed.
+Other threads in the same thread group will continue to execute.
+.IP
+The thread terminates
+.I "as though"
+killed by a
+.B SIGSYS
+signal.
+See
+.B SECCOMP_RET_KILL_PROCESS
+above.
+.IP
+.\" See these commits:
+.\" seccomp: dump core when using SECCOMP_RET_KILL
+.\" (b25e67161c295c98acda92123b2dd1e7d8642901)
+.\" seccomp: Only dump core when single-threaded
+.\" (d7276e321ff8a53106a59c85ca46d03e34288893)
+Before Linux 4.11,
+any process terminated in this way would not trigger a coredump
+(even though
+.B SIGSYS
+is documented in
+.BR signal (7)
+as having a default action of termination with a core dump).
+Since Linux 4.11,
+a single-threaded process will dump core if terminated in this way.
+.IP
+With the addition of
+.B SECCOMP_RET_KILL_PROCESS
+in Linux 4.14,
+.B SECCOMP_RET_KILL_THREAD
+was added as a synonym for
+.BR SECCOMP_RET_KILL ,
+in order to more clearly distinguish the two actions.
+.IP
+.BR Note :
+the use of
+.B SECCOMP_RET_KILL_THREAD
+to kill a single thread in a multithreaded process is likely to leave the
+process in a permanently inconsistent and possibly corrupt state.
+.TP
+.B SECCOMP_RET_TRAP
+This value results in the kernel sending a thread-directed
+.B SIGSYS
+signal to the triggering thread.
+(The system call is not executed.)
+Various fields will be set in the
+.I siginfo_t
+structure (see
+.BR sigaction (2))
+associated with signal:
+.RS
+.IP \[bu] 3
+.I si_signo
+will contain
+.BR SIGSYS .
+.IP \[bu]
+.I si_call_addr
+will show the address of the system call instruction.
+.IP \[bu]
+.I si_syscall
+and
+.I si_arch
+will indicate which system call was attempted.
+.IP \[bu]
+.I si_code
+will contain
+.BR SYS_SECCOMP .
+.IP \[bu]
+.I si_errno
+will contain the
+.B SECCOMP_RET_DATA
+portion of the filter return value.
+.RE
+.IP
+The program counter will be as though the system call happened
+(i.e., the program counter will not point to the system call instruction).
+The return value register will contain an architecture\-dependent value;
+if resuming execution, set it to something appropriate for the system call.
+(The architecture dependency is because replacing it with
+.B ENOSYS
+could overwrite some useful information.)
+.TP
+.B SECCOMP_RET_ERRNO
+This value results in the
+.B SECCOMP_RET_DATA
+portion of the filter's return value being passed to user space as the
+.I errno
+value without executing the system call.
+.TP
+.BR SECCOMP_RET_USER_NOTIF " (since Linux 5.0)"
+.\" commit 6a21cc50f0c7f87dae5259f6cfefe024412313f6
+Forward the system call to an attached user-space supervisor
+process to allow that process to decide what to do with the system call.
+If there is no attached supervisor (either
+because the filter was not installed with the
+.B SECCOMP_FILTER_FLAG_NEW_LISTENER
+flag or because the file descriptor was closed), the filter returns
+.B ENOSYS
+(similar to what happens when a filter returns
+.B SECCOMP_RET_TRACE
+and there is no tracer).
+See
+.BR seccomp_unotify (2)
+for further details.
+.IP
+Note that the supervisor process will not be notified
+if another filter returns an action value with a precedence greater than
+.BR SECCOMP_RET_USER_NOTIF .
+.TP
+.B SECCOMP_RET_TRACE
+When returned, this value will cause the kernel to attempt to notify a
+.BR ptrace (2)-based
+tracer prior to executing the system call.
+If there is no tracer present,
+the system call is not executed and returns a failure status with
+.I errno
+set to
+.BR ENOSYS .
+.IP
+A tracer will be notified if it requests
+.B PTRACE_O_TRACESECCOMP
+using
+.IR ptrace(PTRACE_SETOPTIONS) .
+The tracer will be notified of a
+.B PTRACE_EVENT_SECCOMP
+and the
+.B SECCOMP_RET_DATA
+portion of the filter's return value will be available to the tracer via
+.BR PTRACE_GETEVENTMSG .
+.IP
+The tracer can skip the system call by changing the system call number
+to \-1.
+Alternatively, the tracer can change the system call
+requested by changing the system call to a valid system call number.
+If the tracer asks to skip the system call, then the system call will
+appear to return the value that the tracer puts in the return value register.
+.IP
+.\" This was changed in ce6526e8afa4.
+.\" A related hole, using PTRACE_SYSCALL instead of SECCOMP_RET_TRACE, was
+.\" changed in arch-specific commits, e.g. 93e35efb8de4 for X86 and
+.\" 0f3912fd934c for ARM.
+Before Linux 4.8, the seccomp check will not be run again after the tracer is
+notified.
+(This means that, on older kernels, seccomp-based sandboxes
+.B "must not"
+allow use of
+.BR ptrace (2)\[em]even
+of other
+sandboxed processes\[em]without extreme care;
+ptracers can use this mechanism to escape from the seccomp sandbox.)
+.IP
+Note that a tracer process will not be notified
+if another filter returns an action value with a precedence greater than
+.BR SECCOMP_RET_TRACE .
+.TP
+.BR SECCOMP_RET_LOG " (since Linux 4.14)"
+.\" commit 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4
+This value results in the system call being executed after
+the filter return action is logged.
+An administrator may override the logging of this action via
+the
+.I /proc/sys/kernel/seccomp/actions_logged
+file.
+.TP
+.B SECCOMP_RET_ALLOW
+This value results in the system call being executed.
+.PP
+If an action value other than one of the above is specified,
+then the filter action is treated as either
+.B SECCOMP_RET_KILL_PROCESS
+(since Linux 4.14)
+.\" commit 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945
+or
+.B SECCOMP_RET_KILL_THREAD
+(in Linux 4.13 and earlier).
+.\"
+.SS /proc interfaces
+The files in the directory
+.I /proc/sys/kernel/seccomp
+provide additional seccomp information and configuration:
+.TP
+.IR actions_avail " (since Linux 4.14)"
+.\" commit 8e5f1ad116df6b0de65eac458d5e7c318d1c05af
+A read-only ordered list of seccomp filter return actions in string form.
+The ordering, from left-to-right, is in decreasing order of precedence.
+The list represents the set of seccomp filter return actions
+supported by the kernel.
+.TP
+.IR actions_logged " (since Linux 4.14)"
+.\" commit 0ddec0fc8900201c0897b87b762b7c420436662f
+A read-write ordered list of seccomp filter return actions that
+are allowed to be logged.
+Writes to the file do not need to be in ordered form but reads from
+the file will be ordered in the same way as the
+.I actions_avail
+file.
+.IP
+It is important to note that the value of
+.I actions_logged
+does not prevent certain filter return actions from being logged when
+the audit subsystem is configured to audit a task.
+If the action is not found in the
+.I actions_logged
+file, the final decision on whether to audit the action for that task is
+ultimately left up to the audit subsystem to decide for all filter return
+actions other than
+.BR SECCOMP_RET_ALLOW .
+.IP
+The "allow" string is not accepted in the
+.I actions_logged
+file as it is not possible to log
+.B SECCOMP_RET_ALLOW
+actions.
+Attempting to write "allow" to the file will fail with the error
+.BR EINVAL .
+.\"
+.SS Audit logging of seccomp actions
+.\" commit 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4
+Since Linux 4.14, the kernel provides the facility to log the
+actions returned by seccomp filters in the audit log.
+The kernel makes the decision to log an action based on
+the action type, whether or not the action is present in the
+.I actions_logged
+file, and whether kernel auditing is enabled
+(e.g., via the kernel boot option
+.IR audit=1 ).
+.\" or auditing could be enabled via the netlink API (AUDIT_SET)
+The rules are as follows:
+.IP \[bu] 3
+If the action is
+.BR SECCOMP_RET_ALLOW ,
+the action is not logged.
+.IP \[bu]
+Otherwise, if the action is either
+.B SECCOMP_RET_KILL_PROCESS
+or
+.BR SECCOMP_RET_KILL_THREAD ,
+and that action appears in the
+.I actions_logged
+file, the action is logged.
+.IP \[bu]
+Otherwise, if the filter has requested logging (the
+.B SECCOMP_FILTER_FLAG_LOG
+flag)
+and the action appears in the
+.I actions_logged
+file, the action is logged.
+.IP \[bu]
+Otherwise, if kernel auditing is enabled and the process is being audited
+.RB ( autrace (8)),
+the action is logged.
+.IP \[bu]
+Otherwise, the action is not logged.
+.SH RETURN VALUE
+On success,
+.BR seccomp ()
+returns 0.
+On error, if
+.B SECCOMP_FILTER_FLAG_TSYNC
+was used,
+the return value is the ID of the thread
+that caused the synchronization failure.
+(This ID is a kernel thread ID of the type returned by
+.BR clone (2)
+and
+.BR gettid (2).)
+On other errors, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.BR seccomp ()
+can fail for the following reasons:
+.TP
+.B EACCES
+The caller did not have the
+.B CAP_SYS_ADMIN
+capability in its user namespace, or had not set
+.I no_new_privs
+before using
+.BR SECCOMP_SET_MODE_FILTER .
+.TP
+.B EBUSY
+While installing a new filter, the
+.B SECCOMP_FILTER_FLAG_NEW_LISTENER
+flag was specified,
+but a previous filter had already been installed with that flag.
+.TP
+.B EFAULT
+.I args
+was not a valid address.
+.TP
+.B EINVAL
+.I operation
+is unknown or is not supported by this kernel version or configuration.
+.TP
+.B EINVAL
+The specified
+.I flags
+are invalid for the given
+.IR operation .
+.TP
+.B EINVAL
+.I operation
+included
+.BR BPF_ABS ,
+but the specified offset was not aligned to a 32-bit boundary or exceeded
+.IR "sizeof(struct\~seccomp_data)" .
+.TP
+.B EINVAL
+.\" See kernel/seccomp.c::seccomp_may_assign_mode() in Linux 3.18 sources
+A secure computing mode has already been set, and
+.I operation
+differs from the existing setting.
+.TP
+.B EINVAL
+.I operation
+specified
+.BR SECCOMP_SET_MODE_FILTER ,
+but the filter program pointed to by
+.I args
+was not valid or the length of the filter program was zero or exceeded
+.B BPF_MAXINSNS
+(4096) instructions.
+.TP
+.B ENOMEM
+Out of memory.
+.TP
+.B ENOMEM
+.\" ENOMEM in kernel/seccomp.c::seccomp_attach_filter() in Linux 3.18 sources
+The total length of all filter programs attached
+to the calling thread would exceed
+.B MAX_INSNS_PER_PATH
+(32768) instructions.
+Note that for the purposes of calculating this limit,
+each already existing filter program incurs an
+overhead penalty of 4 instructions.
+.TP
+.B EOPNOTSUPP
+.I operation
+specified
+.BR SECCOMP_GET_ACTION_AVAIL ,
+but the kernel does not support the filter return action specified by
+.IR args .
+.TP
+.B ESRCH
+Another thread caused a failure during thread sync, but its ID could not
+be determined.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 3.17.
+.\" FIXME . Add glibc version
+.SH NOTES
+Rather than hand-coding seccomp filters as shown in the example below,
+you may prefer to employ the
+.I libseccomp
+library, which provides a front-end for generating seccomp filters.
+.PP
+The
+.I Seccomp
+field of the
+.IR /proc/ pid /status
+file provides a method of viewing the seccomp mode of a process; see
+.BR proc (5).
+.PP
+.BR seccomp ()
+provides a superset of the functionality provided by the
+.BR prctl (2)
+.B PR_SET_SECCOMP
+operation (which does not support
+.IR flags ).
+.PP
+Since Linux 4.4, the
+.BR ptrace (2)
+.B PTRACE_SECCOMP_GET_FILTER
+operation can be used to dump a process's seccomp filters.
+.\"
+.SS Architecture support for seccomp BPF
+Architecture support for seccomp BPF filtering
+.\" Check by grepping for HAVE_ARCH_SECCOMP_FILTER in Kconfig files in
+.\" kernel source. Last checked in Linux 4.16-rc source.
+is available on the following architectures:
+.IP \[bu] 3
+x86-64, i386, x32 (since Linux 3.5)
+.PD 0
+.IP \[bu]
+ARM (since Linux 3.8)
+.IP \[bu]
+s390 (since Linux 3.8)
+.IP \[bu]
+MIPS (since Linux 3.16)
+.IP \[bu]
+ARM-64 (since Linux 3.19)
+.IP \[bu]
+PowerPC (since Linux 4.3)
+.IP \[bu]
+Tile (since Linux 4.3)
+.IP \[bu]
+PA-RISC (since Linux 4.6)
+.\" User mode Linux since Linux 4.6
+.PD
+.\"
+.SS Caveats
+There are various subtleties to consider when applying seccomp filters
+to a program, including the following:
+.IP \[bu] 3
+Some traditional system calls have user-space implementations in the
+.BR vdso (7)
+on many architectures.
+Notable examples include
+.BR clock_gettime (2),
+.BR gettimeofday (2),
+and
+.BR time (2).
+On such architectures,
+seccomp filtering for these system calls will have no effect.
+(However, there are cases where the
+.BR vdso (7)
+implementations may fall back to invoking the true system call,
+in which case seccomp filters would see the system call.)
+.IP \[bu]
+Seccomp filtering is based on system call numbers.
+However, applications typically do not directly invoke system calls,
+but instead call wrapper functions in the C library which
+in turn invoke the system calls.
+Consequently, one must be aware of the following:
+.RS
+.IP \[bu] 3
+The glibc wrappers for some traditional system calls may actually
+employ system calls with different names in the kernel.
+For example, the
+.BR exit (2)
+wrapper function actually employs the
+.BR exit_group (2)
+system call, and the
+.BR fork (2)
+wrapper function actually calls
+.BR clone (2).
+.IP \[bu]
+The behavior of wrapper functions may vary across architectures,
+according to the range of system calls provided on those architectures.
+In other words, the same wrapper function may invoke
+different system calls on different architectures.
+.IP \[bu]
+Finally, the behavior of wrapper functions can change across glibc versions.
+For example, in older versions, the glibc wrapper function for
+.BR open (2)
+invoked the system call of the same name,
+but starting in glibc 2.26, the implementation switched to calling
+.BR openat (2)
+on all architectures.
+.RE
+.PP
+The consequence of the above points is that it may be necessary
+to filter for a system call other than might be expected.
+Various manual pages in Section 2 provide helpful details
+about the differences between wrapper functions and
+the underlying system calls in subsections entitled
+.IR "C library/kernel differences" .
+.PP
+Furthermore, note that the application of seccomp filters
+even risks causing bugs in an application,
+when the filters cause unexpected failures for legitimate operations
+that the application might need to perform.
+Such bugs may not easily be discovered when testing the seccomp
+filters if the bugs occur in rarely used application code paths.
+.\"
+.SS Seccomp-specific BPF details
+Note the following BPF details specific to seccomp filters:
+.IP \[bu] 3
+The
+.B BPF_H
+and
+.B BPF_B
+size modifiers are not supported: all operations must load and store
+(4-byte) words
+.RB ( BPF_W ).
+.IP \[bu]
+To access the contents of the
+.I seccomp_data
+buffer, use the
+.B BPF_ABS
+addressing mode modifier.
+.IP \[bu]
+The
+.B BPF_LEN
+addressing mode modifier yields an immediate mode operand
+whose value is the size of the
+.I seccomp_data
+buffer.
+.SH EXAMPLES
+The program below accepts four or more arguments.
+The first three arguments are a system call number,
+a numeric architecture identifier, and an error number.
+The program uses these values to construct a BPF filter
+that is used at run time to perform the following checks:
+.IP \[bu] 3
+If the program is not running on the specified architecture,
+the BPF filter causes system calls to fail with the error
+.BR ENOSYS .
+.IP \[bu]
+If the program attempts to execute the system call with the specified number,
+the BPF filter causes the system call to fail, with
+.I errno
+being set to the specified error number.
+.PP
+The remaining command-line arguments specify
+the pathname and additional arguments of a program
+that the example program should attempt to execute using
+.BR execv (3)
+(a library function that employs the
+.BR execve (2)
+system call).
+Some example runs of the program are shown below.
+.PP
+First, we display the architecture that we are running on (x86-64)
+and then construct a shell function that looks up system call
+numbers on this architecture:
+.PP
+.in +4n
+.EX
+$ \fBuname \-m\fP
+x86_64
+$ \fBsyscall_nr() {
+ cat /usr/src/linux/arch/x86/syscalls/syscall_64.tbl | \e
+ awk \[aq]$2 != "x32" && $3 == "\[aq]$1\[aq]" { print $1 }\[aq]
+}\fP
+.EE
+.in
+.PP
+When the BPF filter rejects a system call (case [2] above),
+it causes the system call to fail with the error number
+specified on the command line.
+In the experiments shown here, we'll use error number 99:
+.PP
+.in +4n
+.EX
+$ \fBerrno 99\fP
+EADDRNOTAVAIL 99 Cannot assign requested address
+.EE
+.in
+.PP
+In the following example, we attempt to run the command
+.BR whoami (1),
+but the BPF filter rejects the
+.BR execve (2)
+system call, so that the command is not even executed:
+.PP
+.in +4n
+.EX
+$ \fBsyscall_nr execve\fP
+59
+$ \fB./a.out\fP
+Usage: ./a.out <syscall_nr> <arch> <errno> <prog> [<args>]
+Hint for <arch>: AUDIT_ARCH_I386: 0x40000003
+ AUDIT_ARCH_X86_64: 0xC000003E
+$ \fB./a.out 59 0xC000003E 99 /bin/whoami\fP
+execv: Cannot assign requested address
+.EE
+.in
+.PP
+In the next example, the BPF filter rejects the
+.BR write (2)
+system call, so that, although it is successfully started, the
+.BR whoami (1)
+command is not able to write output:
+.PP
+.in +4n
+.EX
+$ \fBsyscall_nr write\fP
+1
+$ \fB./a.out 1 0xC000003E 99 /bin/whoami\fP
+.EE
+.in
+.PP
+In the final example,
+the BPF filter rejects a system call that is not used by the
+.BR whoami (1)
+command, so it is able to successfully execute and produce output:
+.PP
+.in +4n
+.EX
+$ \fBsyscall_nr preadv\fP
+295
+$ \fB./a.out 295 0xC000003E 99 /bin/whoami\fP
+cecilia
+.EE
+.in
+.SS Program source
+.\" SRC BEGIN (seccomp.c)
+.EX
+#include <linux/audit.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/prctl.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+\&
+#define X32_SYSCALL_BIT 0x40000000
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+\&
+static int
+install_filter(int syscall_nr, unsigned int t_arch, int f_errno)
+{
+ unsigned int upper_nr_limit = 0xffffffff;
+\&
+ /* Assume that AUDIT_ARCH_X86_64 means the normal x86\-64 ABI
+ (in the x32 ABI, all system calls have bit 30 set in the
+ \[aq]nr\[aq] field, meaning the numbers are >= X32_SYSCALL_BIT). */
+ if (t_arch == AUDIT_ARCH_X86_64)
+ upper_nr_limit = X32_SYSCALL_BIT \- 1;
+\&
+ struct sock_filter filter[] = {
+ /* [0] Load architecture from \[aq]seccomp_data\[aq] buffer into
+ accumulator. */
+ BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+ (offsetof(struct seccomp_data, arch))),
+\&
+ /* [1] Jump forward 5 instructions if architecture does not
+ match \[aq]t_arch\[aq]. */
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, t_arch, 0, 5),
+\&
+ /* [2] Load system call number from \[aq]seccomp_data\[aq] buffer into
+ accumulator. */
+ BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+ (offsetof(struct seccomp_data, nr))),
+\&
+ /* [3] Check ABI \- only needed for x86\-64 in deny\-list use
+ cases. Use BPF_JGT instead of checking against the bit
+ mask to avoid having to reload the syscall number. */
+ BPF_JUMP(BPF_JMP | BPF_JGT | BPF_K, upper_nr_limit, 3, 0),
+\&
+ /* [4] Jump forward 1 instruction if system call number
+ does not match \[aq]syscall_nr\[aq]. */
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, syscall_nr, 0, 1),
+\&
+ /* [5] Matching architecture and system call: don\[aq]t execute
+ the system call, and return \[aq]f_errno\[aq] in \[aq]errno\[aq]. */
+ BPF_STMT(BPF_RET | BPF_K,
+ SECCOMP_RET_ERRNO | (f_errno & SECCOMP_RET_DATA)),
+\&
+ /* [6] Destination of system call number mismatch: allow other
+ system calls. */
+ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
+\&
+ /* [7] Destination of architecture mismatch: kill process. */
+ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
+ };
+\&
+ struct sock_fprog prog = {
+ .len = ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+\&
+ if (syscall(SYS_seccomp, SECCOMP_SET_MODE_FILTER, 0, &prog)) {
+ perror("seccomp");
+ return 1;
+ }
+\&
+ return 0;
+}
+\&
+int
+main(int argc, char *argv[])
+{
+ if (argc < 5) {
+ fprintf(stderr, "Usage: "
+ "%s <syscall_nr> <arch> <errno> <prog> [<args>]\en"
+ "Hint for <arch>: AUDIT_ARCH_I386: 0x%X\en"
+ " AUDIT_ARCH_X86_64: 0x%X\en"
+ "\en", argv[0], AUDIT_ARCH_I386, AUDIT_ARCH_X86_64);
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ perror("prctl");
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (install_filter(strtol(argv[1], NULL, 0),
+ strtoul(argv[2], NULL, 0),
+ strtol(argv[3], NULL, 0)))
+ exit(EXIT_FAILURE);
+\&
+ execv(argv[4], &argv[4]);
+ perror("execv");
+ exit(EXIT_FAILURE);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR bpfc (1),
+.BR strace (1),
+.BR bpf (2),
+.BR prctl (2),
+.BR ptrace (2),
+.BR seccomp_unotify (2),
+.BR sigaction (2),
+.BR proc (5),
+.BR signal (7),
+.BR socket (7)
+.PP
+Various pages from the
+.I libseccomp
+library, including:
+.BR scmp_sys_resolver (1),
+.BR seccomp_export_bpf (3),
+.BR seccomp_init (3),
+.BR seccomp_load (3),
+and
+.BR seccomp_rule_add (3).
+.PP
+The kernel source files
+.I Documentation/networking/filter.txt
+and
+.I Documentation/userspace\-api/seccomp_filter.rst
+.\" commit c061f33f35be0ccc80f4b8e0aea5dfd2ed7e01a3
+(or
+.I Documentation/prctl/seccomp_filter.txt
+before Linux 4.13).
+.PP
+McCanne, S.\& and Jacobson, V.\& (1992)
+.IR "The BSD Packet Filter: A New Architecture for User-level Packet Capture" ,
+Proceedings of the USENIX Winter 1993 Conference
+.UR http://www.tcpdump.org/papers/bpf\-usenix93.pdf
+.UE
diff --git a/man2/seccomp_unotify.2 b/man2/seccomp_unotify.2
new file mode 100644
index 0000000..156fbce
--- /dev/null
+++ b/man2/seccomp_unotify.2
@@ -0,0 +1,2011 @@
+.\" Copyright (C) 2020 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH seccomp_unotify 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+seccomp_unotify \- Seccomp user-space notification mechanism
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <linux/seccomp.h>
+.B #include <linux/filter.h>
+.B #include <linux/audit.h>
+.PP
+.BI "int seccomp(unsigned int " operation ", unsigned int " flags \
+", void *" args );
+.PP
+.B #include <sys/ioctl.h>
+.PP
+.BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_RECV,"
+.BI " struct seccomp_notif *" req );
+.BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_SEND,"
+.BI " struct seccomp_notif_resp *" resp );
+.BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_ID_VALID, __u64 *" id );
+.BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_ADDFD,"
+.BI " struct seccomp_notif_addfd *" addfd );
+.fi
+.SH DESCRIPTION
+This page describes the user-space notification mechanism provided by the
+Secure Computing (seccomp) facility.
+As well as the use of the
+.B SECCOMP_FILTER_FLAG_NEW_LISTENER
+flag, the
+.B SECCOMP_RET_USER_NOTIF
+action value, and the
+.B SECCOMP_GET_NOTIF_SIZES
+operation described in
+.BR seccomp (2),
+this mechanism involves the use of a number of related
+.BR ioctl (2)
+operations (described below).
+.\"
+.SS Overview
+In conventional usage of a seccomp filter,
+the decision about how to treat a system call is made by the filter itself.
+By contrast, the user-space notification mechanism allows
+the seccomp filter to delegate
+the handling of the system call to another user-space process.
+Note that this mechanism is explicitly
+.B not
+intended as a method implementing security policy; see NOTES.
+.PP
+In the discussion that follows,
+the thread(s) on which the seccomp filter is installed is (are)
+referred to as the
+.IR target ,
+and the process that is notified by the user-space notification
+mechanism is referred to as the
+.IR supervisor .
+.PP
+A suitably privileged supervisor can use the user-space notification
+mechanism to perform actions on behalf of the target.
+The advantage of the user-space notification mechanism is that
+the supervisor will
+usually be able to retrieve information about the target and the
+performed system call that the seccomp filter itself cannot.
+(A seccomp filter is limited in the information it can obtain and
+the actions that it can perform because it
+is running on a virtual machine inside the kernel.)
+.PP
+An overview of the steps performed by the target and the supervisor
+is as follows:
+.\"-------------------------------------
+.IP (1) 5
+The target establishes a seccomp filter in the usual manner,
+but with two differences:
+.RS
+.IP \[bu] 3
+The
+.BR seccomp (2)
+.I flags
+argument includes the flag
+.BR SECCOMP_FILTER_FLAG_NEW_LISTENER .
+Consequently, the return value of the (successful)
+.BR seccomp (2)
+call is a new "listening"
+file descriptor that can be used to receive notifications.
+Only one "listening" seccomp filter can be installed for a thread.
+.\" FIXME
+.\" Is the last sentence above correct?
+.\"
+.\" Kees Cook (25 Oct 2020) notes:
+.\"
+.\" I like this limitation, but I expect that it'll need to change in the
+.\" future. Even with LSMs, we see the need for arbitrary stacking, and the
+.\" idea of there being only 1 supervisor will eventually break down. Right
+.\" now there is only 1 because only container managers are using this
+.\" feature. But if some daemon starts using it to isolate some thread,
+.\" suddenly it might break if a container manager is trying to listen to it
+.\" too, etc. I expect it won't be needed soon, but I do think it'll change.
+.\"
+.IP \[bu]
+In cases where it is appropriate, the seccomp filter returns the action value
+.BR SECCOMP_RET_USER_NOTIF .
+This return value will trigger a notification event.
+.RE
+.\"-------------------------------------
+.IP (2)
+In order that the supervisor can obtain notifications
+using the listening file descriptor,
+(a duplicate of) that file descriptor must be passed from
+the target to the supervisor.
+One way in which this could be done is by passing the file descriptor
+over a UNIX domain socket connection between the target and the supervisor
+(using the
+.B SCM_RIGHTS
+ancillary message type described in
+.BR unix (7)).
+Another way to do this is through the use of
+.BR pidfd_getfd (2).
+.\" Jann Horn:
+.\" Instead of using unix domain sockets to send the fd to the
+.\" parent, I think you could also use clone3() with
+.\" flags==CLONE_FILES|SIGCHLD, dup2() the seccomp fd to an fd
+.\" that was reserved in the parent, call unshare(CLONE_FILES)
+.\" in the child after setting up the seccomp fd, and wake
+.\" up the parent with something like pthread_cond_signal()?
+.\" I'm not sure whether that'd look better or worse in the
+.\" end though, so maybe just ignore this comment.
+.\"-------------------------------------
+.IP (3)
+The supervisor will receive notification events
+on the listening file descriptor.
+These events are returned as structures of type
+.IR seccomp_notif .
+Because this structure and its size may evolve over kernel versions,
+the supervisor must first determine the size of this structure
+using the
+.BR seccomp (2)
+.B SECCOMP_GET_NOTIF_SIZES
+operation, which returns a structure of type
+.IR seccomp_notif_sizes .
+The supervisor allocates a buffer of size
+.I seccomp_notif_sizes.seccomp_notif
+bytes to receive notification events.
+In addition,the supervisor allocates another buffer of size
+.I seccomp_notif_sizes.seccomp_notif_resp
+bytes for the response (a
+.I struct seccomp_notif_resp
+structure)
+that it will provide to the kernel (and thus the target).
+.\"-------------------------------------
+.IP (4)
+The target then performs its workload,
+which includes system calls that will be controlled by the seccomp filter.
+Whenever one of these system calls causes the filter to return the
+.B SECCOMP_RET_USER_NOTIF
+action value, the kernel does
+.I not
+(yet) execute the system call;
+instead, execution of the target is temporarily blocked inside
+the kernel (in a sleep state that is interruptible by signals)
+and a notification event is generated on the listening file descriptor.
+.\"-------------------------------------
+.IP (5)
+The supervisor can now repeatedly monitor the
+listening file descriptor for
+.BR SECCOMP_RET_USER_NOTIF -triggered
+events.
+To do this, the supervisor uses the
+.B SECCOMP_IOCTL_NOTIF_RECV
+.BR ioctl (2)
+operation to read information about a notification event;
+this operation blocks until an event is available.
+The operation returns a
+.I seccomp_notif
+structure containing information about the system call
+that is being attempted by the target.
+(As described in NOTES,
+the file descriptor can also be monitored with
+.BR select (2),
+.BR poll (2),
+or
+.BR epoll (7).)
+.\" FIXME
+.\" Christian Brauner:
+.\"
+.\" Do we support O_NONBLOCK with SECCOMP_IOCTL_NOTIF_RECV and if
+.\" not should we?
+.\"
+.\" Michael Kerrisk:
+.\"
+.\" A quick test suggests that O_NONBLOCK has no effect on the blocking
+.\" behavior of SECCOMP_IOCTL_NOTIF_RECV.
+.
+.\"-------------------------------------
+.IP (6)
+The
+.I seccomp_notif
+structure returned by the
+.B SECCOMP_IOCTL_NOTIF_RECV
+operation includes the same information (a
+.I seccomp_data
+structure) that was passed to the seccomp filter.
+This information allows the supervisor to discover the system call number and
+the arguments for the target's system call.
+In addition, the notification event contains the ID of the thread
+that triggered the notification and a unique cookie value that
+is used in subsequent
+.B SECCOMP_IOCTL_NOTIF_ID_VALID
+and
+.B SECCOMP_IOCTL_NOTIF_SEND
+operations.
+.IP
+The information in the notification can be used to discover the
+values of pointer arguments for the target's system call.
+(This is something that can't be done from within a seccomp filter.)
+One way in which the supervisor can do this is to open the corresponding
+.IR /proc/ tid /mem
+file (see
+.BR proc (5))
+and read bytes from the location that corresponds to one of
+the pointer arguments whose value is supplied in the notification event.
+.\" Tycho Andersen mentioned that there are alternatives to /proc/PID/mem,
+.\" such as ptrace() and /proc/PID/map_files
+(The supervisor must be careful to avoid
+a race condition that can occur when doing this;
+see the description of the
+.B SECCOMP_IOCTL_NOTIF_ID_VALID
+.BR ioctl (2)
+operation below.)
+In addition,
+the supervisor can access other system information that is visible
+in user space but which is not accessible from a seccomp filter.
+.\"-------------------------------------
+.IP (7)
+Having obtained information as per the previous step,
+the supervisor may then choose to perform an action in response
+to the target's system call
+(which, as noted above, is not executed when the seccomp filter returns the
+.B SECCOMP_RET_USER_NOTIF
+action value).
+.IP
+One example use case here relates to containers.
+The target may be located inside a container where
+it does not have sufficient capabilities to mount a filesystem
+in the container's mount namespace.
+However, the supervisor may be a more privileged process that
+does have sufficient capabilities to perform the mount operation.
+.\"-------------------------------------
+.IP (8)
+The supervisor then sends a response to the notification.
+The information in this response is used by the kernel to construct
+a return value for the target's system call and provide
+a value that will be assigned to the
+.I errno
+variable of the target.
+.IP
+The response is sent using the
+.B SECCOMP_IOCTL_NOTIF_SEND
+.BR ioctl (2)
+operation, which is used to transmit a
+.I seccomp_notif_resp
+structure to the kernel.
+This structure includes a cookie value that the supervisor obtained in the
+.I seccomp_notif
+structure returned by the
+.B SECCOMP_IOCTL_NOTIF_RECV
+operation.
+This cookie value allows the kernel to associate the response with the
+target.
+This structure must include the cookie value that the supervisor
+obtained in the
+.I seccomp_notif
+structure returned by the
+.B SECCOMP_IOCTL_NOTIF_RECV
+operation;
+the cookie allows the kernel to associate the response with the target.
+.\"-------------------------------------
+.IP (9)
+Once the notification has been sent,
+the system call in the target thread unblocks,
+returning the information that was provided by the supervisor
+in the notification response.
+.\"-------------------------------------
+.PP
+As a variation on the last two steps,
+the supervisor can send a response that tells the kernel that it
+should execute the target thread's system call; see the discussion of
+.BR SECCOMP_USER_NOTIF_FLAG_CONTINUE ,
+below.
+.\"
+.SH IOCTL OPERATIONS
+The following
+.BR ioctl (2)
+operations are supported by the seccomp user-space
+notification file descriptor.
+For each of these operations, the first (file descriptor) argument of
+.BR ioctl (2)
+is the listening file descriptor returned by a call to
+.BR seccomp (2)
+with the
+.B SECCOMP_FILTER_FLAG_NEW_LISTENER
+flag.
+.\"
+.SS SECCOMP_IOCTL_NOTIF_RECV
+The
+.B SECCOMP_IOCTL_NOTIF_RECV
+operation (available since Linux 5.0) is used to obtain a user-space
+notification event.
+If no such event is currently pending,
+the operation blocks until an event occurs.
+The third
+.BR ioctl (2)
+argument is a pointer to a structure of the following form
+which contains information about the event.
+This structure must be zeroed out before the call.
+.PP
+.in +4n
+.EX
+struct seccomp_notif {
+ __u64 id; /* Cookie */
+ __u32 pid; /* TID of target thread */
+ __u32 flags; /* Currently unused (0) */
+ struct seccomp_data data; /* See seccomp(2) */
+};
+.EE
+.in
+.PP
+The fields in this structure are as follows:
+.TP
+.I id
+This is a cookie for the notification.
+Each such cookie is guaranteed to be unique for the corresponding
+seccomp filter.
+.RS
+.IP \[bu] 3
+The cookie can be used with the
+.B SECCOMP_IOCTL_NOTIF_ID_VALID
+.BR ioctl (2)
+operation described below.
+.IP \[bu]
+When returning a notification response to the kernel,
+the supervisor must include the cookie value in the
+.I seccomp_notif_resp
+structure that is specified as the argument of the
+.B SECCOMP_IOCTL_NOTIF_SEND
+operation.
+.RE
+.TP
+.I pid
+This is the thread ID of the target thread that triggered
+the notification event.
+.TP
+.I flags
+This is a bit mask of flags providing further information on the event.
+In the current implementation, this field is always zero.
+.TP
+.I data
+This is a
+.I seccomp_data
+structure containing information about the system call that
+triggered the notification.
+This is the same structure that is passed to the seccomp filter.
+See
+.BR seccomp (2)
+for details of this structure.
+.PP
+On success, this operation returns 0; on failure, \-1 is returned, and
+.I errno
+is set to indicate the cause of the error.
+This operation can fail with the following errors:
+.TP
+.BR EINVAL " (since Linux 5.5)"
+.\" commit 2882d53c9c6f3b8311d225062522f03772cf0179
+The
+.I seccomp_notif
+structure that was passed to the call contained nonzero fields.
+.TP
+.B ENOENT
+The target thread was killed by a signal as the notification information
+was being generated,
+or the target's (blocked) system call was interrupted by a signal handler.
+.\" FIXME
+.\" From my experiments,
+.\" it appears that if a SECCOMP_IOCTL_NOTIF_RECV is done after
+.\" the target thread terminates, then the ioctl() simply
+.\" blocks (rather than returning an error to indicate that the
+.\" target no longer exists).
+.\"
+.\" I found that surprising, and it required some contortions in
+.\" the example program. It was not possible to code my SIGCHLD
+.\" handler (which reaps the zombie when the worker/target
+.\" terminates) to simply set a flag checked in the main
+.\" handleNotifications() loop, since this created an
+.\" unavoidable race where the child might terminate just after
+.\" I had checked the flag, but before I blocked (forever!) in the
+.\" SECCOMP_IOCTL_NOTIF_RECV operation. Instead, I had to code
+.\" the signal handler to simply call _exit(2) in order to
+.\" terminate the parent process (the supervisor).
+.\"
+.\" Is this expected behavior? It seems to me rather
+.\" desirable that SECCOMP_IOCTL_NOTIF_RECV should give an error
+.\" if the target has terminated.
+.\"
+.\" Jann posted a patch to rectify this, but there was no response
+.\" (Lore link: https://bit.ly/3jvUBxk) to his question about fixing
+.\" this issue. (I've tried building with the patch, but encountered
+.\" an issue with the target process entering D state after a signal.)
+.\"
+.\" For now, this behavior is documented in BUGS.
+.\"
+.\" Kees Cook commented: Let's change [this] ASAP!
+.\"
+.SS SECCOMP_IOCTL_NOTIF_ID_VALID
+The
+.B SECCOMP_IOCTL_NOTIF_ID_VALID
+operation (available since Linux 5.0) is used to check that a notification ID
+returned by an earlier
+.B SECCOMP_IOCTL_NOTIF_RECV
+operation is still valid
+(i.e., that the target still exists and its system call
+is still blocked waiting for a response).
+.PP
+The third
+.BR ioctl (2)
+argument is a pointer to the cookie
+.RI ( id )
+returned by the
+.B SECCOMP_IOCTL_NOTIF_RECV
+operation.
+.PP
+This operation is necessary to avoid race conditions that can occur when the
+.I pid
+returned by the
+.B SECCOMP_IOCTL_NOTIF_RECV
+operation terminates, and that process ID is reused by another process.
+An example of this kind of race is the following
+.IP (1) 5
+A notification is generated on the listening file descriptor.
+The returned
+.I seccomp_notif
+contains the TID of the target thread (in the
+.I pid
+field of the structure).
+.IP (2)
+The target terminates.
+.IP (3)
+Another thread or process is created on the system that by chance reuses the
+TID that was freed when the target terminated.
+.IP (4)
+The supervisor
+.BR open (2)s
+the
+.IR /proc/ tid /mem
+file for the TID obtained in step 1, with the intention of (say)
+inspecting the memory location(s) that containing the argument(s) of
+the system call that triggered the notification in step 1.
+.PP
+In the above scenario, the risk is that the supervisor may try
+to access the memory of a process other than the target.
+This race can be avoided by following the call to
+.BR open (2)
+with a
+.B SECCOMP_IOCTL_NOTIF_ID_VALID
+operation to verify that the process that generated the notification
+is still alive.
+(Note that if the target terminates after the latter step,
+a subsequent
+.BR read (2)
+from the file descriptor may return 0, indicating end of file.)
+.\" Jann Horn:
+.\" the PID can be reused, but the /proc/$pid directory is
+.\" internally not associated with the numeric PID, but,
+.\" conceptually speaking, with a specific incarnation of the
+.\" PID, or something like that. (Actually, it is associated
+.\" with the "struct pid", which is not reused, instead of the
+.\" numeric PID.
+.PP
+See NOTES for a discussion of other cases where
+.B SECCOMP_IOCTL_NOTIF_ID_VALID
+checks must be performed.
+.PP
+On success (i.e., the notification ID is still valid),
+this operation returns 0.
+On failure (i.e., the notification ID is no longer valid),
+\-1 is returned, and
+.I errno
+is set to
+.BR ENOENT .
+.\"
+.SS SECCOMP_IOCTL_NOTIF_SEND
+The
+.B SECCOMP_IOCTL_NOTIF_SEND
+operation (available since Linux 5.0)
+is used to send a notification response back to the kernel.
+The third
+.BR ioctl (2)
+argument of this structure is a pointer to a structure of the following form:
+.PP
+.in +4n
+.EX
+struct seccomp_notif_resp {
+ __u64 id; /* Cookie value */
+ __s64 val; /* Success return value */
+ __s32 error; /* 0 (success) or negative error number */
+ __u32 flags; /* See below */
+};
+.EE
+.in
+.PP
+The fields of this structure are as follows:
+.TP
+.I id
+This is the cookie value that was obtained using the
+.B SECCOMP_IOCTL_NOTIF_RECV
+operation.
+This cookie value allows the kernel to correctly associate this response
+with the system call that triggered the user-space notification.
+.TP
+.I val
+This is the value that will be used for a spoofed
+success return for the target's system call; see below.
+.TP
+.I error
+This is the value that will be used as the error number
+.RI ( errno )
+for a spoofed error return for the target's system call; see below.
+.TP
+.I flags
+This is a bit mask that includes zero or more of the following flags:
+.RS
+.TP
+.BR SECCOMP_USER_NOTIF_FLAG_CONTINUE " (since Linux 5.5)"
+Tell the kernel to execute the target's system call.
+.\" commit fb3c5386b382d4097476ce9647260fc89b34afdb
+.RE
+.PP
+Two kinds of response are possible:
+.IP \[bu] 3
+A response to the kernel telling it to execute the
+target's system call.
+In this case, the
+.I flags
+field includes
+.B SECCOMP_USER_NOTIF_FLAG_CONTINUE
+and the
+.I error
+and
+.I val
+fields must be zero.
+.IP
+This kind of response can be useful in cases where the supervisor needs
+to do deeper analysis of the target's system call than is possible
+from a seccomp filter (e.g., examining the values of pointer arguments),
+and, having decided that the system call does not require emulation
+by the supervisor, the supervisor wants the system call to
+be executed normally in the target.
+.IP
+The
+.B SECCOMP_USER_NOTIF_FLAG_CONTINUE
+flag should be used with caution; see NOTES.
+.IP \[bu]
+A spoofed return value for the target's system call.
+In this case, the kernel does not execute the target's system call,
+instead causing the system call to return a spoofed value as specified by
+fields of the
+.I seccomp_notif_resp
+structure.
+The supervisor should set the fields of this structure as follows:
+.RS
+.IP + 3
+.I flags
+does not contain
+.BR SECCOMP_USER_NOTIF_FLAG_CONTINUE .
+.IP +
+.I error
+is set either to 0 for a spoofed "success" return or to a negative
+error number for a spoofed "failure" return.
+In the former case, the kernel causes the target's system call
+to return the value specified in the
+.I val
+field.
+In the latter case, the kernel causes the target's system call
+to return \-1, and
+.I errno
+is assigned the negated
+.I error
+value.
+.IP +
+.I val
+is set to a value that will be used as the return value for a spoofed
+"success" return for the target's system call.
+The value in this field is ignored if the
+.I error
+field contains a nonzero value.
+.\" FIXME
+.\" Kees Cook suggested:
+.\"
+.\" Strictly speaking, this is architecture specific, but
+.\" all architectures do it this way. Should seccomp enforce
+.\" val == 0 when err != 0 ?
+.\"
+.\" Christian Brauner
+.\"
+.\" Feels like it should, at least for the SEND ioctl where we already
+.\" verify that val and err are both 0 when CONTINUE is specified (as you
+.\" pointed out correctly above).
+.RE
+.PP
+On success, this operation returns 0; on failure, \-1 is returned, and
+.I errno
+is set to indicate the cause of the error.
+This operation can fail with the following errors:
+.TP
+.B EINPROGRESS
+A response to this notification has already been sent.
+.TP
+.B EINVAL
+An invalid value was specified in the
+.I flags field.
+.TP
+.B
+.B EINVAL
+The
+.I flags
+field contained
+.BR SECCOMP_USER_NOTIF_FLAG_CONTINUE ,
+and the
+.I error
+or
+.I val
+field was not zero.
+.TP
+.B ENOENT
+The blocked system call in the target
+has been interrupted by a signal handler
+or the target has terminated.
+.\" Jann Horn notes:
+.\" you could also get this [ENOENT] if a response has already
+.\" been sent, instead of EINPROGRESS - the only difference is
+.\" whether the target thread has picked up the response yet
+.\"
+.SS SECCOMP_IOCTL_NOTIF_ADDFD
+The
+.B SECCOMP_IOCTL_NOTIF_ADDFD
+operation (available since Linux 5.9)
+allows the supervisor to install a file descriptor
+into the target's file descriptor table.
+Much like the use of
+.B SCM_RIGHTS
+messages described in
+.BR unix (7),
+this operation is semantically equivalent to duplicating
+a file descriptor from the supervisor's file descriptor table
+into the target's file descriptor table.
+.PP
+The
+.B SECCOMP_IOCTL_NOTIF_ADDFD
+operation permits the supervisor to emulate a target system call (such as
+.BR socket (2)
+or
+.BR openat (2))
+that generates a file descriptor.
+The supervisor can perform the system call that generates
+the file descriptor (and associated open file description)
+and then use this operation to allocate
+a file descriptor that refers to the same open file description in the target.
+(For an explanation of open file descriptions, see
+.BR open (2).)
+.PP
+Once this operation has been performed,
+the supervisor can close its copy of the file descriptor.
+.PP
+In the target,
+the received file descriptor is subject to the same
+Linux Security Module (LSM) checks as are applied to a file descriptor
+that is received in an
+.B SCM_RIGHTS
+ancillary message.
+If the file descriptor refers to a socket,
+it inherits the cgroup version 1 network controller settings
+.RI ( classid
+and
+.IR netprioidx )
+of the target.
+.PP
+The third
+.BR ioctl (2)
+argument is a pointer to a structure of the following form:
+.PP
+.in +4n
+.EX
+struct seccomp_notif_addfd {
+ __u64 id; /* Cookie value */
+ __u32 flags; /* Flags */
+ __u32 srcfd; /* Local file descriptor number */
+ __u32 newfd; /* 0 or desired file descriptor
+ number in target */
+ __u32 newfd_flags; /* Flags to set on target file
+ descriptor */
+};
+.EE
+.in
+.PP
+The fields in this structure are as follows:
+.TP
+.I id
+This field should be set to the notification ID
+(cookie value) that was obtained via
+.BR SECCOMP_IOCTL_NOTIF_RECV .
+.TP
+.I flags
+This field is a bit mask of flags that modify the behavior of the operation.
+Currently, only one flag is supported:
+.RS
+.TP
+.B SECCOMP_ADDFD_FLAG_SETFD
+When allocating the file descriptor in the target,
+use the file descriptor number specified in the
+.I newfd
+field.
+.TP
+.BR SECCOMP_ADDFD_FLAG_SEND " (since Linux 5.14)"
+.\" commit 0ae71c7720e3ae3aabd2e8a072d27f7bd173d25c
+Perform the equivalent of
+.B SECCOMP_IOCTL_NOTIF_ADDFD
+plus
+.B SECCOMP_IOCTL_NOTIF_SEND
+as an atomic operation.
+On successful invocation, the target process's
+.I errno
+will be 0
+and the return value will be the file descriptor number
+that was allocated in the target.
+If allocating the file descriptor in the target fails,
+the target's system call continues to be blocked
+until a successful response is sent.
+.RE
+.TP
+.I srcfd
+This field should be set to the number of the file descriptor
+in the supervisor that is to be duplicated.
+.TP
+.I newfd
+This field determines which file descriptor number is allocated in the target.
+If the
+.B SECCOMP_ADDFD_FLAG_SETFD
+flag is set,
+then this field specifies which file descriptor number should be allocated.
+If this file descriptor number is already open in the target,
+it is atomically closed and reused.
+If the descriptor duplication fails due to an LSM check, or if
+.I srcfd
+is not a valid file descriptor,
+the file descriptor
+.I newfd
+will not be closed in the target process.
+.IP
+If the
+.B SECCOMP_ADDFD_FLAG_SETFD
+flag it not set, then this field must be 0,
+and the kernel allocates the lowest unused file descriptor number
+in the target.
+.TP
+.I newfd_flags
+This field is a bit mask specifying flags that should be set on
+the file descriptor that is received in the target process.
+Currently, only the following flag is implemented:
+.RS
+.TP
+.B O_CLOEXEC
+Set the close-on-exec flag on the received file descriptor.
+.RE
+.PP
+On success, this
+.BR ioctl (2)
+call returns the number of the file descriptor that was allocated
+in the target.
+Assuming that the emulated system call is one that returns
+a file descriptor as its function result (e.g.,
+.BR socket (2)),
+this value can be used as the return value
+.RI ( resp.val )
+that is supplied in the response that is subsequently sent with the
+.B SECCOMP_IOCTL_NOTIF_SEND
+operation.
+.PP
+On error, \-1 is returned and
+.I errno
+is set to indicate the cause of the error.
+.PP
+This operation can fail with the following errors:
+.TP
+.B EBADF
+Allocating the file descriptor in the target would cause the target's
+.B RLIMIT_NOFILE
+limit to be exceeded (see
+.BR getrlimit (2)).
+.TP
+.B EBUSY
+If the flag
+.B SECCOMP_IOCTL_NOTIF_SEND
+is used, this means the operation can't proceed until other
+.B SECCOMP_IOCTL_NOTIF_ADDFD
+requests are processed.
+.TP
+.B EINPROGRESS
+The user-space notification specified in the
+.I id
+field exists but has not yet been fetched (by a
+.BR SECCOMP_IOCTL_NOTIF_RECV )
+or has already been responded to (by a
+.BR SECCOMP_IOCTL_NOTIF_SEND ).
+.TP
+.B EINVAL
+An invalid flag was specified in the
+.I flags
+or
+.I newfd_flags
+field, or the
+.I newfd
+field is nonzero and the
+.B SECCOMP_ADDFD_FLAG_SETFD
+flag was not specified in the
+.I flags
+field.
+.TP
+.B EMFILE
+The file descriptor number specified in
+.I newfd
+exceeds the limit specified in
+.IR /proc/sys/fs/nr_open .
+.TP
+.B ENOENT
+The blocked system call in the target
+has been interrupted by a signal handler
+or the target has terminated.
+.PP
+Here is some sample code (with error handling omitted) that uses the
+.B SECCOMP_ADDFD_FLAG_SETFD
+operation (here, to emulate a call to
+.BR openat (2)):
+.PP
+.EX
+.in +4n
+int fd, removeFd;
+\&
+fd = openat(req\->data.args[0], path, req\->data.args[2],
+ req\->data.args[3]);
+\&
+struct seccomp_notif_addfd addfd;
+addfd.id = req\->id; /* Cookie from SECCOMP_IOCTL_NOTIF_RECV */
+addfd.srcfd = fd;
+addfd.newfd = 0;
+addfd.flags = 0;
+addfd.newfd_flags = O_CLOEXEC;
+\&
+targetFd = ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
+\&
+close(fd); /* No longer needed in supervisor */
+\&
+struct seccomp_notif_resp *resp;
+ /* Code to allocate 'resp' omitted */
+resp\->id = req\->id;
+resp\->error = 0; /* "Success" */
+resp\->val = targetFd;
+resp\->flags = 0;
+ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_SEND, resp);
+.in
+.EE
+.SH NOTES
+One example use case for the user-space notification
+mechanism is to allow a container manager
+(a process which is typically running with more privilege than
+the processes inside the container)
+to mount block devices or create device nodes for the container.
+The mount use case provides an example of where the
+.B SECCOMP_USER_NOTIF_FLAG_CONTINUE
+.BR ioctl (2)
+operation is useful.
+Upon receiving a notification for the
+.BR mount (2)
+system call, the container manager (the "supervisor") can distinguish
+a request to mount a block filesystem
+(which would not be possible for a "target" process inside the container)
+and mount that file system.
+If, on the other hand, the container manager detects that the operation
+could be performed by the process inside the container
+(e.g., a mount of a
+.BR tmpfs (5)
+filesystem), it can notify the kernel that the target process's
+.BR mount (2)
+system call can continue.
+.\"
+.SS select()/poll()/epoll semantics
+The file descriptor returned when
+.BR seccomp (2)
+is employed with the
+.B SECCOMP_FILTER_FLAG_NEW_LISTENER
+flag can be monitored using
+.BR poll (2),
+.BR epoll (7),
+and
+.BR select (2).
+These interfaces indicate that the file descriptor is ready as follows:
+.IP \[bu] 3
+When a notification is pending,
+these interfaces indicate that the file descriptor is readable.
+Following such an indication, a subsequent
+.B SECCOMP_IOCTL_NOTIF_RECV
+.BR ioctl (2)
+will not block, returning either information about a notification
+or else failing with the error
+.B EINTR
+if the target has been killed by a signal or its system call
+has been interrupted by a signal handler.
+.IP \[bu]
+After the notification has been received (i.e., by the
+.B SECCOMP_IOCTL_NOTIF_RECV
+.BR ioctl (2)
+operation), these interfaces indicate that the file descriptor is writable,
+meaning that a notification response can be sent using the
+.B SECCOMP_IOCTL_NOTIF_SEND
+.BR ioctl (2)
+operation.
+.IP \[bu]
+After the last thread using the filter has terminated and been reaped using
+.BR waitpid (2)
+(or similar),
+the file descriptor indicates an end-of-file condition (readable in
+.BR select (2);
+.BR POLLHUP / EPOLLHUP
+in
+.BR poll (2)/
+.BR epoll_wait (2)).
+.SS Design goals; use of SECCOMP_USER_NOTIF_FLAG_CONTINUE
+The intent of the user-space notification feature is
+to allow system calls to be performed on behalf of the target.
+The target's system call should either be handled by the supervisor or
+allowed to continue normally in the kernel (where standard security
+policies will be applied).
+.PP
+.BR "Note well" :
+this mechanism must not be used to make security policy decisions
+about the system call,
+which would be inherently race-prone for reasons described next.
+.PP
+The
+.B SECCOMP_USER_NOTIF_FLAG_CONTINUE
+flag must be used with caution.
+If set by the supervisor, the target's system call will continue.
+However, there is a time-of-check, time-of-use race here,
+since an attacker could exploit the interval of time where the target is
+blocked waiting on the "continue" response to do things such as
+rewriting the system call arguments.
+.PP
+Note furthermore that a user-space notifier can be bypassed if
+the existing filters allow the use of
+.BR seccomp (2)
+or
+.BR prctl (2)
+to install a filter that returns an action value with a higher precedence than
+.B SECCOMP_RET_USER_NOTIF
+(see
+.BR seccomp (2)).
+.PP
+It should thus be absolutely clear that the
+seccomp user-space notification mechanism
+.B can not
+be used to implement a security policy!
+It should only ever be used in scenarios where a more privileged process
+supervises the system calls of a lesser privileged target to
+get around kernel-enforced security restrictions when
+the supervisor deems this safe.
+In other words,
+in order to continue a system call, the supervisor should be sure that
+another security mechanism or the kernel itself will sufficiently block
+the system call if its arguments are rewritten to something unsafe.
+.\"
+.SS Caveats regarding the use of \fI/proc/\fPtid\fI/mem\fP
+The discussion above noted the need to use the
+.B SECCOMP_IOCTL_NOTIF_ID_VALID
+.BR ioctl (2)
+when opening the
+.IR /proc/ tid /mem
+file of the target
+to avoid the possibility of accessing the memory of the wrong process
+in the event that the target terminates and its ID
+is recycled by another (unrelated) thread.
+However, the use of this
+.BR ioctl (2)
+operation is also necessary in other situations,
+as explained in the following paragraphs.
+.PP
+Consider the following scenario, where the supervisor
+tries to read the pathname argument of a target's blocked
+.BR mount (2)
+system call:
+.IP (1) 5
+From one of its functions
+.RI ( func() ),
+the target calls
+.BR mount (2),
+which triggers a user-space notification and causes the target to block.
+.IP (2)
+The supervisor receives the notification, opens
+.IR /proc/ tid /mem ,
+and (successfully) performs the
+.B SECCOMP_IOCTL_NOTIF_ID_VALID
+check.
+.IP (3)
+The target receives a signal, which causes the
+.BR mount (2)
+to abort.
+.IP (4)
+The signal handler executes in the target, and returns.
+.IP (5)
+Upon return from the handler, the execution of
+.I func()
+resumes, and it returns (and perhaps other functions are called,
+overwriting the memory that had been used for the stack frame of
+.IR func() ).
+.IP (6)
+Using the address provided in the notification information,
+the supervisor reads from the target's memory location that used to
+contain the pathname.
+.IP (7)
+The supervisor now calls
+.BR mount (2)
+with some arbitrary bytes obtained in the previous step.
+.PP
+The conclusion from the above scenario is this:
+since the target's blocked system call may be interrupted by a signal handler,
+the supervisor must be written to expect that the
+target may abandon its system call at
+.B any
+time;
+in such an event, any information that the supervisor obtained from
+the target's memory must be considered invalid.
+.PP
+To prevent such scenarios,
+every read from the target's memory must be separated from use of
+the bytes so obtained by a
+.B SECCOMP_IOCTL_NOTIF_ID_VALID
+check.
+In the above example, the check would be placed between the two final steps.
+An example of such a check is shown in EXAMPLES.
+.PP
+Following on from the above, it should be clear that
+a write by the supervisor into the target's memory can
+.B never
+be considered safe.
+.\"
+.SS Caveats regarding blocking system calls
+Suppose that the target performs a blocking system call (e.g.,
+.BR accept (2))
+that the supervisor should handle.
+The supervisor might then in turn execute the same blocking system call.
+.PP
+In this scenario,
+it is important to note that if the target's system call is now
+interrupted by a signal, the supervisor is
+.I not
+informed of this.
+If the supervisor does not take suitable steps to
+actively discover that the target's system call has been canceled,
+various difficulties can occur.
+Taking the example of
+.BR accept (2),
+the supervisor might remain blocked in its
+.BR accept (2)
+holding a port number that the target
+(which, after the interruption by the signal handler,
+perhaps closed its listening socket) might expect to be able to reuse in a
+.BR bind (2)
+call.
+.PP
+Therefore, when the supervisor wishes to emulate a blocking system call,
+it must do so in such a way that it gets informed if the target's
+system call is interrupted by a signal handler.
+For example, if the supervisor itself executes the same
+blocking system call, then it could employ a separate thread
+that uses the
+.B SECCOMP_IOCTL_NOTIF_ID_VALID
+operation to check if the target is still blocked in its system call.
+Alternatively, in the
+.BR accept (2)
+example, the supervisor might use
+.BR poll (2)
+to monitor both the notification file descriptor
+(so as to discover when the target's
+.BR accept (2)
+call has been interrupted) and the listening file descriptor
+(so as to know when a connection is available).
+.PP
+If the target's system call is interrupted,
+the supervisor must take care to release resources (e.g., file descriptors)
+that it acquired on behalf of the target.
+.\"
+.SS Interaction with SA_RESTART signal handlers
+Consider the following scenario:
+.IP (1) 5
+The target process has used
+.BR sigaction (2)
+to install a signal handler with the
+.B SA_RESTART
+flag.
+.IP (2)
+The target has made a system call that triggered a seccomp
+user-space notification and the target is currently blocked
+until the supervisor sends a notification response.
+.IP (3)
+A signal is delivered to the target and the signal handler is executed.
+.IP (4)
+When (if) the supervisor attempts to send a notification response, the
+.B SECCOMP_IOCTL_NOTIF_SEND
+.BR ioctl (2))
+operation will fail with the
+.B ENOENT
+error.
+.PP
+In this scenario, the kernel will restart the target's system call.
+Consequently, the supervisor will receive another user-space notification.
+Thus, depending on how many times the blocked system call
+is interrupted by a signal handler,
+the supervisor may receive multiple notifications for
+the same instance of a system call in the target.
+.PP
+One oddity is that system call restarting as described in this scenario
+will occur even for the blocking system calls listed in
+.BR signal (7)
+that would
+.B never
+normally be restarted by the
+.B SA_RESTART
+flag.
+.\" FIXME
+.\" About the above, Kees Cook commented:
+.\"
+.\" Does this need fixing? I imagine the correct behavior for this case
+.\" would be a response to _SEND of EINPROGRESS and the target would see
+.\" EINTR normally?
+.\"
+.\" I mean, it's not like seccomp doesn't already expose weirdness with
+.\" syscall restarts. Not even arm64 compat agrees[3] with arm32 in this
+.\" regard. :(
+.
+.\" FIXME
+.\" Michael Kerrisk:
+.\" I wonder about the effect of this oddity for system calls that
+.\" are normally nonrestartable because they have timeouts. My
+.\" understanding is that the kernel doesn't restart those system
+.\" calls because it's impossible for the kernel to restart the call
+.\" with the right timeout value. I wonder what happens when those
+.\" system calls are restarted in the scenario we're discussing.)
+.PP
+Furthermore, if the supervisor response is a file descriptor
+added with
+.BR SECCOMP_IOCTL_NOTIF_ADDFD ,
+then the flag
+.B SECCOMP_ADDFD_FLAG_SEND
+can be used to atomically add the file descriptor and return that value,
+making sure no file descriptors are inadvertently leaked into the target.
+.SH BUGS
+If a
+.B SECCOMP_IOCTL_NOTIF_RECV
+.BR ioctl (2)
+operation
+.\" or a poll/epoll/select
+is performed after the target terminates, then the
+.BR ioctl (2)
+call simply blocks (rather than returning an error to indicate that the
+target no longer exists).
+.\" FIXME
+.\" Comment from Kees Cook:
+.\"
+.\" I want this fixed. It caused me no end of pain when building the
+.\" selftests, and ended up spawning my implementing a global test timeout
+.\" in kselftest. :P Before the usage counter refactor, there was no sane
+.\" way to deal with this, but now I think we're close.
+.\"
+.SH EXAMPLES
+The (somewhat contrived) program shown below demonstrates the use of
+the interfaces described in this page.
+The program creates a child process that serves as the "target" process.
+The child process installs a seccomp filter that returns the
+.B SECCOMP_RET_USER_NOTIF
+action value if a call is made to
+.BR mkdir (2).
+The child process then calls
+.BR mkdir (2)
+once for each of the supplied command-line arguments,
+and reports the result returned by the call.
+After processing all arguments, the child process terminates.
+.PP
+The parent process acts as the supervisor, listening for the notifications
+that are generated when the target process calls
+.BR mkdir (2).
+When such a notification occurs,
+the supervisor examines the memory of the target process (using
+.IR /proc/ pid /mem )
+to discover the pathname argument that was supplied to the
+.BR mkdir (2)
+call, and performs one of the following actions:
+.IP \[bu] 3
+If the pathname begins with the prefix "/tmp/",
+then the supervisor attempts to create the specified directory,
+and then spoofs a return for the target process based on the return
+value of the supervisor's
+.BR mkdir (2)
+call.
+In the event that that call succeeds,
+the spoofed success return value is the length of the pathname.
+.IP \[bu]
+If the pathname begins with "./" (i.e., it is a relative pathname),
+the supervisor sends a
+.B SECCOMP_USER_NOTIF_FLAG_CONTINUE
+response to the kernel to say that the kernel should execute
+the target process's
+.BR mkdir (2)
+call.
+.IP \[bu]
+If the pathname begins with some other prefix,
+the supervisor spoofs an error return for the target process,
+so that the target process's
+.BR mkdir (2)
+call appears to fail with the error
+.B EOPNOTSUPP
+("Operation not supported").
+Additionally, if the specified pathname is exactly "/bye",
+then the supervisor terminates.
+.PP
+This program can be used to demonstrate various aspects of the
+behavior of the seccomp user-space notification mechanism.
+To help aid such demonstrations,
+the program logs various messages to show the operation
+of the target process (lines prefixed "T:") and the supervisor
+(indented lines prefixed "S:").
+.PP
+In the following example, the target attempts to create the directory
+.IR /tmp/x .
+Upon receiving the notification, the supervisor creates the directory on the
+target's behalf,
+and spoofs a success return to be received by the target process's
+.BR mkdir (2)
+call.
+.PP
+.in +4n
+.EX
+$ \fB./seccomp_unotify /tmp/x\fP
+T: PID = 23168
+\&
+T: about to mkdir("/tmp/x")
+ S: got notification (ID 0x17445c4a0f4e0e3c) for PID 23168
+ S: executing: mkdir("/tmp/x", 0700)
+ S: success! spoofed return = 6
+ S: sending response (flags = 0; val = 6; error = 0)
+T: SUCCESS: mkdir(2) returned 6
+\&
+T: terminating
+ S: target has terminated; bye
+.EE
+.in
+.PP
+In the above output, note that the spoofed return value seen by the target
+process is 6 (the length of the pathname
+.IR /tmp/x ),
+whereas a normal
+.BR mkdir (2)
+call returns 0 on success.
+.PP
+In the next example, the target attempts to create a directory using the
+relative pathname
+.IR ./sub .
+Since this pathname starts with "./",
+the supervisor sends a
+.B SECCOMP_USER_NOTIF_FLAG_CONTINUE
+response to the kernel,
+and the kernel then (successfully) executes the target process's
+.BR mkdir (2)
+call.
+.PP
+.in +4n
+.EX
+$ \fB./seccomp_unotify ./sub\fP
+T: PID = 23204
+\&
+T: about to mkdir("./sub")
+ S: got notification (ID 0xddb16abe25b4c12) for PID 23204
+ S: target can execute system call
+ S: sending response (flags = 0x1; val = 0; error = 0)
+T: SUCCESS: mkdir(2) returned 0
+\&
+T: terminating
+ S: target has terminated; bye
+.EE
+.in
+.PP
+If the target process attempts to create a directory with
+a pathname that doesn't start with "." and doesn't begin with the prefix
+"/tmp/", then the supervisor spoofs an error return
+.RB ( EOPNOTSUPP ,
+"Operation not supported")
+for the target's
+.BR mkdir (2)
+call (which is not executed):
+.PP
+.in +4n
+.EX
+$ \fB./seccomp_unotify /xxx\fP
+T: PID = 23178
+\&
+T: about to mkdir("/xxx")
+ S: got notification (ID 0xe7dc095d1c524e80) for PID 23178
+ S: spoofing error response (Operation not supported)
+ S: sending response (flags = 0; val = 0; error = \-95)
+T: ERROR: mkdir(2): Operation not supported
+\&
+T: terminating
+ S: target has terminated; bye
+.EE
+.in
+.PP
+In the next example,
+the target process attempts to create a directory with the pathname
+.BR /tmp/nosuchdir/b .
+Upon receiving the notification,
+the supervisor attempts to create that directory, but the
+.BR mkdir (2)
+call fails because the directory
+.B /tmp/nosuchdir
+does not exist.
+Consequently, the supervisor spoofs an error return that passes the error
+that it received back to the target process's
+.BR mkdir (2)
+call.
+.PP
+.in +4n
+.EX
+$ \fB./seccomp_unotify /tmp/nosuchdir/b\fP
+T: PID = 23199
+\&
+T: about to mkdir("/tmp/nosuchdir/b")
+ S: got notification (ID 0x8744454293506046) for PID 23199
+ S: executing: mkdir("/tmp/nosuchdir/b", 0700)
+ S: failure! (errno = 2; No such file or directory)
+ S: sending response (flags = 0; val = 0; error = \-2)
+T: ERROR: mkdir(2): No such file or directory
+\&
+T: terminating
+ S: target has terminated; bye
+.EE
+.in
+.PP
+If the supervisor receives a notification and sees that the
+argument of the target's
+.BR mkdir (2)
+is the string "/bye", then (as well as spoofing an
+.B EOPNOTSUPP
+error), the supervisor terminates.
+If the target process subsequently executes another
+.BR mkdir (2)
+that triggers its seccomp filter to return the
+.B SECCOMP_RET_USER_NOTIF
+action value, then the kernel causes the target process's system call to
+fail with the error
+.B ENOSYS
+("Function not implemented").
+This is demonstrated by the following example:
+.PP
+.in +4n
+.EX
+$ \fB./seccomp_unotify /bye /tmp/y\fP
+T: PID = 23185
+\&
+T: about to mkdir("/bye")
+ S: got notification (ID 0xa81236b1d2f7b0f4) for PID 23185
+ S: spoofing error response (Operation not supported)
+ S: sending response (flags = 0; val = 0; error = \-95)
+ S: terminating **********
+T: ERROR: mkdir(2): Operation not supported
+\&
+T: about to mkdir("/tmp/y")
+T: ERROR: mkdir(2): Function not implemented
+\&
+T: terminating
+.EE
+.in
+.\"
+.SS Program source
+.\" SRC BEGIN (seccomp_unotify.c)
+.EX
+#define _GNU_SOURCE
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/audit.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <unistd.h>
+\&
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+\&
+/* Send the file descriptor \[aq]fd\[aq] over the connected UNIX domain socket
+ \[aq]sockfd\[aq]. Returns 0 on success, or \-1 on error. */
+\&
+static int
+sendfd(int sockfd, int fd)
+{
+ int data;
+ struct iovec iov;
+ struct msghdr msgh;
+ struct cmsghdr *cmsgp;
+\&
+ /* Allocate a char array of suitable size to hold the ancillary data.
+ However, since this buffer is in reality a \[aq]struct cmsghdr\[aq], use a
+ union to ensure that it is suitably aligned. */
+ union {
+ char buf[CMSG_SPACE(sizeof(int))];
+ /* Space large enough to hold an \[aq]int\[aq] */
+ struct cmsghdr align;
+ } controlMsg;
+\&
+ /* The \[aq]msg_name\[aq] field can be used to specify the address of the
+ destination socket when sending a datagram. However, we do not
+ need to use this field because \[aq]sockfd\[aq] is a connected socket. */
+\&
+ msgh.msg_name = NULL;
+ msgh.msg_namelen = 0;
+\&
+ /* On Linux, we must transmit at least one byte of real data in
+ order to send ancillary data. We transmit an arbitrary integer
+ whose value is ignored by recvfd(). */
+\&
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ iov.iov_base = &data;
+ iov.iov_len = sizeof(int);
+ data = 12345;
+\&
+ /* Set \[aq]msghdr\[aq] fields that describe ancillary data */
+\&
+ msgh.msg_control = controlMsg.buf;
+ msgh.msg_controllen = sizeof(controlMsg.buf);
+\&
+ /* Set up ancillary data describing file descriptor to send */
+\&
+ cmsgp = CMSG_FIRSTHDR(&msgh);
+ cmsgp\->cmsg_level = SOL_SOCKET;
+ cmsgp\->cmsg_type = SCM_RIGHTS;
+ cmsgp\->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsgp), &fd, sizeof(int));
+\&
+ /* Send real plus ancillary data */
+\&
+ if (sendmsg(sockfd, &msgh, 0) == \-1)
+ return \-1;
+\&
+ return 0;
+}
+\&
+/* Receive a file descriptor on a connected UNIX domain socket. Returns
+ the received file descriptor on success, or \-1 on error. */
+\&
+static int
+recvfd(int sockfd)
+{
+ int data, fd;
+ ssize_t nr;
+ struct iovec iov;
+ struct msghdr msgh;
+\&
+ /* Allocate a char buffer for the ancillary data. See the comments
+ in sendfd() */
+ union {
+ char buf[CMSG_SPACE(sizeof(int))];
+ struct cmsghdr align;
+ } controlMsg;
+ struct cmsghdr *cmsgp;
+\&
+ /* The \[aq]msg_name\[aq] field can be used to obtain the address of the
+ sending socket. However, we do not need this information. */
+\&
+ msgh.msg_name = NULL;
+ msgh.msg_namelen = 0;
+\&
+ /* Specify buffer for receiving real data */
+\&
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ iov.iov_base = &data; /* Real data is an \[aq]int\[aq] */
+ iov.iov_len = sizeof(int);
+\&
+ /* Set \[aq]msghdr\[aq] fields that describe ancillary data */
+\&
+ msgh.msg_control = controlMsg.buf;
+ msgh.msg_controllen = sizeof(controlMsg.buf);
+\&
+ /* Receive real plus ancillary data; real data is ignored */
+\&
+ nr = recvmsg(sockfd, &msgh, 0);
+ if (nr == \-1)
+ return \-1;
+\&
+ cmsgp = CMSG_FIRSTHDR(&msgh);
+\&
+ /* Check the validity of the \[aq]cmsghdr\[aq] */
+\&
+ if (cmsgp == NULL
+ || cmsgp\->cmsg_len != CMSG_LEN(sizeof(int))
+ || cmsgp\->cmsg_level != SOL_SOCKET
+ || cmsgp\->cmsg_type != SCM_RIGHTS)
+ {
+ errno = EINVAL;
+ return \-1;
+ }
+\&
+ /* Return the received file descriptor to our caller */
+\&
+ memcpy(&fd, CMSG_DATA(cmsgp), sizeof(int));
+ return fd;
+}
+\&
+static void
+sigchldHandler(int sig)
+{
+ char msg[] = "\etS: target has terminated; bye\en";
+\&
+ write(STDOUT_FILENO, msg, sizeof(msg) \- 1);
+ _exit(EXIT_SUCCESS);
+}
+\&
+static int
+seccomp(unsigned int operation, unsigned int flags, void *args)
+{
+ return syscall(SYS_seccomp, operation, flags, args);
+}
+\&
+/* The following is the x86\-64\-specific BPF boilerplate code for checking
+ that the BPF program is running on the right architecture + ABI. At
+ completion of these instructions, the accumulator contains the system
+ call number. */
+\&
+/* For the x32 ABI, all system call numbers have bit 30 set */
+\&
+#define X32_SYSCALL_BIT 0x40000000
+\&
+#define X86_64_CHECK_ARCH_AND_LOAD_SYSCALL_NR \e
+ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, \e
+ (offsetof(struct seccomp_data, arch))), \e
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, 2), \e
+ BPF_STMT(BPF_LD | BPF_W | BPF_ABS, \e
+ (offsetof(struct seccomp_data, nr))), \e
+ BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, X32_SYSCALL_BIT, 0, 1), \e
+ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS)
+\&
+/* installNotifyFilter() installs a seccomp filter that generates
+ user\-space notifications (SECCOMP_RET_USER_NOTIF) when the process
+ calls mkdir(2); the filter allows all other system calls.
+\&
+ The function return value is a file descriptor from which the
+ user\-space notifications can be fetched. */
+\&
+static int
+installNotifyFilter(void)
+{
+ int notifyFd;
+\&
+ struct sock_filter filter[] = {
+ X86_64_CHECK_ARCH_AND_LOAD_SYSCALL_NR,
+\&
+ /* mkdir() triggers notification to user\-space supervisor */
+\&
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, SYS_mkdir, 0, 1),
+ BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
+\&
+ /* Every other system call is allowed */
+\&
+ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
+ };
+\&
+ struct sock_fprog prog = {
+ .len = ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+\&
+ /* Install the filter with the SECCOMP_FILTER_FLAG_NEW_LISTENER flag;
+ as a result, seccomp() returns a notification file descriptor. */
+\&
+ notifyFd = seccomp(SECCOMP_SET_MODE_FILTER,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
+ if (notifyFd == \-1)
+ err(EXIT_FAILURE, "seccomp\-install\-notify\-filter");
+\&
+ return notifyFd;
+}
+\&
+/* Close a pair of sockets created by socketpair() */
+\&
+static void
+closeSocketPair(int sockPair[2])
+{
+ if (close(sockPair[0]) == \-1)
+ err(EXIT_FAILURE, "closeSocketPair\-close\-0");
+ if (close(sockPair[1]) == \-1)
+ err(EXIT_FAILURE, "closeSocketPair\-close\-1");
+}
+\&
+/* Implementation of the target process; create a child process that:
+\&
+ (1) installs a seccomp filter with the
+ SECCOMP_FILTER_FLAG_NEW_LISTENER flag;
+ (2) writes the seccomp notification file descriptor returned from
+ the previous step onto the UNIX domain socket, \[aq]sockPair[0]\[aq];
+ (3) calls mkdir(2) for each element of \[aq]argv\[aq].
+\&
+ The function return value in the parent is the PID of the child
+ process; the child does not return from this function. */
+\&
+static pid_t
+targetProcess(int sockPair[2], char *argv[])
+{
+ int notifyFd, s;
+ pid_t targetPid;
+\&
+ targetPid = fork();
+\&
+ if (targetPid == \-1)
+ err(EXIT_FAILURE, "fork");
+\&
+ if (targetPid > 0) /* In parent, return PID of child */
+ return targetPid;
+\&
+ /* Child falls through to here */
+\&
+ printf("T: PID = %ld\en", (long) getpid());
+\&
+ /* Install seccomp filter(s) */
+\&
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
+ err(EXIT_FAILURE, "prctl");
+\&
+ notifyFd = installNotifyFilter();
+\&
+ /* Pass the notification file descriptor to the tracing process over
+ a UNIX domain socket */
+\&
+ if (sendfd(sockPair[0], notifyFd) == \-1)
+ err(EXIT_FAILURE, "sendfd");
+\&
+ /* Notification and socket FDs are no longer needed in target */
+\&
+ if (close(notifyFd) == \-1)
+ err(EXIT_FAILURE, "close\-target\-notify\-fd");
+\&
+ closeSocketPair(sockPair);
+\&
+ /* Perform a mkdir() call for each of the command\-line arguments */
+\&
+ for (char **ap = argv; *ap != NULL; ap++) {
+ printf("\enT: about to mkdir(\e"%s\e")\en", *ap);
+\&
+ s = mkdir(*ap, 0700);
+ if (s == \-1)
+ perror("T: ERROR: mkdir(2)");
+ else
+ printf("T: SUCCESS: mkdir(2) returned %d\en", s);
+ }
+\&
+ printf("\enT: terminating\en");
+ exit(EXIT_SUCCESS);
+}
+\&
+/* Check that the notification ID provided by a SECCOMP_IOCTL_NOTIF_RECV
+ operation is still valid. It will no longer be valid if the target
+ process has terminated or is no longer blocked in the system call that
+ generated the notification (because it was interrupted by a signal).
+\&
+ This operation can be used when doing such things as accessing
+ /proc/PID files in the target process in order to avoid TOCTOU race
+ conditions where the PID that is returned by SECCOMP_IOCTL_NOTIF_RECV
+ terminates and is reused by another process. */
+\&
+static bool
+cookieIsValid(int notifyFd, uint64_t id)
+{
+ return ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_ID_VALID, &id) == 0;
+}
+\&
+/* Access the memory of the target process in order to fetch the
+ pathname referred to by the system call argument \[aq]argNum\[aq] in
+ \[aq]req\->data.args[]\[aq]. The pathname is returned in \[aq]path\[aq],
+ a buffer of \[aq]len\[aq] bytes allocated by the caller.
+\&
+ Returns true if the pathname is successfully fetched, and false
+ otherwise. For possible causes of failure, see the comments below. */
+\&
+static bool
+getTargetPathname(struct seccomp_notif *req, int notifyFd,
+ int argNum, char *path, size_t len)
+{
+ int procMemFd;
+ char procMemPath[PATH_MAX];
+ ssize_t nread;
+\&
+ snprintf(procMemPath, sizeof(procMemPath), "/proc/%d/mem", req\->pid);
+\&
+ procMemFd = open(procMemPath, O_RDONLY | O_CLOEXEC);
+ if (procMemFd == \-1)
+ return false;
+\&
+ /* Check that the process whose info we are accessing is still alive
+ and blocked in the system call that caused the notification.
+ If the SECCOMP_IOCTL_NOTIF_ID_VALID operation (performed in
+ cookieIsValid()) succeeded, we know that the /proc/PID/mem file
+ descriptor that we opened corresponded to the process for which we
+ received a notification. If that process subsequently terminates,
+ then read() on that file descriptor will return 0 (EOF). */
+\&
+ if (!cookieIsValid(notifyFd, req\->id)) {
+ close(procMemFd);
+ return false;
+ }
+\&
+ /* Read bytes at the location containing the pathname argument */
+\&
+ nread = pread(procMemFd, path, len, req\->data.args[argNum]);
+\&
+ close(procMemFd);
+\&
+ if (nread <= 0)
+ return false;
+\&
+ /* Once again check that the notification ID is still valid. The
+ case we are particularly concerned about here is that just
+ before we fetched the pathname, the target\[aq]s blocked system
+ call was interrupted by a signal handler, and after the handler
+ returned, the target carried on execution (past the interrupted
+ system call). In that case, we have no guarantees about what we
+ are reading, since the target\[aq]s memory may have been arbitrarily
+ changed by subsequent operations. */
+\&
+ if (!cookieIsValid(notifyFd, req\->id)) {
+ perror("\etS: notification ID check failed!!!");
+ return false;
+ }
+\&
+ /* Even if the target\[aq]s system call was not interrupted by a signal,
+ we have no guarantees about what was in the memory of the target
+ process. (The memory may have been modified by another thread, or
+ even by an external attacking process.) We therefore treat the
+ buffer returned by pread() as untrusted input. The buffer should
+ contain a terminating null byte; if not, then we will trigger an
+ error for the target process. */
+\&
+ if (strnlen(path, nread) < nread)
+ return true;
+\&
+ return false;
+}
+\&
+/* Allocate buffers for the seccomp user\-space notification request and
+ response structures. It is the caller\[aq]s responsibility to free the
+ buffers returned via \[aq]req\[aq] and \[aq]resp\[aq]. */
+\&
+static void
+allocSeccompNotifBuffers(struct seccomp_notif **req,
+ struct seccomp_notif_resp **resp,
+ struct seccomp_notif_sizes *sizes)
+{
+ size_t resp_size;
+\&
+ /* Discover the sizes of the structures that are used to receive
+ notifications and send notification responses, and allocate
+ buffers of those sizes. */
+\&
+ if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, sizes) == \-1)
+ err(EXIT_FAILURE, "seccomp\-SECCOMP_GET_NOTIF_SIZES");
+\&
+ *req = malloc(sizes\->seccomp_notif);
+ if (*req == NULL)
+ err(EXIT_FAILURE, "malloc\-seccomp_notif");
+\&
+ /* When allocating the response buffer, we must allow for the fact
+ that the user\-space binary may have been built with user\-space
+ headers where \[aq]struct seccomp_notif_resp\[aq] is bigger than the
+ response buffer expected by the (older) kernel. Therefore, we
+ allocate a buffer that is the maximum of the two sizes. This
+ ensures that if the supervisor places bytes into the response
+ structure that are past the response size that the kernel expects,
+ then the supervisor is not touching an invalid memory location. */
+\&
+ resp_size = sizes\->seccomp_notif_resp;
+ if (sizeof(struct seccomp_notif_resp) > resp_size)
+ resp_size = sizeof(struct seccomp_notif_resp);
+\&
+ *resp = malloc(resp_size);
+ if (*resp == NULL)
+ err(EXIT_FAILURE, "malloc\-seccomp_notif_resp");
+\&
+}
+\&
+/* Handle notifications that arrive via the SECCOMP_RET_USER_NOTIF file
+ descriptor, \[aq]notifyFd\[aq]. */
+\&
+static void
+handleNotifications(int notifyFd)
+{
+ bool pathOK;
+ char path[PATH_MAX];
+ struct seccomp_notif *req;
+ struct seccomp_notif_resp *resp;
+ struct seccomp_notif_sizes sizes;
+\&
+ allocSeccompNotifBuffers(&req, &resp, &sizes);
+\&
+ /* Loop handling notifications */
+\&
+ for (;;) {
+\&
+ /* Wait for next notification, returning info in \[aq]*req\[aq] */
+\&
+ memset(req, 0, sizes.seccomp_notif);
+ if (ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_RECV, req) == \-1) {
+ if (errno == EINTR)
+ continue;
+ err(EXIT_FAILURE, "\etS: ioctl\-SECCOMP_IOCTL_NOTIF_RECV");
+ }
+\&
+ printf("\etS: got notification (ID %#llx) for PID %d\en",
+ req\->id, req\->pid);
+\&
+ /* The only system call that can generate a notification event
+ is mkdir(2). Nevertheless, we check that the notified system
+ call is indeed mkdir() as kind of future\-proofing of this
+ code in case the seccomp filter is later modified to
+ generate notifications for other system calls. */
+\&
+ if (req\->data.nr != SYS_mkdir) {
+ printf("\etS: notification contained unexpected "
+ "system call number; bye!!!\en");
+ exit(EXIT_FAILURE);
+ }
+\&
+ pathOK = getTargetPathname(req, notifyFd, 0, path, sizeof(path));
+\&
+ /* Prepopulate some fields of the response */
+\&
+ resp\->id = req\->id; /* Response includes notification ID */
+ resp\->flags = 0;
+ resp\->val = 0;
+\&
+ /* If getTargetPathname() failed, trigger an EINVAL error
+ response (sending this response may yield an error if the
+ failure occurred because the notification ID was no longer
+ valid); if the directory is in /tmp, then create it on behalf
+ of the supervisor; if the pathname starts with \[aq].\[aq], tell the
+ kernel to let the target process execute the mkdir();
+ otherwise, give an error for a directory pathname in any other
+ location. */
+\&
+ if (!pathOK) {
+ resp\->error = \-EINVAL;
+ printf("\etS: spoofing error for invalid pathname (%s)\en",
+ strerror(\-resp\->error));
+ } else if (strncmp(path, "/tmp/", strlen("/tmp/")) == 0) {
+ printf("\etS: executing: mkdir(\e"%s\e", %#llo)\en",
+ path, req\->data.args[1]);
+\&
+ if (mkdir(path, req\->data.args[1]) == 0) {
+ resp\->error = 0; /* "Success" */
+ resp\->val = strlen(path); /* Used as return value of
+ mkdir() in target */
+ printf("\etS: success! spoofed return = %lld\en",
+ resp\->val);
+ } else {
+\&
+ /* If mkdir() failed in the supervisor, pass the error
+ back to the target */
+\&
+ resp\->error = \-errno;
+ printf("\etS: failure! (errno = %d; %s)\en", errno,
+ strerror(errno));
+ }
+ } else if (strncmp(path, "./", strlen("./")) == 0) {
+ resp\->error = resp\->val = 0;
+ resp\->flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
+ printf("\etS: target can execute system call\en");
+ } else {
+ resp\->error = \-EOPNOTSUPP;
+ printf("\etS: spoofing error response (%s)\en",
+ strerror(\-resp\->error));
+ }
+\&
+ /* Send a response to the notification */
+\&
+ printf("\etS: sending response "
+ "(flags = %#x; val = %lld; error = %d)\en",
+ resp\->flags, resp\->val, resp\->error);
+\&
+ if (ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_SEND, resp) == \-1) {
+ if (errno == ENOENT)
+ printf("\etS: response failed with ENOENT; "
+ "perhaps target process\[aq]s syscall was "
+ "interrupted by a signal?\en");
+ else
+ perror("ioctl\-SECCOMP_IOCTL_NOTIF_SEND");
+ }
+\&
+ /* If the pathname is just "/bye", then the supervisor breaks out
+ of the loop and terminates. This allows us to see what happens
+ if the target process makes further calls to mkdir(2). */
+\&
+ if (strcmp(path, "/bye") == 0)
+ break;
+ }
+\&
+ free(req);
+ free(resp);
+ printf("\etS: terminating **********\en");
+ exit(EXIT_FAILURE);
+}
+\&
+/* Implementation of the supervisor process:
+\&
+ (1) obtains the notification file descriptor from \[aq]sockPair[1]\[aq]
+ (2) handles notifications that arrive on that file descriptor. */
+\&
+static void
+supervisor(int sockPair[2])
+{
+ int notifyFd;
+\&
+ notifyFd = recvfd(sockPair[1]);
+\&
+ if (notifyFd == \-1)
+ err(EXIT_FAILURE, "recvfd");
+\&
+ closeSocketPair(sockPair); /* We no longer need the socket pair */
+\&
+ handleNotifications(notifyFd);
+}
+\&
+int
+main(int argc, char *argv[])
+{
+ int sockPair[2];
+ struct sigaction sa;
+\&
+ setbuf(stdout, NULL);
+\&
+ if (argc < 2) {
+ fprintf(stderr, "At least one pathname argument is required\en");
+ exit(EXIT_FAILURE);
+ }
+\&
+ /* Create a UNIX domain socket that is used to pass the seccomp
+ notification file descriptor from the target process to the
+ supervisor process. */
+\&
+ if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockPair) == \-1)
+ err(EXIT_FAILURE, "socketpair");
+\&
+ /* Create a child process\-\-the "target"\-\-that installs seccomp
+ filtering. The target process writes the seccomp notification
+ file descriptor onto \[aq]sockPair[0]\[aq] and then calls mkdir(2) for
+ each directory in the command\-line arguments. */
+\&
+ (void) targetProcess(sockPair, &argv[optind]);
+\&
+ /* Catch SIGCHLD when the target terminates, so that the
+ supervisor can also terminate. */
+\&
+ sa.sa_handler = sigchldHandler;
+ sa.sa_flags = 0;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(SIGCHLD, &sa, NULL) == \-1)
+ err(EXIT_FAILURE, "sigaction");
+\&
+ supervisor(sockPair);
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR ioctl (2),
+.BR pidfd_getfd (2),
+.BR pidfd_open (2),
+.BR seccomp (2)
+.PP
+A further example program can be found in the kernel source file
+.IR samples/seccomp/user-trap.c .
diff --git a/man2/security.2 b/man2/security.2
new file mode 100644
index 0000000..5d25ea6
--- /dev/null
+++ b/man2/security.2
@@ -0,0 +1 @@
+.so man2/unimplemented.2
diff --git a/man2/select.2 b/man2/select.2
new file mode 100644
index 0000000..41cb7e6
--- /dev/null
+++ b/man2/select.2
@@ -0,0 +1,765 @@
+.\" This manpage is copyright (C) 1992 Drew Eckhardt,
+.\" copyright (C) 1995 Michael Shields,
+.\" copyright (C) 2001 Paul Sheer,
+.\" copyright (C) 2006, 2019 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 1993-07-24 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1995-05-18 by Jim Van Zandt <jrv@vanzandt.mv.com>
+.\" Sun Feb 11 14:07:00 MET 1996 Martin Schulze <joey@linux.de>
+.\" * layout slightly modified
+.\"
+.\" Modified Mon Oct 21 23:05:29 EDT 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified Thu Feb 24 01:41:09 CET 2000 by aeb
+.\" Modified Thu Feb 9 22:32:09 CET 2001 by bert hubert <ahu@ds9a.nl>, aeb
+.\" Modified Mon Nov 11 14:35:00 PST 2002 by Ben Woodard <ben@zork.net>
+.\" 2005-03-11, mtk, modified pselect() text (it is now a system
+.\" call in Linux 2.6.16.
+.\"
+.TH select 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+select, pselect, FD_CLR, FD_ISSET, FD_SET, FD_ZERO, fd_set \-
+synchronous I/O multiplexing
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/select.h>
+.PP
+.BR typedef " /* ... */ " fd_set;
+.PP
+.BI "int select(int " nfds ", fd_set *_Nullable restrict " readfds ,
+.BI " fd_set *_Nullable restrict " writefds ,
+.BI " fd_set *_Nullable restrict " exceptfds ,
+.BI " struct timeval *_Nullable restrict " timeout );
+.PP
+.BI "void FD_CLR(int " fd ", fd_set *" set );
+.BI "int FD_ISSET(int " fd ", fd_set *" set );
+.BI "void FD_SET(int " fd ", fd_set *" set );
+.BI "void FD_ZERO(fd_set *" set );
+.PP
+.BI "int pselect(int " nfds ", fd_set *_Nullable restrict " readfds ,
+.BI " fd_set *_Nullable restrict " writefds ,
+.BI " fd_set *_Nullable restrict " exceptfds ,
+.BI " const struct timespec *_Nullable restrict " timeout ,
+.BI " const sigset_t *_Nullable restrict " sigmask );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR pselect ():
+.nf
+ _POSIX_C_SOURCE >= 200112L
+.fi
+.SH DESCRIPTION
+.BR "WARNING" :
+.BR select ()
+can monitor only file descriptors numbers that are less than
+.B FD_SETSIZE
+(1024)\[em]an unreasonably low limit for many modern applications\[em]and
+this limitation will not change.
+All modern applications should instead use
+.BR poll (2)
+or
+.BR epoll (7),
+which do not suffer this limitation.
+.PP
+.BR select ()
+allows a program to monitor multiple file descriptors,
+waiting until one or more of the file descriptors become "ready"
+for some class of I/O operation (e.g., input possible).
+A file descriptor is considered ready if it is possible to
+perform a corresponding I/O operation (e.g.,
+.BR read (2),
+or a sufficiently small
+.BR write (2))
+without blocking.
+.\"
+.SS fd_set
+A structure type that can represent a set of file descriptors.
+According to POSIX,
+the maximum number of file descriptors in an
+.I fd_set
+structure is the value of the macro
+.BR FD_SETSIZE .
+.\"
+.SS File descriptor sets
+The principal arguments of
+.BR select ()
+are three "sets" of file descriptors (declared with the type
+.IR fd_set ),
+which allow the caller to wait for three classes of events
+on the specified set of file descriptors.
+Each of the
+.I fd_set
+arguments may be specified as NULL if no file descriptors are
+to be watched for the corresponding class of events.
+.PP
+.BR "Note well" :
+Upon return, each of the file descriptor sets is modified in place
+to indicate which file descriptors are currently "ready".
+Thus, if using
+.BR select ()
+within a loop, the sets \fImust be reinitialized\fP before each call.
+.PP
+The contents of a file descriptor set can be manipulated
+using the following macros:
+.TP
+.BR FD_ZERO ()
+This macro clears (removes all file descriptors from)
+.IR set .
+It should be employed as the first step in initializing a file descriptor set.
+.TP
+.BR FD_SET ()
+This macro adds the file descriptor
+.I fd
+to
+.IR set .
+Adding a file descriptor that is already present in the set is a no-op,
+and does not produce an error.
+.TP
+.BR FD_CLR ()
+This macro removes the file descriptor
+.I fd
+from
+.IR set .
+Removing a file descriptor that is not present in the set is a no-op,
+and does not produce an error.
+.TP
+.BR FD_ISSET ()
+.BR select ()
+modifies the contents of the sets according to the rules
+described below.
+After calling
+.BR select (),
+the
+.BR FD_ISSET ()
+macro
+can be used to test if a file descriptor is still present in a set.
+.BR FD_ISSET ()
+returns nonzero if the file descriptor
+.I fd
+is present in
+.IR set ,
+and zero if it is not.
+.\"
+.SS Arguments
+The arguments of
+.BR select ()
+are as follows:
+.TP
+.I readfds
+The file descriptors in this set are watched to see if they are
+ready for reading.
+A file descriptor is ready for reading if a read operation will not
+block; in particular, a file descriptor is also ready on end-of-file.
+.IP
+After
+.BR select ()
+has returned, \fIreadfds\fP will be
+cleared of all file descriptors except for those that are ready for reading.
+.TP
+.I writefds
+The file descriptors in this set are watched to see if they are
+ready for writing.
+A file descriptor is ready for writing if a write operation will not block.
+However, even if a file descriptor indicates as writable,
+a large write may still block.
+.IP
+After
+.BR select ()
+has returned, \fIwritefds\fP will be
+cleared of all file descriptors except for those that are ready for writing.
+.TP
+.I exceptfds
+The file descriptors in this set are watched for "exceptional conditions".
+For examples of some exceptional conditions, see the discussion of
+.B POLLPRI
+in
+.BR poll (2).
+.IP
+After
+.BR select ()
+has returned,
+\fIexceptfds\fP will be cleared of all file descriptors except for those
+for which an exceptional condition has occurred.
+.TP
+.I nfds
+This argument should be set to the highest-numbered file descriptor in any
+of the three sets, plus 1.
+The indicated file descriptors in each set are checked, up to this limit
+(but see BUGS).
+.TP
+.I timeout
+The
+.I timeout
+argument is a
+.I timeval
+structure (shown below) that specifies the interval that
+.BR select ()
+should block waiting for a file descriptor to become ready.
+The call will block until either:
+.RS
+.IP \[bu] 3
+a file descriptor becomes ready;
+.IP \[bu]
+the call is interrupted by a signal handler; or
+.IP \[bu]
+the timeout expires.
+.RE
+.IP
+Note that the
+.I timeout
+interval will be rounded up to the system clock granularity,
+and kernel scheduling delays mean that the blocking interval
+may overrun by a small amount.
+.IP
+If both fields of the
+.I timeval
+structure are zero, then
+.BR select ()
+returns immediately.
+(This is useful for polling.)
+.IP
+If
+.I timeout
+is specified as NULL,
+.BR select ()
+blocks indefinitely waiting for a file descriptor to become ready.
+.\"
+.SS pselect()
+The
+.BR pselect ()
+system call allows an application to safely wait until either
+a file descriptor becomes ready or until a signal is caught.
+.PP
+The operation of
+.BR select ()
+and
+.BR pselect ()
+is identical, other than these three differences:
+.IP \[bu] 3
+.BR select ()
+uses a timeout that is a
+.I struct timeval
+(with seconds and microseconds), while
+.BR pselect ()
+uses a
+.I struct timespec
+(with seconds and nanoseconds).
+.IP \[bu]
+.BR select ()
+may update the
+.I timeout
+argument to indicate how much time was left.
+.BR pselect ()
+does not change this argument.
+.IP \[bu]
+.BR select ()
+has no
+.I sigmask
+argument, and behaves as
+.BR pselect ()
+called with NULL
+.IR sigmask .
+.PP
+.I sigmask
+is a pointer to a signal mask (see
+.BR sigprocmask (2));
+if it is not NULL, then
+.BR pselect ()
+first replaces the current signal mask by the one pointed to by
+.IR sigmask ,
+then does the "select" function, and then restores the original
+signal mask.
+(If
+.I sigmask
+is NULL,
+the signal mask is not modified during the
+.BR pselect ()
+call.)
+.PP
+Other than the difference in the precision of the
+.I timeout
+argument, the following
+.BR pselect ()
+call:
+.PP
+.in +4n
+.EX
+ready = pselect(nfds, &readfds, &writefds, &exceptfds,
+ timeout, &sigmask);
+.EE
+.in
+.PP
+is equivalent to
+.I atomically
+executing the following calls:
+.PP
+.in +4n
+.EX
+sigset_t origmask;
+\&
+pthread_sigmask(SIG_SETMASK, &sigmask, &origmask);
+ready = select(nfds, &readfds, &writefds, &exceptfds, timeout);
+pthread_sigmask(SIG_SETMASK, &origmask, NULL);
+.EE
+.in
+.PP
+The reason that
+.BR pselect ()
+is needed is that if one wants to wait for either a signal
+or for a file descriptor to become ready, then
+an atomic test is needed to prevent race conditions.
+(Suppose the signal handler sets a global flag and
+returns.
+Then a test of this global flag followed by a call of
+.BR select ()
+could hang indefinitely if the signal arrived just after the test
+but just before the call.
+By contrast,
+.BR pselect ()
+allows one to first block signals, handle the signals that have come in,
+then call
+.BR pselect ()
+with the desired
+.IR sigmask ,
+avoiding the race.)
+.SS The timeout
+The
+.I timeout
+argument for
+.BR select ()
+is a structure of the following type:
+.PP
+.in +4n
+.EX
+struct timeval {
+ time_t tv_sec; /* seconds */
+ suseconds_t tv_usec; /* microseconds */
+};
+.EE
+.in
+.PP
+The corresponding argument for
+.BR pselect ()
+is a
+.BR timespec (3)
+structure.
+.PP
+On Linux,
+.BR select ()
+modifies
+.I timeout
+to reflect the amount of time not slept; most other implementations
+do not do this.
+(POSIX.1 permits either behavior.)
+This causes problems both when Linux code which reads
+.I timeout
+is ported to other operating systems, and when code is ported to Linux
+that reuses a \fIstruct timeval\fP for multiple
+.BR select ()s
+in a loop without reinitializing it.
+Consider
+.I timeout
+to be undefined after
+.BR select ()
+returns.
+.\" .PP - it is rumored that:
+.\" On BSD, when a timeout occurs, the file descriptor bits are not changed.
+.\" - it is certainly true that:
+.\" Linux follows SUSv2 and sets the bit masks to zero upon a timeout.
+.SH RETURN VALUE
+On success,
+.BR select ()
+and
+.BR pselect ()
+return the number of file descriptors contained in the three returned
+descriptor sets (that is, the total number of bits that are set in
+.IR readfds ,
+.IR writefds ,
+.IR exceptfds ).
+The return value may be zero if the timeout expired before any
+file descriptors became ready.
+.PP
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error;
+the file descriptor sets are unmodified,
+and
+.I timeout
+becomes undefined.
+.SH ERRORS
+.TP
+.B EBADF
+An invalid file descriptor was given in one of the sets.
+(Perhaps a file descriptor that was already closed,
+or one on which an error has occurred.)
+However, see BUGS.
+.TP
+.B EINTR
+A signal was caught; see
+.BR signal (7).
+.TP
+.B EINVAL
+.I nfds
+is negative or exceeds the
+.B RLIMIT_NOFILE
+resource limit (see
+.BR getrlimit (2)).
+.TP
+.B EINVAL
+The value contained within
+.I timeout
+is invalid.
+.TP
+.B ENOMEM
+Unable to allocate memory for internal tables.
+.SH VERSIONS
+On some other UNIX systems,
+.\" Darwin, according to a report by Jeremy Sequoia, relayed by Josh Triplett
+.BR select ()
+can fail with the error
+.B EAGAIN
+if the system fails to allocate kernel-internal resources, rather than
+.B ENOMEM
+as Linux does.
+POSIX specifies this error for
+.BR poll (2),
+but not for
+.BR select ().
+Portable programs may wish to check for
+.B EAGAIN
+and loop, just as with
+.BR EINTR .
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+.TP
+.BR select ()
+POSIX.1-2001, 4.4BSD (first appeared in 4.2BSD).
+.IP
+Generally portable to/from
+non-BSD systems supporting clones of the BSD socket layer (including
+System\ V variants).
+However, note that the System\ V variant typically
+sets the timeout variable before returning, but the BSD variant does not.
+.TP
+.BR pselect ()
+Linux 2.6.16.
+POSIX.1g, POSIX.1-2001.
+.IP
+Prior to this,
+it was emulated in glibc (but see BUGS).
+.TP
+.B fd_set
+POSIX.1-2001.
+.SH NOTES
+The following header also provides the
+.I fd_set
+type:
+.IR <sys/time.h> .
+.PP
+An
+.I fd_set
+is a fixed size buffer.
+Executing
+.BR FD_CLR ()
+or
+.BR FD_SET ()
+with a value of
+.I fd
+that is negative or is equal to or larger than
+.B FD_SETSIZE
+will result
+in undefined behavior.
+Moreover, POSIX requires
+.I fd
+to be a valid file descriptor.
+.PP
+The operation of
+.BR select ()
+and
+.BR pselect ()
+is not affected by the
+.B O_NONBLOCK
+flag.
+.\"
+.SS The self-pipe trick
+On systems that lack
+.BR pselect (),
+reliable (and more portable) signal trapping can be achieved
+using the self-pipe trick.
+In this technique,
+a signal handler writes a byte to a pipe whose other end
+is monitored by
+.BR select ()
+in the main program.
+(To avoid possibly blocking when writing to a pipe that may be full
+or reading from a pipe that may be empty,
+nonblocking I/O is used when reading from and writing to the pipe.)
+.\"
+.SS Emulating usleep(3)
+Before the advent of
+.BR usleep (3),
+some code employed a call to
+.BR select ()
+with all three sets empty,
+.I nfds
+zero, and a non-NULL
+.I timeout
+as a fairly portable way to sleep with subsecond precision.
+.\"
+.SS Correspondence between select() and poll() notifications
+Within the Linux kernel source,
+.\" fs/select.c
+we find the following definitions which show the correspondence
+between the readable, writable, and exceptional condition notifications of
+.BR select ()
+and the event notifications provided by
+.BR poll (2)
+and
+.BR epoll (7):
+.PP
+.in +4n
+.EX
+#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN |
+ EPOLLHUP | EPOLLERR)
+ /* Ready for reading */
+#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT |
+ EPOLLERR)
+ /* Ready for writing */
+#define POLLEX_SET (EPOLLPRI)
+ /* Exceptional condition */
+.EE
+.in
+.\"
+.SS Multithreaded applications
+If a file descriptor being monitored by
+.BR select ()
+is closed in another thread, the result is unspecified.
+On some UNIX systems,
+.BR select ()
+unblocks and returns, with an indication that the file descriptor is ready
+(a subsequent I/O operation will likely fail with an error,
+unless another process reopens the file descriptor between the time
+.BR select ()
+returned and the I/O operation is performed).
+On Linux (and some other systems),
+closing the file descriptor in another thread has no effect on
+.BR select ().
+In summary, any application that relies on a particular behavior
+in this scenario must be considered buggy.
+.\"
+.SS C library/kernel differences
+The Linux kernel allows file descriptor sets of arbitrary size,
+determining the length of the sets to be checked from the value of
+.IR nfds .
+However, in the glibc implementation, the
+.I fd_set
+type is fixed in size.
+See also BUGS.
+.PP
+The
+.BR pselect ()
+interface described in this page is implemented by glibc.
+The underlying Linux system call is named
+.BR pselect6 ().
+This system call has somewhat different behavior from the glibc
+wrapper function.
+.PP
+The Linux
+.BR pselect6 ()
+system call modifies its
+.I timeout
+argument.
+However, the glibc wrapper function hides this behavior
+by using a local variable for the timeout argument that
+is passed to the system call.
+Thus, the glibc
+.BR pselect ()
+function does not modify its
+.I timeout
+argument;
+this is the behavior required by POSIX.1-2001.
+.PP
+The final argument of the
+.BR pselect6 ()
+system call is not a
+.I "sigset_t\ *"
+pointer, but is instead a structure of the form:
+.PP
+.in +4n
+.EX
+struct {
+ const kernel_sigset_t *ss; /* Pointer to signal set */
+ size_t ss_len; /* Size (in bytes) of object
+ pointed to by \[aq]ss\[aq] */
+};
+.EE
+.in
+.PP
+This allows the system call to obtain both
+a pointer to the signal set and its size,
+while allowing for the fact that most architectures
+support a maximum of 6 arguments to a system call.
+See
+.BR sigprocmask (2)
+for a discussion of the difference between the kernel and libc
+notion of the signal set.
+.\"
+.SS Historical glibc details
+glibc 2.0 provided an incorrect version of
+.BR pselect ()
+that did not take a
+.I sigmask
+argument.
+.PP
+From glibc 2.1 to glibc 2.2.1,
+one must define
+.B _GNU_SOURCE
+in order to obtain the declaration of
+.BR pselect ()
+from
+.IR <sys/select.h> .
+.SH BUGS
+POSIX allows an implementation to define an upper limit,
+advertised via the constant
+.BR FD_SETSIZE ,
+on the range of file descriptors that can be specified
+in a file descriptor set.
+The Linux kernel imposes no fixed limit, but the glibc implementation makes
+.I fd_set
+a fixed-size type, with
+.B FD_SETSIZE
+defined as 1024, and the
+.BR FD_* ()
+macros operating according to that limit.
+To monitor file descriptors greater than 1023, use
+.BR poll (2)
+or
+.BR epoll (7)
+instead.
+.PP
+The implementation of the
+.I fd_set
+arguments as value-result arguments is a design error that is avoided in
+.BR poll (2)
+and
+.BR epoll (7).
+.PP
+According to POSIX,
+.BR select ()
+should check all specified file descriptors in the three file descriptor sets,
+up to the limit
+.IR nfds\-1 .
+However, the current implementation ignores any file descriptor in
+these sets that is greater than the maximum file descriptor number
+that the process currently has open.
+According to POSIX, any such file descriptor that is specified in one
+of the sets should result in the error
+.BR EBADF .
+.PP
+Starting with glibc 2.1, glibc provided an emulation of
+.BR pselect ()
+that was implemented using
+.BR sigprocmask (2)
+and
+.BR select ().
+This implementation remained vulnerable to the very race condition that
+.BR pselect ()
+was designed to prevent.
+Modern versions of glibc use the (race-free)
+.BR pselect ()
+system call on kernels where it is provided.
+.PP
+On Linux,
+.BR select ()
+may report a socket file descriptor as "ready for reading", while
+nevertheless a subsequent read blocks.
+This could for example
+happen when data has arrived but upon examination has the wrong
+checksum and is discarded.
+There may be other circumstances
+in which a file descriptor is spuriously reported as ready.
+.\" Stevens discusses a case where accept can block after select
+.\" returns successfully because of an intervening RST from the client.
+Thus it may be safer to use
+.B O_NONBLOCK
+on sockets that should not block.
+.\" Maybe the kernel should have returned EIO in such a situation?
+.PP
+On Linux,
+.BR select ()
+also modifies
+.I timeout
+if the call is interrupted by a signal handler (i.e., the
+.B EINTR
+error return).
+This is not permitted by POSIX.1.
+The Linux
+.BR pselect ()
+system call has the same behavior,
+but the glibc wrapper hides this behavior by internally copying the
+.I timeout
+to a local variable and passing that variable to the system call.
+.SH EXAMPLES
+.\" SRC BEGIN (select.c)
+.EX
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/select.h>
+\&
+int
+main(void)
+{
+ int retval;
+ fd_set rfds;
+ struct timeval tv;
+\&
+ /* Watch stdin (fd 0) to see when it has input. */
+\&
+ FD_ZERO(&rfds);
+ FD_SET(0, &rfds);
+\&
+ /* Wait up to five seconds. */
+\&
+ tv.tv_sec = 5;
+ tv.tv_usec = 0;
+\&
+ retval = select(1, &rfds, NULL, NULL, &tv);
+ /* Don\[aq]t rely on the value of tv now! */
+\&
+ if (retval == \-1)
+ perror("select()");
+ else if (retval)
+ printf("Data is available now.\en");
+ /* FD_ISSET(0, &rfds) will be true. */
+ else
+ printf("No data within five seconds.\en");
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR accept (2),
+.BR connect (2),
+.BR poll (2),
+.BR read (2),
+.BR recv (2),
+.BR restart_syscall (2),
+.BR send (2),
+.BR sigprocmask (2),
+.BR write (2),
+.BR timespec (3),
+.BR epoll (7),
+.BR time (7)
+.PP
+For a tutorial with discussion and examples, see
+.BR select_tut (2).
diff --git a/man2/select_tut.2 b/man2/select_tut.2
new file mode 100644
index 0000000..e860de3
--- /dev/null
+++ b/man2/select_tut.2
@@ -0,0 +1,638 @@
+.\" This manpage is copyright (C) 2001 Paul Sheer.
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" very minor changes, aeb
+.\"
+.\" Modified 5 June 2002, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" 2006-05-13, mtk, removed much material that is redundant with select.2
+.\" various other changes
+.\" 2008-01-26, mtk, substantial changes and rewrites
+.\"
+.TH SELECT_TUT 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+select, pselect \- synchronous I/O multiplexing
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+See
+.BR select (2)
+.SH DESCRIPTION
+The
+.BR select ()
+and
+.BR pselect ()
+system calls are used to efficiently monitor multiple file descriptors,
+to see if any of them is, or becomes, "ready";
+that is, to see whether I/O becomes possible,
+or an "exceptional condition" has occurred on any of the file descriptors.
+.PP
+This page provides background and tutorial information
+on the use of these system calls.
+For details of the arguments and semantics of
+.BR select ()
+and
+.BR pselect (),
+see
+.BR select (2).
+.\"
+.SS Combining signal and data events
+.BR pselect ()
+is useful if you are waiting for a signal as well as
+for file descriptor(s) to become ready for I/O.
+Programs that receive signals
+normally use the signal handler only to raise a global flag.
+The global flag will indicate that the event must be processed
+in the main loop of the program.
+A signal will cause the
+.BR select ()
+(or
+.BR pselect ())
+call to return with \fIerrno\fP set to \fBEINTR\fP.
+This behavior is essential so that signals can be processed
+in the main loop of the program, otherwise
+.BR select ()
+would block indefinitely.
+.PP
+Now, somewhere
+in the main loop will be a conditional to check the global flag.
+So we must ask:
+what if a signal arrives after the conditional, but before the
+.BR select ()
+call?
+The answer is that
+.BR select ()
+would block indefinitely, even though an event is actually pending.
+This race condition is solved by the
+.BR pselect ()
+call.
+This call can be used to set the signal mask to a set of signals
+that are to be received only within the
+.BR pselect ()
+call.
+For instance, let us say that the event in question
+was the exit of a child process.
+Before the start of the main loop, we
+would block \fBSIGCHLD\fP using
+.BR sigprocmask (2).
+Our
+.BR pselect ()
+call would enable
+.B SIGCHLD
+by using an empty signal mask.
+Our program would look like:
+.PP
+.EX
+static volatile sig_atomic_t got_SIGCHLD = 0;
+\&
+static void
+child_sig_handler(int sig)
+{
+ got_SIGCHLD = 1;
+}
+\&
+int
+main(int argc, char *argv[])
+{
+ sigset_t sigmask, empty_mask;
+ struct sigaction sa;
+ fd_set readfds, writefds, exceptfds;
+ int r;
+\&
+ sigemptyset(&sigmask);
+ sigaddset(&sigmask, SIGCHLD);
+ if (sigprocmask(SIG_BLOCK, &sigmask, NULL) == \-1) {
+ perror("sigprocmask");
+ exit(EXIT_FAILURE);
+ }
+\&
+ sa.sa_flags = 0;
+ sa.sa_handler = child_sig_handler;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(SIGCHLD, &sa, NULL) == \-1) {
+ perror("sigaction");
+ exit(EXIT_FAILURE);
+ }
+\&
+ sigemptyset(&empty_mask);
+\&
+ for (;;) { /* main loop */
+ /* Initialize readfds, writefds, and exceptfds
+ before the pselect() call. (Code omitted.) */
+\&
+ r = pselect(nfds, &readfds, &writefds, &exceptfds,
+ NULL, &empty_mask);
+ if (r == \-1 && errno != EINTR) {
+ /* Handle error */
+ }
+\&
+ if (got_SIGCHLD) {
+ got_SIGCHLD = 0;
+\&
+ /* Handle signalled event here; e.g., wait() for all
+ terminated children. (Code omitted.) */
+ }
+\&
+ /* main body of program */
+ }
+}
+.EE
+.SS Practical
+So what is the point of
+.BR select ()?
+Can't I just read and write to my file descriptors whenever I want?
+The point of
+.BR select ()
+is that it watches
+multiple descriptors at the same time and properly puts the process to
+sleep if there is no activity.
+UNIX programmers often find
+themselves in a position where they have to handle I/O from more than one
+file descriptor where the data flow may be intermittent.
+If you were to merely create a sequence of
+.BR read (2)
+and
+.BR write (2)
+calls, you would
+find that one of your calls may block waiting for data from/to a file
+descriptor, while another file descriptor is unused though ready for I/O.
+.BR select ()
+efficiently copes with this situation.
+.SS Select law
+Many people who try to use
+.BR select ()
+come across behavior that is
+difficult to understand and produces nonportable or borderline results.
+For instance, the above program is carefully written not to
+block at any point, even though it does not set its file descriptors to
+nonblocking mode.
+It is easy to introduce
+subtle errors that will remove the advantage of using
+.BR select (),
+so here is a list of essentials to watch for when using
+.BR select ().
+.TP 4
+1.
+You should always try to use
+.BR select ()
+without a timeout.
+Your program
+should have nothing to do if there is no data available.
+Code that
+depends on timeouts is not usually portable and is difficult to debug.
+.TP
+2.
+The value \fInfds\fP must be properly calculated for efficiency as
+explained above.
+.TP
+3.
+No file descriptor must be added to any set if you do not intend
+to check its result after the
+.BR select ()
+call, and respond appropriately.
+See next rule.
+.TP
+4.
+After
+.BR select ()
+returns, all file descriptors in all sets
+should be checked to see if they are ready.
+.TP
+5.
+The functions
+.BR read (2),
+.BR recv (2),
+.BR write (2),
+and
+.BR send (2)
+do \fInot\fP necessarily read/write the full amount of data
+that you have requested.
+If they do read/write the full amount, it's
+because you have a low traffic load and a fast stream.
+This is not always going to be the case.
+You should cope with the case of your
+functions managing to send or receive only a single byte.
+.TP
+6.
+Never read/write only in single bytes at a time unless you are really
+sure that you have a small amount of data to process.
+It is extremely
+inefficient not to read/write as much data as you can buffer each time.
+The buffers in the example below are 1024 bytes although they could
+easily be made larger.
+.TP
+7.
+Calls to
+.BR read (2),
+.BR recv (2),
+.BR write (2),
+.BR send (2),
+and
+.BR select ()
+can fail with the error
+\fBEINTR\fP,
+and calls to
+.BR read (2),
+.BR recv (2),
+.BR write (2),
+and
+.BR send (2)
+can fail with
+.I errno
+set to \fBEAGAIN\fP (\fBEWOULDBLOCK\fP).
+These results must be properly managed (not done properly above).
+If your program is not going to receive any signals, then
+it is unlikely you will get \fBEINTR\fP.
+If your program does not set nonblocking I/O,
+you will not get \fBEAGAIN\fP.
+.\" Nonetheless, you should still cope with these errors for completeness.
+.TP
+8.
+Never call
+.BR read (2),
+.BR recv (2),
+.BR write (2),
+or
+.BR send (2)
+with a buffer length of zero.
+.TP
+9.
+If the functions
+.BR read (2),
+.BR recv (2),
+.BR write (2),
+and
+.BR send (2)
+fail with errors other than those listed in \fB7.\fP,
+or one of the input functions returns 0, indicating end of file,
+then you should \fInot\fP pass that file descriptor to
+.BR select ()
+again.
+In the example below,
+I close the file descriptor immediately, and then set it to \-1
+to prevent it being included in a set.
+.TP
+10.
+The timeout value must be initialized with each new call to
+.BR select (),
+since some operating systems modify the structure.
+.BR pselect ()
+however does not modify its timeout structure.
+.TP
+11.
+Since
+.BR select ()
+modifies its file descriptor sets,
+if the call is being used in a loop,
+then the sets must be reinitialized before each call.
+.\" "I have heard" does not fill me with confidence, and doesn't
+.\" belong in a man page, so I've commented this point out.
+.\" .TP
+.\" 11.
+.\" I have heard that the Windows socket layer does not cope with OOB data
+.\" properly.
+.\" It also does not cope with
+.\" .BR select ()
+.\" calls when no file descriptors are set at all.
+.\" Having no file descriptors set is a useful
+.\" way to sleep the process with subsecond precision by using the timeout.
+.\" (See further on.)
+.SH RETURN VALUE
+See
+.BR select (2).
+.SH NOTES
+Generally speaking,
+all operating systems that support sockets also support
+.BR select ().
+.BR select ()
+can be used to solve
+many problems in a portable and efficient way that naive programmers try
+to solve in a more complicated manner using
+threads, forking, IPCs, signals, memory sharing, and so on.
+.PP
+The
+.BR poll (2)
+system call has the same functionality as
+.BR select (),
+and is somewhat more efficient when monitoring sparse
+file descriptor sets.
+It is nowadays widely available, but historically was less portable than
+.BR select ().
+.PP
+The Linux-specific
+.BR epoll (7)
+API provides an interface that is more efficient than
+.BR select (2)
+and
+.BR poll (2)
+when monitoring large numbers of file descriptors.
+.SH EXAMPLES
+Here is an example that better demonstrates the true utility of
+.BR select ().
+The listing below is a TCP forwarding program that forwards
+from one TCP port to another.
+.PP
+.\" SRC BEGIN (select.c)
+.EX
+#include <arpa/inet.h>
+#include <errno.h>
+#include <netinet/in.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/select.h>
+#include <sys/socket.h>
+#include <unistd.h>
+\&
+static int forward_port;
+\&
+#undef max
+#define max(x, y) ((x) > (y) ? (x) : (y))
+\&
+static int
+listen_socket(int listen_port)
+{
+ int lfd;
+ int yes;
+ struct sockaddr_in addr;
+\&
+ lfd = socket(AF_INET, SOCK_STREAM, 0);
+ if (lfd == \-1) {
+ perror("socket");
+ return \-1;
+ }
+\&
+ yes = 1;
+ if (setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR,
+ &yes, sizeof(yes)) == \-1)
+ {
+ perror("setsockopt");
+ close(lfd);
+ return \-1;
+ }
+\&
+ memset(&addr, 0, sizeof(addr));
+ addr.sin_port = htons(listen_port);
+ addr.sin_family = AF_INET;
+ if (bind(lfd, (struct sockaddr *) &addr, sizeof(addr)) == \-1) {
+ perror("bind");
+ close(lfd);
+ return \-1;
+ }
+\&
+ printf("accepting connections on port %d\en", listen_port);
+ listen(lfd, 10);
+ return lfd;
+}
+\&
+static int
+connect_socket(int connect_port, char *address)
+{
+ int cfd;
+ struct sockaddr_in addr;
+\&
+ cfd = socket(AF_INET, SOCK_STREAM, 0);
+ if (cfd == \-1) {
+ perror("socket");
+ return \-1;
+ }
+\&
+ memset(&addr, 0, sizeof(addr));
+ addr.sin_port = htons(connect_port);
+ addr.sin_family = AF_INET;
+\&
+ if (!inet_aton(address, (struct in_addr *) &addr.sin_addr.s_addr)) {
+ fprintf(stderr, "inet_aton(): bad IP address format\en");
+ close(cfd);
+ return \-1;
+ }
+\&
+ if (connect(cfd, (struct sockaddr *) &addr, sizeof(addr)) == \-1) {
+ perror("connect()");
+ shutdown(cfd, SHUT_RDWR);
+ close(cfd);
+ return \-1;
+ }
+ return cfd;
+}
+\&
+#define SHUT_FD1 do { \e
+ if (fd1 >= 0) { \e
+ shutdown(fd1, SHUT_RDWR); \e
+ close(fd1); \e
+ fd1 = \-1; \e
+ } \e
+ } while (0)
+\&
+#define SHUT_FD2 do { \e
+ if (fd2 >= 0) { \e
+ shutdown(fd2, SHUT_RDWR); \e
+ close(fd2); \e
+ fd2 = \-1; \e
+ } \e
+ } while (0)
+\&
+#define BUF_SIZE 1024
+\&
+int
+main(int argc, char *argv[])
+{
+ int h;
+ int ready, nfds;
+ int fd1 = \-1, fd2 = \-1;
+ int buf1_avail = 0, buf1_written = 0;
+ int buf2_avail = 0, buf2_written = 0;
+ char buf1[BUF_SIZE], buf2[BUF_SIZE];
+ fd_set readfds, writefds, exceptfds;
+ ssize_t nbytes;
+\&
+ if (argc != 4) {
+ fprintf(stderr, "Usage\en\etfwd <listen\-port> "
+ "<forward\-to\-port> <forward\-to\-ip\-address>\en");
+ exit(EXIT_FAILURE);
+ }
+\&
+ signal(SIGPIPE, SIG_IGN);
+\&
+ forward_port = atoi(argv[2]);
+\&
+ h = listen_socket(atoi(argv[1]));
+ if (h == \-1)
+ exit(EXIT_FAILURE);
+\&
+ for (;;) {
+ nfds = 0;
+\&
+ FD_ZERO(&readfds);
+ FD_ZERO(&writefds);
+ FD_ZERO(&exceptfds);
+ FD_SET(h, &readfds);
+ nfds = max(nfds, h);
+\&
+ if (fd1 > 0 && buf1_avail < BUF_SIZE)
+ FD_SET(fd1, &readfds);
+ /* Note: nfds is updated below, when fd1 is added to
+ exceptfds. */
+ if (fd2 > 0 && buf2_avail < BUF_SIZE)
+ FD_SET(fd2, &readfds);
+\&
+ if (fd1 > 0 && buf2_avail \- buf2_written > 0)
+ FD_SET(fd1, &writefds);
+ if (fd2 > 0 && buf1_avail \- buf1_written > 0)
+ FD_SET(fd2, &writefds);
+\&
+ if (fd1 > 0) {
+ FD_SET(fd1, &exceptfds);
+ nfds = max(nfds, fd1);
+ }
+ if (fd2 > 0) {
+ FD_SET(fd2, &exceptfds);
+ nfds = max(nfds, fd2);
+ }
+\&
+ ready = select(nfds + 1, &readfds, &writefds, &exceptfds, NULL);
+\&
+ if (ready == \-1 && errno == EINTR)
+ continue;
+\&
+ if (ready == \-1) {
+ perror("select()");
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (FD_ISSET(h, &readfds)) {
+ socklen_t addrlen;
+ struct sockaddr_in client_addr;
+ int fd;
+\&
+ addrlen = sizeof(client_addr);
+ memset(&client_addr, 0, addrlen);
+ fd = accept(h, (struct sockaddr *) &client_addr, &addrlen);
+ if (fd == \-1) {
+ perror("accept()");
+ } else {
+ SHUT_FD1;
+ SHUT_FD2;
+ buf1_avail = buf1_written = 0;
+ buf2_avail = buf2_written = 0;
+ fd1 = fd;
+ fd2 = connect_socket(forward_port, argv[3]);
+ if (fd2 == \-1)
+ SHUT_FD1;
+ else
+ printf("connect from %s\en",
+ inet_ntoa(client_addr.sin_addr));
+\&
+ /* Skip any events on the old, closed file
+ descriptors. */
+\&
+ continue;
+ }
+ }
+\&
+ /* NB: read OOB data before normal reads. */
+\&
+ if (fd1 > 0 && FD_ISSET(fd1, &exceptfds)) {
+ char c;
+\&
+ nbytes = recv(fd1, &c, 1, MSG_OOB);
+ if (nbytes < 1)
+ SHUT_FD1;
+ else
+ send(fd2, &c, 1, MSG_OOB);
+ }
+ if (fd2 > 0 && FD_ISSET(fd2, &exceptfds)) {
+ char c;
+\&
+ nbytes = recv(fd2, &c, 1, MSG_OOB);
+ if (nbytes < 1)
+ SHUT_FD2;
+ else
+ send(fd1, &c, 1, MSG_OOB);
+ }
+ if (fd1 > 0 && FD_ISSET(fd1, &readfds)) {
+ nbytes = read(fd1, buf1 + buf1_avail,
+ BUF_SIZE \- buf1_avail);
+ if (nbytes < 1)
+ SHUT_FD1;
+ else
+ buf1_avail += nbytes;
+ }
+ if (fd2 > 0 && FD_ISSET(fd2, &readfds)) {
+ nbytes = read(fd2, buf2 + buf2_avail,
+ BUF_SIZE \- buf2_avail);
+ if (nbytes < 1)
+ SHUT_FD2;
+ else
+ buf2_avail += nbytes;
+ }
+ if (fd1 > 0 && FD_ISSET(fd1, &writefds) && buf2_avail > 0) {
+ nbytes = write(fd1, buf2 + buf2_written,
+ buf2_avail \- buf2_written);
+ if (nbytes < 1)
+ SHUT_FD1;
+ else
+ buf2_written += nbytes;
+ }
+ if (fd2 > 0 && FD_ISSET(fd2, &writefds) && buf1_avail > 0) {
+ nbytes = write(fd2, buf1 + buf1_written,
+ buf1_avail \- buf1_written);
+ if (nbytes < 1)
+ SHUT_FD2;
+ else
+ buf1_written += nbytes;
+ }
+\&
+ /* Check if write data has caught read data. */
+\&
+ if (buf1_written == buf1_avail)
+ buf1_written = buf1_avail = 0;
+ if (buf2_written == buf2_avail)
+ buf2_written = buf2_avail = 0;
+\&
+ /* One side has closed the connection, keep
+ writing to the other side until empty. */
+\&
+ if (fd1 < 0 && buf1_avail \- buf1_written == 0)
+ SHUT_FD2;
+ if (fd2 < 0 && buf2_avail \- buf2_written == 0)
+ SHUT_FD1;
+ }
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.PP
+The above program properly forwards most kinds of TCP connections
+including OOB signal data transmitted by \fBtelnet\fP servers.
+It handles the tricky problem of having data flow in both directions
+simultaneously.
+You might think it more efficient to use a
+.BR fork (2)
+call and devote a thread to each stream.
+This becomes more tricky than you might suspect.
+Another idea is to set nonblocking I/O using
+.BR fcntl (2).
+This also has its problems because you end up using
+inefficient timeouts.
+.PP
+The program does not handle more than one simultaneous connection at a
+time, although it could easily be extended to do this with a linked list
+of buffers\[em]one for each connection.
+At the moment, new
+connections cause the current connection to be dropped.
+.SH SEE ALSO
+.BR accept (2),
+.BR connect (2),
+.BR poll (2),
+.BR read (2),
+.BR recv (2),
+.BR select (2),
+.BR send (2),
+.BR sigprocmask (2),
+.BR write (2),
+.BR epoll (7)
+.\" .SH AUTHORS
+.\" This man page was written by Paul Sheer.
diff --git a/man2/semctl.2 b/man2/semctl.2
new file mode 100644
index 0000000..6069169
--- /dev/null
+++ b/man2/semctl.2
@@ -0,0 +1,623 @@
+'\" t
+.\" Copyright 1993 Giorgio Ciucci (giorgio@crcc.it)
+.\" and Copyright 2004, 2005 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified Tue Oct 22 17:53:56 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified Fri Jun 19 10:59:15 1998 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified Sun Feb 18 01:59:29 2001 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified 20 Dec 2001, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified 21 Dec 2001, aeb
+.\" Modified 27 May 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on CAP_IPC_OWNER requirement
+.\" Modified 17 Jun 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on CAP_SYS_ADMIN requirement for IPC_SET and IPC_RMID
+.\" Modified, 11 Nov 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Language and formatting clean-ups
+.\" Rewrote semun text
+.\" Added semid_ds and ipc_perm structure definitions
+.\" 2005-08-02, mtk: Added IPC_INFO, SEM_INFO, SEM_STAT descriptions.
+.\" 2018-03-20, dbueso: Added SEM_STAT_ANY description.
+.\"
+.TH semctl 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+semctl \- System V semaphore control operations
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/sem.h>
+.PP
+.BI "int semctl(int " semid ", int " semnum ", int " cmd ", ...);"
+.fi
+.SH DESCRIPTION
+.BR semctl ()
+performs the control operation specified by
+.I cmd
+on the System\ V semaphore set identified by
+.IR semid ,
+or on the
+.IR semnum -th
+semaphore of that set.
+(The semaphores in a set are numbered starting at 0.)
+.PP
+This function has three or four arguments, depending on
+.IR cmd .
+When there are four, the fourth has the type
+.IR "union semun" .
+The \fIcalling program\fP must define this union as follows:
+.PP
+.in +4n
+.EX
+union semun {
+ int val; /* Value for SETVAL */
+ struct semid_ds *buf; /* Buffer for IPC_STAT, IPC_SET */
+ unsigned short *array; /* Array for GETALL, SETALL */
+ struct seminfo *__buf; /* Buffer for IPC_INFO
+ (Linux\-specific) */
+};
+.EE
+.in
+.PP
+The
+.I semid_ds
+data structure is defined in \fI<sys/sem.h>\fP as follows:
+.PP
+.in +4n
+.EX
+struct semid_ds {
+ struct ipc_perm sem_perm; /* Ownership and permissions */
+ time_t sem_otime; /* Last semop time */
+ time_t sem_ctime; /* Creation time/time of last
+ modification via semctl() */
+ unsigned long sem_nsems; /* No. of semaphores in set */
+};
+.EE
+.in
+.PP
+The fields of the
+.I semid_ds
+structure are as follows:
+.TP 11
+.I sem_perm
+This is an
+.I ipc_perm
+structure (see below) that specifies the access permissions on the semaphore
+set.
+.TP
+.I sem_otime
+Time of last
+.BR semop (2)
+system call.
+.TP
+.I sem_ctime
+Time of creation of semaphore set or time of last
+.BR semctl ()
+.BR IPCSET ,
+.BR SETVAL ,
+or
+.B SETALL
+operation.
+.TP
+.I sem_nsems
+Number of semaphores in the set.
+Each semaphore of the set is referenced by a nonnegative integer
+ranging from
+.B 0
+to
+.IR sem_nsems\-1 .
+.PP
+The
+.I ipc_perm
+structure is defined as follows
+(the highlighted fields are settable using
+.BR IPC_SET ):
+.PP
+.in +4n
+.EX
+struct ipc_perm {
+ key_t __key; /* Key supplied to semget(2) */
+ uid_t \fBuid\fP; /* Effective UID of owner */
+ gid_t \fBgid\fP; /* Effective GID of owner */
+ uid_t cuid; /* Effective UID of creator */
+ gid_t cgid; /* Effective GID of creator */
+ unsigned short \fBmode\fP; /* Permissions */
+ unsigned short __seq; /* Sequence number */
+};
+.EE
+.in
+.PP
+The least significant 9 bits of the
+.I mode
+field of the
+.I ipc_perm
+structure define the access permissions for the shared memory segment.
+The permission bits are as follows:
+.TS
+l l.
+0400 Read by user
+0200 Write by user
+0040 Read by group
+0020 Write by group
+0004 Read by others
+0002 Write by others
+.TE
+.PP
+In effect, "write" means "alter" for a semaphore set.
+Bits 0100, 0010, and 0001 (the execute bits) are unused by the system.
+.PP
+Valid values for
+.I cmd
+are:
+.TP
+.B IPC_STAT
+Copy information from the kernel data structure associated with
+.I semid
+into the
+.I semid_ds
+structure pointed to by
+.IR arg.buf .
+The argument
+.I semnum
+is ignored.
+The calling process must have read permission on the semaphore set.
+.TP
+.B IPC_SET
+Write the values of some members of the
+.I semid_ds
+structure pointed to by
+.I arg.buf
+to the kernel data structure associated with this semaphore set,
+updating also its
+.I sem_ctime
+member.
+.IP
+The following members of the structure are updated:
+.IR sem_perm.uid ,
+.IR sem_perm.gid ,
+and (the least significant 9 bits of)
+.IR sem_perm.mode .
+.IP
+The effective UID of the calling process must match the owner
+.RI ( sem_perm.uid )
+or creator
+.RI ( sem_perm.cuid )
+of the semaphore set, or the caller must be privileged.
+The argument
+.I semnum
+is ignored.
+.TP
+.B IPC_RMID
+Immediately remove the semaphore set,
+awakening all processes blocked in
+.BR semop (2)
+calls on the set (with an error return and
+.I errno
+set to
+.BR EIDRM ).
+The effective user ID of the calling process must
+match the creator or owner of the semaphore set,
+or the caller must be privileged.
+The argument
+.I semnum
+is ignored.
+.TP
+.BR IPC_INFO " (Linux\-specific)"
+Return information about system-wide semaphore limits and
+parameters in the structure pointed to by
+.IR arg.__buf .
+This structure is of type
+.IR seminfo ,
+defined in
+.I <sys/sem.h>
+if the
+.B _GNU_SOURCE
+feature test macro is defined:
+.IP
+.in +4n
+.EX
+struct seminfo {
+ int semmap; /* Number of entries in semaphore
+ map; unused within kernel */
+ int semmni; /* Maximum number of semaphore sets */
+ int semmns; /* Maximum number of semaphores in all
+ semaphore sets */
+ int semmnu; /* System\-wide maximum number of undo
+ structures; unused within kernel */
+ int semmsl; /* Maximum number of semaphores in a
+ set */
+ int semopm; /* Maximum number of operations for
+ semop(2) */
+ int semume; /* Maximum number of undo entries per
+ process; unused within kernel */
+ int semusz; /* Size of struct sem_undo */
+ int semvmx; /* Maximum semaphore value */
+ int semaem; /* Max. value that can be recorded for
+ semaphore adjustment (SEM_UNDO) */
+};
+.EE
+.in
+.IP
+The
+.IR semmsl ,
+.IR semmns ,
+.IR semopm ,
+and
+.I semmni
+settings can be changed via
+.IR /proc/sys/kernel/sem ;
+see
+.BR proc (5)
+for details.
+.TP
+.BR SEM_INFO " (Linux-specific)"
+Return a
+.I seminfo
+structure containing the same information as for
+.BR IPC_INFO ,
+except that the following fields are returned with information
+about system resources consumed by semaphores: the
+.I semusz
+field returns the number of semaphore sets that currently exist
+on the system; and the
+.I semaem
+field returns the total number of semaphores in all semaphore sets
+on the system.
+.TP
+.BR SEM_STAT " (Linux-specific)"
+Return a
+.I semid_ds
+structure as for
+.BR IPC_STAT .
+However, the
+.I semid
+argument is not a semaphore identifier, but instead an index into
+the kernel's internal array that maintains information about
+all semaphore sets on the system.
+.TP
+.BR SEM_STAT_ANY " (Linux-specific, since Linux 4.17)"
+Return a
+.I semid_ds
+structure as for
+.BR SEM_STAT .
+However,
+.I sem_perm.mode
+is not checked for read access for
+.I semid
+meaning that any user can employ this operation (just as any user may read
+.I /proc/sysvipc/sem
+to obtain the same information).
+.TP
+.B GETALL
+Return
+.B semval
+(i.e., the current value)
+for all semaphores of the set into
+.IR arg.array .
+The argument
+.I semnum
+is ignored.
+The calling process must have read permission on the semaphore set.
+.TP
+.B GETNCNT
+Return the
+.B semncnt
+value for the
+.IR semnum \-th
+semaphore of the set
+(i.e., the number of processes waiting for the semaphore's value to increase).
+The calling process must have read permission on the semaphore set.
+.TP
+.B GETPID
+Return the
+.B sempid
+value for the
+.IR semnum \-th
+semaphore of the set.
+This is the PID of the process that last performed an operation on
+that semaphore (but see NOTES).
+The calling process must have read permission on the semaphore set.
+.TP
+.B GETVAL
+Return
+.B semval
+(i.e., the semaphore value) for the
+.IR semnum \-th
+semaphore of the set.
+The calling process must have read permission on the semaphore set.
+.TP
+.B GETZCNT
+Return the
+.B semzcnt
+value for the
+.IR semnum \-th
+semaphore of the set
+(i.e., the number of processes waiting for the semaphore value to become 0).
+The calling process must have read permission on the semaphore set.
+.TP
+.B SETALL
+Set the
+.B semval
+values for all semaphores of the set using
+.IR arg.array ,
+updating also the
+.I sem_ctime
+member of the
+.I semid_ds
+structure associated with the set.
+Undo entries (see
+.BR semop (2))
+are cleared for altered semaphores in all processes.
+If the changes to semaphore values would permit blocked
+.BR semop (2)
+calls in other processes to proceed, then those processes are woken up.
+The argument
+.I semnum
+is ignored.
+The calling process must have alter (write) permission on
+the semaphore set.
+.TP
+.B SETVAL
+Set the semaphore value
+.RB ( semval )
+to
+.I arg.val
+for the
+.IR semnum \-th
+semaphore of the set, updating also the
+.I sem_ctime
+member of the
+.I semid_ds
+structure associated with the set.
+Undo entries are cleared for altered semaphores in all processes.
+If the changes to semaphore values would permit blocked
+.BR semop (2)
+calls in other processes to proceed, then those processes are woken up.
+The calling process must have alter permission on the semaphore set.
+.SH RETURN VALUE
+On success,
+.BR semctl ()
+returns a nonnegative value depending on
+.I cmd
+as follows:
+.TP
+.B GETNCNT
+the value of
+.BR semncnt .
+.TP
+.B GETPID
+the value of
+.BR sempid .
+.TP
+.B GETVAL
+the value of
+.BR semval .
+.TP
+.B GETZCNT
+the value of
+.BR semzcnt .
+.TP
+.B IPC_INFO
+the index of the highest used entry in the
+kernel's internal array recording information about all
+semaphore sets.
+(This information can be used with repeated
+.B SEM_STAT
+or
+.B SEM_STAT_ANY
+operations to obtain information about all semaphore sets on the system.)
+.TP
+.B SEM_INFO
+as for
+.BR IPC_INFO .
+.TP
+.B SEM_STAT
+the identifier of the semaphore set whose index was given in
+.IR semid .
+.TP
+.B SEM_STAT_ANY
+as for
+.BR SEM_STAT .
+.PP
+All other
+.I cmd
+values return 0 on success.
+.PP
+On failure,
+.BR semctl ()
+returns \-1 and sets
+.I errno
+to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+The argument
+.I cmd
+has one of the values
+.BR GETALL ,
+.BR GETPID ,
+.BR GETVAL ,
+.BR GETNCNT ,
+.BR GETZCNT ,
+.BR IPC_STAT ,
+.BR SEM_STAT ,
+.BR SEM_STAT_ANY ,
+.BR SETALL ,
+or
+.B SETVAL
+and the calling process does not have the required
+permissions on the semaphore set and does not have the
+.B CAP_IPC_OWNER
+capability in the user namespace that governs its IPC namespace.
+.TP
+.B EFAULT
+The address pointed to by
+.I arg.buf
+or
+.I arg.array
+isn't accessible.
+.TP
+.B EIDRM
+The semaphore set was removed.
+.TP
+.B EINVAL
+Invalid value for
+.I cmd
+or
+.IR semid .
+Or: for a
+.B SEM_STAT
+operation, the index value specified in
+.I semid
+referred to an array slot that is currently unused.
+.TP
+.B EPERM
+The argument
+.I cmd
+has the value
+.B IPC_SET
+or
+.B IPC_RMID
+but the effective user ID of the calling process is not the creator
+(as found in
+.IR sem_perm.cuid )
+or the owner
+(as found in
+.IR sem_perm.uid )
+of the semaphore set,
+and the process does not have the
+.B CAP_SYS_ADMIN
+capability.
+.TP
+.B ERANGE
+The argument
+.I cmd
+has the value
+.B SETALL
+or
+.B SETVAL
+and the value to which
+.B semval
+is to be set (for some semaphore of the set) is less than 0
+or greater than the implementation limit
+.BR SEMVMX .
+.SH VERSIONS
+POSIX.1 specifies the
+.\" POSIX.1-2001, POSIX.1-2008
+.I sem_nsems
+field of the
+.I semid_ds
+structure as having the type
+.IR "unsigned\ short" ,
+and the field is so defined on most other systems.
+It was also so defined on Linux 2.2 and earlier,
+but, since Linux 2.4, the field has the type
+.IR "unsigned\ long" .
+.\"
+.SS The sempid value
+POSIX.1 defines
+.I sempid
+as the "process ID of [the] last operation" on a semaphore,
+and explicitly notes that this value is set by a successful
+.BR semop (2)
+call, with the implication that no other interface affects the
+.I sempid
+value.
+.PP
+While some implementations conform to the behavior specified in POSIX.1,
+others do not.
+(The fault here probably lies with POSIX.1 inasmuch as it likely failed
+to capture the full range of existing implementation behaviors.)
+Various other implementations
+.\" At least OpenSolaris (and, one supposes, older Solaris) and Darwin
+also update
+.I sempid
+for the other operations that update the value of a semaphore: the
+.B SETVAL
+and
+.B SETALL
+operations, as well as the semaphore adjustments performed
+on process termination as a consequence of the use of the
+.B SEM_UNDO
+flag (see
+.BR semop (2)).
+.PP
+Linux also updates
+.I sempid
+for
+.B SETVAL
+operations and semaphore adjustments.
+However, somewhat inconsistently, up to and including Linux 4.5,
+the kernel did not update
+.I sempid
+for
+.B SETALL
+operations.
+This was rectified
+.\" commit a5f4db877177d2a3d7ae62a7bac3a5a27e083d7f
+in Linux 4.6.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4.
+.\" SVr4 documents more error conditions EINVAL and EOVERFLOW.
+.PP
+Various fields in a \fIstruct semid_ds\fP were typed as
+.I short
+under Linux 2.2
+and have become
+.I long
+under Linux 2.4.
+To take advantage of this,
+a recompilation under glibc-2.1.91 or later should suffice.
+(The kernel distinguishes old and new calls by an
+.B IPC_64
+flag in
+.IR cmd .)
+.PP
+In some earlier versions of glibc, the
+.I semun
+union was defined in \fI<sys/sem.h>\fP, but POSIX.1 requires
+.\" POSIX.1-2001, POSIX.1-2008
+that the caller define this union.
+On versions of glibc where this union is \fInot\fP defined,
+the macro
+.B _SEM_SEMUN_UNDEFINED
+is defined in \fI<sys/sem.h>\fP.
+.SH NOTES
+The
+.BR IPC_INFO ,
+.BR SEM_STAT ,
+and
+.B SEM_INFO
+operations are used by the
+.BR ipcs (1)
+program to provide information on allocated resources.
+In the future these may modified or moved to a
+.I /proc
+filesystem interface.
+.PP
+The following system limit on semaphore sets affects a
+.BR semctl ()
+call:
+.TP
+.B SEMVMX
+Maximum value for
+.BR semval :
+implementation dependent (32767).
+.PP
+For greater portability, it is best to always call
+.BR semctl ()
+with four arguments.
+.SH EXAMPLES
+See
+.BR shmop (2).
+.SH SEE ALSO
+.BR ipc (2),
+.BR semget (2),
+.BR semop (2),
+.BR capabilities (7),
+.BR sem_overview (7),
+.BR sysvipc (7)
diff --git a/man2/semget.2 b/man2/semget.2
new file mode 100644
index 0000000..bd2b693
--- /dev/null
+++ b/man2/semget.2
@@ -0,0 +1,434 @@
+.\" Copyright 1993 Giorgio Ciucci (giorgio@crcc.it)
+.\" and Copyright (C) 2020 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified Tue Oct 22 17:54:56 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 1 Jan 2002, Martin Schulze <joey@infodrom.org>
+.\" Modified 4 Jan 2002, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified, 27 May 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on capability requirements
+.\" Modified, 11 Nov 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Language and formatting clean-ups
+.\" Added notes on /proc files
+.\" Rewrote BUGS note about semget()'s failure to initialize
+.\" semaphore values
+.\"
+.TH semget 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+semget \- get a System V semaphore set identifier
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/sem.h>
+.fi
+.PP
+.BI "int semget(key_t " key ,
+.BI "int " nsems ,
+.BI "int " semflg );
+.SH DESCRIPTION
+The
+.BR semget ()
+system call returns the System\ V semaphore set identifier
+associated with the argument
+.IR key .
+It may be used either to obtain the identifier of a previously created
+semaphore set (when
+.I semflg
+is zero and
+.I key
+does not have the value
+.BR IPC_PRIVATE ),
+or to create a new set.
+.PP
+A new set of
+.I nsems
+semaphores is created if
+.I key
+has the value
+.B IPC_PRIVATE
+or if no existing semaphore set is associated with
+.I key
+and
+.B IPC_CREAT
+is specified in
+.IR semflg .
+.PP
+If
+.I semflg
+specifies both
+.B IPC_CREAT
+and
+.B IPC_EXCL
+and a semaphore set already exists for
+.IR key ,
+then
+.BR semget ()
+fails with
+.I errno
+set to
+.BR EEXIST .
+(This is analogous to the effect of the combination
+.B O_CREAT | O_EXCL
+for
+.BR open (2).)
+.PP
+Upon creation, the least significant 9 bits of the argument
+.I semflg
+define the permissions (for owner, group, and others)
+for the semaphore set.
+These bits have the same format, and the same
+meaning, as the
+.I mode
+argument of
+.BR open (2)
+(though the execute permissions are
+not meaningful for semaphores, and write permissions mean permission
+to alter semaphore values).
+.PP
+When creating a new semaphore set,
+.BR semget ()
+initializes the set's associated data structure,
+.I semid_ds
+(see
+.BR semctl (2)),
+as follows:
+.IP \[bu] 3
+.I sem_perm.cuid
+and
+.I sem_perm.uid
+are set to the effective user ID of the calling process.
+.IP \[bu]
+.I sem_perm.cgid
+and
+.I sem_perm.gid
+are set to the effective group ID of the calling process.
+.IP \[bu]
+The least significant 9 bits of
+.I sem_perm.mode
+are set to the least significant 9 bits of
+.IR semflg .
+.IP \[bu]
+.I sem_nsems
+is set to the value of
+.IR nsems .
+.IP \[bu]
+.I sem_otime
+is set to 0.
+.IP \[bu]
+.I sem_ctime
+is set to the current time.
+.PP
+The argument
+.I nsems
+can be 0
+(a don't care)
+when a semaphore set is not being created.
+Otherwise,
+.I nsems
+must be greater than 0
+and less than or equal to the maximum number of semaphores per semaphore set
+.RB ( SEMMSL ).
+.PP
+If the semaphore set already exists, the permissions are
+verified.
+.\" and a check is made to see if it is marked for destruction.
+.SH RETURN VALUE
+On success,
+.BR semget ()
+returns the semaphore set identifier (a nonnegative integer).
+On failure, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+A semaphore set exists for
+.IR key ,
+but the calling process does not have permission to access the set,
+and does not have the
+.B CAP_IPC_OWNER
+capability in the user namespace that governs its IPC namespace.
+.TP
+.B EEXIST
+.B IPC_CREAT
+and
+.B IPC_EXCL
+were specified in
+.IR semflg ,
+but a semaphore set already exists for
+.IR key .
+.\" .TP
+.\" .B EIDRM
+.\" The semaphore set is marked to be deleted.
+.TP
+.B EINVAL
+.I nsems
+is less than 0 or greater than the limit on the number
+of semaphores per semaphore set
+.RB ( SEMMSL ).
+.TP
+.B EINVAL
+A semaphore set corresponding to
+.I key
+already exists, but
+.I nsems
+is larger than the number of semaphores in that set.
+.TP
+.B ENOENT
+No semaphore set exists for
+.I key
+and
+.I semflg
+did not specify
+.BR IPC_CREAT .
+.TP
+.B ENOMEM
+A semaphore set has to be created but the system does not have
+enough memory for the new data structure.
+.TP
+.B ENOSPC
+A semaphore set has to be created but the system limit for the maximum
+number of semaphore sets
+.RB ( SEMMNI ),
+or the system wide maximum number of semaphores
+.RB ( SEMMNS ),
+would be exceeded.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+SVr4, POSIX.1-2001.
+.\" SVr4 documents additional error conditions EFBIG, E2BIG, EAGAIN,
+.\" ERANGE, EFAULT.
+.SH NOTES
+.B IPC_PRIVATE
+isn't a flag field but a
+.I key_t
+type.
+If this special value is used for
+.IR key ,
+the system call ignores all but the least significant 9 bits of
+.I semflg
+and creates a new semaphore set (on success).
+.\"
+.SS Semaphore initialization
+The values of the semaphores in a newly created set are indeterminate.
+(POSIX.1-2001 and POSIX.1-2008 are explicit on this point,
+although POSIX.1-2008 notes that a future version of the standard
+may require an implementation to initialize the semaphores to 0.)
+Although Linux, like many other implementations,
+initializes the semaphore values to 0,
+a portable application cannot rely on this:
+it should explicitly initialize the semaphores to the desired values.
+.\" In truth, every one of the many implementations that I've tested sets
+.\" the values to zero, but I suppose there is/was some obscure
+.\" implementation out there that does not.
+.PP
+Initialization can be done using
+.BR semctl (2)
+.B SETVAL
+or
+.B SETALL
+operation.
+Where multiple peers do not know who will be the first to
+initialize the set, checking for a nonzero
+.I sem_otime
+in the associated data structure retrieved by a
+.BR semctl (2)
+.B IPC_STAT
+operation can be used to avoid races.
+.\"
+.SS Semaphore limits
+The following limits on semaphore set resources affect the
+.BR semget ()
+call:
+.TP
+.B SEMMNI
+System-wide limit on the number of semaphore sets.
+Before Linux 3.19,
+the default value for this limit was 128.
+Since Linux 3.19,
+.\" commit e843e7d2c88b7db107a86bd2c7145dc715c058f4
+the default value is 32,000.
+On Linux, this limit can be read and modified via the fourth field of
+.IR /proc/sys/kernel/sem .
+.\" This /proc file is not available in Linux 2.2 and earlier -- MTK
+.TP
+.B SEMMSL
+Maximum number of semaphores per semaphore ID.
+Before Linux 3.19,
+the default value for this limit was 250.
+Since Linux 3.19,
+.\" commit e843e7d2c88b7db107a86bd2c7145dc715c058f4
+the default value is 32,000.
+On Linux, this limit can be read and modified via the first field of
+.IR /proc/sys/kernel/sem .
+.TP
+.B SEMMNS
+System-wide limit on the number of semaphores: policy dependent
+(on Linux, this limit can be read and modified via the second field of
+.IR /proc/sys/kernel/sem ).
+Note that the number of semaphores system-wide
+is also limited by the product of
+.B SEMMSL
+and
+.BR SEMMNI .
+.SH BUGS
+The name choice
+.B IPC_PRIVATE
+was perhaps unfortunate,
+.B IPC_NEW
+would more clearly show its function.
+.SH EXAMPLES
+The program shown below uses
+.BR semget ()
+to create a new semaphore set or retrieve the ID of an existing set.
+It generates the
+.I key
+for
+.BR semget ()
+using
+.BR ftok (3).
+The first two command-line arguments are used as the
+.I pathname
+and
+.I proj_id
+arguments for
+.BR ftok (3).
+The third command-line argument is an integer that specifies the
+.I nsems
+argument for
+.BR semget ().
+Command-line options can be used to specify the
+.B IPC_CREAT
+.RI ( \-c )
+and
+.B IPC_EXCL
+.RI ( \-x )
+flags for the call to
+.BR semget ().
+The usage of this program is demonstrated below.
+.PP
+We first create two files that will be used to generate keys using
+.BR ftok (3),
+create two semaphore sets using those files, and then list the sets using
+.BR ipcs (1):
+.PP
+.in +4n
+.EX
+$ \fBtouch mykey mykey2\fP
+$ \fB./t_semget \-c mykey p 1\fP
+ID = 9
+$ \fB./t_semget \-c mykey2 p 2\fP
+ID = 10
+$ \fBipcs \-s\fP
+\&
+\-\-\-\-\-\- Semaphore Arrays \-\-\-\-\-\-\-\-
+key semid owner perms nsems
+0x7004136d 9 mtk 600 1
+0x70041368 10 mtk 600 2
+.EE
+.in
+.PP
+Next, we demonstrate that when
+.BR semctl (2)
+is given the same
+.I key
+(as generated by the same arguments to
+.BR ftok (3)),
+it returns the ID of the already existing semaphore set:
+.PP
+.in +4n
+.EX
+$ \fB./t_semget \-c mykey p 1\fP
+ID = 9
+.EE
+.in
+.PP
+Finally, we demonstrate the kind of collision that can occur when
+.BR ftok (3)
+is given different
+.I pathname
+arguments that have the same inode number:
+.PP
+.in +4n
+.EX
+$ \fBln mykey link\fP
+$ \fBls \-i1 link mykey\fP
+2233197 link
+2233197 mykey
+$ \fB./t_semget link p 1\fP # Generates same key as \[aq]mykey\[aq]
+ID = 9
+.EE
+.in
+.SS Program source
+\&
+.\" SRC BEGIN (t_semget.c)
+.EX
+/* t_semget.c
+\&
+ Licensed under GNU General Public License v2 or later.
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ipc.h>
+#include <sys/sem.h>
+#include <unistd.h>
+\&
+static void
+usage(const char *pname)
+{
+ fprintf(stderr, "Usage: %s [\-cx] pathname proj\-id num\-sems\en",
+ pname);
+ fprintf(stderr, " \-c Use IPC_CREAT flag\en");
+ fprintf(stderr, " \-x Use IPC_EXCL flag\en");
+ exit(EXIT_FAILURE);
+}
+\&
+int
+main(int argc, char *argv[])
+{
+ int semid, nsems, flags, opt;
+ key_t key;
+\&
+ flags = 0;
+ while ((opt = getopt(argc, argv, "cx")) != \-1) {
+ switch (opt) {
+ case \[aq]c\[aq]: flags |= IPC_CREAT; break;
+ case \[aq]x\[aq]: flags |= IPC_EXCL; break;
+ default: usage(argv[0]);
+ }
+ }
+\&
+ if (argc != optind + 3)
+ usage(argv[0]);
+\&
+ key = ftok(argv[optind], argv[optind + 1][0]);
+ if (key == \-1) {
+ perror("ftok");
+ exit(EXIT_FAILURE);
+ }
+\&
+ nsems = atoi(argv[optind + 2]);
+\&
+ semid = semget(key, nsems, flags | 0600);
+ if (semid == \-1) {
+ perror("semget");
+ exit(EXIT_FAILURE);
+ }
+\&
+ printf("ID = %d\en", semid);
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR semctl (2),
+.BR semop (2),
+.BR ftok (3),
+.BR capabilities (7),
+.BR sem_overview (7),
+.BR sysvipc (7)
diff --git a/man2/semop.2 b/man2/semop.2
new file mode 100644
index 0000000..ece7a0e
--- /dev/null
+++ b/man2/semop.2
@@ -0,0 +1,523 @@
+.\" Copyright 1993 Giorgio Ciucci (giorgio@crcc.it)
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 1996-10-22, Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 2002-01-08, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified 2003-04-28, Ernie Petrides <petrides@redhat.com>
+.\" Modified 2004-05-27, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified, 11 Nov 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Language and formatting clean-ups
+.\" Added notes on /proc files
+.\" 2005-04-08, mtk, Noted kernel version numbers for semtimedop()
+.\" 2007-07-09, mtk, Added an EXAMPLE code segment.
+.\"
+.TH semop 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+semop, semtimedop \- System V semaphore operations
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/sem.h>
+.PP
+.BI "int semop(int " semid ", struct sembuf *" sops ", size_t " nsops );
+.BI "int semtimedop(int " semid ", struct sembuf *" sops ", size_t " nsops ,
+.BI " const struct timespec *_Nullable " timeout );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR semtimedop ():
+.nf
+ _GNU_SOURCE
+.fi
+.SH DESCRIPTION
+Each semaphore in a System\ V semaphore set
+has the following associated values:
+.PP
+.in +4n
+.EX
+unsigned short semval; /* semaphore value */
+unsigned short semzcnt; /* # waiting for zero */
+unsigned short semncnt; /* # waiting for increase */
+pid_t sempid; /* PID of process that last
+ modified the semaphore value */
+.EE
+.in
+.PP
+.BR semop ()
+performs operations on selected semaphores in the set indicated by
+.IR semid .
+Each of the
+.I nsops
+elements in the array pointed to by
+.I sops
+is a structure that
+specifies an operation to be performed on a single semaphore.
+The elements of this structure are of type
+.IR "struct sembuf" ,
+containing the following members:
+.PP
+.in +4n
+.EX
+unsigned short sem_num; /* semaphore number */
+short sem_op; /* semaphore operation */
+short sem_flg; /* operation flags */
+.EE
+.in
+.PP
+Flags recognized in
+.I sem_flg
+are
+.B IPC_NOWAIT
+and
+.BR SEM_UNDO .
+If an operation specifies
+.BR SEM_UNDO ,
+it will be automatically undone when the process terminates.
+.PP
+The set of operations contained in
+.I sops
+is performed in
+.IR "array order" ,
+and
+.IR atomically ,
+that is, the operations are performed either as a complete unit,
+or not at all.
+The behavior of the system call if not all operations can be
+performed immediately depends on the presence of the
+.B IPC_NOWAIT
+flag in the individual
+.I sem_flg
+fields, as noted below.
+.PP
+Each operation is performed on the
+.IR sem_num \-th
+semaphore of the semaphore set, where the first semaphore of the set
+is numbered 0.
+There are three types of operation, distinguished by the value of
+.IR sem_op .
+.PP
+If
+.I sem_op
+is a positive integer, the operation adds this value to
+the semaphore value
+.RI ( semval ).
+Furthermore, if
+.B SEM_UNDO
+is specified for this operation, the system subtracts the value
+.I sem_op
+from the semaphore adjustment
+.RI ( semadj )
+value for this semaphore.
+This operation can always proceed\[em]it never forces a thread to wait.
+The calling process must have alter permission on the semaphore set.
+.PP
+If
+.I sem_op
+is zero, the process must have read permission on the semaphore
+set.
+This is a "wait-for-zero" operation: if
+.I semval
+is zero, the operation can immediately proceed.
+Otherwise, if
+.B IPC_NOWAIT
+is specified in
+.IR sem_flg ,
+.BR semop ()
+fails with
+.I errno
+set to
+.B EAGAIN
+(and none of the operations in
+.I sops
+is performed).
+Otherwise,
+.I semzcnt
+(the count of threads waiting until this semaphore's value becomes zero)
+is incremented by one and the thread sleeps until
+one of the following occurs:
+.IP \[bu] 3
+.I semval
+becomes 0, at which time the value of
+.I semzcnt
+is decremented.
+.IP \[bu]
+The semaphore set
+is removed:
+.BR semop ()
+fails, with
+.I errno
+set to
+.BR EIDRM .
+.IP \[bu]
+The calling thread catches a signal:
+the value of
+.I semzcnt
+is decremented and
+.BR semop ()
+fails, with
+.I errno
+set to
+.BR EINTR .
+.PP
+If
+.I sem_op
+is less than zero, the process must have alter permission on the
+semaphore set.
+If
+.I semval
+is greater than or equal to the absolute value of
+.IR sem_op ,
+the operation can proceed immediately:
+the absolute value of
+.I sem_op
+is subtracted from
+.IR semval ,
+and, if
+.B SEM_UNDO
+is specified for this operation, the system adds the absolute value of
+.I sem_op
+to the semaphore adjustment
+.RI ( semadj )
+value for this semaphore.
+If the absolute value of
+.I sem_op
+is greater than
+.IR semval ,
+and
+.B IPC_NOWAIT
+is specified in
+.IR sem_flg ,
+.BR semop ()
+fails, with
+.I errno
+set to
+.B EAGAIN
+(and none of the operations in
+.I sops
+is performed).
+Otherwise,
+.I semncnt
+(the counter of threads waiting for this semaphore's value to increase)
+is incremented by one and the thread sleeps until
+one of the following occurs:
+.IP \[bu] 3
+.I semval
+becomes greater than or equal to the absolute value of
+.IR sem_op :
+the operation now proceeds, as described above.
+.IP \[bu]
+The semaphore set is removed from the system:
+.BR semop ()
+fails, with
+.I errno
+set to
+.BR EIDRM .
+.IP \[bu]
+The calling thread catches a signal:
+the value of
+.I semncnt
+is decremented and
+.BR semop ()
+fails, with
+.I errno
+set to
+.BR EINTR .
+.PP
+On successful completion, the
+.I sempid
+value for each semaphore specified in the array pointed to by
+.I sops
+is set to the caller's process ID.
+In addition, the
+.I sem_otime
+.\" and
+.\" .I sem_ctime
+is set to the current time.
+.SS semtimedop()
+.BR semtimedop ()
+behaves identically to
+.BR semop ()
+except that in those cases where the calling thread would sleep,
+the duration of that sleep is limited by the amount of elapsed
+time specified by the
+.I timespec
+structure whose address is passed in the
+.I timeout
+argument.
+(This sleep interval will be rounded up to the system clock granularity,
+and kernel scheduling delays mean that the interval
+may overrun by a small amount.)
+If the specified time limit has been reached,
+.BR semtimedop ()
+fails with
+.I errno
+set to
+.B EAGAIN
+(and none of the operations in
+.I sops
+is performed).
+If the
+.I timeout
+argument is NULL,
+then
+.BR semtimedop ()
+behaves exactly like
+.BR semop ().
+.PP
+Note that if
+.BR semtimedop ()
+is interrupted by a signal, causing the call to fail with the error
+.BR EINTR ,
+the contents of
+.I timeout
+are left unchanged.
+.SH RETURN VALUE
+On success,
+.BR semop ()
+and
+.BR semtimedop ()
+return 0.
+On failure, they return \-1, and set
+.I errno
+to indicate the error.
+.SH ERRORS
+.TP
+.B E2BIG
+The argument
+.I nsops
+is greater than
+.BR SEMOPM ,
+the maximum number of operations allowed per system
+call.
+.TP
+.B EACCES
+The calling process does not have the permissions required
+to perform the specified semaphore operations,
+and does not have the
+.B CAP_IPC_OWNER
+capability in the user namespace that governs its IPC namespace.
+.TP
+.B EAGAIN
+An operation could not proceed immediately and either
+.B IPC_NOWAIT
+was specified in
+.I sem_flg
+or the time limit specified in
+.I timeout
+expired.
+.TP
+.B EFAULT
+An address specified in either the
+.I sops
+or the
+.I timeout
+argument isn't accessible.
+.TP
+.B EFBIG
+For some operation the value of
+.I sem_num
+is less than 0 or greater than or equal to the number
+of semaphores in the set.
+.TP
+.B EIDRM
+The semaphore set was removed.
+.TP
+.B EINTR
+While blocked in this system call, the thread caught a signal; see
+.BR signal (7).
+.TP
+.B EINVAL
+The semaphore set doesn't exist, or
+.I semid
+is less than zero, or
+.I nsops
+has a nonpositive value.
+.TP
+.B ENOMEM
+The
+.I sem_flg
+of some operation specified
+.B SEM_UNDO
+and the system does not have enough memory to allocate the undo
+structure.
+.TP
+.B ERANGE
+For some operation
+.I sem_op+semval
+is greater than
+.BR SEMVMX ,
+the implementation dependent maximum value for
+.IR semval .
+.SH STANDARDS
+POSIX.1-2008.
+.SH VERSIONS
+Linux 2.5.52 (backported into Linux 2.4.22),
+glibc 2.3.3.
+POSIX.1-2001, SVr4.
+.\" SVr4 documents additional error conditions EINVAL, EFBIG, ENOSPC.
+.SH NOTES
+The
+.I sem_undo
+structures of a process aren't inherited by the child produced by
+.BR fork (2),
+but they are inherited across an
+.BR execve (2)
+system call.
+.PP
+.BR semop ()
+is never automatically restarted after being interrupted by a signal handler,
+regardless of the setting of the
+.B SA_RESTART
+flag when establishing a signal handler.
+.PP
+A semaphore adjustment
+.RI ( semadj )
+value is a per-process, per-semaphore integer that is the negated sum
+of all operations performed on a semaphore specifying the
+.B SEM_UNDO
+flag.
+Each process has a list of
+.I semadj
+values\[em]one value for each semaphore on which it has operated using
+.BR SEM_UNDO .
+When a process terminates, each of its per-semaphore
+.I semadj
+values is added to the corresponding semaphore,
+thus undoing the effect of that process's operations on the semaphore
+(but see BUGS below).
+When a semaphore's value is directly set using the
+.B SETVAL
+or
+.B SETALL
+request to
+.BR semctl (2),
+the corresponding
+.I semadj
+values in all processes are cleared.
+The
+.BR clone (2)
+.B CLONE_SYSVSEM
+flag allows more than one process to share a
+.I semadj
+list; see
+.BR clone (2)
+for details.
+.PP
+The \fIsemval\fP, \fIsempid\fP, \fIsemzcnt\fP, and \fIsemnct\fP values
+for a semaphore can all be retrieved using appropriate
+.BR semctl (2)
+calls.
+.SS Semaphore limits
+The following limits on semaphore set resources affect the
+.BR semop ()
+call:
+.TP
+.B SEMOPM
+Maximum number of operations allowed for one
+.BR semop ()
+call.
+Before Linux 3.19,
+.\" commit e843e7d2c88b7db107a86bd2c7145dc715c058f4
+the default value for this limit was 32.
+Since Linux 3.19, the default value is 500.
+On Linux, this limit can be read and modified via the third field of
+.IR /proc/sys/kernel/sem .
+.\" This /proc file is not available in Linux 2.2 and earlier -- MTK
+.IR Note :
+this limit should not be raised above 1000,
+.\" See comment in Linux 3.19 source file include/uapi/linux/sem.h
+because of the risk of that
+.BR semop ()
+fails due to kernel memory fragmentation when allocating memory to copy the
+.I sops
+array.
+.TP
+.B SEMVMX
+Maximum allowable value for
+.IR semval :
+implementation dependent (32767).
+.PP
+The implementation has no intrinsic limits for
+the adjust on exit maximum value
+.RB ( SEMAEM ),
+the system wide maximum number of undo structures
+.RB ( SEMMNU )
+and the per-process maximum number of undo entries system parameters.
+.SH BUGS
+When a process terminates, its set of associated
+.I semadj
+structures is used to undo the effect of all of the
+semaphore operations it performed with the
+.B SEM_UNDO
+flag.
+This raises a difficulty: if one (or more) of these semaphore adjustments
+would result in an attempt to decrease a semaphore's value below zero,
+what should an implementation do?
+One possible approach would be to block until all the semaphore
+adjustments could be performed.
+This is however undesirable since it could force process termination to
+block for arbitrarily long periods.
+Another possibility is that such semaphore adjustments could be ignored
+altogether (somewhat analogously to failing when
+.B IPC_NOWAIT
+is specified for a semaphore operation).
+Linux adopts a third approach: decreasing the semaphore value
+as far as possible (i.e., to zero) and allowing process
+termination to proceed immediately.
+.PP
+In Linux 2.6.x, x <= 10, there is a bug that in some circumstances
+prevents a thread that is waiting for a semaphore value to become
+zero from being woken up when the value does actually become zero.
+This bug is fixed in Linux 2.6.11.
+.\" The bug report:
+.\" http://marc.theaimsgroup.com/?l=linux-kernel&m=110260821123863&w=2
+.\" the fix:
+.\" http://marc.theaimsgroup.com/?l=linux-kernel&m=110261701025794&w=2
+.SH EXAMPLES
+The following code segment uses
+.BR semop ()
+to atomically wait for the value of semaphore 0 to become zero,
+and then increment the semaphore value by one.
+.PP
+.in +4n
+.EX
+struct sembuf sops[2];
+int semid;
+\&
+/* Code to set \fIsemid\fP omitted */
+\&
+sops[0].sem_num = 0; /* Operate on semaphore 0 */
+sops[0].sem_op = 0; /* Wait for value to equal 0 */
+sops[0].sem_flg = 0;
+\&
+sops[1].sem_num = 0; /* Operate on semaphore 0 */
+sops[1].sem_op = 1; /* Increment value by one */
+sops[1].sem_flg = 0;
+\&
+if (semop(semid, sops, 2) == \-1) {
+ perror("semop");
+ exit(EXIT_FAILURE);
+}
+.EE
+.in
+.PP
+A further example of the use of
+.BR semop ()
+can be found in
+.BR shmop (2).
+.SH SEE ALSO
+.BR clone (2),
+.BR semctl (2),
+.BR semget (2),
+.BR sigaction (2),
+.BR capabilities (7),
+.BR sem_overview (7),
+.BR sysvipc (7),
+.BR time (7)
diff --git a/man2/semtimedop.2 b/man2/semtimedop.2
new file mode 100644
index 0000000..8a40618
--- /dev/null
+++ b/man2/semtimedop.2
@@ -0,0 +1 @@
+.so man2/semop.2
diff --git a/man2/send.2 b/man2/send.2
new file mode 100644
index 0000000..16c58b5
--- /dev/null
+++ b/man2/send.2
@@ -0,0 +1,506 @@
+.\" Copyright (c) 1983, 1991 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" SPDX-License-Identifier: BSD-4-Clause-UC
+.\"
+.\" Modified 1993-07-24 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1996-10-22 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified Oct 1998 by Andi Kleen
+.\" Modified Oct 2003 by aeb
+.\" Modified 2004-07-01 by mtk
+.\"
+.TH send 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+send, sendto, sendmsg \- send a message on a socket
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/socket.h>
+.PP
+.BI "ssize_t send(int " sockfd ", const void " buf [. len "], size_t " len \
+", int " flags );
+.BI "ssize_t sendto(int " sockfd ", const void " buf [. len "], size_t " len \
+", int " flags ,
+.BI " const struct sockaddr *" dest_addr ", socklen_t " addrlen );
+.BI "ssize_t sendmsg(int " sockfd ", const struct msghdr *" msg \
+", int " flags );
+.fi
+.SH DESCRIPTION
+The system calls
+.BR send (),
+.BR sendto (),
+and
+.BR sendmsg ()
+are used to transmit a message to another socket.
+.PP
+The
+.BR send ()
+call may be used only when the socket is in a
+.I connected
+state (so that the intended recipient is known).
+The only difference between
+.BR send ()
+and
+.BR write (2)
+is the presence of
+.IR flags .
+With a zero
+.I flags
+argument,
+.BR send ()
+is equivalent to
+.BR write (2).
+Also, the following call
+.PP
+.in +4n
+.EX
+send(sockfd, buf, len, flags);
+.EE
+.in
+.PP
+is equivalent to
+.PP
+.in +4n
+.EX
+sendto(sockfd, buf, len, flags, NULL, 0);
+.EE
+.in
+.PP
+The argument
+.I sockfd
+is the file descriptor of the sending socket.
+.PP
+If
+.BR sendto ()
+is used on a connection-mode
+.RB ( SOCK_STREAM ,
+.BR SOCK_SEQPACKET )
+socket, the arguments
+.I dest_addr
+and
+.I addrlen
+are ignored (and the error
+.B EISCONN
+may be returned when they are
+not NULL and 0), and the error
+.B ENOTCONN
+is returned when the socket was not actually connected.
+Otherwise, the address of the target is given by
+.I dest_addr
+with
+.I addrlen
+specifying its size.
+For
+.BR sendmsg (),
+the address of the target is given by
+.IR msg.msg_name ,
+with
+.I msg.msg_namelen
+specifying its size.
+.PP
+For
+.BR send ()
+and
+.BR sendto (),
+the message is found in
+.I buf
+and has length
+.IR len .
+For
+.BR sendmsg (),
+the message is pointed to by the elements of the array
+.IR msg.msg_iov .
+The
+.BR sendmsg ()
+call also allows sending ancillary data (also known as control information).
+.PP
+If the message is too long to pass atomically through the
+underlying protocol, the error
+.B EMSGSIZE
+is returned, and the message is not transmitted.
+.PP
+No indication of failure to deliver is implicit in a
+.BR send ().
+Locally detected errors are indicated by a return value of \-1.
+.PP
+When the message does not fit into the send buffer of the socket,
+.BR send ()
+normally blocks, unless the socket has been placed in nonblocking I/O
+mode.
+In nonblocking mode it would fail with the error
+.B EAGAIN
+or
+.B EWOULDBLOCK
+in this case.
+The
+.BR select (2)
+call may be used to determine when it is possible to send more data.
+.SS The flags argument
+The
+.I flags
+argument is the bitwise OR
+of zero or more of the following flags.
+.\" FIXME . ? document MSG_PROXY (which went away in Linux 2.3.15)
+.TP
+.BR MSG_CONFIRM " (since Linux 2.3.15)"
+Tell the link layer that forward progress happened: you got a successful
+reply from the other side.
+If the link layer doesn't get this
+it will regularly reprobe the neighbor (e.g., via a unicast ARP).
+Valid only on
+.B SOCK_DGRAM
+and
+.B SOCK_RAW
+sockets and currently implemented only for IPv4 and IPv6.
+See
+.BR arp (7)
+for details.
+.TP
+.B MSG_DONTROUTE
+Don't use a gateway to send out the packet, send to hosts only on
+directly connected networks.
+This is usually used only
+by diagnostic or routing programs.
+This is defined only for protocol
+families that route; packet sockets don't.
+.TP
+.BR MSG_DONTWAIT " (since Linux 2.2)"
+Enables nonblocking operation; if the operation would block,
+.B EAGAIN
+or
+.B EWOULDBLOCK
+is returned.
+This provides similar behavior to setting the
+.B O_NONBLOCK
+flag (via the
+.BR fcntl (2)
+.B F_SETFL
+operation), but differs in that
+.B MSG_DONTWAIT
+is a per-call option, whereas
+.B O_NONBLOCK
+is a setting on the open file description (see
+.BR open (2)),
+which will affect all threads in the calling process
+and as well as other processes that hold file descriptors
+referring to the same open file description.
+.TP
+.BR MSG_EOR " (since Linux 2.2)"
+Terminates a record (when this notion is supported, as for sockets of type
+.BR SOCK_SEQPACKET ).
+.TP
+.BR MSG_MORE " (since Linux 2.4.4)"
+The caller has more data to send.
+This flag is used with TCP sockets to obtain the same effect
+as the
+.B TCP_CORK
+socket option (see
+.BR tcp (7)),
+with the difference that this flag can be set on a per-call basis.
+.IP
+Since Linux 2.6, this flag is also supported for UDP sockets, and informs
+the kernel to package all of the data sent in calls with this flag set
+into a single datagram which is transmitted only when a call is performed
+that does not specify this flag.
+(See also the
+.B UDP_CORK
+socket option described in
+.BR udp (7).)
+.TP
+.BR MSG_NOSIGNAL " (since Linux 2.2)"
+Don't generate a
+.B SIGPIPE
+signal if the peer on a stream-oriented socket has closed the connection.
+The
+.B EPIPE
+error is still returned.
+This provides similar behavior to using
+.BR sigaction (2)
+to ignore
+.BR SIGPIPE ,
+but, whereas
+.B MSG_NOSIGNAL
+is a per-call feature,
+ignoring
+.B SIGPIPE
+sets a process attribute that affects all threads in the process.
+.TP
+.B MSG_OOB
+Sends
+.I out-of-band
+data on sockets that support this notion (e.g., of type
+.BR SOCK_STREAM );
+the underlying protocol must also support
+.I out-of-band
+data.
+.TP
+.BR MSG_FASTOPEN " (since Linux 3.7)"
+Attempts TCP Fast Open (RFC7413) and sends data in the SYN like a
+combination of
+.BR connect (2)
+and
+.BR write (2),
+by performing an implicit
+.BR connect (2)
+operation.
+It blocks until the data is buffered and the handshake has completed.
+For a non-blocking socket,
+it returns the number of bytes buffered and sent in the SYN packet.
+If the cookie is not available locally,
+it returns
+.BR EINPROGRESS ,
+and sends a SYN with a Fast Open cookie request automatically.
+The caller needs to write the data again when the socket is connected.
+On errors,
+it sets the same
+.I errno
+as
+.BR connect (2)
+if the handshake fails.
+This flag requires enabling TCP Fast Open client support on sysctl
+.IR net.ipv4.tcp_fastopen .
+.IP
+Refer to
+.B TCP_FASTOPEN_CONNECT
+socket option in
+.BR tcp (7)
+for an alternative approach.
+.SS sendmsg()
+The definition of the
+.I msghdr
+structure employed by
+.BR sendmsg ()
+is as follows:
+.PP
+.in +4n
+.EX
+struct msghdr {
+ void *msg_name; /* Optional address */
+ socklen_t msg_namelen; /* Size of address */
+ struct iovec *msg_iov; /* Scatter/gather array */
+ size_t msg_iovlen; /* # elements in msg_iov */
+ void *msg_control; /* Ancillary data, see below */
+ size_t msg_controllen; /* Ancillary data buffer len */
+ int msg_flags; /* Flags (unused) */
+};
+.EE
+.in
+.PP
+The
+.I msg_name
+field is used on an unconnected socket to specify the target
+address for a datagram.
+It points to a buffer containing the address; the
+.I msg_namelen
+field should be set to the size of the address.
+For a connected socket, these fields should be specified as NULL and 0,
+respectively.
+.PP
+The
+.I msg_iov
+and
+.I msg_iovlen
+fields specify scatter-gather locations, as for
+.BR writev (2).
+.PP
+You may send control information (ancillary data) using the
+.I msg_control
+and
+.I msg_controllen
+members.
+The maximum control buffer length the kernel can process is limited
+per socket by the value in
+.IR /proc/sys/net/core/optmem_max ;
+see
+.BR socket (7).
+For further information on the use of ancillary data in various
+socket domains, see
+.BR unix (7)
+and
+.BR ip (7).
+.PP
+The
+.I msg_flags
+field is ignored.
+.\" Still to be documented:
+.\" Send file descriptors and user credentials using the
+.\" msg_control* fields.
+.SH RETURN VALUE
+On success, these calls return the number of bytes sent.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+These are some standard errors generated by the socket layer.
+Additional errors
+may be generated and returned from the underlying protocol modules;
+see their respective manual pages.
+.TP
+.B EACCES
+(For UNIX domain sockets, which are identified by pathname)
+Write permission is denied on the destination socket file,
+or search permission is denied for one of the directories
+the path prefix.
+(See
+.BR path_resolution (7).)
+.IP
+(For UDP sockets) An attempt was made to send to a
+network/broadcast address as though it was a unicast address.
+.TP
+.BR EAGAIN " or " EWOULDBLOCK
+.\" Actually EAGAIN on Linux
+The socket is marked nonblocking and the requested operation
+would block.
+POSIX.1-2001 allows either error to be returned for this case,
+and does not require these constants to have the same value,
+so a portable application should check for both possibilities.
+.TP
+.B EAGAIN
+(Internet domain datagram sockets)
+The socket referred to by
+.I sockfd
+had not previously been bound to an address and,
+upon attempting to bind it to an ephemeral port,
+it was determined that all port numbers in the ephemeral port range
+are currently in use.
+See the discussion of
+.I /proc/sys/net/ipv4/ip_local_port_range
+in
+.BR ip (7).
+.TP
+.B EALREADY
+Another Fast Open is in progress.
+.TP
+.B EBADF
+.I sockfd
+is not a valid open file descriptor.
+.TP
+.B ECONNRESET
+Connection reset by peer.
+.TP
+.B EDESTADDRREQ
+The socket is not connection-mode, and no peer address is set.
+.TP
+.B EFAULT
+An invalid user space address was specified for an argument.
+.TP
+.B EINTR
+A signal occurred before any data was transmitted; see
+.BR signal (7).
+.TP
+.B EINVAL
+Invalid argument passed.
+.TP
+.B EISCONN
+The connection-mode socket was connected already but a
+recipient was specified.
+(Now either this error is returned, or the recipient specification
+is ignored.)
+.TP
+.B EMSGSIZE
+The socket type
+.\" (e.g., SOCK_DGRAM )
+requires that message be sent atomically, and the size
+of the message to be sent made this impossible.
+.TP
+.B ENOBUFS
+The output queue for a network interface was full.
+This generally indicates that the interface has stopped sending,
+but may be caused by transient congestion.
+(Normally, this does not occur in Linux.
+Packets are just silently dropped
+when a device queue overflows.)
+.TP
+.B ENOMEM
+No memory available.
+.TP
+.B ENOTCONN
+The socket is not connected, and no target has been given.
+.TP
+.B ENOTSOCK
+The file descriptor
+.I sockfd
+does not refer to a socket.
+.TP
+.B EOPNOTSUPP
+Some bit in the
+.I flags
+argument is inappropriate for the socket type.
+.TP
+.B EPIPE
+The local end has been shut down on a connection oriented socket.
+In this case, the process
+will also receive a
+.B SIGPIPE
+unless
+.B MSG_NOSIGNAL
+is set.
+.SH VERSIONS
+According to POSIX.1-2001, the
+.I msg_controllen
+field of the
+.I msghdr
+structure should be typed as
+.IR socklen_t ,
+and the
+.I msg_iovlen
+field should be typed as
+.IR int ,
+but glibc currently types both as
+.IR size_t .
+.\" glibc bug for msg_controllen raised 12 Mar 2006
+.\" http://sourceware.org/bugzilla/show_bug.cgi?id=2448
+.\" The problem is an underlying kernel issue: the size of the
+.\" __kernel_size_t type used to type these fields varies
+.\" across architectures, but socklen_t is always 32 bits,
+.\" as (at least with GCC) is int.
+.SH STANDARDS
+POSIX.1-2008.
+.PP
+.B MSG_CONFIRM
+is a Linux extension.
+.SH HISTORY
+4.4BSD, SVr4, POSIX.1-2001.
+(first appeared in 4.2BSD).
+.PP
+POSIX.1-2001 describes only the
+.B MSG_OOB
+and
+.B MSG_EOR
+flags.
+POSIX.1-2008 adds a specification of
+.BR MSG_NOSIGNAL .
+.SH NOTES
+See
+.BR sendmmsg (2)
+for information about a Linux-specific system call
+that can be used to transmit multiple datagrams in a single call.
+.SH BUGS
+Linux may return
+.B EPIPE
+instead of
+.BR ENOTCONN .
+.SH EXAMPLES
+An example of the use of
+.BR sendto ()
+is shown in
+.BR getaddrinfo (3).
+.SH SEE ALSO
+.BR fcntl (2),
+.BR getsockopt (2),
+.BR recv (2),
+.BR select (2),
+.BR sendfile (2),
+.BR sendmmsg (2),
+.BR shutdown (2),
+.BR socket (2),
+.BR write (2),
+.BR cmsg (3),
+.BR ip (7),
+.BR ipv6 (7),
+.BR socket (7),
+.BR tcp (7),
+.BR udp (7),
+.BR unix (7)
diff --git a/man2/sendfile.2 b/man2/sendfile.2
new file mode 100644
index 0000000..d9c3451
--- /dev/null
+++ b/man2/sendfile.2
@@ -0,0 +1,236 @@
+.\" SPDX-License-Identifier: Linux-man-pages-1-para
+.\"
+.\" This man page is Copyright (C) 1998 Pawel Krawczyk.
+.\"
+.\" $Id: sendfile.2,v 1.5 1999/05/18 11:54:11 freitag Exp $
+.\" 2000-11-19 bert hubert <ahu@ds9a.nl>: in_fd cannot be socket
+.\"
+.\" 2004-12-17, mtk
+.\" updated description of in_fd and out_fd for 2.6
+.\" Various wording and formatting changes
+.\"
+.\" 2005-03-31 Martin Pool <mbp@sourcefrog.net> mmap() improvements
+.\"
+.TH sendfile 2 2023-07-15 "Linux man-pages 6.05.01"
+.SH NAME
+sendfile \- transfer data between file descriptors
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/sendfile.h>
+.PP
+.BI "ssize_t sendfile(int" " out_fd" ", int" " in_fd" ", \
+off_t *_Nullable " offset ,
+.BI " size_t" " count" );
+.\" The below is too ugly. Comments about glibc versions belong
+.\" in the notes, not in the header.
+.\"
+.\" .B #include <features.h>
+.\" .B #if (__GLIBC__==2 && __GLIBC_MINOR__>=1) || __GLIBC__>2
+.\" .B #include <sys/sendfile.h>
+.\" #else
+.\" .B #include <sys/types.h>
+.\" .B /* No system prototype before glibc 2.1. */
+.\" .BI "ssize_t sendfile(int" " out_fd" ", int" " in_fd" ", off_t *" \
+.\" offset ", size_t" " count" )
+.\" .B #endif
+.\"
+.fi
+.SH DESCRIPTION
+.BR sendfile ()
+copies data between one file descriptor and another.
+Because this copying is done within the kernel,
+.BR sendfile ()
+is more efficient than the combination of
+.BR read (2)
+and
+.BR write (2),
+which would require transferring data to and from user space.
+.PP
+.I in_fd
+should be a file descriptor opened for reading and
+.I out_fd
+should be a descriptor opened for writing.
+.PP
+If
+.I offset
+is not NULL, then it points
+to a variable holding the file offset from which
+.BR sendfile ()
+will start reading data from
+.IR in_fd .
+When
+.BR sendfile ()
+returns, this variable
+will be set to the offset of the byte following the last byte that was read.
+If
+.I offset
+is not NULL, then
+.BR sendfile ()
+does not modify the file offset of
+.IR in_fd ;
+otherwise the file offset is adjusted to reflect
+the number of bytes read from
+.IR in_fd .
+.PP
+If
+.I offset
+is NULL, then data will be read from
+.I in_fd
+starting at the file offset,
+and the file offset will be updated by the call.
+.PP
+.I count
+is the number of bytes to copy between the file descriptors.
+.PP
+The
+.I in_fd
+argument must correspond to a file which supports
+.BR mmap (2)-like
+operations
+(i.e., it cannot be a socket).
+.PP
+Before Linux 2.6.33,
+.I out_fd
+must refer to a socket.
+Since Linux 2.6.33 it can be any file.
+If it is a regular file, then
+.BR sendfile ()
+changes the file offset appropriately.
+.SH RETURN VALUE
+If the transfer was successful, the number of bytes written to
+.I out_fd
+is returned.
+Note that a successful call to
+.BR sendfile ()
+may write fewer bytes than requested;
+the caller should be prepared to retry the call if there were unsent bytes.
+See also NOTES.
+.PP
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EAGAIN
+Nonblocking I/O has been selected using
+.B O_NONBLOCK
+and the write would block.
+.TP
+.B EBADF
+The input file was not opened for reading or the output file
+was not opened for writing.
+.TP
+.B EFAULT
+Bad address.
+.TP
+.B EINVAL
+Descriptor is not valid or locked, or an
+.BR mmap (2)-like
+operation is not available for
+.IR in_fd ,
+or
+.I count
+is negative.
+.TP
+.B EINVAL
+.I out_fd
+has the
+.B O_APPEND
+flag set.
+This is not currently supported by
+.BR sendfile ().
+.TP
+.B EIO
+Unspecified error while reading from
+.IR in_fd .
+.TP
+.B ENOMEM
+Insufficient memory to read from
+.IR in_fd .
+.TP
+.B EOVERFLOW
+.I count
+is too large, the operation would result in exceeding the maximum size of either
+the input file or the output file.
+.TP
+.B ESPIPE
+.I offset
+is not NULL but the input file is not seekable.
+.SH VERSIONS
+Other UNIX systems implement
+.BR sendfile ()
+with different semantics and prototypes.
+It should not be used in portable programs.
+.SH STANDARDS
+None.
+.SH HISTORY
+Linux 2.2,
+glibc 2.1.
+.PP
+In Linux 2.4 and earlier,
+.I out_fd
+could also refer to a regular file;
+this possibility went away in the Linux 2.6.x kernel series,
+but was restored in Linux 2.6.33.
+.PP
+The original Linux
+.BR sendfile ()
+system call was not designed to handle large file offsets.
+Consequently, Linux 2.4 added
+.BR sendfile64 (),
+with a wider type for the
+.I offset
+argument.
+The glibc
+.BR sendfile ()
+wrapper function transparently deals with the kernel differences.
+.SH NOTES
+.BR sendfile ()
+will transfer at most 0x7ffff000 (2,147,479,552) bytes,
+returning the number of bytes actually transferred.
+.\" commit e28cc71572da38a5a12c1cfe4d7032017adccf69
+(This is true on both 32-bit and 64-bit systems.)
+.PP
+If you plan to use
+.BR sendfile ()
+for sending files to a TCP socket, but need
+to send some header data in front of the file contents, you will find
+it useful to employ the
+.B TCP_CORK
+option, described in
+.BR tcp (7),
+to minimize the number of packets and to tune performance.
+.PP
+Applications may wish to fall back to
+.BR read (2)
+and
+.BR write (2)
+in the case where
+.BR sendfile ()
+fails with
+.B EINVAL
+or
+.BR ENOSYS .
+.PP
+If
+.I out_fd
+refers to a socket or pipe with zero-copy support, callers must ensure the
+transferred portions of the file referred to by
+.I in_fd
+remain unmodified until the reader on the other end of
+.I out_fd
+has consumed the transferred data.
+.PP
+The Linux-specific
+.BR splice (2)
+call supports transferring data between arbitrary file descriptors
+provided one (or both) of them is a pipe.
+.SH SEE ALSO
+.BR copy_file_range (2),
+.BR mmap (2),
+.BR open (2),
+.BR socket (2),
+.BR splice (2)
diff --git a/man2/sendfile64.2 b/man2/sendfile64.2
new file mode 100644
index 0000000..888077b
--- /dev/null
+++ b/man2/sendfile64.2
@@ -0,0 +1 @@
+.so man2/sendfile.2
diff --git a/man2/sendmmsg.2 b/man2/sendmmsg.2
new file mode 100644
index 0000000..283c4a5
--- /dev/null
+++ b/man2/sendmmsg.2
@@ -0,0 +1,232 @@
+.\" Copyright (c) 2012 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" with some material from a draft by
+.\" Stephan Mueller <stephan.mueller@atsec.com>
+.\" in turn based on Andi Kleen's recvmmsg.2 page.
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH sendmmsg 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+sendmmsg \- send multiple messages on a socket
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #include <sys/socket.h>
+.PP
+.BI "int sendmmsg(int " sockfd ", struct mmsghdr *" msgvec \
+", unsigned int " vlen ","
+.BI " int " flags ");"
+.fi
+.SH DESCRIPTION
+The
+.BR sendmmsg ()
+system call is an extension of
+.BR sendmsg (2)
+that allows the caller to transmit multiple messages on a socket
+using a single system call.
+(This has performance benefits for some applications.)
+.\" See commit 228e548e602061b08ee8e8966f567c12aa079682
+.PP
+The
+.I sockfd
+argument is the file descriptor of the socket
+on which data is to be transmitted.
+.PP
+The
+.I msgvec
+argument is a pointer to an array of
+.I mmsghdr
+structures.
+The size of this array is specified in
+.IR vlen .
+.PP
+The
+.I mmsghdr
+structure is defined in
+.I <sys/socket.h>
+as:
+.PP
+.in +4n
+.EX
+struct mmsghdr {
+ struct msghdr msg_hdr; /* Message header */
+ unsigned int msg_len; /* Number of bytes transmitted */
+};
+.EE
+.in
+.PP
+The
+.I msg_hdr
+field is a
+.I msghdr
+structure, as described in
+.BR sendmsg (2).
+The
+.I msg_len
+field is used to return the number of bytes sent from the message in
+.I msg_hdr
+(i.e., the same as the return value from a single
+.BR sendmsg (2)
+call).
+.PP
+The
+.I flags
+argument contains flags ORed together.
+The flags are the same as for
+.BR sendmsg (2).
+.PP
+A blocking
+.BR sendmmsg ()
+call blocks until
+.I vlen
+messages have been sent.
+A nonblocking call sends as many messages as possible
+(up to the limit specified by
+.IR vlen )
+and returns immediately.
+.PP
+On return from
+.BR sendmmsg (),
+the
+.I msg_len
+fields of successive elements of
+.I msgvec
+are updated to contain the number of bytes transmitted from the corresponding
+.IR msg_hdr .
+The return value of the call indicates the number of elements of
+.I msgvec
+that have been updated.
+.SH RETURN VALUE
+On success,
+.BR sendmmsg ()
+returns the number of messages sent from
+.IR msgvec ;
+if this is less than
+.IR vlen ,
+the caller can retry with a further
+.BR sendmmsg ()
+call to send the remaining messages.
+.PP
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+Errors are as for
+.BR sendmsg (2).
+An error is returned only if no datagrams could be sent.
+See also BUGS.
+.\" commit 728ffb86f10873aaf4abd26dde691ee40ae731fe
+.\" ... only return an error if no datagrams could be sent.
+.\" If less than the requested number of messages were sent, the application
+.\" must retry starting at the first failed one and if the problem is
+.\" persistent the error will be returned.
+.\"
+.\" This matches the behavior of other syscalls like read/write - it
+.\" is not an error if less than the requested number of elements are sent.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 3.0,
+glibc 2.14.
+.SH NOTES
+The value specified in
+.I vlen
+is capped to
+.B UIO_MAXIOV
+(1024).
+.\" commit 98382f419f32d2c12d021943b87dea555677144b
+.\" net: Cap number of elements for sendmmsg
+.\"
+.\" To limit the amount of time we can spend in sendmmsg, cap the
+.\" number of elements to UIO_MAXIOV (currently 1024).
+.\"
+.\" For error handling an application using sendmmsg needs to retry at
+.\" the first unsent message, so capping is simpler and requires less
+.\" application logic than returning EINVAL.
+.SH BUGS
+If an error occurs after at least one message has been sent,
+the call succeeds, and returns the number of messages sent.
+The error code is lost.
+The caller can retry the transmission,
+starting at the first failed message, but there is no guarantee that,
+if an error is returned, it will be the same as the one that was lost
+on the previous call.
+.SH EXAMPLES
+The example below uses
+.BR sendmmsg ()
+to send
+.I onetwo
+and
+.I three
+in two distinct UDP datagrams using one system call.
+The contents of the first datagram originates from a pair of buffers.
+.PP
+.\" SRC BEGIN (sendmmsg.c)
+.EX
+#define _GNU_SOURCE
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+\&
+int
+main(void)
+{
+ int retval;
+ int sockfd;
+ struct iovec msg1[2], msg2;
+ struct mmsghdr msg[2];
+ struct sockaddr_in addr;
+\&
+ sockfd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sockfd == \-1) {
+ perror("socket()");
+ exit(EXIT_FAILURE);
+ }
+\&
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ addr.sin_port = htons(1234);
+ if (connect(sockfd, (struct sockaddr *) &addr, sizeof(addr)) == \-1) {
+ perror("connect()");
+ exit(EXIT_FAILURE);
+ }
+\&
+ memset(msg1, 0, sizeof(msg1));
+ msg1[0].iov_base = "one";
+ msg1[0].iov_len = 3;
+ msg1[1].iov_base = "two";
+ msg1[1].iov_len = 3;
+\&
+ memset(&msg2, 0, sizeof(msg2));
+ msg2.iov_base = "three";
+ msg2.iov_len = 5;
+\&
+ memset(msg, 0, sizeof(msg));
+ msg[0].msg_hdr.msg_iov = msg1;
+ msg[0].msg_hdr.msg_iovlen = 2;
+\&
+ msg[1].msg_hdr.msg_iov = &msg2;
+ msg[1].msg_hdr.msg_iovlen = 1;
+\&
+ retval = sendmmsg(sockfd, msg, 2, 0);
+ if (retval == \-1)
+ perror("sendmmsg()");
+ else
+ printf("%d messages sent\en", retval);
+\&
+ exit(0);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR recvmmsg (2),
+.BR sendmsg (2),
+.BR socket (2),
+.BR socket (7)
diff --git a/man2/sendmsg.2 b/man2/sendmsg.2
new file mode 100644
index 0000000..9a61b33
--- /dev/null
+++ b/man2/sendmsg.2
@@ -0,0 +1 @@
+.so man2/send.2
diff --git a/man2/sendto.2 b/man2/sendto.2
new file mode 100644
index 0000000..9a61b33
--- /dev/null
+++ b/man2/sendto.2
@@ -0,0 +1 @@
+.so man2/send.2
diff --git a/man2/set_mempolicy.2 b/man2/set_mempolicy.2
new file mode 100644
index 0000000..a7f561d
--- /dev/null
+++ b/man2/set_mempolicy.2
@@ -0,0 +1,325 @@
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft-var
+.\"
+.\" Copyright 2003,2004 Andi Kleen, SuSE Labs.
+.\" and Copyright 2007 Lee Schermerhorn, Hewlett Packard
+.\"
+.\" 2006-02-03, mtk, substantial wording changes and other improvements
+.\" 2007-08-27, Lee Schermerhorn <Lee.Schermerhorn@hp.com>
+.\" more precise specification of behavior.
+.\"
+.TH set_mempolicy 2 2023-07-16 "Linux man-pages 6.05.01"
+.SH NAME
+set_mempolicy \- set default NUMA memory policy for a thread and its children
+.SH LIBRARY
+NUMA (Non-Uniform Memory Access) policy library
+.RI ( libnuma ", " \-lnuma )
+.SH SYNOPSIS
+.nf
+.B "#include <numaif.h>"
+.PP
+.BI "long set_mempolicy(int " mode ", const unsigned long *" nodemask ,
+.BI " unsigned long " maxnode );
+.fi
+.SH DESCRIPTION
+.BR set_mempolicy ()
+sets the NUMA memory policy of the calling thread,
+which consists of a policy mode and zero or more nodes,
+to the values specified by the
+.IR mode ,
+.IR nodemask ,
+and
+.I maxnode
+arguments.
+.PP
+A NUMA machine has different
+memory controllers with different distances to specific CPUs.
+The memory policy defines from which node memory is allocated for
+the thread.
+.PP
+This system call defines the default policy for the thread.
+The thread policy governs allocation of pages in the process's
+address space outside of memory ranges
+controlled by a more specific policy set by
+.BR mbind (2).
+The thread default policy also controls allocation of any pages for
+memory-mapped files mapped using the
+.BR mmap (2)
+call with the
+.B MAP_PRIVATE
+flag and that are only read (loaded) from by the thread
+and of memory-mapped files mapped using the
+.BR mmap (2)
+call with the
+.B MAP_SHARED
+flag, regardless of the access type.
+The policy is applied only when a new page is allocated
+for the thread.
+For anonymous memory this is when the page is first
+touched by the thread.
+.PP
+The
+.I mode
+argument must specify one of
+.BR MPOL_DEFAULT ,
+.BR MPOL_BIND ,
+.BR MPOL_INTERLEAVE ,
+.BR MPOL_PREFERRED ,
+or
+.B MPOL_LOCAL
+(which are described in detail below).
+All modes except
+.B MPOL_DEFAULT
+require the caller to specify the node or nodes to which the mode applies,
+via the
+.I nodemask
+argument.
+.PP
+The
+.I mode
+argument may also include an optional
+.IR "mode flag" .
+The supported
+.I "mode flags"
+are:
+.TP
+.BR MPOL_F_NUMA_BALANCING " (since Linux 5.12)"
+.\" commit bda420b985054a3badafef23807c4b4fa38a3dff
+When
+.I mode
+is
+.BR MPOL_BIND ,
+enable the kernel NUMA balancing for the task if it is supported by the kernel.
+If the flag isn't supported by the kernel, or is used with
+.I mode
+other than
+.BR MPOL_BIND ,
+\-1 is returned and
+.I errno
+is set to
+.BR EINVAL .
+.TP
+.BR MPOL_F_RELATIVE_NODES " (since Linux 2.6.26)"
+A nonempty
+.I nodemask
+specifies node IDs that are relative to the
+set of node IDs allowed by the process's current cpuset.
+.TP
+.BR MPOL_F_STATIC_NODES " (since Linux 2.6.26)"
+A nonempty
+.I nodemask
+specifies physical node IDs.
+Linux will not remap the
+.I nodemask
+when the process moves to a different cpuset context,
+nor when the set of nodes allowed by the process's
+current cpuset context changes.
+.PP
+.I nodemask
+points to a bit mask of node IDs that contains up to
+.I maxnode
+bits.
+The bit mask size is rounded to the next multiple of
+.IR "sizeof(unsigned long)" ,
+but the kernel will use bits only up to
+.IR maxnode .
+A NULL value of
+.I nodemask
+or a
+.I maxnode
+value of zero specifies the empty set of nodes.
+If the value of
+.I maxnode
+is zero,
+the
+.I nodemask
+argument is ignored.
+.PP
+Where a
+.I nodemask
+is required, it must contain at least one node that is on-line,
+allowed by the process's current cpuset context,
+(unless the
+.B MPOL_F_STATIC_NODES
+mode flag is specified),
+and contains memory.
+If the
+.B MPOL_F_STATIC_NODES
+is set in
+.I mode
+and a required
+.I nodemask
+contains no nodes that are allowed by the process's current cpuset context,
+the memory policy reverts to
+.IR "local allocation" .
+This effectively overrides the specified policy until the process's
+cpuset context includes one or more of the nodes specified by
+.IR nodemask .
+.PP
+The
+.I mode
+argument must include one of the following values:
+.TP
+.B MPOL_DEFAULT
+This mode specifies that any nondefault thread memory policy be removed,
+so that the memory policy "falls back" to the system default policy.
+The system default policy is "local allocation"\[em]that is,
+allocate memory on the node of the CPU that triggered the allocation.
+.I nodemask
+must be specified as NULL.
+If the "local node" contains no free memory, the system will
+attempt to allocate memory from a "near by" node.
+.TP
+.B MPOL_BIND
+This mode defines a strict policy that restricts memory allocation to the
+nodes specified in
+.IR nodemask .
+If
+.I nodemask
+specifies more than one node, page allocations will come from
+the node with the lowest numeric node ID first, until that node
+contains no free memory.
+Allocations will then come from the node with the next highest
+node ID specified in
+.I nodemask
+and so forth, until none of the specified nodes contain free memory.
+Pages will not be allocated from any node not specified in the
+.IR nodemask .
+.TP
+.B MPOL_INTERLEAVE
+This mode interleaves page allocations across the nodes specified in
+.I nodemask
+in numeric node ID order.
+This optimizes for bandwidth instead of latency
+by spreading out pages and memory accesses to those pages across
+multiple nodes.
+However, accesses to a single page will still be limited to
+the memory bandwidth of a single node.
+.\" NOTE: the following sentence doesn't make sense in the context
+.\" of set_mempolicy() -- no memory area specified.
+.\" To be effective the memory area should be fairly large,
+.\" at least 1 MB or bigger.
+.TP
+.B MPOL_PREFERRED
+This mode sets the preferred node for allocation.
+The kernel will try to allocate pages from this node first
+and fall back to "near by" nodes if the preferred node is low on free
+memory.
+If
+.I nodemask
+specifies more than one node ID, the first node in the
+mask will be selected as the preferred node.
+If the
+.I nodemask
+and
+.I maxnode
+arguments specify the empty set, then the policy
+specifies "local allocation"
+(like the system default policy discussed above).
+.TP
+.BR MPOL_LOCAL " (since Linux 3.8)"
+.\" commit 479e2802d09f1e18a97262c4c6f8f17ae5884bd8
+.\" commit f2a07f40dbc603c15f8b06e6ec7f768af67b424f
+This mode specifies "local allocation"; the memory is allocated on
+the node of the CPU that triggered the allocation (the "local node").
+The
+.I nodemask
+and
+.I maxnode
+arguments must specify the empty set.
+If the "local node" is low on free memory,
+the kernel will try to allocate memory from other nodes.
+The kernel will allocate memory from the "local node"
+whenever memory for this node is available.
+If the "local node" is not allowed by the process's current cpuset context,
+the kernel will try to allocate memory from other nodes.
+The kernel will allocate memory from the "local node" whenever
+it becomes allowed by the process's current cpuset context.
+.PP
+The thread memory policy is preserved across an
+.BR execve (2),
+and is inherited by child threads created using
+.BR fork (2)
+or
+.BR clone (2).
+.SH RETURN VALUE
+On success,
+.BR set_mempolicy ()
+returns 0;
+on error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+Part of all of the memory range specified by
+.I nodemask
+and
+.I maxnode
+points outside your accessible address space.
+.TP
+.B EINVAL
+.I mode
+is invalid.
+Or,
+.I mode
+is
+.B MPOL_DEFAULT
+and
+.I nodemask
+is nonempty,
+or
+.I mode
+is
+.B MPOL_BIND
+or
+.B MPOL_INTERLEAVE
+and
+.I nodemask
+is empty.
+Or,
+.I maxnode
+specifies more than a page worth of bits.
+Or,
+.I nodemask
+specifies one or more node IDs that are
+greater than the maximum supported node ID.
+Or, none of the node IDs specified by
+.I nodemask
+are on-line and allowed by the process's current cpuset context,
+or none of the specified nodes contain memory.
+Or, the
+.I mode
+argument specified both
+.B MPOL_F_STATIC_NODES
+and
+.BR MPOL_F_RELATIVE_NODES .
+Or, the
+.B MPOL_F_NUMA_BALANCING
+isn't supported by the kernel, or is used with
+.I mode
+other than
+.BR MPOL_BIND .
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.7.
+.SH NOTES
+Memory policy is not remembered if the page is swapped out.
+When such a page is paged back in, it will use the policy of
+the thread or memory range that is in effect at the time the
+page is allocated.
+.PP
+For information on library support, see
+.BR numa (7).
+.SH SEE ALSO
+.BR get_mempolicy (2),
+.BR getcpu (2),
+.BR mbind (2),
+.BR mmap (2),
+.BR numa (3),
+.BR cpuset (7),
+.BR numa (7),
+.BR numactl (8)
diff --git a/man2/set_robust_list.2 b/man2/set_robust_list.2
new file mode 100644
index 0000000..a38aa23
--- /dev/null
+++ b/man2/set_robust_list.2
@@ -0,0 +1 @@
+.so man2/get_robust_list.2
diff --git a/man2/set_thread_area.2 b/man2/set_thread_area.2
new file mode 100644
index 0000000..b982112
--- /dev/null
+++ b/man2/set_thread_area.2
@@ -0,0 +1,229 @@
+.\" Copyright (C) 2003 Free Software Foundation, Inc.
+.\" Copyright (C) 2015 Andrew Lutomirski
+.\" Author: Kent Yoder
+.\"
+.\" SPDX-License-Identifier: GPL-1.0-or-later
+.\"
+.TH set_thread_area 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+get_thread_area, set_thread_area \- manipulate thread-local storage information
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.B #if defined __i386__ || defined __x86_64__
+.BR "# include <asm/ldt.h>" " /* Definition of " "struct user_desc" " */"
+.PP
+.BI "int syscall(SYS_get_thread_area, struct user_desc *" u_info );
+.BI "int syscall(SYS_set_thread_area, struct user_desc *" u_info );
+.PP
+.B #elif defined __m68k__
+.PP
+.B "int syscall(SYS_get_thread_area);"
+.BI "int syscall(SYS_set_thread_area, unsigned long " tp );
+.PP
+.B #elif defined __mips__
+.PP
+.BI "int syscall(SYS_set_thread_area, unsigned long " addr );
+.PP
+.B #endif
+.fi
+.PP
+.IR Note :
+glibc provides no wrappers for these system calls,
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+These calls provide architecture-specific support for a thread-local storage
+implementation.
+At the moment,
+.BR set_thread_area ()
+is available on m68k, MIPS, and x86 (both 32-bit and 64-bit variants);
+.BR get_thread_area ()
+is available on m68k and x86.
+.PP
+On m68k and MIPS,
+.BR set_thread_area ()
+allows storing an arbitrary pointer (provided in the
+.B tp
+argument on m68k and in the
+.B addr
+argument on MIPS)
+in the kernel data structure associated with the calling thread;
+this pointer can later be retrieved using
+.BR get_thread_area ()
+(see also NOTES
+for information regarding obtaining the thread pointer on MIPS).
+.PP
+On x86, Linux dedicates three global descriptor table (GDT) entries for
+thread-local storage.
+For more information about the GDT, see the
+Intel Software Developer's Manual or the AMD Architecture Programming Manual.
+.PP
+Both of these system calls take an argument that is a pointer
+to a structure of the following type:
+.PP
+.in +4n
+.EX
+struct user_desc {
+ unsigned int entry_number;
+ unsigned int base_addr;
+ unsigned int limit;
+ unsigned int seg_32bit:1;
+ unsigned int contents:2;
+ unsigned int read_exec_only:1;
+ unsigned int limit_in_pages:1;
+ unsigned int seg_not_present:1;
+ unsigned int useable:1;
+#ifdef __x86_64__
+ unsigned int lm:1;
+#endif
+};
+.EE
+.in
+.PP
+.BR get_thread_area ()
+reads the GDT entry indicated by
+.I u_info\->entry_number
+and fills in the rest of the fields in
+.IR u_info .
+.PP
+.BR set_thread_area ()
+sets a TLS entry in the GDT.
+.PP
+The TLS array entry set by
+.BR set_thread_area ()
+corresponds to the value of
+.I u_info\->entry_number
+passed in by the user.
+If this value is in bounds,
+.BR set_thread_area ()
+writes the TLS descriptor pointed to by
+.I u_info
+into the thread's TLS array.
+.PP
+When
+.BR set_thread_area ()
+is passed an
+.I entry_number
+of \-1, it searches for a free TLS entry.
+If
+.BR set_thread_area ()
+finds a free TLS entry, the value of
+.I u_info\->entry_number
+is set upon return to show which entry was changed.
+.PP
+A
+.I user_desc
+is considered "empty" if
+.I read_exec_only
+and
+.I seg_not_present
+are set to 1 and all of the other fields are 0.
+If an "empty" descriptor is passed to
+.BR set_thread_area (),
+the corresponding TLS entry will be cleared.
+See BUGS for additional details.
+.PP
+Since Linux 3.19,
+.BR set_thread_area ()
+cannot be used to write non-present segments, 16-bit segments, or code
+segments, although clearing a segment is still acceptable.
+.SH RETURN VALUE
+On x86, these system calls
+return 0 on success, and \-1 on failure, with
+.I errno
+set to indicate the error.
+.PP
+On MIPS and m68k,
+.BR set_thread_area ()
+always returns 0.
+On m68k,
+.BR get_thread_area ()
+returns the thread area pointer value
+(previously set via
+.BR set_thread_area ()).
+.SH ERRORS
+.TP
+.B EFAULT
+\fIu_info\fP is an invalid pointer.
+.TP
+.B EINVAL
+\fIu_info\->entry_number\fP is out of bounds.
+.TP
+.B ENOSYS
+.BR get_thread_area ()
+or
+.BR set_thread_area ()
+was invoked as a 64-bit system call.
+.TP
+.B ESRCH
+.RB ( set_thread_area ())
+A free TLS entry could not be located.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+.TP
+.BR set_thread_area ()
+Linux 2.5.29.
+.TP
+.BR get_thread_area ()
+Linux 2.5.32.
+.SH NOTES
+These system calls are generally intended for use only by threading libraries.
+.PP
+.BR arch_prctl (2)
+can interfere with
+.BR set_thread_area ()
+on x86.
+See
+.BR arch_prctl (2)
+for more details.
+This is not normally a problem, as
+.BR arch_prctl (2)
+is normally used only by 64-bit programs.
+.PP
+On MIPS, the current value of the thread area pointer can be obtained
+using the instruction:
+.PP
+.in +4n
+.EX
+rdhwr dest, $29
+.EE
+.in
+.PP
+This instruction traps and is handled by kernel.
+.SH BUGS
+On 64-bit kernels before Linux 3.19,
+.\" commit e30ab185c490e9a9381385529e0fd32f0a399495
+one of the padding bits in
+.IR user_desc ,
+if set, would prevent the descriptor from being considered empty (see
+.BR modify_ldt (2)).
+As a result, the only reliable way to clear a TLS entry is to use
+.BR memset (3)
+to zero the entire
+.I user_desc
+structure, including padding bits, and then to set the
+.I read_exec_only
+and
+.I seg_not_present
+bits.
+On Linux 3.19, a
+.I user_desc
+consisting entirely of zeros except for
+.I entry_number
+will also be interpreted as a request to clear a TLS entry, but this
+behaved differently on older kernels.
+.PP
+Prior to Linux 3.19, the DS and ES segment registers must not reference
+TLS entries.
+.SH SEE ALSO
+.BR arch_prctl (2),
+.BR modify_ldt (2),
+.BR ptrace (2)
+.RB ( PTRACE_GET_THREAD_AREA " and " PTRACE_SET_THREAD_AREA )
diff --git a/man2/set_tid_address.2 b/man2/set_tid_address.2
new file mode 100644
index 0000000..180b909
--- /dev/null
+++ b/man2/set_tid_address.2
@@ -0,0 +1,97 @@
+.\" Copyright (C) 2004 Andries Brouwer (aeb@cwi.nl)
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH set_tid_address 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+set_tid_address \- set pointer to thread ID
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "pid_t syscall(SYS_set_tid_address, int *" tidptr );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR set_tid_address (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+For each thread, the kernel maintains two attributes (addresses) called
+.I set_child_tid
+and
+.IR clear_child_tid .
+These two attributes contain the value NULL by default.
+.TP
+.I set_child_tid
+If a thread is started using
+.BR clone (2)
+with the
+.B CLONE_CHILD_SETTID
+flag,
+.I set_child_tid
+is set to the value passed in the
+.I ctid
+argument of that system call.
+.IP
+When
+.I set_child_tid
+is set, the very first thing the new thread does
+is to write its thread ID at this address.
+.TP
+.I clear_child_tid
+If a thread is started using
+.BR clone (2)
+with the
+.B CLONE_CHILD_CLEARTID
+flag,
+.I clear_child_tid
+is set to the value passed in the
+.I ctid
+argument of that system call.
+.PP
+The system call
+.BR set_tid_address ()
+sets the
+.I clear_child_tid
+value for the calling thread to
+.IR tidptr .
+.PP
+When a thread whose
+.I clear_child_tid
+is not NULL terminates, then,
+if the thread is sharing memory with other threads,
+then 0 is written at the address specified in
+.I clear_child_tid
+and the kernel performs the following operation:
+.PP
+.in +4n
+.EX
+futex(clear_child_tid, FUTEX_WAKE, 1, NULL, NULL, 0);
+.EE
+.in
+.PP
+The effect of this operation is to wake a single thread that
+is performing a futex wait on the memory location.
+Errors from the futex wake operation are ignored.
+.SH RETURN VALUE
+.BR set_tid_address ()
+always returns the caller's thread ID.
+.SH ERRORS
+.BR set_tid_address ()
+always succeeds.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.5.48.
+.PP
+Details as given here are valid since Linux 2.5.49.
+.SH SEE ALSO
+.BR clone (2),
+.BR futex (2),
+.BR gettid (2)
diff --git a/man2/setdomainname.2 b/man2/setdomainname.2
new file mode 100644
index 0000000..1c1594c
--- /dev/null
+++ b/man2/setdomainname.2
@@ -0,0 +1 @@
+.so man2/getdomainname.2
diff --git a/man2/setegid.2 b/man2/setegid.2
new file mode 100644
index 0000000..85032b5
--- /dev/null
+++ b/man2/setegid.2
@@ -0,0 +1 @@
+.so man2/seteuid.2
diff --git a/man2/seteuid.2 b/man2/seteuid.2
new file mode 100644
index 0000000..0c47f6a
--- /dev/null
+++ b/man2/seteuid.2
@@ -0,0 +1,134 @@
+.\" Copyright (C) 2001 Andries Brouwer (aeb@cwi.nl)
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" [should really be seteuid.3]
+.\" Modified, 27 May 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on capability requirements
+.\"
+.TH seteuid 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+seteuid, setegid \- set effective user or group ID
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int seteuid(uid_t " euid );
+.BI "int setegid(gid_t " egid );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR seteuid (),
+.BR setegid ():
+.nf
+ _POSIX_C_SOURCE >= 200112L
+ || /* glibc <= 2.19: */ _BSD_SOURCE
+.fi
+.SH DESCRIPTION
+.BR seteuid ()
+sets the effective user ID of the calling process.
+Unprivileged processes may only set the effective user ID to the
+real user ID, the effective user ID or the saved set-user-ID.
+.PP
+Precisely the same holds for
+.BR setegid ()
+with "group" instead of "user".
+.\" When
+.\" .I euid
+.\" equals \-1, nothing is changed.
+.\" (This is an artifact of the implementation in glibc of seteuid()
+.\" using setresuid(2).)
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.PP
+.IR Note :
+there are cases where
+.BR seteuid ()
+can fail even when the caller is UID 0;
+it is a grave security error to omit checking for a failure return from
+.BR seteuid ().
+.SH ERRORS
+.TP
+.B EINVAL
+The target user or group ID is not valid in this user namespace.
+.TP
+.B EPERM
+In the case of
+.BR seteuid ():
+the calling process is not privileged (does not have the
+.B CAP_SETUID
+capability in its user namespace) and
+.I euid
+does not match the current real user ID, current effective user ID,
+or current saved set-user-ID.
+.IP
+In the case of
+.BR setegid ():
+the calling process is not privileged (does not have the
+.B CAP_SETGID
+capability in its user namespace) and
+.I egid
+does not match the current real group ID, current effective group ID,
+or current saved set-group-ID.
+.SH VERSIONS
+Setting the effective user (group) ID to the
+saved set-user-ID (saved set-group-ID) is
+possible since Linux 1.1.37 (1.1.38).
+On an arbitrary system one should check
+.BR _POSIX_SAVED_IDS .
+.PP
+Under glibc 2.0,
+.BI seteuid( euid )
+is equivalent to
+.BI setreuid(\-1, " euid" )
+and hence may change the saved set-user-ID.
+Under glibc 2.1 and later, it is equivalent to
+.BI setresuid(\-1, " euid" ", \-1)"
+and hence does not change the saved set-user-ID.
+Analogous remarks hold for
+.BR setegid (),
+with the difference that the change in implementation from
+.BI setregid(\-1, " egid" )
+to
+.BI setresgid(\-1, " egid" ", \-1)"
+occurred in glibc 2.2 or 2.3 (depending on the hardware architecture).
+.PP
+According to POSIX.1,
+.BR seteuid ()
+.RB ( setegid ())
+need not permit
+.I euid
+.RI ( egid )
+to be the same value as the current effective user (group) ID,
+and some implementations do not permit this.
+.SS C library/kernel differences
+On Linux,
+.BR seteuid ()
+and
+.BR setegid ()
+are implemented as library functions that call, respectively,
+.BR setreuid (2)
+and
+.BR setregid (2).
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, 4.3BSD.
+.SH SEE ALSO
+.BR geteuid (2),
+.BR setresuid (2),
+.BR setreuid (2),
+.BR setuid (2),
+.BR capabilities (7),
+.BR credentials (7),
+.BR user_namespaces (7)
diff --git a/man2/setfsgid.2 b/man2/setfsgid.2
new file mode 100644
index 0000000..43b5507
--- /dev/null
+++ b/man2/setfsgid.2
@@ -0,0 +1,109 @@
+.\" Copyright (C) 1995, Thomas K. Dyas <tdyas@eden.rutgers.edu>
+.\" and Copyright (C) 2019, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Created 1995-08-06 Thomas K. Dyas <tdyas@eden.rutgers.edu>
+.\" Modified 2000-07-01 aeb
+.\" Modified 2002-07-23 aeb
+.\" Modified, 27 May 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on capability requirements
+.\"
+.TH setfsgid 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+setfsgid \- set group identity used for filesystem checks
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/fsuid.h>
+.PP
+.BI "[[deprecated]] int setfsgid(gid_t " fsgid );
+.fi
+.SH DESCRIPTION
+On Linux, a process has both a filesystem group ID and an effective group ID.
+The (Linux-specific) filesystem group ID is used
+for permissions checking when accessing filesystem objects,
+while the effective group ID is used for some other kinds
+of permissions checks (see
+.BR credentials (7)).
+.PP
+Normally, the value of the process's filesystem group ID
+is the same as the value of its effective group ID.
+This is so, because whenever a process's effective group ID is changed,
+the kernel also changes the filesystem group ID to be the same as
+the new value of the effective group ID.
+A process can cause the value of its filesystem group ID to diverge
+from its effective group ID by using
+.BR setfsgid ()
+to change its filesystem group ID to the value given in
+.IR fsgid .
+.PP
+.BR setfsgid ()
+will succeed only if the caller is the superuser or if
+.I fsgid
+matches either the caller's real group ID, effective group ID,
+saved set-group-ID, or current the filesystem user ID.
+.SH RETURN VALUE
+On both success and failure,
+this call returns the previous filesystem group ID of the caller.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 1.2.
+.\" Linux 1.1.44
+.\" and in libc since libc 4.7.6.
+.SS C library/kernel differences
+In glibc 2.15 and earlier,
+when the wrapper for this system call determines that the argument can't be
+passed to the kernel without integer truncation (because the kernel
+is old and does not support 32-bit group IDs),
+it will return \-1 and set \fIerrno\fP to
+.B EINVAL
+without attempting
+the system call.
+.SH NOTES
+The filesystem group ID concept and the
+.BR setfsgid ()
+system call were invented for historical reasons that are
+no longer applicable on modern Linux kernels.
+See
+.BR setfsuid (2)
+for a discussion of why the use of both
+.BR setfsuid (2)
+and
+.BR setfsgid ()
+is nowadays unneeded.
+.PP
+The original Linux
+.BR setfsgid ()
+system call supported only 16-bit group IDs.
+Subsequently, Linux 2.4 added
+.BR setfsgid32 ()
+supporting 32-bit IDs.
+The glibc
+.BR setfsgid ()
+wrapper function transparently deals with the variation across kernel versions.
+.SH BUGS
+No error indications of any kind are returned to the caller,
+and the fact that both successful and unsuccessful calls return
+the same value makes it impossible to directly determine
+whether the call succeeded or failed.
+Instead, the caller must resort to looking at the return value
+from a further call such as
+.I setfsgid(\-1)
+(which will always fail), in order to determine if a preceding call to
+.BR setfsgid ()
+changed the filesystem group ID.
+At the very
+least,
+.B EPERM
+should be returned when the call fails (because the caller lacks the
+.B CAP_SETGID
+capability).
+.SH SEE ALSO
+.BR kill (2),
+.BR setfsuid (2),
+.BR capabilities (7),
+.BR credentials (7)
diff --git a/man2/setfsgid32.2 b/man2/setfsgid32.2
new file mode 100644
index 0000000..fdb8bdc
--- /dev/null
+++ b/man2/setfsgid32.2
@@ -0,0 +1 @@
+.so man2/setfsgid.2
diff --git a/man2/setfsuid.2 b/man2/setfsuid.2
new file mode 100644
index 0000000..56964b0
--- /dev/null
+++ b/man2/setfsuid.2
@@ -0,0 +1,127 @@
+.\" Copyright (C) 1995, Thomas K. Dyas <tdyas@eden.rutgers.edu>
+.\" and Copyright (C) 2013, 2019, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Created 1995-08-06 Thomas K. Dyas <tdyas@eden.rutgers.edu>
+.\" Modified 2000-07-01 aeb
+.\" Modified 2002-07-23 aeb
+.\" Modified, 27 May 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on capability requirements
+.\"
+.TH setfsuid 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+setfsuid \- set user identity used for filesystem checks
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/fsuid.h>
+.PP
+.BI "[[deprecated]] int setfsuid(uid_t " fsuid );
+.fi
+.SH DESCRIPTION
+On Linux, a process has both a filesystem user ID and an effective user ID.
+The (Linux-specific) filesystem user ID is used
+for permissions checking when accessing filesystem objects,
+while the effective user ID is used for various other kinds
+of permissions checks (see
+.BR credentials (7)).
+.PP
+Normally, the value of the process's filesystem user ID
+is the same as the value of its effective user ID.
+This is so, because whenever a process's effective user ID is changed,
+the kernel also changes the filesystem user ID to be the same as
+the new value of the effective user ID.
+A process can cause the value of its filesystem user ID to diverge
+from its effective user ID by using
+.BR setfsuid ()
+to change its filesystem user ID to the value given in
+.IR fsuid .
+.PP
+Explicit calls to
+.BR setfsuid ()
+and
+.BR setfsgid (2)
+are (were) usually used only by programs such as the Linux NFS server that
+need to change what user and group ID is used for file access without a
+corresponding change in the real and effective user and group IDs.
+A change in the normal user IDs for a program such as the NFS server
+is (was) a security hole that can expose it to unwanted signals.
+(However, this issue is historical; see below.)
+.PP
+.BR setfsuid ()
+will succeed only if the caller is the superuser or if
+.I fsuid
+matches either the caller's real user ID, effective user ID,
+saved set-user-ID, or current filesystem user ID.
+.SH RETURN VALUE
+On both success and failure,
+this call returns the previous filesystem user ID of the caller.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 1.2.
+.\" Linux 1.1.44
+.\" and in libc since libc 4.7.6.
+.PP
+At the time when this system call was introduced, one process
+could send a signal to another process with the same effective user ID.
+This meant that if a privileged process changed its effective user ID
+for the purpose of file permission checking,
+then it could become vulnerable to receiving signals
+sent by another (unprivileged) process with the same user ID.
+The filesystem user ID attribute was thus added to allow a process to
+change its user ID for the purposes of file permission checking without
+at the same time becoming vulnerable to receiving unwanted signals.
+Since Linux 2.0, signal permission handling is different (see
+.BR kill (2)),
+with the result that a process can change its effective user ID
+without being vulnerable to receiving signals from unwanted processes.
+Thus,
+.BR setfsuid ()
+is nowadays unneeded and should be avoided in new applications
+(likewise for
+.BR setfsgid (2)).
+.PP
+The original Linux
+.BR setfsuid ()
+system call supported only 16-bit user IDs.
+Subsequently, Linux 2.4 added
+.BR setfsuid32 ()
+supporting 32-bit IDs.
+The glibc
+.BR setfsuid ()
+wrapper function transparently deals with the variation across kernel versions.
+.SS C library/kernel differences
+In glibc 2.15 and earlier,
+when the wrapper for this system call determines that the argument can't be
+passed to the kernel without integer truncation (because the kernel
+is old and does not support 32-bit user IDs),
+it will return \-1 and set \fIerrno\fP to
+.B EINVAL
+without attempting
+the system call.
+.SH BUGS
+No error indications of any kind are returned to the caller,
+and the fact that both successful and unsuccessful calls return
+the same value makes it impossible to directly determine
+whether the call succeeded or failed.
+Instead, the caller must resort to looking at the return value
+from a further call such as
+.I setfsuid(\-1)
+(which will always fail), in order to determine if a preceding call to
+.BR setfsuid ()
+changed the filesystem user ID.
+At the very
+least,
+.B EPERM
+should be returned when the call fails (because the caller lacks the
+.B CAP_SETUID
+capability).
+.SH SEE ALSO
+.BR kill (2),
+.BR setfsgid (2),
+.BR capabilities (7),
+.BR credentials (7)
diff --git a/man2/setfsuid32.2 b/man2/setfsuid32.2
new file mode 100644
index 0000000..1ea58fd
--- /dev/null
+++ b/man2/setfsuid32.2
@@ -0,0 +1 @@
+.so man2/setfsuid.2
diff --git a/man2/setgid.2 b/man2/setgid.2
new file mode 100644
index 0000000..f618887
--- /dev/null
+++ b/man2/setgid.2
@@ -0,0 +1,92 @@
+.\" Copyright (C), 1994, Graeme W. Wilford. (Wilf.)
+.\" and Copyright (C) 2010, 2015, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Fri Jul 29th 12:56:44 BST 1994 Wilf. <G.Wilford@ee.surrey.ac.uk>
+.\" Modified 1997-01-31 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 2002-03-09 by aeb
+.\"
+.TH setgid 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+setgid \- set group identity
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int setgid(gid_t " gid );
+.fi
+.SH DESCRIPTION
+.BR setgid ()
+sets the effective group ID of the calling process.
+If the calling process is privileged (more precisely: has the
+.B CAP_SETGID
+capability in its user namespace),
+the real GID and saved set-group-ID are also set.
+.PP
+Under Linux,
+.BR setgid ()
+is implemented like the POSIX version with the
+.B _POSIX_SAVED_IDS
+feature.
+This allows a set-group-ID program that is not set-user-ID-root
+to drop all of its group
+privileges, do some un-privileged work, and then reengage the original
+effective group ID in a secure manner.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+The group ID specified in
+.I gid
+is not valid in this user namespace.
+.TP
+.B EPERM
+The calling process is not privileged (does not have the
+\fBCAP_SETGID\fP capability in its user namespace), and
+.I gid
+does not match the real group ID or saved set-group-ID of
+the calling process.
+.SH VERSIONS
+.SS C library/kernel differences
+At the kernel level, user IDs and group IDs are a per-thread attribute.
+However, POSIX requires that all threads in a process
+share the same credentials.
+The NPTL threading implementation handles the POSIX requirements by
+providing wrapper functions for
+the various system calls that change process UIDs and GIDs.
+These wrapper functions (including the one for
+.BR setgid ())
+employ a signal-based technique to ensure
+that when one thread changes credentials,
+all of the other threads in the process also change their credentials.
+For details, see
+.BR nptl (7).
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4.
+.PP
+The original Linux
+.BR setgid ()
+system call supported only 16-bit group IDs.
+Subsequently, Linux 2.4 added
+.BR setgid32 ()
+supporting 32-bit IDs.
+The glibc
+.BR setgid ()
+wrapper function transparently deals with the variation across kernel versions.
+.SH SEE ALSO
+.BR getgid (2),
+.BR setegid (2),
+.BR setregid (2),
+.BR capabilities (7),
+.BR credentials (7),
+.BR user_namespaces (7)
diff --git a/man2/setgid32.2 b/man2/setgid32.2
new file mode 100644
index 0000000..bc8ef19
--- /dev/null
+++ b/man2/setgid32.2
@@ -0,0 +1 @@
+.so man2/setgid.2
diff --git a/man2/setgroups.2 b/man2/setgroups.2
new file mode 100644
index 0000000..0ae4cc0
--- /dev/null
+++ b/man2/setgroups.2
@@ -0,0 +1 @@
+.so man2/getgroups.2
diff --git a/man2/setgroups32.2 b/man2/setgroups32.2
new file mode 100644
index 0000000..478fb63
--- /dev/null
+++ b/man2/setgroups32.2
@@ -0,0 +1 @@
+.so man2/setgroups.2
diff --git a/man2/sethostname.2 b/man2/sethostname.2
new file mode 100644
index 0000000..e1fa2a6
--- /dev/null
+++ b/man2/sethostname.2
@@ -0,0 +1 @@
+.so man2/gethostname.2
diff --git a/man2/setitimer.2 b/man2/setitimer.2
new file mode 100644
index 0000000..9518567
--- /dev/null
+++ b/man2/setitimer.2
@@ -0,0 +1 @@
+.so man2/getitimer.2
diff --git a/man2/setns.2 b/man2/setns.2
new file mode 100644
index 0000000..13565de
--- /dev/null
+++ b/man2/setns.2
@@ -0,0 +1,419 @@
+.\" Copyright (C) 2011, Eric Biederman <ebiederm@xmission.com>
+.\" and Copyright (C) 2011, 2012, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-only
+.\"
+.TH setns 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+setns \- reassociate thread with a namespace
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #include <sched.h>
+.PP
+.BI "int setns(int " fd ", int " nstype );
+.fi
+.SH DESCRIPTION
+The
+.BR setns ()
+system call allows the calling thread to move into different namespaces.
+The
+.I fd
+argument is one of the following:
+.IP \[bu] 3
+a file descriptor referring to one of the magic links in a
+.IR /proc/ pid /ns/
+directory (or a bind mount to such a link);
+.IP \[bu]
+a PID file descriptor (see
+.BR pidfd_open (2)).
+.PP
+The
+.I nstype
+argument is interpreted differently in each case.
+.\"
+.SS fd refers to a \fI/proc/\fPpid\fI/ns/\fP link
+If
+.I fd
+refers to a
+.IR /proc/ pid /ns/
+link, then
+.BR setns ()
+reassociates the calling thread with the namespace associated with that link,
+subject to any constraints imposed by the
+.I nstype
+argument.
+In this usage, each call to
+.BR setns ()
+changes just one of the caller's namespace memberships.
+.PP
+The
+.I nstype
+argument specifies which type of namespace
+the calling thread may be reassociated with.
+This argument can have
+.I one
+of the following values:
+.TP
+.B 0
+Allow any type of namespace to be joined.
+.TP
+.BR CLONE_NEWCGROUP " (since Linux 4.6)"
+.I fd
+must refer to a cgroup namespace.
+.TP
+.BR CLONE_NEWIPC " (since Linux 3.0)"
+.I fd
+must refer to an IPC namespace.
+.TP
+.BR CLONE_NEWNET " (since Linux 3.0)"
+.I fd
+must refer to a network namespace.
+.TP
+.BR CLONE_NEWNS " (since Linux 3.8)"
+.I fd
+must refer to a mount namespace.
+.TP
+.BR CLONE_NEWPID " (since Linux 3.8)"
+.I fd
+must refer to a descendant PID namespace.
+.TP
+.BR CLONE_NEWTIME " (since Linux 5.8)"
+.\" commit 76c12881a38aaa83e1eb4ce2fada36c3a732bad4
+.I fd
+must refer to a time namespace.
+.TP
+.BR CLONE_NEWUSER " (since Linux 3.8)"
+.I fd
+must refer to a user namespace.
+.TP
+.BR CLONE_NEWUTS " (since Linux 3.0)"
+.I fd
+must refer to a UTS namespace.
+.PP
+Specifying
+.I nstype
+as 0 suffices if the caller knows (or does not care)
+what type of namespace is referred to by
+.IR fd .
+Specifying a nonzero value for
+.I nstype
+is useful if the caller does not know what type of namespace is referred to by
+.I fd
+and wants to ensure that the namespace is of a particular type.
+(The caller might not know the type of the namespace referred to by
+.I fd
+if the file descriptor was opened by another process and, for example,
+passed to the caller via a UNIX domain socket.)
+.\"
+.SS fd is a PID file descriptor
+Since Linux 5.8,
+.I fd
+may refer to a PID file descriptor obtained from
+.BR pidfd_open (2)
+or
+.BR clone (2).
+In this usage,
+.BR setns ()
+atomically moves the calling thread into one or more of the same namespaces
+as the thread referred to by
+.IR fd .
+.PP
+The
+.I nstype
+argument is a bit mask specified by ORing together
+.I "one or more"
+of the
+.B CLONE_NEW*
+namespace constants listed above.
+The caller is moved into each of the target thread's namespaces
+that is specified in
+.IR nstype ;
+the caller's memberships in the remaining namespaces are left unchanged.
+.PP
+For example, the following code would move the caller into the
+same user, network, and UTS namespaces as PID 1234,
+but would leave the caller's other namespace memberships unchanged:
+.PP
+.in +4n
+.EX
+int fd = pidfd_open(1234, 0);
+setns(fd, CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWUTS);
+.EE
+.in
+.\"
+.SS Details for specific namespace types
+Note the following details and restrictions when reassociating with
+specific namespace types:
+.TP
+User namespaces
+A process reassociating itself with a user namespace must have the
+.B CAP_SYS_ADMIN
+.\" See kernel/user_namespace.c:userns_install() [3.8 source]
+capability in the target user namespace.
+(This necessarily implies that it is only possible to join
+a descendant user namespace.)
+Upon successfully joining a user namespace,
+a process is granted all capabilities in that namespace,
+regardless of its user and group IDs.
+.IP
+A multithreaded process may not change user namespace with
+.BR setns ().
+.IP
+It is not permitted to use
+.BR setns ()
+to reenter the caller's current user namespace.
+This prevents a caller that has dropped capabilities from regaining
+those capabilities via a call to
+.BR setns ().
+.IP
+For security reasons,
+.\" commit e66eded8309ebf679d3d3c1f5820d1f2ca332c71
+.\" https://lwn.net/Articles/543273/
+a process can't join a new user namespace if it is sharing
+filesystem-related attributes
+(the attributes whose sharing is controlled by the
+.BR clone (2)
+.B CLONE_FS
+flag) with another process.
+.IP
+For further details on user namespaces, see
+.BR user_namespaces (7).
+.TP
+Mount namespaces
+Changing the mount namespace requires that the caller possess both
+.B CAP_SYS_CHROOT
+and
+.B CAP_SYS_ADMIN
+capabilities in its own user namespace and
+.B CAP_SYS_ADMIN
+in the user namespace that owns the target mount namespace.
+.IP
+A process can't join a new mount namespace if it is sharing
+filesystem-related attributes
+(the attributes whose sharing is controlled by the
+.BR clone (2)
+.B CLONE_FS
+flag) with another process.
+.\" Above check is in fs/namespace.c:mntns_install() [3.8 source]
+.IP
+See
+.BR user_namespaces (7)
+for details on the interaction of user namespaces and mount namespaces.
+.TP
+PID namespaces
+In order to reassociate itself with a new PID namespace,
+the caller must have the
+.B CAP_SYS_ADMIN
+capability both in its own user namespace and in the user namespace
+that owns the target PID namespace.
+.IP
+Reassociating the PID namespace has somewhat different
+from other namespace types.
+Reassociating the calling thread with a PID namespace changes only
+the PID namespace that subsequently created child processes of
+the caller will be placed in;
+it does not change the PID namespace of the caller itself.
+.IP
+Reassociating with a PID namespace is allowed only if the target
+PID namespace is a descendant (child, grandchild, etc.)
+of, or is the same as, the current PID namespace of the caller.
+.IP
+For further details on PID namespaces, see
+.BR pid_namespaces (7).
+.TP
+Cgroup namespaces
+In order to reassociate itself with a new cgroup namespace,
+the caller must have the
+.B CAP_SYS_ADMIN
+capability both in its own user namespace and in the user namespace
+that owns the target cgroup namespace.
+.IP
+Using
+.BR setns ()
+to change the caller's cgroup namespace does not change
+the caller's cgroup memberships.
+.TP
+Network, IPC, time, and UTS namespaces
+In order to reassociate itself with a new network, IPC, time, or UTS namespace,
+the caller must have the
+.B CAP_SYS_ADMIN
+capability both in its own user namespace and in the user namespace
+that owns the target namespace.
+.SH RETURN VALUE
+On success,
+.BR setns ()
+returns 0.
+On failure, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+.I fd
+is not a valid file descriptor.
+.TP
+.B EINVAL
+.I fd
+refers to a namespace whose type does not match that specified in
+.IR nstype .
+.TP
+.B EINVAL
+There is problem with reassociating
+the thread with the specified namespace.
+.TP
+.\" See kernel/pid_namespace.c::pidns_install() [kernel 3.18 sources]
+.B EINVAL
+The caller tried to join an ancestor (parent, grandparent, and so on)
+PID namespace.
+.TP
+.B EINVAL
+The caller attempted to join the user namespace
+in which it is already a member.
+.TP
+.B EINVAL
+.\" commit e66eded8309ebf679d3d3c1f5820d1f2ca332c71
+The caller shares filesystem
+.RB ( CLONE_FS )
+state (in particular, the root directory)
+with other processes and tried to join a new user namespace.
+.TP
+.B EINVAL
+.\" See kernel/user_namespace.c::userns_install() [kernel 3.15 sources]
+The caller is multithreaded and tried to join a new user namespace.
+.TP
+.B EINVAL
+.I fd
+is a PID file descriptor and
+.I nstype
+is invalid (e.g., it is 0).
+.TP
+.B ENOMEM
+Cannot allocate sufficient memory to change the specified namespace.
+.TP
+.B EPERM
+The calling thread did not have the required capability
+for this operation.
+.TP
+.B ESRCH
+.I fd
+is a PID file descriptor but the process it refers to no longer exists
+(i.e., it has terminated and been waited on).
+.SH STANDARDS
+Linux.
+.SH VERSIONS
+Linux 3.0,
+glibc 2.14.
+.SH NOTES
+For further information on the
+.IR /proc/ pid /ns/
+magic links, see
+.BR namespaces (7).
+.PP
+Not all of the attributes that can be shared when
+a new thread is created using
+.BR clone (2)
+can be changed using
+.BR setns ().
+.SH EXAMPLES
+The program below takes two or more arguments.
+The first argument specifies the pathname of a namespace file in an existing
+.IR /proc/ pid /ns/
+directory.
+The remaining arguments specify a command and its arguments.
+The program opens the namespace file, joins that namespace using
+.BR setns (),
+and executes the specified command inside that namespace.
+.PP
+The following shell session demonstrates the use of this program
+(compiled as a binary named
+.IR ns_exec )
+in conjunction with the
+.B CLONE_NEWUTS
+example program in the
+.BR clone (2)
+man page (complied as a binary named
+.IR newuts ).
+.PP
+We begin by executing the example program in
+.BR clone (2)
+in the background.
+That program creates a child in a separate UTS namespace.
+The child changes the hostname in its namespace,
+and then both processes display the hostnames in their UTS namespaces,
+so that we can see that they are different.
+.PP
+.in +4n
+.EX
+$ \fBsu\fP # Need privilege for namespace operations
+Password:
+# \fB./newuts bizarro &\fP
+[1] 3549
+clone() returned 3550
+uts.nodename in child: bizarro
+uts.nodename in parent: antero
+# \fBuname \-n\fP # Verify hostname in the shell
+antero
+.EE
+.in
+.PP
+We then run the program shown below,
+using it to execute a shell.
+Inside that shell, we verify that the hostname is the one
+set by the child created by the first program:
+.PP
+.in +4n
+.EX
+# \fB./ns_exec /proc/3550/ns/uts /bin/bash\fP
+# \fBuname \-n\fP # Executed in shell started by ns_exec
+bizarro
+.EE
+.in
+.SS Program source
+.\" SRC BEGIN (setns.c)
+.EX
+#define _GNU_SOURCE
+#include <err.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ int fd;
+\&
+ if (argc < 3) {
+ fprintf(stderr, "%s /proc/PID/ns/FILE cmd args...\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ /* Get file descriptor for namespace; the file descriptor is opened
+ with O_CLOEXEC so as to ensure that it is not inherited by the
+ program that is later executed. */
+\&
+ fd = open(argv[1], O_RDONLY | O_CLOEXEC);
+ if (fd == \-1)
+ err(EXIT_FAILURE, "open");
+\&
+ if (setns(fd, 0) == \-1) /* Join that namespace */
+ err(EXIT_FAILURE, "setns");
+\&
+ execvp(argv[2], &argv[2]); /* Execute a command in namespace */
+ err(EXIT_FAILURE, "execvp");
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR nsenter (1),
+.BR clone (2),
+.BR fork (2),
+.BR unshare (2),
+.BR vfork (2),
+.BR namespaces (7),
+.BR unix (7)
diff --git a/man2/setpgid.2 b/man2/setpgid.2
new file mode 100644
index 0000000..2d8bc96
--- /dev/null
+++ b/man2/setpgid.2
@@ -0,0 +1,329 @@
+.\" Copyright (c) 1983, 1991 Regents of the University of California.
+.\" and Copyright (C) 2007, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" All rights reserved.
+.\"
+.\" SPDX-License-Identifier: BSD-4-Clause-UC
+.\"
+.\" @(#)getpgrp.2 6.4 (Berkeley) 3/10/91
+.\"
+.\" Modified 1993-07-24 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1995-04-15 by Michael Chastain <mec@shell.portal.com>:
+.\" Added 'getpgid'.
+.\" Modified 1996-07-21 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified 1996-11-06 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 1999-09-02 by Michael Haardt <michael@moria.de>
+.\" Modified 2002-01-18 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified 2003-01-20 by Andries Brouwer <aeb@cwi.nl>
+.\" 2007-07-25, mtk, fairly substantial rewrites and rearrangements
+.\" of text.
+.\"
+.TH setpgid 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+setpgid, getpgid, setpgrp, getpgrp \- set/get process group
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int setpgid(pid_t " pid ", pid_t " pgid );
+.BI "pid_t getpgid(pid_t " pid );
+.PP
+.BR "pid_t getpgrp(void);" " /* POSIX.1 version */"
+.BI "[[deprecated]] pid_t getpgrp(pid_t " pid ");\fR /* BSD version */"
+.PP
+.BR "int setpgrp(void);" " /* System V version */"
+.BI "[[deprecated]] int setpgrp(pid_t " pid ", pid_t " pgid ");\fR /* BSD version */"
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR getpgid ():
+.nf
+ _XOPEN_SOURCE >= 500
+.\" || _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED
+ || /* Since glibc 2.12: */ _POSIX_C_SOURCE >= 200809L
+.fi
+.PP
+.BR setpgrp "() (POSIX.1):"
+.nf
+ _XOPEN_SOURCE >= 500
+.\" || _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED
+ || /* Since glibc 2.19: */ _DEFAULT_SOURCE
+ || /* glibc <= 2.19: */ _SVID_SOURCE
+.fi
+.PP
+.BR setpgrp "() (BSD),"
+.BR getpgrp "() (BSD):"
+.nf
+ [These are available only before glibc 2.19]
+ _BSD_SOURCE &&
+ ! (_POSIX_SOURCE || _POSIX_C_SOURCE || _XOPEN_SOURCE
+ || _GNU_SOURCE || _SVID_SOURCE)
+.fi
+.SH DESCRIPTION
+All of these interfaces are available on Linux,
+and are used for getting and setting the
+process group ID (PGID) of a process.
+The preferred, POSIX.1-specified ways of doing this are:
+.BR getpgrp (void),
+for retrieving the calling process's PGID; and
+.BR setpgid (),
+for setting a process's PGID.
+.PP
+.BR setpgid ()
+sets the PGID of the process specified by
+.I pid
+to
+.IR pgid .
+If
+.I pid
+is zero, then the process ID of the calling process is used.
+If
+.I pgid
+is zero, then the PGID of the process specified by
+.I pid
+is made the same as its process ID.
+If
+.BR setpgid ()
+is used to move a process from one process
+group to another (as is done by some shells when creating pipelines),
+both process groups must be part of the same session (see
+.BR setsid (2)
+and
+.BR credentials (7)).
+In this case,
+the \fIpgid\fP specifies an existing process group to be joined and the
+session ID of that group must match the session ID of the joining process.
+.PP
+The POSIX.1 version of
+.BR getpgrp (),
+which takes no arguments,
+returns the PGID of the calling process.
+.PP
+.BR getpgid ()
+returns the PGID of the process specified by
+.IR pid .
+If
+.I pid
+is zero, the process ID of the calling process is used.
+(Retrieving the PGID of a process other than the caller is rarely
+necessary, and the POSIX.1
+.BR getpgrp ()
+is preferred for that task.)
+.PP
+The System\ V-style
+.BR setpgrp (),
+which takes no arguments, is equivalent to
+.IR "setpgid(0,\ 0)" .
+.PP
+The BSD-specific
+.BR setpgrp ()
+call, which takes arguments
+.I pid
+and
+.IR pgid ,
+is a wrapper function that calls
+.PP
+.in +4n
+.EX
+setpgid(pid, pgid)
+.EE
+.in
+.PP
+.\" The true BSD setpgrp() system call differs in allowing the PGID
+.\" to be set to arbitrary values, rather than being restricted to
+.\" PGIDs in the same session.
+Since glibc 2.19, the BSD-specific
+.BR setpgrp ()
+function is no longer exposed by
+.IR <unistd.h> ;
+calls should be replaced with the
+.BR setpgid ()
+call shown above.
+.PP
+The BSD-specific
+.BR getpgrp ()
+call, which takes a single
+.I pid
+argument, is a wrapper function that calls
+.PP
+.in +4n
+.EX
+getpgid(pid)
+.EE
+.in
+.PP
+Since glibc 2.19, the BSD-specific
+.BR getpgrp ()
+function is no longer exposed by
+.IR <unistd.h> ;
+calls should be replaced with calls to the POSIX.1
+.BR getpgrp ()
+which takes no arguments (if the intent is to obtain the caller's PGID),
+or with the
+.BR getpgid ()
+call shown above.
+.SH RETURN VALUE
+On success,
+.BR setpgid ()
+and
+.BR setpgrp ()
+return zero.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.PP
+The POSIX.1
+.BR getpgrp ()
+always returns the PGID of the caller.
+.PP
+.BR getpgid (),
+and the BSD-specific
+.BR getpgrp ()
+return a process group on success.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+An attempt was made to change the process group ID
+of one of the children of the calling process and the child had
+already performed an
+.BR execve (2)
+.RB ( setpgid (),
+.BR setpgrp ()).
+.TP
+.B EINVAL
+.I pgid
+is less than 0
+.RB ( setpgid (),
+.BR setpgrp ()).
+.TP
+.B EPERM
+An attempt was made to move a process into a process group in a
+different session, or to change the process
+group ID of one of the children of the calling process and the
+child was in a different session, or to change the process group ID of
+a session leader
+.RB ( setpgid (),
+.BR setpgrp ()).
+.TP
+.B EPERM
+The target process group does not exist.
+.RB ( setpgid (),
+.BR setpgrp ()).
+.TP
+.B ESRCH
+For
+.BR getpgid ():
+.I pid
+does not match any process.
+For
+.BR setpgid ():
+.I pid
+is not the calling process and not a child of the calling process.
+.SH STANDARDS
+.TP
+.BR getpgid ()
+.TQ
+.BR setpgid ()
+.TQ
+.BR getpgrp "() (no args)"
+.TQ
+.BR setpgrp "() (no args)"
+POSIX.1-2008 (but see HISTORY).
+.TP
+.BR setpgrp "() (2 args)"
+.TQ
+.BR getpgrp "() (1 arg)"
+None.
+.SH HISTORY
+.TP
+.BR getpgid ()
+.TQ
+.BR setpgid ()
+.TQ
+.BR getpgrp "() (no args)"
+POSIX.1-2001.
+.TP
+.BR setpgrp "() (no args)"
+POSIX.1-2001.
+POSIX.1-2008 marks it as obsolete.
+.TP
+.BR setpgrp "() (2 args)"
+.TQ
+.BR getpgrp "() (1 arg)"
+4.2BSD.
+.SH NOTES
+A child created via
+.BR fork (2)
+inherits its parent's process group ID.
+The PGID is preserved across an
+.BR execve (2).
+.PP
+Each process group is a member of a session and each process is a
+member of the session of which its process group is a member.
+(See
+.BR credentials (7).)
+.PP
+A session can have a controlling terminal.
+At any time, one (and only one) of the process groups
+in the session can be the foreground process group
+for the terminal;
+the remaining process groups are in the background.
+If a signal is generated from the terminal (e.g., typing the
+interrupt key to generate
+.BR SIGINT ),
+that signal is sent to the foreground process group.
+(See
+.BR termios (3)
+for a description of the characters that generate signals.)
+Only the foreground process group may
+.BR read (2)
+from the terminal;
+if a background process group tries to
+.BR read (2)
+from the terminal, then the group is sent a
+.B SIGTTIN
+signal, which suspends it.
+The
+.BR tcgetpgrp (3)
+and
+.BR tcsetpgrp (3)
+functions are used to get/set the foreground
+process group of the controlling terminal.
+.PP
+The
+.BR setpgid ()
+and
+.BR getpgrp ()
+calls are used by programs such as
+.BR bash (1)
+to create process groups in order to implement shell job control.
+.PP
+If the termination of a process causes a process group to become orphaned,
+and if any member of the newly orphaned process group is stopped, then a
+.B SIGHUP
+signal followed by a
+.B SIGCONT
+signal will be sent to each process
+in the newly orphaned process group.
+.\" exit.3 refers to the following text:
+An orphaned process group is one in which the parent of
+every member of process group is either itself also a member
+of the process group or is a member of a process group
+in a different session (see also
+.BR credentials (7)).
+.SH SEE ALSO
+.BR getuid (2),
+.BR setsid (2),
+.BR tcgetpgrp (3),
+.BR tcsetpgrp (3),
+.BR termios (3),
+.BR credentials (7)
diff --git a/man2/setpgrp.2 b/man2/setpgrp.2
new file mode 100644
index 0000000..d6b107a
--- /dev/null
+++ b/man2/setpgrp.2
@@ -0,0 +1 @@
+.so man2/setpgid.2
diff --git a/man2/setpriority.2 b/man2/setpriority.2
new file mode 100644
index 0000000..b1dcfd9
--- /dev/null
+++ b/man2/setpriority.2
@@ -0,0 +1 @@
+.so man2/getpriority.2
diff --git a/man2/setregid.2 b/man2/setregid.2
new file mode 100644
index 0000000..ec3ff64
--- /dev/null
+++ b/man2/setregid.2
@@ -0,0 +1 @@
+.so man2/setreuid.2
diff --git a/man2/setregid32.2 b/man2/setregid32.2
new file mode 100644
index 0000000..035df17
--- /dev/null
+++ b/man2/setregid32.2
@@ -0,0 +1 @@
+.so man2/setregid.2
diff --git a/man2/setresgid.2 b/man2/setresgid.2
new file mode 100644
index 0000000..d6866a1
--- /dev/null
+++ b/man2/setresgid.2
@@ -0,0 +1 @@
+.so man2/setresuid.2
diff --git a/man2/setresgid32.2 b/man2/setresgid32.2
new file mode 100644
index 0000000..dec1b95
--- /dev/null
+++ b/man2/setresgid32.2
@@ -0,0 +1 @@
+.so man2/setresgid.2
diff --git a/man2/setresuid.2 b/man2/setresuid.2
new file mode 100644
index 0000000..97f0af9
--- /dev/null
+++ b/man2/setresuid.2
@@ -0,0 +1,147 @@
+.\" Copyright (C) 1997 Andries Brouwer (aeb@cwi.nl)
+.\" and Copyright (C) 2005, 2010, 2014, 2015, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified, 2003-05-26, Michael Kerrisk, <mtk.manpages@gmail.com>
+.TH setresuid 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+setresuid, setresgid \- set real, effective, and saved user or group ID
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #include <unistd.h>
+.PP
+.BI "int setresuid(uid_t " ruid ", uid_t " euid ", uid_t " suid );
+.BI "int setresgid(gid_t " rgid ", gid_t " egid ", gid_t " sgid );
+.fi
+.SH DESCRIPTION
+.BR setresuid ()
+sets the real user ID, the effective user ID, and the
+saved set-user-ID of the calling process.
+.PP
+An unprivileged process may change its real UID,
+effective UID, and saved set-user-ID, each to one of:
+the current real UID, the current effective UID, or the
+current saved set-user-ID.
+.PP
+A privileged process (on Linux, one having the \fBCAP_SETUID\fP capability)
+may set its real UID, effective UID, and
+saved set-user-ID to arbitrary values.
+.PP
+If one of the arguments equals \-1, the corresponding value is not changed.
+.PP
+Regardless of what changes are made to the real UID, effective UID,
+and saved set-user-ID, the filesystem UID is always set to the same
+value as the (possibly new) effective UID.
+.PP
+Completely analogously,
+.BR setresgid ()
+sets the real GID, effective GID, and saved set-group-ID
+of the calling process (and always modifies the filesystem GID
+to be the same as the effective GID),
+with the same restrictions for unprivileged processes.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.PP
+.IR Note :
+there are cases where
+.BR setresuid ()
+can fail even when the caller is UID 0;
+it is a grave security error to omit checking for a failure return from
+.BR setresuid ().
+.SH ERRORS
+.TP
+.B EAGAIN
+The call would change the caller's real UID (i.e.,
+.I ruid
+does not match the caller's real UID),
+but there was a temporary failure allocating the
+necessary kernel data structures.
+.TP
+.B EAGAIN
+.I ruid
+does not match the caller's real UID and this call would
+bring the number of processes belonging to the real user ID
+.I ruid
+over the caller's
+.B RLIMIT_NPROC
+resource limit.
+Since Linux 3.1, this error case no longer occurs
+(but robust applications should check for this error);
+see the description of
+.B EAGAIN
+in
+.BR execve (2).
+.TP
+.B EINVAL
+One or more of the target user or group IDs
+is not valid in this user namespace.
+.TP
+.B EPERM
+The calling process is not privileged (did not have the necessary
+capability in its user namespace)
+and tried to change the IDs to values that are not permitted.
+For
+.BR setresuid (),
+the necessary capability is
+.BR CAP_SETUID ;
+for
+.BR setresgid (),
+it is
+.BR CAP_SETGID .
+.SH VERSIONS
+.SS C library/kernel differences
+At the kernel level, user IDs and group IDs are a per-thread attribute.
+However, POSIX requires that all threads in a process
+share the same credentials.
+The NPTL threading implementation handles the POSIX requirements by
+providing wrapper functions for
+the various system calls that change process UIDs and GIDs.
+These wrapper functions (including those for
+.BR setresuid ()
+and
+.BR setresgid ())
+employ a signal-based technique to ensure
+that when one thread changes credentials,
+all of the other threads in the process also change their credentials.
+For details, see
+.BR nptl (7).
+.SH STANDARDS
+None.
+.SH HISTORY
+Linux 2.1.44,
+glibc 2.3.2.
+HP-UX, FreeBSD.
+.PP
+The original Linux
+.BR setresuid ()
+and
+.BR setresgid ()
+system calls supported only 16-bit user and group IDs.
+Subsequently, Linux 2.4 added
+.BR setresuid32 ()
+and
+.BR setresgid32 (),
+supporting 32-bit IDs.
+The glibc
+.BR setresuid ()
+and
+.BR setresgid ()
+wrapper functions transparently deal with the variations across kernel versions.
+.SH SEE ALSO
+.BR getresuid (2),
+.BR getuid (2),
+.BR setfsgid (2),
+.BR setfsuid (2),
+.BR setreuid (2),
+.BR setuid (2),
+.BR capabilities (7),
+.BR credentials (7),
+.BR user_namespaces (7)
diff --git a/man2/setresuid32.2 b/man2/setresuid32.2
new file mode 100644
index 0000000..d6866a1
--- /dev/null
+++ b/man2/setresuid32.2
@@ -0,0 +1 @@
+.so man2/setresuid.2
diff --git a/man2/setreuid.2 b/man2/setreuid.2
new file mode 100644
index 0000000..121deb4
--- /dev/null
+++ b/man2/setreuid.2
@@ -0,0 +1,193 @@
+.\" Copyright (c) 1983, 1991 The Regents of the University of California.
+.\" and Copyright (C) 2009, 2010, 2014, 2015, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" All rights reserved.
+.\"
+.\" SPDX-License-Identifier: BSD-4-Clause-UC
+.\"
+.\" @(#)setregid.2 6.4 (Berkeley) 3/10/91
+.\"
+.\" Modified Sat Jul 24 09:08:49 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Portions extracted from linux/kernel/sys.c:
+.\" Copyright (C) 1991, 1992 Linus Torvalds
+.\" May be distributed under the GNU General Public License
+.\" Changes: 1994-07-29 by Wilf <G.Wilford@ee.surrey.ac.uk>
+.\" 1994-08-02 by Wilf due to change in kernel.
+.\" 2004-07-04 by aeb
+.\" 2004-05-27 by Michael Kerrisk
+.\"
+.TH setreuid 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+setreuid, setregid \- set real and/or effective user or group ID
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int setreuid(uid_t " ruid ", uid_t " euid );
+.BI "int setregid(gid_t " rgid ", gid_t " egid );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR setreuid (),
+.BR setregid ():
+.nf
+ _XOPEN_SOURCE >= 500
+.\" || _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED
+ || /* Since glibc 2.19: */ _DEFAULT_SOURCE
+ || /* glibc <= 2.19: */ _BSD_SOURCE
+.fi
+.SH DESCRIPTION
+.BR setreuid ()
+sets real and effective user IDs of the calling process.
+.PP
+Supplying a value of \-1 for either the real or effective user ID forces
+the system to leave that ID unchanged.
+.PP
+Unprivileged processes may only set the effective user ID to the real user ID,
+the effective user ID, or the saved set-user-ID.
+.PP
+Unprivileged users may only set the real user ID to
+the real user ID or the effective user ID.
+.PP
+If the real user ID is set (i.e.,
+.I ruid
+is not \-1) or the effective user ID is set to a value
+not equal to the previous real user ID,
+the saved set-user-ID will be set to the new effective user ID.
+.PP
+Completely analogously,
+.BR setregid ()
+sets real and effective group ID's of the calling process,
+and all of the above holds with "group" instead of "user".
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.PP
+.IR Note :
+there are cases where
+.BR setreuid ()
+can fail even when the caller is UID 0;
+it is a grave security error to omit checking for a failure return from
+.BR setreuid ().
+.SH ERRORS
+.TP
+.B EAGAIN
+The call would change the caller's real UID (i.e.,
+.I ruid
+does not match the caller's real UID),
+but there was a temporary failure allocating the
+necessary kernel data structures.
+.TP
+.B EAGAIN
+.I ruid
+does not match the caller's real UID and this call would
+bring the number of processes belonging to the real user ID
+.I ruid
+over the caller's
+.B RLIMIT_NPROC
+resource limit.
+Since Linux 3.1, this error case no longer occurs
+(but robust applications should check for this error);
+see the description of
+.B EAGAIN
+in
+.BR execve (2).
+.TP
+.B EINVAL
+One or more of the target user or group IDs
+is not valid in this user namespace.
+.TP
+.B EPERM
+The calling process is not privileged
+(on Linux, does not have the necessary capability in its user namespace:
+.B CAP_SETUID
+in the case of
+.BR setreuid (),
+or
+.B CAP_SETGID
+in the case of
+.BR setregid ())
+and a change other than (i)
+swapping the effective user (group) ID with the real user (group) ID,
+or (ii) setting one to the value of the other or (iii) setting the
+effective user (group) ID to the value of the
+saved set-user-ID (saved set-group-ID) was specified.
+.SH VERSIONS
+POSIX.1 does not specify all of the UID changes that Linux permits
+for an unprivileged process.
+For
+.BR setreuid (),
+the effective user ID can be made the same as the
+real user ID or the saved set-user-ID,
+and it is unspecified whether unprivileged processes may set the
+real user ID to the real user ID, the effective user ID, or the
+saved set-user-ID.
+For
+.BR setregid (),
+the real group ID can be changed to the value of the saved set-group-ID,
+and the effective group ID can be changed to the value of
+the real group ID or the saved set-group-ID.
+The precise details of what ID changes are permitted vary
+across implementations.
+.PP
+POSIX.1 makes no specification about the effect of these calls
+on the saved set-user-ID and saved set-group-ID.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, 4.3BSD (first appeared in 4.2BSD).
+.PP
+Setting the effective user (group) ID to the
+saved set-user-ID (saved set-group-ID) is
+possible since Linux 1.1.37 (1.1.38).
+.PP
+The original Linux
+.BR setreuid ()
+and
+.BR setregid ()
+system calls supported only 16-bit user and group IDs.
+Subsequently, Linux 2.4 added
+.BR setreuid32 ()
+and
+.BR setregid32 (),
+supporting 32-bit IDs.
+The glibc
+.BR setreuid ()
+and
+.BR setregid ()
+wrapper functions transparently deal with the variations across kernel versions.
+.\"
+.SS C library/kernel differences
+At the kernel level, user IDs and group IDs are a per-thread attribute.
+However, POSIX requires that all threads in a process
+share the same credentials.
+The NPTL threading implementation handles the POSIX requirements by
+providing wrapper functions for
+the various system calls that change process UIDs and GIDs.
+These wrapper functions (including those for
+.BR setreuid ()
+and
+.BR setregid ())
+employ a signal-based technique to ensure
+that when one thread changes credentials,
+all of the other threads in the process also change their credentials.
+For details, see
+.BR nptl (7).
+.SH SEE ALSO
+.BR getgid (2),
+.BR getuid (2),
+.BR seteuid (2),
+.BR setgid (2),
+.BR setresuid (2),
+.BR setuid (2),
+.BR capabilities (7),
+.BR credentials (7),
+.BR user_namespaces (7)
diff --git a/man2/setreuid32.2 b/man2/setreuid32.2
new file mode 100644
index 0000000..ec3ff64
--- /dev/null
+++ b/man2/setreuid32.2
@@ -0,0 +1 @@
+.so man2/setreuid.2
diff --git a/man2/setrlimit.2 b/man2/setrlimit.2
new file mode 100644
index 0000000..df6d736
--- /dev/null
+++ b/man2/setrlimit.2
@@ -0,0 +1 @@
+.so man2/getrlimit.2
diff --git a/man2/setsid.2 b/man2/setsid.2
new file mode 100644
index 0000000..ae04028
--- /dev/null
+++ b/man2/setsid.2
@@ -0,0 +1,100 @@
+.\" Copyright Michael Haardt (michael@cantor.informatik.rwth-aachen.de)
+.\" Sat Aug 27 20:43:50 MET DST 1994
+.\" and Copyright (C) 2014, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" Modified Sun Sep 11 19:19:05 1994 <faith@cs.unc.edu>
+.\" Modified Mon Mar 25 10:19:00 1996 <aeb@cwi.nl> (merged a few
+.\" tiny changes from a man page by Charles Livingston).
+.\" Modified Sun Jul 21 14:45:46 1996 <aeb@cwi.nl>
+.\"
+.TH setsid 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+setsid \- creates a session and sets the process group ID
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.B pid_t setsid(void);
+.fi
+.SH DESCRIPTION
+.BR setsid ()
+creates a new session if the calling process is not a
+process group leader.
+The calling process is the leader of the new session
+(i.e., its session ID is made the same as its process ID).
+The calling process also becomes
+the process group leader of a new process group in the session
+(i.e., its process group ID is made the same as its process ID).
+.PP
+The calling process will be the only process in
+the new process group and in the new session.
+.PP
+Initially, the new session has no controlling terminal.
+For details of how a session acquires a controlling terminal, see
+.BR credentials (7).
+.SH RETURN VALUE
+On success, the (new) session ID of the calling process is returned.
+On error,
+.I "(pid_t)\ \-1"
+is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EPERM
+The process group ID of any process equals the PID of the calling process.
+Thus, in particular,
+.BR setsid ()
+fails if the calling process is already a process group leader.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4.
+.SH NOTES
+A child created via
+.BR fork (2)
+inherits its parent's session ID.
+The session ID is preserved across an
+.BR execve (2).
+.PP
+A process group leader is a process whose process group ID equals its PID.
+Disallowing a process group leader from calling
+.BR setsid ()
+prevents the possibility that a process group leader places itself
+in a new session while other processes in the process group remain
+in the original session;
+such a scenario would break the strict
+two-level hierarchy of sessions and process groups.
+In order to be sure that
+.BR setsid ()
+will succeed, call
+.BR fork (2)
+and have the parent
+.BR _exit (2),
+while the child (which by definition can't be a process group leader) calls
+.BR setsid ().
+.PP
+If a session has a controlling terminal, and the
+.B CLOCAL
+flag for that terminal is not set,
+and a terminal hangup occurs, then the session leader is sent a
+.B SIGHUP
+signal.
+.PP
+If a process that is a session leader terminates, then a
+.B SIGHUP
+signal is sent to each process in the foreground
+process group of the controlling terminal.
+.SH SEE ALSO
+.BR setsid (1),
+.BR getsid (2),
+.BR setpgid (2),
+.BR setpgrp (2),
+.BR tcgetsid (3),
+.BR credentials (7),
+.BR sched (7)
diff --git a/man2/setsockopt.2 b/man2/setsockopt.2
new file mode 100644
index 0000000..d98c776
--- /dev/null
+++ b/man2/setsockopt.2
@@ -0,0 +1 @@
+.so man2/getsockopt.2
diff --git a/man2/settimeofday.2 b/man2/settimeofday.2
new file mode 100644
index 0000000..2b6eff4
--- /dev/null
+++ b/man2/settimeofday.2
@@ -0,0 +1 @@
+.so man2/gettimeofday.2
diff --git a/man2/setuid.2 b/man2/setuid.2
new file mode 100644
index 0000000..80284d6
--- /dev/null
+++ b/man2/setuid.2
@@ -0,0 +1,156 @@
+.\" Copyright (C), 1994, Graeme W. Wilford (Wilf).
+.\" and Copyright (C) 2010, 2014, 2015, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Fri Jul 29th 12:56:44 BST 1994 Wilf. <G.Wilford@ee.surrey.ac.uk>
+.\" Changes inspired by patch from Richard Kettlewell
+.\" <richard@greenend.org.uk>, aeb 970616.
+.\" Modified, 27 May 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on capability requirements
+.TH setuid 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+setuid \- set user identity
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int setuid(uid_t " uid );
+.fi
+.SH DESCRIPTION
+.BR setuid ()
+sets the effective user ID of the calling process.
+If the calling process is privileged
+(more precisely: if the process has the
+.B CAP_SETUID
+capability in its user namespace),
+the real UID and saved set-user-ID are also set.
+.PP
+Under Linux,
+.BR setuid ()
+is implemented like the POSIX version with the
+.B _POSIX_SAVED_IDS
+feature.
+This allows a set-user-ID (other than root) program to drop all of its user
+privileges, do some un-privileged work, and then reengage the original
+effective user ID in a secure manner.
+.PP
+If the user is root or the program is set-user-ID-root, special care must be
+taken:
+.BR setuid ()
+checks the effective user ID of the caller and if it is
+the superuser, all process-related user ID's are set to
+.IR uid .
+After this has occurred, it is impossible for the program to regain root
+privileges.
+.PP
+Thus, a set-user-ID-root program wishing to temporarily drop root
+privileges, assume the identity of an unprivileged user, and then regain
+root privileges afterward cannot use
+.BR setuid ().
+You can accomplish this with
+.BR seteuid (2).
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.PP
+.IR Note :
+there are cases where
+.BR setuid ()
+can fail even when the caller is UID 0;
+it is a grave security error to omit checking for a failure return from
+.BR setuid ().
+.SH ERRORS
+.TP
+.B EAGAIN
+The call would change the caller's real UID (i.e.,
+.I uid
+does not match the caller's real UID),
+but there was a temporary failure allocating the
+necessary kernel data structures.
+.TP
+.B EAGAIN
+.I uid
+does not match the real user ID of the caller and this call would
+bring the number of processes belonging to the real user ID
+.I uid
+over the caller's
+.B RLIMIT_NPROC
+resource limit.
+Since Linux 3.1, this error case no longer occurs
+(but robust applications should check for this error);
+see the description of
+.B EAGAIN
+in
+.BR execve (2).
+.TP
+.B EINVAL
+The user ID specified in
+.I uid
+is not valid in this user namespace.
+.TP
+.B EPERM
+The user is not privileged (Linux: does not have the
+.B CAP_SETUID
+capability in its user namespace) and
+.I uid
+does not match the real UID or saved set-user-ID of the calling process.
+.SH VERSIONS
+.SS C library/kernel differences
+At the kernel level, user IDs and group IDs are a per-thread attribute.
+However, POSIX requires that all threads in a process
+share the same credentials.
+The NPTL threading implementation handles the POSIX requirements by
+providing wrapper functions for
+the various system calls that change process UIDs and GIDs.
+These wrapper functions (including the one for
+.BR setuid ())
+employ a signal-based technique to ensure
+that when one thread changes credentials,
+all of the other threads in the process also change their credentials.
+For details, see
+.BR nptl (7).
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4.
+.PP
+Not quite compatible with the 4.4BSD call, which
+sets all of the real, saved, and effective user IDs.
+.\" SVr4 documents an additional EINVAL error condition.
+.PP
+The original Linux
+.BR setuid ()
+system call supported only 16-bit user IDs.
+Subsequently, Linux 2.4 added
+.BR setuid32 ()
+supporting 32-bit IDs.
+The glibc
+.BR setuid ()
+wrapper function transparently deals with the variation across kernel versions.
+.SH NOTES
+Linux has the concept of the filesystem user ID, normally equal to the
+effective user ID.
+The
+.BR setuid ()
+call also sets the filesystem user ID of the calling process.
+See
+.BR setfsuid (2).
+.PP
+If
+.I uid
+is different from the old effective UID, the process will
+be forbidden from leaving core dumps.
+.SH SEE ALSO
+.BR getuid (2),
+.BR seteuid (2),
+.BR setfsuid (2),
+.BR setreuid (2),
+.BR capabilities (7),
+.BR credentials (7),
+.BR user_namespaces (7)
diff --git a/man2/setuid32.2 b/man2/setuid32.2
new file mode 100644
index 0000000..24656c2
--- /dev/null
+++ b/man2/setuid32.2
@@ -0,0 +1 @@
+.so man2/setuid.2
diff --git a/man2/setup.2 b/man2/setup.2
new file mode 100644
index 0000000..61a6002
--- /dev/null
+++ b/man2/setup.2
@@ -0,0 +1,55 @@
+.\" Copyright (c) 1992 Drew Eckhardt (drew@cs.colorado.edu), March 28, 1992
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt <michael@moria.de>
+.\" Modified Sun Jul 25 10:14:13 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 15 April 1995 by Michael Chastain <mec@shell.portal.com>
+.\" Update calling parameters to Linux 1.2.4 values.
+.\" Modified 10 June 1995 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified 3 May 1996 by Martin Schulze <joey@infodrom.north.de>
+.\" Modified Wed Nov 6 04:05:28 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified Sat Jan 29 01:08:23 2000 by aeb
+.\"
+.TH setup 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+setup \- setup devices and filesystems, mount root filesystem
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.B [[deprecated]] int setup(void);
+.fi
+.SH DESCRIPTION
+.BR setup ()
+is called once from within
+.IR linux/init/main.c .
+It calls initialization functions for devices and filesystems
+configured into the kernel and then mounts the root filesystem.
+.PP
+No user process may call
+.BR setup ().
+Any user process, even a process with superuser permission,
+will receive
+.BR EPERM .
+.SH RETURN VALUE
+.BR setup ()
+always returns \-1 for a user process.
+.SH ERRORS
+.TP
+.B EPERM
+Always, for a user process.
+.SH STANDARDS
+Linux.
+.SH VERSIONS
+Removed in Linux 2.1.121.
+.PP
+The calling sequence varied: at some times
+.BR setup ()
+has had a single argument
+.I "void\ *BIOS"
+and at other times a single argument
+.IR "int magic" .
diff --git a/man2/setxattr.2 b/man2/setxattr.2
new file mode 100644
index 0000000..cc2a6b0
--- /dev/null
+++ b/man2/setxattr.2
@@ -0,0 +1,159 @@
+.\" Copyright (C) Andreas Gruenbacher, February 2001
+.\" Copyright (C) Silicon Graphics Inc, September 2001
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.TH setxattr 2 2023-07-28 "Linux man-pages 6.05.01"
+.SH NAME
+setxattr, lsetxattr, fsetxattr \- set an extended attribute value
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/xattr.h>
+.PP
+.BI "int setxattr(const char *" path ", const char *" name ,
+.BI " const void " value [. size "], size_t " size ", int " flags );
+.BI "int lsetxattr(const char *" path ", const char *" name ,
+.BI " const void " value [. size "], size_t " size ", int " flags );
+.BI "int fsetxattr(int " fd ", const char *" name ,
+.BI " const void " value [. size "], size_t " size ", int " flags );
+.fi
+.SH DESCRIPTION
+Extended attributes are
+.IR name : value
+pairs associated with inodes (files, directories, symbolic links, etc.).
+They are extensions to the normal attributes which are associated
+with all inodes in the system (i.e., the
+.BR stat (2)
+data).
+A complete overview of extended attributes concepts can be found in
+.BR xattr (7).
+.PP
+.BR setxattr ()
+sets the
+.I value
+of the extended attribute identified by
+.I name
+and associated with the given
+.I path
+in the filesystem.
+The
+.I size
+argument specifies the size (in bytes) of
+.IR value ;
+a zero-length value is permitted.
+.PP
+.BR lsetxattr ()
+is identical to
+.BR setxattr (),
+except in the case of a symbolic link, where the extended attribute is
+set on the link itself, not the file that it refers to.
+.PP
+.BR fsetxattr ()
+is identical to
+.BR setxattr (),
+only the extended attribute is set on the open file referred to by
+.I fd
+(as returned by
+.BR open (2))
+in place of
+.IR path .
+.PP
+An extended attribute name is a null-terminated string.
+The
+.I name
+includes a namespace prefix; there may be several, disjoint
+namespaces associated with an individual inode.
+The
+.I value
+of an extended attribute is a chunk of arbitrary textual or
+binary data of specified length.
+.PP
+By default
+(i.e.,
+.I flags
+is zero),
+the extended attribute will be created if it does not exist,
+or the value will be replaced if the attribute already exists.
+To modify these semantics, one of the following values can be specified in
+.IR flags :
+.TP
+.B XATTR_CREATE
+Perform a pure create, which fails if the named attribute exists already.
+.TP
+.B XATTR_REPLACE
+Perform a pure replace operation,
+which fails if the named attribute does not already exist.
+.SH RETURN VALUE
+On success, zero is returned.
+On failure, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EDQUOT
+Disk quota limits meant that
+there is insufficient space remaining to store the extended attribute.
+.TP
+.B EEXIST
+.B XATTR_CREATE
+was specified, and the attribute exists already.
+.TP
+.B ENODATA
+.B XATTR_REPLACE
+was specified, and the attribute does not exist.
+.\" .RB ( ENOATTR
+.\" is defined to be a synonym for
+.\" .BR ENODATA
+.\" in
+.\" .IR <attr/attributes.h> .)
+.TP
+.B ENOSPC
+There is insufficient space remaining to store the extended attribute.
+.TP
+.B ENOTSUP
+The namespace prefix of
+.I name
+is not valid.
+.TP
+.B ENOTSUP
+Extended attributes are not supported by the filesystem, or are disabled,
+.TP
+.B EPERM
+The file is marked immutable or append-only.
+(See
+.BR ioctl_iflags (2).)
+.PP
+In addition, the errors documented in
+.BR stat (2)
+can also occur.
+.TP
+.B ERANGE
+The size of
+.I name
+or
+.I value
+exceeds a filesystem-specific limit.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.4,
+glibc 2.3.
+.\" .SH AUTHORS
+.\" Andreas Gruenbacher,
+.\" .RI < a.gruenbacher@computer.org >
+.\" and the SGI XFS development team,
+.\" .RI < linux-xfs@oss.sgi.com >.
+.\" Please send any bug reports or comments to these addresses.
+.SH SEE ALSO
+.BR getfattr (1),
+.BR setfattr (1),
+.BR getxattr (2),
+.BR listxattr (2),
+.BR open (2),
+.BR removexattr (2),
+.BR stat (2),
+.BR symlink (7),
+.BR xattr (7)
diff --git a/man2/sgetmask.2 b/man2/sgetmask.2
new file mode 100644
index 0000000..d0c99a2
--- /dev/null
+++ b/man2/sgetmask.2
@@ -0,0 +1,70 @@
+.\" Copyright (c) 2007 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH sgetmask 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+sgetmask, ssetmask \- manipulation of signal mask (obsolete)
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.B [[deprecated]] long syscall(SYS_sgetmask, void);
+.BI "[[deprecated]] long syscall(SYS_ssetmask, long " newmask );
+.fi
+.SH DESCRIPTION
+These system calls are obsolete.
+.IR "Do not use them" ;
+use
+.BR sigprocmask (2)
+instead.
+.PP
+.BR sgetmask ()
+returns the signal mask of the calling process.
+.PP
+.BR ssetmask ()
+sets the signal mask of the calling process to the value given in
+.IR newmask .
+The previous signal mask is returned.
+.PP
+The signal masks dealt with by these two system calls
+are plain bit masks (unlike the
+.I sigset_t
+used by
+.BR sigprocmask (2));
+use
+.BR sigmask (3)
+to create and inspect these masks.
+.SH RETURN VALUE
+.BR sgetmask ()
+always successfully returns the signal mask.
+.BR ssetmask ()
+always succeeds, and returns the previous signal mask.
+.SH ERRORS
+These system calls always succeed.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Since Linux 3.16,
+.\" f6187769dae48234f3877df3c4d99294cc2254fa
+support for these system calls is optional,
+depending on whether the kernel was built with the
+.B CONFIG_SGETMASK_SYSCALL
+option.
+.SH NOTES
+These system calls are unaware of signal numbers greater than 31
+(i.e., real-time signals).
+.PP
+These system calls do not exist on x86-64.
+.PP
+It is not possible to block
+.B SIGSTOP
+or
+.BR SIGKILL .
+.SH SEE ALSO
+.BR sigprocmask (2),
+.BR signal (7)
diff --git a/man2/shmat.2 b/man2/shmat.2
new file mode 100644
index 0000000..3f3e5a4
--- /dev/null
+++ b/man2/shmat.2
@@ -0,0 +1 @@
+.so man2/shmop.2
diff --git a/man2/shmctl.2 b/man2/shmctl.2
new file mode 100644
index 0000000..4bfb2c5
--- /dev/null
+++ b/man2/shmctl.2
@@ -0,0 +1,490 @@
+'\" t
+.\" Copyright (c) 1993 Luigi P. Bai (lpb@softint.com) July 28, 1993
+.\" and Copyright 1993 Giorgio Ciucci <giorgio@crcc.it>
+.\" and Copyright 2004, 2005 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 1993-07-28, Rik Faith <faith@cs.unc.edu>
+.\" Modified 1993-11-28, Giorgio Ciucci <giorgio@crcc.it>
+.\" Modified 1997-01-31, Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 2001-02-18, Andries Brouwer <aeb@cwi.nl>
+.\" Modified 2002-01-05, 2004-05-27, 2004-06-17,
+.\" Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified 2004-10-11, aeb
+.\" Modified, Nov 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Language and formatting clean-ups
+.\" Updated shmid_ds structure definitions
+.\" Added information on SHM_DEST and SHM_LOCKED flags
+.\" Noted that CAP_IPC_LOCK is not required for SHM_UNLOCK
+.\" since Linux 2.6.9
+.\" Modified, 2004-11-25, mtk, notes on 2.6.9 RLIMIT_MEMLOCK changes
+.\" 2005-04-25, mtk -- noted aberrant Linux behavior w.r.t. new
+.\" attaches to a segment that has already been marked for deletion.
+.\" 2005-08-02, mtk: Added IPC_INFO, SHM_INFO, SHM_STAT descriptions.
+.\" 2018-03-20, dbueso: Added SHM_STAT_ANY description.
+.\"
+.TH shmctl 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+shmctl \- System V shared memory control
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/shm.h>
+.PP
+.BI "int shmctl(int " shmid ", int " cmd ", struct shmid_ds *" buf );
+.fi
+.SH DESCRIPTION
+.BR shmctl ()
+performs the control operation specified by
+.I cmd
+on the System\ V shared memory segment whose identifier is given in
+.IR shmid .
+.PP
+The
+.I buf
+argument is a pointer to a \fIshmid_ds\fP structure,
+defined in \fI<sys/shm.h>\fP as follows:
+.PP
+.in +4n
+.EX
+struct shmid_ds {
+ struct ipc_perm shm_perm; /* Ownership and permissions */
+ size_t shm_segsz; /* Size of segment (bytes) */
+ time_t shm_atime; /* Last attach time */
+ time_t shm_dtime; /* Last detach time */
+ time_t shm_ctime; /* Creation time/time of last
+ modification via shmctl() */
+ pid_t shm_cpid; /* PID of creator */
+ pid_t shm_lpid; /* PID of last shmat(2)/shmdt(2) */
+ shmatt_t shm_nattch; /* No. of current attaches */
+ ...
+};
+.EE
+.in
+.PP
+The fields of the
+.I shmid_ds
+structure are as follows:
+.TP 12
+.I shm_perm
+This is an
+.I ipc_perm
+structure (see below) that specifies the access permissions
+on the shared memory segment.
+.TP
+.I shm_segsz
+Size in bytes of the shared memory segment.
+.TP
+.I shm_atime
+Time of the last
+.BR shmat (2)
+system call that attached this segment.
+.TP
+.I shm_dtime
+Time of the last
+.BR shmdt (2)
+system call that detached tgis segment.
+.TP
+.I shm_ctime
+Time of creation of segment or time of the last
+.BR shmctl ()
+.B IPC_SET
+operation.
+.TP
+.I shm_cpid
+ID of the process that created the shared memory segment.
+.TP
+.I shm_lpid
+ID of the last process that executed a
+.BR shmat (2)
+or
+.BR shmdt (2)
+system call on this segment.
+.TP
+.I shm_nattch
+Number of processes that have this segment attached.
+.PP
+The
+.I ipc_perm
+structure is defined as follows
+(the highlighted fields are settable using
+.BR IPC_SET ):
+.PP
+.in +4n
+.EX
+struct ipc_perm {
+ key_t __key; /* Key supplied to shmget(2) */
+ uid_t \fBuid\fP; /* Effective UID of owner */
+ gid_t \fBgid\fP; /* Effective GID of owner */
+ uid_t cuid; /* Effective UID of creator */
+ gid_t cgid; /* Effective GID of creator */
+ unsigned short \fBmode\fP; /* \fBPermissions\fP + SHM_DEST and
+ SHM_LOCKED flags */
+ unsigned short __seq; /* Sequence number */
+};
+.EE
+.in
+.PP
+The least significant 9 bits of the
+.I mode
+field of the
+.I ipc_perm
+structure define the access permissions for the shared memory segment.
+The permission bits are as follows:
+.TS
+l l.
+0400 Read by user
+0200 Write by user
+0040 Read by group
+0020 Write by group
+0004 Read by others
+0002 Write by others
+.TE
+.PP
+Bits 0100, 0010, and 0001 (the execute bits) are unused by the system.
+(It is not necessary to have execute permission on a segment
+in order to perform a
+.BR shmat (2)
+call with the
+.B SHM_EXEC
+flag.)
+.PP
+Valid values for
+.I cmd
+are:
+.TP
+.B IPC_STAT
+Copy information from the kernel data structure associated with
+.I shmid
+into the
+.I shmid_ds
+structure pointed to by \fIbuf\fP.
+The caller must have read permission on the
+shared memory segment.
+.TP
+.B IPC_SET
+Write the values of some members of the
+.I shmid_ds
+structure pointed to by
+.I buf
+to the kernel data structure associated with this shared memory segment,
+updating also its
+.I shm_ctime
+member.
+.IP
+The following fields are updated:
+\fIshm_perm.uid\fP, \fIshm_perm.gid\fP,
+and (the least significant 9 bits of) \fIshm_perm.mode\fP.
+.IP
+The effective UID of the calling process must match the owner
+.RI ( shm_perm.uid )
+or creator
+.RI ( shm_perm.cuid )
+of the shared memory segment, or the caller must be privileged.
+.TP
+.B IPC_RMID
+Mark the segment to be destroyed.
+The segment will actually be destroyed
+only after the last process detaches it (i.e., when the
+.I shm_nattch
+member of the associated structure
+.I shmid_ds
+is zero).
+The caller must be the owner or creator of the segment, or be privileged.
+The
+.I buf
+argument is ignored.
+.IP
+If a segment has been marked for destruction, then the (nonstandard)
+.B SHM_DEST
+flag of the
+.I shm_perm.mode
+field in the associated data structure retrieved by
+.B IPC_STAT
+will be set.
+.IP
+The caller \fImust\fP ensure that a segment is eventually destroyed;
+otherwise its pages that were faulted in will remain in memory or swap.
+.IP
+See also the description of
+.I /proc/sys/kernel/shm_rmid_forced
+in
+.BR proc (5).
+.TP
+.BR IPC_INFO " (Linux-specific)"
+Return information about system-wide shared memory limits and
+parameters in the structure pointed to by
+.IR buf .
+This structure is of type
+.I shminfo
+(thus, a cast is required),
+defined in
+.I <sys/shm.h>
+if the
+.B _GNU_SOURCE
+feature test macro is defined:
+.IP
+.in +4n
+.EX
+struct shminfo {
+ unsigned long shmmax; /* Maximum segment size */
+ unsigned long shmmin; /* Minimum segment size;
+ always 1 */
+ unsigned long shmmni; /* Maximum number of segments */
+ unsigned long shmseg; /* Maximum number of segments
+ that a process can attach;
+ unused within kernel */
+ unsigned long shmall; /* Maximum number of pages of
+ shared memory, system\-wide */
+};
+.EE
+.in
+.IP
+The
+.IR shmmni ,
+.IR shmmax ,
+and
+.I shmall
+settings can be changed via
+.I /proc
+files of the same name; see
+.BR proc (5)
+for details.
+.TP
+.BR SHM_INFO " (Linux-specific)"
+Return a
+.I shm_info
+structure whose fields contain information
+about system resources consumed by shared memory.
+This structure is defined in
+.I <sys/shm.h>
+if the
+.B _GNU_SOURCE
+feature test macro is defined:
+.IP
+.in +4n
+.EX
+struct shm_info {
+ int used_ids; /* # of currently existing
+ segments */
+ unsigned long shm_tot; /* Total number of shared
+ memory pages */
+ unsigned long shm_rss; /* # of resident shared
+ memory pages */
+ unsigned long shm_swp; /* # of swapped shared
+ memory pages */
+ unsigned long swap_attempts;
+ /* Unused since Linux 2.4 */
+ unsigned long swap_successes;
+ /* Unused since Linux 2.4 */
+};
+.EE
+.in
+.TP
+.BR SHM_STAT " (Linux-specific)"
+Return a
+.I shmid_ds
+structure as for
+.BR IPC_STAT .
+However, the
+.I shmid
+argument is not a segment identifier, but instead an index into
+the kernel's internal array that maintains information about
+all shared memory segments on the system.
+.TP
+.BR SHM_STAT_ANY " (Linux-specific, since Linux 4.17)"
+Return a
+.I shmid_ds
+structure as for
+.BR SHM_STAT .
+However,
+.I shm_perm.mode
+is not checked for read access for
+.IR shmid ,
+meaning that any user can employ this operation (just as any user may read
+.I /proc/sysvipc/shm
+to obtain the same information).
+.PP
+The caller can prevent or allow swapping of a shared
+memory segment with the following \fIcmd\fP values:
+.TP
+.BR SHM_LOCK " (Linux-specific)"
+Prevent swapping of the shared memory segment.
+The caller must fault in
+any pages that are required to be present after locking is enabled.
+If a segment has been locked, then the (nonstandard)
+.B SHM_LOCKED
+flag of the
+.I shm_perm.mode
+field in the associated data structure retrieved by
+.B IPC_STAT
+will be set.
+.TP
+.BR SHM_UNLOCK " (Linux-specific)"
+Unlock the segment, allowing it to be swapped out.
+.PP
+Before Linux 2.6.10, only a privileged process
+could employ
+.B SHM_LOCK
+and
+.BR SHM_UNLOCK .
+Since Linux 2.6.10, an unprivileged process can employ these operations
+if its effective UID matches the owner or creator UID of the segment, and
+(for
+.BR SHM_LOCK )
+the amount of memory to be locked falls within the
+.B RLIMIT_MEMLOCK
+resource limit (see
+.BR setrlimit (2)).
+.\" There was some weirdness in Linux 2.6.9: SHM_LOCK and SHM_UNLOCK could
+.\" be applied to a segment, regardless of ownership of the segment.
+.\" This was a botch-up in the move to RLIMIT_MEMLOCK, and was fixed
+.\" in Linux 2.6.10. MTK, May 2005
+.SH RETURN VALUE
+A successful
+.B IPC_INFO
+or
+.B SHM_INFO
+operation returns the index of the highest used entry in the
+kernel's internal array recording information about all
+shared memory segments.
+(This information can be used with repeated
+.B SHM_STAT
+or
+.B SHM_STAT_ANY
+operations to obtain information about all shared memory segments
+on the system.)
+A successful
+.B SHM_STAT
+operation returns the identifier of the shared memory segment
+whose index was given in
+.IR shmid .
+Other operations return 0 on success.
+.PP
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+\fBIPC_STAT\fP or \fBSHM_STAT\fP is requested and
+\fIshm_perm.mode\fP does not allow read access for
+.IR shmid ,
+and the calling process does not have the
+.B CAP_IPC_OWNER
+capability in the user namespace that governs its IPC namespace.
+.TP
+.B EFAULT
+The argument
+.I cmd
+has value
+.B IPC_SET
+or
+.B IPC_STAT
+but the address pointed to by
+.I buf
+isn't accessible.
+.TP
+.B EIDRM
+\fIshmid\fP points to a removed identifier.
+.TP
+.B EINVAL
+\fIshmid\fP is not a valid identifier, or \fIcmd\fP
+is not a valid command.
+Or: for a
+.B SHM_STAT
+or
+.B SHM_STAT_ANY
+operation, the index value specified in
+.I shmid
+referred to an array slot that is currently unused.
+.TP
+.B ENOMEM
+(Since Linux 2.6.9),
+.B SHM_LOCK
+was specified and the size of the to-be-locked segment would mean
+that the total bytes in locked shared memory segments would exceed
+the limit for the real user ID of the calling process.
+This limit is defined by the
+.B RLIMIT_MEMLOCK
+soft resource limit (see
+.BR setrlimit (2)).
+.TP
+.B EOVERFLOW
+\fBIPC_STAT\fP is attempted, and the GID or UID value
+is too large to be stored in the structure pointed to by
+.IR buf .
+.TP
+.B EPERM
+\fBIPC_SET\fP or \fBIPC_RMID\fP is attempted, and the
+effective user ID of the calling process is not that of the creator
+(found in
+.IR shm_perm.cuid ),
+or the owner
+(found in
+.IR shm_perm.uid ),
+and the process was not privileged (Linux: did not have the
+.B CAP_SYS_ADMIN
+capability).
+.IP
+Or (before Linux 2.6.9),
+.B SHM_LOCK
+or
+.B SHM_UNLOCK
+was specified, but the process was not privileged
+(Linux: did not have the
+.B CAP_IPC_LOCK
+capability).
+(Since Linux 2.6.9, this error can also occur if the
+.B RLIMIT_MEMLOCK
+is 0 and the caller is not privileged.)
+.SH VERSIONS
+Linux permits a process to attach
+.RB ( shmat (2))
+a shared memory segment that has already been marked for deletion
+using
+.IR shmctl(IPC_RMID) .
+This feature is not available on other UNIX implementations;
+portable applications should avoid relying on it.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4.
+.\" SVr4 documents additional error conditions EINVAL,
+.\" ENOENT, ENOSPC, ENOMEM, EEXIST. Neither SVr4 nor SVID documents
+.\" an EIDRM error condition.
+.PP
+Various fields in a \fIstruct shmid_ds\fP were typed as
+.I short
+under Linux 2.2
+and have become
+.I long
+under Linux 2.4.
+To take advantage of this,
+a recompilation under glibc-2.1.91 or later should suffice.
+(The kernel distinguishes old and new calls by an
+.B IPC_64
+flag in
+.IR cmd .)
+.SH NOTES
+The
+.BR IPC_INFO ,
+.BR SHM_STAT ,
+and
+.B SHM_INFO
+operations are used by the
+.BR ipcs (1)
+program to provide information on allocated resources.
+In the future, these may modified or moved to a
+.I /proc
+filesystem interface.
+.SH SEE ALSO
+.BR mlock (2),
+.BR setrlimit (2),
+.BR shmget (2),
+.BR shmop (2),
+.BR capabilities (7),
+.BR sysvipc (7)
diff --git a/man2/shmdt.2 b/man2/shmdt.2
new file mode 100644
index 0000000..3f3e5a4
--- /dev/null
+++ b/man2/shmdt.2
@@ -0,0 +1 @@
+.so man2/shmop.2
diff --git a/man2/shmget.2 b/man2/shmget.2
new file mode 100644
index 0000000..074e83e
--- /dev/null
+++ b/man2/shmget.2
@@ -0,0 +1,410 @@
+.\" Copyright (c) 1993 Luigi P. Bai (lpb@softint.com) July 28, 1993
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified Wed Jul 28 10:57:35 1993, Rik Faith <faith@cs.unc.edu>
+.\" Modified Sun Nov 28 16:43:30 1993, Rik Faith <faith@cs.unc.edu>
+.\" with material from Giorgio Ciucci <giorgio@crcc.it>
+.\" Portions Copyright 1993 Giorgio Ciucci <giorgio@crcc.it>
+.\" Modified Tue Oct 22 22:03:17 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified, 8 Jan 2003, Michael Kerrisk, <mtk.manpages@gmail.com>
+.\" Removed EIDRM from errors - that can't happen...
+.\" Modified, 27 May 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on capability requirements
+.\" Modified, 11 Nov 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Language and formatting clean-ups
+.\" Added notes on /proc files
+.\"
+.TH shmget 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+shmget \- allocates a System V shared memory segment
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/shm.h>
+.PP
+.BI "int shmget(key_t " key ", size_t " size ", int " shmflg );
+.fi
+.SH DESCRIPTION
+.BR shmget ()
+returns the identifier of the System\ V shared memory segment
+associated with the value of the argument
+.IR key .
+It may be used either to obtain the identifier of a previously created
+shared memory segment (when
+.I shmflg
+is zero and
+.I key
+does not have the value
+.BR IPC_PRIVATE ),
+or to create a new set.
+.PP
+A new shared memory segment, with size equal to the value of
+.I size
+rounded up to a multiple of
+.BR PAGE_SIZE ,
+is created if
+.I key
+has the value
+.B IPC_PRIVATE
+or
+.I key
+isn't
+.BR IPC_PRIVATE ,
+no shared memory segment corresponding to
+.I key
+exists, and
+.B IPC_CREAT
+is specified in
+.IR shmflg .
+.PP
+If
+.I shmflg
+specifies both
+.B IPC_CREAT
+and
+.B IPC_EXCL
+and a shared memory segment already exists for
+.IR key ,
+then
+.BR shmget ()
+fails with
+.I errno
+set to
+.BR EEXIST .
+(This is analogous to the effect of the combination
+.B O_CREAT | O_EXCL
+for
+.BR open (2).)
+.PP
+The value
+.I shmflg
+is composed of:
+.TP
+.B IPC_CREAT
+Create a new segment.
+If this flag is not used, then
+.BR shmget ()
+will find the segment associated with \fIkey\fP and check to see if
+the user has permission to access the segment.
+.TP
+.B IPC_EXCL
+This flag is used with
+.B IPC_CREAT
+to ensure that this call creates the segment.
+If the segment already exists, the call fails.
+.TP
+.BR SHM_HUGETLB " (since Linux 2.6)"
+Allocate the segment using "huge" pages.
+See the Linux kernel source file
+.I Documentation/admin\-guide/mm/hugetlbpage.rst
+for further information.
+.TP
+.BR SHM_HUGE_2MB ", " SHM_HUGE_1GB " (since Linux 3.8)"
+.\" See https://lwn.net/Articles/533499/
+Used in conjunction with
+.B SHM_HUGETLB
+to select alternative hugetlb page sizes (respectively, 2\ MB and 1\ GB)
+on systems that support multiple hugetlb page sizes.
+.IP
+More generally, the desired huge page size can be configured by encoding
+the base-2 logarithm of the desired page size in the six bits at the offset
+.BR SHM_HUGE_SHIFT .
+Thus, the above two constants are defined as:
+.IP
+.in +4n
+.EX
+#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT)
+#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT)
+.EE
+.in
+.IP
+For some additional details,
+see the discussion of the similarly named constants in
+.BR mmap (2).
+.TP
+.BR SHM_NORESERVE " (since Linux 2.6.15)"
+This flag serves the same purpose as the
+.BR mmap (2)
+.B MAP_NORESERVE
+flag.
+Do not reserve swap space for this segment.
+When swap space is reserved, one has the guarantee
+that it is possible to modify the segment.
+When swap space is not reserved one might get
+.B SIGSEGV
+upon a write
+if no physical memory is available.
+See also the discussion of the file
+.I /proc/sys/vm/overcommit_memory
+in
+.BR proc (5).
+.\" As at 2.6.17-rc2, this flag has no effect if SHM_HUGETLB was also
+.\" specified.
+.PP
+In addition to the above flags, the least significant 9 bits of
+.I shmflg
+specify the permissions granted to the owner, group, and others.
+These bits have the same format, and the same
+meaning, as the
+.I mode
+argument of
+.BR open (2).
+Presently, execute permissions are not used by the system.
+.PP
+When a new shared memory segment is created,
+its contents are initialized to zero values, and
+its associated data structure,
+.I shmid_ds
+(see
+.BR shmctl (2)),
+is initialized as follows:
+.IP \[bu] 3
+.I shm_perm.cuid
+and
+.I shm_perm.uid
+are set to the effective user ID of the calling process.
+.IP \[bu]
+.I shm_perm.cgid
+and
+.I shm_perm.gid
+are set to the effective group ID of the calling process.
+.IP \[bu]
+The least significant 9 bits of
+.I shm_perm.mode
+are set to the least significant 9 bit of
+.IR shmflg .
+.IP \[bu]
+.I shm_segsz
+is set to the value of
+.IR size .
+.IP \[bu]
+.IR shm_lpid ,
+.IR shm_nattch ,
+.IR shm_atime ,
+and
+.I shm_dtime
+are set to 0.
+.IP \[bu]
+.I shm_ctime
+is set to the current time.
+.PP
+If the shared memory segment already exists, the permissions are
+verified, and a check is made to see if it is marked for destruction.
+.SH RETURN VALUE
+On success, a valid shared memory identifier is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+The user does not have permission to access the
+shared memory segment, and does not have the
+.B CAP_IPC_OWNER
+capability in the user namespace that governs its IPC namespace.
+.TP
+.B EEXIST
+.B IPC_CREAT
+and
+.B IPC_EXCL
+were specified in
+.IR shmflg ,
+but a shared memory segment already exists for
+.IR key .
+.TP
+.B EINVAL
+A new segment was to be created and
+.I size
+is less than
+.B SHMMIN
+or greater than
+.BR SHMMAX .
+.TP
+.B EINVAL
+A segment for the given
+.I key
+exists, but \fIsize\fP is greater than the size
+of that segment.
+.TP
+.B ENFILE
+.\" [2.6.7] shmem_zero_setup()-->shmem_file_setup()-->get_empty_filp()
+The system-wide limit on the total number of open files has been reached.
+.TP
+.B ENOENT
+No segment exists for the given \fIkey\fP, and
+.B IPC_CREAT
+was not specified.
+.TP
+.B ENOMEM
+No memory could be allocated for segment overhead.
+.TP
+.B ENOSPC
+All possible shared memory IDs have been taken
+.RB ( SHMMNI ),
+or allocating a segment of the requested
+.I size
+would cause the system to exceed the system-wide limit on shared memory
+.RB ( SHMALL ).
+.TP
+.B EPERM
+The
+.B SHM_HUGETLB
+flag was specified, but the caller was not privileged (did not have the
+.B CAP_IPC_LOCK
+capability)
+and is not a member of the
+.I sysctl_hugetlb_shm_group
+group; see the description of
+.I /proc/sys/vm/sysctl_hugetlb_shm_group
+in
+.BR proc (5).
+.SH STANDARDS
+POSIX.1-2008.
+.PP
+.B SHM_HUGETLB
+and
+.B SHM_NORESERVE
+are Linux extensions.
+.SH HISTORY
+POSIX.1-2001, SVr4.
+.\" SVr4 documents an additional error condition EEXIST.
+.SH NOTES
+.B IPC_PRIVATE
+isn't a flag field but a
+.I key_t
+type.
+If this special value is used for
+.IR key ,
+the system call ignores all but the least significant 9 bits of
+.I shmflg
+and creates a new shared memory segment.
+.\"
+.SS Shared memory limits
+The following limits on shared memory segment resources affect the
+.BR shmget ()
+call:
+.TP
+.B SHMALL
+System-wide limit on the total amount of shared memory,
+measured in units of the system page size.
+.IP
+On Linux, this limit can be read and modified via
+.IR /proc/sys/kernel/shmall .
+Since Linux 3.16,
+.\" commit 060028bac94bf60a65415d1d55a359c3a17d5c31
+the default value for this limit is:
+.IP
+.in +4n
+.EX
+ULONG_MAX - 2\[ha]24
+.EE
+.in
+.IP
+The effect of this value
+(which is suitable for both 32-bit and 64-bit systems)
+is to impose no limitation on allocations.
+This value, rather than
+.BR ULONG_MAX ,
+was chosen as the default to prevent some cases where historical
+applications simply raised the existing limit without first checking
+its current value.
+Such applications would cause the value to overflow if the limit was set at
+.BR ULONG_MAX .
+.IP
+From Linux 2.4 up to Linux 3.15,
+the default value for this limit was:
+.IP
+.in +4n
+.EX
+SHMMAX / PAGE_SIZE * (SHMMNI / 16)
+.EE
+.in
+.IP
+If
+.B SHMMAX
+and
+.B SHMMNI
+were not modified, then multiplying the result of this formula
+by the page size (to get a value in bytes) yielded a value of 8\ GB
+as the limit on the total memory used by all shared memory segments.
+.TP
+.B SHMMAX
+Maximum size in bytes for a shared memory segment.
+.IP
+On Linux, this limit can be read and modified via
+.IR /proc/sys/kernel/shmmax .
+Since Linux 3.16,
+.\" commit 060028bac94bf60a65415d1d55a359c3a17d5c31
+the default value for this limit is:
+.IP
+.in +4n
+.EX
+ULONG_MAX - 2\[ha]24
+.EE
+.in
+.IP
+The effect of this value
+(which is suitable for both 32-bit and 64-bit systems)
+is to impose no limitation on allocations.
+See the description of
+.B SHMALL
+for a discussion of why this default value (rather than
+.BR ULONG_MAX )
+is used.
+.IP
+From Linux 2.2 up to Linux 3.15, the default value of
+this limit was 0x2000000 (32\ MiB).
+.IP
+Because it is not possible to map just part of a shared memory segment,
+the amount of virtual memory places another limit on the maximum size of a
+usable segment:
+for example, on i386 the largest segments that can be mapped have a
+size of around 2.8\ GB, and on x86-64 the limit is around 127 TB.
+.TP
+.B SHMMIN
+Minimum size in bytes for a shared memory segment: implementation
+dependent (currently 1 byte, though
+.B PAGE_SIZE
+is the effective minimum size).
+.TP
+.B SHMMNI
+System-wide limit on the number of shared memory segments.
+In Linux 2.2, the default value for this limit was 128;
+since Linux 2.4, the default value is 4096.
+.IP
+On Linux, this limit can be read and modified via
+.IR /proc/sys/kernel/shmmni .
+.\" Kernels between Linux 2.4.x and Linux 2.6.8 had an off-by-one error
+.\" that meant that we could create one more segment than SHMMNI -- MTK
+.\" This /proc file is not available in Linux 2.2 and earlier -- MTK
+.PP
+The implementation has no specific limits for the per-process maximum
+number of shared memory segments
+.RB ( SHMSEG ).
+.SS Linux notes
+Until Linux 2.3.30, Linux would return
+.B EIDRM
+for a
+.BR shmget ()
+on a shared memory segment scheduled for deletion.
+.SH BUGS
+The name choice
+.B IPC_PRIVATE
+was perhaps unfortunate,
+.B IPC_NEW
+would more clearly show its function.
+.SH EXAMPLES
+See
+.BR shmop (2).
+.SH SEE ALSO
+.BR memfd_create (2),
+.BR shmat (2),
+.BR shmctl (2),
+.BR shmdt (2),
+.BR ftok (3),
+.BR capabilities (7),
+.BR shm_overview (7),
+.BR sysvipc (7)
diff --git a/man2/shmop.2 b/man2/shmop.2
new file mode 100644
index 0000000..09168ec
--- /dev/null
+++ b/man2/shmop.2
@@ -0,0 +1,507 @@
+.\" Copyright 1993 Giorgio Ciucci (giorgio@crcc.it)
+.\" and Copyright 2020 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified Sun Nov 28 17:06:19 1993, Rik Faith (faith@cs.unc.edu)
+.\" with material from Luigi P. Bai (lpb@softint.com)
+.\" Portions Copyright 1993 Luigi P. Bai
+.\" Modified Tue Oct 22 22:04:23 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified, 5 Jan 2002, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified, 19 Sep 2002, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added SHM_REMAP flag description
+.\" Modified, 27 May 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on capability requirements
+.\" Modified, 11 Nov 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Language and formatting clean-ups
+.\" Changed wording and placement of sentence regarding attachment
+.\" of segments marked for destruction
+.\"
+.TH SHMOP 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+shmat, shmdt \- System V shared memory operations
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/shm.h>
+.PP
+.BI "void *shmat(int " shmid ", const void *_Nullable " shmaddr ", \
+int " shmflg );
+.BI "int shmdt(const void *" shmaddr );
+.fi
+.SH DESCRIPTION
+.SS shmat()
+.BR shmat ()
+attaches the System\ V shared memory segment identified by
+.I shmid
+to the address space of the calling process.
+The attaching address is specified by
+.I shmaddr
+with one of the following criteria:
+.IP \[bu] 3
+If
+.I shmaddr
+is NULL,
+the system chooses a suitable (unused) page-aligned address to attach
+the segment.
+.IP \[bu]
+If
+.I shmaddr
+isn't NULL
+and
+.B SHM_RND
+is specified in
+.IR shmflg ,
+the attach occurs at the address equal to
+.I shmaddr
+rounded down to the nearest multiple of
+.BR SHMLBA .
+.IP \[bu]
+Otherwise,
+.I shmaddr
+must be a page-aligned address at which the attach occurs.
+.PP
+In addition to
+.BR SHM_RND ,
+the following flags may be specified in the
+.I shmflg
+bit-mask argument:
+.TP
+.BR SHM_EXEC " (Linux-specific; since Linux 2.6.9)"
+Allow the contents of the segment to be executed.
+The caller must have execute permission on the segment.
+.TP
+.B SHM_RDONLY
+Attach the segment for read-only access.
+The process must have read permission for the segment.
+If this flag is not specified,
+the segment is attached for read and write access,
+and the process must have read and write permission for the segment.
+There is no notion of a write-only shared memory segment.
+.TP
+.BR SHM_REMAP " (Linux-specific)"
+This flag specifies
+that the mapping of the segment should replace
+any existing mapping in the range starting at
+.I shmaddr
+and continuing for the size of the segment.
+(Normally, an
+.B EINVAL
+error would result if a mapping already exists in this address range.)
+In this case,
+.I shmaddr
+must not be NULL.
+.PP
+The
+.BR brk (2)
+value of the calling process is not altered by the attach.
+The segment will automatically be detached at process exit.
+The same segment may be attached as a read and as a read-write
+one, and more than once, in the process's address space.
+.PP
+A successful
+.BR shmat ()
+call updates the members of the
+.I shmid_ds
+structure (see
+.BR shmctl (2))
+associated with the shared memory segment as follows:
+.IP \[bu] 3
+.I shm_atime
+is set to the current time.
+.IP \[bu]
+.I shm_lpid
+is set to the process-ID of the calling process.
+.IP \[bu]
+.I shm_nattch
+is incremented by one.
+.\"
+.SS shmdt()
+.BR shmdt ()
+detaches the shared memory segment located at the address specified by
+.I shmaddr
+from the address space of the calling process.
+The to-be-detached segment must be currently
+attached with
+.I shmaddr
+equal to the value returned by the attaching
+.BR shmat ()
+call.
+.PP
+On a successful
+.BR shmdt ()
+call, the system updates the members of the
+.I shmid_ds
+structure associated with the shared memory segment as follows:
+.IP \[bu] 3
+.I shm_dtime
+is set to the current time.
+.IP \[bu]
+.I shm_lpid
+is set to the process-ID of the calling process.
+.IP \[bu]
+.I shm_nattch
+is decremented by one.
+If it becomes 0 and the segment is marked for deletion,
+the segment is deleted.
+.SH RETURN VALUE
+On success,
+.BR shmat ()
+returns the address of the attached shared memory segment; on error,
+.I (void\ *)\ \-1
+is returned, and
+.I errno
+is set to indicate the error.
+.PP
+On success,
+.BR shmdt ()
+returns 0; on error \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.BR shmat ()
+can fail with one of the following errors:
+.TP
+.B EACCES
+The calling process does not have the required permissions for
+the requested attach type, and does not have the
+.B CAP_IPC_OWNER
+capability in the user namespace that governs its IPC namespace.
+.TP
+.B EIDRM
+\fIshmid\fP points to a removed identifier.
+.TP
+.B EINVAL
+Invalid
+.I shmid
+value, unaligned (i.e., not page-aligned and \fBSHM_RND\fP was not
+specified) or invalid
+.I shmaddr
+value, or can't attach segment at
+.IR shmaddr ,
+or
+.B SHM_REMAP
+was specified and
+.I shmaddr
+was NULL.
+.TP
+.B ENOMEM
+Could not allocate memory for the descriptor or for the page tables.
+.PP
+.BR shmdt ()
+can fail with one of the following errors:
+.TP
+.B EINVAL
+There is no shared memory segment attached at
+.IR shmaddr ;
+or,
+.\" The following since Linux 2.6.17-rc1:
+.I shmaddr
+is not aligned on a page boundary.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4.
+.\" SVr4 documents an additional error condition EMFILE.
+.PP
+In SVID 3 (or perhaps earlier),
+the type of the \fIshmaddr\fP argument was changed from
+.I "char\ *"
+into
+.IR "const void\ *" ,
+and the returned type of
+.BR shmat ()
+from
+.I "char\ *"
+into
+.IR "void\ *" .
+.SH NOTES
+After a
+.BR fork (2),
+the child inherits the attached shared memory segments.
+.PP
+After an
+.BR execve (2),
+all attached shared memory segments are detached from the process.
+.PP
+Upon
+.BR _exit (2),
+all attached shared memory segments are detached from the process.
+.PP
+Using
+.BR shmat ()
+with
+.I shmaddr
+equal to NULL
+is the preferred, portable way of attaching a shared memory segment.
+Be aware that the shared memory segment attached in this way
+may be attached at different addresses in different processes.
+Therefore, any pointers maintained within the shared memory must be
+made relative (typically to the starting address of the segment),
+rather than absolute.
+.PP
+On Linux, it is possible to attach a shared memory segment even if it
+is already marked to be deleted.
+However, POSIX.1 does not specify this behavior and
+many other implementations do not support it.
+.PP
+The following system parameter affects
+.BR shmat ():
+.TP
+.B SHMLBA
+Segment low boundary address multiple.
+When explicitly specifying an attach address in a call to
+.BR shmat (),
+the caller should ensure that the address is a multiple of this value.
+This is necessary on some architectures,
+in order either to ensure good CPU cache performance or to ensure that
+different attaches of the same segment have consistent views
+within the CPU cache.
+.B SHMLBA
+is normally some multiple of the system page size.
+(On many Linux architectures,
+.B SHMLBA
+is the same as the system page size.)
+.PP
+The implementation places no intrinsic per-process limit on the
+number of shared memory segments
+.RB ( SHMSEG ).
+.SH EXAMPLES
+The two programs shown below exchange a string using a shared memory segment.
+Further details about the programs are given below.
+First, we show a shell session demonstrating their use.
+.PP
+In one terminal window, we run the "reader" program,
+which creates a System V shared memory segment and a System V semaphore set.
+The program prints out the IDs of the created objects,
+and then waits for the semaphore to change value.
+.PP
+.in +4n
+.EX
+$ \fB./svshm_string_read\fP
+shmid = 1114194; semid = 15
+.EE
+.in
+.PP
+In another terminal window, we run the "writer" program.
+The "writer" program takes three command-line arguments:
+the IDs of the shared memory segment and semaphore set created
+by the "reader", and a string.
+It attaches the existing shared memory segment,
+copies the string to the shared memory, and modifies the semaphore value.
+.PP
+.in +4n
+.EX
+$ \fB./svshm_string_write 1114194 15 \[aq]Hello, world\[aq]\fP
+.EE
+.in
+.PP
+Returning to the terminal where the "reader" is running,
+we see that the program has ceased waiting on the semaphore
+and has printed the string that was copied into the
+shared memory segment by the writer:
+.PP
+.in +4n
+.EX
+Hello, world
+.EE
+.in
+.\"
+.SS Program source: svshm_string.h
+The following header file is included by the "reader" and "writer" programs:
+.PP
+.in +4n
+.\" SRC BEGIN (svshm_string.h)
+.EX
+/* svshm_string.h
+\&
+ Licensed under GNU General Public License v2 or later.
+*/
+#include <sys/types.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/sem.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+\&
+#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \e
+ } while (0)
+\&
+union semun { /* Used in calls to semctl() */
+ int val;
+ struct semid_ds * buf;
+ unsigned short * array;
+#if defined(__linux__)
+ struct seminfo * __buf;
+#endif
+};
+\&
+#define MEM_SIZE 4096
+.EE
+.\" SRC END
+.in
+.\"
+.SS Program source: svshm_string_read.c
+The "reader" program creates a shared memory segment and a semaphore set
+containing one semaphore.
+It then attaches the shared memory object into its address space
+and initializes the semaphore value to 1.
+Finally, the program waits for the semaphore value to become 0,
+and afterwards prints the string that has been copied into the
+shared memory segment by the "writer".
+.PP
+.in +4n
+.\" SRC BEGIN (svshm_string_read.c)
+.EX
+/* svshm_string_read.c
+\&
+ Licensed under GNU General Public License v2 or later.
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ipc.h>
+#include <sys/sem.h>
+#include <sys/shm.h>
+\&
+#include "svshm_string.h"
+\&
+int
+main(void)
+{
+ int semid, shmid;
+ char *addr;
+ union semun arg, dummy;
+ struct sembuf sop;
+\&
+ /* Create shared memory and semaphore set containing one
+ semaphore. */
+\&
+ shmid = shmget(IPC_PRIVATE, MEM_SIZE, IPC_CREAT | 0600);
+ if (shmid == \-1)
+ errExit("shmget");
+\&
+ semid = semget(IPC_PRIVATE, 1, IPC_CREAT | 0600);
+ if (semid == \-1)
+ errExit("semget");
+\&
+ /* Attach shared memory into our address space. */
+\&
+ addr = shmat(shmid, NULL, SHM_RDONLY);
+ if (addr == (void *) \-1)
+ errExit("shmat");
+\&
+ /* Initialize semaphore 0 in set with value 1. */
+\&
+ arg.val = 1;
+ if (semctl(semid, 0, SETVAL, arg) == \-1)
+ errExit("semctl");
+\&
+ printf("shmid = %d; semid = %d\en", shmid, semid);
+\&
+ /* Wait for semaphore value to become 0. */
+\&
+ sop.sem_num = 0;
+ sop.sem_op = 0;
+ sop.sem_flg = 0;
+\&
+ if (semop(semid, &sop, 1) == \-1)
+ errExit("semop");
+\&
+ /* Print the string from shared memory. */
+\&
+ printf("%s\en", addr);
+\&
+ /* Remove shared memory and semaphore set. */
+\&
+ if (shmctl(shmid, IPC_RMID, NULL) == \-1)
+ errExit("shmctl");
+ if (semctl(semid, 0, IPC_RMID, dummy) == \-1)
+ errExit("semctl");
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.in
+.\"
+.SS Program source: svshm_string_write.c
+The writer program takes three command-line arguments:
+the IDs of the shared memory segment and semaphore set
+that have already been created by the "reader", and a string.
+It attaches the shared memory segment into its address space,
+and then decrements the semaphore value to 0 in order to inform the
+"reader" that it can now examine the contents of the shared memory.
+.PP
+.in +4n
+.\" SRC BEGIN (svshm_string_write.c)
+.EX
+/* svshm_string_write.c
+\&
+ Licensed under GNU General Public License v2 or later.
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/sem.h>
+#include <sys/shm.h>
+\&
+#include "svshm_string.h"
+\&
+int
+main(int argc, char *argv[])
+{
+ int semid, shmid;
+ char *addr;
+ size_t len;
+ struct sembuf sop;
+\&
+ if (argc != 4) {
+ fprintf(stderr, "Usage: %s shmid semid string\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ len = strlen(argv[3]) + 1; /* +1 to include trailing \[aq]\e0\[aq] */
+ if (len > MEM_SIZE) {
+ fprintf(stderr, "String is too big!\en");
+ exit(EXIT_FAILURE);
+ }
+\&
+ /* Get object IDs from command\-line. */
+\&
+ shmid = atoi(argv[1]);
+ semid = atoi(argv[2]);
+\&
+ /* Attach shared memory into our address space and copy string
+ (including trailing null byte) into memory. */
+\&
+ addr = shmat(shmid, NULL, 0);
+ if (addr == (void *) \-1)
+ errExit("shmat");
+\&
+ memcpy(addr, argv[3], len);
+\&
+ /* Decrement semaphore to 0. */
+\&
+ sop.sem_num = 0;
+ sop.sem_op = \-1;
+ sop.sem_flg = 0;
+\&
+ if (semop(semid, &sop, 1) == \-1)
+ errExit("semop");
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.in
+.SH SEE ALSO
+.BR brk (2),
+.BR mmap (2),
+.BR shmctl (2),
+.BR shmget (2),
+.BR capabilities (7),
+.BR shm_overview (7),
+.BR sysvipc (7)
diff --git a/man2/shutdown.2 b/man2/shutdown.2
new file mode 100644
index 0000000..d9cbcc1
--- /dev/null
+++ b/man2/shutdown.2
@@ -0,0 +1,98 @@
+.\" Copyright (c) 1983, 1991 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" SPDX-License-Identifier: BSD-4-Clause-UC
+.\"
+.\" $Id: shutdown.2,v 1.1.1.1 1999/03/21 22:52:23 freitag Exp $
+.\"
+.\" Modified Sat Jul 24 09:57:55 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Modified Tue Oct 22 22:04:51 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 1998 by Andi Kleen
+.\"
+.TH shutdown 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+shutdown \- shut down part of a full-duplex connection
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/socket.h>
+.PP
+.BI "int shutdown(int " sockfd ", int " how );
+.fi
+.SH DESCRIPTION
+The
+.BR shutdown ()
+call causes all or part of a full-duplex connection on the socket
+associated with
+.I sockfd
+to be shut down.
+If
+.I how
+is
+.BR SHUT_RD ,
+further receptions will be disallowed.
+If
+.I how
+is
+.BR SHUT_WR ,
+further transmissions will be disallowed.
+If
+.I how
+is
+.BR SHUT_RDWR ,
+further receptions and transmissions will be disallowed.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+.I sockfd
+is not a valid file descriptor.
+.TP
+.B EINVAL
+An invalid value was specified in
+.I how
+(but see BUGS).
+.TP
+.B ENOTCONN
+The specified socket is not connected.
+.TP
+.B ENOTSOCK
+The file descriptor
+.I sockfd
+does not refer to a socket.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, 4.4BSD
+(first appeared in 4.2BSD).
+.SH NOTES
+The constants
+.BR SHUT_RD ,
+.BR SHUT_WR ,
+.B SHUT_RDWR
+have the value 0, 1, 2,
+respectively, and are defined in
+.I <sys/socket.h>
+since glibc-2.1.91.
+.SH BUGS
+Checks for the validity of
+.I how
+are done in domain-specific code,
+and before Linux 3.7 not all domains performed these checks.
+.\" https://bugzilla.kernel.org/show_bug.cgi?id=47111
+Most notably, UNIX domain sockets simply ignored invalid values.
+This problem was fixed for UNIX domain sockets
+.\" commit fc61b928dc4d72176cf4bd4d30bf1d22e599aefc
+.\" and for DECnet sockets in commit 46b66d7077b89fb4917ceef19b3f7dd86055c94a
+in Linux 3.7.
+.SH SEE ALSO
+.BR close (2),
+.BR connect (2),
+.BR socket (2),
+.BR socket (7)
diff --git a/man2/sigaction.2 b/man2/sigaction.2
new file mode 100644
index 0000000..8edde42
--- /dev/null
+++ b/man2/sigaction.2
@@ -0,0 +1,1203 @@
+.\" Copyright (c) 1994,1995 Mike Battersby <mib@deakin.edu.au>
+.\" and Copyright 2004, 2005 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" based on work by faith@cs.unc.edu
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified, aeb, 960424
+.\" Modified Fri Jan 31 17:31:20 1997 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified Thu Nov 26 02:12:45 1998 by aeb - add SIGCHLD stuff.
+.\" Modified Sat May 8 17:40:19 1999 by Matthew Wilcox
+.\" add POSIX.1b signals
+.\" Modified Sat Dec 29 01:44:52 2001 by Evan Jones <ejones@uwaterloo.ca>
+.\" SA_ONSTACK
+.\" Modified 2004-11-11 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added mention of SIGCONT under SA_NOCLDSTOP
+.\" Added SA_NOCLDWAIT
+.\" Modified 2004-11-17 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Updated discussion for POSIX.1-2001 and SIGCHLD and sa_flags.
+.\" Formatting fixes
+.\" 2004-12-09, mtk, added SI_TKILL + other minor changes
+.\" 2005-09-15, mtk, split sigpending(), sigprocmask(), sigsuspend()
+.\" out of this page into separate pages.
+.\" 2010-06-11 Andi Kleen, add hwpoison signal extensions
+.\" 2010-06-11 mtk, improvements to discussion of various siginfo_t fields.
+.\" 2015-01-17, Kees Cook <keescook@chromium.org>
+.\" Added notes on ptrace SIGTRAP and SYS_SECCOMP.
+.\"
+.TH sigaction 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+sigaction, rt_sigaction \- examine and change a signal action
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <signal.h>
+.PP
+.BI "int sigaction(int " signum ,
+.BI " const struct sigaction *_Nullable restrict " act ,
+.BI " struct sigaction *_Nullable restrict " oldact );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR sigaction ():
+.nf
+ _POSIX_C_SOURCE
+.fi
+.PP
+.IR siginfo_t :
+.nf
+ _POSIX_C_SOURCE >= 199309L
+.fi
+.SH DESCRIPTION
+The
+.BR sigaction ()
+system call is used to change the action taken by a process on
+receipt of a specific signal.
+(See
+.BR signal (7)
+for an overview of signals.)
+.PP
+.I signum
+specifies the signal and can be any valid signal except
+.B SIGKILL
+and
+.BR SIGSTOP .
+.PP
+If
+.I act
+is non-NULL, the new action for signal
+.I signum
+is installed from
+.IR act .
+If
+.I oldact
+is non-NULL, the previous action is saved in
+.IR oldact .
+.PP
+The
+.I sigaction
+structure is defined as something like:
+.PP
+.in +4n
+.EX
+struct sigaction {
+ void (*sa_handler)(int);
+ void (*sa_sigaction)(int, siginfo_t *, void *);
+ sigset_t sa_mask;
+ int sa_flags;
+ void (*sa_restorer)(void);
+};
+.EE
+.in
+.PP
+On some architectures a union is involved: do not assign to both
+.I sa_handler
+and
+.IR sa_sigaction .
+.PP
+The
+.I sa_restorer
+field is not intended for application use.
+(POSIX does not specify a
+.I sa_restorer
+field.)
+Some further details of the purpose of this field can be found in
+.BR sigreturn (2).
+.PP
+.I sa_handler
+specifies the action to be associated with
+.I signum
+and can be one of the following:
+.IP \[bu] 3
+.B SIG_DFL
+for the default action.
+.IP \[bu]
+.B SIG_IGN
+to ignore this signal.
+.IP \[bu]
+A pointer to a signal handling function.
+This function receives the signal number as its only argument.
+.PP
+If
+.B SA_SIGINFO
+is specified in
+.IR sa_flags ,
+then
+.I sa_sigaction
+(instead of
+.IR sa_handler )
+specifies the signal-handling function for
+.IR signum .
+This function receives three arguments, as described below.
+.PP
+.I sa_mask
+specifies a mask of signals which should be blocked
+(i.e., added to the signal mask of the thread in which
+the signal handler is invoked)
+during execution of the signal handler.
+In addition, the signal which triggered the handler
+will be blocked, unless the
+.B SA_NODEFER
+flag is used.
+.PP
+.I sa_flags
+specifies a set of flags which modify the behavior of the signal.
+It is formed by the bitwise OR of zero or more of the following:
+.TP
+.B SA_NOCLDSTOP
+If
+.I signum
+is
+.BR SIGCHLD ,
+do not receive notification when child processes stop (i.e., when they
+receive one of
+.BR SIGSTOP ", " SIGTSTP ", " SIGTTIN ,
+or
+.BR SIGTTOU )
+or resume (i.e., they receive
+.BR SIGCONT )
+(see
+.BR wait (2)).
+This flag is meaningful only when establishing a handler for
+.BR SIGCHLD .
+.TP
+.BR SA_NOCLDWAIT " (since Linux 2.6)"
+.\" To be precise: Linux 2.5.60 -- MTK
+If
+.I signum
+is
+.BR SIGCHLD ,
+do not transform children into zombies when they terminate.
+See also
+.BR waitpid (2).
+This flag is meaningful only when establishing a handler for
+.BR SIGCHLD ,
+or when setting that signal's disposition to
+.BR SIG_DFL .
+.IP
+If the
+.B SA_NOCLDWAIT
+flag is set when establishing a handler for
+.BR SIGCHLD ,
+POSIX.1 leaves it unspecified whether a
+.B SIGCHLD
+signal is generated when a child process terminates.
+On Linux, a
+.B SIGCHLD
+signal is generated in this case;
+on some other implementations, it is not.
+.TP
+.B SA_NODEFER
+Do not add the signal to the thread's signal mask while the
+handler is executing, unless the signal is specified in
+.IR act.sa_mask .
+Consequently, a further instance of the signal may be delivered
+to the thread while it is executing the handler.
+This flag is meaningful only when establishing a signal handler.
+.IP
+.B SA_NOMASK
+is an obsolete, nonstandard synonym for this flag.
+.TP
+.B SA_ONSTACK
+Call the signal handler on an alternate signal stack provided by
+.BR sigaltstack (2).
+If an alternate stack is not available, the default stack will be used.
+This flag is meaningful only when establishing a signal handler.
+.TP
+.B SA_RESETHAND
+Restore the signal action to the default upon entry to the signal handler.
+This flag is meaningful only when establishing a signal handler.
+.IP
+.B SA_ONESHOT
+is an obsolete, nonstandard synonym for this flag.
+.TP
+.B SA_RESTART
+Provide behavior compatible with BSD signal semantics by making certain
+system calls restartable across signals.
+This flag is meaningful only when establishing a signal handler.
+See
+.BR signal (7)
+for a discussion of system call restarting.
+.TP
+.B SA_RESTORER
+.IR "Not intended for application use" .
+This flag is used by C libraries to indicate that the
+.I sa_restorer
+field contains the address of a "signal trampoline".
+See
+.BR sigreturn (2)
+for more details.
+.TP
+.BR SA_SIGINFO " (since Linux 2.2)"
+The signal handler takes three arguments, not one.
+In this case,
+.I sa_sigaction
+should be set instead of
+.IR sa_handler .
+This flag is meaningful only when establishing a signal handler.
+.\" (The
+.\" .I sa_sigaction
+.\" field was added in Linux 2.1.86.)
+.\"
+.TP
+.BR SA_UNSUPPORTED " (since Linux 5.11)"
+Used to dynamically probe for flag bit support.
+.IP
+If an attempt to register a handler succeeds with this flag set in
+.I act\->sa_flags
+alongside other flags that are potentially unsupported by the kernel,
+and an immediately subsequent
+.BR sigaction ()
+call specifying the same signal number and with a non-NULL
+.I oldact
+argument yields
+.B SA_UNSUPPORTED
+.I clear
+in
+.IR oldact\->sa_flags ,
+then
+.I oldact\->sa_flags
+may be used as a bitmask
+describing which of the potentially unsupported flags are,
+in fact, supported.
+See the section "Dynamically probing for flag bit support"
+below for more details.
+.TP
+.BR SA_EXPOSE_TAGBITS " (since Linux 5.11)"
+Normally, when delivering a signal,
+an architecture-specific set of tag bits are cleared from the
+.I si_addr
+field of
+.IR siginfo_t .
+If this flag is set,
+an architecture-specific subset of the tag bits will be preserved in
+.IR si_addr .
+.IP
+Programs that need to be compatible with Linux versions older than 5.11
+must use
+.B SA_UNSUPPORTED
+to probe for support.
+.SS The siginfo_t argument to a SA_SIGINFO handler
+When the
+.B SA_SIGINFO
+flag is specified in
+.IR act.sa_flags ,
+the signal handler address is passed via the
+.I act.sa_sigaction
+field.
+This handler takes three arguments, as follows:
+.PP
+.in +4n
+.EX
+void
+handler(int sig, siginfo_t *info, void *ucontext)
+{
+ ...
+}
+.EE
+.in
+.PP
+These three arguments are as follows
+.TP
+.I sig
+The number of the signal that caused invocation of the handler.
+.TP
+.I info
+A pointer to a
+.IR siginfo_t ,
+which is a structure containing further information about the signal,
+as described below.
+.TP
+.I ucontext
+This is a pointer to a
+.I ucontext_t
+structure, cast to \fIvoid\ *\fP.
+The structure pointed to by this field contains
+signal context information that was saved
+on the user-space stack by the kernel; for details, see
+.BR sigreturn (2).
+Further information about the
+.I ucontext_t
+structure can be found in
+.BR getcontext (3)
+and
+.BR signal (7).
+Commonly, the handler function doesn't make any use of the third argument.
+.PP
+The
+.I siginfo_t
+data type is a structure with the following fields:
+.PP
+.in +4n
+.EX
+siginfo_t {
+ int si_signo; /* Signal number */
+ int si_errno; /* An errno value */
+ int si_code; /* Signal code */
+ int si_trapno; /* Trap number that caused
+ hardware\-generated signal
+ (unused on most architectures) */
+.\" FIXME
+.\" The siginfo_t 'si_trapno' field seems to be used
+.\" only on SPARC and Alpha; this page could use
+.\" a little more detail on its purpose there.
+ pid_t si_pid; /* Sending process ID */
+ uid_t si_uid; /* Real user ID of sending process */
+ int si_status; /* Exit value or signal */
+ clock_t si_utime; /* User time consumed */
+ clock_t si_stime; /* System time consumed */
+ union sigval si_value; /* Signal value */
+ int si_int; /* POSIX.1b signal */
+ void *si_ptr; /* POSIX.1b signal */
+ int si_overrun; /* Timer overrun count;
+ POSIX.1b timers */
+ int si_timerid; /* Timer ID; POSIX.1b timers */
+.\" In the kernel: si_tid
+ void *si_addr; /* Memory location which caused fault */
+ long si_band; /* Band event (was \fIint\fP in
+ glibc 2.3.2 and earlier) */
+ int si_fd; /* File descriptor */
+ short si_addr_lsb; /* Least significant bit of address
+ (since Linux 2.6.32) */
+ void *si_lower; /* Lower bound when address violation
+ occurred (since Linux 3.19) */
+ void *si_upper; /* Upper bound when address violation
+ occurred (since Linux 3.19) */
+ int si_pkey; /* Protection key on PTE that caused
+ fault (since Linux 4.6) */
+ void *si_call_addr; /* Address of system call instruction
+ (since Linux 3.5) */
+ int si_syscall; /* Number of attempted system call
+ (since Linux 3.5) */
+ unsigned int si_arch; /* Architecture of attempted system call
+ (since Linux 3.5) */
+}
+.EE
+.in
+.PP
+.IR si_signo ", " si_errno " and " si_code
+are defined for all signals.
+.RI ( si_errno
+is generally unused on Linux.)
+The rest of the struct may be a union, so that one should
+read only the fields that are meaningful for the given signal:
+.IP \[bu] 3
+Signals sent with
+.BR kill (2)
+and
+.BR sigqueue (3)
+fill in
+.IR si_pid " and " si_uid .
+In addition, signals sent with
+.BR sigqueue (3)
+fill in
+.IR si_int " and " si_ptr
+with the values specified by the sender of the signal;
+see
+.BR sigqueue (3)
+for more details.
+.IP \[bu]
+Signals sent by POSIX.1b timers (since Linux 2.6) fill in
+.I si_overrun
+and
+.IR si_timerid .
+The
+.I si_timerid
+field is an internal ID used by the kernel to identify
+the timer; it is not the same as the timer ID returned by
+.BR timer_create (2).
+The
+.I si_overrun
+field is the timer overrun count;
+this is the same information as is obtained by a call to
+.BR timer_getoverrun (2).
+These fields are nonstandard Linux extensions.
+.IP \[bu]
+Signals sent for message queue notification (see the description of
+.B SIGEV_SIGNAL
+in
+.BR mq_notify (3))
+fill in
+.IR si_int / si_ptr ,
+with the
+.I sigev_value
+supplied to
+.BR mq_notify (3);
+.IR si_pid ,
+with the process ID of the message sender; and
+.IR si_uid ,
+with the real user ID of the message sender.
+.IP \[bu]
+.B SIGCHLD
+fills in
+.IR si_pid ", " si_uid ", " si_status ", " si_utime ", and " si_stime ,
+providing information about the child.
+The
+.I si_pid
+field is the process ID of the child;
+.I si_uid
+is the child's real user ID.
+The
+.I si_status
+field contains the exit status of the child (if
+.I si_code
+is
+.BR CLD_EXITED ),
+or the signal number that caused the process to change state.
+The
+.I si_utime
+and
+.I si_stime
+contain the user and system CPU time used by the child process;
+these fields do not include the times used by waited-for children (unlike
+.BR getrusage (2)
+and
+.BR times (2)).
+Up to Linux 2.6, and since Linux 2.6.27, these fields report
+CPU time in units of
+.IR sysconf(_SC_CLK_TCK) .
+In Linux 2.6 kernels before Linux 2.6.27,
+a bug meant that these fields reported time in units
+of the (configurable) system jiffy (see
+.BR time (7)).
+.\" FIXME .
+.\" When si_utime and si_stime where originally implemented, the
+.\" measurement unit was HZ, which was the same as clock ticks
+.\" (sysconf(_SC_CLK_TCK)). In Linux 2.6, HZ became configurable, and
+.\" was *still* used as the unit to return the info these fields,
+.\" with the result that the field values depended on the
+.\" configured HZ. Of course, the should have been measured in
+.\" USER_HZ instead, so that sysconf(_SC_CLK_TCK) could be used to
+.\" convert to seconds. I have a queued patch to fix this:
+.\" http://thread.gmane.org/gmane.linux.kernel/698061/ .
+.\" This patch made it into Linux 2.6.27.
+.\" But note that these fields still don't return the times of
+.\" waited-for children (as is done by getrusage() and times()
+.\" and wait4()). Solaris 8 does include child times.
+.IP \[bu]
+.BR SIGILL ,
+.BR SIGFPE ,
+.BR SIGSEGV ,
+.BR SIGBUS ,
+and
+.B SIGTRAP
+fill in
+.I si_addr
+with the address of the fault.
+On some architectures,
+these signals also fill in the
+.I si_trapno
+field.
+.IP
+Some suberrors of
+.BR SIGBUS ,
+in particular
+.B BUS_MCEERR_AO
+and
+.BR BUS_MCEERR_AR ,
+also fill in
+.IR si_addr_lsb .
+This field indicates the least significant bit of the reported address
+and therefore the extent of the corruption.
+For example, if a full page was corrupted,
+.I si_addr_lsb
+contains
+.IR log2(sysconf(_SC_PAGESIZE)) .
+When
+.B SIGTRAP
+is delivered in response to a
+.BR ptrace (2)
+event (PTRACE_EVENT_foo),
+.I si_addr
+is not populated, but
+.I si_pid
+and
+.I si_uid
+are populated with the respective process ID and user ID responsible for
+delivering the trap.
+In the case of
+.BR seccomp (2),
+the tracee will be shown as delivering the event.
+.B BUS_MCEERR_*
+and
+.I si_addr_lsb
+are Linux-specific extensions.
+.IP
+The
+.B SEGV_BNDERR
+suberror of
+.B SIGSEGV
+populates
+.I si_lower
+and
+.IR si_upper .
+.IP
+The
+.B SEGV_PKUERR
+suberror of
+.B SIGSEGV
+populates
+.IR si_pkey .
+.IP \[bu]
+.BR SIGIO / SIGPOLL
+(the two names are synonyms on Linux)
+fills in
+.I si_band
+and
+.IR si_fd .
+The
+.I si_band
+event is a bit mask containing the same values as are filled in the
+.I revents
+field by
+.BR poll (2).
+The
+.I si_fd
+field indicates the file descriptor for which the I/O event occurred;
+for further details, see the description of
+.B F_SETSIG
+in
+.BR fcntl (2).
+.IP \[bu]
+.BR SIGSYS ,
+generated (since Linux 3.5)
+.\" commit a0727e8ce513fe6890416da960181ceb10fbfae6
+when a seccomp filter returns
+.BR SECCOMP_RET_TRAP ,
+fills in
+.IR si_call_addr ,
+.IR si_syscall ,
+.IR si_arch ,
+.IR si_errno ,
+and other fields as described in
+.BR seccomp (2).
+.\"
+.SS
+The si_code field
+The
+.I si_code
+field inside the
+.I siginfo_t
+argument that is passed to a
+.B SA_SIGINFO
+signal handler is a value (not a bit mask)
+indicating why this signal was sent.
+For a
+.BR ptrace (2)
+event,
+.I si_code
+will contain
+.B SIGTRAP
+and have the ptrace event in the high byte:
+.PP
+.in +4n
+.EX
+(SIGTRAP | PTRACE_EVENT_foo << 8).
+.EE
+.in
+.PP
+For a
+.RB non- ptrace (2)
+event, the values that can appear in
+.I si_code
+are described in the remainder of this section.
+Since glibc 2.20,
+the definitions of most of these symbols are obtained from
+.I <signal.h>
+by defining feature test macros (before including
+.I any
+header file) as follows:
+.IP \[bu] 3
+.B _XOPEN_SOURCE
+with the value 500 or greater;
+.IP \[bu]
+.B _XOPEN_SOURCE
+and
+.BR _XOPEN_SOURCE_EXTENDED ;
+or
+.IP \[bu]
+.B _POSIX_C_SOURCE
+with the value 200809L or greater.
+.PP
+For the
+.B TRAP_*
+constants, the symbol definitions are provided only in the first two cases.
+Before glibc 2.20, no feature test macros were required to obtain these symbols.
+.PP
+For a regular signal, the following list shows the values which can be
+placed in
+.I si_code
+for any signal, along with the reason that the signal was generated.
+.RS 4
+.TP
+.B SI_USER
+.BR kill (2).
+.TP
+.B SI_KERNEL
+Sent by the kernel.
+.TP
+.B SI_QUEUE
+.BR sigqueue (3).
+.TP
+.B SI_TIMER
+POSIX timer expired.
+.TP
+.BR SI_MESGQ " (since Linux 2.6.6)"
+POSIX message queue state changed; see
+.BR mq_notify (3).
+.TP
+.B SI_ASYNCIO
+AIO completed.
+.TP
+.B SI_SIGIO
+Queued
+.B SIGIO
+(only up to Linux 2.2; from Linux 2.4 onward
+.BR SIGIO / SIGPOLL
+fills in
+.I si_code
+as described below).
+.TP
+.BR SI_TKILL " (since Linux 2.4.19)"
+.BR tkill (2)
+or
+.BR tgkill (2).
+.\" SI_DETHREAD is defined in Linux 2.6.9 sources, but isn't implemented
+.\" It appears to have been an idea that was tried during 2.5.6
+.\" through to Linux 2.5.24 and then was backed out.
+.RE
+.PP
+The following values can be placed in
+.I si_code
+for a
+.B SIGILL
+signal:
+.RS 4
+.TP
+.B ILL_ILLOPC
+Illegal opcode.
+.TP
+.B ILL_ILLOPN
+Illegal operand.
+.TP
+.B ILL_ILLADR
+Illegal addressing mode.
+.TP
+.B ILL_ILLTRP
+Illegal trap.
+.TP
+.B ILL_PRVOPC
+Privileged opcode.
+.TP
+.B ILL_PRVREG
+Privileged register.
+.TP
+.B ILL_COPROC
+Coprocessor error.
+.TP
+.B ILL_BADSTK
+Internal stack error.
+.RE
+.PP
+The following values can be placed in
+.I si_code
+for a
+.B SIGFPE
+signal:
+.RS 4
+.TP
+.B FPE_INTDIV
+Integer divide by zero.
+.TP
+.B FPE_INTOVF
+Integer overflow.
+.TP
+.B FPE_FLTDIV
+Floating-point divide by zero.
+.TP
+.B FPE_FLTOVF
+Floating-point overflow.
+.TP
+.B FPE_FLTUND
+Floating-point underflow.
+.TP
+.B FPE_FLTRES
+Floating-point inexact result.
+.TP
+.B FPE_FLTINV
+Floating-point invalid operation.
+.TP
+.B FPE_FLTSUB
+Subscript out of range.
+.RE
+.PP
+The following values can be placed in
+.I si_code
+for a
+.B SIGSEGV
+signal:
+.RS 4
+.TP
+.B SEGV_MAPERR
+Address not mapped to object.
+.TP
+.B SEGV_ACCERR
+Invalid permissions for mapped object.
+.TP
+.BR SEGV_BNDERR " (since Linux 3.19)"
+.\" commit ee1b58d36aa1b5a79eaba11f5c3633c88231da83
+Failed address bound checks.
+.TP
+.BR SEGV_PKUERR " (since Linux 4.6)"
+.\" commit cd0ea35ff5511cde299a61c21a95889b4a71464e
+Access was denied by memory protection keys.
+See
+.BR pkeys (7).
+The protection key which applied to this access is available via
+.IR si_pkey .
+.RE
+.PP
+The following values can be placed in
+.I si_code
+for a
+.B SIGBUS
+signal:
+.RS 4
+.TP
+.B BUS_ADRALN
+Invalid address alignment.
+.TP
+.B BUS_ADRERR
+Nonexistent physical address.
+.TP
+.B BUS_OBJERR
+Object-specific hardware error.
+.TP
+.BR BUS_MCEERR_AR " (since Linux 2.6.32)"
+Hardware memory error consumed on a machine check; action required.
+.TP
+.BR BUS_MCEERR_AO " (since Linux 2.6.32)"
+Hardware memory error detected in process but not consumed; action optional.
+.RE
+.PP
+The following values can be placed in
+.I si_code
+for a
+.B SIGTRAP
+signal:
+.RS 4
+.TP
+.B TRAP_BRKPT
+Process breakpoint.
+.TP
+.B TRAP_TRACE
+Process trace trap.
+.TP
+.BR TRAP_BRANCH " (since Linux 2.4, IA64 only)"
+Process taken branch trap.
+.TP
+.BR TRAP_HWBKPT " (since Linux 2.4, IA64 only)"
+Hardware breakpoint/watchpoint.
+.RE
+.PP
+The following values can be placed in
+.I si_code
+for a
+.B SIGCHLD
+signal:
+.RS 4
+.TP
+.B CLD_EXITED
+Child has exited.
+.TP
+.B CLD_KILLED
+Child was killed.
+.TP
+.B CLD_DUMPED
+Child terminated abnormally.
+.TP
+.B CLD_TRAPPED
+Traced child has trapped.
+.TP
+.B CLD_STOPPED
+Child has stopped.
+.TP
+.BR CLD_CONTINUED " (since Linux 2.6.9)"
+Stopped child has continued.
+.RE
+.PP
+The following values can be placed in
+.I si_code
+for a
+.BR SIGIO / SIGPOLL
+signal:
+.RS 4
+.TP
+.B POLL_IN
+Data input available.
+.TP
+.B POLL_OUT
+Output buffers available.
+.TP
+.B POLL_MSG
+Input message available.
+.TP
+.B POLL_ERR
+I/O error.
+.TP
+.B POLL_PRI
+High priority input available.
+.TP
+.B POLL_HUP
+Device disconnected.
+.RE
+.PP
+The following value can be placed in
+.I si_code
+for a
+.B SIGSYS
+signal:
+.RS 4
+.TP
+.BR SYS_SECCOMP " (since Linux 3.5)"
+Triggered by a
+.BR seccomp (2)
+filter rule.
+.RE
+.SS Dynamically probing for flag bit support
+The
+.BR sigaction ()
+call on Linux accepts unknown bits set in
+.I act\->sa_flags
+without error.
+The behavior of the kernel starting with Linux 5.11 is that a second
+.BR sigaction ()
+will clear unknown bits from
+.IR oldact\->sa_flags .
+However, historically, a second
+.BR sigaction ()
+call would typically leave those bits set in
+.IR oldact\->sa_flags .
+.PP
+This means that support for new flags cannot be detected
+simply by testing for a flag in
+.IR sa_flags ,
+and a program must test that
+.B SA_UNSUPPORTED
+has been cleared before relying on the contents of
+.IR sa_flags .
+.PP
+Since the behavior of the signal handler cannot be guaranteed
+unless the check passes,
+it is wise to either block the affected signal
+while registering the handler and performing the check in this case,
+or where this is not possible,
+for example if the signal is synchronous, to issue the second
+.BR sigaction ()
+in the signal handler itself.
+.PP
+In kernels that do not support a specific flag,
+the kernel's behavior is as if the flag was not set,
+even if the flag was set in
+.IR act\->sa_flags .
+.PP
+The flags
+.BR SA_NOCLDSTOP ,
+.BR SA_NOCLDWAIT ,
+.BR SA_SIGINFO ,
+.BR SA_ONSTACK ,
+.BR SA_RESTART ,
+.BR SA_NODEFER ,
+.BR SA_RESETHAND ,
+and, if defined by the architecture,
+.B SA_RESTORER
+may not be reliably probed for using this mechanism,
+because they were introduced before Linux 5.11.
+However, in general, programs may assume that these flags are supported,
+since they have all been supported since Linux 2.6,
+which was released in the year 2003.
+.PP
+See EXAMPLES below for a demonstration of the use of
+.BR SA_UNSUPPORTED .
+.SH RETURN VALUE
+.BR sigaction ()
+returns 0 on success; on error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+.IR act " or " oldact
+points to memory which is not a valid part of the process address space.
+.TP
+.B EINVAL
+An invalid signal was specified.
+This will also be generated if an attempt
+is made to change the action for
+.BR SIGKILL " or " SIGSTOP ,
+which cannot be caught or ignored.
+.SH VERSIONS
+.SS C library/kernel differences
+The glibc wrapper function for
+.BR sigaction ()
+gives an error
+.RB ( EINVAL )
+on attempts to change the disposition of the two real-time signals
+used internally by the NPTL threading implementation.
+See
+.BR nptl (7)
+for details.
+.PP
+On architectures where the signal trampoline resides in the C library,
+the glibc wrapper function for
+.BR sigaction ()
+places the address of the trampoline code in the
+.I act.sa_restorer
+field and sets the
+.B SA_RESTORER
+flag in the
+.I act.sa_flags
+field.
+See
+.BR sigreturn (2).
+.PP
+The original Linux system call was named
+.BR sigaction ().
+However, with the addition of real-time signals in Linux 2.2,
+the fixed-size, 32-bit
+.I sigset_t
+type supported by that system call was no longer fit for purpose.
+Consequently, a new system call,
+.BR rt_sigaction (),
+was added to support an enlarged
+.I sigset_t
+type.
+The new system call takes a fourth argument,
+.IR "size_t sigsetsize" ,
+which specifies the size in bytes of the signal sets in
+.I act.sa_mask
+and
+.IR oldact.sa_mask .
+This argument is currently required to have the value
+.I sizeof(sigset_t)
+(or the error
+.B EINVAL
+results).
+The glibc
+.BR sigaction ()
+wrapper function hides these details from us, transparently calling
+.BR rt_sigaction ()
+when the kernel provides it.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4.
+.\" SVr4 does not document the EINTR condition.
+.PP
+POSIX.1-1990 disallowed setting the action for
+.B SIGCHLD
+to
+.BR SIG_IGN .
+POSIX.1-2001 and later allow this possibility, so that ignoring
+.B SIGCHLD
+can be used to prevent the creation of zombies (see
+.BR wait (2)).
+Nevertheless, the historical BSD and System\ V behaviors for ignoring
+.B SIGCHLD
+differ, so that the only completely portable method of ensuring that
+terminated children do not become zombies is to catch the
+.B SIGCHLD
+signal and perform a
+.BR wait (2)
+or similar.
+.PP
+POSIX.1-1990 specified only
+.BR SA_NOCLDSTOP .
+POSIX.1-2001 added
+.BR SA_NOCLDSTOP ,
+.BR SA_NOCLDWAIT ,
+.BR SA_NODEFER ,
+.BR SA_ONSTACK ,
+.BR SA_RESETHAND ,
+.BR SA_RESTART ,
+and
+.BR SA_SIGINFO .
+Use of these latter values in
+.I sa_flags
+may be less portable in applications intended for older
+UNIX implementations.
+.PP
+The
+.B SA_RESETHAND
+flag is compatible with the SVr4 flag of the same name.
+.PP
+The
+.B SA_NODEFER
+flag is compatible with the SVr4 flag of the same name under kernels
+1.3.9 and later.
+On older kernels the Linux implementation
+allowed the receipt of any signal, not just the one we are installing
+(effectively overriding any
+.I sa_mask
+settings).
+.SH NOTES
+A child created via
+.BR fork (2)
+inherits a copy of its parent's signal dispositions.
+During an
+.BR execve (2),
+the dispositions of handled signals are reset to the default;
+the dispositions of ignored signals are left unchanged.
+.PP
+According to POSIX, the behavior of a process is undefined after it
+ignores a
+.BR SIGFPE ,
+.BR SIGILL ,
+or
+.B SIGSEGV
+signal that was not generated by
+.BR kill (2)
+or
+.BR raise (3).
+Integer division by zero has undefined result.
+On some architectures it will generate a
+.B SIGFPE
+signal.
+(Also dividing the most negative integer by \-1 may generate
+.BR SIGFPE .)
+Ignoring this signal might lead to an endless loop.
+.PP
+.BR sigaction ()
+can be called with a NULL second argument to query the current signal
+handler.
+It can also be used to check whether a given signal is valid for
+the current machine by calling it with NULL second and third arguments.
+.PP
+It is not possible to block
+.BR SIGKILL " or " SIGSTOP
+(by specifying them in
+.IR sa_mask ).
+Attempts to do so are silently ignored.
+.PP
+See
+.BR sigsetops (3)
+for details on manipulating signal sets.
+.PP
+See
+.BR signal\-safety (7)
+for a list of the async-signal-safe functions that can be
+safely called inside from inside a signal handler.
+.\"
+.SS Undocumented
+Before the introduction of
+.BR SA_SIGINFO ,
+it was also possible to get some additional information about the signal.
+This was done by providing an
+.I sa_handler
+signal handler with a second argument of type
+.IR "struct sigcontext" ,
+which is the same structure as the one that is passed in the
+.I uc_mcontext
+field of the
+.I ucontext
+structure that is passed (via a pointer) in the third argument of the
+.I sa_sigaction
+handler.
+See the relevant Linux kernel sources for details.
+This use is obsolete now.
+.SH BUGS
+When delivering a signal with a
+.B SA_SIGINFO
+handler,
+the kernel does not always provide meaningful values
+for all of the fields of the
+.I siginfo_t
+that are relevant for that signal.
+.PP
+Up to and including Linux 2.6.13, specifying
+.B SA_NODEFER
+in
+.I sa_flags
+prevents not only the delivered signal from being masked during
+execution of the handler, but also the signals specified in
+.IR sa_mask .
+This bug was fixed in Linux 2.6.14.
+.\" commit 69be8f189653cd81aae5a74e26615b12871bb72e
+.SH EXAMPLES
+See
+.BR mprotect (2).
+.SS Probing for flag support
+The following example program exits with status
+.B EXIT_SUCCESS
+if
+.B SA_EXPOSE_TAGBITS
+is determined to be supported, and
+.B EXIT_FAILURE
+otherwise.
+.PP
+.\" SRC BEGIN (sigaction.c)
+.EX
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+\&
+void
+handler(int signo, siginfo_t *info, void *context)
+{
+ struct sigaction oldact;
+\&
+ if (sigaction(SIGSEGV, NULL, &oldact) == \-1
+ || (oldact.sa_flags & SA_UNSUPPORTED)
+ || !(oldact.sa_flags & SA_EXPOSE_TAGBITS))
+ {
+ _exit(EXIT_FAILURE);
+ }
+ _exit(EXIT_SUCCESS);
+}
+\&
+int
+main(void)
+{
+ struct sigaction act = { 0 };
+\&
+ act.sa_flags = SA_SIGINFO | SA_UNSUPPORTED | SA_EXPOSE_TAGBITS;
+ act.sa_sigaction = &handler;
+ if (sigaction(SIGSEGV, &act, NULL) == \-1) {
+ perror("sigaction");
+ exit(EXIT_FAILURE);
+ }
+\&
+ raise(SIGSEGV);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR kill (1),
+.BR kill (2),
+.BR pause (2),
+.BR pidfd_send_signal (2),
+.BR restart_syscall (2),
+.BR seccomp (2),
+.BR sigaltstack (2),
+.BR signal (2),
+.BR signalfd (2),
+.BR sigpending (2),
+.BR sigprocmask (2),
+.BR sigreturn (2),
+.BR sigsuspend (2),
+.BR wait (2),
+.BR killpg (3),
+.BR raise (3),
+.BR siginterrupt (3),
+.BR sigqueue (3),
+.BR sigsetops (3),
+.BR sigvec (3),
+.BR core (5),
+.BR signal (7)
diff --git a/man2/sigaltstack.2 b/man2/sigaltstack.2
new file mode 100644
index 0000000..0ebfebd
--- /dev/null
+++ b/man2/sigaltstack.2
@@ -0,0 +1,363 @@
+'\" t
+.\" Copyright (c) 2001, 2017 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" aeb, various minor fixes
+.TH sigaltstack 2 2023-07-20 "Linux man-pages 6.05.01"
+.SH NAME
+sigaltstack \- set and/or get signal stack context
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <signal.h>
+.PP
+.BI "int sigaltstack(const stack_t *_Nullable restrict " ss ,
+.BI " stack_t *_Nullable restrict " old_ss );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR sigaltstack ():
+.nf
+ _XOPEN_SOURCE >= 500
+.\" || _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED
+ || /* Since glibc 2.12: */ _POSIX_C_SOURCE >= 200809L
+ || /* glibc <= 2.19: */ _BSD_SOURCE
+.fi
+.SH DESCRIPTION
+.BR sigaltstack ()
+allows a thread to define a new alternate
+signal stack and/or retrieve the state of an existing
+alternate signal stack.
+An alternate signal stack is used during the
+execution of a signal handler if the establishment of that handler (see
+.BR sigaction (2))
+requested it.
+.PP
+The normal sequence of events for using an alternate signal stack
+is the following:
+.TP 3
+1.
+Allocate an area of memory to be used for the alternate
+signal stack.
+.TP
+2.
+Use
+.BR sigaltstack ()
+to inform the system of the existence and
+location of the alternate signal stack.
+.TP
+3.
+When establishing a signal handler using
+.BR sigaction (2),
+inform the system that the signal handler should be executed
+on the alternate signal stack by
+specifying the \fBSA_ONSTACK\fP flag.
+.PP
+The \fIss\fP argument is used to specify a new
+alternate signal stack, while the \fIold_ss\fP argument
+is used to retrieve information about the currently
+established signal stack.
+If we are interested in performing just one
+of these tasks, then the other argument can be specified as NULL.
+.PP
+The
+.I stack_t
+type used to type the arguments of this function is defined as follows:
+.PP
+.in +4n
+.EX
+typedef struct {
+ void *ss_sp; /* Base address of stack */
+ int ss_flags; /* Flags */
+ size_t ss_size; /* Number of bytes in stack */
+} stack_t;
+.EE
+.in
+.PP
+To establish a new alternate signal stack,
+the fields of this structure are set as follows:
+.TP
+.I ss.ss_flags
+This field contains either 0, or the following flag:
+.RS
+.TP
+.BR SS_AUTODISARM " (since Linux 4.7)"
+.\" commit 2a74213838104a41588d86fd5e8d344972891ace
+.\" See tools/testing/selftests/sigaltstack/sas.c in kernel sources
+Clear the alternate signal stack settings on entry to the signal handler.
+When the signal handler returns,
+the previous alternate signal stack settings are restored.
+.IP
+This flag was added in order to make it safe
+to switch away from the signal handler with
+.BR swapcontext (3).
+Without this flag, a subsequently handled signal will corrupt
+the state of the switched-away signal handler.
+On kernels where this flag is not supported,
+.BR sigaltstack ()
+fails with the error
+.B EINVAL
+when this flag is supplied.
+.RE
+.TP
+.I ss.ss_sp
+This field specifies the starting address of the stack.
+When a signal handler is invoked on the alternate stack,
+the kernel automatically aligns the address given in \fIss.ss_sp\fP
+to a suitable address boundary for the underlying hardware architecture.
+.TP
+.I ss.ss_size
+This field specifies the size of the stack.
+The constant \fBSIGSTKSZ\fP is defined to be large enough
+to cover the usual size requirements for an alternate signal stack,
+and the constant \fBMINSIGSTKSZ\fP defines the minimum
+size required to execute a signal handler.
+.PP
+To disable an existing stack, specify \fIss.ss_flags\fP
+as \fBSS_DISABLE\fP.
+In this case, the kernel ignores any other flags in
+.I ss.ss_flags
+and the remaining fields
+in \fIss\fP.
+.PP
+If \fIold_ss\fP is not NULL, then it is used to return information about
+the alternate signal stack which was in effect prior to the
+call to
+.BR sigaltstack ().
+The \fIold_ss.ss_sp\fP and \fIold_ss.ss_size\fP fields return the starting
+address and size of that stack.
+The \fIold_ss.ss_flags\fP may return either of the following values:
+.TP
+.B SS_ONSTACK
+The thread is currently executing on the alternate signal stack.
+(Note that it is not possible
+to change the alternate signal stack if the thread is
+currently executing on it.)
+.TP
+.B SS_DISABLE
+The alternate signal stack is currently disabled.
+.IP
+Alternatively, this value is returned if the thread is currently
+executing on an alternate signal stack that was established using the
+.B SS_AUTODISARM
+flag.
+In this case, it is safe to switch away from the signal handler with
+.BR swapcontext (3).
+It is also possible to set up a different alternative signal stack
+using a further call to
+.BR sigaltstack ().
+.\" FIXME Was it intended that one can set up a different alternative
+.\" signal stack in this scenario? (In passing, if one does this, the
+.\" sigaltstack(NULL, &old_ss) now returns old_ss.ss_flags==SS_AUTODISARM
+.\" rather than old_ss.ss_flags==SS_DISABLE. The API design here seems
+.\" confusing...
+.TP
+.B SS_AUTODISARM
+The alternate signal stack has been marked to be autodisarmed
+as described above.
+.PP
+By specifying
+.I ss
+as NULL, and
+.I old_ss
+as a non-NULL value, one can obtain the current settings for
+the alternate signal stack without changing them.
+.SH RETURN VALUE
+.BR sigaltstack ()
+returns 0 on success, or \-1 on failure with
+\fIerrno\fP set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+Either \fIss\fP or \fIold_ss\fP is not NULL and points to an area
+outside of the process's address space.
+.TP
+.B EINVAL
+\fIss\fP is not NULL and the \fIss_flags\fP field contains
+an invalid flag.
+.TP
+.B ENOMEM
+The specified size of the new alternate signal stack
+.I ss.ss_size
+was less than
+.BR MINSIGSTKSZ .
+.TP
+.B EPERM
+An attempt was made to change the alternate signal stack while
+it was active (i.e., the thread was already executing
+on the current alternate signal stack).
+.SH ATTRIBUTES
+For an explanation of the terms used in this section, see
+.BR attributes (7).
+.TS
+allbox;
+lbx lb lb
+l l l.
+Interface Attribute Value
+T{
+.na
+.nh
+.BR sigaltstack ()
+T} Thread safety MT-Safe
+.TE
+.sp 1
+.SH STANDARDS
+POSIX.1-2008.
+.PP
+.B SS_AUTODISARM
+is a Linux extension.
+.SH HISTORY
+POSIX.1-2001, SUSv2, SVr4.
+.SH NOTES
+The most common usage of an alternate signal stack is to handle the
+.B SIGSEGV
+signal that is generated if the space available for the
+standard stack is exhausted: in this case, a signal handler for
+.B SIGSEGV
+cannot be invoked on the standard stack; if we wish to handle it,
+we must use an alternate signal stack.
+.PP
+Establishing an alternate signal stack is useful if a thread
+expects that it may exhaust its standard stack.
+This may occur, for example, because the stack grows so large
+that it encounters the upwardly growing heap, or it reaches a
+limit established by a call to \fB\%setrlimit(RLIMIT_STACK, &rlim)\fP.
+If the standard stack is exhausted, the kernel sends
+the thread a \fBSIGSEGV\fP signal.
+In these circumstances the only way to catch this signal is
+on an alternate signal stack.
+.PP
+On most hardware architectures supported by Linux, stacks grow
+downward.
+.BR sigaltstack ()
+automatically takes account
+of the direction of stack growth.
+.PP
+Functions called from a signal handler executing on an alternate
+signal stack will also use the alternate signal stack.
+(This also applies to any handlers invoked for other signals while
+the thread is executing on the alternate signal stack.)
+Unlike the standard stack, the system does not
+automatically extend the alternate signal stack.
+Exceeding the allocated size of the alternate signal stack will
+lead to unpredictable results.
+.PP
+A successful call to
+.BR execve (2)
+removes any existing alternate
+signal stack.
+A child process created via
+.BR fork (2)
+inherits a copy of its parent's alternate signal stack settings.
+The same is also true for a child process created using
+.BR clone (2),
+unless the clone flags include
+.B CLONE_VM
+and do not include
+.BR CLONE_VFORK ,
+in which case any alternate signal stack that was established in the parent
+is disabled in the child process.
+.PP
+.BR sigaltstack ()
+supersedes the older
+.BR sigstack ()
+call.
+For backward compatibility, glibc also provides
+.BR sigstack ().
+All new applications should be written using
+.BR sigaltstack ().
+.SS History
+4.2BSD had a
+.BR sigstack ()
+system call.
+It used a slightly
+different struct, and had the major disadvantage that the caller
+had to know the direction of stack growth.
+.SH BUGS
+In Linux 2.2 and earlier, the only flag that could be specified
+in
+.I ss.sa_flags
+was
+.BR SS_DISABLE .
+In the lead up to the release of the Linux 2.4 kernel,
+.\" Linux 2.3.40
+.\" After quite a bit of web and mail archive searching,
+.\" I could not find the patch on any mailing list, and I
+.\" could find no place where the rationale for this change
+.\" explained -- mtk
+a change was made to allow
+.BR sigaltstack ()
+to allow
+.I ss.ss_flags==SS_ONSTACK
+with the same meaning as
+.I ss.ss_flags==0
+(i.e., the inclusion of
+.B SS_ONSTACK
+in
+.I ss.ss_flags
+is a no-op).
+On other implementations, and according to POSIX.1,
+.B SS_ONSTACK
+appears only as a reported flag in
+.IR old_ss.ss_flags .
+On Linux, there is no need ever to specify
+.B SS_ONSTACK
+in
+.IR ss.ss_flags ,
+and indeed doing so should be avoided on portability grounds:
+various other systems
+.\" See the source code of Illumos and FreeBSD, for example.
+give an error if
+.B SS_ONSTACK
+is specified in
+.IR ss.ss_flags .
+.SH EXAMPLES
+The following code segment demonstrates the use of
+.BR sigaltstack ()
+(and
+.BR sigaction (2))
+to install an alternate signal stack that is employed by a handler
+for the
+.B SIGSEGV
+signal:
+.PP
+.in +4n
+.EX
+stack_t ss;
+\&
+ss.ss_sp = malloc(SIGSTKSZ);
+if (ss.ss_sp == NULL) {
+ perror("malloc");
+ exit(EXIT_FAILURE);
+}
+\&
+ss.ss_size = SIGSTKSZ;
+ss.ss_flags = 0;
+if (sigaltstack(&ss, NULL) == \-1) {
+ perror("sigaltstack");
+ exit(EXIT_FAILURE);
+}
+\&
+sa.sa_flags = SA_ONSTACK;
+sa.sa_handler = handler(); /* Address of a signal handler */
+sigemptyset(&sa.sa_mask);
+if (sigaction(SIGSEGV, &sa, NULL) == \-1) {
+ perror("sigaction");
+ exit(EXIT_FAILURE);
+}
+.EE
+.in
+.SH SEE ALSO
+.BR execve (2),
+.BR setrlimit (2),
+.BR sigaction (2),
+.BR siglongjmp (3),
+.BR sigsetjmp (3),
+.BR signal (7)
diff --git a/man2/signal.2 b/man2/signal.2
new file mode 100644
index 0000000..619babf
--- /dev/null
+++ b/man2/signal.2
@@ -0,0 +1,280 @@
+.\" Copyright (c) 2000 Andries Brouwer <aeb@cwi.nl>
+.\" and Copyright (c) 2007 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" and Copyright (c) 2008, Linux Foundation, written by Michael Kerrisk
+.\" <mtk.manpages@gmail.com>
+.\" based on work by Rik Faith <faith@cs.unc.edu>
+.\" and Mike Battersby <mike@starbug.apana.org.au>.
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 2004-11-19, mtk:
+.\" added pointer to sigaction.2 for details of ignoring SIGCHLD
+.\" 2007-06-03, mtk: strengthened portability warning, and rewrote
+.\" various sections.
+.\" 2008-07-11, mtk: rewrote and expanded portability discussion.
+.\"
+.TH signal 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+signal \- ANSI C signal handling
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <signal.h>
+.PP
+.B typedef void (*sighandler_t)(int);
+.PP
+.BI "sighandler_t signal(int " signum ", sighandler_t " handler );
+.fi
+.SH DESCRIPTION
+.BR WARNING :
+the behavior of
+.BR signal ()
+varies across UNIX versions,
+and has also varied historically across different versions of Linux.
+\fBAvoid its use\fP: use
+.BR sigaction (2)
+instead.
+See \fIPortability\fP below.
+.PP
+.BR signal ()
+sets the disposition of the signal
+.I signum
+to
+.IR handler ,
+which is either
+.BR SIG_IGN ,
+.BR SIG_DFL ,
+or the address of a programmer-defined function (a "signal handler").
+.PP
+If the signal
+.I signum
+is delivered to the process, then one of the following happens:
+.TP 3
+*
+If the disposition is set to
+.BR SIG_IGN ,
+then the signal is ignored.
+.TP
+*
+If the disposition is set to
+.BR SIG_DFL ,
+then the default action associated with the signal (see
+.BR signal (7))
+occurs.
+.TP
+*
+If the disposition is set to a function,
+then first either the disposition is reset to
+.BR SIG_DFL ,
+or the signal is blocked (see \fIPortability\fP below), and then
+.I handler
+is called with argument
+.IR signum .
+If invocation of the handler caused the signal to be blocked,
+then the signal is unblocked upon return from the handler.
+.PP
+The signals
+.B SIGKILL
+and
+.B SIGSTOP
+cannot be caught or ignored.
+.SH RETURN VALUE
+.BR signal ()
+returns the previous value of the signal handler.
+On failure, it returns
+.BR SIG_ERR ,
+and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+.I signum
+is invalid.
+.SH VERSIONS
+The use of
+.I sighandler_t
+is a GNU extension, exposed if
+.B _GNU_SOURCE
+is defined;
+.\" libc4 and libc5 define
+.\" .IR SignalHandler ;
+glibc also defines (the BSD-derived)
+.I sig_t
+if
+.B _BSD_SOURCE
+(glibc 2.19 and earlier)
+or
+.B _DEFAULT_SOURCE
+(glibc 2.19 and later)
+is defined.
+Without use of such a type, the declaration of
+.BR signal ()
+is the somewhat harder to read:
+.PP
+.in +4n
+.EX
+.BI "void ( *" signal "(int " signum ", void (*" handler ")(int)) ) (int);"
+.EE
+.in
+.SS Portability
+The only portable use of
+.BR signal ()
+is to set a signal's disposition to
+.B SIG_DFL
+or
+.BR SIG_IGN .
+The semantics when using
+.BR signal ()
+to establish a signal handler vary across systems
+(and POSIX.1 explicitly permits this variation);
+.B do not use it for this purpose.
+.PP
+POSIX.1 solved the portability mess by specifying
+.BR sigaction (2),
+which provides explicit control of the semantics when a
+signal handler is invoked; use that interface instead of
+.BR signal ().
+.SH STANDARDS
+C11, POSIX.1-2008.
+.SH HISTORY
+C89, POSIX.1-2001.
+.PP
+In the original UNIX systems, when a handler that was established using
+.BR signal ()
+was invoked by the delivery of a signal,
+the disposition of the signal would be reset to
+.BR SIG_DFL ,
+and the system did not block delivery of further instances of the signal.
+This is equivalent to calling
+.BR sigaction (2)
+with the following flags:
+.PP
+.in +4n
+.EX
+sa.sa_flags = SA_RESETHAND | SA_NODEFER;
+.EE
+.in
+.PP
+System\ V also provides these semantics for
+.BR signal ().
+This was bad because the signal might be delivered again
+before the handler had a chance to reestablish itself.
+Furthermore, rapid deliveries of the same signal could
+result in recursive invocations of the handler.
+.PP
+BSD improved on this situation, but unfortunately also
+changed the semantics of the existing
+.BR signal ()
+interface while doing so.
+On BSD, when a signal handler is invoked,
+the signal disposition is not reset,
+and further instances of the signal are blocked from
+being delivered while the handler is executing.
+Furthermore, certain blocking system calls are automatically
+restarted if interrupted by a signal handler (see
+.BR signal (7)).
+The BSD semantics are equivalent to calling
+.BR sigaction (2)
+with the following flags:
+.PP
+.in +4n
+.EX
+sa.sa_flags = SA_RESTART;
+.EE
+.in
+.PP
+The situation on Linux is as follows:
+.IP \[bu] 3
+The kernel's
+.BR signal ()
+system call provides System\ V semantics.
+.IP \[bu]
+By default, in glibc 2 and later, the
+.BR signal ()
+wrapper function does not invoke the kernel system call.
+Instead, it calls
+.BR sigaction (2)
+using flags that supply BSD semantics.
+This default behavior is provided as long as a suitable
+feature test macro is defined:
+.B _BSD_SOURCE
+on glibc 2.19 and earlier or
+.B _DEFAULT_SOURCE
+in glibc 2.19 and later.
+(By default, these macros are defined; see
+.BR feature_test_macros (7)
+for details.)
+If such a feature test macro is not defined, then
+.BR signal ()
+provides System\ V semantics.
+.\"
+.\" System V semantics are also provided if one uses the separate
+.\" .BR sysv_signal (3)
+.\" function.
+.\" .IP *
+.\" The
+.\" .BR signal ()
+.\" function in Linux libc4 and libc5 provide System\ V semantics.
+.\" If one on a libc5 system includes
+.\" .I <bsd/signal.h>
+.\" instead of
+.\" .IR <signal.h> ,
+.\" then
+.\" .BR signal ()
+.\" provides BSD semantics.
+.SH NOTES
+The effects of
+.BR signal ()
+in a multithreaded process are unspecified.
+.PP
+According to POSIX, the behavior of a process is undefined after it
+ignores a
+.BR SIGFPE ,
+.BR SIGILL ,
+or
+.B SIGSEGV
+signal that was not generated by
+.BR kill (2)
+or
+.BR raise (3).
+Integer division by zero has undefined result.
+On some architectures it will generate a
+.B SIGFPE
+signal.
+(Also dividing the most negative integer by \-1 may generate
+.BR SIGFPE .)
+Ignoring this signal might lead to an endless loop.
+.PP
+See
+.BR sigaction (2)
+for details on what happens when the disposition
+.B SIGCHLD
+is set to
+.BR SIG_IGN .
+.PP
+See
+.BR signal\-safety (7)
+for a list of the async-signal-safe functions that can be
+safely called from inside a signal handler.
+.SH SEE ALSO
+.BR kill (1),
+.BR alarm (2),
+.BR kill (2),
+.BR pause (2),
+.BR sigaction (2),
+.BR signalfd (2),
+.BR sigpending (2),
+.BR sigprocmask (2),
+.BR sigsuspend (2),
+.BR bsd_signal (3),
+.BR killpg (3),
+.BR raise (3),
+.BR siginterrupt (3),
+.BR sigqueue (3),
+.BR sigsetops (3),
+.BR sigvec (3),
+.BR sysv_signal (3),
+.BR signal (7)
diff --git a/man2/signalfd.2 b/man2/signalfd.2
new file mode 100644
index 0000000..9af22b0
--- /dev/null
+++ b/man2/signalfd.2
@@ -0,0 +1,521 @@
+.\" Copyright (C) 2008 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" starting from a version by Davide Libenzi <davidel@xmailserver.org>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.TH signalfd 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+signalfd \- create a file descriptor for accepting signals
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/signalfd.h>
+.PP
+.BI "int signalfd(int " fd ", const sigset_t *" mask ", int " flags );
+.fi
+.SH DESCRIPTION
+.BR signalfd ()
+creates a file descriptor that can be used to accept signals
+targeted at the caller.
+This provides an alternative to the use of a signal handler or
+.BR sigwaitinfo (2),
+and has the advantage that the file descriptor may be monitored by
+.BR select (2),
+.BR poll (2),
+and
+.BR epoll (7).
+.PP
+The
+.I mask
+argument specifies the set of signals that the caller
+wishes to accept via the file descriptor.
+This argument is a signal set whose contents can be initialized
+using the macros described in
+.BR sigsetops (3).
+Normally, the set of signals to be received via the
+file descriptor should be blocked using
+.BR sigprocmask (2),
+to prevent the signals being handled according to their default
+dispositions.
+It is not possible to receive
+.B SIGKILL
+or
+.B SIGSTOP
+signals via a signalfd file descriptor;
+these signals are silently ignored if specified in
+.IR mask .
+.PP
+If the
+.I fd
+argument is \-1,
+then the call creates a new file descriptor and associates the
+signal set specified in
+.I mask
+with that file descriptor.
+If
+.I fd
+is not \-1,
+then it must specify a valid existing signalfd file descriptor, and
+.I mask
+is used to replace the signal set associated with that file descriptor.
+.PP
+Starting with Linux 2.6.27, the following values may be bitwise ORed in
+.I flags
+to change the behavior of
+.BR signalfd ():
+.TP 14
+.B SFD_NONBLOCK
+Set the
+.B O_NONBLOCK
+file status flag on the open file description (see
+.BR open (2))
+referred to by the new file descriptor.
+Using this flag saves extra calls to
+.BR fcntl (2)
+to achieve the same result.
+.TP
+.B SFD_CLOEXEC
+Set the close-on-exec
+.RB ( FD_CLOEXEC )
+flag on the new file descriptor.
+See the description of the
+.B O_CLOEXEC
+flag in
+.BR open (2)
+for reasons why this may be useful.
+.PP
+Up to Linux 2.6.26, the
+.I flags
+argument is unused, and must be specified as zero.
+.PP
+.BR signalfd ()
+returns a file descriptor that supports the following operations:
+.TP
+.BR read (2)
+If one or more of the signals specified in
+.I mask
+is pending for the process, then the buffer supplied to
+.BR read (2)
+is used to return one or more
+.I signalfd_siginfo
+structures (see below) that describe the signals.
+The
+.BR read (2)
+returns information for as many signals as are pending and will
+fit in the supplied buffer.
+The buffer must be at least
+.I "sizeof(struct signalfd_siginfo)"
+bytes.
+The return value of the
+.BR read (2)
+is the total number of bytes read.
+.IP
+As a consequence of the
+.BR read (2),
+the signals are consumed,
+so that they are no longer pending for the process
+(i.e., will not be caught by signal handlers,
+and cannot be accepted using
+.BR sigwaitinfo (2)).
+.IP
+If none of the signals in
+.I mask
+is pending for the process, then the
+.BR read (2)
+either blocks until one of the signals in
+.I mask
+is generated for the process,
+or fails with the error
+.B EAGAIN
+if the file descriptor has been made nonblocking.
+.TP
+.BR poll "(2), " select "(2) (and similar)"
+The file descriptor is readable
+(the
+.BR select (2)
+.I readfds
+argument; the
+.BR poll (2)
+.B POLLIN
+flag)
+if one or more of the signals in
+.I mask
+is pending for the process.
+.IP
+The signalfd file descriptor also supports the other file-descriptor
+multiplexing APIs:
+.BR pselect (2),
+.BR ppoll (2),
+and
+.BR epoll (7).
+.TP
+.BR close (2)
+When the file descriptor is no longer required it should be closed.
+When all file descriptors associated with the same signalfd object
+have been closed, the resources for object are freed by the kernel.
+.SS The signalfd_siginfo structure
+The format of the
+.I signalfd_siginfo
+structure(s) returned by
+.BR read (2)s
+from a signalfd file descriptor is as follows:
+.PP
+.in +4n
+.EX
+struct signalfd_siginfo {
+ uint32_t ssi_signo; /* Signal number */
+ int32_t ssi_errno; /* Error number (unused) */
+ int32_t ssi_code; /* Signal code */
+ uint32_t ssi_pid; /* PID of sender */
+ uint32_t ssi_uid; /* Real UID of sender */
+ int32_t ssi_fd; /* File descriptor (SIGIO) */
+ uint32_t ssi_tid; /* Kernel timer ID (POSIX timers)
+ uint32_t ssi_band; /* Band event (SIGIO) */
+ uint32_t ssi_overrun; /* POSIX timer overrun count */
+ uint32_t ssi_trapno; /* Trap number that caused signal */
+.\" ssi_trapno is unused on most arches
+ int32_t ssi_status; /* Exit status or signal (SIGCHLD) */
+ int32_t ssi_int; /* Integer sent by sigqueue(3) */
+ uint64_t ssi_ptr; /* Pointer sent by sigqueue(3) */
+ uint64_t ssi_utime; /* User CPU time consumed (SIGCHLD) */
+ uint64_t ssi_stime; /* System CPU time consumed
+ (SIGCHLD) */
+ uint64_t ssi_addr; /* Address that generated signal
+ (for hardware\-generated signals) */
+ uint16_t ssi_addr_lsb; /* Least significant bit of address
+ (SIGBUS; since Linux 2.6.37) */
+.\" ssi_addr_lsb: commit b8aeec34175fc8fe8b0d40efea4846dfc1ba663e
+ uint8_t pad[\fIX\fP]; /* Pad size to 128 bytes (allow for
+ additional fields in the future) */
+};
+.EE
+.in
+.PP
+Each of the fields in this structure
+is analogous to the similarly named field in the
+.I siginfo_t
+structure.
+The
+.I siginfo_t
+structure is described in
+.BR sigaction (2).
+Not all fields in the returned
+.I signalfd_siginfo
+structure will be valid for a specific signal;
+the set of valid fields can be determined from the value returned in the
+.I ssi_code
+field.
+This field is the analog of the
+.I siginfo_t
+.I si_code
+field; see
+.BR sigaction (2)
+for details.
+.SS fork(2) semantics
+After a
+.BR fork (2),
+the child inherits a copy of the signalfd file descriptor.
+A
+.BR read (2)
+from the file descriptor in the child will return information
+about signals queued to the child.
+.SS Semantics of file descriptor passing
+As with other file descriptors,
+signalfd file descriptors can be passed to another process
+via a UNIX domain socket (see
+.BR unix (7)).
+In the receiving process, a
+.BR read (2)
+from the received file descriptor will return information
+about signals queued to that process.
+.SS execve(2) semantics
+Just like any other file descriptor,
+a signalfd file descriptor remains open across an
+.BR execve (2),
+unless it has been marked for close-on-exec (see
+.BR fcntl (2)).
+Any signals that were available for reading before the
+.BR execve (2)
+remain available to the newly loaded program.
+(This is analogous to traditional signal semantics,
+where a blocked signal that is pending remains pending across an
+.BR execve (2).)
+.SS Thread semantics
+The semantics of signalfd file descriptors in a multithreaded program
+mirror the standard semantics for signals.
+In other words,
+when a thread reads from a signalfd file descriptor,
+it will read the signals that are directed to the thread
+itself and the signals that are directed to the process
+(i.e., the entire thread group).
+(A thread will not be able to read signals that are directed
+to other threads in the process.)
+.\"
+.SS epoll(7) semantics
+If a process adds (via
+.BR epoll_ctl (2))
+a signalfd file descriptor to an
+.BR epoll (7)
+instance, then
+.BR epoll_wait (2)
+returns events only for signals sent to that process.
+In particular, if the process then uses
+.BR fork (2)
+to create a child process, then the child will be able to
+.BR read (2)
+signals that are sent to it using the signalfd file descriptor, but
+.BR epoll_wait (2)
+will
+.B not
+indicate that the signalfd file descriptor is ready.
+In this scenario, a possible workaround is that after the
+.BR fork (2),
+the child process can close the signalfd file descriptor that it inherited
+from the parent process and then create another signalfd file descriptor
+and add it to the epoll instance.
+Alternatively, the parent and the child could delay creating their
+(separate) signalfd file descriptors and adding them to the
+epoll instance until after the call to
+.BR fork (2).
+.SH RETURN VALUE
+On success,
+.BR signalfd ()
+returns a signalfd file descriptor;
+this is either a new file descriptor (if
+.I fd
+was \-1), or
+.I fd
+if
+.I fd
+was a valid signalfd file descriptor.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+The
+.I fd
+file descriptor is not a valid file descriptor.
+.TP
+.B EINVAL
+.I fd
+is not a valid signalfd file descriptor.
+.\" or, the
+.\" .I sizemask
+.\" argument is not equal to
+.\" .IR sizeof(sigset_t) ;
+.TP
+.B EINVAL
+.I flags
+is invalid;
+or, in Linux 2.6.26 or earlier,
+.I flags
+is nonzero.
+.TP
+.B EMFILE
+The per-process limit on the number of open file descriptors has been reached.
+.TP
+.B ENFILE
+The system-wide limit on the total number of open files has been
+reached.
+.TP
+.B ENODEV
+Could not mount (internal) anonymous inode device.
+.TP
+.B ENOMEM
+There was insufficient memory to create a new signalfd file descriptor.
+.SH VERSIONS
+.SS C library/kernel differences
+The underlying Linux system call requires an additional argument,
+.IR "size_t sizemask" ,
+which specifies the size of the
+.I mask
+argument.
+The glibc
+.BR signalfd ()
+wrapper function does not include this argument,
+since it provides the required value for the underlying system call.
+.PP
+There are two underlying Linux system calls:
+.BR signalfd ()
+and the more recent
+.BR signalfd4 ().
+The former system call does not implement a
+.I flags
+argument.
+The latter system call implements the
+.I flags
+values described above.
+Starting with glibc 2.9, the
+.BR signalfd ()
+wrapper function will use
+.BR signalfd4 ()
+where it is available.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+.TP
+.BR signalfd ()
+Linux 2.6.22,
+glibc 2.8.
+.\" signalfd() is in glibc 2.7, but reportedly does not build
+.TP
+.BR signalfd4 ()
+Linux 2.6.27.
+.SH NOTES
+A process can create multiple signalfd file descriptors.
+This makes it possible to accept different signals
+on different file descriptors.
+(This may be useful if monitoring the file descriptors using
+.BR select (2),
+.BR poll (2),
+or
+.BR epoll (7):
+the arrival of different signals will make different file descriptors ready.)
+If a signal appears in the
+.I mask
+of more than one of the file descriptors, then occurrences
+of that signal can be read (once) from any one of the file descriptors.
+.PP
+Attempts to include
+.B SIGKILL
+and
+.B SIGSTOP
+in
+.I mask
+are silently ignored.
+.PP
+The signal mask employed by a signalfd file descriptor can be viewed
+via the entry for the corresponding file descriptor in the process's
+.IR /proc/ pid /fdinfo
+directory.
+See
+.BR proc (5)
+for further details.
+.\"
+.SS Limitations
+The signalfd mechanism can't be used to receive signals that
+are synchronously generated, such as the
+.B SIGSEGV
+signal that results from accessing an invalid memory address
+or the
+.B SIGFPE
+signal that results from an arithmetic error.
+Such signals can be caught only via signal handler.
+.PP
+As described above,
+in normal usage one blocks the signals that will be accepted via
+.BR signalfd ().
+If spawning a child process to execute a helper program
+(that does not need the signalfd file descriptor),
+then, after the call to
+.BR fork (2),
+you will normally want to unblock those signals before calling
+.BR execve (2),
+so that the helper program can see any signals that it expects to see.
+Be aware, however,
+that this won't be possible in the case of a helper program spawned
+behind the scenes by any library function that the program may call.
+In such cases, one must fall back to using a traditional signal
+handler that writes to a file descriptor monitored by
+.BR select (2),
+.BR poll (2),
+or
+.BR epoll (7).
+.SH BUGS
+Before Linux 2.6.25, the
+.I ssi_ptr
+and
+.I ssi_int
+fields are not filled in with the data accompanying a signal sent by
+.BR sigqueue (3).
+.\" The fix also was put into Linux 2.6.24.5
+.SH EXAMPLES
+The program below accepts the signals
+.B SIGINT
+and
+.B SIGQUIT
+via a signalfd file descriptor.
+The program terminates after accepting a
+.B SIGQUIT
+signal.
+The following shell session demonstrates the use of the program:
+.PP
+.in +4n
+.EX
+.RB "$" " ./signalfd_demo"
+.BR "\[ha]C" " # Control\-C generates SIGINT"
+Got SIGINT
+.B \[ha]C
+Got SIGINT
+\fB\[ha]\e\fP # Control\-\e generates SIGQUIT
+Got SIGQUIT
+$
+.EE
+.in
+.SS Program source
+\&
+.\" SRC BEGIN (signalfd.c)
+.EX
+#include <err.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/signalfd.h>
+#include <unistd.h>
+\&
+int
+main(void)
+{
+ int sfd;
+ ssize_t s;
+ sigset_t mask;
+ struct signalfd_siginfo fdsi;
+\&
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGINT);
+ sigaddset(&mask, SIGQUIT);
+\&
+ /* Block signals so that they aren\[aq]t handled
+ according to their default dispositions. */
+\&
+ if (sigprocmask(SIG_BLOCK, &mask, NULL) == \-1)
+ err(EXIT_FAILURE, "sigprocmask");
+\&
+ sfd = signalfd(\-1, &mask, 0);
+ if (sfd == \-1)
+ err(EXIT_FAILURE, "signalfd");
+\&
+ for (;;) {
+ s = read(sfd, &fdsi, sizeof(fdsi));
+ if (s != sizeof(fdsi))
+ err(EXIT_FAILURE, "read");
+\&
+ if (fdsi.ssi_signo == SIGINT) {
+ printf("Got SIGINT\en");
+ } else if (fdsi.ssi_signo == SIGQUIT) {
+ printf("Got SIGQUIT\en");
+ exit(EXIT_SUCCESS);
+ } else {
+ printf("Read unexpected signal\en");
+ }
+ }
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR eventfd (2),
+.BR poll (2),
+.BR read (2),
+.BR select (2),
+.BR sigaction (2),
+.BR sigprocmask (2),
+.BR sigwaitinfo (2),
+.BR timerfd_create (2),
+.BR sigsetops (3),
+.BR sigwait (3),
+.BR epoll (7),
+.BR signal (7)
diff --git a/man2/signalfd4.2 b/man2/signalfd4.2
new file mode 100644
index 0000000..8dbea5c
--- /dev/null
+++ b/man2/signalfd4.2
@@ -0,0 +1 @@
+.so man2/signalfd.2
diff --git a/man2/sigpending.2 b/man2/sigpending.2
new file mode 100644
index 0000000..e1b3158
--- /dev/null
+++ b/man2/sigpending.2
@@ -0,0 +1,110 @@
+.\" Copyright (c) 2005 Michael Kerrisk
+.\" based on earlier work by faith@cs.unc.edu and
+.\" Mike Battersby <mib@deakin.edu.au>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" 2005-09-15, mtk, Created new page by splitting off from sigaction.2
+.\"
+.TH sigpending 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+sigpending, rt_sigpending \- examine pending signals
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <signal.h>
+.PP
+.BI "int sigpending(sigset_t *" set );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR sigpending ():
+.nf
+ _POSIX_C_SOURCE
+.fi
+.SH DESCRIPTION
+.BR sigpending ()
+returns the set of signals that are pending for delivery to the calling
+thread (i.e., the signals which have been raised while blocked).
+The mask of pending signals is returned in
+.IR set .
+.SH RETURN VALUE
+.BR sigpending ()
+returns 0 on success.
+On failure, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+.I set
+points to memory which is not a valid part of the process address space.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001.
+.SS C library/kernel differences
+The original Linux system call was named
+.BR sigpending ().
+However, with the addition of real-time signals in Linux 2.2,
+the fixed-size, 32-bit
+.I sigset_t
+argument supported by that system call was no longer fit for purpose.
+Consequently, a new system call,
+.BR rt_sigpending (),
+was added to support an enlarged
+.I sigset_t
+type.
+The new system call takes a second argument,
+.IR "size_t sigsetsize" ,
+which specifies the size in bytes of the signal set in
+.IR set .
+.\" This argument is currently required to be less than or equal to
+.\" .IR sizeof(sigset_t)
+.\" (or the error
+.\" .B EINVAL
+.\" results).
+The glibc
+.BR sigpending ()
+wrapper function hides these details from us, transparently calling
+.BR rt_sigpending ()
+when the kernel provides it.
+.SH NOTES
+See
+.BR sigsetops (3)
+for details on manipulating signal sets.
+.PP
+If a signal is both blocked and has a disposition of "ignored", it is
+.I not
+added to the mask of pending signals when generated.
+.PP
+The set of signals that is pending for a thread
+is the union of the set of signals that is pending for that thread
+and the set of signals that is pending for the process as a whole; see
+.BR signal (7).
+.PP
+A child created via
+.BR fork (2)
+initially has an empty pending signal set;
+the pending signal set is preserved across an
+.BR execve (2).
+.SH BUGS
+Up to and including glibc 2.2.1,
+there is a bug in the wrapper function for
+.BR sigpending ()
+which means that information about pending real-time signals
+is not correctly returned.
+.SH SEE ALSO
+.BR kill (2),
+.BR sigaction (2),
+.BR signal (2),
+.BR sigprocmask (2),
+.BR sigsuspend (2),
+.BR sigsetops (3),
+.BR signal (7)
diff --git a/man2/sigprocmask.2 b/man2/sigprocmask.2
new file mode 100644
index 0000000..a89c1ed
--- /dev/null
+++ b/man2/sigprocmask.2
@@ -0,0 +1,224 @@
+.\" Copyright (c) 2005 Michael Kerrisk
+.\" based on earlier work by faith@cs.unc.edu and
+.\" Mike Battersby <mib@deakin.edu.au>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" 2005-09-15, mtk, Created new page by splitting off from sigaction.2
+.\"
+.TH sigprocmask 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+sigprocmask, rt_sigprocmask \- examine and change blocked signals
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.B #include <signal.h>
+.PP
+.nf
+/* Prototype for the glibc wrapper function */
+.BI "int sigprocmask(int " how ", const sigset_t *_Nullable restrict " set ,
+.BI " sigset_t *_Nullable restrict " oldset );
+.PP
+.BR "#include <signal.h>" " /* Definition of " SIG_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+/* Prototype for the underlying system call */
+.BI "int syscall(SYS_rt_sigprocmask, int " how ,
+.BI " const kernel_sigset_t *_Nullable " set ,
+.BI " kernel_sigset_t *_Nullable " oldset ,
+.BI " size_t " sigsetsize );
+.PP
+/* Prototype for the legacy system call */
+.BI "[[deprecated]] int syscall(SYS_sigprocmask, int " how ,
+.BI " const old_kernel_sigset_t *_Nullable " set ,
+.BI " old_kernel_sigset_t *_Nullable " oldset );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR sigprocmask ():
+.nf
+ _POSIX_C_SOURCE
+.fi
+.SH DESCRIPTION
+.BR sigprocmask ()
+is used to fetch and/or change the signal mask of the calling thread.
+The signal mask is the set of signals whose delivery is currently
+blocked for the caller
+(see also
+.BR signal (7)
+for more details).
+.PP
+The behavior of the call is dependent on the value of
+.IR how ,
+as follows.
+.TP
+.B SIG_BLOCK
+The set of blocked signals is the union of the current set and the
+.I set
+argument.
+.TP
+.B SIG_UNBLOCK
+The signals in
+.I set
+are removed from the current set of blocked signals.
+It is permissible to attempt to unblock a signal which is not blocked.
+.TP
+.B SIG_SETMASK
+The set of blocked signals is set to the argument
+.IR set .
+.PP
+If
+.I oldset
+is non-NULL, the previous value of the signal mask is stored in
+.IR oldset .
+.PP
+If
+.I set
+is NULL, then the signal mask is unchanged (i.e.,
+.I how
+is ignored),
+but the current value of the signal mask is nevertheless returned in
+.I oldset
+(if it is not NULL).
+.PP
+A set of functions for modifying and inspecting variables of type
+.I sigset_t
+("signal sets") is described in
+.BR sigsetops (3).
+.PP
+The use of
+.BR sigprocmask ()
+is unspecified in a multithreaded process; see
+.BR pthread_sigmask (3).
+.SH RETURN VALUE
+.BR sigprocmask ()
+returns 0 on success.
+On failure, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+The
+.I set
+or
+.I oldset
+argument points outside the process's allocated address space.
+.TP
+.B EINVAL
+Either the value specified in
+.I how
+was invalid or the kernel does not support the size passed in
+.I sigsetsize.
+.SH VERSIONS
+.SS C library/kernel differences
+The kernel's definition of
+.I sigset_t
+differs in size from that used
+by the C library.
+In this manual page, the former is referred to as
+.I kernel_sigset_t
+(it is nevertheless named
+.I sigset_t
+in the kernel sources).
+.PP
+The glibc wrapper function for
+.BR sigprocmask ()
+silently ignores attempts to block the two real-time signals that
+are used internally by the NPTL threading implementation.
+See
+.BR nptl (7)
+for details.
+.PP
+The original Linux system call was named
+.BR sigprocmask ().
+However, with the addition of real-time signals in Linux 2.2,
+the fixed-size, 32-bit
+.I sigset_t
+(referred to as
+.I old_kernel_sigset_t
+in this manual page)
+type supported by that system call was no longer fit for purpose.
+Consequently, a new system call,
+.BR rt_sigprocmask (),
+was added to support an enlarged
+.I sigset_t
+type
+(referred to as
+.I kernel_sigset_t
+in this manual page).
+The new system call takes a fourth argument,
+.IR "size_t sigsetsize" ,
+which specifies the size in bytes of the signal sets in
+.I set
+and
+.IR oldset .
+This argument is currently required to have a fixed architecture specific value
+(equal to
+.IR sizeof(kernel_sigset_t) ).
+.\" sizeof(kernel_sigset_t) == _NSIG / 8,
+.\" which equals to 8 on most architectures, but e.g. on MIPS it's 16.
+.PP
+The glibc
+.BR sigprocmask ()
+wrapper function hides these details from us, transparently calling
+.BR rt_sigprocmask ()
+when the kernel provides it.
+.\"
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001.
+.SH NOTES
+It is not possible to block
+.BR SIGKILL " or " SIGSTOP .
+Attempts to do so are silently ignored.
+.PP
+Each of the threads in a process has its own signal mask.
+.PP
+A child created via
+.BR fork (2)
+inherits a copy of its parent's signal mask;
+the signal mask is preserved across
+.BR execve (2).
+.PP
+If
+.BR SIGBUS ,
+.BR SIGFPE ,
+.BR SIGILL ,
+or
+.B SIGSEGV
+are generated
+while they are blocked, the result is undefined,
+unless the signal was generated by
+.BR kill (2),
+.BR sigqueue (3),
+or
+.BR raise (3).
+.PP
+See
+.BR sigsetops (3)
+for details on manipulating signal sets.
+.PP
+Note that it is permissible (although not very useful) to specify both
+.I set
+and
+.I oldset
+as NULL.
+.SH SEE ALSO
+.BR kill (2),
+.BR pause (2),
+.BR sigaction (2),
+.BR signal (2),
+.BR sigpending (2),
+.BR sigsuspend (2),
+.BR pthread_sigmask (3),
+.BR sigqueue (3),
+.BR sigsetops (3),
+.BR signal (7)
diff --git a/man2/sigreturn.2 b/man2/sigreturn.2
new file mode 100644
index 0000000..03ce952
--- /dev/null
+++ b/man2/sigreturn.2
@@ -0,0 +1,151 @@
+.\" Copyright (C) 2008, 2014, Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Created Sat Aug 21 1995 Thomas K. Dyas <tdyas@eden.rutgers.edu>
+.\" Modified Tue Oct 22 22:09:03 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" 2008-06-26, mtk, added some more detail on the work done by sigreturn()
+.\" 2014-12-05, mtk, rewrote all of the rest of the original page
+.\"
+.TH sigreturn 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+sigreturn, rt_sigreturn \- return from signal handler and cleanup stack frame
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B int sigreturn(...);
+.fi
+.SH DESCRIPTION
+If the Linux kernel determines that an unblocked
+signal is pending for a process, then,
+at the next transition back to user mode in that process
+(e.g., upon return from a system call or
+when the process is rescheduled onto the CPU),
+it creates a new frame on the user-space stack where it
+saves various pieces of process context
+(processor status word, registers, signal mask, and signal stack settings).
+.\" See arch/x86/kernel/signal.c::__setup_frame() [in Linux 3.17 source code]
+.PP
+The kernel also arranges that, during the transition back to user mode,
+the signal handler is called, and that, upon return from the handler,
+control passes to a piece of user-space code commonly called
+the "signal trampoline".
+The signal trampoline code in turn calls
+.BR sigreturn ().
+.PP
+This
+.BR sigreturn ()
+call undoes everything that was
+done\[em]changing the process's signal mask, switching signal stacks (see
+.BR sigaltstack "(2))\[em]in"
+order to invoke the signal handler.
+Using the information that was earlier saved on the user-space stack
+.BR sigreturn ()
+restores the process's signal mask, switches stacks,
+and restores the process's context
+(processor flags and registers,
+including the stack pointer and instruction pointer),
+so that the process resumes execution
+at the point where it was interrupted by the signal.
+.SH RETURN VALUE
+.BR sigreturn ()
+never returns.
+.SH VERSIONS
+Many UNIX-type systems have a
+.BR sigreturn ()
+system call or near equivalent.
+However, this call is not specified in POSIX,
+and details of its behavior vary across systems.
+.SH STANDARDS
+None.
+.SH NOTES
+.BR sigreturn ()
+exists only to allow the implementation of signal handlers.
+It should
+.B never
+be called directly.
+(Indeed, a simple
+.BR sigreturn ()
+.\" See sysdeps/unix/sysv/linux/sigreturn.c and
+.\" signal/sigreturn.c in the glibc source
+wrapper in the GNU C library simply returns \-1, with
+.I errno
+set to
+.BR ENOSYS .)
+Details of the arguments (if any) passed to
+.BR sigreturn ()
+vary depending on the architecture.
+(On some architectures, such as x86-64,
+.BR sigreturn ()
+takes no arguments, since all of the information that it requires
+is available in the stack frame that was previously created by the
+kernel on the user-space stack.)
+.PP
+Once upon a time, UNIX systems placed the signal trampoline code
+onto the user stack.
+Nowadays, pages of the user stack are protected so as to
+disallow code execution.
+Thus, on contemporary Linux systems, depending on the architecture,
+the signal trampoline code lives either in the
+.BR vdso (7)
+or in the C library.
+In the latter case,
+.\" See, for example, sysdeps/unix/sysv/linux/i386/sigaction.c and
+.\" sysdeps/unix/sysv/linux/x86_64/sigaction.c in the glibc (2.20) source.
+the C library's
+.BR sigaction (2)
+wrapper function informs the kernel of the location of the trampoline code
+by placing its address in the
+.I sa_restorer
+field of the
+.I sigaction
+structure,
+and sets the
+.B SA_RESTORER
+flag in the
+.I sa_flags
+field.
+.PP
+The saved process context information is placed in a
+.I ucontext_t
+structure (see
+.IR <sys/ucontext.h> ).
+That structure is visible within the signal handler
+as the third argument of a handler established via
+.BR sigaction (2)
+with the
+.B SA_SIGINFO
+flag.
+.PP
+On some other UNIX systems,
+the operation of the signal trampoline differs a little.
+In particular, on some systems, upon transitioning back to user mode,
+the kernel passes control to the trampoline (rather than the signal handler),
+and the trampoline code calls the signal handler (and then calls
+.BR sigreturn ()
+once the handler returns).
+.\"
+.SS C library/kernel differences
+The original Linux system call was named
+.BR sigreturn ().
+However, with the addition of real-time signals in Linux 2.2,
+a new system call,
+.BR rt_sigreturn ()
+was added to support an enlarged
+.I sigset_t
+type.
+The GNU C library
+hides these details from us, transparently employing
+.BR rt_sigreturn ()
+when the kernel provides it.
+.\"
+.SH SEE ALSO
+.BR kill (2),
+.BR restart_syscall (2),
+.BR sigaltstack (2),
+.BR signal (2),
+.BR getcontext (3),
+.BR signal (7),
+.BR vdso (7)
diff --git a/man2/sigsuspend.2 b/man2/sigsuspend.2
new file mode 100644
index 0000000..f89a6ca
--- /dev/null
+++ b/man2/sigsuspend.2
@@ -0,0 +1,131 @@
+.\" Copyright (c) 2005 Michael Kerrisk
+.\" based on earlier work by faith@cs.unc.edu and
+.\" Mike Battersby <mib@deakin.edu.au>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" 2005-09-15, mtk, Created new page by splitting off from sigaction.2
+.\"
+.TH sigsuspend 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+sigsuspend, rt_sigsuspend \- wait for a signal
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <signal.h>
+.PP
+.BI "int sigsuspend(const sigset_t *" mask );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR sigsuspend ():
+.nf
+ _POSIX_C_SOURCE
+.fi
+.SH DESCRIPTION
+.BR sigsuspend ()
+temporarily replaces the signal mask of the calling thread with the
+mask given by
+.I mask
+and then suspends the thread until delivery of a signal whose
+action is to invoke a signal handler or to terminate a process.
+.PP
+If the signal terminates the process, then
+.BR sigsuspend ()
+does not return.
+If the signal is caught, then
+.BR sigsuspend ()
+returns after the signal handler returns,
+and the signal mask is restored to the state before the call to
+.BR sigsuspend ().
+.PP
+It is not possible to block
+.B SIGKILL
+or
+.BR SIGSTOP ;
+specifying these signals in
+.IR mask ,
+has no effect on the thread's signal mask.
+.SH RETURN VALUE
+.BR sigsuspend ()
+always returns \-1, with
+.I errno
+set to indicate the error (normally,
+.BR EINTR ).
+.SH ERRORS
+.TP
+.B EFAULT
+.I mask
+points to memory which is not a valid part of the process address space.
+.TP
+.B EINTR
+The call was interrupted by a signal;
+.BR signal (7).
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001.
+.SS C library/kernel differences
+The original Linux system call was named
+.BR sigsuspend ().
+However, with the addition of real-time signals in Linux 2.2,
+the fixed-size, 32-bit
+.I sigset_t
+type supported by that system call was no longer fit for purpose.
+Consequently, a new system call,
+.BR rt_sigsuspend (),
+was added to support an enlarged
+.I sigset_t
+type.
+The new system call takes a second argument,
+.IR "size_t sigsetsize" ,
+which specifies the size in bytes of the signal set in
+.IR mask .
+This argument is currently required to have the value
+.I sizeof(sigset_t)
+(or the error
+.B EINVAL
+results).
+The glibc
+.BR sigsuspend ()
+wrapper function hides these details from us, transparently calling
+.BR rt_sigsuspend ()
+when the kernel provides it.
+.\"
+.SH NOTES
+Normally,
+.BR sigsuspend ()
+is used in conjunction with
+.BR sigprocmask (2)
+in order to prevent delivery of a signal during the execution of a
+critical code section.
+The caller first blocks the signals with
+.BR sigprocmask (2).
+When the critical code has completed, the caller then waits for the
+signals by calling
+.BR sigsuspend ()
+with the signal mask that was returned by
+.BR sigprocmask (2)
+(in the
+.I oldset
+argument).
+.PP
+See
+.BR sigsetops (3)
+for details on manipulating signal sets.
+.SH SEE ALSO
+.BR kill (2),
+.BR pause (2),
+.BR sigaction (2),
+.BR signal (2),
+.BR sigprocmask (2),
+.BR sigwaitinfo (2),
+.BR sigsetops (3),
+.BR sigwait (3),
+.BR signal (7)
diff --git a/man2/sigtimedwait.2 b/man2/sigtimedwait.2
new file mode 100644
index 0000000..1b13df1
--- /dev/null
+++ b/man2/sigtimedwait.2
@@ -0,0 +1 @@
+.so man2/sigwaitinfo.2
diff --git a/man2/sigwaitinfo.2 b/man2/sigwaitinfo.2
new file mode 100644
index 0000000..a5703fc
--- /dev/null
+++ b/man2/sigwaitinfo.2
@@ -0,0 +1,231 @@
+.\" Copyright (c) 2002 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH sigwaitinfo 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+sigwaitinfo, sigtimedwait, rt_sigtimedwait \- synchronously wait
+for queued signals
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <signal.h>
+.PP
+.BI "int sigwaitinfo(const sigset_t *restrict " set ,
+.BI " siginfo_t *_Nullable restrict " info );
+.BI "int sigtimedwait(const sigset_t *restrict " set ,
+.BI " siginfo_t *_Nullable restrict " info ,
+.BI " const struct timespec *restrict " timeout );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR sigwaitinfo (),
+.BR sigtimedwait ():
+.nf
+ _POSIX_C_SOURCE >= 199309L
+.fi
+.SH DESCRIPTION
+.BR sigwaitinfo ()
+suspends execution of the calling thread until one of the signals in
+.I set
+is pending
+(If one of the signals in
+.I set
+is already pending for the calling thread,
+.BR sigwaitinfo ()
+will return immediately.)
+.PP
+.BR sigwaitinfo ()
+removes the signal from the set of pending
+signals and returns the signal number as its function result.
+If the
+.I info
+argument is not NULL,
+then the buffer that it points to is used to return a structure of type
+.I siginfo_t
+(see
+.BR sigaction (2))
+containing information about the signal.
+.PP
+If multiple signals in
+.I set
+are pending for the caller, the signal that is retrieved by
+.BR sigwaitinfo ()
+is determined according to the usual ordering rules; see
+.BR signal (7)
+for further details.
+.PP
+.BR sigtimedwait ()
+operates in exactly the same way as
+.BR sigwaitinfo ()
+except that it has an additional argument,
+.IR timeout ,
+which specifies the interval for which
+the thread is suspended waiting for a signal.
+(This interval will be rounded up to the system clock granularity,
+and kernel scheduling delays mean that the interval
+may overrun by a small amount.)
+This argument is a
+.BR timespec (3)
+structure.
+.PP
+If both fields of this structure are specified as 0, a poll is performed:
+.BR sigtimedwait ()
+returns immediately, either with information about a signal that
+was pending for the caller, or with an error
+if none of the signals in
+.I set
+was pending.
+.SH RETURN VALUE
+On success, both
+.BR sigwaitinfo ()
+and
+.BR sigtimedwait ()
+return a signal number (i.e., a value greater than zero).
+On failure both calls return \-1, with
+.I errno
+set to indicate the error.
+.SH ERRORS
+.TP
+.B EAGAIN
+No signal in
+.I set
+became pending within the
+.I timeout
+period specified to
+.BR sigtimedwait ().
+.TP
+.B EINTR
+The wait was interrupted by a signal handler; see
+.BR signal (7).
+(This handler was for a signal other than one of those in
+.IR set .)
+.TP
+.B EINVAL
+.I timeout
+was invalid.
+.SH VERSIONS
+.SS C library/kernel differences
+On Linux,
+.BR sigwaitinfo ()
+is a library function implemented on top of
+.BR sigtimedwait ().
+.PP
+The glibc wrapper functions for
+.BR sigwaitinfo ()
+and
+.BR sigtimedwait ()
+silently ignore attempts to wait for the two real-time signals that
+are used internally by the NPTL threading implementation.
+See
+.BR nptl (7)
+for details.
+.PP
+The original Linux system call was named
+.BR sigtimedwait ().
+However, with the addition of real-time signals in Linux 2.2,
+the fixed-size, 32-bit
+.I sigset_t
+type supported by that system call was no longer fit for purpose.
+Consequently, a new system call,
+.BR rt_sigtimedwait (),
+was added to support an enlarged
+.I sigset_t
+type.
+The new system call takes a fourth argument,
+.IR "size_t sigsetsize" ,
+which specifies the size in bytes of the signal set in
+.IR set .
+This argument is currently required to have the value
+.I sizeof(sigset_t)
+(or the error
+.B EINVAL
+results).
+The glibc
+.BR sigtimedwait ()
+wrapper function hides these details from us, transparently calling
+.BR rt_sigtimedwait ()
+when the kernel provides it.
+.\"
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001.
+.SH NOTES
+In normal usage, the calling program blocks the signals in
+.I set
+via a prior call to
+.BR sigprocmask (2)
+(so that the default disposition for these signals does not occur if they
+become pending between successive calls to
+.BR sigwaitinfo ()
+or
+.BR sigtimedwait ())
+and does not establish handlers for these signals.
+In a multithreaded program,
+the signal should be blocked in all threads, in order to prevent
+the signal being treated according to its default disposition in
+a thread other than the one calling
+.BR sigwaitinfo ()
+or
+.BR sigtimedwait ()).
+.PP
+The set of signals that is pending for a given thread is the
+union of the set of signals that is pending specifically for that thread
+and the set of signals that is pending for the process as a whole (see
+.BR signal (7)).
+.PP
+Attempts to wait for
+.B SIGKILL
+and
+.B SIGSTOP
+are silently ignored.
+.PP
+If multiple threads of a process are blocked
+waiting for the same signal(s) in
+.BR sigwaitinfo ()
+or
+.BR sigtimedwait (),
+then exactly one of the threads will actually receive the
+signal if it becomes pending for the process as a whole;
+which of the threads receives the signal is indeterminate.
+.PP
+.BR sigwaitinfo ()
+or
+.BR sigtimedwait (),
+can't be used to receive signals that
+are synchronously generated, such as the
+.B SIGSEGV
+signal that results from accessing an invalid memory address
+or the
+.B SIGFPE
+signal that results from an arithmetic error.
+Such signals can be caught only via signal handler.
+.PP
+POSIX leaves the meaning of a NULL value for the
+.I timeout
+argument of
+.BR sigtimedwait ()
+unspecified, permitting the possibility that this has the same meaning
+as a call to
+.BR sigwaitinfo (),
+and indeed this is what is done on Linux.
+.SH SEE ALSO
+.BR kill (2),
+.BR sigaction (2),
+.BR signal (2),
+.BR signalfd (2),
+.BR sigpending (2),
+.BR sigprocmask (2),
+.BR sigqueue (3),
+.BR sigsetops (3),
+.BR sigwait (3),
+.BR timespec (3),
+.BR signal (7),
+.BR time (7)
diff --git a/man2/socket.2 b/man2/socket.2
new file mode 100644
index 0000000..2a35f2b
--- /dev/null
+++ b/man2/socket.2
@@ -0,0 +1,493 @@
+'\" t
+.\" Copyright (c) 1983, 1991 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" SPDX-License-Identifier: BSD-4-Clause-UC
+.\"
+.\" $Id: socket.2,v 1.4 1999/05/13 11:33:42 freitag Exp $
+.\"
+.\" Modified 1993-07-24 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1996-10-22 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 1998, 1999 by Andi Kleen <ak@muc.de>
+.\" Modified 2002-07-17 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified 2004-06-17 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.TH socket 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+socket \- create an endpoint for communication
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/socket.h>
+.PP
+.BI "int socket(int " domain ", int " type ", int " protocol );
+.fi
+.SH DESCRIPTION
+.BR socket ()
+creates an endpoint for communication and returns a file descriptor
+that refers to that endpoint.
+The file descriptor returned by a successful call will be
+the lowest-numbered file descriptor not currently open for the process.
+.PP
+The
+.I domain
+argument specifies a communication domain; this selects the protocol
+family which will be used for communication.
+These families are defined in
+.IR <sys/socket.h> .
+The formats currently understood by the Linux kernel include:
+.TS
+tab(:);
+l1 lw40 l.
+Name:Purpose:Man page
+T{
+.B AF_UNIX
+T}:T{
+Local communication
+T}:T{
+.BR unix (7)
+T}
+T{
+.B AF_LOCAL
+T}:T{
+Synonym for
+.B AF_UNIX
+T}:T{
+T}
+T{
+.B AF_INET
+T}:IPv4 Internet protocols:T{
+.BR ip (7)
+T}
+T{
+.B AF_AX25
+T}:T{
+Amateur radio AX.25 protocol
+T}:T{
+.\" Part of ax25-tools
+.BR ax25 (4)
+T}
+T{
+.B AF_IPX
+T}:IPX \- Novell protocols:
+T{
+.B AF_APPLETALK
+T}:AppleTalk:T{
+.BR ddp (7)
+T}
+T{
+.B AF_X25
+T}:ITU-T X.25 / ISO-8208 protocol:T{
+.BR x25 (7)
+T}
+T{
+.B AF_INET6
+T}:IPv6 Internet protocols:T{
+.BR ipv6 (7)
+T}
+T{
+.B AF_DECnet
+T}:T{
+DECet protocol sockets
+T}
+T{
+.B AF_KEY
+T}:T{
+Key management protocol, originally developed for usage with IPsec
+T}
+T{
+.B AF_NETLINK
+T}:T{
+Kernel user interface device
+T}:T{
+.BR netlink (7)
+T}
+T{
+.B AF_PACKET
+T}:T{
+Low-level packet interface
+T}:T{
+.BR packet (7)
+T}
+T{
+.B AF_RDS
+T}:T{
+.\" commit: 639b321b4d8f4e412bfbb2a4a19bfebc1e68ace4
+Reliable Datagram Sockets (RDS) protocol
+T}:T{
+.\" rds-tools: https://github.com/oracle/rds-tools/blob/master/rds.7
+.\" rds-tools: https://github.com/oracle/rds-tools/blob/master/rds-rdma.7
+.BR rds (7)
+.br
+.BR rds\-rdma (7)
+T}
+T{
+.B AF_PPPOX
+T}:T{
+Generic PPP transport layer, for setting up L2 tunnels
+(L2TP and PPPoE)
+T}
+T{
+.B AF_LLC
+T}:T{
+.\" linux-history commit: 34beb106cde7da233d4df35dd3d6cf4fee937caa
+Logical link control (IEEE 802.2 LLC) protocol
+T}
+T{
+.B AF_IB
+T}:T{
+.\" commits: 8d36eb01da5d371f..ce117ffac2e93334
+InfiniBand native addressing
+T}
+T{
+.B AF_MPLS
+T}:T{
+.\" commits: 0189197f441602acdca3f97750d392a895b778fd
+Multiprotocol Label Switching
+T}
+T{
+.B AF_CAN
+T}:T{
+.\" commits: 8dbde28d9711475a..5423dd67bd0108a1
+Controller Area Network automotive bus protocol
+T}
+T{
+.B AF_TIPC
+T}:T{
+.\" commits: b97bf3fd8f6a16966d4f18983b2c40993ff937d4
+TIPC, "cluster domain sockets" protocol
+T}
+T{
+.B AF_BLUETOOTH
+T}:T{
+.\" commits: 8d36eb01da5d371f..ce117ffac2e93334
+Bluetooth low-level socket protocol
+T}
+T{
+.B AF_ALG
+T}:T{
+.\" commit: 03c8efc1ffeb6b82a22c1af8dd908af349563314
+Interface to kernel crypto API
+T}
+T{
+.B AF_VSOCK
+T}:T{
+.\" commit: d021c344051af91f42c5ba9fdedc176740cbd238
+VSOCK (originally "VMWare VSockets") protocol
+for hypervisor-guest communication
+T}:T{
+.BR vsock (7)
+T}
+T{
+.B AF_KCM
+T}:T{
+.\" commit: 03c8efc1ffeb6b82a22c1af8dd908af349563314
+KCM (kernel connection multiplexer) interface
+T}
+T{
+.B AF_XDP
+T}:T{
+.\" commit: c0c77d8fb787cfe0c3fca689c2a30d1dad4eaba7
+XDP (express data path) interface
+T}
+.TE
+.PP
+Further details of the above address families,
+as well as information on several other address families, can be found in
+.BR address_families (7).
+.PP
+The socket has the indicated
+.IR type ,
+which specifies the communication semantics.
+Currently defined types
+are:
+.TP 16
+.B SOCK_STREAM
+Provides sequenced, reliable, two-way, connection-based byte streams.
+An out-of-band data transmission mechanism may be supported.
+.TP
+.B SOCK_DGRAM
+Supports datagrams (connectionless, unreliable messages of a fixed
+maximum length).
+.TP
+.B SOCK_SEQPACKET
+Provides a sequenced, reliable, two-way connection-based data
+transmission path for datagrams of fixed maximum length; a consumer is
+required to read an entire packet with each input system call.
+.TP
+.B SOCK_RAW
+Provides raw network protocol access.
+.TP
+.B SOCK_RDM
+Provides a reliable datagram layer that does not guarantee ordering.
+.TP
+.B SOCK_PACKET
+Obsolete and should not be used in new programs;
+see
+.BR packet (7).
+.PP
+Some socket types may not be implemented by all protocol families.
+.PP
+Since Linux 2.6.27, the
+.I type
+argument serves a second purpose:
+in addition to specifying a socket type,
+it may include the bitwise OR of any of the following values,
+to modify the behavior of
+.BR socket ():
+.TP 16
+.B SOCK_NONBLOCK
+Set the
+.B O_NONBLOCK
+file status flag on the open file description (see
+.BR open (2))
+referred to by the new file descriptor.
+Using this flag saves extra calls to
+.BR fcntl (2)
+to achieve the same result.
+.TP
+.B SOCK_CLOEXEC
+Set the close-on-exec
+.RB ( FD_CLOEXEC )
+flag on the new file descriptor.
+See the description of the
+.B O_CLOEXEC
+flag in
+.BR open (2)
+for reasons why this may be useful.
+.PP
+The
+.I protocol
+specifies a particular protocol to be used with the socket.
+Normally only a single protocol exists to support a particular
+socket type within a given protocol family, in which case
+.I protocol
+can be specified as 0.
+However, it is possible that many protocols may exist, in
+which case a particular protocol must be specified in this manner.
+The protocol number to use is specific to the \*(lqcommunication domain\*(rq
+in which communication is to take place; see
+.BR protocols (5).
+See
+.BR getprotoent (3)
+on how to map protocol name strings to protocol numbers.
+.PP
+Sockets of type
+.B SOCK_STREAM
+are full-duplex byte streams.
+They do not preserve
+record boundaries.
+A stream socket must be in
+a
+.I connected
+state before any data may be sent or received on it.
+A connection to
+another socket is created with a
+.BR connect (2)
+call.
+Once connected, data may be transferred using
+.BR read (2)
+and
+.BR write (2)
+calls or some variant of the
+.BR send (2)
+and
+.BR recv (2)
+calls.
+When a session has been completed a
+.BR close (2)
+may be performed.
+Out-of-band data may also be transmitted as described in
+.BR send (2)
+and received as described in
+.BR recv (2).
+.PP
+The communications protocols which implement a
+.B SOCK_STREAM
+ensure that data is not lost or duplicated.
+If a piece of data for which
+the peer protocol has buffer space cannot be successfully transmitted
+within a reasonable length of time, then the connection is considered
+to be dead.
+When
+.B SO_KEEPALIVE
+is enabled on the socket the protocol checks in a protocol-specific
+manner if the other end is still alive.
+A
+.B SIGPIPE
+signal is raised if a process sends or receives
+on a broken stream; this causes naive processes,
+which do not handle the signal, to exit.
+.B SOCK_SEQPACKET
+sockets employ the same system calls as
+.B SOCK_STREAM
+sockets.
+The only difference is that
+.BR read (2)
+calls will return only the amount of data requested,
+and any data remaining in the arriving packet will be discarded.
+Also all message boundaries in incoming datagrams are preserved.
+.PP
+.B SOCK_DGRAM
+and
+.B SOCK_RAW
+sockets allow sending of datagrams to correspondents named in
+.BR sendto (2)
+calls.
+Datagrams are generally received with
+.BR recvfrom (2),
+which returns the next datagram along with the address of its sender.
+.PP
+.B SOCK_PACKET
+is an obsolete socket type to receive raw packets directly from the
+device driver.
+Use
+.BR packet (7)
+instead.
+.PP
+An
+.BR fcntl (2)
+.B F_SETOWN
+operation can be used to specify a process or process group to receive a
+.B SIGURG
+signal when the out-of-band data arrives or
+.B SIGPIPE
+signal when a
+.B SOCK_STREAM
+connection breaks unexpectedly.
+This operation may also be used to set the process or process group
+that receives the I/O and asynchronous notification of I/O events via
+.BR SIGIO .
+Using
+.B F_SETOWN
+is equivalent to an
+.BR ioctl (2)
+call with the
+.B FIOSETOWN
+or
+.B SIOCSPGRP
+argument.
+.PP
+When the network signals an error condition to the protocol module (e.g.,
+using an ICMP message for IP) the pending error flag is set for the socket.
+The next operation on this socket will return the error code of the pending
+error.
+For some protocols it is possible to enable a per-socket error queue
+to retrieve detailed information about the error; see
+.B IP_RECVERR
+in
+.BR ip (7).
+.PP
+The operation of sockets is controlled by socket level
+.IR options .
+These options are defined in
+.IR <sys/socket.h> .
+The functions
+.BR setsockopt (2)
+and
+.BR getsockopt (2)
+are used to set and get options.
+.SH RETURN VALUE
+On success, a file descriptor for the new socket is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+Permission to create a socket of the specified type and/or protocol
+is denied.
+.TP
+.B EAFNOSUPPORT
+The implementation does not support the specified address family.
+.TP
+.B EINVAL
+Unknown protocol, or protocol family not available.
+.TP
+.B EINVAL
+.\" Since Linux 2.6.27
+Invalid flags in
+.IR type .
+.TP
+.B EMFILE
+The per-process limit on the number of open file descriptors has been reached.
+.TP
+.B ENFILE
+The system-wide limit on the total number of open files has been reached.
+.TP
+.BR ENOBUFS " or " ENOMEM
+Insufficient memory is available.
+The socket cannot be
+created until sufficient resources are freed.
+.TP
+.B EPROTONOSUPPORT
+The protocol type or the specified protocol is not
+supported within this domain.
+.PP
+Other errors may be generated by the underlying protocol modules.
+.SH STANDARDS
+POSIX.1-2008.
+.PP
+.B SOCK_NONBLOCK
+and
+.B SOCK_CLOEXEC
+are Linux-specific.
+.SH HISTORY
+POSIX.1-2001, 4.4BSD.
+.PP
+.BR socket ()
+appeared in 4.2BSD.
+It is generally portable to/from
+non-BSD systems supporting clones of the BSD socket layer (including
+System\ V variants).
+.PP
+The manifest constants used under 4.x BSD for protocol families
+are
+.BR PF_UNIX ,
+.BR PF_INET ,
+and so on, while
+.BR AF_UNIX ,
+.BR AF_INET ,
+and so on are used for address
+families.
+However, already the BSD man page promises: "The protocol
+family generally is the same as the address family", and subsequent
+standards use AF_* everywhere.
+.SH EXAMPLES
+An example of the use of
+.BR socket ()
+is shown in
+.BR getaddrinfo (3).
+.SH SEE ALSO
+.BR accept (2),
+.BR bind (2),
+.BR close (2),
+.BR connect (2),
+.BR fcntl (2),
+.BR getpeername (2),
+.BR getsockname (2),
+.BR getsockopt (2),
+.BR ioctl (2),
+.BR listen (2),
+.BR read (2),
+.BR recv (2),
+.BR select (2),
+.BR send (2),
+.BR shutdown (2),
+.BR socketpair (2),
+.BR write (2),
+.BR getprotoent (3),
+.BR address_families (7),
+.BR ip (7),
+.BR socket (7),
+.BR tcp (7),
+.BR udp (7),
+.BR unix (7)
+.PP
+\[lq]An Introductory 4.3BSD Interprocess Communication Tutorial\[rq]
+and
+\[lq]BSD Interprocess Communication Tutorial\[rq],
+reprinted in
+.I UNIX Programmer's Supplementary Documents Volume 1.
diff --git a/man2/socketcall.2 b/man2/socketcall.2
new file mode 100644
index 0000000..24f7f6b
--- /dev/null
+++ b/man2/socketcall.2
@@ -0,0 +1,185 @@
+'\" t
+.\" Copyright (c) 1995 Michael Chastain (mec@shell.portal.com), 15 April 1995.
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" Modified Tue Oct 22 22:11:53 1996 by Eric S. Raymond <esr@thyrsus.com>
+.TH socketcall 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+socketcall \- socket system calls
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <linux/net.h>" " /* Definition of " SYS_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_socketcall " */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_socketcall, int " call ", unsigned long *" args );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR socketcall (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+.BR socketcall ()
+is a common kernel entry point for the socket system calls.
+.I call
+determines which socket function to invoke.
+.I args
+points to a block containing the actual arguments,
+which are passed through to the appropriate call.
+.PP
+User programs should call the appropriate functions by their usual names.
+Only standard library implementors and kernel hackers need to know about
+.BR socketcall ().
+.PP
+.TS
+tab(:);
+l l.
+\fIcall\fR:Man page
+T{
+.B SYS_SOCKET
+T}:T{
+.BR socket (2)
+T}
+T{
+.B SYS_BIND
+T}:T{
+.BR bind (2)
+T}
+T{
+.B SYS_CONNECT
+T}:T{
+.BR connect (2)
+T}
+T{
+.B SYS_LISTEN
+T}:T{
+.BR listen (2)
+T}
+T{
+.B SYS_ACCEPT
+T}:T{
+.BR accept (2)
+T}
+T{
+.B SYS_GETSOCKNAME
+T}:T{
+.BR getsockname (2)
+T}
+T{
+.B SYS_GETPEERNAME
+T}:T{
+.BR getpeername (2)
+T}
+T{
+.B SYS_SOCKETPAIR
+T}:T{
+.BR socketpair (2)
+T}
+T{
+.B SYS_SEND
+T}:T{
+.BR send (2)
+T}
+T{
+.B SYS_RECV
+T}:T{
+.BR recv (2)
+T}
+T{
+.B SYS_SENDTO
+T}:T{
+.BR sendto (2)
+T}
+T{
+.B SYS_RECVFROM
+T}:T{
+.BR recvfrom (2)
+T}
+T{
+.B SYS_SHUTDOWN
+T}:T{
+.BR shutdown (2)
+T}
+T{
+.B SYS_SETSOCKOPT
+T}:T{
+.BR setsockopt (2)
+T}
+T{
+.B SYS_GETSOCKOPT
+T}:T{
+.BR getsockopt (2)
+T}
+T{
+.B SYS_SENDMSG
+T}:T{
+.BR sendmsg (2)
+T}
+T{
+.B SYS_RECVMSG
+T}:T{
+.BR recvmsg (2)
+T}
+T{
+.B SYS_ACCEPT4
+T}:T{
+.BR accept4 (2)
+T}
+T{
+.B SYS_RECVMMSG
+T}:T{
+.BR recvmmsg (2)
+T}
+T{
+.B SYS_SENDMMSG
+T}:T{
+.BR sendmmsg (2)
+T}
+.TE
+.SH VERSIONS
+On some architectures\[em]for example, x86-64 and ARM\[em]there is no
+.BR socketcall ()
+system call; instead
+.BR socket (2),
+.BR accept (2),
+.BR bind (2),
+and so on really are implemented as separate system calls.
+.SH STANDARDS
+Linux.
+.PP
+On x86-32,
+.BR socketcall ()
+was historically the only entry point for the sockets API.
+However, starting in Linux 4.3,
+.\" commit 9dea5dc921b5f4045a18c63eb92e84dc274d17eb
+direct system calls are provided on x86-32 for the sockets API.
+This facilitates the creation of
+.BR seccomp (2)
+filters that filter sockets system calls
+(for new user-space binaries that are compiled
+to use the new entry points)
+and also provides a (very) small performance improvement.
+.SH SEE ALSO
+.BR accept (2),
+.BR bind (2),
+.BR connect (2),
+.BR getpeername (2),
+.BR getsockname (2),
+.BR getsockopt (2),
+.BR listen (2),
+.BR recv (2),
+.BR recvfrom (2),
+.BR recvmsg (2),
+.BR send (2),
+.BR sendmsg (2),
+.BR sendto (2),
+.BR setsockopt (2),
+.BR shutdown (2),
+.BR socket (2),
+.BR socketpair (2)
diff --git a/man2/socketpair.2 b/man2/socketpair.2
new file mode 100644
index 0000000..741596e
--- /dev/null
+++ b/man2/socketpair.2
@@ -0,0 +1,116 @@
+.\" Copyright (c) 1983, 1991 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" SPDX-License-Identifier: BSD-4-Clause-UC
+.\"
+.\" @(#)socketpair.2 6.4 (Berkeley) 3/10/91
+.\"
+.\" Modified 1993-07-24 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1996-10-22 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 2002-07-22 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified 2004-06-17 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" 2008-10-11, mtk: Add description of SOCK_NONBLOCK and SOCK_CLOEXEC
+.\"
+.TH socketpair 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+socketpair \- create a pair of connected sockets
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/socket.h>
+.PP
+.BI "int socketpair(int " domain ", int " type ", int " protocol \
+", int " sv [2]);
+.fi
+.SH DESCRIPTION
+The
+.BR socketpair ()
+call creates an unnamed pair of connected sockets in the specified
+.IR domain ,
+of the specified
+.IR type ,
+and using the optionally specified
+.IR protocol .
+For further details of these arguments, see
+.BR socket (2).
+.PP
+The file descriptors used in referencing the new sockets are returned in
+.I sv[0]
+and
+.IR sv[1] .
+The two sockets are indistinguishable.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned,
+.I errno
+is set to indicate the error, and
+.I sv
+is left unchanged
+.PP
+On Linux (and other systems),
+.BR socketpair ()
+does not modify
+.I sv
+on failure.
+A requirement standardizing this behavior was added in POSIX.1-2008 TC2.
+.\" http://austingroupbugs.net/view.php?id=483
+.SH ERRORS
+.TP
+.B EAFNOSUPPORT
+The specified address family is not supported on this machine.
+.TP
+.B EFAULT
+The address
+.I sv
+does not specify a valid part of the process address space.
+.TP
+.B EMFILE
+The per-process limit on the number of open file descriptors has been reached.
+.TP
+.B ENFILE
+The system-wide limit on the total number of open files has been reached.
+.TP
+.B EOPNOTSUPP
+The specified protocol does not support creation of socket pairs.
+.TP
+.B EPROTONOSUPPORT
+The specified protocol is not supported on this machine.
+.SH VERSIONS
+On Linux, the only supported domains for this call are
+.B AF_UNIX
+(or synonymously,
+.BR AF_LOCAL )
+and
+.B AF_TIPC
+.\" commit: 70b03759e9ecfae400605fa34f3d7154cccbbba3
+(since Linux 4.12).
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, 4.4BSD.
+.PP
+.BR socketpair ()
+first appeared in 4.2BSD.
+It is generally portable to/from
+non-BSD systems supporting clones of the BSD socket layer (including
+System\ V variants).
+.PP
+Since Linux 2.6.27,
+.BR socketpair ()
+supports the
+.B SOCK_NONBLOCK
+and
+.B SOCK_CLOEXEC
+flags in the
+.I type
+argument, as described in
+.BR socket (2).
+.SH SEE ALSO
+.BR pipe (2),
+.BR read (2),
+.BR socket (2),
+.BR write (2),
+.BR socket (7),
+.BR unix (7)
diff --git a/man2/splice.2 b/man2/splice.2
new file mode 100644
index 0000000..88d4160
--- /dev/null
+++ b/man2/splice.2
@@ -0,0 +1,266 @@
+.\" This manpage is Copyright (C) 2006 Jens Axboe
+.\" and Copyright (C) 2006 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH splice 2 2023-07-15 "Linux man-pages 6.05.01"
+.SH NAME
+splice \- splice data to/from a pipe
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B "#define _FILE_OFFSET_BITS 64
+.B #include <fcntl.h>
+.PP
+.BI "ssize_t splice(int " fd_in ", off_t *_Nullable " off_in ,
+.BI " int " fd_out ", off_t *_Nullable " off_out ,
+.BI " size_t " len ", unsigned int " flags );
+.\" Return type was long before glibc 2.7
+.fi
+.SH DESCRIPTION
+.BR splice ()
+moves data between two file descriptors
+without copying between kernel address space and user address space.
+It transfers up to
+.I len
+bytes of data from the file descriptor
+.I fd_in
+to the file descriptor
+.IR fd_out ,
+where one of the file descriptors must refer to a pipe.
+.PP
+The following semantics apply for
+.I fd_in
+and
+.IR off_in :
+.IP \[bu] 3
+If
+.I fd_in
+refers to a pipe, then
+.I off_in
+must be NULL.
+.IP \[bu]
+If
+.I fd_in
+does not refer to a pipe and
+.I off_in
+is NULL, then bytes are read from
+.I fd_in
+starting from the file offset,
+and the file offset is adjusted appropriately.
+.IP \[bu]
+If
+.I fd_in
+does not refer to a pipe and
+.I off_in
+is not NULL, then
+.I off_in
+must point to a buffer which specifies the starting
+offset from which bytes will be read from
+.IR fd_in ;
+in this case, the file offset of
+.I fd_in
+is not changed.
+.PP
+Analogous statements apply for
+.I fd_out
+and
+.IR off_out .
+.PP
+The
+.I flags
+argument is a bit mask that is composed by ORing together
+zero or more of the following values:
+.TP
+.B SPLICE_F_MOVE
+Attempt to move pages instead of copying.
+This is only a hint to the kernel:
+pages may still be copied if the kernel cannot move the
+pages from the pipe, or if
+the pipe buffers don't refer to full pages.
+The initial implementation of this flag was buggy:
+therefore starting in Linux 2.6.21 it is a no-op
+(but is still permitted in a
+.BR splice ()
+call);
+in the future, a correct implementation may be restored.
+.TP
+.B SPLICE_F_NONBLOCK
+Do not block on I/O.
+This makes the splice pipe operations nonblocking, but
+.BR splice ()
+may nevertheless block because the file descriptors that
+are spliced to/from may block (unless they have the
+.B O_NONBLOCK
+flag set).
+.TP
+.B SPLICE_F_MORE
+More data will be coming in a subsequent splice.
+This is a helpful hint when
+the
+.I fd_out
+refers to a socket (see also the description of
+.B MSG_MORE
+in
+.BR send (2),
+and the description of
+.B TCP_CORK
+in
+.BR tcp (7)).
+.TP
+.B SPLICE_F_GIFT
+Unused for
+.BR splice ();
+see
+.BR vmsplice (2).
+.SH RETURN VALUE
+Upon successful completion,
+.BR splice ()
+returns the number of bytes
+spliced to or from the pipe.
+.PP
+A return value of 0 means end of input.
+If
+.I fd_in
+refers to a pipe, then this means that there was no data to transfer,
+and it would not make sense to block because there are no writers
+connected to the write end of the pipe.
+.PP
+On error,
+.BR splice ()
+returns \-1 and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EAGAIN
+.B SPLICE_F_NONBLOCK
+was specified in
+.I flags
+or one of the file descriptors had been marked as nonblocking
+.RB ( O_NONBLOCK ) ,
+and the operation would block.
+.TP
+.B EBADF
+One or both file descriptors are not valid,
+or do not have proper read-write mode.
+.TP
+.B EINVAL
+The target filesystem doesn't support splicing.
+.TP
+.B EINVAL
+The target file is opened in append mode.
+.\" The append-mode error is given since Linux 2.6.27; in earlier kernels,
+.\" splice() in append mode was broken
+.TP
+.B EINVAL
+Neither of the file descriptors refers to a pipe.
+.TP
+.B EINVAL
+An offset was given for nonseekable device (e.g., a pipe).
+.TP
+.B EINVAL
+.I fd_in
+and
+.I fd_out
+refer to the same pipe.
+.TP
+.B ENOMEM
+Out of memory.
+.TP
+.B ESPIPE
+Either
+.I off_in
+or
+.I off_out
+was not NULL, but the corresponding file descriptor refers to a pipe.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.17,
+glibc 2.5.
+.PP
+In Linux 2.6.30 and earlier,
+exactly one of
+.I fd_in
+and
+.I fd_out
+was required to be a pipe.
+Since Linux 2.6.31,
+.\" commit 7c77f0b3f9208c339a4b40737bb2cb0f0319bb8d
+both arguments may refer to pipes.
+.SH NOTES
+The three system calls
+.BR splice (),
+.BR vmsplice (2),
+and
+.BR tee (2),
+provide user-space programs with full control over an arbitrary
+kernel buffer, implemented within the kernel using the same type
+of buffer that is used for a pipe.
+In overview, these system calls perform the following tasks:
+.TP
+.BR splice ()
+moves data from the buffer to an arbitrary file descriptor, or vice versa,
+or from one buffer to another.
+.TP
+.BR tee (2)
+"copies" the data from one buffer to another.
+.TP
+.BR vmsplice (2)
+"copies" data from user space into the buffer.
+.PP
+Though we talk of copying, actual copies are generally avoided.
+The kernel does this by implementing a pipe buffer as a set
+of reference-counted pointers to pages of kernel memory.
+The kernel creates "copies" of pages in a buffer by creating new
+pointers (for the output buffer) referring to the pages,
+and increasing the reference counts for the pages:
+only pointers are copied, not the pages of the buffer.
+.\"
+.\" Linus: Now, imagine using the above in a media server, for example.
+.\" Let's say that a year or two has passed, so that the video drivers
+.\" have been updated to be able to do the splice thing, and what can
+.\" you do? You can:
+.\"
+.\" - splice from the (mpeg or whatever - let's just assume that the video
+.\" input is either digital or does the encoding on its own - like they
+.\" pretty much all do) video input into a pipe (remember: no copies - the
+.\" video input will just DMA directly into memory, and splice will just
+.\" set up the pages in the pipe buffer)
+.\" - tee that pipe to split it up
+.\" - splice one end to a file (ie "save the compressed stream to disk")
+.\" - splice the other end to a real-time video decoder window for your
+.\" real-time viewing pleasure.
+.\"
+.\" Linus: Now, the advantage of splice()/tee() is that you can
+.\" do zero-copy movement of data, and unlike sendfile() you can
+.\" do it on _arbitrary_ data (and, as shown by "tee()", it's more
+.\" than just sending the data to somebody else: you can duplicate
+.\" the data and choose to forward it to two or more different
+.\" users - for things like logging etc.).
+.\"
+.PP
+.B _FILE_OFFSET_BITS
+should be defined to be 64 in code that uses non-null
+.I off_in
+or
+.I off_out
+or that takes the address of
+.BR splice ,
+if the code is intended to be portable
+to traditional 32-bit x86 and ARM platforms where
+.BR off_t 's
+width defaults to 32 bits.
+.SH EXAMPLES
+See
+.BR tee (2).
+.SH SEE ALSO
+.BR copy_file_range (2),
+.BR sendfile (2),
+.BR tee (2),
+.BR vmsplice (2),
+.BR pipe (7)
diff --git a/man2/spu_create.2 b/man2/spu_create.2
new file mode 100644
index 0000000..36d1bbd
--- /dev/null
+++ b/man2/spu_create.2
@@ -0,0 +1,276 @@
+.\" Copyright (c) International Business Machines Corp., 2006
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" HISTORY:
+.\" 2005-09-28, created by Arnd Bergmann <arndb@de.ibm.com>
+.\" 2006-06-16, revised by Eduardo M. Fleury <efleury@br.ibm.com>
+.\" 2007-07-10, some polishing by mtk
+.\" 2007-09-28, updates for newer kernels by Jeremy Kerr <jk@ozlabs.org>
+.\"
+.TH spu_create 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+spu_create \- create a new spu context
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <sys/spu.h>" " /* Definition of " SPU_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_spu_create, const char *" pathname \
+", unsigned int " flags ,
+.BI " mode_t " mode ", int " neighbor_fd );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR spu_create (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The
+.BR spu_create ()
+system call is used on PowerPC machines that implement the
+Cell Broadband Engine Architecture in order to access Synergistic
+Processor Units (SPUs).
+It creates a new logical context for an SPU in
+.I pathname
+and returns a file descriptor associated with it.
+.I pathname
+must refer to a nonexistent directory in the mount point of
+the SPU filesystem
+.RB ( spufs ).
+If
+.BR spu_create ()
+is successful, a directory is created at
+.I pathname
+and it is populated with the files described in
+.BR spufs (7).
+.PP
+When a context is created,
+the returned file descriptor can only be passed to
+.BR spu_run (2),
+used as the
+.I dirfd
+argument to the
+.B *at
+family of system calls (e.g.,
+.BR openat (2)),
+or closed;
+other operations are not defined.
+A logical SPU
+context is destroyed (along with all files created within the context's
+.I pathname
+directory) once the last reference to the context has gone;
+this usually occurs when the file descriptor returned by
+.BR spu_create ()
+is closed.
+.PP
+The
+.I mode
+argument (minus any bits set in the process's
+.BR umask (2))
+specifies the permissions used for creating the new directory in
+.BR spufs .
+See
+.BR stat (2)
+for a full list of the possible
+.I mode
+values.
+.PP
+The
+.I neighbor_fd
+is used only when the
+.B SPU_CREATE_AFFINITY_SPU
+flag is specified; see below.
+.PP
+The
+.I flags
+argument can be zero or any bitwise OR-ed
+combination of the following constants:
+.TP
+.B SPU_CREATE_EVENTS_ENABLED
+Rather than using signals for reporting DMA errors, use the
+.I event
+argument to
+.BR spu_run (2).
+.TP
+.B SPU_CREATE_GANG
+Create an SPU gang instead of a context.
+(A gang is a group of SPU contexts that are
+functionally related to each other and which share common scheduling
+parameters\[em]priority and policy.
+In the future, gang scheduling may be implemented causing
+the group to be switched in and out as a single unit.)
+.IP
+A new directory will be created at the location specified by the
+.I pathname
+argument.
+This gang may be used to hold other SPU contexts, by providing
+a pathname that is within the gang directory to further calls to
+.BR spu_create ().
+.TP
+.B SPU_CREATE_NOSCHED
+Create a context that is not affected by the SPU scheduler.
+Once the context is run,
+it will not be scheduled out until it is destroyed by
+the creating process.
+.IP
+Because the context cannot be removed from the SPU, some functionality
+is disabled for
+.B SPU_CREATE_NOSCHED
+contexts.
+Only a subset of the files will be
+available in this context directory in
+.BR spufs .
+Additionally,
+.B SPU_CREATE_NOSCHED
+contexts cannot dump a core file when crashing.
+.IP
+Creating
+.B SPU_CREATE_NOSCHED
+contexts requires the
+.B CAP_SYS_NICE
+capability.
+.TP
+.B SPU_CREATE_ISOLATE
+Create an isolated SPU context.
+Isolated contexts are protected from some
+PPE (PowerPC Processing Element)
+operations,
+such as access to the SPU local store and the NPC register.
+.IP
+Creating
+.B SPU_CREATE_ISOLATE
+contexts also requires the
+.B SPU_CREATE_NOSCHED
+flag.
+.TP
+.BR SPU_CREATE_AFFINITY_SPU " (since Linux 2.6.23)"
+.\" commit 8e68e2f248332a9c3fd4f08258f488c209bd3e0c
+Create a context with affinity to another SPU context.
+This affinity information is used within the SPU scheduling algorithm.
+Using this flag requires that a file descriptor referring to
+the other SPU context be passed in the
+.I neighbor_fd
+argument.
+.TP
+.BR SPU_CREATE_AFFINITY_MEM " (since Linux 2.6.23)"
+.\" commit 8e68e2f248332a9c3fd4f08258f488c209bd3e0c
+Create a context with affinity to system memory.
+This affinity information
+is used within the SPU scheduling algorithm.
+.SH RETURN VALUE
+On success,
+.BR spu_create ()
+returns a new file descriptor.
+On failure, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+The current user does not have write access to the
+.BR spufs (7)
+mount point.
+.TP
+.B EEXIST
+An SPU context already exists at the given pathname.
+.TP
+.B EFAULT
+.I pathname
+is not a valid string pointer in the
+calling process's address space.
+.TP
+.B EINVAL
+.I pathname
+is not a directory in the
+.BR spufs (7)
+mount point, or invalid flags have been provided.
+.TP
+.B ELOOP
+Too many symbolic links were found while resolving
+.IR pathname .
+.TP
+.B EMFILE
+The per-process limit on the number of open file descriptors has been reached.
+.TP
+.B ENAMETOOLONG
+.I pathname
+is too long.
+.TP
+.B ENFILE
+The system-wide limit on the total number of open files has been reached.
+.TP
+.B ENODEV
+An isolated context was requested, but the hardware does not support
+SPU isolation.
+.TP
+.B ENOENT
+Part of
+.I pathname
+could not be resolved.
+.TP
+.B ENOMEM
+The kernel could not allocate all resources required.
+.TP
+.B ENOSPC
+There are not enough SPU resources available to create
+a new context or the user-specific limit for the number
+of SPU contexts has been reached.
+.TP
+.B ENOSYS
+The functionality is not provided by the current system, because
+either the hardware does not provide SPUs or the spufs module is not
+loaded.
+.TP
+.B ENOTDIR
+A part of
+.I pathname
+is not a directory.
+.TP
+.B EPERM
+The
+.B SPU_CREATE_NOSCHED
+flag has been given, but the user does not have the
+.B CAP_SYS_NICE
+capability.
+.SH FILES
+.I pathname
+must point to a location beneath the mount point of
+.BR spufs .
+By convention, it gets mounted in
+.IR /spu .
+.SH STANDARDS
+Linux on PowerPC.
+.SH HISTORY
+Linux 2.6.16.
+.PP
+Prior to the addition of the
+.B SPU_CREATE_AFFINITY_SPU
+flag in Linux 2.6.23, the
+.BR spu_create ()
+system call took only three arguments (i.e., there was no
+.I neighbor_fd
+argument).
+.SH NOTES
+.BR spu_create ()
+is meant to be used from libraries that implement a more abstract
+interface to SPUs, not to be used from regular applications.
+See
+.UR http://www.bsc.es\:/projects\:/deepcomputing\:/linuxoncell/
+.UE
+for the recommended libraries.
+.SH EXAMPLES
+See
+.BR spu_run (2)
+for an example of the use of
+.BR spu_create ()
+.SH SEE ALSO
+.BR close (2),
+.BR spu_run (2),
+.BR capabilities (7),
+.BR spufs (7)
diff --git a/man2/spu_run.2 b/man2/spu_run.2
new file mode 100644
index 0000000..0a9d229
--- /dev/null
+++ b/man2/spu_run.2
@@ -0,0 +1,260 @@
+.\" Copyright (c) International Business Machines Corp., 2006
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" HISTORY:
+.\" 2005-09-28, created by Arnd Bergmann <arndb@de.ibm.com>
+.\" 2006-06-16, revised by Eduardo M. Fleury <efleury@br.ibm.com>
+.\" 2007-07-10, some polishing by mtk
+.\" 2007-09-28, updates for newer kernels, added example
+.\" by Jeremy Kerr <jk@ozlabs.org>
+.\"
+.TH spu_run 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+spu_run \- execute an SPU context
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <sys/spu.h>" " /* Definition of " SPU_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_spu_run, int " fd ", uint32_t *" npc \
+", uint32_t *" event );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR spu_run (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The
+.BR spu_run ()
+system call is used on PowerPC machines that implement the
+Cell Broadband Engine Architecture in order to access Synergistic
+Processor Units (SPUs).
+The
+.I fd
+argument is a file descriptor returned by
+.BR spu_create (2)
+that refers to a specific SPU context.
+When the context gets scheduled to a physical SPU,
+it starts execution at the instruction pointer passed in
+.IR npc .
+.PP
+Execution of SPU code happens synchronously, meaning that
+.BR spu_run ()
+blocks while the SPU is still running.
+If there is a need
+to execute SPU code in parallel with other code on either the
+main CPU or other SPUs, a new thread of execution must be created
+first (e.g., using
+.BR pthread_create (3)).
+.PP
+When
+.BR spu_run ()
+returns, the current value of the SPU program counter is written to
+.IR npc ,
+so successive calls to
+.BR spu_run ()
+can use the same
+.I npc
+pointer.
+.PP
+The
+.I event
+argument provides a buffer for an extended status code.
+If the SPU
+context was created with the
+.B SPU_CREATE_EVENTS_ENABLED
+flag, then this buffer is populated by the Linux kernel before
+.BR spu_run ()
+returns.
+.PP
+The status code may be one (or more) of the following constants:
+.TP
+.B SPE_EVENT_DMA_ALIGNMENT
+A DMA alignment error occurred.
+.TP
+.B SPE_EVENT_INVALID_DMA
+An invalid MFC DMA command was attempted.
+.\" SPE_EVENT_SPE_DATA_SEGMENT is defined, but does not seem to be generated
+.\" at any point (in Linux 5.9 sources).
+.TP
+.B SPE_EVENT_SPE_DATA_STORAGE
+A DMA storage error occurred.
+.TP
+.B SPE_EVENT_SPE_ERROR
+An illegal instruction was executed.
+.PP
+NULL
+is a valid value for the
+.I event
+argument.
+In this case, the events will not be reported to the calling process.
+.SH RETURN VALUE
+On success,
+.BR spu_run ()
+returns the value of the
+.I spu_status
+register.
+On failure, it returns \-1 and sets
+.I errno
+is set to indicate the error.
+.PP
+The
+.I spu_status
+register value is a bit mask of status codes and
+optionally a 14-bit code returned from the
+.B stop-and-signal
+instruction on the SPU.
+The bit masks for the status codes
+are:
+.TP
+.B 0x02
+SPU was stopped by a
+.B stop-and-signal
+instruction.
+.TP
+.B 0x04
+SPU was stopped by a
+.B halt
+instruction.
+.TP
+.B 0x08
+SPU is waiting for a channel.
+.TP
+.B 0x10
+SPU is in single-step mode.
+.TP
+.B 0x20
+SPU has tried to execute an invalid instruction.
+.TP
+.B 0x40
+SPU has tried to access an invalid channel.
+.TP
+.B 0x3fff0000
+The bits masked with this value contain the code returned from a
+.B stop-and-signal
+instruction.
+These bits are valid only if the 0x02 bit is set.
+.PP
+If
+.BR spu_run ()
+has not returned an error, one or more bits among the lower eight
+ones are always set.
+.SH ERRORS
+.TP
+.B EBADF
+.I fd
+is not a valid file descriptor.
+.TP
+.B EFAULT
+.I npc
+is not a valid pointer, or
+.I event
+is non-NULL and an invalid pointer.
+.TP
+.B EINTR
+A signal occurred while
+.BR spu_run ()
+was in progress; see
+.BR signal (7).
+The
+.I npc
+value has been updated to the new program counter value if
+necessary.
+.TP
+.B EINVAL
+.I fd
+is not a valid file descriptor returned from
+.BR spu_create (2).
+.TP
+.B ENOMEM
+There was not enough memory available to handle a page fault
+resulting from a Memory Flow Controller (MFC) direct memory access.
+.TP
+.B ENOSYS
+The functionality is not provided by the current system, because
+either the hardware does not provide SPUs or the spufs module is not
+loaded.
+.SH STANDARDS
+Linux on PowerPC.
+.SH HISTORY
+Linux 2.6.16.
+.SH NOTES
+.BR spu_run ()
+is meant to be used from libraries that implement a more abstract
+interface to SPUs, not to be used from regular applications.
+See
+.UR http://www.bsc.es\:/projects\:/deepcomputing\:/linuxoncell/
+.UE
+for the recommended libraries.
+.SH EXAMPLES
+The following is an example of running a simple, one-instruction SPU
+program with the
+.BR spu_run ()
+system call.
+.PP
+.\" SRC BEGIN (spu_run.c)
+.EX
+#include <err.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <unistd.h>
+\&
+int main(void)
+{
+ int context, fd, spu_status;
+ uint32_t instruction, npc;
+\&
+ context = syscall(SYS_spu_create, "/spu/example\-context", 0, 0755);
+ if (context == \-1)
+ err(EXIT_FAILURE, "spu_create");
+\&
+ /*
+ * Write a \[aq]stop 0x1234\[aq] instruction to the SPU\[aq]s
+ * local store memory.
+ */
+ instruction = 0x00001234;
+\&
+ fd = open("/spu/example\-context/mem", O_RDWR);
+ if (fd == \-1)
+ err(EXIT_FAILURE, "open");
+ write(fd, &instruction, sizeof(instruction));
+\&
+ /*
+ * set npc to the starting instruction address of the
+ * SPU program. Since we wrote the instruction at the
+ * start of the mem file, the entry point will be 0x0.
+ */
+ npc = 0;
+\&
+ spu_status = syscall(SYS_spu_run, context, &npc, NULL);
+ if (spu_status == \-1)
+ err(EXIT_FAILURE, "open");
+\&
+ /*
+ * We should see a status code of 0x12340002:
+ * 0x00000002 (spu was stopped due to stop\-and\-signal)
+ * | 0x12340000 (the stop\-and\-signal code)
+ */
+ printf("SPU Status: %#08x\en", spu_status);
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.\" .SH AUTHORS
+.\" Arnd Bergmann <arndb@de.ibm.com>, Jeremy Kerr <jk@ozlabs.org>
+.SH SEE ALSO
+.BR close (2),
+.BR spu_create (2),
+.BR capabilities (7),
+.BR spufs (7)
diff --git a/man2/ssetmask.2 b/man2/ssetmask.2
new file mode 100644
index 0000000..a7f99d2
--- /dev/null
+++ b/man2/ssetmask.2
@@ -0,0 +1 @@
+.so man2/sgetmask.2
diff --git a/man2/stat.2 b/man2/stat.2
new file mode 100644
index 0000000..f41daab
--- /dev/null
+++ b/man2/stat.2
@@ -0,0 +1,539 @@
+.\" Copyright (c) 1992 Drew Eckhardt (drew@cs.colorado.edu), March 28, 1992
+.\" Parts Copyright (c) 1995 Nicolai Langfeldt (janl@ifi.uio.no), 1/1/95
+.\" and Copyright (c) 2006, 2007, 2014 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt <michael@moria.de>
+.\" Modified 1993-07-24 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1995-05-18 by Todd Larason <jtl@molehill.org>
+.\" Modified 1997-01-31 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 1995-01-09 by Richard Kettlewell <richard@greenend.org.uk>
+.\" Modified 1998-05-13 by Michael Haardt <michael@cantor.informatik.rwth-aachen.de>
+.\" Modified 1999-07-06 by aeb & Albert Cahalan
+.\" Modified 2000-01-07 by aeb
+.\" Modified 2004-06-23 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" 2007-06-08 mtk: Added example program
+.\" 2007-07-05 mtk: Added details on underlying system call interfaces
+.\"
+.TH stat 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+stat, fstat, lstat, fstatat \- get file status
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/stat.h>
+.PP
+.BI "int stat(const char *restrict " pathname ,
+.BI " struct stat *restrict " statbuf );
+.BI "int fstat(int " fd ", struct stat *" statbuf );
+.BI "int lstat(const char *restrict " pathname ,
+.BI " struct stat *restrict " statbuf );
+.PP
+.BR "#include <fcntl.h> " "/* Definition of " AT_* " constants */"
+.B #include <sys/stat.h>
+.PP
+.BI "int fstatat(int " dirfd ", const char *restrict " pathname ,
+.BI " struct stat *restrict " statbuf ", int " flags );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR lstat ():
+.nf
+ /* Since glibc 2.20 */ _DEFAULT_SOURCE
+ || _XOPEN_SOURCE >= 500
+.\" _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED
+ || /* Since glibc 2.10: */ _POSIX_C_SOURCE >= 200112L
+ || /* glibc 2.19 and earlier */ _BSD_SOURCE
+.fi
+.PP
+.BR fstatat ():
+.nf
+ Since glibc 2.10:
+ _POSIX_C_SOURCE >= 200809L
+ Before glibc 2.10:
+ _ATFILE_SOURCE
+.fi
+.SH DESCRIPTION
+These functions return information about a file, in the buffer pointed to by
+.IR statbuf .
+No permissions are required on the file itself, but\[em]in the case of
+.BR stat (),
+.BR fstatat (),
+and
+.BR lstat ()\[em]execute
+(search) permission is required on all of the directories in
+.I pathname
+that lead to the file.
+.PP
+.BR stat ()
+and
+.BR fstatat ()
+retrieve information about the file pointed to by
+.IR pathname ;
+the differences for
+.BR fstatat ()
+are described below.
+.PP
+.BR lstat ()
+is identical to
+.BR stat (),
+except that if
+.I pathname
+is a symbolic link, then it returns information about the link itself,
+not the file that the link refers to.
+.PP
+.BR fstat ()
+is identical to
+.BR stat (),
+except that the file about which information is to be retrieved
+is specified by the file descriptor
+.IR fd .
+.\"
+.SS The stat structure
+All of these system calls return a
+.I stat
+structure (see
+.BR stat (3type)).
+.PP
+.\" Background: inode attributes are modified with i_mutex held, but
+.\" read by stat() without taking the mutex.
+.IR Note :
+for performance and simplicity reasons, different fields in the
+.I stat
+structure may contain state information from different moments
+during the execution of the system call.
+For example, if
+.I st_mode
+or
+.I st_uid
+is changed by another process by calling
+.BR chmod (2)
+or
+.BR chown (2),
+.BR stat ()
+might return the old
+.I st_mode
+together with the new
+.IR st_uid ,
+or the old
+.I st_uid
+together with the new
+.IR st_mode .
+.SS fstatat()
+The
+.BR fstatat ()
+system call is a more general interface for accessing file information
+which can still provide exactly the behavior of each of
+.BR stat (),
+.BR lstat (),
+and
+.BR fstat ().
+.PP
+If the pathname given in
+.I pathname
+is relative, then it is interpreted relative to the directory
+referred to by the file descriptor
+.I dirfd
+(rather than relative to the current working directory of
+the calling process, as is done by
+.BR stat ()
+and
+.BR lstat ()
+for a relative pathname).
+.PP
+If
+.I pathname
+is relative and
+.I dirfd
+is the special value
+.BR AT_FDCWD ,
+then
+.I pathname
+is interpreted relative to the current working
+directory of the calling process (like
+.BR stat ()
+and
+.BR lstat ()).
+.PP
+If
+.I pathname
+is absolute, then
+.I dirfd
+is ignored.
+.PP
+.I flags
+can either be 0, or include one or more of the following flags ORed:
+.TP
+.BR AT_EMPTY_PATH " (since Linux 2.6.39)"
+.\" commit 65cfc6722361570bfe255698d9cd4dccaf47570d
+If
+.I pathname
+is an empty string, operate on the file referred to by
+.I dirfd
+(which may have been obtained using the
+.BR open (2)
+.B O_PATH
+flag).
+In this case,
+.I dirfd
+can refer to any type of file, not just a directory, and
+the behavior of
+.BR fstatat ()
+is similar to that of
+.BR fstat ().
+If
+.I dirfd
+is
+.BR AT_FDCWD ,
+the call operates on the current working directory.
+This flag is Linux-specific; define
+.B _GNU_SOURCE
+.\" Before glibc 2.16, defining _ATFILE_SOURCE sufficed
+to obtain its definition.
+.TP
+.BR AT_NO_AUTOMOUNT " (since Linux 2.6.38)"
+Don't automount the terminal ("basename") component of
+.I pathname.
+Since Linux 3.1 this flag is ignored.
+Since Linux 4.11 this flag is implied.
+.TP
+.B AT_SYMLINK_NOFOLLOW
+If
+.I pathname
+is a symbolic link, do not dereference it:
+instead return information about the link itself, like
+.BR lstat ().
+(By default,
+.BR fstatat ()
+dereferences symbolic links, like
+.BR stat ().)
+.PP
+See
+.BR openat (2)
+for an explanation of the need for
+.BR fstatat ().
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+Search permission is denied for one of the directories
+in the path prefix of
+.IR pathname .
+(See also
+.BR path_resolution (7).)
+.TP
+.B EBADF
+.I fd
+is not a valid open file descriptor.
+.TP
+.B EBADF
+.RB ( fstatat ())
+.I pathname
+is relative but
+.I dirfd
+is neither
+.B AT_FDCWD
+nor a valid file descriptor.
+.TP
+.B EFAULT
+Bad address.
+.TP
+.B EINVAL
+.RB ( fstatat ())
+Invalid flag specified in
+.IR flags .
+.TP
+.B ELOOP
+Too many symbolic links encountered while traversing the path.
+.TP
+.B ENAMETOOLONG
+.I pathname
+is too long.
+.TP
+.B ENOENT
+A component of
+.I pathname
+does not exist or is a dangling symbolic link.
+.TP
+.B ENOENT
+.I pathname
+is an empty string and
+.B AT_EMPTY_PATH
+was not specified in
+.IR flags .
+.TP
+.B ENOMEM
+Out of memory (i.e., kernel memory).
+.TP
+.B ENOTDIR
+A component of the path prefix of
+.I pathname
+is not a directory.
+.TP
+.B ENOTDIR
+.RB ( fstatat ())
+.I pathname
+is relative and
+.I dirfd
+is a file descriptor referring to a file other than a directory.
+.TP
+.B EOVERFLOW
+.I pathname
+or
+.I fd
+refers to a file whose size, inode number,
+or number of blocks cannot be represented in, respectively, the types
+.IR off_t ,
+.IR ino_t ,
+or
+.IR blkcnt_t .
+This error can occur when, for example,
+an application compiled on a 32-bit platform without
+.I \-D_FILE_OFFSET_BITS=64
+calls
+.BR stat ()
+on a file whose size exceeds
+.I (1<<31)\-1
+bytes.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+.TP
+.BR stat ()
+.TQ
+.BR fstat ()
+.TQ
+.BR lstat ()
+SVr4, 4.3BSD, POSIX.1-2001.
+.\" SVr4 documents additional
+.\" .BR fstat ()
+.\" error conditions EINTR, ENOLINK, and EOVERFLOW. SVr4
+.\" documents additional
+.\" .BR stat ()
+.\" and
+.\" .BR lstat ()
+.\" error conditions EINTR, EMULTIHOP, ENOLINK, and EOVERFLOW.
+.TP
+.BR fstatat ()
+POSIX.1-2008.
+Linux 2.6.16,
+glibc 2.4.
+.PP
+According to POSIX.1-2001,
+.BR lstat ()
+on a symbolic link need return valid information only in the
+.I st_size
+field and the file type of the
+.I st_mode
+field of the
+.I stat
+structure.
+POSIX.1-2008 tightens the specification, requiring
+.BR lstat ()
+to return valid information in all fields except the mode bits in
+.IR st_mode .
+.PP
+Use of the
+.I st_blocks
+and
+.I st_blksize
+fields may be less portable.
+(They were introduced in BSD.
+The interpretation differs between systems,
+and possibly on a single system when NFS mounts are involved.)
+.SS C library/kernel differences
+Over time, increases in the size of the
+.I stat
+structure have led to three successive versions of
+.BR stat ():
+.IR sys_stat ()
+(slot
+.IR __NR_oldstat ),
+.IR sys_newstat ()
+(slot
+.IR __NR_stat ),
+and
+.I sys_stat64()
+(slot
+.IR __NR_stat64 )
+on 32-bit platforms such as i386.
+The first two versions were already present in Linux 1.0
+(albeit with different names);
+.\" See include/asm-i386/stat.h in the Linux 2.4 source code for the
+.\" various versions of the structure definitions
+the last was added in Linux 2.4.
+Similar remarks apply for
+.BR fstat ()
+and
+.BR lstat ().
+.PP
+The kernel-internal versions of the
+.I stat
+structure dealt with by the different versions are, respectively:
+.TP
+.I __old_kernel_stat
+The original structure, with rather narrow fields, and no padding.
+.TP
+.I stat
+Larger
+.I st_ino
+field and padding added to various parts of the structure to
+allow for future expansion.
+.TP
+.I stat64
+Even larger
+.I st_ino
+field,
+larger
+.I st_uid
+and
+.I st_gid
+fields to accommodate the Linux-2.4 expansion of UIDs and GIDs to 32 bits,
+and various other enlarged fields and further padding in the structure.
+(Various padding bytes were eventually consumed in Linux 2.6,
+with the advent of 32-bit device IDs and nanosecond components
+for the timestamp fields.)
+.PP
+The glibc
+.BR stat ()
+wrapper function hides these details from applications,
+invoking the most recent version of the system call provided by the kernel,
+and repacking the returned information if required for old binaries.
+.\"
+.\" A note from Andries Brouwer, July 2007
+.\"
+.\" > Is the story not rather more complicated for some calls like
+.\" > stat(2)?
+.\"
+.\" Yes and no, mostly no. See /usr/include/sys/stat.h .
+.\"
+.\" The idea is here not so much that syscalls change, but that
+.\" the definitions of struct stat and of the types dev_t and mode_t change.
+.\" This means that libc (even if it does not call the kernel
+.\" but only calls some internal function) must know what the
+.\" format of dev_t or of struct stat is.
+.\" The communication between the application and libc goes via
+.\" the include file <sys/stat.h> that defines a _STAT_VER and
+.\" _MKNOD_VER describing the layout of the data that user space
+.\" uses. Each (almost each) occurrence of stat() is replaced by
+.\" an occurrence of xstat() where the first parameter of xstat()
+.\" is this version number _STAT_VER.
+.\"
+.\" Now, also the definitions used by the kernel change.
+.\" But glibc copes with this in the standard way, and the
+.\" struct stat as returned by the kernel is repacked into
+.\" the struct stat as expected by the application.
+.\" Thus, _STAT_VER and this setup cater for the application-libc
+.\" interface, rather than the libc-kernel interface.
+.\"
+.\" (Note that the details depend on gcc being used as c compiler.)
+.PP
+On modern 64-bit systems, life is simpler: there is a single
+.BR stat ()
+system call and the kernel deals with a
+.I stat
+structure that contains fields of a sufficient size.
+.PP
+The underlying system call employed by the glibc
+.BR fstatat ()
+wrapper function is actually called
+.BR fstatat64 ()
+or, on some architectures,
+.\" strace(1) shows the name "newfstatat" on x86-64
+.BR newfstatat ().
+.SH EXAMPLES
+The following program calls
+.BR lstat ()
+and displays selected fields in the returned
+.I stat
+structure.
+.PP
+.\" SRC BEGIN (stat.c)
+.EX
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <time.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ struct stat sb;
+\&
+ if (argc != 2) {
+ fprintf(stderr, "Usage: %s <pathname>\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (lstat(argv[1], &sb) == \-1) {
+ perror("lstat");
+ exit(EXIT_FAILURE);
+ }
+\&
+ printf("ID of containing device: [%x,%x]\en",
+ major(sb.st_dev),
+ minor(sb.st_dev));
+\&
+ printf("File type: ");
+\&
+ switch (sb.st_mode & S_IFMT) {
+ case S_IFBLK: printf("block device\en"); break;
+ case S_IFCHR: printf("character device\en"); break;
+ case S_IFDIR: printf("directory\en"); break;
+ case S_IFIFO: printf("FIFO/pipe\en"); break;
+ case S_IFLNK: printf("symlink\en"); break;
+ case S_IFREG: printf("regular file\en"); break;
+ case S_IFSOCK: printf("socket\en"); break;
+ default: printf("unknown?\en"); break;
+ }
+\&
+ printf("I\-node number: %ju\en", (uintmax_t) sb.st_ino);
+\&
+ printf("Mode: %jo (octal)\en",
+ (uintmax_t) sb.st_mode);
+\&
+ printf("Link count: %ju\en", (uintmax_t) sb.st_nlink);
+ printf("Ownership: UID=%ju GID=%ju\en",
+ (uintmax_t) sb.st_uid, (uintmax_t) sb.st_gid);
+\&
+ printf("Preferred I/O block size: %jd bytes\en",
+ (intmax_t) sb.st_blksize);
+ printf("File size: %jd bytes\en",
+ (intmax_t) sb.st_size);
+ printf("Blocks allocated: %jd\en",
+ (intmax_t) sb.st_blocks);
+\&
+ printf("Last status change: %s", ctime(&sb.st_ctime));
+ printf("Last file access: %s", ctime(&sb.st_atime));
+ printf("Last file modification: %s", ctime(&sb.st_mtime));
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR ls (1),
+.BR stat (1),
+.BR access (2),
+.BR chmod (2),
+.BR chown (2),
+.BR readlink (2),
+.BR statx (2),
+.BR utime (2),
+.BR stat (3type),
+.BR capabilities (7),
+.BR inode (7),
+.BR symlink (7)
diff --git a/man2/stat64.2 b/man2/stat64.2
new file mode 100644
index 0000000..b1a86c1
--- /dev/null
+++ b/man2/stat64.2
@@ -0,0 +1 @@
+.so man2/stat.2
diff --git a/man2/statfs.2 b/man2/statfs.2
new file mode 100644
index 0000000..26dad7c
--- /dev/null
+++ b/man2/statfs.2
@@ -0,0 +1,389 @@
+.\" Copyright (C) 2003 Andries Brouwer (aeb@cwi.nl)
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 2003-08-17 by Walter Harms
+.\" Modified 2004-06-23 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.TH statfs 2 2023-07-18 "Linux man-pages 6.05.01"
+.SH NAME
+statfs, fstatfs \- get filesystem statistics
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <sys/vfs.h> " "/* or <sys/statfs.h> */"
+.PP
+.BI "int statfs(const char *" path ", struct statfs *" buf );
+.BI "int fstatfs(int " fd ", struct statfs *" buf );
+.fi
+.PP
+Unless you need the
+.I f_type
+field, you should use the standard
+.BR statvfs (3)
+interface instead.
+.SH DESCRIPTION
+The
+.BR statfs ()
+system call returns information about a mounted filesystem.
+.I path
+is the pathname of any file within the mounted filesystem.
+.I buf
+is a pointer to a
+.I statfs
+structure defined approximately as follows:
+.PP
+.in +4n
+.EX
+struct statfs {
+ __fsword_t f_type; /* Type of filesystem (see below) */
+ __fsword_t f_bsize; /* Optimal transfer block size */
+ fsblkcnt_t f_blocks; /* Total data blocks in filesystem */
+ fsblkcnt_t f_bfree; /* Free blocks in filesystem */
+ fsblkcnt_t f_bavail; /* Free blocks available to
+ unprivileged user */
+ fsfilcnt_t f_files; /* Total inodes in filesystem */
+ fsfilcnt_t f_ffree; /* Free inodes in filesystem */
+ fsid_t f_fsid; /* Filesystem ID */
+ __fsword_t f_namelen; /* Maximum length of filenames */
+ __fsword_t f_frsize; /* Fragment size (since Linux 2.6) */
+ __fsword_t f_flags; /* Mount flags of filesystem
+ (since Linux 2.6.36) */
+ __fsword_t f_spare[xxx];
+ /* Padding bytes reserved for future use */
+};
+.EE
+.in
+.PP
+The following filesystem types may appear in
+.IR f_type :
+.PP
+.in +4n
+.EX
+ADFS_SUPER_MAGIC 0xadf5
+AFFS_SUPER_MAGIC 0xadff
+AFS_SUPER_MAGIC 0x5346414f
+ANON_INODE_FS_MAGIC 0x09041934 /* Anonymous inode FS (for
+ pseudofiles that have no name;
+ e.g., epoll, signalfd, bpf) */
+AUTOFS_SUPER_MAGIC 0x0187
+BDEVFS_MAGIC 0x62646576
+BEFS_SUPER_MAGIC 0x42465331
+BFS_MAGIC 0x1badface
+BINFMTFS_MAGIC 0x42494e4d
+BPF_FS_MAGIC 0xcafe4a11
+BTRFS_SUPER_MAGIC 0x9123683e
+BTRFS_TEST_MAGIC 0x73727279
+CGROUP_SUPER_MAGIC 0x27e0eb /* Cgroup pseudo FS */
+CGROUP2_SUPER_MAGIC 0x63677270 /* Cgroup v2 pseudo FS */
+CIFS_MAGIC_NUMBER 0xff534d42
+CODA_SUPER_MAGIC 0x73757245
+COH_SUPER_MAGIC 0x012ff7b7
+CRAMFS_MAGIC 0x28cd3d45
+DEBUGFS_MAGIC 0x64626720
+DEVFS_SUPER_MAGIC 0x1373 /* Linux 2.6.17 and earlier */
+DEVPTS_SUPER_MAGIC 0x1cd1
+ECRYPTFS_SUPER_MAGIC 0xf15f
+EFIVARFS_MAGIC 0xde5e81e4
+EFS_SUPER_MAGIC 0x00414a53
+EXT_SUPER_MAGIC 0x137d /* Linux 2.0 and earlier */
+EXT2_OLD_SUPER_MAGIC 0xef51
+EXT2_SUPER_MAGIC 0xef53
+EXT3_SUPER_MAGIC 0xef53
+EXT4_SUPER_MAGIC 0xef53
+F2FS_SUPER_MAGIC 0xf2f52010
+FUSE_SUPER_MAGIC 0x65735546
+FUTEXFS_SUPER_MAGIC 0xbad1dea /* Unused */
+HFS_SUPER_MAGIC 0x4244
+HOSTFS_SUPER_MAGIC 0x00c0ffee
+HPFS_SUPER_MAGIC 0xf995e849
+HUGETLBFS_MAGIC 0x958458f6
+ISOFS_SUPER_MAGIC 0x9660
+JFFS2_SUPER_MAGIC 0x72b6
+JFS_SUPER_MAGIC 0x3153464a
+MINIX_SUPER_MAGIC 0x137f /* original minix FS */
+MINIX_SUPER_MAGIC2 0x138f /* 30 char minix FS */
+MINIX2_SUPER_MAGIC 0x2468 /* minix V2 FS */
+MINIX2_SUPER_MAGIC2 0x2478 /* minix V2 FS, 30 char names */
+MINIX3_SUPER_MAGIC 0x4d5a /* minix V3 FS, 60 char names */
+MQUEUE_MAGIC 0x19800202 /* POSIX message queue FS */
+MSDOS_SUPER_MAGIC 0x4d44
+MTD_INODE_FS_MAGIC 0x11307854
+NCP_SUPER_MAGIC 0x564c
+NFS_SUPER_MAGIC 0x6969
+NILFS_SUPER_MAGIC 0x3434
+NSFS_MAGIC 0x6e736673
+NTFS_SB_MAGIC 0x5346544e
+OCFS2_SUPER_MAGIC 0x7461636f
+OPENPROM_SUPER_MAGIC 0x9fa1
+OVERLAYFS_SUPER_MAGIC 0x794c7630
+PIPEFS_MAGIC 0x50495045
+PROC_SUPER_MAGIC 0x9fa0 /* /proc FS */
+PSTOREFS_MAGIC 0x6165676c
+QNX4_SUPER_MAGIC 0x002f
+QNX6_SUPER_MAGIC 0x68191122
+RAMFS_MAGIC 0x858458f6
+REISERFS_SUPER_MAGIC 0x52654973
+ROMFS_MAGIC 0x7275
+SECURITYFS_MAGIC 0x73636673
+SELINUX_MAGIC 0xf97cff8c
+SMACK_MAGIC 0x43415d53
+SMB_SUPER_MAGIC 0x517b
+SMB2_MAGIC_NUMBER 0xfe534d42
+SOCKFS_MAGIC 0x534f434b
+SQUASHFS_MAGIC 0x73717368
+SYSFS_MAGIC 0x62656572
+SYSV2_SUPER_MAGIC 0x012ff7b6
+SYSV4_SUPER_MAGIC 0x012ff7b5
+TMPFS_MAGIC 0x01021994
+TRACEFS_MAGIC 0x74726163
+UDF_SUPER_MAGIC 0x15013346
+UFS_MAGIC 0x00011954
+USBDEVICE_SUPER_MAGIC 0x9fa2
+V9FS_MAGIC 0x01021997
+VXFS_SUPER_MAGIC 0xa501fcf5
+XENFS_SUPER_MAGIC 0xabba1974
+XENIX_SUPER_MAGIC 0x012ff7b4
+XFS_SUPER_MAGIC 0x58465342
+_XIAFS_SUPER_MAGIC 0x012fd16d /* Linux 2.0 and earlier */
+.EE
+.in
+.PP
+Most of these MAGIC constants are defined in
+.IR /usr/include/linux/magic.h ,
+and some are hardcoded in kernel sources.
+.PP
+The
+.I f_flags
+field is a bit mask indicating mount options for the filesystem.
+It contains zero or more of the following bits:
+.\" XXX Keep this list in sync with statvfs(3)
+.TP
+.B ST_MANDLOCK
+Mandatory locking is permitted on the filesystem (see
+.BR fcntl (2)).
+.TP
+.B ST_NOATIME
+Do not update access times; see
+.BR mount (2).
+.TP
+.B ST_NODEV
+Disallow access to device special files on this filesystem.
+.TP
+.B ST_NODIRATIME
+Do not update directory access times; see
+.BR mount (2).
+.TP
+.B ST_NOEXEC
+Execution of programs is disallowed on this filesystem.
+.TP
+.B ST_NOSUID
+The set-user-ID and set-group-ID bits are ignored by
+.BR exec (3)
+for executable files on this filesystem
+.TP
+.B ST_RDONLY
+This filesystem is mounted read-only.
+.TP
+.B ST_RELATIME
+Update atime relative to mtime/ctime; see
+.BR mount (2).
+.TP
+.B ST_SYNCHRONOUS
+Writes are synched to the filesystem immediately (see the description of
+.B O_SYNC
+in
+.BR open (2)).
+.TP
+.BR ST_NOSYMFOLLOW " (since Linux 5.10)"
+.\" dab741e0e02bd3c4f5e2e97be74b39df2523fc6e
+Symbolic links are not followed when resolving paths; see
+.BR mount (2).
+.PP
+Nobody knows what
+.I f_fsid
+is supposed to contain (but see below).
+.PP
+Fields that are undefined for a particular filesystem are set to 0.
+.PP
+.BR fstatfs ()
+returns the same information about an open file referenced by descriptor
+.IR fd .
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+.RB ( statfs ())
+Search permission is denied for a component of the path prefix of
+.IR path .
+(See also
+.BR path_resolution (7).)
+.TP
+.B EBADF
+.RB ( fstatfs ())
+.I fd
+is not a valid open file descriptor.
+.TP
+.B EFAULT
+.I buf
+or
+.I path
+points to an invalid address.
+.TP
+.B EINTR
+The call was interrupted by a signal; see
+.BR signal (7).
+.TP
+.B EIO
+An I/O error occurred while reading from the filesystem.
+.TP
+.B ELOOP
+.RB ( statfs ())
+Too many symbolic links were encountered in translating
+.IR path .
+.TP
+.B ENAMETOOLONG
+.RB ( statfs ())
+.I path
+is too long.
+.TP
+.B ENOENT
+.RB ( statfs ())
+The file referred to by
+.I path
+does not exist.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.B ENOSYS
+The filesystem does not support this call.
+.TP
+.B ENOTDIR
+.RB ( statfs ())
+A component of the path prefix of
+.I path
+is not a directory.
+.TP
+.B EOVERFLOW
+Some values were too large to be represented in the returned struct.
+.SH VERSIONS
+.SS The f_fsid field
+Solaris, Irix, and POSIX have a system call
+.BR statvfs (2)
+that returns a
+.I "struct statvfs"
+(defined in
+.IR <sys/statvfs.h> )
+containing an
+.I "unsigned long"
+.IR f_fsid .
+Linux, SunOS, HP-UX, 4.4BSD have a system call
+.BR statfs ()
+that returns a
+.I "struct statfs"
+(defined in
+.IR <sys/vfs.h> )
+containing a
+.I fsid_t
+.IR f_fsid ,
+where
+.I fsid_t
+is defined as
+.IR "struct { int val[2]; }" .
+The same holds for FreeBSD, except that it uses the include file
+.IR <sys/mount.h> .
+.PP
+The general idea is that
+.I f_fsid
+contains some random stuff such that the pair
+.RI ( f_fsid , ino )
+uniquely determines a file.
+Some operating systems use (a variation on) the device number,
+or the device number combined with the filesystem type.
+Several operating systems restrict giving out the
+.I f_fsid
+field to the superuser only (and zero it for unprivileged users),
+because this field is used in the filehandle of the filesystem
+when NFS-exported, and giving it out is a security concern.
+.PP
+Under some operating systems, the
+.I fsid
+can be used as the second argument to the
+.BR sysfs (2)
+system call.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+The Linux
+.BR statfs ()
+was inspired by the 4.4BSD one
+(but they do not use the same structure).
+.PP
+The original Linux
+.BR statfs ()
+and
+.BR fstatfs ()
+system calls were not designed with extremely large file sizes in mind.
+Subsequently, Linux 2.6
+added new
+.BR statfs64 ()
+and
+.BR fstatfs64 ()
+system calls that employ a new structure,
+.IR statfs64 .
+The new structure contains the same fields as the original
+.I statfs
+structure, but the sizes of various fields are increased,
+to accommodate large file sizes.
+The glibc
+.BR statfs ()
+and
+.BR fstatfs ()
+wrapper functions transparently deal with the kernel differences.
+.PP
+LSB has deprecated the library calls
+.BR statfs ()
+and
+.BR fstatfs ()
+and tells us to use
+.BR statvfs (3)
+and
+.BR fstatvfs (3)
+instead.
+.SH NOTES
+The
+.I __fsword_t
+type used for various fields in the
+.I statfs
+structure definition is a glibc internal type,
+not intended for public use.
+This leaves the programmer in a bit of a conundrum when trying to copy
+or compare these fields to local variables in a program.
+Using
+.I "unsigned\ int"
+for such variables suffices on most systems.
+.PP
+Some systems have only \fI<sys/vfs.h>\fP, other systems also have
+\fI<sys/statfs.h>\fP, where the former includes the latter.
+So it seems
+including the former is the best choice.
+.SH BUGS
+From Linux 2.6.38 up to and including Linux 3.1,
+.\" broken in commit ff0c7d15f9787b7e8c601533c015295cc68329f8
+.\" fixed in commit d70ef97baf048412c395bb5d65791d8fe133a52b
+.BR fstatfs ()
+failed with the error
+.B ENOSYS
+for file descriptors created by
+.BR pipe (2).
+.SH SEE ALSO
+.BR stat (2),
+.BR statvfs (3),
+.BR path_resolution (7)
diff --git a/man2/statfs64.2 b/man2/statfs64.2
new file mode 100644
index 0000000..923d3c0
--- /dev/null
+++ b/man2/statfs64.2
@@ -0,0 +1 @@
+.so man2/statfs.2
diff --git a/man2/statx.2 b/man2/statx.2
new file mode 100644
index 0000000..d7c36b8
--- /dev/null
+++ b/man2/statx.2
@@ -0,0 +1,614 @@
+'\" t
+.\" Copyright (c) 2017 David Howells <dhowells@redhat.com>
+.\"
+.\" Derived from the stat.2 manual page:
+.\" Copyright (c) 1992 Drew Eckhardt (drew@cs.colorado.edu), March 28, 1992
+.\" Parts Copyright (c) 1995 Nicolai Langfeldt (janl@ifi.uio.no), 1/1/95
+.\" and Copyright (c) 2006, 2007, 2014 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH statx 2 2023-06-01 "Linux man-pages 6.05.01"
+.SH NAME
+statx \- get file status (extended)
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#define _GNU_SOURCE " "/* See feature_test_macros(7) */"
+.BR "#include <fcntl.h> " "/* Definition of " AT_* " constants */"
+.B #include <sys/stat.h>
+.PP
+.BI "int statx(int " dirfd ", const char *restrict " pathname ", int " flags ,
+.BI " unsigned int " mask ", struct statx *restrict " statxbuf );
+.fi
+.SH DESCRIPTION
+This function returns information about a file, storing it in the buffer
+pointed to by
+.IR statxbuf .
+The returned buffer is a structure of the following type:
+.PP
+.in +4n
+.EX
+struct statx {
+ __u32 stx_mask; /* Mask of bits indicating
+ filled fields */
+ __u32 stx_blksize; /* Block size for filesystem I/O */
+ __u64 stx_attributes; /* Extra file attribute indicators */
+ __u32 stx_nlink; /* Number of hard links */
+ __u32 stx_uid; /* User ID of owner */
+ __u32 stx_gid; /* Group ID of owner */
+ __u16 stx_mode; /* File type and mode */
+ __u64 stx_ino; /* Inode number */
+ __u64 stx_size; /* Total size in bytes */
+ __u64 stx_blocks; /* Number of 512B blocks allocated */
+ __u64 stx_attributes_mask;
+ /* Mask to show what\[aq]s supported
+ in stx_attributes */
+\&
+ /* The following fields are file timestamps */
+ struct statx_timestamp stx_atime; /* Last access */
+ struct statx_timestamp stx_btime; /* Creation */
+ struct statx_timestamp stx_ctime; /* Last status change */
+ struct statx_timestamp stx_mtime; /* Last modification */
+\&
+ /* If this file represents a device, then the next two
+ fields contain the ID of the device */
+ __u32 stx_rdev_major; /* Major ID */
+ __u32 stx_rdev_minor; /* Minor ID */
+\&
+ /* The next two fields contain the ID of the device
+ containing the filesystem where the file resides */
+ __u32 stx_dev_major; /* Major ID */
+ __u32 stx_dev_minor; /* Minor ID */
+\&
+ __u64 stx_mnt_id; /* Mount ID */
+\&
+ /* Direct I/O alignment restrictions */
+ __u32 stx_dio_mem_align;
+ __u32 stx_dio_offset_align;
+};
+.EE
+.in
+.PP
+The file timestamps are structures of the following type:
+.PP
+.in +4n
+.EX
+struct statx_timestamp {
+ __s64 tv_sec; /* Seconds since the Epoch (UNIX time) */
+ __u32 tv_nsec; /* Nanoseconds since tv_sec */
+};
+.EE
+.in
+.PP
+(Note that reserved space and padding is omitted.)
+.SS
+Invoking \fBstatx\fR():
+To access a file's status, no permissions are required on the file itself,
+but in the case of
+.BR statx ()
+with a pathname,
+execute (search) permission is required on all of the directories in
+.I pathname
+that lead to the file.
+.PP
+.BR statx ()
+uses
+.IR pathname ,
+.IR dirfd ,
+and
+.I flags
+to identify the target file in one of the following ways:
+.TP
+An absolute pathname
+If
+.I pathname
+begins with a slash,
+then it is an absolute pathname that identifies the target file.
+In this case,
+.I dirfd
+is ignored.
+.TP
+A relative pathname
+If
+.I pathname
+is a string that begins with a character other than a slash and
+.I dirfd
+is
+.BR AT_FDCWD ,
+then
+.I pathname
+is a relative pathname that is interpreted relative to the process's
+current working directory.
+.TP
+A directory-relative pathname
+If
+.I pathname
+is a string that begins with a character other than a slash and
+.I dirfd
+is a file descriptor that refers to a directory, then
+.I pathname
+is a relative pathname that is interpreted relative to the directory
+referred to by
+.IR dirfd .
+(See
+.BR openat (2)
+for an explanation of why this is useful.)
+.TP
+By file descriptor
+If
+.I pathname
+is an empty string and the
+.B AT_EMPTY_PATH
+flag is specified in
+.I flags
+(see below),
+then the target file is the one referred to by the file descriptor
+.IR dirfd .
+.PP
+.I flags
+can be used to influence a pathname-based lookup.
+A value for
+.I flags
+is constructed by ORing together zero or more of the following constants:
+.TP
+.B AT_EMPTY_PATH
+.\" commit 65cfc6722361570bfe255698d9cd4dccaf47570d
+If
+.I pathname
+is an empty string, operate on the file referred to by
+.I dirfd
+(which may have been obtained using the
+.BR open (2)
+.B O_PATH
+flag).
+In this case,
+.I dirfd
+can refer to any type of file, not just a directory.
+.IP
+If
+.I dirfd
+is
+.BR AT_FDCWD ,
+the call operates on the current working directory.
+.TP
+.B AT_NO_AUTOMOUNT
+Don't automount the terminal ("basename") component of
+.I pathname
+if it is a directory that is an automount point.
+This allows the caller to gather attributes of an automount point
+(rather than the location it would mount).
+This flag has no effect if the mount point has already been mounted over.
+.IP
+The
+.B AT_NO_AUTOMOUNT
+flag can be used in tools that scan directories
+to prevent mass-automounting of a directory of automount points.
+.IP
+All of
+.BR stat (2),
+.BR lstat (2),
+and
+.BR fstatat (2)
+act as though
+.B AT_NO_AUTOMOUNT
+was set.
+.TP
+.B AT_SYMLINK_NOFOLLOW
+If
+.I pathname
+is a symbolic link, do not dereference it:
+instead return information about the link itself, like
+.BR lstat (2).
+.PP
+.I flags
+can also be used to control what sort of synchronization the kernel will do
+when querying a file on a remote filesystem.
+This is done by ORing in one of the following values:
+.TP
+.B AT_STATX_SYNC_AS_STAT
+Do whatever
+.BR stat (2)
+does.
+This is the default and is very much filesystem-specific.
+.TP
+.B AT_STATX_FORCE_SYNC
+Force the attributes to be synchronized with the server.
+This may require that
+a network filesystem perform a data writeback to get the timestamps correct.
+.TP
+.B AT_STATX_DONT_SYNC
+Don't synchronize anything, but rather just take whatever
+the system has cached if possible.
+This may mean that the information returned is approximate, but,
+on a network filesystem, it may not involve a round trip to the server - even
+if no lease is held.
+.PP
+The
+.I mask
+argument to
+.BR statx ()
+is used to tell the kernel which fields the caller is interested in.
+.I mask
+is an ORed combination of the following constants:
+.PP
+.in +4n
+.TS
+lB l.
+STATX_TYPE Want stx_mode & S_IFMT
+STATX_MODE Want stx_mode & \[ti]S_IFMT
+STATX_NLINK Want stx_nlink
+STATX_UID Want stx_uid
+STATX_GID Want stx_gid
+STATX_ATIME Want stx_atime
+STATX_MTIME Want stx_mtime
+STATX_CTIME Want stx_ctime
+STATX_INO Want stx_ino
+STATX_SIZE Want stx_size
+STATX_BLOCKS Want stx_blocks
+STATX_BASIC_STATS [All of the above]
+STATX_BTIME Want stx_btime
+STATX_ALL The same as STATX_BASIC_STATS | STATX_BTIME.
+ It is deprecated and should not be used.
+STATX_MNT_ID Want stx_mnt_id (since Linux 5.8)
+STATX_DIOALIGN Want stx_dio_mem_align and stx_dio_offset_align
+ (since Linux 6.1; support varies by filesystem)
+.TE
+.in
+.PP
+Note that, in general, the kernel does
+.I not
+reject values in
+.I mask
+other than the above.
+(For an exception, see
+.B EINVAL
+in errors.)
+Instead, it simply informs the caller which values are supported
+by this kernel and filesystem via the
+.I statx.stx_mask
+field.
+Therefore,
+.I "do not"
+simply set
+.I mask
+to
+.B UINT_MAX
+(all bits set),
+as one or more bits may, in the future, be used to specify an
+extension to the buffer.
+.SS
+The returned information
+The status information for the target file is returned in the
+.I statx
+structure pointed to by
+.IR statxbuf .
+Included in this is
+.I stx_mask
+which indicates what other information has been returned.
+.I stx_mask
+has the same format as the
+.I mask
+argument and bits are set in it to indicate
+which fields have been filled in.
+.PP
+It should be noted that the kernel may return fields that weren't
+requested and may fail to return fields that were requested,
+depending on what the backing filesystem supports.
+(Fields that are given values despite being unrequested can just be ignored.)
+In either case,
+.I stx_mask
+will not be equal
+.IR mask .
+.PP
+If a filesystem does not support a field or if it has
+an unrepresentable value (for instance, a file with an exotic type),
+then the mask bit corresponding to that field will be cleared in
+.I stx_mask
+even if the user asked for it and a dummy value will be filled in for
+compatibility purposes if one is available (e.g., a dummy UID and GID may be
+specified to mount under some circumstances).
+.PP
+A filesystem may also fill in fields that the caller didn't ask for if it has
+values for them available and the information is available at no extra cost.
+If this happens, the corresponding bits will be set in
+.IR stx_mask .
+.PP
+.\" Background: inode attributes are modified with i_mutex held, but
+.\" read by stat() without taking the mutex.
+.IR Note :
+for performance and simplicity reasons, different fields in the
+.I statx
+structure may contain state information from different moments
+during the execution of the system call.
+For example, if
+.I stx_mode
+or
+.I stx_uid
+is changed by another process by calling
+.BR chmod (2)
+or
+.BR chown (2),
+.BR stat ()
+might return the old
+.I stx_mode
+together with the new
+.IR stx_uid ,
+or the old
+.I stx_uid
+together with the new
+.IR stx_mode .
+.PP
+Apart from
+.I stx_mask
+(which is described above), the fields in the
+.I statx
+structure are:
+.TP
+.I stx_blksize
+The "preferred" block size for efficient filesystem I/O.
+(Writing to a file in
+smaller chunks may cause an inefficient read-modify-rewrite.)
+.TP
+.I stx_attributes
+Further status information about the file (see below for more information).
+.TP
+.I stx_nlink
+The number of hard links on a file.
+.TP
+.I stx_uid
+This field contains the user ID of the owner of the file.
+.TP
+.I stx_gid
+This field contains the ID of the group owner of the file.
+.TP
+.I stx_mode
+The file type and mode.
+See
+.BR inode (7)
+for details.
+.TP
+.I stx_ino
+The inode number of the file.
+.TP
+.I stx_size
+The size of the file (if it is a regular file or a symbolic link) in bytes.
+The size of a symbolic link is the length of the pathname it contains,
+without a terminating null byte.
+.TP
+.I stx_blocks
+The number of blocks allocated to the file on the medium, in 512-byte units.
+(This may be smaller than
+.IR stx_size /512
+when the file has holes.)
+.TP
+.I stx_attributes_mask
+A mask indicating which bits in
+.I stx_attributes
+are supported by the VFS and the filesystem.
+.TP
+.I stx_atime
+The file's last access timestamp.
+.TP
+.I stx_btime
+The file's creation timestamp.
+.TP
+.I stx_ctime
+The file's last status change timestamp.
+.TP
+.I stx_mtime
+The file's last modification timestamp.
+.TP
+.IR stx_dev_major " and " stx_dev_minor
+The device on which this file (inode) resides.
+.TP
+.IR stx_rdev_major " and " stx_rdev_minor
+The device that this file (inode) represents if the file is of block or
+character device type.
+.TP
+.I stx_mnt_id
+.\" commit fa2fcf4f1df1559a0a4ee0f46915b496cc2ebf60
+The mount ID of the mount containing the file.
+This is the same number reported by
+.BR name_to_handle_at (2)
+and corresponds to the number in the first field in one of the records in
+.IR /proc/self/mountinfo .
+.TP
+.I stx_dio_mem_align
+The alignment (in bytes) required for user memory buffers for direct I/O
+.RB ( O_DIRECT )
+on this file,
+or 0 if direct I/O is not supported on this file.
+.IP
+.B STATX_DIOALIGN
+.RI ( stx_dio_mem_align
+and
+.IR stx_dio_offset_align )
+is supported on block devices since Linux 6.1.
+The support on regular files varies by filesystem;
+it is supported by ext4, f2fs, and xfs since Linux 6.1.
+.TP
+.I stx_dio_offset_align
+The alignment (in bytes) required for file offsets and I/O segment lengths
+for direct I/O
+.RB ( O_DIRECT )
+on this file,
+or 0 if direct I/O is not supported on this file.
+This will only be nonzero if
+.I stx_dio_mem_align
+is nonzero, and vice versa.
+.PP
+For further information on the above fields, see
+.BR inode (7).
+.\"
+.SS File attributes
+The
+.I stx_attributes
+field contains a set of ORed flags that indicate additional attributes
+of the file.
+Note that any attribute that is not indicated as supported by
+.I stx_attributes_mask
+has no usable value here.
+The bits in
+.I stx_attributes_mask
+correspond bit-by-bit to
+.IR stx_attributes .
+.PP
+The flags are as follows:
+.TP
+.B STATX_ATTR_COMPRESSED
+The file is compressed by the filesystem and may take extra resources
+to access.
+.TP
+.B STATX_ATTR_IMMUTABLE
+The file cannot be modified: it cannot be deleted or renamed,
+no hard links can be created to this file and no data can be written to it.
+See
+.BR chattr (1).
+.TP
+.B STATX_ATTR_APPEND
+The file can only be opened in append mode for writing.
+Random access writing
+is not permitted.
+See
+.BR chattr (1).
+.TP
+.B STATX_ATTR_NODUMP
+File is not a candidate for backup when a backup program such as
+.BR dump (8)
+is run.
+See
+.BR chattr (1).
+.TP
+.B STATX_ATTR_ENCRYPTED
+A key is required for the file to be encrypted by the filesystem.
+.TP
+.BR STATX_ATTR_VERITY " (since Linux 5.5)"
+.\" commit 3ad2522c64cff1f5aebb987b00683268f0cc7c29
+The file has fs-verity enabled.
+It cannot be written to, and all reads from it will be verified
+against a cryptographic hash that covers the
+entire file (e.g., via a Merkle tree).
+.TP
+.BR STATX_ATTR_DAX " (since Linux 5.8)"
+The file is in the DAX (cpu direct access) state.
+DAX state attempts to
+minimize software cache effects for both I/O and memory mappings of this file.
+It requires a file system which has been configured to support DAX.
+.IP
+DAX generally assumes all accesses are via CPU load / store instructions
+which can minimize overhead for small accesses,
+but may adversely affect CPU utilization for large transfers.
+.IP
+File I/O is done directly to/from user-space buffers and memory mapped I/O may
+be performed with direct memory mappings that bypass the kernel page cache.
+.IP
+While the DAX property tends to result in data being transferred synchronously,
+it does not give the same guarantees as the
+.B O_SYNC
+flag (see
+.BR open (2)),
+where data and the necessary metadata are transferred together.
+.IP
+A DAX file may support being mapped with the
+.B MAP_SYNC
+flag, which enables a
+program to use CPU cache flush instructions to persist CPU store operations
+without an explicit
+.BR fsync (2).
+See
+.BR mmap (2)
+for more information.
+.TP
+.BR STATX_ATTR_MOUNT_ROOT " (since Linux 5.8)"
+.\" commit 80340fe3605c0e78cfe496c3b3878be828cfdbfe
+The file is the root of a mount.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+Search permission is denied for one of the directories
+in the path prefix of
+.IR pathname .
+(See also
+.BR path_resolution (7).)
+.TP
+.B EBADF
+.I pathname
+is relative but
+.I dirfd
+is neither
+.B AT_FDCWD
+nor a valid file descriptor.
+.TP
+.B EFAULT
+.I pathname
+or
+.I statxbuf
+is NULL or points to a location outside the process's
+accessible address space.
+.TP
+.B EINVAL
+Invalid flag specified in
+.IR flags .
+.TP
+.B EINVAL
+Reserved flag specified in
+.IR mask .
+(Currently, there is one such flag, designated by the constant
+.BR STATX__RESERVED ,
+with the value 0x80000000U.)
+.TP
+.B ELOOP
+Too many symbolic links encountered while traversing the pathname.
+.TP
+.B ENAMETOOLONG
+.I pathname
+is too long.
+.TP
+.B ENOENT
+A component of
+.I pathname
+does not exist, or
+.I pathname
+is an empty string and
+.B AT_EMPTY_PATH
+was not specified in
+.IR flags .
+.TP
+.B ENOMEM
+Out of memory (i.e., kernel memory).
+.TP
+.B ENOTDIR
+A component of the path prefix of
+.I pathname
+is not a directory or
+.I pathname
+is relative and
+.I dirfd
+is a file descriptor referring to a file other than a directory.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 4.11,
+glibc 2.28.
+.SH SEE ALSO
+.BR ls (1),
+.BR stat (1),
+.BR access (2),
+.BR chmod (2),
+.BR chown (2),
+.BR name_to_handle_at (2),
+.BR readlink (2),
+.BR stat (2),
+.BR utime (2),
+.BR proc (5),
+.BR capabilities (7),
+.BR inode (7),
+.BR symlink (7)
diff --git a/man2/stime.2 b/man2/stime.2
new file mode 100644
index 0000000..e4d1a38
--- /dev/null
+++ b/man2/stime.2
@@ -0,0 +1,73 @@
+.\" Copyright (c) 1992 Drew Eckhardt (drew@cs.colorado.edu), March 28, 1992
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt <michael@moria.de>
+.\" Modified 1993-07-24 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 2001-03-16 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified 2004-05-27 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.TH stime 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+stime \- set time
+.SH SYNOPSIS
+.nf
+.B #include <time.h>
+.PP
+.BI "[[deprecated]] int stime(const time_t *" t );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR stime ():
+.nf
+ Since glibc 2.19:
+ _DEFAULT_SOURCE
+ glibc 2.19 and earlier:
+ _SVID_SOURCE
+.fi
+.SH DESCRIPTION
+.BR NOTE :
+This function is deprecated;
+use
+.BR clock_settime (2)
+instead.
+.PP
+.BR stime ()
+sets the system's idea of the time and date.
+The time, pointed
+to by \fIt\fP, is measured in seconds since the
+Epoch, 1970-01-01 00:00:00 +0000 (UTC).
+.BR stime ()
+may be executed only by the superuser.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+Error in getting information from user space.
+.TP
+.B EPERM
+The calling process has insufficient privilege.
+Under Linux, the
+.B CAP_SYS_TIME
+privilege is required.
+.SH STANDARDS
+None.
+.SH HISTORY
+SVr4.
+.PP
+Starting with glibc 2.31,
+this function is no longer available to newly linked applications
+and is no longer declared in
+.IR <time.h> .
+.SH SEE ALSO
+.BR date (1),
+.BR settimeofday (2),
+.BR capabilities (7)
diff --git a/man2/stty.2 b/man2/stty.2
new file mode 100644
index 0000000..5d25ea6
--- /dev/null
+++ b/man2/stty.2
@@ -0,0 +1 @@
+.so man2/unimplemented.2
diff --git a/man2/subpage_prot.2 b/man2/subpage_prot.2
new file mode 100644
index 0000000..4309a7d
--- /dev/null
+++ b/man2/subpage_prot.2
@@ -0,0 +1,118 @@
+.\" Copyright (c) 2010 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" based on a proposal from Stephan Mueller <smueller@atsec.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Various pieces of text taken from the kernel source and the commentary
+.\" in Linux commit fa28237cfcc5827553044cbd6ee52e33692b0faa
+.\" both written by Paul Mackerras <paulus@samba.org>
+.\"
+.TH subpage_prot 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+subpage_prot \- define a subpage protection for an address range
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_subpage_prot, unsigned long " addr ", unsigned long " len ,
+.BI " uint32_t *" map );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR subpage_prot (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+The PowerPC-specific
+.BR subpage_prot ()
+system call provides the facility to control the access
+permissions on individual 4\ kB subpages on systems configured with
+a page size of 64\ kB.
+.PP
+The protection map is applied to the memory pages in the region starting at
+.I addr
+and continuing for
+.I len
+bytes.
+Both of these arguments must be aligned to a 64-kB boundary.
+.PP
+The protection map is specified in the buffer pointed to by
+.IR map .
+The map has 2 bits per 4\ kB subpage;
+thus each 32-bit word specifies the protections of 16 4\ kB subpages
+inside a 64\ kB page
+(so, the number of 32-bit words pointed to by
+.I map
+should equate to the number of 64-kB pages specified by
+.IR len ).
+Each 2-bit field in the protection map is either 0 to allow any access,
+1 to prevent writes, or 2 or 3 to prevent all accesses.
+.SH RETURN VALUE
+On success,
+.BR subpage_prot ()
+returns 0.
+Otherwise, one of the error codes specified below is returned.
+.SH ERRORS
+.TP
+.B EFAULT
+The buffer referred to by
+.I map
+is not accessible.
+.TP
+.B EINVAL
+The
+.I addr
+or
+.I len
+arguments are incorrect.
+Both of these arguments must be aligned to a multiple of the system page size,
+and they must not refer to a region outside of the
+address space of the process or to a region that consists of huge pages.
+.TP
+.B ENOMEM
+Out of memory.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.25 (PowerPC).
+.PP
+The system call is provided only if the kernel is configured with
+.BR CONFIG_PPC_64K_PAGES .
+.SH NOTES
+Normal page protections (at the 64-kB page level) also apply;
+the subpage protection mechanism is an additional constraint,
+so putting 0 in a 2-bit field won't allow writes to a page that is otherwise
+write-protected.
+.SS Rationale
+This system call is provided to assist writing emulators that
+operate using 64-kB pages on PowerPC systems.
+When emulating systems such as x86, which uses a smaller page size,
+the emulator can no longer use the memory-management unit (MMU)
+and normal system calls for controlling page protections.
+(The emulator could emulate the MMU by checking and possibly remapping
+the address for each memory access in software, but that is slow.)
+The idea is that the emulator supplies an array of protection masks
+to apply to a specified range of virtual addresses.
+These masks are applied at the level where hardware page-table entries (PTEs)
+are inserted into the hardware page table based on the Linux PTEs,
+so the Linux PTEs are not affected.
+Implicit in this is that the regions of the address space that are
+protected are switched to use 4-kB hardware pages rather than 64-kB
+hardware pages (on machines with hardware 64-kB page support).
+.\" In the initial implementation, it was the case that:
+.\" In fact the whole process is switched to use 4 kB hardware pages when the
+.\" subpage_prot system call is used, but this could be improved in future
+.\" to switch only the affected segments.
+.\" But Paul Mackerass says (Oct 2010): I'm pretty sure we now only switch
+.\" the affected segment, not the whole process.
+.SH SEE ALSO
+.BR mprotect (2),
+.BR syscall (2)
+.PP
+.I Documentation/admin\-guide/mm/hugetlbpage.rst
+in the Linux kernel source tree
diff --git a/man2/swapoff.2 b/man2/swapoff.2
new file mode 100644
index 0000000..2bd424c
--- /dev/null
+++ b/man2/swapoff.2
@@ -0,0 +1 @@
+.so man2/swapon.2
diff --git a/man2/swapon.2 b/man2/swapon.2
new file mode 100644
index 0000000..400f609
--- /dev/null
+++ b/man2/swapon.2
@@ -0,0 +1,197 @@
+.\" Copyright (c) 1992 Drew Eckhardt (drew@cs.colorado.edu), March 28, 1992
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt <michael@moria.de>
+.\" Modified 1993-07-24 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1995-07-22 by Michael Chastain <mec@duracef.shout.net>
+.\" Modified 1995-07-23 by aeb
+.\" Modified 1996-10-22 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 1998-09-08 by aeb
+.\" Modified 2004-06-17 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified 2004-10-10 by aeb
+.\" 2004-12-14 mtk, Anand Kumria: added new errors
+.\" 2007-06-22 Ivana Varekova <varekova@redhat.com>, mtk
+.\" Update text describing limit on number of swap files.
+.\" 2021-01-17 Alex Baranowski <alex@euro-linux.com>
+.\" Update information about available swap files decreased by
+.\" CONFIG_DEVICE_PRIVATE option.
+.\"
+.\" FIXME Linux 3.11 added SWAP_FLAG_DISCARD_ONCE and SWAP_FLAG_DISCARD_PAGES
+.\" commit dcf6b7ddd7df8965727746f89c59229b23180e5a
+.\" Author: Rafael Aquini <aquini@redhat.com>
+.\" Date: Wed Jul 3 15:02:46 2013 -0700
+.\"
+.TH swapon 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+swapon, swapoff \- start/stop swapping to file/device
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/swap.h>
+.PP
+.BI "int swapon(const char *" path ", int " swapflags );
+.BI "int swapoff(const char *" path );
+.fi
+.SH DESCRIPTION
+.BR swapon ()
+sets the swap area to the file or block device specified by
+.IR path .
+.BR swapoff ()
+stops swapping to the file or block device specified by
+.IR path .
+.PP
+If the
+.B SWAP_FLAG_PREFER
+flag is specified in the
+.BR swapon ()
+.I swapflags
+argument, the new swap area will have a higher priority than default.
+The priority is encoded within
+.I swapflags
+as:
+.PP
+.in +4n
+.EX
+.I "(prio << SWAP_FLAG_PRIO_SHIFT) & SWAP_FLAG_PRIO_MASK"
+.EE
+.in
+.PP
+If the
+.B SWAP_FLAG_DISCARD
+flag is specified in the
+.BR swapon ()
+.I swapflags
+argument, freed swap pages will be discarded before they are reused,
+if the swap device supports the discard or trim operation.
+(This may improve performance on some Solid State Devices,
+but often it does not.)
+See also NOTES.
+.PP
+These functions may be used only by a privileged process (one having the
+.B CAP_SYS_ADMIN
+capability).
+.SS Priority
+Each swap area has a priority, either high or low.
+The default priority is low.
+Within the low-priority areas,
+newer areas are even lower priority than older areas.
+.PP
+All priorities set with
+.I swapflags
+are high-priority, higher than default.
+They may have any nonnegative value chosen by the caller.
+Higher numbers mean higher priority.
+.PP
+Swap pages are allocated from areas in priority order,
+highest priority first.
+For areas with different priorities,
+a higher-priority area is exhausted before using a lower-priority area.
+If two or more areas have the same priority,
+and it is the highest priority available,
+pages are allocated on a round-robin basis between them.
+.PP
+As of Linux 1.3.6, the kernel usually follows these rules,
+but there are exceptions.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBUSY
+(for
+.BR swapon ())
+The specified
+.I path
+is already being used as a swap area.
+.TP
+.B EINVAL
+The file
+.I path
+exists, but refers neither to a regular file nor to a block device;
+.TP
+.B EINVAL
+.RB ( swapon ())
+The indicated path does not contain a valid swap signature or
+resides on an in-memory filesystem such as
+.BR tmpfs (5).
+.TP
+.BR EINVAL " (since Linux 3.4)"
+.RB ( swapon ())
+An invalid flag value was specified in
+.IR swapflags .
+.TP
+.B EINVAL
+.RB ( swapoff ())
+.I path
+is not currently a swap area.
+.TP
+.B ENFILE
+The system-wide limit on the total number of open files has been reached.
+.TP
+.B ENOENT
+The file
+.I path
+does not exist.
+.TP
+.B ENOMEM
+The system has insufficient memory to start swapping.
+.TP
+.B EPERM
+The caller does not have the
+.B CAP_SYS_ADMIN
+capability.
+Alternatively, the maximum number of swap files are already in use;
+see NOTES below.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+The
+.I swapflags
+argument was introduced in Linux 1.3.2.
+.SH NOTES
+The partition or path must be prepared with
+.BR mkswap (8).
+.PP
+There is an upper limit on the number of swap files that may be used,
+defined by the kernel constant
+.BR MAX_SWAPFILES .
+Before Linux 2.4.10,
+.B MAX_SWAPFILES
+has the value 8;
+since Linux 2.4.10, it has the value 32.
+Since Linux 2.6.18, the limit is decreased by 2 (thus: 30)
+if the kernel is built with the
+.B CONFIG_MIGRATION
+option
+(which reserves two swap table entries for the page migration features of
+.BR mbind (2)
+and
+.BR migrate_pages (2)).
+Since Linux 2.6.32, the limit is further decreased by 1
+if the kernel is built with the
+.B CONFIG_MEMORY_FAILURE
+option.
+Since Linux 5.14, the limit is further decreased by 4
+if the kernel is built with the
+.B CONFIG_DEVICE_PRIVATE
+option.
+.PP
+Discard of swap pages was introduced in Linux 2.6.29,
+then made conditional
+on the
+.B SWAP_FLAG_DISCARD
+flag in Linux 2.6.36,
+.\" To be precise: 2.6.35.5
+which still discards the
+entire swap area when
+.BR swapon ()
+is called, even if that flag bit is not set.
+.SH SEE ALSO
+.BR mkswap (8),
+.BR swapoff (8),
+.BR swapon (8)
diff --git a/man2/symlink.2 b/man2/symlink.2
new file mode 100644
index 0000000..dd87f2d
--- /dev/null
+++ b/man2/symlink.2
@@ -0,0 +1,265 @@
+.\" This manpage is Copyright (C) 1992 Drew Eckhardt;
+.\" and Copyright (C) 1993 Michael Haardt, Ian Jackson.
+.\" and Copyright (C) 2006, 2014 Michael Kerrisk
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 1993-07-24 by Rik Faith
+.\" Modified 1996-04-26 by Nick Duffek <nsd@bbc.com>
+.\" Modified 1996-11-06 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 1997-01-31 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 2004-06-23 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.TH symlink 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+symlink, symlinkat \- make a new name for a file
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int symlink(const char *" target ", const char *" linkpath );
+.PP
+.BR "#include <fcntl.h> " "/* Definition of " AT_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int symlinkat(const char *" target ", int " newdirfd \
+", const char *" linkpath );
+.PP
+.fi
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR symlink ():
+.nf
+ _XOPEN_SOURCE >= 500 || _POSIX_C_SOURCE >= 200112L
+.\" || _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED
+ || /* glibc <= 2.19: */ _BSD_SOURCE
+.fi
+.PP
+.BR symlinkat ():
+.nf
+ Since glibc 2.10:
+ _POSIX_C_SOURCE >= 200809L
+ Before glibc 2.10:
+ _ATFILE_SOURCE
+.fi
+.SH DESCRIPTION
+.BR symlink ()
+creates a symbolic link named
+.I linkpath
+which contains the string
+.IR target .
+.PP
+Symbolic links are interpreted at run time as if the contents of the
+link had been substituted into the path being followed to find a file or
+directory.
+.PP
+Symbolic links may contain
+.I ..
+path components, which (if used at the start of the link) refer to the
+parent directories of that in which the link resides.
+.PP
+A symbolic link (also known as a soft link) may point to an existing
+file or to a nonexistent one; the latter case is known as a dangling
+link.
+.PP
+The permissions of a symbolic link are irrelevant; the ownership is
+ignored when following the link
+(except when the
+.I protected_symlinks
+feature is enabled, as explained in
+.BR proc (5)),
+but is checked when removal or
+renaming of the link is requested and the link is in a directory with
+the sticky bit
+.RB ( S_ISVTX )
+set.
+.PP
+If
+.I linkpath
+exists, it will
+.I not
+be overwritten.
+.SS symlinkat()
+The
+.BR symlinkat ()
+system call operates in exactly the same way as
+.BR symlink (),
+except for the differences described here.
+.PP
+If the pathname given in
+.I linkpath
+is relative, then it is interpreted relative to the directory
+referred to by the file descriptor
+.I newdirfd
+(rather than relative to the current working directory of
+the calling process, as is done by
+.BR symlink ()
+for a relative pathname).
+.PP
+If
+.I linkpath
+is relative and
+.I newdirfd
+is the special value
+.BR AT_FDCWD ,
+then
+.I linkpath
+is interpreted relative to the current working
+directory of the calling process (like
+.BR symlink ()).
+.PP
+If
+.I linkpath
+is absolute, then
+.I newdirfd
+is ignored.
+.PP
+See
+.BR openat (2)
+for an explanation of the need for
+.BR symlinkat ().
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+Write access to the directory containing
+.I linkpath
+is denied, or one of the directories in the path prefix of
+.I linkpath
+did not allow search permission.
+(See also
+.BR path_resolution (7).)
+.TP
+.B EBADF
+.RB ( symlinkat ())
+.I linkpath
+is relative but
+.I newdirfd
+is neither
+.B AT_FDCWD
+nor a valid file descriptor.
+.TP
+.B EDQUOT
+The user's quota of resources on the filesystem has been exhausted.
+The resources could be inodes or disk blocks, depending on the filesystem
+implementation.
+.TP
+.B EEXIST
+.I linkpath
+already exists.
+.TP
+.B EFAULT
+.IR target " or " linkpath " points outside your accessible address space."
+.TP
+.B EIO
+An I/O error occurred.
+.TP
+.B ELOOP
+Too many symbolic links were encountered in resolving
+.IR linkpath .
+.TP
+.B ENAMETOOLONG
+.IR target " or " linkpath " was too long."
+.TP
+.B ENOENT
+A directory component in
+.I linkpath
+does not exist or is a dangling symbolic link, or
+.I target
+or
+.I linkpath
+is an empty string.
+.TP
+.B ENOENT
+.RB ( symlinkat ())
+.I linkpath
+is a relative pathname and
+.I newdirfd
+refers to a directory that has been deleted.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.B ENOSPC
+The device containing the file has no room for the new directory
+entry.
+.TP
+.B ENOTDIR
+A component used as a directory in
+.I linkpath
+is not, in fact, a directory.
+.TP
+.B ENOTDIR
+.RB ( symlinkat ())
+.I linkpath
+is relative and
+.I newdirfd
+is a file descriptor referring to a file other than a directory.
+.TP
+.B EPERM
+The filesystem containing
+.I linkpath
+does not support the creation of symbolic links.
+.TP
+.B EROFS
+.I linkpath
+is on a read-only filesystem.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+.TP
+.BR symlink ()
+SVr4, 4.3BSD, POSIX.1-2001.
+.\" SVr4 documents additional error codes EDQUOT and ENOSYS.
+.\" See
+.\" .BR open (2)
+.\" re multiple files with the same name, and NFS.
+.TP
+.BR symlinkat ()
+POSIX.1-2008.
+Linux 2.6.16,
+glibc 2.4.
+.SS glibc notes
+On older kernels where
+.BR symlinkat ()
+is unavailable, the glibc wrapper function falls back to the use of
+.BR symlink ().
+When
+.I linkpath
+is a relative pathname,
+glibc constructs a pathname based on the symbolic link in
+.I /proc/self/fd
+that corresponds to the
+.I newdirfd
+argument.
+.SH NOTES
+No checking of
+.I target
+is done.
+.PP
+Deleting the name referred to by a symbolic link will actually delete the
+file (unless it also has other hard links).
+If this behavior is not desired, use
+.BR link (2).
+.SH SEE ALSO
+.BR ln (1),
+.BR namei (1),
+.BR lchown (2),
+.BR link (2),
+.BR lstat (2),
+.BR open (2),
+.BR readlink (2),
+.BR rename (2),
+.BR unlink (2),
+.BR path_resolution (7),
+.BR symlink (7)
diff --git a/man2/symlinkat.2 b/man2/symlinkat.2
new file mode 100644
index 0000000..78568cd
--- /dev/null
+++ b/man2/symlinkat.2
@@ -0,0 +1 @@
+.so man2/symlink.2
diff --git a/man2/sync.2 b/man2/sync.2
new file mode 100644
index 0000000..21f01d7
--- /dev/null
+++ b/man2/sync.2
@@ -0,0 +1,146 @@
+.\" Copyright (c) 1992 Drew Eckhardt (drew@cs.colorado.edu), March 28, 1992
+.\" and Copyright (c) 2011 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt <michael@moria.de>
+.\" Modified Sat Jul 24 12:02:47 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 15 Apr 1995 by Michael Chastain <mec@shell.portal.com>:
+.\" Added reference to `bdflush(2)'.
+.\" Modified 960414 by Andries Brouwer <aeb@cwi.nl>:
+.\" Added the fact that since 1.3.20 sync actually waits.
+.\" Modified Tue Oct 22 22:27:07 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 2001-10-10 by aeb, following Michael Kerrisk.
+.\" 2011-09-07, mtk, Added syncfs() documentation,
+.\"
+.TH sync 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+sync, syncfs \- commit filesystem caches to disk
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.B void sync(void);
+.PP
+.BI "int syncfs(int " fd );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR sync ():
+.nf
+ _XOPEN_SOURCE >= 500
+.\" || _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED
+ || /* Since glibc 2.19: */ _DEFAULT_SOURCE
+ || /* glibc <= 2.19: */ _BSD_SOURCE
+.fi
+.PP
+.BR syncfs ():
+.nf
+ _GNU_SOURCE
+.fi
+.SH DESCRIPTION
+.BR sync ()
+causes all pending modifications to filesystem metadata and cached file
+data to be written to the underlying filesystems.
+.PP
+.BR syncfs ()
+is like
+.BR sync (),
+but synchronizes just the filesystem containing file
+referred to by the open file descriptor
+.IR fd .
+.SH RETURN VALUE
+.BR syncfs ()
+returns 0 on success;
+on error, it returns \-1 and sets
+.I errno
+to indicate the error.
+.SH ERRORS
+.BR sync ()
+is always successful.
+.PP
+.BR syncfs ()
+can fail for at least the following reasons:
+.TP
+.B EBADF
+.I fd
+is not a valid file descriptor.
+.TP
+.B EIO
+An error occurred during synchronization.
+This error may relate to data written to any file on the filesystem, or on
+metadata related to the filesystem itself.
+.TP
+.B ENOSPC
+Disk space was exhausted while synchronizing.
+.TP
+.BR ENOSPC ", " EDQUOT
+Data was written to a file on NFS or another filesystem which does not
+allocate space at the time of a
+.BR write (2)
+system call, and some previous write failed due to insufficient
+storage space.
+.SH VERSIONS
+According to the standard specification (e.g., POSIX.1-2001),
+.BR sync ()
+schedules the writes, but may return before the actual
+writing is done.
+However Linux waits for I/O completions,
+and thus
+.BR sync ()
+or
+.BR syncfs ()
+provide the same guarantees as
+.BR fsync ()
+called on every file in
+the system or filesystem respectively.
+.SH STANDARDS
+.TP
+.BR sync ()
+POSIX.1-2008.
+.TP
+.BR syncfs ()
+Linux.
+.SH HISTORY
+.TP
+.BR sync ()
+POSIX.1-2001, SVr4, 4.3BSD.
+.TP
+.BR syncfs ()
+Linux 2.6.39,
+glibc 2.14.
+.PP
+Since glibc 2.2.2, the Linux prototype for
+.BR sync ()
+is as listed above,
+following the various standards.
+In glibc 2.2.1 and earlier,
+it was "int sync(void)", and
+.BR sync ()
+always returned 0.
+.PP
+In mainline kernel versions prior to Linux 5.8,
+.BR syncfs ()
+will fail only when passed a bad file descriptor
+.RB ( EBADF ).
+Since Linux 5.8,
+.\" commit 735e4ae5ba28c886d249ad04d3c8cc097dad6336
+.BR syncfs ()
+will also report an error if one or more inodes failed
+to be written back since the last
+.BR syncfs ()
+call.
+.SH BUGS
+Before Linux 1.3.20, Linux did not wait for I/O to complete
+before returning.
+.SH SEE ALSO
+.BR sync (1),
+.BR fdatasync (2),
+.BR fsync (2)
diff --git a/man2/sync_file_range.2 b/man2/sync_file_range.2
new file mode 100644
index 0000000..f324f75
--- /dev/null
+++ b/man2/sync_file_range.2
@@ -0,0 +1,213 @@
+.\" Copyright (c) 2006 Andrew Morton <akpm@osdl.org>
+.\" and Copyright 2006 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" 2006-07-05 Initial creation, Michael Kerrisk based on
+.\" Andrew Morton's comments in fs/sync.c
+.\" 2010-10-09, mtk, Document sync_file_range2()
+.\"
+.TH sync_file_range 2 2023-07-15 "Linux man-pages 6.05.01"
+.SH NAME
+sync_file_range \- sync a file segment with disk
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #define _FILE_OFFSET_BITS 64
+.B #include <fcntl.h>
+.PP
+.BI "int sync_file_range(int " fd ", off_t " offset ", off_t " nbytes ,
+.BI " unsigned int " flags );
+.fi
+.SH DESCRIPTION
+.BR sync_file_range ()
+permits fine control when synchronizing the open file referred to by the
+file descriptor
+.I fd
+with disk.
+.PP
+.I offset
+is the starting byte of the file range to be synchronized.
+.I nbytes
+specifies the length of the range to be synchronized, in bytes; if
+.I nbytes
+is zero, then all bytes from
+.I offset
+through to the end of file are synchronized.
+Synchronization is in units of the system page size:
+.I offset
+is rounded down to a page boundary;
+.I (offset+nbytes\-1)
+is rounded up to a page boundary.
+.PP
+The
+.I flags
+bit-mask argument can include any of the following values:
+.TP
+.B SYNC_FILE_RANGE_WAIT_BEFORE
+Wait upon write-out of all pages in the specified range
+that have already been submitted to the device driver for write-out
+before performing any write.
+.TP
+.B SYNC_FILE_RANGE_WRITE
+Initiate write-out of all dirty pages in the specified
+range which are not presently submitted write-out.
+Note that even this may block if you attempt to
+write more than request queue size.
+.TP
+.B SYNC_FILE_RANGE_WAIT_AFTER
+Wait upon write-out of all pages in the range
+after performing any write.
+.PP
+Specifying
+.I flags
+as 0 is permitted, as a no-op.
+.SS Warning
+This system call is extremely dangerous and should not be used in portable
+programs.
+None of these operations writes out the file's metadata.
+Therefore, unless the application is strictly performing overwrites of
+already-instantiated disk blocks, there are no guarantees that the data will
+be available after a crash.
+There is no user interface to know if a write is purely an overwrite.
+On filesystems using copy-on-write semantics (e.g.,
+.IR btrfs )
+an overwrite of existing allocated blocks is impossible.
+When writing into preallocated space,
+many filesystems also require calls into the block
+allocator, which this system call does not sync out to disk.
+This system call does not flush disk write caches and thus does not provide
+any data integrity on systems with volatile disk write caches.
+.SS Some details
+.B SYNC_FILE_RANGE_WAIT_BEFORE
+and
+.B SYNC_FILE_RANGE_WAIT_AFTER
+will detect any
+I/O errors or
+.B ENOSPC
+conditions and will return these to the caller.
+.PP
+Useful combinations of the
+.I flags
+bits are:
+.TP
+.B SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE
+Ensures that all pages
+in the specified range which were dirty when
+.BR sync_file_range ()
+was called are placed
+under write-out.
+This is a start-write-for-data-integrity operation.
+.TP
+.B SYNC_FILE_RANGE_WRITE
+Start write-out of all dirty pages in the specified range which
+are not presently under write-out.
+This is an asynchronous flush-to-disk
+operation.
+This is not suitable for data integrity operations.
+.TP
+.BR SYNC_FILE_RANGE_WAIT_BEFORE " (or " SYNC_FILE_RANGE_WAIT_AFTER )
+Wait for
+completion of write-out of all pages in the specified range.
+This can be used after an earlier
+.B SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE
+operation to wait for completion of that operation, and obtain its result.
+.TP
+.B SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | \
+SYNC_FILE_RANGE_WAIT_AFTER
+This is a write-for-data-integrity operation
+that will ensure that all pages in the specified range which were dirty when
+.BR sync_file_range ()
+was called are committed to disk.
+.SH RETURN VALUE
+On success,
+.BR sync_file_range ()
+returns 0; on failure \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EBADF
+.I fd
+is not a valid file descriptor.
+.TP
+.B EINVAL
+.I flags
+specifies an invalid bit; or
+.I offset
+or
+.I nbytes
+is invalid.
+.TP
+.B EIO
+I/O error.
+.TP
+.B ENOMEM
+Out of memory.
+.TP
+.B ENOSPC
+Out of disk space.
+.TP
+.B ESPIPE
+.I fd
+refers to something other than a regular file, a block device, or
+a directory.
+.SH VERSIONS
+.SS sync_file_range2()
+Some architectures (e.g., PowerPC, ARM)
+need 64-bit arguments to be aligned in a suitable pair of registers.
+.\" See kernel commit edd5cd4a9424f22b0fa08bef5e299d41befd5622
+On such architectures, the call signature of
+.BR sync_file_range ()
+shown in the SYNOPSIS would force
+a register to be wasted as padding between the
+.I fd
+and
+.I offset
+arguments.
+(See
+.BR syscall (2)
+for details.)
+Therefore, these architectures define a different
+system call that orders the arguments suitably:
+.PP
+.in +4n
+.EX
+.BI "int sync_file_range2(int " fd ", unsigned int " flags ,
+.BI " off_t " offset ", off_t " nbytes );
+.EE
+.in
+.PP
+The behavior of this system call is otherwise exactly the same as
+.BR sync_file_range ().
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.17.
+.SS sync_file_range2()
+A system call with this signature first appeared on the ARM architecture
+in Linux 2.6.20, with the name
+.BR arm_sync_file_range ().
+It was renamed in Linux 2.6.22,
+when the analogous system call was added for PowerPC.
+On architectures where glibc support is provided,
+glibc transparently wraps
+.BR sync_file_range2 ()
+under the name
+.BR sync_file_range ().
+.SH NOTES
+.B _FILE_OFFSET_BITS
+should be defined to be 64 in code that takes the address of
+.BR sync_file_range ,
+if the code is intended to be portable
+to traditional 32-bit x86 and ARM platforms where
+.BR off_t 's
+width defaults to 32 bits.
+.SH SEE ALSO
+.BR fdatasync (2),
+.BR fsync (2),
+.BR msync (2),
+.BR sync (2)
diff --git a/man2/sync_file_range2.2 b/man2/sync_file_range2.2
new file mode 100644
index 0000000..ad7a1e6
--- /dev/null
+++ b/man2/sync_file_range2.2
@@ -0,0 +1 @@
+.so man2/sync_file_range.2
diff --git a/man2/syncfs.2 b/man2/syncfs.2
new file mode 100644
index 0000000..5555798
--- /dev/null
+++ b/man2/syncfs.2
@@ -0,0 +1 @@
+.so man2/sync.2
diff --git a/man2/syscall.2 b/man2/syscall.2
new file mode 100644
index 0000000..43f054a
--- /dev/null
+++ b/man2/syscall.2
@@ -0,0 +1,367 @@
+'\" t
+.\" Copyright (c) 1980, 1991, 1993
+.\" The Regents of the University of California. All rights reserved.
+.\"
+.\" SPDX-License-Identifier: BSD-4-Clause-UC
+.\"
+.\" @(#)syscall.2 8.1 (Berkeley) 6/16/93
+.\"
+.\"
+.\" 2002-03-20 Christoph Hellwig <hch@infradead.org>
+.\" - adopted for Linux
+.\" 2015-01-17, Kees Cook <keescook@chromium.org>
+.\" Added mips and arm64.
+.\"
+.TH syscall 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+syscall \- indirect system call
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "long syscall(long " number ", ...);"
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR syscall ():
+.nf
+ Since glibc 2.19:
+ _DEFAULT_SOURCE
+ Before glibc 2.19:
+ _BSD_SOURCE || _SVID_SOURCE
+.fi
+.SH DESCRIPTION
+.BR syscall ()
+is a small library function that invokes
+the system call whose assembly language
+interface has the specified
+.I number
+with the specified arguments.
+Employing
+.BR syscall ()
+is useful, for example,
+when invoking a system call that has no wrapper function in the C library.
+.PP
+.BR syscall ()
+saves CPU registers before making the system call,
+restores the registers upon return from the system call,
+and stores any error returned by the system call in
+.BR errno (3).
+.PP
+Symbolic constants for system call numbers can be found in the header file
+.IR <sys/syscall.h> .
+.SH RETURN VALUE
+The return value is defined by the system call being invoked.
+In general, a 0 return value indicates success.
+A \-1 return value indicates an error,
+and an error number is stored in
+.IR errno .
+.SH ERRORS
+.TP
+.B ENOSYS
+The requested system call number is not implemented.
+.PP
+Other errors are specific to the invoked system call.
+.SH NOTES
+.BR syscall ()
+first appeared in
+4BSD.
+.SS Architecture-specific requirements
+Each architecture ABI has its own requirements on how
+system call arguments are passed to the kernel.
+For system calls that have a glibc wrapper (e.g., most system calls),
+glibc handles the details of copying arguments to the right registers
+in a manner suitable for the architecture.
+However, when using
+.BR syscall ()
+to make a system call,
+the caller might need to handle architecture-dependent details;
+this requirement is most commonly encountered on certain 32-bit architectures.
+.PP
+For example, on the ARM architecture Embedded ABI (EABI), a
+64-bit value (e.g.,
+.IR "long long" )
+must be aligned to an even register pair.
+Thus, using
+.BR syscall ()
+instead of the wrapper provided by glibc,
+the
+.BR readahead (2)
+system call would be invoked as follows on the ARM architecture with the EABI
+in little endian mode:
+.PP
+.in +4n
+.EX
+syscall(SYS_readahead, fd, 0,
+ (unsigned int) (offset & 0xFFFFFFFF),
+ (unsigned int) (offset >> 32),
+ count);
+.EE
+.in
+.PP
+Since the offset argument is 64 bits, and the first argument
+.RI ( fd )
+is passed in
+.IR r0 ,
+the caller must manually split and align the 64-bit value
+so that it is passed in the
+.IR r2 / r3
+register pair.
+That means inserting a dummy value into
+.I r1
+(the second argument of 0).
+Care also must be taken so that the split follows endian conventions
+(according to the C ABI for the platform).
+.PP
+Similar issues can occur on MIPS with the O32 ABI,
+on PowerPC and parisc with the 32-bit ABI, and on Xtensa.
+.\" Mike Frysinger: this issue ends up forcing MIPS
+.\" O32 to take 7 arguments to syscall()
+.PP
+.\" See arch/parisc/kernel/sys_parisc.c.
+Note that while the parisc C ABI also uses aligned register pairs,
+it uses a shim layer to hide the issue from user space.
+.PP
+The affected system calls are
+.BR fadvise64_64 (2),
+.BR ftruncate64 (2),
+.BR posix_fadvise (2),
+.BR pread64 (2),
+.BR pwrite64 (2),
+.BR readahead (2),
+.BR sync_file_range (2),
+and
+.BR truncate64 (2).
+.PP
+.\" You need to look up the syscalls directly in the kernel source to see if
+.\" they should be in this list. For example, look at fs/read_write.c and
+.\" the function signatures that do:
+.\" ..., unsigned long, pos_l, unsigned long, pos_h, ...
+.\" If they use off_t, then they most likely do not belong in this list.
+This does not affect syscalls that manually split and assemble 64-bit values
+such as
+.BR _llseek (2),
+.BR preadv (2),
+.BR preadv2 (2),
+.BR pwritev (2),
+and
+.BR pwritev2 (2).
+Welcome to the wonderful world of historical baggage.
+.SS Architecture calling conventions
+Every architecture has its own way of invoking and passing arguments to the
+kernel.
+The details for various architectures are listed in the two tables below.
+.PP
+The first table lists the instruction used to transition to kernel mode
+(which might not be the fastest or best way to transition to the kernel,
+so you might have to refer to
+.BR vdso (7)),
+the register used to indicate the system call number,
+the register(s) used to return the system call result,
+and the register used to signal an error.
+.if t \{\
+.ft CW
+\}
+.TS
+l2 l2 l2 l2 l1 l2 l.
+Arch/ABI Instruction System Ret Ret Error Notes
+ call # val val2
+_
+alpha callsys v0 v0 a4 a3 1, 6
+arc trap0 r8 r0 - -
+arm/OABI swi NR - r0 - - 2
+arm/EABI swi 0x0 r7 r0 r1 -
+arm64 svc #0 w8 x0 x1 -
+blackfin excpt 0x0 P0 R0 - -
+i386 int $0x80 eax eax edx -
+ia64 break 0x100000 r15 r8 r9 r10 1, 6
+loongarch syscall 0 a7 a0 - -
+m68k trap #0 d0 d0 - -
+microblaze brki r14,8 r12 r3 - -
+mips syscall v0 v0 v1 a3 1, 6
+nios2 trap r2 r2 - r7
+parisc ble 0x100(%sr2, %r0) r20 r28 - -
+powerpc sc r0 r3 - r0 1
+powerpc64 sc r0 r3 - cr0.SO 1
+riscv ecall a7 a0 a1 -
+s390 svc 0 r1 r2 r3 - 3
+s390x svc 0 r1 r2 r3 - 3
+superh trapa #31 r3 r0 r1 - 4, 6
+sparc/32 t 0x10 g1 o0 o1 psr/csr 1, 6
+sparc/64 t 0x6d g1 o0 o1 psr/csr 1, 6
+tile swint1 R10 R00 - R01 1
+x86-64 syscall rax rax rdx - 5
+x32 syscall rax rax rdx - 5
+xtensa syscall a2 a2 - -
+.TE
+.PP
+Notes:
+.IP \[bu] 3
+On a few architectures,
+a register is used as a boolean
+(0 indicating no error, and \-1 indicating an error) to signal that the
+system call failed.
+The actual error value is still contained in the return register.
+On sparc, the carry bit
+.RI ( csr )
+in the processor status register
+.RI ( psr )
+is used instead of a full register.
+On powerpc64, the summary overflow bit
+.RI ( SO )
+in field 0 of the condition register
+.RI ( cr0 )
+is used.
+.IP \[bu]
+.I NR
+is the system call number.
+.IP \[bu]
+For s390 and s390x,
+.I NR
+(the system call number) may be passed directly with
+.I "svc\ NR"
+if it is less than 256.
+.IP \[bu]
+On SuperH additional trap numbers are supported for historic reasons, but
+.BR trapa #31
+is the recommended "unified" ABI.
+.IP \[bu]
+The x32 ABI shares syscall table with x86-64 ABI, but there are some
+nuances:
+.RS
+.IP \[bu] 3
+In order to indicate that a system call is called under the x32 ABI,
+an additional bit,
+.BR __X32_SYSCALL_BIT ,
+is bitwise ORed with the system call number.
+The ABI used by a process affects some process behaviors,
+including signal handling or system call restarting.
+.IP \[bu]
+Since x32 has different sizes for
+.I long
+and pointer types, layouts of some (but not all;
+.I struct timeval
+or
+.I struct rlimit
+are 64-bit, for example) structures are different.
+In order to handle this,
+additional system calls are added to the system call table,
+starting from number 512
+(without the
+.BR __X32_SYSCALL_BIT ).
+For example,
+.B __NR_readv
+is defined as 19 for the x86-64 ABI and as
+.IR __X32_SYSCALL_BIT " | " \fB515\fP
+for the x32 ABI.
+Most of these additional system calls are actually identical
+to the system calls used for providing i386 compat.
+There are some notable exceptions, however, such as
+.BR preadv2 (2),
+which uses
+.I struct iovec
+entities with 4-byte pointers and sizes ("compat_iovec" in kernel terms),
+but passes an 8-byte
+.I pos
+argument in a single register and not two, as is done in every other ABI.
+.RE
+.IP \[bu]
+Some architectures
+(namely, Alpha, IA-64, MIPS, SuperH, sparc/32, and sparc/64)
+use an additional register ("Retval2" in the above table)
+to pass back a second return value from the
+.BR pipe (2)
+system call;
+Alpha uses this technique in the architecture-specific
+.BR getxpid (2),
+.BR getxuid (2),
+and
+.BR getxgid (2)
+system calls as well.
+Other architectures do not use the second return value register
+in the system call interface, even if it is defined in the System V ABI.
+.if t \{\
+.in
+.ft P
+\}
+.PP
+The second table shows the registers used to pass the system call arguments.
+.if t \{\
+.ft CW
+\}
+.TS
+l l2 l2 l2 l2 l2 l2 l2 l.
+Arch/ABI arg1 arg2 arg3 arg4 arg5 arg6 arg7 Notes
+_
+alpha a0 a1 a2 a3 a4 a5 -
+arc r0 r1 r2 r3 r4 r5 -
+arm/OABI r0 r1 r2 r3 r4 r5 r6
+arm/EABI r0 r1 r2 r3 r4 r5 r6
+arm64 x0 x1 x2 x3 x4 x5 -
+blackfin R0 R1 R2 R3 R4 R5 -
+i386 ebx ecx edx esi edi ebp -
+ia64 out0 out1 out2 out3 out4 out5 -
+loongarch a0 a1 a2 a3 a4 a5 a6
+m68k d1 d2 d3 d4 d5 a0 -
+microblaze r5 r6 r7 r8 r9 r10 -
+mips/o32 a0 a1 a2 a3 - - - 1
+mips/n32,64 a0 a1 a2 a3 a4 a5 -
+nios2 r4 r5 r6 r7 r8 r9 -
+parisc r26 r25 r24 r23 r22 r21 -
+powerpc r3 r4 r5 r6 r7 r8 r9
+powerpc64 r3 r4 r5 r6 r7 r8 -
+riscv a0 a1 a2 a3 a4 a5 -
+s390 r2 r3 r4 r5 r6 r7 -
+s390x r2 r3 r4 r5 r6 r7 -
+superh r4 r5 r6 r7 r0 r1 r2
+sparc/32 o0 o1 o2 o3 o4 o5 -
+sparc/64 o0 o1 o2 o3 o4 o5 -
+tile R00 R01 R02 R03 R04 R05 -
+x86-64 rdi rsi rdx r10 r8 r9 -
+x32 rdi rsi rdx r10 r8 r9 -
+xtensa a6 a3 a4 a5 a8 a9 -
+.TE
+.PP
+Notes:
+.IP \[bu] 3
+The mips/o32 system call convention passes
+arguments 5 through 8 on the user stack.
+.if t \{\
+.in
+.ft P
+\}
+.PP
+Note that these tables don't cover the entire calling convention\[em]some
+architectures may indiscriminately clobber other registers not listed here.
+.SH EXAMPLES
+.\" SRC BEGIN (syscall.c)
+.EX
+#define _GNU_SOURCE
+#include <signal.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+\&
+int
+main(void)
+{
+ pid_t tid;
+\&
+ tid = syscall(SYS_gettid);
+ syscall(SYS_tgkill, getpid(), tid, SIGHUP);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR _syscall (2),
+.BR intro (2),
+.BR syscalls (2),
+.BR errno (3),
+.BR vdso (7)
diff --git a/man2/syscalls.2 b/man2/syscalls.2
new file mode 100644
index 0000000..1011c14
--- /dev/null
+++ b/man2/syscalls.2
@@ -0,0 +1,1168 @@
+'\" t
+.\" Copyright (C) 2007 Michael Kerrisk <mtk.manpages@gmail.com>
+.\" with some input from Stepan Kasal <kasal@ucw.cz>
+.\"
+.\" Some content retained from an earlier version of this page:
+.\" Copyright (C) 1998 Andries Brouwer (aeb@cwi.nl)
+.\" Modifications for 2.2 and 2.4 Copyright (C) 2002 Ian Redfern
+.\" <redferni@logica.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH syscalls 2 2023-07-30 "Linux man-pages 6.05.01"
+.SH NAME
+syscalls \- Linux system calls
+.SH SYNOPSIS
+.nf
+Linux system calls.
+.fi
+.SH DESCRIPTION
+The system call is the fundamental interface between an application
+and the Linux kernel.
+.SS System calls and library wrapper functions
+System calls are generally not invoked directly,
+but rather via wrapper functions in glibc (or perhaps some other library).
+For details of direct invocation of a system call, see
+.BR intro (2).
+Often, but not always, the name of the wrapper function is the same
+as the name of the system call that it invokes.
+For example, glibc contains a function
+.BR chdir ()
+which invokes the underlying "chdir" system call.
+.PP
+Often the glibc wrapper function is quite thin, doing little work
+other than copying arguments to the right registers
+before invoking the system call,
+and then setting
+.I errno
+appropriately after the system call has returned.
+(These are the same steps that are performed by
+.BR syscall (2),
+which can be used to invoke system calls
+for which no wrapper function is provided.)
+Note: system calls indicate a failure by returning a negative error
+number to the caller on architectures without a separate error register/flag,
+as noted in
+.BR syscall (2);
+when this happens,
+the wrapper function negates the returned error number
+(to make it positive), copies it to
+.IR errno ,
+and returns \-1 to the caller of the wrapper.
+.PP
+Sometimes, however, the wrapper function does some extra work
+before invoking the system call.
+For example, nowadays there are (for reasons described below) two
+related system calls,
+.BR truncate (2)
+and
+.BR truncate64 (2),
+and the glibc
+.BR truncate ()
+wrapper function checks which of those system calls
+are provided by the kernel and determines which should be employed.
+.SS System call list
+Below is a list of the Linux system calls.
+In the list, the
+.I Kernel
+column indicates the kernel version
+for those system calls that were new in Linux 2.2,
+or have appeared since that kernel version.
+Note the following points:
+.IP \[bu] 3
+Where no kernel version is indicated,
+the system call appeared in Linux 1.0 or earlier.
+.IP \[bu]
+Where a system call is marked "1.2"
+this means the system call probably appeared in a Linux 1.1.x kernel version,
+and first appeared in a stable kernel with 1.2.
+(Development of the 1.2 kernel was initiated from a branch of kernel
+1.0.6 via the 1.1.x unstable kernel series.)
+.IP \[bu]
+Where a system call is marked "2.0"
+this means the system call probably appeared in a Linux 1.3.x kernel version,
+and first appeared in a stable kernel with Linux 2.0.
+(Development of the Linux 2.0 kernel was initiated from a branch of
+Linux 1.2.x, somewhere around Linux 1.2.10,
+via the Linux 1.3.x unstable kernel series.)
+.\" Was Linux 2.0 started from a branch of Linux 1.2.10?
+.\" At least from the timestamps of the tarballs of
+.\" of Linux 1.2.10 and Linux 1.3.0, that's how it looks, but in
+.\" fact the diff doesn't seem very clear, the
+.\" Linux 1.3.0 .tar.bz is much bigger (2.0 MB) than the
+.\" Linux 1.2.10 .tar.bz2 (1.8 MB), and AEB points out the
+.\" timestamps of some files in Linux 1.3.0 seem to be older
+.\" than those in Linux 1.2.10. All of this suggests
+.\" that there might not have been a clean branch point.
+.IP \[bu]
+Where a system call is marked "2.2"
+this means the system call probably appeared in a Linux 2.1.x kernel version,
+and first appeared in a stable kernel with Linux 2.2.0.
+(Development of the Linux 2.2 kernel was initiated from a branch of kernel
+Linux 2.0.21 via the Linux 2.1.x unstable kernel series.)
+.IP \[bu]
+Where a system call is marked "2.4"
+this means the system call probably appeared in a Linux 2.3.x kernel version,
+and first appeared in a stable kernel with Linux 2.4.0.
+(Development of the Linux 2.4 kernel was initiated from a branch of
+Linux 2.2.8 via the Linux 2.3.x unstable kernel series.)
+.IP \[bu]
+Where a system call is marked "2.6"
+this means the system call probably appeared in a Linux 2.5.x kernel version,
+and first appeared in a stable kernel with Linux 2.6.0.
+(Development of Linux 2.6 was initiated from a branch
+of Linux 2.4.15 via the Linux 2.5.x unstable kernel series.)
+.IP \[bu]
+Starting with Linux 2.6.0, the development model changed,
+and new system calls may appear in each Linux 2.6.x release.
+In this case, the exact version number where the system call appeared
+is shown.
+This convention continues with the Linux 3.x kernel series,
+which followed on from Linux 2.6.39; and the Linux 4.x kernel series,
+which followed on from Linux 3.19; and the Linux 5.x kernel series,
+which followed on from Linux 4.20.
+.IP \[bu]
+In some cases, a system call was added to a stable kernel
+series after it branched from the previous stable kernel
+series, and then backported into the earlier stable kernel series.
+For example some system calls that appeared in Linux 2.6.x were also backported
+into a Linux 2.4.x release after Linux 2.4.15.
+When this is so, the version where the system call appeared
+in both of the major kernel series is listed.
+.PP
+The list of system calls that are available as at Linux 5.14
+(or in a few cases only on older kernels) is as follows:
+.\"
+.\" Looking at scripts/checksyscalls.sh in the kernel source is
+.\" instructive about x86 specifics.
+.\"
+.TS
+l2 le l
+---
+l l l.
+\fBSystem call\fP \fBKernel\fP \fBNotes\fP
+
+\fB_llseek\fP(2) 1.2
+\fB_newselect\fP(2) 2.0
+\fB_sysctl\fP(2) 2.0 Removed in 5.5
+\fBaccept\fP(2) 2.0 T{
+See notes on \fBsocketcall\fP(2)
+T}
+\fBaccept4\fP(2) 2.6.28
+\fBaccess\fP(2) 1.0
+\fBacct\fP(2) 1.0
+\fBadd_key\fP(2) 2.6.10
+\fBadjtimex\fP(2) 1.0
+\fBalarm\fP(2) 1.0
+\fBalloc_hugepages\fP(2) 2.5.36 Removed in 2.5.44
+.\" 4adeefe161a74369e44cc8e663f240ece0470dc3
+\fBarc_gettls\fP(2) 3.9 ARC only
+\fBarc_settls\fP(2) 3.9 ARC only
+.\" 91e040a79df73d371f70792f30380d4e44805250
+\fBarc_usr_cmpxchg\fP(2) 4.9 ARC only
+.\" x86: 79170fda313ed5be2394f87aa2a00d597f8ed4a1
+\fBarch_prctl\fP(2) 2.6 T{
+x86_64, x86 since 4.12
+T}
+.\" 9674cdc74d63f346870943ef966a034f8c71ee57
+\fBatomic_barrier\fP(2) 2.6.34 m68k only
+\fBatomic_cmpxchg_32\fP(2) 2.6.34 m68k only
+\fBbdflush\fP(2) 1.2 T{
+Deprecated (does nothing)
+since 2.6
+T}
+\fBbind\fP(2) 2.0 T{
+See notes on \fBsocketcall\fP(2)
+T}
+\fBbpf\fP(2) 3.18
+\fBbrk\fP(2) 1.0
+\fBbreakpoint\fP(2) 2.2 T{
+ARM OABI only, defined with
+\fB__ARM_NR\fP prefix
+T}
+\fBcacheflush\fP(2) 1.2 Not on x86
+\fBcapget\fP(2) 2.2
+\fBcapset\fP(2) 2.2
+\fBchdir\fP(2) 1.0
+\fBchmod\fP(2) 1.0
+\fBchown\fP(2) 2.2 T{
+See \fBchown\fP(2) for
+version details
+T}
+\fBchown32\fP(2) 2.4
+\fBchroot\fP(2) 1.0
+\fBclock_adjtime\fP(2) 2.6.39
+\fBclock_getres\fP(2) 2.6
+\fBclock_gettime\fP(2) 2.6
+\fBclock_nanosleep\fP(2) 2.6
+\fBclock_settime\fP(2) 2.6
+\fBclone2\fP(2) 2.4 IA-64 only
+\fBclone\fP(2) 1.0
+\fBclone3\fP(2) 5.3
+\fBclose\fP(2) 1.0
+\fBclose_range\fP(2) 5.9
+.\" .\" dcef1f634657dabe7905af3ccda12cf7f0b6fcc1
+.\" .\" cc20d42986d5807cbe4f5c7c8e3dab2e59ea0db3
+.\" .\" db695c0509d6ec9046ee5e4c520a19fa17d9fce2
+.\" \fBcmpxchg\fP(2) 2.6.12 T{
+.\" ARM, syscall constant never was
+.\" exposed to user space, in-kernel
+.\" definition had \fB__ARM_NR\fP prefix,
+.\" removed in 4.4
+.\" T}
+.\" 867e359b97c970a60626d5d76bbe2a8fadbf38fb
+.\" bb9d812643d8a121df7d614a2b9c60193a92deb0
+\fBconnect\fP(2) 2.0 T{
+See notes on \fBsocketcall\fP(2)
+T}
+\fBcopy_file_range\fP(2) 4.5
+\fBcreat\fP(2) 1.0
+\fBcreate_module\fP(2) 1.0 Removed in 2.6
+\fBdelete_module\fP(2) 1.0
+.\" 1394f03221790a988afc3e4b3cb79f2e477246a9
+.\" 4ba66a9760722ccbb691b8f7116cad2f791cca7b
+\fBdup\fP(2) 1.0
+\fBdup2\fP(2) 1.0
+\fBdup3\fP(2) 2.6.27
+\fBepoll_create\fP(2) 2.6
+\fBepoll_create1\fP(2) 2.6.27
+\fBepoll_ctl\fP(2) 2.6
+\fBepoll_pwait\fP(2) 2.6.19
+\fBepoll_pwait2\fP(2) 5.11
+\fBepoll_wait\fP(2) 2.6
+\fBeventfd\fP(2) 2.6.22
+\fBeventfd2\fP(2) 2.6.27
+\fBexecv\fP(2) 2.0 T{
+SPARC/SPARC64 only, for
+compatibility with SunOS
+T}
+\fBexecve\fP(2) 1.0
+\fBexecveat\fP(2) 3.19
+\fBexit\fP(2) 1.0
+\fBexit_group\fP(2) 2.6
+\fBfaccessat\fP(2) 2.6.16
+\fBfaccessat2\fP(2) 5.8
+\fBfadvise64\fP(2) 2.6
+.\" Implements \fBposix_fadvise\fP(2)
+\fBfadvise64_64\fP(2) 2.6
+\fBfallocate\fP(2) 2.6.23
+\fBfanotify_init\fP(2) 2.6.37
+\fBfanotify_mark\fP(2) 2.6.37
+.\" The fanotify calls were added in Linux 2.6.36,
+.\" but disabled while the API was finalized.
+\fBfchdir\fP(2) 1.0
+\fBfchmod\fP(2) 1.0
+\fBfchmodat\fP(2) 2.6.16
+\fBfchown\fP(2) 1.0
+\fBfchown32\fP(2) 2.4
+\fBfchownat\fP(2) 2.6.16
+\fBfcntl\fP(2) 1.0
+\fBfcntl64\fP(2) 2.4
+\fBfdatasync\fP(2) 2.0
+\fBfgetxattr\fP(2) 2.6; 2.4.18
+\fBfinit_module\fP(2) 3.8
+\fBflistxattr\fP(2) 2.6; 2.4.18
+\fBflock\fP(2) 2.0
+\fBfork\fP(2) 1.0
+\fBfree_hugepages\fP(2) 2.5.36 Removed in 2.5.44
+\fBfremovexattr\fP(2) 2.6; 2.4.18
+\fBfsconfig\fP(2) 5.2
+\fBfsetxattr\fP(2) 2.6; 2.4.18
+\fBfsmount\fP(2) 5.2
+\fBfsopen\fP(2) 5.2
+\fBfspick\fP(2) 5.2
+\fBfstat\fP(2) 1.0
+\fBfstat64\fP(2) 2.4
+\fBfstatat64\fP(2) 2.6.16
+\fBfstatfs\fP(2) 1.0
+\fBfstatfs64\fP(2) 2.6
+\fBfsync\fP(2) 1.0
+\fBftruncate\fP(2) 1.0
+\fBftruncate64\fP(2) 2.4
+\fBfutex\fP(2) 2.6
+\fBfutimesat\fP(2) 2.6.16
+\fBget_kernel_syms\fP(2) 1.0 Removed in 2.6
+\fBget_mempolicy\fP(2) 2.6.6
+\fBget_robust_list\fP(2) 2.6.17
+\fBget_thread_area\fP(2) 2.6
+.\" 8fcd6c45f5a65621ec809b7866a3623e9a01d4ed
+\fBget_tls\fP(2) 4.15 T{
+ARM OABI only, has
+\fB__ARM_NR\fP prefix
+T}
+\fBgetcpu\fP(2) 2.6.19
+\fBgetcwd\fP(2) 2.2
+\fBgetdents\fP(2) 2.0
+\fBgetdents64\fP(2) 2.4
+.\" parisc: 863722e856e64dae0e252b6bb546737c6c5626ce
+\fBgetdomainname\fP(2) 2.2 T{
+SPARC, SPARC64; available
+as \fBosf_getdomainname\fP(2)
+on Alpha since Linux 2.0
+T}
+.\" ec98c6b9b47df6df1c1fa6cf3d427414f8c2cf16
+\fBgetdtablesize\fP(2) 2.0 T{
+SPARC (removed in 2.6.26),
+available on Alpha as
+\fBosf_getdtablesize\fP(2)
+T}
+\fBgetegid\fP(2) 1.0
+\fBgetegid32\fP(2) 2.4
+\fBgeteuid\fP(2) 1.0
+\fBgeteuid32\fP(2) 2.4
+\fBgetgid\fP(2) 1.0
+\fBgetgid32\fP(2) 2.4
+\fBgetgroups\fP(2) 1.0
+\fBgetgroups32\fP(2) 2.4
+.\" SPARC removal: ec98c6b9b47df6df1c1fa6cf3d427414f8c2cf16
+\fBgethostname\fP(2) 2.0 T{
+Alpha, was available on
+SPARC up to Linux 2.6.26
+T}
+\fBgetitimer\fP(2) 1.0
+\fBgetpeername\fP(2) 2.0 T{
+See notes on \fBsocketcall\fP(2)
+T}
+\fBgetpagesize\fP(2) 2.0 Not on x86
+\fBgetpgid\fP(2) 1.0
+\fBgetpgrp\fP(2) 1.0
+\fBgetpid\fP(2) 1.0
+\fBgetppid\fP(2) 1.0
+\fBgetpriority\fP(2) 1.0
+\fBgetrandom\fP(2) 3.17
+\fBgetresgid\fP(2) 2.2
+\fBgetresgid32\fP(2) 2.4
+\fBgetresuid\fP(2) 2.2
+\fBgetresuid32\fP(2) 2.4
+\fBgetrlimit\fP(2) 1.0
+\fBgetrusage\fP(2) 1.0
+\fBgetsid\fP(2) 2.0
+\fBgetsockname\fP(2) 2.0 T{
+See notes on \fBsocketcall\fP(2)
+T}
+\fBgetsockopt\fP(2) 2.0 T{
+See notes on \fBsocketcall\fP(2)
+T}
+\fBgettid\fP(2) 2.4.11
+\fBgettimeofday\fP(2) 1.0
+\fBgetuid\fP(2) 1.0
+\fBgetuid32\fP(2) 2.4
+\fBgetunwind\fP(2) 2.4.8 T{
+IA-64 only; deprecated
+T}
+\fBgetxattr\fP(2) 2.6; 2.4.18
+\fBgetxgid\fP(2) 2.0 T{
+Alpha only; see NOTES
+T}
+\fBgetxpid\fP(2) 2.0 T{
+Alpha only; see NOTES
+T}
+\fBgetxuid\fP(2) 2.0 T{
+Alpha only; see NOTES
+T}
+\fBinit_module\fP(2) 1.0
+\fBinotify_add_watch\fP(2) 2.6.13
+\fBinotify_init\fP(2) 2.6.13
+\fBinotify_init1\fP(2) 2.6.27
+\fBinotify_rm_watch\fP(2) 2.6.13
+\fBio_cancel\fP(2) 2.6
+\fBio_destroy\fP(2) 2.6
+\fBio_getevents\fP(2) 2.6
+\fBio_pgetevents\fP(2) 4.18
+\fBio_setup\fP(2) 2.6
+\fBio_submit\fP(2) 2.6
+\fBio_uring_enter\fP(2) 5.1
+\fBio_uring_register\fP(2) 5.1
+\fBio_uring_setup\fP(2) 5.1
+\fBioctl\fP(2) 1.0
+\fBioperm\fP(2) 1.0
+\fBiopl\fP(2) 1.0
+\fBioprio_get\fP(2) 2.6.13
+\fBioprio_set\fP(2) 2.6.13
+\fBipc\fP(2) 1.0
+.\" Implements System V IPC calls
+\fBkcmp\fP(2) 3.5
+\fBkern_features\fP(2) 3.7 SPARC64 only
+.\" FIXME . document kern_features():
+.\" commit 517ffce4e1a03aea979fe3a18a3dd1761a24fafb
+\fBkexec_file_load\fP(2) 3.17
+\fBkexec_load\fP(2) 2.6.13
+.\" The entry in the syscall table was reserved starting in 2.6.7
+.\" Was named sys_kexec_load() from 2.6.7 to 2.6.16
+\fBkeyctl\fP(2) 2.6.10
+\fBkill\fP(2) 1.0
+\fBlandlock_add_rule\fP(2) 5.13
+\fBlandlock_create_ruleset\fP(2) 5.13
+\fBlandlock_restrict_self\fP(2) 5.13
+\fBlchown\fP(2) 1.0 T{
+See \fBchown\fP(2) for
+version details
+T}
+\fBlchown32\fP(2) 2.4
+\fBlgetxattr\fP(2) 2.6; 2.4.18
+\fBlink\fP(2) 1.0
+\fBlinkat\fP(2) 2.6.16
+\fBlisten\fP(2) 2.0 T{
+See notes on \fBsocketcall\fP(2)
+T}
+\fBlistxattr\fP(2) 2.6; 2.4.18
+\fBllistxattr\fP(2) 2.6; 2.4.18
+\fBlookup_dcookie\fP(2) 2.6
+\fBlremovexattr\fP(2) 2.6; 2.4.18
+\fBlseek\fP(2) 1.0
+\fBlsetxattr\fP(2) 2.6; 2.4.18
+\fBlstat\fP(2) 1.0
+\fBlstat64\fP(2) 2.4
+\fBmadvise\fP(2) 2.4
+\fBmbind\fP(2) 2.6.6
+\fBmemory_ordering\fP(2) 2.2 SPARC64 only
+.\" 26025bbfbba33a9425be1b89eccb4664ea4c17b6
+.\" bb6fb6dfcc17cddac11ac295861f7608194447a7
+\fBmembarrier\fP(2) 3.17
+\fBmemfd_create\fP(2) 3.17
+\fBmemfd_secret\fP(2) 5.14
+\fBmigrate_pages\fP(2) 2.6.16
+\fBmincore\fP(2) 2.4
+\fBmkdir\fP(2) 1.0
+\fBmkdirat\fP(2) 2.6.16
+\fBmknod\fP(2) 1.0
+\fBmknodat\fP(2) 2.6.16
+\fBmlock\fP(2) 2.0
+\fBmlock2\fP(2) 4.4
+\fBmlockall\fP(2) 2.0
+\fBmmap\fP(2) 1.0
+\fBmmap2\fP(2) 2.4
+\fBmodify_ldt\fP(2) 1.0
+\fBmount\fP(2) 1.0
+\fBmove_mount\fP(2) 5.2
+\fBmove_pages\fP(2) 2.6.18
+\fBmprotect\fP(2) 1.0
+\fBmq_getsetattr\fP(2) 2.6.6
+.\" Implements \fBmq_getattr\fP(3) and \fBmq_setattr\fP(3)
+\fBmq_notify\fP(2) 2.6.6
+\fBmq_open\fP(2) 2.6.6
+\fBmq_timedreceive\fP(2) 2.6.6
+\fBmq_timedsend\fP(2) 2.6.6
+\fBmq_unlink\fP(2) 2.6.6
+\fBmremap\fP(2) 2.0
+\fBmsgctl\fP(2) 2.0 T{
+See notes on \fBipc\fP(2)
+T}
+\fBmsgget\fP(2) 2.0 T{
+See notes on \fBipc\fP(2)
+T}
+\fBmsgrcv\fP(2) 2.0 T{
+See notes on \fBipc\fP(2)
+T}
+\fBmsgsnd\fP(2) 2.0 T{
+See notes on \fBipc\fP(2)
+T}
+\fBmsync\fP(2) 2.0
+.\" \fBmultiplexer\fP(2) ?? __NR_multiplexer reserved on
+.\" PowerPC, but unimplemented?
+\fBmunlock\fP(2) 2.0
+\fBmunlockall\fP(2) 2.0
+\fBmunmap\fP(2) 1.0
+\fBname_to_handle_at\fP(2) 2.6.39
+\fBnanosleep\fP(2) 2.0
+.\" 5590ff0d5528b60153c0b4e7b771472b5a95e297
+\fBnewfstatat\fP(2) 2.6.16 See \fBstat\fP(2)
+\fBnfsservctl\fP(2) 2.2 Removed in 3.1
+\fBnice\fP(2) 1.0
+\fBold_adjtimex\fP(2) 2.0 T{
+Alpha only; see NOTES
+T}
+\fBold_getrlimit\fP(2) 2.4 T{
+Old variant of \fBgetrlimit\fP(2)
+that used a different value
+for \fBRLIM_INFINITY\fP
+T}
+\fBoldfstat\fP(2) 1.0
+\fBoldlstat\fP(2) 1.0
+\fBoldolduname\fP(2) 1.0
+\fBoldstat\fP(2) 1.0
+\fBoldumount\fP(2) 2.4.116 T{
+Name of the old \fBumount\fP(2)
+syscall on Alpha
+T}
+\fBolduname\fP(2) 1.0
+\fBopen\fP(2) 1.0
+\fBopen_by_handle_at\fP(2) 2.6.39
+\fBopen_tree\fP(2) 5.2
+\fBopenat\fP(2) 2.6.16
+\fBopenat2\fP(2) 5.6
+.\" 9d02a4283e9ce4e9ca11ff00615bdacdb0515a1a
+\fBor1k_atomic\fP(2) 3.1 T{
+OpenRISC 1000 only
+T}
+\fBpause\fP(2) 1.0
+\fBpciconfig_iobase\fP(2) 2.2.15; 2.4 Not on x86
+.\" Alpha, PowerPC, ARM; not x86
+\fBpciconfig_read\fP(2) 2.0.26; 2.2 Not on x86
+.\" , PowerPC, ARM; not x86
+\fBpciconfig_write\fP(2) 2.0.26; 2.2 Not on x86
+.\" , PowerPC, ARM; not x86
+\fBperf_event_open\fP(2) 2.6.31 T{
+Was perf_counter_open() in
+2.6.31; renamed in 2.6.32
+T}
+\fBpersonality\fP(2) 1.2
+\fBperfctr\fP(2) 2.2 T{
+SPARC only; removed in 2.6.34
+T}
+.\" commit c7d5a0050773e98d1094eaa9f2a1a793fafac300 removed perfctr()
+\fBperfmonctl\fP(2) 2.4 IA-64 only; removed in 5.10
+\fBpidfd_getfd\fP(2) 5.6
+\fBpidfd_send_signal\fP(2) 5.1
+\fBpidfd_open\fP(2) 5.3
+\fBpipe\fP(2) 1.0
+\fBpipe2\fP(2) 2.6.27
+\fBpivot_root\fP(2) 2.4
+\fBpkey_alloc\fP(2) 4.8
+\fBpkey_free\fP(2) 4.8
+\fBpkey_mprotect\fP(2) 4.8
+\fBpoll\fP(2) 2.0.36; 2.2
+\fBppoll\fP(2) 2.6.16
+\fBprctl\fP(2) 2.2
+\fBpread64\fP(2) T{
+Added as "pread" in 2.2;
+renamed "pread64" in 2.6
+T}
+\fBpreadv\fP(2) 2.6.30
+\fBpreadv2\fP(2) 4.6
+\fBprlimit64\fP(2) 2.6.36
+\fBprocess_madvise\fP(2) 5.10
+\fBprocess_vm_readv\fP(2) 3.2
+\fBprocess_vm_writev\fP(2) 3.2
+\fBpselect6\fP(2) 2.6.16
+.\" Implements \fBpselect\fP(2)
+\fBptrace\fP(2) 1.0
+\fBpwrite64\fP(2) T{
+Added as "pwrite" in 2.2;
+renamed "pwrite64" in 2.6
+T}
+\fBpwritev\fP(2) 2.6.30
+\fBpwritev2\fP(2) 4.6
+\fBquery_module\fP(2) 2.2 Removed in 2.6
+\fBquotactl\fP(2) 1.0
+\fBquotactl_fd\fP(2) 5.14
+\fBread\fP(2) 1.0
+\fBreadahead\fP(2) 2.4.13
+\fBreaddir\fP(2) 1.0
+.\" Supersedes \fBgetdents\fP(2)
+\fBreadlink\fP(2) 1.0
+\fBreadlinkat\fP(2) 2.6.16
+\fBreadv\fP(2) 2.0
+\fBreboot\fP(2) 1.0
+\fBrecv\fP(2) 2.0 T{
+See notes on \fBsocketcall\fP(2)
+T}
+\fBrecvfrom\fP(2) 2.0 T{
+See notes on \fBsocketcall\fP(2)
+T}
+\fBrecvmsg\fP(2) 2.0 T{
+See notes on \fBsocketcall\fP(2)
+T}
+\fBrecvmmsg\fP(2) 2.6.33
+\fBremap_file_pages\fP(2) 2.6 T{
+Deprecated since 3.16
+T}
+\fBremovexattr\fP(2) 2.6; 2.4.18
+\fBrename\fP(2) 1.0
+\fBrenameat\fP(2) 2.6.16
+\fBrenameat2\fP(2) 3.15
+\fBrequest_key\fP(2) 2.6.10
+\fBrestart_syscall\fP(2) 2.6
+.\" 921ebd8f2c081b3cf6c3b29ef4103eef3ff26054
+\fBriscv_flush_icache\fP(2) 4.15 RISC-V only
+\fBrmdir\fP(2) 1.0
+\fBrseq\fP(2) 4.18
+\fBrt_sigaction\fP(2) 2.2
+\fBrt_sigpending\fP(2) 2.2
+\fBrt_sigprocmask\fP(2) 2.2
+\fBrt_sigqueueinfo\fP(2) 2.2
+\fBrt_sigreturn\fP(2) 2.2
+\fBrt_sigsuspend\fP(2) 2.2
+\fBrt_sigtimedwait\fP(2) 2.2
+\fBrt_tgsigqueueinfo\fP(2) 2.6.31
+\fBrtas\fP(2) 2.6.2 T{
+PowerPC/PowerPC64 only
+T}
+\fBs390_runtime_instr\fP(2) 3.7 s390 only
+\fBs390_pci_mmio_read\fP(2) 3.19 s390 only
+\fBs390_pci_mmio_write\fP(2) 3.19 s390 only
+\fBs390_sthyi\fP(2) 4.15 s390 only
+\fBs390_guarded_storage\fP(2) 4.12 s390 only
+\fBsched_get_affinity\fP(2) 2.6 T{
+Name of
+.BR \%sched_getaffinity (2)
+on SPARC and SPARC64
+T}
+\fBsched_get_priority_max\fP(2) 2.0
+\fBsched_get_priority_min\fP(2) 2.0
+\fBsched_getaffinity\fP(2) 2.6
+\fBsched_getattr\fP(2) 3.14
+\fBsched_getparam\fP(2) 2.0
+\fBsched_getscheduler\fP(2) 2.0
+\fBsched_rr_get_interval\fP(2) 2.0
+\fBsched_set_affinity\fP(2) 2.6 T{
+Name of
+.BR \%sched_setaffinity (2)
+on SPARC and SPARC64
+T}
+\fBsched_setaffinity\fP(2) 2.6
+\fBsched_setattr\fP(2) 3.14
+\fBsched_setparam\fP(2) 2.0
+\fBsched_setscheduler\fP(2) 2.0
+\fBsched_yield\fP(2) 2.0
+\fBseccomp\fP(2) 3.17
+\fBselect\fP(2) 1.0
+\fBsemctl\fP(2) 2.0 T{
+See notes on \fBipc\fP(2)
+T}
+\fBsemget\fP(2) 2.0 T{
+See notes on \fBipc\fP(2)
+T}
+\fBsemop\fP(2) 2.0 T{
+See notes on \fBipc\fP(2)
+T}
+\fBsemtimedop\fP(2) 2.6; 2.4.22
+\fBsend\fP(2) 2.0 T{
+See notes on \fBsocketcall\fP(2)
+T}
+\fBsendfile\fP(2) 2.2
+\fBsendfile64\fP(2) 2.6; 2.4.19
+\fBsendmmsg\fP(2) 3.0
+\fBsendmsg\fP(2) 2.0 T{
+See notes on \fBsocketcall\fP(2)
+T}
+\fBsendto\fP(2) 2.0 T{
+See notes on \fBsocketcall\fP(2)
+T}
+\fBset_mempolicy\fP(2) 2.6.6
+\fBset_robust_list\fP(2) 2.6.17
+\fBset_thread_area\fP(2) 2.6
+\fBset_tid_address\fP(2) 2.6
+\fBset_tls\fP(2) 2.6.11 T{
+ARM OABI/EABI only (constant
+has \fB__ARM_NR\fP prefix)
+T}
+.\" \fBsetaltroot\fP(2) 2.6.10 T{
+.\" Removed in 2.6.11, exposed one
+.\" of implementation details of
+.\" \fBpersonality\fP(2) (creating an
+.\" alternative root, precursor of
+.\" mount namespaces) to user space.
+.\" T}
+.\" See http://lkml.org/lkml/2005/8/1/83
+.\" "[PATCH] remove sys_set_zone_reclaim()"
+\fBsetdomainname\fP(2) 1.0
+\fBsetfsgid\fP(2) 1.2
+\fBsetfsgid32\fP(2) 2.4
+\fBsetfsuid\fP(2) 1.2
+\fBsetfsuid32\fP(2) 2.4
+\fBsetgid\fP(2) 1.0
+\fBsetgid32\fP(2) 2.4
+\fBsetgroups\fP(2) 1.0
+\fBsetgroups32\fP(2) 2.4
+.\" arch/alpha/include/asm/core_lca.h
+\fBsethae\fP(2) 2.0 T{
+Alpha only; see NOTES
+T}
+\fBsethostname\fP(2) 1.0
+\fBsetitimer\fP(2) 1.0
+\fBsetns\fP(2) 3.0
+\fBsetpgid\fP(2) 1.0
+\fBsetpgrp\fP(2) 2.0 T{
+Alternative name for
+\fBsetpgid\fP(2) on Alpha
+T}
+\fBsetpriority\fP(2) 1.0
+\fBsetregid\fP(2) 1.0
+\fBsetregid32\fP(2) 2.4
+\fBsetresgid\fP(2) 2.2
+\fBsetresgid32\fP(2) 2.4
+\fBsetresuid\fP(2) 2.2
+\fBsetresuid32\fP(2) 2.4
+\fBsetreuid\fP(2) 1.0
+\fBsetreuid32\fP(2) 2.4
+\fBsetrlimit\fP(2) 1.0
+\fBsetsid\fP(2) 1.0
+\fBsetsockopt\fP(2) 2.0 T{
+See notes on \fBsocketcall\fP(2)
+T}
+\fBsettimeofday\fP(2) 1.0
+\fBsetuid\fP(2) 1.0
+\fBsetuid32\fP(2) 2.4
+\fBsetup\fP(2) 1.0 Removed in 2.2
+\fBsetxattr\fP(2) 2.6; 2.4.18
+\fBsgetmask\fP(2) 1.0
+\fBshmat\fP(2) 2.0 T{
+See notes on \fBipc\fP(2)
+T}
+\fBshmctl\fP(2) 2.0 T{
+See notes on \fBipc\fP(2)
+T}
+\fBshmdt\fP(2) 2.0 T{
+See notes on \fBipc\fP(2)
+T}
+\fBshmget\fP(2) 2.0 T{
+See notes on \fBipc\fP(2)
+T}
+\fBshutdown\fP(2) 2.0 T{
+See notes on \fBsocketcall\fP(2)
+T}
+\fBsigaction\fP(2) 1.0
+\fBsigaltstack\fP(2) 2.2
+\fBsignal\fP(2) 1.0
+\fBsignalfd\fP(2) 2.6.22
+\fBsignalfd4\fP(2) 2.6.27
+\fBsigpending\fP(2) 1.0
+\fBsigprocmask\fP(2) 1.0
+\fBsigreturn\fP(2) 1.0
+\fBsigsuspend\fP(2) 1.0
+\fBsocket\fP(2) 2.0 T{
+See notes on \fBsocketcall\fP(2)
+T}
+\fBsocketcall\fP(2) 1.0
+.\" Implements BSD socket calls
+\fBsocketpair\fP(2) 2.0 T{
+See notes on \fBsocketcall\fP(2)
+T}
+.\" 5a0015d62668e64c8b6e02e360fbbea121bfd5e6
+\fBspill\fP(2) 2.6.13 Xtensa only
+\fBsplice\fP(2) 2.6.17
+\fBspu_create\fP(2) 2.6.16 T{
+PowerPC/PowerPC64 only
+T}
+\fBspu_run\fP(2) 2.6.16 T{
+PowerPC/PowerPC64 only
+T}
+\fBssetmask\fP(2) 1.0
+\fBstat\fP(2) 1.0
+\fBstat64\fP(2) 2.4
+\fBstatfs\fP(2) 1.0
+\fBstatfs64\fP(2) 2.6
+\fBstatx\fP(2) 4.11
+\fBstime\fP(2) 1.0
+\fBsubpage_prot\fP(2) 2.6.25 T{
+PowerPC/PowerPC64 only
+T}
+\fBswapcontext\fP(2) 2.6.3 T{
+PowerPC/PowerPC64 only
+T}
+.\" 529d235a0e190ded1d21ccc80a73e625ebcad09b
+\fBswitch_endian\fP(2) 4.1 PowerPC64 only
+\fBswapoff\fP(2) 1.0
+\fBswapon\fP(2) 1.0
+\fBsymlink\fP(2) 1.0
+\fBsymlinkat\fP(2) 2.6.16
+\fBsync\fP(2) 1.0
+\fBsync_file_range\fP(2) 2.6.17
+\fBsync_file_range2\fP(2) 2.6.22
+.\" PowerPC, ARM, tile
+.\" First appeared on ARM, as arm_sync_file_range(), but later renamed
+.\" \fBsys_debug_setcontext\fP(2) ??? PowerPC if CONFIG_PPC32
+\fBsyncfs\fP(2) 2.6.39
+\fBsys_debug_setcontext\fP(2) 2.6.11 PowerPC only
+\fBsyscall\fP(2) 1.0 T{
+Still available on ARM OABI
+and MIPS O32 ABI
+T}
+\fBsysfs\fP(2) 1.2
+\fBsysinfo\fP(2) 1.0
+\fBsyslog\fP(2) 1.0
+.\" glibc interface is \fBklogctl\fP(3)
+\fBsysmips\fP(2) 2.6.0 MIPS only
+\fBtee\fP(2) 2.6.17
+\fBtgkill\fP(2) 2.6
+\fBtime\fP(2) 1.0
+\fBtimer_create\fP(2) 2.6
+\fBtimer_delete\fP(2) 2.6
+\fBtimer_getoverrun\fP(2) 2.6
+\fBtimer_gettime\fP(2) 2.6
+\fBtimer_settime\fP(2) 2.6
+.\" .\" b215e283992899650c4271e7385c79e26fb9a88e
+.\" .\" 4d672e7ac79b5ec5cdc90e450823441e20464691
+.\" \fBtimerfd\fP(2) 2.6.22 T{
+.\" Old timerfd interface,
+.\" removed in 2.6.25
+.\" T}
+\fBtimerfd_create\fP(2) 2.6.25
+\fBtimerfd_gettime\fP(2) 2.6.25
+\fBtimerfd_settime\fP(2) 2.6.25
+\fBtimes\fP(2) 1.0
+\fBtkill\fP(2) 2.6; 2.4.22
+\fBtruncate\fP(2) 1.0
+\fBtruncate64\fP(2) 2.4
+\fBugetrlimit\fP(2) 2.4
+\fBumask\fP(2) 1.0
+\fBumount\fP(2) 1.0
+.\" sys_oldumount() -- __NR_umount
+\fBumount2\fP(2) 2.2
+.\" sys_umount() -- __NR_umount2
+\fBuname\fP(2) 1.0
+\fBunlink\fP(2) 1.0
+\fBunlinkat\fP(2) 2.6.16
+\fBunshare\fP(2) 2.6.16
+\fBuselib\fP(2) 1.0
+\fBustat\fP(2) 1.0
+\fBuserfaultfd\fP(2) 4.3
+\fBusr26\fP(2) 2.4.8.1 ARM OABI only
+\fBusr32\fP(2) 2.4.8.1 ARM OABI only
+\fButime\fP(2) 1.0
+\fButimensat\fP(2) 2.6.22
+\fButimes\fP(2) 2.2
+\fButrap_install\fP(2) 2.2 SPARC64 only
+.\" FIXME . document utrap_install()
+.\" There's a man page for Solaris 5.11
+\fBvfork\fP(2) 2.2
+\fBvhangup\fP(2) 1.0
+\fBvm86old\fP(2) 1.0 T{
+Was "vm86"; renamed in
+2.0.28/2.2
+T}
+\fBvm86\fP(2) 2.0.28; 2.2
+\fBvmsplice\fP(2) 2.6.17
+\fBwait4\fP(2) 1.0
+\fBwaitid\fP(2) 2.6.10
+\fBwaitpid\fP(2) 1.0
+\fBwrite\fP(2) 1.0
+\fBwritev\fP(2) 2.0
+.\" 5a0015d62668e64c8b6e02e360fbbea121bfd5e6
+\fBxtensa\fP(2) 2.6.13 Xtensa only
+.TE
+.PP
+On many platforms, including x86-32, socket calls are all multiplexed
+(via glibc wrapper functions) through
+.BR socketcall (2)
+and similarly System\ V IPC calls are multiplexed through
+.BR ipc (2).
+.PP
+Although slots are reserved for them in the system call table,
+the following system calls are not implemented in the standard kernel:
+.BR afs_syscall (2), \" __NR_afs_syscall is 53 on Linux 2.6.22/i386
+.BR break (2), \" __NR_break is 17 on Linux 2.6.22/i386
+.BR ftime (2), \" __NR_ftime is 35 on Linux 2.6.22/i386
+.BR getpmsg (2), \" __NR_getpmsg is 188 on Linux 2.6.22/i386
+.BR gtty (2), \" __NR_gtty is 32 on Linux 2.6.22/i386
+.BR idle (2), \" __NR_idle is 112 on Linux 2.6.22/i386
+.BR lock (2), \" __NR_lock is 53 on Linux 2.6.22/i386
+.BR madvise1 (2), \" __NR_madvise1 is 219 on Linux 2.6.22/i386
+.BR mpx (2), \" __NR_mpx is 66 on Linux 2.6.22/i386
+.BR phys (2), \" Slot has been reused
+.BR prof (2), \" __NR_prof is 44 on Linux 2.6.22/i386
+.BR profil (2), \" __NR_profil is 98 on Linux 2.6.22/i386
+.BR putpmsg (2), \" __NR_putpmsg is 189 on Linux 2.6.22/i386
+.\" __NR_security is 223 on Linux 2.4/i386; absent on 2.6/i386, present
+.\" on a couple of 2.6 architectures
+.BR security (2), \" __NR_security is 223 on Linux 2.4/i386
+.\" The security call is for future use.
+.BR stty (2), \" __NR_stty is 31 on Linux 2.6.22/i386
+.BR tuxcall (2), \" __NR_tuxcall is 184 on x86_64, also on PPC and alpha
+.BR ulimit (2), \" __NR_ulimit is 58 on Linux 2.6.22/i386
+and
+.BR vserver (2) \" __NR_vserver is 273 on Linux 2.6.22/i386
+(see also
+.BR unimplemented (2)).
+However,
+.BR ftime (3),
+.BR profil (3),
+and
+.BR ulimit (3)
+exist as library routines.
+The slot for
+.BR phys (2)
+is in use since Linux 2.1.116 for
+.BR umount (2);
+.BR phys (2)
+will never be implemented.
+The
+.BR getpmsg (2)
+and
+.BR putpmsg (2)
+calls are for kernels patched to support STREAMS,
+and may never be in the standard kernel.
+.PP
+There was briefly
+.BR set_zone_reclaim (2),
+added in Linux 2.6.13, and removed in Linux 2.6.16;
+this system call was never available to user space.
+.\"
+.SS System calls on removed ports
+Some system calls only ever existed on Linux architectures that have
+since been removed from the kernel:
+.TP
+AVR32 (port removed in Linux 4.12)
+.RS
+.PD 0
+.IP \[bu] 3
+.BR pread (2)
+.IP \[bu]
+.BR pwrite (2)
+.PD
+.RE
+.TP
+Blackfin (port removed in Linux 4.17)
+.RS
+.PD 0
+.IP \[bu] 3
+.BR bfin_spinlock (2)
+(added in Linux 2.6.22)
+.IP \[bu]
+.BR dma_memcpy (2)
+(added in Linux 2.6.22)
+.IP \[bu]
+.BR pread (2)
+(added in Linux 2.6.22)
+.IP \[bu]
+.BR pwrite (2)
+(added in Linux 2.6.22)
+.IP \[bu]
+.BR sram_alloc (2)
+(added in Linux 2.6.22)
+.IP \[bu]
+.BR sram_free (2)
+(added in Linux 2.6.22)
+.PD
+.RE
+.TP
+Metag (port removed in Linux 4.17)
+.RS
+.PD 0
+.IP \[bu] 3
+.BR metag_get_tls (2)
+(add in Linux 3.9)
+.IP \[bu]
+.BR metag_set_fpu_flags (2)
+(add in Linux 3.9)
+.IP \[bu]
+.BR metag_set_tls (2)
+(add in Linux 3.9)
+.IP \[bu]
+.BR metag_setglobalbit (2)
+(add in Linux 3.9)
+.PD
+.RE
+.TP
+Tile (port removed in Linux 4.17)
+.RS
+.PD 0
+.IP \[bu] 3
+.BR cmpxchg_badaddr (2)
+(added in Linux 2.6.36)
+.PD
+.RE
+.SH NOTES
+Roughly speaking, the code belonging to the system call
+with number __NR_xxx defined in
+.I /usr/include/asm/unistd.h
+can be found in the Linux kernel source in the routine
+.IR sys_xxx ().
+There are many exceptions, however, mostly because
+older system calls were superseded by newer ones,
+and this has been treated somewhat unsystematically.
+On platforms with
+proprietary operating-system emulation,
+such as sparc, sparc64, and alpha,
+there are many additional system calls; mips64 also contains a full
+set of 32-bit system calls.
+.PP
+Over time, changes to the interfaces of some system calls have been
+necessary.
+One reason for such changes was the need to increase the size of
+structures or scalar values passed to the system call.
+Because of these changes, certain architectures
+(notably, longstanding 32-bit architectures such as i386)
+now have various groups of related system calls (e.g.,
+.BR truncate (2)
+and
+.BR truncate64 (2))
+which perform similar tasks, but which vary in
+details such as the size of their arguments.
+(As noted earlier, applications are generally unaware of this:
+the glibc wrapper functions do some work to ensure that the right
+system call is invoked, and that ABI compatibility is
+preserved for old binaries.)
+Examples of systems calls that exist in multiple versions are
+the following:
+.IP \[bu] 3
+By now there are three different versions of
+.BR stat (2):
+.IR sys_stat ()
+(slot
+.IR __NR_oldstat ),
+.IR sys_newstat ()
+(slot
+.IR __NR_stat ),
+and
+.IR sys_stat64 ()
+(slot
+.IR __NR_stat64 ),
+with the last being the most current.
+.\" e.g., on 2.6.22/i386: __NR_oldstat 18, __NR_stat 106, __NR_stat64 195
+.\" The stat system calls deal with three different data structures,
+.\" defined in include/asm-i386/stat.h: __old_kernel_stat, stat, stat64
+A similar story applies for
+.BR lstat (2)
+and
+.BR fstat (2).
+.IP \[bu]
+Similarly, the defines
+.IR __NR_oldolduname ,
+.IR __NR_olduname ,
+and
+.I __NR_uname
+refer to the routines
+.IR sys_olduname (),
+.IR sys_uname (),
+and
+.IR sys_newuname ().
+.IP \[bu]
+In Linux 2.0, a new version of
+.BR vm86 (2)
+appeared, with the old and the new kernel routines being named
+.IR sys_vm86old ()
+and
+.IR sys_vm86 ().
+.IP \[bu]
+In Linux 2.4, a new version of
+.BR getrlimit (2)
+appeared, with the old and the new kernel routines being named
+.IR sys_old_getrlimit ()
+(slot
+.IR __NR_getrlimit )
+and
+.IR sys_getrlimit ()
+(slot
+.IR __NR_ugetrlimit ).
+.IP \[bu]
+Linux 2.4 increased the size of user and group IDs from 16 to 32 bits.
+.\" 64-bit off_t changes: ftruncate64, *stat64,
+.\" fcntl64 (because of the flock structure), getdents64, *statfs64
+To support this change, a range of system calls were added
+(e.g.,
+.BR chown32 (2),
+.BR getuid32 (2),
+.BR getgroups32 (2),
+.BR setresuid32 (2)),
+superseding earlier calls of the same name without the
+"32" suffix.
+.IP \[bu]
+Linux 2.4 added support for applications on 32-bit architectures
+to access large files (i.e., files for which the sizes and
+file offsets can't be represented in 32 bits.)
+To support this change, replacements were required for system calls
+that deal with file offsets and sizes.
+Thus the following system calls were added:
+.BR fcntl64 (2),
+.BR getdents64 (2),
+.BR stat64 (2),
+.BR statfs64 (2),
+.BR truncate64 (2),
+and their analogs that work with file descriptors or
+symbolic links.
+These system calls supersede the older system calls
+which, except in the case of the "stat" calls,
+have the same name without the "64" suffix.
+.IP
+On newer platforms that only have 64-bit file access and 32-bit UIDs/GIDs
+(e.g., alpha, ia64, s390x, x86-64), there is just a single version of
+the UID/GID and file access system calls.
+On platforms (typically, 32-bit platforms) where the *64 and *32 calls exist,
+the other versions are obsolete.
+.IP \[bu]
+The
+.I rt_sig*
+calls were added in Linux 2.2 to support the addition
+of real-time signals (see
+.BR signal (7)).
+These system calls supersede the older system calls of the same
+name without the "rt_" prefix.
+.IP \[bu]
+The
+.BR select (2)
+and
+.BR mmap (2)
+system calls use five or more arguments,
+which caused problems in the way
+argument passing on the i386 used to be set up.
+Thus, while other architectures have
+.IR sys_select ()
+and
+.IR sys_mmap ()
+corresponding to
+.I __NR_select
+and
+.IR __NR_mmap ,
+on i386 one finds
+.IR old_select ()
+and
+.IR old_mmap ()
+(routines that use a pointer to an
+argument block) instead.
+These days passing five arguments
+is not a problem any more, and there is a
+.I __NR__newselect
+.\" (used by libc 6)
+that corresponds directly to
+.IR sys_select ()
+and similarly
+.IR __NR_mmap2 .
+s390x is the only 64-bit architecture that has
+.IR old_mmap ().
+.\" .PP
+.\" Two system call numbers,
+.\" .IR __NR__llseek
+.\" and
+.\" .IR __NR__sysctl
+.\" have an additional underscore absent in
+.\" .IR sys_llseek ()
+.\" and
+.\" .IR sys_sysctl ().
+.\"
+.\" In Linux 2.1.81,
+.\" .BR lchown (2)
+.\" and
+.\" .BR chown (2)
+.\" were swapped; that is,
+.\" .BR lchown (2)
+.\" was added with the semantics that were then current for
+.\" .BR chown (2),
+.\" and the semantics of the latter call were changed to what
+.\" they are today.
+.\"
+.\"
+.SS "Architecture-specific details: Alpha"
+.TP
+.BR getxgid (2)
+returns a pair of GID and effective GID via registers
+\fBr0\fP and \fBr20\fP; it is provided
+instead of
+\fBgetgid\fP(2) and \fBgetegid\fP(2).
+.TP
+.BR getxpid (2)
+returns a pair of PID and parent PID via registers
+\fBr0\fP and \fBr20\fP; it is provided instead of
+\fBgetpid\fP(2) and \fBgetppid\fP(2).
+.TP
+.BR old_adjtimex (2)
+is a variant of \fBadjtimex\fP(2) that uses \fIstruct timeval32\fP,
+for compatibility with OSF/1.
+.TP
+.BR getxuid (2)
+returns a pair of GID and effective GID via registers
+\fBr0\fP and \fBr20\fP; it is provided instead of
+\fBgetuid\fP(2) and \fBgeteuid\fP(2).
+.TP
+.BR sethae (2)
+is used for configuring the Host Address Extension register on
+low-cost Alphas in order to access address space beyond first 27 bits.
+.SH SEE ALSO
+.BR ausyscall (1),
+.BR intro (2),
+.BR syscall (2),
+.BR unimplemented (2),
+.BR errno (3),
+.BR libc (7),
+.BR vdso (7)
diff --git a/man2/sysctl.2 b/man2/sysctl.2
new file mode 100644
index 0000000..fbe967f
--- /dev/null
+++ b/man2/sysctl.2
@@ -0,0 +1,158 @@
+.\" Copyright (C) 1996 Andries Brouwer (aeb@cwi.nl)
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Written 11 April 1996 by Andries Brouwer <aeb@cwi.nl>
+.\" 960412: Added comments from Stephen Tweedie
+.\" Modified Tue Oct 22 22:28:41 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified Mon Jan 5 20:31:04 1998 by aeb.
+.\"
+.TH sysctl 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+sysctl \- read/write system parameters
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.B #include <linux/sysctl.h>
+.PP
+.BI "[[deprecated]] int _sysctl(struct __sysctl_args *" args );
+.fi
+.SH DESCRIPTION
+.B This system call no longer exists on current kernels!
+See NOTES.
+.PP
+The
+.BR _sysctl ()
+call reads and/or writes kernel parameters.
+For example, the hostname,
+or the maximum number of open files.
+The argument has the form
+.PP
+.in +4n
+.EX
+struct __sysctl_args {
+ int *name; /* integer vector describing variable */
+ int nlen; /* length of this vector */
+ void *oldval; /* 0 or address where to store old value */
+ size_t *oldlenp; /* available room for old value,
+ overwritten by actual size of old value */
+ void *newval; /* 0 or address of new value */
+ size_t newlen; /* size of new value */
+};
+.EE
+.in
+.PP
+This call does a search in a tree structure, possibly resembling
+a directory tree under
+.IR /proc/sys ,
+and if the requested item is found calls some appropriate routine
+to read or modify the value.
+.SH RETURN VALUE
+Upon successful completion,
+.BR _sysctl ()
+returns 0.
+Otherwise, a value of \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.BR EACCES ", " EPERM
+No search permission for one of the encountered "directories",
+or no read permission where
+.I oldval
+was nonzero, or no write permission where
+.I newval
+was nonzero.
+.TP
+.B EFAULT
+The invocation asked for the previous value by setting
+.I oldval
+non-NULL, but allowed zero room in
+.IR oldlenp .
+.TP
+.B ENOTDIR
+.I name
+was not found.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 1.3.57.
+Removed in Linux 5.5, glibc 2.32.
+.PP
+It originated in
+4.4BSD.
+Only Linux has the
+.I /proc/sys
+mirror, and the object naming schemes differ between Linux and 4.4BSD,
+but the declaration of the
+.BR sysctl ()
+function is the same in both.
+.SH NOTES
+Use of this system call was long discouraged:
+since Linux 2.6.24,
+uses of this system call result in warnings in the kernel log,
+and in Linux 5.5, the system call was finally removed.
+Use the
+.I /proc/sys
+interface instead.
+.PP
+Note that on older kernels where this system call still exists,
+it is available only if the kernel was configured with the
+.B CONFIG_SYSCTL_SYSCALL
+option.
+Furthermore, glibc does not provide a wrapper for this system call,
+necessitating the use of
+.BR syscall (2).
+.SH BUGS
+The object names vary between kernel versions,
+making this system call worthless for applications.
+.PP
+Not all available objects are properly documented.
+.PP
+It is not yet possible to change operating system by writing to
+.IR /proc/sys/kernel/ostype .
+.SH EXAMPLES
+.\" SRC BEGIN (sysctl.c)
+.EX
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+\&
+#include <linux/sysctl.h>
+\&
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+\&
+int _sysctl(struct __sysctl_args *args);
+\&
+#define OSNAMESZ 100
+\&
+int
+main(void)
+{
+ int name[] = { CTL_KERN, KERN_OSTYPE };
+ char osname[OSNAMESZ];
+ size_t osnamelth;
+ struct __sysctl_args args;
+\&
+ memset(&args, 0, sizeof(args));
+ args.name = name;
+ args.nlen = ARRAY_SIZE(name);
+ args.oldval = osname;
+ args.oldlenp = &osnamelth;
+\&
+ osnamelth = sizeof(osname);
+\&
+ if (syscall(SYS__sysctl, &args) == \-1) {
+ perror("_sysctl");
+ exit(EXIT_FAILURE);
+ }
+ printf("This machine is running %*s\en", (int) osnamelth, osname);
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR proc (5)
diff --git a/man2/sysfs.2 b/man2/sysfs.2
new file mode 100644
index 0000000..d650a9c
--- /dev/null
+++ b/man2/sysfs.2
@@ -0,0 +1,97 @@
+.\" Copyright (C) 1995, Thomas K. Dyas <tdyas@eden.rutgers.edu>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Created Wed Aug 9 1995 Thomas K. Dyas <tdyas@eden.rutgers.edu>
+.\"
+.TH sysfs 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+sysfs \- get filesystem type information
+.SH SYNOPSIS
+.nf
+.BI "[[deprecated]] int sysfs(int " option ", const char *" fsname );
+.BI "[[deprecated]] int sysfs(int " option ", unsigned int " fs_index ", char *" buf );
+.BI "[[deprecated]] int sysfs(int " option );
+.fi
+.SH DESCRIPTION
+.BR "Note" :
+if you are looking for information about the
+.B sysfs
+filesystem that is normally mounted at
+.IR /sys ,
+see
+.BR sysfs (5).
+.PP
+The (obsolete)
+.BR sysfs ()
+system call returns information about the filesystem types
+currently present in the kernel.
+The specific form of the
+.BR sysfs ()
+call and the information returned depends on the
+.I option
+in effect:
+.TP 3
+.B 1
+Translate the filesystem identifier string
+.I fsname
+into a filesystem type index.
+.TP
+.B 2
+Translate the filesystem type index
+.I fs_index
+into a null-terminated filesystem identifier string.
+This string will
+be written to the buffer pointed to by
+.IR buf .
+Make sure that
+.I buf
+has enough space to accept the string.
+.TP
+.B 3
+Return the total number of filesystem types currently present in the
+kernel.
+.PP
+The numbering of the filesystem type indexes begins with zero.
+.SH RETURN VALUE
+On success,
+.BR sysfs ()
+returns the filesystem index for option
+.BR 1 ,
+zero for option
+.BR 2 ,
+and the number of currently configured filesystems for option
+.BR 3 .
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+.RI "Either " fsname " or " buf
+is outside your accessible address space.
+.TP
+.B EINVAL
+.I fsname
+is not a valid filesystem type identifier;
+.I fs_index
+is out-of-bounds;
+.I option
+is invalid.
+.SH STANDARDS
+None.
+.SH HISTORY
+SVr4.
+.PP
+This System-V derived system call is obsolete; don't use it.
+On systems with
+.IR /proc ,
+the same information can be obtained via
+.IR /proc ;
+use that interface instead.
+.SH BUGS
+There is no libc or glibc support.
+There is no way to guess how large \fIbuf\fP should be.
+.SH SEE ALSO
+.BR proc (5),
+.BR sysfs (5)
diff --git a/man2/sysinfo.2 b/man2/sysinfo.2
new file mode 100644
index 0000000..fc44136
--- /dev/null
+++ b/man2/sysinfo.2
@@ -0,0 +1,106 @@
+.\" Copyright (C) 2016, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Based on an earlier version of the page where a few pieces were
+.\" copyright (C) 1993 by Dan Miner (dminer@nyx.cs.du.edu) and subsequently
+.\" others (see old changelog below).
+.\" The structure definitions are taken more or less straight from the kernel
+.\" source files.
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\"
+.\" Modified Sat Jul 24 12:35:12 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Modified Tue Oct 22 22:29:51 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified Mon Aug 25 16:06:11 1997 by Nicolás Lichtmaier <nick@debian.org>
+.\"
+.TH sysinfo 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+sysinfo \- return system information
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/sysinfo.h>
+.PP
+.BI "int sysinfo(struct sysinfo *" info );
+.fi
+.SH DESCRIPTION
+.BR sysinfo ()
+returns certain statistics on memory and swap usage,
+as well as the load average.
+.PP
+Until Linux 2.3.16,
+.BR sysinfo ()
+returned information in the following structure:
+.PP
+.in +4n
+.EX
+struct sysinfo {
+ long uptime; /* Seconds since boot */
+ unsigned long loads[3]; /* 1, 5, and 15 minute load averages */
+ unsigned long totalram; /* Total usable main memory size */
+ unsigned long freeram; /* Available memory size */
+ unsigned long sharedram; /* Amount of shared memory */
+ unsigned long bufferram; /* Memory used by buffers */
+ unsigned long totalswap; /* Total swap space size */
+ unsigned long freeswap; /* Swap space still available */
+ unsigned short procs; /* Number of current processes */
+ char _f[22]; /* Pads structure to 64 bytes */
+};
+.EE
+.in
+.PP
+In the above structure, the sizes of the memory and swap fields
+are given in bytes.
+.PP
+Since Linux 2.3.23 (i386) and Linux 2.3.48
+(all architectures) the structure is:
+.PP
+.in +4n
+.EX
+struct sysinfo {
+ long uptime; /* Seconds since boot */
+ unsigned long loads[3]; /* 1, 5, and 15 minute load averages */
+ unsigned long totalram; /* Total usable main memory size */
+ unsigned long freeram; /* Available memory size */
+ unsigned long sharedram; /* Amount of shared memory */
+ unsigned long bufferram; /* Memory used by buffers */
+ unsigned long totalswap; /* Total swap space size */
+ unsigned long freeswap; /* Swap space still available */
+ unsigned short procs; /* Number of current processes */
+ unsigned long totalhigh; /* Total high memory size */
+ unsigned long freehigh; /* Available high memory size */
+ unsigned int mem_unit; /* Memory unit size in bytes */
+ char _f[20\-2*sizeof(long)\-sizeof(int)];
+ /* Padding to 64 bytes */
+};
+.EE
+.in
+.PP
+In the above structure,
+sizes of the memory and swap fields are given as multiples of
+.I mem_unit
+bytes.
+.SH RETURN VALUE
+On success,
+.BR sysinfo ()
+returns zero.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+.I info
+is not a valid address.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 0.98.pl6.
+.SH NOTES
+All of the information provided by this system call is also available via
+.I /proc/meminfo
+and
+.IR /proc/loadavg .
+.SH SEE ALSO
+.BR proc (5)
diff --git a/man2/syslog.2 b/man2/syslog.2
new file mode 100644
index 0000000..4e90778
--- /dev/null
+++ b/man2/syslog.2
@@ -0,0 +1,378 @@
+'\" t
+.\" Copyright (C) 1995 Andries Brouwer (aeb@cwi.nl)
+.\" and Copyright (C) 2012, 2014 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Written 11 June 1995 by Andries Brouwer <aeb@cwi.nl>
+.\" 2008-02-15, Jeremy Kerr <jk@ozlabs.org>
+.\" Add info on command type 10; add details on types 6, 7, 8, & 9.
+.\" 2008-02-15, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Update LOG_BUF_LEN details; update RETURN VALUE section.
+.\"
+.TH syslog 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+syslog, klogctl \- read and/or clear kernel message ring buffer;
+set console_loglevel
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <sys/klog.h>" " /* Definition of " SYSLOG_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_syslog, int " type ", char *" bufp ", int " len );
+.PP
+/* The glibc interface */
+.B #include <sys/klog.h>
+.PP
+.BI "int klogctl(int " type ", char *" bufp ", int " len );
+.fi
+.SH DESCRIPTION
+.IR Note :
+Probably, you are looking for the C library function
+.BR syslog (),
+which talks to
+.BR syslogd (8);
+see
+.BR syslog (3)
+for details.
+.PP
+This page describes the kernel
+.BR syslog ()
+system call, which is used to control the kernel
+.IR printk ()
+buffer; the glibc wrapper function for the system call is called
+.BR klogctl ().
+.SS The kernel log buffer
+The kernel has a cyclic buffer of length
+.B LOG_BUF_LEN
+in which messages given as arguments to the kernel function
+.BR printk ()
+are stored (regardless of their log level).
+In early kernels,
+.B LOG_BUF_LEN
+had the value 4096;
+from Linux 1.3.54, it was 8192;
+from Linux 2.1.113, it was 16384;
+since Linux 2.4.23/2.6, the value is a kernel configuration option
+.RB ( CONFIG_LOG_BUF_SHIFT ,
+default value dependent on the architecture).
+.\" Under "General setup" ==> "Kernel log buffer size"
+.\" For Linux 2.6, precisely the option seems to have appeared in Linux 2.5.55.
+Since Linux 2.6.6, the size can be queried with command type 10 (see below).
+.SS Commands
+The \fItype\fP argument determines the action taken by this function.
+The list below specifies the values for
+.IR type .
+The symbolic names are defined in the kernel source,
+but are not exported to user space;
+you will either need to use the numbers, or define the names yourself.
+.TP
+.BR SYSLOG_ACTION_CLOSE " (0)"
+Close the log.
+Currently a NOP.
+.TP
+.BR SYSLOG_ACTION_OPEN " (1)"
+Open the log.
+Currently a NOP.
+.TP
+.BR SYSLOG_ACTION_READ " (2)"
+Read from the log.
+The call
+waits until the kernel log buffer is nonempty, and then reads
+at most \fIlen\fP bytes into the buffer pointed to by
+.IR bufp .
+The call returns the number of bytes read.
+Bytes read from the log disappear from the log buffer:
+the information can be read only once.
+This is the function executed by the kernel when a user program reads
+.IR /proc/kmsg .
+.TP
+.BR SYSLOG_ACTION_READ_ALL " (3)"
+Read all messages remaining in the ring buffer,
+placing them in the buffer pointed to by
+.IR bufp .
+The call reads the last \fIlen\fP
+bytes from the log buffer (nondestructively),
+but will not read more than was written into the buffer since the
+last "clear ring buffer" command (see command 5 below)).
+The call returns the number of bytes read.
+.TP
+.BR SYSLOG_ACTION_READ_CLEAR " (4)"
+Read and clear all messages remaining in the ring buffer.
+The call does precisely the same as for a
+.I type
+of 3, but also executes the "clear ring buffer" command.
+.TP
+.BR SYSLOG_ACTION_CLEAR " (5)"
+The call executes just the "clear ring buffer" command.
+The
+.I bufp
+and
+.I len
+arguments are ignored.
+.IP
+This command does not really clear the ring buffer.
+Rather, it sets a kernel bookkeeping variable that
+determines the results returned by commands 3
+.RB ( SYSLOG_ACTION_READ_ALL )
+and 4
+.RB ( SYSLOG_ACTION_READ_CLEAR ).
+This command has no effect on commands 2
+.RB ( SYSLOG_ACTION_READ )
+and 9
+.RB ( SYSLOG_ACTION_SIZE_UNREAD ).
+.TP
+.BR SYSLOG_ACTION_CONSOLE_OFF " (6)"
+The command saves the current value of
+.I console_loglevel
+and then sets
+.I console_loglevel
+to
+.IR minimum_console_loglevel ,
+so that no messages are printed to the console.
+Before Linux 2.6.32,
+.\" commit 1aaad49e856ce41adc07d8ae0c8ef35fc4483245
+the command simply sets
+.I console_loglevel
+to
+.IR minimum_console_loglevel .
+See the discussion of
+.IR /proc/sys/kernel/printk ,
+below.
+.IP
+The
+.I bufp
+and
+.I len
+arguments are ignored.
+.TP
+.BR SYSLOG_ACTION_CONSOLE_ON " (7)"
+If a previous
+.B SYSLOG_ACTION_CONSOLE_OFF
+command has been performed,
+this command restores
+.I console_loglevel
+to the value that was saved by that command.
+Before Linux 2.6.32,
+.\" commit 1aaad49e856ce41adc07d8ae0c8ef35fc4483245
+this command simply sets
+.I console_loglevel
+to
+.IR default_console_loglevel .
+See the discussion of
+.IR /proc/sys/kernel/printk ,
+below.
+.IP
+The
+.I bufp
+and
+.I len
+arguments are ignored.
+.TP
+.BR SYSLOG_ACTION_CONSOLE_LEVEL " (8)"
+The call sets
+.I console_loglevel
+to the value given in
+.IR len ,
+which must be an integer between 1 and 8 (inclusive).
+The kernel silently enforces a minimum value of
+.I minimum_console_loglevel
+for
+.IR len .
+See the
+.I log level
+section for details.
+The
+.I bufp
+argument is ignored.
+.TP
+.BR SYSLOG_ACTION_SIZE_UNREAD " (9) (since Linux 2.4.10)"
+The call
+returns the number of bytes currently available to be read
+from the kernel log buffer via command 2
+.RB ( SYSLOG_ACTION_READ ).
+The
+.I bufp
+and
+.I len
+arguments are ignored.
+.TP
+.BR SYSLOG_ACTION_SIZE_BUFFER " (10) (since Linux 2.6.6)"
+This command returns the total size of the kernel log buffer.
+The
+.I bufp
+and
+.I len
+arguments are ignored.
+.PP
+All commands except 3 and 10 require privilege.
+In Linux kernels before Linux 2.6.37,
+command types 3 and 10 are allowed to unprivileged processes;
+since Linux 2.6.37,
+these commands are allowed to unprivileged processes only if
+.I /proc/sys/kernel/dmesg_restrict
+has the value 0.
+Before Linux 2.6.37, "privileged" means that the caller has the
+.B CAP_SYS_ADMIN
+capability.
+Since Linux 2.6.37,
+"privileged" means that the caller has either the
+.B CAP_SYS_ADMIN
+capability (now deprecated for this purpose) or the (new)
+.B CAP_SYSLOG
+capability.
+.\"
+.\"
+.SS /proc/sys/kernel/printk
+.I /proc/sys/kernel/printk
+is a writable file containing four integer values that influence kernel
+.I printk()
+behavior when printing or logging error messages.
+The four values are:
+.TP
+.I console_loglevel
+Only messages with a log level lower than this value will
+be printed to the console.
+The default value for this field is
+.B DEFAULT_CONSOLE_LOGLEVEL
+(7), but it is set to
+4 if the kernel command line contains the word "quiet",\" since Linux 2.4
+10 if the kernel command line contains the word "debug",
+and to 15 in case
+of a kernel fault (the 10 and 15 are just silly, and equivalent to 8).
+The value of
+.I console_loglevel
+can be set (to a value in the range 1\[en]8) by a
+.BR syslog ()
+call with a
+.I type
+of 8.
+.TP
+.I default_message_loglevel
+This value will be used as the log level for
+.I printk()
+messages that do not have an explicit level.
+Up to and including Linux 2.6.38,
+the hard-coded default value for this field was 4
+.RB ( KERN_WARNING );
+since Linux 2.6.39,
+.\" commit 5af5bcb8d37f99ba415a1adc6da71051b84f93a5
+the default value is defined by the kernel configuration option
+.BR CONFIG_DEFAULT_MESSAGE_LOGLEVEL ,
+which defaults to 4.
+.TP
+.I minimum_console_loglevel
+The value in this field is the minimum value to which
+.I console_loglevel
+can be set.
+.TP
+.I default_console_loglevel
+This is the default value for
+.IR console_loglevel .
+.\"
+.\"
+.SS The log level
+Every
+.IR printk ()
+message has its own log level.
+If the log level is not explicitly specified as part of the message,
+it defaults to
+.IR default_message_loglevel .
+The conventional meaning of the log level is as follows:
+.TS
+lB lB lB
+lB c l.
+Kernel constant Level value Meaning
+KERN_EMERG 0 System is unusable
+KERN_ALERT 1 T{
+Action must be taken immediately
+T}
+KERN_CRIT 2 Critical conditions
+KERN_ERR 3 Error conditions
+KERN_WARNING 4 Warning conditions
+KERN_NOTICE 5 T{
+Normal but significant condition
+T}
+KERN_INFO 6 Informational
+KERN_DEBUG 7 Debug-level messages
+.TE
+.sp 1
+The kernel
+.I printk()
+routine will print a message on the
+console only if it has a log level less than the value of
+.IR console_loglevel .
+.SH RETURN VALUE
+For \fItype\fP equal to 2, 3, or 4, a successful call to
+.BR syslog ()
+returns the number
+of bytes read.
+For \fItype\fP 9,
+.BR syslog ()
+returns the number of bytes currently
+available to be read on the kernel log buffer.
+For \fItype\fP 10,
+.BR syslog ()
+returns the total size of the kernel log buffer.
+For other values of \fItype\fP, 0 is returned on success.
+.PP
+In case of error, \-1 is returned,
+and \fIerrno\fP is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+Bad arguments (e.g.,
+bad
+.IR type ;
+or for
+.I type
+2, 3, or 4,
+.I buf
+is NULL,
+or
+.I len
+is less than zero; or for
+.I type
+8, the
+.I level
+is outside the range 1 to 8).
+.TP
+.B ENOSYS
+This
+.BR syslog ()
+system call is not available, because the kernel was compiled with the
+.B CONFIG_PRINTK
+kernel-configuration option disabled.
+.TP
+.B EPERM
+An attempt was made to change
+.I console_loglevel
+or clear the kernel
+message ring buffer by a process without sufficient privilege
+(more precisely: without the
+.B CAP_SYS_ADMIN
+or
+.B CAP_SYSLOG
+capability).
+.TP
+.B ERESTARTSYS
+System call was interrupted by a signal; nothing was read.
+(This can be seen only during a trace.)
+.SH STANDARDS
+Linux.
+.SH HISTORY
+From the very start, people noted that it is unfortunate that
+a system call and a library routine of the same name are entirely
+different animals.
+.\" In libc4 and libc5 the number of this call was defined by
+.\" .BR SYS_klog .
+.\" In glibc 2.0 the syscall is baptized
+.\" .BR klogctl ().
+.SH SEE ALSO
+.BR dmesg (1),
+.BR syslog (3),
+.BR capabilities (7)
diff --git a/man2/tee.2 b/man2/tee.2
new file mode 100644
index 0000000..7a3a6b1
--- /dev/null
+++ b/man2/tee.2
@@ -0,0 +1,199 @@
+.\" This manpage is Copyright (C) 2006 Jens Axboe
+.\" and Copyright (C) 2006 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH tee 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+tee \- duplicating pipe content
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #include <fcntl.h>
+.PP
+.BI "ssize_t tee(int " fd_in ", int " fd_out ", size_t " len \
+", unsigned int " flags );
+.fi
+.\" Return type was long before glibc 2.7
+.SH DESCRIPTION
+.\" Example programs http://brick.kernel.dk/snaps
+.\"
+.\"
+.\" add a "tee(in, out1, out2)" system call that duplicates the pages
+.\" (again, incrementing their reference count, not copying the data) from
+.\" one pipe to two other pipes.
+.BR tee ()
+duplicates up to
+.I len
+bytes of data from the pipe referred to by the file descriptor
+.I fd_in
+to the pipe referred to by the file descriptor
+.IR fd_out .
+It does not consume the data that is duplicated from
+.IR fd_in ;
+therefore, that data can be copied by a subsequent
+.BR splice (2).
+.PP
+.I flags
+is a bit mask that is composed by ORing together
+zero or more of the following values:
+.TP 1.9i
+.B SPLICE_F_MOVE
+Currently has no effect for
+.BR tee ();
+see
+.BR splice (2).
+.TP
+.B SPLICE_F_NONBLOCK
+Do not block on I/O; see
+.BR splice (2)
+for further details.
+.TP
+.B SPLICE_F_MORE
+Currently has no effect for
+.BR tee (),
+but may be implemented in the future; see
+.BR splice (2).
+.TP
+.B SPLICE_F_GIFT
+Unused for
+.BR tee ();
+see
+.BR vmsplice (2).
+.SH RETURN VALUE
+Upon successful completion,
+.BR tee ()
+returns the number of bytes that were duplicated between the input
+and output.
+A return value of 0 means that there was no data to transfer,
+and it would not make sense to block, because there are no
+writers connected to the write end of the pipe referred to by
+.IR fd_in .
+.PP
+On error,
+.BR tee ()
+returns \-1 and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EAGAIN
+.B SPLICE_F_NONBLOCK
+was specified in
+.I flags
+or one of the file descriptors had been marked as nonblocking
+.RB ( O_NONBLOCK ) ,
+and the operation would block.
+.TP
+.B EINVAL
+.I fd_in
+or
+.I fd_out
+does not refer to a pipe; or
+.I fd_in
+and
+.I fd_out
+refer to the same pipe.
+.TP
+.B ENOMEM
+Out of memory.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.17,
+glibc 2.5.
+.SH NOTES
+Conceptually,
+.BR tee ()
+copies the data between the two pipes.
+In reality no real data copying takes place though:
+under the covers,
+.BR tee ()
+assigns data to the output by merely grabbing
+a reference to the input.
+.SH EXAMPLES
+The example below implements a basic
+.BR tee (1)
+program using the
+.BR tee ()
+system call.
+Here is an example of its use:
+.PP
+.in +4n
+.EX
+$ \fBdate | ./a.out out.log | cat\fP
+Tue Oct 28 10:06:00 CET 2014
+$ \fBcat out.log\fP
+Tue Oct 28 10:06:00 CET 2014
+.EE
+.in
+.SS Program source
+\&
+.\" SRC BEGIN (tee.c)
+.EX
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ int fd;
+ ssize_t len, slen;
+\&
+ if (argc != 2) {
+ fprintf(stderr, "Usage: %s <file>\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ fd = open(argv[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);
+ if (fd == \-1) {
+ perror("open");
+ exit(EXIT_FAILURE);
+ }
+\&
+ for (;;) {
+ /*
+ * tee stdin to stdout.
+ */
+ len = tee(STDIN_FILENO, STDOUT_FILENO,
+ INT_MAX, SPLICE_F_NONBLOCK);
+ if (len < 0) {
+ if (errno == EAGAIN)
+ continue;
+ perror("tee");
+ exit(EXIT_FAILURE);
+ }
+ if (len == 0)
+ break;
+\&
+ /*
+ * Consume stdin by splicing it to a file.
+ */
+ while (len > 0) {
+ slen = splice(STDIN_FILENO, NULL, fd, NULL,
+ len, SPLICE_F_MOVE);
+ if (slen < 0) {
+ perror("splice");
+ exit(EXIT_FAILURE);
+ }
+ len \-= slen;
+ }
+ }
+\&
+ close(fd);
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR splice (2),
+.BR vmsplice (2),
+.BR pipe (7)
diff --git a/man2/tgkill.2 b/man2/tgkill.2
new file mode 100644
index 0000000..82fc2d6
--- /dev/null
+++ b/man2/tgkill.2
@@ -0,0 +1 @@
+.so man2/tkill.2
diff --git a/man2/time.2 b/man2/time.2
new file mode 100644
index 0000000..f121e9c
--- /dev/null
+++ b/man2/time.2
@@ -0,0 +1,117 @@
+.\" Copyright (c) 1992 Drew Eckhardt (drew@cs.colorado.edu), March 28, 1992
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt <michael@moria.de>
+.\" Modified Sat Jul 24 14:13:40 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Additions by Joseph S. Myers <jsm28@cam.ac.uk>, 970909
+.\"
+.TH time 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+time \- get time in seconds
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <time.h>
+.PP
+.BI "time_t time(time_t *_Nullable " tloc );
+.fi
+.SH DESCRIPTION
+.BR time ()
+returns the time as the number of seconds since the
+Epoch, 1970-01-01 00:00:00 +0000 (UTC).
+.PP
+If
+.I tloc
+is non-NULL,
+the return value is also stored in the memory pointed to by
+.IR tloc .
+.SH RETURN VALUE
+On success, the value of time in seconds since the Epoch is returned.
+On error, \fI((time_t)\ \-1)\fP is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+.I tloc
+points outside your accessible address space (but see BUGS).
+.IP
+On systems where the C library
+.BR time ()
+wrapper function invokes an implementation provided by the
+.BR vdso (7)
+(so that there is no trap into the kernel),
+an invalid address may instead trigger a
+.B SIGSEGV
+signal.
+.SH VERSIONS
+POSIX.1 defines
+.I seconds since the Epoch
+using a formula that approximates the number of seconds between a
+specified time and the Epoch.
+This formula takes account of the facts that
+all years that are evenly divisible by 4 are leap years,
+but years that are evenly divisible by 100 are not leap years
+unless they are also evenly divisible by 400,
+in which case they are leap years.
+This value is not the same as the actual number of seconds between the time
+and the Epoch, because of leap seconds and because system clocks are not
+required to be synchronized to a standard reference.
+The intention is that the interpretation of seconds since the Epoch values be
+consistent; see POSIX.1-2008 Rationale A.4.15 for further rationale.
+.PP
+On Linux, a call to
+.BR time ()
+with
+.I tloc
+specified as NULL cannot fail with the error
+.BR EOVERFLOW ,
+even on ABIs where
+.I time_t
+is a signed 32-bit integer and the clock reaches or exceeds 2**31 seconds
+(2038-01-19 03:14:08 UTC, ignoring leap seconds).
+(POSIX.1 permits, but does not require, the
+.B EOVERFLOW
+error in the case where the seconds since the Epoch will not fit in
+.IR time_t .)
+Instead, the behavior on Linux is undefined when the system time is out of the
+.I time_t
+range.
+Applications intended to run after 2038 should use ABIs with
+.I time_t
+wider than 32 bits.
+.SS C library/kernel differences
+On some architectures, an implementation of
+.BR time ()
+is provided in the
+.BR vdso (7).
+.SH STANDARDS
+C11, POSIX.1-2008.
+.SH HISTORY
+SVr4, 4.3BSD, C89, POSIX.1-2001.
+.\" Under 4.3BSD, this call is obsoleted by
+.\" .BR gettimeofday (2).
+.SH BUGS
+Error returns from this system call are indistinguishable from
+successful reports that the time is a few seconds
+.I before
+the Epoch, so the C library wrapper function never sets
+.I errno
+as a result of this call.
+.PP
+The
+.I tloc
+argument is obsolescent and should always be NULL in new code.
+When
+.I tloc
+is NULL, the call cannot fail.
+.SH SEE ALSO
+.BR date (1),
+.BR gettimeofday (2),
+.BR ctime (3),
+.BR ftime (3),
+.BR time (7),
+.BR vdso (7)
diff --git a/man2/timer_create.2 b/man2/timer_create.2
new file mode 100644
index 0000000..3265b27
--- /dev/null
+++ b/man2/timer_create.2
@@ -0,0 +1,487 @@
+.\" Copyright (c) 2009 Linux Foundation, written by Michael Kerrisk
+.\" <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH timer_create 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+timer_create \- create a POSIX per-process timer
+.SH LIBRARY
+Real-time library
+.RI ( librt ", " \-lrt )
+.SH SYNOPSIS
+.nf
+.BR "#include <signal.h>" " /* Definition of " SIGEV_* " constants */"
+.B #include <time.h>
+.PP
+.BI "int timer_create(clockid_t " clockid ,
+.BI " struct sigevent *_Nullable restrict " sevp ,
+.BI " timer_t *restrict " timerid );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR timer_create ():
+.nf
+ _POSIX_C_SOURCE >= 199309L
+.fi
+.SH DESCRIPTION
+.BR timer_create ()
+creates a new per-process interval timer.
+The ID of the new timer is returned in the buffer pointed to by
+.IR timerid ,
+which must be a non-null pointer.
+This ID is unique within the process, until the timer is deleted.
+The new timer is initially disarmed.
+.PP
+The
+.I clockid
+argument specifies the clock that the new timer uses to measure time.
+It can be specified as one of the following values:
+.TP
+.B CLOCK_REALTIME
+A settable system-wide real-time clock.
+.TP
+.B CLOCK_MONOTONIC
+A nonsettable monotonically increasing clock that measures time
+from some unspecified point in the past that does not change
+after system startup.
+.\" Note: the CLOCK_MONOTONIC_RAW clock added for clock_gettime()
+.\" in Linux 2.6.28 is not supported for POSIX timers -- mtk, Feb 2009
+.TP
+.BR CLOCK_PROCESS_CPUTIME_ID " (since Linux 2.6.12)"
+A clock that measures (user and system) CPU time consumed by
+(all of the threads in) the calling process.
+.TP
+.BR CLOCK_THREAD_CPUTIME_ID " (since Linux 2.6.12)"
+A clock that measures (user and system) CPU time consumed by
+the calling thread.
+.\" The CLOCK_MONOTONIC_RAW that was added in Linux 2.6.28 can't be used
+.\" to create a timer -- mtk, Feb 2009
+.TP
+.BR CLOCK_BOOTTIME " (Since Linux 2.6.39)"
+.\" commit 70a08cca1227dc31c784ec930099a4417a06e7d0
+Like
+.BR CLOCK_MONOTONIC ,
+this is a monotonically increasing clock.
+However, whereas the
+.B CLOCK_MONOTONIC
+clock does not measure the time while a system is suspended, the
+.B CLOCK_BOOTTIME
+clock does include the time during which the system is suspended.
+This is useful for applications that need to be suspend-aware.
+.B CLOCK_REALTIME
+is not suitable for such applications, since that clock is affected
+by discontinuous changes to the system clock.
+.TP
+.BR CLOCK_REALTIME_ALARM " (since Linux 3.0)"
+.\" commit 9a7adcf5c6dea63d2e47e6f6d2f7a6c9f48b9337
+This clock is like
+.BR CLOCK_REALTIME ,
+but will wake the system if it is suspended.
+The caller must have the
+.B CAP_WAKE_ALARM
+capability in order to set a timer against this clock.
+.TP
+.BR CLOCK_BOOTTIME_ALARM " (since Linux 3.0)"
+.\" commit 9a7adcf5c6dea63d2e47e6f6d2f7a6c9f48b9337
+This clock is like
+.BR CLOCK_BOOTTIME ,
+but will wake the system if it is suspended.
+The caller must have the
+.B CAP_WAKE_ALARM
+capability in order to set a timer against this clock.
+.TP
+.BR CLOCK_TAI " (since Linux 3.10)"
+A system-wide clock derived from wall-clock time but ignoring leap seconds.
+.PP
+See
+.BR clock_getres (2)
+for some further details on the above clocks.
+.PP
+As well as the above values,
+.I clockid
+can be specified as the
+.I clockid
+returned by a call to
+.BR clock_getcpuclockid (3)
+or
+.BR pthread_getcpuclockid (3).
+.PP
+The
+.I sevp
+argument points to a
+.I sigevent
+structure that specifies how the caller
+should be notified when the timer expires.
+For the definition and general details of this structure, see
+.BR sigevent (7).
+.PP
+The
+.I sevp.sigev_notify
+field can have the following values:
+.TP
+.B SIGEV_NONE
+Don't asynchronously notify when the timer expires.
+Progress of the timer can be monitored using
+.BR timer_gettime (2).
+.TP
+.B SIGEV_SIGNAL
+Upon timer expiration, generate the signal
+.I sigev_signo
+for the process.
+See
+.BR sigevent (7)
+for general details.
+The
+.I si_code
+field of the
+.I siginfo_t
+structure will be set to
+.BR SI_TIMER .
+At any point in time,
+at most one signal is queued to the process for a given timer; see
+.BR timer_getoverrun (2)
+for more details.
+.TP
+.B SIGEV_THREAD
+Upon timer expiration, invoke
+.I sigev_notify_function
+as if it were the start function of a new thread.
+See
+.BR sigevent (7)
+for details.
+.TP
+.BR SIGEV_THREAD_ID " (Linux-specific)"
+As for
+.BR SIGEV_SIGNAL ,
+but the signal is targeted at the thread whose ID is given in
+.IR sigev_notify_thread_id ,
+which must be a thread in the same process as the caller.
+The
+.I sigev_notify_thread_id
+field specifies a kernel thread ID, that is, the value returned by
+.BR clone (2)
+or
+.BR gettid (2).
+This flag is intended only for use by threading libraries.
+.PP
+Specifying
+.I sevp
+as NULL is equivalent to specifying a pointer to a
+.I sigevent
+structure in which
+.I sigev_notify
+is
+.BR SIGEV_SIGNAL ,
+.I sigev_signo
+is
+.BR SIGALRM ,
+and
+.I sigev_value.sival_int
+is the timer ID.
+.SH RETURN VALUE
+On success,
+.BR timer_create ()
+returns 0, and the ID of the new timer is placed in
+.IR *timerid .
+On failure, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EAGAIN
+Temporary error during kernel allocation of timer structures.
+.TP
+.B EINVAL
+Clock ID,
+.IR sigev_notify ,
+.IR sigev_signo ,
+or
+.I sigev_notify_thread_id
+is invalid.
+.TP
+.B ENOMEM
+.\" glibc layer: malloc()
+Could not allocate memory.
+.TP
+.B ENOTSUP
+The kernel does not support creating a timer against this
+.IR clockid .
+.TP
+.B EPERM
+.I clockid
+was
+.B CLOCK_REALTIME_ALARM
+or
+.B CLOCK_BOOTTIME_ALARM
+but the caller did not have the
+.B CAP_WAKE_ALARM
+capability.
+.SH VERSIONS
+.SS C library/kernel differences
+Part of the implementation of the POSIX timers API is provided by glibc.
+.\" See nptl/sysdeps/unix/sysv/linux/timer_create.c
+In particular:
+.IP \[bu] 3
+Much of the functionality for
+.B SIGEV_THREAD
+is implemented within glibc, rather than the kernel.
+(This is necessarily so,
+since the thread involved in handling the notification is one
+that must be managed by the C library POSIX threads implementation.)
+Although the notification delivered to the process is via a thread,
+internally the NPTL implementation uses a
+.I sigev_notify
+value of
+.B SIGEV_THREAD_ID
+along with a real-time signal that is reserved by the implementation (see
+.BR nptl (7)).
+.IP \[bu]
+The implementation of the default case where
+.I evp
+is NULL is handled inside glibc,
+which invokes the underlying system call with a suitably populated
+.I sigevent
+structure.
+.IP \[bu]
+The timer IDs presented at user level are maintained by glibc,
+which maps these IDs to the timer IDs employed by the kernel.
+.\" See the glibc source file kernel-posix-timers.h for the structure
+.\" that glibc uses to map user-space timer IDs to kernel timer IDs
+.\" The kernel-level timer ID is exposed via siginfo.si_tid.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+Linux 2.6.
+POSIX.1-2001.
+.PP
+Prior to Linux 2.6,
+glibc provided an incomplete user-space implementation
+.RB ( CLOCK_REALTIME
+timers only) using POSIX threads,
+and before glibc 2.17,
+.\" glibc commit 93a78ac437ba44f493333d7e2a4b0249839ce460
+the implementation falls back to this technique on systems
+running kernels older than Linux 2.6.
+.SH NOTES
+A program may create multiple interval timers using
+.BR timer_create ().
+.PP
+Timers are not inherited by the child of a
+.BR fork (2),
+and are disarmed and deleted during an
+.BR execve (2).
+.PP
+The kernel preallocates a "queued real-time signal"
+for each timer created using
+.BR timer_create ().
+Consequently, the number of timers is limited by the
+.B RLIMIT_SIGPENDING
+resource limit (see
+.BR setrlimit (2)).
+.PP
+The timers created by
+.BR timer_create ()
+are commonly known as "POSIX (interval) timers".
+The POSIX timers API consists of the following interfaces:
+.TP
+.BR timer_create ()
+Create a timer.
+.TP
+.BR timer_settime (2)
+Arm (start) or disarm (stop) a timer.
+.TP
+.BR timer_gettime (2)
+Fetch the time remaining until the next expiration of a timer,
+along with the interval setting of the timer.
+.TP
+.BR timer_getoverrun (2)
+Return the overrun count for the last timer expiration.
+.TP
+.BR timer_delete (2)
+Disarm and delete a timer.
+.PP
+Since Linux 3.10, the
+.IR /proc/ pid /timers
+file can be used to list the POSIX timers for the process with PID
+.IR pid .
+See
+.BR proc (5)
+for further information.
+.PP
+Since Linux 4.10,
+.\" baa73d9e478ff32d62f3f9422822b59dd9a95a21
+support for POSIX timers is a configurable option that is enabled by default.
+Kernel support can be disabled via the
+.B CONFIG_POSIX_TIMERS
+option.
+.SH EXAMPLES
+The program below takes two arguments: a sleep period in seconds,
+and a timer frequency in nanoseconds.
+The program establishes a handler for the signal it uses for the timer,
+blocks that signal,
+creates and arms a timer that expires with the given frequency,
+sleeps for the specified number of seconds,
+and then unblocks the timer signal.
+Assuming that the timer expired at least once while the program slept,
+the signal handler will be invoked,
+and the handler displays some information about the timer notification.
+The program terminates after one invocation of the signal handler.
+.PP
+In the following example run, the program sleeps for 1 second,
+after creating a timer that has a frequency of 100 nanoseconds.
+By the time the signal is unblocked and delivered,
+there have been around ten million overruns.
+.PP
+.in +4n
+.EX
+$ \fB./a.out 1 100\fP
+Establishing handler for signal 34
+Blocking signal 34
+timer ID is 0x804c008
+Sleeping for 1 seconds
+Unblocking signal 34
+Caught signal 34
+ sival_ptr = 0xbfb174f4; *sival_ptr = 0x804c008
+ overrun count = 10004886
+.EE
+.in
+.SS Program source
+\&
+.\" SRC BEGIN (timer_create.c)
+.EX
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+\&
+#define CLOCKID CLOCK_REALTIME
+#define SIG SIGRTMIN
+\&
+#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \e
+ } while (0)
+\&
+static void
+print_siginfo(siginfo_t *si)
+{
+ int or;
+ timer_t *tidp;
+\&
+ tidp = si\->si_value.sival_ptr;
+\&
+ printf(" sival_ptr = %p; ", si\->si_value.sival_ptr);
+ printf(" *sival_ptr = %#jx\en", (uintmax_t) *tidp);
+\&
+ or = timer_getoverrun(*tidp);
+ if (or == \-1)
+ errExit("timer_getoverrun");
+ else
+ printf(" overrun count = %d\en", or);
+}
+\&
+static void
+handler(int sig, siginfo_t *si, void *uc)
+{
+ /* Note: calling printf() from a signal handler is not safe
+ (and should not be done in production programs), since
+ printf() is not async\-signal\-safe; see signal\-safety(7).
+ Nevertheless, we use printf() here as a simple way of
+ showing that the handler was called. */
+\&
+ printf("Caught signal %d\en", sig);
+ print_siginfo(si);
+ signal(sig, SIG_IGN);
+}
+\&
+int
+main(int argc, char *argv[])
+{
+ timer_t timerid;
+ sigset_t mask;
+ long long freq_nanosecs;
+ struct sigevent sev;
+ struct sigaction sa;
+ struct itimerspec its;
+\&
+ if (argc != 3) {
+ fprintf(stderr, "Usage: %s <sleep\-secs> <freq\-nanosecs>\en",
+ argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ /* Establish handler for timer signal. */
+\&
+ printf("Establishing handler for signal %d\en", SIG);
+ sa.sa_flags = SA_SIGINFO;
+ sa.sa_sigaction = handler;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(SIG, &sa, NULL) == \-1)
+ errExit("sigaction");
+\&
+ /* Block timer signal temporarily. */
+\&
+ printf("Blocking signal %d\en", SIG);
+ sigemptyset(&mask);
+ sigaddset(&mask, SIG);
+ if (sigprocmask(SIG_SETMASK, &mask, NULL) == \-1)
+ errExit("sigprocmask");
+\&
+ /* Create the timer. */
+\&
+ sev.sigev_notify = SIGEV_SIGNAL;
+ sev.sigev_signo = SIG;
+ sev.sigev_value.sival_ptr = &timerid;
+ if (timer_create(CLOCKID, &sev, &timerid) == \-1)
+ errExit("timer_create");
+\&
+ printf("timer ID is %#jx\en", (uintmax_t) timerid);
+\&
+ /* Start the timer. */
+\&
+ freq_nanosecs = atoll(argv[2]);
+ its.it_value.tv_sec = freq_nanosecs / 1000000000;
+ its.it_value.tv_nsec = freq_nanosecs % 1000000000;
+ its.it_interval.tv_sec = its.it_value.tv_sec;
+ its.it_interval.tv_nsec = its.it_value.tv_nsec;
+\&
+ if (timer_settime(timerid, 0, &its, NULL) == \-1)
+ errExit("timer_settime");
+\&
+ /* Sleep for a while; meanwhile, the timer may expire
+ multiple times. */
+\&
+ printf("Sleeping for %d seconds\en", atoi(argv[1]));
+ sleep(atoi(argv[1]));
+\&
+ /* Unlock the timer signal, so that timer notification
+ can be delivered. */
+\&
+ printf("Unblocking signal %d\en", SIG);
+ if (sigprocmask(SIG_UNBLOCK, &mask, NULL) == \-1)
+ errExit("sigprocmask");
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.ad l
+.nh
+.BR clock_gettime (2),
+.BR setitimer (2),
+.BR timer_delete (2),
+.BR timer_getoverrun (2),
+.BR timer_settime (2),
+.BR timerfd_create (2),
+.BR clock_getcpuclockid (3),
+.BR pthread_getcpuclockid (3),
+.BR pthreads (7),
+.BR sigevent (7),
+.BR signal (7),
+.BR time (7)
diff --git a/man2/timer_delete.2 b/man2/timer_delete.2
new file mode 100644
index 0000000..ee1468e
--- /dev/null
+++ b/man2/timer_delete.2
@@ -0,0 +1,58 @@
+.\" Copyright (c) 2009 Linux Foundation, written by Michael Kerrisk
+.\" <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH timer_delete 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+timer_delete \- delete a POSIX per-process timer
+.SH LIBRARY
+Real-time library
+.RI ( librt ", " \-lrt )
+.SH SYNOPSIS
+.nf
+.B #include <time.h>
+.PP
+.BI "int timer_delete(timer_t " timerid );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR timer_delete ():
+.nf
+ _POSIX_C_SOURCE >= 199309L
+.fi
+.SH DESCRIPTION
+.BR timer_delete ()
+deletes the timer whose ID is given in
+.IR timerid .
+If the timer was armed at the time of this call,
+it is disarmed before being deleted.
+The treatment of any pending signal generated by the deleted timer
+is unspecified.
+.SH RETURN VALUE
+On success,
+.BR timer_delete ()
+returns 0.
+On failure, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+.I timerid
+is not a valid timer ID.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+Linux 2.6.
+POSIX.1-2001.
+.SH SEE ALSO
+.BR clock_gettime (2),
+.BR timer_create (2),
+.BR timer_getoverrun (2),
+.BR timer_settime (2),
+.BR time (7)
diff --git a/man2/timer_getoverrun.2 b/man2/timer_getoverrun.2
new file mode 100644
index 0000000..7521957
--- /dev/null
+++ b/man2/timer_getoverrun.2
@@ -0,0 +1,134 @@
+.\" Copyright (c) 2009 Linux Foundation, written by Michael Kerrisk
+.\" <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH timer_getoverrun 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+timer_getoverrun \- get overrun count for a POSIX per-process timer
+.SH LIBRARY
+Real-time library
+.RI ( librt ", " \-lrt )
+.SH SYNOPSIS
+.nf
+.B #include <time.h>
+.PP
+.BI "int timer_getoverrun(timer_t " timerid );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR timer_getoverrun ():
+.nf
+ _POSIX_C_SOURCE >= 199309L
+.fi
+.SH DESCRIPTION
+.BR timer_getoverrun ()
+returns the "overrun count" for the timer referred to by
+.IR timerid .
+An application can use the overrun count to accurately calculate the number
+of timer expirations that would have occurred over a given time interval.
+Timer overruns can occur both when receiving expiration notifications
+via signals
+.RB ( SIGEV_SIGNAL ),
+and via threads
+.RB ( SIGEV_THREAD ).
+.PP
+When expiration notifications are delivered via a signal,
+overruns can occur as follows.
+Regardless of whether or not a real-time signal is used for
+timer notifications,
+the system queues at most one signal per timer.
+(This is the behavior specified by POSIX.1.
+The alternative, queuing one signal for each timer expiration,
+could easily result in overflowing the allowed limits for
+queued signals on the system.)
+Because of system scheduling delays,
+or because the signal may be temporarily blocked,
+there can be a delay between the time when the notification
+signal is generated and the time when it
+is delivered (e.g., caught by a signal handler) or accepted (e.g., using
+.BR sigwaitinfo (2)).
+In this interval, further timer expirations may occur.
+The timer overrun count is the number of additional
+timer expirations that occurred between the time when the signal
+was generated and when it was delivered or accepted.
+.PP
+Timer overruns can also occur when expiration notifications
+are delivered via invocation of a thread,
+since there may be an arbitrary delay between an expiration of the timer
+and the invocation of the notification thread,
+and in that delay interval, additional timer expirations may occur.
+.SH RETURN VALUE
+On success,
+.BR timer_getoverrun ()
+returns the overrun count of the specified timer;
+this count may be 0 if no overruns have occurred.
+On failure, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+.I timerid
+is not a valid timer ID.
+.SH VERSIONS
+When timer notifications are delivered via signals
+.RB ( SIGEV_SIGNAL ),
+on Linux it is also possible to obtain the overrun count via the
+.I si_overrun
+field of the
+.I siginfo_t
+structure (see
+.BR sigaction (2)).
+This allows an application to avoid the overhead of making
+a system call to obtain the overrun count,
+but is a nonportable extension to POSIX.1.
+.PP
+POSIX.1 discusses timer overruns only in the context of
+timer notifications using signals.
+.\" FIXME . Austin bug filed, 11 Feb 09
+.\" https://www.austingroupbugs.net/view.php?id=95
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+Linux 2.6.
+POSIX.1-2001.
+.SH BUGS
+POSIX.1 specifies that if the timer overrun count
+is equal to or greater than an implementation-defined maximum,
+.BR DELAYTIMER_MAX ,
+then
+.BR timer_getoverrun ()
+should return
+.BR DELAYTIMER_MAX .
+However, before Linux 4.19,
+.\" http://bugzilla.kernel.org/show_bug.cgi?id=12665
+if the timer overrun value exceeds the maximum representable integer,
+the counter cycles, starting once more from low values.
+Since Linux 4.19,
+.\" commit 78c9c4dfbf8c04883941445a195276bb4bb92c76
+.BR timer_getoverrun ()
+returns
+.B DELAYTIMER_MAX
+(defined as
+.B INT_MAX
+in
+.IR <limits.h> )
+in this case (and the overrun value is reset to 0).
+.SH EXAMPLES
+See
+.BR timer_create (2).
+.SH SEE ALSO
+.BR clock_gettime (2),
+.BR sigaction (2),
+.BR signalfd (2),
+.BR sigwaitinfo (2),
+.BR timer_create (2),
+.BR timer_delete (2),
+.BR timer_settime (2),
+.BR signal (7),
+.BR time (7)
diff --git a/man2/timer_gettime.2 b/man2/timer_gettime.2
new file mode 100644
index 0000000..42015ca
--- /dev/null
+++ b/man2/timer_gettime.2
@@ -0,0 +1 @@
+.so man2/timer_settime.2
diff --git a/man2/timer_settime.2 b/man2/timer_settime.2
new file mode 100644
index 0000000..030bab5
--- /dev/null
+++ b/man2/timer_settime.2
@@ -0,0 +1,187 @@
+.\" Copyright (c) 2009 Linux Foundation, written by Michael Kerrisk
+.\" <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH timer_settime 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+timer_settime, timer_gettime \- arm/disarm and fetch
+state of POSIX per-process timer
+.SH LIBRARY
+Real-time library
+.RI ( librt ", " \-lrt )
+.SH SYNOPSIS
+.nf
+.B #include <time.h>
+.PP
+.BI "int timer_gettime(timer_t " timerid ", struct itimerspec *" curr_value );
+.BI "int timer_settime(timer_t " timerid ", int " flags ,
+.BI " const struct itimerspec *restrict " new_value ,
+.BI " struct itimerspec *_Nullable restrict " old_value );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR timer_settime (),
+.BR timer_gettime ():
+.nf
+ _POSIX_C_SOURCE >= 199309L
+.fi
+.SH DESCRIPTION
+.BR timer_settime ()
+arms or disarms the timer identified by
+.IR timerid .
+The
+.I new_value
+argument is pointer to an
+.I itimerspec
+structure that specifies the new initial value and
+the new interval for the timer.
+The
+.I itimerspec
+structure is described in
+.BR itimerspec (3type).
+.PP
+Each of the substructures of the
+.I itimerspec
+structure is a
+.BR timespec (3)
+structure that allows a time value to be specified
+in seconds and nanoseconds.
+These time values are measured according to the clock
+that was specified when the timer was created by
+.BR timer_create (2).
+.PP
+If
+.I new_value\->it_value
+specifies a nonzero value (i.e., either subfield is nonzero), then
+.BR timer_settime ()
+arms (starts) the timer,
+setting it to initially expire at the given time.
+(If the timer was already armed,
+then the previous settings are overwritten.)
+If
+.I new_value\->it_value
+specifies a zero value
+(i.e., both subfields are zero),
+then the timer is disarmed.
+.PP
+The
+.I new_value\->it_interval
+field specifies the period of the timer, in seconds and nanoseconds.
+If this field is nonzero, then each time that an armed timer expires,
+the timer is reloaded from the value specified in
+.IR new_value\->it_interval .
+If
+.I new_value\->it_interval
+specifies a zero value,
+then the timer expires just once, at the time specified by
+.IR it_value .
+.PP
+By default, the initial expiration time specified in
+.I new_value\->it_value
+is interpreted relative to the current time on the timer's
+clock at the time of the call.
+This can be modified by specifying
+.B TIMER_ABSTIME
+in
+.IR flags ,
+in which case
+.I new_value\->it_value
+is interpreted as an absolute value as measured on the timer's clock;
+that is, the timer will expire when the clock value reaches the
+value specified by
+.IR new_value\->it_value .
+If the specified absolute time has already passed,
+then the timer expires immediately,
+and the overrun count (see
+.BR timer_getoverrun (2))
+will be set correctly.
+.\" By experiment: the overrun count is set correctly, for CLOCK_REALTIME.
+.PP
+If the value of the
+.B CLOCK_REALTIME
+clock is adjusted while an absolute timer based on that clock is armed,
+then the expiration of the timer will be appropriately adjusted.
+Adjustments to the
+.B CLOCK_REALTIME
+clock have no effect on relative timers based on that clock.
+.\" Similar remarks might apply with respect to process and thread CPU time
+.\" clocks, but these clocks are not currently (2.6.28) settable on Linux.
+.PP
+If
+.I old_value
+is not NULL, then it points to a buffer
+that is used to return the previous interval of the timer (in
+.IR old_value\->it_interval )
+and the amount of time until the timer
+would previously have next expired (in
+.IR old_value\->it_value ).
+.PP
+.BR timer_gettime ()
+returns the time until next expiration, and the interval,
+for the timer specified by
+.IR timerid ,
+in the buffer pointed to by
+.IR curr_value .
+The time remaining until the next timer expiration is returned in
+.IR curr_value\->it_value ;
+this is always a relative value, regardless of whether the
+.B TIMER_ABSTIME
+flag was used when arming the timer.
+If the value returned in
+.I curr_value\->it_value
+is zero, then the timer is currently disarmed.
+The timer interval is returned in
+.IR curr_value\->it_interval .
+If the value returned in
+.I curr_value\->it_interval
+is zero, then this is a "one-shot" timer.
+.SH RETURN VALUE
+On success,
+.BR timer_settime ()
+and
+.BR timer_gettime ()
+return 0.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+These functions may fail with the following errors:
+.TP
+.B EFAULT
+.IR new_value ,
+.IR old_value ,
+or
+.I curr_value
+is not a valid pointer.
+.TP
+.B EINVAL
+.I timerid
+is invalid.
+.\" FIXME . eventually: invalid value in flags
+.PP
+.BR timer_settime ()
+may fail with the following errors:
+.TP
+.B EINVAL
+.I new_value.it_value
+is negative; or
+.I new_value.it_value.tv_nsec
+is negative or greater than 999,999,999.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+Linux 2.6.
+POSIX.1-2001.
+.SH EXAMPLES
+See
+.BR timer_create (2).
+.SH SEE ALSO
+.BR timer_create (2),
+.BR timer_getoverrun (2),
+.BR timespec (3),
+.BR time (7)
diff --git a/man2/timerfd_create.2 b/man2/timerfd_create.2
new file mode 100644
index 0000000..6ceea56
--- /dev/null
+++ b/man2/timerfd_create.2
@@ -0,0 +1,700 @@
+.\" Copyright (C) 2008 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.TH timerfd_create 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+timerfd_create, timerfd_settime, timerfd_gettime \-
+timers that notify via file descriptors
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/timerfd.h>
+.PP
+.BI "int timerfd_create(int " clockid ", int " flags );
+.PP
+.BI "int timerfd_settime(int " fd ", int " flags ,
+.BI " const struct itimerspec *" new_value ,
+.BI " struct itimerspec *_Nullable " old_value );
+.BI "int timerfd_gettime(int " fd ", struct itimerspec *" curr_value );
+.fi
+.SH DESCRIPTION
+These system calls create and operate on a timer
+that delivers timer expiration notifications via a file descriptor.
+They provide an alternative to the use of
+.BR setitimer (2)
+or
+.BR timer_create (2),
+with the advantage that the file descriptor may be monitored by
+.BR select (2),
+.BR poll (2),
+and
+.BR epoll (7).
+.PP
+The use of these three system calls is analogous to the use of
+.BR timer_create (2),
+.BR timer_settime (2),
+and
+.BR timer_gettime (2).
+(There is no analog of
+.BR timer_getoverrun (2),
+since that functionality is provided by
+.BR read (2),
+as described below.)
+.\"
+.SS timerfd_create()
+.BR timerfd_create ()
+creates a new timer object,
+and returns a file descriptor that refers to that timer.
+The
+.I clockid
+argument specifies the clock that is used to mark the progress
+of the timer, and must be one of the following:
+.TP
+.B CLOCK_REALTIME
+A settable system-wide real-time clock.
+.TP
+.B CLOCK_MONOTONIC
+A nonsettable monotonically increasing clock that measures time
+from some unspecified point in the past that does not change
+after system startup.
+.TP
+.BR CLOCK_BOOTTIME " (Since Linux 3.15)"
+.\" commit 4a2378a943f09907fb1ae35c15de917f60289c14
+Like
+.BR CLOCK_MONOTONIC ,
+this is a monotonically increasing clock.
+However, whereas the
+.B CLOCK_MONOTONIC
+clock does not measure the time while a system is suspended, the
+.B CLOCK_BOOTTIME
+clock does include the time during which the system is suspended.
+This is useful for applications that need to be suspend-aware.
+.B CLOCK_REALTIME
+is not suitable for such applications, since that clock is affected
+by discontinuous changes to the system clock.
+.TP
+.BR CLOCK_REALTIME_ALARM " (since Linux 3.11)"
+.\" commit 11ffa9d6065f344a9bd769a2452f26f2f671e5f8
+This clock is like
+.BR CLOCK_REALTIME ,
+but will wake the system if it is suspended.
+The caller must have the
+.B CAP_WAKE_ALARM
+capability in order to set a timer against this clock.
+.TP
+.BR CLOCK_BOOTTIME_ALARM " (since Linux 3.11)"
+.\" commit 11ffa9d6065f344a9bd769a2452f26f2f671e5f8
+This clock is like
+.BR CLOCK_BOOTTIME ,
+but will wake the system if it is suspended.
+The caller must have the
+.B CAP_WAKE_ALARM
+capability in order to set a timer against this clock.
+.PP
+See
+.BR clock_getres (2)
+for some further details on the above clocks.
+.PP
+The current value of each of these clocks can be retrieved using
+.BR clock_gettime (2).
+.PP
+Starting with Linux 2.6.27, the following values may be bitwise ORed in
+.I flags
+to change the behavior of
+.BR timerfd_create ():
+.TP 14
+.B TFD_NONBLOCK
+Set the
+.B O_NONBLOCK
+file status flag on the open file description (see
+.BR open (2))
+referred to by the new file descriptor.
+Using this flag saves extra calls to
+.BR fcntl (2)
+to achieve the same result.
+.TP
+.B TFD_CLOEXEC
+Set the close-on-exec
+.RB ( FD_CLOEXEC )
+flag on the new file descriptor.
+See the description of the
+.B O_CLOEXEC
+flag in
+.BR open (2)
+for reasons why this may be useful.
+.PP
+In Linux versions up to and including 2.6.26,
+.I flags
+must be specified as zero.
+.SS timerfd_settime()
+.BR timerfd_settime ()
+arms (starts) or disarms (stops)
+the timer referred to by the file descriptor
+.IR fd .
+.PP
+The
+.I new_value
+argument specifies the initial expiration and interval for the timer.
+The
+.I itimerspec
+structure used for this argument is described in
+.BR itimerspec (3type).
+.PP
+.I new_value.it_value
+specifies the initial expiration of the timer,
+in seconds and nanoseconds.
+Setting either field of
+.I new_value.it_value
+to a nonzero value arms the timer.
+Setting both fields of
+.I new_value.it_value
+to zero disarms the timer.
+.PP
+Setting one or both fields of
+.I new_value.it_interval
+to nonzero values specifies the period, in seconds and nanoseconds,
+for repeated timer expirations after the initial expiration.
+If both fields of
+.I new_value.it_interval
+are zero, the timer expires just once, at the time specified by
+.IR new_value.it_value .
+.PP
+By default,
+the initial expiration time specified in
+.I new_value
+is interpreted relative to the current time
+on the timer's clock at the time of the call (i.e.,
+.I new_value.it_value
+specifies a time relative to the current value of the clock specified by
+.IR clockid ).
+An absolute timeout can be selected via the
+.I flags
+argument.
+.PP
+The
+.I flags
+argument is a bit mask that can include the following values:
+.TP
+.B TFD_TIMER_ABSTIME
+Interpret
+.I new_value.it_value
+as an absolute value on the timer's clock.
+The timer will expire when the value of the timer's
+clock reaches the value specified in
+.IR new_value.it_value .
+.TP
+.B TFD_TIMER_CANCEL_ON_SET
+If this flag is specified along with
+.B TFD_TIMER_ABSTIME
+and the clock for this timer is
+.B CLOCK_REALTIME
+or
+.BR CLOCK_REALTIME_ALARM ,
+then mark this timer as cancelable if the real-time clock
+undergoes a discontinuous change
+.RB ( settimeofday (2),
+.BR clock_settime (2),
+or similar).
+When such changes occur, a current or future
+.BR read (2)
+from the file descriptor will fail with the error
+.BR ECANCELED .
+.PP
+If the
+.I old_value
+argument is not NULL, then the
+.I itimerspec
+structure that it points to is used to return the setting of the timer
+that was current at the time of the call;
+see the description of
+.BR timerfd_gettime ()
+following.
+.\"
+.SS timerfd_gettime()
+.BR timerfd_gettime ()
+returns, in
+.IR curr_value ,
+an
+.I itimerspec
+structure that contains the current setting of the timer
+referred to by the file descriptor
+.IR fd .
+.PP
+The
+.I it_value
+field returns the amount of time
+until the timer will next expire.
+If both fields of this structure are zero,
+then the timer is currently disarmed.
+This field always contains a relative value, regardless of whether the
+.B TFD_TIMER_ABSTIME
+flag was specified when setting the timer.
+.PP
+The
+.I it_interval
+field returns the interval of the timer.
+If both fields of this structure are zero,
+then the timer is set to expire just once, at the time specified by
+.IR curr_value.it_value .
+.SS Operating on a timer file descriptor
+The file descriptor returned by
+.BR timerfd_create ()
+supports the following additional operations:
+.TP
+.BR read (2)
+If the timer has already expired one or more times since
+its settings were last modified using
+.BR timerfd_settime (),
+or since the last successful
+.BR read (2),
+then the buffer given to
+.BR read (2)
+returns an unsigned 8-byte integer
+.RI ( uint64_t )
+containing the number of expirations that have occurred.
+(The returned value is in host byte order\[em]that is,
+the native byte order for integers on the host machine.)
+.IP
+If no timer expirations have occurred at the time of the
+.BR read (2),
+then the call either blocks until the next timer expiration,
+or fails with the error
+.B EAGAIN
+if the file descriptor has been made nonblocking
+(via the use of the
+.BR fcntl (2)
+.B F_SETFL
+operation to set the
+.B O_NONBLOCK
+flag).
+.IP
+A
+.BR read (2)
+fails with the error
+.B EINVAL
+if the size of the supplied buffer is less than 8 bytes.
+.IP
+If the associated clock is either
+.B CLOCK_REALTIME
+or
+.BR CLOCK_REALTIME_ALARM ,
+the timer is absolute
+.RB ( TFD_TIMER_ABSTIME ),
+and the flag
+.B TFD_TIMER_CANCEL_ON_SET
+was specified when calling
+.BR timerfd_settime (),
+then
+.BR read (2)
+fails with the error
+.B ECANCELED
+if the real-time clock undergoes a discontinuous change.
+(This allows the reading application to discover
+such discontinuous changes to the clock.)
+.IP
+If the associated clock is either
+.B CLOCK_REALTIME
+or
+.BR CLOCK_REALTIME_ALARM ,
+the timer is absolute
+.RB ( TFD_TIMER_ABSTIME ),
+and the flag
+.B TFD_TIMER_CANCEL_ON_SET
+was
+.I not
+specified when calling
+.BR timerfd_settime (),
+then a discontinuous negative change to the clock (e.g.,
+.BR clock_settime (2))
+may cause
+.BR read (2)
+to unblock, but return a value of 0 (i.e., no bytes read),
+if the clock change occurs after the time expired,
+but before the
+.BR read (2)
+on the file descriptor.
+.TP
+.BR poll "(2), " select "(2) (and similar)"
+The file descriptor is readable
+(the
+.BR select (2)
+.I readfds
+argument; the
+.BR poll (2)
+.B POLLIN
+flag)
+if one or more timer expirations have occurred.
+.IP
+The file descriptor also supports the other file-descriptor
+multiplexing APIs:
+.BR pselect (2),
+.BR ppoll (2),
+and
+.BR epoll (7).
+.TP
+.BR ioctl (2)
+The following timerfd-specific command is supported:
+.RS
+.TP
+.BR TFD_IOC_SET_TICKS " (since Linux 3.17)"
+.\" commit 5442e9fbd7c23172a1c9bc736629cd123a9923f0
+Adjust the number of timer expirations that have occurred.
+The argument is a pointer to a nonzero 8-byte integer
+.RI ( uint64_t *)
+containing the new number of expirations.
+Once the number is set, any waiter on the timer is woken up.
+The only purpose of this command is to restore the expirations
+for the purpose of checkpoint/restore.
+This operation is available only if the kernel was configured with the
+.B CONFIG_CHECKPOINT_RESTORE
+option.
+.RE
+.TP
+.BR close (2)
+When the file descriptor is no longer required it should be closed.
+When all file descriptors associated with the same timer object
+have been closed,
+the timer is disarmed and its resources are freed by the kernel.
+.\"
+.SS fork(2) semantics
+After a
+.BR fork (2),
+the child inherits a copy of the file descriptor created by
+.BR timerfd_create ().
+The file descriptor refers to the same underlying
+timer object as the corresponding file descriptor in the parent,
+and
+.BR read (2)s
+in the child will return information about
+expirations of the timer.
+.\"
+.SS execve(2) semantics
+A file descriptor created by
+.BR timerfd_create ()
+is preserved across
+.BR execve (2),
+and continues to generate timer expirations if the timer was armed.
+.SH RETURN VALUE
+On success,
+.BR timerfd_create ()
+returns a new file descriptor.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.PP
+.BR timerfd_settime ()
+and
+.BR timerfd_gettime ()
+return 0 on success;
+on error they return \-1, and set
+.I errno
+to indicate the error.
+.SH ERRORS
+.BR timerfd_create ()
+can fail with the following errors:
+.TP
+.B EINVAL
+The
+.I clockid
+is not valid.
+.TP
+.B EINVAL
+.I flags
+is invalid;
+or, in Linux 2.6.26 or earlier,
+.I flags
+is nonzero.
+.TP
+.B EMFILE
+The per-process limit on the number of open file descriptors has been reached.
+.TP
+.B ENFILE
+The system-wide limit on the total number of open files has been
+reached.
+.TP
+.B ENODEV
+Could not mount (internal) anonymous inode device.
+.TP
+.B ENOMEM
+There was insufficient kernel memory to create the timer.
+.TP
+.B EPERM
+.I clockid
+was
+.B CLOCK_REALTIME_ALARM
+or
+.B CLOCK_BOOTTIME_ALARM
+but the caller did not have the
+.B CAP_WAKE_ALARM
+capability.
+.PP
+.BR timerfd_settime ()
+and
+.BR timerfd_gettime ()
+can fail with the following errors:
+.TP
+.B EBADF
+.I fd
+is not a valid file descriptor.
+.TP
+.B EFAULT
+.IR new_value ,
+.IR old_value ,
+or
+.I curr_value
+is not a valid pointer.
+.TP
+.B EINVAL
+.I fd
+is not a valid timerfd file descriptor.
+.PP
+.BR timerfd_settime ()
+can also fail with the following errors:
+.TP
+.B ECANCELED
+See NOTES.
+.TP
+.B EINVAL
+.I new_value
+is not properly initialized (one of the
+.I tv_nsec
+falls outside the range zero to 999,999,999).
+.TP
+.B EINVAL
+.\" This case only checked since Linux 2.6.29, and Linux 2.2.2[78].some-stable-version.
+.\" In older kernel versions, no check was made for invalid flags.
+.I flags
+is invalid.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.25,
+glibc 2.8.
+.SH NOTES
+Suppose the following scenario for
+.B CLOCK_REALTIME
+or
+.B CLOCK_REALTIME_ALARM
+timer that was created with
+.BR timerfd_create ():
+.IP (1) 5
+The timer has been started
+.RB ( timerfd_settime ())
+with the
+.B TFD_TIMER_ABSTIME
+and
+.B TFD_TIMER_CANCEL_ON_SET
+flags;
+.IP (2)
+A discontinuous change (e.g.,
+.BR settimeofday (2))
+is subsequently made to the
+.B CLOCK_REALTIME
+clock; and
+.IP (3)
+the caller once more calls
+.BR timerfd_settime ()
+to rearm the timer (without first doing a
+.BR read (2)
+on the file descriptor).
+.PP
+In this case the following occurs:
+.IP \[bu] 3
+The
+.BR timerfd_settime ()
+returns \-1 with
+.I errno
+set to
+.BR ECANCELED .
+(This enables the caller to know that the previous timer was affected
+by a discontinuous change to the clock.)
+.IP \[bu]
+The timer
+.I "is successfully rearmed"
+with the settings provided in the second
+.BR timerfd_settime ()
+call.
+(This was probably an implementation accident, but won't be fixed now,
+in case there are applications that depend on this behaviour.)
+.SH BUGS
+Currently,
+.\" 2.6.29
+.BR timerfd_create ()
+supports fewer types of clock IDs than
+.BR timer_create (2).
+.SH EXAMPLES
+The following program creates a timer and then monitors its progress.
+The program accepts up to three command-line arguments.
+The first argument specifies the number of seconds for
+the initial expiration of the timer.
+The second argument specifies the interval for the timer, in seconds.
+The third argument specifies the number of times the program should
+allow the timer to expire before terminating.
+The second and third command-line arguments are optional.
+.PP
+The following shell session demonstrates the use of the program:
+.PP
+.in +4n
+.EX
+.RB "$" " a.out 3 1 100"
+0.000: timer started
+3.000: read: 1; total=1
+4.000: read: 1; total=2
+.BR "\[ha]Z " " # type control\-Z to suspend the program"
+[1]+ Stopped ./timerfd3_demo 3 1 100
+.RB "$ " "fg" " # Resume execution after a few seconds"
+a.out 3 1 100
+9.660: read: 5; total=7
+10.000: read: 1; total=8
+11.000: read: 1; total=9
+.BR "\[ha]C " " # type control\-C to suspend the program"
+.EE
+.in
+.SS Program source
+\&
+.\" SRC BEGIN (timerfd_create.c)
+.EX
+.\" The commented out code here is what we currently need until
+.\" the required stuff is in glibc
+.\"
+.\"
+.\"/* Link with \-lrt */
+.\"#define _GNU_SOURCE
+.\"#include <sys/syscall.h>
+.\"#include <unistd.h>
+.\"#include <time.h>
+.\"#if defined(__i386__)
+.\"#define __NR_timerfd_create 322
+.\"#define __NR_timerfd_settime 325
+.\"#define __NR_timerfd_gettime 326
+.\"#endif
+.\"
+.\"static int
+.\"timerfd_create(int clockid, int flags)
+.\"{
+.\" return syscall(__NR_timerfd_create, clockid, flags);
+.\"}
+.\"
+.\"static int
+.\"timerfd_settime(int fd, int flags, struct itimerspec *new_value,
+.\" struct itimerspec *curr_value)
+.\"{
+.\" return syscall(__NR_timerfd_settime, fd, flags, new_value,
+.\" curr_value);
+.\"}
+.\"
+.\"static int
+.\"timerfd_gettime(int fd, struct itimerspec *curr_value)
+.\"{
+.\" return syscall(__NR_timerfd_gettime, fd, curr_value);
+.\"}
+.\"
+.\"#define TFD_TIMER_ABSTIME (1 << 0)
+.\"
+.\"////////////////////////////////////////////////////////////
+#include <err.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/timerfd.h>
+#include <time.h>
+#include <unistd.h>
+\&
+static void
+print_elapsed_time(void)
+{
+ int secs, nsecs;
+ static int first_call = 1;
+ struct timespec curr;
+ static struct timespec start;
+\&
+ if (first_call) {
+ first_call = 0;
+ if (clock_gettime(CLOCK_MONOTONIC, &start) == \-1)
+ err(EXIT_FAILURE, "clock_gettime");
+ }
+\&
+ if (clock_gettime(CLOCK_MONOTONIC, &curr) == \-1)
+ err(EXIT_FAILURE, "clock_gettime");
+\&
+ secs = curr.tv_sec \- start.tv_sec;
+ nsecs = curr.tv_nsec \- start.tv_nsec;
+ if (nsecs < 0) {
+ secs\-\-;
+ nsecs += 1000000000;
+ }
+ printf("%d.%03d: ", secs, (nsecs + 500000) / 1000000);
+}
+\&
+int
+main(int argc, char *argv[])
+{
+ int fd;
+ ssize_t s;
+ uint64_t exp, tot_exp, max_exp;
+ struct timespec now;
+ struct itimerspec new_value;
+\&
+ if (argc != 2 && argc != 4) {
+ fprintf(stderr, "%s init\-secs [interval\-secs max\-exp]\en",
+ argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (clock_gettime(CLOCK_REALTIME, &now) == \-1)
+ err(EXIT_FAILURE, "clock_gettime");
+\&
+ /* Create a CLOCK_REALTIME absolute timer with initial
+ expiration and interval as specified in command line. */
+\&
+ new_value.it_value.tv_sec = now.tv_sec + atoi(argv[1]);
+ new_value.it_value.tv_nsec = now.tv_nsec;
+ if (argc == 2) {
+ new_value.it_interval.tv_sec = 0;
+ max_exp = 1;
+ } else {
+ new_value.it_interval.tv_sec = atoi(argv[2]);
+ max_exp = atoi(argv[3]);
+ }
+ new_value.it_interval.tv_nsec = 0;
+\&
+ fd = timerfd_create(CLOCK_REALTIME, 0);
+ if (fd == \-1)
+ err(EXIT_FAILURE, "timerfd_create");
+\&
+ if (timerfd_settime(fd, TFD_TIMER_ABSTIME, &new_value, NULL) == \-1)
+ err(EXIT_FAILURE, "timerfd_settime");
+\&
+ print_elapsed_time();
+ printf("timer started\en");
+\&
+ for (tot_exp = 0; tot_exp < max_exp;) {
+ s = read(fd, &exp, sizeof(uint64_t));
+ if (s != sizeof(uint64_t))
+ err(EXIT_FAILURE, "read");
+\&
+ tot_exp += exp;
+ print_elapsed_time();
+ printf("read: %" PRIu64 "; total=%" PRIu64 "\en", exp, tot_exp);
+ }
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR eventfd (2),
+.BR poll (2),
+.BR read (2),
+.BR select (2),
+.BR setitimer (2),
+.BR signalfd (2),
+.BR timer_create (2),
+.BR timer_gettime (2),
+.BR timer_settime (2),
+.BR timespec (3),
+.BR epoll (7),
+.BR time (7)
diff --git a/man2/timerfd_gettime.2 b/man2/timerfd_gettime.2
new file mode 100644
index 0000000..6d12940
--- /dev/null
+++ b/man2/timerfd_gettime.2
@@ -0,0 +1 @@
+.so man2/timerfd_create.2
diff --git a/man2/timerfd_settime.2 b/man2/timerfd_settime.2
new file mode 100644
index 0000000..6d12940
--- /dev/null
+++ b/man2/timerfd_settime.2
@@ -0,0 +1 @@
+.so man2/timerfd_create.2
diff --git a/man2/times.2 b/man2/times.2
new file mode 100644
index 0000000..1d85010
--- /dev/null
+++ b/man2/times.2
@@ -0,0 +1,222 @@
+.\" Copyright (c) 1992 Drew Eckhardt (drew@cs.colorado.edu), March 28, 1992
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt (michael@moria.de)
+.\" Modified Sat Jul 24 14:29:17 1993 by Rik Faith (faith@cs.unc.edu)
+.\" Modified 961203 and 001211 and 010326 by aeb@cwi.nl
+.\" Modified 001213 by Michael Haardt (michael@moria.de)
+.\" Modified 13 Jun 02, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added note on nonstandard behavior when SIGCHLD is ignored.
+.\" Modified 2004-11-16, mtk, Noted that the nonconformance when
+.\" SIGCHLD is being ignored is fixed in Linux 2.6.9; other minor changes
+.\" Modified 2004-12-08, mtk, in Linux 2.6 times() return value changed
+.\" 2005-04-13, mtk
+.\" Added notes on nonstandard behavior: Linux allows 'buf' to
+.\" be NULL, but POSIX.1 doesn't specify this and it's nonportable.
+.\"
+.TH times 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+times \- get process times
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/times.h>
+.PP
+.BI "clock_t times(struct tms *" buf );
+.fi
+.SH DESCRIPTION
+.BR times ()
+stores the current process times in the
+.I "struct tms"
+that
+.I buf
+points to.
+The
+.I struct tms
+is as defined in
+.IR <sys/times.h> :
+.PP
+.in +4n
+.EX
+struct tms {
+ clock_t tms_utime; /* user time */
+ clock_t tms_stime; /* system time */
+ clock_t tms_cutime; /* user time of children */
+ clock_t tms_cstime; /* system time of children */
+};
+.EE
+.in
+.PP
+The
+.I tms_utime
+field contains the CPU time spent executing instructions
+of the calling process.
+The
+.I tms_stime
+field contains the CPU time spent executing inside the kernel
+while performing tasks on behalf of the calling process.
+.PP
+The
+.I tms_cutime
+field contains the sum of the
+.I tms_utime
+and
+.I tms_cutime
+values for all waited-for terminated children.
+The
+.I tms_cstime
+field contains the sum of the
+.I tms_stime
+and
+.I tms_cstime
+values for all waited-for terminated children.
+.PP
+Times for terminated children (and their descendants)
+are added in at the moment
+.BR wait (2)
+or
+.BR waitpid (2)
+returns their process ID.
+In particular,
+times of grandchildren
+that the children did not wait for are never seen.
+.PP
+All times reported are in clock ticks.
+.SH RETURN VALUE
+.BR times ()
+returns the number of clock ticks that have elapsed since
+an arbitrary point in the past.
+The return value may overflow the possible range of type
+.IR clock_t .
+On error,
+\fI(clock_t)\ \-1\fP is returned,
+and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+.I tms
+points outside the process's address space.
+.SH VERSIONS
+On Linux,
+the
+.I buf
+argument can be specified as NULL,
+with the result that
+.BR times ()
+just returns a function result.
+However,
+POSIX does not specify this behavior,
+and most
+other UNIX implementations require a non-NULL value for
+.IR buf .
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001,
+SVr4,
+4.3BSD.
+.PP
+In POSIX.1-1996 the symbol \fBCLK_TCK\fP (defined in
+.IR <time.h> )
+is mentioned as obsolescent.
+It is obsolete now.
+.PP
+Before Linux 2.6.9,
+if the disposition of
+.B SIGCHLD
+is set to
+.BR SIG_IGN ,
+then the times of terminated children
+are automatically included in the
+.I tms_cstime
+and
+.I tms_cutime
+fields,
+although POSIX.1-2001 says that this should happen
+only if the calling process
+.BR wait (2)s
+on its children.
+This nonconformance is rectified in Linux 2.6.9 and later.
+.\" See the description of times() in XSH, which says:
+.\" The times of a terminated child process are included... when wait()
+.\" or waitpid() returns the process ID of this terminated child.
+.PP
+On Linux,
+the \[lq]arbitrary point in the past\[rq]
+from which the return value of
+.BR times ()
+is measured has varied across kernel versions.
+On Linux 2.4 and earlier,
+this point is the moment the system was booted.
+Since Linux 2.6,
+this point is \fI(2\[ha]32/HZ) \- 300\fP
+seconds before system boot time.
+This variability across kernel versions (and across UNIX implementations),
+combined with the fact that the returned value may overflow the range of
+.IR clock_t ,
+means that a portable application would be wise to avoid using this value.
+To measure changes in elapsed time,
+use
+.BR clock_gettime (2)
+instead.
+.\" .PP
+.\" On older systems the number of clock ticks per second is given
+.\" by the variable HZ.
+.PP
+SVr1-3 returns
+.I long
+and the struct members are of type
+.I time_t
+although they store clock ticks,
+not seconds since the Epoch.
+V7 used
+.I long
+for the struct members,
+because it had no type
+.I time_t
+yet.
+.SH NOTES
+The number of clock ticks per second can be obtained using:
+.PP
+.in +4n
+.EX
+sysconf(_SC_CLK_TCK);
+.EE
+.in
+.PP
+Note that
+.BR clock (3)
+also returns a value of type
+.IR clock_t ,
+but this value is measured in units of
+.BR CLOCKS_PER_SEC ,
+not the clock ticks used by
+.BR times ().
+.SH BUGS
+A limitation of the Linux system call conventions on some architectures
+(notably i386) means that on Linux 2.6 there is a small time window
+(41 seconds) soon after boot when
+.BR times ()
+can return \-1,
+falsely indicating that an error occurred.
+The same problem can occur when the return value wraps past
+the maximum value that can be stored in
+.BR clock_t .
+.\" The problem is that a syscall return of -4095 to -1
+.\" is interpreted by glibc as an error, and the wrapper converts
+.\" the return value to -1.
+.\" http://marc.info/?l=linux-kernel&m=119447727031225&w=2
+.\" "compat_sys_times() bogus until jiffies >= 0"
+.\" November 2007
+.SH SEE ALSO
+.BR time (1),
+.BR getrusage (2),
+.BR wait (2),
+.BR clock (3),
+.BR sysconf (3),
+.BR time (7)
diff --git a/man2/tkill.2 b/man2/tkill.2
new file mode 100644
index 0000000..8780e8a
--- /dev/null
+++ b/man2/tkill.2
@@ -0,0 +1,130 @@
+.\" Copyright (C) 2008 Michael Kerrisk <tmk.manpages@gmail.com>
+.\" and Copyright 2003 Abhijit Menon-Sen <ams@wiw.org>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" 2004-05-31, added tgkill, ahu, aeb
+.\" 2008-01-15 mtk -- rewrote DESCRIPTION
+.\"
+.TH tkill 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+tkill, tgkill \- send a signal to a thread
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <signal.h>" " /* Definition of " SIG* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "[[deprecated]] int syscall(SYS_tkill, pid_t " tid ", int " sig );
+.PP
+.B #include <signal.h>
+.PP
+.BI "int tgkill(pid_t " tgid ", pid_t " tid ", int " sig );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR tkill (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+.BR tgkill ()
+sends the signal
+.I sig
+to the thread with the thread ID
+.I tid
+in the thread group
+.IR tgid .
+(By contrast,
+.BR kill (2)
+can be used to send a signal only to a process (i.e., thread group)
+as a whole, and the signal will be delivered to an arbitrary
+thread within that process.)
+.PP
+.BR tkill ()
+is an obsolete predecessor to
+.BR tgkill ().
+It allows only the target thread ID to be specified,
+which may result in the wrong thread being signaled if a thread
+terminates and its thread ID is recycled.
+Avoid using this system call.
+.\" FIXME Maybe say something about the following:
+.\" http://sourceware.org/bugzilla/show_bug.cgi?id=12889
+.\"
+.\" Quoting Rich Felker <bugdal@aerifal.cx>:
+.\"
+.\" There is a race condition in pthread_kill: it is possible that,
+.\" between the time pthread_kill reads the pid/tid from the target
+.\" thread descriptor and the time it makes the tgkill syscall,
+.\" the target thread terminates and the same tid gets assigned
+.\" to a new thread in the same process.
+.\"
+.\" (The tgkill syscall was designed to eliminate a similar race
+.\" condition in tkill, but it only succeeded in eliminating races
+.\" where the tid gets reused in a different process, and does not
+.\" help if the same tid gets assigned to a new thread in the
+.\" same process.)
+.\"
+.\" The only solution I can see is to introduce a mutex that ensures
+.\" that a thread cannot exit while pthread_kill is being called on it.
+.\"
+.\" Note that in most real-world situations, like almost all race
+.\" conditions, this one will be extremely rare. To make it
+.\" measurable, one could exhaust all but 1-2 available pid values,
+.\" possibly by lowering the max pid parameter in /proc, forcing
+.\" the same tid to be reused rapidly.
+.PP
+These are the raw system call interfaces, meant for internal
+thread library use.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and \fIerrno\fP
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EAGAIN
+The
+.B RLIMIT_SIGPENDING
+resource limit was reached and
+.I sig
+is a real-time signal.
+.TP
+.B EAGAIN
+Insufficient kernel memory was available and
+.I sig
+is a real-time signal.
+.TP
+.B EINVAL
+An invalid thread ID, thread group ID, or signal was specified.
+.TP
+.B EPERM
+Permission denied.
+For the required permissions, see
+.BR kill (2).
+.TP
+.B ESRCH
+No process with the specified thread ID (and thread group ID) exists.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+.TP
+.BR tkill ()
+Linux 2.4.19 / 2.5.4.
+.TP
+.BR tgkill ()
+Linux 2.5.75,
+glibc 2.30.
+.SH NOTES
+See the description of
+.B CLONE_THREAD
+in
+.BR clone (2)
+for an explanation of thread groups.
+.SH SEE ALSO
+.BR clone (2),
+.BR gettid (2),
+.BR kill (2),
+.BR rt_sigqueueinfo (2)
diff --git a/man2/truncate.2 b/man2/truncate.2
new file mode 100644
index 0000000..02a12e5
--- /dev/null
+++ b/man2/truncate.2
@@ -0,0 +1,251 @@
+.\" Copyright (c) 1983, 1991 The Regents of the University of California.
+.\" All rights reserved.
+.\"
+.\" SPDX-License-Identifier: BSD-4-Clause-UC
+.\"
+.\" @(#)truncate.2 6.9 (Berkeley) 3/10/91
+.\"
+.\" Modified 1993-07-24 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1996-10-22 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 1998-12-21 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified 2002-01-07 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified 2002-04-06 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified 2004-06-23 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.TH truncate 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+truncate, ftruncate \- truncate a file to a specified length
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int truncate(const char *" path ", off_t " length );
+.BI "int ftruncate(int " fd ", off_t " length );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR truncate ():
+.nf
+ _XOPEN_SOURCE >= 500
+.\" || _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED
+ || /* Since glibc 2.12: */ _POSIX_C_SOURCE >= 200809L
+ || /* glibc <= 2.19: */ _BSD_SOURCE
+.fi
+.PP
+.BR ftruncate ():
+.nf
+ _XOPEN_SOURCE >= 500
+.\" || _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED
+ || /* Since glibc 2.3.5: */ _POSIX_C_SOURCE >= 200112L
+ || /* glibc <= 2.19: */ _BSD_SOURCE
+.fi
+.SH DESCRIPTION
+The
+.BR truncate ()
+and
+.BR ftruncate ()
+functions cause the regular file named by
+.I path
+or referenced by
+.I fd
+to be truncated to a size of precisely
+.I length
+bytes.
+.PP
+If the file previously was larger than this size, the extra data is lost.
+If the file previously was shorter, it is extended, and
+the extended part reads as null bytes (\[aq]\e0\[aq]).
+.PP
+The file offset is not changed.
+.PP
+If the size changed, then the st_ctime and st_mtime fields
+(respectively, time of last status change and
+time of last modification; see
+.BR inode (7))
+for the file are updated,
+and the set-user-ID and set-group-ID mode bits may be cleared.
+.PP
+With
+.BR ftruncate (),
+the file must be open for writing; with
+.BR truncate (),
+the file must be writable.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+For
+.BR truncate ():
+.TP
+.B EACCES
+Search permission is denied for a component of the path prefix,
+or the named file is not writable by the user.
+(See also
+.BR path_resolution (7).)
+.TP
+.B EFAULT
+The argument
+.I path
+points outside the process's allocated address space.
+.TP
+.B EFBIG
+The argument
+.I length
+is larger than the maximum file size. (XSI)
+.TP
+.B EINTR
+While blocked waiting to complete,
+the call was interrupted by a signal handler; see
+.BR fcntl (2)
+and
+.BR signal (7).
+.TP
+.B EINVAL
+The argument
+.I length
+is negative or larger than the maximum file size.
+.TP
+.B EIO
+An I/O error occurred updating the inode.
+.TP
+.B EISDIR
+The named file is a directory.
+.TP
+.B ELOOP
+Too many symbolic links were encountered in translating the pathname.
+.TP
+.B ENAMETOOLONG
+A component of a pathname exceeded 255 characters,
+or an entire pathname exceeded 1023 characters.
+.TP
+.B ENOENT
+The named file does not exist.
+.TP
+.B ENOTDIR
+A component of the path prefix is not a directory.
+.TP
+.B EPERM
+.\" This happens for at least MSDOS and VFAT filesystems
+.\" on kernel 2.6.13
+The underlying filesystem does not support extending
+a file beyond its current size.
+.TP
+.B EPERM
+The operation was prevented by a file seal; see
+.BR fcntl (2).
+.TP
+.B EROFS
+The named file resides on a read-only filesystem.
+.TP
+.B ETXTBSY
+The file is an executable file that is being executed.
+.PP
+For
+.BR ftruncate ()
+the same errors apply, but instead of things that can be wrong with
+.IR path ,
+we now have things that can be wrong with the file descriptor,
+.IR fd :
+.TP
+.B EBADF
+.I fd
+is not a valid file descriptor.
+.TP
+.BR EBADF " or " EINVAL
+.I fd
+is not open for writing.
+.TP
+.B EINVAL
+.I fd
+does not reference a regular file or a POSIX shared memory object.
+.TP
+.BR EINVAL " or " EBADF
+The file descriptor
+.I fd
+is not open for writing.
+POSIX permits, and portable applications should handle,
+either error for this case.
+(Linux produces
+.BR EINVAL .)
+.SH VERSIONS
+The details in DESCRIPTION are for XSI-compliant systems.
+For non-XSI-compliant systems, the POSIX standard allows
+two behaviors for
+.BR ftruncate ()
+when
+.I length
+exceeds the file length
+(note that
+.BR truncate ()
+is not specified at all in such an environment):
+either returning an error, or extending the file.
+Like most UNIX implementations, Linux follows the XSI requirement
+when dealing with native filesystems.
+However, some nonnative filesystems do not permit
+.BR truncate ()
+and
+.BR ftruncate ()
+to be used to extend a file beyond its current length:
+a notable example on Linux is VFAT.
+.\" At the very least: OSF/1, Solaris 7, and FreeBSD conform, mtk, Jan 2002
+.PP
+On some 32-bit architectures,
+the calling signature for these system calls differ,
+for the reasons described in
+.BR syscall (2).
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001,
+4.4BSD, SVr4 (first appeared in 4.2BSD).
+.\" POSIX.1-1996 has
+.\" .BR ftruncate ().
+.\" POSIX.1-2001 also has
+.\" .BR truncate (),
+.\" as an XSI extension.
+.\" .LP
+.\" SVr4 documents additional
+.\" .BR truncate ()
+.\" error conditions EMFILE, EMULTIHP, ENFILE, ENOLINK. SVr4 documents for
+.\" .BR ftruncate ()
+.\" an additional EAGAIN error condition.
+.PP
+The original Linux
+.BR truncate ()
+and
+.BR ftruncate ()
+system calls were not designed to handle large file offsets.
+Consequently, Linux 2.4 added
+.BR truncate64 ()
+and
+.BR ftruncate64 ()
+system calls that handle large files.
+However, these details can be ignored by applications using glibc, whose
+wrapper functions transparently employ the more recent system calls
+where they are available.
+.SH NOTES
+.BR ftruncate ()
+can also be used to set the size of a POSIX shared memory object; see
+.BR shm_open (3).
+.SH BUGS
+A header file bug in glibc 2.12 meant that the minimum value of
+.\" http://sourceware.org/bugzilla/show_bug.cgi?id=12037
+.B _POSIX_C_SOURCE
+required to expose the declaration of
+.BR ftruncate ()
+was 200809L instead of 200112L.
+This has been fixed in later glibc versions.
+.SH SEE ALSO
+.BR truncate (1),
+.BR open (2),
+.BR stat (2),
+.BR path_resolution (7)
diff --git a/man2/truncate64.2 b/man2/truncate64.2
new file mode 100644
index 0000000..2ed34f1
--- /dev/null
+++ b/man2/truncate64.2
@@ -0,0 +1 @@
+.so man2/truncate.2
diff --git a/man2/tuxcall.2 b/man2/tuxcall.2
new file mode 100644
index 0000000..5d25ea6
--- /dev/null
+++ b/man2/tuxcall.2
@@ -0,0 +1 @@
+.so man2/unimplemented.2
diff --git a/man2/ugetrlimit.2 b/man2/ugetrlimit.2
new file mode 100644
index 0000000..df6d736
--- /dev/null
+++ b/man2/ugetrlimit.2
@@ -0,0 +1 @@
+.so man2/getrlimit.2
diff --git a/man2/umask.2 b/man2/umask.2
new file mode 100644
index 0000000..c920b55
--- /dev/null
+++ b/man2/umask.2
@@ -0,0 +1,149 @@
+.\" Copyright (c) 2006, 2008, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" (A few fragments remain from an earlier (1992) version written in
+.\" 1992 by Drew Eckhardt <drew@cs.colorado.edu>.)
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt <michael@moria.de>
+.\" Modified Sat Jul 24 12:51:53 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Modified Tue Oct 22 22:39:04 1996 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified Thu May 1 06:05:54 UTC 1997 by Nicolás Lichtmaier
+.\" <nick@debian.com> with Lars Wirzenius <liw@iki.fi> suggestion
+.\" 2006-05-13, mtk, substantial rewrite of description of 'mask'
+.\" 2008-01-09, mtk, a few rewrites and additions.
+.TH umask 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+umask \- set file mode creation mask
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/stat.h>
+.PP
+.BI "mode_t umask(mode_t " mask );
+.fi
+.SH DESCRIPTION
+.BR umask ()
+sets the calling process's file mode creation mask (umask) to
+.I mask
+& 0777 (i.e., only the file permission bits of
+.I mask
+are used), and returns the previous value of the mask.
+.PP
+The umask is used by
+.BR open (2),
+.BR mkdir (2),
+and other system calls that create files
+.\" e.g., mkfifo(), creat(), mknod(), sem_open(), mq_open(), shm_open()
+.\" but NOT the System V IPC *get() calls
+to modify the permissions placed on newly created files or directories.
+Specifically, permissions in the umask are turned off from
+the
+.I mode
+argument to
+.BR open (2)
+and
+.BR mkdir (2).
+.PP
+Alternatively, if the parent directory has a default ACL (see
+.BR acl (5)),
+the umask is ignored, the default ACL is inherited,
+the permission bits are set based on the inherited ACL,
+and permission bits absent in the
+.I mode
+argument are turned off.
+For example, the following default ACL is equivalent to a umask of 022:
+.PP
+.in +4n
+.EX
+u::rwx,g::r-x,o::r-x
+.EE
+.in
+.PP
+Combining the effect of this default ACL with a
+.I mode
+argument of 0666 (rw-rw-rw-), the resulting file permissions would be 0644
+(rw-r--r--).
+.PP
+The constants that should be used to specify
+.I mask
+are described in
+.BR inode (7).
+.PP
+The typical default value for the process umask is
+.BR S_IWGRP " | " S_IWOTH
+(octal 022).
+In the usual case where the
+.I mode
+argument to
+.BR open (2)
+is specified as:
+.PP
+.in +4n
+.EX
+.BR S_IRUSR " | " S_IWUSR " | " S_IRGRP " | " S_IWGRP " | " S_IROTH " | " S_IWOTH
+.EE
+.in
+.PP
+(octal 0666) when creating a new file, the permissions on the
+resulting file will be:
+.PP
+.in +4n
+.EX
+.BR S_IRUSR " | " S_IWUSR " | " S_IRGRP " | " S_IROTH
+.EE
+.in
+.PP
+(because 0666 & \[ti]022 = 0644; i.e. rw\-r\-\-r\-\-).
+.SH RETURN VALUE
+This system call always succeeds and the previous value of the mask
+is returned.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4, 4.3BSD.
+.SH NOTES
+A child process created via
+.BR fork (2)
+inherits its parent's umask.
+The umask is left unchanged by
+.BR execve (2).
+.PP
+It is impossible to use
+.BR umask ()
+to fetch a process's umask without at the same time changing it.
+A second call to
+.BR umask ()
+would then be needed to restore the umask.
+The nonatomicity of these two steps provides the potential
+for races in multithreaded programs.
+.PP
+Since Linux 4.7, the umask of any process can be viewed via the
+.I Umask
+field of
+.IR /proc/ pid /status .
+Inspecting this field in
+.I /proc/self/status
+allows a process to retrieve its umask without at the same time changing it.
+.PP
+The umask setting also affects the permissions assigned to POSIX IPC objects
+.RB ( mq_open (3),
+.BR sem_open (3),
+.BR shm_open (3)),
+FIFOs
+.RB ( mkfifo (3)),
+and UNIX domain sockets
+.RB ( unix (7))
+created by the process.
+The umask does not affect the permissions assigned
+to System\ V IPC objects created by the process (using
+.BR msgget (2),
+.BR semget (2),
+.BR shmget (2)).
+.SH SEE ALSO
+.BR chmod (2),
+.BR mkdir (2),
+.BR open (2),
+.BR stat (2),
+.BR acl (5)
diff --git a/man2/umount.2 b/man2/umount.2
new file mode 100644
index 0000000..cba0abc
--- /dev/null
+++ b/man2/umount.2
@@ -0,0 +1,214 @@
+.\" Copyright (C) 1993 Rickard E. Faith <faith@cs.unc.edu>
+.\" and Copyright (C) 1994 Andries E. Brouwer <aeb@cwi.nl>
+.\" and Copyright (C) 2002, 2005 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" 2008-10-06, mtk: Created this as a new page by splitting
+.\" umount/umount2 material out of mount.2
+.\"
+.TH umount 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+umount, umount2 \- unmount filesystem
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B "#include <sys/mount.h>"
+.PP
+.BI "int umount(const char *" target );
+.BI "int umount2(const char *" target ", int " flags );
+.fi
+.SH DESCRIPTION
+.BR umount ()
+and
+.BR umount2 ()
+remove the attachment of the (topmost) filesystem mounted on
+.IR target .
+.\" Note: the kernel naming differs from the glibc naming
+.\" umount2 is the glibc name for what the kernel now calls umount
+.\" and umount is the glibc name for oldumount
+.PP
+Appropriate privilege (Linux: the
+.B CAP_SYS_ADMIN
+capability) is required to unmount filesystems.
+.PP
+Linux 2.1.116 added the
+.BR umount2 ()
+system call, which, like
+.BR umount (),
+unmounts a target, but allows additional
+.I flags
+controlling the behavior of the operation:
+.TP
+.BR MNT_FORCE " (since Linux 2.1.116)"
+Ask the filesystem to abort pending requests before attempting the
+unmount.
+This may allow the unmount to complete without waiting
+for an inaccessible server, but could cause data loss.
+If, after aborting requests,
+some processes still have active references to the filesystem,
+the unmount will still fail.
+As at Linux 4.12,
+.B MNT_FORCE
+is supported only on the following filesystems:
+9p (since Linux 2.6.16),
+ceph (since Linux 2.6.34),
+cifs (since Linux 2.6.12),
+fuse (since Linux 2.6.16),
+lustre (since Linux 3.11),
+and NFS (since Linux 2.1.116).
+.TP
+.BR MNT_DETACH " (since Linux 2.4.11)"
+Perform a lazy unmount: make the mount unavailable for new
+accesses, immediately disconnect the filesystem and all filesystems
+mounted below it from each other and from the mount table, and
+actually perform the unmount when the mount ceases to be busy.
+.TP
+.BR MNT_EXPIRE " (since Linux 2.6.8)"
+Mark the mount as expired.
+If a mount is not currently in use, then an initial call to
+.BR umount2 ()
+with this flag fails with the error
+.BR EAGAIN ,
+but marks the mount as expired.
+The mount remains expired as long as it isn't accessed
+by any process.
+A second
+.BR umount2 ()
+call specifying
+.B MNT_EXPIRE
+unmounts an expired mount.
+This flag cannot be specified with either
+.B MNT_FORCE
+or
+.BR MNT_DETACH .
+.TP
+.BR UMOUNT_NOFOLLOW " (since Linux 2.6.34)"
+.\" Later added to Linux 2.6.33-stable
+Don't dereference
+.I target
+if it is a symbolic link.
+This flag allows security problems to be avoided in set-user-ID-\fIroot\fP
+programs that allow unprivileged users to unmount filesystems.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+The error values given below result from filesystem type independent
+errors.
+Each filesystem type may have its own special errors and its
+own special behavior.
+See the Linux kernel source code for details.
+.TP
+.B EAGAIN
+A call to
+.BR umount2 ()
+specifying
+.B MNT_EXPIRE
+successfully marked an unbusy filesystem as expired.
+.TP
+.B EBUSY
+.I target
+could not be unmounted because it is busy.
+.TP
+.B EFAULT
+.I target
+points outside the user address space.
+.TP
+.B EINVAL
+.I target
+is not a mount point.
+.TP
+.B EINVAL
+.I target
+is locked; see
+.BR mount_namespaces (7).
+.TP
+.B EINVAL
+.BR umount2 ()
+was called with
+.B MNT_EXPIRE
+and either
+.B MNT_DETACH
+or
+.BR MNT_FORCE .
+.TP
+.BR EINVAL " (since Linux 2.6.34)"
+.BR umount2 ()
+was called with an invalid flag value in
+.IR flags .
+.TP
+.B ENAMETOOLONG
+A pathname was longer than
+.BR MAXPATHLEN .
+.TP
+.B ENOENT
+A pathname was empty or had a nonexistent component.
+.TP
+.B ENOMEM
+The kernel could not allocate a free page to copy filenames or data into.
+.TP
+.B EPERM
+The caller does not have the required privileges.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+.B MNT_DETACH
+and
+.B MNT_EXPIRE
+.\" http://sourceware.org/bugzilla/show_bug.cgi?id=10092
+are available since glibc 2.11.
+.PP
+The original
+.BR umount ()
+function was called as \fIumount(device)\fP and would return
+.B ENOTBLK
+when called with something other than a block device.
+In Linux 0.98p4, a call \fIumount(dir)\fP was added, in order to
+support anonymous devices.
+In Linux 2.3.99-pre7, the call \fIumount(device)\fP was removed,
+leaving only \fIumount(dir)\fP (since now devices can be mounted
+in more than one place, so specifying the device does not suffice).
+.SH NOTES
+.SS umount() and shared mounts
+Shared mounts cause any mount activity on a mount, including
+.BR umount ()
+operations, to be forwarded to every shared mount in the
+peer group and every slave mount of that peer group.
+This means that
+.BR umount ()
+of any peer in a set of shared mounts will cause all of its
+peers to be unmounted and all of their slaves to be unmounted as well.
+.PP
+This propagation of unmount activity can be particularly surprising
+on systems where every mount is shared by default.
+On such systems,
+recursively bind mounting the root directory of the filesystem
+onto a subdirectory and then later unmounting that subdirectory with
+.B MNT_DETACH
+will cause every mount in the mount namespace to be lazily unmounted.
+.PP
+To ensure
+.BR umount ()
+does not propagate in this fashion,
+the mount may be remounted using a
+.BR mount (2)
+call with a
+.I mount_flags
+argument that includes both
+.B MS_REC
+and
+.B MS_PRIVATE
+prior to
+.BR umount ()
+being called.
+.SH SEE ALSO
+.BR mount (2),
+.BR mount_namespaces (7),
+.BR path_resolution (7),
+.BR mount (8),
+.BR umount (8)
diff --git a/man2/umount2.2 b/man2/umount2.2
new file mode 100644
index 0000000..84ea419
--- /dev/null
+++ b/man2/umount2.2
@@ -0,0 +1 @@
+.so man2/umount.2
diff --git a/man2/uname.2 b/man2/uname.2
new file mode 100644
index 0000000..e84f3e7
--- /dev/null
+++ b/man2/uname.2
@@ -0,0 +1,134 @@
+.\" Copyright (C) 2001 Andries Brouwer <aeb@cwi.nl>.
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" 2007-07-05 mtk: Added details on underlying system call interfaces
+.\"
+.TH uname 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+uname \- get name and information about current kernel
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/utsname.h>
+.PP
+.BI "int uname(struct utsname *" buf );
+.fi
+.SH DESCRIPTION
+.BR uname ()
+returns system information in the structure pointed to by
+.IR buf .
+The
+.I utsname
+struct is defined in
+.IR <sys/utsname.h> :
+.PP
+.in +4n
+.EX
+struct utsname {
+ char sysname[]; /* Operating system name (e.g., "Linux") */
+ char nodename[]; /* Name within communications network
+ to which the node is attached, if any */
+ char release[]; /* Operating system release
+ (e.g., "2.6.28") */
+ char version[]; /* Operating system version */
+ char machine[]; /* Hardware type identifier */
+#ifdef _GNU_SOURCE
+ char domainname[]; /* NIS or YP domain name */
+#endif
+};
+.EE
+.in
+.PP
+The length of the arrays in a
+.I struct utsname
+is unspecified (see NOTES);
+the fields are terminated by a null byte (\[aq]\e0\[aq]).
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+.I buf
+is not valid.
+.SH VERSIONS
+The
+.I domainname
+member (the NIS or YP domain name) is a GNU extension.
+.PP
+The length of the fields in the struct varies.
+Some operating systems
+or libraries use a hardcoded 9 or 33 or 65 or 257.
+Other systems use
+.B SYS_NMLN
+or
+.B _SYS_NMLN
+or
+.B UTSLEN
+or
+.BR _UTSNAME_LENGTH .
+Clearly, it is a bad
+idea to use any of these constants; just use sizeof(...).
+SVr4 uses 257, "to support Internet hostnames"
+\[em] this is the largest value likely to be encountered in the wild.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+POSIX.1-2001, SVr4, 4.4BSD.
+.SS C library/kernel differences
+Over time, increases in the size of the
+.I utsname
+structure have led to three successive versions of
+.BR uname ():
+.IR sys_olduname ()
+(slot
+.IR __NR_oldolduname ),
+.IR sys_uname ()
+(slot
+.IR __NR_olduname ),
+and
+.IR sys_newuname ()
+(slot
+.IR __NR_uname) .
+The first one
+.\" That was back before Linux 1.0
+used length 9 for all fields;
+the second
+.\" That was also back before Linux 1.0
+used 65;
+the third also uses 65 but adds the
+.I domainname
+field.
+The glibc
+.BR uname ()
+wrapper function hides these details from applications,
+invoking the most recent version of the system call provided by the kernel.
+.SH NOTES
+The kernel has the name, release, version, and supported machine type built in.
+Conversely, the
+.I nodename
+field is configured by the administrator to match the network
+(this is what the BSD historically calls the "hostname",
+and is set via
+.BR sethostname (2)).
+Similarly, the
+.I domainname
+field is set via
+.BR setdomainname (2).
+.PP
+Part of the utsname information is also accessible via
+.IR /proc/sys/kernel/ { ostype ,
+.IR hostname ,
+.IR osrelease ,
+.IR version ,
+.IR domainname }.
+.SH SEE ALSO
+.BR uname (1),
+.BR getdomainname (2),
+.BR gethostname (2),
+.BR uts_namespaces (7)
diff --git a/man2/unimplemented.2 b/man2/unimplemented.2
new file mode 100644
index 0000000..535d3e9
--- /dev/null
+++ b/man2/unimplemented.2
@@ -0,0 +1,48 @@
+.\" Copyright 1995 Michael Chastain (mec@shell.portal.com), 15 April 1995.
+.\"
+.\" SPDX-License-Identifier: GPL-2.0-or-later
+.\"
+.\" Updated, aeb, 980612
+.\"
+.TH UNIMPLEMENTED 2 2022-10-09 "Linux man-pages 6.05.01"
+.SH NAME
+afs_syscall, break, fattach, fdetach, ftime, getmsg, getpmsg, gtty, isastream,
+lock, madvise1, mpx, prof, profil, putmsg, putpmsg, security,
+stty, tuxcall, ulimit, vserver \- unimplemented system calls
+.SH SYNOPSIS
+.nf
+Unimplemented system calls.
+.fi
+.SH DESCRIPTION
+These system calls are not implemented in the Linux kernel.
+.SH RETURN VALUE
+These system calls always return \-1 and set
+.I errno
+to
+.BR ENOSYS .
+.SH NOTES
+Note that
+.BR ftime (3),
+.BR profil (3),
+and
+.BR ulimit (3)
+are implemented as library functions.
+.PP
+Some system calls, like
+.BR alloc_hugepages (2),
+.BR free_hugepages (2),
+.BR ioperm (2),
+.BR iopl (2),
+and
+.BR vm86 (2)
+exist only on certain architectures.
+.PP
+Some system calls, like
+.BR ipc (2),
+.BR create_module (2),
+.BR init_module (2),
+and
+.BR delete_module (2)
+exist only when the Linux kernel was built with support for them.
+.SH SEE ALSO
+.BR syscalls (2)
diff --git a/man2/unlink.2 b/man2/unlink.2
new file mode 100644
index 0000000..85cb670
--- /dev/null
+++ b/man2/unlink.2
@@ -0,0 +1,298 @@
+.\" This manpage is Copyright (C) 1992 Drew Eckhardt;
+.\" and Copyright (C) 1993 Ian Jackson
+.\" and Copyright (C) 2006, 2014 Michael Kerrisk.
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified 1993-07-24 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1996-09-08 by Arnt Gulbrandsen <agulbra@troll.no>
+.\" Modified 1997-01-31 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 2001-05-17 by aeb
+.\" Modified 2004-06-23 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.TH unlink 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+unlink, unlinkat \- delete a name and possibly the file it refers to
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "int unlink(const char *" pathname );
+.PP
+.BR "#include <fcntl.h> " "/* Definition of " AT_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int unlinkat(int " dirfd ", const char *" pathname ", int " flags );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR unlinkat ():
+.nf
+ Since glibc 2.10:
+ _POSIX_C_SOURCE >= 200809L
+ Before glibc 2.10:
+ _ATFILE_SOURCE
+.fi
+.SH DESCRIPTION
+.BR unlink ()
+deletes a name from the filesystem.
+If that name was the
+last link to a file and no processes have the file open, the file is
+deleted and the space it was using is made available for reuse.
+.PP
+If the name was the last link to a file but any processes still have
+the file open, the file will remain in existence until the last file
+descriptor referring to it is closed.
+.PP
+If the name referred to a symbolic link, the link is removed.
+.PP
+If the name referred to a socket, FIFO, or device, the name for it is
+removed but processes which have the object open may continue to use
+it.
+.SS unlinkat()
+The
+.BR unlinkat ()
+system call operates in exactly the same way as either
+.BR unlink ()
+or
+.BR rmdir (2)
+(depending on whether or not
+.I flags
+includes the
+.B AT_REMOVEDIR
+flag)
+except for the differences described here.
+.PP
+If the pathname given in
+.I pathname
+is relative, then it is interpreted relative to the directory
+referred to by the file descriptor
+.I dirfd
+(rather than relative to the current working directory of
+the calling process, as is done by
+.BR unlink ()
+and
+.BR rmdir (2)
+for a relative pathname).
+.PP
+If the pathname given in
+.I pathname
+is relative and
+.I dirfd
+is the special value
+.BR AT_FDCWD ,
+then
+.I pathname
+is interpreted relative to the current working
+directory of the calling process (like
+.BR unlink ()
+and
+.BR rmdir (2)).
+.PP
+If the pathname given in
+.I pathname
+is absolute, then
+.I dirfd
+is ignored.
+.PP
+.I flags
+is a bit mask that can either be specified as 0, or by ORing
+together flag values that control the operation of
+.BR unlinkat ().
+Currently, only one such flag is defined:
+.TP
+.B AT_REMOVEDIR
+By default,
+.BR unlinkat ()
+performs the equivalent of
+.BR unlink ()
+on
+.IR pathname .
+If the
+.B AT_REMOVEDIR
+flag is specified, then
+performs the equivalent of
+.BR rmdir (2)
+on
+.IR pathname .
+.PP
+See
+.BR openat (2)
+for an explanation of the need for
+.BR unlinkat ().
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+Write access to the directory containing
+.I pathname
+is not allowed for the process's effective UID, or one of the
+directories in
+.I pathname
+did not allow search permission.
+(See also
+.BR path_resolution (7).)
+.TP
+.B EBUSY
+The file
+.I pathname
+cannot be unlinked because it is being used by the system
+or another process;
+for example, it is a mount point
+or the NFS client software created it to represent an
+active but otherwise nameless inode ("NFS silly renamed").
+.TP
+.B EFAULT
+.I pathname
+points outside your accessible address space.
+.TP
+.B EIO
+An I/O error occurred.
+.TP
+.B EISDIR
+.I pathname
+refers to a directory.
+(This is the non-POSIX value returned since Linux 2.1.132.)
+.TP
+.B ELOOP
+Too many symbolic links were encountered in translating
+.IR pathname .
+.TP
+.B ENAMETOOLONG
+.IR pathname " was too long."
+.TP
+.B ENOENT
+A component in
+.I pathname
+does not exist or is a dangling symbolic link, or
+.I pathname
+is empty.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.B ENOTDIR
+A component used as a directory in
+.I pathname
+is not, in fact, a directory.
+.TP
+.B EPERM
+The system does not allow unlinking of directories,
+or unlinking of directories requires privileges that the
+calling process doesn't have.
+(This is the POSIX prescribed error return;
+as noted above, Linux returns
+.B EISDIR
+for this case.)
+.TP
+.BR EPERM " (Linux only)"
+The filesystem does not allow unlinking of files.
+.TP
+.BR EPERM " or " EACCES
+The directory containing
+.I pathname
+has the sticky bit
+.RB ( S_ISVTX )
+set and the process's effective UID is neither the UID of the file to
+be deleted nor that of the directory containing it, and
+the process is not privileged (Linux: does not have the
+.B CAP_FOWNER
+capability).
+.TP
+.B EPERM
+The file to be unlinked is marked immutable or append-only.
+(See
+.BR ioctl_iflags (2).)
+.TP
+.B EROFS
+.I pathname
+refers to a file on a read-only filesystem.
+.PP
+The same errors that occur for
+.BR unlink ()
+and
+.BR rmdir (2)
+can also occur for
+.BR unlinkat ().
+The following additional errors can occur for
+.BR unlinkat ():
+.TP
+.B EBADF
+.I pathname
+is relative but
+.I dirfd
+is neither
+.B AT_FDCWD
+nor a valid file descriptor.
+.TP
+.B EINVAL
+An invalid flag value was specified in
+.IR flags .
+.TP
+.B EISDIR
+.I pathname
+refers to a directory, and
+.B AT_REMOVEDIR
+was not specified in
+.IR flags .
+.TP
+.B ENOTDIR
+.I pathname
+is relative and
+.I dirfd
+is a file descriptor referring to a file other than a directory.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+.TP
+.BR unlink ()
+SVr4, 4.3BSD, POSIX.1-2001.
+.\" SVr4 documents additional error
+.\" conditions EINTR, EMULTIHOP, ETXTBSY, ENOLINK.
+.TP
+.BR unlinkat ()
+POSIX.1-2008.
+Linux 2.6.16,
+glibc 2.4.
+.SS glibc
+On older kernels where
+.BR unlinkat ()
+is unavailable, the glibc wrapper function falls back to the use of
+.BR unlink ()
+or
+.BR rmdir (2).
+When
+.I pathname
+is a relative pathname,
+glibc constructs a pathname based on the symbolic link in
+.I /proc/self/fd
+that corresponds to the
+.I dirfd
+argument.
+.SH BUGS
+Infelicities in the protocol underlying NFS can cause the unexpected
+disappearance of files which are still being used.
+.SH SEE ALSO
+.BR rm (1),
+.BR unlink (1),
+.BR chmod (2),
+.BR link (2),
+.BR mknod (2),
+.BR open (2),
+.BR rename (2),
+.BR rmdir (2),
+.BR mkfifo (3),
+.BR remove (3),
+.BR path_resolution (7),
+.BR symlink (7)
diff --git a/man2/unlinkat.2 b/man2/unlinkat.2
new file mode 100644
index 0000000..4921f73
--- /dev/null
+++ b/man2/unlinkat.2
@@ -0,0 +1 @@
+.so man2/unlink.2
diff --git a/man2/unshare.2 b/man2/unshare.2
new file mode 100644
index 0000000..b12afb5
--- /dev/null
+++ b/man2/unshare.2
@@ -0,0 +1,572 @@
+.\" Copyright (C) 2006, Janak Desai <janak@us.ibm.com>
+.\" and Copyright (C) 2006, 2012 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: GPL-1.0-or-later
+.\"
+.\" Patch Justification:
+.\" unshare system call is needed to implement, using PAM,
+.\" per-security_context and/or per-user namespace to provide
+.\" polyinstantiated directories. Using unshare and bind mounts, a
+.\" PAM module can create private namespace with appropriate
+.\" directories(based on user's security context) bind mounted on
+.\" public directories such as /tmp, thus providing an instance of
+.\" /tmp that is based on user's security context. Without the
+.\" unshare system call, namespace separation can only be achieved
+.\" by clone, which would require porting and maintaining all commands
+.\" such as login, and su, that establish a user session.
+.\"
+.TH unshare 2 2023-05-26 "Linux man-pages 6.05.01"
+.SH NAME
+unshare \- disassociate parts of the process execution context
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #define _GNU_SOURCE
+.B #include <sched.h>
+.PP
+.BI "int unshare(int " flags );
+.fi
+.SH DESCRIPTION
+.BR unshare ()
+allows a process (or thread) to disassociate parts of its execution
+context that are currently being shared with other processes (or threads).
+Part of the execution context, such as the mount namespace, is shared
+implicitly when a new process is created using
+.BR fork (2)
+or
+.BR vfork (2),
+while other parts, such as virtual memory, may be
+shared by explicit request when creating a process or thread using
+.BR clone (2).
+.PP
+The main use of
+.BR unshare ()
+is to allow a process to control its
+shared execution context without creating a new process.
+.PP
+The
+.I flags
+argument is a bit mask that specifies which parts of
+the execution context should be unshared.
+This argument is specified by ORing together zero or more
+of the following constants:
+.TP
+.B CLONE_FILES
+Reverse the effect of the
+.BR clone (2)
+.B CLONE_FILES
+flag.
+Unshare the file descriptor table, so that the calling process
+no longer shares its file descriptors with any other process.
+.TP
+.B CLONE_FS
+Reverse the effect of the
+.BR clone (2)
+.B CLONE_FS
+flag.
+Unshare filesystem attributes, so that the calling process
+no longer shares its root directory
+.RB ( chroot (2)),
+current directory
+.RB ( chdir (2)),
+or umask
+.RB ( umask (2))
+attributes with any other process.
+.TP
+.BR CLONE_NEWCGROUP " (since Linux 4.6)"
+This flag has the same effect as the
+.BR clone (2)
+.B CLONE_NEWCGROUP
+flag.
+Unshare the cgroup namespace.
+Use of
+.B CLONE_NEWCGROUP
+requires the
+.B CAP_SYS_ADMIN
+capability.
+.TP
+.BR CLONE_NEWIPC " (since Linux 2.6.19)"
+This flag has the same effect as the
+.BR clone (2)
+.B CLONE_NEWIPC
+flag.
+Unshare the IPC namespace,
+so that the calling process has a private copy of the
+IPC namespace which is not shared with any other process.
+Specifying this flag automatically implies
+.B CLONE_SYSVSEM
+as well.
+Use of
+.B CLONE_NEWIPC
+requires the
+.B CAP_SYS_ADMIN
+capability.
+.TP
+.BR CLONE_NEWNET " (since Linux 2.6.24)"
+This flag has the same effect as the
+.BR clone (2)
+.B CLONE_NEWNET
+flag.
+Unshare the network namespace,
+so that the calling process is moved into a
+new network namespace which is not shared
+with any previously existing process.
+Use of
+.B CLONE_NEWNET
+requires the
+.B CAP_SYS_ADMIN
+capability.
+.TP
+.B CLONE_NEWNS
+.\" These flag name are inconsistent:
+.\" CLONE_NEWNS does the same thing in clone(), but CLONE_VM,
+.\" CLONE_FS, and CLONE_FILES reverse the action of the clone()
+.\" flags of the same name.
+This flag has the same effect as the
+.BR clone (2)
+.B CLONE_NEWNS
+flag.
+Unshare the mount namespace,
+so that the calling process has a private copy of
+its namespace which is not shared with any other process.
+Specifying this flag automatically implies
+.B CLONE_FS
+as well.
+Use of
+.B CLONE_NEWNS
+requires the
+.B CAP_SYS_ADMIN
+capability.
+For further information, see
+.BR mount_namespaces (7).
+.TP
+.BR CLONE_NEWPID " (since Linux 3.8)"
+This flag has the same effect as the
+.BR clone (2)
+.B CLONE_NEWPID
+flag.
+Unshare the PID namespace,
+so that the calling process has a new PID namespace for its children
+which is not shared with any previously existing process.
+The calling process is
+.I not
+moved into the new namespace.
+The first child created by the calling process will have
+the process ID 1 and will assume the role of
+.BR init (1)
+in the new namespace.
+.B CLONE_NEWPID
+automatically implies
+.B CLONE_THREAD
+as well.
+Use of
+.B CLONE_NEWPID
+requires the
+.B CAP_SYS_ADMIN
+capability.
+For further information, see
+.BR pid_namespaces (7).
+.TP
+.BR CLONE_NEWTIME " (since Linux 5.6)"
+Unshare the time namespace,
+so that the calling process has a new time namespace for its children
+which is not shared with any previously existing process.
+The calling process is
+.I not
+moved into the new namespace.
+Use of
+.B CLONE_NEWTIME
+requires the
+.B CAP_SYS_ADMIN
+capability.
+For further information, see
+.BR time_namespaces (7).
+.TP
+.BR CLONE_NEWUSER " (since Linux 3.8)"
+This flag has the same effect as the
+.BR clone (2)
+.B CLONE_NEWUSER
+flag.
+Unshare the user namespace,
+so that the calling process is moved into a new user namespace
+which is not shared with any previously existing process.
+As with the child process created by
+.BR clone (2)
+with the
+.B CLONE_NEWUSER
+flag, the caller obtains a full set of capabilities in the new namespace.
+.IP
+.B CLONE_NEWUSER
+requires that the calling process is not threaded; specifying
+.B CLONE_NEWUSER
+automatically implies
+.BR CLONE_THREAD .
+Since Linux 3.9,
+.\" commit e66eded8309ebf679d3d3c1f5820d1f2ca332c71
+.\" https://lwn.net/Articles/543273/
+.B CLONE_NEWUSER
+also automatically implies
+.BR CLONE_FS .
+.B CLONE_NEWUSER
+requires that the user ID and group ID
+of the calling process are mapped to user IDs and group IDs in the
+user namespace of the calling process at the time of the call.
+.IP
+For further information on user namespaces, see
+.BR user_namespaces (7).
+.TP
+.BR CLONE_NEWUTS " (since Linux 2.6.19)"
+This flag has the same effect as the
+.BR clone (2)
+.B CLONE_NEWUTS
+flag.
+Unshare the UTS IPC namespace,
+so that the calling process has a private copy of the
+UTS namespace which is not shared with any other process.
+Use of
+.B CLONE_NEWUTS
+requires the
+.B CAP_SYS_ADMIN
+capability.
+.TP
+.BR CLONE_SYSVSEM " (since Linux 2.6.26)"
+.\" commit 9edff4ab1f8d82675277a04e359d0ed8bf14a7b7
+This flag reverses the effect of the
+.BR clone (2)
+.B CLONE_SYSVSEM
+flag.
+Unshare System\ V semaphore adjustment
+.RI ( semadj )
+values,
+so that the calling process has a new empty
+.I semadj
+list that is not shared with any other process.
+If this is the last process that has a reference to the process's current
+.I semadj
+list, then the adjustments in that list are applied
+to the corresponding semaphores, as described in
+.BR semop (2).
+.\" CLONE_NEWNS If CLONE_SIGHAND is set and signals are also being shared
+.\" (i.e., current->signal->count > 1), force CLONE_THREAD.
+.PP
+In addition,
+.BR CLONE_THREAD ,
+.BR CLONE_SIGHAND ,
+and
+.B CLONE_VM
+can be specified in
+.I flags
+if the caller is single threaded (i.e., it is not sharing
+its address space with another process or thread).
+In this case, these flags have no effect.
+(Note also that specifying
+.B CLONE_THREAD
+automatically implies
+.BR CLONE_VM ,
+and specifying
+.B CLONE_VM
+automatically implies
+.BR CLONE_SIGHAND .)
+.\" As at 3.9, the following forced implications also apply,
+.\" although the relevant flags are not yet implemented.
+.\" If CLONE_THREAD is set force CLONE_VM.
+.\" If CLONE_VM is set, force CLONE_SIGHAND.
+.\"
+If the process is multithreaded, then
+the use of these flags results in an error.
+.\" See kernel/fork.c::check_unshare_flags()
+.PP
+If
+.I flags
+is specified as zero, then
+.BR unshare ()
+is a no-op;
+no changes are made to the calling process's execution context.
+.SH RETURN VALUE
+On success, zero returned.
+On failure, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+An invalid bit was specified in
+.IR flags .
+.TP
+.B EINVAL
+.BR CLONE_THREAD ,
+.BR CLONE_SIGHAND ,
+or
+.B CLONE_VM
+was specified in
+.IR flags ,
+and the caller is multithreaded.
+.TP
+.B EINVAL
+.B CLONE_NEWIPC
+was specified in
+.IR flags ,
+but the kernel was not configured with the
+.B CONFIG_SYSVIPC
+and
+.B CONFIG_IPC_NS
+options.
+.TP
+.B EINVAL
+.B CLONE_NEWNET
+was specified in
+.IR flags ,
+but the kernel was not configured with the
+.B CONFIG_NET_NS
+option.
+.TP
+.B EINVAL
+.B CLONE_NEWPID
+was specified in
+.IR flags ,
+but the kernel was not configured with the
+.B CONFIG_PID_NS
+option.
+.TP
+.B EINVAL
+.B CLONE_NEWUSER
+was specified in
+.IR flags ,
+but the kernel was not configured with the
+.B CONFIG_USER_NS
+option.
+.TP
+.B EINVAL
+.B CLONE_NEWUTS
+was specified in
+.IR flags ,
+but the kernel was not configured with the
+.B CONFIG_UTS_NS
+option.
+.TP
+.B EINVAL
+.B CLONE_NEWPID
+was specified in
+.IR flags ,
+but the process has previously called
+.BR unshare ()
+with the
+.B CLONE_NEWPID
+flag.
+.TP
+.B ENOMEM
+Cannot allocate sufficient memory to copy parts of caller's
+context that need to be unshared.
+.TP
+.BR ENOSPC " (since Linux 3.7)"
+.\" commit f2302505775fd13ba93f034206f1e2a587017929
+.B CLONE_NEWPID
+was specified in flags,
+but the limit on the nesting depth of PID namespaces
+would have been exceeded; see
+.BR pid_namespaces (7).
+.TP
+.BR ENOSPC " (since Linux 4.9; beforehand " EUSERS )
+.B CLONE_NEWUSER
+was specified in
+.IR flags ,
+and the call would cause the limit on the number of
+nested user namespaces to be exceeded.
+See
+.BR user_namespaces (7).
+.IP
+From Linux 3.11 to Linux 4.8, the error diagnosed in this case was
+.BR EUSERS .
+.TP
+.BR ENOSPC " (since Linux 4.9)"
+One of the values in
+.I flags
+specified the creation of a new user namespace,
+but doing so would have caused the limit defined by the corresponding file in
+.I /proc/sys/user
+to be exceeded.
+For further details, see
+.BR namespaces (7).
+.TP
+.B EPERM
+The calling process did not have the required privileges for this operation.
+.TP
+.B EPERM
+.B CLONE_NEWUSER
+was specified in
+.IR flags ,
+but either the effective user ID or the effective group ID of the caller
+does not have a mapping in the parent namespace (see
+.BR user_namespaces (7)).
+.TP
+.BR EPERM " (since Linux 3.9)"
+.\" commit 3151527ee007b73a0ebd296010f1c0454a919c7d
+.B CLONE_NEWUSER
+was specified in
+.I flags
+and the caller is in a chroot environment
+.\" FIXME What is the rationale for this restriction?
+(i.e., the caller's root directory does not match the root directory
+of the mount namespace in which it resides).
+.TP
+.BR EUSERS " (from Linux 3.11 to Linux 4.8)"
+.B CLONE_NEWUSER
+was specified in
+.IR flags ,
+and the limit on the number of nested user namespaces would be exceeded.
+See the discussion of the
+.B ENOSPC
+error above.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.16.
+.SH NOTES
+Not all of the process attributes that can be shared when
+a new process is created using
+.BR clone (2)
+can be unshared using
+.BR unshare ().
+In particular, as at kernel 3.8,
+.\" FIXME all of the following needs to be reviewed for the current kernel
+.BR unshare ()
+does not implement flags that reverse the effects of
+.BR CLONE_SIGHAND ,
+.\" However, we can do unshare(CLONE_SIGHAND) if CLONE_SIGHAND
+.\" was not specified when doing clone(); i.e., unsharing
+.\" signal handlers is permitted if we are not actually
+.\" sharing signal handlers. mtk
+.BR CLONE_THREAD ,
+or
+.BR CLONE_VM .
+.\" However, we can do unshare(CLONE_VM) if CLONE_VM
+.\" was not specified when doing clone(); i.e., unsharing
+.\" virtual memory is permitted if we are not actually
+.\" sharing virtual memory. mtk
+Such functionality may be added in the future, if required.
+.\"
+.\"9) Future Work
+.\"--------------
+.\"The current implementation of unshare does not allow unsharing of
+.\"signals and signal handlers. Signals are complex to begin with and
+.\"to unshare signals and/or signal handlers of a currently running
+.\"process is even more complex. If in the future there is a specific
+.\"need to allow unsharing of signals and/or signal handlers, it can
+.\"be incrementally added to unshare without affecting legacy
+.\"applications using unshare.
+.\"
+.PP
+Creating all kinds of namespace, except user namespaces, requires the
+.B CAP_SYS_ADMIN
+capability.
+However, since creating a user namespace automatically confers a full set of
+capabilities,
+creating both a user namespace and any other type of namespace in the same
+.BR unshare ()
+call does not require the
+.B CAP_SYS_ADMIN
+capability in the original namespace.
+.SH EXAMPLES
+The program below provides a simple implementation of the
+.BR unshare (1)
+command, which unshares one or more namespaces and executes the
+command supplied in its command-line arguments.
+Here's an example of the use of this program,
+running a shell in a new mount namespace,
+and verifying that the original shell and the
+new shell are in separate mount namespaces:
+.PP
+.in +4n
+.EX
+$ \fBreadlink /proc/$$/ns/mnt\fP
+mnt:[4026531840]
+$ \fBsudo ./unshare \-m /bin/bash\fP
+# \fBreadlink /proc/$$/ns/mnt\fP
+mnt:[4026532325]
+.EE
+.in
+.PP
+The differing output of the two
+.BR readlink (1)
+commands shows that the two shells are in different mount namespaces.
+.SS Program source
+\&
+.\" SRC BEGIN (unshare.c)
+.EX
+/* unshare.c
+\&
+ A simple implementation of the unshare(1) command: unshare
+ namespaces and execute a command.
+*/
+#define _GNU_SOURCE
+#include <err.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+\&
+static void
+usage(char *pname)
+{
+ fprintf(stderr, "Usage: %s [options] program [arg...]\en", pname);
+ fprintf(stderr, "Options can be:\en");
+ fprintf(stderr, " \-C unshare cgroup namespace\en");
+ fprintf(stderr, " \-i unshare IPC namespace\en");
+ fprintf(stderr, " \-m unshare mount namespace\en");
+ fprintf(stderr, " \-n unshare network namespace\en");
+ fprintf(stderr, " \-p unshare PID namespace\en");
+ fprintf(stderr, " \-t unshare time namespace\en");
+ fprintf(stderr, " \-u unshare UTS namespace\en");
+ fprintf(stderr, " \-U unshare user namespace\en");
+ exit(EXIT_FAILURE);
+}
+\&
+int
+main(int argc, char *argv[])
+{
+ int flags, opt;
+\&
+ flags = 0;
+\&
+ while ((opt = getopt(argc, argv, "CimnptuU")) != \-1) {
+ switch (opt) {
+ case \[aq]C\[aq]: flags |= CLONE_NEWCGROUP; break;
+ case \[aq]i\[aq]: flags |= CLONE_NEWIPC; break;
+ case \[aq]m\[aq]: flags |= CLONE_NEWNS; break;
+ case \[aq]n\[aq]: flags |= CLONE_NEWNET; break;
+ case \[aq]p\[aq]: flags |= CLONE_NEWPID; break;
+ case \[aq]t\[aq]: flags |= CLONE_NEWTIME; break;
+ case \[aq]u\[aq]: flags |= CLONE_NEWUTS; break;
+ case \[aq]U\[aq]: flags |= CLONE_NEWUSER; break;
+ default: usage(argv[0]);
+ }
+ }
+\&
+ if (optind >= argc)
+ usage(argv[0]);
+\&
+ if (unshare(flags) == \-1)
+ err(EXIT_FAILURE, "unshare");
+\&
+ execvp(argv[optind], &argv[optind]);
+ err(EXIT_FAILURE, "execvp");
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR unshare (1),
+.BR clone (2),
+.BR fork (2),
+.BR kcmp (2),
+.BR setns (2),
+.BR vfork (2),
+.BR namespaces (7)
+.PP
+.I Documentation/userspace\-api/unshare.rst
+in the Linux kernel source tree
+.\" commit f504d47be5e8fa7ecf2bf660b18b42e6960c0eb2
+(or
+.I Documentation/unshare.txt
+before Linux 4.12)
diff --git a/man2/uselib.2 b/man2/uselib.2
new file mode 100644
index 0000000..1d6a072
--- /dev/null
+++ b/man2/uselib.2
@@ -0,0 +1,106 @@
+.\" Copyright (c) 1992 Drew Eckhardt (drew@cs.colorado.edu), March 28, 1992
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt <michael@moria.de>
+.\" Modified 1993-07-24 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1996-10-22 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 2004-06-23 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified 2005-01-09 by aeb
+.\"
+.TH uselib 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+uselib \- load shared library
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "[[deprecated]] int uselib(const char *" library );
+.fi
+.SH DESCRIPTION
+The system call
+.BR uselib ()
+serves to load
+a shared library to be used by the calling process.
+It is given a pathname.
+The address where to load is found
+in the library itself.
+The library can have any recognized
+binary format.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+In addition to all of the error codes returned by
+.BR open (2)
+and
+.BR mmap (2),
+the following may also be returned:
+.TP
+.B EACCES
+The library specified by
+.I library
+does not have read or execute permission, or the caller does not have
+search permission for one of the directories in the path prefix.
+(See also
+.BR path_resolution (7).)
+.TP
+.B ENFILE
+The system-wide limit on the total number of open files has been reached.
+.TP
+.B ENOEXEC
+The file specified by
+.I library
+is not an executable of a known type;
+for example, it does not have the correct magic numbers.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+This obsolete system call is not supported by glibc.
+No declaration is provided in glibc headers, but, through a quirk of history,
+glibc before glibc 2.23 did export an ABI for this system call.
+Therefore, in order to employ this system call,
+it was sufficient to manually declare the interface in your code;
+alternatively, you could invoke the system call using
+.BR syscall (2).
+.PP
+In ancient libc versions (before glibc 2.0),
+.BR uselib ()
+was used to load
+the shared libraries with names found in an array of names
+in the binary.
+.\" .PP
+.\" .\" libc 4.3.1f - changelog 1993-03-02
+.\" Since libc 4.3.2, startup code tries to prefix these names
+.\" with "/usr/lib", "/lib" and "" before giving up.
+.\" .\" libc 4.3.4 - changelog 1993-04-21
+.\" In libc 4.3.4 and later these names are looked for in the directories
+.\" found in
+.\" .BR LD_LIBRARY_PATH ,
+.\" and if not found there,
+.\" prefixes "/usr/lib", "/lib" and "/" are tried.
+.\" .PP
+.\" From libc 4.4.4 on only the library "/lib/ld.so" is loaded,
+.\" so that this dynamic library can load the remaining libraries needed
+.\" (again using this call).
+.\" This is also the state of affairs in libc5.
+.\" .PP
+.\" glibc2 does not use this call.
+.PP
+Since Linux 3.15,
+.\" commit 69369a7003735d0d8ef22097e27a55a8bad9557a
+this system call is available only when the kernel is configured with the
+.B CONFIG_USELIB
+option.
+.SH SEE ALSO
+.BR ar (1),
+.BR gcc (1),
+.BR ld (1),
+.BR ldd (1),
+.BR mmap (2),
+.BR open (2),
+.BR dlopen (3),
+.BR capabilities (7),
+.BR ld.so (8)
diff --git a/man2/userfaultfd.2 b/man2/userfaultfd.2
new file mode 100644
index 0000000..82903c6
--- /dev/null
+++ b/man2/userfaultfd.2
@@ -0,0 +1,943 @@
+.\" Copyright (c) 2016, IBM Corporation.
+.\" Written by Mike Rapoport <rppt@linux.vnet.ibm.com>
+.\" and Copyright (C) 2017 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH userfaultfd 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+userfaultfd \- create a file descriptor for handling page faults in user space
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <fcntl.h>" " /* Definition of " O_* " constants */"
+.BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
+.BR "#include <linux/userfaultfd.h>" " /* Definition of " UFFD_* " constants */"
+.B #include <unistd.h>
+.PP
+.BI "int syscall(SYS_userfaultfd, int " flags );
+.fi
+.PP
+.IR Note :
+glibc provides no wrapper for
+.BR userfaultfd (),
+necessitating the use of
+.BR syscall (2).
+.SH DESCRIPTION
+.BR userfaultfd ()
+creates a new userfaultfd object that can be used for delegation of page-fault
+handling to a user-space application,
+and returns a file descriptor that refers to the new object.
+The new userfaultfd object is configured using
+.BR ioctl (2).
+.PP
+Once the userfaultfd object is configured, the application can use
+.BR read (2)
+to receive userfaultfd notifications.
+The reads from userfaultfd may be blocking or non-blocking,
+depending on the value of
+.I flags
+used for the creation of the userfaultfd or subsequent calls to
+.BR fcntl (2).
+.PP
+The following values may be bitwise ORed in
+.I flags
+to change the behavior of
+.BR userfaultfd ():
+.TP
+.B O_CLOEXEC
+Enable the close-on-exec flag for the new userfaultfd file descriptor.
+See the description of the
+.B O_CLOEXEC
+flag in
+.BR open (2).
+.TP
+.B O_NONBLOCK
+Enables non-blocking operation for the userfaultfd object.
+See the description of the
+.B O_NONBLOCK
+flag in
+.BR open (2).
+.TP
+.B UFFD_USER_MODE_ONLY
+This is an userfaultfd-specific flag that was introduced in Linux 5.11.
+When set, the userfaultfd object will only be able to handle
+page faults originated from the user space on the registered regions.
+When a kernel-originated fault was triggered
+on the registered range with this userfaultfd, a
+.B SIGBUS
+signal will be delivered.
+.PP
+When the last file descriptor referring to a userfaultfd object is closed,
+all memory ranges that were registered with the object are unregistered
+and unread events are flushed.
+.\"
+.PP
+Userfaultfd supports three modes of registration:
+.TP
+.BR UFFDIO_REGISTER_MODE_MISSING " (since Linux 4.10)"
+When registered with
+.B UFFDIO_REGISTER_MODE_MISSING
+mode, user-space will receive a page-fault notification
+when a missing page is accessed.
+The faulted thread will be stopped from execution until the page fault is
+resolved from user-space by either an
+.B UFFDIO_COPY
+or an
+.B UFFDIO_ZEROPAGE
+ioctl.
+.TP
+.BR UFFDIO_REGISTER_MODE_MINOR " (since Linux 5.13)"
+When registered with
+.B UFFDIO_REGISTER_MODE_MINOR
+mode, user-space will receive a page-fault notification
+when a minor page fault occurs.
+That is,
+when a backing page is in the page cache,
+but page table entries don't yet exist.
+The faulted thread will be stopped from execution
+until the page fault is resolved from user-space by an
+.B UFFDIO_CONTINUE
+ioctl.
+.TP
+.BR UFFDIO_REGISTER_MODE_WP " (since Linux 5.7)"
+When registered with
+.B UFFDIO_REGISTER_MODE_WP
+mode, user-space will receive a page-fault notification
+when a write-protected page is written.
+The faulted thread will be stopped from execution
+until user-space write-unprotects the page using an
+.B UFFDIO_WRITEPROTECT
+ioctl.
+.PP
+Multiple modes can be enabled at the same time for the same memory range.
+.PP
+Since Linux 4.14, a userfaultfd page-fault notification can selectively embed
+faulting thread ID information into the notification.
+One needs to enable this feature explicitly using the
+.B UFFD_FEATURE_THREAD_ID
+feature bit when initializing the userfaultfd context.
+By default, thread ID reporting is disabled.
+.SS Usage
+The userfaultfd mechanism is designed to allow a thread in a multithreaded
+program to perform user-space paging for the other threads in the process.
+When a page fault occurs for one of the regions registered
+to the userfaultfd object,
+the faulting thread is put to sleep and
+an event is generated that can be read via the userfaultfd file descriptor.
+The fault-handling thread reads events from this file descriptor and services
+them using the operations described in
+.BR ioctl_userfaultfd (2).
+When servicing the page fault events,
+the fault-handling thread can trigger a wake-up for the sleeping thread.
+.PP
+It is possible for the faulting threads and the fault-handling threads
+to run in the context of different processes.
+In this case, these threads may belong to different programs,
+and the program that executes the faulting threads
+will not necessarily cooperate with the program that handles the page faults.
+In such non-cooperative mode,
+the process that monitors userfaultfd and handles page faults
+needs to be aware of the changes in the virtual memory layout
+of the faulting process to avoid memory corruption.
+.PP
+Since Linux 4.11,
+userfaultfd can also notify the fault-handling threads about changes
+in the virtual memory layout of the faulting process.
+In addition, if the faulting process invokes
+.BR fork (2),
+the userfaultfd objects associated with the parent may be duplicated
+into the child process and the userfaultfd monitor will be notified
+(via the
+.B UFFD_EVENT_FORK
+described below)
+about the file descriptor associated with the userfault objects
+created for the child process,
+which allows the userfaultfd monitor to perform user-space paging
+for the child process.
+Unlike page faults which have to be synchronous and require an
+explicit or implicit wakeup,
+all other events are delivered asynchronously and
+the non-cooperative process resumes execution as
+soon as the userfaultfd manager executes
+.BR read (2).
+The userfaultfd manager should carefully synchronize calls to
+.B UFFDIO_COPY
+with the processing of events.
+.PP
+The current asynchronous model of the event delivery is optimal for
+single threaded non-cooperative userfaultfd manager implementations.
+.\" Regarding the preceding sentence, Mike Rapoport says:
+.\" The major point here is that current events delivery model could be
+.\" problematic for multi-threaded monitor. I even suspect that it would be
+.\" impossible to ensure synchronization between page faults and non-page
+.\" fault events in multi-threaded monitor.
+.\" .PP
+.\" FIXME elaborate about non-cooperating mode, describe its limitations
+.\" for kernels before Linux 4.11, features added in Linux 4.11
+.\" and limitations remaining in Linux 4.11
+.\" Maybe it's worth adding a dedicated sub-section...
+.\"
+.PP
+Since Linux 5.7, userfaultfd is able to do
+synchronous page dirty tracking using the new write-protect register mode.
+One should check against the feature bit
+.B UFFD_FEATURE_PAGEFAULT_FLAG_WP
+before using this feature.
+Similar to the original userfaultfd missing mode, the write-protect mode will
+generate a userfaultfd notification when the protected page is written.
+The user needs to resolve the page fault by unprotecting the faulted page and
+kicking the faulted thread to continue.
+For more information,
+please refer to the "Userfaultfd write-protect mode" section.
+.\"
+.SS Userfaultfd operation
+After the userfaultfd object is created with
+.BR userfaultfd (),
+the application must enable it using the
+.B UFFDIO_API
+.BR ioctl (2)
+operation.
+This operation allows a handshake between the kernel and user space
+to determine the API version and supported features.
+This operation must be performed before any of the other
+.BR ioctl (2)
+operations described below (or those operations fail with the
+.B EINVAL
+error).
+.PP
+After a successful
+.B UFFDIO_API
+operation,
+the application then registers memory address ranges using the
+.B UFFDIO_REGISTER
+.BR ioctl (2)
+operation.
+After successful completion of a
+.B UFFDIO_REGISTER
+operation,
+a page fault occurring in the requested memory range, and satisfying
+the mode defined at the registration time, will be forwarded by the kernel to
+the user-space application.
+The application can then use the
+.B UFFDIO_COPY ,
+.B UFFDIO_ZEROPAGE ,
+or
+.B UFFDIO_CONTINUE
+.BR ioctl (2)
+operations to resolve the page fault.
+.PP
+Since Linux 4.14, if the application sets the
+.B UFFD_FEATURE_SIGBUS
+feature bit using the
+.B UFFDIO_API
+.BR ioctl (2),
+no page-fault notification will be forwarded to user space.
+Instead a
+.B SIGBUS
+signal is delivered to the faulting process.
+With this feature,
+userfaultfd can be used for robustness purposes to simply catch
+any access to areas within the registered address range that do not
+have pages allocated, without having to listen to userfaultfd events.
+No userfaultfd monitor will be required for dealing with such memory
+accesses.
+For example, this feature can be useful for applications that
+want to prevent the kernel from automatically allocating pages and filling
+holes in sparse files when the hole is accessed through a memory mapping.
+.PP
+The
+.B UFFD_FEATURE_SIGBUS
+feature is implicitly inherited through
+.BR fork (2)
+if used in combination with
+.BR UFFD_FEATURE_FORK .
+.PP
+Details of the various
+.BR ioctl (2)
+operations can be found in
+.BR ioctl_userfaultfd (2).
+.PP
+Since Linux 4.11, events other than page-fault may enabled during
+.B UFFDIO_API
+operation.
+.PP
+Up to Linux 4.11,
+userfaultfd can be used only with anonymous private memory mappings.
+Since Linux 4.11,
+userfaultfd can be also used with hugetlbfs and shared memory mappings.
+.\"
+.SS Userfaultfd write-protect mode (since Linux 5.7)
+Since Linux 5.7, userfaultfd supports write-protect mode for anonymous memory.
+The user needs to first check availability of this feature using
+.B UFFDIO_API
+ioctl against the feature bit
+.B UFFD_FEATURE_PAGEFAULT_FLAG_WP
+before using this feature.
+.PP
+Since Linux 5.19,
+the write-protection mode was also supported on
+shmem and hugetlbfs memory types.
+It can be detected with the feature bit
+.BR UFFD_FEATURE_WP_HUGETLBFS_SHMEM .
+.PP
+To register with userfaultfd write-protect mode, the user needs to initiate the
+.B UFFDIO_REGISTER
+ioctl with mode
+.B UFFDIO_REGISTER_MODE_WP
+set.
+Note that it is legal to monitor the same memory range with multiple modes.
+For example, the user can do
+.B UFFDIO_REGISTER
+with the mode set to
+.BR "UFFDIO_REGISTER_MODE_MISSING | UFFDIO_REGISTER_MODE_WP" .
+When there is only
+.B UFFDIO_REGISTER_MODE_WP
+registered, user-space will
+.I not
+receive any notification when a missing page is written.
+Instead, user-space will receive a write-protect page-fault notification
+only when an existing but write-protected page got written.
+.PP
+After the
+.B UFFDIO_REGISTER
+ioctl completed with
+.B UFFDIO_REGISTER_MODE_WP
+mode set,
+the user can write-protect any existing memory within the range using the ioctl
+.B UFFDIO_WRITEPROTECT
+where
+.I uffdio_writeprotect.mode
+should be set to
+.BR UFFDIO_WRITEPROTECT_MODE_WP .
+.PP
+When a write-protect event happens,
+user-space will receive a page-fault notification whose
+.I uffd_msg.pagefault.flags
+will be with
+.B UFFD_PAGEFAULT_FLAG_WP
+flag set.
+Note: since only writes can trigger this kind of fault,
+write-protect notifications will always have the
+.B UFFD_PAGEFAULT_FLAG_WRITE
+bit set along with the
+.B UFFD_PAGEFAULT_FLAG_WP
+bit.
+.PP
+To resolve a write-protection page fault, the user should initiate another
+.B UFFDIO_WRITEPROTECT
+ioctl, whose
+.I uffd_msg.pagefault.flags
+should have the flag
+.B UFFDIO_WRITEPROTECT_MODE_WP
+cleared upon the faulted page or range.
+.\"
+.SS Userfaultfd minor fault mode (since Linux 5.13)
+Since Linux 5.13,
+userfaultfd supports minor fault mode.
+In this mode,
+fault messages are produced not for major faults
+(where the page was missing),
+but rather for minor faults,
+where a page exists in the page cache,
+but the page table entries are not yet present.
+The user needs to first check availability of this feature using the
+.B UFFDIO_API
+ioctl with the appropriate feature bits set before using this feature:
+.B UFFD_FEATURE_MINOR_HUGETLBFS
+since Linux 5.13,
+or
+.B UFFD_FEATURE_MINOR_SHMEM
+since Linux 5.14.
+.PP
+To register with userfaultfd minor fault mode,
+the user needs to initiate the
+.B UFFDIO_REGISTER
+ioctl with mode
+.B UFFD_REGISTER_MODE_MINOR
+set.
+.PP
+When a minor fault occurs,
+user-space will receive a page-fault notification
+whose
+.I uffd_msg.pagefault.flags
+will have the
+.B UFFD_PAGEFAULT_FLAG_MINOR
+flag set.
+.PP
+To resolve a minor page fault,
+the handler should decide whether or not
+the existing page contents need to be modified first.
+If so,
+this should be done in-place via a second,
+non-userfaultfd-registered mapping
+to the same backing page
+(e.g., by mapping the shmem or hugetlbfs file twice).
+Once the page is considered "up to date",
+the fault can be resolved by initiating an
+.B UFFDIO_CONTINUE
+ioctl,
+which installs the page table entries and
+(by default)
+wakes up the faulting thread(s).
+.PP
+Minor fault mode supports only hugetlbfs-backed (since Linux 5.13)
+and shmem-backed (since Linux 5.14) memory.
+.\"
+.SS Reading from the userfaultfd structure
+Each
+.BR read (2)
+from the userfaultfd file descriptor returns one or more
+.I uffd_msg
+structures, each of which describes a page-fault event
+or an event required for the non-cooperative userfaultfd usage:
+.PP
+.in +4n
+.EX
+struct uffd_msg {
+ __u8 event; /* Type of event */
+ ...
+ union {
+ struct {
+ __u64 flags; /* Flags describing fault */
+ __u64 address; /* Faulting address */
+ union {
+ __u32 ptid; /* Thread ID of the fault */
+ } feat;
+ } pagefault;
+\&
+ struct { /* Since Linux 4.11 */
+ __u32 ufd; /* Userfault file descriptor
+ of the child process */
+ } fork;
+\&
+ struct { /* Since Linux 4.11 */
+ __u64 from; /* Old address of remapped area */
+ __u64 to; /* New address of remapped area */
+ __u64 len; /* Original mapping length */
+ } remap;
+\&
+ struct { /* Since Linux 4.11 */
+ __u64 start; /* Start address of removed area */
+ __u64 end; /* End address of removed area */
+ } remove;
+ ...
+ } arg;
+\&
+ /* Padding fields omitted */
+} __packed;
+.EE
+.in
+.PP
+If multiple events are available and the supplied buffer is large enough,
+.BR read (2)
+returns as many events as will fit in the supplied buffer.
+If the buffer supplied to
+.BR read (2)
+is smaller than the size of the
+.I uffd_msg
+structure, the
+.BR read (2)
+fails with the error
+.BR EINVAL .
+.PP
+The fields set in the
+.I uffd_msg
+structure are as follows:
+.TP
+.I event
+The type of event.
+Depending of the event type,
+different fields of the
+.I arg
+union represent details required for the event processing.
+The non-page-fault events are generated only when appropriate feature
+is enabled during API handshake with
+.B UFFDIO_API
+.BR ioctl (2).
+.IP
+The following values can appear in the
+.I event
+field:
+.RS
+.TP
+.BR UFFD_EVENT_PAGEFAULT " (since Linux 4.3)"
+A page-fault event.
+The page-fault details are available in the
+.I pagefault
+field.
+.TP
+.BR UFFD_EVENT_FORK " (since Linux 4.11)"
+Generated when the faulting process invokes
+.BR fork (2)
+(or
+.BR clone (2)
+without the
+.B CLONE_VM
+flag).
+The event details are available in the
+.I fork
+field.
+.\" FIXME describe duplication of userfault file descriptor during fork
+.TP
+.BR UFFD_EVENT_REMAP " (since Linux 4.11)"
+Generated when the faulting process invokes
+.BR mremap (2).
+The event details are available in the
+.I remap
+field.
+.TP
+.BR UFFD_EVENT_REMOVE " (since Linux 4.11)"
+Generated when the faulting process invokes
+.BR madvise (2)
+with
+.B MADV_DONTNEED
+or
+.B MADV_REMOVE
+advice.
+The event details are available in the
+.I remove
+field.
+.TP
+.BR UFFD_EVENT_UNMAP " (since Linux 4.11)"
+Generated when the faulting process unmaps a memory range,
+either explicitly using
+.BR munmap (2)
+or implicitly during
+.BR mmap (2)
+or
+.BR mremap (2).
+The event details are available in the
+.I remove
+field.
+.RE
+.TP
+.I pagefault.address
+The address that triggered the page fault.
+.TP
+.I pagefault.flags
+A bit mask of flags that describe the event.
+For
+.BR UFFD_EVENT_PAGEFAULT ,
+the following flag may appear:
+.RS
+.TP
+.B UFFD_PAGEFAULT_FLAG_WP
+If this flag is set, then the fault was a write-protect fault.
+.TP
+.B UFFD_PAGEFAULT_FLAG_MINOR
+If this flag is set, then the fault was a minor fault.
+.TP
+.B UFFD_PAGEFAULT_FLAG_WRITE
+If this flag is set, then the fault was a write fault.
+.PP
+If neither
+.B UFFD_PAGEFAULT_FLAG_WP
+nor
+.B UFFD_PAGEFAULT_FLAG_MINOR
+are set, then the fault was a missing fault.
+.RE
+.TP
+.I pagefault.feat.pid
+The thread ID that triggered the page fault.
+.TP
+.I fork.ufd
+The file descriptor associated with the userfault object
+created for the child created by
+.BR fork (2).
+.TP
+.I remap.from
+The original address of the memory range that was remapped using
+.BR mremap (2).
+.TP
+.I remap.to
+The new address of the memory range that was remapped using
+.BR mremap (2).
+.TP
+.I remap.len
+The original length of the memory range that was remapped using
+.BR mremap (2).
+.TP
+.I remove.start
+The start address of the memory range that was freed using
+.BR madvise (2)
+or unmapped
+.TP
+.I remove.end
+The end address of the memory range that was freed using
+.BR madvise (2)
+or unmapped
+.PP
+A
+.BR read (2)
+on a userfaultfd file descriptor can fail with the following errors:
+.TP
+.B EINVAL
+The userfaultfd object has not yet been enabled using the
+.B UFFDIO_API
+.BR ioctl (2)
+operation
+.PP
+If the
+.B O_NONBLOCK
+flag is enabled in the associated open file description,
+the userfaultfd file descriptor can be monitored with
+.BR poll (2),
+.BR select (2),
+and
+.BR epoll (7).
+When events are available, the file descriptor indicates as readable.
+If the
+.B O_NONBLOCK
+flag is not enabled, then
+.BR poll (2)
+(always) indicates the file as having a
+.B POLLERR
+condition, and
+.BR select (2)
+indicates the file descriptor as both readable and writable.
+.\" FIXME What is the reason for this seemingly odd behavior with respect
+.\" to the O_NONBLOCK flag? (see userfaultfd_poll() in fs/userfaultfd.c).
+.\" Something needs to be said about this.
+.SH RETURN VALUE
+On success,
+.BR userfaultfd ()
+returns a new file descriptor that refers to the userfaultfd object.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EINVAL
+An unsupported value was specified in
+.IR flags .
+.TP
+.B EMFILE
+The per-process limit on the number of open file descriptors has been
+reached
+.TP
+.B ENFILE
+The system-wide limit on the total number of open files has been
+reached.
+.TP
+.B ENOMEM
+Insufficient kernel memory was available.
+.TP
+.BR EPERM " (since Linux 5.2)"
+.\" cefdca0a86be517bc390fc4541e3674b8e7803b0
+The caller is not privileged (does not have the
+.B CAP_SYS_PTRACE
+capability in the initial user namespace), and
+.I /proc/sys/vm/unprivileged_userfaultfd
+has the value 0.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 4.3.
+.PP
+Support for hugetlbfs and shared memory areas and
+non-page-fault events was added in Linux 4.11
+.SH NOTES
+The userfaultfd mechanism can be used as an alternative to
+traditional user-space paging techniques based on the use of the
+.B SIGSEGV
+signal and
+.BR mmap (2).
+It can also be used to implement lazy restore
+for checkpoint/restore mechanisms,
+as well as post-copy migration to allow (nearly) uninterrupted execution
+when transferring virtual machines and Linux containers
+from one host to another.
+.SH BUGS
+If the
+.B UFFD_FEATURE_EVENT_FORK
+is enabled and a system call from the
+.BR fork (2)
+family is interrupted by a signal or failed, a stale userfaultfd descriptor
+might be created.
+In this case, a spurious
+.B UFFD_EVENT_FORK
+will be delivered to the userfaultfd monitor.
+.SH EXAMPLES
+The program below demonstrates the use of the userfaultfd mechanism.
+The program creates two threads, one of which acts as the
+page-fault handler for the process, for the pages in a demand-page zero
+region created using
+.BR mmap (2).
+.PP
+The program takes one command-line argument,
+which is the number of pages that will be created in a mapping
+whose page faults will be handled via userfaultfd.
+After creating a userfaultfd object,
+the program then creates an anonymous private mapping of the specified size
+and registers the address range of that mapping using the
+.B UFFDIO_REGISTER
+.BR ioctl (2)
+operation.
+The program then creates a second thread that will perform the
+task of handling page faults.
+.PP
+The main thread then walks through the pages of the mapping fetching
+bytes from successive pages.
+Because the pages have not yet been accessed,
+the first access of a byte in each page will trigger a page-fault event
+on the userfaultfd file descriptor.
+.PP
+Each of the page-fault events is handled by the second thread,
+which sits in a loop processing input from the userfaultfd file descriptor.
+In each loop iteration, the second thread first calls
+.BR poll (2)
+to check the state of the file descriptor,
+and then reads an event from the file descriptor.
+All such events should be
+.B UFFD_EVENT_PAGEFAULT
+events,
+which the thread handles by copying a page of data into
+the faulting region using the
+.B UFFDIO_COPY
+.BR ioctl (2)
+operation.
+.PP
+The following is an example of what we see when running the program:
+.PP
+.in +4n
+.EX
+$ \fB./userfaultfd_demo 3\fP
+Address returned by mmap() = 0x7fd30106c000
+\&
+fault_handler_thread():
+ poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
+ UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106c00f
+ (uffdio_copy.copy returned 4096)
+Read address 0x7fd30106c00f in main(): A
+Read address 0x7fd30106c40f in main(): A
+Read address 0x7fd30106c80f in main(): A
+Read address 0x7fd30106cc0f in main(): A
+\&
+fault_handler_thread():
+ poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
+ UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106d00f
+ (uffdio_copy.copy returned 4096)
+Read address 0x7fd30106d00f in main(): B
+Read address 0x7fd30106d40f in main(): B
+Read address 0x7fd30106d80f in main(): B
+Read address 0x7fd30106dc0f in main(): B
+\&
+fault_handler_thread():
+ poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
+ UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106e00f
+ (uffdio_copy.copy returned 4096)
+Read address 0x7fd30106e00f in main(): C
+Read address 0x7fd30106e40f in main(): C
+Read address 0x7fd30106e80f in main(): C
+Read address 0x7fd30106ec0f in main(): C
+.EE
+.in
+.SS Program source
+\&
+.\" SRC BEGIN (userfaultfd.c)
+.EX
+/* userfaultfd_demo.c
+\&
+ Licensed under the GNU General Public License version 2 or later.
+*/
+#define _GNU_SOURCE
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <linux/userfaultfd.h>
+#include <poll.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+\&
+static int page_size;
+\&
+static void *
+fault_handler_thread(void *arg)
+{
+ int nready;
+ long uffd; /* userfaultfd file descriptor */
+ ssize_t nread;
+ struct pollfd pollfd;
+ struct uffdio_copy uffdio_copy;
+\&
+ static int fault_cnt = 0; /* Number of faults so far handled */
+ static char *page = NULL;
+ static struct uffd_msg msg; /* Data read from userfaultfd */
+\&
+ uffd = (long) arg;
+\&
+ /* Create a page that will be copied into the faulting region. */
+\&
+ if (page == NULL) {
+ page = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0);
+ if (page == MAP_FAILED)
+ err(EXIT_FAILURE, "mmap");
+ }
+\&
+ /* Loop, handling incoming events on the userfaultfd
+ file descriptor. */
+\&
+ for (;;) {
+\&
+ /* See what poll() tells us about the userfaultfd. */
+\&
+ pollfd.fd = uffd;
+ pollfd.events = POLLIN;
+ nready = poll(&pollfd, 1, \-1);
+ if (nready == \-1)
+ err(EXIT_FAILURE, "poll");
+\&
+ printf("\enfault_handler_thread():\en");
+ printf(" poll() returns: nready = %d; "
+ "POLLIN = %d; POLLERR = %d\en", nready,
+ (pollfd.revents & POLLIN) != 0,
+ (pollfd.revents & POLLERR) != 0);
+\&
+ /* Read an event from the userfaultfd. */
+\&
+ nread = read(uffd, &msg, sizeof(msg));
+ if (nread == 0) {
+ printf("EOF on userfaultfd!\en");
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (nread == \-1)
+ err(EXIT_FAILURE, "read");
+\&
+ /* We expect only one kind of event; verify that assumption. */
+\&
+ if (msg.event != UFFD_EVENT_PAGEFAULT) {
+ fprintf(stderr, "Unexpected event on userfaultfd\en");
+ exit(EXIT_FAILURE);
+ }
+\&
+ /* Display info about the page\-fault event. */
+\&
+ printf(" UFFD_EVENT_PAGEFAULT event: ");
+ printf("flags = %"PRIx64"; ", msg.arg.pagefault.flags);
+ printf("address = %"PRIx64"\en", msg.arg.pagefault.address);
+\&
+ /* Copy the page pointed to by \[aq]page\[aq] into the faulting
+ region. Vary the contents that are copied in, so that it
+ is more obvious that each fault is handled separately. */
+\&
+ memset(page, \[aq]A\[aq] + fault_cnt % 20, page_size);
+ fault_cnt++;
+\&
+ uffdio_copy.src = (unsigned long) page;
+\&
+ /* We need to handle page faults in units of pages(!).
+ So, round faulting address down to page boundary. */
+\&
+ uffdio_copy.dst = (unsigned long) msg.arg.pagefault.address &
+ \[ti](page_size \- 1);
+ uffdio_copy.len = page_size;
+ uffdio_copy.mode = 0;
+ uffdio_copy.copy = 0;
+ if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == \-1)
+ err(EXIT_FAILURE, "ioctl\-UFFDIO_COPY");
+\&
+ printf(" (uffdio_copy.copy returned %"PRId64")\en",
+ uffdio_copy.copy);
+ }
+}
+\&
+int
+main(int argc, char *argv[])
+{
+ int s;
+ char c;
+ char *addr; /* Start of region handled by userfaultfd */
+ long uffd; /* userfaultfd file descriptor */
+ size_t len, l; /* Length of region handled by userfaultfd */
+ pthread_t thr; /* ID of thread that handles page faults */
+ struct uffdio_api uffdio_api;
+ struct uffdio_register uffdio_register;
+\&
+ if (argc != 2) {
+ fprintf(stderr, "Usage: %s num\-pages\en", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+\&
+ page_size = sysconf(_SC_PAGE_SIZE);
+ len = strtoull(argv[1], NULL, 0) * page_size;
+\&
+ /* Create and enable userfaultfd object. */
+\&
+ uffd = syscall(SYS_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (uffd == \-1)
+ err(EXIT_FAILURE, "userfaultfd");
+\&
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = 0;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api) == \-1)
+ err(EXIT_FAILURE, "ioctl\-UFFDIO_API");
+\&
+ /* Create a private anonymous mapping. The memory will be
+ demand\-zero paged\-\-that is, not yet allocated. When we
+ actually touch the memory, it will be allocated via
+ the userfaultfd. */
+\&
+ addr = mmap(NULL, len, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, \-1, 0);
+ if (addr == MAP_FAILED)
+ err(EXIT_FAILURE, "mmap");
+\&
+ printf("Address returned by mmap() = %p\en", addr);
+\&
+ /* Register the memory range of the mapping we just created for
+ handling by the userfaultfd object. In mode, we request to track
+ missing pages (i.e., pages that have not yet been faulted in). */
+\&
+ uffdio_register.range.start = (unsigned long) addr;
+ uffdio_register.range.len = len;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == \-1)
+ err(EXIT_FAILURE, "ioctl\-UFFDIO_REGISTER");
+\&
+ /* Create a thread that will process the userfaultfd events. */
+\&
+ s = pthread_create(&thr, NULL, fault_handler_thread, (void *) uffd);
+ if (s != 0) {
+ errc(EXIT_FAILURE, s, "pthread_create");
+ }
+\&
+ /* Main thread now touches memory in the mapping, touching
+ locations 1024 bytes apart. This will trigger userfaultfd
+ events for all pages in the region. */
+\&
+ l = 0xf; /* Ensure that faulting address is not on a page
+ boundary, in order to test that we correctly
+ handle that case in fault_handling_thread(). */
+ while (l < len) {
+ c = addr[l];
+ printf("Read address %p in %s(): ", addr + l, __func__);
+ printf("%c\en", c);
+ l += 1024;
+ usleep(100000); /* Slow things down a little */
+ }
+\&
+ exit(EXIT_SUCCESS);
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR fcntl (2),
+.BR ioctl (2),
+.BR ioctl_userfaultfd (2),
+.BR madvise (2),
+.BR mmap (2)
+.PP
+.I Documentation/admin\-guide/mm/userfaultfd.rst
+in the Linux kernel source tree
diff --git a/man2/ustat.2 b/man2/ustat.2
new file mode 100644
index 0000000..a894b13
--- /dev/null
+++ b/man2/ustat.2
@@ -0,0 +1,104 @@
+.\" Copyright (C) 1995, Thomas K. Dyas <tdyas@eden.rutgers.edu>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Created 1995-08-09 Thomas K. Dyas <tdyas@eden.rutgers.edu>
+.\" Modified 1997-01-31 by Eric S. Raymond <esr@thyrsus.com>
+.\" Modified 2001-03-22 by aeb
+.\" Modified 2003-08-04 by aeb
+.\"
+.TH ustat 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+ustat \- get filesystem statistics
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/types.h>
+.BR "#include <unistd.h>" " /* libc[45] */"
+.BR "#include <ustat.h>" " /* glibc2 */"
+.PP
+.BI "[[deprecated]] int ustat(dev_t " dev ", struct ustat *" ubuf );
+.fi
+.SH DESCRIPTION
+.BR ustat ()
+returns information about a mounted filesystem.
+.I dev
+is a device number identifying a device containing
+a mounted filesystem.
+.I ubuf
+is a pointer to a
+.I ustat
+structure that contains the following
+members:
+.PP
+.in +4n
+.EX
+daddr_t f_tfree; /* Total free blocks */
+ino_t f_tinode; /* Number of free inodes */
+char f_fname[6]; /* Filsys name */
+char f_fpack[6]; /* Filsys pack name */
+.EE
+.in
+.PP
+The last two fields,
+.I f_fname
+and
+.IR f_fpack ,
+are not implemented and will
+always be filled with null bytes (\[aq]\e0\[aq]).
+.SH RETURN VALUE
+On success, zero is returned and the
+.I ustat
+structure pointed to by
+.I ubuf
+will be filled in.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+.I ubuf
+points outside of your accessible address space.
+.TP
+.B EINVAL
+.I dev
+does not refer to a device containing a mounted filesystem.
+.TP
+.B ENOSYS
+The mounted filesystem referenced by
+.I dev
+does not support this operation,
+or any version of Linux before Linux 1.3.16.
+.SH STANDARDS
+None.
+.SH HISTORY
+SVr4.
+Removed in glibc 2.28.
+.\" SVr4 documents additional error conditions ENOLINK, ECOMM, and EINTR
+.\" but has no ENOSYS condition.
+.PP
+.BR ustat ()
+is deprecated and has been provided only for compatibility.
+All new programs should use
+.BR statfs (2)
+instead.
+.SS HP-UX notes
+The HP-UX version of the
+.I ustat
+structure has an additional field,
+.IR f_blksize ,
+that is unknown elsewhere.
+HP-UX warns:
+For some filesystems, the number of free inodes does not change.
+Such filesystems will return \-1 in the field
+.IR f_tinode .
+.\" Some software tries to use this in order to test whether the
+.\" underlying filesystem is NFS.
+For some filesystems, inodes are dynamically allocated.
+Such filesystems will return the current number of free inodes.
+.SH SEE ALSO
+.BR stat (2),
+.BR statfs (2)
diff --git a/man2/utime.2 b/man2/utime.2
new file mode 100644
index 0000000..86760ab
--- /dev/null
+++ b/man2/utime.2
@@ -0,0 +1,179 @@
+.\" Copyright (c) 1992 Drew Eckhardt (drew@cs.colorado.edu), March 28, 1992
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified by Michael Haardt <michael@moria.de>
+.\" Modified 1993-07-24 by Rik Faith <faith@cs.unc.edu>
+.\" Modified 1995-06-10 by Andries Brouwer <aeb@cwi.nl>
+.\" Modified 2004-06-23 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Modified 2004-10-10 by Andries Brouwer <aeb@cwi.nl>
+.\"
+.TH utime 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+utime, utimes \- change file last access and modification times
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <utime.h>
+.PP
+.BI "int utime(const char *" filename ,
+.BI " const struct utimbuf *_Nullable " times );
+.PP
+.B #include <sys/time.h>
+.PP
+.BI "int utimes(const char *" filename ,
+.BI " const struct timeval " times "[_Nullable 2]);"
+.fi
+.SH DESCRIPTION
+.B Note:
+modern applications may prefer to use the interfaces described in
+.BR utimensat (2).
+.PP
+The
+.BR utime ()
+system call
+changes the access and modification times of the inode specified by
+.I filename
+to the
+.IR actime " and " modtime
+fields of
+.I times
+respectively.
+The status change time (ctime) will be set to the current time, even if the
+other time stamps don't actually change.
+.PP
+If
+.I times
+is NULL, then the access and modification times of the file are set
+to the current time.
+.PP
+Changing timestamps is permitted when: either
+the process has appropriate privileges,
+or the effective user ID equals the user ID
+of the file, or
+.I times
+is NULL and the process has write permission for the file.
+.PP
+The
+.I utimbuf
+structure is:
+.PP
+.in +4n
+.EX
+struct utimbuf {
+ time_t actime; /* access time */
+ time_t modtime; /* modification time */
+};
+.EE
+.in
+.PP
+The
+.BR utime ()
+system call
+allows specification of timestamps with a resolution of 1 second.
+.PP
+The
+.BR utimes ()
+system call
+is similar, but the
+.I times
+argument refers to an array rather than a structure.
+The elements of this array are
+.I timeval
+structures, which allow a precision of 1 microsecond for specifying timestamps.
+The
+.I timeval
+structure is:
+.PP
+.in +4n
+.EX
+struct timeval {
+ long tv_sec; /* seconds */
+ long tv_usec; /* microseconds */
+};
+.EE
+.in
+.PP
+.I times[0]
+specifies the new access time, and
+.I times[1]
+specifies the new modification time.
+If
+.I times
+is NULL, then analogously to
+.BR utime (),
+the access and modification times of the file are
+set to the current time.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+Search permission is denied for one of the directories in
+the path prefix of
+.I path
+(see also
+.BR path_resolution (7)).
+.TP
+.B EACCES
+.I times
+is NULL,
+the caller's effective user ID does not match the owner of the file,
+the caller does not have write access to the file,
+and the caller is not privileged
+(Linux: does not have either the
+.B CAP_DAC_OVERRIDE
+or the
+.B CAP_FOWNER
+capability).
+.TP
+.B ENOENT
+.I filename
+does not exist.
+.TP
+.B EPERM
+.I times
+is not NULL,
+the caller's effective UID does not match the owner of the file,
+and the caller is not privileged
+(Linux: does not have the
+.B CAP_FOWNER
+capability).
+.TP
+.B EROFS
+.I path
+resides on a read-only filesystem.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+.TP
+.BR utime ()
+SVr4, POSIX.1-2001.
+POSIX.1-2008 marks it as obsolete.
+.TP
+.BR utimes ()
+4.3BSD, POSIX.1-2001.
+.SH NOTES
+Linux does not allow changing the timestamps on an immutable file,
+or setting the timestamps to something other than the current time
+on an append-only file.
+.\"
+.\" In libc4 and libc5,
+.\" .BR utimes ()
+.\" is just a wrapper for
+.\" .BR utime ()
+.\" and hence does not allow a subsecond resolution.
+.SH SEE ALSO
+.BR chattr (1),
+.BR touch (1),
+.BR futimesat (2),
+.BR stat (2),
+.BR utimensat (2),
+.BR futimens (3),
+.BR futimes (3),
+.BR inode (7)
diff --git a/man2/utimensat.2 b/man2/utimensat.2
new file mode 100644
index 0000000..77456fc
--- /dev/null
+++ b/man2/utimensat.2
@@ -0,0 +1,613 @@
+'\" t
+.\" Copyright (C) 2008, Linux Foundation, written by Michael Kerrisk
+.\" <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH utimensat 2 2023-07-20 "Linux man-pages 6.05.01"
+.SH NAME
+utimensat, futimens \- change file timestamps with nanosecond precision
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#include <fcntl.h>" " /* Definition of " AT_* " constants */"
+.B #include <sys/stat.h>
+.PP
+.BI "int utimensat(int " dirfd ", const char *" pathname ,
+.BI " const struct timespec " times "[_Nullable 2], int " flags );
+.BI "int futimens(int " fd ", const struct timespec " times "[_Nullable 2]);"
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR utimensat ():
+.nf
+ Since glibc 2.10:
+ _POSIX_C_SOURCE >= 200809L
+ Before glibc 2.10:
+ _ATFILE_SOURCE
+.fi
+.PP
+.BR futimens ():
+.nf
+ Since glibc 2.10:
+ _POSIX_C_SOURCE >= 200809L
+ Before glibc 2.10:
+ _GNU_SOURCE
+.fi
+.SH DESCRIPTION
+.BR utimensat ()
+and
+.BR futimens ()
+update the timestamps of a file with nanosecond precision.
+This contrasts with the historical
+.BR utime (2)
+and
+.BR utimes (2),
+which permit only second and microsecond precision, respectively,
+when setting file timestamps.
+.PP
+With
+.BR utimensat ()
+the file is specified via the pathname given in
+.IR pathname .
+With
+.BR futimens ()
+the file whose timestamps are to be updated is specified via
+an open file descriptor,
+.IR fd .
+.PP
+For both calls, the new file timestamps are specified in the array
+.IR times :
+.I times[0]
+specifies the new "last access time" (\fIatime\fP);
+.I times[1]
+specifies the new "last modification time" (\fImtime\fP).
+Each of the elements of
+.I times
+specifies a time as the number of seconds and nanoseconds
+since the Epoch, 1970-01-01 00:00:00 +0000 (UTC).
+This information is conveyed in a
+.BR timespec (3)
+structure.
+.PP
+Updated file timestamps are set to the greatest value
+supported by the filesystem that is not greater than the specified time.
+.PP
+If the
+.I tv_nsec
+field of one of the
+.I timespec
+structures has the special value
+.BR UTIME_NOW ,
+then the corresponding file timestamp is set to the current time.
+If the
+.I tv_nsec
+field of one of the
+.I timespec
+structures has the special value
+.BR UTIME_OMIT ,
+then the corresponding file timestamp is left unchanged.
+In both of these cases, the value of the corresponding
+.I tv_sec
+.\" 2.6.22 was broken: it is not ignored
+field is ignored.
+.PP
+If
+.I times
+is NULL, then both timestamps are set to the current time.
+.\"
+.PP
+The status change time (ctime) will be set to the current time, even if the
+other time stamps don't actually change.
+.SS Permissions requirements
+To set both file timestamps to the current time (i.e.,
+.I times
+is NULL, or both
+.I tv_nsec
+fields specify
+.BR UTIME_NOW ),
+either:
+.IP \[bu] 3
+the caller must have write access to the file;
+.\" 2.6.22 was broken here -- for futimens() the check is
+.\" based on whether or not the file descriptor is writable,
+.\" not on whether the caller's effective UID has write
+.\" permission for the file referred to by the descriptor.
+.IP \[bu]
+the caller's effective user ID must match the owner of the file; or
+.IP \[bu]
+the caller must have appropriate privileges.
+.PP
+To make any change other than setting both timestamps to the
+current time (i.e.,
+.I times
+is not NULL, and neither
+.I tv_nsec
+field is
+.B UTIME_NOW
+.\" 2.6.22 was broken here:
+.\" both must be something other than *either* UTIME_OMIT *or* UTIME_NOW.
+and neither
+.I tv_nsec
+field is
+.BR UTIME_OMIT ),
+either condition 2 or 3 above must apply.
+.PP
+If both
+.I tv_nsec
+fields are specified as
+.BR UTIME_OMIT ,
+then no file ownership or permission checks are performed,
+and the file timestamps are not modified,
+but other error conditions may still be detected.
+.\"
+.\"
+.SS utimensat() specifics
+If
+.I pathname
+is relative, then by default it is interpreted relative to the
+directory referred to by the open file descriptor,
+.I dirfd
+(rather than relative to the current working directory of
+the calling process, as is done by
+.BR utimes (2)
+for a relative pathname).
+See
+.BR openat (2)
+for an explanation of why this can be useful.
+.PP
+If
+.I pathname
+is relative and
+.I dirfd
+is the special value
+.BR AT_FDCWD ,
+then
+.I pathname
+is interpreted relative to the current working
+directory of the calling process (like
+.BR utimes (2)).
+.PP
+If
+.I pathname
+is absolute, then
+.I dirfd
+is ignored.
+.PP
+The
+.I flags
+field is a bit mask that may be 0, or include the following constant,
+defined in
+.IR <fcntl.h> :
+.TP
+.B AT_SYMLINK_NOFOLLOW
+If
+.I pathname
+specifies a symbolic link, then update the timestamps of the link,
+rather than the file to which it refers.
+.SH RETURN VALUE
+On success,
+.BR utimensat ()
+and
+.BR futimens ()
+return 0.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EACCES
+.I times
+is NULL,
+or both
+.I tv_nsec
+values are
+.BR UTIME_NOW ,
+and the effective user ID of the caller does not match
+the owner of the file,
+the caller does not have write access to the file,
+and the caller is not privileged
+(Linux: does not have either the
+.B CAP_FOWNER
+or the
+.B CAP_DAC_OVERRIDE
+capability).
+.\" But Linux 2.6.22 was broken here.
+.\" Traditionally, utime()/utimes() gives the error EACCES for the case
+.\" where the timestamp pointer argument is NULL (i.e., set both timestamps
+.\" to the current time), and the file is owned by a user other than the
+.\" effective UID of the caller, and the file is not writable by the
+.\" effective UID of the program. utimensat() also gives this error in the
+.\" same case. However, in the same circumstances, when utimensat() is
+.\" given a 'times' array in which both tv_nsec fields are UTIME_NOW, which
+.\" provides equivalent functionality to specifying 'times' as NULL, the
+.\" call succeeds. It should fail with the error EACCES in this case.
+.\"
+.\" POSIX.1-2008 has the following:
+.\" .TP
+.\" .B EACCES
+.\" .RB ( utimensat ())
+.\" .I fd
+.\" was not opened with
+.\" .B O_SEARCH
+.\" and the permissions of the directory to which
+.\" .I fd
+.\" refers do not allow searches.
+.\" EXT2_IMMUTABLE_FL and similar flags for other filesystems.
+.TP
+.B EBADF
+.RB ( futimens ())
+.I fd
+is not a valid file descriptor.
+.TP
+.B EBADF
+.RB ( utimensat ())
+.I pathname
+is relative but
+.I dirfd
+is neither
+.B AT_FDCWD
+nor a valid file descriptor.
+.TP
+.B EFAULT
+.I times
+pointed to an invalid address; or,
+.I dirfd
+was
+.BR AT_FDCWD ,
+and
+.I pathname
+is NULL or an invalid address.
+.TP
+.B EINVAL
+Invalid value in
+.IR flags .
+.TP
+.B EINVAL
+Invalid value in one of the
+.I tv_nsec
+fields (value outside range [0, 999,999,999], and not
+.B UTIME_NOW
+or
+.BR UTIME_OMIT );
+or an invalid value in one of the
+.I tv_sec
+fields.
+.TP
+.B EINVAL
+.\" SUSv4 does not specify this error.
+.I pathname
+is NULL,
+.I dirfd
+is not
+.BR AT_FDCWD ,
+and
+.I flags
+contains
+.BR AT_SYMLINK_NOFOLLOW .
+.TP
+.B ELOOP
+.RB ( utimensat ())
+Too many symbolic links were encountered in resolving
+.IR pathname .
+.TP
+.B ENAMETOOLONG
+.RB ( utimensat ())
+.I pathname
+is too long.
+.TP
+.B ENOENT
+.RB ( utimensat ())
+A component of
+.I pathname
+does not refer to an existing directory or file,
+or
+.I pathname
+is an empty string.
+.TP
+.B ENOTDIR
+.RB ( utimensat ())
+.I pathname
+is a relative pathname, but
+.I dirfd
+is neither
+.B AT_FDCWD
+nor a file descriptor referring to a directory;
+or, one of the prefix components of
+.I pathname
+is not a directory.
+.TP
+.B EPERM
+The caller attempted to change one or both timestamps to a value
+other than the current time,
+or to change one of the timestamps to the current time while
+leaving the other timestamp unchanged,
+(i.e.,
+.I times
+is not NULL, neither
+.I tv_nsec
+field is
+.BR UTIME_NOW ,
+and neither
+.I tv_nsec
+field is
+.BR UTIME_OMIT )
+and either:
+.RS
+.IP \[bu] 3
+the caller's effective user ID does not match the owner of file,
+and the caller is not privileged
+(Linux: does not have the
+.B CAP_FOWNER
+capability); or,
+.IP \[bu]
+.\" Linux 2.6.22 was broken here:
+.\" it was not consistent with the old utimes() implementation,
+.\" since the case when both tv_nsec fields are UTIME_NOW, was not
+.\" treated like the (times == NULL) case.
+the file is marked append-only or immutable (see
+.BR chattr (1)).
+.\" EXT2_IMMUTABLE_FL EXT_APPEND_FL and similar flags for
+.\" other filesystems.
+.\"
+.\" Why the inconsistency (which is described under NOTES) between
+.\" EACCES and EPERM, where only EPERM tests for append-only.
+.\" (This was also so for the older utimes() implementation.)
+.RE
+.TP
+.B EROFS
+The file is on a read-only filesystem.
+.TP
+.B ESRCH
+.RB ( utimensat ())
+Search permission is denied for one of the prefix components of
+.IR pathname .
+.SH ATTRIBUTES
+For an explanation of the terms used in this section, see
+.BR attributes (7).
+.TS
+allbox;
+lbx lb lb
+l l l.
+Interface Attribute Value
+T{
+.na
+.nh
+.BR utimensat (),
+.BR futimens ()
+T} Thread safety MT-Safe
+.TE
+.sp 1
+.SH VERSIONS
+.SS C library/kernel ABI differences
+On Linux,
+.BR futimens ()
+is a library function implemented on top of the
+.BR utimensat ()
+system call.
+To support this, the Linux
+.BR utimensat ()
+system call implements a nonstandard feature: if
+.I pathname
+is NULL, then the call modifies the timestamps of
+the file referred to by the file descriptor
+.I dirfd
+(which may refer to any type of file).
+Using this feature, the call
+.I "futimens(fd,\ times)"
+is implemented as:
+.PP
+.in +4n
+.EX
+utimensat(fd, NULL, times, 0);
+.EE
+.in
+.PP
+Note, however, that the glibc wrapper for
+.BR utimensat ()
+disallows passing NULL as the value for
+.IR pathname :
+the wrapper function returns the error
+.B EINVAL
+in this case.
+.SH STANDARDS
+POSIX.1-2008.
+.SH VERSIONS
+.TP
+.BR utimensat ()
+Linux 2.6.22,
+glibc 2.6.
+POSIX.1-2008.
+.TP
+.BR futimens ()
+glibc 2.6.
+POSIX.1-2008.
+.SH NOTES
+.BR utimensat ()
+obsoletes
+.BR futimesat (2).
+.PP
+On Linux, timestamps cannot be changed for a file marked immutable,
+and the only change permitted for files marked append-only is to
+set the timestamps to the current time.
+(This is consistent with the historical behavior of
+.BR utime (2)
+and
+.BR utimes (2)
+on Linux.)
+.PP
+If both
+.I tv_nsec
+fields are specified as
+.BR UTIME_OMIT ,
+then the Linux implementation of
+.BR utimensat ()
+succeeds even if the file referred to by
+.I dirfd
+and
+.I pathname
+does not exist.
+.SH BUGS
+Several bugs afflict
+.BR utimensat ()
+and
+.BR futimens ()
+before Linux 2.6.26.
+These bugs are either nonconformances with the POSIX.1 draft specification
+or inconsistencies with historical Linux behavior.
+.IP \[bu] 3
+POSIX.1 specifies that if one of the
+.I tv_nsec
+fields has the value
+.B UTIME_NOW
+or
+.BR UTIME_OMIT ,
+then the value of the corresponding
+.I tv_sec
+field should be ignored.
+Instead, the value of the
+.I tv_sec
+field is required to be 0 (or the error
+.B EINVAL
+results).
+.IP \[bu]
+Various bugs mean that for the purposes of permission checking,
+the case where both
+.I tv_nsec
+fields are set to
+.B UTIME_NOW
+isn't always treated the same as specifying
+.I times
+as NULL,
+and the case where one
+.I tv_nsec
+value is
+.B UTIME_NOW
+and the other is
+.B UTIME_OMIT
+isn't treated the same as specifying
+.I times
+as a pointer to an array of structures containing arbitrary time values.
+As a result, in some cases:
+a) file timestamps can be updated by a process that shouldn't have
+permission to perform updates;
+b) file timestamps can't be updated by a process that should have
+permission to perform updates; and
+c) the wrong
+.I errno
+value is returned in case of an error.
+.\" Below, the long description of the errors from the previous bullet
+.\" point (abridged because it's too much detail for a man page).
+.\" .IP *
+.\" If one of the
+.\" .I tv_nsec
+.\" fields is
+.\" .BR UTIME_OMIT
+.\" and the other is
+.\" .BR UTIME_NOW ,
+.\" then the error
+.\" .B EPERM
+.\" should occur if the process's effective user ID does not match
+.\" the file owner and the process is not privileged.
+.\" Instead, the call successfully changes one of the timestamps.
+.\" .IP *
+.\" If file is not writable by the effective user ID of the process and
+.\" the process's effective user ID does not match the file owner and
+.\" the process is not privileged,
+.\" and
+.\" .I times
+.\" is NULL, then the error
+.\" .B EACCES
+.\" results.
+.\" This error should also occur if
+.\" .I times
+.\" points to an array of structures in which both
+.\" .I tv_nsec
+.\" fields are
+.\" .BR UTIME_NOW .
+.\" Instead the call succeeds.
+.\" .IP *
+.\" If a file is marked as append-only (see
+.\" .BR chattr (1)),
+.\" then Linux traditionally
+.\" (i.e.,
+.\" .BR utime (2),
+.\" .BR utimes (2)),
+.\" permits a NULL
+.\" .I times
+.\" argument to be used in order to update both timestamps to the current time.
+.\" For consistency,
+.\" .BR utimensat ()
+.\" and
+.\" .BR futimens ()
+.\" should also produce the same result when given a
+.\" .I times
+.\" argument that points to an array of structures in which both
+.\" .I tv_nsec
+.\" fields are
+.\" .BR UTIME_NOW .
+.\" Instead, the call fails with the error
+.\" .BR EPERM .
+.\" .IP *
+.\" If a file is marked as immutable (see
+.\" .BR chattr (1)),
+.\" then Linux traditionally
+.\" (i.e.,
+.\" .BR utime (2),
+.\" .BR utimes (2)),
+.\" gives an
+.\" .B EACCES
+.\" error if
+.\" .I times
+.\" is NULL.
+.\" For consistency,
+.\" .BR utimensat ()
+.\" and
+.\" .BR futimens ()
+.\" should also produce the same result when given a
+.\" .I times
+.\" that points to an array of structures in which both
+.\" .I tv_nsec
+.\" fields are
+.\" .BR UTIME_NOW .
+.\" Instead, the call fails with the error
+.\" .BR EPERM .
+.IP \[bu]
+POSIX.1 says that a process that has \fIwrite access to the file\fP
+can make a call with
+.I times
+as NULL, or with
+.I times
+pointing to an array of structures in which both
+.I tv_nsec
+fields are
+.BR UTIME_NOW ,
+in order to update both timestamps to the current time.
+However,
+.BR futimens ()
+instead checks whether the
+.IR "access mode of the file descriptor allows writing" .
+.\" This means that a process with a file descriptor that allows
+.\" writing could change the timestamps of a file for which it
+.\" does not have write permission;
+.\" conversely, a process with a read-only file descriptor won't
+.\" be able to update the timestamps of a file,
+.\" even if it has write permission on the file.
+.SH SEE ALSO
+.BR chattr (1),
+.BR touch (1),
+.BR futimesat (2),
+.BR openat (2),
+.BR stat (2),
+.BR utimes (2),
+.BR futimes (3),
+.BR timespec (3),
+.BR inode (7),
+.BR path_resolution (7),
+.BR symlink (7)
diff --git a/man2/utimes.2 b/man2/utimes.2
new file mode 100644
index 0000000..04372d4
--- /dev/null
+++ b/man2/utimes.2
@@ -0,0 +1 @@
+.so man2/utime.2
diff --git a/man2/vfork.2 b/man2/vfork.2
new file mode 100644
index 0000000..85c04d3
--- /dev/null
+++ b/man2/vfork.2
@@ -0,0 +1,316 @@
+.\" Copyright (c) 1999 Andries Brouwer (aeb@cwi.nl), 1 Nov 1999
+.\" and Copyright 2006, 2012, 2017 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" 1999-11-10: Merged text taken from the page contributed by
+.\" Reed H. Petty (rhp@draper.net)
+.\"
+.TH vfork 2 2023-07-28 "Linux man-pages 6.05.01"
+.SH NAME
+vfork \- create a child process and block parent
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.B pid_t vfork(void);
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR vfork ():
+.nf
+ Since glibc 2.12:
+ (_XOPEN_SOURCE >= 500) && ! (_POSIX_C_SOURCE >= 200809L)
+ || /* Since glibc 2.19: */ _DEFAULT_SOURCE
+ || /* glibc <= 2.19: */ _BSD_SOURCE
+ Before glibc 2.12:
+ _BSD_SOURCE || _XOPEN_SOURCE >= 500
+.\" || _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED
+.fi
+.SH DESCRIPTION
+.SS Standard description
+(From POSIX.1)
+The
+.BR vfork ()
+function has the same effect as
+.BR fork (2),
+except that the behavior is undefined if the process created by
+.BR vfork ()
+either modifies any data other than a variable of type
+.I pid_t
+used to store the return value from
+.BR vfork (),
+or returns from the function in which
+.BR vfork ()
+was called, or calls any other function before successfully calling
+.BR _exit (2)
+or one of the
+.BR exec (3)
+family of functions.
+.SS Linux description
+.BR vfork (),
+just like
+.BR fork (2),
+creates a child process of the calling process.
+For details and return value and errors, see
+.BR fork (2).
+.PP
+.BR vfork ()
+is a special case of
+.BR clone (2).
+It is used to create new processes without copying the page tables of
+the parent process.
+It may be useful in performance-sensitive applications
+where a child is created which then immediately issues an
+.BR execve (2).
+.PP
+.BR vfork ()
+differs from
+.BR fork (2)
+in that the calling thread is suspended until the child terminates
+(either normally,
+by calling
+.BR _exit (2),
+or abnormally, after delivery of a fatal signal),
+or it makes a call to
+.BR execve (2).
+Until that point, the child shares all memory with its parent,
+including the stack.
+The child must not return from the current function or call
+.BR exit (3)
+(which would have the effect of calling exit handlers
+established by the parent process and flushing the parent's
+.BR stdio (3)
+buffers), but may call
+.BR _exit (2).
+.PP
+As with
+.BR fork (2),
+the child process created by
+.BR vfork ()
+inherits copies of various of the caller's process attributes
+(e.g., file descriptors, signal dispositions, and current working directory);
+the
+.BR vfork ()
+call differs only in the treatment of the virtual address space,
+as described above.
+.PP
+Signals sent to the parent
+arrive after the child releases the parent's memory
+(i.e., after the child terminates
+or calls
+.BR execve (2)).
+.SS Historic description
+Under Linux,
+.BR fork (2)
+is implemented using copy-on-write pages, so the only penalty incurred by
+.BR fork (2)
+is the time and memory required to duplicate the parent's page tables,
+and to create a unique task structure for the child.
+However, in the bad old days a
+.BR fork (2)
+would require making a complete copy of the caller's data space,
+often needlessly, since usually immediately afterward an
+.BR exec (3)
+is done.
+Thus, for greater efficiency, BSD introduced the
+.BR vfork ()
+system call, which did not fully copy the address space of
+the parent process, but borrowed the parent's memory and thread
+of control until a call to
+.BR execve (2)
+or an exit occurred.
+The parent process was suspended while the
+child was using its resources.
+The use of
+.BR vfork ()
+was tricky: for example, not modifying data
+in the parent process depended on knowing which variables were
+held in a register.
+.SH VERSIONS
+The requirements put on
+.BR vfork ()
+by the standards are weaker than those put on
+.BR fork (2),
+so an implementation where the two are synonymous is compliant.
+In particular, the programmer cannot rely on the parent
+remaining blocked until the child either terminates or calls
+.BR execve (2),
+and cannot rely on any specific behavior with respect to shared memory.
+.\" In AIXv3.1 vfork is equivalent to fork.
+.PP
+Some consider the semantics of
+.BR vfork ()
+to be an architectural blemish, and the 4.2BSD man page stated:
+\[lq]This system call will be eliminated
+when proper system sharing mechanisms are implemented.
+Users should not depend on the memory sharing semantics of
+.I vfork
+as it will, in that case, be made synonymous to
+.IR fork .\[rq]
+However, even though modern memory management hardware
+has decreased the performance difference between
+.BR fork (2)
+and
+.BR vfork (),
+there are various reasons why Linux and other systems have retained
+.BR vfork ():
+.IP \[bu] 3
+Some performance-critical applications require the small performance
+advantage conferred by
+.BR vfork ().
+.IP \[bu]
+.BR vfork ()
+can be implemented on systems that lack a memory-management unit (MMU), but
+.BR fork (2)
+can't be implemented on such systems.
+(POSIX.1-2008 removed
+.BR vfork ()
+from the standard; the POSIX rationale for the
+.BR posix_spawn (3)
+function notes that that function,
+which provides functionality equivalent to
+.BR fork (2)+\c
+.BR exec (3),
+is designed to be implementable on systems that lack an MMU.)
+.\" http://stackoverflow.com/questions/4259629/what-is-the-difference-between-fork-and-vfork
+.\" http://developers.sun.com/solaris/articles/subprocess/subprocess.html
+.\" http://mailman.uclinux.org/pipermail/uclinux-dev/2009-April/000684.html
+.\"
+.IP \[bu]
+On systems where memory is constrained,
+.BR vfork ()
+avoids the need to temporarily commit memory (see the description of
+.I /proc/sys/vm/overcommit_memory
+in
+.BR proc (5))
+in order to execute a new program.
+(This can be especially beneficial where a large parent process wishes
+to execute a small helper program in a child process.)
+By contrast, using
+.BR fork (2)
+in this scenario requires either committing an amount of memory equal
+to the size of the parent process (if strict overcommitting is in force)
+or overcommitting memory with the risk that a process is terminated
+by the out-of-memory (OOM) killer.
+.SS Linux notes
+Fork handlers established using
+.BR pthread_atfork (3)
+are not called when a multithreaded program employing
+the NPTL threading library calls
+.BR vfork ().
+Fork handlers are called in this case in a program using the
+LinuxThreads threading library.
+(See
+.BR pthreads (7)
+for a description of Linux threading libraries.)
+.PP
+A call to
+.BR vfork ()
+is equivalent to calling
+.BR clone (2)
+with
+.I flags
+specified as:
+.PP
+.in +4n
+.EX
+ CLONE_VM | CLONE_VFORK | SIGCHLD
+.EE
+.in
+.SH STANDARDS
+None.
+.SH HISTORY
+4.3BSD; POSIX.1-2001 (but marked OBSOLETE).
+POSIX.1-2008 removes the specification of
+.BR vfork ().
+.PP
+The
+.BR vfork ()
+system call appeared in 3.0BSD.
+.\" In the release notes for 4.2BSD Sam Leffler wrote: `vfork: Is still
+.\" present, but definitely on its way out'.
+In 4.4BSD it was made synonymous to
+.BR fork (2)
+but NetBSD introduced it again;
+see
+.UR http://www.netbsd.org\:/Documentation\:/kernel\:/vfork.html
+.UE .
+In Linux, it has been equivalent to
+.BR fork (2)
+until Linux 2.2.0-pre6 or so.
+Since Linux 2.2.0-pre9 (on i386, somewhat later on
+other architectures) it is an independent system call.
+Support was added in glibc 2.0.112.
+.\"
+.SH CAVEATS
+The child process should take care not to modify the memory in unintended ways,
+since such changes will be seen by the parent process once
+the child terminates or executes another program.
+In this regard, signal handlers can be especially problematic:
+if a signal handler that is invoked in the child of
+.BR vfork ()
+changes memory, those changes may result in an inconsistent process state
+from the perspective of the parent process
+(e.g., memory changes would be visible in the parent,
+but changes to the state of open file descriptors would not be visible).
+.PP
+When
+.BR vfork ()
+is called in a multithreaded process,
+only the calling thread is suspended until the child terminates
+or executes a new program.
+This means that the child is sharing an address space with other running code.
+This can be dangerous if another thread in the parent process
+changes credentials (using
+.BR setuid (2)
+or similar),
+since there are now two processes with different privilege levels
+running in the same address space.
+As an example of the dangers,
+suppose that a multithreaded program running as root creates a child using
+.BR vfork ().
+After the
+.BR vfork (),
+a thread in the parent process drops the process to an unprivileged user
+in order to run some untrusted code
+(e.g., perhaps via plug-in opened with
+.BR dlopen (3)).
+In this case, attacks are possible where the parent process uses
+.BR mmap (2)
+to map in code that will be executed by the privileged child process.
+.\"
+.SH BUGS
+Details of the signal handling are obscure and differ between systems.
+The BSD man page states:
+"To avoid a possible deadlock situation, processes that are children
+in the middle of a
+.BR vfork ()
+are never sent
+.B SIGTTOU
+or
+.B SIGTTIN
+signals; rather, output or
+.IR ioctl s
+are allowed and input attempts result in an end-of-file indication."
+.\"
+.\" As far as I can tell, the following is not true in Linux 2.6.19:
+.\" Currently (Linux 2.3.25),
+.\" .BR strace (1)
+.\" cannot follow
+.\" .BR vfork ()
+.\" and requires a kernel patch.
+.SH SEE ALSO
+.BR clone (2),
+.BR execve (2),
+.BR _exit (2),
+.BR fork (2),
+.BR unshare (2),
+.BR wait (2)
diff --git a/man2/vhangup.2 b/man2/vhangup.2
new file mode 100644
index 0000000..af9853b
--- /dev/null
+++ b/man2/vhangup.2
@@ -0,0 +1,58 @@
+.\" Copyright 1993 Rickard E. Faith (faith@cs.unc.edu)
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified, 27 May 2004, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Added notes on capability requirements
+.\"
+.TH vhangup 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+vhangup \- virtually hangup the current terminal
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.B int vhangup(void);
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR vhangup ():
+.nf
+ Since glibc 2.21:
+.\" commit 266865c0e7b79d4196e2cc393693463f03c90bd8
+ _DEFAULT_SOURCE
+ In glibc 2.19 and 2.20:
+ _DEFAULT_SOURCE || (_XOPEN_SOURCE && _XOPEN_SOURCE < 500)
+ Up to and including glibc 2.19:
+ _BSD_SOURCE || (_XOPEN_SOURCE && _XOPEN_SOURCE < 500)
+.fi
+.SH DESCRIPTION
+.BR vhangup ()
+simulates a hangup on the current terminal.
+This call arranges for other
+users to have a \*(lqclean\*(rq terminal at login time.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EPERM
+The calling process has insufficient privilege to call
+.BR vhangup ();
+the
+.B CAP_SYS_TTY_CONFIG
+capability is required.
+.SH STANDARDS
+Linux.
+.SH SEE ALSO
+.BR init (1),
+.BR capabilities (7)
diff --git a/man2/vm86.2 b/man2/vm86.2
new file mode 100644
index 0000000..97595e8
--- /dev/null
+++ b/man2/vm86.2
@@ -0,0 +1,58 @@
+.\" Copyright 1993 Rickard E. Faith (faith@cs.unc.edu)
+.\" Copyright 1997 Andries E. Brouwer (aeb@cwi.nl)
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH vm86 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+vm86old, vm86 \- enter virtual 8086 mode
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/vm86.h>
+.PP
+.BI "int vm86old(struct vm86_struct *" info );
+.BI "int vm86(unsigned long " fn ", struct vm86plus_struct *" v86 );
+.fi
+.SH DESCRIPTION
+The system call
+.BR vm86 ()
+was introduced in Linux 0.97p2.
+In Linux 2.1.15 and 2.0.28, it was renamed to
+.BR vm86old (),
+and a new
+.BR vm86 ()
+was introduced.
+The definition of
+.I struct vm86_struct
+was changed
+in 1.1.8 and 1.1.9.
+.PP
+These calls cause the process to enter VM86 mode (virtual-8086 in Intel
+literature), and are used by
+.BR dosemu .
+.PP
+VM86 mode is an emulation of real mode within a protected mode task.
+.SH RETURN VALUE
+On success, zero is returned.
+On error, \-1 is returned, and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EFAULT
+This return value is specific to i386 and indicates a problem with getting
+user-space data.
+.TP
+.B ENOSYS
+This return value indicates the call is not implemented on the present
+architecture.
+.TP
+.B EPERM
+Saved kernel stack exists.
+(This is a kernel sanity check; the saved
+stack should exist only within vm86 mode itself.)
+.SH STANDARDS
+Linux on 32-bit Intel processors.
diff --git a/man2/vm86old.2 b/man2/vm86old.2
new file mode 100644
index 0000000..bf2581d
--- /dev/null
+++ b/man2/vm86old.2
@@ -0,0 +1 @@
+.so man2/vm86.2
diff --git a/man2/vmsplice.2 b/man2/vmsplice.2
new file mode 100644
index 0000000..4520b8a
--- /dev/null
+++ b/man2/vmsplice.2
@@ -0,0 +1,162 @@
+.\" This manpage is Copyright (C) 2006 Jens Axboe
+.\" and Copyright (C) 2006 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH vmsplice 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+vmsplice \- splice user pages to/from a pipe
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
+.B #include <fcntl.h>
+.PP
+.BI "ssize_t vmsplice(int " fd ", const struct iovec *" iov ,
+.BI " size_t " nr_segs ", unsigned int " flags );
+.fi
+.\" Return type was long before glibc 2.7
+.SH DESCRIPTION
+.\" Linus: vmsplice() system call to basically do a "write to
+.\" the buffer", but using the reference counting and VM traversal
+.\" to actually fill the buffer. This means that the user needs to
+.\" be careful not to reuse the user-space buffer it spliced into
+.\" the kernel-space one (contrast this to "write()", which copies
+.\" the actual data, and you can thus reuse the buffer immediately
+.\" after a successful write), but that is often easy to do.
+If
+.I fd
+is opened for writing, the
+.BR vmsplice ()
+system call maps
+.I nr_segs
+ranges of user memory described by
+.I iov
+into a pipe.
+If
+.I fd
+is opened for reading,
+.\" Since Linux 2.6.23
+.\" commit 6a14b90bb6bc7cd83e2a444bf457a2ea645cbfe7
+the
+.BR vmsplice ()
+system call fills
+.I nr_segs
+ranges of user memory described by
+.I iov
+from a pipe.
+The file descriptor
+.I fd
+must refer to a pipe.
+.PP
+The pointer
+.I iov
+points to an array of
+.I iovec
+structures as described in
+.BR iovec (3type).
+.PP
+The
+.I flags
+argument is a bit mask that is composed by ORing together
+zero or more of the following values:
+.TP
+.B SPLICE_F_MOVE
+Unused for
+.BR vmsplice ();
+see
+.BR splice (2).
+.TP
+.B SPLICE_F_NONBLOCK
+.\" Not used for vmsplice
+.\" May be in the future -- therefore EAGAIN
+Do not block on I/O; see
+.BR splice (2)
+for further details.
+.TP
+.B SPLICE_F_MORE
+Currently has no effect for
+.BR vmsplice (),
+but may be implemented in the future; see
+.BR splice (2).
+.TP
+.B SPLICE_F_GIFT
+The user pages are a gift to the kernel.
+The application may not modify this memory ever,
+.\" FIXME . Explain the following line in a little more detail:
+otherwise the page cache and on-disk data may differ.
+Gifting pages to the kernel means that a subsequent
+.BR splice (2)
+.B SPLICE_F_MOVE
+can successfully move the pages;
+if this flag is not specified, then a subsequent
+.BR splice (2)
+.B SPLICE_F_MOVE
+must copy the pages.
+Data must also be properly page aligned, both in memory and length.
+.\" FIXME
+.\" It looks like the page-alignment requirement went away with
+.\" commit bd1a68b59c8e3bce45fb76632c64e1e063c3962d
+.\"
+.\" .... if we expect to later SPLICE_F_MOVE to the cache.
+.SH RETURN VALUE
+Upon successful completion,
+.BR vmsplice ()
+returns the number of bytes transferred to the pipe.
+On error,
+.BR vmsplice ()
+returns \-1 and
+.I errno
+is set to indicate the error.
+.SH ERRORS
+.TP
+.B EAGAIN
+.B SPLICE_F_NONBLOCK
+was specified in
+.IR flags ,
+and the operation would block.
+.TP
+.B EBADF
+.I fd
+either not valid, or doesn't refer to a pipe.
+.TP
+.B EINVAL
+.I nr_segs
+is greater than
+.BR IOV_MAX ;
+or memory not aligned if
+.B SPLICE_F_GIFT
+set.
+.TP
+.B ENOMEM
+Out of memory.
+.SH STANDARDS
+Linux.
+.SH HISTORY
+Linux 2.6.17,
+glibc 2.5.
+.SH NOTES
+.BR vmsplice ()
+follows the other vectorized read/write type functions when it comes to
+limitations on the number of segments being passed in.
+This limit is
+.B IOV_MAX
+as defined in
+.IR <limits.h> .
+Currently,
+.\" UIO_MAXIOV in kernel source
+this limit is 1024.
+.PP
+.\" commit 6a14b90bb6bc7cd83e2a444bf457a2ea645cbfe7
+.BR vmsplice ()
+really supports true splicing only from user memory to a pipe.
+In the opposite direction, it actually just copies the data to user space.
+But this makes the interface nice and symmetric and enables people to build on
+.BR vmsplice ()
+with room for future improvement in performance.
+.SH SEE ALSO
+.BR splice (2),
+.BR tee (2),
+.BR pipe (7)
diff --git a/man2/vserver.2 b/man2/vserver.2
new file mode 100644
index 0000000..5d25ea6
--- /dev/null
+++ b/man2/vserver.2
@@ -0,0 +1 @@
+.so man2/unimplemented.2
diff --git a/man2/wait.2 b/man2/wait.2
new file mode 100644
index 0000000..cbd851e
--- /dev/null
+++ b/man2/wait.2
@@ -0,0 +1,720 @@
+.\" Copyright (c) 1993 by Thomas Koenig <ig25@rz.uni-karlsruhe.de>
+.\" and Copyright (c) 2004 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified Sat Jul 24 13:30:06 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Modified Sun Aug 21 17:42:42 1994 by Rik Faith <faith@cs.unc.edu>
+.\" (Thanks to Koen Holtman <koen@win.tue.nl>)
+.\" Modified Wed May 17 15:54:12 1995 by Rik Faith <faith@cs.unc.edu>
+.\" To remove *'s from status in macros (Thanks to Michael Shields).
+.\" Modified as suggested by Nick Duffek <nsd@bbc.com>, aeb, 960426
+.\" Modified Mon Jun 23 14:09:52 1997 by aeb - add EINTR.
+.\" Modified Thu Nov 26 02:12:45 1998 by aeb - add SIGCHLD stuff.
+.\" Modified Mon Jul 24 21:37:38 2000 by David A. Wheeler
+.\" <dwheeler@dwheeler.com> - noted thread issues.
+.\" Modified 26 Jun 01 by Michael Kerrisk
+.\" Added __WCLONE, __WALL, and __WNOTHREAD descriptions
+.\" Modified 2001-09-25, aeb
+.\" Modified 26 Jun 01 by Michael Kerrisk, <mtk.manpages@gmail.com>
+.\" Updated notes on setting disposition of SIGCHLD to SIG_IGN
+.\" 2004-11-11, mtk
+.\" Added waitid(2); added WCONTINUED and WIFCONTINUED()
+.\" Added text on SA_NOCLDSTOP
+.\" Updated discussion of SA_NOCLDWAIT to reflect 2.6 behavior
+.\" Much other text rewritten
+.\" 2005-05-10, mtk, __W* flags can't be used with waitid()
+.\" 2008-07-04, mtk, removed erroneous text about SA_NOCLDSTOP
+.\"
+.TH wait 2 2023-05-03 "Linux man-pages 6.05.01"
+.SH NAME
+wait, waitpid, waitid \- wait for process to change state
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/wait.h>
+.PP
+.BI "pid_t wait(int *_Nullable " "wstatus" );
+.BI "pid_t waitpid(pid_t " pid ", int *_Nullable " wstatus ", int " options );
+.PP
+.BI "int waitid(idtype_t " idtype ", id_t " id \
+", siginfo_t *" infop ", int " options );
+ /* This is the glibc and POSIX interface; see
+ NOTES for information on the raw system call. */
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR waitid ():
+.nf
+ Since glibc 2.26:
+ _XOPEN_SOURCE >= 500 || _POSIX_C_SOURCE >= 200809L
+.\" (_XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED)
+ glibc 2.25 and earlier:
+ _XOPEN_SOURCE
+ || /* Since glibc 2.12: */ _POSIX_C_SOURCE >= 200809L
+ || /* glibc <= 2.19: */ _BSD_SOURCE
+.fi
+.SH DESCRIPTION
+All of these system calls are used to wait for state changes
+in a child of the calling process, and obtain information
+about the child whose state has changed.
+A state change is considered to be: the child terminated;
+the child was stopped by a signal; or the child was resumed by a signal.
+In the case of a terminated child, performing a wait allows
+the system to release the resources associated with the child;
+if a wait is not performed, then the terminated child remains in
+a "zombie" state (see NOTES below).
+.PP
+If a child has already changed state, then these calls return immediately.
+Otherwise, they block until either a child changes state or
+a signal handler interrupts the call (assuming that system calls
+are not automatically restarted using the
+.B SA_RESTART
+flag of
+.BR sigaction (2)).
+In the remainder of this page, a child whose state has changed
+and which has not yet been waited upon by one of these system
+calls is termed
+.IR waitable .
+.SS wait() and waitpid()
+The
+.BR wait ()
+system call suspends execution of the calling thread until one of its
+children terminates.
+The call
+.I wait(&wstatus)
+is equivalent to:
+.PP
+.in +4n
+.EX
+waitpid(\-1, &wstatus, 0);
+.EE
+.in
+.PP
+The
+.BR waitpid ()
+system call suspends execution of the calling thread until a
+child specified by
+.I pid
+argument has changed state.
+By default,
+.BR waitpid ()
+waits only for terminated children, but this behavior is modifiable
+via the
+.I options
+argument, as described below.
+.PP
+The value of
+.I pid
+can be:
+.TP
+.RB "< " \-1
+meaning wait for any child process whose process group ID is
+equal to the absolute value of
+.IR pid .
+.TP
+.B \-1
+meaning wait for any child process.
+.TP
+.B 0
+meaning wait for any child process whose process group ID is
+equal to that of the calling process at the time of the call to
+.BR waitpid ().
+.TP
+.RB "> " 0
+meaning wait for the child whose process ID is equal to the
+value of
+.IR pid .
+.PP
+The value of
+.I options
+is an OR of zero or more of the following constants:
+.TP
+.B WNOHANG
+return immediately if no child has exited.
+.TP
+.B WUNTRACED
+also return if a child has stopped
+(but not traced via
+.BR ptrace (2)).
+Status for
+.I traced
+children which have stopped is provided
+even if this option is not specified.
+.TP
+.BR WCONTINUED " (since Linux 2.6.10)"
+also return if a stopped child has been resumed by delivery of
+.BR SIGCONT .
+.PP
+(For Linux-only options, see below.)
+.PP
+If
+.I wstatus
+is not NULL,
+.BR wait ()
+and
+.BR waitpid ()
+store status information in the \fIint\fP to which it points.
+This integer can be inspected with the following macros (which
+take the integer itself as an argument, not a pointer to it,
+as is done in
+.BR wait ()
+and
+.BR waitpid ()!):
+.TP
+.BI WIFEXITED( wstatus )
+returns true if the child terminated normally, that is,
+by calling
+.BR exit (3)
+or
+.BR _exit (2),
+or by returning from main().
+.TP
+.BI WEXITSTATUS( wstatus )
+returns the exit status of the child.
+This consists of the least significant 8 bits of the
+.I status
+argument that the child specified in a call to
+.BR exit (3)
+or
+.BR _exit (2)
+or as the argument for a return statement in main().
+This macro should be employed only if
+.B WIFEXITED
+returned true.
+.TP
+.BI WIFSIGNALED( wstatus )
+returns true if the child process was terminated by a signal.
+.TP
+.BI WTERMSIG( wstatus )
+returns the number of the signal that caused the child process to
+terminate.
+This macro should be employed only if
+.B WIFSIGNALED
+returned true.
+.TP
+.BI WCOREDUMP( wstatus )
+returns true if the child produced a core dump (see
+.BR core (5)).
+This macro should be employed only if
+.B WIFSIGNALED
+returned true.
+.IP
+This macro is not specified in POSIX.1-2001 and is not available on
+some UNIX implementations (e.g., AIX, SunOS).
+Therefore, enclose its use inside
+.IR "#ifdef WCOREDUMP ... #endif" .
+.TP
+.BI WIFSTOPPED( wstatus )
+returns true if the child process was stopped by delivery of a signal;
+this is possible only if the call was done using
+.B WUNTRACED
+or when the child is being traced (see
+.BR ptrace (2)).
+.TP
+.BI WSTOPSIG( wstatus )
+returns the number of the signal which caused the child to stop.
+This macro should be employed only if
+.B WIFSTOPPED
+returned true.
+.TP
+.BI WIFCONTINUED( wstatus )
+(since Linux 2.6.10)
+returns true if the child process was resumed by delivery of
+.BR SIGCONT .
+.SS waitid()
+The
+.BR waitid ()
+system call (available since Linux 2.6.9) provides more precise
+control over which child state changes to wait for.
+.PP
+The
+.I idtype
+and
+.I id
+arguments select the child(ren) to wait for, as follows:
+.TP
+.IR idtype " == " \fBP_PID\fP
+Wait for the child whose process ID matches
+.IR id .
+.TP
+.IR idtype " == " \fBP_PIDFD\fP " (since Linux 5.4)"
+.\" commit 3695eae5fee0605f316fbaad0b9e3de791d7dfaf
+Wait for the child referred to by the PID file descriptor specified in
+.IR id .
+(See
+.BR pidfd_open (2)
+for further information on PID file descriptors.)
+.TP
+.IR idtype " == " \fBP_PGID\fP
+Wait for any child whose process group ID matches
+.IR id .
+Since Linux 5.4,
+.\" commit 821cc7b0b205c0df64cce59aacc330af251fa8f7
+if
+.I id
+is zero, then wait for any child that is in the same process group
+as the caller's process group at the time of the call.
+.TP
+.IR idtype " == " \fBP_ALL\fP
+Wait for any child;
+.I id
+is ignored.
+.PP
+The child state changes to wait for are specified by ORing
+one or more of the following flags in
+.IR options :
+.TP
+.B WEXITED
+Wait for children that have terminated.
+.TP
+.B WSTOPPED
+Wait for children that have been stopped by delivery of a signal.
+.TP
+.B WCONTINUED
+Wait for (previously stopped) children that have been
+resumed by delivery of
+.BR SIGCONT .
+.PP
+The following flags may additionally be ORed in
+.IR options :
+.TP
+.B WNOHANG
+As for
+.BR waitpid ().
+.TP
+.B WNOWAIT
+Leave the child in a waitable state; a later wait call
+can be used to again retrieve the child status information.
+.PP
+Upon successful return,
+.BR waitid ()
+fills in the following fields of the
+.I siginfo_t
+structure pointed to by
+.IR infop :
+.TP
+\fIsi_pid\fP
+The process ID of the child.
+.TP
+\fIsi_uid\fP
+The real user ID of the child.
+(This field is not set on most other implementations.)
+.TP
+\fIsi_signo\fP
+Always set to
+.BR SIGCHLD .
+.TP
+\fIsi_status\fP
+Either the exit status of the child, as given to
+.BR _exit (2)
+(or
+.BR exit (3)),
+or the signal that caused the child to terminate, stop, or continue.
+The
+.I si_code
+field can be used to determine how to interpret this field.
+.TP
+\fIsi_code\fP
+Set to one of:
+.B CLD_EXITED
+(child called
+.BR _exit (2));
+.B CLD_KILLED
+(child killed by signal);
+.B CLD_DUMPED
+(child killed by signal, and dumped core);
+.B CLD_STOPPED
+(child stopped by signal);
+.B CLD_TRAPPED
+(traced child has trapped); or
+.B CLD_CONTINUED
+(child continued by
+.BR SIGCONT ).
+.PP
+If
+.B WNOHANG
+was specified in
+.I options
+and there were no children in a waitable state, then
+.BR waitid ()
+returns 0 immediately and
+the state of the
+.I siginfo_t
+structure pointed to by
+.I infop
+depends on the implementation.
+To (portably) distinguish this case from that where a child was in a
+waitable state, zero out the
+.I si_pid
+field before the call and check for a nonzero value in this field
+after the call returns.
+.PP
+POSIX.1-2008 Technical Corrigendum 1 (2013) adds the requirement that when
+.B WNOHANG
+is specified in
+.I options
+and there were no children in a waitable state, then
+.BR waitid ()
+should zero out the
+.I si_pid
+and
+.I si_signo
+fields of the structure.
+On Linux and other implementations that adhere to this requirement,
+it is not necessary to zero out the
+.I si_pid
+field before calling
+.BR waitid ().
+However,
+not all implementations follow the POSIX.1 specification on this point.
+.\" POSIX.1-2001 leaves this possibility unspecified; most
+.\" implementations (including Linux) zero out the structure
+.\" in this case, but at least one implementation (AIX 5.1)
+.\" does not -- MTK Nov 04
+.SH RETURN VALUE
+.BR wait ():
+on success, returns the process ID of the terminated child;
+on failure, \-1 is returned.
+.PP
+.BR waitpid ():
+on success, returns the process ID of the child whose state has changed;
+if
+.B WNOHANG
+was specified and one or more child(ren) specified by
+.I pid
+exist, but have not yet changed state, then 0 is returned.
+On failure, \-1 is returned.
+.PP
+.BR waitid ():
+returns 0 on success or
+if
+.B WNOHANG
+was specified and no child(ren) specified by
+.I id
+has yet changed state;
+on failure, \-1 is returned.
+.\" FIXME As reported by Vegard Nossum, if infop is NULL, then waitid()
+.\" returns the PID of the child. Either this is a bug, or it is intended
+.\" behavior that needs to be documented. See my Jan 2009 LKML mail
+.\" "waitid() return value strangeness when infop is NULL".
+.PP
+On failure, each of these calls sets
+.I errno
+to indicate the error.
+.SH ERRORS
+.TP
+.B EAGAIN
+The PID file descriptor specified in
+.I id
+is nonblocking and the process that it refers to has not terminated.
+.TP
+.B ECHILD
+(for
+.BR wait ())
+The calling process does not have any unwaited-for children.
+.TP
+.B ECHILD
+(for
+.BR waitpid ()
+or
+.BR waitid ())
+The process specified by
+.I pid
+.RB ( waitpid ())
+or
+.I idtype
+and
+.I id
+.RB ( waitid ())
+does not exist or is not a child of the calling process.
+(This can happen for one's own child if the action for
+.B SIGCHLD
+is set to
+.BR SIG_IGN .
+See also the \fILinux Notes\fP section about threads.)
+.TP
+.B EINTR
+.B WNOHANG
+was not set and an unblocked signal or a
+.B SIGCHLD
+was caught; see
+.BR signal (7).
+.TP
+.B EINVAL
+The
+.I options
+argument was invalid.
+.TP
+.B ESRCH
+(for
+.BR wait ()
+or
+.BR waitpid ())
+.I pid
+is equal to
+.BR INT_MIN .
+.SH VERSIONS
+.SS C library/kernel differences
+.BR wait ()
+is actually a library function that (in glibc) is implemented as a call to
+.BR wait4 (2).
+.PP
+On some architectures, there is no
+.BR waitpid ()
+system call;
+.\" e.g., i386 has the system call, but not x86-64
+instead, this interface is implemented via a C library
+wrapper function that calls
+.BR wait4 (2).
+.PP
+The raw
+.BR waitid ()
+system call takes a fifth argument, of type
+.IR "struct rusage\ *" .
+If this argument is non-NULL,
+then it is used to return resource usage information about the child,
+in the same manner as
+.BR wait4 (2).
+See
+.BR getrusage (2)
+for details.
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+SVr4, 4.3BSD, POSIX.1-2001.
+.SH NOTES
+A child that terminates, but has not been waited for becomes a "zombie".
+The kernel maintains a minimal set of information about the zombie
+process (PID, termination status, resource usage information)
+in order to allow the parent to later perform a wait to obtain
+information about the child.
+As long as a zombie is not removed from the system via a wait,
+it will consume a slot in the kernel process table, and if
+this table fills, it will not be possible to create further processes.
+If a parent process terminates, then its "zombie" children (if any)
+are adopted by
+.BR init (1),
+(or by the nearest "subreaper" process as defined through the use of the
+.BR prctl (2)
+.B PR_SET_CHILD_SUBREAPER
+operation);
+.BR init (1)
+automatically performs a wait to remove the zombies.
+.PP
+POSIX.1-2001 specifies that if the disposition of
+.B SIGCHLD
+is set to
+.B SIG_IGN
+or the
+.B SA_NOCLDWAIT
+flag is set for
+.B SIGCHLD
+(see
+.BR sigaction (2)),
+then children that terminate do not become zombies and a call to
+.BR wait ()
+or
+.BR waitpid ()
+will block until all children have terminated, and then fail with
+.I errno
+set to
+.BR ECHILD .
+(The original POSIX standard left the behavior of setting
+.B SIGCHLD
+to
+.B SIG_IGN
+unspecified.
+Note that even though the default disposition of
+.B SIGCHLD
+is "ignore", explicitly setting the disposition to
+.B SIG_IGN
+results in different treatment of zombie process children.)
+.PP
+Linux 2.6 conforms to the POSIX requirements.
+However, Linux 2.4 (and earlier) does not:
+if a
+.BR wait ()
+or
+.BR waitpid ()
+call is made while
+.B SIGCHLD
+is being ignored, the call behaves just as though
+.B SIGCHLD
+were not being ignored, that is, the call blocks until the next child
+terminates and then returns the process ID and status of that child.
+.SS Linux notes
+In the Linux kernel, a kernel-scheduled thread is not a distinct
+construct from a process.
+Instead, a thread is simply a process
+that is created using the Linux-unique
+.BR clone (2)
+system call; other routines such as the portable
+.BR pthread_create (3)
+call are implemented using
+.BR clone (2).
+Before Linux 2.4, a thread was just a special case of a process,
+and as a consequence one thread could not wait on the children
+of another thread, even when the latter belongs to the same thread group.
+However, POSIX prescribes such functionality, and since Linux 2.4
+a thread can, and by default will, wait on children of other threads
+in the same thread group.
+.PP
+The following Linux-specific
+.I options
+are for use with children created using
+.BR clone (2);
+they can also, since Linux 4.7,
+.\" commit 91c4e8ea8f05916df0c8a6f383508ac7c9e10dba
+be used with
+.BR waitid ():
+.TP
+.B __WCLONE
+.\" since 0.99pl10
+Wait for "clone" children only.
+If omitted, then wait for "non-clone" children only.
+(A "clone" child is one which delivers no signal, or a signal other than
+.B SIGCHLD
+to its parent upon termination.)
+This option is ignored if
+.B __WALL
+is also specified.
+.TP
+.BR __WALL " (since Linux 2.4)"
+.\" since patch-2.3.48
+Wait for all children, regardless of
+type ("clone" or "non-clone").
+.TP
+.BR __WNOTHREAD " (since Linux 2.4)"
+.\" since patch-2.4.0-test8
+Do not wait for children of other threads in
+the same thread group.
+This was the default before Linux 2.4.
+.PP
+Since Linux 4.7,
+.\" commit bf959931ddb88c4e4366e96dd22e68fa0db9527c
+.\" prevents cases where an unreapable zombie is created if
+.\" /sbin/init doesn't use __WALL.
+the
+.B __WALL
+flag is automatically implied if the child is being ptraced.
+.SH BUGS
+According to POSIX.1-2008, an application calling
+.BR waitid ()
+must ensure that
+.I infop
+points to a
+.I siginfo_t
+structure (i.e., that it is a non-null pointer).
+On Linux, if
+.I infop
+is NULL,
+.BR waitid ()
+succeeds, and returns the process ID of the waited-for child.
+Applications should avoid relying on this inconsistent,
+nonstandard, and unnecessary feature.
+.SH EXAMPLES
+.\" fork.2 refers to this example program.
+The following program demonstrates the use of
+.BR fork (2)
+and
+.BR waitpid ().
+The program creates a child process.
+If no command-line argument is supplied to the program,
+then the child suspends its execution using
+.BR pause (2),
+to allow the user to send signals to the child.
+Otherwise, if a command-line argument is supplied,
+then the child exits immediately,
+using the integer supplied on the command line as the exit status.
+The parent process executes a loop that monitors the child using
+.BR waitpid (),
+and uses the W*() macros described above to analyze the wait status value.
+.PP
+The following shell session demonstrates the use of the program:
+.PP
+.in +4n
+.EX
+.RB "$" " ./a.out &"
+Child PID is 32360
+[1] 32359
+.RB "$" " kill \-STOP 32360"
+stopped by signal 19
+.RB "$" " kill \-CONT 32360"
+continued
+.RB "$" " kill \-TERM 32360"
+killed by signal 15
+[1]+ Done ./a.out
+$
+.EE
+.in
+.SS Program source
+\&
+.\" SRC BEGIN (wait.c)
+.EX
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/wait.h>
+#include <unistd.h>
+\&
+int
+main(int argc, char *argv[])
+{
+ int wstatus;
+ pid_t cpid, w;
+\&
+ cpid = fork();
+ if (cpid == \-1) {
+ perror("fork");
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (cpid == 0) { /* Code executed by child */
+ printf("Child PID is %jd\en", (intmax_t) getpid());
+ if (argc == 1)
+ pause(); /* Wait for signals */
+ _exit(atoi(argv[1]));
+\&
+ } else { /* Code executed by parent */
+ do {
+ w = waitpid(cpid, &wstatus, WUNTRACED | WCONTINUED);
+ if (w == \-1) {
+ perror("waitpid");
+ exit(EXIT_FAILURE);
+ }
+\&
+ if (WIFEXITED(wstatus)) {
+ printf("exited, status=%d\en", WEXITSTATUS(wstatus));
+ } else if (WIFSIGNALED(wstatus)) {
+ printf("killed by signal %d\en", WTERMSIG(wstatus));
+ } else if (WIFSTOPPED(wstatus)) {
+ printf("stopped by signal %d\en", WSTOPSIG(wstatus));
+ } else if (WIFCONTINUED(wstatus)) {
+ printf("continued\en");
+ }
+ } while (!WIFEXITED(wstatus) && !WIFSIGNALED(wstatus));
+ exit(EXIT_SUCCESS);
+ }
+}
+.EE
+.\" SRC END
+.SH SEE ALSO
+.BR _exit (2),
+.BR clone (2),
+.BR fork (2),
+.BR kill (2),
+.BR ptrace (2),
+.BR sigaction (2),
+.BR signal (2),
+.BR wait4 (2),
+.BR pthread_create (3),
+.BR core (5),
+.BR credentials (7),
+.BR signal (7)
diff --git a/man2/wait3.2 b/man2/wait3.2
new file mode 100644
index 0000000..097794b
--- /dev/null
+++ b/man2/wait3.2
@@ -0,0 +1 @@
+.so man2/wait4.2
diff --git a/man2/wait4.2 b/man2/wait4.2
new file mode 100644
index 0000000..7136d70
--- /dev/null
+++ b/man2/wait4.2
@@ -0,0 +1,169 @@
+.\" Copyright (c) 1993 by Thomas Koenig (ig25@rz.uni-karlsruhe.de)
+.\" and Copyright (c) 2004 by Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified Sat Jul 24 13:32:44 1993 by Rik Faith (faith@cs.unc.edu)
+.\" Modified Mon Jun 23 14:09:52 1997 by aeb - add EINTR.
+.\" Modified Tue Jul 7 12:26:42 1998 by aeb - changed return value wait3
+.\" Modified 2004-11-11, Michael Kerrisk <mtk.manpages@gmail.com>
+.\" Rewrote much of this page, and removed much duplicated text,
+.\" replacing with pointers to wait.2
+.\"
+.TH wait4 2 2023-03-30 "Linux man-pages 6.05.01"
+.SH NAME
+wait3, wait4 \- wait for process to change state, BSD style
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <sys/wait.h>
+.PP
+.BI "pid_t wait3(int *_Nullable " "wstatus" ", int " options ,
+.BI " struct rusage *_Nullable " rusage );
+.BI "pid_t wait4(pid_t " pid ", int *_Nullable " wstatus ", int " options ,
+.BI " struct rusage *_Nullable " rusage );
+.fi
+.PP
+.RS -4
+Feature Test Macro Requirements for glibc (see
+.BR feature_test_macros (7)):
+.RE
+.PP
+.BR wait3 ():
+.nf
+ Since glibc 2.26:
+ _DEFAULT_SOURCE
+ || (_XOPEN_SOURCE >= 500 &&
+ ! (_POSIX_C_SOURCE >= 200112L
+ || _XOPEN_SOURCE >= 600))
+ From glibc 2.19 to glibc 2.25:
+ _DEFAULT_SOURCE || _XOPEN_SOURCE >= 500
+ glibc 2.19 and earlier:
+ _BSD_SOURCE || _XOPEN_SOURCE >= 500
+.\" || _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED
+.fi
+.PP
+.BR wait4 ():
+.nf
+ Since glibc 2.19:
+ _DEFAULT_SOURCE
+ glibc 2.19 and earlier:
+ _BSD_SOURCE
+.fi
+.SH DESCRIPTION
+These functions are nonstandard; in new programs, the use of
+.BR waitpid (2)
+or
+.BR waitid (2)
+is preferable.
+.PP
+The
+.BR wait3 ()
+and
+.BR wait4 ()
+system calls are similar to
+.BR waitpid (2),
+but additionally return resource usage information about the
+child in the structure pointed to by
+.IR rusage .
+.PP
+Other than the use of the
+.I rusage
+argument, the following
+.BR wait3 ()
+call:
+.PP
+.in +4n
+.EX
+wait3(wstatus, options, rusage);
+.EE
+.in
+.PP
+is equivalent to:
+.PP
+.in +4n
+.EX
+waitpid(\-1, wstatus, options);
+.EE
+.in
+.PP
+Similarly, the following
+.BR wait4 ()
+call:
+.PP
+.in +4n
+.EX
+wait4(pid, wstatus, options, rusage);
+.EE
+.in
+.PP
+is equivalent to:
+.PP
+.in +4n
+.EX
+waitpid(pid, wstatus, options);
+.EE
+.in
+.PP
+In other words,
+.BR wait3 ()
+waits of any child, while
+.BR wait4 ()
+can be used to select a specific child, or children, on which to wait.
+See
+.BR wait (2)
+for further details.
+.PP
+If
+.I rusage
+is not NULL, the
+.I struct rusage
+to which it points will be filled with accounting information
+about the child.
+See
+.BR getrusage (2)
+for details.
+.SH RETURN VALUE
+As for
+.BR waitpid (2).
+.SH ERRORS
+As for
+.BR waitpid (2).
+.SH STANDARDS
+None.
+.SH HISTORY
+4.3BSD.
+.PP
+SUSv1 included a specification of
+.BR wait3 ();
+SUSv2 included
+.BR wait3 (),
+but marked it LEGACY;
+SUSv3 removed it.
+.PP
+Including
+.I <sys/time.h>
+is not required these days, but increases portability.
+(Indeed,
+.I <sys/resource.h>
+defines the
+.I rusage
+structure with fields of type
+.I struct timeval
+defined in
+.IR <sys/time.h> .)
+.SS C library/kernel differences
+On Linux,
+.BR wait3 ()
+is a library function implemented on top of the
+.BR wait4 ()
+system call.
+.SH SEE ALSO
+.BR fork (2),
+.BR getrusage (2),
+.BR sigaction (2),
+.BR signal (2),
+.BR wait (2),
+.BR signal (7)
diff --git a/man2/waitid.2 b/man2/waitid.2
new file mode 100644
index 0000000..0605b35
--- /dev/null
+++ b/man2/waitid.2
@@ -0,0 +1 @@
+.so man2/wait.2
diff --git a/man2/waitpid.2 b/man2/waitpid.2
new file mode 100644
index 0000000..0605b35
--- /dev/null
+++ b/man2/waitpid.2
@@ -0,0 +1 @@
+.so man2/wait.2
diff --git a/man2/write.2 b/man2/write.2
new file mode 100644
index 0000000..d1dc273
--- /dev/null
+++ b/man2/write.2
@@ -0,0 +1,329 @@
+.\" This manpage is Copyright (C) 1992 Drew Eckhardt;
+.\" and Copyright (C) 1993 Michael Haardt, Ian Jackson.
+.\" and Copyright (C) 2007 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.\" Modified Sat Jul 24 13:35:59 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Modified Sun Nov 28 17:19:01 1993 by Rik Faith <faith@cs.unc.edu>
+.\" Modified Sat Jan 13 12:58:08 1996 by Michael Haardt
+.\" <michael@cantor.informatik.rwth-aachen.de>
+.\" Modified Sun Jul 21 18:59:33 1996 by Andries Brouwer <aeb@cwi.nl>
+.\" 2001-12-13 added remark by Zack Weinberg
+.\" 2007-06-18 mtk:
+.\" Added details about seekable files and file offset.
+.\" Noted that write() may write less than 'count' bytes, and
+.\" gave some examples of why this might occur.
+.\" Noted what happens if write() is interrupted by a signal.
+.\"
+.TH write 2 2023-04-03 "Linux man-pages 6.05.01"
+.SH NAME
+write \- write to a file descriptor
+.SH LIBRARY
+Standard C library
+.RI ( libc ", " \-lc )
+.SH SYNOPSIS
+.nf
+.B #include <unistd.h>
+.PP
+.BI "ssize_t write(int " fd ", const void " buf [. count "], size_t " count );
+.fi
+.SH DESCRIPTION
+.BR write ()
+writes up to
+.I count
+bytes from the buffer starting at
+.I buf
+to the file referred to by the file descriptor
+.IR fd .
+.PP
+The number of bytes written may be less than
+.I count
+if, for example,
+there is insufficient space on the underlying physical medium, or the
+.B RLIMIT_FSIZE
+resource limit is encountered (see
+.BR setrlimit (2)),
+or the call was interrupted by a signal
+handler after having written less than
+.I count
+bytes.
+(See also
+.BR pipe (7).)
+.PP
+For a seekable file (i.e., one to which
+.BR lseek (2)
+may be applied, for example, a regular file)
+writing takes place at the file offset,
+and the file offset is incremented by
+the number of bytes actually written.
+If the file was
+.BR open (2)ed
+with
+.BR O_APPEND ,
+the file offset is first set to the end of the file before writing.
+The adjustment of the file offset and the write operation
+are performed as an atomic step.
+.PP
+POSIX requires that a
+.BR read (2)
+that can be proved to occur after a
+.BR write ()
+has returned will return the new data.
+Note that not all filesystems are POSIX conforming.
+.PP
+According to POSIX.1, if
+.I count
+is greater than
+.BR SSIZE_MAX ,
+the result is implementation-defined;
+see NOTES for the upper limit on Linux.
+.SH RETURN VALUE
+On success, the number of bytes written is returned.
+On error, \-1 is returned, and \fIerrno\fP is set
+to indicate the error.
+.PP
+Note that a successful
+.BR write ()
+may transfer fewer than
+.I count
+bytes.
+Such partial writes can occur for various reasons;
+for example, because there was insufficient space on the disk device
+to write all of the requested bytes, or because a blocked
+.BR write ()
+to a socket, pipe, or similar was interrupted by a signal handler
+after it had transferred some, but before it had transferred all
+of the requested bytes.
+In the event of a partial write, the caller can make another
+.BR write ()
+call to transfer the remaining bytes.
+The subsequent call will either transfer further bytes or
+may result in an error (e.g., if the disk is now full).
+.PP
+If \fIcount\fP is zero and
+.I fd
+refers to a regular file, then
+.BR write ()
+may return a failure status if one of the errors below is detected.
+If no errors are detected, or error detection is not performed,
+0 is returned without causing any other effect.
+If
+\fIcount\fP is zero and
+.I fd
+refers to a file other than a regular file,
+the results are not specified.
+.SH ERRORS
+.TP
+.B EAGAIN
+The file descriptor
+.I fd
+refers to a file other than a socket and has been marked nonblocking
+.RB ( O_NONBLOCK ),
+and the write would block.
+See
+.BR open (2)
+for further details on the
+.B O_NONBLOCK
+flag.
+.TP
+.BR EAGAIN " or " EWOULDBLOCK
+.\" Actually EAGAIN on Linux
+The file descriptor
+.I fd
+refers to a socket and has been marked nonblocking
+.RB ( O_NONBLOCK ),
+and the write would block.
+POSIX.1-2001 allows either error to be returned for this case,
+and does not require these constants to have the same value,
+so a portable application should check for both possibilities.
+.TP
+.B EBADF
+.I fd
+is not a valid file descriptor or is not open for writing.
+.TP
+.B EDESTADDRREQ
+.I fd
+refers to a datagram socket for which a peer address has not been set using
+.BR connect (2).
+.TP
+.B EDQUOT
+The user's quota of disk blocks on the filesystem containing the file
+referred to by
+.I fd
+has been exhausted.
+.TP
+.B EFAULT
+.I buf
+is outside your accessible address space.
+.TP
+.B EFBIG
+An attempt was made to write a file that exceeds the implementation-defined
+maximum file size or the process's file size limit,
+or to write at a position past the maximum allowed offset.
+.TP
+.B EINTR
+The call was interrupted by a signal before any data was written; see
+.BR signal (7).
+.TP
+.B EINVAL
+.I fd
+is attached to an object which is unsuitable for writing;
+or the file was opened with the
+.B O_DIRECT
+flag, and either the address specified in
+.IR buf ,
+the value specified in
+.IR count ,
+or the file offset is not suitably aligned.
+.TP
+.B EIO
+A low-level I/O error occurred while modifying the inode.
+This error may relate to the write-back of data written by an earlier
+.BR write (),
+which may have been issued to a different file descriptor on
+the same file.
+Since Linux 4.13, errors from write-back come
+with a promise that they
+.I may
+be reported by subsequent.
+.BR write ()
+requests, and
+.I will
+be reported by a subsequent
+.BR fsync (2)
+(whether or not they were also reported by
+.BR write ()).
+.\" commit 088737f44bbf6378745f5b57b035e57ee3dc4750
+An alternate cause of
+.B EIO
+on networked filesystems is when an advisory lock had been taken out
+on the file descriptor and this lock has been lost.
+See the
+.I "Lost locks"
+section of
+.BR fcntl (2)
+for further details.
+.TP
+.B ENOSPC
+The device containing the file referred to by
+.I fd
+has no room for the data.
+.TP
+.B EPERM
+The operation was prevented by a file seal; see
+.BR fcntl (2).
+.TP
+.B EPIPE
+.I fd
+is connected to a pipe or socket whose reading end is closed.
+When this happens the writing process will also receive a
+.B SIGPIPE
+signal.
+(Thus, the write return value is seen only if the program
+catches, blocks or ignores this signal.)
+.PP
+Other errors may occur, depending on the object connected to
+.IR fd .
+.SH STANDARDS
+POSIX.1-2008.
+.SH HISTORY
+SVr4, 4.3BSD, POSIX.1-2001.
+.\" SVr4 documents additional error
+.\" conditions EDEADLK, ENOLCK, ENOLNK, ENOSR, ENXIO, or ERANGE.
+.PP
+Under SVr4 a write may be interrupted and return
+.B EINTR
+at any point,
+not just before any data is written.
+.SH NOTES
+A successful return from
+.BR write ()
+does not make any guarantee that data has been committed to disk.
+On some filesystems, including NFS, it does not even guarantee
+that space has successfully been reserved for the data.
+In this case,
+some errors might be delayed until a future
+.BR write (),
+.BR fsync (2),
+or even
+.BR close (2).
+The only way to be sure is to call
+.BR fsync (2)
+after you are done writing all your data.
+.PP
+If a
+.BR write ()
+is interrupted by a signal handler before any bytes are written,
+then the call fails with the error
+.BR EINTR ;
+if it is interrupted after at least one byte has been written,
+the call succeeds, and returns the number of bytes written.
+.PP
+On Linux,
+.BR write ()
+(and similar system calls) will transfer at most
+0x7ffff000 (2,147,479,552) bytes,
+returning the number of bytes actually transferred.
+.\" commit e28cc71572da38a5a12c1cfe4d7032017adccf69
+(This is true on both 32-bit and 64-bit systems.)
+.PP
+An error return value while performing
+.BR write ()
+using direct I/O does not mean the
+entire write has failed.
+Partial data may be written
+and the data at the file offset on which the
+.BR write ()
+was attempted should be considered inconsistent.
+.SH BUGS
+According to POSIX.1-2008/SUSv4 Section XSI 2.9.7
+("Thread Interactions with Regular File Operations"):
+.PP
+.RS 4
+All of the following functions shall be atomic with respect to
+each other in the effects specified in POSIX.1-2008 when they
+operate on regular files or symbolic links: ...
+.RE
+.PP
+Among the APIs subsequently listed are
+.BR write ()
+and
+.BR writev (2).
+And among the effects that should be atomic across threads (and processes)
+are updates of the file offset.
+However, before Linux 3.14,
+this was not the case: if two processes that share
+an open file description (see
+.BR open (2))
+perform a
+.BR write ()
+(or
+.BR writev (2))
+at the same time, then the I/O operations were not atomic
+with respect to updating the file offset,
+with the result that the blocks of data output by the two processes
+might (incorrectly) overlap.
+This problem was fixed in Linux 3.14.
+.\" http://thread.gmane.org/gmane.linux.kernel/1649458
+.\" From: Michael Kerrisk (man-pages <mtk.manpages <at> gmail.com>
+.\" Subject: Update of file offset on write() etc. is non-atomic with I/O
+.\" Date: 2014-02-17 15:41:37 GMT
+.\" Newsgroups: gmane.linux.kernel, gmane.linux.file-systems
+.\" commit 9c225f2655e36a470c4f58dbbc99244c5fc7f2d4
+.\" Author: Linus Torvalds <torvalds@linux-foundation.org>
+.\" Date: Mon Mar 3 09:36:58 2014 -0800
+.\"
+.\" vfs: atomic f_pos accesses as per POSIX
+.SH SEE ALSO
+.BR close (2),
+.BR fcntl (2),
+.BR fsync (2),
+.BR ioctl (2),
+.BR lseek (2),
+.BR open (2),
+.BR pwrite (2),
+.BR read (2),
+.BR select (2),
+.BR writev (2),
+.BR fwrite (3)
diff --git a/man2/writev.2 b/man2/writev.2
new file mode 100644
index 0000000..54e3384
--- /dev/null
+++ b/man2/writev.2
@@ -0,0 +1 @@
+.so man2/readv.2