From 399644e47874bff147afb19c89228901ac39340e Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Mon, 15 Apr 2024 21:40:15 +0200 Subject: Adding upstream version 6.05.01. Signed-off-by: Daniel Baumann --- man2/perf_event_open.2 | 3989 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 3989 insertions(+) create mode 100644 man2/perf_event_open.2 (limited to 'man2/perf_event_open.2') diff --git a/man2/perf_event_open.2 b/man2/perf_event_open.2 new file mode 100644 index 0000000..d9e7877 --- /dev/null +++ b/man2/perf_event_open.2 @@ -0,0 +1,3989 @@ +.\" Copyright (c) 2012, Vincent Weaver +.\" +.\" SPDX-License-Identifier: GPL-2.0-or-later +.\" +.\" This document is based on the perf_event.h header file, the +.\" tools/perf/design.txt file, and a lot of bitter experience. +.\" +.TH perf_event_open 2 2023-05-03 "Linux man-pages 6.05.01" +.SH NAME +perf_event_open \- set up performance monitoring +.SH LIBRARY +Standard C library +.RI ( libc ", " \-lc ) +.SH SYNOPSIS +.nf +.BR "#include " " /* Definition of " PERF_* " constants */" +.BR "#include " " /* Definition of " HW_* " constants */" +.BR "#include " " /* Definition of " SYS_* " constants */" +.B #include +.PP +.BI "int syscall(SYS_perf_event_open, struct perf_event_attr *" attr , +.BI " pid_t " pid ", int " cpu ", int " group_fd \ +", unsigned long " flags ); +.fi +.PP +.IR Note : +glibc provides no wrapper for +.BR perf_event_open (), +necessitating the use of +.BR syscall (2). +.SH DESCRIPTION +Given a list of parameters, +.BR perf_event_open () +returns a file descriptor, for use in subsequent system calls +.RB ( read "(2), " mmap "(2), " prctl "(2), " fcntl "(2), etc.)." +.PP +A call to +.BR perf_event_open () +creates a file descriptor that allows measuring performance +information. +Each file descriptor corresponds to one +event that is measured; these can be grouped together +to measure multiple events simultaneously. +.PP +Events can be enabled and disabled in two ways: via +.BR ioctl (2) +and via +.BR prctl (2). +When an event is disabled it does not count or generate overflows but does +continue to exist and maintain its count value. +.PP +Events come in two flavors: counting and sampled. +A +.I counting +event is one that is used for counting the aggregate number of events +that occur. +In general, counting event results are gathered with a +.BR read (2) +call. +A +.I sampling +event periodically writes measurements to a buffer that can then +be accessed via +.BR mmap (2). +.SS Arguments +The +.I pid +and +.I cpu +arguments allow specifying which process and CPU to monitor: +.TP +.BR "pid == 0" " and " "cpu == \-1" +This measures the calling process/thread on any CPU. +.TP +.BR "pid == 0" " and " "cpu >= 0" +This measures the calling process/thread only +when running on the specified CPU. +.TP +.BR "pid > 0" " and " "cpu == \-1" +This measures the specified process/thread on any CPU. +.TP +.BR "pid > 0" " and " "cpu >= 0" +This measures the specified process/thread only +when running on the specified CPU. +.TP +.BR "pid == \-1" " and " "cpu >= 0" +This measures all processes/threads on the specified CPU. +This requires +.B CAP_PERFMON +(since Linux 5.8) or +.B CAP_SYS_ADMIN +capability or a +.I /proc/sys/kernel/perf_event_paranoid +value of less than 1. +.TP +.BR "pid == \-1" " and " "cpu == \-1" +This setting is invalid and will return an error. +.PP +When +.I pid +is greater than zero, permission to perform this system call +is governed by +.B CAP_PERFMON +(since Linux 5.9) and a ptrace access mode +.B PTRACE_MODE_READ_REALCREDS +check on older Linux versions; see +.BR ptrace (2). +.PP +The +.I group_fd +argument allows event groups to be created. +An event group has one event which is the group leader. +The leader is created first, with +.IR group_fd " = \-1." +The rest of the group members are created with subsequent +.BR perf_event_open () +calls with +.I group_fd +being set to the file descriptor of the group leader. +(A single event on its own is created with +.IR group_fd " = \-1" +and is considered to be a group with only 1 member.) +An event group is scheduled onto the CPU as a unit: +it will be put onto the CPU +only if all of the events in the group can be put onto the CPU. +This means that the values of the member events can be meaningfully compared +\[em]added, divided (to get ratios), and so on\[em] +with each other, +since they have counted events for the same set of executed instructions. +.PP +The +.I flags +argument is formed by ORing together zero or more of the following values: +.TP +.BR PERF_FLAG_FD_CLOEXEC " (since Linux 3.14)" +.\" commit a21b0b354d4ac39be691f51c53562e2c24443d9e +This flag enables the close-on-exec flag for the created +event file descriptor, +so that the file descriptor is automatically closed on +.BR execve (2). +Setting the close-on-exec flags at creation time, rather than later with +.BR fcntl (2), +avoids potential race conditions where the calling thread invokes +.BR perf_event_open () +and +.BR fcntl (2) +at the same time as another thread calls +.BR fork (2) +then +.BR execve (2). +.TP +.B PERF_FLAG_FD_NO_GROUP +This flag tells the event to ignore the +.I group_fd +parameter except for the purpose of setting up output redirection +using the +.B PERF_FLAG_FD_OUTPUT +flag. +.TP +.BR PERF_FLAG_FD_OUTPUT " (broken since Linux 2.6.35)" +.\" commit ac9721f3f54b27a16c7e1afb2481e7ee95a70318 +This flag re-routes the event's sampled output to instead +be included in the mmap buffer of the event specified by +.IR group_fd . +.TP +.BR PERF_FLAG_PID_CGROUP " (since Linux 2.6.39)" +.\" commit e5d1367f17ba6a6fed5fd8b74e4d5720923e0c25 +This flag activates per-container system-wide monitoring. +A container +is an abstraction that isolates a set of resources for finer-grained +control (CPUs, memory, etc.). +In this mode, the event is measured +only if the thread running on the monitored CPU belongs to the designated +container (cgroup). +The cgroup is identified by passing a file descriptor +opened on its directory in the cgroupfs filesystem. +For instance, if the +cgroup to monitor is called +.IR test , +then a file descriptor opened on +.I /dev/cgroup/test +(assuming cgroupfs is mounted on +.IR /dev/cgroup ) +must be passed as the +.I pid +parameter. +cgroup monitoring is available only +for system-wide events and may therefore require extra permissions. +.PP +The +.I perf_event_attr +structure provides detailed configuration information +for the event being created. +.PP +.in +4n +.EX +struct perf_event_attr { + __u32 type; /* Type of event */ + __u32 size; /* Size of attribute structure */ + __u64 config; /* Type\-specific configuration */ +\& + union { + __u64 sample_period; /* Period of sampling */ + __u64 sample_freq; /* Frequency of sampling */ + }; +\& + __u64 sample_type; /* Specifies values included in sample */ + __u64 read_format; /* Specifies values returned in read */ +\& + __u64 disabled : 1, /* off by default */ + inherit : 1, /* children inherit it */ + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only group on PMU */ + exclude_user : 1, /* don\[aq]t count user */ + exclude_kernel : 1, /* don\[aq]t count kernel */ + exclude_hv : 1, /* don\[aq]t count hypervisor */ + exclude_idle : 1, /* don\[aq]t count when idle */ + mmap : 1, /* include mmap data */ + comm : 1, /* include comm data */ + freq : 1, /* use freq, not period */ + inherit_stat : 1, /* per task counts */ + enable_on_exec : 1, /* next exec enables */ + task : 1, /* trace fork/exit */ + watermark : 1, /* wakeup_watermark */ + precise_ip : 2, /* skid constraint */ + mmap_data : 1, /* non\-exec mmap data */ + sample_id_all : 1, /* sample_type all events */ + exclude_host : 1, /* don\[aq]t count in host */ + exclude_guest : 1, /* don\[aq]t count in guest */ + exclude_callchain_kernel : 1, + /* exclude kernel callchains */ + exclude_callchain_user : 1, + /* exclude user callchains */ + mmap2 : 1, /* include mmap with inode data */ + comm_exec : 1, /* flag comm events that are + due to exec */ + use_clockid : 1, /* use clockid for time fields */ + context_switch : 1, /* context switch data */ + write_backward : 1, /* Write ring buffer from end + to beginning */ + namespaces : 1, /* include namespaces data */ + ksymbol : 1, /* include ksymbol events */ + bpf_event : 1, /* include bpf events */ + aux_output : 1, /* generate AUX records + instead of events */ + cgroup : 1, /* include cgroup events */ + text_poke : 1, /* include text poke events */ + build_id : 1, /* use build id in mmap2 events */ + inherit_thread : 1, /* children only inherit */ + /* if cloned with CLONE_THREAD */ + remove_on_exec : 1, /* event is removed from task + on exec */ + sigtrap : 1, /* send synchronous SIGTRAP + on event */ +\& + __reserved_1 : 26; +\& + union { + __u32 wakeup_events; /* wakeup every n events */ + __u32 wakeup_watermark; /* bytes before wakeup */ + }; +\& + __u32 bp_type; /* breakpoint type */ +\& + union { + __u64 bp_addr; /* breakpoint address */ + __u64 kprobe_func; /* for perf_kprobe */ + __u64 uprobe_path; /* for perf_uprobe */ + __u64 config1; /* extension of config */ + }; +\& + union { + __u64 bp_len; /* breakpoint length */ + __u64 kprobe_addr; /* with kprobe_func == NULL */ + __u64 probe_offset; /* for perf_[k,u]probe */ + __u64 config2; /* extension of config1 */ + }; + __u64 branch_sample_type; /* enum perf_branch_sample_type */ + __u64 sample_regs_user; /* user regs to dump on samples */ + __u32 sample_stack_user; /* size of stack to dump on + samples */ + __s32 clockid; /* clock to use for time fields */ + __u64 sample_regs_intr; /* regs to dump on samples */ + __u32 aux_watermark; /* aux bytes before wakeup */ + __u16 sample_max_stack; /* max frames in callchain */ + __u16 __reserved_2; /* align to u64 */ + __u32 aux_sample_size; /* max aux sample size */ + __u32 __reserved_3; /* align to u64 */ + __u64 sig_data; /* user data for sigtrap */ +\& +}; +.EE +.in +.PP +The fields of the +.I perf_event_attr +structure are described in more detail below: +.TP +.I type +This field specifies the overall event type. +It has one of the following values: +.RS +.TP +.B PERF_TYPE_HARDWARE +This indicates one of the "generalized" hardware events provided +by the kernel. +See the +.I config +field definition for more details. +.TP +.B PERF_TYPE_SOFTWARE +This indicates one of the software-defined events provided by the kernel +(even if no hardware support is available). +.TP +.B PERF_TYPE_TRACEPOINT +This indicates a tracepoint +provided by the kernel tracepoint infrastructure. +.TP +.B PERF_TYPE_HW_CACHE +This indicates a hardware cache event. +This has a special encoding, described in the +.I config +field definition. +.TP +.B PERF_TYPE_RAW +This indicates a "raw" implementation-specific event in the +.IR config " field." +.TP +.BR PERF_TYPE_BREAKPOINT " (since Linux 2.6.33)" +.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e +This indicates a hardware breakpoint as provided by the CPU. +Breakpoints can be read/write accesses to an address as well as +execution of an instruction address. +.TP +dynamic PMU +Since Linux 2.6.38, +.\" commit 2e80a82a49c4c7eca4e35734380f28298ba5db19 +.BR perf_event_open () +can support multiple PMUs. +To enable this, a value exported by the kernel can be used in the +.I type +field to indicate which PMU to use. +The value to use can be found in the sysfs filesystem: +there is a subdirectory per PMU instance under +.IR /sys/bus/event_source/devices . +In each subdirectory there is a +.I type +file whose content is an integer that can be used in the +.I type +field. +For instance, +.I /sys/bus/event_source/devices/cpu/type +contains the value for the core CPU PMU, which is usually 4. +.TP +.BR kprobe " and " uprobe " (since Linux 4.17)" +.\" commit 65074d43fc77bcae32776724b7fa2696923c78e4 +.\" commit e12f03d7031a977356e3d7b75a68c2185ff8d155 +.\" commit 33ea4b24277b06dbc55d7f5772a46f029600255e +These two dynamic PMUs create a kprobe/uprobe and attach it to the +file descriptor generated by perf_event_open. +The kprobe/uprobe will be destroyed on the destruction of the file descriptor. +See fields +.IR kprobe_func , +.IR uprobe_path , +.IR kprobe_addr , +and +.I probe_offset +for more details. +.RE +.TP +.I "size" +The size of the +.I perf_event_attr +structure for forward/backward compatibility. +Set this using +.I sizeof(struct perf_event_attr) +to allow the kernel to see +the struct size at the time of compilation. +.IP +The related define +.B PERF_ATTR_SIZE_VER0 +is set to 64; this was the size of the first published struct. +.B PERF_ATTR_SIZE_VER1 +is 72, corresponding to the addition of breakpoints in Linux 2.6.33. +.\" commit cb5d76999029ae7a517cb07dfa732c1b5a934fc2 +.\" this was added much later when PERF_ATTR_SIZE_VER2 happened +.\" but the actual attr_size had increased in Linux 2.6.33 +.B PERF_ATTR_SIZE_VER2 +is 80 corresponding to the addition of branch sampling in Linux 3.4. +.\" commit cb5d76999029ae7a517cb07dfa732c1b5a934fc2 +.B PERF_ATTR_SIZE_VER3 +is 96 corresponding to the addition +of +.I sample_regs_user +and +.I sample_stack_user +in Linux 3.7. +.\" commit 1659d129ed014b715b0b2120e6fd929bdd33ed03 +.B PERF_ATTR_SIZE_VER4 +is 104 corresponding to the addition of +.I sample_regs_intr +in Linux 3.19. +.\" commit 60e2364e60e86e81bc6377f49779779e6120977f +.B PERF_ATTR_SIZE_VER5 +is 112 corresponding to the addition of +.I aux_watermark +in Linux 4.1. +.\" commit 1a5941312414c71dece6717da9a0fa1303127afa +.TP +.I "config" +This specifies which event you want, in conjunction with +the +.I type +field. +The +.I config1 +and +.I config2 +fields are also taken into account in cases where 64 bits is not +enough to fully specify the event. +The encoding of these fields are event dependent. +.IP +There are various ways to set the +.I config +field that are dependent on the value of the previously +described +.I type +field. +What follows are various possible settings for +.I config +separated out by +.IR type . +.IP +If +.I type +is +.BR PERF_TYPE_HARDWARE , +we are measuring one of the generalized hardware CPU events. +Not all of these are available on all platforms. +Set +.I config +to one of the following: +.RS 12 +.TP +.B PERF_COUNT_HW_CPU_CYCLES +Total cycles. +Be wary of what happens during CPU frequency scaling. +.TP +.B PERF_COUNT_HW_INSTRUCTIONS +Retired instructions. +Be careful, these can be affected by various +issues, most notably hardware interrupt counts. +.TP +.B PERF_COUNT_HW_CACHE_REFERENCES +Cache accesses. +Usually this indicates Last Level Cache accesses but this may +vary depending on your CPU. +This may include prefetches and coherency messages; again this +depends on the design of your CPU. +.TP +.B PERF_COUNT_HW_CACHE_MISSES +Cache misses. +Usually this indicates Last Level Cache misses; this is intended to be +used in conjunction with the +.B PERF_COUNT_HW_CACHE_REFERENCES +event to calculate cache miss rates. +.TP +.B PERF_COUNT_HW_BRANCH_INSTRUCTIONS +Retired branch instructions. +Prior to Linux 2.6.35, this used +the wrong event on AMD processors. +.\" commit f287d332ce835f77a4f5077d2c0ef1e3f9ea42d2 +.TP +.B PERF_COUNT_HW_BRANCH_MISSES +Mispredicted branch instructions. +.TP +.B PERF_COUNT_HW_BUS_CYCLES +Bus cycles, which can be different from total cycles. +.TP +.BR PERF_COUNT_HW_STALLED_CYCLES_FRONTEND " (since Linux 3.0)" +.\" commit 8f62242246351b5a4bc0c1f00c0c7003edea128a +Stalled cycles during issue. +.TP +.BR PERF_COUNT_HW_STALLED_CYCLES_BACKEND " (since Linux 3.0)" +.\" commit 8f62242246351b5a4bc0c1f00c0c7003edea128a +Stalled cycles during retirement. +.TP +.BR PERF_COUNT_HW_REF_CPU_CYCLES " (since Linux 3.3)" +.\" commit c37e17497e01fc0f5d2d6feb5723b210b3ab8890 +Total cycles; not affected by CPU frequency scaling. +.RE +.IP +If +.I type +is +.BR PERF_TYPE_SOFTWARE , +we are measuring software events provided by the kernel. +Set +.I config +to one of the following: +.RS 12 +.TP +.B PERF_COUNT_SW_CPU_CLOCK +This reports the CPU clock, a high-resolution per-CPU timer. +.TP +.B PERF_COUNT_SW_TASK_CLOCK +This reports a clock count specific to the task that is running. +.TP +.B PERF_COUNT_SW_PAGE_FAULTS +This reports the number of page faults. +.TP +.B PERF_COUNT_SW_CONTEXT_SWITCHES +This counts context switches. +Until Linux 2.6.34, these were all reported as user-space +events, after that they are reported as happening in the kernel. +.\" commit e49a5bd38159dfb1928fd25b173bc9de4bbadb21 +.TP +.B PERF_COUNT_SW_CPU_MIGRATIONS +This reports the number of times the process +has migrated to a new CPU. +.TP +.B PERF_COUNT_SW_PAGE_FAULTS_MIN +This counts the number of minor page faults. +These did not require disk I/O to handle. +.TP +.B PERF_COUNT_SW_PAGE_FAULTS_MAJ +This counts the number of major page faults. +These required disk I/O to handle. +.TP +.BR PERF_COUNT_SW_ALIGNMENT_FAULTS " (since Linux 2.6.33)" +.\" commit f7d7986060b2890fc26db6ab5203efbd33aa2497 +This counts the number of alignment faults. +These happen when unaligned memory accesses happen; the kernel +can handle these but it reduces performance. +This happens only on some architectures (never on x86). +.TP +.BR PERF_COUNT_SW_EMULATION_FAULTS " (since Linux 2.6.33)" +.\" commit f7d7986060b2890fc26db6ab5203efbd33aa2497 +This counts the number of emulation faults. +The kernel sometimes traps on unimplemented instructions +and emulates them for user space. +This can negatively impact performance. +.TP +.BR PERF_COUNT_SW_DUMMY " (since Linux 3.12)" +.\" commit fa0097ee690693006ab1aea6c01ad3c851b65c77 +This is a placeholder event that counts nothing. +Informational sample record types such as mmap or comm +must be associated with an active event. +This dummy event allows gathering such records without requiring +a counting event. +.TP +.BR PERF_COUNT_SW_BPF_OUTPUT " (since Linux 4.4)" +.\" commit a43eec304259a6c637f4014a6d4767159b6a3aa3 +This is used to generate raw sample data from BPF. +BPF programs can write to this event using +.B bpf_perf_event_output +helper. +.TP +.BR PERF_COUNT_SW_CGROUP_SWITCHES " (since Linux 5.13)" +.\" commit d0d1dd628527c77db2391ce0293c1ed344b2365f +This counts context switches to a task in a different cgroup. +In other words, if the next task is in the same cgroup, +it won't count the switch. +.RE +.PP +.RS +If +.I type +is +.BR PERF_TYPE_TRACEPOINT , +then we are measuring kernel tracepoints. +The value to use in +.I config +can be obtained from under debugfs +.I tracing/events/*/*/id +if ftrace is enabled in the kernel. +.RE +.PP +.RS +If +.I type +is +.BR PERF_TYPE_HW_CACHE , +then we are measuring a hardware CPU cache event. +To calculate the appropriate +.I config +value, use the following equation: +.RS 4 +.PP +.in +4n +.EX +config = (perf_hw_cache_id) | + (perf_hw_cache_op_id << 8) | + (perf_hw_cache_op_result_id << 16); +.EE +.in +.PP +where +.I perf_hw_cache_id +is one of: +.RS 4 +.TP +.B PERF_COUNT_HW_CACHE_L1D +for measuring Level 1 Data Cache +.TP +.B PERF_COUNT_HW_CACHE_L1I +for measuring Level 1 Instruction Cache +.TP +.B PERF_COUNT_HW_CACHE_LL +for measuring Last-Level Cache +.TP +.B PERF_COUNT_HW_CACHE_DTLB +for measuring the Data TLB +.TP +.B PERF_COUNT_HW_CACHE_ITLB +for measuring the Instruction TLB +.TP +.B PERF_COUNT_HW_CACHE_BPU +for measuring the branch prediction unit +.TP +.BR PERF_COUNT_HW_CACHE_NODE " (since Linux 3.1)" +.\" commit 89d6c0b5bdbb1927775584dcf532d98b3efe1477 +for measuring local memory accesses +.RE +.PP +and +.I perf_hw_cache_op_id +is one of: +.RS 4 +.TP +.B PERF_COUNT_HW_CACHE_OP_READ +for read accesses +.TP +.B PERF_COUNT_HW_CACHE_OP_WRITE +for write accesses +.TP +.B PERF_COUNT_HW_CACHE_OP_PREFETCH +for prefetch accesses +.RE +.PP +and +.I perf_hw_cache_op_result_id +is one of: +.RS 4 +.TP +.B PERF_COUNT_HW_CACHE_RESULT_ACCESS +to measure accesses +.TP +.B PERF_COUNT_HW_CACHE_RESULT_MISS +to measure misses +.RE +.RE +.PP +If +.I type +is +.BR PERF_TYPE_RAW , +then a custom "raw" +.I config +value is needed. +Most CPUs support events that are not covered by the "generalized" events. +These are implementation defined; see your CPU manual (for example +the Intel Volume 3B documentation or the AMD BIOS and Kernel Developer +Guide). +The libpfm4 library can be used to translate from the name in the +architectural manuals to the raw hex value +.BR perf_event_open () +expects in this field. +.PP +If +.I type +is +.BR PERF_TYPE_BREAKPOINT , +then leave +.I config +set to zero. +Its parameters are set in other places. +.PP +If +.I type +is +.B kprobe +or +.BR uprobe , +set +.I retprobe +(bit 0 of +.IR config , +see +.IR /sys/bus/event_source/devices/[k,u]probe/format/retprobe ) +for kretprobe/uretprobe. +See fields +.IR kprobe_func , +.IR uprobe_path , +.IR kprobe_addr , +and +.I probe_offset +for more details. +.RE +.TP +.IR kprobe_func ", " uprobe_path ", " kprobe_addr ", and " probe_offset +These fields describe the kprobe/uprobe for dynamic PMUs +.B kprobe +and +.BR uprobe . +For +.BR kprobe : +use +.I kprobe_func +and +.IR probe_offset , +or use +.I kprobe_addr +and leave +.I kprobe_func +as NULL. +For +.BR uprobe : +use +.I uprobe_path +and +.IR probe_offset . +.TP +.IR sample_period ", " sample_freq +A "sampling" event is one that generates an overflow notification +every N events, where N is given by +.IR sample_period . +A sampling event has +.IR sample_period " > 0." +When an overflow occurs, requested data is recorded +in the mmap buffer. +The +.I sample_type +field controls what data is recorded on each overflow. +.IP +.I sample_freq +can be used if you wish to use frequency rather than period. +In this case, you set the +.I freq +flag. +The kernel will adjust the sampling period +to try and achieve the desired rate. +The rate of adjustment is a +timer tick. +.TP +.I sample_type +The various bits in this field specify which values to include +in the sample. +They will be recorded in a ring-buffer, +which is available to user space using +.BR mmap (2). +The order in which the values are saved in the +sample are documented in the MMAP Layout subsection below; +it is not the +.I "enum perf_event_sample_format" +order. +.RS +.TP +.B PERF_SAMPLE_IP +Records instruction pointer. +.TP +.B PERF_SAMPLE_TID +Records the process and thread IDs. +.TP +.B PERF_SAMPLE_TIME +Records a timestamp. +.TP +.B PERF_SAMPLE_ADDR +Records an address, if applicable. +.TP +.B PERF_SAMPLE_READ +Record counter values for all events in a group, not just the group leader. +.TP +.B PERF_SAMPLE_CALLCHAIN +Records the callchain (stack backtrace). +.TP +.B PERF_SAMPLE_ID +Records a unique ID for the opened event's group leader. +.TP +.B PERF_SAMPLE_CPU +Records CPU number. +.TP +.B PERF_SAMPLE_PERIOD +Records the current sampling period. +.TP +.B PERF_SAMPLE_STREAM_ID +Records a unique ID for the opened event. +Unlike +.B PERF_SAMPLE_ID +the actual ID is returned, not the group leader. +This ID is the same as the one returned by +.BR PERF_FORMAT_ID . +.TP +.B PERF_SAMPLE_RAW +Records additional data, if applicable. +Usually returned by tracepoint events. +.TP +.BR PERF_SAMPLE_BRANCH_STACK " (since Linux 3.4)" +.\" commit bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e +This provides a record of recent branches, as provided +by CPU branch sampling hardware (such as Intel Last Branch Record). +Not all hardware supports this feature. +.IP +See the +.I branch_sample_type +field for how to filter which branches are reported. +.TP +.BR PERF_SAMPLE_REGS_USER " (since Linux 3.7)" +.\" commit 4018994f3d8785275ef0e7391b75c3462c029e56 +Records the current user-level CPU register state +(the values in the process before the kernel was called). +.TP +.BR PERF_SAMPLE_STACK_USER " (since Linux 3.7)" +.\" commit c5ebcedb566ef17bda7b02686e0d658a7bb42ee7 +Records the user level stack, allowing stack unwinding. +.TP +.BR PERF_SAMPLE_WEIGHT " (since Linux 3.10)" +.\" commit c3feedf2aaf9ac8bad6f19f5d21e4ee0b4b87e9c +Records a hardware provided weight value that expresses how +costly the sampled event was. +This allows the hardware to highlight expensive events in +a profile. +.TP +.BR PERF_SAMPLE_DATA_SRC " (since Linux 3.10)" +.\" commit d6be9ad6c960f43800a6f118932bc8a5a4eadcd1 +Records the data source: where in the memory hierarchy +the data associated with the sampled instruction came from. +This is available only if the underlying hardware +supports this feature. +.TP +.BR PERF_SAMPLE_IDENTIFIER " (since Linux 3.12)" +.\" commit ff3d527cebc1fa3707c617bfe9e74f53fcfb0955 +Places the +.B SAMPLE_ID +value in a fixed position in the record, +either at the beginning (for sample events) or at the end +(if a non-sample event). +.IP +This was necessary because a sample stream may have +records from various different event sources with different +.I sample_type +settings. +Parsing the event stream properly was not possible because the +format of the record was needed to find +.BR SAMPLE_ID , +but +the format could not be found without knowing what +event the sample belonged to (causing a circular +dependency). +.IP +The +.B PERF_SAMPLE_IDENTIFIER +setting makes the event stream always parsable +by putting +.B SAMPLE_ID +in a fixed location, even though +it means having duplicate +.B SAMPLE_ID +values in records. +.TP +.BR PERF_SAMPLE_TRANSACTION " (since Linux 3.13)" +.\" commit fdfbbd07e91f8fe387140776f3fd94605f0c89e5 +Records reasons for transactional memory abort events +(for example, from Intel TSX transactional memory support). +.IP +The +.I precise_ip +setting must be greater than 0 and a transactional memory abort +event must be measured or no values will be recorded. +Also note that some perf_event measurements, such as sampled +cycle counting, may cause extraneous aborts (by causing an +interrupt during a transaction). +.TP +.BR PERF_SAMPLE_REGS_INTR " (since Linux 3.19)" +.\" commit 60e2364e60e86e81bc6377f49779779e6120977f +Records a subset of the current CPU register state +as specified by +.IR sample_regs_intr . +Unlike +.B PERF_SAMPLE_REGS_USER +the register values will return kernel register +state if the overflow happened while kernel +code is running. +If the CPU supports hardware sampling of +register state (i.e., PEBS on Intel x86) and +.I precise_ip +is set higher than zero then the register +values returned are those captured by +hardware at the time of the sampled +instruction's retirement. +.TP +.BR PERF_SAMPLE_PHYS_ADDR " (since Linux 4.13)" +.\" commit fc7ce9c74c3ad232b084d80148654f926d01ece7 +Records physical address of data like in +.BR PERF_SAMPLE_ADDR . +.TP +.BR PERF_SAMPLE_CGROUP " (since Linux 5.7)" +.\" commit 96aaab686505c449e24d76e76507290dcc30e008 +Records (perf_event) cgroup ID of the process. +This corresponds to the +.I id +field in the +.B PERF_RECORD_CGROUP +event. +.TP +.BR PERF_SAMPLE_DATA_PAGE_SIZE " (since Linux 5.11)" +.\" commit 8d97e71811aaafe4abf611dc24822fd6e73df1a1 +Records page size of data like in +.BR PERF_SAMPLE_ADDR . +.TP +.BR PERF_SAMPLE_CODE_PAGE_SIZE " (since Linux 5.11)" +.\" commit 995f088efebe1eba0282a6ffa12411b37f8990c2 +Records page size of ip like in +.BR PERF_SAMPLE_IP . +.TP +.BR PERF_SAMPLE_WEIGHT_STRUCT " (since Linux 5.12)" +.\" commit 2a6c6b7d7ad346f0679d0963cb19b3f0ea7ef32c +Records hardware provided weight values like in +.BR PERF_SAMPLE_WEIGHT , +but it can represent multiple values in a struct. +This shares the same space as +.BR PERF_SAMPLE_WEIGHT , +so users can apply either of those, +not both. +It has the following format and +the meaning of each field is +dependent on the hardware implementation. +.PP +.in +4n +.EX +union perf_sample_weight { + u64 full; /* PERF_SAMPLE_WEIGHT */ + struct { /* PERF_SAMPLE_WEIGHT_STRUCT */ + u32 var1_dw; + u16 var2_w; + u16 var3_w; + }; +}; +.EE +.in +.RE +.TP +.I read_format +This field specifies the format of the data returned by +.BR read (2) +on a +.BR perf_event_open () +file descriptor. +.RS +.TP +.B PERF_FORMAT_TOTAL_TIME_ENABLED +Adds the 64-bit +.I time_enabled +field. +This can be used to calculate estimated totals if +the PMU is overcommitted and multiplexing is happening. +.TP +.B PERF_FORMAT_TOTAL_TIME_RUNNING +Adds the 64-bit +.I time_running +field. +This can be used to calculate estimated totals if +the PMU is overcommitted and multiplexing is happening. +.TP +.B PERF_FORMAT_ID +Adds a 64-bit unique value that corresponds to the event group. +.TP +.B PERF_FORMAT_GROUP +Allows all counter values in an event group to be read with one read. +.TP +.B PERF_FORMAT_LOST " (since Linux 6.0)" +.\" commit 119a784c81270eb88e573174ed2209225d646656 +Adds a 64-bit value that is the number of lost samples for this event. +This would be only meaningful when +.I sample_period +or +.I sample_freq +is set. +.RE +.TP +.I disabled +The +.I disabled +bit specifies whether the counter starts out disabled or enabled. +If disabled, the event can later be enabled by +.BR ioctl (2), +.BR prctl (2), +or +.IR enable_on_exec . +.IP +When creating an event group, typically the group leader is initialized +with +.I disabled +set to 1 and any child events are initialized with +.I disabled +set to 0. +Despite +.I disabled +being 0, the child events will not start until the group leader +is enabled. +.TP +.I inherit +The +.I inherit +bit specifies that this counter should count events of child +tasks as well as the task specified. +This applies only to new children, not to any existing children at +the time the counter is created (nor to any new children of +existing children). +.IP +Inherit does not work for some combinations of +.I read_format +values, such as +.BR PERF_FORMAT_GROUP . +.TP +.I pinned +The +.I pinned +bit specifies that the counter should always be on the CPU if at all +possible. +It applies only to hardware counters and only to group leaders. +If a pinned counter cannot be put onto the CPU (e.g., because there are +not enough hardware counters or because of a conflict with some other +event), then the counter goes into an 'error' state, where reads +return end-of-file (i.e., +.BR read (2) +returns 0) until the counter is subsequently enabled or disabled. +.TP +.I exclusive +The +.I exclusive +bit specifies that when this counter's group is on the CPU, +it should be the only group using the CPU's counters. +In the future this may allow monitoring programs to +support PMU features that need to run alone so that they do not +disrupt other hardware counters. +.IP +Note that many unexpected situations may prevent events with the +.I exclusive +bit set from ever running. +This includes any users running a system-wide +measurement as well as any kernel use of the performance counters +(including the commonly enabled NMI Watchdog Timer interface). +.TP +.I exclude_user +If this bit is set, the count excludes events that happen in user space. +.TP +.I exclude_kernel +If this bit is set, the count excludes events that happen in kernel space. +.TP +.I exclude_hv +If this bit is set, the count excludes events that happen in the +hypervisor. +This is mainly for PMUs that have built-in support for handling this +(such as POWER). +Extra support is needed for handling hypervisor measurements on most +machines. +.TP +.I exclude_idle +If set, don't count when the CPU is running the idle task. +While you can currently enable this for any event type, it is ignored +for all but software events. +.TP +.I mmap +The +.I mmap +bit enables generation of +.B PERF_RECORD_MMAP +samples for every +.BR mmap (2) +call that has +.B PROT_EXEC +set. +This allows tools to notice new executable code being mapped into +a program (dynamic shared libraries for example) +so that addresses can be mapped back to the original code. +.TP +.I comm +The +.I comm +bit enables tracking of process command name as modified by the +.BR execve (2) +and +.BR prctl (PR_SET_NAME) +system calls as well as writing to +.IR /proc/self/comm . +If the +.I comm_exec +flag is also successfully set (possible since Linux 3.16), +.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871 +then the misc flag +.B PERF_RECORD_MISC_COMM_EXEC +can be used to differentiate the +.BR execve (2) +case from the others. +.TP +.I freq +If this bit is set, then +.I sample_frequency +not +.I sample_period +is used when setting up the sampling interval. +.TP +.I inherit_stat +This bit enables saving of event counts on context switch for +inherited tasks. +This is meaningful only if the +.I inherit +field is set. +.TP +.I enable_on_exec +If this bit is set, a counter is automatically +enabled after a call to +.BR execve (2). +.TP +.I task +If this bit is set, then +fork/exit notifications are included in the ring buffer. +.TP +.I watermark +If set, have an overflow notification happen when we cross the +.I wakeup_watermark +boundary. +Otherwise, overflow notifications happen after +.I wakeup_events +samples. +.TP +.IR precise_ip " (since Linux 2.6.35)" +.\" commit ab608344bcbde4f55ec4cd911b686b0ce3eae076 +This controls the amount of skid. +Skid is how many instructions +execute between an event of interest happening and the kernel +being able to stop and record the event. +Smaller skid is +better and allows more accurate reporting of which events +correspond to which instructions, but hardware is often limited +with how small this can be. +.IP +The possible values of this field are the following: +.RS +.TP +.B 0 +.B SAMPLE_IP +can have arbitrary skid. +.TP +.B 1 +.B SAMPLE_IP +must have constant skid. +.TP +.B 2 +.B SAMPLE_IP +requested to have 0 skid. +.TP +.B 3 +.B SAMPLE_IP +must have 0 skid. +See also the description of +.BR PERF_RECORD_MISC_EXACT_IP . +.RE +.TP +.IR mmap_data " (since Linux 2.6.36)" +.\" commit 3af9e859281bda7eb7c20b51879cf43aa788ac2e +This is the counterpart of the +.I mmap +field. +This enables generation of +.B PERF_RECORD_MMAP +samples for +.BR mmap (2) +calls that do not have +.B PROT_EXEC +set (for example data and SysV shared memory). +.TP +.IR sample_id_all " (since Linux 2.6.38)" +.\" commit c980d1091810df13f21aabbce545fd98f545bbf7 +If set, then TID, TIME, ID, STREAM_ID, and CPU can +additionally be included in +.RB non- PERF_RECORD_SAMPLE s +if the corresponding +.I sample_type +is selected. +.IP +If +.B PERF_SAMPLE_IDENTIFIER +is specified, then an additional ID value is included +as the last value to ease parsing the record stream. +This may lead to the +.I id +value appearing twice. +.IP +The layout is described by this pseudo-structure: +.IP +.in +4n +.EX +struct sample_id { + { u32 pid, tid; } /* if PERF_SAMPLE_TID set */ + { u64 time; } /* if PERF_SAMPLE_TIME set */ + { u64 id; } /* if PERF_SAMPLE_ID set */ + { u64 stream_id;} /* if PERF_SAMPLE_STREAM_ID set */ + { u32 cpu, res; } /* if PERF_SAMPLE_CPU set */ + { u64 id; } /* if PERF_SAMPLE_IDENTIFIER set */ +}; +.EE +.in +.TP +.IR exclude_host " (since Linux 3.2)" +.\" commit a240f76165e6255384d4bdb8139895fac7988799 +When conducting measurements that include processes running +VM instances (i.e., have executed a +.B KVM_RUN +.BR ioctl (2)), +only measure events happening inside a guest instance. +This is only meaningful outside the guests; this setting does +not change counts gathered inside of a guest. +Currently, this functionality is x86 only. +.TP +.IR exclude_guest " (since Linux 3.2)" +.\" commit a240f76165e6255384d4bdb8139895fac7988799 +When conducting measurements that include processes running +VM instances (i.e., have executed a +.B KVM_RUN +.BR ioctl (2)), +do not measure events happening inside guest instances. +This is only meaningful outside the guests; this setting does +not change counts gathered inside of a guest. +Currently, this functionality is x86 only. +.TP +.IR exclude_callchain_kernel " (since Linux 3.7)" +.\" commit d077526485d5c9b12fe85d0b2b3b7041e6bc5f91 +Do not include kernel callchains. +.TP +.IR exclude_callchain_user " (since Linux 3.7)" +.\" commit d077526485d5c9b12fe85d0b2b3b7041e6bc5f91 +Do not include user callchains. +.TP +.IR mmap2 " (since Linux 3.16)" +.\" commit 13d7a2410fa637f450a29ecb515ac318ee40c741 +.\" This is tricky; was committed during 3.12 development +.\" but right before release was disabled. +.\" So while you could select mmap2 starting with Linux 3.12 +.\" it did not work until Linux 3.16 +.\" commit a5a5ba72843dd05f991184d6cb9a4471acce1005 +Generate an extended executable mmap record that contains enough +additional information to uniquely identify shared mappings. +The +.I mmap +flag must also be set for this to work. +.TP +.IR comm_exec " (since Linux 3.16)" +.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871 +This is purely a feature-detection flag, it does not change +kernel behavior. +If this flag can successfully be set, then, when +.I comm +is enabled, the +.B PERF_RECORD_MISC_COMM_EXEC +flag will be set in the +.I misc +field of a comm record header if the rename event being +reported was caused by a call to +.BR execve (2). +This allows tools to distinguish between the various +types of process renaming. +.TP +.IR use_clockid " (since Linux 4.1)" +.\" commit 34f439278cef7b1177f8ce24f9fc81dfc6221d3b +This allows selecting which internal Linux clock to use +when generating timestamps via the +.I clockid +field. +This can make it easier to correlate perf sample times with +timestamps generated by other tools. +.TP +.IR context_switch " (since Linux 4.3)" +.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4 +This enables the generation of +.B PERF_RECORD_SWITCH +records when a context switch occurs. +It also enables the generation of +.B PERF_RECORD_SWITCH_CPU_WIDE +records when sampling in CPU-wide mode. +This functionality is in addition to existing tracepoint and +software events for measuring context switches. +The advantage of this method is that it will give full +information even with strict +.I perf_event_paranoid +settings. +.TP +.IR write_backward " (since Linux 4.6)" +.\" commit 9ecda41acb971ebd07c8fb35faf24005c0baea12 +This causes the ring buffer to be written from the end to the beginning. +This is to support reading from overwritable ring buffer. +.TP +.IR namespaces " (since Linux 4.11)" +.\" commit e422267322cd319e2695a535e47c5b1feeac45eb +This enables the generation of +.B PERF_RECORD_NAMESPACES +records when a task enters a new namespace. +Each namespace has a combination of device and inode numbers. +.TP +.IR ksymbol " (since Linux 5.0)" +.\" commit 76193a94522f1d4edf2447a536f3f796ce56343b +This enables the generation of +.B PERF_RECORD_KSYMBOL +records when new kernel symbols are registered or unregistered. +This is analyzing dynamic kernel functions like eBPF. +.TP +.IR bpf_event " (since Linux 5.0)" +.\" commit 6ee52e2a3fe4ea35520720736e6791df1fb67106 +This enables the generation of +.B PERF_RECORD_BPF_EVENT +records when an eBPF program is loaded or unloaded. +.TP +.IR aux_output " (since Linux 5.4)" +.\" commit ab43762ef010967e4ccd53627f70a2eecbeafefb +This allows normal (non-AUX) events to generate data for AUX events +if the hardware supports it. +.TP +.IR cgroup " (since Linux 5.7)" +.\" commit 96aaab686505c449e24d76e76507290dcc30e008 +This enables the generation of +.B PERF_RECORD_CGROUP +records when a new cgroup is created (and activated). +.TP +.IR text_poke " (since Linux 5.8)" +.\" commit e17d43b93e544f5016c0251d2074c15568d5d963 +This enables the generation of +.B PERF_RECORD_TEXT_POKE +records when there's a change to the kernel text +(i.e., self-modifying code). +.TP +.IR build_id " (since Linux 5.12)" +.\" commit 88a16a1309333e43d328621ece3e9fa37027e8eb +This changes the contents in the +.B PERF_RECORD_MMAP2 +to have a build-id instead of device and inode numbers. +.TP +.IR inherit_thread " (since Linux 5.13)" +.\" commit 2b26f0aa004995f49f7b6f4100dd0e4c39a9ed5f +This disables the inheritance of the event to a child process. +Only new threads in the same process +(which is cloned with +.BR CLONE_THREAD ) +will inherit the event. +.TP +.IR remove_on_exec " (since Linux 5.13)" +.\" commit 2e498d0a74e5b88a6689ae1b811f247f91ff188e +This closes the event when it starts a new process image by +.BR execve (2). +.TP +.IR sigtrap " (since Linux 5.13)" +.\" commit 97ba62b278674293762c3d91f724f1bb922f04e0 +This enables synchronous signal delivery of +.B SIGTRAP +on event overflow. +.TP +.IR wakeup_events ", " wakeup_watermark +This union sets how many samples +.RI ( wakeup_events ) +or bytes +.RI ( wakeup_watermark ) +happen before an overflow notification happens. +Which one is used is selected by the +.I watermark +bit flag. +.IP +.I wakeup_events +counts only +.B PERF_RECORD_SAMPLE +record types. +To receive overflow notification for all +.B PERF_RECORD +types choose watermark and set +.I wakeup_watermark +to 1. +.IP +Prior to Linux 3.0, setting +.\" commit f506b3dc0ec454a16d40cab9ee5d75435b39dc50 +.I wakeup_events +to 0 resulted in no overflow notifications; +more recent kernels treat 0 the same as 1. +.TP +.IR bp_type " (since Linux 2.6.33)" +.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e +This chooses the breakpoint type. +It is one of: +.RS +.TP +.B HW_BREAKPOINT_EMPTY +No breakpoint. +.TP +.B HW_BREAKPOINT_R +Count when we read the memory location. +.TP +.B HW_BREAKPOINT_W +Count when we write the memory location. +.TP +.B HW_BREAKPOINT_RW +Count when we read or write the memory location. +.TP +.B HW_BREAKPOINT_X +Count when we execute code at the memory location. +.PP +The values can be combined via a bitwise or, but the +combination of +.B HW_BREAKPOINT_R +or +.B HW_BREAKPOINT_W +with +.B HW_BREAKPOINT_X +is not allowed. +.RE +.TP +.IR bp_addr " (since Linux 2.6.33)" +.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e +This is the address of the breakpoint. +For execution breakpoints, this is the memory address of the instruction +of interest; for read and write breakpoints, it is the memory address +of the memory location of interest. +.TP +.IR config1 " (since Linux 2.6.39)" +.\" commit a7e3ed1e470116c9d12c2f778431a481a6be8ab6 +.I config1 +is used for setting events that need an extra register or otherwise +do not fit in the regular config field. +Raw OFFCORE_EVENTS on Nehalem/Westmere/SandyBridge use this field +on Linux 3.3 and later kernels. +.TP +.IR bp_len " (since Linux 2.6.33)" +.\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e +.I bp_len +is the length of the breakpoint being measured if +.I type +is +.BR PERF_TYPE_BREAKPOINT . +Options are +.BR HW_BREAKPOINT_LEN_1 , +.BR HW_BREAKPOINT_LEN_2 , +.BR HW_BREAKPOINT_LEN_4 , +and +.BR HW_BREAKPOINT_LEN_8 . +For an execution breakpoint, set this to +.IR sizeof(long) . +.TP +.IR config2 " (since Linux 2.6.39)" +.\" commit a7e3ed1e470116c9d12c2f778431a481a6be8ab6 +.I config2 +is a further extension of the +.I config1 +field. +.TP +.IR branch_sample_type " (since Linux 3.4)" +.\" commit bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e +If +.B PERF_SAMPLE_BRANCH_STACK +is enabled, then this specifies what branches to include +in the branch record. +.IP +The first part of the value is the privilege level, which +is a combination of one of the values listed below. +If the user does not set privilege level explicitly, the kernel +will use the event's privilege level. +Event and branch privilege levels do not have to match. +.RS +.TP +.B PERF_SAMPLE_BRANCH_USER +Branch target is in user space. +.TP +.B PERF_SAMPLE_BRANCH_KERNEL +Branch target is in kernel space. +.TP +.B PERF_SAMPLE_BRANCH_HV +Branch target is in hypervisor. +.TP +.B PERF_SAMPLE_BRANCH_PLM_ALL +A convenience value that is the three preceding values ORed together. +.PP +In addition to the privilege value, at least one or more of the +following bits must be set. +.TP +.B PERF_SAMPLE_BRANCH_ANY +Any branch type. +.TP +.B PERF_SAMPLE_BRANCH_ANY_CALL +Any call branch (includes direct calls, indirect calls, and far jumps). +.TP +.B PERF_SAMPLE_BRANCH_IND_CALL +Indirect calls. +.TP +.BR PERF_SAMPLE_BRANCH_CALL " (since Linux 4.4)" +.\" commit c229bf9dc179d2023e185c0f705bdf68484c1e73 +Direct calls. +.TP +.B PERF_SAMPLE_BRANCH_ANY_RETURN +Any return branch. +.TP +.BR PERF_SAMPLE_BRANCH_IND_JUMP " (since Linux 4.2)" +.\" commit c9fdfa14c3792c0160849c484e83aa57afd80ccc +Indirect jumps. +.TP +.BR PERF_SAMPLE_BRANCH_COND " (since Linux 3.16)" +.\" commit bac52139f0b7ab31330e98fd87fc5a2664951050 +Conditional branches. +.TP +.BR PERF_SAMPLE_BRANCH_ABORT_TX " (since Linux 3.11)" +.\" commit 135c5612c460f89657c4698fe2ea753f6f667963 +Transactional memory aborts. +.TP +.BR PERF_SAMPLE_BRANCH_IN_TX " (since Linux 3.11)" +.\" commit 135c5612c460f89657c4698fe2ea753f6f667963 +Branch in transactional memory transaction. +.TP +.BR PERF_SAMPLE_BRANCH_NO_TX " (since Linux 3.11)" +.\" commit 135c5612c460f89657c4698fe2ea753f6f667963 +Branch not in transactional memory transaction. +.BR PERF_SAMPLE_BRANCH_CALL_STACK " (since Linux 4.1)" +.\" commit 2c44b1936bb3b135a3fac8b3493394d42e51cf70 +Branch is part of a hardware-generated call stack. +This requires hardware support, currently only found +on Intel x86 Haswell or newer. +.RE +.TP +.IR sample_regs_user " (since Linux 3.7)" +.\" commit 4018994f3d8785275ef0e7391b75c3462c029e56 +This bit mask defines the set of user CPU registers to dump on samples. +The layout of the register mask is architecture-specific and +is described in the kernel header file +.IR arch/ARCH/include/uapi/asm/perf_regs.h . +.TP +.IR sample_stack_user " (since Linux 3.7)" +.\" commit c5ebcedb566ef17bda7b02686e0d658a7bb42ee7 +This defines the size of the user stack to dump if +.B PERF_SAMPLE_STACK_USER +is specified. +.TP +.IR clockid " (since Linux 4.1)" +.\" commit 34f439278cef7b1177f8ce24f9fc81dfc6221d3b +If +.I use_clockid +is set, then this field selects which internal Linux timer to +use for timestamps. +The available timers are defined in +.IR linux/time.h , +with +.BR CLOCK_MONOTONIC , +.BR CLOCK_MONOTONIC_RAW , +.BR CLOCK_REALTIME , +.BR CLOCK_BOOTTIME , +and +.B CLOCK_TAI +currently supported. +.TP +.IR aux_watermark " (since Linux 4.1)" +.\" commit 1a5941312414c71dece6717da9a0fa1303127afa +This specifies how much data is required to trigger a +.B PERF_RECORD_AUX +sample. +.TP +.IR sample_max_stack " (since Linux 4.8)" +.\" commit 97c79a38cd454602645f0470ffb444b3b75ce574 +When +.I sample_type +includes +.BR PERF_SAMPLE_CALLCHAIN , +this field specifies how many stack frames to report when +generating the callchain. +.TP +.IR aux_sample_size " (since Linux 5.5)" +.\" commit a4faf00d994c40e64f656805ac375c65e324eefb +When +.B PERF_SAMPLE_AUX +flag is set, +specify the desired size of AUX data. +Note that it can get smaller data than the specified size. +.TP +.IR sig_data " (since Linux 5.13)" +.\" commit 97ba62b278674293762c3d91f724f1bb922f04e0 +This data will be copied to user's signal handler +(through +.I si_perf +in the +.IR siginfo_t ) +to disambiguate which event triggered the signal. +.SS Reading results +Once a +.BR perf_event_open () +file descriptor has been opened, the values +of the events can be read from the file descriptor. +The values that are there are specified by the +.I read_format +field in the +.I attr +structure at open time. +.PP +If you attempt to read into a buffer that is not big enough to hold the +data, the error +.B ENOSPC +results. +.PP +Here is the layout of the data returned by a read: +.IP \[bu] 3 +If +.B PERF_FORMAT_GROUP +was specified to allow reading all events in a group at once: +.IP +.in +4n +.EX +struct read_format { + u64 nr; /* The number of events */ + u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */ + u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */ + struct { + u64 value; /* The value of the event */ + u64 id; /* if PERF_FORMAT_ID */ + u64 lost; /* if PERF_FORMAT_LOST */ + } values[nr]; +}; +.EE +.in +.IP \[bu] +If +.B PERF_FORMAT_GROUP +was +.I not +specified: +.IP +.in +4n +.EX +struct read_format { + u64 value; /* The value of the event */ + u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */ + u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */ + u64 id; /* if PERF_FORMAT_ID */ + u64 lost; /* if PERF_FORMAT_LOST */ +}; +.EE +.in +.PP +The values read are as follows: +.TP +.I nr +The number of events in this file descriptor. +Available only if +.B PERF_FORMAT_GROUP +was specified. +.TP +.IR time_enabled ", " time_running +Total time the event was enabled and running. +Normally these values are the same. +Multiplexing happens if the number of events is more than the +number of available PMU counter slots. +In that case the events run only part of the time and the +.I time_enabled +and +.I time running +values can be used to scale an estimated value for the count. +.TP +.I value +An unsigned 64-bit value containing the counter result. +.TP +.I id +A globally unique value for this particular event; only present if +.B PERF_FORMAT_ID +was specified in +.IR read_format . +.TP +.I lost +The number of lost samples of this event; +only present if +.B PERF_FORMAT_LOST +was specified in +.IR read_format . +.SS MMAP layout +When using +.BR perf_event_open () +in sampled mode, asynchronous events +(like counter overflow or +.B PROT_EXEC +mmap tracking) +are logged into a ring-buffer. +This ring-buffer is created and accessed through +.BR mmap (2). +.PP +The mmap size should be 1+2\[ha]n pages, where the first page is a +metadata page +.RI ( "struct perf_event_mmap_page" ) +that contains various +bits of information such as where the ring-buffer head is. +.PP +Before Linux 2.6.39, there is a bug that means you must allocate an mmap +ring buffer when sampling even if you do not plan to access it. +.PP +The structure of the first metadata mmap page is as follows: +.PP +.in +4n +.EX +struct perf_event_mmap_page { + __u32 version; /* version number of this structure */ + __u32 compat_version; /* lowest version this is compat with */ + __u32 lock; /* seqlock for synchronization */ + __u32 index; /* hardware counter identifier */ + __s64 offset; /* add to hardware counter value */ + __u64 time_enabled; /* time event active */ + __u64 time_running; /* time event on CPU */ + union { + __u64 capabilities; + struct { + __u64 cap_usr_time / cap_usr_rdpmc / cap_bit0 : 1, + cap_bit0_is_deprecated : 1, + cap_user_rdpmc : 1, + cap_user_time : 1, + cap_user_time_zero : 1, + }; + }; + __u16 pmc_width; + __u16 time_shift; + __u32 time_mult; + __u64 time_offset; + __u64 __reserved[120]; /* Pad to 1 k */ + __u64 data_head; /* head in the data section */ + __u64 data_tail; /* user\-space written tail */ + __u64 data_offset; /* where the buffer starts */ + __u64 data_size; /* data buffer size */ + __u64 aux_head; + __u64 aux_tail; + __u64 aux_offset; + __u64 aux_size; +\& +} +.EE +.in +.PP +The following list describes the fields in the +.I perf_event_mmap_page +structure in more detail: +.TP +.I version +Version number of this structure. +.TP +.I compat_version +The lowest version this is compatible with. +.TP +.I lock +A seqlock for synchronization. +.TP +.I index +A unique hardware counter identifier. +.TP +.I offset +When using rdpmc for reads this offset value +must be added to the one returned by rdpmc to get +the current total event count. +.TP +.I time_enabled +Time the event was active. +.TP +.I time_running +Time the event was running. +.TP +.IR cap_usr_time " / " cap_usr_rdpmc " / " cap_bit0 " (since Linux 3.4)" +.\" commit c7206205d00ab375839bd6c7ddb247d600693c09 +There was a bug in the definition of +.I cap_usr_time +and +.I cap_usr_rdpmc +from Linux 3.4 until Linux 3.11. +Both bits were defined to point to the same location, so it was +impossible to know if +.I cap_usr_time +or +.I cap_usr_rdpmc +were actually set. +.IP +Starting with Linux 3.12, these are renamed to +.\" commit fa7315871046b9a4c48627905691dbde57e51033 +.I cap_bit0 +and you should use the +.I cap_user_time +and +.I cap_user_rdpmc +fields instead. +.TP +.IR cap_bit0_is_deprecated " (since Linux 3.12)" +.\" commit fa7315871046b9a4c48627905691dbde57e51033 +If set, this bit indicates that the kernel supports +the properly separated +.I cap_user_time +and +.I cap_user_rdpmc +bits. +.IP +If not-set, it indicates an older kernel where +.I cap_usr_time +and +.I cap_usr_rdpmc +map to the same bit and thus both features should +be used with caution. +.TP +.IR cap_user_rdpmc " (since Linux 3.12)" +.\" commit fa7315871046b9a4c48627905691dbde57e51033 +If the hardware supports user-space read of performance counters +without syscall (this is the "rdpmc" instruction on x86), then +the following code can be used to do a read: +.IP +.in +4n +.EX +u32 seq, time_mult, time_shift, idx, width; +u64 count, enabled, running; +u64 cyc, time_offset; +\& +do { + seq = pc\->lock; + barrier(); + enabled = pc\->time_enabled; + running = pc\->time_running; +\& + if (pc\->cap_usr_time && enabled != running) { + cyc = rdtsc(); + time_offset = pc\->time_offset; + time_mult = pc\->time_mult; + time_shift = pc\->time_shift; + } +\& + idx = pc\->index; + count = pc\->offset; +\& + if (pc\->cap_usr_rdpmc && idx) { + width = pc\->pmc_width; + count += rdpmc(idx \- 1); + } +\& + barrier(); +} while (pc\->lock != seq); +.EE +.in +.TP +.IR cap_user_time " (since Linux 3.12)" +.\" commit fa7315871046b9a4c48627905691dbde57e51033 +This bit indicates the hardware has a constant, nonstop +timestamp counter (TSC on x86). +.TP +.IR cap_user_time_zero " (since Linux 3.12)" +.\" commit fa7315871046b9a4c48627905691dbde57e51033 +Indicates the presence of +.I time_zero +which allows mapping timestamp values to +the hardware clock. +.TP +.I pmc_width +If +.IR cap_usr_rdpmc , +this field provides the bit-width of the value +read using the rdpmc or equivalent instruction. +This can be used to sign extend the result like: +.IP +.in +4n +.EX +pmc <<= 64 \- pmc_width; +pmc >>= 64 \- pmc_width; // signed shift right +count += pmc; +.EE +.in +.TP +.IR time_shift ", " time_mult ", " time_offset +.IP +If +.IR cap_usr_time , +these fields can be used to compute the time +delta since +.I time_enabled +(in nanoseconds) using rdtsc or similar. +.IP +.in +4n +.EX +u64 quot, rem; +u64 delta; +\& +quot = cyc >> time_shift; +rem = cyc & (((u64)1 << time_shift) \- 1); +delta = time_offset + quot * time_mult + + ((rem * time_mult) >> time_shift); +.EE +.in +.IP +Where +.IR time_offset , +.IR time_mult , +.IR time_shift , +and +.I cyc +are read in the +seqcount loop described above. +This delta can then be added to +enabled and possible running (if idx), improving the scaling: +.IP +.in +4n +.EX +enabled += delta; +if (idx) + running += delta; +quot = count / running; +rem = count % running; +count = quot * enabled + (rem * enabled) / running; +.EE +.in +.TP +.IR time_zero " (since Linux 3.12)" +.\" commit fa7315871046b9a4c48627905691dbde57e51033 +.IP +If +.I cap_usr_time_zero +is set, then the hardware clock (the TSC timestamp counter on x86) +can be calculated from the +.IR time_zero , +.IR time_mult , +and +.I time_shift +values: +.IP +.in +4n +.EX +time = timestamp \- time_zero; +quot = time / time_mult; +rem = time % time_mult; +cyc = (quot << time_shift) + (rem << time_shift) / time_mult; +.EE +.in +.IP +And vice versa: +.IP +.in +4n +.EX +quot = cyc >> time_shift; +rem = cyc & (((u64)1 << time_shift) \- 1); +timestamp = time_zero + quot * time_mult + + ((rem * time_mult) >> time_shift); +.EE +.in +.TP +.I data_head +This points to the head of the data section. +The value continuously increases, it does not wrap. +The value needs to be manually wrapped by the size of the mmap buffer +before accessing the samples. +.IP +On SMP-capable platforms, after reading the +.I data_head +value, +user space should issue an rmb(). +.TP +.I data_tail +When the mapping is +.BR PROT_WRITE , +the +.I data_tail +value should be written by user space to reflect the last read data. +In this case, the kernel will not overwrite unread data. +.TP +.IR data_offset " (since Linux 4.1)" +.\" commit e8c6deac69629c0cb97c3d3272f8631ef17f8f0f +Contains the offset of the location in the mmap buffer +where perf sample data begins. +.TP +.IR data_size " (since Linux 4.1)" +.\" commit e8c6deac69629c0cb97c3d3272f8631ef17f8f0f +Contains the size of the perf sample region within +the mmap buffer. +.TP +.IR aux_head ", " aux_tail ", " aux_offset ", " aux_size " (since Linux 4.1)" +.\" commit 45bfb2e50471abbbfd83d40d28c986078b0d24ff +The AUX region allows +.BR mmap (2)-ing +a separate sample buffer for +high-bandwidth data streams (separate from the main perf sample buffer). +An example of a high-bandwidth stream is instruction tracing support, +as is found in newer Intel processors. +.IP +To set up an AUX area, first +.I aux_offset +needs to be set with an offset greater than +.IR data_offset + data_size +and +.I aux_size +needs to be set to the desired buffer size. +The desired offset and size must be page aligned, and the size +must be a power of two. +These values are then passed to mmap in order to map the AUX buffer. +Pages in the AUX buffer are included as part of the +.B RLIMIT_MEMLOCK +resource limit (see +.BR setrlimit (2)), +and also as part of the +.I perf_event_mlock_kb +allowance. +.IP +By default, the AUX buffer will be truncated if it will not fit +in the available space in the ring buffer. +If the AUX buffer is mapped as a read only buffer, then it will +operate in ring buffer mode where old data will be overwritten +by new. +In overwrite mode, it might not be possible to infer where the +new data began, and it is the consumer's job to disable +measurement while reading to avoid possible data races. +.IP +The +.I aux_head +and +.I aux_tail +ring buffer pointers have the same behavior and ordering +rules as the previous described +.I data_head +and +.IR data_tail . +.PP +The following 2^n ring-buffer pages have the layout described below. +.PP +If +.I perf_event_attr.sample_id_all +is set, then all event types will +have the sample_type selected fields related to where/when (identity) +an event took place (TID, TIME, ID, CPU, STREAM_ID) described in +.B PERF_RECORD_SAMPLE +below, it will be stashed just after the +.I perf_event_header +and the fields already present for the existing +fields, that is, at the end of the payload. +This allows a newer perf.data +file to be supported by older perf tools, with the new optional +fields being ignored. +.PP +The mmap values start with a header: +.PP +.in +4n +.EX +struct perf_event_header { + __u32 type; + __u16 misc; + __u16 size; +}; +.EE +.in +.PP +Below, we describe the +.I perf_event_header +fields in more detail. +For ease of reading, +the fields with shorter descriptions are presented first. +.TP +.I size +This indicates the size of the record. +.TP +.I misc +The +.I misc +field contains additional information about the sample. +.IP +The CPU mode can be determined from this value by masking with +.B PERF_RECORD_MISC_CPUMODE_MASK +and looking for one of the following (note these are not +bit masks, only one can be set at a time): +.RS +.TP +.B PERF_RECORD_MISC_CPUMODE_UNKNOWN +Unknown CPU mode. +.TP +.B PERF_RECORD_MISC_KERNEL +Sample happened in the kernel. +.TP +.B PERF_RECORD_MISC_USER +Sample happened in user code. +.TP +.B PERF_RECORD_MISC_HYPERVISOR +Sample happened in the hypervisor. +.TP +.BR PERF_RECORD_MISC_GUEST_KERNEL " (since Linux 2.6.35)" +.\" commit 39447b386c846bbf1c56f6403c5282837486200f +Sample happened in the guest kernel. +.TP +.B PERF_RECORD_MISC_GUEST_USER " (since Linux 2.6.35)" +.\" commit 39447b386c846bbf1c56f6403c5282837486200f +Sample happened in guest user code. +.RE +.PP +.RS +Since the following three statuses are generated by +different record types, they alias to the same bit: +.TP +.BR PERF_RECORD_MISC_MMAP_DATA " (since Linux 3.10)" +.\" commit 2fe85427e3bf65d791700d065132772fc26e4d75 +This is set when the mapping is not executable; +otherwise the mapping is executable. +.TP +.BR PERF_RECORD_MISC_COMM_EXEC " (since Linux 3.16)" +.\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871 +This is set for a +.B PERF_RECORD_COMM +record on kernels more recent than Linux 3.16 +if a process name change was caused by an +.BR execve (2) +system call. +.TP +.BR PERF_RECORD_MISC_SWITCH_OUT " (since Linux 4.3)" +.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4 +When a +.B PERF_RECORD_SWITCH +or +.B PERF_RECORD_SWITCH_CPU_WIDE +record is generated, this bit indicates that the +context switch is away from the current process +(instead of into the current process). +.RE +.PP +.RS +In addition, the following bits can be set: +.TP +.B PERF_RECORD_MISC_EXACT_IP +This indicates that the content of +.B PERF_SAMPLE_IP +points +to the actual instruction that triggered the event. +See also +.IR perf_event_attr.precise_ip . +.TP +.BR PERF_RECORD_MISC_SWITCH_OUT_PREEMPT " (since Linux 4.17)" +.\" commit 101592b4904ecf6b8ed2a4784d41d180319d95a1 +When a +.B PERF_RECORD_SWITCH +or +.B PERF_RECORD_SWITCH_CPU_WIDE +record is generated, +this indicates the context switch was a preemption. +.TP +.BR PERF_RECORD_MISC_MMAP_BUILD_ID " (since Linux 5.12)" +.\" commit 88a16a1309333e43d328621ece3e9fa37027e8eb +This indicates that the content of +.B PERF_SAMPLE_MMAP2 +contains build-ID data instead of device major and minor numbers +as well as the inode number. +.TP +.BR PERF_RECORD_MISC_EXT_RESERVED " (since Linux 2.6.35)" +.\" commit 1676b8a077c352085d52578fb4f29350b58b6e74 +This indicates there is extended data available (currently not used). +.TP +.B PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT +.\" commit 930e6fcd2bcce9bcd9d4aa7e755678d33f3fe6f4 +This bit is not set by the kernel. +It is reserved for the user-space perf utility to indicate that +.IR /proc/ pid /maps +parsing was taking too long and was stopped, and thus the mmap +records may be truncated. +.RE +.TP +.I type +The +.I type +value is one of the below. +The values in the corresponding record (that follows the header) +depend on the +.I type +selected as shown. +.RS +.TP 4 +.B PERF_RECORD_MMAP +The MMAP events record the +.B PROT_EXEC +mappings so that we can correlate +user-space IPs to code. +They have the following structure: +.IP +.in +4n +.EX +struct { + struct perf_event_header header; + u32 pid, tid; + u64 addr; + u64 len; + u64 pgoff; + char filename[]; +}; +.EE +.in +.RS +.TP +.I pid +is the process ID. +.TP +.I tid +is the thread ID. +.TP +.I addr +is the address of the allocated memory. +.I len +is the length of the allocated memory. +.I pgoff +is the page offset of the allocated memory. +.I filename +is a string describing the backing of the allocated memory. +.RE +.TP +.B PERF_RECORD_LOST +This record indicates when events are lost. +.IP +.in +4n +.EX +struct { + struct perf_event_header header; + u64 id; + u64 lost; + struct sample_id sample_id; +}; +.EE +.in +.RS +.TP +.I id +is the unique event ID for the samples that were lost. +.TP +.I lost +is the number of events that were lost. +.RE +.TP +.B PERF_RECORD_COMM +This record indicates a change in the process name. +.IP +.in +4n +.EX +struct { + struct perf_event_header header; + u32 pid; + u32 tid; + char comm[]; + struct sample_id sample_id; +}; +.EE +.in +.RS +.TP +.I pid +is the process ID. +.TP +.I tid +is the thread ID. +.TP +.I comm +is a string containing the new name of the process. +.RE +.TP +.B PERF_RECORD_EXIT +This record indicates a process exit event. +.IP +.in +4n +.EX +struct { + struct perf_event_header header; + u32 pid, ppid; + u32 tid, ptid; + u64 time; + struct sample_id sample_id; +}; +.EE +.in +.TP +.BR PERF_RECORD_THROTTLE ", " PERF_RECORD_UNTHROTTLE +This record indicates a throttle/unthrottle event. +.IP +.in +4n +.EX +struct { + struct perf_event_header header; + u64 time; + u64 id; + u64 stream_id; + struct sample_id sample_id; +}; +.EE +.in +.TP +.B PERF_RECORD_FORK +This record indicates a fork event. +.IP +.in +4n +.EX +struct { + struct perf_event_header header; + u32 pid, ppid; + u32 tid, ptid; + u64 time; + struct sample_id sample_id; +}; +.EE +.in +.TP +.B PERF_RECORD_READ +This record indicates a read event. +.IP +.in +4n +.EX +struct { + struct perf_event_header header; + u32 pid, tid; + struct read_format values; + struct sample_id sample_id; +}; +.EE +.in +.TP +.B PERF_RECORD_SAMPLE +This record indicates a sample. +.IP +.in +4n +.EX +struct { + struct perf_event_header header; + u64 sample_id; /* if PERF_SAMPLE_IDENTIFIER */ + u64 ip; /* if PERF_SAMPLE_IP */ + u32 pid, tid; /* if PERF_SAMPLE_TID */ + u64 time; /* if PERF_SAMPLE_TIME */ + u64 addr; /* if PERF_SAMPLE_ADDR */ + u64 id; /* if PERF_SAMPLE_ID */ + u64 stream_id; /* if PERF_SAMPLE_STREAM_ID */ + u32 cpu, res; /* if PERF_SAMPLE_CPU */ + u64 period; /* if PERF_SAMPLE_PERIOD */ + struct read_format v; + /* if PERF_SAMPLE_READ */ + u64 nr; /* if PERF_SAMPLE_CALLCHAIN */ + u64 ips[nr]; /* if PERF_SAMPLE_CALLCHAIN */ + u32 size; /* if PERF_SAMPLE_RAW */ + char data[size]; /* if PERF_SAMPLE_RAW */ + u64 bnr; /* if PERF_SAMPLE_BRANCH_STACK */ + struct perf_branch_entry lbr[bnr]; + /* if PERF_SAMPLE_BRANCH_STACK */ + u64 abi; /* if PERF_SAMPLE_REGS_USER */ + u64 regs[weight(mask)]; + /* if PERF_SAMPLE_REGS_USER */ + u64 size; /* if PERF_SAMPLE_STACK_USER */ + char data[size]; /* if PERF_SAMPLE_STACK_USER */ + u64 dyn_size; /* if PERF_SAMPLE_STACK_USER && + size != 0 */ + union perf_sample_weight weight; + /* if PERF_SAMPLE_WEIGHT */ + /* || PERF_SAMPLE_WEIGHT_STRUCT */ + u64 data_src; /* if PERF_SAMPLE_DATA_SRC */ + u64 transaction; /* if PERF_SAMPLE_TRANSACTION */ + u64 abi; /* if PERF_SAMPLE_REGS_INTR */ + u64 regs[weight(mask)]; + /* if PERF_SAMPLE_REGS_INTR */ + u64 phys_addr; /* if PERF_SAMPLE_PHYS_ADDR */ + u64 cgroup; /* if PERF_SAMPLE_CGROUP */ + u64 data_page_size; + /* if PERF_SAMPLE_DATA_PAGE_SIZE */ + u64 code_page_size; + /* if PERF_SAMPLE_CODE_PAGE_SIZE */ + u64 size; /* if PERF_SAMPLE_AUX */ + char data[size]; /* if PERF_SAMPLE_AUX */ +}; +.EE +.in +.RS 4 +.TP 4 +.I sample_id +If +.B PERF_SAMPLE_IDENTIFIER +is enabled, a 64-bit unique ID is included. +This is a duplication of the +.B PERF_SAMPLE_ID +.I id +value, but included at the beginning of the sample +so parsers can easily obtain the value. +.TP +.I ip +If +.B PERF_SAMPLE_IP +is enabled, then a 64-bit instruction +pointer value is included. +.TP +.IR pid ", " tid +If +.B PERF_SAMPLE_TID +is enabled, then a 32-bit process ID +and 32-bit thread ID are included. +.TP +.I time +If +.B PERF_SAMPLE_TIME +is enabled, then a 64-bit timestamp +is included. +This is obtained via local_clock() which is a hardware timestamp +if available and the jiffies value if not. +.TP +.I addr +If +.B PERF_SAMPLE_ADDR +is enabled, then a 64-bit address is included. +This is usually the address of a tracepoint, +breakpoint, or software event; otherwise the value is 0. +.TP +.I id +If +.B PERF_SAMPLE_ID +is enabled, a 64-bit unique ID is included. +If the event is a member of an event group, the group leader ID is returned. +This ID is the same as the one returned by +.BR PERF_FORMAT_ID . +.TP +.I stream_id +If +.B PERF_SAMPLE_STREAM_ID +is enabled, a 64-bit unique ID is included. +Unlike +.B PERF_SAMPLE_ID +the actual ID is returned, not the group leader. +This ID is the same as the one returned by +.BR PERF_FORMAT_ID . +.TP +.IR cpu ", " res +If +.B PERF_SAMPLE_CPU +is enabled, this is a 32-bit value indicating +which CPU was being used, in addition to a reserved (unused) +32-bit value. +.TP +.I period +If +.B PERF_SAMPLE_PERIOD +is enabled, a 64-bit value indicating +the current sampling period is written. +.TP +.I v +If +.B PERF_SAMPLE_READ +is enabled, a structure of type read_format +is included which has values for all events in the event group. +The values included depend on the +.I read_format +value used at +.BR perf_event_open () +time. +.TP +.IR nr ", " ips[nr] +If +.B PERF_SAMPLE_CALLCHAIN +is enabled, then a 64-bit number is included +which indicates how many following 64-bit instruction pointers will +follow. +This is the current callchain. +.TP +.IR size ", " data[size] +If +.B PERF_SAMPLE_RAW +is enabled, then a 32-bit value indicating size +is included followed by an array of 8-bit values of length size. +The values are padded with 0 to have 64-bit alignment. +.IP +This RAW record data is opaque with respect to the ABI. +The ABI doesn't make any promises with respect to the stability +of its content, it may vary depending +on event, hardware, and kernel version. +.TP +.IR bnr ", " lbr[bnr] +If +.B PERF_SAMPLE_BRANCH_STACK +is enabled, then a 64-bit value indicating +the number of records is included, followed by +.I bnr +.I perf_branch_entry +structures which each include the fields: +.RS +.TP +.I from +This indicates the source instruction (may not be a branch). +.TP +.I to +The branch target. +.TP +.I mispred +The branch target was mispredicted. +.TP +.I predicted +The branch target was predicted. +.TP +.IR in_tx " (since Linux 3.11)" +.\" commit 135c5612c460f89657c4698fe2ea753f6f667963 +The branch was in a transactional memory transaction. +.TP +.IR abort " (since Linux 3.11)" +.\" commit 135c5612c460f89657c4698fe2ea753f6f667963 +The branch was in an aborted transactional memory transaction. +.TP +.IR cycles " (since Linux 4.3)" +.\" commit 71ef3c6b9d4665ee7afbbe4c208a98917dcfc32f +This reports the number of cycles elapsed since the +previous branch stack update. +.PP +The entries are from most to least recent, so the first entry +has the most recent branch. +.PP +Support for +.IR mispred , +.IR predicted , +and +.I cycles +is optional; if not supported, those +values will be 0. +.PP +The type of branches recorded is specified by the +.I branch_sample_type +field. +.RE +.TP +.IR abi ", " regs[weight(mask)] +If +.B PERF_SAMPLE_REGS_USER +is enabled, then the user CPU registers are recorded. +.IP +The +.I abi +field is one of +.BR PERF_SAMPLE_REGS_ABI_NONE , +.BR PERF_SAMPLE_REGS_ABI_32 , +or +.BR PERF_SAMPLE_REGS_ABI_64 . +.IP +The +.I regs +field is an array of the CPU registers that were specified by +the +.I sample_regs_user +attr field. +The number of values is the number of bits set in the +.I sample_regs_user +bit mask. +.TP +.IR size ", " data[size] ", " dyn_size +If +.B PERF_SAMPLE_STACK_USER +is enabled, then the user stack is recorded. +This can be used to generate stack backtraces. +.I size +is the size requested by the user in +.I sample_stack_user +or else the maximum record size. +.I data +is the stack data (a raw dump of the memory pointed to by the +stack pointer at the time of sampling). +.I dyn_size +is the amount of data actually dumped (can be less than +.IR size ). +Note that +.I dyn_size +is omitted if +.I size +is 0. +.TP +.I weight +If +.B PERF_SAMPLE_WEIGHT +or +.B PERF_SAMPLE_WEIGHT_STRUCT +is enabled, then a 64-bit value provided by the hardware +is recorded that indicates how costly the event was. +This allows expensive events to stand out more clearly +in profiles. +.TP +.I data_src +If +.B PERF_SAMPLE_DATA_SRC +is enabled, then a 64-bit value is recorded that is made up of +the following fields: +.RS +.TP 4 +.I mem_op +Type of opcode, a bitwise combination of: +.IP +.PD 0 +.RS +.TP 24 +.B PERF_MEM_OP_NA +Not available +.TP +.B PERF_MEM_OP_LOAD +Load instruction +.TP +.B PERF_MEM_OP_STORE +Store instruction +.TP +.B PERF_MEM_OP_PFETCH +Prefetch +.TP +.B PERF_MEM_OP_EXEC +Executable code +.RE +.PD +.TP +.I mem_lvl +Memory hierarchy level hit or miss, a bitwise combination of +the following, shifted left by +.BR PERF_MEM_LVL_SHIFT : +.IP +.PD 0 +.RS +.TP 24 +.B PERF_MEM_LVL_NA +Not available +.TP +.B PERF_MEM_LVL_HIT +Hit +.TP +.B PERF_MEM_LVL_MISS +Miss +.TP +.B PERF_MEM_LVL_L1 +Level 1 cache +.TP +.B PERF_MEM_LVL_LFB +Line fill buffer +.TP +.B PERF_MEM_LVL_L2 +Level 2 cache +.TP +.B PERF_MEM_LVL_L3 +Level 3 cache +.TP +.B PERF_MEM_LVL_LOC_RAM +Local DRAM +.TP +.B PERF_MEM_LVL_REM_RAM1 +Remote DRAM 1 hop +.TP +.B PERF_MEM_LVL_REM_RAM2 +Remote DRAM 2 hops +.TP +.B PERF_MEM_LVL_REM_CCE1 +Remote cache 1 hop +.TP +.B PERF_MEM_LVL_REM_CCE2 +Remote cache 2 hops +.TP +.B PERF_MEM_LVL_IO +I/O memory +.TP +.B PERF_MEM_LVL_UNC +Uncached memory +.RE +.PD +.TP +.I mem_snoop +Snoop mode, a bitwise combination of the following, shifted left by +.BR PERF_MEM_SNOOP_SHIFT : +.IP +.PD 0 +.RS +.TP 24 +.B PERF_MEM_SNOOP_NA +Not available +.TP +.B PERF_MEM_SNOOP_NONE +No snoop +.TP +.B PERF_MEM_SNOOP_HIT +Snoop hit +.TP +.B PERF_MEM_SNOOP_MISS +Snoop miss +.TP +.B PERF_MEM_SNOOP_HITM +Snoop hit modified +.RE +.PD +.TP +.I mem_lock +Lock instruction, a bitwise combination of the following, shifted left by +.BR PERF_MEM_LOCK_SHIFT : +.IP +.PD 0 +.RS +.TP 24 +.B PERF_MEM_LOCK_NA +Not available +.TP +.B PERF_MEM_LOCK_LOCKED +Locked transaction +.RE +.PD +.TP +.I mem_dtlb +TLB access hit or miss, a bitwise combination of the following, shifted +left by +.BR PERF_MEM_TLB_SHIFT : +.IP +.PD 0 +.RS +.TP 24 +.B PERF_MEM_TLB_NA +Not available +.TP +.B PERF_MEM_TLB_HIT +Hit +.TP +.B PERF_MEM_TLB_MISS +Miss +.TP +.B PERF_MEM_TLB_L1 +Level 1 TLB +.TP +.B PERF_MEM_TLB_L2 +Level 2 TLB +.TP +.B PERF_MEM_TLB_WK +Hardware walker +.TP +.B PERF_MEM_TLB_OS +OS fault handler +.RE +.PD +.RE +.TP +.I transaction +If the +.B PERF_SAMPLE_TRANSACTION +flag is set, then a 64-bit field is recorded describing +the sources of any transactional memory aborts. +.IP +The field is a bitwise combination of the following values: +.RS +.TP +.B PERF_TXN_ELISION +Abort from an elision type transaction (Intel-CPU-specific). +.TP +.B PERF_TXN_TRANSACTION +Abort from a generic transaction. +.TP +.B PERF_TXN_SYNC +Synchronous abort (related to the reported instruction). +.TP +.B PERF_TXN_ASYNC +Asynchronous abort (not related to the reported instruction). +.TP +.B PERF_TXN_RETRY +Retryable abort (retrying the transaction may have succeeded). +.TP +.B PERF_TXN_CONFLICT +Abort due to memory conflicts with other threads. +.TP +.B PERF_TXN_CAPACITY_WRITE +Abort due to write capacity overflow. +.TP +.B PERF_TXN_CAPACITY_READ +Abort due to read capacity overflow. +.RE +.IP +In addition, a user-specified abort code can be obtained from +the high 32 bits of the field by shifting right by +.B PERF_TXN_ABORT_SHIFT +and masking with the value +.BR PERF_TXN_ABORT_MASK . +.TP +.IR abi ", " regs[weight(mask)] +If +.B PERF_SAMPLE_REGS_INTR +is enabled, then the user CPU registers are recorded. +.IP +The +.I abi +field is one of +.BR PERF_SAMPLE_REGS_ABI_NONE , +.BR PERF_SAMPLE_REGS_ABI_32 , +or +.BR PERF_SAMPLE_REGS_ABI_64 . +.IP +The +.I regs +field is an array of the CPU registers that were specified by +the +.I sample_regs_intr +attr field. +The number of values is the number of bits set in the +.I sample_regs_intr +bit mask. +.TP +.I phys_addr +If the +.B PERF_SAMPLE_PHYS_ADDR +flag is set, then the 64-bit physical address is recorded. +.TP +.I cgroup +If the +.B PERF_SAMPLE_CGROUP +flag is set, +then the 64-bit cgroup ID (for the perf_event subsystem) is recorded. +To get the pathname of the cgroup, the ID should match to one in a +.BR PERF_RECORD_CGROUP . +.TP +.I data_page_size +If the +.B PERF_SAMPLE_DATA_PAGE_SIZE +flag is set, +then the 64-bit page size value of the +.B data +address is recorded. +.TP +.I code_page_size +If the +.B PERF_SAMPLE_CODE_PAGE_SIZE +flag is set, +then the 64-bit page size value of the +.B ip +address is recorded. +.TP +.I size +.TQ +.IR data [ size ] +If +.B PERF_SAMPLE_AUX +is enabled, +a snapshot of the aux buffer is recorded. +.RE +.TP +.B PERF_RECORD_MMAP2 +This record includes extended information on +.BR mmap (2) +calls returning executable mappings. +The format is similar to that of the +.B PERF_RECORD_MMAP +record, but includes extra values that allow uniquely identifying +shared mappings. +Depending on the +.B PERF_RECORD_MISC_MMAP_BUILD_ID +bit in the header, +the extra values have different layout and meanings. +.IP +.in +4n +.EX +struct { + struct perf_event_header header; + u32 pid; + u32 tid; + u64 addr; + u64 len; + u64 pgoff; + union { + struct { + u32 maj; + u32 min; + u64 ino; + u64 ino_generation; + }; + struct { /* if PERF_RECORD_MISC_MMAP_BUILD_ID */ + u8 build_id_size; + u8 __reserved_1; + u16 __reserved_2; + u8 build_id[20]; + }; + }; + u32 prot; + u32 flags; + char filename[]; + struct sample_id sample_id; +}; +.EE +.in +.RS +.TP +.I pid +is the process ID. +.TP +.I tid +is the thread ID. +.TP +.I addr +is the address of the allocated memory. +.TP +.I len +is the length of the allocated memory. +.TP +.I pgoff +is the page offset of the allocated memory. +.TP +.I maj +is the major ID of the underlying device. +.TP +.I min +is the minor ID of the underlying device. +.TP +.I ino +is the inode number. +.TP +.I ino_generation +is the inode generation. +.TP +.I build_id_size +is the actual size of +.I build_id +field (up to 20). +.TP +.I build_id +is a raw data to identify a binary. +.TP +.I prot +is the protection information. +.TP +.I flags +is the flags information. +.TP +.I filename +is a string describing the backing of the allocated memory. +.RE +.TP +.BR PERF_RECORD_AUX " (since Linux 4.1)" +.\" commit 68db7e98c3a6ebe7284b6cf14906ed7c55f3f7f0 +This record reports that new data is available in the separate +AUX buffer region. +.IP +.in +4n +.EX +struct { + struct perf_event_header header; + u64 aux_offset; + u64 aux_size; + u64 flags; + struct sample_id sample_id; +}; +.EE +.in +.RS +.TP +.I aux_offset +offset in the AUX mmap region where the new data begins. +.TP +.I aux_size +size of the data made available. +.TP +.I flags +describes the AUX update. +.RS +.TP +.B PERF_AUX_FLAG_TRUNCATED +if set, then the data returned was truncated to fit the available +buffer size. +.TP +.B PERF_AUX_FLAG_OVERWRITE +.\" commit 2023a0d2829e521fe6ad6b9907f3f90bfbf57142 +if set, then the data returned has overwritten previous data. +.RE +.RE +.TP +.BR PERF_RECORD_ITRACE_START " (since Linux 4.1)" +.\" ec0d7729bbaed4b9d2d3fada693278e13a3d1368 +This record indicates which process has initiated an instruction +trace event, allowing tools to properly correlate the instruction +addresses in the AUX buffer with the proper executable. +.IP +.in +4n +.EX +struct { + struct perf_event_header header; + u32 pid; + u32 tid; +}; +.EE +.in +.RS +.TP +.I pid +process ID of the thread starting an instruction trace. +.TP +.I tid +thread ID of the thread starting an instruction trace. +.RE +.TP +.BR PERF_RECORD_LOST_SAMPLES " (since Linux 4.2)" +.\" f38b0dbb491a6987e198aa6b428db8692a6480f8 +When using hardware sampling (such as Intel PEBS) this record +indicates some number of samples that may have been lost. +.IP +.in +4n +.EX +struct { + struct perf_event_header header; + u64 lost; + struct sample_id sample_id; +}; +.EE +.in +.RS +.TP +.I lost +the number of potentially lost samples. +.RE +.TP +.BR PERF_RECORD_SWITCH " (since Linux 4.3)" +.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4 +This record indicates a context switch has happened. +The +.B PERF_RECORD_MISC_SWITCH_OUT +bit in the +.I misc +field indicates whether it was a context switch into +or away from the current process. +.IP +.in +4n +.EX +struct { + struct perf_event_header header; + struct sample_id sample_id; +}; +.EE +.in +.TP +.BR PERF_RECORD_SWITCH_CPU_WIDE " (since Linux 4.3)" +.\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4 +As with +.B PERF_RECORD_SWITCH +this record indicates a context switch has happened, +but it only occurs when sampling in CPU-wide mode +and provides additional information on the process +being switched to/from. +The +.B PERF_RECORD_MISC_SWITCH_OUT +bit in the +.I misc +field indicates whether it was a context switch into +or away from the current process. +.IP +.in +4n +.EX +struct { + struct perf_event_header header; + u32 next_prev_pid; + u32 next_prev_tid; + struct sample_id sample_id; +}; +.EE +.in +.RS +.TP +.I next_prev_pid +The process ID of the previous (if switching in) +or next (if switching out) process on the CPU. +.TP +.I next_prev_tid +The thread ID of the previous (if switching in) +or next (if switching out) thread on the CPU. +.RE +.TP +.BR PERF_RECORD_NAMESPACES " (since Linux 4.11)" +.\" commit e422267322cd319e2695a535e47c5b1feeac45eb +This record includes various namespace information of a process. +.IP +.in +4n +.EX +struct { + struct perf_event_header header; + u32 pid; + u32 tid; + u64 nr_namespaces; + struct { u64 dev, inode } [nr_namespaces]; + struct sample_id sample_id; +}; +.EE +.in +.RS +.TP +.I pid +is the process ID +.TP +.I tid +is the thread ID +.TP +.I nr_namespace +is the number of namespaces in this record +.RE +.IP +Each namespace has +.I dev +and +.I inode +fields and is recorded in the +fixed position like below: +.RS +.TP +.BR NET_NS_INDEX = 0 +Network namespace +.TP +.BR UTS_NS_INDEX = 1 +UTS namespace +.TP +.BR IPC_NS_INDEX = 2 +IPC namespace +.TP +.BR PID_NS_INDEX = 3 +PID namespace +.TP +.BR USER_NS_INDEX = 4 +User namespace +.TP +.BR MNT_NS_INDEX = 5 +Mount namespace +.TP +.BR CGROUP_NS_INDEX = 6 +Cgroup namespace +.RE +.TP +.BR PERF_RECORD_KSYMBOL " (since Linux 5.0)" +.\" commit 76193a94522f1d4edf2447a536f3f796ce56343b +This record indicates kernel symbol register/unregister events. +.IP +.in +4n +.EX +struct { + struct perf_event_header header; + u64 addr; + u32 len; + u16 ksym_type; + u16 flags; + char name[]; + struct sample_id sample_id; +}; +.EE +.in +.RS +.TP +.I addr +is the address of the kernel symbol. +.TP +.I len +is the length of the kernel symbol. +.TP +.I ksym_type +is the type of the kernel symbol. +Currently the following types are available: +.RS +.TP +.B PERF_RECORD_KSYMBOL_TYPE_BPF +The kernel symbol is a BPF function. +.RE +.TP +.I flags +If the +.B PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER +is set, then this event is for unregistering the kernel symbol. +.RE +.TP +.BR PERF_RECORD_BPF_EVENT " (since Linux 5.0)" +.\" commit 6ee52e2a3fe4ea35520720736e6791df1fb67106 +This record indicates BPF program is loaded or unloaded. +.IP +.in +4n +.EX +struct { + struct perf_event_header header; + u16 type; + u16 flags; + u32 id; + u8 tag[BPF_TAG_SIZE]; + struct sample_id sample_id; +}; +.EE +.in +.RS +.TP +.I type +is one of the following values: +.RS +.TP +.B PERF_BPF_EVENT_PROG_LOAD +A BPF program is loaded +.TP +.B PERF_BPF_EVENT_PROG_UNLOAD +A BPF program is unloaded +.RE +.TP +.I id +is the ID of the BPF program. +.TP +.I tag +is the tag of the BPF program. +Currently, +.B BPF_TAG_SIZE +is defined as 8. +.RE +.TP +.BR PERF_RECORD_CGROUP " (since Linux 5.7)" +.\" commit 96aaab686505c449e24d76e76507290dcc30e008 +This record indicates a new cgroup is created and activated. +.IP +.in +4n +.EX +struct { + struct perf_event_header header; + u64 id; + char path[]; + struct sample_id sample_id; +}; +.EE +.in +.RS +.TP +.I id +is the cgroup identifier. +This can be also retrieved by +.BR name_to_handle_at (2) +on the cgroup path (as a file handle). +.TP +.I path +is the path of the cgroup from the root. +.RE +.TP +.BR PERF_RECORD_TEXT_POKE " (since Linux 5.8)" +.\" commit e17d43b93e544f5016c0251d2074c15568d5d963 +This record indicates a change in the kernel text. +This includes addition and removal of the text +and the corresponding length is zero in this case. +.IP +.in +4n +.EX +struct { + struct perf_event_header header; + u64 addr; + u16 old_len; + u16 new_len; + u8 bytes[]; + struct sample_id sample_id; +}; +.EE +.in +.RS +.TP +.I addr +is the address of the change +.TP +.I old_len +is the old length +.TP +.I new_len +is the new length +.TP +.I bytes +contains old bytes immediately followed by new bytes. +.RE +.RE +.SS Overflow handling +Events can be set to notify when a threshold is crossed, +indicating an overflow. +Overflow conditions can be captured by monitoring the +event file descriptor with +.BR poll (2), +.BR select (2), +or +.BR epoll (7). +Alternatively, the overflow events can be captured via sa signal handler, +by enabling I/O signaling on the file descriptor; see the discussion of the +.B F_SETOWN +and +.B F_SETSIG +operations in +.BR fcntl (2). +.PP +Overflows are generated only by sampling events +.RI ( sample_period +must have a nonzero value). +.PP +There are two ways to generate overflow notifications. +.PP +The first is to set a +.I wakeup_events +or +.I wakeup_watermark +value that will trigger if a certain number of samples +or bytes have been written to the mmap ring buffer. +In this case, +.B POLL_IN +is indicated. +.PP +The other way is by use of the +.B PERF_EVENT_IOC_REFRESH +ioctl. +This ioctl adds to a counter that decrements each time the event overflows. +When nonzero, +.B POLL_IN +is indicated, but +once the counter reaches 0 +.B POLL_HUP +is indicated and +the underlying event is disabled. +.PP +Refreshing an event group leader refreshes all siblings and +refreshing with a parameter of 0 currently enables infinite +refreshes; +these behaviors are unsupported and should not be relied on. +.\" See https://lkml.org/lkml/2011/5/24/337 +.PP +Starting with Linux 3.18, +.\" commit 179033b3e064d2cd3f5f9945e76b0a0f0fbf4883 +.B POLL_HUP +is indicated if the event being monitored is attached to a different +process and that process exits. +.SS rdpmc instruction +Starting with Linux 3.4 on x86, you can use the +.\" commit c7206205d00ab375839bd6c7ddb247d600693c09 +.I rdpmc +instruction to get low-latency reads without having to enter the kernel. +Note that using +.I rdpmc +is not necessarily faster than other methods for reading event values. +.PP +Support for this can be detected with the +.I cap_usr_rdpmc +field in the mmap page; documentation on how +to calculate event values can be found in that section. +.PP +Originally, when rdpmc support was enabled, any process (not just ones +with an active perf event) could use the rdpmc instruction to access +the counters. +Starting with Linux 4.0, +.\" 7911d3f7af14a614617e38245fedf98a724e46a9 +rdpmc support is only allowed if an event is currently enabled +in a process's context. +To restore the old behavior, write the value 2 to +.IR /sys/devices/cpu/rdpmc . +.SS perf_event ioctl calls +Various ioctls act on +.BR perf_event_open () +file descriptors: +.TP +.B PERF_EVENT_IOC_ENABLE +This enables the individual event or event group specified by the +file descriptor argument. +.IP +If the +.B PERF_IOC_FLAG_GROUP +bit is set in the ioctl argument, then all events in a group are +enabled, even if the event specified is not the group leader +(but see BUGS). +.TP +.B PERF_EVENT_IOC_DISABLE +This disables the individual counter or event group specified by the +file descriptor argument. +.IP +Enabling or disabling the leader of a group enables or disables the +entire group; that is, while the group leader is disabled, none of the +counters in the group will count. +Enabling or disabling a member of a group other than the leader +affects only that counter; disabling a non-leader +stops that counter from counting but doesn't affect any other counter. +.IP +If the +.B PERF_IOC_FLAG_GROUP +bit is set in the ioctl argument, then all events in a group are +disabled, even if the event specified is not the group leader +(but see BUGS). +.TP +.B PERF_EVENT_IOC_REFRESH +Non-inherited overflow counters can use this +to enable a counter for a number of overflows specified by the argument, +after which it is disabled. +Subsequent calls of this ioctl add the argument value to the current +count. +An overflow notification with +.B POLL_IN +set will happen on each overflow until the +count reaches 0; when that happens a notification with +.B POLL_HUP +set is sent and the event is disabled. +Using an argument of 0 is considered undefined behavior. +.TP +.B PERF_EVENT_IOC_RESET +Reset the event count specified by the +file descriptor argument to zero. +This resets only the counts; there is no way to reset the +multiplexing +.I time_enabled +or +.I time_running +values. +.IP +If the +.B PERF_IOC_FLAG_GROUP +bit is set in the ioctl argument, then all events in a group are +reset, even if the event specified is not the group leader +(but see BUGS). +.TP +.B PERF_EVENT_IOC_PERIOD +This updates the overflow period for the event. +.IP +Since Linux 3.7 (on ARM) +.\" commit 3581fe0ef37ce12ac7a4f74831168352ae848edc +and Linux 3.14 (all other architectures), +.\" commit bad7192b842c83e580747ca57104dd51fe08c223 +the new period takes effect immediately. +On older kernels, the new period did not take effect until +after the next overflow. +.IP +The argument is a pointer to a 64-bit value containing the +desired new period. +.IP +Prior to Linux 2.6.36, +.\" commit ad0cf3478de8677f720ee06393b3147819568d6a +this ioctl always failed due to a bug +in the kernel. +.TP +.B PERF_EVENT_IOC_SET_OUTPUT +This tells the kernel to report event notifications to the specified +file descriptor rather than the default one. +The file descriptors must all be on the same CPU. +.IP +The argument specifies the desired file descriptor, or \-1 if +output should be ignored. +.TP +.BR PERF_EVENT_IOC_SET_FILTER " (since Linux 2.6.33)" +.\" commit 6fb2915df7f0747d9044da9dbff5b46dc2e20830 +This adds an ftrace filter to this event. +.IP +The argument is a pointer to the desired ftrace filter. +.TP +.BR PERF_EVENT_IOC_ID " (since Linux 3.12)" +.\" commit cf4957f17f2a89984915ea808876d9c82225b862 +This returns the event ID value for the given event file descriptor. +.IP +The argument is a pointer to a 64-bit unsigned integer +to hold the result. +.TP +.BR PERF_EVENT_IOC_SET_BPF " (since Linux 4.1)" +.\" commit 2541517c32be2531e0da59dfd7efc1ce844644f5 +This allows attaching a Berkeley Packet Filter (BPF) +program to an existing kprobe tracepoint event. +You need +.B CAP_PERFMON +(since Linux 5.8) or +.B CAP_SYS_ADMIN +privileges to use this ioctl. +.IP +The argument is a BPF program file descriptor that was created by +a previous +.BR bpf (2) +system call. +.TP +.BR PERF_EVENT_IOC_PAUSE_OUTPUT " (since Linux 4.7)" +.\" commit 86e7972f690c1017fd086cdfe53d8524e68c661c +This allows pausing and resuming the event's ring-buffer. +A paused ring-buffer does not prevent generation of samples, +but simply discards them. +The discarded samples are considered lost, and cause a +.B PERF_RECORD_LOST +sample to be generated when possible. +An overflow signal may still be triggered by the discarded sample +even though the ring-buffer remains empty. +.IP +The argument is an unsigned 32-bit integer. +A nonzero value pauses the ring-buffer, while a +zero value resumes the ring-buffer. +.TP +.BR PERF_EVENT_MODIFY_ATTRIBUTES " (since Linux 4.17)" +.\" commit 32ff77e8cc9e66cc4fb38098f64fd54cc8f54573 +This allows modifying an existing event without the overhead +of closing and reopening a new event. +Currently this is supported only for breakpoint events. +.IP +The argument is a pointer to a +.I perf_event_attr +structure containing the updated event settings. +.TP +.BR PERF_EVENT_IOC_QUERY_BPF " (since Linux 4.16)" +.\" commit f371b304f12e31fe30207c41ca7754564e0ea4dc +This allows querying which Berkeley Packet Filter (BPF) +programs are attached to an existing kprobe tracepoint. +You can only attach one BPF program per event, but you can +have multiple events attached to a tracepoint. +Querying this value on one tracepoint event returns the ID +of all BPF programs in all events attached to the tracepoint. +You need +.B CAP_PERFMON +(since Linux 5.8) or +.B CAP_SYS_ADMIN +privileges to use this ioctl. +.IP +The argument is a pointer to a structure +.in +4n +.EX +struct perf_event_query_bpf { + __u32 ids_len; + __u32 prog_cnt; + __u32 ids[0]; +}; +.EE +.in +.IP +The +.I ids_len +field indicates the number of ids that can fit in the provided +.I ids +array. +The +.I prog_cnt +value is filled in by the kernel with the number of attached +BPF programs. +The +.I ids +array is filled with the ID of each attached BPF program. +If there are more programs than will fit in the array, then the +kernel will return +.B ENOSPC +and +.I ids_len +will indicate the number of program IDs that were successfully copied. +.\" +.SS Using prctl(2) +A process can enable or disable all currently open event groups +using the +.BR prctl (2) +.B PR_TASK_PERF_EVENTS_ENABLE +and +.B PR_TASK_PERF_EVENTS_DISABLE +operations. +This applies only to events created locally by the calling process. +This does not apply to events created by other processes attached +to the calling process or inherited events from a parent process. +Only group leaders are enabled and disabled, +not any other members of the groups. +.SS perf_event related configuration files +Files in +.I /proc/sys/kernel/ +.RS 4 +.TP +.I /proc/sys/kernel/perf_event_paranoid +The +.I perf_event_paranoid +file can be set to restrict access to the performance counters. +.IP +.PD 0 +.RS +.TP +.B 2 +allow only user-space measurements (default since Linux 4.6). +.\" default changed in commit 0161028b7c8aebef64194d3d73e43bc3b53b5c66 +.TP +.B 1 +allow both kernel and user measurements (default before Linux 4.6). +.TP +.B 0 +allow access to CPU-specific data but not raw tracepoint samples. +.TP +.B \-1 +no restrictions. +.RE +.PD +.IP +The existence of the +.I perf_event_paranoid +file is the official method for determining if a kernel supports +.BR perf_event_open (). +.TP +.I /proc/sys/kernel/perf_event_max_sample_rate +This sets the maximum sample rate. +Setting this too high can allow +users to sample at a rate that impacts overall machine performance +and potentially lock up the machine. +The default value is +100000 (samples per second). +.TP +.I /proc/sys/kernel/perf_event_max_stack +.\" Introduced in c5dfd78eb79851e278b7973031b9ca363da87a7e +This file sets the maximum depth of stack frame entries reported +when generating a call trace. +.TP +.I /proc/sys/kernel/perf_event_mlock_kb +Maximum number of pages an unprivileged user can +.BR mlock (2). +The default is 516 (kB). +.RE +.PP +Files in +.I /sys/bus/event_source/devices/ +.PP +.RS 4 +Since Linux 2.6.34, the kernel supports having multiple PMUs +available for monitoring. +Information on how to program these PMUs can be found under +.IR /sys/bus/event_source/devices/ . +Each subdirectory corresponds to a different PMU. +.TP +.IR /sys/bus/event_source/devices/*/type " (since Linux 2.6.38)" +.\" commit abe43400579d5de0078c2d3a760e6598e183f871 +This contains an integer that can be used in the +.I type +field of +.I perf_event_attr +to indicate that you wish to use this PMU. +.TP +.IR /sys/bus/event_source/devices/cpu/rdpmc " (since Linux 3.4)" +.\" commit 0c9d42ed4cee2aa1dfc3a260b741baae8615744f +If this file is 1, then direct user-space access to the +performance counter registers is allowed via the rdpmc instruction. +This can be disabled by echoing 0 to the file. +.IP +As of Linux 4.0 +.\" a66734297f78707ce39d756b656bfae861d53f62 +.\" 7911d3f7af14a614617e38245fedf98a724e46a9 +the behavior has changed, so that 1 now means only allow access +to processes with active perf events, with 2 indicating the old +allow-anyone-access behavior. +.TP +.IR /sys/bus/event_source/devices/*/format/ " (since Linux 3.4)" +.\" commit 641cc938815dfd09f8fa1ec72deb814f0938ac33 +This subdirectory contains information on the architecture-specific +subfields available for programming the various +.I config +fields in the +.I perf_event_attr +struct. +.IP +The content of each file is the name of the config field, followed +by a colon, followed by a series of integer bit ranges separated by +commas. +For example, the file +.I event +may contain the value +.I config1:1,6\-10,44 +which indicates that event is an attribute that occupies bits 1,6\[en]10, and 44 +of +.IR perf_event_attr::config1 . +.TP +.IR /sys/bus/event_source/devices/*/events/ " (since Linux 3.4)" +.\" commit 641cc938815dfd09f8fa1ec72deb814f0938ac33 +This subdirectory contains files with predefined events. +The contents are strings describing the event settings +expressed in terms of the fields found in the previously mentioned +.I ./format/ +directory. +These are not necessarily complete lists of all events supported by +a PMU, but usually a subset of events deemed useful or interesting. +.IP +The content of each file is a list of attribute names +separated by commas. +Each entry has an optional value (either hex or decimal). +If no value is specified, then it is assumed to be a single-bit +field with a value of 1. +An example entry may look like this: +.IR event=0x2,inv,ldlat=3 . +.TP +.I /sys/bus/event_source/devices/*/uevent +This file is the standard kernel device interface +for injecting hotplug events. +.TP +.IR /sys/bus/event_source/devices/*/cpumask " (since Linux 3.7)" +.\" commit 314d9f63f385096580e9e2a06eaa0745d92fe4ac +The +.I cpumask +file contains a comma-separated list of integers that +indicate a representative CPU number for each socket (package) +on the motherboard. +This is needed when setting up uncore or northbridge events, as +those PMUs present socket-wide events. +.RE +.SH RETURN VALUE +On success, +.BR perf_event_open () +returns the new file descriptor. +On error, \-1 is returned and +.I errno +is set to indicate the error. +.SH ERRORS +The errors returned by +.BR perf_event_open () +can be inconsistent, and may +vary across processor architectures and performance monitoring units. +.TP +.B E2BIG +Returned if the +.I perf_event_attr +.I size +value is too small +(smaller than +.BR PERF_ATTR_SIZE_VER0 ), +too big (larger than the page size), +or larger than the kernel supports and the extra bytes are not zero. +When +.B E2BIG +is returned, the +.I perf_event_attr +.I size +field is overwritten by the kernel to be the size of the structure +it was expecting. +.TP +.B EACCES +Returned when the requested event requires +.B CAP_PERFMON +(since Linux 5.8) or +.B CAP_SYS_ADMIN +permissions (or a more permissive perf_event paranoid setting). +Some common cases where an unprivileged process +may encounter this error: +attaching to a process owned by a different user; +monitoring all processes on a given CPU (i.e., specifying the +.I pid +argument as \-1); +and not setting +.I exclude_kernel +when the paranoid setting requires it. +.TP +.B EBADF +Returned if the +.I group_fd +file descriptor is not valid, or, if +.B PERF_FLAG_PID_CGROUP +is set, +the cgroup file descriptor in +.I pid +is not valid. +.TP +.BR EBUSY " (since Linux 4.1)" +.\" bed5b25ad9c8a2f5d735ef0bc746ec870c01c1b0 +Returned if another event already has exclusive +access to the PMU. +.TP +.B EFAULT +Returned if the +.I attr +pointer points at an invalid memory address. +.TP +.B EINTR +Returned when trying to mix perf and ftrace handling +for a uprobe. +.TP +.B EINVAL +Returned if the specified event is invalid. +There are many possible reasons for this. +A not-exhaustive list: +.I sample_freq +is higher than the maximum setting; +the +.I cpu +to monitor does not exist; +.I read_format +is out of range; +.I sample_type +is out of range; +the +.I flags +value is out of range; +.I exclusive +or +.I pinned +set and the event is not a group leader; +the event +.I config +values are out of range or set reserved bits; +the generic event selected is not supported; or +there is not enough room to add the selected event. +.TP +.B EMFILE +Each opened event uses one file descriptor. +If a large number of events are opened, +the per-process limit on the number of open file descriptors will be reached, +and no more events can be created. +.TP +.B ENODEV +Returned when the event involves a feature not supported +by the current CPU. +.TP +.B ENOENT +Returned if the +.I type +setting is not valid. +This error is also returned for +some unsupported generic events. +.TP +.B ENOSPC +Prior to Linux 3.3, if there was not enough room for the event, +.\" commit aa2bc1ade59003a379ffc485d6da2d92ea3370a6 +.B ENOSPC +was returned. +In Linux 3.3, this was changed to +.BR EINVAL . +.B ENOSPC +is still returned if you try to add more breakpoint events +than supported by the hardware. +.TP +.B ENOSYS +Returned if +.B PERF_SAMPLE_STACK_USER +is set in +.I sample_type +and it is not supported by hardware. +.TP +.B EOPNOTSUPP +Returned if an event requiring a specific hardware feature is +requested but there is no hardware support. +This includes requesting low-skid events if not supported, +branch tracing if it is not available, sampling if no PMU +interrupt is available, and branch stacks for software events. +.TP +.BR EOVERFLOW " (since Linux 4.8)" +.\" 97c79a38cd454602645f0470ffb444b3b75ce574 +Returned if +.B PERF_SAMPLE_CALLCHAIN +is requested and +.I sample_max_stack +is larger than the maximum specified in +.IR /proc/sys/kernel/perf_event_max_stack . +.TP +.B EPERM +Returned on many (but not all) architectures when an unsupported +.IR exclude_hv ", " exclude_idle ", " exclude_user ", or " exclude_kernel +setting is specified. +.IP +It can also happen, as with +.BR EACCES , +when the requested event requires +.B CAP_PERFMON +(since Linux 5.8) or +.B CAP_SYS_ADMIN +permissions (or a more permissive perf_event paranoid setting). +This includes setting a breakpoint on a kernel address, +and (since Linux 3.13) setting a kernel function-trace tracepoint. +.\" commit a4e95fc2cbb31d70a65beffeaf8773f881328c34 +.TP +.B ESRCH +Returned if attempting to attach to a process that does not exist. +.SH STANDARDS +Linux. +.SH HISTORY +.BR perf_event_open () +was introduced in Linux 2.6.31 but was called +.\" commit 0793a61d4df8daeac6492dbf8d2f3e5713caae5e +.BR perf_counter_open (). +It was renamed in Linux 2.6.32. +.\" commit cdd6c482c9ff9c55475ee7392ec8f672eddb7be6 +.SH NOTES +The official way of knowing if +.BR perf_event_open () +support is enabled is checking +for the existence of the file +.IR /proc/sys/kernel/perf_event_paranoid . +.PP +.B CAP_PERFMON +capability (since Linux 5.8) provides secure approach to +performance monitoring and observability operations in a system +according to the principal of least privilege (POSIX IEEE 1003.1e). +Accessing system performance monitoring and observability operations +using +.B CAP_PERFMON +rather than the much more powerful +.B CAP_SYS_ADMIN +excludes chances to misuse credentials and makes operations more secure. +.B CAP_SYS_ADMIN +usage for secure system performance monitoring and observability +is discouraged in favor of the +.B CAP_PERFMON +capability. +.SH BUGS +The +.B F_SETOWN_EX +option to +.BR fcntl (2) +is needed to properly get overflow signals in threads. +This was introduced in Linux 2.6.32. +.\" commit ba0a6c9f6fceed11c6a99e8326f0477fe383e6b5 +.PP +Prior to Linux 2.6.33 (at least for x86), +.\" commit b690081d4d3f6a23541493f1682835c3cd5c54a1 +the kernel did not check +if events could be scheduled together until read time. +The same happens on all known kernels if the NMI watchdog is enabled. +This means to see if a given set of events works you have to +.BR perf_event_open (), +start, then read before you know for sure you +can get valid measurements. +.PP +Prior to Linux 2.6.34, +.\" FIXME . cannot find a kernel commit for this one +event constraints were not enforced by the kernel. +In that case, some events would silently return "0" if the kernel +scheduled them in an improper counter slot. +.PP +Prior to Linux 2.6.34, there was a bug when multiplexing where the +wrong results could be returned. +.\" commit 45e16a6834b6af098702e5ea6c9a40de42ff77d8 +.PP +Kernels from Linux 2.6.35 to Linux 2.6.39 can quickly crash the kernel if +"inherit" is enabled and many threads are started. +.\" commit 38b435b16c36b0d863efcf3f07b34a6fac9873fd +.PP +Prior to Linux 2.6.35, +.\" commit 050735b08ca8a016bbace4445fa025b88fee770b +.B PERF_FORMAT_GROUP +did not work with attached processes. +.PP +There is a bug in the kernel code between +Linux 2.6.36 and Linux 3.0 that ignores the +"watermark" field and acts as if a wakeup_event +was chosen if the union has a +nonzero value in it. +.\" commit 4ec8363dfc1451f8c8f86825731fe712798ada02 +.PP +From Linux 2.6.31 to Linux 3.4, the +.B PERF_IOC_FLAG_GROUP +ioctl argument was broken and would repeatedly operate +on the event specified rather than iterating across +all sibling events in a group. +.\" commit 724b6daa13e100067c30cfc4d1ad06629609dc4e +.PP +From Linux 3.4 to Linux 3.11, the mmap +.\" commit fa7315871046b9a4c48627905691dbde57e51033 +.I cap_usr_rdpmc +and +.I cap_usr_time +bits mapped to the same location. +Code should migrate to the new +.I cap_user_rdpmc +and +.I cap_user_time +fields instead. +.PP +Always double-check your results! +Various generalized events have had wrong values. +For example, retired branches measured +the wrong thing on AMD machines until Linux 2.6.35. +.\" commit f287d332ce835f77a4f5077d2c0ef1e3f9ea42d2 +.SH EXAMPLES +The following is a short example that measures the total +instruction count of a call to +.BR printf (3). +.PP +.\" SRC BEGIN (perf_event_open.c) +.EX +#include +#include +#include +#include +#include +#include +#include +\& +static long +perf_event_open(struct perf_event_attr *hw_event, pid_t pid, + int cpu, int group_fd, unsigned long flags) +{ + int ret; +\& + ret = syscall(SYS_perf_event_open, hw_event, pid, cpu, + group_fd, flags); + return ret; +} +\& +int +main(void) +{ + int fd; + long long count; + struct perf_event_attr pe; +\& + memset(&pe, 0, sizeof(pe)); + pe.type = PERF_TYPE_HARDWARE; + pe.size = sizeof(pe); + pe.config = PERF_COUNT_HW_INSTRUCTIONS; + pe.disabled = 1; + pe.exclude_kernel = 1; + pe.exclude_hv = 1; +\& + fd = perf_event_open(&pe, 0, \-1, \-1, 0); + if (fd == \-1) { + fprintf(stderr, "Error opening leader %llx\en", pe.config); + exit(EXIT_FAILURE); + } +\& + ioctl(fd, PERF_EVENT_IOC_RESET, 0); + ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); +\& + printf("Measuring instruction count for this printf\en"); +\& + ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); + read(fd, &count, sizeof(count)); +\& + printf("Used %lld instructions\en", count); +\& + close(fd); +} +.EE +.\" SRC END +.SH SEE ALSO +.BR perf (1), +.BR fcntl (2), +.BR mmap (2), +.BR open (2), +.BR prctl (2), +.BR read (2) +.PP +.I Documentation/admin\-guide/perf\-security.rst +in the kernel source tree -- cgit v1.2.3