diff options
Diffstat (limited to 'kernel/trace')
88 files changed, 89235 insertions, 0 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig new file mode 100644 index 0000000000..61c541c365 --- /dev/null +++ b/kernel/trace/Kconfig @@ -0,0 +1,1174 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Architectures that offer an FUNCTION_TRACER implementation should +# select HAVE_FUNCTION_TRACER: +# + +config USER_STACKTRACE_SUPPORT + bool + +config NOP_TRACER + bool + +config HAVE_RETHOOK + bool + +config RETHOOK + bool + depends on HAVE_RETHOOK + help + Enable generic return hooking feature. This is an internal + API, which will be used by other function-entry hooking + features like fprobe and kprobes. + +config HAVE_FUNCTION_TRACER + bool + help + See Documentation/trace/ftrace-design.rst + +config HAVE_FUNCTION_GRAPH_TRACER + bool + help + See Documentation/trace/ftrace-design.rst + +config HAVE_FUNCTION_GRAPH_RETVAL + bool + +config HAVE_DYNAMIC_FTRACE + bool + help + See Documentation/trace/ftrace-design.rst + +config HAVE_DYNAMIC_FTRACE_WITH_REGS + bool + +config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + bool + +config HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS + bool + +config HAVE_DYNAMIC_FTRACE_WITH_ARGS + bool + help + If this is set, then arguments and stack can be found from + the ftrace_regs passed into the function callback regs parameter + by default, even without setting the REGS flag in the ftrace_ops. + This allows for use of ftrace_regs_get_argument() and + ftrace_regs_get_stack_pointer(). + +config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE + bool + help + If the architecture generates __patchable_function_entries sections + but does not want them included in the ftrace locations. + +config HAVE_FTRACE_MCOUNT_RECORD + bool + help + See Documentation/trace/ftrace-design.rst + +config HAVE_SYSCALL_TRACEPOINTS + bool + help + See Documentation/trace/ftrace-design.rst + +config HAVE_FENTRY + bool + help + Arch supports the gcc options -pg with -mfentry + +config HAVE_NOP_MCOUNT + bool + help + Arch supports the gcc options -pg with -mrecord-mcount and -nop-mcount + +config HAVE_OBJTOOL_MCOUNT + bool + help + Arch supports objtool --mcount + +config HAVE_OBJTOOL_NOP_MCOUNT + bool + help + Arch supports the objtool options --mcount with --mnop. + An architecture can select this if it wants to enable nop'ing + of ftrace locations. + +config HAVE_C_RECORDMCOUNT + bool + help + C version of recordmcount available? + +config HAVE_BUILDTIME_MCOUNT_SORT + bool + help + An architecture selects this if it sorts the mcount_loc section + at build time. + +config BUILDTIME_MCOUNT_SORT + bool + default y + depends on HAVE_BUILDTIME_MCOUNT_SORT && DYNAMIC_FTRACE + help + Sort the mcount_loc section at build time. + +config TRACER_MAX_TRACE + bool + +config TRACE_CLOCK + bool + +config RING_BUFFER + bool + select TRACE_CLOCK + select IRQ_WORK + +config EVENT_TRACING + select CONTEXT_SWITCH_TRACER + select GLOB + bool + +config CONTEXT_SWITCH_TRACER + bool + +config RING_BUFFER_ALLOW_SWAP + bool + help + Allow the use of ring_buffer_swap_cpu. + Adds a very slight overhead to tracing when enabled. + +config PREEMPTIRQ_TRACEPOINTS + bool + depends on TRACE_PREEMPT_TOGGLE || TRACE_IRQFLAGS + select TRACING + default y + help + Create preempt/irq toggle tracepoints if needed, so that other parts + of the kernel can use them to generate or add hooks to them. + +# All tracer options should select GENERIC_TRACER. For those options that are +# enabled by all tracers (context switch and event tracer) they select TRACING. +# This allows those options to appear when no other tracer is selected. But the +# options do not appear when something else selects it. We need the two options +# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the +# hiding of the automatic options. + +config TRACING + bool + select RING_BUFFER + select STACKTRACE if STACKTRACE_SUPPORT + select TRACEPOINTS + select NOP_TRACER + select BINARY_PRINTF + select EVENT_TRACING + select TRACE_CLOCK + select TASKS_RCU if PREEMPTION + +config GENERIC_TRACER + bool + select TRACING + +# +# Minimum requirements an architecture has to meet for us to +# be able to offer generic tracing facilities: +# +config TRACING_SUPPORT + bool + depends on TRACE_IRQFLAGS_SUPPORT + depends on STACKTRACE_SUPPORT + default y + +menuconfig FTRACE + bool "Tracers" + depends on TRACING_SUPPORT + default y if DEBUG_KERNEL + help + Enable the kernel tracing infrastructure. + +if FTRACE + +config BOOTTIME_TRACING + bool "Boot-time Tracing support" + depends on TRACING + select BOOT_CONFIG + help + Enable developer to setup ftrace subsystem via supplemental + kernel cmdline at boot time for debugging (tracing) driver + initialization and boot process. + +config FUNCTION_TRACER + bool "Kernel Function Tracer" + depends on HAVE_FUNCTION_TRACER + select KALLSYMS + select GENERIC_TRACER + select CONTEXT_SWITCH_TRACER + select GLOB + select TASKS_RCU if PREEMPTION + select TASKS_RUDE_RCU + help + Enable the kernel to trace every kernel function. This is done + by using a compiler feature to insert a small, 5-byte No-Operation + instruction at the beginning of every kernel function, which NOP + sequence is then dynamically patched into a tracer call when + tracing is enabled by the administrator. If it's runtime disabled + (the bootup default), then the overhead of the instructions is very + small and not measurable even in micro-benchmarks (at least on + x86, but may have impact on other architectures). + +config FUNCTION_GRAPH_TRACER + bool "Kernel Function Graph Tracer" + depends on HAVE_FUNCTION_GRAPH_TRACER + depends on FUNCTION_TRACER + depends on !X86_32 || !CC_OPTIMIZE_FOR_SIZE + default y + help + Enable the kernel to trace a function at both its return + and its entry. + Its first purpose is to trace the duration of functions and + draw a call graph for each thread with some information like + the return value. This is done by setting the current return + address on the current task structure into a stack of calls. + +config FUNCTION_GRAPH_RETVAL + bool "Kernel Function Graph Return Value" + depends on HAVE_FUNCTION_GRAPH_RETVAL + depends on FUNCTION_GRAPH_TRACER + default n + help + Support recording and printing the function return value when + using function graph tracer. It can be helpful to locate functions + that return errors. This feature is off by default, and you can + enable it via the trace option funcgraph-retval. + See Documentation/trace/ftrace.rst + +config DYNAMIC_FTRACE + bool "enable/disable function tracing dynamically" + depends on FUNCTION_TRACER + depends on HAVE_DYNAMIC_FTRACE + default y + help + This option will modify all the calls to function tracing + dynamically (will patch them out of the binary image and + replace them with a No-Op instruction) on boot up. During + compile time, a table is made of all the locations that ftrace + can function trace, and this table is linked into the kernel + image. When this is enabled, functions can be individually + enabled, and the functions not enabled will not affect + performance of the system. + + See the files in /sys/kernel/tracing: + available_filter_functions + set_ftrace_filter + set_ftrace_notrace + + This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but + otherwise has native performance as long as no tracing is active. + +config DYNAMIC_FTRACE_WITH_REGS + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_DYNAMIC_FTRACE_WITH_REGS + +config DYNAMIC_FTRACE_WITH_DIRECT_CALLS + def_bool y + depends on DYNAMIC_FTRACE_WITH_REGS || DYNAMIC_FTRACE_WITH_ARGS + depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + +config DYNAMIC_FTRACE_WITH_CALL_OPS + def_bool y + depends on HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS + +config DYNAMIC_FTRACE_WITH_ARGS + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS + +config FPROBE + bool "Kernel Function Probe (fprobe)" + depends on FUNCTION_TRACER + depends on DYNAMIC_FTRACE_WITH_REGS + depends on HAVE_RETHOOK + select RETHOOK + default n + help + This option enables kernel function probe (fprobe) based on ftrace. + The fprobe is similar to kprobes, but probes only for kernel function + entries and exits. This also can probe multiple functions by one + fprobe. + + If unsure, say N. + +config FUNCTION_PROFILER + bool "Kernel function profiler" + depends on FUNCTION_TRACER + default n + help + This option enables the kernel function profiler. A file is created + in debugfs called function_profile_enabled which defaults to zero. + When a 1 is echoed into this file profiling begins, and when a + zero is entered, profiling stops. A "functions" file is created in + the trace_stat directory; this file shows the list of functions that + have been hit and their counters. + + If in doubt, say N. + +config STACK_TRACER + bool "Trace max stack" + depends on HAVE_FUNCTION_TRACER + select FUNCTION_TRACER + select STACKTRACE + select KALLSYMS + help + This special tracer records the maximum stack footprint of the + kernel and displays it in /sys/kernel/tracing/stack_trace. + + This tracer works by hooking into every function call that the + kernel executes, and keeping a maximum stack depth value and + stack-trace saved. If this is configured with DYNAMIC_FTRACE + then it will not have any overhead while the stack tracer + is disabled. + + To enable the stack tracer on bootup, pass in 'stacktrace' + on the kernel command line. + + The stack tracer can also be enabled or disabled via the + sysctl kernel.stack_tracer_enabled + + Say N if unsure. + +config TRACE_PREEMPT_TOGGLE + bool + help + Enables hooks which will be called when preemption is first disabled, + and last enabled. + +config IRQSOFF_TRACER + bool "Interrupts-off Latency Tracer" + default n + depends on TRACE_IRQFLAGS_SUPPORT + select TRACE_IRQFLAGS + select GENERIC_TRACER + select TRACER_MAX_TRACE + select RING_BUFFER_ALLOW_SWAP + select TRACER_SNAPSHOT + select TRACER_SNAPSHOT_PER_CPU_SWAP + help + This option measures the time spent in irqs-off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started + via: + + echo 0 > /sys/kernel/tracing/tracing_max_latency + + (Note that kernel size and overhead increase with this option + enabled. This option and the preempt-off timing option can be + used together or separately.) + +config PREEMPT_TRACER + bool "Preemption-off Latency Tracer" + default n + depends on PREEMPTION + select GENERIC_TRACER + select TRACER_MAX_TRACE + select RING_BUFFER_ALLOW_SWAP + select TRACER_SNAPSHOT + select TRACER_SNAPSHOT_PER_CPU_SWAP + select TRACE_PREEMPT_TOGGLE + help + This option measures the time spent in preemption-off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started + via: + + echo 0 > /sys/kernel/tracing/tracing_max_latency + + (Note that kernel size and overhead increase with this option + enabled. This option and the irqs-off timing option can be + used together or separately.) + +config SCHED_TRACER + bool "Scheduling Latency Tracer" + select GENERIC_TRACER + select CONTEXT_SWITCH_TRACER + select TRACER_MAX_TRACE + select TRACER_SNAPSHOT + help + This tracer tracks the latency of the highest priority task + to be scheduled in, starting from the point it has woken up. + +config HWLAT_TRACER + bool "Tracer to detect hardware latencies (like SMIs)" + select GENERIC_TRACER + select TRACER_MAX_TRACE + help + This tracer, when enabled will create one or more kernel threads, + depending on what the cpumask file is set to, which each thread + spinning in a loop looking for interruptions caused by + something other than the kernel. For example, if a + System Management Interrupt (SMI) takes a noticeable amount of + time, this tracer will detect it. This is useful for testing + if a system is reliable for Real Time tasks. + + Some files are created in the tracing directory when this + is enabled: + + hwlat_detector/width - time in usecs for how long to spin for + hwlat_detector/window - time in usecs between the start of each + iteration + + A kernel thread is created that will spin with interrupts disabled + for "width" microseconds in every "window" cycle. It will not spin + for "window - width" microseconds, where the system can + continue to operate. + + The output will appear in the trace and trace_pipe files. + + When the tracer is not running, it has no affect on the system, + but when it is running, it can cause the system to be + periodically non responsive. Do not run this tracer on a + production system. + + To enable this tracer, echo in "hwlat" into the current_tracer + file. Every time a latency is greater than tracing_thresh, it will + be recorded into the ring buffer. + +config OSNOISE_TRACER + bool "OS Noise tracer" + select GENERIC_TRACER + select TRACER_MAX_TRACE + help + In the context of high-performance computing (HPC), the Operating + System Noise (osnoise) refers to the interference experienced by an + application due to activities inside the operating system. In the + context of Linux, NMIs, IRQs, SoftIRQs, and any other system thread + can cause noise to the system. Moreover, hardware-related jobs can + also cause noise, for example, via SMIs. + + The osnoise tracer leverages the hwlat_detector by running a similar + loop with preemption, SoftIRQs and IRQs enabled, thus allowing all + the sources of osnoise during its execution. The osnoise tracer takes + note of the entry and exit point of any source of interferences, + increasing a per-cpu interference counter. It saves an interference + counter for each source of interference. The interference counter for + NMI, IRQs, SoftIRQs, and threads is increased anytime the tool + observes these interferences' entry events. When a noise happens + without any interference from the operating system level, the + hardware noise counter increases, pointing to a hardware-related + noise. In this way, osnoise can account for any source of + interference. At the end of the period, the osnoise tracer prints + the sum of all noise, the max single noise, the percentage of CPU + available for the thread, and the counters for the noise sources. + + In addition to the tracer, a set of tracepoints were added to + facilitate the identification of the osnoise source. + + The output will appear in the trace and trace_pipe files. + + To enable this tracer, echo in "osnoise" into the current_tracer + file. + +config TIMERLAT_TRACER + bool "Timerlat tracer" + select OSNOISE_TRACER + select GENERIC_TRACER + help + The timerlat tracer aims to help the preemptive kernel developers + to find sources of wakeup latencies of real-time threads. + + The tracer creates a per-cpu kernel thread with real-time priority. + The tracer thread sets a periodic timer to wakeup itself, and goes + to sleep waiting for the timer to fire. At the wakeup, the thread + then computes a wakeup latency value as the difference between + the current time and the absolute time that the timer was set + to expire. + + The tracer prints two lines at every activation. The first is the + timer latency observed at the hardirq context before the + activation of the thread. The second is the timer latency observed + by the thread, which is the same level that cyclictest reports. The + ACTIVATION ID field serves to relate the irq execution to its + respective thread execution. + + The tracer is build on top of osnoise tracer, and the osnoise: + events can be used to trace the source of interference from NMI, + IRQs and other threads. It also enables the capture of the + stacktrace at the IRQ context, which helps to identify the code + path that can cause thread delay. + +config MMIOTRACE + bool "Memory mapped IO tracing" + depends on HAVE_MMIOTRACE_SUPPORT && PCI + select GENERIC_TRACER + help + Mmiotrace traces Memory Mapped I/O access and is meant for + debugging and reverse engineering. It is called from the ioremap + implementation and works via page faults. Tracing is disabled by + default and can be enabled at run-time. + + See Documentation/trace/mmiotrace.rst. + If you are not helping to develop drivers, say N. + +config ENABLE_DEFAULT_TRACERS + bool "Trace process context switches and events" + depends on !GENERIC_TRACER + select TRACING + help + This tracer hooks to various trace points in the kernel, + allowing the user to pick and choose which trace point they + want to trace. It also includes the sched_switch tracer plugin. + +config FTRACE_SYSCALLS + bool "Trace syscalls" + depends on HAVE_SYSCALL_TRACEPOINTS + select GENERIC_TRACER + select KALLSYMS + help + Basic tracer to catch the syscall entry and exit events. + +config TRACER_SNAPSHOT + bool "Create a snapshot trace buffer" + select TRACER_MAX_TRACE + help + Allow tracing users to take snapshot of the current buffer using the + ftrace interface, e.g.: + + echo 1 > /sys/kernel/tracing/snapshot + cat snapshot + +config TRACER_SNAPSHOT_PER_CPU_SWAP + bool "Allow snapshot to swap per CPU" + depends on TRACER_SNAPSHOT + select RING_BUFFER_ALLOW_SWAP + help + Allow doing a snapshot of a single CPU buffer instead of a + full swap (all buffers). If this is set, then the following is + allowed: + + echo 1 > /sys/kernel/tracing/per_cpu/cpu2/snapshot + + After which, only the tracing buffer for CPU 2 was swapped with + the main tracing buffer, and the other CPU buffers remain the same. + + When this is enabled, this adds a little more overhead to the + trace recording, as it needs to add some checks to synchronize + recording with swaps. But this does not affect the performance + of the overall system. This is enabled by default when the preempt + or irq latency tracers are enabled, as those need to swap as well + and already adds the overhead (plus a lot more). + +config TRACE_BRANCH_PROFILING + bool + select GENERIC_TRACER + +choice + prompt "Branch Profiling" + default BRANCH_PROFILE_NONE + help + The branch profiling is a software profiler. It will add hooks + into the C conditionals to test which path a branch takes. + + The likely/unlikely profiler only looks at the conditions that + are annotated with a likely or unlikely macro. + + The "all branch" profiler will profile every if-statement in the + kernel. This profiler will also enable the likely/unlikely + profiler. + + Either of the above profilers adds a bit of overhead to the system. + If unsure, choose "No branch profiling". + +config BRANCH_PROFILE_NONE + bool "No branch profiling" + help + No branch profiling. Branch profiling adds a bit of overhead. + Only enable it if you want to analyse the branching behavior. + Otherwise keep it disabled. + +config PROFILE_ANNOTATED_BRANCHES + bool "Trace likely/unlikely profiler" + select TRACE_BRANCH_PROFILING + help + This tracer profiles all likely and unlikely macros + in the kernel. It will display the results in: + + /sys/kernel/tracing/trace_stat/branch_annotated + + Note: this will add a significant overhead; only turn this + on if you need to profile the system's use of these macros. + +config PROFILE_ALL_BRANCHES + bool "Profile all if conditionals" if !FORTIFY_SOURCE + select TRACE_BRANCH_PROFILING + help + This tracer profiles all branch conditions. Every if () + taken in the kernel is recorded whether it hit or miss. + The results will be displayed in: + + /sys/kernel/tracing/trace_stat/branch_all + + This option also enables the likely/unlikely profiler. + + This configuration, when enabled, will impose a great overhead + on the system. This should only be enabled when the system + is to be analyzed in much detail. +endchoice + +config TRACING_BRANCHES + bool + help + Selected by tracers that will trace the likely and unlikely + conditions. This prevents the tracers themselves from being + profiled. Profiling the tracing infrastructure can only happen + when the likelys and unlikelys are not being traced. + +config BRANCH_TRACER + bool "Trace likely/unlikely instances" + depends on TRACE_BRANCH_PROFILING + select TRACING_BRANCHES + help + This traces the events of likely and unlikely condition + calls in the kernel. The difference between this and the + "Trace likely/unlikely profiler" is that this is not a + histogram of the callers, but actually places the calling + events into a running trace buffer to see when and where the + events happened, as well as their results. + + Say N if unsure. + +config BLK_DEV_IO_TRACE + bool "Support for tracing block IO actions" + depends on SYSFS + depends on BLOCK + select RELAY + select DEBUG_FS + select TRACEPOINTS + select GENERIC_TRACER + select STACKTRACE + help + Say Y here if you want to be able to trace the block layer actions + on a given queue. Tracing allows you to see any traffic happening + on a block device queue. For more information (and the userspace + support tools needed), fetch the blktrace tools from: + + git://git.kernel.dk/blktrace.git + + Tracing also is possible using the ftrace interface, e.g.: + + echo 1 > /sys/block/sda/sda1/trace/enable + echo blk > /sys/kernel/tracing/current_tracer + cat /sys/kernel/tracing/trace_pipe + + If unsure, say N. + +config FPROBE_EVENTS + depends on FPROBE + depends on HAVE_REGS_AND_STACK_ACCESS_API + bool "Enable fprobe-based dynamic events" + select TRACING + select PROBE_EVENTS + select DYNAMIC_EVENTS + default y + help + This allows user to add tracing events on the function entry and + exit via ftrace interface. The syntax is same as the kprobe events + and the kprobe events on function entry and exit will be + transparently converted to this fprobe events. + +config PROBE_EVENTS_BTF_ARGS + depends on HAVE_FUNCTION_ARG_ACCESS_API + depends on FPROBE_EVENTS || KPROBE_EVENTS + depends on DEBUG_INFO_BTF && BPF_SYSCALL + bool "Support BTF function arguments for probe events" + default y + help + The user can specify the arguments of the probe event using the names + of the arguments of the probed function, when the probe location is a + kernel function entry or a tracepoint. + This is available only if BTF (BPF Type Format) support is enabled. + +config KPROBE_EVENTS + depends on KPROBES + depends on HAVE_REGS_AND_STACK_ACCESS_API + bool "Enable kprobes-based dynamic events" + select TRACING + select PROBE_EVENTS + select DYNAMIC_EVENTS + default y + help + This allows the user to add tracing events (similar to tracepoints) + on the fly via the ftrace interface. See + Documentation/trace/kprobetrace.rst for more details. + + Those events can be inserted wherever kprobes can probe, and record + various register and memory values. + + This option is also required by perf-probe subcommand of perf tools. + If you want to use perf tools, this option is strongly recommended. + +config KPROBE_EVENTS_ON_NOTRACE + bool "Do NOT protect notrace function from kprobe events" + depends on KPROBE_EVENTS + depends on DYNAMIC_FTRACE + default n + help + This is only for the developers who want to debug ftrace itself + using kprobe events. + + If kprobes can use ftrace instead of breakpoint, ftrace related + functions are protected from kprobe-events to prevent an infinite + recursion or any unexpected execution path which leads to a kernel + crash. + + This option disables such protection and allows you to put kprobe + events on ftrace functions for debugging ftrace by itself. + Note that this might let you shoot yourself in the foot. + + If unsure, say N. + +config UPROBE_EVENTS + bool "Enable uprobes-based dynamic events" + depends on ARCH_SUPPORTS_UPROBES + depends on MMU + depends on PERF_EVENTS + select UPROBES + select PROBE_EVENTS + select DYNAMIC_EVENTS + select TRACING + default y + help + This allows the user to add tracing events on top of userspace + dynamic events (similar to tracepoints) on the fly via the trace + events interface. Those events can be inserted wherever uprobes + can probe, and record various registers. + This option is required if you plan to use perf-probe subcommand + of perf tools on user space applications. + +config BPF_EVENTS + depends on BPF_SYSCALL + depends on (KPROBE_EVENTS || UPROBE_EVENTS) && PERF_EVENTS + bool + default y + help + This allows the user to attach BPF programs to kprobe, uprobe, and + tracepoint events. + +config DYNAMIC_EVENTS + def_bool n + +config PROBE_EVENTS + def_bool n + +config BPF_KPROBE_OVERRIDE + bool "Enable BPF programs to override a kprobed function" + depends on BPF_EVENTS + depends on FUNCTION_ERROR_INJECTION + default n + help + Allows BPF to override the execution of a probed function and + set a different return value. This is used for error injection. + +config FTRACE_MCOUNT_RECORD + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_FTRACE_MCOUNT_RECORD + +config FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY + bool + depends on FTRACE_MCOUNT_RECORD + +config FTRACE_MCOUNT_USE_CC + def_bool y + depends on $(cc-option,-mrecord-mcount) + depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY + depends on FTRACE_MCOUNT_RECORD + +config FTRACE_MCOUNT_USE_OBJTOOL + def_bool y + depends on HAVE_OBJTOOL_MCOUNT + depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY + depends on !FTRACE_MCOUNT_USE_CC + depends on FTRACE_MCOUNT_RECORD + select OBJTOOL + +config FTRACE_MCOUNT_USE_RECORDMCOUNT + def_bool y + depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY + depends on !FTRACE_MCOUNT_USE_CC + depends on !FTRACE_MCOUNT_USE_OBJTOOL + depends on FTRACE_MCOUNT_RECORD + +config TRACING_MAP + bool + depends on ARCH_HAVE_NMI_SAFE_CMPXCHG + help + tracing_map is a special-purpose lock-free map for tracing, + separated out as a stand-alone facility in order to allow it + to be shared between multiple tracers. It isn't meant to be + generally used outside of that context, and is normally + selected by tracers that use it. + +config SYNTH_EVENTS + bool "Synthetic trace events" + select TRACING + select DYNAMIC_EVENTS + default n + help + Synthetic events are user-defined trace events that can be + used to combine data from other trace events or in fact any + data source. Synthetic events can be generated indirectly + via the trace() action of histogram triggers or directly + by way of an in-kernel API. + + See Documentation/trace/events.rst or + Documentation/trace/histogram.rst for details and examples. + + If in doubt, say N. + +config USER_EVENTS + bool "User trace events" + select TRACING + select DYNAMIC_EVENTS + help + User trace events are user-defined trace events that + can be used like an existing kernel trace event. User trace + events are generated by writing to a tracefs file. User + processes can determine if their tracing events should be + generated by registering a value and bit with the kernel + that reflects when it is enabled or not. + + See Documentation/trace/user_events.rst. + If in doubt, say N. + +config HIST_TRIGGERS + bool "Histogram triggers" + depends on ARCH_HAVE_NMI_SAFE_CMPXCHG + select TRACING_MAP + select TRACING + select DYNAMIC_EVENTS + select SYNTH_EVENTS + default n + help + Hist triggers allow one or more arbitrary trace event fields + to be aggregated into hash tables and dumped to stdout by + reading a debugfs/tracefs file. They're useful for + gathering quick and dirty (though precise) summaries of + event activity as an initial guide for further investigation + using more advanced tools. + + Inter-event tracing of quantities such as latencies is also + supported using hist triggers under this option. + + See Documentation/trace/histogram.rst. + If in doubt, say N. + +config TRACE_EVENT_INJECT + bool "Trace event injection" + depends on TRACING + help + Allow user-space to inject a specific trace event into the ring + buffer. This is mainly used for testing purpose. + + If unsure, say N. + +config TRACEPOINT_BENCHMARK + bool "Add tracepoint that benchmarks tracepoints" + help + This option creates the tracepoint "benchmark:benchmark_event". + When the tracepoint is enabled, it kicks off a kernel thread that + goes into an infinite loop (calling cond_resched() to let other tasks + run), and calls the tracepoint. Each iteration will record the time + it took to write to the tracepoint and the next iteration that + data will be passed to the tracepoint itself. That is, the tracepoint + will report the time it took to do the previous tracepoint. + The string written to the tracepoint is a static string of 128 bytes + to keep the time the same. The initial string is simply a write of + "START". The second string records the cold cache time of the first + write which is not added to the rest of the calculations. + + As it is a tight loop, it benchmarks as hot cache. That's fine because + we care most about hot paths that are probably in cache already. + + An example of the output: + + START + first=3672 [COLD CACHED] + last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712 + last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337 + last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064 + last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411 + last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389 + last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666 + + +config RING_BUFFER_BENCHMARK + tristate "Ring buffer benchmark stress tester" + depends on RING_BUFFER + help + This option creates a test to stress the ring buffer and benchmark it. + It creates its own ring buffer such that it will not interfere with + any other users of the ring buffer (such as ftrace). It then creates + a producer and consumer that will run for 10 seconds and sleep for + 10 seconds. Each interval it will print out the number of events + it recorded and give a rough estimate of how long each iteration took. + + It does not disable interrupts or raise its priority, so it may be + affected by processes that are running. + + If unsure, say N. + +config TRACE_EVAL_MAP_FILE + bool "Show eval mappings for trace events" + depends on TRACING + help + The "print fmt" of the trace events will show the enum/sizeof names + instead of their values. This can cause problems for user space tools + that use this string to parse the raw data as user space does not know + how to convert the string to its value. + + To fix this, there's a special macro in the kernel that can be used + to convert an enum/sizeof into its value. If this macro is used, then + the print fmt strings will be converted to their values. + + If something does not get converted properly, this option can be + used to show what enums/sizeof the kernel tried to convert. + + This option is for debugging the conversions. A file is created + in the tracing directory called "eval_map" that will show the + names matched with their values and what trace event system they + belong too. + + Normally, the mapping of the strings to values will be freed after + boot up or module load. With this option, they will not be freed, as + they are needed for the "eval_map" file. Enabling this option will + increase the memory footprint of the running kernel. + + If unsure, say N. + +config FTRACE_RECORD_RECURSION + bool "Record functions that recurse in function tracing" + depends on FUNCTION_TRACER + help + All callbacks that attach to the function tracing have some sort + of protection against recursion. Even though the protection exists, + it adds overhead. This option will create a file in the tracefs + file system called "recursed_functions" that will list the functions + that triggered a recursion. + + This will add more overhead to cases that have recursion. + + If unsure, say N + +config FTRACE_RECORD_RECURSION_SIZE + int "Max number of recursed functions to record" + default 128 + depends on FTRACE_RECORD_RECURSION + help + This defines the limit of number of functions that can be + listed in the "recursed_functions" file, that lists all + the functions that caused a recursion to happen. + This file can be reset, but the limit can not change in + size at runtime. + +config RING_BUFFER_RECORD_RECURSION + bool "Record functions that recurse in the ring buffer" + depends on FTRACE_RECORD_RECURSION + # default y, because it is coupled with FTRACE_RECORD_RECURSION + default y + help + The ring buffer has its own internal recursion. Although when + recursion happens it won't cause harm because of the protection, + but it does cause unwanted overhead. Enabling this option will + place where recursion was detected into the ftrace "recursed_functions" + file. + + This will add more overhead to cases that have recursion. + +config GCOV_PROFILE_FTRACE + bool "Enable GCOV profiling on ftrace subsystem" + depends on GCOV_KERNEL + help + Enable GCOV profiling on ftrace subsystem for checking + which functions/lines are tested. + + If unsure, say N. + + Note that on a kernel compiled with this config, ftrace will + run significantly slower. + +config FTRACE_SELFTEST + bool + +config FTRACE_STARTUP_TEST + bool "Perform a startup test on ftrace" + depends on GENERIC_TRACER + select FTRACE_SELFTEST + help + This option performs a series of startup tests on ftrace. On bootup + a series of tests are made to verify that the tracer is + functioning properly. It will do tests on all the configured + tracers of ftrace. + +config EVENT_TRACE_STARTUP_TEST + bool "Run selftest on trace events" + depends on FTRACE_STARTUP_TEST + default y + help + This option performs a test on all trace events in the system. + It basically just enables each event and runs some code that + will trigger events (not necessarily the event it enables) + This may take some time run as there are a lot of events. + +config EVENT_TRACE_TEST_SYSCALLS + bool "Run selftest on syscall events" + depends on EVENT_TRACE_STARTUP_TEST + help + This option will also enable testing every syscall event. + It only enables the event and disables it and runs various loads + with the event enabled. This adds a bit more time for kernel boot + up since it runs this on every system call defined. + + TBD - enable a way to actually call the syscalls as we test their + events + +config FTRACE_SORT_STARTUP_TEST + bool "Verify compile time sorting of ftrace functions" + depends on DYNAMIC_FTRACE + depends on BUILDTIME_MCOUNT_SORT + help + Sorting of the mcount_loc sections that is used to find the + where the ftrace knows where to patch functions for tracing + and other callbacks is done at compile time. But if the sort + is not done correctly, it will cause non-deterministic failures. + When this is set, the sorted sections will be verified that they + are in deed sorted and will warn if they are not. + + If unsure, say N + +config RING_BUFFER_STARTUP_TEST + bool "Ring buffer startup self test" + depends on RING_BUFFER + help + Run a simple self test on the ring buffer on boot up. Late in the + kernel boot sequence, the test will start that kicks off + a thread per cpu. Each thread will write various size events + into the ring buffer. Another thread is created to send IPIs + to each of the threads, where the IPI handler will also write + to the ring buffer, to test/stress the nesting ability. + If any anomalies are discovered, a warning will be displayed + and all ring buffers will be disabled. + + The test runs for 10 seconds. This will slow your boot time + by at least 10 more seconds. + + At the end of the test, statistics and more checks are done. + It will output the stats of each per cpu buffer: What + was written, the sizes, what was read, what was lost, and + other similar details. + + If unsure, say N + +config RING_BUFFER_VALIDATE_TIME_DELTAS + bool "Verify ring buffer time stamp deltas" + depends on RING_BUFFER + help + This will audit the time stamps on the ring buffer sub + buffer to make sure that all the time deltas for the + events on a sub buffer matches the current time stamp. + This audit is performed for every event that is not + interrupted, or interrupting another event. A check + is also made when traversing sub buffers to make sure + that all the deltas on the previous sub buffer do not + add up to be greater than the current time stamp. + + NOTE: This adds significant overhead to recording of events, + and should only be used to test the logic of the ring buffer. + Do not use it on production systems. + + Only say Y if you understand what this does, and you + still want it enabled. Otherwise say N + +config MMIOTRACE_TEST + tristate "Test module for mmiotrace" + depends on MMIOTRACE && m + help + This is a dumb module for testing mmiotrace. It is very dangerous + as it will write garbage to IO memory starting at a given address. + However, it should be safe to use on e.g. unused portion of VRAM. + + Say N, unless you absolutely know what you are doing. + +config PREEMPTIRQ_DELAY_TEST + tristate "Test module to create a preempt / IRQ disable delay thread to test latency tracers" + depends on m + help + Select this option to build a test module that can help test latency + tracers by executing a preempt or irq disable section with a user + configurable delay. The module busy waits for the duration of the + critical section. + + For example, the following invocation generates a burst of three + irq-disabled critical sections for 500us: + modprobe preemptirq_delay_test test_mode=irq delay=500 burst_size=3 + + What's more, if you want to attach the test on the cpu which the latency + tracer is running on, specify cpu_affinity=cpu_num at the end of the + command. + + If unsure, say N + +config SYNTH_EVENT_GEN_TEST + tristate "Test module for in-kernel synthetic event generation" + depends on SYNTH_EVENTS + help + This option creates a test module to check the base + functionality of in-kernel synthetic event definition and + generation. + + To test, insert the module, and then check the trace buffer + for the generated sample events. + + If unsure, say N. + +config KPROBE_EVENT_GEN_TEST + tristate "Test module for in-kernel kprobe event generation" + depends on KPROBE_EVENTS + help + This option creates a test module to check the base + functionality of in-kernel kprobe event definition. + + To test, insert the module, and then check the trace buffer + for the generated kprobe events. + + If unsure, say N. + +config HIST_TRIGGERS_DEBUG + bool "Hist trigger debug support" + depends on HIST_TRIGGERS + help + Add "hist_debug" file for each event, which when read will + dump out a bunch of internal details about the hist triggers + defined on that event. + + The hist_debug file serves a couple of purposes: + + - Helps developers verify that nothing is broken. + + - Provides educational information to support the details + of the hist trigger internals as described by + Documentation/trace/histogram-design.rst. + + The hist_debug output only covers the data structures + related to the histogram definitions themselves and doesn't + display the internals of map buckets or variable values of + running histograms. + + If unsure, say N. + +source "kernel/trace/rv/Kconfig" + +endif # FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile new file mode 100644 index 0000000000..057cd975d0 --- /dev/null +++ b/kernel/trace/Makefile @@ -0,0 +1,113 @@ +# SPDX-License-Identifier: GPL-2.0 + +# Do not instrument the tracer itself: + +ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) + +ifdef CONFIG_FUNCTION_TRACER + +# Avoid recursion due to instrumentation. +KCSAN_SANITIZE := n + +ifdef CONFIG_FTRACE_SELFTEST +# selftest needs instrumentation +CFLAGS_trace_selftest_dynamic.o = $(CC_FLAGS_FTRACE) +obj-y += trace_selftest_dynamic.o +endif +endif + +ifdef CONFIG_FTRACE_STARTUP_TEST +CFLAGS_trace_kprobe_selftest.o = $(CC_FLAGS_FTRACE) +obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe_selftest.o +endif + +# If unlikely tracing is enabled, do not trace these files +ifdef CONFIG_TRACING_BRANCHES +KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING +endif + +# for GCOV coverage profiling +ifdef CONFIG_GCOV_PROFILE_FTRACE +GCOV_PROFILE := y +endif + +# Functions in this file could be invoked from early interrupt +# code and produce random code coverage. +KCOV_INSTRUMENT_trace_preemptirq.o := n + +CFLAGS_bpf_trace.o := -I$(src) + +CFLAGS_trace_benchmark.o := -I$(src) +CFLAGS_trace_events_filter.o := -I$(src) + +obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o + +obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o +obj-$(CONFIG_RING_BUFFER) += ring_buffer.o +obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o + +obj-$(CONFIG_TRACING) += trace.o +obj-$(CONFIG_TRACING) += trace_output.o +obj-$(CONFIG_TRACING) += trace_seq.o +obj-$(CONFIG_TRACING) += trace_stat.o +obj-$(CONFIG_TRACING) += trace_printk.o +obj-$(CONFIG_TRACING) += pid_list.o +obj-$(CONFIG_TRACING_MAP) += tracing_map.o +obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o +obj-$(CONFIG_SYNTH_EVENT_GEN_TEST) += synth_event_gen_test.o +obj-$(CONFIG_KPROBE_EVENT_GEN_TEST) += kprobe_event_gen_test.o +obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o +obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o +obj-$(CONFIG_PREEMPTIRQ_TRACEPOINTS) += trace_preemptirq.o +obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o +obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o +obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o +obj-$(CONFIG_OSNOISE_TRACER) += trace_osnoise.o +obj-$(CONFIG_NOP_TRACER) += trace_nop.o +obj-$(CONFIG_STACK_TRACER) += trace_stack.o +obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o +obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o +obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += fgraph.o +ifeq ($(CONFIG_BLOCK),y) +obj-$(CONFIG_EVENT_TRACING) += blktrace.o +endif +obj-$(CONFIG_EVENT_TRACING) += trace_events.o +obj-$(CONFIG_EVENT_TRACING) += trace_export.o +obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o +ifeq ($(CONFIG_PERF_EVENTS),y) +obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o +endif +obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o +obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o +obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o +obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o +obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o +obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o +obj-$(CONFIG_USER_EVENTS) += trace_events_user.o +obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o +obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o +obj-$(CONFIG_TRACEPOINTS) += error_report-traces.o +obj-$(CONFIG_TRACEPOINTS) += power-traces.o +ifeq ($(CONFIG_PM),y) +obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o +endif +ifeq ($(CONFIG_TRACING),y) +obj-$(CONFIG_KGDB_KDB) += trace_kdb.o +endif +obj-$(CONFIG_DYNAMIC_EVENTS) += trace_dynevent.o +obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o +obj-$(CONFIG_PROBE_EVENTS_BTF_ARGS) += trace_btf.o +obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o +obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o +obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o +obj-$(CONFIG_FPROBE) += fprobe.o +obj-$(CONFIG_RETHOOK) += rethook.o +obj-$(CONFIG_FPROBE_EVENTS) += trace_fprobe.o + +obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o +obj-$(CONFIG_RV) += rv/ + +libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c new file mode 100644 index 0000000000..d5d94510af --- /dev/null +++ b/kernel/trace/blktrace.c @@ -0,0 +1,1918 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk> + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/blkdev.h> +#include <linux/blktrace_api.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/debugfs.h> +#include <linux/export.h> +#include <linux/time.h> +#include <linux/uaccess.h> +#include <linux/list.h> +#include <linux/blk-cgroup.h> + +#include "../../block/blk.h" + +#include <trace/events/block.h> + +#include "trace_output.h" + +#ifdef CONFIG_BLK_DEV_IO_TRACE + +static unsigned int blktrace_seq __read_mostly = 1; + +static struct trace_array *blk_tr; +static bool blk_tracer_enabled __read_mostly; + +static LIST_HEAD(running_trace_list); +static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(running_trace_lock); + +/* Select an alternative, minimalistic output than the original one */ +#define TRACE_BLK_OPT_CLASSIC 0x1 +#define TRACE_BLK_OPT_CGROUP 0x2 +#define TRACE_BLK_OPT_CGNAME 0x4 + +static struct tracer_opt blk_tracer_opts[] = { + /* Default disable the minimalistic output */ + { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, +#ifdef CONFIG_BLK_CGROUP + { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) }, + { TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) }, +#endif + { } +}; + +static struct tracer_flags blk_tracer_flags = { + .val = 0, + .opts = blk_tracer_opts, +}; + +/* Global reference count of probes */ +static DEFINE_MUTEX(blk_probe_mutex); +static int blk_probes_ref; + +static void blk_register_tracepoints(void); +static void blk_unregister_tracepoints(void); + +/* + * Send out a notify message. + */ +static void trace_note(struct blk_trace *bt, pid_t pid, int action, + const void *data, size_t len, u64 cgid) +{ + struct blk_io_trace *t; + struct ring_buffer_event *event = NULL; + struct trace_buffer *buffer = NULL; + unsigned int trace_ctx = 0; + int cpu = smp_processor_id(); + bool blk_tracer = blk_tracer_enabled; + ssize_t cgid_len = cgid ? sizeof(cgid) : 0; + + if (blk_tracer) { + buffer = blk_tr->array_buffer.buffer; + trace_ctx = tracing_gen_ctx_flags(0); + event = trace_buffer_lock_reserve(buffer, TRACE_BLK, + sizeof(*t) + len + cgid_len, + trace_ctx); + if (!event) + return; + t = ring_buffer_event_data(event); + goto record_it; + } + + if (!bt->rchan) + return; + + t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len); + if (t) { + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; + t->time = ktime_to_ns(ktime_get()); +record_it: + t->device = bt->dev; + t->action = action | (cgid ? __BLK_TN_CGROUP : 0); + t->pid = pid; + t->cpu = cpu; + t->pdu_len = len + cgid_len; + if (cgid_len) + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); + memcpy((void *) t + sizeof(*t) + cgid_len, data, len); + + if (blk_tracer) + trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); + } +} + +/* + * Send out a notify for this process, if we haven't done so since a trace + * started + */ +static void trace_note_tsk(struct task_struct *tsk) +{ + unsigned long flags; + struct blk_trace *bt; + + tsk->btrace_seq = blktrace_seq; + raw_spin_lock_irqsave(&running_trace_lock, flags); + list_for_each_entry(bt, &running_trace_list, running_list) { + trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, + sizeof(tsk->comm), 0); + } + raw_spin_unlock_irqrestore(&running_trace_lock, flags); +} + +static void trace_note_time(struct blk_trace *bt) +{ + struct timespec64 now; + unsigned long flags; + u32 words[2]; + + /* need to check user space to see if this breaks in y2038 or y2106 */ + ktime_get_real_ts64(&now); + words[0] = (u32)now.tv_sec; + words[1] = now.tv_nsec; + + local_irq_save(flags); + trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), 0); + local_irq_restore(flags); +} + +void __blk_trace_note_message(struct blk_trace *bt, + struct cgroup_subsys_state *css, const char *fmt, ...) +{ + int n; + va_list args; + unsigned long flags; + char *buf; + u64 cgid = 0; + + if (unlikely(bt->trace_state != Blktrace_running && + !blk_tracer_enabled)) + return; + + /* + * If the BLK_TC_NOTIFY action mask isn't set, don't send any note + * message to the trace. + */ + if (!(bt->act_mask & BLK_TC_NOTIFY)) + return; + + local_irq_save(flags); + buf = this_cpu_ptr(bt->msg_data); + va_start(args, fmt); + n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); + va_end(args); + +#ifdef CONFIG_BLK_CGROUP + if (css && (blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) + cgid = cgroup_id(css->cgroup); + else + cgid = 1; +#endif + trace_note(bt, current->pid, BLK_TN_MESSAGE, buf, n, cgid); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(__blk_trace_note_message); + +static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, + pid_t pid) +{ + if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) + return 1; + if (sector && (sector < bt->start_lba || sector > bt->end_lba)) + return 1; + if (bt->pid && pid != bt->pid) + return 1; + + return 0; +} + +/* + * Data direction bit lookup + */ +static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), + BLK_TC_ACT(BLK_TC_WRITE) }; + +#define BLK_TC_RAHEAD BLK_TC_AHEAD +#define BLK_TC_PREFLUSH BLK_TC_FLUSH + +/* The ilog2() calls fall out because they're constant */ +#define MASK_TC_BIT(rw, __name) ((__force u32)(rw & REQ_ ## __name) << \ + (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name)) + +/* + * The worker for the various blk_add_trace*() types. Fills out a + * blk_io_trace structure and places it in a per-cpu subbuffer. + */ +static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, + const blk_opf_t opf, u32 what, int error, + int pdu_len, void *pdu_data, u64 cgid) +{ + struct task_struct *tsk = current; + struct ring_buffer_event *event = NULL; + struct trace_buffer *buffer = NULL; + struct blk_io_trace *t; + unsigned long flags = 0; + unsigned long *sequence; + unsigned int trace_ctx = 0; + pid_t pid; + int cpu; + bool blk_tracer = blk_tracer_enabled; + ssize_t cgid_len = cgid ? sizeof(cgid) : 0; + const enum req_op op = opf & REQ_OP_MASK; + + if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) + return; + + what |= ddir_act[op_is_write(op) ? WRITE : READ]; + what |= MASK_TC_BIT(opf, SYNC); + what |= MASK_TC_BIT(opf, RAHEAD); + what |= MASK_TC_BIT(opf, META); + what |= MASK_TC_BIT(opf, PREFLUSH); + what |= MASK_TC_BIT(opf, FUA); + if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE) + what |= BLK_TC_ACT(BLK_TC_DISCARD); + if (op == REQ_OP_FLUSH) + what |= BLK_TC_ACT(BLK_TC_FLUSH); + if (cgid) + what |= __BLK_TA_CGROUP; + + pid = tsk->pid; + if (act_log_check(bt, what, sector, pid)) + return; + cpu = raw_smp_processor_id(); + + if (blk_tracer) { + tracing_record_cmdline(current); + + buffer = blk_tr->array_buffer.buffer; + trace_ctx = tracing_gen_ctx_flags(0); + event = trace_buffer_lock_reserve(buffer, TRACE_BLK, + sizeof(*t) + pdu_len + cgid_len, + trace_ctx); + if (!event) + return; + t = ring_buffer_event_data(event); + goto record_it; + } + + if (unlikely(tsk->btrace_seq != blktrace_seq)) + trace_note_tsk(tsk); + + /* + * A word about the locking here - we disable interrupts to reserve + * some space in the relay per-cpu buffer, to prevent an irq + * from coming in and stepping on our toes. + */ + local_irq_save(flags); + t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len); + if (t) { + sequence = per_cpu_ptr(bt->sequence, cpu); + + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; + t->sequence = ++(*sequence); + t->time = ktime_to_ns(ktime_get()); +record_it: + /* + * These two are not needed in ftrace as they are in the + * generic trace_entry, filled by tracing_generic_entry_update, + * but for the trace_event->bin() synthesizer benefit we do it + * here too. + */ + t->cpu = cpu; + t->pid = pid; + + t->sector = sector; + t->bytes = bytes; + t->action = what; + t->device = bt->dev; + t->error = error; + t->pdu_len = pdu_len + cgid_len; + + if (cgid_len) + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); + if (pdu_len) + memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); + + if (blk_tracer) { + trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); + return; + } + } + + local_irq_restore(flags); +} + +static void blk_trace_free(struct request_queue *q, struct blk_trace *bt) +{ + relay_close(bt->rchan); + + /* + * If 'bt->dir' is not set, then both 'dropped' and 'msg' are created + * under 'q->debugfs_dir', thus lookup and remove them. + */ + if (!bt->dir) { + debugfs_lookup_and_remove("dropped", q->debugfs_dir); + debugfs_lookup_and_remove("msg", q->debugfs_dir); + } else { + debugfs_remove(bt->dir); + } + free_percpu(bt->sequence); + free_percpu(bt->msg_data); + kfree(bt); +} + +static void get_probe_ref(void) +{ + mutex_lock(&blk_probe_mutex); + if (++blk_probes_ref == 1) + blk_register_tracepoints(); + mutex_unlock(&blk_probe_mutex); +} + +static void put_probe_ref(void) +{ + mutex_lock(&blk_probe_mutex); + if (!--blk_probes_ref) + blk_unregister_tracepoints(); + mutex_unlock(&blk_probe_mutex); +} + +static int blk_trace_start(struct blk_trace *bt) +{ + if (bt->trace_state != Blktrace_setup && + bt->trace_state != Blktrace_stopped) + return -EINVAL; + + blktrace_seq++; + smp_mb(); + bt->trace_state = Blktrace_running; + raw_spin_lock_irq(&running_trace_lock); + list_add(&bt->running_list, &running_trace_list); + raw_spin_unlock_irq(&running_trace_lock); + trace_note_time(bt); + + return 0; +} + +static int blk_trace_stop(struct blk_trace *bt) +{ + if (bt->trace_state != Blktrace_running) + return -EINVAL; + + bt->trace_state = Blktrace_stopped; + raw_spin_lock_irq(&running_trace_lock); + list_del_init(&bt->running_list); + raw_spin_unlock_irq(&running_trace_lock); + relay_flush(bt->rchan); + + return 0; +} + +static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt) +{ + blk_trace_stop(bt); + synchronize_rcu(); + blk_trace_free(q, bt); + put_probe_ref(); +} + +static int __blk_trace_remove(struct request_queue *q) +{ + struct blk_trace *bt; + + bt = rcu_replace_pointer(q->blk_trace, NULL, + lockdep_is_held(&q->debugfs_mutex)); + if (!bt) + return -EINVAL; + + blk_trace_cleanup(q, bt); + + return 0; +} + +int blk_trace_remove(struct request_queue *q) +{ + int ret; + + mutex_lock(&q->debugfs_mutex); + ret = __blk_trace_remove(q); + mutex_unlock(&q->debugfs_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(blk_trace_remove); + +static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, + size_t count, loff_t *ppos) +{ + struct blk_trace *bt = filp->private_data; + char buf[16]; + + snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); + + return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); +} + +static const struct file_operations blk_dropped_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = blk_dropped_read, + .llseek = default_llseek, +}; + +static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char *msg; + struct blk_trace *bt; + + if (count >= BLK_TN_MAX_MSG) + return -EINVAL; + + msg = memdup_user_nul(buffer, count); + if (IS_ERR(msg)) + return PTR_ERR(msg); + + bt = filp->private_data; + __blk_trace_note_message(bt, NULL, "%s", msg); + kfree(msg); + + return count; +} + +static const struct file_operations blk_msg_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = blk_msg_write, + .llseek = noop_llseek, +}; + +/* + * Keep track of how many times we encountered a full subbuffer, to aid + * the user space app in telling how many lost events there were. + */ +static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, + void *prev_subbuf, size_t prev_padding) +{ + struct blk_trace *bt; + + if (!relay_buf_full(buf)) + return 1; + + bt = buf->chan->private_data; + atomic_inc(&bt->dropped); + return 0; +} + +static int blk_remove_buf_file_callback(struct dentry *dentry) +{ + debugfs_remove(dentry); + + return 0; +} + +static struct dentry *blk_create_buf_file_callback(const char *filename, + struct dentry *parent, + umode_t mode, + struct rchan_buf *buf, + int *is_global) +{ + return debugfs_create_file(filename, mode, parent, buf, + &relay_file_operations); +} + +static const struct rchan_callbacks blk_relay_callbacks = { + .subbuf_start = blk_subbuf_start_callback, + .create_buf_file = blk_create_buf_file_callback, + .remove_buf_file = blk_remove_buf_file_callback, +}; + +static void blk_trace_setup_lba(struct blk_trace *bt, + struct block_device *bdev) +{ + if (bdev) { + bt->start_lba = bdev->bd_start_sect; + bt->end_lba = bdev->bd_start_sect + bdev_nr_sectors(bdev); + } else { + bt->start_lba = 0; + bt->end_lba = -1ULL; + } +} + +/* + * Setup everything required to start tracing + */ +static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, + struct blk_user_trace_setup *buts) +{ + struct blk_trace *bt = NULL; + struct dentry *dir = NULL; + int ret; + + lockdep_assert_held(&q->debugfs_mutex); + + if (!buts->buf_size || !buts->buf_nr) + return -EINVAL; + + strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); + buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; + + /* + * some device names have larger paths - convert the slashes + * to underscores for this to work as expected + */ + strreplace(buts->name, '/', '_'); + + /* + * bdev can be NULL, as with scsi-generic, this is a helpful as + * we can be. + */ + if (rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->debugfs_mutex))) { + pr_warn("Concurrent blktraces are not allowed on %s\n", + buts->name); + return -EBUSY; + } + + bt = kzalloc(sizeof(*bt), GFP_KERNEL); + if (!bt) + return -ENOMEM; + + ret = -ENOMEM; + bt->sequence = alloc_percpu(unsigned long); + if (!bt->sequence) + goto err; + + bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char)); + if (!bt->msg_data) + goto err; + + /* + * When tracing the whole disk reuse the existing debugfs directory + * created by the block layer on init. For partitions block devices, + * and scsi-generic block devices we create a temporary new debugfs + * directory that will be removed once the trace ends. + */ + if (bdev && !bdev_is_partition(bdev)) + dir = q->debugfs_dir; + else + bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); + + /* + * As blktrace relies on debugfs for its interface the debugfs directory + * is required, contrary to the usual mantra of not checking for debugfs + * files or directories. + */ + if (IS_ERR_OR_NULL(dir)) { + pr_warn("debugfs_dir not present for %s so skipping\n", + buts->name); + ret = -ENOENT; + goto err; + } + + bt->dev = dev; + atomic_set(&bt->dropped, 0); + INIT_LIST_HEAD(&bt->running_list); + + ret = -EIO; + debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); + debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); + + bt->rchan = relay_open("trace", dir, buts->buf_size, + buts->buf_nr, &blk_relay_callbacks, bt); + if (!bt->rchan) + goto err; + + bt->act_mask = buts->act_mask; + if (!bt->act_mask) + bt->act_mask = (u16) -1; + + blk_trace_setup_lba(bt, bdev); + + /* overwrite with user settings */ + if (buts->start_lba) + bt->start_lba = buts->start_lba; + if (buts->end_lba) + bt->end_lba = buts->end_lba; + + bt->pid = buts->pid; + bt->trace_state = Blktrace_setup; + + rcu_assign_pointer(q->blk_trace, bt); + get_probe_ref(); + + ret = 0; +err: + if (ret) + blk_trace_free(q, bt); + return ret; +} + +static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, char __user *arg) +{ + struct blk_user_trace_setup buts; + int ret; + + ret = copy_from_user(&buts, arg, sizeof(buts)); + if (ret) + return -EFAULT; + + ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + if (ret) + return ret; + + if (copy_to_user(arg, &buts, sizeof(buts))) { + __blk_trace_remove(q); + return -EFAULT; + } + return 0; +} + +int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, + char __user *arg) +{ + int ret; + + mutex_lock(&q->debugfs_mutex); + ret = __blk_trace_setup(q, name, dev, bdev, arg); + mutex_unlock(&q->debugfs_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(blk_trace_setup); + +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) +static int compat_blk_trace_setup(struct request_queue *q, char *name, + dev_t dev, struct block_device *bdev, + char __user *arg) +{ + struct blk_user_trace_setup buts; + struct compat_blk_user_trace_setup cbuts; + int ret; + + if (copy_from_user(&cbuts, arg, sizeof(cbuts))) + return -EFAULT; + + buts = (struct blk_user_trace_setup) { + .act_mask = cbuts.act_mask, + .buf_size = cbuts.buf_size, + .buf_nr = cbuts.buf_nr, + .start_lba = cbuts.start_lba, + .end_lba = cbuts.end_lba, + .pid = cbuts.pid, + }; + + ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + if (ret) + return ret; + + if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { + __blk_trace_remove(q); + return -EFAULT; + } + + return 0; +} +#endif + +static int __blk_trace_startstop(struct request_queue *q, int start) +{ + struct blk_trace *bt; + + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->debugfs_mutex)); + if (bt == NULL) + return -EINVAL; + + if (start) + return blk_trace_start(bt); + else + return blk_trace_stop(bt); +} + +int blk_trace_startstop(struct request_queue *q, int start) +{ + int ret; + + mutex_lock(&q->debugfs_mutex); + ret = __blk_trace_startstop(q, start); + mutex_unlock(&q->debugfs_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(blk_trace_startstop); + +/* + * When reading or writing the blktrace sysfs files, the references to the + * opened sysfs or device files should prevent the underlying block device + * from being removed. So no further delete protection is really needed. + */ + +/** + * blk_trace_ioctl - handle the ioctls associated with tracing + * @bdev: the block device + * @cmd: the ioctl cmd + * @arg: the argument data, if any + * + **/ +int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) +{ + struct request_queue *q = bdev_get_queue(bdev); + int ret, start = 0; + char b[BDEVNAME_SIZE]; + + mutex_lock(&q->debugfs_mutex); + + switch (cmd) { + case BLKTRACESETUP: + snprintf(b, sizeof(b), "%pg", bdev); + ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); + break; +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) + case BLKTRACESETUP32: + snprintf(b, sizeof(b), "%pg", bdev); + ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); + break; +#endif + case BLKTRACESTART: + start = 1; + fallthrough; + case BLKTRACESTOP: + ret = __blk_trace_startstop(q, start); + break; + case BLKTRACETEARDOWN: + ret = __blk_trace_remove(q); + break; + default: + ret = -ENOTTY; + break; + } + + mutex_unlock(&q->debugfs_mutex); + return ret; +} + +/** + * blk_trace_shutdown - stop and cleanup trace structures + * @q: the request queue associated with the device + * + **/ +void blk_trace_shutdown(struct request_queue *q) +{ + if (rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->debugfs_mutex))) + __blk_trace_remove(q); +} + +#ifdef CONFIG_BLK_CGROUP +static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +{ + struct cgroup_subsys_state *blkcg_css; + struct blk_trace *bt; + + /* We don't use the 'bt' value here except as an optimization... */ + bt = rcu_dereference_protected(q->blk_trace, 1); + if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) + return 0; + + blkcg_css = bio_blkcg_css(bio); + if (!blkcg_css) + return 0; + return cgroup_id(blkcg_css->cgroup); +} +#else +static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +{ + return 0; +} +#endif + +static u64 +blk_trace_request_get_cgid(struct request *rq) +{ + if (!rq->bio) + return 0; + /* Use the first bio */ + return blk_trace_bio_get_cgid(rq->q, rq->bio); +} + +/* + * blktrace probes + */ + +/** + * blk_add_trace_rq - Add a trace for a request oriented action + * @rq: the source request + * @error: return status to log + * @nr_bytes: number of completed bytes + * @what: the action + * @cgid: the cgroup info + * + * Description: + * Records an action against a request. Will log the bio offset + size. + * + **/ +static void blk_add_trace_rq(struct request *rq, blk_status_t error, + unsigned int nr_bytes, u32 what, u64 cgid) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(rq->q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); + return; + } + + if (blk_rq_is_passthrough(rq)) + what |= BLK_TC_ACT(BLK_TC_PC); + else + what |= BLK_TC_ACT(BLK_TC_FS); + + __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, rq->cmd_flags, + what, blk_status_to_errno(error), 0, NULL, cgid); + rcu_read_unlock(); +} + +static void blk_add_trace_rq_insert(void *ignore, struct request *rq) +{ + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT, + blk_trace_request_get_cgid(rq)); +} + +static void blk_add_trace_rq_issue(void *ignore, struct request *rq) +{ + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE, + blk_trace_request_get_cgid(rq)); +} + +static void blk_add_trace_rq_merge(void *ignore, struct request *rq) +{ + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE, + blk_trace_request_get_cgid(rq)); +} + +static void blk_add_trace_rq_requeue(void *ignore, struct request *rq) +{ + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE, + blk_trace_request_get_cgid(rq)); +} + +static void blk_add_trace_rq_complete(void *ignore, struct request *rq, + blk_status_t error, unsigned int nr_bytes) +{ + blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE, + blk_trace_request_get_cgid(rq)); +} + +/** + * blk_add_trace_bio - Add a trace for a bio oriented action + * @q: queue the io is for + * @bio: the source bio + * @what: the action + * @error: error, if any + * + * Description: + * Records an action against a bio. Will log the bio offset + size. + * + **/ +static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, + u32 what, int error) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); + return; + } + + __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, + bio->bi_opf, what, error, 0, NULL, + blk_trace_bio_get_cgid(q, bio)); + rcu_read_unlock(); +} + +static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio) +{ + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0); +} + +static void blk_add_trace_bio_complete(void *ignore, + struct request_queue *q, struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, + blk_status_to_errno(bio->bi_status)); +} + +static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio) +{ + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BACKMERGE, + 0); +} + +static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio) +{ + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_FRONTMERGE, + 0); +} + +static void blk_add_trace_bio_queue(void *ignore, struct bio *bio) +{ + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_QUEUE, 0); +} + +static void blk_add_trace_getrq(void *ignore, struct bio *bio) +{ + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_GETRQ, 0); +} + +static void blk_add_trace_plug(void *ignore, struct request_queue *q) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (bt) + __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); + rcu_read_unlock(); +} + +static void blk_add_trace_unplug(void *ignore, struct request_queue *q, + unsigned int depth, bool explicit) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (bt) { + __be64 rpdu = cpu_to_be64(depth); + u32 what; + + if (explicit) + what = BLK_TA_UNPLUG_IO; + else + what = BLK_TA_UNPLUG_TIMER; + + __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); + } + rcu_read_unlock(); +} + +static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) +{ + struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (bt) { + __be64 rpdu = cpu_to_be64(pdu); + + __blk_add_trace(bt, bio->bi_iter.bi_sector, + bio->bi_iter.bi_size, bio->bi_opf, BLK_TA_SPLIT, + blk_status_to_errno(bio->bi_status), + sizeof(rpdu), &rpdu, + blk_trace_bio_get_cgid(q, bio)); + } + rcu_read_unlock(); +} + +/** + * blk_add_trace_bio_remap - Add a trace for a bio-remap operation + * @ignore: trace callback data parameter (not used) + * @bio: the source bio + * @dev: source device + * @from: source sector + * + * Called after a bio is remapped to a different device and/or sector. + **/ +static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev, + sector_t from) +{ + struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct blk_trace *bt; + struct blk_io_trace_remap r; + + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); + return; + } + + r.device_from = cpu_to_be32(dev); + r.device_to = cpu_to_be32(bio_dev(bio)); + r.sector_from = cpu_to_be64(from); + + __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, + bio->bi_opf, BLK_TA_REMAP, + blk_status_to_errno(bio->bi_status), + sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); + rcu_read_unlock(); +} + +/** + * blk_add_trace_rq_remap - Add a trace for a request-remap operation + * @ignore: trace callback data parameter (not used) + * @rq: the source request + * @dev: target device + * @from: source sector + * + * Description: + * Device mapper remaps request to other devices. + * Add a trace for that action. + * + **/ +static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, + sector_t from) +{ + struct blk_trace *bt; + struct blk_io_trace_remap r; + + rcu_read_lock(); + bt = rcu_dereference(rq->q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); + return; + } + + r.device_from = cpu_to_be32(dev); + r.device_to = cpu_to_be32(disk_devt(rq->q->disk)); + r.sector_from = cpu_to_be64(from); + + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), + rq->cmd_flags, BLK_TA_REMAP, 0, + sizeof(r), &r, blk_trace_request_get_cgid(rq)); + rcu_read_unlock(); +} + +/** + * blk_add_driver_data - Add binary message with driver-specific data + * @rq: io request + * @data: driver-specific data + * @len: length of driver-specific data + * + * Description: + * Some drivers might want to write driver-specific data per request. + * + **/ +void blk_add_driver_data(struct request *rq, void *data, size_t len) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(rq->q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); + return; + } + + __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, + BLK_TA_DRV_DATA, 0, len, data, + blk_trace_request_get_cgid(rq)); + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(blk_add_driver_data); + +static void blk_register_tracepoints(void) +{ + int ret; + + ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); + WARN_ON(ret); + ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); + WARN_ON(ret); + ret = register_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); + WARN_ON(ret); + ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); + WARN_ON(ret); + ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); + WARN_ON(ret); + ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); + WARN_ON(ret); + ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); + WARN_ON(ret); + ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); + WARN_ON(ret); + ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); + WARN_ON(ret); + ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); + WARN_ON(ret); + ret = register_trace_block_getrq(blk_add_trace_getrq, NULL); + WARN_ON(ret); + ret = register_trace_block_plug(blk_add_trace_plug, NULL); + WARN_ON(ret); + ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); + WARN_ON(ret); + ret = register_trace_block_split(blk_add_trace_split, NULL); + WARN_ON(ret); + ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); + WARN_ON(ret); + ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); + WARN_ON(ret); +} + +static void blk_unregister_tracepoints(void) +{ + unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); + unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); + unregister_trace_block_split(blk_add_trace_split, NULL); + unregister_trace_block_unplug(blk_add_trace_unplug, NULL); + unregister_trace_block_plug(blk_add_trace_plug, NULL); + unregister_trace_block_getrq(blk_add_trace_getrq, NULL); + unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); + unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); + unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); + unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); + unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); + unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); + unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); + unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); + unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); + unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); + + tracepoint_synchronize_unregister(); +} + +/* + * struct blk_io_tracer formatting routines + */ + +static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) +{ + int i = 0; + int tc = t->action >> BLK_TC_SHIFT; + + if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) { + rwbs[i++] = 'N'; + goto out; + } + + if (tc & BLK_TC_FLUSH) + rwbs[i++] = 'F'; + + if (tc & BLK_TC_DISCARD) + rwbs[i++] = 'D'; + else if (tc & BLK_TC_WRITE) + rwbs[i++] = 'W'; + else if (t->bytes) + rwbs[i++] = 'R'; + else + rwbs[i++] = 'N'; + + if (tc & BLK_TC_FUA) + rwbs[i++] = 'F'; + if (tc & BLK_TC_AHEAD) + rwbs[i++] = 'A'; + if (tc & BLK_TC_SYNC) + rwbs[i++] = 'S'; + if (tc & BLK_TC_META) + rwbs[i++] = 'M'; +out: + rwbs[i] = '\0'; +} + +static inline +const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) +{ + return (const struct blk_io_trace *)ent; +} + +static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) +{ + return (void *)(te_blk_io_trace(ent) + 1) + (has_cg ? sizeof(u64) : 0); +} + +static inline u64 t_cgid(const struct trace_entry *ent) +{ + return *(u64 *)(te_blk_io_trace(ent) + 1); +} + +static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg) +{ + return te_blk_io_trace(ent)->pdu_len - (has_cg ? sizeof(u64) : 0); +} + +static inline u32 t_action(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->action; +} + +static inline u32 t_bytes(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->bytes; +} + +static inline u32 t_sec(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->bytes >> 9; +} + +static inline unsigned long long t_sector(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->sector; +} + +static inline __u16 t_error(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->error; +} + +static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg) +{ + const __be64 *val = pdu_start(ent, has_cg); + return be64_to_cpu(*val); +} + +typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act, + bool has_cg); + +static void blk_log_action_classic(struct trace_iterator *iter, const char *act, + bool has_cg) +{ + char rwbs[RWBS_LEN]; + unsigned long long ts = iter->ts; + unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); + unsigned secs = (unsigned long)ts; + const struct blk_io_trace *t = te_blk_io_trace(iter->ent); + + fill_rwbs(rwbs, t); + + trace_seq_printf(&iter->seq, + "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", + MAJOR(t->device), MINOR(t->device), iter->cpu, + secs, nsec_rem, iter->ent->pid, act, rwbs); +} + +static void blk_log_action(struct trace_iterator *iter, const char *act, + bool has_cg) +{ + char rwbs[RWBS_LEN]; + const struct blk_io_trace *t = te_blk_io_trace(iter->ent); + + fill_rwbs(rwbs, t); + if (has_cg) { + u64 id = t_cgid(iter->ent); + + if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) { + char blkcg_name_buf[NAME_MAX + 1] = "<...>"; + + cgroup_path_from_kernfs_id(id, blkcg_name_buf, + sizeof(blkcg_name_buf)); + trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ", + MAJOR(t->device), MINOR(t->device), + blkcg_name_buf, act, rwbs); + } else { + /* + * The cgid portion used to be "INO,GEN". Userland + * builds a FILEID_INO32_GEN fid out of them and + * opens the cgroup using open_by_handle_at(2). + * While 32bit ino setups are still the same, 64bit + * ones now use the 64bit ino as the whole ID and + * no longer use generation. + * + * Regardless of the content, always output + * "LOW32,HIGH32" so that FILEID_INO32_GEN fid can + * be mapped back to @id on both 64 and 32bit ino + * setups. See __kernfs_fh_to_dentry(). + */ + trace_seq_printf(&iter->seq, + "%3d,%-3d %llx,%-llx %2s %3s ", + MAJOR(t->device), MINOR(t->device), + id & U32_MAX, id >> 32, act, rwbs); + } + } else + trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", + MAJOR(t->device), MINOR(t->device), act, rwbs); +} + +static void blk_log_dump_pdu(struct trace_seq *s, + const struct trace_entry *ent, bool has_cg) +{ + const unsigned char *pdu_buf; + int pdu_len; + int i, end; + + pdu_buf = pdu_start(ent, has_cg); + pdu_len = pdu_real_len(ent, has_cg); + + if (!pdu_len) + return; + + /* find the last zero that needs to be printed */ + for (end = pdu_len - 1; end >= 0; end--) + if (pdu_buf[end]) + break; + end++; + + trace_seq_putc(s, '('); + + for (i = 0; i < pdu_len; i++) { + + trace_seq_printf(s, "%s%02x", + i == 0 ? "" : " ", pdu_buf[i]); + + /* + * stop when the rest is just zeros and indicate so + * with a ".." appended + */ + if (i == end && end != pdu_len - 1) { + trace_seq_puts(s, " ..) "); + return; + } + } + + trace_seq_puts(s, ") "); +} + +static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) +{ + char cmd[TASK_COMM_LEN]; + + trace_find_cmdline(ent->pid, cmd); + + if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { + trace_seq_printf(s, "%u ", t_bytes(ent)); + blk_log_dump_pdu(s, ent, has_cg); + trace_seq_printf(s, "[%s]\n", cmd); + } else { + if (t_sec(ent)) + trace_seq_printf(s, "%llu + %u [%s]\n", + t_sector(ent), t_sec(ent), cmd); + else + trace_seq_printf(s, "[%s]\n", cmd); + } +} + +static void blk_log_with_error(struct trace_seq *s, + const struct trace_entry *ent, bool has_cg) +{ + if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { + blk_log_dump_pdu(s, ent, has_cg); + trace_seq_printf(s, "[%d]\n", t_error(ent)); + } else { + if (t_sec(ent)) + trace_seq_printf(s, "%llu + %u [%d]\n", + t_sector(ent), + t_sec(ent), t_error(ent)); + else + trace_seq_printf(s, "%llu [%d]\n", + t_sector(ent), t_error(ent)); + } +} + +static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) +{ + const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg); + + trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", + t_sector(ent), t_sec(ent), + MAJOR(be32_to_cpu(__r->device_from)), + MINOR(be32_to_cpu(__r->device_from)), + be64_to_cpu(__r->sector_from)); +} + +static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) +{ + char cmd[TASK_COMM_LEN]; + + trace_find_cmdline(ent->pid, cmd); + + trace_seq_printf(s, "[%s]\n", cmd); +} + +static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) +{ + char cmd[TASK_COMM_LEN]; + + trace_find_cmdline(ent->pid, cmd); + + trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg)); +} + +static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) +{ + char cmd[TASK_COMM_LEN]; + + trace_find_cmdline(ent->pid, cmd); + + trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), + get_pdu_int(ent, has_cg), cmd); +} + +static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent, + bool has_cg) +{ + + trace_seq_putmem(s, pdu_start(ent, has_cg), + pdu_real_len(ent, has_cg)); + trace_seq_putc(s, '\n'); +} + +/* + * struct tracer operations + */ + +static void blk_tracer_print_header(struct seq_file *m) +{ + if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) + return; + seq_puts(m, "# DEV CPU TIMESTAMP PID ACT FLG\n" + "# | | | | | |\n"); +} + +static void blk_tracer_start(struct trace_array *tr) +{ + blk_tracer_enabled = true; +} + +static int blk_tracer_init(struct trace_array *tr) +{ + blk_tr = tr; + blk_tracer_start(tr); + return 0; +} + +static void blk_tracer_stop(struct trace_array *tr) +{ + blk_tracer_enabled = false; +} + +static void blk_tracer_reset(struct trace_array *tr) +{ + blk_tracer_stop(tr); +} + +static const struct { + const char *act[2]; + void (*print)(struct trace_seq *s, const struct trace_entry *ent, + bool has_cg); +} what2act[] = { + [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, + [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, + [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic }, + [__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic }, + [__BLK_TA_SLEEPRQ] = {{ "S", "sleeprq" }, blk_log_generic }, + [__BLK_TA_REQUEUE] = {{ "R", "requeue" }, blk_log_with_error }, + [__BLK_TA_ISSUE] = {{ "D", "issue" }, blk_log_generic }, + [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error }, + [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug }, + [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug }, + [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, + [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, + [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, + [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic }, + [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, +}; + +static enum print_line_t print_one_line(struct trace_iterator *iter, + bool classic) +{ + struct trace_array *tr = iter->tr; + struct trace_seq *s = &iter->seq; + const struct blk_io_trace *t; + u16 what; + bool long_act; + blk_log_action_t *log_action; + bool has_cg; + + t = te_blk_io_trace(iter->ent); + what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP; + long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE); + log_action = classic ? &blk_log_action_classic : &blk_log_action; + has_cg = t->action & __BLK_TA_CGROUP; + + if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) { + log_action(iter, long_act ? "message" : "m", has_cg); + blk_log_msg(s, iter->ent, has_cg); + return trace_handle_return(s); + } + + if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) + trace_seq_printf(s, "Unknown action %x\n", what); + else { + log_action(iter, what2act[what].act[long_act], has_cg); + what2act[what].print(s, iter->ent, has_cg); + } + + return trace_handle_return(s); +} + +static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + return print_one_line(iter, false); +} + +static void blk_trace_synthesize_old_trace(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; + const int offset = offsetof(struct blk_io_trace, sector); + struct blk_io_trace old = { + .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, + .time = iter->ts, + }; + + trace_seq_putmem(s, &old, offset); + trace_seq_putmem(s, &t->sector, + sizeof(old) - offset + t->pdu_len); +} + +static enum print_line_t +blk_trace_event_print_binary(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + blk_trace_synthesize_old_trace(iter); + + return trace_handle_return(&iter->seq); +} + +static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) +{ + if ((iter->ent->type != TRACE_BLK) || + !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) + return TRACE_TYPE_UNHANDLED; + + return print_one_line(iter, true); +} + +static int +blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) +{ + /* don't output context-info for blk_classic output */ + if (bit == TRACE_BLK_OPT_CLASSIC) { + if (set) + tr->trace_flags &= ~TRACE_ITER_CONTEXT_INFO; + else + tr->trace_flags |= TRACE_ITER_CONTEXT_INFO; + } + return 0; +} + +static struct tracer blk_tracer __read_mostly = { + .name = "blk", + .init = blk_tracer_init, + .reset = blk_tracer_reset, + .start = blk_tracer_start, + .stop = blk_tracer_stop, + .print_header = blk_tracer_print_header, + .print_line = blk_tracer_print_line, + .flags = &blk_tracer_flags, + .set_flag = blk_tracer_set_flag, +}; + +static struct trace_event_functions trace_blk_event_funcs = { + .trace = blk_trace_event_print, + .binary = blk_trace_event_print_binary, +}; + +static struct trace_event trace_blk_event = { + .type = TRACE_BLK, + .funcs = &trace_blk_event_funcs, +}; + +static int __init init_blk_tracer(void) +{ + if (!register_trace_event(&trace_blk_event)) { + pr_warn("Warning: could not register block events\n"); + return 1; + } + + if (register_tracer(&blk_tracer) != 0) { + pr_warn("Warning: could not register the block tracer\n"); + unregister_trace_event(&trace_blk_event); + return 1; + } + + return 0; +} + +device_initcall(init_blk_tracer); + +static int blk_trace_remove_queue(struct request_queue *q) +{ + struct blk_trace *bt; + + bt = rcu_replace_pointer(q->blk_trace, NULL, + lockdep_is_held(&q->debugfs_mutex)); + if (bt == NULL) + return -EINVAL; + + blk_trace_stop(bt); + + put_probe_ref(); + synchronize_rcu(); + blk_trace_free(q, bt); + return 0; +} + +/* + * Setup everything required to start tracing + */ +static int blk_trace_setup_queue(struct request_queue *q, + struct block_device *bdev) +{ + struct blk_trace *bt = NULL; + int ret = -ENOMEM; + + bt = kzalloc(sizeof(*bt), GFP_KERNEL); + if (!bt) + return -ENOMEM; + + bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char)); + if (!bt->msg_data) + goto free_bt; + + bt->dev = bdev->bd_dev; + bt->act_mask = (u16)-1; + + blk_trace_setup_lba(bt, bdev); + + rcu_assign_pointer(q->blk_trace, bt); + get_probe_ref(); + return 0; + +free_bt: + blk_trace_free(q, bt); + return ret; +} + +/* + * sysfs interface to enable and configure tracing + */ + +static ssize_t sysfs_blk_trace_attr_show(struct device *dev, + struct device_attribute *attr, + char *buf); +static ssize_t sysfs_blk_trace_attr_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count); +#define BLK_TRACE_DEVICE_ATTR(_name) \ + DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \ + sysfs_blk_trace_attr_show, \ + sysfs_blk_trace_attr_store) + +static BLK_TRACE_DEVICE_ATTR(enable); +static BLK_TRACE_DEVICE_ATTR(act_mask); +static BLK_TRACE_DEVICE_ATTR(pid); +static BLK_TRACE_DEVICE_ATTR(start_lba); +static BLK_TRACE_DEVICE_ATTR(end_lba); + +static struct attribute *blk_trace_attrs[] = { + &dev_attr_enable.attr, + &dev_attr_act_mask.attr, + &dev_attr_pid.attr, + &dev_attr_start_lba.attr, + &dev_attr_end_lba.attr, + NULL +}; + +struct attribute_group blk_trace_attr_group = { + .name = "trace", + .attrs = blk_trace_attrs, +}; + +static const struct { + int mask; + const char *str; +} mask_maps[] = { + { BLK_TC_READ, "read" }, + { BLK_TC_WRITE, "write" }, + { BLK_TC_FLUSH, "flush" }, + { BLK_TC_SYNC, "sync" }, + { BLK_TC_QUEUE, "queue" }, + { BLK_TC_REQUEUE, "requeue" }, + { BLK_TC_ISSUE, "issue" }, + { BLK_TC_COMPLETE, "complete" }, + { BLK_TC_FS, "fs" }, + { BLK_TC_PC, "pc" }, + { BLK_TC_NOTIFY, "notify" }, + { BLK_TC_AHEAD, "ahead" }, + { BLK_TC_META, "meta" }, + { BLK_TC_DISCARD, "discard" }, + { BLK_TC_DRV_DATA, "drv_data" }, + { BLK_TC_FUA, "fua" }, +}; + +static int blk_trace_str2mask(const char *str) +{ + int i; + int mask = 0; + char *buf, *s, *token; + + buf = kstrdup(str, GFP_KERNEL); + if (buf == NULL) + return -ENOMEM; + s = strstrip(buf); + + while (1) { + token = strsep(&s, ","); + if (token == NULL) + break; + + if (*token == '\0') + continue; + + for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { + if (strcasecmp(token, mask_maps[i].str) == 0) { + mask |= mask_maps[i].mask; + break; + } + } + if (i == ARRAY_SIZE(mask_maps)) { + mask = -EINVAL; + break; + } + } + kfree(buf); + + return mask; +} + +static ssize_t blk_trace_mask2str(char *buf, int mask) +{ + int i; + char *p = buf; + + for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { + if (mask & mask_maps[i].mask) { + p += sprintf(p, "%s%s", + (p == buf) ? "" : ",", mask_maps[i].str); + } + } + *p++ = '\n'; + + return p - buf; +} + +static ssize_t sysfs_blk_trace_attr_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct block_device *bdev = dev_to_bdev(dev); + struct request_queue *q = bdev_get_queue(bdev); + struct blk_trace *bt; + ssize_t ret = -ENXIO; + + mutex_lock(&q->debugfs_mutex); + + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->debugfs_mutex)); + if (attr == &dev_attr_enable) { + ret = sprintf(buf, "%u\n", !!bt); + goto out_unlock_bdev; + } + + if (bt == NULL) + ret = sprintf(buf, "disabled\n"); + else if (attr == &dev_attr_act_mask) + ret = blk_trace_mask2str(buf, bt->act_mask); + else if (attr == &dev_attr_pid) + ret = sprintf(buf, "%u\n", bt->pid); + else if (attr == &dev_attr_start_lba) + ret = sprintf(buf, "%llu\n", bt->start_lba); + else if (attr == &dev_attr_end_lba) + ret = sprintf(buf, "%llu\n", bt->end_lba); + +out_unlock_bdev: + mutex_unlock(&q->debugfs_mutex); + return ret; +} + +static ssize_t sysfs_blk_trace_attr_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct block_device *bdev = dev_to_bdev(dev); + struct request_queue *q = bdev_get_queue(bdev); + struct blk_trace *bt; + u64 value; + ssize_t ret = -EINVAL; + + if (count == 0) + goto out; + + if (attr == &dev_attr_act_mask) { + if (kstrtoull(buf, 0, &value)) { + /* Assume it is a list of trace category names */ + ret = blk_trace_str2mask(buf); + if (ret < 0) + goto out; + value = ret; + } + } else { + if (kstrtoull(buf, 0, &value)) + goto out; + } + + mutex_lock(&q->debugfs_mutex); + + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->debugfs_mutex)); + if (attr == &dev_attr_enable) { + if (!!value == !!bt) { + ret = 0; + goto out_unlock_bdev; + } + if (value) + ret = blk_trace_setup_queue(q, bdev); + else + ret = blk_trace_remove_queue(q); + goto out_unlock_bdev; + } + + ret = 0; + if (bt == NULL) { + ret = blk_trace_setup_queue(q, bdev); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->debugfs_mutex)); + } + + if (ret == 0) { + if (attr == &dev_attr_act_mask) + bt->act_mask = value; + else if (attr == &dev_attr_pid) + bt->pid = value; + else if (attr == &dev_attr_start_lba) + bt->start_lba = value; + else if (attr == &dev_attr_end_lba) + bt->end_lba = value; + } + +out_unlock_bdev: + mutex_unlock(&q->debugfs_mutex); +out: + return ret ? ret : count; +} +#endif /* CONFIG_BLK_DEV_IO_TRACE */ + +#ifdef CONFIG_EVENT_TRACING + +/** + * blk_fill_rwbs - Fill the buffer rwbs by mapping op to character string. + * @rwbs: buffer to be filled + * @opf: request operation type (REQ_OP_XXX) and flags for the tracepoint + * + * Description: + * Maps each request operation and flag to a single character and fills the + * buffer provided by the caller with resulting string. + * + **/ +void blk_fill_rwbs(char *rwbs, blk_opf_t opf) +{ + int i = 0; + + if (opf & REQ_PREFLUSH) + rwbs[i++] = 'F'; + + switch (opf & REQ_OP_MASK) { + case REQ_OP_WRITE: + rwbs[i++] = 'W'; + break; + case REQ_OP_DISCARD: + rwbs[i++] = 'D'; + break; + case REQ_OP_SECURE_ERASE: + rwbs[i++] = 'D'; + rwbs[i++] = 'E'; + break; + case REQ_OP_FLUSH: + rwbs[i++] = 'F'; + break; + case REQ_OP_READ: + rwbs[i++] = 'R'; + break; + default: + rwbs[i++] = 'N'; + } + + if (opf & REQ_FUA) + rwbs[i++] = 'F'; + if (opf & REQ_RAHEAD) + rwbs[i++] = 'A'; + if (opf & REQ_SYNC) + rwbs[i++] = 'S'; + if (opf & REQ_META) + rwbs[i++] = 'M'; + + rwbs[i] = '\0'; +} +EXPORT_SYMBOL_GPL(blk_fill_rwbs); + +#endif /* CONFIG_EVENT_TRACING */ + diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c new file mode 100644 index 0000000000..1d76f3b014 --- /dev/null +++ b/kernel/trace/bpf_trace.c @@ -0,0 +1,3327 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook + */ +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/bpf.h> +#include <linux/bpf_verifier.h> +#include <linux/bpf_perf_event.h> +#include <linux/btf.h> +#include <linux/filter.h> +#include <linux/uaccess.h> +#include <linux/ctype.h> +#include <linux/kprobes.h> +#include <linux/spinlock.h> +#include <linux/syscalls.h> +#include <linux/error-injection.h> +#include <linux/btf_ids.h> +#include <linux/bpf_lsm.h> +#include <linux/fprobe.h> +#include <linux/bsearch.h> +#include <linux/sort.h> +#include <linux/key.h> +#include <linux/verification.h> +#include <linux/namei.h> + +#include <net/bpf_sk_storage.h> + +#include <uapi/linux/bpf.h> +#include <uapi/linux/btf.h> + +#include <asm/tlb.h> + +#include "trace_probe.h" +#include "trace.h" + +#define CREATE_TRACE_POINTS +#include "bpf_trace.h" + +#define bpf_event_rcu_dereference(p) \ + rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex)) + +#define MAX_UPROBE_MULTI_CNT (1U << 20) +#define MAX_KPROBE_MULTI_CNT (1U << 20) + +#ifdef CONFIG_MODULES +struct bpf_trace_module { + struct module *module; + struct list_head list; +}; + +static LIST_HEAD(bpf_trace_modules); +static DEFINE_MUTEX(bpf_module_mutex); + +static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) +{ + struct bpf_raw_event_map *btp, *ret = NULL; + struct bpf_trace_module *btm; + unsigned int i; + + mutex_lock(&bpf_module_mutex); + list_for_each_entry(btm, &bpf_trace_modules, list) { + for (i = 0; i < btm->module->num_bpf_raw_events; ++i) { + btp = &btm->module->bpf_raw_events[i]; + if (!strcmp(btp->tp->name, name)) { + if (try_module_get(btm->module)) + ret = btp; + goto out; + } + } + } +out: + mutex_unlock(&bpf_module_mutex); + return ret; +} +#else +static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) +{ + return NULL; +} +#endif /* CONFIG_MODULES */ + +u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); + +static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, + u64 flags, const struct btf **btf, + s32 *btf_id); +static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx); +static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx); + +static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx); +static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx); + +/** + * trace_call_bpf - invoke BPF program + * @call: tracepoint event + * @ctx: opaque context pointer + * + * kprobe handlers execute BPF programs via this helper. + * Can be used from static tracepoints in the future. + * + * Return: BPF programs always return an integer which is interpreted by + * kprobe handler as: + * 0 - return from kprobe (event is filtered out) + * 1 - store kprobe event into ring buffer + * Other values are reserved and currently alias to 1 + */ +unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) +{ + unsigned int ret; + + cant_sleep(); + + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { + /* + * since some bpf program is already running on this cpu, + * don't call into another bpf program (same or different) + * and don't send kprobe event into ring-buffer, + * so return zero here + */ + ret = 0; + goto out; + } + + /* + * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock + * to all call sites, we did a bpf_prog_array_valid() there to check + * whether call->prog_array is empty or not, which is + * a heuristic to speed up execution. + * + * If bpf_prog_array_valid() fetched prog_array was + * non-NULL, we go into trace_call_bpf() and do the actual + * proper rcu_dereference() under RCU lock. + * If it turns out that prog_array is NULL then, we bail out. + * For the opposite, if the bpf_prog_array_valid() fetched pointer + * was NULL, you'll skip the prog_array with the risk of missing + * out of events when it was updated in between this and the + * rcu_dereference() which is accepted risk. + */ + rcu_read_lock(); + ret = bpf_prog_run_array(rcu_dereference(call->prog_array), + ctx, bpf_prog_run); + rcu_read_unlock(); + + out: + __this_cpu_dec(bpf_prog_active); + + return ret; +} + +#ifdef CONFIG_BPF_KPROBE_OVERRIDE +BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) +{ + regs_set_return_value(regs, rc); + override_function_with_return(regs); + return 0; +} + +static const struct bpf_func_proto bpf_override_return_proto = { + .func = bpf_override_return, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; +#endif + +static __always_inline int +bpf_probe_read_user_common(void *dst, u32 size, const void __user *unsafe_ptr) +{ + int ret; + + ret = copy_from_user_nofault(dst, unsafe_ptr, size); + if (unlikely(ret < 0)) + memset(dst, 0, size); + return ret; +} + +BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, + const void __user *, unsafe_ptr) +{ + return bpf_probe_read_user_common(dst, size, unsafe_ptr); +} + +const struct bpf_func_proto bpf_probe_read_user_proto = { + .func = bpf_probe_read_user, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +static __always_inline int +bpf_probe_read_user_str_common(void *dst, u32 size, + const void __user *unsafe_ptr) +{ + int ret; + + /* + * NB: We rely on strncpy_from_user() not copying junk past the NUL + * terminator into `dst`. + * + * strncpy_from_user() does long-sized strides in the fast path. If the + * strncpy does not mask out the bytes after the NUL in `unsafe_ptr`, + * then there could be junk after the NUL in `dst`. If user takes `dst` + * and keys a hash map with it, then semantically identical strings can + * occupy multiple entries in the map. + */ + ret = strncpy_from_user_nofault(dst, unsafe_ptr, size); + if (unlikely(ret < 0)) + memset(dst, 0, size); + return ret; +} + +BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size, + const void __user *, unsafe_ptr) +{ + return bpf_probe_read_user_str_common(dst, size, unsafe_ptr); +} + +const struct bpf_func_proto bpf_probe_read_user_str_proto = { + .func = bpf_probe_read_user_str, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + return bpf_probe_read_kernel_common(dst, size, unsafe_ptr); +} + +const struct bpf_func_proto bpf_probe_read_kernel_proto = { + .func = bpf_probe_read_kernel, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +static __always_inline int +bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr) +{ + int ret; + + /* + * The strncpy_from_kernel_nofault() call will likely not fill the + * entire buffer, but that's okay in this circumstance as we're probing + * arbitrary memory anyway similar to bpf_probe_read_*() and might + * as well probe the stack. Thus, memory is explicitly cleared + * only in error case, so that improper users ignoring return + * code altogether don't copy garbage; otherwise length of string + * is returned that can be used for bpf_perf_event_output() et al. + */ + ret = strncpy_from_kernel_nofault(dst, unsafe_ptr, size); + if (unlikely(ret < 0)) + memset(dst, 0, size); + return ret; +} + +BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr); +} + +const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { + .func = bpf_probe_read_kernel_str, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE +BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + if ((unsigned long)unsafe_ptr < TASK_SIZE) { + return bpf_probe_read_user_common(dst, size, + (__force void __user *)unsafe_ptr); + } + return bpf_probe_read_kernel_common(dst, size, unsafe_ptr); +} + +static const struct bpf_func_proto bpf_probe_read_compat_proto = { + .func = bpf_probe_read_compat, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_probe_read_compat_str, void *, dst, u32, size, + const void *, unsafe_ptr) +{ + if ((unsigned long)unsafe_ptr < TASK_SIZE) { + return bpf_probe_read_user_str_common(dst, size, + (__force void __user *)unsafe_ptr); + } + return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr); +} + +static const struct bpf_func_proto bpf_probe_read_compat_str_proto = { + .func = bpf_probe_read_compat_str, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; +#endif /* CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE */ + +BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src, + u32, size) +{ + /* + * Ensure we're in user context which is safe for the helper to + * run. This helper has no business in a kthread. + * + * access_ok() should prevent writing to non-user memory, but in + * some situations (nommu, temporary switch, etc) access_ok() does + * not provide enough validation, hence the check on KERNEL_DS. + * + * nmi_uaccess_okay() ensures the probe is not run in an interim + * state, when the task or mm are switched. This is specifically + * required to prevent the use of temporary mm. + */ + + if (unlikely(in_interrupt() || + current->flags & (PF_KTHREAD | PF_EXITING))) + return -EPERM; + if (unlikely(!nmi_uaccess_okay())) + return -EPERM; + + return copy_to_user_nofault(unsafe_ptr, src, size); +} + +static const struct bpf_func_proto bpf_probe_write_user_proto = { + .func = bpf_probe_write_user, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, +}; + +static const struct bpf_func_proto *bpf_get_probe_write_proto(void) +{ + if (!capable(CAP_SYS_ADMIN)) + return NULL; + + pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", + current->comm, task_pid_nr(current)); + + return &bpf_probe_write_user_proto; +} + +#define MAX_TRACE_PRINTK_VARARGS 3 +#define BPF_TRACE_PRINTK_SIZE 1024 + +BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, + u64, arg2, u64, arg3) +{ + u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 }; + struct bpf_bprintf_data data = { + .get_bin_args = true, + .get_buf = true, + }; + int ret; + + ret = bpf_bprintf_prepare(fmt, fmt_size, args, + MAX_TRACE_PRINTK_VARARGS, &data); + if (ret < 0) + return ret; + + ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args); + + trace_bpf_trace_printk(data.buf); + + bpf_bprintf_cleanup(&data); + + return ret; +} + +static const struct bpf_func_proto bpf_trace_printk_proto = { + .func = bpf_trace_printk, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg2_type = ARG_CONST_SIZE, +}; + +static void __set_printk_clr_event(void) +{ + /* + * This program might be calling bpf_trace_printk, + * so enable the associated bpf_trace/bpf_trace_printk event. + * Repeat this each time as it is possible a user has + * disabled bpf_trace_printk events. By loading a program + * calling bpf_trace_printk() however the user has expressed + * the intent to see such events. + */ + if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1)) + pr_warn_ratelimited("could not enable bpf_trace_printk events"); +} + +const struct bpf_func_proto *bpf_get_trace_printk_proto(void) +{ + __set_printk_clr_event(); + return &bpf_trace_printk_proto; +} + +BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, args, + u32, data_len) +{ + struct bpf_bprintf_data data = { + .get_bin_args = true, + .get_buf = true, + }; + int ret, num_args; + + if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || + (data_len && !args)) + return -EINVAL; + num_args = data_len / 8; + + ret = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data); + if (ret < 0) + return ret; + + ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args); + + trace_bpf_trace_printk(data.buf); + + bpf_bprintf_cleanup(&data); + + return ret; +} + +static const struct bpf_func_proto bpf_trace_vprintk_proto = { + .func = bpf_trace_vprintk, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE_OR_ZERO, +}; + +const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void) +{ + __set_printk_clr_event(); + return &bpf_trace_vprintk_proto; +} + +BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, + const void *, args, u32, data_len) +{ + struct bpf_bprintf_data data = { + .get_bin_args = true, + }; + int err, num_args; + + if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || + (data_len && !args)) + return -EINVAL; + num_args = data_len / 8; + + err = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data); + if (err < 0) + return err; + + seq_bprintf(m, fmt, data.bin_args); + + bpf_bprintf_cleanup(&data); + + return seq_has_overflowed(m) ? -EOVERFLOW : 0; +} + +BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file) + +static const struct bpf_func_proto bpf_seq_printf_proto = { + .func = bpf_seq_printf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_seq_file_ids[0], + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) +{ + return seq_write(m, data, len) ? -EOVERFLOW : 0; +} + +static const struct bpf_func_proto bpf_seq_write_proto = { + .func = bpf_seq_write, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_seq_file_ids[0], + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, +}; + +BPF_CALL_4(bpf_seq_printf_btf, struct seq_file *, m, struct btf_ptr *, ptr, + u32, btf_ptr_size, u64, flags) +{ + const struct btf *btf; + s32 btf_id; + int ret; + + ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id); + if (ret) + return ret; + + return btf_type_seq_show_flags(btf, btf_id, ptr->ptr, m, flags); +} + +static const struct bpf_func_proto bpf_seq_printf_btf_proto = { + .func = bpf_seq_printf_btf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_seq_file_ids[0], + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + +static __always_inline int +get_map_perf_counter(struct bpf_map *map, u64 flags, + u64 *value, u64 *enabled, u64 *running) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + unsigned int cpu = smp_processor_id(); + u64 index = flags & BPF_F_INDEX_MASK; + struct bpf_event_entry *ee; + + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + if (index == BPF_F_CURRENT_CPU) + index = cpu; + if (unlikely(index >= array->map.max_entries)) + return -E2BIG; + + ee = READ_ONCE(array->ptrs[index]); + if (!ee) + return -ENOENT; + + return perf_event_read_local(ee->event, value, enabled, running); +} + +BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) +{ + u64 value = 0; + int err; + + err = get_map_perf_counter(map, flags, &value, NULL, NULL); + /* + * this api is ugly since we miss [-22..-2] range of valid + * counter values, but that's uapi + */ + if (err) + return err; + return value; +} + +static const struct bpf_func_proto bpf_perf_event_read_proto = { + .func = bpf_perf_event_read, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags, + struct bpf_perf_event_value *, buf, u32, size) +{ + int err = -EINVAL; + + if (unlikely(size != sizeof(struct bpf_perf_event_value))) + goto clear; + err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled, + &buf->running); + if (unlikely(err)) + goto clear; + return 0; +clear: + memset(buf, 0, size); + return err; +} + +static const struct bpf_func_proto bpf_perf_event_read_value_proto = { + .func = bpf_perf_event_read_value, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + +static __always_inline u64 +__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, + u64 flags, struct perf_sample_data *sd) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + unsigned int cpu = smp_processor_id(); + u64 index = flags & BPF_F_INDEX_MASK; + struct bpf_event_entry *ee; + struct perf_event *event; + + if (index == BPF_F_CURRENT_CPU) + index = cpu; + if (unlikely(index >= array->map.max_entries)) + return -E2BIG; + + ee = READ_ONCE(array->ptrs[index]); + if (!ee) + return -ENOENT; + + event = ee->event; + if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || + event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) + return -EINVAL; + + if (unlikely(event->oncpu != cpu)) + return -EOPNOTSUPP; + + return perf_event_output(event, sd, regs); +} + +/* + * Support executing tracepoints in normal, irq, and nmi context that each call + * bpf_perf_event_output + */ +struct bpf_trace_sample_data { + struct perf_sample_data sds[3]; +}; + +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds); +static DEFINE_PER_CPU(int, bpf_trace_nest_level); +BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, + u64, flags, void *, data, u64, size) +{ + struct bpf_trace_sample_data *sds; + struct perf_raw_record raw = { + .frag = { + .size = size, + .data = data, + }, + }; + struct perf_sample_data *sd; + int nest_level, err; + + preempt_disable(); + sds = this_cpu_ptr(&bpf_trace_sds); + nest_level = this_cpu_inc_return(bpf_trace_nest_level); + + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) { + err = -EBUSY; + goto out; + } + + sd = &sds->sds[nest_level - 1]; + + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) { + err = -EINVAL; + goto out; + } + + perf_sample_data_init(sd, 0, 0); + perf_sample_save_raw_data(sd, &raw); + + err = __bpf_perf_event_output(regs, map, flags, sd); +out: + this_cpu_dec(bpf_trace_nest_level); + preempt_enable(); + return err; +} + +static const struct bpf_func_proto bpf_perf_event_output_proto = { + .func = bpf_perf_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +static DEFINE_PER_CPU(int, bpf_event_output_nest_level); +struct bpf_nested_pt_regs { + struct pt_regs regs[3]; +}; +static DEFINE_PER_CPU(struct bpf_nested_pt_regs, bpf_pt_regs); +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds); + +u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, + void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) +{ + struct perf_raw_frag frag = { + .copy = ctx_copy, + .size = ctx_size, + .data = ctx, + }; + struct perf_raw_record raw = { + .frag = { + { + .next = ctx_size ? &frag : NULL, + }, + .size = meta_size, + .data = meta, + }, + }; + struct perf_sample_data *sd; + struct pt_regs *regs; + int nest_level; + u64 ret; + + preempt_disable(); + nest_level = this_cpu_inc_return(bpf_event_output_nest_level); + + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) { + ret = -EBUSY; + goto out; + } + sd = this_cpu_ptr(&bpf_misc_sds.sds[nest_level - 1]); + regs = this_cpu_ptr(&bpf_pt_regs.regs[nest_level - 1]); + + perf_fetch_caller_regs(regs); + perf_sample_data_init(sd, 0, 0); + perf_sample_save_raw_data(sd, &raw); + + ret = __bpf_perf_event_output(regs, map, flags, sd); +out: + this_cpu_dec(bpf_event_output_nest_level); + preempt_enable(); + return ret; +} + +BPF_CALL_0(bpf_get_current_task) +{ + return (long) current; +} + +const struct bpf_func_proto bpf_get_current_task_proto = { + .func = bpf_get_current_task, + .gpl_only = true, + .ret_type = RET_INTEGER, +}; + +BPF_CALL_0(bpf_get_current_task_btf) +{ + return (unsigned long) current; +} + +const struct bpf_func_proto bpf_get_current_task_btf_proto = { + .func = bpf_get_current_task_btf, + .gpl_only = true, + .ret_type = RET_PTR_TO_BTF_ID_TRUSTED, + .ret_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], +}; + +BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task) +{ + return (unsigned long) task_pt_regs(task); +} + +BTF_ID_LIST(bpf_task_pt_regs_ids) +BTF_ID(struct, pt_regs) + +const struct bpf_func_proto bpf_task_pt_regs_proto = { + .func = bpf_task_pt_regs, + .gpl_only = true, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], + .ret_type = RET_PTR_TO_BTF_ID, + .ret_btf_id = &bpf_task_pt_regs_ids[0], +}; + +BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + struct cgroup *cgrp; + + if (unlikely(idx >= array->map.max_entries)) + return -E2BIG; + + cgrp = READ_ONCE(array->ptrs[idx]); + if (unlikely(!cgrp)) + return -EAGAIN; + + return task_under_cgroup_hierarchy(current, cgrp); +} + +static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { + .func = bpf_current_task_under_cgroup, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, +}; + +struct send_signal_irq_work { + struct irq_work irq_work; + struct task_struct *task; + u32 sig; + enum pid_type type; +}; + +static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); + +static void do_bpf_send_signal(struct irq_work *entry) +{ + struct send_signal_irq_work *work; + + work = container_of(entry, struct send_signal_irq_work, irq_work); + group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type); + put_task_struct(work->task); +} + +static int bpf_send_signal_common(u32 sig, enum pid_type type) +{ + struct send_signal_irq_work *work = NULL; + + /* Similar to bpf_probe_write_user, task needs to be + * in a sound condition and kernel memory access be + * permitted in order to send signal to the current + * task. + */ + if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING))) + return -EPERM; + if (unlikely(!nmi_uaccess_okay())) + return -EPERM; + /* Task should not be pid=1 to avoid kernel panic. */ + if (unlikely(is_global_init(current))) + return -EPERM; + + if (irqs_disabled()) { + /* Do an early check on signal validity. Otherwise, + * the error is lost in deferred irq_work. + */ + if (unlikely(!valid_signal(sig))) + return -EINVAL; + + work = this_cpu_ptr(&send_signal_work); + if (irq_work_is_busy(&work->irq_work)) + return -EBUSY; + + /* Add the current task, which is the target of sending signal, + * to the irq_work. The current task may change when queued + * irq works get executed. + */ + work->task = get_task_struct(current); + work->sig = sig; + work->type = type; + irq_work_queue(&work->irq_work); + return 0; + } + + return group_send_sig_info(sig, SEND_SIG_PRIV, current, type); +} + +BPF_CALL_1(bpf_send_signal, u32, sig) +{ + return bpf_send_signal_common(sig, PIDTYPE_TGID); +} + +static const struct bpf_func_proto bpf_send_signal_proto = { + .func = bpf_send_signal, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_send_signal_thread, u32, sig) +{ + return bpf_send_signal_common(sig, PIDTYPE_PID); +} + +static const struct bpf_func_proto bpf_send_signal_thread_proto = { + .func = bpf_send_signal_thread, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz) +{ + struct path copy; + long len; + char *p; + + if (!sz) + return 0; + + /* + * The path pointer is verified as trusted and safe to use, + * but let's double check it's valid anyway to workaround + * potentially broken verifier. + */ + len = copy_from_kernel_nofault(©, path, sizeof(*path)); + if (len < 0) + return len; + + p = d_path(©, buf, sz); + if (IS_ERR(p)) { + len = PTR_ERR(p); + } else { + len = buf + sz - p; + memmove(buf, p, len); + } + + return len; +} + +BTF_SET_START(btf_allowlist_d_path) +#ifdef CONFIG_SECURITY +BTF_ID(func, security_file_permission) +BTF_ID(func, security_inode_getattr) +BTF_ID(func, security_file_open) +#endif +#ifdef CONFIG_SECURITY_PATH +BTF_ID(func, security_path_truncate) +#endif +BTF_ID(func, vfs_truncate) +BTF_ID(func, vfs_fallocate) +BTF_ID(func, dentry_open) +BTF_ID(func, vfs_getattr) +BTF_ID(func, filp_close) +BTF_SET_END(btf_allowlist_d_path) + +static bool bpf_d_path_allowed(const struct bpf_prog *prog) +{ + if (prog->type == BPF_PROG_TYPE_TRACING && + prog->expected_attach_type == BPF_TRACE_ITER) + return true; + + if (prog->type == BPF_PROG_TYPE_LSM) + return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id); + + return btf_id_set_contains(&btf_allowlist_d_path, + prog->aux->attach_btf_id); +} + +BTF_ID_LIST_SINGLE(bpf_d_path_btf_ids, struct, path) + +static const struct bpf_func_proto bpf_d_path_proto = { + .func = bpf_d_path, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_d_path_btf_ids[0], + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .allowed = bpf_d_path_allowed, +}; + +#define BTF_F_ALL (BTF_F_COMPACT | BTF_F_NONAME | \ + BTF_F_PTR_RAW | BTF_F_ZERO) + +static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, + u64 flags, const struct btf **btf, + s32 *btf_id) +{ + const struct btf_type *t; + + if (unlikely(flags & ~(BTF_F_ALL))) + return -EINVAL; + + if (btf_ptr_size != sizeof(struct btf_ptr)) + return -EINVAL; + + *btf = bpf_get_btf_vmlinux(); + + if (IS_ERR_OR_NULL(*btf)) + return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL; + + if (ptr->type_id > 0) + *btf_id = ptr->type_id; + else + return -EINVAL; + + if (*btf_id > 0) + t = btf_type_by_id(*btf, *btf_id); + if (*btf_id <= 0 || !t) + return -ENOENT; + + return 0; +} + +BPF_CALL_5(bpf_snprintf_btf, char *, str, u32, str_size, struct btf_ptr *, ptr, + u32, btf_ptr_size, u64, flags) +{ + const struct btf *btf; + s32 btf_id; + int ret; + + ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id); + if (ret) + return ret; + + return btf_type_snprintf_show(btf, btf_id, ptr->ptr, str, str_size, + flags); +} + +const struct bpf_func_proto bpf_snprintf_btf_proto = { + .func = bpf_snprintf_btf, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_get_func_ip_tracing, void *, ctx) +{ + /* This helper call is inlined by verifier. */ + return ((u64 *)ctx)[-2]; +} + +static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { + .func = bpf_get_func_ip_tracing, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +#ifdef CONFIG_X86_KERNEL_IBT +static unsigned long get_entry_ip(unsigned long fentry_ip) +{ + u32 instr; + + /* Being extra safe in here in case entry ip is on the page-edge. */ + if (get_kernel_nofault(instr, (u32 *) fentry_ip - 1)) + return fentry_ip; + if (is_endbr(instr)) + fentry_ip -= ENDBR_INSN_SIZE; + return fentry_ip; +} +#else +#define get_entry_ip(fentry_ip) fentry_ip +#endif + +BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) +{ + struct bpf_trace_run_ctx *run_ctx __maybe_unused; + struct kprobe *kp; + +#ifdef CONFIG_UPROBES + run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); + if (run_ctx->is_uprobe) + return ((struct uprobe_dispatch_data *)current->utask->vaddr)->bp_addr; +#endif + + kp = kprobe_running(); + + if (!kp || !(kp->flags & KPROBE_FLAG_ON_FUNC_ENTRY)) + return 0; + + return get_entry_ip((uintptr_t)kp->addr); +} + +static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = { + .func = bpf_get_func_ip_kprobe, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_func_ip_kprobe_multi, struct pt_regs *, regs) +{ + return bpf_kprobe_multi_entry_ip(current->bpf_ctx); +} + +static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe_multi = { + .func = bpf_get_func_ip_kprobe_multi, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_attach_cookie_kprobe_multi, struct pt_regs *, regs) +{ + return bpf_kprobe_multi_cookie(current->bpf_ctx); +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_kmulti = { + .func = bpf_get_attach_cookie_kprobe_multi, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_func_ip_uprobe_multi, struct pt_regs *, regs) +{ + return bpf_uprobe_multi_entry_ip(current->bpf_ctx); +} + +static const struct bpf_func_proto bpf_get_func_ip_proto_uprobe_multi = { + .func = bpf_get_func_ip_uprobe_multi, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_attach_cookie_uprobe_multi, struct pt_regs *, regs) +{ + return bpf_uprobe_multi_cookie(current->bpf_ctx); +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_umulti = { + .func = bpf_get_attach_cookie_uprobe_multi, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx) +{ + struct bpf_trace_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); + return run_ctx->bpf_cookie; +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_trace = { + .func = bpf_get_attach_cookie_trace, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_attach_cookie_pe, struct bpf_perf_event_data_kern *, ctx) +{ + return ctx->event->bpf_cookie; +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_pe = { + .func = bpf_get_attach_cookie_pe, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_attach_cookie_tracing, void *, ctx) +{ + struct bpf_trace_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_trace_run_ctx, run_ctx); + return run_ctx->bpf_cookie; +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = { + .func = bpf_get_attach_cookie_tracing, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags) +{ +#ifndef CONFIG_X86 + return -ENOENT; +#else + static const u32 br_entry_size = sizeof(struct perf_branch_entry); + u32 entry_cnt = size / br_entry_size; + + entry_cnt = static_call(perf_snapshot_branch_stack)(buf, entry_cnt); + + if (unlikely(flags)) + return -EINVAL; + + if (!entry_cnt) + return -ENOENT; + + return entry_cnt * br_entry_size; +#endif +} + +static const struct bpf_func_proto bpf_get_branch_snapshot_proto = { + .func = bpf_get_branch_snapshot, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, +}; + +BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value) +{ + /* This helper call is inlined by verifier. */ + u64 nr_args = ((u64 *)ctx)[-1]; + + if ((u64) n >= nr_args) + return -EINVAL; + *value = ((u64 *)ctx)[n]; + return 0; +} + +static const struct bpf_func_proto bpf_get_func_arg_proto = { + .func = get_func_arg, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_LONG, +}; + +BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value) +{ + /* This helper call is inlined by verifier. */ + u64 nr_args = ((u64 *)ctx)[-1]; + + *value = ((u64 *)ctx)[nr_args]; + return 0; +} + +static const struct bpf_func_proto bpf_get_func_ret_proto = { + .func = get_func_ret, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_LONG, +}; + +BPF_CALL_1(get_func_arg_cnt, void *, ctx) +{ + /* This helper call is inlined by verifier. */ + return ((u64 *)ctx)[-1]; +} + +static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { + .func = get_func_arg_cnt, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +#ifdef CONFIG_KEYS +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "kfuncs which will be used in BPF programs"); + +/** + * bpf_lookup_user_key - lookup a key by its serial + * @serial: key handle serial number + * @flags: lookup-specific flags + * + * Search a key with a given *serial* and the provided *flags*. + * If found, increment the reference count of the key by one, and + * return it in the bpf_key structure. + * + * The bpf_key structure must be passed to bpf_key_put() when done + * with it, so that the key reference count is decremented and the + * bpf_key structure is freed. + * + * Permission checks are deferred to the time the key is used by + * one of the available key-specific kfuncs. + * + * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested + * special keyring (e.g. session keyring), if it doesn't yet exist. + * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting + * for the key construction, and to retrieve uninstantiated keys (keys + * without data attached to them). + * + * Return: a bpf_key pointer with a valid key pointer if the key is found, a + * NULL pointer otherwise. + */ +__bpf_kfunc struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags) +{ + key_ref_t key_ref; + struct bpf_key *bkey; + + if (flags & ~KEY_LOOKUP_ALL) + return NULL; + + /* + * Permission check is deferred until the key is used, as the + * intent of the caller is unknown here. + */ + key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK); + if (IS_ERR(key_ref)) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_KERNEL); + if (!bkey) { + key_put(key_ref_to_ptr(key_ref)); + return NULL; + } + + bkey->key = key_ref_to_ptr(key_ref); + bkey->has_ref = true; + + return bkey; +} + +/** + * bpf_lookup_system_key - lookup a key by a system-defined ID + * @id: key ID + * + * Obtain a bpf_key structure with a key pointer set to the passed key ID. + * The key pointer is marked as invalid, to prevent bpf_key_put() from + * attempting to decrement the key reference count on that pointer. The key + * pointer set in such way is currently understood only by + * verify_pkcs7_signature(). + * + * Set *id* to one of the values defined in include/linux/verification.h: + * 0 for the primary keyring (immutable keyring of system keys); + * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring + * (where keys can be added only if they are vouched for by existing keys + * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform + * keyring (primarily used by the integrity subsystem to verify a kexec'ed + * kerned image and, possibly, the initramfs signature). + * + * Return: a bpf_key pointer with an invalid key pointer set from the + * pre-determined ID on success, a NULL pointer otherwise + */ +__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id) +{ + struct bpf_key *bkey; + + if (system_keyring_id_check(id) < 0) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC); + if (!bkey) + return NULL; + + bkey->key = (struct key *)(unsigned long)id; + bkey->has_ref = false; + + return bkey; +} + +/** + * bpf_key_put - decrement key reference count if key is valid and free bpf_key + * @bkey: bpf_key structure + * + * Decrement the reference count of the key inside *bkey*, if the pointer + * is valid, and free *bkey*. + */ +__bpf_kfunc void bpf_key_put(struct bpf_key *bkey) +{ + if (bkey->has_ref) + key_put(bkey->key); + + kfree(bkey); +} + +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION +/** + * bpf_verify_pkcs7_signature - verify a PKCS#7 signature + * @data_ptr: data to verify + * @sig_ptr: signature of the data + * @trusted_keyring: keyring with keys trusted for signature verification + * + * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* + * with keys in a keyring referenced by *trusted_keyring*. + * + * Return: 0 on success, a negative value on error. + */ +__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, + struct bpf_dynptr_kern *sig_ptr, + struct bpf_key *trusted_keyring) +{ + int ret; + + if (trusted_keyring->has_ref) { + /* + * Do the permission check deferred in bpf_lookup_user_key(). + * See bpf_lookup_user_key() for more details. + * + * A call to key_task_permission() here would be redundant, as + * it is already done by keyring_search() called by + * find_asymmetric_key(). + */ + ret = key_validate(trusted_keyring->key); + if (ret < 0) + return ret; + } + + return verify_pkcs7_signature(data_ptr->data, + __bpf_dynptr_size(data_ptr), + sig_ptr->data, + __bpf_dynptr_size(sig_ptr), + trusted_keyring->key, + VERIFYING_UNSPECIFIED_SIGNATURE, NULL, + NULL); +} +#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ + +__diag_pop(); + +BTF_SET8_START(key_sig_kfunc_set) +BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION +BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) +#endif +BTF_SET8_END(key_sig_kfunc_set) + +static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = { + .owner = THIS_MODULE, + .set = &key_sig_kfunc_set, +}; + +static int __init bpf_key_sig_kfuncs_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, + &bpf_key_sig_kfunc_set); +} + +late_initcall(bpf_key_sig_kfuncs_init); +#endif /* CONFIG_KEYS */ + +static const struct bpf_func_proto * +bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; + case BPF_FUNC_map_lookup_percpu_elem: + return &bpf_map_lookup_percpu_elem_proto; + case BPF_FUNC_ktime_get_ns: + return &bpf_ktime_get_ns_proto; + case BPF_FUNC_ktime_get_boot_ns: + return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_tail_call: + return &bpf_tail_call_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_current_task: + return &bpf_get_current_task_proto; + case BPF_FUNC_get_current_task_btf: + return &bpf_get_current_task_btf_proto; + case BPF_FUNC_task_pt_regs: + return &bpf_task_pt_regs_proto; + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_current_comm: + return &bpf_get_current_comm_proto; + case BPF_FUNC_trace_printk: + return bpf_get_trace_printk_proto(); + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_get_numa_node_id: + return &bpf_get_numa_node_id_proto; + case BPF_FUNC_perf_event_read: + return &bpf_perf_event_read_proto; + case BPF_FUNC_current_task_under_cgroup: + return &bpf_current_task_under_cgroup_proto; + case BPF_FUNC_get_prandom_u32: + return &bpf_get_prandom_u32_proto; + case BPF_FUNC_probe_write_user: + return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ? + NULL : bpf_get_probe_write_proto(); + case BPF_FUNC_probe_read_user: + return &bpf_probe_read_user_proto; + case BPF_FUNC_probe_read_kernel: + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? + NULL : &bpf_probe_read_kernel_proto; + case BPF_FUNC_probe_read_user_str: + return &bpf_probe_read_user_str_proto; + case BPF_FUNC_probe_read_kernel_str: + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? + NULL : &bpf_probe_read_kernel_str_proto; +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + case BPF_FUNC_probe_read: + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? + NULL : &bpf_probe_read_compat_proto; + case BPF_FUNC_probe_read_str: + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? + NULL : &bpf_probe_read_compat_str_proto; +#endif +#ifdef CONFIG_CGROUPS + case BPF_FUNC_cgrp_storage_get: + return &bpf_cgrp_storage_get_proto; + case BPF_FUNC_cgrp_storage_delete: + return &bpf_cgrp_storage_delete_proto; +#endif + case BPF_FUNC_send_signal: + return &bpf_send_signal_proto; + case BPF_FUNC_send_signal_thread: + return &bpf_send_signal_thread_proto; + case BPF_FUNC_perf_event_read_value: + return &bpf_perf_event_read_value_proto; + case BPF_FUNC_get_ns_current_pid_tgid: + return &bpf_get_ns_current_pid_tgid_proto; + case BPF_FUNC_ringbuf_output: + return &bpf_ringbuf_output_proto; + case BPF_FUNC_ringbuf_reserve: + return &bpf_ringbuf_reserve_proto; + case BPF_FUNC_ringbuf_submit: + return &bpf_ringbuf_submit_proto; + case BPF_FUNC_ringbuf_discard: + return &bpf_ringbuf_discard_proto; + case BPF_FUNC_ringbuf_query: + return &bpf_ringbuf_query_proto; + case BPF_FUNC_jiffies64: + return &bpf_jiffies64_proto; + case BPF_FUNC_get_task_stack: + return &bpf_get_task_stack_proto; + case BPF_FUNC_copy_from_user: + return &bpf_copy_from_user_proto; + case BPF_FUNC_copy_from_user_task: + return &bpf_copy_from_user_task_proto; + case BPF_FUNC_snprintf_btf: + return &bpf_snprintf_btf_proto; + case BPF_FUNC_per_cpu_ptr: + return &bpf_per_cpu_ptr_proto; + case BPF_FUNC_this_cpu_ptr: + return &bpf_this_cpu_ptr_proto; + case BPF_FUNC_task_storage_get: + if (bpf_prog_check_recur(prog)) + return &bpf_task_storage_get_recur_proto; + return &bpf_task_storage_get_proto; + case BPF_FUNC_task_storage_delete: + if (bpf_prog_check_recur(prog)) + return &bpf_task_storage_delete_recur_proto; + return &bpf_task_storage_delete_proto; + case BPF_FUNC_for_each_map_elem: + return &bpf_for_each_map_elem_proto; + case BPF_FUNC_snprintf: + return &bpf_snprintf_proto; + case BPF_FUNC_get_func_ip: + return &bpf_get_func_ip_proto_tracing; + case BPF_FUNC_get_branch_snapshot: + return &bpf_get_branch_snapshot_proto; + case BPF_FUNC_find_vma: + return &bpf_find_vma_proto; + case BPF_FUNC_trace_vprintk: + return bpf_get_trace_vprintk_proto(); + default: + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_perf_event_output_proto; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto; +#ifdef CONFIG_BPF_KPROBE_OVERRIDE + case BPF_FUNC_override_return: + return &bpf_override_return_proto; +#endif + case BPF_FUNC_get_func_ip: + if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI) + return &bpf_get_func_ip_proto_kprobe_multi; + if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI) + return &bpf_get_func_ip_proto_uprobe_multi; + return &bpf_get_func_ip_proto_kprobe; + case BPF_FUNC_get_attach_cookie: + if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI) + return &bpf_get_attach_cookie_proto_kmulti; + if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI) + return &bpf_get_attach_cookie_proto_umulti; + return &bpf_get_attach_cookie_proto_trace; + default: + return bpf_tracing_func_proto(func_id, prog); + } +} + +/* bpf+kprobe programs can access fields of 'struct pt_regs' */ +static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(struct pt_regs)) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + /* + * Assertion for 32 bit to make sure last 8 byte access + * (BPF_DW) to the last 4 byte member is disallowed. + */ + if (off + size > sizeof(struct pt_regs)) + return false; + + return true; +} + +const struct bpf_verifier_ops kprobe_verifier_ops = { + .get_func_proto = kprobe_prog_func_proto, + .is_valid_access = kprobe_prog_is_valid_access, +}; + +const struct bpf_prog_ops kprobe_prog_ops = { +}; + +BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, + u64, flags, void *, data, u64, size) +{ + struct pt_regs *regs = *(struct pt_regs **)tp_buff; + + /* + * r1 points to perf tracepoint buffer where first 8 bytes are hidden + * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it + * from there and call the same bpf_perf_event_output() helper inline. + */ + return ____bpf_perf_event_output(regs, map, flags, data, size); +} + +static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { + .func = bpf_perf_event_output_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map, + u64, flags) +{ + struct pt_regs *regs = *(struct pt_regs **)tp_buff; + + /* + * Same comment as in bpf_perf_event_output_tp(), only that this time + * the other helper's function body cannot be inlined due to being + * external, thus we need to call raw helper function. + */ + return bpf_get_stackid((unsigned long) regs, (unsigned long) map, + flags, 0, 0); +} + +static const struct bpf_func_proto bpf_get_stackid_proto_tp = { + .func = bpf_get_stackid_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size, + u64, flags) +{ + struct pt_regs *regs = *(struct pt_regs **)tp_buff; + + return bpf_get_stack((unsigned long) regs, (unsigned long) buf, + (unsigned long) size, flags, 0); +} + +static const struct bpf_func_proto bpf_get_stack_proto_tp = { + .func = bpf_get_stack_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_perf_event_output_proto_tp; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto_tp; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto_tp; + case BPF_FUNC_get_attach_cookie: + return &bpf_get_attach_cookie_proto_trace; + default: + return bpf_tracing_func_proto(func_id, prog); + } +} + +static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + + BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64)); + return true; +} + +const struct bpf_verifier_ops tracepoint_verifier_ops = { + .get_func_proto = tp_prog_func_proto, + .is_valid_access = tp_prog_is_valid_access, +}; + +const struct bpf_prog_ops tracepoint_prog_ops = { +}; + +BPF_CALL_3(bpf_perf_prog_read_value, struct bpf_perf_event_data_kern *, ctx, + struct bpf_perf_event_value *, buf, u32, size) +{ + int err = -EINVAL; + + if (unlikely(size != sizeof(struct bpf_perf_event_value))) + goto clear; + err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled, + &buf->running); + if (unlikely(err)) + goto clear; + return 0; +clear: + memset(buf, 0, size); + return err; +} + +static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { + .func = bpf_perf_prog_read_value, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx, + void *, buf, u32, size, u64, flags) +{ + static const u32 br_entry_size = sizeof(struct perf_branch_entry); + struct perf_branch_stack *br_stack = ctx->data->br_stack; + u32 to_copy; + + if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE)) + return -EINVAL; + + if (unlikely(!(ctx->data->sample_flags & PERF_SAMPLE_BRANCH_STACK))) + return -ENOENT; + + if (unlikely(!br_stack)) + return -ENOENT; + + if (flags & BPF_F_GET_BRANCH_RECORDS_SIZE) + return br_stack->nr * br_entry_size; + + if (!buf || (size % br_entry_size != 0)) + return -EINVAL; + + to_copy = min_t(u32, br_stack->nr * br_entry_size, size); + memcpy(buf, br_stack->entries, to_copy); + + return to_copy; +} + +static const struct bpf_func_proto bpf_read_branch_records_proto = { + .func = bpf_read_branch_records, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM_OR_NULL, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_perf_event_output_proto_tp; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto_pe; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto_pe; + case BPF_FUNC_perf_prog_read_value: + return &bpf_perf_prog_read_value_proto; + case BPF_FUNC_read_branch_records: + return &bpf_read_branch_records_proto; + case BPF_FUNC_get_attach_cookie: + return &bpf_get_attach_cookie_proto_pe; + default: + return bpf_tracing_func_proto(func_id, prog); + } +} + +/* + * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp + * to avoid potential recursive reuse issue when/if tracepoints are added + * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack. + * + * Since raw tracepoints run despite bpf_prog_active, support concurrent usage + * in normal, irq, and nmi context. + */ +struct bpf_raw_tp_regs { + struct pt_regs regs[3]; +}; +static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs); +static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level); +static struct pt_regs *get_bpf_raw_tp_regs(void) +{ + struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs); + int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level); + + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) { + this_cpu_dec(bpf_raw_tp_nest_level); + return ERR_PTR(-EBUSY); + } + + return &tp_regs->regs[nest_level - 1]; +} + +static void put_bpf_raw_tp_regs(void) +{ + this_cpu_dec(bpf_raw_tp_nest_level); +} + +BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, + struct bpf_map *, map, u64, flags, void *, data, u64, size) +{ + struct pt_regs *regs = get_bpf_raw_tp_regs(); + int ret; + + if (IS_ERR(regs)) + return PTR_ERR(regs); + + perf_fetch_caller_regs(regs); + ret = ____bpf_perf_event_output(regs, map, flags, data, size); + + put_bpf_raw_tp_regs(); + return ret; +} + +static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { + .func = bpf_perf_event_output_raw_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +extern const struct bpf_func_proto bpf_skb_output_proto; +extern const struct bpf_func_proto bpf_xdp_output_proto; +extern const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto; + +BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, + struct bpf_map *, map, u64, flags) +{ + struct pt_regs *regs = get_bpf_raw_tp_regs(); + int ret; + + if (IS_ERR(regs)) + return PTR_ERR(regs); + + perf_fetch_caller_regs(regs); + /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ + ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map, + flags, 0, 0); + put_bpf_raw_tp_regs(); + return ret; +} + +static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { + .func = bpf_get_stackid_raw_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args, + void *, buf, u32, size, u64, flags) +{ + struct pt_regs *regs = get_bpf_raw_tp_regs(); + int ret; + + if (IS_ERR(regs)) + return PTR_ERR(regs); + + perf_fetch_caller_regs(regs); + ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf, + (unsigned long) size, flags, 0); + put_bpf_raw_tp_regs(); + return ret; +} + +static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { + .func = bpf_get_stack_raw_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_perf_event_output_proto_raw_tp; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto_raw_tp; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto_raw_tp; + default: + return bpf_tracing_func_proto(func_id, prog); + } +} + +const struct bpf_func_proto * +tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + const struct bpf_func_proto *fn; + + switch (func_id) { +#ifdef CONFIG_NET + case BPF_FUNC_skb_output: + return &bpf_skb_output_proto; + case BPF_FUNC_xdp_output: + return &bpf_xdp_output_proto; + case BPF_FUNC_skc_to_tcp6_sock: + return &bpf_skc_to_tcp6_sock_proto; + case BPF_FUNC_skc_to_tcp_sock: + return &bpf_skc_to_tcp_sock_proto; + case BPF_FUNC_skc_to_tcp_timewait_sock: + return &bpf_skc_to_tcp_timewait_sock_proto; + case BPF_FUNC_skc_to_tcp_request_sock: + return &bpf_skc_to_tcp_request_sock_proto; + case BPF_FUNC_skc_to_udp6_sock: + return &bpf_skc_to_udp6_sock_proto; + case BPF_FUNC_skc_to_unix_sock: + return &bpf_skc_to_unix_sock_proto; + case BPF_FUNC_skc_to_mptcp_sock: + return &bpf_skc_to_mptcp_sock_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_tracing_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_tracing_proto; + case BPF_FUNC_sock_from_file: + return &bpf_sock_from_file_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_ptr_cookie_proto; + case BPF_FUNC_xdp_get_buff_len: + return &bpf_xdp_get_buff_len_trace_proto; +#endif + case BPF_FUNC_seq_printf: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_printf_proto : + NULL; + case BPF_FUNC_seq_write: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_write_proto : + NULL; + case BPF_FUNC_seq_printf_btf: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_printf_btf_proto : + NULL; + case BPF_FUNC_d_path: + return &bpf_d_path_proto; + case BPF_FUNC_get_func_arg: + return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_proto : NULL; + case BPF_FUNC_get_func_ret: + return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL; + case BPF_FUNC_get_func_arg_cnt: + return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL; + case BPF_FUNC_get_attach_cookie: + return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL; + default: + fn = raw_tp_prog_func_proto(func_id, prog); + if (!fn && prog->expected_attach_type == BPF_TRACE_ITER) + fn = bpf_iter_get_func_proto(func_id, prog); + return fn; + } +} + +static bool raw_tp_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + return bpf_tracing_ctx_access(off, size, type); +} + +static bool tracing_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +int __weak bpf_prog_test_run_tracing(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + return -ENOTSUPP; +} + +const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { + .get_func_proto = raw_tp_prog_func_proto, + .is_valid_access = raw_tp_prog_is_valid_access, +}; + +const struct bpf_prog_ops raw_tracepoint_prog_ops = { +#ifdef CONFIG_NET + .test_run = bpf_prog_test_run_raw_tp, +#endif +}; + +const struct bpf_verifier_ops tracing_verifier_ops = { + .get_func_proto = tracing_prog_func_proto, + .is_valid_access = tracing_prog_is_valid_access, +}; + +const struct bpf_prog_ops tracing_prog_ops = { + .test_run = bpf_prog_test_run_tracing, +}; + +static bool raw_tp_writable_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off == 0) { + if (size != sizeof(u64) || type != BPF_READ) + return false; + info->reg_type = PTR_TO_TP_BUFFER; + } + return raw_tp_prog_is_valid_access(off, size, type, prog, info); +} + +const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = { + .get_func_proto = raw_tp_prog_func_proto, + .is_valid_access = raw_tp_writable_prog_is_valid_access, +}; + +const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = { +}; + +static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_u64 = sizeof(u64); + + if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) { + if (sizeof(unsigned long) != 4) + return false; + if (size != 8) + return false; + if (off % size != 4) + return false; + } + + switch (off) { + case bpf_ctx_range(struct bpf_perf_event_data, sample_period): + bpf_ctx_record_field_size(info, size_u64); + if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) + return false; + break; + case bpf_ctx_range(struct bpf_perf_event_data, addr): + bpf_ctx_record_field_size(info, size_u64); + if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) + return false; + break; + default: + if (size != sizeof(long)) + return false; + } + + return true; +} + +static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_perf_event_data, sample_period): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, + data), si->dst_reg, si->src_reg, + offsetof(struct bpf_perf_event_data_kern, data)); + *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, + bpf_target_off(struct perf_sample_data, period, 8, + target_size)); + break; + case offsetof(struct bpf_perf_event_data, addr): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, + data), si->dst_reg, si->src_reg, + offsetof(struct bpf_perf_event_data_kern, data)); + *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, + bpf_target_off(struct perf_sample_data, addr, 8, + target_size)); + break; + default: + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, + regs), si->dst_reg, si->src_reg, + offsetof(struct bpf_perf_event_data_kern, regs)); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg, + si->off); + break; + } + + return insn - insn_buf; +} + +const struct bpf_verifier_ops perf_event_verifier_ops = { + .get_func_proto = pe_prog_func_proto, + .is_valid_access = pe_prog_is_valid_access, + .convert_ctx_access = pe_prog_convert_ctx_access, +}; + +const struct bpf_prog_ops perf_event_prog_ops = { +}; + +static DEFINE_MUTEX(bpf_event_mutex); + +#define BPF_TRACE_MAX_PROGS 64 + +int perf_event_attach_bpf_prog(struct perf_event *event, + struct bpf_prog *prog, + u64 bpf_cookie) +{ + struct bpf_prog_array *old_array; + struct bpf_prog_array *new_array; + int ret = -EEXIST; + + /* + * Kprobe override only works if they are on the function entry, + * and only if they are on the opt-in list. + */ + if (prog->kprobe_override && + (!trace_kprobe_on_func_entry(event->tp_event) || + !trace_kprobe_error_injectable(event->tp_event))) + return -EINVAL; + + mutex_lock(&bpf_event_mutex); + + if (event->prog) + goto unlock; + + old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); + if (old_array && + bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { + ret = -E2BIG; + goto unlock; + } + + ret = bpf_prog_array_copy(old_array, NULL, prog, bpf_cookie, &new_array); + if (ret < 0) + goto unlock; + + /* set the new array to event->tp_event and set event->prog */ + event->prog = prog; + event->bpf_cookie = bpf_cookie; + rcu_assign_pointer(event->tp_event->prog_array, new_array); + bpf_prog_array_free_sleepable(old_array); + +unlock: + mutex_unlock(&bpf_event_mutex); + return ret; +} + +void perf_event_detach_bpf_prog(struct perf_event *event) +{ + struct bpf_prog_array *old_array; + struct bpf_prog_array *new_array; + int ret; + + mutex_lock(&bpf_event_mutex); + + if (!event->prog) + goto unlock; + + old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); + ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array); + if (ret == -ENOENT) + goto unlock; + if (ret < 0) { + bpf_prog_array_delete_safe(old_array, event->prog); + } else { + rcu_assign_pointer(event->tp_event->prog_array, new_array); + bpf_prog_array_free_sleepable(old_array); + } + + bpf_prog_put(event->prog); + event->prog = NULL; + +unlock: + mutex_unlock(&bpf_event_mutex); +} + +int perf_event_query_prog_array(struct perf_event *event, void __user *info) +{ + struct perf_event_query_bpf __user *uquery = info; + struct perf_event_query_bpf query = {}; + struct bpf_prog_array *progs; + u32 *ids, prog_cnt, ids_len; + int ret; + + if (!perfmon_capable()) + return -EPERM; + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + if (copy_from_user(&query, uquery, sizeof(query))) + return -EFAULT; + + ids_len = query.ids_len; + if (ids_len > BPF_TRACE_MAX_PROGS) + return -E2BIG; + ids = kcalloc(ids_len, sizeof(u32), GFP_USER | __GFP_NOWARN); + if (!ids) + return -ENOMEM; + /* + * The above kcalloc returns ZERO_SIZE_PTR when ids_len = 0, which + * is required when user only wants to check for uquery->prog_cnt. + * There is no need to check for it since the case is handled + * gracefully in bpf_prog_array_copy_info. + */ + + mutex_lock(&bpf_event_mutex); + progs = bpf_event_rcu_dereference(event->tp_event->prog_array); + ret = bpf_prog_array_copy_info(progs, ids, ids_len, &prog_cnt); + mutex_unlock(&bpf_event_mutex); + + if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) || + copy_to_user(uquery->ids, ids, ids_len * sizeof(u32))) + ret = -EFAULT; + + kfree(ids); + return ret; +} + +extern struct bpf_raw_event_map __start__bpf_raw_tp[]; +extern struct bpf_raw_event_map __stop__bpf_raw_tp[]; + +struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) +{ + struct bpf_raw_event_map *btp = __start__bpf_raw_tp; + + for (; btp < __stop__bpf_raw_tp; btp++) { + if (!strcmp(btp->tp->name, name)) + return btp; + } + + return bpf_get_raw_tracepoint_module(name); +} + +void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) +{ + struct module *mod; + + preempt_disable(); + mod = __module_address((unsigned long)btp); + module_put(mod); + preempt_enable(); +} + +static __always_inline +void __bpf_trace_run(struct bpf_prog *prog, u64 *args) +{ + cant_sleep(); + if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + bpf_prog_inc_misses_counter(prog); + goto out; + } + rcu_read_lock(); + (void) bpf_prog_run(prog, args); + rcu_read_unlock(); +out: + this_cpu_dec(*(prog->active)); +} + +#define UNPACK(...) __VA_ARGS__ +#define REPEAT_1(FN, DL, X, ...) FN(X) +#define REPEAT_2(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__) +#define REPEAT_3(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_2(FN, DL, __VA_ARGS__) +#define REPEAT_4(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_3(FN, DL, __VA_ARGS__) +#define REPEAT_5(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_4(FN, DL, __VA_ARGS__) +#define REPEAT_6(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_5(FN, DL, __VA_ARGS__) +#define REPEAT_7(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_6(FN, DL, __VA_ARGS__) +#define REPEAT_8(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_7(FN, DL, __VA_ARGS__) +#define REPEAT_9(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_8(FN, DL, __VA_ARGS__) +#define REPEAT_10(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_9(FN, DL, __VA_ARGS__) +#define REPEAT_11(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_10(FN, DL, __VA_ARGS__) +#define REPEAT_12(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_11(FN, DL, __VA_ARGS__) +#define REPEAT(X, FN, DL, ...) REPEAT_##X(FN, DL, __VA_ARGS__) + +#define SARG(X) u64 arg##X +#define COPY(X) args[X] = arg##X + +#define __DL_COM (,) +#define __DL_SEM (;) + +#define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 + +#define BPF_TRACE_DEFN_x(x) \ + void bpf_trace_run##x(struct bpf_prog *prog, \ + REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \ + { \ + u64 args[x]; \ + REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \ + __bpf_trace_run(prog, args); \ + } \ + EXPORT_SYMBOL_GPL(bpf_trace_run##x) +BPF_TRACE_DEFN_x(1); +BPF_TRACE_DEFN_x(2); +BPF_TRACE_DEFN_x(3); +BPF_TRACE_DEFN_x(4); +BPF_TRACE_DEFN_x(5); +BPF_TRACE_DEFN_x(6); +BPF_TRACE_DEFN_x(7); +BPF_TRACE_DEFN_x(8); +BPF_TRACE_DEFN_x(9); +BPF_TRACE_DEFN_x(10); +BPF_TRACE_DEFN_x(11); +BPF_TRACE_DEFN_x(12); + +static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +{ + struct tracepoint *tp = btp->tp; + + /* + * check that program doesn't access arguments beyond what's + * available in this tracepoint + */ + if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) + return -EINVAL; + + if (prog->aux->max_tp_access > btp->writable_size) + return -EINVAL; + + return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, + prog); +} + +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +{ + return __bpf_probe_register(btp, prog); +} + +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +{ + return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); +} + +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, + u32 *fd_type, const char **buf, + u64 *probe_offset, u64 *probe_addr) +{ + bool is_tracepoint, is_syscall_tp; + struct bpf_prog *prog; + int flags, err = 0; + + prog = event->prog; + if (!prog) + return -ENOENT; + + /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */ + if (prog->type == BPF_PROG_TYPE_PERF_EVENT) + return -EOPNOTSUPP; + + *prog_id = prog->aux->id; + flags = event->tp_event->flags; + is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT; + is_syscall_tp = is_syscall_trace_event(event->tp_event); + + if (is_tracepoint || is_syscall_tp) { + *buf = is_tracepoint ? event->tp_event->tp->name + : event->tp_event->name; + /* We allow NULL pointer for tracepoint */ + if (fd_type) + *fd_type = BPF_FD_TYPE_TRACEPOINT; + if (probe_offset) + *probe_offset = 0x0; + if (probe_addr) + *probe_addr = 0x0; + } else { + /* kprobe/uprobe */ + err = -EOPNOTSUPP; +#ifdef CONFIG_KPROBE_EVENTS + if (flags & TRACE_EVENT_FL_KPROBE) + err = bpf_get_kprobe_info(event, fd_type, buf, + probe_offset, probe_addr, + event->attr.type == PERF_TYPE_TRACEPOINT); +#endif +#ifdef CONFIG_UPROBE_EVENTS + if (flags & TRACE_EVENT_FL_UPROBE) + err = bpf_get_uprobe_info(event, fd_type, buf, + probe_offset, probe_addr, + event->attr.type == PERF_TYPE_TRACEPOINT); +#endif + } + + return err; +} + +static int __init send_signal_irq_work_init(void) +{ + int cpu; + struct send_signal_irq_work *work; + + for_each_possible_cpu(cpu) { + work = per_cpu_ptr(&send_signal_work, cpu); + init_irq_work(&work->irq_work, do_bpf_send_signal); + } + return 0; +} + +subsys_initcall(send_signal_irq_work_init); + +#ifdef CONFIG_MODULES +static int bpf_event_notify(struct notifier_block *nb, unsigned long op, + void *module) +{ + struct bpf_trace_module *btm, *tmp; + struct module *mod = module; + int ret = 0; + + if (mod->num_bpf_raw_events == 0 || + (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) + goto out; + + mutex_lock(&bpf_module_mutex); + + switch (op) { + case MODULE_STATE_COMING: + btm = kzalloc(sizeof(*btm), GFP_KERNEL); + if (btm) { + btm->module = module; + list_add(&btm->list, &bpf_trace_modules); + } else { + ret = -ENOMEM; + } + break; + case MODULE_STATE_GOING: + list_for_each_entry_safe(btm, tmp, &bpf_trace_modules, list) { + if (btm->module == module) { + list_del(&btm->list); + kfree(btm); + break; + } + } + break; + } + + mutex_unlock(&bpf_module_mutex); + +out: + return notifier_from_errno(ret); +} + +static struct notifier_block bpf_module_nb = { + .notifier_call = bpf_event_notify, +}; + +static int __init bpf_event_init(void) +{ + register_module_notifier(&bpf_module_nb); + return 0; +} + +fs_initcall(bpf_event_init); +#endif /* CONFIG_MODULES */ + +#ifdef CONFIG_FPROBE +struct bpf_kprobe_multi_link { + struct bpf_link link; + struct fprobe fp; + unsigned long *addrs; + u64 *cookies; + u32 cnt; + u32 mods_cnt; + struct module **mods; + u32 flags; +}; + +struct bpf_kprobe_multi_run_ctx { + struct bpf_run_ctx run_ctx; + struct bpf_kprobe_multi_link *link; + unsigned long entry_ip; +}; + +struct user_syms { + const char **syms; + char *buf; +}; + +static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt) +{ + unsigned long __user usymbol; + const char **syms = NULL; + char *buf = NULL, *p; + int err = -ENOMEM; + unsigned int i; + + syms = kvmalloc_array(cnt, sizeof(*syms), GFP_KERNEL); + if (!syms) + goto error; + + buf = kvmalloc_array(cnt, KSYM_NAME_LEN, GFP_KERNEL); + if (!buf) + goto error; + + for (p = buf, i = 0; i < cnt; i++) { + if (__get_user(usymbol, usyms + i)) { + err = -EFAULT; + goto error; + } + err = strncpy_from_user(p, (const char __user *) usymbol, KSYM_NAME_LEN); + if (err == KSYM_NAME_LEN) + err = -E2BIG; + if (err < 0) + goto error; + syms[i] = p; + p += err + 1; + } + + us->syms = syms; + us->buf = buf; + return 0; + +error: + if (err) { + kvfree(syms); + kvfree(buf); + } + return err; +} + +static void kprobe_multi_put_modules(struct module **mods, u32 cnt) +{ + u32 i; + + for (i = 0; i < cnt; i++) + module_put(mods[i]); +} + +static void free_user_syms(struct user_syms *us) +{ + kvfree(us->syms); + kvfree(us->buf); +} + +static void bpf_kprobe_multi_link_release(struct bpf_link *link) +{ + struct bpf_kprobe_multi_link *kmulti_link; + + kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); + unregister_fprobe(&kmulti_link->fp); + kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt); +} + +static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link) +{ + struct bpf_kprobe_multi_link *kmulti_link; + + kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); + kvfree(kmulti_link->addrs); + kvfree(kmulti_link->cookies); + kfree(kmulti_link->mods); + kfree(kmulti_link); +} + +static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + u64 __user *uaddrs = u64_to_user_ptr(info->kprobe_multi.addrs); + struct bpf_kprobe_multi_link *kmulti_link; + u32 ucount = info->kprobe_multi.count; + int err = 0, i; + + if (!uaddrs ^ !ucount) + return -EINVAL; + + kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); + info->kprobe_multi.count = kmulti_link->cnt; + info->kprobe_multi.flags = kmulti_link->flags; + + if (!uaddrs) + return 0; + if (ucount < kmulti_link->cnt) + err = -ENOSPC; + else + ucount = kmulti_link->cnt; + + if (kallsyms_show_value(current_cred())) { + if (copy_to_user(uaddrs, kmulti_link->addrs, ucount * sizeof(u64))) + return -EFAULT; + } else { + for (i = 0; i < ucount; i++) { + if (put_user(0, uaddrs + i)) + return -EFAULT; + } + } + return err; +} + +static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { + .release = bpf_kprobe_multi_link_release, + .dealloc = bpf_kprobe_multi_link_dealloc, + .fill_link_info = bpf_kprobe_multi_link_fill_link_info, +}; + +static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv) +{ + const struct bpf_kprobe_multi_link *link = priv; + unsigned long *addr_a = a, *addr_b = b; + u64 *cookie_a, *cookie_b; + + cookie_a = link->cookies + (addr_a - link->addrs); + cookie_b = link->cookies + (addr_b - link->addrs); + + /* swap addr_a/addr_b and cookie_a/cookie_b values */ + swap(*addr_a, *addr_b); + swap(*cookie_a, *cookie_b); +} + +static int bpf_kprobe_multi_addrs_cmp(const void *a, const void *b) +{ + const unsigned long *addr_a = a, *addr_b = b; + + if (*addr_a == *addr_b) + return 0; + return *addr_a < *addr_b ? -1 : 1; +} + +static int bpf_kprobe_multi_cookie_cmp(const void *a, const void *b, const void *priv) +{ + return bpf_kprobe_multi_addrs_cmp(a, b); +} + +static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx) +{ + struct bpf_kprobe_multi_run_ctx *run_ctx; + struct bpf_kprobe_multi_link *link; + u64 *cookie, entry_ip; + unsigned long *addr; + + if (WARN_ON_ONCE(!ctx)) + return 0; + run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx); + link = run_ctx->link; + if (!link->cookies) + return 0; + entry_ip = run_ctx->entry_ip; + addr = bsearch(&entry_ip, link->addrs, link->cnt, sizeof(entry_ip), + bpf_kprobe_multi_addrs_cmp); + if (!addr) + return 0; + cookie = link->cookies + (addr - link->addrs); + return *cookie; +} + +static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) +{ + struct bpf_kprobe_multi_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx); + return run_ctx->entry_ip; +} + +static int +kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, + unsigned long entry_ip, struct pt_regs *regs) +{ + struct bpf_kprobe_multi_run_ctx run_ctx = { + .link = link, + .entry_ip = entry_ip, + }; + struct bpf_run_ctx *old_run_ctx; + int err; + + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { + err = 0; + goto out; + } + + migrate_disable(); + rcu_read_lock(); + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + err = bpf_prog_run(link->link.prog, regs); + bpf_reset_run_ctx(old_run_ctx); + rcu_read_unlock(); + migrate_enable(); + + out: + __this_cpu_dec(bpf_prog_active); + return err; +} + +static int +kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, + unsigned long ret_ip, struct pt_regs *regs, + void *data) +{ + struct bpf_kprobe_multi_link *link; + + link = container_of(fp, struct bpf_kprobe_multi_link, fp); + kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs); + return 0; +} + +static void +kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip, + unsigned long ret_ip, struct pt_regs *regs, + void *data) +{ + struct bpf_kprobe_multi_link *link; + + link = container_of(fp, struct bpf_kprobe_multi_link, fp); + kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs); +} + +static int symbols_cmp_r(const void *a, const void *b, const void *priv) +{ + const char **str_a = (const char **) a; + const char **str_b = (const char **) b; + + return strcmp(*str_a, *str_b); +} + +struct multi_symbols_sort { + const char **funcs; + u64 *cookies; +}; + +static void symbols_swap_r(void *a, void *b, int size, const void *priv) +{ + const struct multi_symbols_sort *data = priv; + const char **name_a = a, **name_b = b; + + swap(*name_a, *name_b); + + /* If defined, swap also related cookies. */ + if (data->cookies) { + u64 *cookie_a, *cookie_b; + + cookie_a = data->cookies + (name_a - data->funcs); + cookie_b = data->cookies + (name_b - data->funcs); + swap(*cookie_a, *cookie_b); + } +} + +struct modules_array { + struct module **mods; + int mods_cnt; + int mods_cap; +}; + +static int add_module(struct modules_array *arr, struct module *mod) +{ + struct module **mods; + + if (arr->mods_cnt == arr->mods_cap) { + arr->mods_cap = max(16, arr->mods_cap * 3 / 2); + mods = krealloc_array(arr->mods, arr->mods_cap, sizeof(*mods), GFP_KERNEL); + if (!mods) + return -ENOMEM; + arr->mods = mods; + } + + arr->mods[arr->mods_cnt] = mod; + arr->mods_cnt++; + return 0; +} + +static bool has_module(struct modules_array *arr, struct module *mod) +{ + int i; + + for (i = arr->mods_cnt - 1; i >= 0; i--) { + if (arr->mods[i] == mod) + return true; + } + return false; +} + +static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u32 addrs_cnt) +{ + struct modules_array arr = {}; + u32 i, err = 0; + + for (i = 0; i < addrs_cnt; i++) { + struct module *mod; + + preempt_disable(); + mod = __module_address(addrs[i]); + /* Either no module or we it's already stored */ + if (!mod || has_module(&arr, mod)) { + preempt_enable(); + continue; + } + if (!try_module_get(mod)) + err = -EINVAL; + preempt_enable(); + if (err) + break; + err = add_module(&arr, mod); + if (err) { + module_put(mod); + break; + } + } + + /* We return either err < 0 in case of error, ... */ + if (err) { + kprobe_multi_put_modules(arr.mods, arr.mods_cnt); + kfree(arr.mods); + return err; + } + + /* or number of modules found if everything is ok. */ + *mods = arr.mods; + return arr.mods_cnt; +} + +static int addrs_check_error_injection_list(unsigned long *addrs, u32 cnt) +{ + u32 i; + + for (i = 0; i < cnt; i++) { + if (!within_error_injection_list(addrs[i])) + return -EINVAL; + } + return 0; +} + +int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_kprobe_multi_link *link = NULL; + struct bpf_link_primer link_primer; + void __user *ucookies; + unsigned long *addrs; + u32 flags, cnt, size; + void __user *uaddrs; + u64 *cookies = NULL; + void __user *usyms; + int err; + + /* no support for 32bit archs yet */ + if (sizeof(u64) != sizeof(void *)) + return -EOPNOTSUPP; + + if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI) + return -EINVAL; + + flags = attr->link_create.kprobe_multi.flags; + if (flags & ~BPF_F_KPROBE_MULTI_RETURN) + return -EINVAL; + + uaddrs = u64_to_user_ptr(attr->link_create.kprobe_multi.addrs); + usyms = u64_to_user_ptr(attr->link_create.kprobe_multi.syms); + if (!!uaddrs == !!usyms) + return -EINVAL; + + cnt = attr->link_create.kprobe_multi.cnt; + if (!cnt) + return -EINVAL; + if (cnt > MAX_KPROBE_MULTI_CNT) + return -E2BIG; + + size = cnt * sizeof(*addrs); + addrs = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); + if (!addrs) + return -ENOMEM; + + ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies); + if (ucookies) { + cookies = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL); + if (!cookies) { + err = -ENOMEM; + goto error; + } + if (copy_from_user(cookies, ucookies, size)) { + err = -EFAULT; + goto error; + } + } + + if (uaddrs) { + if (copy_from_user(addrs, uaddrs, size)) { + err = -EFAULT; + goto error; + } + } else { + struct multi_symbols_sort data = { + .cookies = cookies, + }; + struct user_syms us; + + err = copy_user_syms(&us, usyms, cnt); + if (err) + goto error; + + if (cookies) + data.funcs = us.syms; + + sort_r(us.syms, cnt, sizeof(*us.syms), symbols_cmp_r, + symbols_swap_r, &data); + + err = ftrace_lookup_symbols(us.syms, cnt, addrs); + free_user_syms(&us); + if (err) + goto error; + } + + if (prog->kprobe_override && addrs_check_error_injection_list(addrs, cnt)) { + err = -EINVAL; + goto error; + } + + link = kzalloc(sizeof(*link), GFP_KERNEL); + if (!link) { + err = -ENOMEM; + goto error; + } + + bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI, + &bpf_kprobe_multi_link_lops, prog); + + err = bpf_link_prime(&link->link, &link_primer); + if (err) + goto error; + + if (flags & BPF_F_KPROBE_MULTI_RETURN) + link->fp.exit_handler = kprobe_multi_link_exit_handler; + else + link->fp.entry_handler = kprobe_multi_link_handler; + + link->addrs = addrs; + link->cookies = cookies; + link->cnt = cnt; + link->flags = flags; + + if (cookies) { + /* + * Sorting addresses will trigger sorting cookies as well + * (check bpf_kprobe_multi_cookie_swap). This way we can + * find cookie based on the address in bpf_get_attach_cookie + * helper. + */ + sort_r(addrs, cnt, sizeof(*addrs), + bpf_kprobe_multi_cookie_cmp, + bpf_kprobe_multi_cookie_swap, + link); + } + + err = get_modules_for_addrs(&link->mods, addrs, cnt); + if (err < 0) { + bpf_link_cleanup(&link_primer); + return err; + } + link->mods_cnt = err; + + err = register_fprobe_ips(&link->fp, addrs, cnt); + if (err) { + kprobe_multi_put_modules(link->mods, link->mods_cnt); + bpf_link_cleanup(&link_primer); + return err; + } + + return bpf_link_settle(&link_primer); + +error: + kfree(link); + kvfree(addrs); + kvfree(cookies); + return err; +} +#else /* !CONFIG_FPROBE */ +int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} +static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx) +{ + return 0; +} +static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) +{ + return 0; +} +#endif + +#ifdef CONFIG_UPROBES +struct bpf_uprobe_multi_link; + +struct bpf_uprobe { + struct bpf_uprobe_multi_link *link; + loff_t offset; + u64 cookie; + struct uprobe_consumer consumer; +}; + +struct bpf_uprobe_multi_link { + struct path path; + struct bpf_link link; + u32 cnt; + struct bpf_uprobe *uprobes; + struct task_struct *task; +}; + +struct bpf_uprobe_multi_run_ctx { + struct bpf_run_ctx run_ctx; + unsigned long entry_ip; + struct bpf_uprobe *uprobe; +}; + +static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes, + u32 cnt) +{ + u32 i; + + for (i = 0; i < cnt; i++) { + uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset, + &uprobes[i].consumer); + } +} + +static void bpf_uprobe_multi_link_release(struct bpf_link *link) +{ + struct bpf_uprobe_multi_link *umulti_link; + + umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); + bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt); +} + +static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) +{ + struct bpf_uprobe_multi_link *umulti_link; + + umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); + if (umulti_link->task) + put_task_struct(umulti_link->task); + path_put(&umulti_link->path); + kvfree(umulti_link->uprobes); + kfree(umulti_link); +} + +static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { + .release = bpf_uprobe_multi_link_release, + .dealloc = bpf_uprobe_multi_link_dealloc, +}; + +static int uprobe_prog_run(struct bpf_uprobe *uprobe, + unsigned long entry_ip, + struct pt_regs *regs) +{ + struct bpf_uprobe_multi_link *link = uprobe->link; + struct bpf_uprobe_multi_run_ctx run_ctx = { + .entry_ip = entry_ip, + .uprobe = uprobe, + }; + struct bpf_prog *prog = link->link.prog; + bool sleepable = prog->aux->sleepable; + struct bpf_run_ctx *old_run_ctx; + int err = 0; + + if (link->task && current != link->task) + return 0; + + if (sleepable) + rcu_read_lock_trace(); + else + rcu_read_lock(); + + migrate_disable(); + + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + err = bpf_prog_run(link->link.prog, regs); + bpf_reset_run_ctx(old_run_ctx); + + migrate_enable(); + + if (sleepable) + rcu_read_unlock_trace(); + else + rcu_read_unlock(); + return err; +} + +static bool +uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx, + struct mm_struct *mm) +{ + struct bpf_uprobe *uprobe; + + uprobe = container_of(con, struct bpf_uprobe, consumer); + return uprobe->link->task->mm == mm; +} + +static int +uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs) +{ + struct bpf_uprobe *uprobe; + + uprobe = container_of(con, struct bpf_uprobe, consumer); + return uprobe_prog_run(uprobe, instruction_pointer(regs), regs); +} + +static int +uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs) +{ + struct bpf_uprobe *uprobe; + + uprobe = container_of(con, struct bpf_uprobe, consumer); + return uprobe_prog_run(uprobe, func, regs); +} + +static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) +{ + struct bpf_uprobe_multi_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx); + return run_ctx->entry_ip; +} + +static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx) +{ + struct bpf_uprobe_multi_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx); + return run_ctx->uprobe->cookie; +} + +int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_uprobe_multi_link *link = NULL; + unsigned long __user *uref_ctr_offsets; + unsigned long *ref_ctr_offsets = NULL; + struct bpf_link_primer link_primer; + struct bpf_uprobe *uprobes = NULL; + struct task_struct *task = NULL; + unsigned long __user *uoffsets; + u64 __user *ucookies; + void __user *upath; + u32 flags, cnt, i; + struct path path; + char *name; + pid_t pid; + int err; + + /* no support for 32bit archs yet */ + if (sizeof(u64) != sizeof(void *)) + return -EOPNOTSUPP; + + if (prog->expected_attach_type != BPF_TRACE_UPROBE_MULTI) + return -EINVAL; + + flags = attr->link_create.uprobe_multi.flags; + if (flags & ~BPF_F_UPROBE_MULTI_RETURN) + return -EINVAL; + + /* + * path, offsets and cnt are mandatory, + * ref_ctr_offsets and cookies are optional + */ + upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path); + uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets); + cnt = attr->link_create.uprobe_multi.cnt; + + if (!upath || !uoffsets || !cnt) + return -EINVAL; + if (cnt > MAX_UPROBE_MULTI_CNT) + return -E2BIG; + + uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets); + ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies); + + name = strndup_user(upath, PATH_MAX); + if (IS_ERR(name)) { + err = PTR_ERR(name); + return err; + } + + err = kern_path(name, LOOKUP_FOLLOW, &path); + kfree(name); + if (err) + return err; + + if (!d_is_reg(path.dentry)) { + err = -EBADF; + goto error_path_put; + } + + pid = attr->link_create.uprobe_multi.pid; + if (pid) { + rcu_read_lock(); + task = get_pid_task(find_vpid(pid), PIDTYPE_PID); + rcu_read_unlock(); + if (!task) { + err = -ESRCH; + goto error_path_put; + } + } + + err = -ENOMEM; + + link = kzalloc(sizeof(*link), GFP_KERNEL); + uprobes = kvcalloc(cnt, sizeof(*uprobes), GFP_KERNEL); + + if (!uprobes || !link) + goto error_free; + + if (uref_ctr_offsets) { + ref_ctr_offsets = kvcalloc(cnt, sizeof(*ref_ctr_offsets), GFP_KERNEL); + if (!ref_ctr_offsets) + goto error_free; + } + + for (i = 0; i < cnt; i++) { + if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) { + err = -EFAULT; + goto error_free; + } + if (uref_ctr_offsets && __get_user(ref_ctr_offsets[i], uref_ctr_offsets + i)) { + err = -EFAULT; + goto error_free; + } + if (__get_user(uprobes[i].offset, uoffsets + i)) { + err = -EFAULT; + goto error_free; + } + + uprobes[i].link = link; + + if (flags & BPF_F_UPROBE_MULTI_RETURN) + uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler; + else + uprobes[i].consumer.handler = uprobe_multi_link_handler; + + if (pid) + uprobes[i].consumer.filter = uprobe_multi_link_filter; + } + + link->cnt = cnt; + link->uprobes = uprobes; + link->path = path; + link->task = task; + + bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI, + &bpf_uprobe_multi_link_lops, prog); + + for (i = 0; i < cnt; i++) { + err = uprobe_register_refctr(d_real_inode(link->path.dentry), + uprobes[i].offset, + ref_ctr_offsets ? ref_ctr_offsets[i] : 0, + &uprobes[i].consumer); + if (err) { + bpf_uprobe_unregister(&path, uprobes, i); + goto error_free; + } + } + + err = bpf_link_prime(&link->link, &link_primer); + if (err) + goto error_free; + + kvfree(ref_ctr_offsets); + return bpf_link_settle(&link_primer); + +error_free: + kvfree(ref_ctr_offsets); + kvfree(uprobes); + kfree(link); + if (task) + put_task_struct(task); +error_path_put: + path_put(&path); + return err; +} +#else /* !CONFIG_UPROBES */ +int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} +static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx) +{ + return 0; +} +static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx) +{ + return 0; +} +#endif /* CONFIG_UPROBES */ diff --git a/kernel/trace/bpf_trace.h b/kernel/trace/bpf_trace.h new file mode 100644 index 0000000000..9acbc11ac7 --- /dev/null +++ b/kernel/trace/bpf_trace.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bpf_trace + +#if !defined(_TRACE_BPF_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) + +#define _TRACE_BPF_TRACE_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(bpf_trace_printk, + + TP_PROTO(const char *bpf_string), + + TP_ARGS(bpf_string), + + TP_STRUCT__entry( + __string(bpf_string, bpf_string) + ), + + TP_fast_assign( + __assign_str(bpf_string, bpf_string); + ), + + TP_printk("%s", __get_str(bpf_string)) +); + +#endif /* _TRACE_BPF_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE bpf_trace + +#include <trace/define_trace.h> diff --git a/kernel/trace/error_report-traces.c b/kernel/trace/error_report-traces.c new file mode 100644 index 0000000000..f89792c25b --- /dev/null +++ b/kernel/trace/error_report-traces.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Error reporting trace points. + * + * Copyright (C) 2021, Google LLC. + */ + +#define CREATE_TRACE_POINTS +#include <trace/events/error_report.h> + +EXPORT_TRACEPOINT_SYMBOL_GPL(error_report_end); diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c new file mode 100644 index 0000000000..c83c005e65 --- /dev/null +++ b/kernel/trace/fgraph.c @@ -0,0 +1,689 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Infrastructure to took into function calls and returns. + * Copyright (c) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com> + * Mostly borrowed from function tracer which + * is Copyright (c) Steven Rostedt <srostedt@redhat.com> + * + * Highly modified by Steven Rostedt (VMware). + */ +#include <linux/jump_label.h> +#include <linux/suspend.h> +#include <linux/ftrace.h> +#include <linux/slab.h> + +#include <trace/events/sched.h> + +#include "ftrace_internal.h" +#include "trace.h" + +#ifdef CONFIG_DYNAMIC_FTRACE +#define ASSIGN_OPS_HASH(opsname, val) \ + .func_hash = val, \ + .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), +#else +#define ASSIGN_OPS_HASH(opsname, val) +#endif + +DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph); +int ftrace_graph_active; + +/* Both enabled by default (can be cleared by function_graph tracer flags */ +static bool fgraph_sleep_time = true; + +#ifdef CONFIG_DYNAMIC_FTRACE +/* + * archs can override this function if they must do something + * to enable hook for graph tracer. + */ +int __weak ftrace_enable_ftrace_graph_caller(void) +{ + return 0; +} + +/* + * archs can override this function if they must do something + * to disable hook for graph tracer. + */ +int __weak ftrace_disable_ftrace_graph_caller(void) +{ + return 0; +} +#endif + +/** + * ftrace_graph_stop - set to permanently disable function graph tracing + * + * In case of an error int function graph tracing, this is called + * to try to keep function graph tracing from causing any more harm. + * Usually this is pretty severe and this is called to try to at least + * get a warning out to the user. + */ +void ftrace_graph_stop(void) +{ + static_branch_enable(&kill_ftrace_graph); +} + +/* Add a function return address to the trace stack on thread info.*/ +static int +ftrace_push_return_trace(unsigned long ret, unsigned long func, + unsigned long frame_pointer, unsigned long *retp) +{ + unsigned long long calltime; + int index; + + if (unlikely(ftrace_graph_is_dead())) + return -EBUSY; + + if (!current->ret_stack) + return -EBUSY; + + /* + * We must make sure the ret_stack is tested before we read + * anything else. + */ + smp_rmb(); + + /* The return trace stack is full */ + if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { + atomic_inc(¤t->trace_overrun); + return -EBUSY; + } + + calltime = trace_clock_local(); + + index = ++current->curr_ret_stack; + barrier(); + current->ret_stack[index].ret = ret; + current->ret_stack[index].func = func; + current->ret_stack[index].calltime = calltime; +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST + current->ret_stack[index].fp = frame_pointer; +#endif +#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR + current->ret_stack[index].retp = retp; +#endif + return 0; +} + +/* + * Not all archs define MCOUNT_INSN_SIZE which is used to look for direct + * functions. But those archs currently don't support direct functions + * anyway, and ftrace_find_rec_direct() is just a stub for them. + * Define MCOUNT_INSN_SIZE to keep those archs compiling. + */ +#ifndef MCOUNT_INSN_SIZE +/* Make sure this only works without direct calls */ +# ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +# error MCOUNT_INSN_SIZE not defined with direct calls enabled +# endif +# define MCOUNT_INSN_SIZE 0 +#endif + +int function_graph_enter(unsigned long ret, unsigned long func, + unsigned long frame_pointer, unsigned long *retp) +{ + struct ftrace_graph_ent trace; + +#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS + /* + * Skip graph tracing if the return location is served by direct trampoline, + * since call sequence and return addresses are unpredictable anyway. + * Ex: BPF trampoline may call original function and may skip frame + * depending on type of BPF programs attached. + */ + if (ftrace_direct_func_count && + ftrace_find_rec_direct(ret - MCOUNT_INSN_SIZE)) + return -EBUSY; +#endif + trace.func = func; + trace.depth = ++current->curr_ret_depth; + + if (ftrace_push_return_trace(ret, func, frame_pointer, retp)) + goto out; + + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) + goto out_ret; + + return 0; + out_ret: + current->curr_ret_stack--; + out: + current->curr_ret_depth--; + return -EBUSY; +} + +/* Retrieve a function return address to the trace stack on thread info.*/ +static void +ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, + unsigned long frame_pointer) +{ + int index; + + index = current->curr_ret_stack; + + if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic, otherwise we have no where to go */ + *ret = (unsigned long)panic; + return; + } + +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST + /* + * The arch may choose to record the frame pointer used + * and check it here to make sure that it is what we expect it + * to be. If gcc does not set the place holder of the return + * address in the frame pointer, and does a copy instead, then + * the function graph trace will fail. This test detects this + * case. + * + * Currently, x86_32 with optimize for size (-Os) makes the latest + * gcc do the above. + * + * Note, -mfentry does not use frame pointers, and this test + * is not needed if CC_USING_FENTRY is set. + */ + if (unlikely(current->ret_stack[index].fp != frame_pointer)) { + ftrace_graph_stop(); + WARN(1, "Bad frame pointer: expected %lx, received %lx\n" + " from func %ps return to %lx\n", + current->ret_stack[index].fp, + frame_pointer, + (void *)current->ret_stack[index].func, + current->ret_stack[index].ret); + *ret = (unsigned long)panic; + return; + } +#endif + + *ret = current->ret_stack[index].ret; + trace->func = current->ret_stack[index].func; + trace->calltime = current->ret_stack[index].calltime; + trace->overrun = atomic_read(¤t->trace_overrun); + trace->depth = current->curr_ret_depth--; + /* + * We still want to trace interrupts coming in if + * max_depth is set to 1. Make sure the decrement is + * seen before ftrace_graph_return. + */ + barrier(); +} + +/* + * Hibernation protection. + * The state of the current task is too much unstable during + * suspend/restore to disk. We want to protect against that. + */ +static int +ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, + void *unused) +{ + switch (state) { + case PM_HIBERNATION_PREPARE: + pause_graph_tracing(); + break; + + case PM_POST_HIBERNATION: + unpause_graph_tracing(); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block ftrace_suspend_notifier = { + .notifier_call = ftrace_suspend_notifier_call, +}; + +/* fgraph_ret_regs is not defined without CONFIG_FUNCTION_GRAPH_RETVAL */ +struct fgraph_ret_regs; + +/* + * Send the trace to the ring-buffer. + * @return the original return address. + */ +static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs, + unsigned long frame_pointer) +{ + struct ftrace_graph_ret trace; + unsigned long ret; + + ftrace_pop_return_trace(&trace, &ret, frame_pointer); +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + trace.retval = fgraph_ret_regs_return_value(ret_regs); +#endif + trace.rettime = trace_clock_local(); + ftrace_graph_return(&trace); + /* + * The ftrace_graph_return() may still access the current + * ret_stack structure, we need to make sure the update of + * curr_ret_stack is after that. + */ + barrier(); + current->curr_ret_stack--; + + if (unlikely(!ret)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic. What else to do? */ + ret = (unsigned long)panic; + } + + return ret; +} + +/* + * After all architecures have selected HAVE_FUNCTION_GRAPH_RETVAL, we can + * leave only ftrace_return_to_handler(ret_regs). + */ +#ifdef CONFIG_HAVE_FUNCTION_GRAPH_RETVAL +unsigned long ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs) +{ + return __ftrace_return_to_handler(ret_regs, + fgraph_ret_regs_frame_pointer(ret_regs)); +} +#else +unsigned long ftrace_return_to_handler(unsigned long frame_pointer) +{ + return __ftrace_return_to_handler(NULL, frame_pointer); +} +#endif + +/** + * ftrace_graph_get_ret_stack - return the entry of the shadow stack + * @task: The task to read the shadow stack from + * @idx: Index down the shadow stack + * + * Return the ret_struct on the shadow stack of the @task at the + * call graph at @idx starting with zero. If @idx is zero, it + * will return the last saved ret_stack entry. If it is greater than + * zero, it will return the corresponding ret_stack for the depth + * of saved return addresses. + */ +struct ftrace_ret_stack * +ftrace_graph_get_ret_stack(struct task_struct *task, int idx) +{ + idx = task->curr_ret_stack - idx; + + if (idx >= 0 && idx <= task->curr_ret_stack) + return &task->ret_stack[idx]; + + return NULL; +} + +/** + * ftrace_graph_ret_addr - convert a potentially modified stack return address + * to its original value + * + * This function can be called by stack unwinding code to convert a found stack + * return address ('ret') to its original value, in case the function graph + * tracer has modified it to be 'return_to_handler'. If the address hasn't + * been modified, the unchanged value of 'ret' is returned. + * + * 'idx' is a state variable which should be initialized by the caller to zero + * before the first call. + * + * 'retp' is a pointer to the return address on the stack. It's ignored if + * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined. + */ +#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR +unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, + unsigned long ret, unsigned long *retp) +{ + int index = task->curr_ret_stack; + int i; + + if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler)) + return ret; + + if (index < 0) + return ret; + + for (i = 0; i <= index; i++) + if (task->ret_stack[i].retp == retp) + return task->ret_stack[i].ret; + + return ret; +} +#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ +unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, + unsigned long ret, unsigned long *retp) +{ + int task_idx; + + if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler)) + return ret; + + task_idx = task->curr_ret_stack; + + if (!task->ret_stack || task_idx < *idx) + return ret; + + task_idx -= *idx; + (*idx)++; + + return task->ret_stack[task_idx].ret; +} +#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ + +static struct ftrace_ops graph_ops = { + .func = ftrace_graph_func, + .flags = FTRACE_OPS_FL_INITIALIZED | + FTRACE_OPS_FL_PID | + FTRACE_OPS_GRAPH_STUB, +#ifdef FTRACE_GRAPH_TRAMP_ADDR + .trampoline = FTRACE_GRAPH_TRAMP_ADDR, + /* trampoline_size is only needed for dynamically allocated tramps */ +#endif + ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) +}; + +void ftrace_graph_sleep_time_control(bool enable) +{ + fgraph_sleep_time = enable; +} + +int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) +{ + return 0; +} + +/* + * Simply points to ftrace_stub, but with the proper protocol. + * Defined by the linker script in linux/vmlinux.lds.h + */ +extern void ftrace_stub_graph(struct ftrace_graph_ret *); + +/* The callbacks that hook a function */ +trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph; +trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; +static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; + +/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ +static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) +{ + int i; + int ret = 0; + int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE; + struct task_struct *g, *t; + + for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) { + ret_stack_list[i] = + kmalloc_array(FTRACE_RETFUNC_DEPTH, + sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack_list[i]) { + start = 0; + end = i; + ret = -ENOMEM; + goto free; + } + } + + rcu_read_lock(); + for_each_process_thread(g, t) { + if (start == end) { + ret = -EAGAIN; + goto unlock; + } + + if (t->ret_stack == NULL) { + atomic_set(&t->trace_overrun, 0); + t->curr_ret_stack = -1; + t->curr_ret_depth = -1; + /* Make sure the tasks see the -1 first: */ + smp_wmb(); + t->ret_stack = ret_stack_list[start++]; + } + } + +unlock: + rcu_read_unlock(); +free: + for (i = start; i < end; i++) + kfree(ret_stack_list[i]); + return ret; +} + +static void +ftrace_graph_probe_sched_switch(void *ignore, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + unsigned long long timestamp; + int index; + + /* + * Does the user want to count the time a function was asleep. + * If so, do not update the time stamps. + */ + if (fgraph_sleep_time) + return; + + timestamp = trace_clock_local(); + + prev->ftrace_timestamp = timestamp; + + /* only process tasks that we timestamped */ + if (!next->ftrace_timestamp) + return; + + /* + * Update all the counters in next to make up for the + * time next was sleeping. + */ + timestamp -= next->ftrace_timestamp; + + for (index = next->curr_ret_stack; index >= 0; index--) + next->ret_stack[index].calltime += timestamp; +} + +static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) +{ + if (!ftrace_ops_test(&global_ops, trace->func, NULL)) + return 0; + return __ftrace_graph_entry(trace); +} + +/* + * The function graph tracer should only trace the functions defined + * by set_ftrace_filter and set_ftrace_notrace. If another function + * tracer ops is registered, the graph tracer requires testing the + * function against the global ops, and not just trace any function + * that any ftrace_ops registered. + */ +void update_function_graph_func(void) +{ + struct ftrace_ops *op; + bool do_test = false; + + /* + * The graph and global ops share the same set of functions + * to test. If any other ops is on the list, then + * the graph tracing needs to test if its the function + * it should call. + */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op != &global_ops && op != &graph_ops && + op != &ftrace_list_end) { + do_test = true; + /* in double loop, break out with goto */ + goto out; + } + } while_for_each_ftrace_op(op); + out: + if (do_test) + ftrace_graph_entry = ftrace_graph_entry_test; + else + ftrace_graph_entry = __ftrace_graph_entry; +} + +static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); + +static void +graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) +{ + atomic_set(&t->trace_overrun, 0); + t->ftrace_timestamp = 0; + /* make curr_ret_stack visible before we add the ret_stack */ + smp_wmb(); + t->ret_stack = ret_stack; +} + +/* + * Allocate a return stack for the idle task. May be the first + * time through, or it may be done by CPU hotplug online. + */ +void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) +{ + t->curr_ret_stack = -1; + t->curr_ret_depth = -1; + /* + * The idle task has no parent, it either has its own + * stack or no stack at all. + */ + if (t->ret_stack) + WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); + + if (ftrace_graph_active) { + struct ftrace_ret_stack *ret_stack; + + ret_stack = per_cpu(idle_ret_stack, cpu); + if (!ret_stack) { + ret_stack = + kmalloc_array(FTRACE_RETFUNC_DEPTH, + sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack) + return; + per_cpu(idle_ret_stack, cpu) = ret_stack; + } + graph_init_task(t, ret_stack); + } +} + +/* Allocate a return stack for newly created task */ +void ftrace_graph_init_task(struct task_struct *t) +{ + /* Make sure we do not use the parent ret_stack */ + t->ret_stack = NULL; + t->curr_ret_stack = -1; + t->curr_ret_depth = -1; + + if (ftrace_graph_active) { + struct ftrace_ret_stack *ret_stack; + + ret_stack = kmalloc_array(FTRACE_RETFUNC_DEPTH, + sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack) + return; + graph_init_task(t, ret_stack); + } +} + +void ftrace_graph_exit_task(struct task_struct *t) +{ + struct ftrace_ret_stack *ret_stack = t->ret_stack; + + t->ret_stack = NULL; + /* NULL must become visible to IRQs before we free it: */ + barrier(); + + kfree(ret_stack); +} + +/* Allocate a return stack for each task */ +static int start_graph_tracing(void) +{ + struct ftrace_ret_stack **ret_stack_list; + int ret, cpu; + + ret_stack_list = kmalloc_array(FTRACE_RETSTACK_ALLOC_SIZE, + sizeof(struct ftrace_ret_stack *), + GFP_KERNEL); + + if (!ret_stack_list) + return -ENOMEM; + + /* The cpu_boot init_task->ret_stack will never be freed */ + for_each_online_cpu(cpu) { + if (!idle_task(cpu)->ret_stack) + ftrace_graph_init_idle_task(idle_task(cpu), cpu); + } + + do { + ret = alloc_retstack_tasklist(ret_stack_list); + } while (ret == -EAGAIN); + + if (!ret) { + ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); + if (ret) + pr_info("ftrace_graph: Couldn't activate tracepoint" + " probe to kernel_sched_switch\n"); + } + + kfree(ret_stack_list); + return ret; +} + +int register_ftrace_graph(struct fgraph_ops *gops) +{ + int ret = 0; + + mutex_lock(&ftrace_lock); + + /* we currently allow only one tracer registered at a time */ + if (ftrace_graph_active) { + ret = -EBUSY; + goto out; + } + + register_pm_notifier(&ftrace_suspend_notifier); + + ftrace_graph_active++; + ret = start_graph_tracing(); + if (ret) { + ftrace_graph_active--; + goto out; + } + + ftrace_graph_return = gops->retfunc; + + /* + * Update the indirect function to the entryfunc, and the + * function that gets called to the entry_test first. Then + * call the update fgraph entry function to determine if + * the entryfunc should be called directly or not. + */ + __ftrace_graph_entry = gops->entryfunc; + ftrace_graph_entry = ftrace_graph_entry_test; + update_function_graph_func(); + + ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); +out: + mutex_unlock(&ftrace_lock); + return ret; +} + +void unregister_ftrace_graph(struct fgraph_ops *gops) +{ + mutex_lock(&ftrace_lock); + + if (unlikely(!ftrace_graph_active)) + goto out; + + ftrace_graph_active--; + ftrace_graph_return = ftrace_stub_graph; + ftrace_graph_entry = ftrace_graph_entry_stub; + __ftrace_graph_entry = ftrace_graph_entry_stub; + ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); + unregister_pm_notifier(&ftrace_suspend_notifier); + unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); + + out: + mutex_unlock(&ftrace_lock); +} diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c new file mode 100644 index 0000000000..881f90f0cb --- /dev/null +++ b/kernel/trace/fprobe.c @@ -0,0 +1,396 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fprobe - Simple ftrace probe wrapper for function entry. + */ +#define pr_fmt(fmt) "fprobe: " fmt + +#include <linux/err.h> +#include <linux/fprobe.h> +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/rethook.h> +#include <linux/slab.h> +#include <linux/sort.h> + +#include "trace.h" + +struct fprobe_rethook_node { + struct rethook_node node; + unsigned long entry_ip; + unsigned long entry_parent_ip; + char data[]; +}; + +static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + struct fprobe_rethook_node *fpr; + struct rethook_node *rh = NULL; + struct fprobe *fp; + void *entry_data = NULL; + int ret = 0; + + fp = container_of(ops, struct fprobe, ops); + + if (fp->exit_handler) { + rh = rethook_try_get(fp->rethook); + if (!rh) { + fp->nmissed++; + return; + } + fpr = container_of(rh, struct fprobe_rethook_node, node); + fpr->entry_ip = ip; + fpr->entry_parent_ip = parent_ip; + if (fp->entry_data_size) + entry_data = fpr->data; + } + + if (fp->entry_handler) + ret = fp->entry_handler(fp, ip, parent_ip, ftrace_get_regs(fregs), entry_data); + + /* If entry_handler returns !0, nmissed is not counted. */ + if (rh) { + if (ret) + rethook_recycle(rh); + else + rethook_hook(rh, ftrace_get_regs(fregs), true); + } +} + +static void fprobe_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + struct fprobe *fp; + int bit; + + fp = container_of(ops, struct fprobe, ops); + if (fprobe_disabled(fp)) + return; + + /* recursion detection has to go before any traceable function and + * all functions before this point should be marked as notrace + */ + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) { + fp->nmissed++; + return; + } + __fprobe_handler(ip, parent_ip, ops, fregs); + ftrace_test_recursion_unlock(bit); + +} +NOKPROBE_SYMBOL(fprobe_handler); + +static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + struct fprobe *fp; + int bit; + + fp = container_of(ops, struct fprobe, ops); + if (fprobe_disabled(fp)) + return; + + /* recursion detection has to go before any traceable function and + * all functions called before this point should be marked as notrace + */ + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) { + fp->nmissed++; + return; + } + + /* + * This user handler is shared with other kprobes and is not expected to be + * called recursively. So if any other kprobe handler is running, this will + * exit as kprobe does. See the section 'Share the callbacks with kprobes' + * in Documentation/trace/fprobe.rst for more information. + */ + if (unlikely(kprobe_running())) { + fp->nmissed++; + goto recursion_unlock; + } + + kprobe_busy_begin(); + __fprobe_handler(ip, parent_ip, ops, fregs); + kprobe_busy_end(); + +recursion_unlock: + ftrace_test_recursion_unlock(bit); +} + +static void fprobe_exit_handler(struct rethook_node *rh, void *data, + unsigned long ret_ip, struct pt_regs *regs) +{ + struct fprobe *fp = (struct fprobe *)data; + struct fprobe_rethook_node *fpr; + int bit; + + if (!fp || fprobe_disabled(fp)) + return; + + fpr = container_of(rh, struct fprobe_rethook_node, node); + + /* + * we need to assure no calls to traceable functions in-between the + * end of fprobe_handler and the beginning of fprobe_exit_handler. + */ + bit = ftrace_test_recursion_trylock(fpr->entry_ip, fpr->entry_parent_ip); + if (bit < 0) { + fp->nmissed++; + return; + } + + fp->exit_handler(fp, fpr->entry_ip, ret_ip, regs, + fp->entry_data_size ? (void *)fpr->data : NULL); + ftrace_test_recursion_unlock(bit); +} +NOKPROBE_SYMBOL(fprobe_exit_handler); + +static int symbols_cmp(const void *a, const void *b) +{ + const char **str_a = (const char **) a; + const char **str_b = (const char **) b; + + return strcmp(*str_a, *str_b); +} + +/* Convert ftrace location address from symbols */ +static unsigned long *get_ftrace_locations(const char **syms, int num) +{ + unsigned long *addrs; + + /* Convert symbols to symbol address */ + addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL); + if (!addrs) + return ERR_PTR(-ENOMEM); + + /* ftrace_lookup_symbols expects sorted symbols */ + sort(syms, num, sizeof(*syms), symbols_cmp, NULL); + + if (!ftrace_lookup_symbols(syms, num, addrs)) + return addrs; + + kfree(addrs); + return ERR_PTR(-ENOENT); +} + +static void fprobe_init(struct fprobe *fp) +{ + fp->nmissed = 0; + if (fprobe_shared_with_kprobes(fp)) + fp->ops.func = fprobe_kprobe_handler; + else + fp->ops.func = fprobe_handler; + fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS; +} + +static int fprobe_init_rethook(struct fprobe *fp, int num) +{ + int i, size; + + if (num <= 0) + return -EINVAL; + + if (!fp->exit_handler) { + fp->rethook = NULL; + return 0; + } + + /* Initialize rethook if needed */ + if (fp->nr_maxactive) + size = fp->nr_maxactive; + else + size = num * num_possible_cpus() * 2; + if (size <= 0) + return -EINVAL; + + fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler); + if (!fp->rethook) + return -ENOMEM; + for (i = 0; i < size; i++) { + struct fprobe_rethook_node *node; + + node = kzalloc(sizeof(*node) + fp->entry_data_size, GFP_KERNEL); + if (!node) { + rethook_free(fp->rethook); + fp->rethook = NULL; + return -ENOMEM; + } + rethook_add_node(fp->rethook, &node->node); + } + return 0; +} + +static void fprobe_fail_cleanup(struct fprobe *fp) +{ + if (fp->rethook) { + /* Don't need to cleanup rethook->handler because this is not used. */ + rethook_free(fp->rethook); + fp->rethook = NULL; + } + ftrace_free_filter(&fp->ops); +} + +/** + * register_fprobe() - Register fprobe to ftrace by pattern. + * @fp: A fprobe data structure to be registered. + * @filter: A wildcard pattern of probed symbols. + * @notfilter: A wildcard pattern of NOT probed symbols. + * + * Register @fp to ftrace for enabling the probe on the symbols matched to @filter. + * If @notfilter is not NULL, the symbols matched the @notfilter are not probed. + * + * Return 0 if @fp is registered successfully, -errno if not. + */ +int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter) +{ + struct ftrace_hash *hash; + unsigned char *str; + int ret, len; + + if (!fp || !filter) + return -EINVAL; + + fprobe_init(fp); + + len = strlen(filter); + str = kstrdup(filter, GFP_KERNEL); + ret = ftrace_set_filter(&fp->ops, str, len, 0); + kfree(str); + if (ret) + return ret; + + if (notfilter) { + len = strlen(notfilter); + str = kstrdup(notfilter, GFP_KERNEL); + ret = ftrace_set_notrace(&fp->ops, str, len, 0); + kfree(str); + if (ret) + goto out; + } + + /* TODO: + * correctly calculate the total number of filtered symbols + * from both filter and notfilter. + */ + hash = rcu_access_pointer(fp->ops.local_hash.filter_hash); + if (WARN_ON_ONCE(!hash)) + goto out; + + ret = fprobe_init_rethook(fp, (int)hash->count); + if (!ret) + ret = register_ftrace_function(&fp->ops); + +out: + if (ret) + fprobe_fail_cleanup(fp); + return ret; +} +EXPORT_SYMBOL_GPL(register_fprobe); + +/** + * register_fprobe_ips() - Register fprobe to ftrace by address. + * @fp: A fprobe data structure to be registered. + * @addrs: An array of target ftrace location addresses. + * @num: The number of entries of @addrs. + * + * Register @fp to ftrace for enabling the probe on the address given by @addrs. + * The @addrs must be the addresses of ftrace location address, which may be + * the symbol address + arch-dependent offset. + * If you unsure what this mean, please use other registration functions. + * + * Return 0 if @fp is registered successfully, -errno if not. + */ +int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num) +{ + int ret; + + if (!fp || !addrs || num <= 0) + return -EINVAL; + + fprobe_init(fp); + + ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0); + if (ret) + return ret; + + ret = fprobe_init_rethook(fp, num); + if (!ret) + ret = register_ftrace_function(&fp->ops); + + if (ret) + fprobe_fail_cleanup(fp); + return ret; +} +EXPORT_SYMBOL_GPL(register_fprobe_ips); + +/** + * register_fprobe_syms() - Register fprobe to ftrace by symbols. + * @fp: A fprobe data structure to be registered. + * @syms: An array of target symbols. + * @num: The number of entries of @syms. + * + * Register @fp to the symbols given by @syms array. This will be useful if + * you are sure the symbols exist in the kernel. + * + * Return 0 if @fp is registered successfully, -errno if not. + */ +int register_fprobe_syms(struct fprobe *fp, const char **syms, int num) +{ + unsigned long *addrs; + int ret; + + if (!fp || !syms || num <= 0) + return -EINVAL; + + addrs = get_ftrace_locations(syms, num); + if (IS_ERR(addrs)) + return PTR_ERR(addrs); + + ret = register_fprobe_ips(fp, addrs, num); + + kfree(addrs); + + return ret; +} +EXPORT_SYMBOL_GPL(register_fprobe_syms); + +bool fprobe_is_registered(struct fprobe *fp) +{ + if (!fp || (fp->ops.saved_func != fprobe_handler && + fp->ops.saved_func != fprobe_kprobe_handler)) + return false; + return true; +} + +/** + * unregister_fprobe() - Unregister fprobe from ftrace + * @fp: A fprobe data structure to be unregistered. + * + * Unregister fprobe (and remove ftrace hooks from the function entries). + * + * Return 0 if @fp is unregistered successfully, -errno if not. + */ +int unregister_fprobe(struct fprobe *fp) +{ + int ret; + + if (!fprobe_is_registered(fp)) + return -EINVAL; + + if (fp->rethook) + rethook_stop(fp->rethook); + + ret = unregister_ftrace_function(&fp->ops); + if (ret < 0) + return ret; + + if (fp->rethook) + rethook_free(fp->rethook); + + ftrace_free_filter(&fp->ops); + + return ret; +} +EXPORT_SYMBOL_GPL(unregister_fprobe); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c new file mode 100644 index 0000000000..b01ae7d360 --- /dev/null +++ b/kernel/trace/ftrace.c @@ -0,0 +1,8270 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Infrastructure for profiling code inserted by 'gcc -pg'. + * + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com> + * + * Originally ported from the -rt patch by: + * Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com> + * + * Based on code in the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 Nadia Yvette Chambers + */ + +#include <linux/stop_machine.h> +#include <linux/clocksource.h> +#include <linux/sched/task.h> +#include <linux/kallsyms.h> +#include <linux/security.h> +#include <linux/seq_file.h> +#include <linux/tracefs.h> +#include <linux/hardirq.h> +#include <linux/kthread.h> +#include <linux/uaccess.h> +#include <linux/bsearch.h> +#include <linux/module.h> +#include <linux/ftrace.h> +#include <linux/sysctl.h> +#include <linux/slab.h> +#include <linux/ctype.h> +#include <linux/sort.h> +#include <linux/list.h> +#include <linux/hash.h> +#include <linux/rcupdate.h> +#include <linux/kprobes.h> + +#include <trace/events/sched.h> + +#include <asm/sections.h> +#include <asm/setup.h> + +#include "ftrace_internal.h" +#include "trace_output.h" +#include "trace_stat.h" + +/* Flags that do not get reset */ +#define FTRACE_NOCLEAR_FLAGS (FTRACE_FL_DISABLED | FTRACE_FL_TOUCHED | \ + FTRACE_FL_MODIFIED) + +#define FTRACE_INVALID_FUNCTION "__ftrace_invalid_address__" + +#define FTRACE_WARN_ON(cond) \ + ({ \ + int ___r = cond; \ + if (WARN_ON(___r)) \ + ftrace_kill(); \ + ___r; \ + }) + +#define FTRACE_WARN_ON_ONCE(cond) \ + ({ \ + int ___r = cond; \ + if (WARN_ON_ONCE(___r)) \ + ftrace_kill(); \ + ___r; \ + }) + +/* hash bits for specific function selection */ +#define FTRACE_HASH_DEFAULT_BITS 10 +#define FTRACE_HASH_MAX_BITS 12 + +#ifdef CONFIG_DYNAMIC_FTRACE +#define INIT_OPS_HASH(opsname) \ + .func_hash = &opsname.local_hash, \ + .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), +#else +#define INIT_OPS_HASH(opsname) +#endif + +enum { + FTRACE_MODIFY_ENABLE_FL = (1 << 0), + FTRACE_MODIFY_MAY_SLEEP_FL = (1 << 1), +}; + +struct ftrace_ops ftrace_list_end __read_mostly = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_STUB, + INIT_OPS_HASH(ftrace_list_end) +}; + +/* ftrace_enabled is a method to turn ftrace on or off */ +int ftrace_enabled __read_mostly; +static int __maybe_unused last_ftrace_enabled; + +/* Current function tracing op */ +struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; +/* What to set function_trace_op to */ +static struct ftrace_ops *set_function_trace_op; + +static bool ftrace_pids_enabled(struct ftrace_ops *ops) +{ + struct trace_array *tr; + + if (!(ops->flags & FTRACE_OPS_FL_PID) || !ops->private) + return false; + + tr = ops->private; + + return tr->function_pids != NULL || tr->function_no_pids != NULL; +} + +static void ftrace_update_trampoline(struct ftrace_ops *ops); + +/* + * ftrace_disabled is set when an anomaly is discovered. + * ftrace_disabled is much stronger than ftrace_enabled. + */ +static int ftrace_disabled __read_mostly; + +DEFINE_MUTEX(ftrace_lock); + +struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; +ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; +struct ftrace_ops global_ops; + +/* Defined by vmlinux.lds.h see the comment above arch_ftrace_ops_list_func for details */ +void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS +/* + * Stub used to invoke the list ops without requiring a separate trampoline. + */ +const struct ftrace_ops ftrace_list_ops = { + .func = ftrace_ops_list_func, + .flags = FTRACE_OPS_FL_STUB, +}; + +static void ftrace_ops_nop_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + /* do nothing */ +} + +/* + * Stub used when a call site is disabled. May be called transiently by threads + * which have made it into ftrace_caller but haven't yet recovered the ops at + * the point the call site is disabled. + */ +const struct ftrace_ops ftrace_nop_ops = { + .func = ftrace_ops_nop_func, + .flags = FTRACE_OPS_FL_STUB, +}; +#endif + +static inline void ftrace_ops_init(struct ftrace_ops *ops) +{ +#ifdef CONFIG_DYNAMIC_FTRACE + if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) { + mutex_init(&ops->local_hash.regex_lock); + ops->func_hash = &ops->local_hash; + ops->flags |= FTRACE_OPS_FL_INITIALIZED; + } +#endif +} + +static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + struct trace_array *tr = op->private; + int pid; + + if (tr) { + pid = this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid); + if (pid == FTRACE_PID_IGNORE) + return; + if (pid != FTRACE_PID_TRACE && + pid != current->pid) + return; + } + + op->saved_func(ip, parent_ip, op, fregs); +} + +static void ftrace_sync_ipi(void *data) +{ + /* Probably not needed, but do it anyway */ + smp_rmb(); +} + +static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) +{ + /* + * If this is a dynamic or RCU ops, or we force list func, + * then it needs to call the list anyway. + */ + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_RCU) || + FTRACE_FORCE_LIST_FUNC) + return ftrace_ops_list_func; + + return ftrace_ops_get_func(ops); +} + +static void update_ftrace_function(void) +{ + ftrace_func_t func; + + /* + * Prepare the ftrace_ops that the arch callback will use. + * If there's only one ftrace_ops registered, the ftrace_ops_list + * will point to the ops we want. + */ + set_function_trace_op = rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)); + + /* If there's no ftrace_ops registered, just call the stub function */ + if (set_function_trace_op == &ftrace_list_end) { + func = ftrace_stub; + + /* + * If we are at the end of the list and this ops is + * recursion safe and not dynamic and the arch supports passing ops, + * then have the mcount trampoline call the function directly. + */ + } else if (rcu_dereference_protected(ftrace_ops_list->next, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { + func = ftrace_ops_get_list_func(ftrace_ops_list); + + } else { + /* Just use the default ftrace_ops */ + set_function_trace_op = &ftrace_list_end; + func = ftrace_ops_list_func; + } + + update_function_graph_func(); + + /* If there's no change, then do nothing more here */ + if (ftrace_trace_function == func) + return; + + /* + * If we are using the list function, it doesn't care + * about the function_trace_ops. + */ + if (func == ftrace_ops_list_func) { + ftrace_trace_function = func; + /* + * Don't even bother setting function_trace_ops, + * it would be racy to do so anyway. + */ + return; + } + +#ifndef CONFIG_DYNAMIC_FTRACE + /* + * For static tracing, we need to be a bit more careful. + * The function change takes affect immediately. Thus, + * we need to coordinate the setting of the function_trace_ops + * with the setting of the ftrace_trace_function. + * + * Set the function to the list ops, which will call the + * function we want, albeit indirectly, but it handles the + * ftrace_ops and doesn't depend on function_trace_op. + */ + ftrace_trace_function = ftrace_ops_list_func; + /* + * Make sure all CPUs see this. Yes this is slow, but static + * tracing is slow and nasty to have enabled. + */ + synchronize_rcu_tasks_rude(); + /* Now all cpus are using the list ops. */ + function_trace_op = set_function_trace_op; + /* Make sure the function_trace_op is visible on all CPUs */ + smp_wmb(); + /* Nasty way to force a rmb on all cpus */ + smp_call_function(ftrace_sync_ipi, NULL, 1); + /* OK, we are all set to update the ftrace_trace_function now! */ +#endif /* !CONFIG_DYNAMIC_FTRACE */ + + ftrace_trace_function = func; +} + +static void add_ftrace_ops(struct ftrace_ops __rcu **list, + struct ftrace_ops *ops) +{ + rcu_assign_pointer(ops->next, *list); + + /* + * We are entering ops into the list but another + * CPU might be walking that list. We need to make sure + * the ops->next pointer is valid before another CPU sees + * the ops pointer included into the list. + */ + rcu_assign_pointer(*list, ops); +} + +static int remove_ftrace_ops(struct ftrace_ops __rcu **list, + struct ftrace_ops *ops) +{ + struct ftrace_ops **p; + + /* + * If we are removing the last function, then simply point + * to the ftrace_stub. + */ + if (rcu_dereference_protected(*list, + lockdep_is_held(&ftrace_lock)) == ops && + rcu_dereference_protected(ops->next, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { + *list = &ftrace_list_end; + return 0; + } + + for (p = list; *p != &ftrace_list_end; p = &(*p)->next) + if (*p == ops) + break; + + if (*p != ops) + return -1; + + *p = (*p)->next; + return 0; +} + +static void ftrace_update_trampoline(struct ftrace_ops *ops); + +int __register_ftrace_function(struct ftrace_ops *ops) +{ + if (ops->flags & FTRACE_OPS_FL_DELETED) + return -EINVAL; + + if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EBUSY; + +#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS + /* + * If the ftrace_ops specifies SAVE_REGS, then it only can be used + * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. + * Setting SAVE_REGS_IF_SUPPORTED makes SAVE_REGS irrelevant. + */ + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS && + !(ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)) + return -EINVAL; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED) + ops->flags |= FTRACE_OPS_FL_SAVE_REGS; +#endif + if (!ftrace_enabled && (ops->flags & FTRACE_OPS_FL_PERMANENT)) + return -EBUSY; + + if (!is_kernel_core_data((unsigned long)ops)) + ops->flags |= FTRACE_OPS_FL_DYNAMIC; + + add_ftrace_ops(&ftrace_ops_list, ops); + + /* Always save the function, and reset at unregistering */ + ops->saved_func = ops->func; + + if (ftrace_pids_enabled(ops)) + ops->func = ftrace_pid_func; + + ftrace_update_trampoline(ops); + + if (ftrace_enabled) + update_ftrace_function(); + + return 0; +} + +int __unregister_ftrace_function(struct ftrace_ops *ops) +{ + int ret; + + if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) + return -EBUSY; + + ret = remove_ftrace_ops(&ftrace_ops_list, ops); + + if (ret < 0) + return ret; + + if (ftrace_enabled) + update_ftrace_function(); + + ops->func = ops->saved_func; + + return 0; +} + +static void ftrace_update_pid_func(void) +{ + struct ftrace_ops *op; + + /* Only do something if we are tracing something */ + if (ftrace_trace_function == ftrace_stub) + return; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op->flags & FTRACE_OPS_FL_PID) { + op->func = ftrace_pids_enabled(op) ? + ftrace_pid_func : op->saved_func; + ftrace_update_trampoline(op); + } + } while_for_each_ftrace_op(op); + + update_ftrace_function(); +} + +#ifdef CONFIG_FUNCTION_PROFILER +struct ftrace_profile { + struct hlist_node node; + unsigned long ip; + unsigned long counter; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + unsigned long long time; + unsigned long long time_squared; +#endif +}; + +struct ftrace_profile_page { + struct ftrace_profile_page *next; + unsigned long index; + struct ftrace_profile records[]; +}; + +struct ftrace_profile_stat { + atomic_t disabled; + struct hlist_head *hash; + struct ftrace_profile_page *pages; + struct ftrace_profile_page *start; + struct tracer_stat stat; +}; + +#define PROFILE_RECORDS_SIZE \ + (PAGE_SIZE - offsetof(struct ftrace_profile_page, records)) + +#define PROFILES_PER_PAGE \ + (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) + +static int ftrace_profile_enabled __read_mostly; + +/* ftrace_profile_lock - synchronize the enable and disable of the profiler */ +static DEFINE_MUTEX(ftrace_profile_lock); + +static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); + +#define FTRACE_PROFILE_HASH_BITS 10 +#define FTRACE_PROFILE_HASH_SIZE (1 << FTRACE_PROFILE_HASH_BITS) + +static void * +function_stat_next(void *v, int idx) +{ + struct ftrace_profile *rec = v; + struct ftrace_profile_page *pg; + + pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); + + again: + if (idx != 0) + rec++; + + if ((void *)rec >= (void *)&pg->records[pg->index]) { + pg = pg->next; + if (!pg) + return NULL; + rec = &pg->records[0]; + if (!rec->counter) + goto again; + } + + return rec; +} + +static void *function_stat_start(struct tracer_stat *trace) +{ + struct ftrace_profile_stat *stat = + container_of(trace, struct ftrace_profile_stat, stat); + + if (!stat || !stat->start) + return NULL; + + return function_stat_next(&stat->start->records[0], 0); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* function graph compares on total time */ +static int function_stat_cmp(const void *p1, const void *p2) +{ + const struct ftrace_profile *a = p1; + const struct ftrace_profile *b = p2; + + if (a->time < b->time) + return -1; + if (a->time > b->time) + return 1; + else + return 0; +} +#else +/* not function graph compares against hits */ +static int function_stat_cmp(const void *p1, const void *p2) +{ + const struct ftrace_profile *a = p1; + const struct ftrace_profile *b = p2; + + if (a->counter < b->counter) + return -1; + if (a->counter > b->counter) + return 1; + else + return 0; +} +#endif + +static int function_stat_headers(struct seq_file *m) +{ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + seq_puts(m, " Function " + "Hit Time Avg s^2\n" + " -------- " + "--- ---- --- ---\n"); +#else + seq_puts(m, " Function Hit\n" + " -------- ---\n"); +#endif + return 0; +} + +static int function_stat_show(struct seq_file *m, void *v) +{ + struct ftrace_profile *rec = v; + char str[KSYM_SYMBOL_LEN]; + int ret = 0; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + static struct trace_seq s; + unsigned long long avg; + unsigned long long stddev; +#endif + mutex_lock(&ftrace_profile_lock); + + /* we raced with function_profile_reset() */ + if (unlikely(rec->counter == 0)) { + ret = -EBUSY; + goto out; + } + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + avg = div64_ul(rec->time, rec->counter); + if (tracing_thresh && (avg < tracing_thresh)) + goto out; +#endif + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + seq_printf(m, " %-30.30s %10lu", str, rec->counter); + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + seq_puts(m, " "); + + /* Sample standard deviation (s^2) */ + if (rec->counter <= 1) + stddev = 0; + else { + /* + * Apply Welford's method: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + */ + stddev = rec->counter * rec->time_squared - + rec->time * rec->time; + + /* + * Divide only 1000 for ns^2 -> us^2 conversion. + * trace_print_graph_duration will divide 1000 again. + */ + stddev = div64_ul(stddev, + rec->counter * (rec->counter - 1) * 1000); + } + + trace_seq_init(&s); + trace_print_graph_duration(rec->time, &s); + trace_seq_puts(&s, " "); + trace_print_graph_duration(avg, &s); + trace_seq_puts(&s, " "); + trace_print_graph_duration(stddev, &s); + trace_print_seq(m, &s); +#endif + seq_putc(m, '\n'); +out: + mutex_unlock(&ftrace_profile_lock); + + return ret; +} + +static void ftrace_profile_reset(struct ftrace_profile_stat *stat) +{ + struct ftrace_profile_page *pg; + + pg = stat->pages = stat->start; + + while (pg) { + memset(pg->records, 0, PROFILE_RECORDS_SIZE); + pg->index = 0; + pg = pg->next; + } + + memset(stat->hash, 0, + FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head)); +} + +static int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) +{ + struct ftrace_profile_page *pg; + int functions; + int pages; + int i; + + /* If we already allocated, do nothing */ + if (stat->pages) + return 0; + + stat->pages = (void *)get_zeroed_page(GFP_KERNEL); + if (!stat->pages) + return -ENOMEM; + +#ifdef CONFIG_DYNAMIC_FTRACE + functions = ftrace_update_tot_cnt; +#else + /* + * We do not know the number of functions that exist because + * dynamic tracing is what counts them. With past experience + * we have around 20K functions. That should be more than enough. + * It is highly unlikely we will execute every function in + * the kernel. + */ + functions = 20000; +#endif + + pg = stat->start = stat->pages; + + pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); + + for (i = 1; i < pages; i++) { + pg->next = (void *)get_zeroed_page(GFP_KERNEL); + if (!pg->next) + goto out_free; + pg = pg->next; + } + + return 0; + + out_free: + pg = stat->start; + while (pg) { + unsigned long tmp = (unsigned long)pg; + + pg = pg->next; + free_page(tmp); + } + + stat->pages = NULL; + stat->start = NULL; + + return -ENOMEM; +} + +static int ftrace_profile_init_cpu(int cpu) +{ + struct ftrace_profile_stat *stat; + int size; + + stat = &per_cpu(ftrace_profile_stats, cpu); + + if (stat->hash) { + /* If the profile is already created, simply reset it */ + ftrace_profile_reset(stat); + return 0; + } + + /* + * We are profiling all functions, but usually only a few thousand + * functions are hit. We'll make a hash of 1024 items. + */ + size = FTRACE_PROFILE_HASH_SIZE; + + stat->hash = kcalloc(size, sizeof(struct hlist_head), GFP_KERNEL); + + if (!stat->hash) + return -ENOMEM; + + /* Preallocate the function profiling pages */ + if (ftrace_profile_pages_init(stat) < 0) { + kfree(stat->hash); + stat->hash = NULL; + return -ENOMEM; + } + + return 0; +} + +static int ftrace_profile_init(void) +{ + int cpu; + int ret = 0; + + for_each_possible_cpu(cpu) { + ret = ftrace_profile_init_cpu(cpu); + if (ret) + break; + } + + return ret; +} + +/* interrupts must be disabled */ +static struct ftrace_profile * +ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) +{ + struct ftrace_profile *rec; + struct hlist_head *hhd; + unsigned long key; + + key = hash_long(ip, FTRACE_PROFILE_HASH_BITS); + hhd = &stat->hash[key]; + + if (hlist_empty(hhd)) + return NULL; + + hlist_for_each_entry_rcu_notrace(rec, hhd, node) { + if (rec->ip == ip) + return rec; + } + + return NULL; +} + +static void ftrace_add_profile(struct ftrace_profile_stat *stat, + struct ftrace_profile *rec) +{ + unsigned long key; + + key = hash_long(rec->ip, FTRACE_PROFILE_HASH_BITS); + hlist_add_head_rcu(&rec->node, &stat->hash[key]); +} + +/* + * The memory is already allocated, this simply finds a new record to use. + */ +static struct ftrace_profile * +ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) +{ + struct ftrace_profile *rec = NULL; + + /* prevent recursion (from NMIs) */ + if (atomic_inc_return(&stat->disabled) != 1) + goto out; + + /* + * Try to find the function again since an NMI + * could have added it + */ + rec = ftrace_find_profiled_func(stat, ip); + if (rec) + goto out; + + if (stat->pages->index == PROFILES_PER_PAGE) { + if (!stat->pages->next) + goto out; + stat->pages = stat->pages->next; + } + + rec = &stat->pages->records[stat->pages->index++]; + rec->ip = ip; + ftrace_add_profile(stat, rec); + + out: + atomic_dec(&stat->disabled); + + return rec; +} + +static void +function_profile_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + struct ftrace_profile_stat *stat; + struct ftrace_profile *rec; + unsigned long flags; + + if (!ftrace_profile_enabled) + return; + + local_irq_save(flags); + + stat = this_cpu_ptr(&ftrace_profile_stats); + if (!stat->hash || !ftrace_profile_enabled) + goto out; + + rec = ftrace_find_profiled_func(stat, ip); + if (!rec) { + rec = ftrace_profile_alloc(stat, ip); + if (!rec) + goto out; + } + + rec->counter++; + out: + local_irq_restore(flags); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static bool fgraph_graph_time = true; + +void ftrace_graph_graph_time_control(bool enable) +{ + fgraph_graph_time = enable; +} + +static int profile_graph_entry(struct ftrace_graph_ent *trace) +{ + struct ftrace_ret_stack *ret_stack; + + function_profile_call(trace->func, 0, NULL, NULL); + + /* If function graph is shutting down, ret_stack can be NULL */ + if (!current->ret_stack) + return 0; + + ret_stack = ftrace_graph_get_ret_stack(current, 0); + if (ret_stack) + ret_stack->subtime = 0; + + return 1; +} + +static void profile_graph_return(struct ftrace_graph_ret *trace) +{ + struct ftrace_ret_stack *ret_stack; + struct ftrace_profile_stat *stat; + unsigned long long calltime; + struct ftrace_profile *rec; + unsigned long flags; + + local_irq_save(flags); + stat = this_cpu_ptr(&ftrace_profile_stats); + if (!stat->hash || !ftrace_profile_enabled) + goto out; + + /* If the calltime was zero'd ignore it */ + if (!trace->calltime) + goto out; + + calltime = trace->rettime - trace->calltime; + + if (!fgraph_graph_time) { + + /* Append this call time to the parent time to subtract */ + ret_stack = ftrace_graph_get_ret_stack(current, 1); + if (ret_stack) + ret_stack->subtime += calltime; + + ret_stack = ftrace_graph_get_ret_stack(current, 0); + if (ret_stack && ret_stack->subtime < calltime) + calltime -= ret_stack->subtime; + else + calltime = 0; + } + + rec = ftrace_find_profiled_func(stat, trace->func); + if (rec) { + rec->time += calltime; + rec->time_squared += calltime * calltime; + } + + out: + local_irq_restore(flags); +} + +static struct fgraph_ops fprofiler_ops = { + .entryfunc = &profile_graph_entry, + .retfunc = &profile_graph_return, +}; + +static int register_ftrace_profiler(void) +{ + return register_ftrace_graph(&fprofiler_ops); +} + +static void unregister_ftrace_profiler(void) +{ + unregister_ftrace_graph(&fprofiler_ops); +} +#else +static struct ftrace_ops ftrace_profile_ops __read_mostly = { + .func = function_profile_call, + .flags = FTRACE_OPS_FL_INITIALIZED, + INIT_OPS_HASH(ftrace_profile_ops) +}; + +static int register_ftrace_profiler(void) +{ + return register_ftrace_function(&ftrace_profile_ops); +} + +static void unregister_ftrace_profiler(void) +{ + unregister_ftrace_function(&ftrace_profile_ops); +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +static ssize_t +ftrace_profile_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + val = !!val; + + mutex_lock(&ftrace_profile_lock); + if (ftrace_profile_enabled ^ val) { + if (val) { + ret = ftrace_profile_init(); + if (ret < 0) { + cnt = ret; + goto out; + } + + ret = register_ftrace_profiler(); + if (ret < 0) { + cnt = ret; + goto out; + } + ftrace_profile_enabled = 1; + } else { + ftrace_profile_enabled = 0; + /* + * unregister_ftrace_profiler calls stop_machine + * so this acts like an synchronize_rcu. + */ + unregister_ftrace_profiler(); + } + } + out: + mutex_unlock(&ftrace_profile_lock); + + *ppos += cnt; + + return cnt; +} + +static ssize_t +ftrace_profile_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; /* big enough to hold a number */ + int r; + + r = sprintf(buf, "%u\n", ftrace_profile_enabled); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static const struct file_operations ftrace_profile_fops = { + .open = tracing_open_generic, + .read = ftrace_profile_read, + .write = ftrace_profile_write, + .llseek = default_llseek, +}; + +/* used to initialize the real stat files */ +static struct tracer_stat function_stats __initdata = { + .name = "functions", + .stat_start = function_stat_start, + .stat_next = function_stat_next, + .stat_cmp = function_stat_cmp, + .stat_headers = function_stat_headers, + .stat_show = function_stat_show +}; + +static __init void ftrace_profile_tracefs(struct dentry *d_tracer) +{ + struct ftrace_profile_stat *stat; + char *name; + int ret; + int cpu; + + for_each_possible_cpu(cpu) { + stat = &per_cpu(ftrace_profile_stats, cpu); + + name = kasprintf(GFP_KERNEL, "function%d", cpu); + if (!name) { + /* + * The files created are permanent, if something happens + * we still do not free memory. + */ + WARN(1, + "Could not allocate stat file for cpu %d\n", + cpu); + return; + } + stat->stat = function_stats; + stat->stat.name = name; + ret = register_stat_tracer(&stat->stat); + if (ret) { + WARN(1, + "Could not register function stat for cpu %d\n", + cpu); + kfree(name); + return; + } + } + + trace_create_file("function_profile_enabled", + TRACE_MODE_WRITE, d_tracer, NULL, + &ftrace_profile_fops); +} + +#else /* CONFIG_FUNCTION_PROFILER */ +static __init void ftrace_profile_tracefs(struct dentry *d_tracer) +{ +} +#endif /* CONFIG_FUNCTION_PROFILER */ + +#ifdef CONFIG_DYNAMIC_FTRACE + +static struct ftrace_ops *removed_ops; + +/* + * Set when doing a global update, like enabling all recs or disabling them. + * It is not set when just updating a single ftrace_ops. + */ +static bool update_all_ops; + +#ifndef CONFIG_FTRACE_MCOUNT_RECORD +# error Dynamic ftrace depends on MCOUNT_RECORD +#endif + +struct ftrace_func_probe { + struct ftrace_probe_ops *probe_ops; + struct ftrace_ops ops; + struct trace_array *tr; + struct list_head list; + void *data; + int ref; +}; + +/* + * We make these constant because no one should touch them, + * but they are used as the default "empty hash", to avoid allocating + * it all the time. These are in a read only section such that if + * anyone does try to modify it, it will cause an exception. + */ +static const struct hlist_head empty_buckets[1]; +static const struct ftrace_hash empty_hash = { + .buckets = (struct hlist_head *)empty_buckets, +}; +#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) + +struct ftrace_ops global_ops = { + .func = ftrace_stub, + .local_hash.notrace_hash = EMPTY_HASH, + .local_hash.filter_hash = EMPTY_HASH, + INIT_OPS_HASH(global_ops) + .flags = FTRACE_OPS_FL_INITIALIZED | + FTRACE_OPS_FL_PID, +}; + +/* + * Used by the stack unwinder to know about dynamic ftrace trampolines. + */ +struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr) +{ + struct ftrace_ops *op = NULL; + + /* + * Some of the ops may be dynamically allocated, + * they are freed after a synchronize_rcu(). + */ + preempt_disable_notrace(); + + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* + * This is to check for dynamically allocated trampolines. + * Trampolines that are in kernel text will have + * core_kernel_text() return true. + */ + if (op->trampoline && op->trampoline_size) + if (addr >= op->trampoline && + addr < op->trampoline + op->trampoline_size) { + preempt_enable_notrace(); + return op; + } + } while_for_each_ftrace_op(op); + preempt_enable_notrace(); + + return NULL; +} + +/* + * This is used by __kernel_text_address() to return true if the + * address is on a dynamically allocated trampoline that would + * not return true for either core_kernel_text() or + * is_module_text_address(). + */ +bool is_ftrace_trampoline(unsigned long addr) +{ + return ftrace_ops_trampoline(addr) != NULL; +} + +struct ftrace_page { + struct ftrace_page *next; + struct dyn_ftrace *records; + int index; + int order; +}; + +#define ENTRY_SIZE sizeof(struct dyn_ftrace) +#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) + +static struct ftrace_page *ftrace_pages_start; +static struct ftrace_page *ftrace_pages; + +static __always_inline unsigned long +ftrace_hash_key(struct ftrace_hash *hash, unsigned long ip) +{ + if (hash->size_bits > 0) + return hash_long(ip, hash->size_bits); + + return 0; +} + +/* Only use this function if ftrace_hash_empty() has already been tested */ +static __always_inline struct ftrace_func_entry * +__ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) +{ + unsigned long key; + struct ftrace_func_entry *entry; + struct hlist_head *hhd; + + key = ftrace_hash_key(hash, ip); + hhd = &hash->buckets[key]; + + hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) { + if (entry->ip == ip) + return entry; + } + return NULL; +} + +/** + * ftrace_lookup_ip - Test to see if an ip exists in an ftrace_hash + * @hash: The hash to look at + * @ip: The instruction pointer to test + * + * Search a given @hash to see if a given instruction pointer (@ip) + * exists in it. + * + * Returns the entry that holds the @ip if found. NULL otherwise. + */ +struct ftrace_func_entry * +ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) +{ + if (ftrace_hash_empty(hash)) + return NULL; + + return __ftrace_lookup_ip(hash, ip); +} + +static void __add_hash_entry(struct ftrace_hash *hash, + struct ftrace_func_entry *entry) +{ + struct hlist_head *hhd; + unsigned long key; + + key = ftrace_hash_key(hash, entry->ip); + hhd = &hash->buckets[key]; + hlist_add_head(&entry->hlist, hhd); + hash->count++; +} + +static struct ftrace_func_entry * +add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +{ + struct ftrace_func_entry *entry; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return NULL; + + entry->ip = ip; + __add_hash_entry(hash, entry); + + return entry; +} + +static void +free_hash_entry(struct ftrace_hash *hash, + struct ftrace_func_entry *entry) +{ + hlist_del(&entry->hlist); + kfree(entry); + hash->count--; +} + +static void +remove_hash_entry(struct ftrace_hash *hash, + struct ftrace_func_entry *entry) +{ + hlist_del_rcu(&entry->hlist); + hash->count--; +} + +static void ftrace_hash_clear(struct ftrace_hash *hash) +{ + struct hlist_head *hhd; + struct hlist_node *tn; + struct ftrace_func_entry *entry; + int size = 1 << hash->size_bits; + int i; + + if (!hash->count) + return; + + for (i = 0; i < size; i++) { + hhd = &hash->buckets[i]; + hlist_for_each_entry_safe(entry, tn, hhd, hlist) + free_hash_entry(hash, entry); + } + FTRACE_WARN_ON(hash->count); +} + +static void free_ftrace_mod(struct ftrace_mod_load *ftrace_mod) +{ + list_del(&ftrace_mod->list); + kfree(ftrace_mod->module); + kfree(ftrace_mod->func); + kfree(ftrace_mod); +} + +static void clear_ftrace_mod_list(struct list_head *head) +{ + struct ftrace_mod_load *p, *n; + + /* stack tracer isn't supported yet */ + if (!head) + return; + + mutex_lock(&ftrace_lock); + list_for_each_entry_safe(p, n, head, list) + free_ftrace_mod(p); + mutex_unlock(&ftrace_lock); +} + +static void free_ftrace_hash(struct ftrace_hash *hash) +{ + if (!hash || hash == EMPTY_HASH) + return; + ftrace_hash_clear(hash); + kfree(hash->buckets); + kfree(hash); +} + +static void __free_ftrace_hash_rcu(struct rcu_head *rcu) +{ + struct ftrace_hash *hash; + + hash = container_of(rcu, struct ftrace_hash, rcu); + free_ftrace_hash(hash); +} + +static void free_ftrace_hash_rcu(struct ftrace_hash *hash) +{ + if (!hash || hash == EMPTY_HASH) + return; + call_rcu(&hash->rcu, __free_ftrace_hash_rcu); +} + +/** + * ftrace_free_filter - remove all filters for an ftrace_ops + * @ops - the ops to remove the filters from + */ +void ftrace_free_filter(struct ftrace_ops *ops) +{ + ftrace_ops_init(ops); + free_ftrace_hash(ops->func_hash->filter_hash); + free_ftrace_hash(ops->func_hash->notrace_hash); +} +EXPORT_SYMBOL_GPL(ftrace_free_filter); + +static struct ftrace_hash *alloc_ftrace_hash(int size_bits) +{ + struct ftrace_hash *hash; + int size; + + hash = kzalloc(sizeof(*hash), GFP_KERNEL); + if (!hash) + return NULL; + + size = 1 << size_bits; + hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL); + + if (!hash->buckets) { + kfree(hash); + return NULL; + } + + hash->size_bits = size_bits; + + return hash; +} + + +static int ftrace_add_mod(struct trace_array *tr, + const char *func, const char *module, + int enable) +{ + struct ftrace_mod_load *ftrace_mod; + struct list_head *mod_head = enable ? &tr->mod_trace : &tr->mod_notrace; + + ftrace_mod = kzalloc(sizeof(*ftrace_mod), GFP_KERNEL); + if (!ftrace_mod) + return -ENOMEM; + + INIT_LIST_HEAD(&ftrace_mod->list); + ftrace_mod->func = kstrdup(func, GFP_KERNEL); + ftrace_mod->module = kstrdup(module, GFP_KERNEL); + ftrace_mod->enable = enable; + + if (!ftrace_mod->func || !ftrace_mod->module) + goto out_free; + + list_add(&ftrace_mod->list, mod_head); + + return 0; + + out_free: + free_ftrace_mod(ftrace_mod); + + return -ENOMEM; +} + +static struct ftrace_hash * +alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *new_hash; + int size; + int i; + + new_hash = alloc_ftrace_hash(size_bits); + if (!new_hash) + return NULL; + + if (hash) + new_hash->flags = hash->flags; + + /* Empty hash? */ + if (ftrace_hash_empty(hash)) + return new_hash; + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + if (add_hash_entry(new_hash, entry->ip) == NULL) + goto free_hash; + } + } + + FTRACE_WARN_ON(new_hash->count != hash->count); + + return new_hash; + + free_hash: + free_ftrace_hash(new_hash); + return NULL; +} + +static void +ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash); +static void +ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); + +static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, + struct ftrace_hash *new_hash); + +static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *new_hash; + struct hlist_head *hhd; + struct hlist_node *tn; + int bits = 0; + int i; + + /* + * Use around half the size (max bit of it), but + * a minimum of 2 is fine (as size of 0 or 1 both give 1 for bits). + */ + bits = fls(size / 2); + + /* Don't allocate too much */ + if (bits > FTRACE_HASH_MAX_BITS) + bits = FTRACE_HASH_MAX_BITS; + + new_hash = alloc_ftrace_hash(bits); + if (!new_hash) + return NULL; + + new_hash->flags = src->flags; + + size = 1 << src->size_bits; + for (i = 0; i < size; i++) { + hhd = &src->buckets[i]; + hlist_for_each_entry_safe(entry, tn, hhd, hlist) { + remove_hash_entry(src, entry); + __add_hash_entry(new_hash, entry); + } + } + return new_hash; +} + +static struct ftrace_hash * +__ftrace_hash_move(struct ftrace_hash *src) +{ + int size = src->count; + + /* + * If the new source is empty, just return the empty_hash. + */ + if (ftrace_hash_empty(src)) + return EMPTY_HASH; + + return dup_hash(src, size); +} + +static int +ftrace_hash_move(struct ftrace_ops *ops, int enable, + struct ftrace_hash **dst, struct ftrace_hash *src) +{ + struct ftrace_hash *new_hash; + int ret; + + /* Reject setting notrace hash on IPMODIFY ftrace_ops */ + if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable) + return -EINVAL; + + new_hash = __ftrace_hash_move(src); + if (!new_hash) + return -ENOMEM; + + /* Make sure this can be applied if it is IPMODIFY ftrace_ops */ + if (enable) { + /* IPMODIFY should be updated only when filter_hash updating */ + ret = ftrace_hash_ipmodify_update(ops, new_hash); + if (ret < 0) { + free_ftrace_hash(new_hash); + return ret; + } + } + + /* + * Remove the current set, update the hash and add + * them back. + */ + ftrace_hash_rec_disable_modify(ops, enable); + + rcu_assign_pointer(*dst, new_hash); + + ftrace_hash_rec_enable_modify(ops, enable); + + return 0; +} + +static bool hash_contains_ip(unsigned long ip, + struct ftrace_ops_hash *hash) +{ + /* + * The function record is a match if it exists in the filter + * hash and not in the notrace hash. Note, an empty hash is + * considered a match for the filter hash, but an empty + * notrace hash is considered not in the notrace hash. + */ + return (ftrace_hash_empty(hash->filter_hash) || + __ftrace_lookup_ip(hash->filter_hash, ip)) && + (ftrace_hash_empty(hash->notrace_hash) || + !__ftrace_lookup_ip(hash->notrace_hash, ip)); +} + +/* + * Test the hashes for this ops to see if we want to call + * the ops->func or not. + * + * It's a match if the ip is in the ops->filter_hash or + * the filter_hash does not exist or is empty, + * AND + * the ip is not in the ops->notrace_hash. + * + * This needs to be called with preemption disabled as + * the hashes are freed with call_rcu(). + */ +int +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) +{ + struct ftrace_ops_hash hash; + int ret; + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + /* + * There's a small race when adding ops that the ftrace handler + * that wants regs, may be called without them. We can not + * allow that handler to be called if regs is NULL. + */ + if (regs == NULL && (ops->flags & FTRACE_OPS_FL_SAVE_REGS)) + return 0; +#endif + + rcu_assign_pointer(hash.filter_hash, ops->func_hash->filter_hash); + rcu_assign_pointer(hash.notrace_hash, ops->func_hash->notrace_hash); + + if (hash_contains_ip(ip, &hash)) + ret = 1; + else + ret = 0; + + return ret; +} + +/* + * This is a double for. Do not use 'break' to break out of the loop, + * you must use a goto. + */ +#define do_for_each_ftrace_rec(pg, rec) \ + for (pg = ftrace_pages_start; pg; pg = pg->next) { \ + int _____i; \ + for (_____i = 0; _____i < pg->index; _____i++) { \ + rec = &pg->records[_____i]; + +#define while_for_each_ftrace_rec() \ + } \ + } + + +static int ftrace_cmp_recs(const void *a, const void *b) +{ + const struct dyn_ftrace *key = a; + const struct dyn_ftrace *rec = b; + + if (key->flags < rec->ip) + return -1; + if (key->ip >= rec->ip + MCOUNT_INSN_SIZE) + return 1; + return 0; +} + +static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec = NULL; + struct dyn_ftrace key; + + key.ip = start; + key.flags = end; /* overload flags, as it is unsigned long */ + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + if (pg->index == 0 || + end < pg->records[0].ip || + start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) + continue; + rec = bsearch(&key, pg->records, pg->index, + sizeof(struct dyn_ftrace), + ftrace_cmp_recs); + if (rec) + break; + } + return rec; +} + +/** + * ftrace_location_range - return the first address of a traced location + * if it touches the given ip range + * @start: start of range to search. + * @end: end of range to search (inclusive). @end points to the last byte + * to check. + * + * Returns rec->ip if the related ftrace location is a least partly within + * the given address range. That is, the first address of the instruction + * that is either a NOP or call to the function tracer. It checks the ftrace + * internal tables to determine if the address belongs or not. + */ +unsigned long ftrace_location_range(unsigned long start, unsigned long end) +{ + struct dyn_ftrace *rec; + + rec = lookup_rec(start, end); + if (rec) + return rec->ip; + + return 0; +} + +/** + * ftrace_location - return the ftrace location + * @ip: the instruction pointer to check + * + * If @ip matches the ftrace location, return @ip. + * If @ip matches sym+0, return sym's ftrace location. + * Otherwise, return 0. + */ +unsigned long ftrace_location(unsigned long ip) +{ + struct dyn_ftrace *rec; + unsigned long offset; + unsigned long size; + + rec = lookup_rec(ip, ip); + if (!rec) { + if (!kallsyms_lookup_size_offset(ip, &size, &offset)) + goto out; + + /* map sym+0 to __fentry__ */ + if (!offset) + rec = lookup_rec(ip, ip + size - 1); + } + + if (rec) + return rec->ip; + +out: + return 0; +} + +/** + * ftrace_text_reserved - return true if range contains an ftrace location + * @start: start of range to search + * @end: end of range to search (inclusive). @end points to the last byte to check. + * + * Returns 1 if @start and @end contains a ftrace location. + * That is, the instruction that is either a NOP or call to + * the function tracer. It checks the ftrace internal tables to + * determine if the address belongs or not. + */ +int ftrace_text_reserved(const void *start, const void *end) +{ + unsigned long ret; + + ret = ftrace_location_range((unsigned long)start, + (unsigned long)end); + + return (int)!!ret; +} + +/* Test if ops registered to this rec needs regs */ +static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) +{ + struct ftrace_ops *ops; + bool keep_regs = false; + + for (ops = ftrace_ops_list; + ops != &ftrace_list_end; ops = ops->next) { + /* pass rec in as regs to have non-NULL val */ + if (ftrace_ops_test(ops, rec->ip, rec)) { + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + keep_regs = true; + break; + } + } + } + + return keep_regs; +} + +static struct ftrace_ops * +ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); +static struct ftrace_ops * +ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_exclude); +static struct ftrace_ops * +ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops); + +static bool skip_record(struct dyn_ftrace *rec) +{ + /* + * At boot up, weak functions are set to disable. Function tracing + * can be enabled before they are, and they still need to be disabled now. + * If the record is disabled, still continue if it is marked as already + * enabled (this is needed to keep the accounting working). + */ + return rec->flags & FTRACE_FL_DISABLED && + !(rec->flags & FTRACE_FL_ENABLED); +} + +static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, + int filter_hash, + bool inc) +{ + struct ftrace_hash *hash; + struct ftrace_hash *other_hash; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + bool update = false; + int count = 0; + int all = false; + + /* Only update if the ops has been registered */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return false; + + /* + * In the filter_hash case: + * If the count is zero, we update all records. + * Otherwise we just update the items in the hash. + * + * In the notrace_hash case: + * We enable the update in the hash. + * As disabling notrace means enabling the tracing, + * and enabling notrace means disabling, the inc variable + * gets inversed. + */ + if (filter_hash) { + hash = ops->func_hash->filter_hash; + other_hash = ops->func_hash->notrace_hash; + if (ftrace_hash_empty(hash)) + all = true; + } else { + inc = !inc; + hash = ops->func_hash->notrace_hash; + other_hash = ops->func_hash->filter_hash; + /* + * If the notrace hash has no items, + * then there's nothing to do. + */ + if (ftrace_hash_empty(hash)) + return false; + } + + do_for_each_ftrace_rec(pg, rec) { + int in_other_hash = 0; + int in_hash = 0; + int match = 0; + + if (skip_record(rec)) + continue; + + if (all) { + /* + * Only the filter_hash affects all records. + * Update if the record is not in the notrace hash. + */ + if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) + match = 1; + } else { + in_hash = !!ftrace_lookup_ip(hash, rec->ip); + in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); + + /* + * If filter_hash is set, we want to match all functions + * that are in the hash but not in the other hash. + * + * If filter_hash is not set, then we are decrementing. + * That means we match anything that is in the hash + * and also in the other_hash. That is, we need to turn + * off functions in the other hash because they are disabled + * by this hash. + */ + if (filter_hash && in_hash && !in_other_hash) + match = 1; + else if (!filter_hash && in_hash && + (in_other_hash || ftrace_hash_empty(other_hash))) + match = 1; + } + if (!match) + continue; + + if (inc) { + rec->flags++; + if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX)) + return false; + + if (ops->flags & FTRACE_OPS_FL_DIRECT) + rec->flags |= FTRACE_FL_DIRECT; + + /* + * If there's only a single callback registered to a + * function, and the ops has a trampoline registered + * for it, then we can call it directly. + */ + if (ftrace_rec_count(rec) == 1 && ops->trampoline) + rec->flags |= FTRACE_FL_TRAMP; + else + /* + * If we are adding another function callback + * to this function, and the previous had a + * custom trampoline in use, then we need to go + * back to the default trampoline. + */ + rec->flags &= ~FTRACE_FL_TRAMP; + + /* + * If any ops wants regs saved for this function + * then all ops will get saved regs. + */ + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) + rec->flags |= FTRACE_FL_REGS; + } else { + if (FTRACE_WARN_ON(ftrace_rec_count(rec) == 0)) + return false; + rec->flags--; + + /* + * Only the internal direct_ops should have the + * DIRECT flag set. Thus, if it is removing a + * function, then that function should no longer + * be direct. + */ + if (ops->flags & FTRACE_OPS_FL_DIRECT) + rec->flags &= ~FTRACE_FL_DIRECT; + + /* + * If the rec had REGS enabled and the ops that is + * being removed had REGS set, then see if there is + * still any ops for this record that wants regs. + * If not, we can stop recording them. + */ + if (ftrace_rec_count(rec) > 0 && + rec->flags & FTRACE_FL_REGS && + ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + if (!test_rec_ops_needs_regs(rec)) + rec->flags &= ~FTRACE_FL_REGS; + } + + /* + * The TRAMP needs to be set only if rec count + * is decremented to one, and the ops that is + * left has a trampoline. As TRAMP can only be + * enabled if there is only a single ops attached + * to it. + */ + if (ftrace_rec_count(rec) == 1 && + ftrace_find_tramp_ops_any_other(rec, ops)) + rec->flags |= FTRACE_FL_TRAMP; + else + rec->flags &= ~FTRACE_FL_TRAMP; + + /* + * flags will be cleared in ftrace_check_record() + * if rec count is zero. + */ + } + + /* + * If the rec has a single associated ops, and ops->func can be + * called directly, allow the call site to call via the ops. + */ + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS) && + ftrace_rec_count(rec) == 1 && + ftrace_ops_get_func(ops) == ops->func) + rec->flags |= FTRACE_FL_CALL_OPS; + else + rec->flags &= ~FTRACE_FL_CALL_OPS; + + count++; + + /* Must match FTRACE_UPDATE_CALLS in ftrace_modify_all_code() */ + update |= ftrace_test_record(rec, true) != FTRACE_UPDATE_IGNORE; + + /* Shortcut, if we handled all records, we are done. */ + if (!all && count == hash->count) + return update; + } while_for_each_ftrace_rec(); + + return update; +} + +static bool ftrace_hash_rec_disable(struct ftrace_ops *ops, + int filter_hash) +{ + return __ftrace_hash_rec_update(ops, filter_hash, 0); +} + +static bool ftrace_hash_rec_enable(struct ftrace_ops *ops, + int filter_hash) +{ + return __ftrace_hash_rec_update(ops, filter_hash, 1); +} + +static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops, + int filter_hash, int inc) +{ + struct ftrace_ops *op; + + __ftrace_hash_rec_update(ops, filter_hash, inc); + + if (ops->func_hash != &global_ops.local_hash) + return; + + /* + * If the ops shares the global_ops hash, then we need to update + * all ops that are enabled and use this hash. + */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* Already done */ + if (op == ops) + continue; + if (op->func_hash == &global_ops.local_hash) + __ftrace_hash_rec_update(op, filter_hash, inc); + } while_for_each_ftrace_op(op); +} + +static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, + int filter_hash) +{ + ftrace_hash_rec_update_modify(ops, filter_hash, 0); +} + +static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, + int filter_hash) +{ + ftrace_hash_rec_update_modify(ops, filter_hash, 1); +} + +/* + * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK + * or no-needed to update, -EBUSY if it detects a conflict of the flag + * on a ftrace_rec, and -EINVAL if the new_hash tries to trace all recs. + * Note that old_hash and new_hash has below meanings + * - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected) + * - If the hash is EMPTY_HASH, it hits nothing + * - Anything else hits the recs which match the hash entries. + * + * DIRECT ops does not have IPMODIFY flag, but we still need to check it + * against functions with FTRACE_FL_IPMODIFY. If there is any overlap, call + * ops_func(SHARE_IPMODIFY_SELF) to make sure current ops can share with + * IPMODIFY. If ops_func(SHARE_IPMODIFY_SELF) returns non-zero, propagate + * the return value to the caller and eventually to the owner of the DIRECT + * ops. + */ +static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, + struct ftrace_hash *old_hash, + struct ftrace_hash *new_hash) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec, *end = NULL; + int in_old, in_new; + bool is_ipmodify, is_direct; + + /* Only update if the ops has been registered */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return 0; + + is_ipmodify = ops->flags & FTRACE_OPS_FL_IPMODIFY; + is_direct = ops->flags & FTRACE_OPS_FL_DIRECT; + + /* neither IPMODIFY nor DIRECT, skip */ + if (!is_ipmodify && !is_direct) + return 0; + + if (WARN_ON_ONCE(is_ipmodify && is_direct)) + return 0; + + /* + * Since the IPMODIFY and DIRECT are very address sensitive + * actions, we do not allow ftrace_ops to set all functions to new + * hash. + */ + if (!new_hash || !old_hash) + return -EINVAL; + + /* Update rec->flags */ + do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_DISABLED) + continue; + + /* We need to update only differences of filter_hash */ + in_old = !!ftrace_lookup_ip(old_hash, rec->ip); + in_new = !!ftrace_lookup_ip(new_hash, rec->ip); + if (in_old == in_new) + continue; + + if (in_new) { + if (rec->flags & FTRACE_FL_IPMODIFY) { + int ret; + + /* Cannot have two ipmodify on same rec */ + if (is_ipmodify) + goto rollback; + + FTRACE_WARN_ON(rec->flags & FTRACE_FL_DIRECT); + + /* + * Another ops with IPMODIFY is already + * attached. We are now attaching a direct + * ops. Run SHARE_IPMODIFY_SELF, to check + * whether sharing is supported. + */ + if (!ops->ops_func) + return -EBUSY; + ret = ops->ops_func(ops, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF); + if (ret) + return ret; + } else if (is_ipmodify) { + rec->flags |= FTRACE_FL_IPMODIFY; + } + } else if (is_ipmodify) { + rec->flags &= ~FTRACE_FL_IPMODIFY; + } + } while_for_each_ftrace_rec(); + + return 0; + +rollback: + end = rec; + + /* Roll back what we did above */ + do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_DISABLED) + continue; + + if (rec == end) + goto err_out; + + in_old = !!ftrace_lookup_ip(old_hash, rec->ip); + in_new = !!ftrace_lookup_ip(new_hash, rec->ip); + if (in_old == in_new) + continue; + + if (in_new) + rec->flags &= ~FTRACE_FL_IPMODIFY; + else + rec->flags |= FTRACE_FL_IPMODIFY; + } while_for_each_ftrace_rec(); + +err_out: + return -EBUSY; +} + +static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + + if (ftrace_hash_empty(hash)) + hash = NULL; + + return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash); +} + +/* Disabling always succeeds */ +static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + + if (ftrace_hash_empty(hash)) + hash = NULL; + + __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH); +} + +static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, + struct ftrace_hash *new_hash) +{ + struct ftrace_hash *old_hash = ops->func_hash->filter_hash; + + if (ftrace_hash_empty(old_hash)) + old_hash = NULL; + + if (ftrace_hash_empty(new_hash)) + new_hash = NULL; + + return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash); +} + +static void print_ip_ins(const char *fmt, const unsigned char *p) +{ + char ins[MCOUNT_INSN_SIZE]; + + if (copy_from_kernel_nofault(ins, p, MCOUNT_INSN_SIZE)) { + printk(KERN_CONT "%s[FAULT] %px\n", fmt, p); + return; + } + + printk(KERN_CONT "%s", fmt); + pr_cont("%*phC", MCOUNT_INSN_SIZE, ins); +} + +enum ftrace_bug_type ftrace_bug_type; +const void *ftrace_expected; + +static void print_bug_type(void) +{ + switch (ftrace_bug_type) { + case FTRACE_BUG_UNKNOWN: + break; + case FTRACE_BUG_INIT: + pr_info("Initializing ftrace call sites\n"); + break; + case FTRACE_BUG_NOP: + pr_info("Setting ftrace call site to NOP\n"); + break; + case FTRACE_BUG_CALL: + pr_info("Setting ftrace call site to call ftrace function\n"); + break; + case FTRACE_BUG_UPDATE: + pr_info("Updating ftrace call site to call a different ftrace function\n"); + break; + } +} + +/** + * ftrace_bug - report and shutdown function tracer + * @failed: The failed type (EFAULT, EINVAL, EPERM) + * @rec: The record that failed + * + * The arch code that enables or disables the function tracing + * can call ftrace_bug() when it has detected a problem in + * modifying the code. @failed should be one of either: + * EFAULT - if the problem happens on reading the @ip address + * EINVAL - if what is read at @ip is not what was expected + * EPERM - if the problem happens on writing to the @ip address + */ +void ftrace_bug(int failed, struct dyn_ftrace *rec) +{ + unsigned long ip = rec ? rec->ip : 0; + + pr_info("------------[ ftrace bug ]------------\n"); + + switch (failed) { + case -EFAULT: + pr_info("ftrace faulted on modifying "); + print_ip_sym(KERN_INFO, ip); + break; + case -EINVAL: + pr_info("ftrace failed to modify "); + print_ip_sym(KERN_INFO, ip); + print_ip_ins(" actual: ", (unsigned char *)ip); + pr_cont("\n"); + if (ftrace_expected) { + print_ip_ins(" expected: ", ftrace_expected); + pr_cont("\n"); + } + break; + case -EPERM: + pr_info("ftrace faulted on writing "); + print_ip_sym(KERN_INFO, ip); + break; + default: + pr_info("ftrace faulted on unknown error "); + print_ip_sym(KERN_INFO, ip); + } + print_bug_type(); + if (rec) { + struct ftrace_ops *ops = NULL; + + pr_info("ftrace record flags: %lx\n", rec->flags); + pr_cont(" (%ld)%s%s", ftrace_rec_count(rec), + rec->flags & FTRACE_FL_REGS ? " R" : " ", + rec->flags & FTRACE_FL_CALL_OPS ? " O" : " "); + if (rec->flags & FTRACE_FL_TRAMP_EN) { + ops = ftrace_find_tramp_ops_any(rec); + if (ops) { + do { + pr_cont("\ttramp: %pS (%pS)", + (void *)ops->trampoline, + (void *)ops->func); + ops = ftrace_find_tramp_ops_next(rec, ops); + } while (ops); + } else + pr_cont("\ttramp: ERROR!"); + + } + ip = ftrace_get_addr_curr(rec); + pr_cont("\n expected tramp: %lx\n", ip); + } + + FTRACE_WARN_ON_ONCE(1); +} + +static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) +{ + unsigned long flag = 0UL; + + ftrace_bug_type = FTRACE_BUG_UNKNOWN; + + if (skip_record(rec)) + return FTRACE_UPDATE_IGNORE; + + /* + * If we are updating calls: + * + * If the record has a ref count, then we need to enable it + * because someone is using it. + * + * Otherwise we make sure its disabled. + * + * If we are disabling calls, then disable all records that + * are enabled. + */ + if (enable && ftrace_rec_count(rec)) + flag = FTRACE_FL_ENABLED; + + /* + * If enabling and the REGS flag does not match the REGS_EN, or + * the TRAMP flag doesn't match the TRAMP_EN, then do not ignore + * this record. Set flags to fail the compare against ENABLED. + * Same for direct calls. + */ + if (flag) { + if (!(rec->flags & FTRACE_FL_REGS) != + !(rec->flags & FTRACE_FL_REGS_EN)) + flag |= FTRACE_FL_REGS; + + if (!(rec->flags & FTRACE_FL_TRAMP) != + !(rec->flags & FTRACE_FL_TRAMP_EN)) + flag |= FTRACE_FL_TRAMP; + + /* + * Direct calls are special, as count matters. + * We must test the record for direct, if the + * DIRECT and DIRECT_EN do not match, but only + * if the count is 1. That's because, if the + * count is something other than one, we do not + * want the direct enabled (it will be done via the + * direct helper). But if DIRECT_EN is set, and + * the count is not one, we need to clear it. + * + */ + if (ftrace_rec_count(rec) == 1) { + if (!(rec->flags & FTRACE_FL_DIRECT) != + !(rec->flags & FTRACE_FL_DIRECT_EN)) + flag |= FTRACE_FL_DIRECT; + } else if (rec->flags & FTRACE_FL_DIRECT_EN) { + flag |= FTRACE_FL_DIRECT; + } + + /* + * Ops calls are special, as count matters. + * As with direct calls, they must only be enabled when count + * is one, otherwise they'll be handled via the list ops. + */ + if (ftrace_rec_count(rec) == 1) { + if (!(rec->flags & FTRACE_FL_CALL_OPS) != + !(rec->flags & FTRACE_FL_CALL_OPS_EN)) + flag |= FTRACE_FL_CALL_OPS; + } else if (rec->flags & FTRACE_FL_CALL_OPS_EN) { + flag |= FTRACE_FL_CALL_OPS; + } + } + + /* If the state of this record hasn't changed, then do nothing */ + if ((rec->flags & FTRACE_FL_ENABLED) == flag) + return FTRACE_UPDATE_IGNORE; + + if (flag) { + /* Save off if rec is being enabled (for return value) */ + flag ^= rec->flags & FTRACE_FL_ENABLED; + + if (update) { + rec->flags |= FTRACE_FL_ENABLED | FTRACE_FL_TOUCHED; + if (flag & FTRACE_FL_REGS) { + if (rec->flags & FTRACE_FL_REGS) + rec->flags |= FTRACE_FL_REGS_EN; + else + rec->flags &= ~FTRACE_FL_REGS_EN; + } + if (flag & FTRACE_FL_TRAMP) { + if (rec->flags & FTRACE_FL_TRAMP) + rec->flags |= FTRACE_FL_TRAMP_EN; + else + rec->flags &= ~FTRACE_FL_TRAMP_EN; + } + + /* Keep track of anything that modifies the function */ + if (rec->flags & (FTRACE_FL_DIRECT | FTRACE_FL_IPMODIFY)) + rec->flags |= FTRACE_FL_MODIFIED; + + if (flag & FTRACE_FL_DIRECT) { + /* + * If there's only one user (direct_ops helper) + * then we can call the direct function + * directly (no ftrace trampoline). + */ + if (ftrace_rec_count(rec) == 1) { + if (rec->flags & FTRACE_FL_DIRECT) + rec->flags |= FTRACE_FL_DIRECT_EN; + else + rec->flags &= ~FTRACE_FL_DIRECT_EN; + } else { + /* + * Can only call directly if there's + * only one callback to the function. + */ + rec->flags &= ~FTRACE_FL_DIRECT_EN; + } + } + + if (flag & FTRACE_FL_CALL_OPS) { + if (ftrace_rec_count(rec) == 1) { + if (rec->flags & FTRACE_FL_CALL_OPS) + rec->flags |= FTRACE_FL_CALL_OPS_EN; + else + rec->flags &= ~FTRACE_FL_CALL_OPS_EN; + } else { + /* + * Can only call directly if there's + * only one set of associated ops. + */ + rec->flags &= ~FTRACE_FL_CALL_OPS_EN; + } + } + } + + /* + * If this record is being updated from a nop, then + * return UPDATE_MAKE_CALL. + * Otherwise, + * return UPDATE_MODIFY_CALL to tell the caller to convert + * from the save regs, to a non-save regs function or + * vice versa, or from a trampoline call. + */ + if (flag & FTRACE_FL_ENABLED) { + ftrace_bug_type = FTRACE_BUG_CALL; + return FTRACE_UPDATE_MAKE_CALL; + } + + ftrace_bug_type = FTRACE_BUG_UPDATE; + return FTRACE_UPDATE_MODIFY_CALL; + } + + if (update) { + /* If there's no more users, clear all flags */ + if (!ftrace_rec_count(rec)) + rec->flags &= FTRACE_NOCLEAR_FLAGS; + else + /* + * Just disable the record, but keep the ops TRAMP + * and REGS states. The _EN flags must be disabled though. + */ + rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN | + FTRACE_FL_REGS_EN | FTRACE_FL_DIRECT_EN | + FTRACE_FL_CALL_OPS_EN); + } + + ftrace_bug_type = FTRACE_BUG_NOP; + return FTRACE_UPDATE_MAKE_NOP; +} + +/** + * ftrace_update_record - set a record that now is tracing or not + * @rec: the record to update + * @enable: set to true if the record is tracing, false to force disable + * + * The records that represent all functions that can be traced need + * to be updated when tracing has been enabled. + */ +int ftrace_update_record(struct dyn_ftrace *rec, bool enable) +{ + return ftrace_check_record(rec, enable, true); +} + +/** + * ftrace_test_record - check if the record has been enabled or not + * @rec: the record to test + * @enable: set to true to check if enabled, false if it is disabled + * + * The arch code may need to test if a record is already set to + * tracing to determine how to modify the function code that it + * represents. + */ +int ftrace_test_record(struct dyn_ftrace *rec, bool enable) +{ + return ftrace_check_record(rec, enable, false); +} + +static struct ftrace_ops * +ftrace_find_tramp_ops_any(struct dyn_ftrace *rec) +{ + struct ftrace_ops *op; + unsigned long ip = rec->ip; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + + if (!op->trampoline) + continue; + + if (hash_contains_ip(ip, op->func_hash)) + return op; + } while_for_each_ftrace_op(op); + + return NULL; +} + +static struct ftrace_ops * +ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_exclude) +{ + struct ftrace_ops *op; + unsigned long ip = rec->ip; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + + if (op == op_exclude || !op->trampoline) + continue; + + if (hash_contains_ip(ip, op->func_hash)) + return op; + } while_for_each_ftrace_op(op); + + return NULL; +} + +static struct ftrace_ops * +ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, + struct ftrace_ops *op) +{ + unsigned long ip = rec->ip; + + while_for_each_ftrace_op(op) { + + if (!op->trampoline) + continue; + + if (hash_contains_ip(ip, op->func_hash)) + return op; + } + + return NULL; +} + +static struct ftrace_ops * +ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) +{ + struct ftrace_ops *op; + unsigned long ip = rec->ip; + + /* + * Need to check removed ops first. + * If they are being removed, and this rec has a tramp, + * and this rec is in the ops list, then it would be the + * one with the tramp. + */ + if (removed_ops) { + if (hash_contains_ip(ip, &removed_ops->old_hash)) + return removed_ops; + } + + /* + * Need to find the current trampoline for a rec. + * Now, a trampoline is only attached to a rec if there + * was a single 'ops' attached to it. But this can be called + * when we are adding another op to the rec or removing the + * current one. Thus, if the op is being added, we can + * ignore it because it hasn't attached itself to the rec + * yet. + * + * If an ops is being modified (hooking to different functions) + * then we don't care about the new functions that are being + * added, just the old ones (that are probably being removed). + * + * If we are adding an ops to a function that already is using + * a trampoline, it needs to be removed (trampolines are only + * for single ops connected), then an ops that is not being + * modified also needs to be checked. + */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + + if (!op->trampoline) + continue; + + /* + * If the ops is being added, it hasn't gotten to + * the point to be removed from this tree yet. + */ + if (op->flags & FTRACE_OPS_FL_ADDING) + continue; + + + /* + * If the ops is being modified and is in the old + * hash, then it is probably being removed from this + * function. + */ + if ((op->flags & FTRACE_OPS_FL_MODIFYING) && + hash_contains_ip(ip, &op->old_hash)) + return op; + /* + * If the ops is not being added or modified, and it's + * in its normal filter hash, then this must be the one + * we want! + */ + if (!(op->flags & FTRACE_OPS_FL_MODIFYING) && + hash_contains_ip(ip, op->func_hash)) + return op; + + } while_for_each_ftrace_op(op); + + return NULL; +} + +static struct ftrace_ops * +ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) +{ + struct ftrace_ops *op; + unsigned long ip = rec->ip; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* pass rec in as regs to have non-NULL val */ + if (hash_contains_ip(ip, op->func_hash)) + return op; + } while_for_each_ftrace_op(op); + + return NULL; +} + +struct ftrace_ops * +ftrace_find_unique_ops(struct dyn_ftrace *rec) +{ + struct ftrace_ops *op, *found = NULL; + unsigned long ip = rec->ip; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + + if (hash_contains_ip(ip, op->func_hash)) { + if (found) + return NULL; + found = op; + } + + } while_for_each_ftrace_op(op); + + return found; +} + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* Protected by rcu_tasks for reading, and direct_mutex for writing */ +static struct ftrace_hash __rcu *direct_functions = EMPTY_HASH; +static DEFINE_MUTEX(direct_mutex); +int ftrace_direct_func_count; + +/* + * Search the direct_functions hash to see if the given instruction pointer + * has a direct caller attached to it. + */ +unsigned long ftrace_find_rec_direct(unsigned long ip) +{ + struct ftrace_func_entry *entry; + + entry = __ftrace_lookup_ip(direct_functions, ip); + if (!entry) + return 0; + + return entry->direct; +} + +static void call_direct_funcs(unsigned long ip, unsigned long pip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + unsigned long addr = READ_ONCE(ops->direct_call); + + if (!addr) + return; + + arch_ftrace_set_direct_caller(fregs, addr); +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + +/** + * ftrace_get_addr_new - Get the call address to set to + * @rec: The ftrace record descriptor + * + * If the record has the FTRACE_FL_REGS set, that means that it + * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS + * is not set, then it wants to convert to the normal callback. + * + * Returns the address of the trampoline to set to + */ +unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) +{ + struct ftrace_ops *ops; + unsigned long addr; + + if ((rec->flags & FTRACE_FL_DIRECT) && + (ftrace_rec_count(rec) == 1)) { + addr = ftrace_find_rec_direct(rec->ip); + if (addr) + return addr; + WARN_ON_ONCE(1); + } + + /* Trampolines take precedence over regs */ + if (rec->flags & FTRACE_FL_TRAMP) { + ops = ftrace_find_tramp_ops_new(rec); + if (FTRACE_WARN_ON(!ops || !ops->trampoline)) { + pr_warn("Bad trampoline accounting at: %p (%pS) (%lx)\n", + (void *)rec->ip, (void *)rec->ip, rec->flags); + /* Ftrace is shutting down, return anything */ + return (unsigned long)FTRACE_ADDR; + } + return ops->trampoline; + } + + if (rec->flags & FTRACE_FL_REGS) + return (unsigned long)FTRACE_REGS_ADDR; + else + return (unsigned long)FTRACE_ADDR; +} + +/** + * ftrace_get_addr_curr - Get the call address that is already there + * @rec: The ftrace record descriptor + * + * The FTRACE_FL_REGS_EN is set when the record already points to + * a function that saves all the regs. Basically the '_EN' version + * represents the current state of the function. + * + * Returns the address of the trampoline that is currently being called + */ +unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) +{ + struct ftrace_ops *ops; + unsigned long addr; + + /* Direct calls take precedence over trampolines */ + if (rec->flags & FTRACE_FL_DIRECT_EN) { + addr = ftrace_find_rec_direct(rec->ip); + if (addr) + return addr; + WARN_ON_ONCE(1); + } + + /* Trampolines take precedence over regs */ + if (rec->flags & FTRACE_FL_TRAMP_EN) { + ops = ftrace_find_tramp_ops_curr(rec); + if (FTRACE_WARN_ON(!ops)) { + pr_warn("Bad trampoline accounting at: %p (%pS)\n", + (void *)rec->ip, (void *)rec->ip); + /* Ftrace is shutting down, return anything */ + return (unsigned long)FTRACE_ADDR; + } + return ops->trampoline; + } + + if (rec->flags & FTRACE_FL_REGS_EN) + return (unsigned long)FTRACE_REGS_ADDR; + else + return (unsigned long)FTRACE_ADDR; +} + +static int +__ftrace_replace_code(struct dyn_ftrace *rec, bool enable) +{ + unsigned long ftrace_old_addr; + unsigned long ftrace_addr; + int ret; + + ftrace_addr = ftrace_get_addr_new(rec); + + /* This needs to be done before we call ftrace_update_record */ + ftrace_old_addr = ftrace_get_addr_curr(rec); + + ret = ftrace_update_record(rec, enable); + + ftrace_bug_type = FTRACE_BUG_UNKNOWN; + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: + ftrace_bug_type = FTRACE_BUG_CALL; + return ftrace_make_call(rec, ftrace_addr); + + case FTRACE_UPDATE_MAKE_NOP: + ftrace_bug_type = FTRACE_BUG_NOP; + return ftrace_make_nop(NULL, rec, ftrace_old_addr); + + case FTRACE_UPDATE_MODIFY_CALL: + ftrace_bug_type = FTRACE_BUG_UPDATE; + return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); + } + + return -1; /* unknown ftrace bug */ +} + +void __weak ftrace_replace_code(int mod_flags) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + bool enable = mod_flags & FTRACE_MODIFY_ENABLE_FL; + int schedulable = mod_flags & FTRACE_MODIFY_MAY_SLEEP_FL; + int failed; + + if (unlikely(ftrace_disabled)) + return; + + do_for_each_ftrace_rec(pg, rec) { + + if (skip_record(rec)) + continue; + + failed = __ftrace_replace_code(rec, enable); + if (failed) { + ftrace_bug(failed, rec); + /* Stop processing */ + return; + } + if (schedulable) + cond_resched(); + } while_for_each_ftrace_rec(); +} + +struct ftrace_rec_iter { + struct ftrace_page *pg; + int index; +}; + +/** + * ftrace_rec_iter_start - start up iterating over traced functions + * + * Returns an iterator handle that is used to iterate over all + * the records that represent address locations where functions + * are traced. + * + * May return NULL if no records are available. + */ +struct ftrace_rec_iter *ftrace_rec_iter_start(void) +{ + /* + * We only use a single iterator. + * Protected by the ftrace_lock mutex. + */ + static struct ftrace_rec_iter ftrace_rec_iter; + struct ftrace_rec_iter *iter = &ftrace_rec_iter; + + iter->pg = ftrace_pages_start; + iter->index = 0; + + /* Could have empty pages */ + while (iter->pg && !iter->pg->index) + iter->pg = iter->pg->next; + + if (!iter->pg) + return NULL; + + return iter; +} + +/** + * ftrace_rec_iter_next - get the next record to process. + * @iter: The handle to the iterator. + * + * Returns the next iterator after the given iterator @iter. + */ +struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter) +{ + iter->index++; + + if (iter->index >= iter->pg->index) { + iter->pg = iter->pg->next; + iter->index = 0; + + /* Could have empty pages */ + while (iter->pg && !iter->pg->index) + iter->pg = iter->pg->next; + } + + if (!iter->pg) + return NULL; + + return iter; +} + +/** + * ftrace_rec_iter_record - get the record at the iterator location + * @iter: The current iterator location + * + * Returns the record that the current @iter is at. + */ +struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) +{ + return &iter->pg->records[iter->index]; +} + +static int +ftrace_nop_initialize(struct module *mod, struct dyn_ftrace *rec) +{ + int ret; + + if (unlikely(ftrace_disabled)) + return 0; + + ret = ftrace_init_nop(mod, rec); + if (ret) { + ftrace_bug_type = FTRACE_BUG_INIT; + ftrace_bug(ret, rec); + return 0; + } + return 1; +} + +/* + * archs can override this function if they must do something + * before the modifying code is performed. + */ +void __weak ftrace_arch_code_modify_prepare(void) +{ +} + +/* + * archs can override this function if they must do something + * after the modifying code is performed. + */ +void __weak ftrace_arch_code_modify_post_process(void) +{ +} + +static int update_ftrace_func(ftrace_func_t func) +{ + static ftrace_func_t save_func; + + /* Avoid updating if it hasn't changed */ + if (func == save_func) + return 0; + + save_func = func; + + return ftrace_update_ftrace_func(func); +} + +void ftrace_modify_all_code(int command) +{ + int update = command & FTRACE_UPDATE_TRACE_FUNC; + int mod_flags = 0; + int err = 0; + + if (command & FTRACE_MAY_SLEEP) + mod_flags = FTRACE_MODIFY_MAY_SLEEP_FL; + + /* + * If the ftrace_caller calls a ftrace_ops func directly, + * we need to make sure that it only traces functions it + * expects to trace. When doing the switch of functions, + * we need to update to the ftrace_ops_list_func first + * before the transition between old and new calls are set, + * as the ftrace_ops_list_func will check the ops hashes + * to make sure the ops are having the right functions + * traced. + */ + if (update) { + err = update_ftrace_func(ftrace_ops_list_func); + if (FTRACE_WARN_ON(err)) + return; + } + + if (command & FTRACE_UPDATE_CALLS) + ftrace_replace_code(mod_flags | FTRACE_MODIFY_ENABLE_FL); + else if (command & FTRACE_DISABLE_CALLS) + ftrace_replace_code(mod_flags); + + if (update && ftrace_trace_function != ftrace_ops_list_func) { + function_trace_op = set_function_trace_op; + smp_wmb(); + /* If irqs are disabled, we are in stop machine */ + if (!irqs_disabled()) + smp_call_function(ftrace_sync_ipi, NULL, 1); + err = update_ftrace_func(ftrace_trace_function); + if (FTRACE_WARN_ON(err)) + return; + } + + if (command & FTRACE_START_FUNC_RET) + err = ftrace_enable_ftrace_graph_caller(); + else if (command & FTRACE_STOP_FUNC_RET) + err = ftrace_disable_ftrace_graph_caller(); + FTRACE_WARN_ON(err); +} + +static int __ftrace_modify_code(void *data) +{ + int *command = data; + + ftrace_modify_all_code(*command); + + return 0; +} + +/** + * ftrace_run_stop_machine - go back to the stop machine method + * @command: The command to tell ftrace what to do + * + * If an arch needs to fall back to the stop machine method, the + * it can call this function. + */ +void ftrace_run_stop_machine(int command) +{ + stop_machine(__ftrace_modify_code, &command, NULL); +} + +/** + * arch_ftrace_update_code - modify the code to trace or not trace + * @command: The command that needs to be done + * + * Archs can override this function if it does not need to + * run stop_machine() to modify code. + */ +void __weak arch_ftrace_update_code(int command) +{ + ftrace_run_stop_machine(command); +} + +static void ftrace_run_update_code(int command) +{ + ftrace_arch_code_modify_prepare(); + + /* + * By default we use stop_machine() to modify the code. + * But archs can do what ever they want as long as it + * is safe. The stop_machine() is the safest, but also + * produces the most overhead. + */ + arch_ftrace_update_code(command); + + ftrace_arch_code_modify_post_process(); +} + +static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, + struct ftrace_ops_hash *old_hash) +{ + ops->flags |= FTRACE_OPS_FL_MODIFYING; + ops->old_hash.filter_hash = old_hash->filter_hash; + ops->old_hash.notrace_hash = old_hash->notrace_hash; + ftrace_run_update_code(command); + ops->old_hash.filter_hash = NULL; + ops->old_hash.notrace_hash = NULL; + ops->flags &= ~FTRACE_OPS_FL_MODIFYING; +} + +static ftrace_func_t saved_ftrace_func; +static int ftrace_start_up; + +void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops) +{ +} + +/* List of trace_ops that have allocated trampolines */ +static LIST_HEAD(ftrace_ops_trampoline_list); + +static void ftrace_add_trampoline_to_kallsyms(struct ftrace_ops *ops) +{ + lockdep_assert_held(&ftrace_lock); + list_add_rcu(&ops->list, &ftrace_ops_trampoline_list); +} + +static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops) +{ + lockdep_assert_held(&ftrace_lock); + list_del_rcu(&ops->list); + synchronize_rcu(); +} + +/* + * "__builtin__ftrace" is used as a module name in /proc/kallsyms for symbols + * for pages allocated for ftrace purposes, even though "__builtin__ftrace" is + * not a module. + */ +#define FTRACE_TRAMPOLINE_MOD "__builtin__ftrace" +#define FTRACE_TRAMPOLINE_SYM "ftrace_trampoline" + +static void ftrace_trampoline_free(struct ftrace_ops *ops) +{ + if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) && + ops->trampoline) { + /* + * Record the text poke event before the ksymbol unregister + * event. + */ + perf_event_text_poke((void *)ops->trampoline, + (void *)ops->trampoline, + ops->trampoline_size, NULL, 0); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + ops->trampoline, ops->trampoline_size, + true, FTRACE_TRAMPOLINE_SYM); + /* Remove from kallsyms after the perf events */ + ftrace_remove_trampoline_from_kallsyms(ops); + } + + arch_ftrace_trampoline_free(ops); +} + +static void ftrace_startup_enable(int command) +{ + if (saved_ftrace_func != ftrace_trace_function) { + saved_ftrace_func = ftrace_trace_function; + command |= FTRACE_UPDATE_TRACE_FUNC; + } + + if (!command || !ftrace_enabled) + return; + + ftrace_run_update_code(command); +} + +static void ftrace_startup_all(int command) +{ + update_all_ops = true; + ftrace_startup_enable(command); + update_all_ops = false; +} + +int ftrace_startup(struct ftrace_ops *ops, int command) +{ + int ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + ret = __register_ftrace_function(ops); + if (ret) + return ret; + + ftrace_start_up++; + + /* + * Note that ftrace probes uses this to start up + * and modify functions it will probe. But we still + * set the ADDING flag for modification, as probes + * do not have trampolines. If they add them in the + * future, then the probes will need to distinguish + * between adding and updating probes. + */ + ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING; + + ret = ftrace_hash_ipmodify_enable(ops); + if (ret < 0) { + /* Rollback registration process */ + __unregister_ftrace_function(ops); + ftrace_start_up--; + ops->flags &= ~FTRACE_OPS_FL_ENABLED; + if (ops->flags & FTRACE_OPS_FL_DYNAMIC) + ftrace_trampoline_free(ops); + return ret; + } + + if (ftrace_hash_rec_enable(ops, 1)) + command |= FTRACE_UPDATE_CALLS; + + ftrace_startup_enable(command); + + /* + * If ftrace is in an undefined state, we just remove ops from list + * to prevent the NULL pointer, instead of totally rolling it back and + * free trampoline, because those actions could cause further damage. + */ + if (unlikely(ftrace_disabled)) { + __unregister_ftrace_function(ops); + return -ENODEV; + } + + ops->flags &= ~FTRACE_OPS_FL_ADDING; + + return 0; +} + +int ftrace_shutdown(struct ftrace_ops *ops, int command) +{ + int ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + ret = __unregister_ftrace_function(ops); + if (ret) + return ret; + + ftrace_start_up--; + /* + * Just warn in case of unbalance, no need to kill ftrace, it's not + * critical but the ftrace_call callers may be never nopped again after + * further ftrace uses. + */ + WARN_ON_ONCE(ftrace_start_up < 0); + + /* Disabling ipmodify never fails */ + ftrace_hash_ipmodify_disable(ops); + + if (ftrace_hash_rec_disable(ops, 1)) + command |= FTRACE_UPDATE_CALLS; + + ops->flags &= ~FTRACE_OPS_FL_ENABLED; + + if (saved_ftrace_func != ftrace_trace_function) { + saved_ftrace_func = ftrace_trace_function; + command |= FTRACE_UPDATE_TRACE_FUNC; + } + + if (!command || !ftrace_enabled) + goto out; + + /* + * If the ops uses a trampoline, then it needs to be + * tested first on update. + */ + ops->flags |= FTRACE_OPS_FL_REMOVING; + removed_ops = ops; + + /* The trampoline logic checks the old hashes */ + ops->old_hash.filter_hash = ops->func_hash->filter_hash; + ops->old_hash.notrace_hash = ops->func_hash->notrace_hash; + + ftrace_run_update_code(command); + + /* + * If there's no more ops registered with ftrace, run a + * sanity check to make sure all rec flags are cleared. + */ + if (rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { + struct ftrace_page *pg; + struct dyn_ftrace *rec; + + do_for_each_ftrace_rec(pg, rec) { + if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_NOCLEAR_FLAGS)) + pr_warn(" %pS flags:%lx\n", + (void *)rec->ip, rec->flags); + } while_for_each_ftrace_rec(); + } + + ops->old_hash.filter_hash = NULL; + ops->old_hash.notrace_hash = NULL; + + removed_ops = NULL; + ops->flags &= ~FTRACE_OPS_FL_REMOVING; + +out: + /* + * Dynamic ops may be freed, we must make sure that all + * callers are done before leaving this function. + */ + if (ops->flags & FTRACE_OPS_FL_DYNAMIC) { + /* + * We need to do a hard force of sched synchronization. + * This is because we use preempt_disable() to do RCU, but + * the function tracers can be called where RCU is not watching + * (like before user_exit()). We can not rely on the RCU + * infrastructure to do the synchronization, thus we must do it + * ourselves. + */ + synchronize_rcu_tasks_rude(); + + /* + * When the kernel is preemptive, tasks can be preempted + * while on a ftrace trampoline. Just scheduling a task on + * a CPU is not good enough to flush them. Calling + * synchronize_rcu_tasks() will wait for those tasks to + * execute and either schedule voluntarily or enter user space. + */ + if (IS_ENABLED(CONFIG_PREEMPTION)) + synchronize_rcu_tasks(); + + ftrace_trampoline_free(ops); + } + + return 0; +} + +static u64 ftrace_update_time; +unsigned long ftrace_update_tot_cnt; +unsigned long ftrace_number_of_pages; +unsigned long ftrace_number_of_groups; + +static inline int ops_traces_mod(struct ftrace_ops *ops) +{ + /* + * Filter_hash being empty will default to trace module. + * But notrace hash requires a test of individual module functions. + */ + return ftrace_hash_empty(ops->func_hash->filter_hash) && + ftrace_hash_empty(ops->func_hash->notrace_hash); +} + +static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) +{ + bool init_nop = ftrace_need_init_nop(); + struct ftrace_page *pg; + struct dyn_ftrace *p; + u64 start, stop; + unsigned long update_cnt = 0; + unsigned long rec_flags = 0; + int i; + + start = ftrace_now(raw_smp_processor_id()); + + /* + * When a module is loaded, this function is called to convert + * the calls to mcount in its text to nops, and also to create + * an entry in the ftrace data. Now, if ftrace is activated + * after this call, but before the module sets its text to + * read-only, the modification of enabling ftrace can fail if + * the read-only is done while ftrace is converting the calls. + * To prevent this, the module's records are set as disabled + * and will be enabled after the call to set the module's text + * to read-only. + */ + if (mod) + rec_flags |= FTRACE_FL_DISABLED; + + for (pg = new_pgs; pg; pg = pg->next) { + + for (i = 0; i < pg->index; i++) { + + /* If something went wrong, bail without enabling anything */ + if (unlikely(ftrace_disabled)) + return -1; + + p = &pg->records[i]; + p->flags = rec_flags; + + /* + * Do the initial record conversion from mcount jump + * to the NOP instructions. + */ + if (init_nop && !ftrace_nop_initialize(mod, p)) + break; + + update_cnt++; + } + } + + stop = ftrace_now(raw_smp_processor_id()); + ftrace_update_time = stop - start; + ftrace_update_tot_cnt += update_cnt; + + return 0; +} + +static int ftrace_allocate_records(struct ftrace_page *pg, int count) +{ + int order; + int pages; + int cnt; + + if (WARN_ON(!count)) + return -EINVAL; + + /* We want to fill as much as possible, with no empty pages */ + pages = DIV_ROUND_UP(count, ENTRIES_PER_PAGE); + order = fls(pages) - 1; + + again: + pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); + + if (!pg->records) { + /* if we can't allocate this size, try something smaller */ + if (!order) + return -ENOMEM; + order--; + goto again; + } + + ftrace_number_of_pages += 1 << order; + ftrace_number_of_groups++; + + cnt = (PAGE_SIZE << order) / ENTRY_SIZE; + pg->order = order; + + if (cnt > count) + cnt = count; + + return cnt; +} + +static void ftrace_free_pages(struct ftrace_page *pages) +{ + struct ftrace_page *pg = pages; + + while (pg) { + if (pg->records) { + free_pages((unsigned long)pg->records, pg->order); + ftrace_number_of_pages -= 1 << pg->order; + } + pages = pg->next; + kfree(pg); + pg = pages; + ftrace_number_of_groups--; + } +} + +static struct ftrace_page * +ftrace_allocate_pages(unsigned long num_to_init) +{ + struct ftrace_page *start_pg; + struct ftrace_page *pg; + int cnt; + + if (!num_to_init) + return NULL; + + start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL); + if (!pg) + return NULL; + + /* + * Try to allocate as much as possible in one continues + * location that fills in all of the space. We want to + * waste as little space as possible. + */ + for (;;) { + cnt = ftrace_allocate_records(pg, num_to_init); + if (cnt < 0) + goto free_pages; + + num_to_init -= cnt; + if (!num_to_init) + break; + + pg->next = kzalloc(sizeof(*pg), GFP_KERNEL); + if (!pg->next) + goto free_pages; + + pg = pg->next; + } + + return start_pg; + + free_pages: + ftrace_free_pages(start_pg); + pr_info("ftrace: FAILED to allocate memory for functions\n"); + return NULL; +} + +#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ + +struct ftrace_iterator { + loff_t pos; + loff_t func_pos; + loff_t mod_pos; + struct ftrace_page *pg; + struct dyn_ftrace *func; + struct ftrace_func_probe *probe; + struct ftrace_func_entry *probe_entry; + struct trace_parser parser; + struct ftrace_hash *hash; + struct ftrace_ops *ops; + struct trace_array *tr; + struct list_head *mod_list; + int pidx; + int idx; + unsigned flags; +}; + +static void * +t_probe_next(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + struct trace_array *tr = iter->ops->private; + struct list_head *func_probes; + struct ftrace_hash *hash; + struct list_head *next; + struct hlist_node *hnd = NULL; + struct hlist_head *hhd; + int size; + + (*pos)++; + iter->pos = *pos; + + if (!tr) + return NULL; + + func_probes = &tr->func_probes; + if (list_empty(func_probes)) + return NULL; + + if (!iter->probe) { + next = func_probes->next; + iter->probe = list_entry(next, struct ftrace_func_probe, list); + } + + if (iter->probe_entry) + hnd = &iter->probe_entry->hlist; + + hash = iter->probe->ops.func_hash->filter_hash; + + /* + * A probe being registered may temporarily have an empty hash + * and it's at the end of the func_probes list. + */ + if (!hash || hash == EMPTY_HASH) + return NULL; + + size = 1 << hash->size_bits; + + retry: + if (iter->pidx >= size) { + if (iter->probe->list.next == func_probes) + return NULL; + next = iter->probe->list.next; + iter->probe = list_entry(next, struct ftrace_func_probe, list); + hash = iter->probe->ops.func_hash->filter_hash; + size = 1 << hash->size_bits; + iter->pidx = 0; + } + + hhd = &hash->buckets[iter->pidx]; + + if (hlist_empty(hhd)) { + iter->pidx++; + hnd = NULL; + goto retry; + } + + if (!hnd) + hnd = hhd->first; + else { + hnd = hnd->next; + if (!hnd) { + iter->pidx++; + goto retry; + } + } + + if (WARN_ON_ONCE(!hnd)) + return NULL; + + iter->probe_entry = hlist_entry(hnd, struct ftrace_func_entry, hlist); + + return iter; +} + +static void *t_probe_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + void *p = NULL; + loff_t l; + + if (!(iter->flags & FTRACE_ITER_DO_PROBES)) + return NULL; + + if (iter->mod_pos > *pos) + return NULL; + + iter->probe = NULL; + iter->probe_entry = NULL; + iter->pidx = 0; + for (l = 0; l <= (*pos - iter->mod_pos); ) { + p = t_probe_next(m, &l); + if (!p) + break; + } + if (!p) + return NULL; + + /* Only set this if we have an item */ + iter->flags |= FTRACE_ITER_PROBE; + + return iter; +} + +static int +t_probe_show(struct seq_file *m, struct ftrace_iterator *iter) +{ + struct ftrace_func_entry *probe_entry; + struct ftrace_probe_ops *probe_ops; + struct ftrace_func_probe *probe; + + probe = iter->probe; + probe_entry = iter->probe_entry; + + if (WARN_ON_ONCE(!probe || !probe_entry)) + return -EIO; + + probe_ops = probe->probe_ops; + + if (probe_ops->print) + return probe_ops->print(m, probe_entry->ip, probe_ops, probe->data); + + seq_printf(m, "%ps:%ps\n", (void *)probe_entry->ip, + (void *)probe_ops->func); + + return 0; +} + +static void * +t_mod_next(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; + + (*pos)++; + iter->pos = *pos; + + iter->mod_list = iter->mod_list->next; + + if (iter->mod_list == &tr->mod_trace || + iter->mod_list == &tr->mod_notrace) { + iter->flags &= ~FTRACE_ITER_MOD; + return NULL; + } + + iter->mod_pos = *pos; + + return iter; +} + +static void *t_mod_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + void *p = NULL; + loff_t l; + + if (iter->func_pos > *pos) + return NULL; + + iter->mod_pos = iter->func_pos; + + /* probes are only available if tr is set */ + if (!iter->tr) + return NULL; + + for (l = 0; l <= (*pos - iter->func_pos); ) { + p = t_mod_next(m, &l); + if (!p) + break; + } + if (!p) { + iter->flags &= ~FTRACE_ITER_MOD; + return t_probe_start(m, pos); + } + + /* Only set this if we have an item */ + iter->flags |= FTRACE_ITER_MOD; + + return iter; +} + +static int +t_mod_show(struct seq_file *m, struct ftrace_iterator *iter) +{ + struct ftrace_mod_load *ftrace_mod; + struct trace_array *tr = iter->tr; + + if (WARN_ON_ONCE(!iter->mod_list) || + iter->mod_list == &tr->mod_trace || + iter->mod_list == &tr->mod_notrace) + return -EIO; + + ftrace_mod = list_entry(iter->mod_list, struct ftrace_mod_load, list); + + if (ftrace_mod->func) + seq_printf(m, "%s", ftrace_mod->func); + else + seq_putc(m, '*'); + + seq_printf(m, ":mod:%s\n", ftrace_mod->module); + + return 0; +} + +static void * +t_func_next(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + struct dyn_ftrace *rec = NULL; + + (*pos)++; + + retry: + if (iter->idx >= iter->pg->index) { + if (iter->pg->next) { + iter->pg = iter->pg->next; + iter->idx = 0; + goto retry; + } + } else { + rec = &iter->pg->records[iter->idx++]; + if (((iter->flags & (FTRACE_ITER_FILTER | FTRACE_ITER_NOTRACE)) && + !ftrace_lookup_ip(iter->hash, rec->ip)) || + + ((iter->flags & FTRACE_ITER_ENABLED) && + !(rec->flags & FTRACE_FL_ENABLED)) || + + ((iter->flags & FTRACE_ITER_TOUCHED) && + !(rec->flags & FTRACE_FL_TOUCHED))) { + + rec = NULL; + goto retry; + } + } + + if (!rec) + return NULL; + + iter->pos = iter->func_pos = *pos; + iter->func = rec; + + return iter; +} + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + loff_t l = *pos; /* t_probe_start() must use original pos */ + void *ret; + + if (unlikely(ftrace_disabled)) + return NULL; + + if (iter->flags & FTRACE_ITER_PROBE) + return t_probe_next(m, pos); + + if (iter->flags & FTRACE_ITER_MOD) + return t_mod_next(m, pos); + + if (iter->flags & FTRACE_ITER_PRINTALL) { + /* next must increment pos, and t_probe_start does not */ + (*pos)++; + return t_mod_start(m, &l); + } + + ret = t_func_next(m, pos); + + if (!ret) + return t_mod_start(m, &l); + + return ret; +} + +static void reset_iter_read(struct ftrace_iterator *iter) +{ + iter->pos = 0; + iter->func_pos = 0; + iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE | FTRACE_ITER_MOD); +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + void *p = NULL; + loff_t l; + + mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) + return NULL; + + /* + * If an lseek was done, then reset and start from beginning. + */ + if (*pos < iter->pos) + reset_iter_read(iter); + + /* + * For set_ftrace_filter reading, if we have the filter + * off, we can short cut and just print out that all + * functions are enabled. + */ + if ((iter->flags & (FTRACE_ITER_FILTER | FTRACE_ITER_NOTRACE)) && + ftrace_hash_empty(iter->hash)) { + iter->func_pos = 1; /* Account for the message */ + if (*pos > 0) + return t_mod_start(m, pos); + iter->flags |= FTRACE_ITER_PRINTALL; + /* reset in case of seek/pread */ + iter->flags &= ~FTRACE_ITER_PROBE; + return iter; + } + + if (iter->flags & FTRACE_ITER_MOD) + return t_mod_start(m, pos); + + /* + * Unfortunately, we need to restart at ftrace_pages_start + * every time we let go of the ftrace_mutex. This is because + * those pointers can change without the lock. + */ + iter->pg = ftrace_pages_start; + iter->idx = 0; + for (l = 0; l <= *pos; ) { + p = t_func_next(m, &l); + if (!p) + break; + } + + if (!p) + return t_mod_start(m, pos); + + return iter; +} + +static void t_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&ftrace_lock); +} + +void * __weak +arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + return NULL; +} + +static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops, + struct dyn_ftrace *rec) +{ + void *ptr; + + ptr = arch_ftrace_trampoline_func(ops, rec); + if (ptr) + seq_printf(m, " ->%pS", ptr); +} + +#ifdef FTRACE_MCOUNT_MAX_OFFSET +/* + * Weak functions can still have an mcount/fentry that is saved in + * the __mcount_loc section. These can be detected by having a + * symbol offset of greater than FTRACE_MCOUNT_MAX_OFFSET, as the + * symbol found by kallsyms is not the function that the mcount/fentry + * is part of. The offset is much greater in these cases. + * + * Test the record to make sure that the ip points to a valid kallsyms + * and if not, mark it disabled. + */ +static int test_for_valid_rec(struct dyn_ftrace *rec) +{ + char str[KSYM_SYMBOL_LEN]; + unsigned long offset; + const char *ret; + + ret = kallsyms_lookup(rec->ip, NULL, &offset, NULL, str); + + /* Weak functions can cause invalid addresses */ + if (!ret || offset > FTRACE_MCOUNT_MAX_OFFSET) { + rec->flags |= FTRACE_FL_DISABLED; + return 0; + } + return 1; +} + +static struct workqueue_struct *ftrace_check_wq __initdata; +static struct work_struct ftrace_check_work __initdata; + +/* + * Scan all the mcount/fentry entries to make sure they are valid. + */ +static __init void ftrace_check_work_func(struct work_struct *work) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + + mutex_lock(&ftrace_lock); + do_for_each_ftrace_rec(pg, rec) { + test_for_valid_rec(rec); + } while_for_each_ftrace_rec(); + mutex_unlock(&ftrace_lock); +} + +static int __init ftrace_check_for_weak_functions(void) +{ + INIT_WORK(&ftrace_check_work, ftrace_check_work_func); + + ftrace_check_wq = alloc_workqueue("ftrace_check_wq", WQ_UNBOUND, 0); + + queue_work(ftrace_check_wq, &ftrace_check_work); + return 0; +} + +static int __init ftrace_check_sync(void) +{ + /* Make sure the ftrace_check updates are finished */ + if (ftrace_check_wq) + destroy_workqueue(ftrace_check_wq); + return 0; +} + +late_initcall_sync(ftrace_check_sync); +subsys_initcall(ftrace_check_for_weak_functions); + +static int print_rec(struct seq_file *m, unsigned long ip) +{ + unsigned long offset; + char str[KSYM_SYMBOL_LEN]; + char *modname; + const char *ret; + + ret = kallsyms_lookup(ip, NULL, &offset, &modname, str); + /* Weak functions can cause invalid addresses */ + if (!ret || offset > FTRACE_MCOUNT_MAX_OFFSET) { + snprintf(str, KSYM_SYMBOL_LEN, "%s_%ld", + FTRACE_INVALID_FUNCTION, offset); + ret = NULL; + } + + seq_puts(m, str); + if (modname) + seq_printf(m, " [%s]", modname); + return ret == NULL ? -1 : 0; +} +#else +static inline int test_for_valid_rec(struct dyn_ftrace *rec) +{ + return 1; +} + +static inline int print_rec(struct seq_file *m, unsigned long ip) +{ + seq_printf(m, "%ps", (void *)ip); + return 0; +} +#endif + +static int t_show(struct seq_file *m, void *v) +{ + struct ftrace_iterator *iter = m->private; + struct dyn_ftrace *rec; + + if (iter->flags & FTRACE_ITER_PROBE) + return t_probe_show(m, iter); + + if (iter->flags & FTRACE_ITER_MOD) + return t_mod_show(m, iter); + + if (iter->flags & FTRACE_ITER_PRINTALL) { + if (iter->flags & FTRACE_ITER_NOTRACE) + seq_puts(m, "#### no functions disabled ####\n"); + else + seq_puts(m, "#### all functions enabled ####\n"); + return 0; + } + + rec = iter->func; + + if (!rec) + return 0; + + if (iter->flags & FTRACE_ITER_ADDRS) + seq_printf(m, "%lx ", rec->ip); + + if (print_rec(m, rec->ip)) { + /* This should only happen when a rec is disabled */ + WARN_ON_ONCE(!(rec->flags & FTRACE_FL_DISABLED)); + seq_putc(m, '\n'); + return 0; + } + + if (iter->flags & (FTRACE_ITER_ENABLED | FTRACE_ITER_TOUCHED)) { + struct ftrace_ops *ops; + + seq_printf(m, " (%ld)%s%s%s%s%s", + ftrace_rec_count(rec), + rec->flags & FTRACE_FL_REGS ? " R" : " ", + rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ", + rec->flags & FTRACE_FL_DIRECT ? " D" : " ", + rec->flags & FTRACE_FL_CALL_OPS ? " O" : " ", + rec->flags & FTRACE_FL_MODIFIED ? " M " : " "); + if (rec->flags & FTRACE_FL_TRAMP_EN) { + ops = ftrace_find_tramp_ops_any(rec); + if (ops) { + do { + seq_printf(m, "\ttramp: %pS (%pS)", + (void *)ops->trampoline, + (void *)ops->func); + add_trampoline_func(m, ops, rec); + ops = ftrace_find_tramp_ops_next(rec, ops); + } while (ops); + } else + seq_puts(m, "\ttramp: ERROR!"); + } else { + add_trampoline_func(m, NULL, rec); + } + if (rec->flags & FTRACE_FL_CALL_OPS_EN) { + ops = ftrace_find_unique_ops(rec); + if (ops) { + seq_printf(m, "\tops: %pS (%pS)", + ops, ops->func); + } else { + seq_puts(m, "\tops: ERROR!"); + } + } + if (rec->flags & FTRACE_FL_DIRECT) { + unsigned long direct; + + direct = ftrace_find_rec_direct(rec->ip); + if (direct) + seq_printf(m, "\n\tdirect-->%pS", (void *)direct); + } + } + + seq_putc(m, '\n'); + + return 0; +} + +static const struct seq_operations show_ftrace_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int +ftrace_avail_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->ops = &global_ops; + + return 0; +} + +static int +ftrace_enabled_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + + /* + * This shows us what functions are currently being + * traced and by what. Not sure if we want lockdown + * to hide such critical information for an admin. + * Although, perhaps it can show information we don't + * want people to see, but if something is tracing + * something, we probably want to know about it. + */ + + iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->flags = FTRACE_ITER_ENABLED; + iter->ops = &global_ops; + + return 0; +} + +static int +ftrace_touched_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + + /* + * This shows us what functions have ever been enabled + * (traced, direct, patched, etc). Not sure if we want lockdown + * to hide such critical information for an admin. + * Although, perhaps it can show information we don't + * want people to see, but if something had traced + * something, we probably want to know about it. + */ + + iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->flags = FTRACE_ITER_TOUCHED; + iter->ops = &global_ops; + + return 0; +} + +static int +ftrace_avail_addrs_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->flags = FTRACE_ITER_ADDRS; + iter->ops = &global_ops; + + return 0; +} + +/** + * ftrace_regex_open - initialize function tracer filter files + * @ops: The ftrace_ops that hold the hash filters + * @flag: The type of filter to process + * @inode: The inode, usually passed in to your open routine + * @file: The file, usually passed in to your open routine + * + * ftrace_regex_open() initializes the filter files for the + * @ops. Depending on @flag it may process the filter hash or + * the notrace hash of @ops. With this called from the open + * routine, you can use ftrace_filter_write() for the write + * routine if @flag has FTRACE_ITER_FILTER set, or + * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. + * tracing_lseek() should be used as the lseek routine, and + * release must call ftrace_regex_release(). + */ +int +ftrace_regex_open(struct ftrace_ops *ops, int flag, + struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + struct ftrace_hash *hash; + struct list_head *mod_head; + struct trace_array *tr = ops->private; + int ret = -ENOMEM; + + ftrace_ops_init(ops); + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + if (tracing_check_open_get_tr(tr)) + return -ENODEV; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + goto out; + + if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) + goto out; + + iter->ops = ops; + iter->flags = flag; + iter->tr = tr; + + mutex_lock(&ops->func_hash->regex_lock); + + if (flag & FTRACE_ITER_NOTRACE) { + hash = ops->func_hash->notrace_hash; + mod_head = tr ? &tr->mod_notrace : NULL; + } else { + hash = ops->func_hash->filter_hash; + mod_head = tr ? &tr->mod_trace : NULL; + } + + iter->mod_list = mod_head; + + if (file->f_mode & FMODE_WRITE) { + const int size_bits = FTRACE_HASH_DEFAULT_BITS; + + if (file->f_flags & O_TRUNC) { + iter->hash = alloc_ftrace_hash(size_bits); + clear_ftrace_mod_list(mod_head); + } else { + iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); + } + + if (!iter->hash) { + trace_parser_put(&iter->parser); + goto out_unlock; + } + } else + iter->hash = hash; + + ret = 0; + + if (file->f_mode & FMODE_READ) { + iter->pg = ftrace_pages_start; + + ret = seq_open(file, &show_ftrace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = iter; + } else { + /* Failed */ + free_ftrace_hash(iter->hash); + trace_parser_put(&iter->parser); + } + } else + file->private_data = iter; + + out_unlock: + mutex_unlock(&ops->func_hash->regex_lock); + + out: + if (ret) { + kfree(iter); + if (tr) + trace_array_put(tr); + } + + return ret; +} + +static int +ftrace_filter_open(struct inode *inode, struct file *file) +{ + struct ftrace_ops *ops = inode->i_private; + + /* Checks for tracefs lockdown */ + return ftrace_regex_open(ops, + FTRACE_ITER_FILTER | FTRACE_ITER_DO_PROBES, + inode, file); +} + +static int +ftrace_notrace_open(struct inode *inode, struct file *file) +{ + struct ftrace_ops *ops = inode->i_private; + + /* Checks for tracefs lockdown */ + return ftrace_regex_open(ops, FTRACE_ITER_NOTRACE, + inode, file); +} + +/* Type for quick search ftrace basic regexes (globs) from filter_parse_regex */ +struct ftrace_glob { + char *search; + unsigned len; + int type; +}; + +/* + * If symbols in an architecture don't correspond exactly to the user-visible + * name of what they represent, it is possible to define this function to + * perform the necessary adjustments. +*/ +char * __weak arch_ftrace_match_adjust(char *str, const char *search) +{ + return str; +} + +static int ftrace_match(char *str, struct ftrace_glob *g) +{ + int matched = 0; + int slen; + + str = arch_ftrace_match_adjust(str, g->search); + + switch (g->type) { + case MATCH_FULL: + if (strcmp(str, g->search) == 0) + matched = 1; + break; + case MATCH_FRONT_ONLY: + if (strncmp(str, g->search, g->len) == 0) + matched = 1; + break; + case MATCH_MIDDLE_ONLY: + if (strstr(str, g->search)) + matched = 1; + break; + case MATCH_END_ONLY: + slen = strlen(str); + if (slen >= g->len && + memcmp(str + slen - g->len, g->search, g->len) == 0) + matched = 1; + break; + case MATCH_GLOB: + if (glob_match(g->search, str)) + matched = 1; + break; + } + + return matched; +} + +static int +enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter) +{ + struct ftrace_func_entry *entry; + int ret = 0; + + entry = ftrace_lookup_ip(hash, rec->ip); + if (clear_filter) { + /* Do nothing if it doesn't exist */ + if (!entry) + return 0; + + free_hash_entry(hash, entry); + } else { + /* Do nothing if it exists */ + if (entry) + return 0; + if (add_hash_entry(hash, rec->ip) == NULL) + ret = -ENOMEM; + } + return ret; +} + +static int +add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g, + int clear_filter) +{ + long index = simple_strtoul(func_g->search, NULL, 0); + struct ftrace_page *pg; + struct dyn_ftrace *rec; + + /* The index starts at 1 */ + if (--index < 0) + return 0; + + do_for_each_ftrace_rec(pg, rec) { + if (pg->index <= index) { + index -= pg->index; + /* this is a double loop, break goes to the next page */ + break; + } + rec = &pg->records[index]; + enter_record(hash, rec, clear_filter); + return 1; + } while_for_each_ftrace_rec(); + return 0; +} + +#ifdef FTRACE_MCOUNT_MAX_OFFSET +static int lookup_ip(unsigned long ip, char **modname, char *str) +{ + unsigned long offset; + + kallsyms_lookup(ip, NULL, &offset, modname, str); + if (offset > FTRACE_MCOUNT_MAX_OFFSET) + return -1; + return 0; +} +#else +static int lookup_ip(unsigned long ip, char **modname, char *str) +{ + kallsyms_lookup(ip, NULL, NULL, modname, str); + return 0; +} +#endif + +static int +ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, + struct ftrace_glob *mod_g, int exclude_mod) +{ + char str[KSYM_SYMBOL_LEN]; + char *modname; + + if (lookup_ip(rec->ip, &modname, str)) { + /* This should only happen when a rec is disabled */ + WARN_ON_ONCE(system_state == SYSTEM_RUNNING && + !(rec->flags & FTRACE_FL_DISABLED)); + return 0; + } + + if (mod_g) { + int mod_matches = (modname) ? ftrace_match(modname, mod_g) : 0; + + /* blank module name to match all modules */ + if (!mod_g->len) { + /* blank module globbing: modname xor exclude_mod */ + if (!exclude_mod != !modname) + goto func_match; + return 0; + } + + /* + * exclude_mod is set to trace everything but the given + * module. If it is set and the module matches, then + * return 0. If it is not set, and the module doesn't match + * also return 0. Otherwise, check the function to see if + * that matches. + */ + if (!mod_matches == !exclude_mod) + return 0; +func_match: + /* blank search means to match all funcs in the mod */ + if (!func_g->len) + return 1; + } + + return ftrace_match(str, func_g); +} + +static int +match_records(struct ftrace_hash *hash, char *func, int len, char *mod) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + struct ftrace_glob func_g = { .type = MATCH_FULL }; + struct ftrace_glob mod_g = { .type = MATCH_FULL }; + struct ftrace_glob *mod_match = (mod) ? &mod_g : NULL; + int exclude_mod = 0; + int found = 0; + int ret; + int clear_filter = 0; + + if (func) { + func_g.type = filter_parse_regex(func, len, &func_g.search, + &clear_filter); + func_g.len = strlen(func_g.search); + } + + if (mod) { + mod_g.type = filter_parse_regex(mod, strlen(mod), + &mod_g.search, &exclude_mod); + mod_g.len = strlen(mod_g.search); + } + + mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) + goto out_unlock; + + if (func_g.type == MATCH_INDEX) { + found = add_rec_by_index(hash, &func_g, clear_filter); + goto out_unlock; + } + + do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_DISABLED) + continue; + + if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) { + ret = enter_record(hash, rec, clear_filter); + if (ret < 0) { + found = ret; + goto out_unlock; + } + found = 1; + } + cond_resched(); + } while_for_each_ftrace_rec(); + out_unlock: + mutex_unlock(&ftrace_lock); + + return found; +} + +static int +ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) +{ + return match_records(hash, buff, len, NULL); +} + +static void ftrace_ops_update_code(struct ftrace_ops *ops, + struct ftrace_ops_hash *old_hash) +{ + struct ftrace_ops *op; + + if (!ftrace_enabled) + return; + + if (ops->flags & FTRACE_OPS_FL_ENABLED) { + ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); + return; + } + + /* + * If this is the shared global_ops filter, then we need to + * check if there is another ops that shares it, is enabled. + * If so, we still need to run the modify code. + */ + if (ops->func_hash != &global_ops.local_hash) + return; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op->func_hash == &global_ops.local_hash && + op->flags & FTRACE_OPS_FL_ENABLED) { + ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash); + /* Only need to do this once */ + return; + } + } while_for_each_ftrace_op(op); +} + +static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, + struct ftrace_hash **orig_hash, + struct ftrace_hash *hash, + int enable) +{ + struct ftrace_ops_hash old_hash_ops; + struct ftrace_hash *old_hash; + int ret; + + old_hash = *orig_hash; + old_hash_ops.filter_hash = ops->func_hash->filter_hash; + old_hash_ops.notrace_hash = ops->func_hash->notrace_hash; + ret = ftrace_hash_move(ops, enable, orig_hash, hash); + if (!ret) { + ftrace_ops_update_code(ops, &old_hash_ops); + free_ftrace_hash_rcu(old_hash); + } + return ret; +} + +static bool module_exists(const char *module) +{ + /* All modules have the symbol __this_module */ + static const char this_mod[] = "__this_module"; + char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; + unsigned long val; + int n; + + n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod); + + if (n > sizeof(modname) - 1) + return false; + + val = module_kallsyms_lookup_name(modname); + return val != 0; +} + +static int cache_mod(struct trace_array *tr, + const char *func, char *module, int enable) +{ + struct ftrace_mod_load *ftrace_mod, *n; + struct list_head *head = enable ? &tr->mod_trace : &tr->mod_notrace; + int ret; + + mutex_lock(&ftrace_lock); + + /* We do not cache inverse filters */ + if (func[0] == '!') { + func++; + ret = -EINVAL; + + /* Look to remove this hash */ + list_for_each_entry_safe(ftrace_mod, n, head, list) { + if (strcmp(ftrace_mod->module, module) != 0) + continue; + + /* no func matches all */ + if (strcmp(func, "*") == 0 || + (ftrace_mod->func && + strcmp(ftrace_mod->func, func) == 0)) { + ret = 0; + free_ftrace_mod(ftrace_mod); + continue; + } + } + goto out; + } + + ret = -EINVAL; + /* We only care about modules that have not been loaded yet */ + if (module_exists(module)) + goto out; + + /* Save this string off, and execute it when the module is loaded */ + ret = ftrace_add_mod(tr, func, module, enable); + out: + mutex_unlock(&ftrace_lock); + + return ret; +} + +static int +ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, + int reset, int enable); + +#ifdef CONFIG_MODULES +static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, + char *mod, bool enable) +{ + struct ftrace_mod_load *ftrace_mod, *n; + struct ftrace_hash **orig_hash, *new_hash; + LIST_HEAD(process_mods); + char *func; + + mutex_lock(&ops->func_hash->regex_lock); + + if (enable) + orig_hash = &ops->func_hash->filter_hash; + else + orig_hash = &ops->func_hash->notrace_hash; + + new_hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, + *orig_hash); + if (!new_hash) + goto out; /* warn? */ + + mutex_lock(&ftrace_lock); + + list_for_each_entry_safe(ftrace_mod, n, head, list) { + + if (strcmp(ftrace_mod->module, mod) != 0) + continue; + + if (ftrace_mod->func) + func = kstrdup(ftrace_mod->func, GFP_KERNEL); + else + func = kstrdup("*", GFP_KERNEL); + + if (!func) /* warn? */ + continue; + + list_move(&ftrace_mod->list, &process_mods); + + /* Use the newly allocated func, as it may be "*" */ + kfree(ftrace_mod->func); + ftrace_mod->func = func; + } + + mutex_unlock(&ftrace_lock); + + list_for_each_entry_safe(ftrace_mod, n, &process_mods, list) { + + func = ftrace_mod->func; + + /* Grabs ftrace_lock, which is why we have this extra step */ + match_records(new_hash, func, strlen(func), mod); + free_ftrace_mod(ftrace_mod); + } + + if (enable && list_empty(head)) + new_hash->flags &= ~FTRACE_HASH_FL_MOD; + + mutex_lock(&ftrace_lock); + + ftrace_hash_move_and_update_ops(ops, orig_hash, + new_hash, enable); + mutex_unlock(&ftrace_lock); + + out: + mutex_unlock(&ops->func_hash->regex_lock); + + free_ftrace_hash(new_hash); +} + +static void process_cached_mods(const char *mod_name) +{ + struct trace_array *tr; + char *mod; + + mod = kstrdup(mod_name, GFP_KERNEL); + if (!mod) + return; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!list_empty(&tr->mod_trace)) + process_mod_list(&tr->mod_trace, tr->ops, mod, true); + if (!list_empty(&tr->mod_notrace)) + process_mod_list(&tr->mod_notrace, tr->ops, mod, false); + } + mutex_unlock(&trace_types_lock); + + kfree(mod); +} +#endif + +/* + * We register the module command as a template to show others how + * to register the a command as well. + */ + +static int +ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, + char *func_orig, char *cmd, char *module, int enable) +{ + char *func; + int ret; + + /* match_records() modifies func, and we need the original */ + func = kstrdup(func_orig, GFP_KERNEL); + if (!func) + return -ENOMEM; + + /* + * cmd == 'mod' because we only registered this func + * for the 'mod' ftrace_func_command. + * But if you register one func with multiple commands, + * you can tell which command was used by the cmd + * parameter. + */ + ret = match_records(hash, func, strlen(func), module); + kfree(func); + + if (!ret) + return cache_mod(tr, func_orig, module, enable); + if (ret < 0) + return ret; + return 0; +} + +static struct ftrace_func_command ftrace_mod_cmd = { + .name = "mod", + .func = ftrace_mod_callback, +}; + +static int __init ftrace_mod_cmd_init(void) +{ + return register_ftrace_command(&ftrace_mod_cmd); +} +core_initcall(ftrace_mod_cmd_init); + +static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + struct ftrace_probe_ops *probe_ops; + struct ftrace_func_probe *probe; + + probe = container_of(op, struct ftrace_func_probe, ops); + probe_ops = probe->probe_ops; + + /* + * Disable preemption for these calls to prevent a RCU grace + * period. This syncs the hash iteration and freeing of items + * on the hash. rcu_read_lock is too dangerous here. + */ + preempt_disable_notrace(); + probe_ops->func(ip, parent_ip, probe->tr, probe_ops, probe->data); + preempt_enable_notrace(); +} + +struct ftrace_func_map { + struct ftrace_func_entry entry; + void *data; +}; + +struct ftrace_func_mapper { + struct ftrace_hash hash; +}; + +/** + * allocate_ftrace_func_mapper - allocate a new ftrace_func_mapper + * + * Returns a ftrace_func_mapper descriptor that can be used to map ips to data. + */ +struct ftrace_func_mapper *allocate_ftrace_func_mapper(void) +{ + struct ftrace_hash *hash; + + /* + * The mapper is simply a ftrace_hash, but since the entries + * in the hash are not ftrace_func_entry type, we define it + * as a separate structure. + */ + hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + return (struct ftrace_func_mapper *)hash; +} + +/** + * ftrace_func_mapper_find_ip - Find some data mapped to an ip + * @mapper: The mapper that has the ip maps + * @ip: the instruction pointer to find the data for + * + * Returns the data mapped to @ip if found otherwise NULL. The return + * is actually the address of the mapper data pointer. The address is + * returned for use cases where the data is no bigger than a long, and + * the user can use the data pointer as its data instead of having to + * allocate more memory for the reference. + */ +void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper, + unsigned long ip) +{ + struct ftrace_func_entry *entry; + struct ftrace_func_map *map; + + entry = ftrace_lookup_ip(&mapper->hash, ip); + if (!entry) + return NULL; + + map = (struct ftrace_func_map *)entry; + return &map->data; +} + +/** + * ftrace_func_mapper_add_ip - Map some data to an ip + * @mapper: The mapper that has the ip maps + * @ip: The instruction pointer address to map @data to + * @data: The data to map to @ip + * + * Returns 0 on success otherwise an error. + */ +int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, + unsigned long ip, void *data) +{ + struct ftrace_func_entry *entry; + struct ftrace_func_map *map; + + entry = ftrace_lookup_ip(&mapper->hash, ip); + if (entry) + return -EBUSY; + + map = kmalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return -ENOMEM; + + map->entry.ip = ip; + map->data = data; + + __add_hash_entry(&mapper->hash, &map->entry); + + return 0; +} + +/** + * ftrace_func_mapper_remove_ip - Remove an ip from the mapping + * @mapper: The mapper that has the ip maps + * @ip: The instruction pointer address to remove the data from + * + * Returns the data if it is found, otherwise NULL. + * Note, if the data pointer is used as the data itself, (see + * ftrace_func_mapper_find_ip(), then the return value may be meaningless, + * if the data pointer was set to zero. + */ +void *ftrace_func_mapper_remove_ip(struct ftrace_func_mapper *mapper, + unsigned long ip) +{ + struct ftrace_func_entry *entry; + struct ftrace_func_map *map; + void *data; + + entry = ftrace_lookup_ip(&mapper->hash, ip); + if (!entry) + return NULL; + + map = (struct ftrace_func_map *)entry; + data = map->data; + + remove_hash_entry(&mapper->hash, entry); + kfree(entry); + + return data; +} + +/** + * free_ftrace_func_mapper - free a mapping of ips and data + * @mapper: The mapper that has the ip maps + * @free_func: A function to be called on each data item. + * + * This is used to free the function mapper. The @free_func is optional + * and can be used if the data needs to be freed as well. + */ +void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, + ftrace_mapper_func free_func) +{ + struct ftrace_func_entry *entry; + struct ftrace_func_map *map; + struct hlist_head *hhd; + int size, i; + + if (!mapper) + return; + + if (free_func && mapper->hash.count) { + size = 1 << mapper->hash.size_bits; + for (i = 0; i < size; i++) { + hhd = &mapper->hash.buckets[i]; + hlist_for_each_entry(entry, hhd, hlist) { + map = (struct ftrace_func_map *)entry; + free_func(map); + } + } + } + free_ftrace_hash(&mapper->hash); +} + +static void release_probe(struct ftrace_func_probe *probe) +{ + struct ftrace_probe_ops *probe_ops; + + mutex_lock(&ftrace_lock); + + WARN_ON(probe->ref <= 0); + + /* Subtract the ref that was used to protect this instance */ + probe->ref--; + + if (!probe->ref) { + probe_ops = probe->probe_ops; + /* + * Sending zero as ip tells probe_ops to free + * the probe->data itself + */ + if (probe_ops->free) + probe_ops->free(probe_ops, probe->tr, 0, probe->data); + list_del(&probe->list); + kfree(probe); + } + mutex_unlock(&ftrace_lock); +} + +static void acquire_probe_locked(struct ftrace_func_probe *probe) +{ + /* + * Add one ref to keep it from being freed when releasing the + * ftrace_lock mutex. + */ + probe->ref++; +} + +int +register_ftrace_function_probe(char *glob, struct trace_array *tr, + struct ftrace_probe_ops *probe_ops, + void *data) +{ + struct ftrace_func_probe *probe = NULL, *iter; + struct ftrace_func_entry *entry; + struct ftrace_hash **orig_hash; + struct ftrace_hash *old_hash; + struct ftrace_hash *hash; + int count = 0; + int size; + int ret; + int i; + + if (WARN_ON(!tr)) + return -EINVAL; + + /* We do not support '!' for function probes */ + if (WARN_ON(glob[0] == '!')) + return -EINVAL; + + + mutex_lock(&ftrace_lock); + /* Check if the probe_ops is already registered */ + list_for_each_entry(iter, &tr->func_probes, list) { + if (iter->probe_ops == probe_ops) { + probe = iter; + break; + } + } + if (!probe) { + probe = kzalloc(sizeof(*probe), GFP_KERNEL); + if (!probe) { + mutex_unlock(&ftrace_lock); + return -ENOMEM; + } + probe->probe_ops = probe_ops; + probe->ops.func = function_trace_probe_call; + probe->tr = tr; + ftrace_ops_init(&probe->ops); + list_add(&probe->list, &tr->func_probes); + } + + acquire_probe_locked(probe); + + mutex_unlock(&ftrace_lock); + + /* + * Note, there's a small window here that the func_hash->filter_hash + * may be NULL or empty. Need to be careful when reading the loop. + */ + mutex_lock(&probe->ops.func_hash->regex_lock); + + orig_hash = &probe->ops.func_hash->filter_hash; + old_hash = *orig_hash; + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); + + if (!hash) { + ret = -ENOMEM; + goto out; + } + + ret = ftrace_match_records(hash, glob, strlen(glob)); + + /* Nothing found? */ + if (!ret) + ret = -EINVAL; + + if (ret < 0) + goto out; + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + if (ftrace_lookup_ip(old_hash, entry->ip)) + continue; + /* + * The caller might want to do something special + * for each function we find. We call the callback + * to give the caller an opportunity to do so. + */ + if (probe_ops->init) { + ret = probe_ops->init(probe_ops, tr, + entry->ip, data, + &probe->data); + if (ret < 0) { + if (probe_ops->free && count) + probe_ops->free(probe_ops, tr, + 0, probe->data); + probe->data = NULL; + goto out; + } + } + count++; + } + } + + mutex_lock(&ftrace_lock); + + if (!count) { + /* Nothing was added? */ + ret = -EINVAL; + goto out_unlock; + } + + ret = ftrace_hash_move_and_update_ops(&probe->ops, orig_hash, + hash, 1); + if (ret < 0) + goto err_unlock; + + /* One ref for each new function traced */ + probe->ref += count; + + if (!(probe->ops.flags & FTRACE_OPS_FL_ENABLED)) + ret = ftrace_startup(&probe->ops, 0); + + out_unlock: + mutex_unlock(&ftrace_lock); + + if (!ret) + ret = count; + out: + mutex_unlock(&probe->ops.func_hash->regex_lock); + free_ftrace_hash(hash); + + release_probe(probe); + + return ret; + + err_unlock: + if (!probe_ops->free || !count) + goto out_unlock; + + /* Failed to do the move, need to call the free functions */ + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + if (ftrace_lookup_ip(old_hash, entry->ip)) + continue; + probe_ops->free(probe_ops, tr, entry->ip, probe->data); + } + } + goto out_unlock; +} + +int +unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, + struct ftrace_probe_ops *probe_ops) +{ + struct ftrace_func_probe *probe = NULL, *iter; + struct ftrace_ops_hash old_hash_ops; + struct ftrace_func_entry *entry; + struct ftrace_glob func_g; + struct ftrace_hash **orig_hash; + struct ftrace_hash *old_hash; + struct ftrace_hash *hash = NULL; + struct hlist_node *tmp; + struct hlist_head hhd; + char str[KSYM_SYMBOL_LEN]; + int count = 0; + int i, ret = -ENODEV; + int size; + + if (!glob || !strlen(glob) || !strcmp(glob, "*")) + func_g.search = NULL; + else { + int not; + + func_g.type = filter_parse_regex(glob, strlen(glob), + &func_g.search, ¬); + func_g.len = strlen(func_g.search); + + /* we do not support '!' for function probes */ + if (WARN_ON(not)) + return -EINVAL; + } + + mutex_lock(&ftrace_lock); + /* Check if the probe_ops is already registered */ + list_for_each_entry(iter, &tr->func_probes, list) { + if (iter->probe_ops == probe_ops) { + probe = iter; + break; + } + } + if (!probe) + goto err_unlock_ftrace; + + ret = -EINVAL; + if (!(probe->ops.flags & FTRACE_OPS_FL_INITIALIZED)) + goto err_unlock_ftrace; + + acquire_probe_locked(probe); + + mutex_unlock(&ftrace_lock); + + mutex_lock(&probe->ops.func_hash->regex_lock); + + orig_hash = &probe->ops.func_hash->filter_hash; + old_hash = *orig_hash; + + if (ftrace_hash_empty(old_hash)) + goto out_unlock; + + old_hash_ops.filter_hash = old_hash; + /* Probes only have filters */ + old_hash_ops.notrace_hash = NULL; + + ret = -ENOMEM; + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); + if (!hash) + goto out_unlock; + + INIT_HLIST_HEAD(&hhd); + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry_safe(entry, tmp, &hash->buckets[i], hlist) { + + if (func_g.search) { + kallsyms_lookup(entry->ip, NULL, NULL, + NULL, str); + if (!ftrace_match(str, &func_g)) + continue; + } + count++; + remove_hash_entry(hash, entry); + hlist_add_head(&entry->hlist, &hhd); + } + } + + /* Nothing found? */ + if (!count) { + ret = -EINVAL; + goto out_unlock; + } + + mutex_lock(&ftrace_lock); + + WARN_ON(probe->ref < count); + + probe->ref -= count; + + if (ftrace_hash_empty(hash)) + ftrace_shutdown(&probe->ops, 0); + + ret = ftrace_hash_move_and_update_ops(&probe->ops, orig_hash, + hash, 1); + + /* still need to update the function call sites */ + if (ftrace_enabled && !ftrace_hash_empty(hash)) + ftrace_run_modify_code(&probe->ops, FTRACE_UPDATE_CALLS, + &old_hash_ops); + synchronize_rcu(); + + hlist_for_each_entry_safe(entry, tmp, &hhd, hlist) { + hlist_del(&entry->hlist); + if (probe_ops->free) + probe_ops->free(probe_ops, tr, entry->ip, probe->data); + kfree(entry); + } + mutex_unlock(&ftrace_lock); + + out_unlock: + mutex_unlock(&probe->ops.func_hash->regex_lock); + free_ftrace_hash(hash); + + release_probe(probe); + + return ret; + + err_unlock_ftrace: + mutex_unlock(&ftrace_lock); + return ret; +} + +void clear_ftrace_function_probes(struct trace_array *tr) +{ + struct ftrace_func_probe *probe, *n; + + list_for_each_entry_safe(probe, n, &tr->func_probes, list) + unregister_ftrace_function_probe_func(NULL, tr, probe->probe_ops); +} + +static LIST_HEAD(ftrace_commands); +static DEFINE_MUTEX(ftrace_cmd_mutex); + +/* + * Currently we only register ftrace commands from __init, so mark this + * __init too. + */ +__init int register_ftrace_command(struct ftrace_func_command *cmd) +{ + struct ftrace_func_command *p; + int ret = 0; + + mutex_lock(&ftrace_cmd_mutex); + list_for_each_entry(p, &ftrace_commands, list) { + if (strcmp(cmd->name, p->name) == 0) { + ret = -EBUSY; + goto out_unlock; + } + } + list_add(&cmd->list, &ftrace_commands); + out_unlock: + mutex_unlock(&ftrace_cmd_mutex); + + return ret; +} + +/* + * Currently we only unregister ftrace commands from __init, so mark + * this __init too. + */ +__init int unregister_ftrace_command(struct ftrace_func_command *cmd) +{ + struct ftrace_func_command *p, *n; + int ret = -ENODEV; + + mutex_lock(&ftrace_cmd_mutex); + list_for_each_entry_safe(p, n, &ftrace_commands, list) { + if (strcmp(cmd->name, p->name) == 0) { + ret = 0; + list_del_init(&p->list); + goto out_unlock; + } + } + out_unlock: + mutex_unlock(&ftrace_cmd_mutex); + + return ret; +} + +static int ftrace_process_regex(struct ftrace_iterator *iter, + char *buff, int len, int enable) +{ + struct ftrace_hash *hash = iter->hash; + struct trace_array *tr = iter->ops->private; + char *func, *command, *next = buff; + struct ftrace_func_command *p; + int ret = -EINVAL; + + func = strsep(&next, ":"); + + if (!next) { + ret = ftrace_match_records(hash, func, len); + if (!ret) + ret = -EINVAL; + if (ret < 0) + return ret; + return 0; + } + + /* command found */ + + command = strsep(&next, ":"); + + mutex_lock(&ftrace_cmd_mutex); + list_for_each_entry(p, &ftrace_commands, list) { + if (strcmp(p->name, command) == 0) { + ret = p->func(tr, hash, func, command, next, enable); + goto out_unlock; + } + } + out_unlock: + mutex_unlock(&ftrace_cmd_mutex); + + return ret; +} + +static ssize_t +ftrace_regex_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos, int enable) +{ + struct ftrace_iterator *iter; + struct trace_parser *parser; + ssize_t ret, read; + + if (!cnt) + return 0; + + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + iter = m->private; + } else + iter = file->private_data; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + /* iter->hash is a local copy, so we don't need regex_lock */ + + parser = &iter->parser; + read = trace_get_user(parser, ubuf, cnt, ppos); + + if (read >= 0 && trace_parser_loaded(parser) && + !trace_parser_cont(parser)) { + ret = ftrace_process_regex(iter, parser->buffer, + parser->idx, enable); + trace_parser_clear(parser); + if (ret < 0) + goto out; + } + + ret = read; + out: + return ret; +} + +ssize_t +ftrace_filter_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return ftrace_regex_write(file, ubuf, cnt, ppos, 1); +} + +ssize_t +ftrace_notrace_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return ftrace_regex_write(file, ubuf, cnt, ppos, 0); +} + +static int +__ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) +{ + struct ftrace_func_entry *entry; + + ip = ftrace_location(ip); + if (!ip) + return -EINVAL; + + if (remove) { + entry = ftrace_lookup_ip(hash, ip); + if (!entry) + return -ENOENT; + free_hash_entry(hash, entry); + return 0; + } + + entry = add_hash_entry(hash, ip); + return entry ? 0 : -ENOMEM; +} + +static int +ftrace_match_addr(struct ftrace_hash *hash, unsigned long *ips, + unsigned int cnt, int remove) +{ + unsigned int i; + int err; + + for (i = 0; i < cnt; i++) { + err = __ftrace_match_addr(hash, ips[i], remove); + if (err) { + /* + * This expects the @hash is a temporary hash and if this + * fails the caller must free the @hash. + */ + return err; + } + } + return 0; +} + +static int +ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, + unsigned long *ips, unsigned int cnt, + int remove, int reset, int enable) +{ + struct ftrace_hash **orig_hash; + struct ftrace_hash *hash; + int ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + mutex_lock(&ops->func_hash->regex_lock); + + if (enable) + orig_hash = &ops->func_hash->filter_hash; + else + orig_hash = &ops->func_hash->notrace_hash; + + if (reset) + hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + else + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + + if (!hash) { + ret = -ENOMEM; + goto out_regex_unlock; + } + + if (buf && !ftrace_match_records(hash, buf, len)) { + ret = -EINVAL; + goto out_regex_unlock; + } + if (ips) { + ret = ftrace_match_addr(hash, ips, cnt, remove); + if (ret < 0) + goto out_regex_unlock; + } + + mutex_lock(&ftrace_lock); + ret = ftrace_hash_move_and_update_ops(ops, orig_hash, hash, enable); + mutex_unlock(&ftrace_lock); + + out_regex_unlock: + mutex_unlock(&ops->func_hash->regex_lock); + + free_ftrace_hash(hash); + return ret; +} + +static int +ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt, + int remove, int reset, int enable) +{ + return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable); +} + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + +struct ftrace_direct_func { + struct list_head next; + unsigned long addr; + int count; +}; + +static LIST_HEAD(ftrace_direct_funcs); + +static int register_ftrace_function_nolock(struct ftrace_ops *ops); + +#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_ARGS) + +static int check_direct_multi(struct ftrace_ops *ops) +{ + if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) + return -EINVAL; + if ((ops->flags & MULTI_FLAGS) != MULTI_FLAGS) + return -EINVAL; + return 0; +} + +static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long addr) +{ + struct ftrace_func_entry *entry, *del; + int size, i; + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + del = __ftrace_lookup_ip(direct_functions, entry->ip); + if (del && del->direct == addr) { + remove_hash_entry(direct_functions, del); + kfree(del); + } + } + } +} + +/** + * register_ftrace_direct - Call a custom trampoline directly + * for multiple functions registered in @ops + * @ops: The address of the struct ftrace_ops object + * @addr: The address of the trampoline to call at @ops functions + * + * This is used to connect a direct calls to @addr from the nop locations + * of the functions registered in @ops (with by ftrace_set_filter_ip + * function). + * + * The location that it calls (@addr) must be able to handle a direct call, + * and save the parameters of the function being traced, and restore them + * (or inject new ones if needed), before returning. + * + * Returns: + * 0 on success + * -EINVAL - The @ops object was already registered with this call or + * when there are no functions in @ops object. + * -EBUSY - Another direct function is already attached (there can be only one) + * -ENODEV - @ip does not point to a ftrace nop location (or not supported) + * -ENOMEM - There was an allocation failure. + */ +int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) +{ + struct ftrace_hash *hash, *new_hash = NULL, *free_hash = NULL; + struct ftrace_func_entry *entry, *new; + int err = -EBUSY, size, i; + + if (ops->func || ops->trampoline) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) + return -EINVAL; + if (ops->flags & FTRACE_OPS_FL_ENABLED) + return -EINVAL; + + hash = ops->func_hash->filter_hash; + if (ftrace_hash_empty(hash)) + return -EINVAL; + + mutex_lock(&direct_mutex); + + /* Make sure requested entries are not already registered.. */ + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + if (ftrace_find_rec_direct(entry->ip)) + goto out_unlock; + } + } + + err = -ENOMEM; + + /* Make a copy hash to place the new and the old entries in */ + size = hash->count + direct_functions->count; + if (size > 32) + size = 32; + new_hash = alloc_ftrace_hash(fls(size)); + if (!new_hash) + goto out_unlock; + + /* Now copy over the existing direct entries */ + size = 1 << direct_functions->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &direct_functions->buckets[i], hlist) { + new = add_hash_entry(new_hash, entry->ip); + if (!new) + goto out_unlock; + new->direct = entry->direct; + } + } + + /* ... and add the new entries */ + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + new = add_hash_entry(new_hash, entry->ip); + if (!new) + goto out_unlock; + /* Update both the copy and the hash entry */ + new->direct = addr; + entry->direct = addr; + } + } + + free_hash = direct_functions; + rcu_assign_pointer(direct_functions, new_hash); + new_hash = NULL; + + ops->func = call_direct_funcs; + ops->flags = MULTI_FLAGS; + ops->trampoline = FTRACE_REGS_ADDR; + ops->direct_call = addr; + + err = register_ftrace_function_nolock(ops); + + out_unlock: + mutex_unlock(&direct_mutex); + + if (free_hash && free_hash != EMPTY_HASH) { + synchronize_rcu_tasks(); + free_ftrace_hash(free_hash); + } + + if (new_hash) + free_ftrace_hash(new_hash); + + return err; +} +EXPORT_SYMBOL_GPL(register_ftrace_direct); + +/** + * unregister_ftrace_direct - Remove calls to custom trampoline + * previously registered by register_ftrace_direct for @ops object. + * @ops: The address of the struct ftrace_ops object + * + * This is used to remove a direct calls to @addr from the nop locations + * of the functions registered in @ops (with by ftrace_set_filter_ip + * function). + * + * Returns: + * 0 on success + * -EINVAL - The @ops object was not properly registered. + */ +int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, + bool free_filters) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + int err; + + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + + mutex_lock(&direct_mutex); + err = unregister_ftrace_function(ops); + remove_direct_functions_hash(hash, addr); + mutex_unlock(&direct_mutex); + + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; + + if (free_filters) + ftrace_free_filter(ops); + return err; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_direct); + +static int +__modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) +{ + struct ftrace_hash *hash; + struct ftrace_func_entry *entry, *iter; + static struct ftrace_ops tmp_ops = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_STUB, + }; + int i, size; + int err; + + lockdep_assert_held_once(&direct_mutex); + + /* Enable the tmp_ops to have the same functions as the direct ops */ + ftrace_ops_init(&tmp_ops); + tmp_ops.func_hash = ops->func_hash; + tmp_ops.direct_call = addr; + + err = register_ftrace_function_nolock(&tmp_ops); + if (err) + return err; + + /* + * Now the ftrace_ops_list_func() is called to do the direct callers. + * We can safely change the direct functions attached to each entry. + */ + mutex_lock(&ftrace_lock); + + hash = ops->func_hash->filter_hash; + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(iter, &hash->buckets[i], hlist) { + entry = __ftrace_lookup_ip(direct_functions, iter->ip); + if (!entry) + continue; + entry->direct = addr; + } + } + /* Prevent store tearing if a trampoline concurrently accesses the value */ + WRITE_ONCE(ops->direct_call, addr); + + mutex_unlock(&ftrace_lock); + + /* Removing the tmp_ops will add the updated direct callers to the functions */ + unregister_ftrace_function(&tmp_ops); + + return err; +} + +/** + * modify_ftrace_direct_nolock - Modify an existing direct 'multi' call + * to call something else + * @ops: The address of the struct ftrace_ops object + * @addr: The address of the new trampoline to call at @ops functions + * + * This is used to unregister currently registered direct caller and + * register new one @addr on functions registered in @ops object. + * + * Note there's window between ftrace_shutdown and ftrace_startup calls + * where there will be no callbacks called. + * + * Caller should already have direct_mutex locked, so we don't lock + * direct_mutex here. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @ops object was not properly registered. + */ +int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr) +{ + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + + return __modify_ftrace_direct(ops, addr); +} +EXPORT_SYMBOL_GPL(modify_ftrace_direct_nolock); + +/** + * modify_ftrace_direct - Modify an existing direct 'multi' call + * to call something else + * @ops: The address of the struct ftrace_ops object + * @addr: The address of the new trampoline to call at @ops functions + * + * This is used to unregister currently registered direct caller and + * register new one @addr on functions registered in @ops object. + * + * Note there's window between ftrace_shutdown and ftrace_startup calls + * where there will be no callbacks called. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @ops object was not properly registered. + */ +int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) +{ + int err; + + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + + mutex_lock(&direct_mutex); + err = __modify_ftrace_direct(ops, addr); + mutex_unlock(&direct_mutex); + return err; +} +EXPORT_SYMBOL_GPL(modify_ftrace_direct); +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + +/** + * ftrace_set_filter_ip - set a function to filter on in ftrace by address + * @ops - the ops to set the filter with + * @ip - the address to add to or remove from the filter. + * @remove - non zero to remove the ip from the filter + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled + * If @ip is NULL, it fails to update filter. + * + * This can allocate memory which must be freed before @ops can be freed, + * either by removing each filtered addr or by using + * ftrace_free_filter(@ops). + */ +int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, + int remove, int reset) +{ + ftrace_ops_init(ops); + return ftrace_set_addr(ops, &ip, 1, remove, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); + +/** + * ftrace_set_filter_ips - set functions to filter on in ftrace by addresses + * @ops - the ops to set the filter with + * @ips - the array of addresses to add to or remove from the filter. + * @cnt - the number of addresses in @ips + * @remove - non zero to remove ips from the filter + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled + * If @ips array or any ip specified within is NULL , it fails to update filter. + * + * This can allocate memory which must be freed before @ops can be freed, + * either by removing each filtered addr or by using + * ftrace_free_filter(@ops). +*/ +int ftrace_set_filter_ips(struct ftrace_ops *ops, unsigned long *ips, + unsigned int cnt, int remove, int reset) +{ + ftrace_ops_init(ops); + return ftrace_set_addr(ops, ips, cnt, remove, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_filter_ips); + +/** + * ftrace_ops_set_global_filter - setup ops to use global filters + * @ops - the ops which will use the global filters + * + * ftrace users who need global function trace filtering should call this. + * It can set the global filter only if ops were not initialized before. + */ +void ftrace_ops_set_global_filter(struct ftrace_ops *ops) +{ + if (ops->flags & FTRACE_OPS_FL_INITIALIZED) + return; + + ftrace_ops_init(ops); + ops->func_hash = &global_ops.local_hash; +} +EXPORT_SYMBOL_GPL(ftrace_ops_set_global_filter); + +static int +ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, + int reset, int enable) +{ + return ftrace_set_hash(ops, buf, len, NULL, 0, 0, reset, enable); +} + +/** + * ftrace_set_filter - set a function to filter on in ftrace + * @ops - the ops to set the filter with + * @buf - the string that holds the function filter text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled. + * If @buf is NULL and reset is set, all functions will be enabled for tracing. + * + * This can allocate memory which must be freed before @ops can be freed, + * either by removing each filtered addr or by using + * ftrace_free_filter(@ops). + */ +int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset) +{ + ftrace_ops_init(ops); + return ftrace_set_regex(ops, buf, len, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_filter); + +/** + * ftrace_set_notrace - set a function to not trace in ftrace + * @ops - the ops to set the notrace filter with + * @buf - the string that holds the function notrace text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Notrace Filters denote which functions should not be enabled when tracing + * is enabled. If @buf is NULL and reset is set, all functions will be enabled + * for tracing. + * + * This can allocate memory which must be freed before @ops can be freed, + * either by removing each filtered addr or by using + * ftrace_free_filter(@ops). + */ +int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset) +{ + ftrace_ops_init(ops); + return ftrace_set_regex(ops, buf, len, reset, 0); +} +EXPORT_SYMBOL_GPL(ftrace_set_notrace); +/** + * ftrace_set_global_filter - set a function to filter on with global tracers + * @buf - the string that holds the function filter text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled. + * If @buf is NULL and reset is set, all functions will be enabled for tracing. + */ +void ftrace_set_global_filter(unsigned char *buf, int len, int reset) +{ + ftrace_set_regex(&global_ops, buf, len, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_global_filter); + +/** + * ftrace_set_global_notrace - set a function to not trace with global tracers + * @buf - the string that holds the function notrace text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Notrace Filters denote which functions should not be enabled when tracing + * is enabled. If @buf is NULL and reset is set, all functions will be enabled + * for tracing. + */ +void ftrace_set_global_notrace(unsigned char *buf, int len, int reset) +{ + ftrace_set_regex(&global_ops, buf, len, reset, 0); +} +EXPORT_SYMBOL_GPL(ftrace_set_global_notrace); + +/* + * command line interface to allow users to set filters on boot up. + */ +#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE +static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; +static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; + +/* Used by function selftest to not test if filter is set */ +bool ftrace_filter_param __initdata; + +static int __init set_ftrace_notrace(char *str) +{ + ftrace_filter_param = true; + strscpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_notrace=", set_ftrace_notrace); + +static int __init set_ftrace_filter(char *str) +{ + ftrace_filter_param = true; + strscpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_filter=", set_ftrace_filter); + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; +static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; +static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); + +static int __init set_graph_function(char *str) +{ + strscpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_graph_filter=", set_graph_function); + +static int __init set_graph_notrace_function(char *str) +{ + strscpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_graph_notrace=", set_graph_notrace_function); + +static int __init set_graph_max_depth_function(char *str) +{ + if (!str) + return 0; + fgraph_max_depth = simple_strtoul(str, NULL, 0); + return 1; +} +__setup("ftrace_graph_max_depth=", set_graph_max_depth_function); + +static void __init set_ftrace_early_graph(char *buf, int enable) +{ + int ret; + char *func; + struct ftrace_hash *hash; + + hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + if (MEM_FAIL(!hash, "Failed to allocate hash\n")) + return; + + while (buf) { + func = strsep(&buf, ","); + /* we allow only one expression at a time */ + ret = ftrace_graph_set_hash(hash, func); + if (ret) + printk(KERN_DEBUG "ftrace: function %s not " + "traceable\n", func); + } + + if (enable) + ftrace_graph_hash = hash; + else + ftrace_graph_notrace_hash = hash; +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +void __init +ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable) +{ + char *func; + + ftrace_ops_init(ops); + + while (buf) { + func = strsep(&buf, ","); + ftrace_set_regex(ops, func, strlen(func), 0, enable); + } +} + +static void __init set_ftrace_early_filters(void) +{ + if (ftrace_filter_buf[0]) + ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1); + if (ftrace_notrace_buf[0]) + ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (ftrace_graph_buf[0]) + set_ftrace_early_graph(ftrace_graph_buf, 1); + if (ftrace_graph_notrace_buf[0]) + set_ftrace_early_graph(ftrace_graph_notrace_buf, 0); +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +} + +int ftrace_regex_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct ftrace_iterator *iter; + struct ftrace_hash **orig_hash; + struct trace_parser *parser; + int filter_hash; + + if (file->f_mode & FMODE_READ) { + iter = m->private; + seq_release(inode, file); + } else + iter = file->private_data; + + parser = &iter->parser; + if (trace_parser_loaded(parser)) { + int enable = !(iter->flags & FTRACE_ITER_NOTRACE); + + ftrace_process_regex(iter, parser->buffer, + parser->idx, enable); + } + + trace_parser_put(parser); + + mutex_lock(&iter->ops->func_hash->regex_lock); + + if (file->f_mode & FMODE_WRITE) { + filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); + + if (filter_hash) { + orig_hash = &iter->ops->func_hash->filter_hash; + if (iter->tr) { + if (list_empty(&iter->tr->mod_trace)) + iter->hash->flags &= ~FTRACE_HASH_FL_MOD; + else + iter->hash->flags |= FTRACE_HASH_FL_MOD; + } + } else + orig_hash = &iter->ops->func_hash->notrace_hash; + + mutex_lock(&ftrace_lock); + ftrace_hash_move_and_update_ops(iter->ops, orig_hash, + iter->hash, filter_hash); + mutex_unlock(&ftrace_lock); + } else { + /* For read only, the hash is the ops hash */ + iter->hash = NULL; + } + + mutex_unlock(&iter->ops->func_hash->regex_lock); + free_ftrace_hash(iter->hash); + if (iter->tr) + trace_array_put(iter->tr); + kfree(iter); + + return 0; +} + +static const struct file_operations ftrace_avail_fops = { + .open = ftrace_avail_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static const struct file_operations ftrace_enabled_fops = { + .open = ftrace_enabled_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static const struct file_operations ftrace_touched_fops = { + .open = ftrace_touched_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static const struct file_operations ftrace_avail_addrs_fops = { + .open = ftrace_avail_addrs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static const struct file_operations ftrace_filter_fops = { + .open = ftrace_filter_open, + .read = seq_read, + .write = ftrace_filter_write, + .llseek = tracing_lseek, + .release = ftrace_regex_release, +}; + +static const struct file_operations ftrace_notrace_fops = { + .open = ftrace_notrace_open, + .read = seq_read, + .write = ftrace_notrace_write, + .llseek = tracing_lseek, + .release = ftrace_regex_release, +}; + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +static DEFINE_MUTEX(graph_lock); + +struct ftrace_hash __rcu *ftrace_graph_hash = EMPTY_HASH; +struct ftrace_hash __rcu *ftrace_graph_notrace_hash = EMPTY_HASH; + +enum graph_filter_type { + GRAPH_FILTER_NOTRACE = 0, + GRAPH_FILTER_FUNCTION, +}; + +#define FTRACE_GRAPH_EMPTY ((void *)1) + +struct ftrace_graph_data { + struct ftrace_hash *hash; + struct ftrace_func_entry *entry; + int idx; /* for hash table iteration */ + enum graph_filter_type type; + struct ftrace_hash *new_hash; + const struct seq_operations *seq_ops; + struct trace_parser parser; +}; + +static void * +__g_next(struct seq_file *m, loff_t *pos) +{ + struct ftrace_graph_data *fgd = m->private; + struct ftrace_func_entry *entry = fgd->entry; + struct hlist_head *head; + int i, idx = fgd->idx; + + if (*pos >= fgd->hash->count) + return NULL; + + if (entry) { + hlist_for_each_entry_continue(entry, hlist) { + fgd->entry = entry; + return entry; + } + + idx++; + } + + for (i = idx; i < 1 << fgd->hash->size_bits; i++) { + head = &fgd->hash->buckets[i]; + hlist_for_each_entry(entry, head, hlist) { + fgd->entry = entry; + fgd->idx = i; + return entry; + } + } + return NULL; +} + +static void * +g_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return __g_next(m, pos); +} + +static void *g_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_graph_data *fgd = m->private; + + mutex_lock(&graph_lock); + + if (fgd->type == GRAPH_FILTER_FUNCTION) + fgd->hash = rcu_dereference_protected(ftrace_graph_hash, + lockdep_is_held(&graph_lock)); + else + fgd->hash = rcu_dereference_protected(ftrace_graph_notrace_hash, + lockdep_is_held(&graph_lock)); + + /* Nothing, tell g_show to print all functions are enabled */ + if (ftrace_hash_empty(fgd->hash) && !*pos) + return FTRACE_GRAPH_EMPTY; + + fgd->idx = 0; + fgd->entry = NULL; + return __g_next(m, pos); +} + +static void g_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&graph_lock); +} + +static int g_show(struct seq_file *m, void *v) +{ + struct ftrace_func_entry *entry = v; + + if (!entry) + return 0; + + if (entry == FTRACE_GRAPH_EMPTY) { + struct ftrace_graph_data *fgd = m->private; + + if (fgd->type == GRAPH_FILTER_FUNCTION) + seq_puts(m, "#### all functions enabled ####\n"); + else + seq_puts(m, "#### no functions disabled ####\n"); + return 0; + } + + seq_printf(m, "%ps\n", (void *)entry->ip); + + return 0; +} + +static const struct seq_operations ftrace_graph_seq_ops = { + .start = g_start, + .next = g_next, + .stop = g_stop, + .show = g_show, +}; + +static int +__ftrace_graph_open(struct inode *inode, struct file *file, + struct ftrace_graph_data *fgd) +{ + int ret; + struct ftrace_hash *new_hash = NULL; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + if (file->f_mode & FMODE_WRITE) { + const int size_bits = FTRACE_HASH_DEFAULT_BITS; + + if (trace_parser_get_init(&fgd->parser, FTRACE_BUFF_MAX)) + return -ENOMEM; + + if (file->f_flags & O_TRUNC) + new_hash = alloc_ftrace_hash(size_bits); + else + new_hash = alloc_and_copy_ftrace_hash(size_bits, + fgd->hash); + if (!new_hash) { + ret = -ENOMEM; + goto out; + } + } + + if (file->f_mode & FMODE_READ) { + ret = seq_open(file, &ftrace_graph_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = fgd; + } else { + /* Failed */ + free_ftrace_hash(new_hash); + new_hash = NULL; + } + } else + file->private_data = fgd; + +out: + if (ret < 0 && file->f_mode & FMODE_WRITE) + trace_parser_put(&fgd->parser); + + fgd->new_hash = new_hash; + + /* + * All uses of fgd->hash must be taken with the graph_lock + * held. The graph_lock is going to be released, so force + * fgd->hash to be reinitialized when it is taken again. + */ + fgd->hash = NULL; + + return ret; +} + +static int +ftrace_graph_open(struct inode *inode, struct file *file) +{ + struct ftrace_graph_data *fgd; + int ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); + if (fgd == NULL) + return -ENOMEM; + + mutex_lock(&graph_lock); + + fgd->hash = rcu_dereference_protected(ftrace_graph_hash, + lockdep_is_held(&graph_lock)); + fgd->type = GRAPH_FILTER_FUNCTION; + fgd->seq_ops = &ftrace_graph_seq_ops; + + ret = __ftrace_graph_open(inode, file, fgd); + if (ret < 0) + kfree(fgd); + + mutex_unlock(&graph_lock); + return ret; +} + +static int +ftrace_graph_notrace_open(struct inode *inode, struct file *file) +{ + struct ftrace_graph_data *fgd; + int ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); + if (fgd == NULL) + return -ENOMEM; + + mutex_lock(&graph_lock); + + fgd->hash = rcu_dereference_protected(ftrace_graph_notrace_hash, + lockdep_is_held(&graph_lock)); + fgd->type = GRAPH_FILTER_NOTRACE; + fgd->seq_ops = &ftrace_graph_seq_ops; + + ret = __ftrace_graph_open(inode, file, fgd); + if (ret < 0) + kfree(fgd); + + mutex_unlock(&graph_lock); + return ret; +} + +static int +ftrace_graph_release(struct inode *inode, struct file *file) +{ + struct ftrace_graph_data *fgd; + struct ftrace_hash *old_hash, *new_hash; + struct trace_parser *parser; + int ret = 0; + + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + + fgd = m->private; + seq_release(inode, file); + } else { + fgd = file->private_data; + } + + + if (file->f_mode & FMODE_WRITE) { + + parser = &fgd->parser; + + if (trace_parser_loaded((parser))) { + ret = ftrace_graph_set_hash(fgd->new_hash, + parser->buffer); + } + + trace_parser_put(parser); + + new_hash = __ftrace_hash_move(fgd->new_hash); + if (!new_hash) { + ret = -ENOMEM; + goto out; + } + + mutex_lock(&graph_lock); + + if (fgd->type == GRAPH_FILTER_FUNCTION) { + old_hash = rcu_dereference_protected(ftrace_graph_hash, + lockdep_is_held(&graph_lock)); + rcu_assign_pointer(ftrace_graph_hash, new_hash); + } else { + old_hash = rcu_dereference_protected(ftrace_graph_notrace_hash, + lockdep_is_held(&graph_lock)); + rcu_assign_pointer(ftrace_graph_notrace_hash, new_hash); + } + + mutex_unlock(&graph_lock); + + /* + * We need to do a hard force of sched synchronization. + * This is because we use preempt_disable() to do RCU, but + * the function tracers can be called where RCU is not watching + * (like before user_exit()). We can not rely on the RCU + * infrastructure to do the synchronization, thus we must do it + * ourselves. + */ + if (old_hash != EMPTY_HASH) + synchronize_rcu_tasks_rude(); + + free_ftrace_hash(old_hash); + } + + out: + free_ftrace_hash(fgd->new_hash); + kfree(fgd); + + return ret; +} + +static int +ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer) +{ + struct ftrace_glob func_g; + struct dyn_ftrace *rec; + struct ftrace_page *pg; + struct ftrace_func_entry *entry; + int fail = 1; + int not; + + /* decode regex */ + func_g.type = filter_parse_regex(buffer, strlen(buffer), + &func_g.search, ¬); + + func_g.len = strlen(func_g.search); + + mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) { + mutex_unlock(&ftrace_lock); + return -ENODEV; + } + + do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_DISABLED) + continue; + + if (ftrace_match_record(rec, &func_g, NULL, 0)) { + entry = ftrace_lookup_ip(hash, rec->ip); + + if (!not) { + fail = 0; + + if (entry) + continue; + if (add_hash_entry(hash, rec->ip) == NULL) + goto out; + } else { + if (entry) { + free_hash_entry(hash, entry); + fail = 0; + } + } + } + } while_for_each_ftrace_rec(); +out: + mutex_unlock(&ftrace_lock); + + if (fail) + return -EINVAL; + + return 0; +} + +static ssize_t +ftrace_graph_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + ssize_t read, ret = 0; + struct ftrace_graph_data *fgd = file->private_data; + struct trace_parser *parser; + + if (!cnt) + return 0; + + /* Read mode uses seq functions */ + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + fgd = m->private; + } + + parser = &fgd->parser; + + read = trace_get_user(parser, ubuf, cnt, ppos); + + if (read >= 0 && trace_parser_loaded(parser) && + !trace_parser_cont(parser)) { + + ret = ftrace_graph_set_hash(fgd->new_hash, + parser->buffer); + trace_parser_clear(parser); + } + + if (!ret) + ret = read; + + return ret; +} + +static const struct file_operations ftrace_graph_fops = { + .open = ftrace_graph_open, + .read = seq_read, + .write = ftrace_graph_write, + .llseek = tracing_lseek, + .release = ftrace_graph_release, +}; + +static const struct file_operations ftrace_graph_notrace_fops = { + .open = ftrace_graph_notrace_open, + .read = seq_read, + .write = ftrace_graph_write, + .llseek = tracing_lseek, + .release = ftrace_graph_release, +}; +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +void ftrace_create_filter_files(struct ftrace_ops *ops, + struct dentry *parent) +{ + + trace_create_file("set_ftrace_filter", TRACE_MODE_WRITE, parent, + ops, &ftrace_filter_fops); + + trace_create_file("set_ftrace_notrace", TRACE_MODE_WRITE, parent, + ops, &ftrace_notrace_fops); +} + +/* + * The name "destroy_filter_files" is really a misnomer. Although + * in the future, it may actually delete the files, but this is + * really intended to make sure the ops passed in are disabled + * and that when this function returns, the caller is free to + * free the ops. + * + * The "destroy" name is only to match the "create" name that this + * should be paired with. + */ +void ftrace_destroy_filter_files(struct ftrace_ops *ops) +{ + mutex_lock(&ftrace_lock); + if (ops->flags & FTRACE_OPS_FL_ENABLED) + ftrace_shutdown(ops, 0); + ops->flags |= FTRACE_OPS_FL_DELETED; + ftrace_free_filter(ops); + mutex_unlock(&ftrace_lock); +} + +static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer) +{ + + trace_create_file("available_filter_functions", TRACE_MODE_READ, + d_tracer, NULL, &ftrace_avail_fops); + + trace_create_file("available_filter_functions_addrs", TRACE_MODE_READ, + d_tracer, NULL, &ftrace_avail_addrs_fops); + + trace_create_file("enabled_functions", TRACE_MODE_READ, + d_tracer, NULL, &ftrace_enabled_fops); + + trace_create_file("touched_functions", TRACE_MODE_READ, + d_tracer, NULL, &ftrace_touched_fops); + + ftrace_create_filter_files(&global_ops, d_tracer); + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + trace_create_file("set_graph_function", TRACE_MODE_WRITE, d_tracer, + NULL, + &ftrace_graph_fops); + trace_create_file("set_graph_notrace", TRACE_MODE_WRITE, d_tracer, + NULL, + &ftrace_graph_notrace_fops); +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + + return 0; +} + +static int ftrace_cmp_ips(const void *a, const void *b) +{ + const unsigned long *ipa = a; + const unsigned long *ipb = b; + + if (*ipa > *ipb) + return 1; + if (*ipa < *ipb) + return -1; + return 0; +} + +#ifdef CONFIG_FTRACE_SORT_STARTUP_TEST +static void test_is_sorted(unsigned long *start, unsigned long count) +{ + int i; + + for (i = 1; i < count; i++) { + if (WARN(start[i - 1] > start[i], + "[%d] %pS at %lx is not sorted with %pS at %lx\n", i, + (void *)start[i - 1], start[i - 1], + (void *)start[i], start[i])) + break; + } + if (i == count) + pr_info("ftrace section at %px sorted properly\n", start); +} +#else +static void test_is_sorted(unsigned long *start, unsigned long count) +{ +} +#endif + +static int ftrace_process_locs(struct module *mod, + unsigned long *start, + unsigned long *end) +{ + struct ftrace_page *pg_unuse = NULL; + struct ftrace_page *start_pg; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + unsigned long skipped = 0; + unsigned long count; + unsigned long *p; + unsigned long addr; + unsigned long flags = 0; /* Shut up gcc */ + int ret = -ENOMEM; + + count = end - start; + + if (!count) + return 0; + + /* + * Sorting mcount in vmlinux at build time depend on + * CONFIG_BUILDTIME_MCOUNT_SORT, while mcount loc in + * modules can not be sorted at build time. + */ + if (!IS_ENABLED(CONFIG_BUILDTIME_MCOUNT_SORT) || mod) { + sort(start, count, sizeof(*start), + ftrace_cmp_ips, NULL); + } else { + test_is_sorted(start, count); + } + + start_pg = ftrace_allocate_pages(count); + if (!start_pg) + return -ENOMEM; + + mutex_lock(&ftrace_lock); + + /* + * Core and each module needs their own pages, as + * modules will free them when they are removed. + * Force a new page to be allocated for modules. + */ + if (!mod) { + WARN_ON(ftrace_pages || ftrace_pages_start); + /* First initialization */ + ftrace_pages = ftrace_pages_start = start_pg; + } else { + if (!ftrace_pages) + goto out; + + if (WARN_ON(ftrace_pages->next)) { + /* Hmm, we have free pages? */ + while (ftrace_pages->next) + ftrace_pages = ftrace_pages->next; + } + + ftrace_pages->next = start_pg; + } + + p = start; + pg = start_pg; + while (p < end) { + unsigned long end_offset; + addr = ftrace_call_adjust(*p++); + /* + * Some architecture linkers will pad between + * the different mcount_loc sections of different + * object files to satisfy alignments. + * Skip any NULL pointers. + */ + if (!addr) { + skipped++; + continue; + } + + end_offset = (pg->index+1) * sizeof(pg->records[0]); + if (end_offset > PAGE_SIZE << pg->order) { + /* We should have allocated enough */ + if (WARN_ON(!pg->next)) + break; + pg = pg->next; + } + + rec = &pg->records[pg->index++]; + rec->ip = addr; + } + + if (pg->next) { + pg_unuse = pg->next; + pg->next = NULL; + } + + /* Assign the last page to ftrace_pages */ + ftrace_pages = pg; + + /* + * We only need to disable interrupts on start up + * because we are modifying code that an interrupt + * may execute, and the modification is not atomic. + * But for modules, nothing runs the code we modify + * until we are finished with it, and there's no + * reason to cause large interrupt latencies while we do it. + */ + if (!mod) + local_irq_save(flags); + ftrace_update_code(mod, start_pg); + if (!mod) + local_irq_restore(flags); + ret = 0; + out: + mutex_unlock(&ftrace_lock); + + /* We should have used all pages unless we skipped some */ + if (pg_unuse) { + WARN_ON(!skipped); + ftrace_free_pages(pg_unuse); + } + return ret; +} + +struct ftrace_mod_func { + struct list_head list; + char *name; + unsigned long ip; + unsigned int size; +}; + +struct ftrace_mod_map { + struct rcu_head rcu; + struct list_head list; + struct module *mod; + unsigned long start_addr; + unsigned long end_addr; + struct list_head funcs; + unsigned int num_funcs; +}; + +static int ftrace_get_trampoline_kallsym(unsigned int symnum, + unsigned long *value, char *type, + char *name, char *module_name, + int *exported) +{ + struct ftrace_ops *op; + + list_for_each_entry_rcu(op, &ftrace_ops_trampoline_list, list) { + if (!op->trampoline || symnum--) + continue; + *value = op->trampoline; + *type = 't'; + strscpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN); + strscpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN); + *exported = 0; + return 0; + } + + return -ERANGE; +} + +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) || defined(CONFIG_MODULES) +/* + * Check if the current ops references the given ip. + * + * If the ops traces all functions, then it was already accounted for. + * If the ops does not trace the current record function, skip it. + * If the ops ignores the function via notrace filter, skip it. + */ +static bool +ops_references_ip(struct ftrace_ops *ops, unsigned long ip) +{ + /* If ops isn't enabled, ignore it */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return false; + + /* If ops traces all then it includes this function */ + if (ops_traces_mod(ops)) + return true; + + /* The function must be in the filter */ + if (!ftrace_hash_empty(ops->func_hash->filter_hash) && + !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip)) + return false; + + /* If in notrace hash, we ignore it too */ + if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip)) + return false; + + return true; +} +#endif + +#ifdef CONFIG_MODULES + +#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) + +static LIST_HEAD(ftrace_mod_maps); + +static int referenced_filters(struct dyn_ftrace *rec) +{ + struct ftrace_ops *ops; + int cnt = 0; + + for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { + if (ops_references_ip(ops, rec->ip)) { + if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT)) + continue; + if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY)) + continue; + cnt++; + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) + rec->flags |= FTRACE_FL_REGS; + if (cnt == 1 && ops->trampoline) + rec->flags |= FTRACE_FL_TRAMP; + else + rec->flags &= ~FTRACE_FL_TRAMP; + } + } + + return cnt; +} + +static void +clear_mod_from_hash(struct ftrace_page *pg, struct ftrace_hash *hash) +{ + struct ftrace_func_entry *entry; + struct dyn_ftrace *rec; + int i; + + if (ftrace_hash_empty(hash)) + return; + + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + entry = __ftrace_lookup_ip(hash, rec->ip); + /* + * Do not allow this rec to match again. + * Yeah, it may waste some memory, but will be removed + * if/when the hash is modified again. + */ + if (entry) + entry->ip = 0; + } +} + +/* Clear any records from hashes */ +static void clear_mod_from_hashes(struct ftrace_page *pg) +{ + struct trace_array *tr; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->ops || !tr->ops->func_hash) + continue; + mutex_lock(&tr->ops->func_hash->regex_lock); + clear_mod_from_hash(pg, tr->ops->func_hash->filter_hash); + clear_mod_from_hash(pg, tr->ops->func_hash->notrace_hash); + mutex_unlock(&tr->ops->func_hash->regex_lock); + } + mutex_unlock(&trace_types_lock); +} + +static void ftrace_free_mod_map(struct rcu_head *rcu) +{ + struct ftrace_mod_map *mod_map = container_of(rcu, struct ftrace_mod_map, rcu); + struct ftrace_mod_func *mod_func; + struct ftrace_mod_func *n; + + /* All the contents of mod_map are now not visible to readers */ + list_for_each_entry_safe(mod_func, n, &mod_map->funcs, list) { + kfree(mod_func->name); + list_del(&mod_func->list); + kfree(mod_func); + } + + kfree(mod_map); +} + +void ftrace_release_mod(struct module *mod) +{ + struct ftrace_mod_map *mod_map; + struct ftrace_mod_map *n; + struct dyn_ftrace *rec; + struct ftrace_page **last_pg; + struct ftrace_page *tmp_page = NULL; + struct ftrace_page *pg; + + mutex_lock(&ftrace_lock); + + if (ftrace_disabled) + goto out_unlock; + + list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) { + if (mod_map->mod == mod) { + list_del_rcu(&mod_map->list); + call_rcu(&mod_map->rcu, ftrace_free_mod_map); + break; + } + } + + /* + * Each module has its own ftrace_pages, remove + * them from the list. + */ + last_pg = &ftrace_pages_start; + for (pg = ftrace_pages_start; pg; pg = *last_pg) { + rec = &pg->records[0]; + if (within_module(rec->ip, mod)) { + /* + * As core pages are first, the first + * page should never be a module page. + */ + if (WARN_ON(pg == ftrace_pages_start)) + goto out_unlock; + + /* Check if we are deleting the last page */ + if (pg == ftrace_pages) + ftrace_pages = next_to_ftrace_page(last_pg); + + ftrace_update_tot_cnt -= pg->index; + *last_pg = pg->next; + + pg->next = tmp_page; + tmp_page = pg; + } else + last_pg = &pg->next; + } + out_unlock: + mutex_unlock(&ftrace_lock); + + for (pg = tmp_page; pg; pg = tmp_page) { + + /* Needs to be called outside of ftrace_lock */ + clear_mod_from_hashes(pg); + + if (pg->records) { + free_pages((unsigned long)pg->records, pg->order); + ftrace_number_of_pages -= 1 << pg->order; + } + tmp_page = pg->next; + kfree(pg); + ftrace_number_of_groups--; + } +} + +void ftrace_module_enable(struct module *mod) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + + mutex_lock(&ftrace_lock); + + if (ftrace_disabled) + goto out_unlock; + + /* + * If the tracing is enabled, go ahead and enable the record. + * + * The reason not to enable the record immediately is the + * inherent check of ftrace_make_nop/ftrace_make_call for + * correct previous instructions. Making first the NOP + * conversion puts the module to the correct state, thus + * passing the ftrace_make_call check. + * + * We also delay this to after the module code already set the + * text to read-only, as we now need to set it back to read-write + * so that we can modify the text. + */ + if (ftrace_start_up) + ftrace_arch_code_modify_prepare(); + + do_for_each_ftrace_rec(pg, rec) { + int cnt; + /* + * do_for_each_ftrace_rec() is a double loop. + * module text shares the pg. If a record is + * not part of this module, then skip this pg, + * which the "break" will do. + */ + if (!within_module(rec->ip, mod)) + break; + + /* Weak functions should still be ignored */ + if (!test_for_valid_rec(rec)) { + /* Clear all other flags. Should not be enabled anyway */ + rec->flags = FTRACE_FL_DISABLED; + continue; + } + + cnt = 0; + + /* + * When adding a module, we need to check if tracers are + * currently enabled and if they are, and can trace this record, + * we need to enable the module functions as well as update the + * reference counts for those function records. + */ + if (ftrace_start_up) + cnt += referenced_filters(rec); + + rec->flags &= ~FTRACE_FL_DISABLED; + rec->flags += cnt; + + if (ftrace_start_up && cnt) { + int failed = __ftrace_replace_code(rec, 1); + if (failed) { + ftrace_bug(failed, rec); + goto out_loop; + } + } + + } while_for_each_ftrace_rec(); + + out_loop: + if (ftrace_start_up) + ftrace_arch_code_modify_post_process(); + + out_unlock: + mutex_unlock(&ftrace_lock); + + process_cached_mods(mod->name); +} + +void ftrace_module_init(struct module *mod) +{ + int ret; + + if (ftrace_disabled || !mod->num_ftrace_callsites) + return; + + ret = ftrace_process_locs(mod, mod->ftrace_callsites, + mod->ftrace_callsites + mod->num_ftrace_callsites); + if (ret) + pr_warn("ftrace: failed to allocate entries for module '%s' functions\n", + mod->name); +} + +static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map, + struct dyn_ftrace *rec) +{ + struct ftrace_mod_func *mod_func; + unsigned long symsize; + unsigned long offset; + char str[KSYM_SYMBOL_LEN]; + char *modname; + const char *ret; + + ret = kallsyms_lookup(rec->ip, &symsize, &offset, &modname, str); + if (!ret) + return; + + mod_func = kmalloc(sizeof(*mod_func), GFP_KERNEL); + if (!mod_func) + return; + + mod_func->name = kstrdup(str, GFP_KERNEL); + if (!mod_func->name) { + kfree(mod_func); + return; + } + + mod_func->ip = rec->ip - offset; + mod_func->size = symsize; + + mod_map->num_funcs++; + + list_add_rcu(&mod_func->list, &mod_map->funcs); +} + +static struct ftrace_mod_map * +allocate_ftrace_mod_map(struct module *mod, + unsigned long start, unsigned long end) +{ + struct ftrace_mod_map *mod_map; + + mod_map = kmalloc(sizeof(*mod_map), GFP_KERNEL); + if (!mod_map) + return NULL; + + mod_map->mod = mod; + mod_map->start_addr = start; + mod_map->end_addr = end; + mod_map->num_funcs = 0; + + INIT_LIST_HEAD_RCU(&mod_map->funcs); + + list_add_rcu(&mod_map->list, &ftrace_mod_maps); + + return mod_map; +} + +static const char * +ftrace_func_address_lookup(struct ftrace_mod_map *mod_map, + unsigned long addr, unsigned long *size, + unsigned long *off, char *sym) +{ + struct ftrace_mod_func *found_func = NULL; + struct ftrace_mod_func *mod_func; + + list_for_each_entry_rcu(mod_func, &mod_map->funcs, list) { + if (addr >= mod_func->ip && + addr < mod_func->ip + mod_func->size) { + found_func = mod_func; + break; + } + } + + if (found_func) { + if (size) + *size = found_func->size; + if (off) + *off = addr - found_func->ip; + if (sym) + strscpy(sym, found_func->name, KSYM_NAME_LEN); + + return found_func->name; + } + + return NULL; +} + +const char * +ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, + unsigned long *off, char **modname, char *sym) +{ + struct ftrace_mod_map *mod_map; + const char *ret = NULL; + + /* mod_map is freed via call_rcu() */ + preempt_disable(); + list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) { + ret = ftrace_func_address_lookup(mod_map, addr, size, off, sym); + if (ret) { + if (modname) + *modname = mod_map->mod->name; + break; + } + } + preempt_enable(); + + return ret; +} + +int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name, + char *module_name, int *exported) +{ + struct ftrace_mod_map *mod_map; + struct ftrace_mod_func *mod_func; + int ret; + + preempt_disable(); + list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) { + + if (symnum >= mod_map->num_funcs) { + symnum -= mod_map->num_funcs; + continue; + } + + list_for_each_entry_rcu(mod_func, &mod_map->funcs, list) { + if (symnum > 1) { + symnum--; + continue; + } + + *value = mod_func->ip; + *type = 'T'; + strscpy(name, mod_func->name, KSYM_NAME_LEN); + strscpy(module_name, mod_map->mod->name, MODULE_NAME_LEN); + *exported = 1; + preempt_enable(); + return 0; + } + WARN_ON(1); + break; + } + ret = ftrace_get_trampoline_kallsym(symnum, value, type, name, + module_name, exported); + preempt_enable(); + return ret; +} + +#else +static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map, + struct dyn_ftrace *rec) { } +static inline struct ftrace_mod_map * +allocate_ftrace_mod_map(struct module *mod, + unsigned long start, unsigned long end) +{ + return NULL; +} +int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name, char *module_name, + int *exported) +{ + int ret; + + preempt_disable(); + ret = ftrace_get_trampoline_kallsym(symnum, value, type, name, + module_name, exported); + preempt_enable(); + return ret; +} +#endif /* CONFIG_MODULES */ + +struct ftrace_init_func { + struct list_head list; + unsigned long ip; +}; + +/* Clear any init ips from hashes */ +static void +clear_func_from_hash(struct ftrace_init_func *func, struct ftrace_hash *hash) +{ + struct ftrace_func_entry *entry; + + entry = ftrace_lookup_ip(hash, func->ip); + /* + * Do not allow this rec to match again. + * Yeah, it may waste some memory, but will be removed + * if/when the hash is modified again. + */ + if (entry) + entry->ip = 0; +} + +static void +clear_func_from_hashes(struct ftrace_init_func *func) +{ + struct trace_array *tr; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->ops || !tr->ops->func_hash) + continue; + mutex_lock(&tr->ops->func_hash->regex_lock); + clear_func_from_hash(func, tr->ops->func_hash->filter_hash); + clear_func_from_hash(func, tr->ops->func_hash->notrace_hash); + mutex_unlock(&tr->ops->func_hash->regex_lock); + } + mutex_unlock(&trace_types_lock); +} + +static void add_to_clear_hash_list(struct list_head *clear_list, + struct dyn_ftrace *rec) +{ + struct ftrace_init_func *func; + + func = kmalloc(sizeof(*func), GFP_KERNEL); + if (!func) { + MEM_FAIL(1, "alloc failure, ftrace filter could be stale\n"); + return; + } + + func->ip = rec->ip; + list_add(&func->list, clear_list); +} + +void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) +{ + unsigned long start = (unsigned long)(start_ptr); + unsigned long end = (unsigned long)(end_ptr); + struct ftrace_page **last_pg = &ftrace_pages_start; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + struct dyn_ftrace key; + struct ftrace_mod_map *mod_map = NULL; + struct ftrace_init_func *func, *func_next; + LIST_HEAD(clear_hash); + + key.ip = start; + key.flags = end; /* overload flags, as it is unsigned long */ + + mutex_lock(&ftrace_lock); + + /* + * If we are freeing module init memory, then check if + * any tracer is active. If so, we need to save a mapping of + * the module functions being freed with the address. + */ + if (mod && ftrace_ops_list != &ftrace_list_end) + mod_map = allocate_ftrace_mod_map(mod, start, end); + + for (pg = ftrace_pages_start; pg; last_pg = &pg->next, pg = *last_pg) { + if (end < pg->records[0].ip || + start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) + continue; + again: + rec = bsearch(&key, pg->records, pg->index, + sizeof(struct dyn_ftrace), + ftrace_cmp_recs); + if (!rec) + continue; + + /* rec will be cleared from hashes after ftrace_lock unlock */ + add_to_clear_hash_list(&clear_hash, rec); + + if (mod_map) + save_ftrace_mod_rec(mod_map, rec); + + pg->index--; + ftrace_update_tot_cnt--; + if (!pg->index) { + *last_pg = pg->next; + if (pg->records) { + free_pages((unsigned long)pg->records, pg->order); + ftrace_number_of_pages -= 1 << pg->order; + } + ftrace_number_of_groups--; + kfree(pg); + pg = container_of(last_pg, struct ftrace_page, next); + if (!(*last_pg)) + ftrace_pages = pg; + continue; + } + memmove(rec, rec + 1, + (pg->index - (rec - pg->records)) * sizeof(*rec)); + /* More than one function may be in this block */ + goto again; + } + mutex_unlock(&ftrace_lock); + + list_for_each_entry_safe(func, func_next, &clear_hash, list) { + clear_func_from_hashes(func); + kfree(func); + } +} + +void __init ftrace_free_init_mem(void) +{ + void *start = (void *)(&__init_begin); + void *end = (void *)(&__init_end); + + ftrace_boot_snapshot(); + + ftrace_free_mem(NULL, start, end); +} + +int __init __weak ftrace_dyn_arch_init(void) +{ + return 0; +} + +void __init ftrace_init(void) +{ + extern unsigned long __start_mcount_loc[]; + extern unsigned long __stop_mcount_loc[]; + unsigned long count, flags; + int ret; + + local_irq_save(flags); + ret = ftrace_dyn_arch_init(); + local_irq_restore(flags); + if (ret) + goto failed; + + count = __stop_mcount_loc - __start_mcount_loc; + if (!count) { + pr_info("ftrace: No functions to be traced?\n"); + goto failed; + } + + pr_info("ftrace: allocating %ld entries in %ld pages\n", + count, DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); + + ret = ftrace_process_locs(NULL, + __start_mcount_loc, + __stop_mcount_loc); + if (ret) { + pr_warn("ftrace: failed to allocate entries for functions\n"); + goto failed; + } + + pr_info("ftrace: allocated %ld pages with %ld groups\n", + ftrace_number_of_pages, ftrace_number_of_groups); + + last_ftrace_enabled = ftrace_enabled = 1; + + set_ftrace_early_filters(); + + return; + failed: + ftrace_disabled = 1; +} + +/* Do nothing if arch does not support this */ +void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops) +{ +} + +static void ftrace_update_trampoline(struct ftrace_ops *ops) +{ + unsigned long trampoline = ops->trampoline; + + arch_ftrace_update_trampoline(ops); + if (ops->trampoline && ops->trampoline != trampoline && + (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) { + /* Add to kallsyms before the perf events */ + ftrace_add_trampoline_to_kallsyms(ops); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + ops->trampoline, ops->trampoline_size, false, + FTRACE_TRAMPOLINE_SYM); + /* + * Record the perf text poke event after the ksymbol register + * event. + */ + perf_event_text_poke((void *)ops->trampoline, NULL, 0, + (void *)ops->trampoline, + ops->trampoline_size); + } +} + +void ftrace_init_trace_array(struct trace_array *tr) +{ + INIT_LIST_HEAD(&tr->func_probes); + INIT_LIST_HEAD(&tr->mod_trace); + INIT_LIST_HEAD(&tr->mod_notrace); +} +#else + +struct ftrace_ops global_ops = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_INITIALIZED | + FTRACE_OPS_FL_PID, +}; + +static int __init ftrace_nodyn_init(void) +{ + ftrace_enabled = 1; + return 0; +} +core_initcall(ftrace_nodyn_init); + +static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; } +static inline void ftrace_startup_all(int command) { } + +static void ftrace_update_trampoline(struct ftrace_ops *ops) +{ +} + +#endif /* CONFIG_DYNAMIC_FTRACE */ + +__init void ftrace_init_global_array_ops(struct trace_array *tr) +{ + tr->ops = &global_ops; + tr->ops->private = tr; + ftrace_init_trace_array(tr); +} + +void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) +{ + /* If we filter on pids, update to use the pid function */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { + if (WARN_ON(tr->ops->func != ftrace_stub)) + printk("ftrace ops had %pS for function\n", + tr->ops->func); + } + tr->ops->func = func; + tr->ops->private = tr; +} + +void ftrace_reset_array_ops(struct trace_array *tr) +{ + tr->ops->func = ftrace_stub; +} + +static nokprobe_inline void +__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ignored, struct ftrace_regs *fregs) +{ + struct pt_regs *regs = ftrace_get_regs(fregs); + struct ftrace_ops *op; + int bit; + + /* + * The ftrace_test_and_set_recursion() will disable preemption, + * which is required since some of the ops may be dynamically + * allocated, they must be freed after a synchronize_rcu(). + */ + bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START); + if (bit < 0) + return; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* Stub functions don't need to be called nor tested */ + if (op->flags & FTRACE_OPS_FL_STUB) + continue; + /* + * Check the following for each ops before calling their func: + * if RCU flag is set, then rcu_is_watching() must be true + * Otherwise test if the ip matches the ops filter + * + * If any of the above fails then the op->func() is not executed. + */ + if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) && + ftrace_ops_test(op, ip, regs)) { + if (FTRACE_WARN_ON(!op->func)) { + pr_warn("op=%p %pS\n", op, op); + goto out; + } + op->func(ip, parent_ip, op, fregs); + } + } while_for_each_ftrace_op(op); +out: + trace_clear_recursion(bit); +} + +/* + * Some archs only support passing ip and parent_ip. Even though + * the list function ignores the op parameter, we do not want any + * C side effects, where a function is called without the caller + * sending a third parameter. + * Archs are to support both the regs and ftrace_ops at the same time. + * If they support ftrace_ops, it is assumed they support regs. + * If call backs want to use regs, they must either check for regs + * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS. + * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved. + * An architecture can pass partial regs with ftrace_ops and still + * set the ARCH_SUPPORTS_FTRACE_OPS. + * + * In vmlinux.lds.h, ftrace_ops_list_func() is defined to be + * arch_ftrace_ops_list_func. + */ +#if ARCH_SUPPORTS_FTRACE_OPS +void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + __ftrace_ops_list_func(ip, parent_ip, NULL, fregs); +} +#else +void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) +{ + __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); +} +#endif +NOKPROBE_SYMBOL(arch_ftrace_ops_list_func); + +/* + * If there's only one function registered but it does not support + * recursion, needs RCU protection, then this function will be called + * by the mcount trampoline. + */ +static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + int bit; + + bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START); + if (bit < 0) + return; + + if (!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) + op->func(ip, parent_ip, op, fregs); + + trace_clear_recursion(bit); +} +NOKPROBE_SYMBOL(ftrace_ops_assist_func); + +/** + * ftrace_ops_get_func - get the function a trampoline should call + * @ops: the ops to get the function for + * + * Normally the mcount trampoline will call the ops->func, but there + * are times that it should not. For example, if the ops does not + * have its own recursion protection, then it should call the + * ftrace_ops_assist_func() instead. + * + * Returns the function that the trampoline should call for @ops. + */ +ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) +{ + /* + * If the function does not handle recursion or needs to be RCU safe, + * then we need to call the assist handler. + */ + if (ops->flags & (FTRACE_OPS_FL_RECURSION | + FTRACE_OPS_FL_RCU)) + return ftrace_ops_assist_func; + + return ops->func; +} + +static void +ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + struct trace_pid_list *no_pid_list; + + pid_list = rcu_dereference_sched(tr->function_pids); + no_pid_list = rcu_dereference_sched(tr->function_no_pids); + + if (trace_ignore_this_task(pid_list, no_pid_list, next)) + this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid, + FTRACE_PID_IGNORE); + else + this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid, + next->pid); +} + +static void +ftrace_pid_follow_sched_process_fork(void *data, + struct task_struct *self, + struct task_struct *task) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = data; + + pid_list = rcu_dereference_sched(tr->function_pids); + trace_filter_add_remove_task(pid_list, self, task); + + pid_list = rcu_dereference_sched(tr->function_no_pids); + trace_filter_add_remove_task(pid_list, self, task); +} + +static void +ftrace_pid_follow_sched_process_exit(void *data, struct task_struct *task) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = data; + + pid_list = rcu_dereference_sched(tr->function_pids); + trace_filter_add_remove_task(pid_list, NULL, task); + + pid_list = rcu_dereference_sched(tr->function_no_pids); + trace_filter_add_remove_task(pid_list, NULL, task); +} + +void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) +{ + if (enable) { + register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, + tr); + register_trace_sched_process_free(ftrace_pid_follow_sched_process_exit, + tr); + } else { + unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, + tr); + unregister_trace_sched_process_free(ftrace_pid_follow_sched_process_exit, + tr); + } +} + +static void clear_ftrace_pids(struct trace_array *tr, int type) +{ + struct trace_pid_list *pid_list; + struct trace_pid_list *no_pid_list; + int cpu; + + pid_list = rcu_dereference_protected(tr->function_pids, + lockdep_is_held(&ftrace_lock)); + no_pid_list = rcu_dereference_protected(tr->function_no_pids, + lockdep_is_held(&ftrace_lock)); + + /* Make sure there's something to do */ + if (!pid_type_enabled(type, pid_list, no_pid_list)) + return; + + /* See if the pids still need to be checked after this */ + if (!still_need_pid_events(type, pid_list, no_pid_list)) { + unregister_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); + for_each_possible_cpu(cpu) + per_cpu_ptr(tr->array_buffer.data, cpu)->ftrace_ignore_pid = FTRACE_PID_TRACE; + } + + if (type & TRACE_PIDS) + rcu_assign_pointer(tr->function_pids, NULL); + + if (type & TRACE_NO_PIDS) + rcu_assign_pointer(tr->function_no_pids, NULL); + + /* Wait till all users are no longer using pid filtering */ + synchronize_rcu(); + + if ((type & TRACE_PIDS) && pid_list) + trace_pid_list_free(pid_list); + + if ((type & TRACE_NO_PIDS) && no_pid_list) + trace_pid_list_free(no_pid_list); +} + +void ftrace_clear_pids(struct trace_array *tr) +{ + mutex_lock(&ftrace_lock); + + clear_ftrace_pids(tr, TRACE_PIDS | TRACE_NO_PIDS); + + mutex_unlock(&ftrace_lock); +} + +static void ftrace_pid_reset(struct trace_array *tr, int type) +{ + mutex_lock(&ftrace_lock); + clear_ftrace_pids(tr, type); + + ftrace_update_pid_func(); + ftrace_startup_all(0); + + mutex_unlock(&ftrace_lock); +} + +/* Greater than any max PID */ +#define FTRACE_NO_PIDS (void *)(PID_MAX_LIMIT + 1) + +static void *fpid_start(struct seq_file *m, loff_t *pos) + __acquires(RCU) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = m->private; + + mutex_lock(&ftrace_lock); + rcu_read_lock_sched(); + + pid_list = rcu_dereference_sched(tr->function_pids); + + if (!pid_list) + return !(*pos) ? FTRACE_NO_PIDS : NULL; + + return trace_pid_start(pid_list, pos); +} + +static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_array *tr = m->private; + struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_pids); + + if (v == FTRACE_NO_PIDS) { + (*pos)++; + return NULL; + } + return trace_pid_next(pid_list, v, pos); +} + +static void fpid_stop(struct seq_file *m, void *p) + __releases(RCU) +{ + rcu_read_unlock_sched(); + mutex_unlock(&ftrace_lock); +} + +static int fpid_show(struct seq_file *m, void *v) +{ + if (v == FTRACE_NO_PIDS) { + seq_puts(m, "no pid\n"); + return 0; + } + + return trace_pid_show(m, v); +} + +static const struct seq_operations ftrace_pid_sops = { + .start = fpid_start, + .next = fpid_next, + .stop = fpid_stop, + .show = fpid_show, +}; + +static void *fnpid_start(struct seq_file *m, loff_t *pos) + __acquires(RCU) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = m->private; + + mutex_lock(&ftrace_lock); + rcu_read_lock_sched(); + + pid_list = rcu_dereference_sched(tr->function_no_pids); + + if (!pid_list) + return !(*pos) ? FTRACE_NO_PIDS : NULL; + + return trace_pid_start(pid_list, pos); +} + +static void *fnpid_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_array *tr = m->private; + struct trace_pid_list *pid_list = rcu_dereference_sched(tr->function_no_pids); + + if (v == FTRACE_NO_PIDS) { + (*pos)++; + return NULL; + } + return trace_pid_next(pid_list, v, pos); +} + +static const struct seq_operations ftrace_no_pid_sops = { + .start = fnpid_start, + .next = fnpid_next, + .stop = fpid_stop, + .show = fpid_show, +}; + +static int pid_open(struct inode *inode, struct file *file, int type) +{ + const struct seq_operations *seq_ops; + struct trace_array *tr = inode->i_private; + struct seq_file *m; + int ret = 0; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_pid_reset(tr, type); + + switch (type) { + case TRACE_PIDS: + seq_ops = &ftrace_pid_sops; + break; + case TRACE_NO_PIDS: + seq_ops = &ftrace_no_pid_sops; + break; + default: + trace_array_put(tr); + WARN_ON_ONCE(1); + return -EINVAL; + } + + ret = seq_open(file, seq_ops); + if (ret < 0) { + trace_array_put(tr); + } else { + m = file->private_data; + /* copy tr over to seq ops */ + m->private = tr; + } + + return ret; +} + +static int +ftrace_pid_open(struct inode *inode, struct file *file) +{ + return pid_open(inode, file, TRACE_PIDS); +} + +static int +ftrace_no_pid_open(struct inode *inode, struct file *file) +{ + return pid_open(inode, file, TRACE_NO_PIDS); +} + +static void ignore_task_cpu(void *data) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + struct trace_pid_list *no_pid_list; + + /* + * This function is called by on_each_cpu() while the + * event_mutex is held. + */ + pid_list = rcu_dereference_protected(tr->function_pids, + mutex_is_locked(&ftrace_lock)); + no_pid_list = rcu_dereference_protected(tr->function_no_pids, + mutex_is_locked(&ftrace_lock)); + + if (trace_ignore_this_task(pid_list, no_pid_list, current)) + this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid, + FTRACE_PID_IGNORE); + else + this_cpu_write(tr->array_buffer.data->ftrace_ignore_pid, + current->pid); +} + +static ssize_t +pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos, int type) +{ + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; + struct trace_pid_list *filtered_pids; + struct trace_pid_list *other_pids; + struct trace_pid_list *pid_list; + ssize_t ret; + + if (!cnt) + return 0; + + mutex_lock(&ftrace_lock); + + switch (type) { + case TRACE_PIDS: + filtered_pids = rcu_dereference_protected(tr->function_pids, + lockdep_is_held(&ftrace_lock)); + other_pids = rcu_dereference_protected(tr->function_no_pids, + lockdep_is_held(&ftrace_lock)); + break; + case TRACE_NO_PIDS: + filtered_pids = rcu_dereference_protected(tr->function_no_pids, + lockdep_is_held(&ftrace_lock)); + other_pids = rcu_dereference_protected(tr->function_pids, + lockdep_is_held(&ftrace_lock)); + break; + default: + ret = -EINVAL; + WARN_ON_ONCE(1); + goto out; + } + + ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); + if (ret < 0) + goto out; + + switch (type) { + case TRACE_PIDS: + rcu_assign_pointer(tr->function_pids, pid_list); + break; + case TRACE_NO_PIDS: + rcu_assign_pointer(tr->function_no_pids, pid_list); + break; + } + + + if (filtered_pids) { + synchronize_rcu(); + trace_pid_list_free(filtered_pids); + } else if (pid_list && !other_pids) { + /* Register a probe to set whether to ignore the tracing of a task */ + register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); + } + + /* + * Ignoring of pids is done at task switch. But we have to + * check for those tasks that are currently running. + * Always do this in case a pid was appended or removed. + */ + on_each_cpu(ignore_task_cpu, tr, 1); + + ftrace_update_pid_func(); + ftrace_startup_all(0); + out: + mutex_unlock(&ftrace_lock); + + if (ret > 0) + *ppos += ret; + + return ret; +} + +static ssize_t +ftrace_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return pid_write(filp, ubuf, cnt, ppos, TRACE_PIDS); +} + +static ssize_t +ftrace_no_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return pid_write(filp, ubuf, cnt, ppos, TRACE_NO_PIDS); +} + +static int +ftrace_pid_release(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + + return seq_release(inode, file); +} + +static const struct file_operations ftrace_pid_fops = { + .open = ftrace_pid_open, + .write = ftrace_pid_write, + .read = seq_read, + .llseek = tracing_lseek, + .release = ftrace_pid_release, +}; + +static const struct file_operations ftrace_no_pid_fops = { + .open = ftrace_no_pid_open, + .write = ftrace_no_pid_write, + .read = seq_read, + .llseek = tracing_lseek, + .release = ftrace_pid_release, +}; + +void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer) +{ + trace_create_file("set_ftrace_pid", TRACE_MODE_WRITE, d_tracer, + tr, &ftrace_pid_fops); + trace_create_file("set_ftrace_notrace_pid", TRACE_MODE_WRITE, + d_tracer, tr, &ftrace_no_pid_fops); +} + +void __init ftrace_init_tracefs_toplevel(struct trace_array *tr, + struct dentry *d_tracer) +{ + /* Only the top level directory has the dyn_tracefs and profile */ + WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); + + ftrace_init_dyn_tracefs(d_tracer); + ftrace_profile_tracefs(d_tracer); +} + +/** + * ftrace_kill - kill ftrace + * + * This function should be used by panic code. It stops ftrace + * but in a not so nice way. If you need to simply kill ftrace + * from a non-atomic section, use ftrace_kill. + */ +void ftrace_kill(void) +{ + ftrace_disabled = 1; + ftrace_enabled = 0; + ftrace_trace_function = ftrace_stub; +} + +/** + * ftrace_is_dead - Test if ftrace is dead or not. + * + * Returns 1 if ftrace is "dead", zero otherwise. + */ +int ftrace_is_dead(void) +{ + return ftrace_disabled; +} + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* + * When registering ftrace_ops with IPMODIFY, it is necessary to make sure + * it doesn't conflict with any direct ftrace_ops. If there is existing + * direct ftrace_ops on a kernel function being patched, call + * FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER on it to enable sharing. + * + * @ops: ftrace_ops being registered. + * + * Returns: + * 0 on success; + * Negative on failure. + */ +static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *hash; + struct ftrace_ops *op; + int size, i, ret; + + lockdep_assert_held_once(&direct_mutex); + + if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY)) + return 0; + + hash = ops->func_hash->filter_hash; + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + unsigned long ip = entry->ip; + bool found_op = false; + + mutex_lock(&ftrace_lock); + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (!(op->flags & FTRACE_OPS_FL_DIRECT)) + continue; + if (ops_references_ip(op, ip)) { + found_op = true; + break; + } + } while_for_each_ftrace_op(op); + mutex_unlock(&ftrace_lock); + + if (found_op) { + if (!op->ops_func) + return -EBUSY; + + ret = op->ops_func(op, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER); + if (ret) + return ret; + } + } + } + + return 0; +} + +/* + * Similar to prepare_direct_functions_for_ipmodify, clean up after ops + * with IPMODIFY is unregistered. The cleanup is optional for most DIRECT + * ops. + */ +static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *hash; + struct ftrace_ops *op; + int size, i; + + if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY)) + return; + + mutex_lock(&direct_mutex); + + hash = ops->func_hash->filter_hash; + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + unsigned long ip = entry->ip; + bool found_op = false; + + mutex_lock(&ftrace_lock); + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (!(op->flags & FTRACE_OPS_FL_DIRECT)) + continue; + if (ops_references_ip(op, ip)) { + found_op = true; + break; + } + } while_for_each_ftrace_op(op); + mutex_unlock(&ftrace_lock); + + /* The cleanup is optional, ignore any errors */ + if (found_op && op->ops_func) + op->ops_func(op, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER); + } + } + mutex_unlock(&direct_mutex); +} + +#define lock_direct_mutex() mutex_lock(&direct_mutex) +#define unlock_direct_mutex() mutex_unlock(&direct_mutex) + +#else /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + +static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops) +{ + return 0; +} + +static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops) +{ +} + +#define lock_direct_mutex() do { } while (0) +#define unlock_direct_mutex() do { } while (0) + +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + +/* + * Similar to register_ftrace_function, except we don't lock direct_mutex. + */ +static int register_ftrace_function_nolock(struct ftrace_ops *ops) +{ + int ret; + + ftrace_ops_init(ops); + + mutex_lock(&ftrace_lock); + + ret = ftrace_startup(ops, 0); + + mutex_unlock(&ftrace_lock); + + return ret; +} + +/** + * register_ftrace_function - register a function for profiling + * @ops: ops structure that holds the function for profiling. + * + * Register a function to be called by all functions in the + * kernel. + * + * Note: @ops->func and all the functions it calls must be labeled + * with "notrace", otherwise it will go into a + * recursive loop. + */ +int register_ftrace_function(struct ftrace_ops *ops) +{ + int ret; + + lock_direct_mutex(); + ret = prepare_direct_functions_for_ipmodify(ops); + if (ret < 0) + goto out_unlock; + + ret = register_ftrace_function_nolock(ops); + +out_unlock: + unlock_direct_mutex(); + return ret; +} +EXPORT_SYMBOL_GPL(register_ftrace_function); + +/** + * unregister_ftrace_function - unregister a function for profiling. + * @ops: ops structure that holds the function to unregister + * + * Unregister a function that was added to be called by ftrace profiling. + */ +int unregister_ftrace_function(struct ftrace_ops *ops) +{ + int ret; + + mutex_lock(&ftrace_lock); + ret = ftrace_shutdown(ops, 0); + mutex_unlock(&ftrace_lock); + + cleanup_direct_functions_after_ipmodify(ops); + return ret; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_function); + +static int symbols_cmp(const void *a, const void *b) +{ + const char **str_a = (const char **) a; + const char **str_b = (const char **) b; + + return strcmp(*str_a, *str_b); +} + +struct kallsyms_data { + unsigned long *addrs; + const char **syms; + size_t cnt; + size_t found; +}; + +/* This function gets called for all kernel and module symbols + * and returns 1 in case we resolved all the requested symbols, + * 0 otherwise. + */ +static int kallsyms_callback(void *data, const char *name, unsigned long addr) +{ + struct kallsyms_data *args = data; + const char **sym; + int idx; + + sym = bsearch(&name, args->syms, args->cnt, sizeof(*args->syms), symbols_cmp); + if (!sym) + return 0; + + idx = sym - args->syms; + if (args->addrs[idx]) + return 0; + + if (!ftrace_location(addr)) + return 0; + + args->addrs[idx] = addr; + args->found++; + return args->found == args->cnt ? 1 : 0; +} + +/** + * ftrace_lookup_symbols - Lookup addresses for array of symbols + * + * @sorted_syms: array of symbols pointers symbols to resolve, + * must be alphabetically sorted + * @cnt: number of symbols/addresses in @syms/@addrs arrays + * @addrs: array for storing resulting addresses + * + * This function looks up addresses for array of symbols provided in + * @syms array (must be alphabetically sorted) and stores them in + * @addrs array, which needs to be big enough to store at least @cnt + * addresses. + * + * This function returns 0 if all provided symbols are found, + * -ESRCH otherwise. + */ +int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) +{ + struct kallsyms_data args; + int found_all; + + memset(addrs, 0, sizeof(*addrs) * cnt); + args.addrs = addrs; + args.syms = sorted_syms; + args.cnt = cnt; + args.found = 0; + + found_all = kallsyms_on_each_symbol(kallsyms_callback, &args); + if (found_all) + return 0; + found_all = module_kallsyms_on_each_symbol(NULL, kallsyms_callback, &args); + return found_all ? 0 : -ESRCH; +} + +#ifdef CONFIG_SYSCTL + +#ifdef CONFIG_DYNAMIC_FTRACE +static void ftrace_startup_sysctl(void) +{ + int command; + + if (unlikely(ftrace_disabled)) + return; + + /* Force update next time */ + saved_ftrace_func = NULL; + /* ftrace_start_up is true if we want ftrace running */ + if (ftrace_start_up) { + command = FTRACE_UPDATE_CALLS; + if (ftrace_graph_active) + command |= FTRACE_START_FUNC_RET; + ftrace_startup_enable(command); + } +} + +static void ftrace_shutdown_sysctl(void) +{ + int command; + + if (unlikely(ftrace_disabled)) + return; + + /* ftrace_start_up is true if ftrace is running */ + if (ftrace_start_up) { + command = FTRACE_DISABLE_CALLS; + if (ftrace_graph_active) + command |= FTRACE_STOP_FUNC_RET; + ftrace_run_update_code(command); + } +} +#else +# define ftrace_startup_sysctl() do { } while (0) +# define ftrace_shutdown_sysctl() do { } while (0) +#endif /* CONFIG_DYNAMIC_FTRACE */ + +static bool is_permanent_ops_registered(void) +{ + struct ftrace_ops *op; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op->flags & FTRACE_OPS_FL_PERMANENT) + return true; + } while_for_each_ftrace_op(op); + + return false; +} + +static int +ftrace_enable_sysctl(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = -ENODEV; + + mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) + goto out; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) + goto out; + + if (ftrace_enabled) { + + /* we are starting ftrace again */ + if (rcu_dereference_protected(ftrace_ops_list, + lockdep_is_held(&ftrace_lock)) != &ftrace_list_end) + update_ftrace_function(); + + ftrace_startup_sysctl(); + + } else { + if (is_permanent_ops_registered()) { + ftrace_enabled = true; + ret = -EBUSY; + goto out; + } + + /* stopping ftrace calls (just send to ftrace_stub) */ + ftrace_trace_function = ftrace_stub; + + ftrace_shutdown_sysctl(); + } + + last_ftrace_enabled = !!ftrace_enabled; + out: + mutex_unlock(&ftrace_lock); + return ret; +} + +static struct ctl_table ftrace_sysctls[] = { + { + .procname = "ftrace_enabled", + .data = &ftrace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ftrace_enable_sysctl, + }, + {} +}; + +static int __init ftrace_sysctl_init(void) +{ + register_sysctl_init("kernel", ftrace_sysctls); + return 0; +} +late_initcall(ftrace_sysctl_init); +#endif diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h new file mode 100644 index 0000000000..5012c04f92 --- /dev/null +++ b/kernel/trace/ftrace_internal.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_KERNEL_FTRACE_INTERNAL_H +#define _LINUX_KERNEL_FTRACE_INTERNAL_H + +int __register_ftrace_function(struct ftrace_ops *ops); +int __unregister_ftrace_function(struct ftrace_ops *ops); + +#ifdef CONFIG_FUNCTION_TRACER + +extern struct mutex ftrace_lock; +extern struct ftrace_ops global_ops; + +#ifdef CONFIG_DYNAMIC_FTRACE + +int ftrace_startup(struct ftrace_ops *ops, int command); +int ftrace_shutdown(struct ftrace_ops *ops, int command); +int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs); + +#else /* !CONFIG_DYNAMIC_FTRACE */ + +/* Keep as macros so we do not need to define the commands */ +# define ftrace_startup(ops, command) \ + ({ \ + int ___ret = __register_ftrace_function(ops); \ + if (!___ret) \ + (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ + ___ret; \ + }) +# define ftrace_shutdown(ops, command) \ + ({ \ + int ___ret = __unregister_ftrace_function(ops); \ + if (!___ret) \ + (ops)->flags &= ~FTRACE_OPS_FL_ENABLED; \ + ___ret; \ + }) +static inline int +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) +{ + return 1; +} +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +extern int ftrace_graph_active; +void update_function_graph_func(void); +#else /* !CONFIG_FUNCTION_GRAPH_TRACER */ +# define ftrace_graph_active 0 +static inline void update_function_graph_func(void) { } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +#else /* !CONFIG_FUNCTION_TRACER */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#endif diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c new file mode 100644 index 0000000000..5a4b722b50 --- /dev/null +++ b/kernel/trace/kprobe_event_gen_test.c @@ -0,0 +1,276 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test module for in-kernel kprobe event creation and generation. + * + * Copyright (C) 2019 Tom Zanussi <zanussi@kernel.org> + */ + +#include <linux/module.h> +#include <linux/trace_events.h> + +/* + * This module is a simple test of basic functionality for in-kernel + * kprobe/kretprobe event creation. The first test uses + * kprobe_event_gen_cmd_start(), kprobe_event_add_fields() and + * kprobe_event_gen_cmd_end() to create a kprobe event, which is then + * enabled in order to generate trace output. The second creates a + * kretprobe event using kretprobe_event_gen_cmd_start() and + * kretprobe_event_gen_cmd_end(), and is also then enabled. + * + * To test, select CONFIG_KPROBE_EVENT_GEN_TEST and build the module. + * Then: + * + * # insmod kernel/trace/kprobe_event_gen_test.ko + * # cat /sys/kernel/tracing/trace + * + * You should see many instances of the "gen_kprobe_test" and + * "gen_kretprobe_test" events in the trace buffer. + * + * To remove the events, remove the module: + * + * # rmmod kprobe_event_gen_test + * + */ + +static struct trace_event_file *gen_kprobe_test; +static struct trace_event_file *gen_kretprobe_test; + +#define KPROBE_GEN_TEST_FUNC "do_sys_open" + +/* X86 */ +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_32) +#define KPROBE_GEN_TEST_ARG0 "dfd=%ax" +#define KPROBE_GEN_TEST_ARG1 "filename=%dx" +#define KPROBE_GEN_TEST_ARG2 "flags=%cx" +#define KPROBE_GEN_TEST_ARG3 "mode=+4($stack)" + +/* ARM64 */ +#elif defined(CONFIG_ARM64) +#define KPROBE_GEN_TEST_ARG0 "dfd=%x0" +#define KPROBE_GEN_TEST_ARG1 "filename=%x1" +#define KPROBE_GEN_TEST_ARG2 "flags=%x2" +#define KPROBE_GEN_TEST_ARG3 "mode=%x3" + +/* ARM */ +#elif defined(CONFIG_ARM) +#define KPROBE_GEN_TEST_ARG0 "dfd=%r0" +#define KPROBE_GEN_TEST_ARG1 "filename=%r1" +#define KPROBE_GEN_TEST_ARG2 "flags=%r2" +#define KPROBE_GEN_TEST_ARG3 "mode=%r3" + +/* RISCV */ +#elif defined(CONFIG_RISCV) +#define KPROBE_GEN_TEST_ARG0 "dfd=%a0" +#define KPROBE_GEN_TEST_ARG1 "filename=%a1" +#define KPROBE_GEN_TEST_ARG2 "flags=%a2" +#define KPROBE_GEN_TEST_ARG3 "mode=%a3" + +/* others */ +#else +#define KPROBE_GEN_TEST_ARG0 NULL +#define KPROBE_GEN_TEST_ARG1 NULL +#define KPROBE_GEN_TEST_ARG2 NULL +#define KPROBE_GEN_TEST_ARG3 NULL +#endif + +static bool trace_event_file_is_valid(struct trace_event_file *input) +{ + return input && !IS_ERR(input); +} + +/* + * Test to make sure we can create a kprobe event, then add more + * fields. + */ +static int __init test_gen_kprobe_cmd(void) +{ + struct dynevent_cmd cmd; + char *buf; + int ret; + + /* Create a buffer to hold the generated command */ + buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* Before generating the command, initialize the cmd object */ + kprobe_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN); + + /* + * Define the gen_kprobe_test event with the first 2 kprobe + * fields. + */ + ret = kprobe_event_gen_cmd_start(&cmd, "gen_kprobe_test", + KPROBE_GEN_TEST_FUNC, + KPROBE_GEN_TEST_ARG0, KPROBE_GEN_TEST_ARG1); + if (ret) + goto out; + + /* Use kprobe_event_add_fields to add the rest of the fields */ + + ret = kprobe_event_add_fields(&cmd, KPROBE_GEN_TEST_ARG2, KPROBE_GEN_TEST_ARG3); + if (ret) + goto out; + + /* + * This actually creates the event. + */ + ret = kprobe_event_gen_cmd_end(&cmd); + if (ret) + goto out; + + /* + * Now get the gen_kprobe_test event file. We need to prevent + * the instance and event from disappearing from underneath + * us, which trace_get_event_file() does (though in this case + * we're using the top-level instance which never goes away). + */ + gen_kprobe_test = trace_get_event_file(NULL, "kprobes", + "gen_kprobe_test"); + if (IS_ERR(gen_kprobe_test)) { + ret = PTR_ERR(gen_kprobe_test); + goto delete; + } + + /* Enable the event or you won't see anything */ + ret = trace_array_set_clr_event(gen_kprobe_test->tr, + "kprobes", "gen_kprobe_test", true); + if (ret) { + trace_put_event_file(gen_kprobe_test); + goto delete; + } + out: + kfree(buf); + return ret; + delete: + if (trace_event_file_is_valid(gen_kprobe_test)) + gen_kprobe_test = NULL; + /* We got an error after creating the event, delete it */ + kprobe_event_delete("gen_kprobe_test"); + goto out; +} + +/* + * Test to make sure we can create a kretprobe event. + */ +static int __init test_gen_kretprobe_cmd(void) +{ + struct dynevent_cmd cmd; + char *buf; + int ret; + + /* Create a buffer to hold the generated command */ + buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* Before generating the command, initialize the cmd object */ + kprobe_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN); + + /* + * Define the kretprobe event. + */ + ret = kretprobe_event_gen_cmd_start(&cmd, "gen_kretprobe_test", + KPROBE_GEN_TEST_FUNC, + "$retval"); + if (ret) + goto out; + + /* + * This actually creates the event. + */ + ret = kretprobe_event_gen_cmd_end(&cmd); + if (ret) + goto out; + + /* + * Now get the gen_kretprobe_test event file. We need to + * prevent the instance and event from disappearing from + * underneath us, which trace_get_event_file() does (though in + * this case we're using the top-level instance which never + * goes away). + */ + gen_kretprobe_test = trace_get_event_file(NULL, "kprobes", + "gen_kretprobe_test"); + if (IS_ERR(gen_kretprobe_test)) { + ret = PTR_ERR(gen_kretprobe_test); + goto delete; + } + + /* Enable the event or you won't see anything */ + ret = trace_array_set_clr_event(gen_kretprobe_test->tr, + "kprobes", "gen_kretprobe_test", true); + if (ret) { + trace_put_event_file(gen_kretprobe_test); + goto delete; + } + out: + kfree(buf); + return ret; + delete: + if (trace_event_file_is_valid(gen_kretprobe_test)) + gen_kretprobe_test = NULL; + /* We got an error after creating the event, delete it */ + kprobe_event_delete("gen_kretprobe_test"); + goto out; +} + +static int __init kprobe_event_gen_test_init(void) +{ + int ret; + + ret = test_gen_kprobe_cmd(); + if (ret) + return ret; + + ret = test_gen_kretprobe_cmd(); + if (ret) { + if (trace_event_file_is_valid(gen_kretprobe_test)) { + WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr, + "kprobes", + "gen_kretprobe_test", false)); + trace_put_event_file(gen_kretprobe_test); + } + WARN_ON(kprobe_event_delete("gen_kretprobe_test")); + } + + return ret; +} + +static void __exit kprobe_event_gen_test_exit(void) +{ + if (trace_event_file_is_valid(gen_kprobe_test)) { + /* Disable the event or you can't remove it */ + WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr, + "kprobes", + "gen_kprobe_test", false)); + + /* Now give the file and instance back */ + trace_put_event_file(gen_kprobe_test); + } + + + /* Now unregister and free the event */ + WARN_ON(kprobe_event_delete("gen_kprobe_test")); + + if (trace_event_file_is_valid(gen_kretprobe_test)) { + /* Disable the event or you can't remove it */ + WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr, + "kprobes", + "gen_kretprobe_test", false)); + + /* Now give the file and instance back */ + trace_put_event_file(gen_kretprobe_test); + } + + + /* Now unregister and free the event */ + WARN_ON(kprobe_event_delete("gen_kretprobe_test")); +} + +module_init(kprobe_event_gen_test_init) +module_exit(kprobe_event_gen_test_exit) + +MODULE_AUTHOR("Tom Zanussi"); +MODULE_DESCRIPTION("kprobe event generation test"); +MODULE_LICENSE("GPL v2"); diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c new file mode 100644 index 0000000000..95106d02b3 --- /dev/null +++ b/kernel/trace/pid_list.c @@ -0,0 +1,495 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 VMware Inc, Steven Rostedt <rostedt@goodmis.org> + */ +#include <linux/spinlock.h> +#include <linux/irq_work.h> +#include <linux/slab.h> +#include "trace.h" + +/* See pid_list.h for details */ + +static inline union lower_chunk *get_lower_chunk(struct trace_pid_list *pid_list) +{ + union lower_chunk *chunk; + + lockdep_assert_held(&pid_list->lock); + + if (!pid_list->lower_list) + return NULL; + + chunk = pid_list->lower_list; + pid_list->lower_list = chunk->next; + pid_list->free_lower_chunks--; + WARN_ON_ONCE(pid_list->free_lower_chunks < 0); + chunk->next = NULL; + /* + * If a refill needs to happen, it can not happen here + * as the scheduler run queue locks are held. + */ + if (pid_list->free_lower_chunks <= CHUNK_REALLOC) + irq_work_queue(&pid_list->refill_irqwork); + + return chunk; +} + +static inline union upper_chunk *get_upper_chunk(struct trace_pid_list *pid_list) +{ + union upper_chunk *chunk; + + lockdep_assert_held(&pid_list->lock); + + if (!pid_list->upper_list) + return NULL; + + chunk = pid_list->upper_list; + pid_list->upper_list = chunk->next; + pid_list->free_upper_chunks--; + WARN_ON_ONCE(pid_list->free_upper_chunks < 0); + chunk->next = NULL; + /* + * If a refill needs to happen, it can not happen here + * as the scheduler run queue locks are held. + */ + if (pid_list->free_upper_chunks <= CHUNK_REALLOC) + irq_work_queue(&pid_list->refill_irqwork); + + return chunk; +} + +static inline void put_lower_chunk(struct trace_pid_list *pid_list, + union lower_chunk *chunk) +{ + lockdep_assert_held(&pid_list->lock); + + chunk->next = pid_list->lower_list; + pid_list->lower_list = chunk; + pid_list->free_lower_chunks++; +} + +static inline void put_upper_chunk(struct trace_pid_list *pid_list, + union upper_chunk *chunk) +{ + lockdep_assert_held(&pid_list->lock); + + chunk->next = pid_list->upper_list; + pid_list->upper_list = chunk; + pid_list->free_upper_chunks++; +} + +static inline bool upper_empty(union upper_chunk *chunk) +{ + /* + * If chunk->data has no lower chunks, it will be the same + * as a zeroed bitmask. Use find_first_bit() to test it + * and if it doesn't find any bits set, then the array + * is empty. + */ + int bit = find_first_bit((unsigned long *)chunk->data, + sizeof(chunk->data) * 8); + return bit >= sizeof(chunk->data) * 8; +} + +static inline int pid_split(unsigned int pid, unsigned int *upper1, + unsigned int *upper2, unsigned int *lower) +{ + /* MAX_PID should cover all pids */ + BUILD_BUG_ON(MAX_PID < PID_MAX_LIMIT); + + /* In case a bad pid is passed in, then fail */ + if (unlikely(pid >= MAX_PID)) + return -1; + + *upper1 = (pid >> UPPER1_SHIFT) & UPPER_MASK; + *upper2 = (pid >> UPPER2_SHIFT) & UPPER_MASK; + *lower = pid & LOWER_MASK; + + return 0; +} + +static inline unsigned int pid_join(unsigned int upper1, + unsigned int upper2, unsigned int lower) +{ + return ((upper1 & UPPER_MASK) << UPPER1_SHIFT) | + ((upper2 & UPPER_MASK) << UPPER2_SHIFT) | + (lower & LOWER_MASK); +} + +/** + * trace_pid_list_is_set - test if the pid is set in the list + * @pid_list: The pid list to test + * @pid: The pid to see if set in the list. + * + * Tests if @pid is set in the @pid_list. This is usually called + * from the scheduler when a task is scheduled. Its pid is checked + * if it should be traced or not. + * + * Return true if the pid is in the list, false otherwise. + */ +bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid) +{ + union upper_chunk *upper_chunk; + union lower_chunk *lower_chunk; + unsigned long flags; + unsigned int upper1; + unsigned int upper2; + unsigned int lower; + bool ret = false; + + if (!pid_list) + return false; + + if (pid_split(pid, &upper1, &upper2, &lower) < 0) + return false; + + raw_spin_lock_irqsave(&pid_list->lock, flags); + upper_chunk = pid_list->upper[upper1]; + if (upper_chunk) { + lower_chunk = upper_chunk->data[upper2]; + if (lower_chunk) + ret = test_bit(lower, lower_chunk->data); + } + raw_spin_unlock_irqrestore(&pid_list->lock, flags); + + return ret; +} + +/** + * trace_pid_list_set - add a pid to the list + * @pid_list: The pid list to add the @pid to. + * @pid: The pid to add. + * + * Adds @pid to @pid_list. This is usually done explicitly by a user + * adding a task to be traced, or indirectly by the fork function + * when children should be traced and a task's pid is in the list. + * + * Return 0 on success, negative otherwise. + */ +int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid) +{ + union upper_chunk *upper_chunk; + union lower_chunk *lower_chunk; + unsigned long flags; + unsigned int upper1; + unsigned int upper2; + unsigned int lower; + int ret; + + if (!pid_list) + return -ENODEV; + + if (pid_split(pid, &upper1, &upper2, &lower) < 0) + return -EINVAL; + + raw_spin_lock_irqsave(&pid_list->lock, flags); + upper_chunk = pid_list->upper[upper1]; + if (!upper_chunk) { + upper_chunk = get_upper_chunk(pid_list); + if (!upper_chunk) { + ret = -ENOMEM; + goto out; + } + pid_list->upper[upper1] = upper_chunk; + } + lower_chunk = upper_chunk->data[upper2]; + if (!lower_chunk) { + lower_chunk = get_lower_chunk(pid_list); + if (!lower_chunk) { + ret = -ENOMEM; + goto out; + } + upper_chunk->data[upper2] = lower_chunk; + } + set_bit(lower, lower_chunk->data); + ret = 0; + out: + raw_spin_unlock_irqrestore(&pid_list->lock, flags); + return ret; +} + +/** + * trace_pid_list_clear - remove a pid from the list + * @pid_list: The pid list to remove the @pid from. + * @pid: The pid to remove. + * + * Removes @pid from @pid_list. This is usually done explicitly by a user + * removing tasks from tracing, or indirectly by the exit function + * when a task that is set to be traced exits. + * + * Return 0 on success, negative otherwise. + */ +int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid) +{ + union upper_chunk *upper_chunk; + union lower_chunk *lower_chunk; + unsigned long flags; + unsigned int upper1; + unsigned int upper2; + unsigned int lower; + + if (!pid_list) + return -ENODEV; + + if (pid_split(pid, &upper1, &upper2, &lower) < 0) + return -EINVAL; + + raw_spin_lock_irqsave(&pid_list->lock, flags); + upper_chunk = pid_list->upper[upper1]; + if (!upper_chunk) + goto out; + + lower_chunk = upper_chunk->data[upper2]; + if (!lower_chunk) + goto out; + + clear_bit(lower, lower_chunk->data); + + /* if there's no more bits set, add it to the free list */ + if (find_first_bit(lower_chunk->data, LOWER_MAX) >= LOWER_MAX) { + put_lower_chunk(pid_list, lower_chunk); + upper_chunk->data[upper2] = NULL; + if (upper_empty(upper_chunk)) { + put_upper_chunk(pid_list, upper_chunk); + pid_list->upper[upper1] = NULL; + } + } + out: + raw_spin_unlock_irqrestore(&pid_list->lock, flags); + return 0; +} + +/** + * trace_pid_list_next - return the next pid in the list + * @pid_list: The pid list to examine. + * @pid: The pid to start from + * @next: The pointer to place the pid that is set starting from @pid. + * + * Looks for the next consecutive pid that is in @pid_list starting + * at the pid specified by @pid. If one is set (including @pid), then + * that pid is placed into @next. + * + * Return 0 when a pid is found, -1 if there are no more pids included. + */ +int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid, + unsigned int *next) +{ + union upper_chunk *upper_chunk; + union lower_chunk *lower_chunk; + unsigned long flags; + unsigned int upper1; + unsigned int upper2; + unsigned int lower; + + if (!pid_list) + return -ENODEV; + + if (pid_split(pid, &upper1, &upper2, &lower) < 0) + return -EINVAL; + + raw_spin_lock_irqsave(&pid_list->lock, flags); + for (; upper1 <= UPPER_MASK; upper1++, upper2 = 0) { + upper_chunk = pid_list->upper[upper1]; + + if (!upper_chunk) + continue; + + for (; upper2 <= UPPER_MASK; upper2++, lower = 0) { + lower_chunk = upper_chunk->data[upper2]; + if (!lower_chunk) + continue; + + lower = find_next_bit(lower_chunk->data, LOWER_MAX, + lower); + if (lower < LOWER_MAX) + goto found; + } + } + + found: + raw_spin_unlock_irqrestore(&pid_list->lock, flags); + if (upper1 > UPPER_MASK) + return -1; + + *next = pid_join(upper1, upper2, lower); + return 0; +} + +/** + * trace_pid_list_first - return the first pid in the list + * @pid_list: The pid list to examine. + * @pid: The pointer to place the pid first found pid that is set. + * + * Looks for the first pid that is set in @pid_list, and places it + * into @pid if found. + * + * Return 0 when a pid is found, -1 if there are no pids set. + */ +int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid) +{ + return trace_pid_list_next(pid_list, 0, pid); +} + +static void pid_list_refill_irq(struct irq_work *iwork) +{ + struct trace_pid_list *pid_list = container_of(iwork, struct trace_pid_list, + refill_irqwork); + union upper_chunk *upper = NULL; + union lower_chunk *lower = NULL; + union upper_chunk **upper_next = &upper; + union lower_chunk **lower_next = &lower; + int upper_count; + int lower_count; + int ucnt = 0; + int lcnt = 0; + + again: + raw_spin_lock(&pid_list->lock); + upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks; + lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks; + raw_spin_unlock(&pid_list->lock); + + if (upper_count <= 0 && lower_count <= 0) + return; + + while (upper_count-- > 0) { + union upper_chunk *chunk; + + chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + if (!chunk) + break; + *upper_next = chunk; + upper_next = &chunk->next; + ucnt++; + } + + while (lower_count-- > 0) { + union lower_chunk *chunk; + + chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + if (!chunk) + break; + *lower_next = chunk; + lower_next = &chunk->next; + lcnt++; + } + + raw_spin_lock(&pid_list->lock); + if (upper) { + *upper_next = pid_list->upper_list; + pid_list->upper_list = upper; + pid_list->free_upper_chunks += ucnt; + } + if (lower) { + *lower_next = pid_list->lower_list; + pid_list->lower_list = lower; + pid_list->free_lower_chunks += lcnt; + } + raw_spin_unlock(&pid_list->lock); + + /* + * On success of allocating all the chunks, both counters + * will be less than zero. If they are not, then an allocation + * failed, and we should not try again. + */ + if (upper_count >= 0 || lower_count >= 0) + return; + /* + * When the locks were released, free chunks could have + * been used and allocation needs to be done again. Might as + * well allocate it now. + */ + goto again; +} + +/** + * trace_pid_list_alloc - create a new pid_list + * + * Allocates a new pid_list to store pids into. + * + * Returns the pid_list on success, NULL otherwise. + */ +struct trace_pid_list *trace_pid_list_alloc(void) +{ + struct trace_pid_list *pid_list; + int i; + + /* According to linux/thread.h, pids can be no bigger that 30 bits */ + WARN_ON_ONCE(pid_max > (1 << 30)); + + pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL); + if (!pid_list) + return NULL; + + init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq); + + raw_spin_lock_init(&pid_list->lock); + + for (i = 0; i < CHUNK_ALLOC; i++) { + union upper_chunk *chunk; + + chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + if (!chunk) + break; + chunk->next = pid_list->upper_list; + pid_list->upper_list = chunk; + pid_list->free_upper_chunks++; + } + + for (i = 0; i < CHUNK_ALLOC; i++) { + union lower_chunk *chunk; + + chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + if (!chunk) + break; + chunk->next = pid_list->lower_list; + pid_list->lower_list = chunk; + pid_list->free_lower_chunks++; + } + + return pid_list; +} + +/** + * trace_pid_list_free - Frees an allocated pid_list. + * + * Frees the memory for a pid_list that was allocated. + */ +void trace_pid_list_free(struct trace_pid_list *pid_list) +{ + union upper_chunk *upper; + union lower_chunk *lower; + int i, j; + + if (!pid_list) + return; + + irq_work_sync(&pid_list->refill_irqwork); + + while (pid_list->lower_list) { + union lower_chunk *chunk; + + chunk = pid_list->lower_list; + pid_list->lower_list = pid_list->lower_list->next; + kfree(chunk); + } + + while (pid_list->upper_list) { + union upper_chunk *chunk; + + chunk = pid_list->upper_list; + pid_list->upper_list = pid_list->upper_list->next; + kfree(chunk); + } + + for (i = 0; i < UPPER1_SIZE; i++) { + upper = pid_list->upper[i]; + if (upper) { + for (j = 0; j < UPPER2_SIZE; j++) { + lower = upper->data[j]; + kfree(lower); + } + kfree(upper); + } + } + kfree(pid_list); +} diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h new file mode 100644 index 0000000000..62e73f1ac8 --- /dev/null +++ b/kernel/trace/pid_list.h @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Do not include this file directly. */ + +#ifndef _TRACE_INTERNAL_PID_LIST_H +#define _TRACE_INTERNAL_PID_LIST_H + +/* + * In order to keep track of what pids to trace, a tree is created much + * like page tables are used. This creates a sparse bit map, where + * the tree is filled in when needed. A PID is at most 30 bits (see + * linux/thread.h), and is broken up into 3 sections based on the bit map + * of the bits. The 8 MSB is the "upper1" section. The next 8 MSB is the + * "upper2" section and the 14 LSB is the "lower" section. + * + * A trace_pid_list structure holds the "upper1" section, in an + * array of 256 pointers (1 or 2K in size) to "upper_chunk" unions, where + * each has an array of 256 pointers (1 or 2K in size) to the "lower_chunk" + * structures, where each has an array of size 2K bytes representing a bitmask + * of the 14 LSB of the PID (256 * 8 = 2048) + * + * When a trace_pid_list is allocated, it includes the 256 pointer array + * of the upper1 unions. Then a "cache" of upper and lower is allocated + * where these will be assigned as needed. + * + * When a bit is set in the pid_list bitmask, the pid to use has + * the 8 MSB masked, and this is used to index the array in the + * pid_list to find the next upper union. If the element is NULL, + * then one is retrieved from the upper_list cache. If none is + * available, then -ENOMEM is returned. + * + * The next 8 MSB is used to index into the "upper2" section. If this + * element is NULL, then it is retrieved from the lower_list cache. + * Again, if one is not available -ENOMEM is returned. + * + * Finally the 14 LSB of the PID is used to set the bit in the 16384 + * bitmask (made up of 2K bytes). + * + * When the second upper section or the lower section has their last + * bit cleared, they are added back to the free list to be reused + * when needed. + */ + +#define UPPER_BITS 8 +#define UPPER_MAX (1 << UPPER_BITS) +#define UPPER1_SIZE (1 << UPPER_BITS) +#define UPPER2_SIZE (1 << UPPER_BITS) + +#define LOWER_BITS 14 +#define LOWER_MAX (1 << LOWER_BITS) +#define LOWER_SIZE (LOWER_MAX / BITS_PER_LONG) + +#define UPPER1_SHIFT (LOWER_BITS + UPPER_BITS) +#define UPPER2_SHIFT LOWER_BITS +#define LOWER_MASK (LOWER_MAX - 1) + +#define UPPER_MASK (UPPER_MAX - 1) + +/* According to linux/thread.h pids can not be bigger than or equal to 1 << 30 */ +#define MAX_PID (1 << 30) + +/* Just keep 6 chunks of both upper and lower in the cache on alloc */ +#define CHUNK_ALLOC 6 + +/* Have 2 chunks free, trigger a refill of the cache */ +#define CHUNK_REALLOC 2 + +union lower_chunk { + union lower_chunk *next; + unsigned long data[LOWER_SIZE]; // 2K in size +}; + +union upper_chunk { + union upper_chunk *next; + union lower_chunk *data[UPPER2_SIZE]; // 1 or 2K in size +}; + +struct trace_pid_list { + raw_spinlock_t lock; + struct irq_work refill_irqwork; + union upper_chunk *upper[UPPER1_SIZE]; // 1 or 2K in size + union upper_chunk *upper_list; + union lower_chunk *lower_list; + int free_upper_chunks; + int free_lower_chunks; +}; + +#endif /* _TRACE_INTERNAL_PID_LIST_H */ diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c new file mode 100644 index 0000000000..21bb161c23 --- /dev/null +++ b/kernel/trace/power-traces.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Power trace points + * + * Copyright (C) 2009 Arjan van de Ven <arjan@linux.intel.com> + */ + +#include <linux/string.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/sched.h> +#include <linux/module.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/power.h> + +EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); +EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_frequency); +EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); + diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c new file mode 100644 index 0000000000..8c4ffd0761 --- /dev/null +++ b/kernel/trace/preemptirq_delay_test.c @@ -0,0 +1,218 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Preempt / IRQ disable delay thread to test latency tracers + * + * Copyright (C) 2018 Joel Fernandes (Google) <joel@joelfernandes.org> + */ + +#include <linux/trace_clock.h> +#include <linux/delay.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/kernel.h> +#include <linux/kobject.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/printk.h> +#include <linux/string.h> +#include <linux/sysfs.h> +#include <linux/completion.h> + +static ulong delay = 100; +static char test_mode[12] = "irq"; +static uint burst_size = 1; +static int cpu_affinity = -1; + +module_param_named(delay, delay, ulong, 0444); +module_param_string(test_mode, test_mode, 12, 0444); +module_param_named(burst_size, burst_size, uint, 0444); +module_param_named(cpu_affinity, cpu_affinity, int, 0444); +MODULE_PARM_DESC(delay, "Period in microseconds (100 us default)"); +MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt, irq, or alternate (default irq)"); +MODULE_PARM_DESC(burst_size, "The size of a burst (default 1)"); +MODULE_PARM_DESC(cpu_affinity, "Cpu num test is running on"); + +static struct completion done; + +#define MIN(x, y) ((x) < (y) ? (x) : (y)) + +static void busy_wait(ulong time) +{ + u64 start, end; + + start = trace_clock_local(); + + do { + end = trace_clock_local(); + if (kthread_should_stop()) + break; + } while ((end - start) < (time * 1000)); +} + +static __always_inline void irqoff_test(void) +{ + unsigned long flags; + + local_irq_save(flags); + busy_wait(delay); + local_irq_restore(flags); +} + +static __always_inline void preemptoff_test(void) +{ + preempt_disable(); + busy_wait(delay); + preempt_enable(); +} + +static void execute_preemptirqtest(int idx) +{ + if (!strcmp(test_mode, "irq")) + irqoff_test(); + else if (!strcmp(test_mode, "preempt")) + preemptoff_test(); + else if (!strcmp(test_mode, "alternate")) { + if (idx % 2 == 0) + irqoff_test(); + else + preemptoff_test(); + } +} + +#define DECLARE_TESTFN(POSTFIX) \ + static void preemptirqtest_##POSTFIX(int idx) \ + { \ + execute_preemptirqtest(idx); \ + } \ + +/* + * We create 10 different functions, so that we can get 10 different + * backtraces. + */ +DECLARE_TESTFN(0) +DECLARE_TESTFN(1) +DECLARE_TESTFN(2) +DECLARE_TESTFN(3) +DECLARE_TESTFN(4) +DECLARE_TESTFN(5) +DECLARE_TESTFN(6) +DECLARE_TESTFN(7) +DECLARE_TESTFN(8) +DECLARE_TESTFN(9) + +static void (*testfuncs[])(int) = { + preemptirqtest_0, + preemptirqtest_1, + preemptirqtest_2, + preemptirqtest_3, + preemptirqtest_4, + preemptirqtest_5, + preemptirqtest_6, + preemptirqtest_7, + preemptirqtest_8, + preemptirqtest_9, +}; + +#define NR_TEST_FUNCS ARRAY_SIZE(testfuncs) + +static int preemptirq_delay_run(void *data) +{ + int i; + int s = MIN(burst_size, NR_TEST_FUNCS); + struct cpumask cpu_mask; + + if (cpu_affinity > -1) { + cpumask_clear(&cpu_mask); + cpumask_set_cpu(cpu_affinity, &cpu_mask); + if (set_cpus_allowed_ptr(current, &cpu_mask)) + pr_err("cpu_affinity:%d, failed\n", cpu_affinity); + } + + for (i = 0; i < s; i++) + (testfuncs[i])(i); + + complete(&done); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + + __set_current_state(TASK_RUNNING); + + return 0; +} + +static int preemptirq_run_test(void) +{ + struct task_struct *task; + char task_name[50]; + + init_completion(&done); + + snprintf(task_name, sizeof(task_name), "%s_test", test_mode); + task = kthread_run(preemptirq_delay_run, NULL, task_name); + if (IS_ERR(task)) + return PTR_ERR(task); + if (task) { + wait_for_completion(&done); + kthread_stop(task); + } + return 0; +} + + +static ssize_t trigger_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret; + + ret = preemptirq_run_test(); + if (ret) + return ret; + return count; +} + +static struct kobj_attribute trigger_attribute = + __ATTR(trigger, 0200, NULL, trigger_store); + +static struct attribute *attrs[] = { + &trigger_attribute.attr, + NULL, +}; + +static struct attribute_group attr_group = { + .attrs = attrs, +}; + +static struct kobject *preemptirq_delay_kobj; + +static int __init preemptirq_delay_init(void) +{ + int retval; + + retval = preemptirq_run_test(); + if (retval != 0) + return retval; + + preemptirq_delay_kobj = kobject_create_and_add("preemptirq_delay_test", + kernel_kobj); + if (!preemptirq_delay_kobj) + return -ENOMEM; + + retval = sysfs_create_group(preemptirq_delay_kobj, &attr_group); + if (retval) + kobject_put(preemptirq_delay_kobj); + + return retval; +} + +static void __exit preemptirq_delay_exit(void) +{ + kobject_put(preemptirq_delay_kobj); +} + +module_init(preemptirq_delay_init) +module_exit(preemptirq_delay_exit) +MODULE_LICENSE("GPL v2"); diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c new file mode 100644 index 0000000000..3cebcbaf35 --- /dev/null +++ b/kernel/trace/rethook.c @@ -0,0 +1,347 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define pr_fmt(fmt) "rethook: " fmt + +#include <linux/bug.h> +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/preempt.h> +#include <linux/rethook.h> +#include <linux/slab.h> +#include <linux/sort.h> + +/* Return hook list (shadow stack by list) */ + +/* + * This function is called from delayed_put_task_struct() when a task is + * dead and cleaned up to recycle any kretprobe instances associated with + * this task. These left over instances represent probed functions that + * have been called but will never return. + */ +void rethook_flush_task(struct task_struct *tk) +{ + struct rethook_node *rhn; + struct llist_node *node; + + node = __llist_del_all(&tk->rethooks); + while (node) { + rhn = container_of(node, struct rethook_node, llist); + node = node->next; + preempt_disable(); + rethook_recycle(rhn); + preempt_enable(); + } +} + +static void rethook_free_rcu(struct rcu_head *head) +{ + struct rethook *rh = container_of(head, struct rethook, rcu); + struct rethook_node *rhn; + struct freelist_node *node; + int count = 1; + + node = rh->pool.head; + while (node) { + rhn = container_of(node, struct rethook_node, freelist); + node = node->next; + kfree(rhn); + count++; + } + + /* The rh->ref is the number of pooled node + 1 */ + if (refcount_sub_and_test(count, &rh->ref)) + kfree(rh); +} + +/** + * rethook_stop() - Stop using a rethook. + * @rh: the struct rethook to stop. + * + * Stop using a rethook to prepare for freeing it. If you want to wait for + * all running rethook handler before calling rethook_free(), you need to + * call this first and wait RCU, and call rethook_free(). + */ +void rethook_stop(struct rethook *rh) +{ + rcu_assign_pointer(rh->handler, NULL); +} + +/** + * rethook_free() - Free struct rethook. + * @rh: the struct rethook to be freed. + * + * Free the rethook. Before calling this function, user must ensure the + * @rh::data is cleaned if needed (or, the handler can access it after + * calling this function.) This function will set the @rh to be freed + * after all rethook_node are freed (not soon). And the caller must + * not touch @rh after calling this. + */ +void rethook_free(struct rethook *rh) +{ + rethook_stop(rh); + + call_rcu(&rh->rcu, rethook_free_rcu); +} + +static inline rethook_handler_t rethook_get_handler(struct rethook *rh) +{ + return (rethook_handler_t)rcu_dereference_check(rh->handler, + rcu_read_lock_any_held()); +} + +/** + * rethook_alloc() - Allocate struct rethook. + * @data: a data to pass the @handler when hooking the return. + * @handler: the return hook callback function. + * + * Allocate and initialize a new rethook with @data and @handler. + * Return NULL if memory allocation fails or @handler is NULL. + * Note that @handler == NULL means this rethook is going to be freed. + */ +struct rethook *rethook_alloc(void *data, rethook_handler_t handler) +{ + struct rethook *rh = kzalloc(sizeof(struct rethook), GFP_KERNEL); + + if (!rh || !handler) { + kfree(rh); + return NULL; + } + + rh->data = data; + rcu_assign_pointer(rh->handler, handler); + rh->pool.head = NULL; + refcount_set(&rh->ref, 1); + + return rh; +} + +/** + * rethook_add_node() - Add a new node to the rethook. + * @rh: the struct rethook. + * @node: the struct rethook_node to be added. + * + * Add @node to @rh. User must allocate @node (as a part of user's + * data structure.) The @node fields are initialized in this function. + */ +void rethook_add_node(struct rethook *rh, struct rethook_node *node) +{ + node->rethook = rh; + freelist_add(&node->freelist, &rh->pool); + refcount_inc(&rh->ref); +} + +static void free_rethook_node_rcu(struct rcu_head *head) +{ + struct rethook_node *node = container_of(head, struct rethook_node, rcu); + + if (refcount_dec_and_test(&node->rethook->ref)) + kfree(node->rethook); + kfree(node); +} + +/** + * rethook_recycle() - return the node to rethook. + * @node: The struct rethook_node to be returned. + * + * Return back the @node to @node::rethook. If the @node::rethook is already + * marked as freed, this will free the @node. + */ +void rethook_recycle(struct rethook_node *node) +{ + rethook_handler_t handler; + + handler = rethook_get_handler(node->rethook); + if (likely(handler)) + freelist_add(&node->freelist, &node->rethook->pool); + else + call_rcu(&node->rcu, free_rethook_node_rcu); +} +NOKPROBE_SYMBOL(rethook_recycle); + +/** + * rethook_try_get() - get an unused rethook node. + * @rh: The struct rethook which pools the nodes. + * + * Get an unused rethook node from @rh. If the node pool is empty, this + * will return NULL. Caller must disable preemption. + */ +struct rethook_node *rethook_try_get(struct rethook *rh) +{ + rethook_handler_t handler = rethook_get_handler(rh); + struct freelist_node *fn; + + /* Check whether @rh is going to be freed. */ + if (unlikely(!handler)) + return NULL; + + /* + * This expects the caller will set up a rethook on a function entry. + * When the function returns, the rethook will eventually be reclaimed + * or released in the rethook_recycle() with call_rcu(). + * This means the caller must be run in the RCU-availabe context. + */ + if (unlikely(!rcu_is_watching())) + return NULL; + + fn = freelist_try_get(&rh->pool); + if (!fn) + return NULL; + + return container_of(fn, struct rethook_node, freelist); +} +NOKPROBE_SYMBOL(rethook_try_get); + +/** + * rethook_hook() - Hook the current function return. + * @node: The struct rethook node to hook the function return. + * @regs: The struct pt_regs for the function entry. + * @mcount: True if this is called from mcount(ftrace) context. + * + * Hook the current running function return. This must be called when the + * function entry (or at least @regs must be the registers of the function + * entry.) @mcount is used for identifying the context. If this is called + * from ftrace (mcount) callback, @mcount must be set true. If this is called + * from the real function entry (e.g. kprobes) @mcount must be set false. + * This is because the way to hook the function return depends on the context. + */ +void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount) +{ + arch_rethook_prepare(node, regs, mcount); + __llist_add(&node->llist, ¤t->rethooks); +} +NOKPROBE_SYMBOL(rethook_hook); + +/* This assumes the 'tsk' is the current task or is not running. */ +static unsigned long __rethook_find_ret_addr(struct task_struct *tsk, + struct llist_node **cur) +{ + struct rethook_node *rh = NULL; + struct llist_node *node = *cur; + + if (!node) + node = tsk->rethooks.first; + else + node = node->next; + + while (node) { + rh = container_of(node, struct rethook_node, llist); + if (rh->ret_addr != (unsigned long)arch_rethook_trampoline) { + *cur = node; + return rh->ret_addr; + } + node = node->next; + } + return 0; +} +NOKPROBE_SYMBOL(__rethook_find_ret_addr); + +/** + * rethook_find_ret_addr -- Find correct return address modified by rethook + * @tsk: Target task + * @frame: A frame pointer + * @cur: a storage of the loop cursor llist_node pointer for next call + * + * Find the correct return address modified by a rethook on @tsk in unsigned + * long type. + * The @tsk must be 'current' or a task which is not running. @frame is a hint + * to get the currect return address - which is compared with the + * rethook::frame field. The @cur is a loop cursor for searching the + * kretprobe return addresses on the @tsk. The '*@cur' should be NULL at the + * first call, but '@cur' itself must NOT NULL. + * + * Returns found address value or zero if not found. + */ +unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame, + struct llist_node **cur) +{ + struct rethook_node *rhn = NULL; + unsigned long ret; + + if (WARN_ON_ONCE(!cur)) + return 0; + + if (WARN_ON_ONCE(tsk != current && task_is_running(tsk))) + return 0; + + do { + ret = __rethook_find_ret_addr(tsk, cur); + if (!ret) + break; + rhn = container_of(*cur, struct rethook_node, llist); + } while (rhn->frame != frame); + + return ret; +} +NOKPROBE_SYMBOL(rethook_find_ret_addr); + +void __weak arch_rethook_fixup_return(struct pt_regs *regs, + unsigned long correct_ret_addr) +{ + /* + * Do nothing by default. If the architecture which uses a + * frame pointer to record real return address on the stack, + * it should fill this function to fixup the return address + * so that stacktrace works from the rethook handler. + */ +} + +/* This function will be called from each arch-defined trampoline. */ +unsigned long rethook_trampoline_handler(struct pt_regs *regs, + unsigned long frame) +{ + struct llist_node *first, *node = NULL; + unsigned long correct_ret_addr; + rethook_handler_t handler; + struct rethook_node *rhn; + + correct_ret_addr = __rethook_find_ret_addr(current, &node); + if (!correct_ret_addr) { + pr_err("rethook: Return address not found! Maybe there is a bug in the kernel\n"); + BUG_ON(1); + } + + instruction_pointer_set(regs, correct_ret_addr); + + /* + * These loops must be protected from rethook_free_rcu() because those + * are accessing 'rhn->rethook'. + */ + preempt_disable_notrace(); + + /* + * Run the handler on the shadow stack. Do not unlink the list here because + * stackdump inside the handlers needs to decode it. + */ + first = current->rethooks.first; + while (first) { + rhn = container_of(first, struct rethook_node, llist); + if (WARN_ON_ONCE(rhn->frame != frame)) + break; + handler = rethook_get_handler(rhn->rethook); + if (handler) + handler(rhn, rhn->rethook->data, + correct_ret_addr, regs); + + if (first == node) + break; + first = first->next; + } + + /* Fixup registers for returning to correct address. */ + arch_rethook_fixup_return(regs, correct_ret_addr); + + /* Unlink used shadow stack */ + first = current->rethooks.first; + current->rethooks.first = node->next; + node->next = NULL; + + while (first) { + rhn = container_of(first, struct rethook_node, llist); + first = first->next; + rethook_recycle(rhn); + } + preempt_enable_notrace(); + + return correct_ret_addr; +} +NOKPROBE_SYMBOL(rethook_trampoline_handler); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c new file mode 100644 index 0000000000..f232cf56fa --- /dev/null +++ b/kernel/trace/ring_buffer.c @@ -0,0 +1,6179 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generic ring buffer + * + * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> + */ +#include <linux/trace_recursion.h> +#include <linux/trace_events.h> +#include <linux/ring_buffer.h> +#include <linux/trace_clock.h> +#include <linux/sched/clock.h> +#include <linux/trace_seq.h> +#include <linux/spinlock.h> +#include <linux/irq_work.h> +#include <linux/security.h> +#include <linux/uaccess.h> +#include <linux/hardirq.h> +#include <linux/kthread.h> /* for self test */ +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/mutex.h> +#include <linux/delay.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/hash.h> +#include <linux/list.h> +#include <linux/cpu.h> +#include <linux/oom.h> + +#include <asm/local.h> + +/* + * The "absolute" timestamp in the buffer is only 59 bits. + * If a clock has the 5 MSBs set, it needs to be saved and + * reinserted. + */ +#define TS_MSB (0xf8ULL << 56) +#define ABS_TS_MASK (~TS_MSB) + +static void update_pages_handler(struct work_struct *work); + +/* + * The ring buffer header is special. We must manually up keep it. + */ +int ring_buffer_print_entry_header(struct trace_seq *s) +{ + trace_seq_puts(s, "# compressed entry header\n"); + trace_seq_puts(s, "\ttype_len : 5 bits\n"); + trace_seq_puts(s, "\ttime_delta : 27 bits\n"); + trace_seq_puts(s, "\tarray : 32 bits\n"); + trace_seq_putc(s, '\n'); + trace_seq_printf(s, "\tpadding : type == %d\n", + RINGBUF_TYPE_PADDING); + trace_seq_printf(s, "\ttime_extend : type == %d\n", + RINGBUF_TYPE_TIME_EXTEND); + trace_seq_printf(s, "\ttime_stamp : type == %d\n", + RINGBUF_TYPE_TIME_STAMP); + trace_seq_printf(s, "\tdata max type_len == %d\n", + RINGBUF_TYPE_DATA_TYPE_LEN_MAX); + + return !trace_seq_has_overflowed(s); +} + +/* + * The ring buffer is made up of a list of pages. A separate list of pages is + * allocated for each CPU. A writer may only write to a buffer that is + * associated with the CPU it is currently executing on. A reader may read + * from any per cpu buffer. + * + * The reader is special. For each per cpu buffer, the reader has its own + * reader page. When a reader has read the entire reader page, this reader + * page is swapped with another page in the ring buffer. + * + * Now, as long as the writer is off the reader page, the reader can do what + * ever it wants with that page. The writer will never write to that page + * again (as long as it is out of the ring buffer). + * + * Here's some silly ASCII art. + * + * +------+ + * |reader| RING BUFFER + * |page | + * +------+ +---+ +---+ +---+ + * | |-->| |-->| | + * +---+ +---+ +---+ + * ^ | + * | | + * +---------------+ + * + * + * +------+ + * |reader| RING BUFFER + * |page |------------------v + * +------+ +---+ +---+ +---+ + * | |-->| |-->| | + * +---+ +---+ +---+ + * ^ | + * | | + * +---------------+ + * + * + * +------+ + * |reader| RING BUFFER + * |page |------------------v + * +------+ +---+ +---+ +---+ + * ^ | |-->| |-->| | + * | +---+ +---+ +---+ + * | | + * | | + * +------------------------------+ + * + * + * +------+ + * |buffer| RING BUFFER + * |page |------------------v + * +------+ +---+ +---+ +---+ + * ^ | | | |-->| | + * | New +---+ +---+ +---+ + * | Reader------^ | + * | page | + * +------------------------------+ + * + * + * After we make this swap, the reader can hand this page off to the splice + * code and be done with it. It can even allocate a new page if it needs to + * and swap that into the ring buffer. + * + * We will be using cmpxchg soon to make all this lockless. + * + */ + +/* Used for individual buffers (after the counter) */ +#define RB_BUFFER_OFF (1 << 20) + +#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) + +#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) +#define RB_ALIGNMENT 4U +#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) +#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ + +#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS +# define RB_FORCE_8BYTE_ALIGNMENT 0 +# define RB_ARCH_ALIGNMENT RB_ALIGNMENT +#else +# define RB_FORCE_8BYTE_ALIGNMENT 1 +# define RB_ARCH_ALIGNMENT 8U +#endif + +#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) + +/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ +#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX + +enum { + RB_LEN_TIME_EXTEND = 8, + RB_LEN_TIME_STAMP = 8, +}; + +#define skip_time_extend(event) \ + ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) + +#define extended_time(event) \ + (event->type_len >= RINGBUF_TYPE_TIME_EXTEND) + +static inline bool rb_null_event(struct ring_buffer_event *event) +{ + return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; +} + +static void rb_event_set_padding(struct ring_buffer_event *event) +{ + /* padding has a NULL time_delta */ + event->type_len = RINGBUF_TYPE_PADDING; + event->time_delta = 0; +} + +static unsigned +rb_event_data_length(struct ring_buffer_event *event) +{ + unsigned length; + + if (event->type_len) + length = event->type_len * RB_ALIGNMENT; + else + length = event->array[0]; + return length + RB_EVNT_HDR_SIZE; +} + +/* + * Return the length of the given event. Will return + * the length of the time extend if the event is a + * time extend. + */ +static inline unsigned +rb_event_length(struct ring_buffer_event *event) +{ + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + if (rb_null_event(event)) + /* undefined */ + return -1; + return event->array[0] + RB_EVNT_HDR_SIZE; + + case RINGBUF_TYPE_TIME_EXTEND: + return RB_LEN_TIME_EXTEND; + + case RINGBUF_TYPE_TIME_STAMP: + return RB_LEN_TIME_STAMP; + + case RINGBUF_TYPE_DATA: + return rb_event_data_length(event); + default: + WARN_ON_ONCE(1); + } + /* not hit */ + return 0; +} + +/* + * Return total length of time extend and data, + * or just the event length for all other events. + */ +static inline unsigned +rb_event_ts_length(struct ring_buffer_event *event) +{ + unsigned len = 0; + + if (extended_time(event)) { + /* time extends include the data event after it */ + len = RB_LEN_TIME_EXTEND; + event = skip_time_extend(event); + } + return len + rb_event_length(event); +} + +/** + * ring_buffer_event_length - return the length of the event + * @event: the event to get the length of + * + * Returns the size of the data load of a data event. + * If the event is something other than a data event, it + * returns the size of the event itself. With the exception + * of a TIME EXTEND, where it still returns the size of the + * data load of the data event after it. + */ +unsigned ring_buffer_event_length(struct ring_buffer_event *event) +{ + unsigned length; + + if (extended_time(event)) + event = skip_time_extend(event); + + length = rb_event_length(event); + if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) + return length; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) + length -= sizeof(event->array[0]); + return length; +} +EXPORT_SYMBOL_GPL(ring_buffer_event_length); + +/* inline for ring buffer fast paths */ +static __always_inline void * +rb_event_data(struct ring_buffer_event *event) +{ + if (extended_time(event)) + event = skip_time_extend(event); + WARN_ON_ONCE(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); + /* If length is in len field, then array[0] has the data */ + if (event->type_len) + return (void *)&event->array[0]; + /* Otherwise length is in array[0] and array[1] has the data */ + return (void *)&event->array[1]; +} + +/** + * ring_buffer_event_data - return the data of the event + * @event: the event to get the data from + */ +void *ring_buffer_event_data(struct ring_buffer_event *event) +{ + return rb_event_data(event); +} +EXPORT_SYMBOL_GPL(ring_buffer_event_data); + +#define for_each_buffer_cpu(buffer, cpu) \ + for_each_cpu(cpu, buffer->cpumask) + +#define for_each_online_buffer_cpu(buffer, cpu) \ + for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask) + +#define TS_SHIFT 27 +#define TS_MASK ((1ULL << TS_SHIFT) - 1) +#define TS_DELTA_TEST (~TS_MASK) + +static u64 rb_event_time_stamp(struct ring_buffer_event *event) +{ + u64 ts; + + ts = event->array[0]; + ts <<= TS_SHIFT; + ts += event->time_delta; + + return ts; +} + +/* Flag when events were overwritten */ +#define RB_MISSED_EVENTS (1 << 31) +/* Missed count stored at end */ +#define RB_MISSED_STORED (1 << 30) + +struct buffer_data_page { + u64 time_stamp; /* page time stamp */ + local_t commit; /* write committed index */ + unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ +}; + +/* + * Note, the buffer_page list must be first. The buffer pages + * are allocated in cache lines, which means that each buffer + * page will be at the beginning of a cache line, and thus + * the least significant bits will be zero. We use this to + * add flags in the list struct pointers, to make the ring buffer + * lockless. + */ +struct buffer_page { + struct list_head list; /* list of buffer pages */ + local_t write; /* index for next write */ + unsigned read; /* index for next read */ + local_t entries; /* entries on this page */ + unsigned long real_end; /* real end of data */ + struct buffer_data_page *page; /* Actual data page */ +}; + +/* + * The buffer page counters, write and entries, must be reset + * atomically when crossing page boundaries. To synchronize this + * update, two counters are inserted into the number. One is + * the actual counter for the write position or count on the page. + * + * The other is a counter of updaters. Before an update happens + * the update partition of the counter is incremented. This will + * allow the updater to update the counter atomically. + * + * The counter is 20 bits, and the state data is 12. + */ +#define RB_WRITE_MASK 0xfffff +#define RB_WRITE_INTCNT (1 << 20) + +static void rb_init_page(struct buffer_data_page *bpage) +{ + local_set(&bpage->commit, 0); +} + +static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage) +{ + return local_read(&bpage->page->commit); +} + +static void free_buffer_page(struct buffer_page *bpage) +{ + free_page((unsigned long)bpage->page); + kfree(bpage); +} + +/* + * We need to fit the time_stamp delta into 27 bits. + */ +static inline bool test_time_stamp(u64 delta) +{ + return !!(delta & TS_DELTA_TEST); +} + +#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) + +/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ +#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) + +int ring_buffer_print_page_header(struct trace_seq *s) +{ + struct buffer_data_page field; + + trace_seq_printf(s, "\tfield: u64 timestamp;\t" + "offset:0;\tsize:%u;\tsigned:%u;\n", + (unsigned int)sizeof(field.time_stamp), + (unsigned int)is_signed_type(u64)); + + trace_seq_printf(s, "\tfield: local_t commit;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + (unsigned int)sizeof(field.commit), + (unsigned int)is_signed_type(long)); + + trace_seq_printf(s, "\tfield: int overwrite;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + 1, + (unsigned int)is_signed_type(long)); + + trace_seq_printf(s, "\tfield: char data;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), data), + (unsigned int)BUF_PAGE_SIZE, + (unsigned int)is_signed_type(char)); + + return !trace_seq_has_overflowed(s); +} + +struct rb_irq_work { + struct irq_work work; + wait_queue_head_t waiters; + wait_queue_head_t full_waiters; + long wait_index; + bool waiters_pending; + bool full_waiters_pending; + bool wakeup_full; +}; + +/* + * Structure to hold event state and handle nested events. + */ +struct rb_event_info { + u64 ts; + u64 delta; + u64 before; + u64 after; + unsigned long length; + struct buffer_page *tail_page; + int add_timestamp; +}; + +/* + * Used for the add_timestamp + * NONE + * EXTEND - wants a time extend + * ABSOLUTE - the buffer requests all events to have absolute time stamps + * FORCE - force a full time stamp. + */ +enum { + RB_ADD_STAMP_NONE = 0, + RB_ADD_STAMP_EXTEND = BIT(1), + RB_ADD_STAMP_ABSOLUTE = BIT(2), + RB_ADD_STAMP_FORCE = BIT(3) +}; +/* + * Used for which event context the event is in. + * TRANSITION = 0 + * NMI = 1 + * IRQ = 2 + * SOFTIRQ = 3 + * NORMAL = 4 + * + * See trace_recursive_lock() comment below for more details. + */ +enum { + RB_CTX_TRANSITION, + RB_CTX_NMI, + RB_CTX_IRQ, + RB_CTX_SOFTIRQ, + RB_CTX_NORMAL, + RB_CTX_MAX +}; + +#if BITS_PER_LONG == 32 +#define RB_TIME_32 +#endif + +/* To test on 64 bit machines */ +//#define RB_TIME_32 + +#ifdef RB_TIME_32 + +struct rb_time_struct { + local_t cnt; + local_t top; + local_t bottom; + local_t msb; +}; +#else +#include <asm/local64.h> +struct rb_time_struct { + local64_t time; +}; +#endif +typedef struct rb_time_struct rb_time_t; + +#define MAX_NEST 5 + +/* + * head_page == tail_page && head == tail then buffer is empty. + */ +struct ring_buffer_per_cpu { + int cpu; + atomic_t record_disabled; + atomic_t resize_disabled; + struct trace_buffer *buffer; + raw_spinlock_t reader_lock; /* serialize readers */ + arch_spinlock_t lock; + struct lock_class_key lock_key; + struct buffer_data_page *free_page; + unsigned long nr_pages; + unsigned int current_context; + struct list_head *pages; + struct buffer_page *head_page; /* read from head */ + struct buffer_page *tail_page; /* write to tail */ + struct buffer_page *commit_page; /* committed pages */ + struct buffer_page *reader_page; + unsigned long lost_events; + unsigned long last_overrun; + unsigned long nest; + local_t entries_bytes; + local_t entries; + local_t overrun; + local_t commit_overrun; + local_t dropped_events; + local_t committing; + local_t commits; + local_t pages_touched; + local_t pages_lost; + local_t pages_read; + long last_pages_touch; + size_t shortest_full; + unsigned long read; + unsigned long read_bytes; + rb_time_t write_stamp; + rb_time_t before_stamp; + u64 event_stamp[MAX_NEST]; + u64 read_stamp; + /* pages removed since last reset */ + unsigned long pages_removed; + /* ring buffer pages to update, > 0 to add, < 0 to remove */ + long nr_pages_to_update; + struct list_head new_pages; /* new pages to add */ + struct work_struct update_pages_work; + struct completion update_done; + + struct rb_irq_work irq_work; +}; + +struct trace_buffer { + unsigned flags; + int cpus; + atomic_t record_disabled; + atomic_t resizing; + cpumask_var_t cpumask; + + struct lock_class_key *reader_lock_key; + + struct mutex mutex; + + struct ring_buffer_per_cpu **buffers; + + struct hlist_node node; + u64 (*clock)(void); + + struct rb_irq_work irq_work; + bool time_stamp_abs; +}; + +struct ring_buffer_iter { + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long head; + unsigned long next_event; + struct buffer_page *head_page; + struct buffer_page *cache_reader_page; + unsigned long cache_read; + unsigned long cache_pages_removed; + u64 read_stamp; + u64 page_stamp; + struct ring_buffer_event *event; + int missed_events; +}; + +#ifdef RB_TIME_32 + +/* + * On 32 bit machines, local64_t is very expensive. As the ring + * buffer doesn't need all the features of a true 64 bit atomic, + * on 32 bit, it uses these functions (64 still uses local64_t). + * + * For the ring buffer, 64 bit required operations for the time is + * the following: + * + * - Reads may fail if it interrupted a modification of the time stamp. + * It will succeed if it did not interrupt another write even if + * the read itself is interrupted by a write. + * It returns whether it was successful or not. + * + * - Writes always succeed and will overwrite other writes and writes + * that were done by events interrupting the current write. + * + * - A write followed by a read of the same time stamp will always succeed, + * but may not contain the same value. + * + * - A cmpxchg will fail if it interrupted another write or cmpxchg. + * Other than that, it acts like a normal cmpxchg. + * + * The 60 bit time stamp is broken up by 30 bits in a top and bottom half + * (bottom being the least significant 30 bits of the 60 bit time stamp). + * + * The two most significant bits of each half holds a 2 bit counter (0-3). + * Each update will increment this counter by one. + * When reading the top and bottom, if the two counter bits match then the + * top and bottom together make a valid 60 bit number. + */ +#define RB_TIME_SHIFT 30 +#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1) +#define RB_TIME_MSB_SHIFT 60 + +static inline int rb_time_cnt(unsigned long val) +{ + return (val >> RB_TIME_SHIFT) & 3; +} + +static inline u64 rb_time_val(unsigned long top, unsigned long bottom) +{ + u64 val; + + val = top & RB_TIME_VAL_MASK; + val <<= RB_TIME_SHIFT; + val |= bottom & RB_TIME_VAL_MASK; + + return val; +} + +static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) +{ + unsigned long top, bottom, msb; + unsigned long c; + + /* + * If the read is interrupted by a write, then the cnt will + * be different. Loop until both top and bottom have been read + * without interruption. + */ + do { + c = local_read(&t->cnt); + top = local_read(&t->top); + bottom = local_read(&t->bottom); + msb = local_read(&t->msb); + } while (c != local_read(&t->cnt)); + + *cnt = rb_time_cnt(top); + + /* If top, msb or bottom counts don't match, this interrupted a write */ + if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom)) + return false; + + /* The shift to msb will lose its cnt bits */ + *ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT); + return true; +} + +static bool rb_time_read(rb_time_t *t, u64 *ret) +{ + unsigned long cnt; + + return __rb_time_read(t, ret, &cnt); +} + +static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt) +{ + return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT); +} + +static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom, + unsigned long *msb) +{ + *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK); + *bottom = (unsigned long)(val & RB_TIME_VAL_MASK); + *msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT); +} + +static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt) +{ + val = rb_time_val_cnt(val, cnt); + local_set(t, val); +} + +static void rb_time_set(rb_time_t *t, u64 val) +{ + unsigned long cnt, top, bottom, msb; + + rb_time_split(val, &top, &bottom, &msb); + + /* Writes always succeed with a valid number even if it gets interrupted. */ + do { + cnt = local_inc_return(&t->cnt); + rb_time_val_set(&t->top, top, cnt); + rb_time_val_set(&t->bottom, bottom, cnt); + rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt); + } while (cnt != local_read(&t->cnt)); +} + +static inline bool +rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) +{ + return local_try_cmpxchg(l, &expect, set); +} + +#else /* 64 bits */ + +/* local64_t always succeeds */ + +static inline bool rb_time_read(rb_time_t *t, u64 *ret) +{ + *ret = local64_read(&t->time); + return true; +} +static void rb_time_set(rb_time_t *t, u64 val) +{ + local64_set(&t->time, val); +} +#endif + +/* + * Enable this to make sure that the event passed to + * ring_buffer_event_time_stamp() is not committed and also + * is on the buffer that it passed in. + */ +//#define RB_VERIFY_EVENT +#ifdef RB_VERIFY_EVENT +static struct list_head *rb_list_head(struct list_head *list); +static void verify_event(struct ring_buffer_per_cpu *cpu_buffer, + void *event) +{ + struct buffer_page *page = cpu_buffer->commit_page; + struct buffer_page *tail_page = READ_ONCE(cpu_buffer->tail_page); + struct list_head *next; + long commit, write; + unsigned long addr = (unsigned long)event; + bool done = false; + int stop = 0; + + /* Make sure the event exists and is not committed yet */ + do { + if (page == tail_page || WARN_ON_ONCE(stop++ > 100)) + done = true; + commit = local_read(&page->page->commit); + write = local_read(&page->write); + if (addr >= (unsigned long)&page->page->data[commit] && + addr < (unsigned long)&page->page->data[write]) + return; + + next = rb_list_head(page->list.next); + page = list_entry(next, struct buffer_page, list); + } while (!done); + WARN_ON_ONCE(1); +} +#else +static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer, + void *event) +{ +} +#endif + +/* + * The absolute time stamp drops the 5 MSBs and some clocks may + * require them. The rb_fix_abs_ts() will take a previous full + * time stamp, and add the 5 MSB of that time stamp on to the + * saved absolute time stamp. Then they are compared in case of + * the unlikely event that the latest time stamp incremented + * the 5 MSB. + */ +static inline u64 rb_fix_abs_ts(u64 abs, u64 save_ts) +{ + if (save_ts & TS_MSB) { + abs |= save_ts & TS_MSB; + /* Check for overflow */ + if (unlikely(abs < save_ts)) + abs += 1ULL << 59; + } + return abs; +} + +static inline u64 rb_time_stamp(struct trace_buffer *buffer); + +/** + * ring_buffer_event_time_stamp - return the event's current time stamp + * @buffer: The buffer that the event is on + * @event: the event to get the time stamp of + * + * Note, this must be called after @event is reserved, and before it is + * committed to the ring buffer. And must be called from the same + * context where the event was reserved (normal, softirq, irq, etc). + * + * Returns the time stamp associated with the current event. + * If the event has an extended time stamp, then that is used as + * the time stamp to return. + * In the highly unlikely case that the event was nested more than + * the max nesting, then the write_stamp of the buffer is returned, + * otherwise current time is returned, but that really neither of + * the last two cases should ever happen. + */ +u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[smp_processor_id()]; + unsigned int nest; + u64 ts; + + /* If the event includes an absolute time, then just use that */ + if (event->type_len == RINGBUF_TYPE_TIME_STAMP) { + ts = rb_event_time_stamp(event); + return rb_fix_abs_ts(ts, cpu_buffer->tail_page->page->time_stamp); + } + + nest = local_read(&cpu_buffer->committing); + verify_event(cpu_buffer, event); + if (WARN_ON_ONCE(!nest)) + goto fail; + + /* Read the current saved nesting level time stamp */ + if (likely(--nest < MAX_NEST)) + return cpu_buffer->event_stamp[nest]; + + /* Shouldn't happen, warn if it does */ + WARN_ONCE(1, "nest (%d) greater than max", nest); + + fail: + /* Can only fail on 32 bit */ + if (!rb_time_read(&cpu_buffer->write_stamp, &ts)) + /* Screw it, just read the current time */ + ts = rb_time_stamp(cpu_buffer->buffer); + + return ts; +} + +/** + * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer + * @buffer: The ring_buffer to get the number of pages from + * @cpu: The cpu of the ring_buffer to get the number of pages from + * + * Returns the number of pages used by a per_cpu buffer of the ring buffer. + */ +size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu) +{ + return buffer->buffers[cpu]->nr_pages; +} + +/** + * ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer + * @buffer: The ring_buffer to get the number of pages from + * @cpu: The cpu of the ring_buffer to get the number of pages from + * + * Returns the number of pages that have content in the ring buffer. + */ +size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu) +{ + size_t read; + size_t lost; + size_t cnt; + + read = local_read(&buffer->buffers[cpu]->pages_read); + lost = local_read(&buffer->buffers[cpu]->pages_lost); + cnt = local_read(&buffer->buffers[cpu]->pages_touched); + + if (WARN_ON_ONCE(cnt < lost)) + return 0; + + cnt -= lost; + + /* The reader can read an empty page, but not more than that */ + if (cnt < read) { + WARN_ON_ONCE(read > cnt + 1); + return 0; + } + + return cnt - read; +} + +static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int full) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + size_t nr_pages; + size_t dirty; + + nr_pages = cpu_buffer->nr_pages; + if (!nr_pages || !full) + return true; + + /* + * Add one as dirty will never equal nr_pages, as the sub-buffer + * that the writer is on is not counted as dirty. + * This is needed if "buffer_percent" is set to 100. + */ + dirty = ring_buffer_nr_dirty_pages(buffer, cpu) + 1; + + return (dirty * 100) >= (full * nr_pages); +} + +/* + * rb_wake_up_waiters - wake up tasks waiting for ring buffer input + * + * Schedules a delayed work to wake up any task that is blocked on the + * ring buffer waiters queue. + */ +static void rb_wake_up_waiters(struct irq_work *work) +{ + struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); + + wake_up_all(&rbwork->waiters); + if (rbwork->full_waiters_pending || rbwork->wakeup_full) { + rbwork->wakeup_full = false; + rbwork->full_waiters_pending = false; + wake_up_all(&rbwork->full_waiters); + } +} + +/** + * ring_buffer_wake_waiters - wake up any waiters on this ring buffer + * @buffer: The ring buffer to wake waiters on + * @cpu: The CPU buffer to wake waiters on + * + * In the case of a file that represents a ring buffer is closing, + * it is prudent to wake up any waiters that are on this. + */ +void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct rb_irq_work *rbwork; + + if (!buffer) + return; + + if (cpu == RING_BUFFER_ALL_CPUS) { + + /* Wake up individual ones too. One level recursion */ + for_each_buffer_cpu(buffer, cpu) + ring_buffer_wake_waiters(buffer, cpu); + + rbwork = &buffer->irq_work; + } else { + if (WARN_ON_ONCE(!buffer->buffers)) + return; + if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) + return; + + cpu_buffer = buffer->buffers[cpu]; + /* The CPU buffer may not have been initialized yet */ + if (!cpu_buffer) + return; + rbwork = &cpu_buffer->irq_work; + } + + rbwork->wait_index++; + /* make sure the waiters see the new index */ + smp_wmb(); + + /* This can be called in any context */ + irq_work_queue(&rbwork->work); +} + +/** + * ring_buffer_wait - wait for input to the ring buffer + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + */ +int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) +{ + struct ring_buffer_per_cpu *cpu_buffer; + DEFINE_WAIT(wait); + struct rb_irq_work *work; + long wait_index; + int ret = 0; + + /* + * Depending on what the caller is waiting for, either any + * data in any cpu buffer, or a specific buffer, put the + * caller on the appropriate wait queue. + */ + if (cpu == RING_BUFFER_ALL_CPUS) { + work = &buffer->irq_work; + /* Full only makes sense on per cpu reads */ + full = 0; + } else { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -ENODEV; + cpu_buffer = buffer->buffers[cpu]; + work = &cpu_buffer->irq_work; + } + + wait_index = READ_ONCE(work->wait_index); + + while (true) { + if (full) + prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE); + else + prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); + + /* + * The events can happen in critical sections where + * checking a work queue can cause deadlocks. + * After adding a task to the queue, this flag is set + * only to notify events to try to wake up the queue + * using irq_work. + * + * We don't clear it even if the buffer is no longer + * empty. The flag only causes the next event to run + * irq_work to do the work queue wake up. The worse + * that can happen if we race with !trace_empty() is that + * an event will cause an irq_work to try to wake up + * an empty queue. + * + * There's no reason to protect this flag either, as + * the work queue and irq_work logic will do the necessary + * synchronization for the wake ups. The only thing + * that is necessary is that the wake up happens after + * a task has been queued. It's OK for spurious wake ups. + */ + if (full) + work->full_waiters_pending = true; + else + work->waiters_pending = true; + + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) + break; + + if (cpu != RING_BUFFER_ALL_CPUS && + !ring_buffer_empty_cpu(buffer, cpu)) { + unsigned long flags; + bool pagebusy; + bool done; + + if (!full) + break; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + done = !pagebusy && full_hit(buffer, cpu, full); + + if (!cpu_buffer->shortest_full || + cpu_buffer->shortest_full > full) + cpu_buffer->shortest_full = full; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (done) + break; + } + + schedule(); + + /* Make sure to see the new wait index */ + smp_rmb(); + if (wait_index != work->wait_index) + break; + } + + if (full) + finish_wait(&work->full_waiters, &wait); + else + finish_wait(&work->waiters, &wait); + + return ret; +} + +/** + * ring_buffer_poll_wait - poll on buffer input + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * @filp: the file descriptor + * @poll_table: The poll descriptor + * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + * + * Returns EPOLLIN | EPOLLRDNORM if data exists in the buffers, + * zero otherwise. + */ +__poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, + struct file *filp, poll_table *poll_table, int full) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct rb_irq_work *work; + + if (cpu == RING_BUFFER_ALL_CPUS) { + work = &buffer->irq_work; + full = 0; + } else { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + work = &cpu_buffer->irq_work; + } + + if (full) { + poll_wait(filp, &work->full_waiters, poll_table); + work->full_waiters_pending = true; + if (!cpu_buffer->shortest_full || + cpu_buffer->shortest_full > full) + cpu_buffer->shortest_full = full; + } else { + poll_wait(filp, &work->waiters, poll_table); + work->waiters_pending = true; + } + + /* + * There's a tight race between setting the waiters_pending and + * checking if the ring buffer is empty. Once the waiters_pending bit + * is set, the next event will wake the task up, but we can get stuck + * if there's only a single event in. + * + * FIXME: Ideally, we need a memory barrier on the writer side as well, + * but adding a memory barrier to all events will cause too much of a + * performance hit in the fast path. We only need a memory barrier when + * the buffer goes from empty to having content. But as this race is + * extremely small, and it's not a problem if another event comes in, we + * will fix it later. + */ + smp_mb(); + + if (full) + return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0; + + if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || + (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) + return EPOLLIN | EPOLLRDNORM; + return 0; +} + +/* buffer may be either ring_buffer or ring_buffer_per_cpu */ +#define RB_WARN_ON(b, cond) \ + ({ \ + int _____ret = unlikely(cond); \ + if (_____ret) { \ + if (__same_type(*(b), struct ring_buffer_per_cpu)) { \ + struct ring_buffer_per_cpu *__b = \ + (void *)b; \ + atomic_inc(&__b->buffer->record_disabled); \ + } else \ + atomic_inc(&b->record_disabled); \ + WARN_ON(1); \ + } \ + _____ret; \ + }) + +/* Up this if you want to test the TIME_EXTENTS and normalization */ +#define DEBUG_SHIFT 0 + +static inline u64 rb_time_stamp(struct trace_buffer *buffer) +{ + u64 ts; + + /* Skip retpolines :-( */ + if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local)) + ts = trace_clock_local(); + else + ts = buffer->clock(); + + /* shift to debug/test normalization and TIME_EXTENTS */ + return ts << DEBUG_SHIFT; +} + +u64 ring_buffer_time_stamp(struct trace_buffer *buffer) +{ + u64 time; + + preempt_disable_notrace(); + time = rb_time_stamp(buffer); + preempt_enable_notrace(); + + return time; +} +EXPORT_SYMBOL_GPL(ring_buffer_time_stamp); + +void ring_buffer_normalize_time_stamp(struct trace_buffer *buffer, + int cpu, u64 *ts) +{ + /* Just stupid testing the normalize function and deltas */ + *ts >>= DEBUG_SHIFT; +} +EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); + +/* + * Making the ring buffer lockless makes things tricky. + * Although writes only happen on the CPU that they are on, + * and they only need to worry about interrupts. Reads can + * happen on any CPU. + * + * The reader page is always off the ring buffer, but when the + * reader finishes with a page, it needs to swap its page with + * a new one from the buffer. The reader needs to take from + * the head (writes go to the tail). But if a writer is in overwrite + * mode and wraps, it must push the head page forward. + * + * Here lies the problem. + * + * The reader must be careful to replace only the head page, and + * not another one. As described at the top of the file in the + * ASCII art, the reader sets its old page to point to the next + * page after head. It then sets the page after head to point to + * the old reader page. But if the writer moves the head page + * during this operation, the reader could end up with the tail. + * + * We use cmpxchg to help prevent this race. We also do something + * special with the page before head. We set the LSB to 1. + * + * When the writer must push the page forward, it will clear the + * bit that points to the head page, move the head, and then set + * the bit that points to the new head page. + * + * We also don't want an interrupt coming in and moving the head + * page on another writer. Thus we use the second LSB to catch + * that too. Thus: + * + * head->list->prev->next bit 1 bit 0 + * ------- ------- + * Normal page 0 0 + * Points to head page 0 1 + * New head page 1 0 + * + * Note we can not trust the prev pointer of the head page, because: + * + * +----+ +-----+ +-----+ + * | |------>| T |---X--->| N | + * | |<------| | | | + * +----+ +-----+ +-----+ + * ^ ^ | + * | +-----+ | | + * +----------| R |----------+ | + * | |<-----------+ + * +-----+ + * + * Key: ---X--> HEAD flag set in pointer + * T Tail page + * R Reader page + * N Next page + * + * (see __rb_reserve_next() to see where this happens) + * + * What the above shows is that the reader just swapped out + * the reader page with a page in the buffer, but before it + * could make the new header point back to the new page added + * it was preempted by a writer. The writer moved forward onto + * the new page added by the reader and is about to move forward + * again. + * + * You can see, it is legitimate for the previous pointer of + * the head (or any page) not to point back to itself. But only + * temporarily. + */ + +#define RB_PAGE_NORMAL 0UL +#define RB_PAGE_HEAD 1UL +#define RB_PAGE_UPDATE 2UL + + +#define RB_FLAG_MASK 3UL + +/* PAGE_MOVED is not part of the mask */ +#define RB_PAGE_MOVED 4UL + +/* + * rb_list_head - remove any bit + */ +static struct list_head *rb_list_head(struct list_head *list) +{ + unsigned long val = (unsigned long)list; + + return (struct list_head *)(val & ~RB_FLAG_MASK); +} + +/* + * rb_is_head_page - test if the given page is the head page + * + * Because the reader may move the head_page pointer, we can + * not trust what the head page is (it may be pointing to + * the reader page). But if the next page is a header page, + * its flags will be non zero. + */ +static inline int +rb_is_head_page(struct buffer_page *page, struct list_head *list) +{ + unsigned long val; + + val = (unsigned long)list->next; + + if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list) + return RB_PAGE_MOVED; + + return val & RB_FLAG_MASK; +} + +/* + * rb_is_reader_page + * + * The unique thing about the reader page, is that, if the + * writer is ever on it, the previous pointer never points + * back to the reader page. + */ +static bool rb_is_reader_page(struct buffer_page *page) +{ + struct list_head *list = page->list.prev; + + return rb_list_head(list->next) != &page->list; +} + +/* + * rb_set_list_to_head - set a list_head to be pointing to head. + */ +static void rb_set_list_to_head(struct list_head *list) +{ + unsigned long *ptr; + + ptr = (unsigned long *)&list->next; + *ptr |= RB_PAGE_HEAD; + *ptr &= ~RB_PAGE_UPDATE; +} + +/* + * rb_head_page_activate - sets up head page + */ +static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *head; + + head = cpu_buffer->head_page; + if (!head) + return; + + /* + * Set the previous list pointer to have the HEAD flag. + */ + rb_set_list_to_head(head->list.prev); +} + +static void rb_list_head_clear(struct list_head *list) +{ + unsigned long *ptr = (unsigned long *)&list->next; + + *ptr &= ~RB_FLAG_MASK; +} + +/* + * rb_head_page_deactivate - clears head page ptr (for free list) + */ +static void +rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *hd; + + /* Go through the whole list and clear any pointers found. */ + rb_list_head_clear(cpu_buffer->pages); + + list_for_each(hd, cpu_buffer->pages) + rb_list_head_clear(hd); +} + +static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag, int new_flag) +{ + struct list_head *list; + unsigned long val = (unsigned long)&head->list; + unsigned long ret; + + list = &prev->list; + + val &= ~RB_FLAG_MASK; + + ret = cmpxchg((unsigned long *)&list->next, + val | old_flag, val | new_flag); + + /* check if the reader took the page */ + if ((ret & ~RB_FLAG_MASK) != val) + return RB_PAGE_MOVED; + + return ret & RB_FLAG_MASK; +} + +static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_UPDATE); +} + +static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_HEAD); +} + +static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_NORMAL); +} + +static inline void rb_inc_page(struct buffer_page **bpage) +{ + struct list_head *p = rb_list_head((*bpage)->list.next); + + *bpage = list_entry(p, struct buffer_page, list); +} + +static struct buffer_page * +rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *head; + struct buffer_page *page; + struct list_head *list; + int i; + + if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page)) + return NULL; + + /* sanity check */ + list = cpu_buffer->pages; + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list)) + return NULL; + + page = head = cpu_buffer->head_page; + /* + * It is possible that the writer moves the header behind + * where we started, and we miss in one loop. + * A second loop should grab the header, but we'll do + * three loops just because I'm paranoid. + */ + for (i = 0; i < 3; i++) { + do { + if (rb_is_head_page(page, page->list.prev)) { + cpu_buffer->head_page = page; + return page; + } + rb_inc_page(&page); + } while (page != head); + } + + RB_WARN_ON(cpu_buffer, 1); + + return NULL; +} + +static bool rb_head_page_replace(struct buffer_page *old, + struct buffer_page *new) +{ + unsigned long *ptr = (unsigned long *)&old->list.prev->next; + unsigned long val; + + val = *ptr & ~RB_FLAG_MASK; + val |= RB_PAGE_HEAD; + + return try_cmpxchg(ptr, &val, (unsigned long)&new->list); +} + +/* + * rb_tail_page_update - move the tail page forward + */ +static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + struct buffer_page *next_page) +{ + unsigned long old_entries; + unsigned long old_write; + + /* + * The tail page now needs to be moved forward. + * + * We need to reset the tail page, but without messing + * with possible erasing of data brought in by interrupts + * that have moved the tail page and are currently on it. + * + * We add a counter to the write field to denote this. + */ + old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); + old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); + + local_inc(&cpu_buffer->pages_touched); + /* + * Just make sure we have seen our old_write and synchronize + * with any interrupts that come in. + */ + barrier(); + + /* + * If the tail page is still the same as what we think + * it is, then it is up to us to update the tail + * pointer. + */ + if (tail_page == READ_ONCE(cpu_buffer->tail_page)) { + /* Zero the write counter */ + unsigned long val = old_write & ~RB_WRITE_MASK; + unsigned long eval = old_entries & ~RB_WRITE_MASK; + + /* + * This will only succeed if an interrupt did + * not come in and change it. In which case, we + * do not want to modify it. + * + * We add (void) to let the compiler know that we do not care + * about the return value of these functions. We use the + * cmpxchg to only update if an interrupt did not already + * do it for us. If the cmpxchg fails, we don't care. + */ + (void)local_cmpxchg(&next_page->write, old_write, val); + (void)local_cmpxchg(&next_page->entries, old_entries, eval); + + /* + * No need to worry about races with clearing out the commit. + * it only can increment when a commit takes place. But that + * only happens in the outer most nested commit. + */ + local_set(&next_page->page->commit, 0); + + /* Again, either we update tail_page or an interrupt does */ + (void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page); + } +} + +static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *bpage) +{ + unsigned long val = (unsigned long)bpage; + + RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK); +} + +/** + * rb_check_pages - integrity check of buffer pages + * @cpu_buffer: CPU buffer with pages to test + * + * As a safety measure we check to make sure the data pages have not + * been corrupted. + */ +static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *head = rb_list_head(cpu_buffer->pages); + struct list_head *tmp; + + if (RB_WARN_ON(cpu_buffer, + rb_list_head(rb_list_head(head->next)->prev) != head)) + return; + + if (RB_WARN_ON(cpu_buffer, + rb_list_head(rb_list_head(head->prev)->next) != head)) + return; + + for (tmp = rb_list_head(head->next); tmp != head; tmp = rb_list_head(tmp->next)) { + if (RB_WARN_ON(cpu_buffer, + rb_list_head(rb_list_head(tmp->next)->prev) != tmp)) + return; + + if (RB_WARN_ON(cpu_buffer, + rb_list_head(rb_list_head(tmp->prev)->next) != tmp)) + return; + } +} + +static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, + long nr_pages, struct list_head *pages) +{ + struct buffer_page *bpage, *tmp; + bool user_thread = current->mm != NULL; + gfp_t mflags; + long i; + + /* + * Check if the available memory is there first. + * Note, si_mem_available() only gives us a rough estimate of available + * memory. It may not be accurate. But we don't care, we just want + * to prevent doing any allocation when it is obvious that it is + * not going to succeed. + */ + i = si_mem_available(); + if (i < nr_pages) + return -ENOMEM; + + /* + * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails + * gracefully without invoking oom-killer and the system is not + * destabilized. + */ + mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL; + + /* + * If a user thread allocates too much, and si_mem_available() + * reports there's enough memory, even though there is not. + * Make sure the OOM killer kills this thread. This can happen + * even with RETRY_MAYFAIL because another task may be doing + * an allocation after this task has taken all memory. + * This is the task the OOM killer needs to take out during this + * loop, even if it was triggered by an allocation somewhere else. + */ + if (user_thread) + set_current_oom_origin(); + for (i = 0; i < nr_pages; i++) { + struct page *page; + + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), + mflags, cpu_to_node(cpu_buffer->cpu)); + if (!bpage) + goto free_pages; + + rb_check_bpage(cpu_buffer, bpage); + + list_add(&bpage->list, pages); + + page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0); + if (!page) + goto free_pages; + bpage->page = page_address(page); + rb_init_page(bpage->page); + + if (user_thread && fatal_signal_pending(current)) + goto free_pages; + } + if (user_thread) + clear_current_oom_origin(); + + return 0; + +free_pages: + list_for_each_entry_safe(bpage, tmp, pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + if (user_thread) + clear_current_oom_origin(); + + return -ENOMEM; +} + +static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long nr_pages) +{ + LIST_HEAD(pages); + + WARN_ON(!nr_pages); + + if (__rb_allocate_pages(cpu_buffer, nr_pages, &pages)) + return -ENOMEM; + + /* + * The ring buffer page list is a circular list that does not + * start and end with a list head. All page list items point to + * other pages. + */ + cpu_buffer->pages = pages.next; + list_del(&pages); + + cpu_buffer->nr_pages = nr_pages; + + rb_check_pages(cpu_buffer); + + return 0; +} + +static struct ring_buffer_per_cpu * +rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *bpage; + struct page *page; + int ret; + + cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu)); + if (!cpu_buffer) + return NULL; + + cpu_buffer->cpu = cpu; + cpu_buffer->buffer = buffer; + raw_spin_lock_init(&cpu_buffer->reader_lock); + lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); + cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); + init_completion(&cpu_buffer->update_done); + init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); + init_waitqueue_head(&cpu_buffer->irq_work.waiters); + init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); + + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu)); + if (!bpage) + goto fail_free_buffer; + + rb_check_bpage(cpu_buffer, bpage); + + cpu_buffer->reader_page = bpage; + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + if (!page) + goto fail_free_reader; + bpage->page = page_address(page); + rb_init_page(bpage->page); + + INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + INIT_LIST_HEAD(&cpu_buffer->new_pages); + + ret = rb_allocate_pages(cpu_buffer, nr_pages); + if (ret < 0) + goto fail_free_reader; + + cpu_buffer->head_page + = list_entry(cpu_buffer->pages, struct buffer_page, list); + cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + + rb_head_page_activate(cpu_buffer); + + return cpu_buffer; + + fail_free_reader: + free_buffer_page(cpu_buffer->reader_page); + + fail_free_buffer: + kfree(cpu_buffer); + return NULL; +} + +static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *head = cpu_buffer->pages; + struct buffer_page *bpage, *tmp; + + irq_work_sync(&cpu_buffer->irq_work.work); + + free_buffer_page(cpu_buffer->reader_page); + + if (head) { + rb_head_page_deactivate(cpu_buffer); + + list_for_each_entry_safe(bpage, tmp, head, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + bpage = list_entry(head, struct buffer_page, list); + free_buffer_page(bpage); + } + + free_page((unsigned long)cpu_buffer->free_page); + + kfree(cpu_buffer); +} + +/** + * __ring_buffer_alloc - allocate a new ring_buffer + * @size: the size in bytes per cpu that is needed. + * @flags: attributes to set for the ring buffer. + * @key: ring buffer reader_lock_key. + * + * Currently the only flag that is available is the RB_FL_OVERWRITE + * flag. This flag means that the buffer will overwrite old data + * when the buffer wraps. If this flag is not set, the buffer will + * drop data when the tail hits the head. + */ +struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, + struct lock_class_key *key) +{ + struct trace_buffer *buffer; + long nr_pages; + int bsize; + int cpu; + int ret; + + /* keep it in its own cache line */ + buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), + GFP_KERNEL); + if (!buffer) + return NULL; + + if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) + goto fail_free_buffer; + + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + buffer->flags = flags; + buffer->clock = trace_clock_local; + buffer->reader_lock_key = key; + + init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters); + init_waitqueue_head(&buffer->irq_work.waiters); + + /* need at least two pages */ + if (nr_pages < 2) + nr_pages = 2; + + buffer->cpus = nr_cpu_ids; + + bsize = sizeof(void *) * nr_cpu_ids; + buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), + GFP_KERNEL); + if (!buffer->buffers) + goto fail_free_cpumask; + + cpu = raw_smp_processor_id(); + cpumask_set_cpu(cpu, buffer->cpumask); + buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); + if (!buffer->buffers[cpu]) + goto fail_free_buffers; + + ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); + if (ret < 0) + goto fail_free_buffers; + + mutex_init(&buffer->mutex); + + return buffer; + + fail_free_buffers: + for_each_buffer_cpu(buffer, cpu) { + if (buffer->buffers[cpu]) + rb_free_cpu_buffer(buffer->buffers[cpu]); + } + kfree(buffer->buffers); + + fail_free_cpumask: + free_cpumask_var(buffer->cpumask); + + fail_free_buffer: + kfree(buffer); + return NULL; +} +EXPORT_SYMBOL_GPL(__ring_buffer_alloc); + +/** + * ring_buffer_free - free a ring buffer. + * @buffer: the buffer to free. + */ +void +ring_buffer_free(struct trace_buffer *buffer) +{ + int cpu; + + cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); + + irq_work_sync(&buffer->irq_work.work); + + for_each_buffer_cpu(buffer, cpu) + rb_free_cpu_buffer(buffer->buffers[cpu]); + + kfree(buffer->buffers); + free_cpumask_var(buffer->cpumask); + + kfree(buffer); +} +EXPORT_SYMBOL_GPL(ring_buffer_free); + +void ring_buffer_set_clock(struct trace_buffer *buffer, + u64 (*clock)(void)) +{ + buffer->clock = clock; +} + +void ring_buffer_set_time_stamp_abs(struct trace_buffer *buffer, bool abs) +{ + buffer->time_stamp_abs = abs; +} + +bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer) +{ + return buffer->time_stamp_abs; +} + +static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); + +static inline unsigned long rb_page_entries(struct buffer_page *bpage) +{ + return local_read(&bpage->entries) & RB_WRITE_MASK; +} + +static inline unsigned long rb_page_write(struct buffer_page *bpage) +{ + return local_read(&bpage->write) & RB_WRITE_MASK; +} + +static bool +rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) +{ + struct list_head *tail_page, *to_remove, *next_page; + struct buffer_page *to_remove_page, *tmp_iter_page; + struct buffer_page *last_page, *first_page; + unsigned long nr_removed; + unsigned long head_bit; + int page_entries; + + head_bit = 0; + + raw_spin_lock_irq(&cpu_buffer->reader_lock); + atomic_inc(&cpu_buffer->record_disabled); + /* + * We don't race with the readers since we have acquired the reader + * lock. We also don't race with writers after disabling recording. + * This makes it easy to figure out the first and the last page to be + * removed from the list. We unlink all the pages in between including + * the first and last pages. This is done in a busy loop so that we + * lose the least number of traces. + * The pages are freed after we restart recording and unlock readers. + */ + tail_page = &cpu_buffer->tail_page->list; + + /* + * tail page might be on reader page, we remove the next page + * from the ring buffer + */ + if (cpu_buffer->tail_page == cpu_buffer->reader_page) + tail_page = rb_list_head(tail_page->next); + to_remove = tail_page; + + /* start of pages to remove */ + first_page = list_entry(rb_list_head(to_remove->next), + struct buffer_page, list); + + for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) { + to_remove = rb_list_head(to_remove)->next; + head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD; + } + /* Read iterators need to reset themselves when some pages removed */ + cpu_buffer->pages_removed += nr_removed; + + next_page = rb_list_head(to_remove)->next; + + /* + * Now we remove all pages between tail_page and next_page. + * Make sure that we have head_bit value preserved for the + * next page + */ + tail_page->next = (struct list_head *)((unsigned long)next_page | + head_bit); + next_page = rb_list_head(next_page); + next_page->prev = tail_page; + + /* make sure pages points to a valid page in the ring buffer */ + cpu_buffer->pages = next_page; + + /* update head page */ + if (head_bit) + cpu_buffer->head_page = list_entry(next_page, + struct buffer_page, list); + + /* pages are removed, resume tracing and then free the pages */ + atomic_dec(&cpu_buffer->record_disabled); + raw_spin_unlock_irq(&cpu_buffer->reader_lock); + + RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)); + + /* last buffer page to remove */ + last_page = list_entry(rb_list_head(to_remove), struct buffer_page, + list); + tmp_iter_page = first_page; + + do { + cond_resched(); + + to_remove_page = tmp_iter_page; + rb_inc_page(&tmp_iter_page); + + /* update the counters */ + page_entries = rb_page_entries(to_remove_page); + if (page_entries) { + /* + * If something was added to this page, it was full + * since it is not the tail page. So we deduct the + * bytes consumed in ring buffer from here. + * Increment overrun to account for the lost events. + */ + local_add(page_entries, &cpu_buffer->overrun); + local_sub(rb_page_commit(to_remove_page), &cpu_buffer->entries_bytes); + local_inc(&cpu_buffer->pages_lost); + } + + /* + * We have already removed references to this list item, just + * free up the buffer_page and its page + */ + free_buffer_page(to_remove_page); + nr_removed--; + + } while (to_remove_page != last_page); + + RB_WARN_ON(cpu_buffer, nr_removed); + + return nr_removed == 0; +} + +static bool +rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *pages = &cpu_buffer->new_pages; + unsigned long flags; + bool success; + int retries; + + /* Can be called at early boot up, where interrupts must not been enabled */ + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + /* + * We are holding the reader lock, so the reader page won't be swapped + * in the ring buffer. Now we are racing with the writer trying to + * move head page and the tail page. + * We are going to adapt the reader page update process where: + * 1. We first splice the start and end of list of new pages between + * the head page and its previous page. + * 2. We cmpxchg the prev_page->next to point from head page to the + * start of new pages list. + * 3. Finally, we update the head->prev to the end of new list. + * + * We will try this process 10 times, to make sure that we don't keep + * spinning. + */ + retries = 10; + success = false; + while (retries--) { + struct list_head *head_page, *prev_page, *r; + struct list_head *last_page, *first_page; + struct list_head *head_page_with_bit; + struct buffer_page *hpage = rb_set_head_page(cpu_buffer); + + if (!hpage) + break; + head_page = &hpage->list; + prev_page = head_page->prev; + + first_page = pages->next; + last_page = pages->prev; + + head_page_with_bit = (struct list_head *) + ((unsigned long)head_page | RB_PAGE_HEAD); + + last_page->next = head_page_with_bit; + first_page->prev = prev_page; + + r = cmpxchg(&prev_page->next, head_page_with_bit, first_page); + + if (r == head_page_with_bit) { + /* + * yay, we replaced the page pointer to our new list, + * now, we just have to update to head page's prev + * pointer to point to end of list + */ + head_page->prev = last_page; + success = true; + break; + } + } + + if (success) + INIT_LIST_HEAD(pages); + /* + * If we weren't successful in adding in new pages, warn and stop + * tracing + */ + RB_WARN_ON(cpu_buffer, !success); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + /* free pages if they weren't inserted */ + if (!success) { + struct buffer_page *bpage, *tmp; + list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, + list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + } + return success; +} + +static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer) +{ + bool success; + + if (cpu_buffer->nr_pages_to_update > 0) + success = rb_insert_pages(cpu_buffer); + else + success = rb_remove_pages(cpu_buffer, + -cpu_buffer->nr_pages_to_update); + + if (success) + cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update; +} + +static void update_pages_handler(struct work_struct *work) +{ + struct ring_buffer_per_cpu *cpu_buffer = container_of(work, + struct ring_buffer_per_cpu, update_pages_work); + rb_update_pages(cpu_buffer); + complete(&cpu_buffer->update_done); +} + +/** + * ring_buffer_resize - resize the ring buffer + * @buffer: the buffer to resize. + * @size: the new size. + * @cpu_id: the cpu buffer to resize + * + * Minimum size is 2 * BUF_PAGE_SIZE. + * + * Returns 0 on success and < 0 on failure. + */ +int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, + int cpu_id) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long nr_pages; + int cpu, err; + + /* + * Always succeed at resizing a non-existent buffer: + */ + if (!buffer) + return 0; + + /* Make sure the requested buffer exists */ + if (cpu_id != RING_BUFFER_ALL_CPUS && + !cpumask_test_cpu(cpu_id, buffer->cpumask)) + return 0; + + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + + /* we need a minimum of two pages */ + if (nr_pages < 2) + nr_pages = 2; + + /* prevent another thread from changing buffer sizes */ + mutex_lock(&buffer->mutex); + atomic_inc(&buffer->resizing); + + if (cpu_id == RING_BUFFER_ALL_CPUS) { + /* + * Don't succeed if resizing is disabled, as a reader might be + * manipulating the ring buffer and is expecting a sane state while + * this is true. + */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + if (atomic_read(&cpu_buffer->resize_disabled)) { + err = -EBUSY; + goto out_err_unlock; + } + } + + /* calculate the pages to update */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + cpu_buffer->nr_pages_to_update = nr_pages - + cpu_buffer->nr_pages; + /* + * nothing more to do for removing pages or no update + */ + if (cpu_buffer->nr_pages_to_update <= 0) + continue; + /* + * to add pages, make sure all new pages can be + * allocated without receiving ENOMEM + */ + INIT_LIST_HEAD(&cpu_buffer->new_pages); + if (__rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update, + &cpu_buffer->new_pages)) { + /* not enough memory for new pages */ + err = -ENOMEM; + goto out_err; + } + + cond_resched(); + } + + cpus_read_lock(); + /* + * Fire off all the required work handlers + * We can't schedule on offline CPUs, but it's not necessary + * since we can change their buffer sizes without any race. + */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + if (!cpu_buffer->nr_pages_to_update) + continue; + + /* Can't run something on an offline CPU. */ + if (!cpu_online(cpu)) { + rb_update_pages(cpu_buffer); + cpu_buffer->nr_pages_to_update = 0; + } else { + /* Run directly if possible. */ + migrate_disable(); + if (cpu != smp_processor_id()) { + migrate_enable(); + schedule_work_on(cpu, + &cpu_buffer->update_pages_work); + } else { + update_pages_handler(&cpu_buffer->update_pages_work); + migrate_enable(); + } + } + } + + /* wait for all the updates to complete */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + if (!cpu_buffer->nr_pages_to_update) + continue; + + if (cpu_online(cpu)) + wait_for_completion(&cpu_buffer->update_done); + cpu_buffer->nr_pages_to_update = 0; + } + + cpus_read_unlock(); + } else { + cpu_buffer = buffer->buffers[cpu_id]; + + if (nr_pages == cpu_buffer->nr_pages) + goto out; + + /* + * Don't succeed if resizing is disabled, as a reader might be + * manipulating the ring buffer and is expecting a sane state while + * this is true. + */ + if (atomic_read(&cpu_buffer->resize_disabled)) { + err = -EBUSY; + goto out_err_unlock; + } + + cpu_buffer->nr_pages_to_update = nr_pages - + cpu_buffer->nr_pages; + + INIT_LIST_HEAD(&cpu_buffer->new_pages); + if (cpu_buffer->nr_pages_to_update > 0 && + __rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update, + &cpu_buffer->new_pages)) { + err = -ENOMEM; + goto out_err; + } + + cpus_read_lock(); + + /* Can't run something on an offline CPU. */ + if (!cpu_online(cpu_id)) + rb_update_pages(cpu_buffer); + else { + /* Run directly if possible. */ + migrate_disable(); + if (cpu_id == smp_processor_id()) { + rb_update_pages(cpu_buffer); + migrate_enable(); + } else { + migrate_enable(); + schedule_work_on(cpu_id, + &cpu_buffer->update_pages_work); + wait_for_completion(&cpu_buffer->update_done); + } + } + + cpu_buffer->nr_pages_to_update = 0; + cpus_read_unlock(); + } + + out: + /* + * The ring buffer resize can happen with the ring buffer + * enabled, so that the update disturbs the tracing as little + * as possible. But if the buffer is disabled, we do not need + * to worry about that, and we can take the time to verify + * that the buffer is not corrupt. + */ + if (atomic_read(&buffer->record_disabled)) { + atomic_inc(&buffer->record_disabled); + /* + * Even though the buffer was disabled, we must make sure + * that it is truly disabled before calling rb_check_pages. + * There could have been a race between checking + * record_disable and incrementing it. + */ + synchronize_rcu(); + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + rb_check_pages(cpu_buffer); + } + atomic_dec(&buffer->record_disabled); + } + + atomic_dec(&buffer->resizing); + mutex_unlock(&buffer->mutex); + return 0; + + out_err: + for_each_buffer_cpu(buffer, cpu) { + struct buffer_page *bpage, *tmp; + + cpu_buffer = buffer->buffers[cpu]; + cpu_buffer->nr_pages_to_update = 0; + + if (list_empty(&cpu_buffer->new_pages)) + continue; + + list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, + list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + } + out_err_unlock: + atomic_dec(&buffer->resizing); + mutex_unlock(&buffer->mutex); + return err; +} +EXPORT_SYMBOL_GPL(ring_buffer_resize); + +void ring_buffer_change_overwrite(struct trace_buffer *buffer, int val) +{ + mutex_lock(&buffer->mutex); + if (val) + buffer->flags |= RB_FL_OVERWRITE; + else + buffer->flags &= ~RB_FL_OVERWRITE; + mutex_unlock(&buffer->mutex); +} +EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); + +static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) +{ + return bpage->page->data + index; +} + +static __always_inline struct ring_buffer_event * +rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) +{ + return __rb_page_index(cpu_buffer->reader_page, + cpu_buffer->reader_page->read); +} + +static struct ring_buffer_event * +rb_iter_head_event(struct ring_buffer_iter *iter) +{ + struct ring_buffer_event *event; + struct buffer_page *iter_head_page = iter->head_page; + unsigned long commit; + unsigned length; + + if (iter->head != iter->next_event) + return iter->event; + + /* + * When the writer goes across pages, it issues a cmpxchg which + * is a mb(), which will synchronize with the rmb here. + * (see rb_tail_page_update() and __rb_reserve_next()) + */ + commit = rb_page_commit(iter_head_page); + smp_rmb(); + + /* An event needs to be at least 8 bytes in size */ + if (iter->head > commit - 8) + goto reset; + + event = __rb_page_index(iter_head_page, iter->head); + length = rb_event_length(event); + + /* + * READ_ONCE() doesn't work on functions and we don't want the + * compiler doing any crazy optimizations with length. + */ + barrier(); + + if ((iter->head + length) > commit || length > BUF_PAGE_SIZE) + /* Writer corrupted the read? */ + goto reset; + + memcpy(iter->event, event, length); + /* + * If the page stamp is still the same after this rmb() then the + * event was safely copied without the writer entering the page. + */ + smp_rmb(); + + /* Make sure the page didn't change since we read this */ + if (iter->page_stamp != iter_head_page->page->time_stamp || + commit > rb_page_commit(iter_head_page)) + goto reset; + + iter->next_event = iter->head + length; + return iter->event; + reset: + /* Reset to the beginning */ + iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp; + iter->head = 0; + iter->next_event = 0; + iter->missed_events = 1; + return NULL; +} + +/* Size is determined by what has been committed */ +static __always_inline unsigned rb_page_size(struct buffer_page *bpage) +{ + return rb_page_commit(bpage); +} + +static __always_inline unsigned +rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) +{ + return rb_page_commit(cpu_buffer->commit_page); +} + +static __always_inline unsigned +rb_event_index(struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + + return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; +} + +static void rb_inc_iter(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + + /* + * The iterator could be on the reader page (it starts there). + * But the head could have moved, since the reader was + * found. Check for this case and assign the iterator + * to the head page instead of next. + */ + if (iter->head_page == cpu_buffer->reader_page) + iter->head_page = rb_set_head_page(cpu_buffer); + else + rb_inc_page(&iter->head_page); + + iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp; + iter->head = 0; + iter->next_event = 0; +} + +/* + * rb_handle_head_page - writer hit the head page + * + * Returns: +1 to retry page + * 0 to continue + * -1 on error + */ +static int +rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + struct buffer_page *next_page) +{ + struct buffer_page *new_head; + int entries; + int type; + int ret; + + entries = rb_page_entries(next_page); + + /* + * The hard part is here. We need to move the head + * forward, and protect against both readers on + * other CPUs and writers coming in via interrupts. + */ + type = rb_head_page_set_update(cpu_buffer, next_page, tail_page, + RB_PAGE_HEAD); + + /* + * type can be one of four: + * NORMAL - an interrupt already moved it for us + * HEAD - we are the first to get here. + * UPDATE - we are the interrupt interrupting + * a current move. + * MOVED - a reader on another CPU moved the next + * pointer to its reader page. Give up + * and try again. + */ + + switch (type) { + case RB_PAGE_HEAD: + /* + * We changed the head to UPDATE, thus + * it is our responsibility to update + * the counters. + */ + local_add(entries, &cpu_buffer->overrun); + local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes); + local_inc(&cpu_buffer->pages_lost); + + /* + * The entries will be zeroed out when we move the + * tail page. + */ + + /* still more to do */ + break; + + case RB_PAGE_UPDATE: + /* + * This is an interrupt that interrupt the + * previous update. Still more to do. + */ + break; + case RB_PAGE_NORMAL: + /* + * An interrupt came in before the update + * and processed this for us. + * Nothing left to do. + */ + return 1; + case RB_PAGE_MOVED: + /* + * The reader is on another CPU and just did + * a swap with our next_page. + * Try again. + */ + return 1; + default: + RB_WARN_ON(cpu_buffer, 1); /* WTF??? */ + return -1; + } + + /* + * Now that we are here, the old head pointer is + * set to UPDATE. This will keep the reader from + * swapping the head page with the reader page. + * The reader (on another CPU) will spin till + * we are finished. + * + * We just need to protect against interrupts + * doing the job. We will set the next pointer + * to HEAD. After that, we set the old pointer + * to NORMAL, but only if it was HEAD before. + * otherwise we are an interrupt, and only + * want the outer most commit to reset it. + */ + new_head = next_page; + rb_inc_page(&new_head); + + ret = rb_head_page_set_head(cpu_buffer, new_head, next_page, + RB_PAGE_NORMAL); + + /* + * Valid returns are: + * HEAD - an interrupt came in and already set it. + * NORMAL - One of two things: + * 1) We really set it. + * 2) A bunch of interrupts came in and moved + * the page forward again. + */ + switch (ret) { + case RB_PAGE_HEAD: + case RB_PAGE_NORMAL: + /* OK */ + break; + default: + RB_WARN_ON(cpu_buffer, 1); + return -1; + } + + /* + * It is possible that an interrupt came in, + * set the head up, then more interrupts came in + * and moved it again. When we get back here, + * the page would have been set to NORMAL but we + * just set it back to HEAD. + * + * How do you detect this? Well, if that happened + * the tail page would have moved. + */ + if (ret == RB_PAGE_NORMAL) { + struct buffer_page *buffer_tail_page; + + buffer_tail_page = READ_ONCE(cpu_buffer->tail_page); + /* + * If the tail had moved passed next, then we need + * to reset the pointer. + */ + if (buffer_tail_page != tail_page && + buffer_tail_page != next_page) + rb_head_page_set_normal(cpu_buffer, new_head, + next_page, + RB_PAGE_HEAD); + } + + /* + * If this was the outer most commit (the one that + * changed the original pointer from HEAD to UPDATE), + * then it is up to us to reset it to NORMAL. + */ + if (type == RB_PAGE_HEAD) { + ret = rb_head_page_set_normal(cpu_buffer, next_page, + tail_page, + RB_PAGE_UPDATE); + if (RB_WARN_ON(cpu_buffer, + ret != RB_PAGE_UPDATE)) + return -1; + } + + return 0; +} + +static inline void +rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long tail, struct rb_event_info *info) +{ + struct buffer_page *tail_page = info->tail_page; + struct ring_buffer_event *event; + unsigned long length = info->length; + + /* + * Only the event that crossed the page boundary + * must fill the old tail_page with padding. + */ + if (tail >= BUF_PAGE_SIZE) { + /* + * If the page was filled, then we still need + * to update the real_end. Reset it to zero + * and the reader will ignore it. + */ + if (tail == BUF_PAGE_SIZE) + tail_page->real_end = 0; + + local_sub(length, &tail_page->write); + return; + } + + event = __rb_page_index(tail_page, tail); + + /* + * Save the original length to the meta data. + * This will be used by the reader to add lost event + * counter. + */ + tail_page->real_end = tail; + + /* + * If this event is bigger than the minimum size, then + * we need to be careful that we don't subtract the + * write counter enough to allow another writer to slip + * in on this page. + * We put in a discarded commit instead, to make sure + * that this space is not used again, and this space will + * not be accounted into 'entries_bytes'. + * + * If we are less than the minimum size, we don't need to + * worry about it. + */ + if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) { + /* No room for any events */ + + /* Mark the rest of the page with padding */ + rb_event_set_padding(event); + + /* Make sure the padding is visible before the write update */ + smp_wmb(); + + /* Set the write back to the previous setting */ + local_sub(length, &tail_page->write); + return; + } + + /* Put in a discarded event */ + event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + event->time_delta = 1; + + /* account for padding bytes */ + local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); + + /* Make sure the padding is visible before the tail_page->write update */ + smp_wmb(); + + /* Set write to end of buffer */ + length = (tail + length) - BUF_PAGE_SIZE; + local_sub(length, &tail_page->write); +} + +static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer); + +/* + * This is the slow path, force gcc not to inline it. + */ +static noinline struct ring_buffer_event * +rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long tail, struct rb_event_info *info) +{ + struct buffer_page *tail_page = info->tail_page; + struct buffer_page *commit_page = cpu_buffer->commit_page; + struct trace_buffer *buffer = cpu_buffer->buffer; + struct buffer_page *next_page; + int ret; + + next_page = tail_page; + + rb_inc_page(&next_page); + + /* + * If for some reason, we had an interrupt storm that made + * it all the way around the buffer, bail, and warn + * about it. + */ + if (unlikely(next_page == commit_page)) { + local_inc(&cpu_buffer->commit_overrun); + goto out_reset; + } + + /* + * This is where the fun begins! + * + * We are fighting against races between a reader that + * could be on another CPU trying to swap its reader + * page with the buffer head. + * + * We are also fighting against interrupts coming in and + * moving the head or tail on us as well. + * + * If the next page is the head page then we have filled + * the buffer, unless the commit page is still on the + * reader page. + */ + if (rb_is_head_page(next_page, &tail_page->list)) { + + /* + * If the commit is not on the reader page, then + * move the header page. + */ + if (!rb_is_reader_page(cpu_buffer->commit_page)) { + /* + * If we are not in overwrite mode, + * this is easy, just stop here. + */ + if (!(buffer->flags & RB_FL_OVERWRITE)) { + local_inc(&cpu_buffer->dropped_events); + goto out_reset; + } + + ret = rb_handle_head_page(cpu_buffer, + tail_page, + next_page); + if (ret < 0) + goto out_reset; + if (ret) + goto out_again; + } else { + /* + * We need to be careful here too. The + * commit page could still be on the reader + * page. We could have a small buffer, and + * have filled up the buffer with events + * from interrupts and such, and wrapped. + * + * Note, if the tail page is also on the + * reader_page, we let it move out. + */ + if (unlikely((cpu_buffer->commit_page != + cpu_buffer->tail_page) && + (cpu_buffer->commit_page == + cpu_buffer->reader_page))) { + local_inc(&cpu_buffer->commit_overrun); + goto out_reset; + } + } + } + + rb_tail_page_update(cpu_buffer, tail_page, next_page); + + out_again: + + rb_reset_tail(cpu_buffer, tail, info); + + /* Commit what we have for now. */ + rb_end_commit(cpu_buffer); + /* rb_end_commit() decs committing */ + local_inc(&cpu_buffer->committing); + + /* fail and let the caller try again */ + return ERR_PTR(-EAGAIN); + + out_reset: + /* reset write */ + rb_reset_tail(cpu_buffer, tail, info); + + return NULL; +} + +/* Slow path */ +static struct ring_buffer_event * +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) +{ + if (abs) + event->type_len = RINGBUF_TYPE_TIME_STAMP; + else + event->type_len = RINGBUF_TYPE_TIME_EXTEND; + + /* Not the first event on the page, or not delta? */ + if (abs || rb_event_index(event)) { + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + } else { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } + + return skip_time_extend(event); +} + +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline bool sched_clock_stable(void) +{ + return true; +} +#endif + +static void +rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info) +{ + u64 write_stamp; + + WARN_ONCE(1, "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s", + (unsigned long long)info->delta, + (unsigned long long)info->ts, + (unsigned long long)info->before, + (unsigned long long)info->after, + (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0), + sched_clock_stable() ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/tracing/trace_clock\n" + "or add trace_clock=global to the kernel command line\n"); +} + +static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event **event, + struct rb_event_info *info, + u64 *delta, + unsigned int *length) +{ + bool abs = info->add_timestamp & + (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE); + + if (unlikely(info->delta > (1ULL << 59))) { + /* + * Some timers can use more than 59 bits, and when a timestamp + * is added to the buffer, it will lose those bits. + */ + if (abs && (info->ts & TS_MSB)) { + info->delta &= ABS_TS_MASK; + + /* did the clock go backwards */ + } else if (info->before == info->after && info->before > info->ts) { + /* not interrupted */ + static int once; + + /* + * This is possible with a recalibrating of the TSC. + * Do not produce a call stack, but just report it. + */ + if (!once) { + once++; + pr_warn("Ring buffer clock went backwards: %llu -> %llu\n", + info->before, info->ts); + } + } else + rb_check_timestamp(cpu_buffer, info); + if (!abs) + info->delta = 0; + } + *event = rb_add_time_stamp(*event, info->delta, abs); + *length -= RB_LEN_TIME_EXTEND; + *delta = 0; +} + +/** + * rb_update_event - update event type and data + * @cpu_buffer: The per cpu buffer of the @event + * @event: the event to update + * @info: The info to update the @event with (contains length and delta) + * + * Update the type and data fields of the @event. The length + * is the actual size that is written to the ring buffer, + * and with this, we can determine what to place into the + * data field. + */ +static void +rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, + struct rb_event_info *info) +{ + unsigned length = info->length; + u64 delta = info->delta; + unsigned int nest = local_read(&cpu_buffer->committing) - 1; + + if (!WARN_ON_ONCE(nest >= MAX_NEST)) + cpu_buffer->event_stamp[nest] = info->ts; + + /* + * If we need to add a timestamp, then we + * add it to the start of the reserved space. + */ + if (unlikely(info->add_timestamp)) + rb_add_timestamp(cpu_buffer, &event, info, &delta, &length); + + event->time_delta = delta; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { + event->type_len = 0; + event->array[0] = length; + } else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); +} + +static unsigned rb_calculate_event_length(unsigned length) +{ + struct ring_buffer_event event; /* Used only for sizeof array */ + + /* zero length can cause confusions */ + if (!length) + length++; + + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) + length += sizeof(event.array[0]); + + length += RB_EVNT_HDR_SIZE; + length = ALIGN(length, RB_ARCH_ALIGNMENT); + + /* + * In case the time delta is larger than the 27 bits for it + * in the header, we need to add a timestamp. If another + * event comes in when trying to discard this one to increase + * the length, then the timestamp will be added in the allocated + * space of this event. If length is bigger than the size needed + * for the TIME_EXTEND, then padding has to be used. The events + * length must be either RB_LEN_TIME_EXTEND, or greater than or equal + * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding. + * As length is a multiple of 4, we only need to worry if it + * is 12 (RB_LEN_TIME_EXTEND + 4). + */ + if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT) + length += RB_ALIGNMENT; + + return length; +} + +static inline bool +rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long new_index, old_index; + struct buffer_page *bpage; + unsigned long addr; + + new_index = rb_event_index(event); + old_index = new_index + rb_event_ts_length(event); + addr = (unsigned long)event; + addr &= PAGE_MASK; + + bpage = READ_ONCE(cpu_buffer->tail_page); + + /* + * Make sure the tail_page is still the same and + * the next write location is the end of this event + */ + if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { + unsigned long write_mask = + local_read(&bpage->write) & ~RB_WRITE_MASK; + unsigned long event_length = rb_event_length(event); + + /* + * For the before_stamp to be different than the write_stamp + * to make sure that the next event adds an absolute + * value and does not rely on the saved write stamp, which + * is now going to be bogus. + * + * By setting the before_stamp to zero, the next event + * is not going to use the write_stamp and will instead + * create an absolute timestamp. This means there's no + * reason to update the wirte_stamp! + */ + rb_time_set(&cpu_buffer->before_stamp, 0); + + /* + * If an event were to come in now, it would see that the + * write_stamp and the before_stamp are different, and assume + * that this event just added itself before updating + * the write stamp. The interrupting event will fix the + * write stamp for us, and use an absolute timestamp. + */ + + /* + * This is on the tail page. It is possible that + * a write could come in and move the tail page + * and write to the next page. That is fine + * because we just shorten what is on this page. + */ + old_index += write_mask; + new_index += write_mask; + + /* caution: old_index gets updated on cmpxchg failure */ + if (local_try_cmpxchg(&bpage->write, &old_index, new_index)) { + /* update counters */ + local_sub(event_length, &cpu_buffer->entries_bytes); + return true; + } + } + + /* could not discard */ + return false; +} + +static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) +{ + local_inc(&cpu_buffer->committing); + local_inc(&cpu_buffer->commits); +} + +static __always_inline void +rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long max_count; + + /* + * We only race with interrupts and NMIs on this CPU. + * If we own the commit event, then we can commit + * all others that interrupted us, since the interruptions + * are in stack format (they finish before they come + * back to us). This allows us to do a simple loop to + * assign the commit to the tail. + */ + again: + max_count = cpu_buffer->nr_pages * 100; + + while (cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)) { + if (RB_WARN_ON(cpu_buffer, !(--max_count))) + return; + if (RB_WARN_ON(cpu_buffer, + rb_is_reader_page(cpu_buffer->tail_page))) + return; + /* + * No need for a memory barrier here, as the update + * of the tail_page did it for this page. + */ + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + rb_inc_page(&cpu_buffer->commit_page); + /* add barrier to keep gcc from optimizing too much */ + barrier(); + } + while (rb_commit_index(cpu_buffer) != + rb_page_write(cpu_buffer->commit_page)) { + + /* Make sure the readers see the content of what is committed. */ + smp_wmb(); + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + RB_WARN_ON(cpu_buffer, + local_read(&cpu_buffer->commit_page->page->commit) & + ~RB_WRITE_MASK); + barrier(); + } + + /* again, keep gcc from optimizing */ + barrier(); + + /* + * If an interrupt came in just after the first while loop + * and pushed the tail page forward, we will be left with + * a dangling commit that will never go forward. + */ + if (unlikely(cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page))) + goto again; +} + +static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long commits; + + if (RB_WARN_ON(cpu_buffer, + !local_read(&cpu_buffer->committing))) + return; + + again: + commits = local_read(&cpu_buffer->commits); + /* synchronize with interrupts */ + barrier(); + if (local_read(&cpu_buffer->committing) == 1) + rb_set_commit_to_write(cpu_buffer); + + local_dec(&cpu_buffer->committing); + + /* synchronize with interrupts */ + barrier(); + + /* + * Need to account for interrupts coming in between the + * updating of the commit page and the clearing of the + * committing counter. + */ + if (unlikely(local_read(&cpu_buffer->commits) != commits) && + !local_read(&cpu_buffer->committing)) { + local_inc(&cpu_buffer->committing); + goto again; + } +} + +static inline void rb_event_discard(struct ring_buffer_event *event) +{ + if (extended_time(event)) + event = skip_time_extend(event); + + /* array[0] holds the actual length for the discarded event */ + event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} + +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer) +{ + local_inc(&cpu_buffer->entries); + rb_end_commit(cpu_buffer); +} + +static __always_inline void +rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) +{ + if (buffer->irq_work.waiters_pending) { + buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&buffer->irq_work.work); + } + + if (cpu_buffer->irq_work.waiters_pending) { + cpu_buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } + + if (cpu_buffer->last_pages_touch == local_read(&cpu_buffer->pages_touched)) + return; + + if (cpu_buffer->reader_page == cpu_buffer->commit_page) + return; + + if (!cpu_buffer->irq_work.full_waiters_pending) + return; + + cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched); + + if (!full_hit(buffer, cpu_buffer->cpu, cpu_buffer->shortest_full)) + return; + + cpu_buffer->irq_work.wakeup_full = true; + cpu_buffer->irq_work.full_waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); +} + +#ifdef CONFIG_RING_BUFFER_RECORD_RECURSION +# define do_ring_buffer_record_recursion() \ + do_ftrace_record_recursion(_THIS_IP_, _RET_IP_) +#else +# define do_ring_buffer_record_recursion() do { } while (0) +#endif + +/* + * The lock and unlock are done within a preempt disable section. + * The current_context per_cpu variable can only be modified + * by the current task between lock and unlock. But it can + * be modified more than once via an interrupt. To pass this + * information from the lock to the unlock without having to + * access the 'in_interrupt()' functions again (which do show + * a bit of overhead in something as critical as function tracing, + * we use a bitmask trick. + * + * bit 1 = NMI context + * bit 2 = IRQ context + * bit 3 = SoftIRQ context + * bit 4 = normal context. + * + * This works because this is the order of contexts that can + * preempt other contexts. A SoftIRQ never preempts an IRQ + * context. + * + * When the context is determined, the corresponding bit is + * checked and set (if it was set, then a recursion of that context + * happened). + * + * On unlock, we need to clear this bit. To do so, just subtract + * 1 from the current_context and AND it to itself. + * + * (binary) + * 101 - 1 = 100 + * 101 & 100 = 100 (clearing bit zero) + * + * 1010 - 1 = 1001 + * 1010 & 1001 = 1000 (clearing bit 1) + * + * The least significant bit can be cleared this way, and it + * just so happens that it is the same bit corresponding to + * the current context. + * + * Now the TRANSITION bit breaks the above slightly. The TRANSITION bit + * is set when a recursion is detected at the current context, and if + * the TRANSITION bit is already set, it will fail the recursion. + * This is needed because there's a lag between the changing of + * interrupt context and updating the preempt count. In this case, + * a false positive will be found. To handle this, one extra recursion + * is allowed, and this is done by the TRANSITION bit. If the TRANSITION + * bit is already set, then it is considered a recursion and the function + * ends. Otherwise, the TRANSITION bit is set, and that bit is returned. + * + * On the trace_recursive_unlock(), the TRANSITION bit will be the first + * to be cleared. Even if it wasn't the context that set it. That is, + * if an interrupt comes in while NORMAL bit is set and the ring buffer + * is called before preempt_count() is updated, since the check will + * be on the NORMAL bit, the TRANSITION bit will then be set. If an + * NMI then comes in, it will set the NMI bit, but when the NMI code + * does the trace_recursive_unlock() it will clear the TRANSITION bit + * and leave the NMI bit set. But this is fine, because the interrupt + * code that set the TRANSITION bit will then clear the NMI bit when it + * calls trace_recursive_unlock(). If another NMI comes in, it will + * set the TRANSITION bit and continue. + * + * Note: The TRANSITION bit only handles a single transition between context. + */ + +static __always_inline bool +trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned int val = cpu_buffer->current_context; + int bit = interrupt_context_level(); + + bit = RB_CTX_NORMAL - bit; + + if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) { + /* + * It is possible that this was called by transitioning + * between interrupt context, and preempt_count() has not + * been updated yet. In this case, use the TRANSITION bit. + */ + bit = RB_CTX_TRANSITION; + if (val & (1 << (bit + cpu_buffer->nest))) { + do_ring_buffer_record_recursion(); + return true; + } + } + + val |= (1 << (bit + cpu_buffer->nest)); + cpu_buffer->current_context = val; + + return false; +} + +static __always_inline void +trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) +{ + cpu_buffer->current_context &= + cpu_buffer->current_context - (1 << cpu_buffer->nest); +} + +/* The recursive locking above uses 5 bits */ +#define NESTED_BITS 5 + +/** + * ring_buffer_nest_start - Allow to trace while nested + * @buffer: The ring buffer to modify + * + * The ring buffer has a safety mechanism to prevent recursion. + * But there may be a case where a trace needs to be done while + * tracing something else. In this case, calling this function + * will allow this function to nest within a currently active + * ring_buffer_lock_reserve(). + * + * Call this function before calling another ring_buffer_lock_reserve() and + * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit(). + */ +void ring_buffer_nest_start(struct trace_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + /* Enabled by ring_buffer_nest_end() */ + preempt_disable_notrace(); + cpu = raw_smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + /* This is the shift value for the above recursive locking */ + cpu_buffer->nest += NESTED_BITS; +} + +/** + * ring_buffer_nest_end - Allow to trace while nested + * @buffer: The ring buffer to modify + * + * Must be called after ring_buffer_nest_start() and after the + * ring_buffer_unlock_commit(). + */ +void ring_buffer_nest_end(struct trace_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + /* disabled by ring_buffer_nest_start() */ + cpu = raw_smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + /* This is the shift value for the above recursive locking */ + cpu_buffer->nest -= NESTED_BITS; + preempt_enable_notrace(); +} + +/** + * ring_buffer_unlock_commit - commit a reserved + * @buffer: The buffer to commit to + * + * This commits the data to the ring buffer, and releases any locks held. + * + * Must be paired with ring_buffer_lock_reserve. + */ +int ring_buffer_unlock_commit(struct trace_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu = raw_smp_processor_id(); + + cpu_buffer = buffer->buffers[cpu]; + + rb_commit(cpu_buffer); + + rb_wakeups(buffer, cpu_buffer); + + trace_recursive_unlock(cpu_buffer); + + preempt_enable_notrace(); + + return 0; +} +EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); + +/* Special value to validate all deltas on a page. */ +#define CHECK_FULL_PAGE 1L + +#ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS +static void dump_buffer_page(struct buffer_data_page *bpage, + struct rb_event_info *info, + unsigned long tail) +{ + struct ring_buffer_event *event; + u64 ts, delta; + int e; + + ts = bpage->time_stamp; + pr_warn(" [%lld] PAGE TIME STAMP\n", ts); + + for (e = 0; e < tail; e += rb_event_length(event)) { + + event = (struct ring_buffer_event *)(bpage->data + e); + + switch (event->type_len) { + + case RINGBUF_TYPE_TIME_EXTEND: + delta = rb_event_time_stamp(event); + ts += delta; + pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta); + break; + + case RINGBUF_TYPE_TIME_STAMP: + delta = rb_event_time_stamp(event); + ts = rb_fix_abs_ts(delta, ts); + pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta); + break; + + case RINGBUF_TYPE_PADDING: + ts += event->time_delta; + pr_warn(" [%lld] delta:%d PADDING\n", ts, event->time_delta); + break; + + case RINGBUF_TYPE_DATA: + ts += event->time_delta; + pr_warn(" [%lld] delta:%d\n", ts, event->time_delta); + break; + + default: + break; + } + } +} + +static DEFINE_PER_CPU(atomic_t, checking); +static atomic_t ts_dump; + +/* + * Check if the current event time stamp matches the deltas on + * the buffer page. + */ +static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info, + unsigned long tail) +{ + struct ring_buffer_event *event; + struct buffer_data_page *bpage; + u64 ts, delta; + bool full = false; + int e; + + bpage = info->tail_page->page; + + if (tail == CHECK_FULL_PAGE) { + full = true; + tail = local_read(&bpage->commit); + } else if (info->add_timestamp & + (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)) { + /* Ignore events with absolute time stamps */ + return; + } + + /* + * Do not check the first event (skip possible extends too). + * Also do not check if previous events have not been committed. + */ + if (tail <= 8 || tail > local_read(&bpage->commit)) + return; + + /* + * If this interrupted another event, + */ + if (atomic_inc_return(this_cpu_ptr(&checking)) != 1) + goto out; + + ts = bpage->time_stamp; + + for (e = 0; e < tail; e += rb_event_length(event)) { + + event = (struct ring_buffer_event *)(bpage->data + e); + + switch (event->type_len) { + + case RINGBUF_TYPE_TIME_EXTEND: + delta = rb_event_time_stamp(event); + ts += delta; + break; + + case RINGBUF_TYPE_TIME_STAMP: + delta = rb_event_time_stamp(event); + ts = rb_fix_abs_ts(delta, ts); + break; + + case RINGBUF_TYPE_PADDING: + if (event->time_delta == 1) + break; + fallthrough; + case RINGBUF_TYPE_DATA: + ts += event->time_delta; + break; + + default: + RB_WARN_ON(cpu_buffer, 1); + } + } + if ((full && ts > info->ts) || + (!full && ts + info->delta != info->ts)) { + /* If another report is happening, ignore this one */ + if (atomic_inc_return(&ts_dump) != 1) { + atomic_dec(&ts_dump); + goto out; + } + atomic_inc(&cpu_buffer->record_disabled); + /* There's some cases in boot up that this can happen */ + WARN_ON_ONCE(system_state != SYSTEM_BOOTING); + pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s\n", + cpu_buffer->cpu, + ts + info->delta, info->ts, info->delta, + info->before, info->after, + full ? " (full)" : ""); + dump_buffer_page(bpage, info, tail); + atomic_dec(&ts_dump); + /* Do not re-enable checking */ + return; + } +out: + atomic_dec(this_cpu_ptr(&checking)); +} +#else +static inline void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info, + unsigned long tail) +{ +} +#endif /* CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS */ + +static struct ring_buffer_event * +__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info) +{ + struct ring_buffer_event *event; + struct buffer_page *tail_page; + unsigned long tail, write, w; + bool a_ok; + bool b_ok; + + /* Don't let the compiler play games with cpu_buffer->tail_page */ + tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); + + /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK; + barrier(); + b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); + barrier(); + info->ts = rb_time_stamp(cpu_buffer->buffer); + + if ((info->add_timestamp & RB_ADD_STAMP_ABSOLUTE)) { + info->delta = info->ts; + } else { + /* + * If interrupting an event time update, we may need an + * absolute timestamp. + * Don't bother if this is the start of a new page (w == 0). + */ + if (!w) { + /* Use the sub-buffer timestamp */ + info->delta = 0; + } else if (unlikely(!a_ok || !b_ok || info->before != info->after)) { + info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; + info->length += RB_LEN_TIME_EXTEND; + } else { + info->delta = info->ts - info->after; + if (unlikely(test_time_stamp(info->delta))) { + info->add_timestamp |= RB_ADD_STAMP_EXTEND; + info->length += RB_LEN_TIME_EXTEND; + } + } + } + + /*B*/ rb_time_set(&cpu_buffer->before_stamp, info->ts); + + /*C*/ write = local_add_return(info->length, &tail_page->write); + + /* set write to only the index of the write */ + write &= RB_WRITE_MASK; + + tail = write - info->length; + + /* See if we shot pass the end of this buffer page */ + if (unlikely(write > BUF_PAGE_SIZE)) { + check_buffer(cpu_buffer, info, CHECK_FULL_PAGE); + return rb_move_tail(cpu_buffer, tail, info); + } + + if (likely(tail == w)) { + /* Nothing interrupted us between A and C */ + /*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts); + /* + * If something came in between C and D, the write stamp + * may now not be in sync. But that's fine as the before_stamp + * will be different and then next event will just be forced + * to use an absolute timestamp. + */ + if (likely(!(info->add_timestamp & + (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) + /* This did not interrupt any time update */ + info->delta = info->ts - info->after; + else + /* Just use full timestamp for interrupting event */ + info->delta = info->ts; + check_buffer(cpu_buffer, info, tail); + } else { + u64 ts; + /* SLOW PATH - Interrupted between A and C */ + + /* Save the old before_stamp */ + a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); + RB_WARN_ON(cpu_buffer, !a_ok); + + /* + * Read a new timestamp and update the before_stamp to make + * the next event after this one force using an absolute + * timestamp. This is in case an interrupt were to come in + * between E and F. + */ + ts = rb_time_stamp(cpu_buffer->buffer); + rb_time_set(&cpu_buffer->before_stamp, ts); + + barrier(); + /*E*/ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); + /* Was interrupted before here, write_stamp must be valid */ + RB_WARN_ON(cpu_buffer, !a_ok); + barrier(); + /*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && + info->after == info->before && info->after < ts) { + /* + * Nothing came after this event between C and F, it is + * safe to use info->after for the delta as it + * matched info->before and is still valid. + */ + info->delta = ts - info->after; + } else { + /* + * Interrupted between C and F: + * Lost the previous events time stamp. Just set the + * delta to zero, and this will be the same time as + * the event this event interrupted. And the events that + * came after this will still be correct (as they would + * have built their delta on the previous event. + */ + info->delta = 0; + } + info->ts = ts; + info->add_timestamp &= ~RB_ADD_STAMP_FORCE; + } + + /* + * If this is the first commit on the page, then it has the same + * timestamp as the page itself. + */ + if (unlikely(!tail && !(info->add_timestamp & + (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) + info->delta = 0; + + /* We reserved something on the buffer */ + + event = __rb_page_index(tail_page, tail); + rb_update_event(cpu_buffer, event, info); + + local_inc(&tail_page->entries); + + /* + * If this is the first commit on the page, then update + * its timestamp. + */ + if (unlikely(!tail)) + tail_page->page->time_stamp = info->ts; + + /* account for these added bytes */ + local_add(info->length, &cpu_buffer->entries_bytes); + + return event; +} + +static __always_inline struct ring_buffer_event * +rb_reserve_next_event(struct trace_buffer *buffer, + struct ring_buffer_per_cpu *cpu_buffer, + unsigned long length) +{ + struct ring_buffer_event *event; + struct rb_event_info info; + int nr_loops = 0; + int add_ts_default; + + /* ring buffer does cmpxchg, make sure it is safe in NMI context */ + if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) && + (unlikely(in_nmi()))) { + return NULL; + } + + rb_start_commit(cpu_buffer); + /* The commit page can not change after this */ + +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP + /* + * Due to the ability to swap a cpu buffer from a buffer + * it is possible it was swapped before we committed. + * (committing stops a swap). We check for it here and + * if it happened, we have to fail the write. + */ + barrier(); + if (unlikely(READ_ONCE(cpu_buffer->buffer) != buffer)) { + local_dec(&cpu_buffer->committing); + local_dec(&cpu_buffer->commits); + return NULL; + } +#endif + + info.length = rb_calculate_event_length(length); + + if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { + add_ts_default = RB_ADD_STAMP_ABSOLUTE; + info.length += RB_LEN_TIME_EXTEND; + if (info.length > BUF_MAX_DATA_SIZE) + goto out_fail; + } else { + add_ts_default = RB_ADD_STAMP_NONE; + } + + again: + info.add_timestamp = add_ts_default; + info.delta = 0; + + /* + * We allow for interrupts to reenter here and do a trace. + * If one does, it will cause this original code to loop + * back here. Even with heavy interrupts happening, this + * should only happen a few times in a row. If this happens + * 1000 times in a row, there must be either an interrupt + * storm or we have something buggy. + * Bail! + */ + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) + goto out_fail; + + event = __rb_reserve_next(cpu_buffer, &info); + + if (unlikely(PTR_ERR(event) == -EAGAIN)) { + if (info.add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND)) + info.length -= RB_LEN_TIME_EXTEND; + goto again; + } + + if (likely(event)) + return event; + out_fail: + rb_end_commit(cpu_buffer); + return NULL; +} + +/** + * ring_buffer_lock_reserve - reserve a part of the buffer + * @buffer: the ring buffer to reserve from + * @length: the length of the data to reserve (excluding event header) + * + * Returns a reserved event on the ring buffer to copy directly to. + * The user of this interface will need to get the body to write into + * and can use the ring_buffer_event_data() interface. + * + * The length is the length of the data needed, not the event length + * which also includes the event header. + * + * Must be paired with ring_buffer_unlock_commit, unless NULL is returned. + * If NULL is returned, then nothing has been allocated or locked. + */ +struct ring_buffer_event * +ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + int cpu; + + /* If we are tracing schedule, we don't want to recurse */ + preempt_disable_notrace(); + + if (unlikely(atomic_read(&buffer->record_disabled))) + goto out; + + cpu = raw_smp_processor_id(); + + if (unlikely(!cpumask_test_cpu(cpu, buffer->cpumask))) + goto out; + + cpu_buffer = buffer->buffers[cpu]; + + if (unlikely(atomic_read(&cpu_buffer->record_disabled))) + goto out; + + if (unlikely(length > BUF_MAX_DATA_SIZE)) + goto out; + + if (unlikely(trace_recursive_lock(cpu_buffer))) + goto out; + + event = rb_reserve_next_event(buffer, cpu_buffer, length); + if (!event) + goto out_unlock; + + return event; + + out_unlock: + trace_recursive_unlock(cpu_buffer); + out: + preempt_enable_notrace(); + return NULL; +} +EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); + +/* + * Decrement the entries to the page that an event is on. + * The event does not even need to exist, only the pointer + * to the page it is on. This may only be called before the commit + * takes place. + */ +static inline void +rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + struct buffer_page *bpage = cpu_buffer->commit_page; + struct buffer_page *start; + + addr &= PAGE_MASK; + + /* Do the likely case first */ + if (likely(bpage->page == (void *)addr)) { + local_dec(&bpage->entries); + return; + } + + /* + * Because the commit page may be on the reader page we + * start with the next page and check the end loop there. + */ + rb_inc_page(&bpage); + start = bpage; + do { + if (bpage->page == (void *)addr) { + local_dec(&bpage->entries); + return; + } + rb_inc_page(&bpage); + } while (bpage != start); + + /* commit not part of this buffer?? */ + RB_WARN_ON(cpu_buffer, 1); +} + +/** + * ring_buffer_discard_commit - discard an event that has not been committed + * @buffer: the ring buffer + * @event: non committed event to discard + * + * Sometimes an event that is in the ring buffer needs to be ignored. + * This function lets the user discard an event in the ring buffer + * and then that event will not be read later. + * + * This function only works if it is called before the item has been + * committed. It will try to free the event from the ring buffer + * if another event has not been added behind it. + * + * If another event has been added behind it, it will set the event + * up as discarded, and perform the commit. + * + * If this function is called, do not call ring_buffer_unlock_commit on + * the event. + */ +void ring_buffer_discard_commit(struct trace_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + /* The event is discarded regardless */ + rb_event_discard(event); + + cpu = smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + + /* + * This must only be called if the event has not been + * committed yet. Thus we can assume that preemption + * is still disabled. + */ + RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); + + rb_decrement_entry(cpu_buffer, event); + if (rb_try_to_discard(cpu_buffer, event)) + goto out; + + out: + rb_end_commit(cpu_buffer); + + trace_recursive_unlock(cpu_buffer); + + preempt_enable_notrace(); + +} +EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); + +/** + * ring_buffer_write - write data to the buffer without reserving + * @buffer: The ring buffer to write to. + * @length: The length of the data being written (excluding the event header) + * @data: The data to write to the buffer. + * + * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as + * one function. If you already have the data to write to the buffer, it + * may be easier to simply call this function. + * + * Note, like ring_buffer_lock_reserve, the length is the length of the data + * and not the length of the event which would hold the header. + */ +int ring_buffer_write(struct trace_buffer *buffer, + unsigned long length, + void *data) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + void *body; + int ret = -EBUSY; + int cpu; + + preempt_disable_notrace(); + + if (atomic_read(&buffer->record_disabled)) + goto out; + + cpu = raw_smp_processor_id(); + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + goto out; + + cpu_buffer = buffer->buffers[cpu]; + + if (atomic_read(&cpu_buffer->record_disabled)) + goto out; + + if (length > BUF_MAX_DATA_SIZE) + goto out; + + if (unlikely(trace_recursive_lock(cpu_buffer))) + goto out; + + event = rb_reserve_next_event(buffer, cpu_buffer, length); + if (!event) + goto out_unlock; + + body = rb_event_data(event); + + memcpy(body, data, length); + + rb_commit(cpu_buffer); + + rb_wakeups(buffer, cpu_buffer); + + ret = 0; + + out_unlock: + trace_recursive_unlock(cpu_buffer); + + out: + preempt_enable_notrace(); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_write); + +static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *reader = cpu_buffer->reader_page; + struct buffer_page *head = rb_set_head_page(cpu_buffer); + struct buffer_page *commit = cpu_buffer->commit_page; + + /* In case of error, head will be NULL */ + if (unlikely(!head)) + return true; + + /* Reader should exhaust content in reader page */ + if (reader->read != rb_page_commit(reader)) + return false; + + /* + * If writers are committing on the reader page, knowing all + * committed content has been read, the ring buffer is empty. + */ + if (commit == reader) + return true; + + /* + * If writers are committing on a page other than reader page + * and head page, there should always be content to read. + */ + if (commit != head) + return false; + + /* + * Writers are committing on the head page, we just need + * to care about there're committed data, and the reader will + * swap reader page with head page when it is to read data. + */ + return rb_page_commit(commit) == 0; +} + +/** + * ring_buffer_record_disable - stop all writes into the buffer + * @buffer: The ring buffer to stop writes to. + * + * This prevents all writes to the buffer. Any attempt to write + * to the buffer after this will fail and return NULL. + * + * The caller should call synchronize_rcu() after this. + */ +void ring_buffer_record_disable(struct trace_buffer *buffer) +{ + atomic_inc(&buffer->record_disabled); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_disable); + +/** + * ring_buffer_record_enable - enable writes to the buffer + * @buffer: The ring buffer to enable writes + * + * Note, multiple disables will need the same number of enables + * to truly enable the writing (much like preempt_disable). + */ +void ring_buffer_record_enable(struct trace_buffer *buffer) +{ + atomic_dec(&buffer->record_disabled); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_enable); + +/** + * ring_buffer_record_off - stop all writes into the buffer + * @buffer: The ring buffer to stop writes to. + * + * This prevents all writes to the buffer. Any attempt to write + * to the buffer after this will fail and return NULL. + * + * This is different than ring_buffer_record_disable() as + * it works like an on/off switch, where as the disable() version + * must be paired with a enable(). + */ +void ring_buffer_record_off(struct trace_buffer *buffer) +{ + unsigned int rd; + unsigned int new_rd; + + rd = atomic_read(&buffer->record_disabled); + do { + new_rd = rd | RB_BUFFER_OFF; + } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd)); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_off); + +/** + * ring_buffer_record_on - restart writes into the buffer + * @buffer: The ring buffer to start writes to. + * + * This enables all writes to the buffer that was disabled by + * ring_buffer_record_off(). + * + * This is different than ring_buffer_record_enable() as + * it works like an on/off switch, where as the enable() version + * must be paired with a disable(). + */ +void ring_buffer_record_on(struct trace_buffer *buffer) +{ + unsigned int rd; + unsigned int new_rd; + + rd = atomic_read(&buffer->record_disabled); + do { + new_rd = rd & ~RB_BUFFER_OFF; + } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd)); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_on); + +/** + * ring_buffer_record_is_on - return true if the ring buffer can write + * @buffer: The ring buffer to see if write is enabled + * + * Returns true if the ring buffer is in a state that it accepts writes. + */ +bool ring_buffer_record_is_on(struct trace_buffer *buffer) +{ + return !atomic_read(&buffer->record_disabled); +} + +/** + * ring_buffer_record_is_set_on - return true if the ring buffer is set writable + * @buffer: The ring buffer to see if write is set enabled + * + * Returns true if the ring buffer is set writable by ring_buffer_record_on(). + * Note that this does NOT mean it is in a writable state. + * + * It may return true when the ring buffer has been disabled by + * ring_buffer_record_disable(), as that is a temporary disabling of + * the ring buffer. + */ +bool ring_buffer_record_is_set_on(struct trace_buffer *buffer) +{ + return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF); +} + +/** + * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer + * @buffer: The ring buffer to stop writes to. + * @cpu: The CPU buffer to stop + * + * This prevents all writes to the buffer. Any attempt to write + * to the buffer after this will fail and return NULL. + * + * The caller should call synchronize_rcu() after this. + */ +void ring_buffer_record_disable_cpu(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return; + + cpu_buffer = buffer->buffers[cpu]; + atomic_inc(&cpu_buffer->record_disabled); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu); + +/** + * ring_buffer_record_enable_cpu - enable writes to the buffer + * @buffer: The ring buffer to enable writes + * @cpu: The CPU to enable. + * + * Note, multiple disables will need the same number of enables + * to truly enable the writing (much like preempt_disable). + */ +void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return; + + cpu_buffer = buffer->buffers[cpu]; + atomic_dec(&cpu_buffer->record_disabled); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); + +/* + * The total entries in the ring buffer is the running counter + * of entries entered into the ring buffer, minus the sum of + * the entries read from the ring buffer and the number of + * entries that were overwritten. + */ +static inline unsigned long +rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) +{ + return local_read(&cpu_buffer->entries) - + (local_read(&cpu_buffer->overrun) + cpu_buffer->read); +} + +/** + * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to read from. + */ +u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu) +{ + unsigned long flags; + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *bpage; + u64 ret = 0; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + /* + * if the tail is on reader_page, oldest time stamp is on the reader + * page + */ + if (cpu_buffer->tail_page == cpu_buffer->reader_page) + bpage = cpu_buffer->reader_page; + else + bpage = rb_set_head_page(cpu_buffer); + if (bpage) + ret = bpage->page->time_stamp; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts); + +/** + * ring_buffer_bytes_cpu - get the number of bytes unconsumed in a cpu buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to read from. + */ +unsigned long ring_buffer_bytes_cpu(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes; + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu); + +/** + * ring_buffer_entries_cpu - get the number of entries in a cpu buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the entries from. + */ +unsigned long ring_buffer_entries_cpu(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + + return rb_num_of_entries(cpu_buffer); +} +EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); + +/** + * ring_buffer_overrun_cpu - get the number of overruns caused by the ring + * buffer wrapping around (only if RB_FL_OVERWRITE is on). + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long ring_buffer_overrun_cpu(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = local_read(&cpu_buffer->overrun); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); + +/** + * ring_buffer_commit_overrun_cpu - get the number of overruns caused by + * commits failing due to the buffer wrapping around while there are uncommitted + * events, such as during an interrupt storm. + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long +ring_buffer_commit_overrun_cpu(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = local_read(&cpu_buffer->commit_overrun); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); + +/** + * ring_buffer_dropped_events_cpu - get the number of dropped events caused by + * the ring buffer filling up (only if RB_FL_OVERWRITE is off). + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long +ring_buffer_dropped_events_cpu(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = local_read(&cpu_buffer->dropped_events); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); + +/** + * ring_buffer_read_events_cpu - get the number of events successfully read + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of events read + */ +unsigned long +ring_buffer_read_events_cpu(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + return cpu_buffer->read; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu); + +/** + * ring_buffer_entries - get the number of entries in a buffer + * @buffer: The ring buffer + * + * Returns the total number of entries in the ring buffer + * (all CPU entries) + */ +unsigned long ring_buffer_entries(struct trace_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long entries = 0; + int cpu; + + /* if you care about this being correct, lock the buffer */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + entries += rb_num_of_entries(cpu_buffer); + } + + return entries; +} +EXPORT_SYMBOL_GPL(ring_buffer_entries); + +/** + * ring_buffer_overruns - get the number of overruns in buffer + * @buffer: The ring buffer + * + * Returns the total number of overruns in the ring buffer + * (all CPU entries) + */ +unsigned long ring_buffer_overruns(struct trace_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long overruns = 0; + int cpu; + + /* if you care about this being correct, lock the buffer */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + overruns += local_read(&cpu_buffer->overrun); + } + + return overruns; +} +EXPORT_SYMBOL_GPL(ring_buffer_overruns); + +static void rb_iter_reset(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + + /* Iterator usage is expected to have record disabled */ + iter->head_page = cpu_buffer->reader_page; + iter->head = cpu_buffer->reader_page->read; + iter->next_event = iter->head; + + iter->cache_reader_page = iter->head_page; + iter->cache_read = cpu_buffer->read; + iter->cache_pages_removed = cpu_buffer->pages_removed; + + if (iter->head) { + iter->read_stamp = cpu_buffer->read_stamp; + iter->page_stamp = cpu_buffer->reader_page->page->time_stamp; + } else { + iter->read_stamp = iter->head_page->page->time_stamp; + iter->page_stamp = iter->read_stamp; + } +} + +/** + * ring_buffer_iter_reset - reset an iterator + * @iter: The iterator to reset + * + * Resets the iterator, so that it will start from the beginning + * again. + */ +void ring_buffer_iter_reset(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + + if (!iter) + return; + + cpu_buffer = iter->cpu_buffer; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + rb_iter_reset(iter); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); +} +EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); + +/** + * ring_buffer_iter_empty - check if an iterator has no more to read + * @iter: The iterator to check + */ +int ring_buffer_iter_empty(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *reader; + struct buffer_page *head_page; + struct buffer_page *commit_page; + struct buffer_page *curr_commit_page; + unsigned commit; + u64 curr_commit_ts; + u64 commit_ts; + + cpu_buffer = iter->cpu_buffer; + reader = cpu_buffer->reader_page; + head_page = cpu_buffer->head_page; + commit_page = cpu_buffer->commit_page; + commit_ts = commit_page->page->time_stamp; + + /* + * When the writer goes across pages, it issues a cmpxchg which + * is a mb(), which will synchronize with the rmb here. + * (see rb_tail_page_update()) + */ + smp_rmb(); + commit = rb_page_commit(commit_page); + /* We want to make sure that the commit page doesn't change */ + smp_rmb(); + + /* Make sure commit page didn't change */ + curr_commit_page = READ_ONCE(cpu_buffer->commit_page); + curr_commit_ts = READ_ONCE(curr_commit_page->page->time_stamp); + + /* If the commit page changed, then there's more data */ + if (curr_commit_page != commit_page || + curr_commit_ts != commit_ts) + return 0; + + /* Still racy, as it may return a false positive, but that's OK */ + return ((iter->head_page == commit_page && iter->head >= commit) || + (iter->head_page == reader && commit_page == head_page && + head_page->read == commit && + iter->head == rb_page_commit(cpu_buffer->reader_page))); +} +EXPORT_SYMBOL_GPL(ring_buffer_iter_empty); + +static void +rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + u64 delta; + + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + return; + + case RINGBUF_TYPE_TIME_EXTEND: + delta = rb_event_time_stamp(event); + cpu_buffer->read_stamp += delta; + return; + + case RINGBUF_TYPE_TIME_STAMP: + delta = rb_event_time_stamp(event); + delta = rb_fix_abs_ts(delta, cpu_buffer->read_stamp); + cpu_buffer->read_stamp = delta; + return; + + case RINGBUF_TYPE_DATA: + cpu_buffer->read_stamp += event->time_delta; + return; + + default: + RB_WARN_ON(cpu_buffer, 1); + } +} + +static void +rb_update_iter_read_stamp(struct ring_buffer_iter *iter, + struct ring_buffer_event *event) +{ + u64 delta; + + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + return; + + case RINGBUF_TYPE_TIME_EXTEND: + delta = rb_event_time_stamp(event); + iter->read_stamp += delta; + return; + + case RINGBUF_TYPE_TIME_STAMP: + delta = rb_event_time_stamp(event); + delta = rb_fix_abs_ts(delta, iter->read_stamp); + iter->read_stamp = delta; + return; + + case RINGBUF_TYPE_DATA: + iter->read_stamp += event->time_delta; + return; + + default: + RB_WARN_ON(iter->cpu_buffer, 1); + } +} + +static struct buffer_page * +rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *reader = NULL; + unsigned long overwrite; + unsigned long flags; + int nr_loops = 0; + bool ret; + + local_irq_save(flags); + arch_spin_lock(&cpu_buffer->lock); + + again: + /* + * This should normally only loop twice. But because the + * start of the reader inserts an empty page, it causes + * a case where we will loop three times. There should be no + * reason to loop four times (that I know of). + */ + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) { + reader = NULL; + goto out; + } + + reader = cpu_buffer->reader_page; + + /* If there's more to read, return this page */ + if (cpu_buffer->reader_page->read < rb_page_size(reader)) + goto out; + + /* Never should we have an index greater than the size */ + if (RB_WARN_ON(cpu_buffer, + cpu_buffer->reader_page->read > rb_page_size(reader))) + goto out; + + /* check if we caught up to the tail */ + reader = NULL; + if (cpu_buffer->commit_page == cpu_buffer->reader_page) + goto out; + + /* Don't bother swapping if the ring buffer is empty */ + if (rb_num_of_entries(cpu_buffer) == 0) + goto out; + + /* + * Reset the reader page to size zero. + */ + local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); + local_set(&cpu_buffer->reader_page->page->commit, 0); + cpu_buffer->reader_page->real_end = 0; + + spin: + /* + * Splice the empty reader page into the list around the head. + */ + reader = rb_set_head_page(cpu_buffer); + if (!reader) + goto out; + cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); + cpu_buffer->reader_page->list.prev = reader->list.prev; + + /* + * cpu_buffer->pages just needs to point to the buffer, it + * has no specific buffer page to point to. Lets move it out + * of our way so we don't accidentally swap it. + */ + cpu_buffer->pages = reader->list.prev; + + /* The reader page will be pointing to the new head */ + rb_set_list_to_head(&cpu_buffer->reader_page->list); + + /* + * We want to make sure we read the overruns after we set up our + * pointers to the next object. The writer side does a + * cmpxchg to cross pages which acts as the mb on the writer + * side. Note, the reader will constantly fail the swap + * while the writer is updating the pointers, so this + * guarantees that the overwrite recorded here is the one we + * want to compare with the last_overrun. + */ + smp_mb(); + overwrite = local_read(&(cpu_buffer->overrun)); + + /* + * Here's the tricky part. + * + * We need to move the pointer past the header page. + * But we can only do that if a writer is not currently + * moving it. The page before the header page has the + * flag bit '1' set if it is pointing to the page we want. + * but if the writer is in the process of moving it + * than it will be '2' or already moved '0'. + */ + + ret = rb_head_page_replace(reader, cpu_buffer->reader_page); + + /* + * If we did not convert it, then we must try again. + */ + if (!ret) + goto spin; + + /* + * Yay! We succeeded in replacing the page. + * + * Now make the new head point back to the reader page. + */ + rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list; + rb_inc_page(&cpu_buffer->head_page); + + local_inc(&cpu_buffer->pages_read); + + /* Finally update the reader page to the new head */ + cpu_buffer->reader_page = reader; + cpu_buffer->reader_page->read = 0; + + if (overwrite != cpu_buffer->last_overrun) { + cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun; + cpu_buffer->last_overrun = overwrite; + } + + goto again; + + out: + /* Update the read_stamp on the first event */ + if (reader && reader->read == 0) + cpu_buffer->read_stamp = reader->page->time_stamp; + + arch_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + /* + * The writer has preempt disable, wait for it. But not forever + * Although, 1 second is pretty much "forever" + */ +#define USECS_WAIT 1000000 + for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) { + /* If the write is past the end of page, a writer is still updating it */ + if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE)) + break; + + udelay(1); + + /* Get the latest version of the reader write value */ + smp_rmb(); + } + + /* The writer is not moving forward? Something is wrong */ + if (RB_WARN_ON(cpu_buffer, nr_loops == USECS_WAIT)) + reader = NULL; + + /* + * Make sure we see any padding after the write update + * (see rb_reset_tail()). + * + * In addition, a writer may be writing on the reader page + * if the page has not been fully filled, so the read barrier + * is also needed to make sure we see the content of what is + * committed by the writer (see rb_set_commit_to_write()). + */ + smp_rmb(); + + + return reader; +} + +static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct ring_buffer_event *event; + struct buffer_page *reader; + unsigned length; + + reader = rb_get_reader_page(cpu_buffer); + + /* This function should not be called when buffer is empty */ + if (RB_WARN_ON(cpu_buffer, !reader)) + return; + + event = rb_reader_event(cpu_buffer); + + if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX) + cpu_buffer->read++; + + rb_update_read_stamp(cpu_buffer, event); + + length = rb_event_length(event); + cpu_buffer->reader_page->read += length; + cpu_buffer->read_bytes += length; +} + +static void rb_advance_iter(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + cpu_buffer = iter->cpu_buffer; + + /* If head == next_event then we need to jump to the next event */ + if (iter->head == iter->next_event) { + /* If the event gets overwritten again, there's nothing to do */ + if (rb_iter_head_event(iter) == NULL) + return; + } + + iter->head = iter->next_event; + + /* + * Check if we are at the end of the buffer. + */ + if (iter->next_event >= rb_page_size(iter->head_page)) { + /* discarded commits can make the page empty */ + if (iter->head_page == cpu_buffer->commit_page) + return; + rb_inc_iter(iter); + return; + } + + rb_update_iter_read_stamp(iter, iter->event); +} + +static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) +{ + return cpu_buffer->lost_events; +} + +static struct ring_buffer_event * +rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, + unsigned long *lost_events) +{ + struct ring_buffer_event *event; + struct buffer_page *reader; + int nr_loops = 0; + + if (ts) + *ts = 0; + again: + /* + * We repeat when a time extend is encountered. + * Since the time extend is always attached to a data event, + * we should never loop more than once. + * (We never hit the following condition more than twice). + */ + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) + return NULL; + + reader = rb_get_reader_page(cpu_buffer); + if (!reader) + return NULL; + + event = rb_reader_event(cpu_buffer); + + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + if (rb_null_event(event)) + RB_WARN_ON(cpu_buffer, 1); + /* + * Because the writer could be discarding every + * event it creates (which would probably be bad) + * if we were to go back to "again" then we may never + * catch up, and will trigger the warn on, or lock + * the box. Return the padding, and we will release + * the current locks, and try again. + */ + return event; + + case RINGBUF_TYPE_TIME_EXTEND: + /* Internal data, OK to advance */ + rb_advance_reader(cpu_buffer); + goto again; + + case RINGBUF_TYPE_TIME_STAMP: + if (ts) { + *ts = rb_event_time_stamp(event); + *ts = rb_fix_abs_ts(*ts, reader->page->time_stamp); + ring_buffer_normalize_time_stamp(cpu_buffer->buffer, + cpu_buffer->cpu, ts); + } + /* Internal data, OK to advance */ + rb_advance_reader(cpu_buffer); + goto again; + + case RINGBUF_TYPE_DATA: + if (ts && !(*ts)) { + *ts = cpu_buffer->read_stamp + event->time_delta; + ring_buffer_normalize_time_stamp(cpu_buffer->buffer, + cpu_buffer->cpu, ts); + } + if (lost_events) + *lost_events = rb_lost_events(cpu_buffer); + return event; + + default: + RB_WARN_ON(cpu_buffer, 1); + } + + return NULL; +} +EXPORT_SYMBOL_GPL(ring_buffer_peek); + +static struct ring_buffer_event * +rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) +{ + struct trace_buffer *buffer; + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + int nr_loops = 0; + + if (ts) + *ts = 0; + + cpu_buffer = iter->cpu_buffer; + buffer = cpu_buffer->buffer; + + /* + * Check if someone performed a consuming read to the buffer + * or removed some pages from the buffer. In these cases, + * iterator was invalidated and we need to reset it. + */ + if (unlikely(iter->cache_read != cpu_buffer->read || + iter->cache_reader_page != cpu_buffer->reader_page || + iter->cache_pages_removed != cpu_buffer->pages_removed)) + rb_iter_reset(iter); + + again: + if (ring_buffer_iter_empty(iter)) + return NULL; + + /* + * As the writer can mess with what the iterator is trying + * to read, just give up if we fail to get an event after + * three tries. The iterator is not as reliable when reading + * the ring buffer with an active write as the consumer is. + * Do not warn if the three failures is reached. + */ + if (++nr_loops > 3) + return NULL; + + if (rb_per_cpu_empty(cpu_buffer)) + return NULL; + + if (iter->head >= rb_page_size(iter->head_page)) { + rb_inc_iter(iter); + goto again; + } + + event = rb_iter_head_event(iter); + if (!event) + goto again; + + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + if (rb_null_event(event)) { + rb_inc_iter(iter); + goto again; + } + rb_advance_iter(iter); + return event; + + case RINGBUF_TYPE_TIME_EXTEND: + /* Internal data, OK to advance */ + rb_advance_iter(iter); + goto again; + + case RINGBUF_TYPE_TIME_STAMP: + if (ts) { + *ts = rb_event_time_stamp(event); + *ts = rb_fix_abs_ts(*ts, iter->head_page->page->time_stamp); + ring_buffer_normalize_time_stamp(cpu_buffer->buffer, + cpu_buffer->cpu, ts); + } + /* Internal data, OK to advance */ + rb_advance_iter(iter); + goto again; + + case RINGBUF_TYPE_DATA: + if (ts && !(*ts)) { + *ts = iter->read_stamp + event->time_delta; + ring_buffer_normalize_time_stamp(buffer, + cpu_buffer->cpu, ts); + } + return event; + + default: + RB_WARN_ON(cpu_buffer, 1); + } + + return NULL; +} +EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); + +static inline bool rb_reader_lock(struct ring_buffer_per_cpu *cpu_buffer) +{ + if (likely(!in_nmi())) { + raw_spin_lock(&cpu_buffer->reader_lock); + return true; + } + + /* + * If an NMI die dumps out the content of the ring buffer + * trylock must be used to prevent a deadlock if the NMI + * preempted a task that holds the ring buffer locks. If + * we get the lock then all is fine, if not, then continue + * to do the read, but this can corrupt the ring buffer, + * so it must be permanently disabled from future writes. + * Reading from NMI is a oneshot deal. + */ + if (raw_spin_trylock(&cpu_buffer->reader_lock)) + return true; + + /* Continue without locking, but disable the ring buffer */ + atomic_inc(&cpu_buffer->record_disabled); + return false; +} + +static inline void +rb_reader_unlock(struct ring_buffer_per_cpu *cpu_buffer, bool locked) +{ + if (likely(locked)) + raw_spin_unlock(&cpu_buffer->reader_lock); +} + +/** + * ring_buffer_peek - peek at the next event to be read + * @buffer: The ring buffer to read + * @cpu: The cpu to peak at + * @ts: The timestamp counter of this event. + * @lost_events: a variable to store if events were lost (may be NULL) + * + * This will return the event that will be read next, but does + * not consume the data. + */ +struct ring_buffer_event * +ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts, + unsigned long *lost_events) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_event *event; + unsigned long flags; + bool dolock; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return NULL; + + again: + local_irq_save(flags); + dolock = rb_reader_lock(cpu_buffer); + event = rb_buffer_peek(cpu_buffer, ts, lost_events); + if (event && event->type_len == RINGBUF_TYPE_PADDING) + rb_advance_reader(cpu_buffer); + rb_reader_unlock(cpu_buffer, dolock); + local_irq_restore(flags); + + if (event && event->type_len == RINGBUF_TYPE_PADDING) + goto again; + + return event; +} + +/** ring_buffer_iter_dropped - report if there are dropped events + * @iter: The ring buffer iterator + * + * Returns true if there was dropped events since the last peek. + */ +bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter) +{ + bool ret = iter->missed_events != 0; + + iter->missed_events = 0; + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped); + +/** + * ring_buffer_iter_peek - peek at the next event to be read + * @iter: The ring buffer iterator + * @ts: The timestamp counter of this event. + * + * This will return the event that will be read next, but does + * not increment the iterator. + */ +struct ring_buffer_event * +ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + struct ring_buffer_event *event; + unsigned long flags; + + again: + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + event = rb_iter_peek(iter, ts); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + if (event && event->type_len == RINGBUF_TYPE_PADDING) + goto again; + + return event; +} + +/** + * ring_buffer_consume - return an event and consume it + * @buffer: The ring buffer to get the next event from + * @cpu: the cpu to read the buffer from + * @ts: a variable to store the timestamp (may be NULL) + * @lost_events: a variable to store if events were lost (may be NULL) + * + * Returns the next event in the ring buffer, and that event is consumed. + * Meaning, that sequential reads will keep returning a different event, + * and eventually empty the ring buffer if the producer is slower. + */ +struct ring_buffer_event * +ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts, + unsigned long *lost_events) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event = NULL; + unsigned long flags; + bool dolock; + + again: + /* might be called in atomic */ + preempt_disable(); + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + goto out; + + cpu_buffer = buffer->buffers[cpu]; + local_irq_save(flags); + dolock = rb_reader_lock(cpu_buffer); + + event = rb_buffer_peek(cpu_buffer, ts, lost_events); + if (event) { + cpu_buffer->lost_events = 0; + rb_advance_reader(cpu_buffer); + } + + rb_reader_unlock(cpu_buffer, dolock); + local_irq_restore(flags); + + out: + preempt_enable(); + + if (event && event->type_len == RINGBUF_TYPE_PADDING) + goto again; + + return event; +} +EXPORT_SYMBOL_GPL(ring_buffer_consume); + +/** + * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer + * @buffer: The ring buffer to read from + * @cpu: The cpu buffer to iterate over + * @flags: gfp flags to use for memory allocation + * + * This performs the initial preparations necessary to iterate + * through the buffer. Memory is allocated, buffer recording + * is disabled, and the iterator pointer is returned to the caller. + * + * Disabling buffer recording prevents the reading from being + * corrupted. This is not a consuming read, so a producer is not + * expected. + * + * After a sequence of ring_buffer_read_prepare calls, the user is + * expected to make at least one call to ring_buffer_read_prepare_sync. + * Afterwards, ring_buffer_read_start is invoked to get things going + * for real. + * + * This overall must be paired with ring_buffer_read_finish. + */ +struct ring_buffer_iter * +ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_iter *iter; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return NULL; + + iter = kzalloc(sizeof(*iter), flags); + if (!iter) + return NULL; + + /* Holds the entire event: data and meta data */ + iter->event = kmalloc(BUF_PAGE_SIZE, flags); + if (!iter->event) { + kfree(iter); + return NULL; + } + + cpu_buffer = buffer->buffers[cpu]; + + iter->cpu_buffer = cpu_buffer; + + atomic_inc(&cpu_buffer->resize_disabled); + + return iter; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); + +/** + * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls + * + * All previously invoked ring_buffer_read_prepare calls to prepare + * iterators will be synchronized. Afterwards, read_buffer_read_start + * calls on those iterators are allowed. + */ +void +ring_buffer_read_prepare_sync(void) +{ + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); + +/** + * ring_buffer_read_start - start a non consuming read of the buffer + * @iter: The iterator returned by ring_buffer_read_prepare + * + * This finalizes the startup of an iteration through the buffer. + * The iterator comes from a call to ring_buffer_read_prepare and + * an intervening ring_buffer_read_prepare_sync must have been + * performed. + * + * Must be paired with ring_buffer_read_finish. + */ +void +ring_buffer_read_start(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + + if (!iter) + return; + + cpu_buffer = iter->cpu_buffer; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + arch_spin_lock(&cpu_buffer->lock); + rb_iter_reset(iter); + arch_spin_unlock(&cpu_buffer->lock); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); +} +EXPORT_SYMBOL_GPL(ring_buffer_read_start); + +/** + * ring_buffer_read_finish - finish reading the iterator of the buffer + * @iter: The iterator retrieved by ring_buffer_start + * + * This re-enables the recording to the buffer, and frees the + * iterator. + */ +void +ring_buffer_read_finish(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + unsigned long flags; + + /* + * Ring buffer is disabled from recording, here's a good place + * to check the integrity of the ring buffer. + * Must prevent readers from trying to read, as the check + * clears the HEAD page and readers require it. + */ + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + rb_check_pages(cpu_buffer); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + atomic_dec(&cpu_buffer->resize_disabled); + kfree(iter->event); + kfree(iter); +} +EXPORT_SYMBOL_GPL(ring_buffer_read_finish); + +/** + * ring_buffer_iter_advance - advance the iterator to the next location + * @iter: The ring buffer iterator + * + * Move the location of the iterator such that the next read will + * be the next location of the iterator. + */ +void ring_buffer_iter_advance(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + unsigned long flags; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + rb_advance_iter(iter); + + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); +} +EXPORT_SYMBOL_GPL(ring_buffer_iter_advance); + +/** + * ring_buffer_size - return the size of the ring buffer (in bytes) + * @buffer: The ring buffer. + * @cpu: The CPU to get ring buffer size from. + */ +unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu) +{ + /* + * Earlier, this method returned + * BUF_PAGE_SIZE * buffer->nr_pages + * Since the nr_pages field is now removed, we have converted this to + * return the per cpu buffer value. + */ + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages; +} +EXPORT_SYMBOL_GPL(ring_buffer_size); + +static void rb_clear_buffer_page(struct buffer_page *page) +{ + local_set(&page->write, 0); + local_set(&page->entries, 0); + rb_init_page(page->page); + page->read = 0; +} + +static void +rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *page; + + rb_head_page_deactivate(cpu_buffer); + + cpu_buffer->head_page + = list_entry(cpu_buffer->pages, struct buffer_page, list); + rb_clear_buffer_page(cpu_buffer->head_page); + list_for_each_entry(page, cpu_buffer->pages, list) { + rb_clear_buffer_page(page); + } + + cpu_buffer->tail_page = cpu_buffer->head_page; + cpu_buffer->commit_page = cpu_buffer->head_page; + + INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + INIT_LIST_HEAD(&cpu_buffer->new_pages); + rb_clear_buffer_page(cpu_buffer->reader_page); + + local_set(&cpu_buffer->entries_bytes, 0); + local_set(&cpu_buffer->overrun, 0); + local_set(&cpu_buffer->commit_overrun, 0); + local_set(&cpu_buffer->dropped_events, 0); + local_set(&cpu_buffer->entries, 0); + local_set(&cpu_buffer->committing, 0); + local_set(&cpu_buffer->commits, 0); + local_set(&cpu_buffer->pages_touched, 0); + local_set(&cpu_buffer->pages_lost, 0); + local_set(&cpu_buffer->pages_read, 0); + cpu_buffer->last_pages_touch = 0; + cpu_buffer->shortest_full = 0; + cpu_buffer->read = 0; + cpu_buffer->read_bytes = 0; + + rb_time_set(&cpu_buffer->write_stamp, 0); + rb_time_set(&cpu_buffer->before_stamp, 0); + + memset(cpu_buffer->event_stamp, 0, sizeof(cpu_buffer->event_stamp)); + + cpu_buffer->lost_events = 0; + cpu_buffer->last_overrun = 0; + + rb_head_page_activate(cpu_buffer); + cpu_buffer->pages_removed = 0; +} + +/* Must have disabled the cpu buffer then done a synchronize_rcu */ +static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) + goto out; + + arch_spin_lock(&cpu_buffer->lock); + + rb_reset_cpu(cpu_buffer); + + arch_spin_unlock(&cpu_buffer->lock); + + out: + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); +} + +/** + * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer + * @buffer: The ring buffer to reset a per cpu buffer of + * @cpu: The CPU buffer to be reset + */ +void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return; + + /* prevent another thread from changing buffer sizes */ + mutex_lock(&buffer->mutex); + + atomic_inc(&cpu_buffer->resize_disabled); + atomic_inc(&cpu_buffer->record_disabled); + + /* Make sure all commits have finished */ + synchronize_rcu(); + + reset_disabled_cpu_buffer(cpu_buffer); + + atomic_dec(&cpu_buffer->record_disabled); + atomic_dec(&cpu_buffer->resize_disabled); + + mutex_unlock(&buffer->mutex); +} +EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); + +/* Flag to ensure proper resetting of atomic variables */ +#define RESET_BIT (1 << 30) + +/** + * ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer + * @buffer: The ring buffer to reset a per cpu buffer of + */ +void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + /* prevent another thread from changing buffer sizes */ + mutex_lock(&buffer->mutex); + + for_each_online_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + atomic_add(RESET_BIT, &cpu_buffer->resize_disabled); + atomic_inc(&cpu_buffer->record_disabled); + } + + /* Make sure all commits have finished */ + synchronize_rcu(); + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + /* + * If a CPU came online during the synchronize_rcu(), then + * ignore it. + */ + if (!(atomic_read(&cpu_buffer->resize_disabled) & RESET_BIT)) + continue; + + reset_disabled_cpu_buffer(cpu_buffer); + + atomic_dec(&cpu_buffer->record_disabled); + atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled); + } + + mutex_unlock(&buffer->mutex); +} + +/** + * ring_buffer_reset - reset a ring buffer + * @buffer: The ring buffer to reset all cpu buffers + */ +void ring_buffer_reset(struct trace_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + /* prevent another thread from changing buffer sizes */ + mutex_lock(&buffer->mutex); + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + atomic_inc(&cpu_buffer->resize_disabled); + atomic_inc(&cpu_buffer->record_disabled); + } + + /* Make sure all commits have finished */ + synchronize_rcu(); + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + reset_disabled_cpu_buffer(cpu_buffer); + + atomic_dec(&cpu_buffer->record_disabled); + atomic_dec(&cpu_buffer->resize_disabled); + } + + mutex_unlock(&buffer->mutex); +} +EXPORT_SYMBOL_GPL(ring_buffer_reset); + +/** + * ring_buffer_empty - is the ring buffer empty? + * @buffer: The ring buffer to test + */ +bool ring_buffer_empty(struct trace_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + bool dolock; + bool ret; + int cpu; + + /* yes this is racy, but if you don't like the race, lock the buffer */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + local_irq_save(flags); + dolock = rb_reader_lock(cpu_buffer); + ret = rb_per_cpu_empty(cpu_buffer); + rb_reader_unlock(cpu_buffer, dolock); + local_irq_restore(flags); + + if (!ret) + return false; + } + + return true; +} +EXPORT_SYMBOL_GPL(ring_buffer_empty); + +/** + * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty? + * @buffer: The ring buffer + * @cpu: The CPU buffer to test + */ +bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + bool dolock; + bool ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return true; + + cpu_buffer = buffer->buffers[cpu]; + local_irq_save(flags); + dolock = rb_reader_lock(cpu_buffer); + ret = rb_per_cpu_empty(cpu_buffer); + rb_reader_unlock(cpu_buffer, dolock); + local_irq_restore(flags); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); + +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP +/** + * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers + * @buffer_a: One buffer to swap with + * @buffer_b: The other buffer to swap with + * @cpu: the CPU of the buffers to swap + * + * This function is useful for tracers that want to take a "snapshot" + * of a CPU buffer and has another back up buffer lying around. + * it is expected that the tracer handles the cpu buffer not being + * used at the moment. + */ +int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, + struct trace_buffer *buffer_b, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer_a; + struct ring_buffer_per_cpu *cpu_buffer_b; + int ret = -EINVAL; + + if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || + !cpumask_test_cpu(cpu, buffer_b->cpumask)) + goto out; + + cpu_buffer_a = buffer_a->buffers[cpu]; + cpu_buffer_b = buffer_b->buffers[cpu]; + + /* At least make sure the two buffers are somewhat the same */ + if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) + goto out; + + ret = -EAGAIN; + + if (atomic_read(&buffer_a->record_disabled)) + goto out; + + if (atomic_read(&buffer_b->record_disabled)) + goto out; + + if (atomic_read(&cpu_buffer_a->record_disabled)) + goto out; + + if (atomic_read(&cpu_buffer_b->record_disabled)) + goto out; + + /* + * We can't do a synchronize_rcu here because this + * function can be called in atomic context. + * Normally this will be called from the same CPU as cpu. + * If not it's up to the caller to protect this. + */ + atomic_inc(&cpu_buffer_a->record_disabled); + atomic_inc(&cpu_buffer_b->record_disabled); + + ret = -EBUSY; + if (local_read(&cpu_buffer_a->committing)) + goto out_dec; + if (local_read(&cpu_buffer_b->committing)) + goto out_dec; + + /* + * When resize is in progress, we cannot swap it because + * it will mess the state of the cpu buffer. + */ + if (atomic_read(&buffer_a->resizing)) + goto out_dec; + if (atomic_read(&buffer_b->resizing)) + goto out_dec; + + buffer_a->buffers[cpu] = cpu_buffer_b; + buffer_b->buffers[cpu] = cpu_buffer_a; + + cpu_buffer_b->buffer = buffer_a; + cpu_buffer_a->buffer = buffer_b; + + ret = 0; + +out_dec: + atomic_dec(&cpu_buffer_a->record_disabled); + atomic_dec(&cpu_buffer_b->record_disabled); +out: + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); +#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */ + +/** + * ring_buffer_alloc_read_page - allocate a page to read from buffer + * @buffer: the buffer to allocate for. + * @cpu: the cpu buffer to allocate. + * + * This function is used in conjunction with ring_buffer_read_page. + * When reading a full page from the ring buffer, these functions + * can be used to speed up the process. The calling function should + * allocate a few pages first with this function. Then when it + * needs to get pages from the ring buffer, it passes the result + * of this function into ring_buffer_read_page, which will swap + * the page that was allocated, with the read page of the buffer. + * + * Returns: + * The page allocated, or ERR_PTR + */ +void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_data_page *bpage = NULL; + unsigned long flags; + struct page *page; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return ERR_PTR(-ENODEV); + + cpu_buffer = buffer->buffers[cpu]; + local_irq_save(flags); + arch_spin_lock(&cpu_buffer->lock); + + if (cpu_buffer->free_page) { + bpage = cpu_buffer->free_page; + cpu_buffer->free_page = NULL; + } + + arch_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + if (bpage) + goto out; + + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_NORETRY, 0); + if (!page) + return ERR_PTR(-ENOMEM); + + bpage = page_address(page); + + out: + rb_init_page(bpage); + + return bpage; +} +EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); + +/** + * ring_buffer_free_read_page - free an allocated read page + * @buffer: the buffer the page was allocate for + * @cpu: the cpu buffer the page came from + * @data: the page to free + * + * Free a page allocated from ring_buffer_alloc_read_page. + */ +void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_data_page *bpage = data; + struct page *page = virt_to_page(bpage); + unsigned long flags; + + if (!buffer || !buffer->buffers || !buffer->buffers[cpu]) + return; + + cpu_buffer = buffer->buffers[cpu]; + + /* If the page is still in use someplace else, we can't reuse it */ + if (page_ref_count(page) > 1) + goto out; + + local_irq_save(flags); + arch_spin_lock(&cpu_buffer->lock); + + if (!cpu_buffer->free_page) { + cpu_buffer->free_page = bpage; + bpage = NULL; + } + + arch_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + out: + free_page((unsigned long)bpage); +} +EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); + +/** + * ring_buffer_read_page - extract a page from the ring buffer + * @buffer: buffer to extract from + * @data_page: the page to use allocated from ring_buffer_alloc_read_page + * @len: amount to extract + * @cpu: the cpu of the buffer to extract + * @full: should the extraction only happen when the page is full. + * + * This function will pull out a page from the ring buffer and consume it. + * @data_page must be the address of the variable that was returned + * from ring_buffer_alloc_read_page. This is because the page might be used + * to swap with a page in the ring buffer. + * + * for example: + * rpage = ring_buffer_alloc_read_page(buffer, cpu); + * if (IS_ERR(rpage)) + * return PTR_ERR(rpage); + * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); + * if (ret >= 0) + * process_page(rpage, ret); + * + * When @full is set, the function will not return true unless + * the writer is off the reader page. + * + * Note: it is up to the calling functions to handle sleeps and wakeups. + * The ring buffer can be used anywhere in the kernel and can not + * blindly call wake_up. The layer that uses the ring buffer must be + * responsible for that. + * + * Returns: + * >=0 if data has been transferred, returns the offset of consumed data. + * <0 if no data has been transferred. + */ +int ring_buffer_read_page(struct trace_buffer *buffer, + void **data_page, size_t len, int cpu, int full) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_event *event; + struct buffer_data_page *bpage; + struct buffer_page *reader; + unsigned long missed_events; + unsigned long flags; + unsigned int commit; + unsigned int read; + u64 save_timestamp; + int ret = -1; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + goto out; + + /* + * If len is not big enough to hold the page header, then + * we can not copy anything. + */ + if (len <= BUF_PAGE_HDR_SIZE) + goto out; + + len -= BUF_PAGE_HDR_SIZE; + + if (!data_page) + goto out; + + bpage = *data_page; + if (!bpage) + goto out; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + reader = rb_get_reader_page(cpu_buffer); + if (!reader) + goto out_unlock; + + event = rb_reader_event(cpu_buffer); + + read = reader->read; + commit = rb_page_commit(reader); + + /* Check if any events were dropped */ + missed_events = cpu_buffer->lost_events; + + /* + * If this page has been partially read or + * if len is not big enough to read the rest of the page or + * a writer is still on the page, then + * we must copy the data from the page to the buffer. + * Otherwise, we can simply swap the page with the one passed in. + */ + if (read || (len < (commit - read)) || + cpu_buffer->reader_page == cpu_buffer->commit_page) { + struct buffer_data_page *rpage = cpu_buffer->reader_page->page; + unsigned int rpos = read; + unsigned int pos = 0; + unsigned int size; + + /* + * If a full page is expected, this can still be returned + * if there's been a previous partial read and the + * rest of the page can be read and the commit page is off + * the reader page. + */ + if (full && + (!read || (len < (commit - read)) || + cpu_buffer->reader_page == cpu_buffer->commit_page)) + goto out_unlock; + + if (len > (commit - read)) + len = (commit - read); + + /* Always keep the time extend and data together */ + size = rb_event_ts_length(event); + + if (len < size) + goto out_unlock; + + /* save the current timestamp, since the user will need it */ + save_timestamp = cpu_buffer->read_stamp; + + /* Need to copy one event at a time */ + do { + /* We need the size of one event, because + * rb_advance_reader only advances by one event, + * whereas rb_event_ts_length may include the size of + * one or two events. + * We have already ensured there's enough space if this + * is a time extend. */ + size = rb_event_length(event); + memcpy(bpage->data + pos, rpage->data + rpos, size); + + len -= size; + + rb_advance_reader(cpu_buffer); + rpos = reader->read; + pos += size; + + if (rpos >= commit) + break; + + event = rb_reader_event(cpu_buffer); + /* Always keep the time extend and data together */ + size = rb_event_ts_length(event); + } while (len >= size); + + /* update bpage */ + local_set(&bpage->commit, pos); + bpage->time_stamp = save_timestamp; + + /* we copied everything to the beginning */ + read = 0; + } else { + /* update the entry counter */ + cpu_buffer->read += rb_page_entries(reader); + cpu_buffer->read_bytes += rb_page_commit(reader); + + /* swap the pages */ + rb_init_page(bpage); + bpage = reader->page; + reader->page = *data_page; + local_set(&reader->write, 0); + local_set(&reader->entries, 0); + reader->read = 0; + *data_page = bpage; + + /* + * Use the real_end for the data size, + * This gives us a chance to store the lost events + * on the page. + */ + if (reader->real_end) + local_set(&bpage->commit, reader->real_end); + } + ret = read; + + cpu_buffer->lost_events = 0; + + commit = local_read(&bpage->commit); + /* + * Set a flag in the commit field if we lost events + */ + if (missed_events) { + /* If there is room at the end of the page to save the + * missed events, then record it there. + */ + if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) { + memcpy(&bpage->data[commit], &missed_events, + sizeof(missed_events)); + local_add(RB_MISSED_STORED, &bpage->commit); + commit += sizeof(missed_events); + } + local_add(RB_MISSED_EVENTS, &bpage->commit); + } + + /* + * This page may be off to user land. Zero it out here. + */ + if (commit < BUF_PAGE_SIZE) + memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); + + out_unlock: + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + out: + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_page); + +/* + * We only allocate new buffers, never free them if the CPU goes down. + * If we were to free the buffer, then the user would lose any trace that was in + * the buffer. + */ +int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node) +{ + struct trace_buffer *buffer; + long nr_pages_same; + int cpu_i; + unsigned long nr_pages; + + buffer = container_of(node, struct trace_buffer, node); + if (cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + nr_pages = 0; + nr_pages_same = 1; + /* check if all cpu sizes are same */ + for_each_buffer_cpu(buffer, cpu_i) { + /* fill in the size from first enabled cpu */ + if (nr_pages == 0) + nr_pages = buffer->buffers[cpu_i]->nr_pages; + if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { + nr_pages_same = 0; + break; + } + } + /* allocate minimum pages, user can later expand it */ + if (!nr_pages_same) + nr_pages = 2; + buffer->buffers[cpu] = + rb_allocate_cpu_buffer(buffer, nr_pages, cpu); + if (!buffer->buffers[cpu]) { + WARN(1, "failed to allocate ring buffer on CPU %u\n", + cpu); + return -ENOMEM; + } + smp_wmb(); + cpumask_set_cpu(cpu, buffer->cpumask); + return 0; +} + +#ifdef CONFIG_RING_BUFFER_STARTUP_TEST +/* + * This is a basic integrity check of the ring buffer. + * Late in the boot cycle this test will run when configured in. + * It will kick off a thread per CPU that will go into a loop + * writing to the per cpu ring buffer various sizes of data. + * Some of the data will be large items, some small. + * + * Another thread is created that goes into a spin, sending out + * IPIs to the other CPUs to also write into the ring buffer. + * this is to test the nesting ability of the buffer. + * + * Basic stats are recorded and reported. If something in the + * ring buffer should happen that's not expected, a big warning + * is displayed and all ring buffers are disabled. + */ +static struct task_struct *rb_threads[NR_CPUS] __initdata; + +struct rb_test_data { + struct trace_buffer *buffer; + unsigned long events; + unsigned long bytes_written; + unsigned long bytes_alloc; + unsigned long bytes_dropped; + unsigned long events_nested; + unsigned long bytes_written_nested; + unsigned long bytes_alloc_nested; + unsigned long bytes_dropped_nested; + int min_size_nested; + int max_size_nested; + int max_size; + int min_size; + int cpu; + int cnt; +}; + +static struct rb_test_data rb_data[NR_CPUS] __initdata; + +/* 1 meg per cpu */ +#define RB_TEST_BUFFER_SIZE 1048576 + +static char rb_string[] __initdata = + "abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\" + "?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890" + "!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv"; + +static bool rb_test_started __initdata; + +struct rb_item { + int size; + char str[]; +}; + +static __init int rb_write_something(struct rb_test_data *data, bool nested) +{ + struct ring_buffer_event *event; + struct rb_item *item; + bool started; + int event_len; + int size; + int len; + int cnt; + + /* Have nested writes different that what is written */ + cnt = data->cnt + (nested ? 27 : 0); + + /* Multiply cnt by ~e, to make some unique increment */ + size = (cnt * 68 / 25) % (sizeof(rb_string) - 1); + + len = size + sizeof(struct rb_item); + + started = rb_test_started; + /* read rb_test_started before checking buffer enabled */ + smp_rmb(); + + event = ring_buffer_lock_reserve(data->buffer, len); + if (!event) { + /* Ignore dropped events before test starts. */ + if (started) { + if (nested) + data->bytes_dropped += len; + else + data->bytes_dropped_nested += len; + } + return len; + } + + event_len = ring_buffer_event_length(event); + + if (RB_WARN_ON(data->buffer, event_len < len)) + goto out; + + item = ring_buffer_event_data(event); + item->size = size; + memcpy(item->str, rb_string, size); + + if (nested) { + data->bytes_alloc_nested += event_len; + data->bytes_written_nested += len; + data->events_nested++; + if (!data->min_size_nested || len < data->min_size_nested) + data->min_size_nested = len; + if (len > data->max_size_nested) + data->max_size_nested = len; + } else { + data->bytes_alloc += event_len; + data->bytes_written += len; + data->events++; + if (!data->min_size || len < data->min_size) + data->max_size = len; + if (len > data->max_size) + data->max_size = len; + } + + out: + ring_buffer_unlock_commit(data->buffer); + + return 0; +} + +static __init int rb_test(void *arg) +{ + struct rb_test_data *data = arg; + + while (!kthread_should_stop()) { + rb_write_something(data, false); + data->cnt++; + + set_current_state(TASK_INTERRUPTIBLE); + /* Now sleep between a min of 100-300us and a max of 1ms */ + usleep_range(((data->cnt % 3) + 1) * 100, 1000); + } + + return 0; +} + +static __init void rb_ipi(void *ignore) +{ + struct rb_test_data *data; + int cpu = smp_processor_id(); + + data = &rb_data[cpu]; + rb_write_something(data, true); +} + +static __init int rb_hammer_test(void *arg) +{ + while (!kthread_should_stop()) { + + /* Send an IPI to all cpus to write data! */ + smp_call_function(rb_ipi, NULL, 1); + /* No sleep, but for non preempt, let others run */ + schedule(); + } + + return 0; +} + +static __init int test_ringbuffer(void) +{ + struct task_struct *rb_hammer; + struct trace_buffer *buffer; + int cpu; + int ret = 0; + + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warn("Lockdown is enabled, skipping ring buffer tests\n"); + return 0; + } + + pr_info("Running ring buffer tests...\n"); + + buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE); + if (WARN_ON(!buffer)) + return 0; + + /* Disable buffer so that threads can't write to it yet */ + ring_buffer_record_off(buffer); + + for_each_online_cpu(cpu) { + rb_data[cpu].buffer = buffer; + rb_data[cpu].cpu = cpu; + rb_data[cpu].cnt = cpu; + rb_threads[cpu] = kthread_run_on_cpu(rb_test, &rb_data[cpu], + cpu, "rbtester/%u"); + if (WARN_ON(IS_ERR(rb_threads[cpu]))) { + pr_cont("FAILED\n"); + ret = PTR_ERR(rb_threads[cpu]); + goto out_free; + } + } + + /* Now create the rb hammer! */ + rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer"); + if (WARN_ON(IS_ERR(rb_hammer))) { + pr_cont("FAILED\n"); + ret = PTR_ERR(rb_hammer); + goto out_free; + } + + ring_buffer_record_on(buffer); + /* + * Show buffer is enabled before setting rb_test_started. + * Yes there's a small race window where events could be + * dropped and the thread wont catch it. But when a ring + * buffer gets enabled, there will always be some kind of + * delay before other CPUs see it. Thus, we don't care about + * those dropped events. We care about events dropped after + * the threads see that the buffer is active. + */ + smp_wmb(); + rb_test_started = true; + + set_current_state(TASK_INTERRUPTIBLE); + /* Just run for 10 seconds */; + schedule_timeout(10 * HZ); + + kthread_stop(rb_hammer); + + out_free: + for_each_online_cpu(cpu) { + if (!rb_threads[cpu]) + break; + kthread_stop(rb_threads[cpu]); + } + if (ret) { + ring_buffer_free(buffer); + return ret; + } + + /* Report! */ + pr_info("finished\n"); + for_each_online_cpu(cpu) { + struct ring_buffer_event *event; + struct rb_test_data *data = &rb_data[cpu]; + struct rb_item *item; + unsigned long total_events; + unsigned long total_dropped; + unsigned long total_written; + unsigned long total_alloc; + unsigned long total_read = 0; + unsigned long total_size = 0; + unsigned long total_len = 0; + unsigned long total_lost = 0; + unsigned long lost; + int big_event_size; + int small_event_size; + + ret = -1; + + total_events = data->events + data->events_nested; + total_written = data->bytes_written + data->bytes_written_nested; + total_alloc = data->bytes_alloc + data->bytes_alloc_nested; + total_dropped = data->bytes_dropped + data->bytes_dropped_nested; + + big_event_size = data->max_size + data->max_size_nested; + small_event_size = data->min_size + data->min_size_nested; + + pr_info("CPU %d:\n", cpu); + pr_info(" events: %ld\n", total_events); + pr_info(" dropped bytes: %ld\n", total_dropped); + pr_info(" alloced bytes: %ld\n", total_alloc); + pr_info(" written bytes: %ld\n", total_written); + pr_info(" biggest event: %d\n", big_event_size); + pr_info(" smallest event: %d\n", small_event_size); + + if (RB_WARN_ON(buffer, total_dropped)) + break; + + ret = 0; + + while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) { + total_lost += lost; + item = ring_buffer_event_data(event); + total_len += ring_buffer_event_length(event); + total_size += item->size + sizeof(struct rb_item); + if (memcmp(&item->str[0], rb_string, item->size) != 0) { + pr_info("FAILED!\n"); + pr_info("buffer had: %.*s\n", item->size, item->str); + pr_info("expected: %.*s\n", item->size, rb_string); + RB_WARN_ON(buffer, 1); + ret = -1; + break; + } + total_read++; + } + if (ret) + break; + + ret = -1; + + pr_info(" read events: %ld\n", total_read); + pr_info(" lost events: %ld\n", total_lost); + pr_info(" total events: %ld\n", total_lost + total_read); + pr_info(" recorded len bytes: %ld\n", total_len); + pr_info(" recorded size bytes: %ld\n", total_size); + if (total_lost) { + pr_info(" With dropped events, record len and size may not match\n" + " alloced and written from above\n"); + } else { + if (RB_WARN_ON(buffer, total_len != total_alloc || + total_size != total_written)) + break; + } + if (RB_WARN_ON(buffer, total_lost + total_read != total_events)) + break; + + ret = 0; + } + if (!ret) + pr_info("Ring buffer PASSED!\n"); + + ring_buffer_free(buffer); + return 0; +} + +late_initcall(test_ringbuffer); +#endif /* CONFIG_RING_BUFFER_STARTUP_TEST */ diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c new file mode 100644 index 0000000000..aef34673d7 --- /dev/null +++ b/kernel/trace/ring_buffer_benchmark.c @@ -0,0 +1,497 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ring buffer tester and benchmark + * + * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com> + */ +#include <linux/ring_buffer.h> +#include <linux/completion.h> +#include <linux/kthread.h> +#include <uapi/linux/sched/types.h> +#include <linux/module.h> +#include <linux/ktime.h> +#include <asm/local.h> + +struct rb_page { + u64 ts; + local_t commit; + char data[4080]; +}; + +/* run time and sleep time in seconds */ +#define RUN_TIME 10ULL +#define SLEEP_TIME 10 + +/* number of events for writer to wake up the reader */ +static int wakeup_interval = 100; + +static int reader_finish; +static DECLARE_COMPLETION(read_start); +static DECLARE_COMPLETION(read_done); + +static struct trace_buffer *buffer; +static struct task_struct *producer; +static struct task_struct *consumer; +static unsigned long read; + +static unsigned int disable_reader; +module_param(disable_reader, uint, 0644); +MODULE_PARM_DESC(disable_reader, "only run producer"); + +static unsigned int write_iteration = 50; +module_param(write_iteration, uint, 0644); +MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); + +static int producer_nice = MAX_NICE; +static int consumer_nice = MAX_NICE; + +static int producer_fifo; +static int consumer_fifo; + +module_param(producer_nice, int, 0644); +MODULE_PARM_DESC(producer_nice, "nice prio for producer"); + +module_param(consumer_nice, int, 0644); +MODULE_PARM_DESC(consumer_nice, "nice prio for consumer"); + +module_param(producer_fifo, int, 0644); +MODULE_PARM_DESC(producer_fifo, "use fifo for producer: 0 - disabled, 1 - low prio, 2 - fifo"); + +module_param(consumer_fifo, int, 0644); +MODULE_PARM_DESC(consumer_fifo, "use fifo for consumer: 0 - disabled, 1 - low prio, 2 - fifo"); + +static int read_events; + +static int test_error; + +#define TEST_ERROR() \ + do { \ + if (!test_error) { \ + test_error = 1; \ + WARN_ON(1); \ + } \ + } while (0) + +enum event_status { + EVENT_FOUND, + EVENT_DROPPED, +}; + +static bool break_test(void) +{ + return test_error || kthread_should_stop(); +} + +static enum event_status read_event(int cpu) +{ + struct ring_buffer_event *event; + int *entry; + u64 ts; + + event = ring_buffer_consume(buffer, cpu, &ts, NULL); + if (!event) + return EVENT_DROPPED; + + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + TEST_ERROR(); + return EVENT_DROPPED; + } + + read++; + return EVENT_FOUND; +} + +static enum event_status read_page(int cpu) +{ + struct ring_buffer_event *event; + struct rb_page *rpage; + unsigned long commit; + void *bpage; + int *entry; + int ret; + int inc; + int i; + + bpage = ring_buffer_alloc_read_page(buffer, cpu); + if (IS_ERR(bpage)) + return EVENT_DROPPED; + + ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); + if (ret >= 0) { + rpage = bpage; + /* The commit may have missed event flags set, clear them */ + commit = local_read(&rpage->commit) & 0xfffff; + for (i = 0; i < commit && !test_error ; i += inc) { + + if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { + TEST_ERROR(); + break; + } + + inc = -1; + event = (void *)&rpage->data[i]; + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + /* failed writes may be discarded events */ + if (!event->time_delta) + TEST_ERROR(); + inc = event->array[0] + 4; + break; + case RINGBUF_TYPE_TIME_EXTEND: + inc = 8; + break; + case 0: + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + TEST_ERROR(); + break; + } + read++; + if (!event->array[0]) { + TEST_ERROR(); + break; + } + inc = event->array[0] + 4; + break; + default: + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + TEST_ERROR(); + break; + } + read++; + inc = ((event->type_len + 1) * 4); + } + if (test_error) + break; + + if (inc <= 0) { + TEST_ERROR(); + break; + } + } + } + ring_buffer_free_read_page(buffer, cpu, bpage); + + if (ret < 0) + return EVENT_DROPPED; + return EVENT_FOUND; +} + +static void ring_buffer_consumer(void) +{ + /* toggle between reading pages and events */ + read_events ^= 1; + + read = 0; + /* + * Continue running until the producer specifically asks to stop + * and is ready for the completion. + */ + while (!READ_ONCE(reader_finish)) { + int found = 1; + + while (found && !test_error) { + int cpu; + + found = 0; + for_each_online_cpu(cpu) { + enum event_status stat; + + if (read_events) + stat = read_event(cpu); + else + stat = read_page(cpu); + + if (test_error) + break; + + if (stat == EVENT_FOUND) + found = 1; + + } + } + + /* Wait till the producer wakes us up when there is more data + * available or when the producer wants us to finish reading. + */ + set_current_state(TASK_INTERRUPTIBLE); + if (reader_finish) + break; + + schedule(); + } + __set_current_state(TASK_RUNNING); + reader_finish = 0; + complete(&read_done); +} + +static void ring_buffer_producer(void) +{ + ktime_t start_time, end_time, timeout; + unsigned long long time; + unsigned long long entries; + unsigned long long overruns; + unsigned long missed = 0; + unsigned long hit = 0; + unsigned long avg; + int cnt = 0; + + /* + * Hammer the buffer for 10 secs (this may + * make the system stall) + */ + trace_printk("Starting ring buffer hammer\n"); + start_time = ktime_get(); + timeout = ktime_add_ns(start_time, RUN_TIME * NSEC_PER_SEC); + do { + struct ring_buffer_event *event; + int *entry; + int i; + + for (i = 0; i < write_iteration; i++) { + event = ring_buffer_lock_reserve(buffer, 10); + if (!event) { + missed++; + } else { + hit++; + entry = ring_buffer_event_data(event); + *entry = smp_processor_id(); + ring_buffer_unlock_commit(buffer); + } + } + end_time = ktime_get(); + + cnt++; + if (consumer && !(cnt % wakeup_interval)) + wake_up_process(consumer); + +#ifndef CONFIG_PREEMPTION + /* + * If we are a non preempt kernel, the 10 seconds run will + * stop everything while it runs. Instead, we will call + * cond_resched and also add any time that was lost by a + * reschedule. + * + * Do a cond resched at the same frequency we would wake up + * the reader. + */ + if (cnt % wakeup_interval) + cond_resched(); +#endif + } while (ktime_before(end_time, timeout) && !break_test()); + trace_printk("End ring buffer hammer\n"); + + if (consumer) { + /* Init both completions here to avoid races */ + init_completion(&read_start); + init_completion(&read_done); + /* the completions must be visible before the finish var */ + smp_wmb(); + reader_finish = 1; + wake_up_process(consumer); + wait_for_completion(&read_done); + } + + time = ktime_us_delta(end_time, start_time); + + entries = ring_buffer_entries(buffer); + overruns = ring_buffer_overruns(buffer); + + if (test_error) + trace_printk("ERROR!\n"); + + if (!disable_reader) { + if (consumer_fifo) + trace_printk("Running Consumer at SCHED_FIFO %s\n", + consumer_fifo == 1 ? "low" : "high"); + else + trace_printk("Running Consumer at nice: %d\n", + consumer_nice); + } + if (producer_fifo) + trace_printk("Running Producer at SCHED_FIFO %s\n", + producer_fifo == 1 ? "low" : "high"); + else + trace_printk("Running Producer at nice: %d\n", + producer_nice); + + /* Let the user know that the test is running at low priority */ + if (!producer_fifo && !consumer_fifo && + producer_nice == MAX_NICE && consumer_nice == MAX_NICE) + trace_printk("WARNING!!! This test is running at lowest priority.\n"); + + trace_printk("Time: %lld (usecs)\n", time); + trace_printk("Overruns: %lld\n", overruns); + if (disable_reader) + trace_printk("Read: (reader disabled)\n"); + else + trace_printk("Read: %ld (by %s)\n", read, + read_events ? "events" : "pages"); + trace_printk("Entries: %lld\n", entries); + trace_printk("Total: %lld\n", entries + overruns + read); + trace_printk("Missed: %ld\n", missed); + trace_printk("Hit: %ld\n", hit); + + /* Convert time from usecs to millisecs */ + do_div(time, USEC_PER_MSEC); + if (time) + hit /= (long)time; + else + trace_printk("TIME IS ZERO??\n"); + + trace_printk("Entries per millisec: %ld\n", hit); + + if (hit) { + /* Calculate the average time in nanosecs */ + avg = NSEC_PER_MSEC / hit; + trace_printk("%ld ns per entry\n", avg); + } + + if (missed) { + if (time) + missed /= (long)time; + + trace_printk("Total iterations per millisec: %ld\n", + hit + missed); + + /* it is possible that hit + missed will overflow and be zero */ + if (!(hit + missed)) { + trace_printk("hit + missed overflowed and totalled zero!\n"); + hit--; /* make it non zero */ + } + + /* Calculate the average time in nanosecs */ + avg = NSEC_PER_MSEC / (hit + missed); + trace_printk("%ld ns per entry\n", avg); + } +} + +static void wait_to_die(void) +{ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); +} + +static int ring_buffer_consumer_thread(void *arg) +{ + while (!break_test()) { + complete(&read_start); + + ring_buffer_consumer(); + + set_current_state(TASK_INTERRUPTIBLE); + if (break_test()) + break; + schedule(); + } + __set_current_state(TASK_RUNNING); + + if (!kthread_should_stop()) + wait_to_die(); + + return 0; +} + +static int ring_buffer_producer_thread(void *arg) +{ + while (!break_test()) { + ring_buffer_reset(buffer); + + if (consumer) { + wake_up_process(consumer); + wait_for_completion(&read_start); + } + + ring_buffer_producer(); + if (break_test()) + goto out_kill; + + trace_printk("Sleeping for 10 secs\n"); + set_current_state(TASK_INTERRUPTIBLE); + if (break_test()) + goto out_kill; + schedule_timeout(HZ * SLEEP_TIME); + } + +out_kill: + __set_current_state(TASK_RUNNING); + if (!kthread_should_stop()) + wait_to_die(); + + return 0; +} + +static int __init ring_buffer_benchmark_init(void) +{ + int ret; + + /* make a one meg buffer in overwite mode */ + buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE); + if (!buffer) + return -ENOMEM; + + if (!disable_reader) { + consumer = kthread_create(ring_buffer_consumer_thread, + NULL, "rb_consumer"); + ret = PTR_ERR(consumer); + if (IS_ERR(consumer)) + goto out_fail; + } + + producer = kthread_run(ring_buffer_producer_thread, + NULL, "rb_producer"); + ret = PTR_ERR(producer); + + if (IS_ERR(producer)) + goto out_kill; + + /* + * Run them as low-prio background tasks by default: + */ + if (!disable_reader) { + if (consumer_fifo >= 2) + sched_set_fifo(consumer); + else if (consumer_fifo == 1) + sched_set_fifo_low(consumer); + else + set_user_nice(consumer, consumer_nice); + } + + if (producer_fifo >= 2) + sched_set_fifo(producer); + else if (producer_fifo == 1) + sched_set_fifo_low(producer); + else + set_user_nice(producer, producer_nice); + + return 0; + + out_kill: + if (consumer) + kthread_stop(consumer); + + out_fail: + ring_buffer_free(buffer); + return ret; +} + +static void __exit ring_buffer_benchmark_exit(void) +{ + kthread_stop(producer); + if (consumer) + kthread_stop(consumer); + ring_buffer_free(buffer); +} + +module_init(ring_buffer_benchmark_init); +module_exit(ring_buffer_benchmark_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("ring_buffer_benchmark"); +MODULE_LICENSE("GPL"); diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c new file mode 100644 index 0000000000..25dec0b002 --- /dev/null +++ b/kernel/trace/rpm-traces.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Power trace points + * + * Copyright (C) 2009 Ming Lei <ming.lei@canonical.com> + */ + +#include <linux/string.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/usb.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/rpm.h> + +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_return_int); +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_suspend); +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_resume); diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig new file mode 100644 index 0000000000..831779607e --- /dev/null +++ b/kernel/trace/rv/Kconfig @@ -0,0 +1,78 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config DA_MON_EVENTS + bool + +config DA_MON_EVENTS_IMPLICIT + select DA_MON_EVENTS + bool + +config DA_MON_EVENTS_ID + select DA_MON_EVENTS + bool + +menuconfig RV + bool "Runtime Verification" + depends on TRACING + help + Enable the kernel runtime verification infrastructure. RV is a + lightweight (yet rigorous) method that complements classical + exhaustive verification techniques (such as model checking and + theorem proving). RV works by analyzing the trace of the system's + actual execution, comparing it against a formal specification of + the system behavior. + + For further information, see: + Documentation/trace/rv/runtime-verification.rst + +config RV_MON_WIP + depends on RV + depends on PREEMPT_TRACER + select DA_MON_EVENTS_IMPLICIT + bool "wip monitor" + help + Enable wip (wakeup in preemptive) sample monitor that illustrates + the usage of per-cpu monitors, and one limitation of the + preempt_disable/enable events. + + For further information, see: + Documentation/trace/rv/monitor_wip.rst + +config RV_MON_WWNR + depends on RV + select DA_MON_EVENTS_ID + bool "wwnr monitor" + help + Enable wwnr (wakeup while not running) sample monitor, this is a + sample monitor that illustrates the usage of per-task monitor. + The model is borken on purpose: it serves to test reactors. + + For further information, see: + Documentation/trace/rv/monitor_wwnr.rst + +config RV_REACTORS + bool "Runtime verification reactors" + default y + depends on RV + help + Enables the online runtime verification reactors. A runtime + monitor can cause a reaction to the detection of an exception + on the model's execution. By default, the monitors have + tracing reactions, printing the monitor output via tracepoints, + but other reactions can be added (on-demand) via this interface. + +config RV_REACT_PRINTK + bool "Printk reactor" + depends on RV_REACTORS + default y + help + Enables the printk reactor. The printk reactor emits a printk() + message if an exception is found. + +config RV_REACT_PANIC + bool "Panic reactor" + depends on RV_REACTORS + default y + help + Enables the panic reactor. The panic reactor emits a printk() + message if an exception is found and panic()s the system. diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile new file mode 100644 index 0000000000..963d14875b --- /dev/null +++ b/kernel/trace/rv/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_RV) += rv.o +obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o +obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o +obj-$(CONFIG_RV_REACTORS) += rv_reactors.o +obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o +obj-$(CONFIG_RV_REACT_PANIC) += reactor_panic.o diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c new file mode 100644 index 0000000000..b2b49a27e8 --- /dev/null +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "wip" + +#include <trace/events/rv.h> +#include <trace/events/sched.h> +#include <trace/events/preemptirq.h> + +#include "wip.h" + +static struct rv_monitor rv_wip; +DECLARE_DA_MON_PER_CPU(wip, unsigned char); + +static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_wip(preempt_disable_wip); +} + +static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event_wip(preempt_enable_wip); +} + +static void handle_sched_waking(void *data, struct task_struct *task) +{ + da_handle_event_wip(sched_waking_wip); +} + +static int enable_wip(void) +{ + int retval; + + retval = da_monitor_init_wip(); + if (retval) + return retval; + + rv_attach_trace_probe("wip", preempt_enable, handle_preempt_enable); + rv_attach_trace_probe("wip", sched_waking, handle_sched_waking); + rv_attach_trace_probe("wip", preempt_disable, handle_preempt_disable); + + return 0; +} + +static void disable_wip(void) +{ + rv_wip.enabled = 0; + + rv_detach_trace_probe("wip", preempt_disable, handle_preempt_disable); + rv_detach_trace_probe("wip", preempt_enable, handle_preempt_enable); + rv_detach_trace_probe("wip", sched_waking, handle_sched_waking); + + da_monitor_destroy_wip(); +} + +static struct rv_monitor rv_wip = { + .name = "wip", + .description = "wakeup in preemptive per-cpu testing monitor.", + .enable = enable_wip, + .disable = disable_wip, + .reset = da_monitor_reset_all_wip, + .enabled = 0, +}; + +static int __init register_wip(void) +{ + rv_register_monitor(&rv_wip); + return 0; +} + +static void __exit unregister_wip(void) +{ + rv_unregister_monitor(&rv_wip); +} + +module_init(register_wip); +module_exit(unregister_wip); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Bristot de Oliveira <bristot@kernel.org>"); +MODULE_DESCRIPTION("wip: wakeup in preemptive - per-cpu sample monitor."); diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h new file mode 100644 index 0000000000..2e373f2c65 --- /dev/null +++ b/kernel/trace/rv/monitors/wip/wip.h @@ -0,0 +1,46 @@ +/* + * Automatically generated C representation of wip automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_wip { + preemptive_wip = 0, + non_preemptive_wip, + state_max_wip +}; + +#define INVALID_STATE state_max_wip + +enum events_wip { + preempt_disable_wip = 0, + preempt_enable_wip, + sched_waking_wip, + event_max_wip +}; + +struct automaton_wip { + char *state_names[state_max_wip]; + char *event_names[event_max_wip]; + unsigned char function[state_max_wip][event_max_wip]; + unsigned char initial_state; + bool final_states[state_max_wip]; +}; + +static const struct automaton_wip automaton_wip = { + .state_names = { + "preemptive", + "non_preemptive" + }, + .event_names = { + "preempt_disable", + "preempt_enable", + "sched_waking" + }, + .function = { + { non_preemptive_wip, INVALID_STATE, INVALID_STATE }, + { INVALID_STATE, preemptive_wip, non_preemptive_wip }, + }, + .initial_state = preemptive_wip, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c new file mode 100644 index 0000000000..0e43dd2db6 --- /dev/null +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "wwnr" + +#include <trace/events/rv.h> +#include <trace/events/sched.h> + +#include "wwnr.h" + +static struct rv_monitor rv_wwnr; +DECLARE_DA_MON_PER_TASK(wwnr, unsigned char); + +static void handle_switch(void *data, bool preempt, struct task_struct *p, + struct task_struct *n, unsigned int prev_state) +{ + /* start monitoring only after the first suspension */ + if (prev_state == TASK_INTERRUPTIBLE) + da_handle_start_event_wwnr(p, switch_out_wwnr); + else + da_handle_event_wwnr(p, switch_out_wwnr); + + da_handle_event_wwnr(n, switch_in_wwnr); +} + +static void handle_wakeup(void *data, struct task_struct *p) +{ + da_handle_event_wwnr(p, wakeup_wwnr); +} + +static int enable_wwnr(void) +{ + int retval; + + retval = da_monitor_init_wwnr(); + if (retval) + return retval; + + rv_attach_trace_probe("wwnr", sched_switch, handle_switch); + rv_attach_trace_probe("wwnr", sched_wakeup, handle_wakeup); + + return 0; +} + +static void disable_wwnr(void) +{ + rv_wwnr.enabled = 0; + + rv_detach_trace_probe("wwnr", sched_switch, handle_switch); + rv_detach_trace_probe("wwnr", sched_wakeup, handle_wakeup); + + da_monitor_destroy_wwnr(); +} + +static struct rv_monitor rv_wwnr = { + .name = "wwnr", + .description = "wakeup while not running per-task testing model.", + .enable = enable_wwnr, + .disable = disable_wwnr, + .reset = da_monitor_reset_all_wwnr, + .enabled = 0, +}; + +static int __init register_wwnr(void) +{ + rv_register_monitor(&rv_wwnr); + return 0; +} + +static void __exit unregister_wwnr(void) +{ + rv_unregister_monitor(&rv_wwnr); +} + +module_init(register_wwnr); +module_exit(unregister_wwnr); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Daniel Bristot de Oliveira <bristot@kernel.org>"); +MODULE_DESCRIPTION("wwnr: wakeup while not running monitor"); diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h new file mode 100644 index 0000000000..d0d9c4b812 --- /dev/null +++ b/kernel/trace/rv/monitors/wwnr/wwnr.h @@ -0,0 +1,46 @@ +/* + * Automatically generated C representation of wwnr automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_wwnr { + not_running_wwnr = 0, + running_wwnr, + state_max_wwnr +}; + +#define INVALID_STATE state_max_wwnr + +enum events_wwnr { + switch_in_wwnr = 0, + switch_out_wwnr, + wakeup_wwnr, + event_max_wwnr +}; + +struct automaton_wwnr { + char *state_names[state_max_wwnr]; + char *event_names[event_max_wwnr]; + unsigned char function[state_max_wwnr][event_max_wwnr]; + unsigned char initial_state; + bool final_states[state_max_wwnr]; +}; + +static const struct automaton_wwnr automaton_wwnr = { + .state_names = { + "not_running", + "running" + }, + .event_names = { + "switch_in", + "switch_out", + "wakeup" + }, + .function = { + { running_wwnr, INVALID_STATE, not_running_wwnr }, + { INVALID_STATE, not_running_wwnr, INVALID_STATE }, + }, + .initial_state = not_running_wwnr, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c new file mode 100644 index 0000000000..0186ff4cbd --- /dev/null +++ b/kernel/trace/rv/reactor_panic.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org> + * + * Panic RV reactor: + * Prints the exception msg to the kernel message log and panic(). + */ + +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> + +static void rv_panic_reaction(char *msg) +{ + panic(msg); +} + +static struct rv_reactor rv_panic = { + .name = "panic", + .description = "panic the system if an exception is found.", + .react = rv_panic_reaction +}; + +static int __init register_react_panic(void) +{ + rv_register_reactor(&rv_panic); + return 0; +} + +static void __exit unregister_react_panic(void) +{ + rv_unregister_reactor(&rv_panic); +} + +module_init(register_react_panic); +module_exit(unregister_react_panic); + +MODULE_AUTHOR("Daniel Bristot de Oliveira"); +MODULE_DESCRIPTION("panic rv reactor: panic if an exception is found."); diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c new file mode 100644 index 0000000000..178759dbf8 --- /dev/null +++ b/kernel/trace/rv/reactor_printk.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org> + * + * Printk RV reactor: + * Prints the exception msg to the kernel message log. + */ +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> + +static void rv_printk_reaction(char *msg) +{ + printk_deferred(msg); +} + +static struct rv_reactor rv_printk = { + .name = "printk", + .description = "prints the exception msg to the kernel message log.", + .react = rv_printk_reaction +}; + +static int __init register_react_printk(void) +{ + rv_register_reactor(&rv_printk); + return 0; +} + +static void __exit unregister_react_printk(void) +{ + rv_unregister_reactor(&rv_printk); +} + +module_init(register_react_printk); +module_exit(unregister_react_printk); + +MODULE_AUTHOR("Daniel Bristot de Oliveira"); +MODULE_DESCRIPTION("printk rv reactor: printk if an exception is hit."); diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c new file mode 100644 index 0000000000..2f68e93fff --- /dev/null +++ b/kernel/trace/rv/rv.c @@ -0,0 +1,797 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org> + * + * This is the online Runtime Verification (RV) interface. + * + * RV is a lightweight (yet rigorous) method that complements classical + * exhaustive verification techniques (such as model checking and + * theorem proving) with a more practical approach to complex systems. + * + * RV works by analyzing the trace of the system's actual execution, + * comparing it against a formal specification of the system behavior. + * RV can give precise information on the runtime behavior of the + * monitored system while enabling the reaction for unexpected + * events, avoiding, for example, the propagation of a failure on + * safety-critical systems. + * + * The development of this interface roots in the development of the + * paper: + * + * De Oliveira, Daniel Bristot; Cucinotta, Tommaso; De Oliveira, Romulo + * Silva. Efficient formal verification for the Linux kernel. In: + * International Conference on Software Engineering and Formal Methods. + * Springer, Cham, 2019. p. 315-332. + * + * And: + * + * De Oliveira, Daniel Bristot, et al. Automata-based formal analysis + * and verification of the real-time Linux kernel. PhD Thesis, 2020. + * + * == Runtime monitor interface == + * + * A monitor is the central part of the runtime verification of a system. + * + * The monitor stands in between the formal specification of the desired + * (or undesired) behavior, and the trace of the actual system. + * + * In Linux terms, the runtime verification monitors are encapsulated + * inside the "RV monitor" abstraction. A RV monitor includes a reference + * model of the system, a set of instances of the monitor (per-cpu monitor, + * per-task monitor, and so on), and the helper functions that glue the + * monitor to the system via trace. Generally, a monitor includes some form + * of trace output as a reaction for event parsing and exceptions, + * as depicted bellow: + * + * Linux +----- RV Monitor ----------------------------------+ Formal + * Realm | | Realm + * +-------------------+ +----------------+ +-----------------+ + * | Linux kernel | | Monitor | | Reference | + * | Tracing | -> | Instance(s) | <- | Model | + * | (instrumentation) | | (verification) | | (specification) | + * +-------------------+ +----------------+ +-----------------+ + * | | | + * | V | + * | +----------+ | + * | | Reaction | | + * | +--+--+--+-+ | + * | | | | | + * | | | +-> trace output ? | + * +------------------------|--|----------------------+ + * | +----> panic ? + * +-------> <user-specified> + * + * This file implements the interface for loading RV monitors, and + * to control the verification session. + * + * == Registering monitors == + * + * The struct rv_monitor defines a set of callback functions to control + * a verification session. For instance, when a given monitor is enabled, + * the "enable" callback function is called to hook the instrumentation + * functions to the kernel trace events. The "disable" function is called + * when disabling the verification session. + * + * A RV monitor is registered via: + * int rv_register_monitor(struct rv_monitor *monitor); + * And unregistered via: + * int rv_unregister_monitor(struct rv_monitor *monitor); + * + * == User interface == + * + * The user interface resembles kernel tracing interface. It presents + * these files: + * + * "available_monitors" + * - List the available monitors, one per line. + * + * For example: + * # cat available_monitors + * wip + * wwnr + * + * "enabled_monitors" + * - Lists the enabled monitors, one per line; + * - Writing to it enables a given monitor; + * - Writing a monitor name with a '!' prefix disables it; + * - Truncating the file disables all enabled monitors. + * + * For example: + * # cat enabled_monitors + * # echo wip > enabled_monitors + * # echo wwnr >> enabled_monitors + * # cat enabled_monitors + * wip + * wwnr + * # echo '!wip' >> enabled_monitors + * # cat enabled_monitors + * wwnr + * # echo > enabled_monitors + * # cat enabled_monitors + * # + * + * Note that more than one monitor can be enabled concurrently. + * + * "monitoring_on" + * - It is an on/off general switcher for monitoring. Note + * that it does not disable enabled monitors or detach events, + * but stops the per-entity monitors from monitoring the events + * received from the instrumentation. It resembles the "tracing_on" + * switcher. + * + * "monitors/" + * Each monitor will have its own directory inside "monitors/". There + * the monitor specific files will be presented. + * The "monitors/" directory resembles the "events" directory on + * tracefs. + * + * For example: + * # cd monitors/wip/ + * # ls + * desc enable + * # cat desc + * auto-generated wakeup in preemptive monitor. + * # cat enable + * 0 + * + * For further information, see: + * Documentation/trace/rv/runtime-verification.rst + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/slab.h> + +#ifdef CONFIG_DA_MON_EVENTS +#define CREATE_TRACE_POINTS +#include <trace/events/rv.h> +#endif + +#include "rv.h" + +DEFINE_MUTEX(rv_interface_lock); + +static struct rv_interface rv_root; + +struct dentry *get_monitors_root(void) +{ + return rv_root.monitors_dir; +} + +/* + * Interface for the monitor register. + */ +static LIST_HEAD(rv_monitors_list); + +static int task_monitor_count; +static bool task_monitor_slots[RV_PER_TASK_MONITORS]; + +int rv_get_task_monitor_slot(void) +{ + int i; + + lockdep_assert_held(&rv_interface_lock); + + if (task_monitor_count == RV_PER_TASK_MONITORS) + return -EBUSY; + + task_monitor_count++; + + for (i = 0; i < RV_PER_TASK_MONITORS; i++) { + if (task_monitor_slots[i] == false) { + task_monitor_slots[i] = true; + return i; + } + } + + WARN_ONCE(1, "RV task_monitor_count and slots are out of sync\n"); + + return -EINVAL; +} + +void rv_put_task_monitor_slot(int slot) +{ + lockdep_assert_held(&rv_interface_lock); + + if (slot < 0 || slot >= RV_PER_TASK_MONITORS) { + WARN_ONCE(1, "RV releasing an invalid slot!: %d\n", slot); + return; + } + + WARN_ONCE(!task_monitor_slots[slot], "RV releasing unused task_monitor_slots: %d\n", + slot); + + task_monitor_count--; + task_monitor_slots[slot] = false; +} + +/* + * This section collects the monitor/ files and folders. + */ +static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count, + loff_t *ppos) +{ + struct rv_monitor_def *mdef = filp->private_data; + const char *buff; + + buff = mdef->monitor->enabled ? "1\n" : "0\n"; + + return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1); +} + +/* + * __rv_disable_monitor - disabled an enabled monitor + */ +static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) +{ + lockdep_assert_held(&rv_interface_lock); + + if (mdef->monitor->enabled) { + mdef->monitor->enabled = 0; + mdef->monitor->disable(); + + /* + * Wait for the execution of all events to finish. + * Otherwise, the data used by the monitor could + * be inconsistent. i.e., if the monitor is re-enabled. + */ + if (sync) + tracepoint_synchronize_unregister(); + return 1; + } + return 0; +} + +/** + * rv_disable_monitor - disable a given runtime monitor + * + * Returns 0 on success. + */ +int rv_disable_monitor(struct rv_monitor_def *mdef) +{ + __rv_disable_monitor(mdef, true); + return 0; +} + +/** + * rv_enable_monitor - enable a given runtime monitor + * + * Returns 0 on success, error otherwise. + */ +int rv_enable_monitor(struct rv_monitor_def *mdef) +{ + int retval; + + lockdep_assert_held(&rv_interface_lock); + + if (mdef->monitor->enabled) + return 0; + + retval = mdef->monitor->enable(); + + if (!retval) + mdef->monitor->enabled = 1; + + return retval; +} + +/* + * interface for enabling/disabling a monitor. + */ +static ssize_t monitor_enable_write_data(struct file *filp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct rv_monitor_def *mdef = filp->private_data; + int retval; + bool val; + + retval = kstrtobool_from_user(user_buf, count, &val); + if (retval) + return retval; + + mutex_lock(&rv_interface_lock); + + if (val) + retval = rv_enable_monitor(mdef); + else + retval = rv_disable_monitor(mdef); + + mutex_unlock(&rv_interface_lock); + + return retval ? : count; +} + +static const struct file_operations interface_enable_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = monitor_enable_write_data, + .read = monitor_enable_read_data, +}; + +/* + * Interface to read monitors description. + */ +static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf, size_t count, + loff_t *ppos) +{ + struct rv_monitor_def *mdef = filp->private_data; + char buff[256]; + + memset(buff, 0, sizeof(buff)); + + snprintf(buff, sizeof(buff), "%s\n", mdef->monitor->description); + + return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1); +} + +static const struct file_operations interface_desc_fops = { + .open = simple_open, + .llseek = no_llseek, + .read = monitor_desc_read_data, +}; + +/* + * During the registration of a monitor, this function creates + * the monitor dir, where the specific options of the monitor + * are exposed. + */ +static int create_monitor_dir(struct rv_monitor_def *mdef) +{ + struct dentry *root = get_monitors_root(); + const char *name = mdef->monitor->name; + struct dentry *tmp; + int retval; + + mdef->root_d = rv_create_dir(name, root); + if (!mdef->root_d) + return -ENOMEM; + + tmp = rv_create_file("enable", RV_MODE_WRITE, mdef->root_d, mdef, &interface_enable_fops); + if (!tmp) { + retval = -ENOMEM; + goto out_remove_root; + } + + tmp = rv_create_file("desc", RV_MODE_READ, mdef->root_d, mdef, &interface_desc_fops); + if (!tmp) { + retval = -ENOMEM; + goto out_remove_root; + } + + retval = reactor_populate_monitor(mdef); + if (retval) + goto out_remove_root; + + return 0; + +out_remove_root: + rv_remove(mdef->root_d); + return retval; +} + +/* + * Available/Enable monitor shared seq functions. + */ +static int monitors_show(struct seq_file *m, void *p) +{ + struct rv_monitor_def *mon_def = p; + + seq_printf(m, "%s\n", mon_def->monitor->name); + return 0; +} + +/* + * Used by the seq file operations at the end of a read + * operation. + */ +static void monitors_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&rv_interface_lock); +} + +/* + * Available monitor seq functions. + */ +static void *available_monitors_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&rv_interface_lock); + return seq_list_start(&rv_monitors_list, *pos); +} + +static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &rv_monitors_list, pos); +} + +/* + * Enable monitor seq functions. + */ +static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct rv_monitor_def *m_def = p; + + (*pos)++; + + list_for_each_entry_continue(m_def, &rv_monitors_list, list) { + if (m_def->monitor->enabled) + return m_def; + } + + return NULL; +} + +static void *enabled_monitors_start(struct seq_file *m, loff_t *pos) +{ + struct rv_monitor_def *m_def; + loff_t l; + + mutex_lock(&rv_interface_lock); + + if (list_empty(&rv_monitors_list)) + return NULL; + + m_def = list_entry(&rv_monitors_list, struct rv_monitor_def, list); + + for (l = 0; l <= *pos; ) { + m_def = enabled_monitors_next(m, m_def, &l); + if (!m_def) + break; + } + + return m_def; +} + +/* + * available/enabled monitors seq definition. + */ +static const struct seq_operations available_monitors_seq_ops = { + .start = available_monitors_start, + .next = available_monitors_next, + .stop = monitors_stop, + .show = monitors_show +}; + +static const struct seq_operations enabled_monitors_seq_ops = { + .start = enabled_monitors_start, + .next = enabled_monitors_next, + .stop = monitors_stop, + .show = monitors_show +}; + +/* + * available_monitors interface. + */ +static int available_monitors_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &available_monitors_seq_ops); +}; + +static const struct file_operations available_monitors_ops = { + .open = available_monitors_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +/* + * enabled_monitors interface. + */ +static void disable_all_monitors(void) +{ + struct rv_monitor_def *mdef; + int enabled = 0; + + mutex_lock(&rv_interface_lock); + + list_for_each_entry(mdef, &rv_monitors_list, list) + enabled += __rv_disable_monitor(mdef, false); + + if (enabled) { + /* + * Wait for the execution of all events to finish. + * Otherwise, the data used by the monitor could + * be inconsistent. i.e., if the monitor is re-enabled. + */ + tracepoint_synchronize_unregister(); + } + + mutex_unlock(&rv_interface_lock); +} + +static int enabled_monitors_open(struct inode *inode, struct file *file) +{ + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) + disable_all_monitors(); + + return seq_open(file, &enabled_monitors_seq_ops); +}; + +static ssize_t enabled_monitors_write(struct file *filp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buff[MAX_RV_MONITOR_NAME_SIZE + 2]; + struct rv_monitor_def *mdef; + int retval = -EINVAL; + bool enable = true; + char *ptr; + int len; + + if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1) + return -EINVAL; + + memset(buff, 0, sizeof(buff)); + + retval = simple_write_to_buffer(buff, sizeof(buff) - 1, ppos, user_buf, count); + if (retval < 0) + return -EFAULT; + + ptr = strim(buff); + + if (ptr[0] == '!') { + enable = false; + ptr++; + } + + len = strlen(ptr); + if (!len) + return count; + + mutex_lock(&rv_interface_lock); + + retval = -EINVAL; + + list_for_each_entry(mdef, &rv_monitors_list, list) { + if (strcmp(ptr, mdef->monitor->name) != 0) + continue; + + /* + * Monitor found! + */ + if (enable) + retval = rv_enable_monitor(mdef); + else + retval = rv_disable_monitor(mdef); + + if (!retval) + retval = count; + + break; + } + + mutex_unlock(&rv_interface_lock); + return retval; +} + +static const struct file_operations enabled_monitors_ops = { + .open = enabled_monitors_open, + .read = seq_read, + .write = enabled_monitors_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * Monitoring on global switcher! + */ +static bool __read_mostly monitoring_on; + +/** + * rv_monitoring_on - checks if monitoring is on + * + * Returns 1 if on, 0 otherwise. + */ +bool rv_monitoring_on(void) +{ + /* Ensures that concurrent monitors read consistent monitoring_on */ + smp_rmb(); + return READ_ONCE(monitoring_on); +} + +/* + * monitoring_on general switcher. + */ +static ssize_t monitoring_on_read_data(struct file *filp, char __user *user_buf, + size_t count, loff_t *ppos) +{ + const char *buff; + + buff = rv_monitoring_on() ? "1\n" : "0\n"; + + return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1); +} + +static void turn_monitoring_off(void) +{ + WRITE_ONCE(monitoring_on, false); + /* Ensures that concurrent monitors read consistent monitoring_on */ + smp_wmb(); +} + +static void reset_all_monitors(void) +{ + struct rv_monitor_def *mdef; + + list_for_each_entry(mdef, &rv_monitors_list, list) { + if (mdef->monitor->enabled) + mdef->monitor->reset(); + } +} + +static void turn_monitoring_on(void) +{ + WRITE_ONCE(monitoring_on, true); + /* Ensures that concurrent monitors read consistent monitoring_on */ + smp_wmb(); +} + +static void turn_monitoring_on_with_reset(void) +{ + lockdep_assert_held(&rv_interface_lock); + + if (rv_monitoring_on()) + return; + + /* + * Monitors might be out of sync with the system if events were not + * processed because of !rv_monitoring_on(). + * + * Reset all monitors, forcing a re-sync. + */ + reset_all_monitors(); + turn_monitoring_on(); +} + +static ssize_t monitoring_on_write_data(struct file *filp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + int retval; + bool val; + + retval = kstrtobool_from_user(user_buf, count, &val); + if (retval) + return retval; + + mutex_lock(&rv_interface_lock); + + if (val) + turn_monitoring_on_with_reset(); + else + turn_monitoring_off(); + + /* + * Wait for the execution of all events to finish + * before returning to user-space. + */ + tracepoint_synchronize_unregister(); + + mutex_unlock(&rv_interface_lock); + + return count; +} + +static const struct file_operations monitoring_on_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = monitoring_on_write_data, + .read = monitoring_on_read_data, +}; + +static void destroy_monitor_dir(struct rv_monitor_def *mdef) +{ + reactor_cleanup_monitor(mdef); + rv_remove(mdef->root_d); +} + +/** + * rv_register_monitor - register a rv monitor. + * @monitor: The rv_monitor to be registered. + * + * Returns 0 if successful, error otherwise. + */ +int rv_register_monitor(struct rv_monitor *monitor) +{ + struct rv_monitor_def *r; + int retval = 0; + + if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) { + pr_info("Monitor %s has a name longer than %d\n", monitor->name, + MAX_RV_MONITOR_NAME_SIZE); + return -1; + } + + mutex_lock(&rv_interface_lock); + + list_for_each_entry(r, &rv_monitors_list, list) { + if (strcmp(monitor->name, r->monitor->name) == 0) { + pr_info("Monitor %s is already registered\n", monitor->name); + retval = -1; + goto out_unlock; + } + } + + r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL); + if (!r) { + retval = -ENOMEM; + goto out_unlock; + } + + r->monitor = monitor; + + retval = create_monitor_dir(r); + if (retval) { + kfree(r); + goto out_unlock; + } + + list_add_tail(&r->list, &rv_monitors_list); + +out_unlock: + mutex_unlock(&rv_interface_lock); + return retval; +} + +/** + * rv_unregister_monitor - unregister a rv monitor. + * @monitor: The rv_monitor to be unregistered. + * + * Returns 0 if successful, error otherwise. + */ +int rv_unregister_monitor(struct rv_monitor *monitor) +{ + struct rv_monitor_def *ptr, *next; + + mutex_lock(&rv_interface_lock); + + list_for_each_entry_safe(ptr, next, &rv_monitors_list, list) { + if (strcmp(monitor->name, ptr->monitor->name) == 0) { + rv_disable_monitor(ptr); + list_del(&ptr->list); + destroy_monitor_dir(ptr); + } + } + + mutex_unlock(&rv_interface_lock); + return 0; +} + +int __init rv_init_interface(void) +{ + struct dentry *tmp; + int retval; + + rv_root.root_dir = rv_create_dir("rv", NULL); + if (!rv_root.root_dir) + goto out_err; + + rv_root.monitors_dir = rv_create_dir("monitors", rv_root.root_dir); + if (!rv_root.monitors_dir) + goto out_err; + + tmp = rv_create_file("available_monitors", RV_MODE_READ, rv_root.root_dir, NULL, + &available_monitors_ops); + if (!tmp) + goto out_err; + + tmp = rv_create_file("enabled_monitors", RV_MODE_WRITE, rv_root.root_dir, NULL, + &enabled_monitors_ops); + if (!tmp) + goto out_err; + + tmp = rv_create_file("monitoring_on", RV_MODE_WRITE, rv_root.root_dir, NULL, + &monitoring_on_fops); + if (!tmp) + goto out_err; + retval = init_rv_reactors(rv_root.root_dir); + if (retval) + goto out_err; + + turn_monitoring_on(); + + return 0; + +out_err: + rv_remove(rv_root.root_dir); + printk(KERN_ERR "RV: Error while creating the RV interface\n"); + return 1; +} diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h new file mode 100644 index 0000000000..db6cb0913d --- /dev/null +++ b/kernel/trace/rv/rv.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/mutex.h> + +struct rv_interface { + struct dentry *root_dir; + struct dentry *monitors_dir; +}; + +#include "../trace.h" +#include <linux/tracefs.h> +#include <linux/rv.h> + +#define RV_MODE_WRITE TRACE_MODE_WRITE +#define RV_MODE_READ TRACE_MODE_READ + +#define rv_create_dir tracefs_create_dir +#define rv_create_file tracefs_create_file +#define rv_remove tracefs_remove + +#define MAX_RV_MONITOR_NAME_SIZE 32 +#define MAX_RV_REACTOR_NAME_SIZE 32 + +extern struct mutex rv_interface_lock; + +#ifdef CONFIG_RV_REACTORS +struct rv_reactor_def { + struct list_head list; + struct rv_reactor *reactor; + /* protected by the monitor interface lock */ + int counter; +}; +#endif + +struct rv_monitor_def { + struct list_head list; + struct rv_monitor *monitor; + struct dentry *root_d; +#ifdef CONFIG_RV_REACTORS + struct rv_reactor_def *rdef; + bool reacting; +#endif + bool task_monitor; +}; + +struct dentry *get_monitors_root(void); +int rv_disable_monitor(struct rv_monitor_def *mdef); +int rv_enable_monitor(struct rv_monitor_def *mdef); + +#ifdef CONFIG_RV_REACTORS +int reactor_populate_monitor(struct rv_monitor_def *mdef); +void reactor_cleanup_monitor(struct rv_monitor_def *mdef); +int init_rv_reactors(struct dentry *root_dir); +#else +static inline int reactor_populate_monitor(struct rv_monitor_def *mdef) +{ + return 0; +} + +static inline void reactor_cleanup_monitor(struct rv_monitor_def *mdef) +{ + return; +} + +static inline int init_rv_reactors(struct dentry *root_dir) +{ + return 0; +} +#endif diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c new file mode 100644 index 0000000000..6aae106695 --- /dev/null +++ b/kernel/trace/rv/rv_reactors.c @@ -0,0 +1,510 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019-2022 Red Hat, Inc. Daniel Bristot de Oliveira <bristot@kernel.org> + * + * Runtime reactor interface. + * + * A runtime monitor can cause a reaction to the detection of an + * exception on the model's execution. By default, the monitors have + * tracing reactions, printing the monitor output via tracepoints. + * But other reactions can be added (on-demand) via this interface. + * + * == Registering reactors == + * + * The struct rv_reactor defines a callback function to be executed + * in case of a model exception happens. The callback function + * receives a message to be (optionally) printed before executing + * the reaction. + * + * A RV reactor is registered via: + * int rv_register_reactor(struct rv_reactor *reactor) + * And unregistered via: + * int rv_unregister_reactor(struct rv_reactor *reactor) + * + * These functions are exported to modules, enabling reactors to be + * dynamically loaded. + * + * == User interface == + * + * The user interface resembles the kernel tracing interface and + * presents these files: + * + * "available_reactors" + * - List the available reactors, one per line. + * + * For example: + * # cat available_reactors + * nop + * panic + * printk + * + * "reacting_on" + * - It is an on/off general switch for reactors, disabling + * all reactions. + * + * "monitors/MONITOR/reactors" + * - List available reactors, with the select reaction for the given + * MONITOR inside []. The default one is the nop (no operation) + * reactor. + * - Writing the name of an reactor enables it to the given + * MONITOR. + * + * For example: + * # cat monitors/wip/reactors + * [nop] + * panic + * printk + * # echo panic > monitors/wip/reactors + * # cat monitors/wip/reactors + * nop + * [panic] + * printk + */ + +#include <linux/slab.h> + +#include "rv.h" + +/* + * Interface for the reactor register. + */ +static LIST_HEAD(rv_reactors_list); + +static struct rv_reactor_def *get_reactor_rdef_by_name(char *name) +{ + struct rv_reactor_def *r; + + list_for_each_entry(r, &rv_reactors_list, list) { + if (strcmp(name, r->reactor->name) == 0) + return r; + } + return NULL; +} + +/* + * Available reactors seq functions. + */ +static int reactors_show(struct seq_file *m, void *p) +{ + struct rv_reactor_def *rea_def = p; + + seq_printf(m, "%s\n", rea_def->reactor->name); + return 0; +} + +static void reactors_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&rv_interface_lock); +} + +static void *reactors_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&rv_interface_lock); + return seq_list_start(&rv_reactors_list, *pos); +} + +static void *reactors_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &rv_reactors_list, pos); +} + +/* + * available_reactors seq definition. + */ +static const struct seq_operations available_reactors_seq_ops = { + .start = reactors_start, + .next = reactors_next, + .stop = reactors_stop, + .show = reactors_show +}; + +/* + * available_reactors interface. + */ +static int available_reactors_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &available_reactors_seq_ops); +}; + +static const struct file_operations available_reactors_ops = { + .open = available_reactors_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +/* + * Monitor's reactor file. + */ +static int monitor_reactor_show(struct seq_file *m, void *p) +{ + struct rv_monitor_def *mdef = m->private; + struct rv_reactor_def *rdef = p; + + if (mdef->rdef == rdef) + seq_printf(m, "[%s]\n", rdef->reactor->name); + else + seq_printf(m, "%s\n", rdef->reactor->name); + return 0; +} + +/* + * available_reactors seq definition. + */ +static const struct seq_operations monitor_reactors_seq_ops = { + .start = reactors_start, + .next = reactors_next, + .stop = reactors_stop, + .show = monitor_reactor_show +}; + +static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor_def *rdef, + bool reacting) +{ + bool monitor_enabled; + + /* nothing to do */ + if (mdef->rdef == rdef) + return; + + monitor_enabled = mdef->monitor->enabled; + if (monitor_enabled) + rv_disable_monitor(mdef); + + /* swap reactor's usage */ + mdef->rdef->counter--; + rdef->counter++; + + mdef->rdef = rdef; + mdef->reacting = reacting; + mdef->monitor->react = rdef->reactor->react; + + if (monitor_enabled) + rv_enable_monitor(mdef); +} + +static ssize_t +monitor_reactors_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buff[MAX_RV_REACTOR_NAME_SIZE + 2]; + struct rv_monitor_def *mdef; + struct rv_reactor_def *rdef; + struct seq_file *seq_f; + int retval = -EINVAL; + bool enable; + char *ptr; + int len; + + if (count < 1 || count > MAX_RV_REACTOR_NAME_SIZE + 1) + return -EINVAL; + + memset(buff, 0, sizeof(buff)); + + retval = simple_write_to_buffer(buff, sizeof(buff) - 1, ppos, user_buf, count); + if (retval < 0) + return -EFAULT; + + ptr = strim(buff); + + len = strlen(ptr); + if (!len) + return count; + + /* + * See monitor_reactors_open() + */ + seq_f = file->private_data; + mdef = seq_f->private; + + mutex_lock(&rv_interface_lock); + + retval = -EINVAL; + + list_for_each_entry(rdef, &rv_reactors_list, list) { + if (strcmp(ptr, rdef->reactor->name) != 0) + continue; + + if (rdef == get_reactor_rdef_by_name("nop")) + enable = false; + else + enable = true; + + monitor_swap_reactors(mdef, rdef, enable); + + retval = count; + break; + } + + mutex_unlock(&rv_interface_lock); + + return retval; +} + +/* + * available_reactors interface. + */ +static int monitor_reactors_open(struct inode *inode, struct file *file) +{ + struct rv_monitor_def *mdef = inode->i_private; + struct seq_file *seq_f; + int ret; + + ret = seq_open(file, &monitor_reactors_seq_ops); + if (ret < 0) + return ret; + + /* + * seq_open stores the seq_file on the file->private data. + */ + seq_f = file->private_data; + + /* + * Copy the create file "private" data to the seq_file private data. + */ + seq_f->private = mdef; + + return 0; +}; + +static const struct file_operations monitor_reactors_ops = { + .open = monitor_reactors_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = monitor_reactors_write +}; + +static int __rv_register_reactor(struct rv_reactor *reactor) +{ + struct rv_reactor_def *r; + + list_for_each_entry(r, &rv_reactors_list, list) { + if (strcmp(reactor->name, r->reactor->name) == 0) { + pr_info("Reactor %s is already registered\n", reactor->name); + return -EINVAL; + } + } + + r = kzalloc(sizeof(struct rv_reactor_def), GFP_KERNEL); + if (!r) + return -ENOMEM; + + r->reactor = reactor; + r->counter = 0; + + list_add_tail(&r->list, &rv_reactors_list); + + return 0; +} + +/** + * rv_register_reactor - register a rv reactor. + * @reactor: The rv_reactor to be registered. + * + * Returns 0 if successful, error otherwise. + */ +int rv_register_reactor(struct rv_reactor *reactor) +{ + int retval = 0; + + if (strlen(reactor->name) >= MAX_RV_REACTOR_NAME_SIZE) { + pr_info("Reactor %s has a name longer than %d\n", + reactor->name, MAX_RV_MONITOR_NAME_SIZE); + return -EINVAL; + } + + mutex_lock(&rv_interface_lock); + retval = __rv_register_reactor(reactor); + mutex_unlock(&rv_interface_lock); + return retval; +} + +/** + * rv_unregister_reactor - unregister a rv reactor. + * @reactor: The rv_reactor to be unregistered. + * + * Returns 0 if successful, error otherwise. + */ +int rv_unregister_reactor(struct rv_reactor *reactor) +{ + struct rv_reactor_def *ptr, *next; + int ret = 0; + + mutex_lock(&rv_interface_lock); + + list_for_each_entry_safe(ptr, next, &rv_reactors_list, list) { + if (strcmp(reactor->name, ptr->reactor->name) == 0) { + + if (!ptr->counter) { + list_del(&ptr->list); + } else { + printk(KERN_WARNING + "rv: the rv_reactor %s is in use by %d monitor(s)\n", + ptr->reactor->name, ptr->counter); + printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n", + ptr->reactor->name); + ret = -EBUSY; + break; + } + } + } + + mutex_unlock(&rv_interface_lock); + return ret; +} + +/* + * reacting_on interface. + */ +static bool __read_mostly reacting_on; + +/** + * rv_reacting_on - checks if reacting is on + * + * Returns 1 if on, 0 otherwise. + */ +bool rv_reacting_on(void) +{ + /* Ensures that concurrent monitors read consistent reacting_on */ + smp_rmb(); + return READ_ONCE(reacting_on); +} + +static ssize_t reacting_on_read_data(struct file *filp, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + char *buff; + + buff = rv_reacting_on() ? "1\n" : "0\n"; + + return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1); +} + +static void turn_reacting_off(void) +{ + WRITE_ONCE(reacting_on, false); + /* Ensures that concurrent monitors read consistent reacting_on */ + smp_wmb(); +} + +static void turn_reacting_on(void) +{ + WRITE_ONCE(reacting_on, true); + /* Ensures that concurrent monitors read consistent reacting_on */ + smp_wmb(); +} + +static ssize_t reacting_on_write_data(struct file *filp, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + int retval; + bool val; + + retval = kstrtobool_from_user(user_buf, count, &val); + if (retval) + return retval; + + mutex_lock(&rv_interface_lock); + + if (val) + turn_reacting_on(); + else + turn_reacting_off(); + + /* + * Wait for the execution of all events to finish + * before returning to user-space. + */ + tracepoint_synchronize_unregister(); + + mutex_unlock(&rv_interface_lock); + + return count; +} + +static const struct file_operations reacting_on_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = reacting_on_write_data, + .read = reacting_on_read_data, +}; + +/** + * reactor_populate_monitor - creates per monitor reactors file + * @mdef: monitor's definition. + * + * Returns 0 if successful, error otherwise. + */ +int reactor_populate_monitor(struct rv_monitor_def *mdef) +{ + struct dentry *tmp; + + tmp = rv_create_file("reactors", RV_MODE_WRITE, mdef->root_d, mdef, &monitor_reactors_ops); + if (!tmp) + return -ENOMEM; + + /* + * Configure as the rv_nop reactor. + */ + mdef->rdef = get_reactor_rdef_by_name("nop"); + mdef->rdef->counter++; + mdef->reacting = false; + + return 0; +} + +/** + * reactor_cleanup_monitor - cleanup a monitor reference + * @mdef: monitor's definition. + */ +void reactor_cleanup_monitor(struct rv_monitor_def *mdef) +{ + lockdep_assert_held(&rv_interface_lock); + mdef->rdef->counter--; + WARN_ON_ONCE(mdef->rdef->counter < 0); +} + +/* + * Nop reactor register + */ +static void rv_nop_reaction(char *msg) +{ +} + +static struct rv_reactor rv_nop = { + .name = "nop", + .description = "no-operation reactor: do nothing.", + .react = rv_nop_reaction +}; + +int init_rv_reactors(struct dentry *root_dir) +{ + struct dentry *available, *reacting; + int retval; + + available = rv_create_file("available_reactors", RV_MODE_READ, root_dir, NULL, + &available_reactors_ops); + if (!available) + goto out_err; + + reacting = rv_create_file("reacting_on", RV_MODE_WRITE, root_dir, NULL, &reacting_on_fops); + if (!reacting) + goto rm_available; + + retval = __rv_register_reactor(&rv_nop); + if (retval) + goto rm_reacting; + + turn_reacting_on(); + + return 0; + +rm_reacting: + rv_remove(reacting); +rm_available: + rv_remove(available); +out_err: + return -ENOMEM; +} diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c new file mode 100644 index 0000000000..354c2117be --- /dev/null +++ b/kernel/trace/synth_event_gen_test.c @@ -0,0 +1,536 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test module for in-kernel synthetic event creation and generation. + * + * Copyright (C) 2019 Tom Zanussi <zanussi@kernel.org> + */ + +#include <linux/module.h> +#include <linux/trace_events.h> + +/* + * This module is a simple test of basic functionality for in-kernel + * synthetic event creation and generation, the first and second tests + * using synth_event_gen_cmd_start() and synth_event_add_field(), the + * third uses synth_event_create() to do it all at once with a static + * field array. + * + * Following that are a few examples using the created events to test + * various ways of tracing a synthetic event. + * + * To test, select CONFIG_SYNTH_EVENT_GEN_TEST and build the module. + * Then: + * + * # insmod kernel/trace/synth_event_gen_test.ko + * # cat /sys/kernel/tracing/trace + * + * You should see several events in the trace buffer - + * "create_synth_test", "empty_synth_test", and several instances of + * "gen_synth_test". + * + * To remove the events, remove the module: + * + * # rmmod synth_event_gen_test + * + */ + +static struct trace_event_file *create_synth_test; +static struct trace_event_file *empty_synth_test; +static struct trace_event_file *gen_synth_test; + +/* + * Test to make sure we can create a synthetic event, then add more + * fields. + */ +static int __init test_gen_synth_cmd(void) +{ + struct dynevent_cmd cmd; + u64 vals[7]; + char *buf; + int ret; + + /* Create a buffer to hold the generated command */ + buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* Before generating the command, initialize the cmd object */ + synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN); + + /* + * Create the empty gen_synth_test synthetic event with the + * first 4 fields. + */ + ret = synth_event_gen_cmd_start(&cmd, "gen_synth_test", THIS_MODULE, + "pid_t", "next_pid_field", + "char[16]", "next_comm_field", + "u64", "ts_ns", + "u64", "ts_ms"); + if (ret) + goto free; + + /* Use synth_event_add_field to add the rest of the fields */ + + ret = synth_event_add_field(&cmd, "unsigned int", "cpu"); + if (ret) + goto free; + + ret = synth_event_add_field(&cmd, "char[64]", "my_string_field"); + if (ret) + goto free; + + ret = synth_event_add_field(&cmd, "int", "my_int_field"); + if (ret) + goto free; + + ret = synth_event_gen_cmd_end(&cmd); + if (ret) + goto free; + + /* + * Now get the gen_synth_test event file. We need to prevent + * the instance and event from disappearing from underneath + * us, which trace_get_event_file() does (though in this case + * we're using the top-level instance which never goes away). + */ + gen_synth_test = trace_get_event_file(NULL, "synthetic", + "gen_synth_test"); + if (IS_ERR(gen_synth_test)) { + ret = PTR_ERR(gen_synth_test); + goto delete; + } + + /* Enable the event or you won't see anything */ + ret = trace_array_set_clr_event(gen_synth_test->tr, + "synthetic", "gen_synth_test", true); + if (ret) { + trace_put_event_file(gen_synth_test); + goto delete; + } + + /* Create some bogus values just for testing */ + + vals[0] = 777; /* next_pid_field */ + vals[1] = (u64)(long)"hula hoops"; /* next_comm_field */ + vals[2] = 1000000; /* ts_ns */ + vals[3] = 1000; /* ts_ms */ + vals[4] = raw_smp_processor_id(); /* cpu */ + vals[5] = (u64)(long)"thneed"; /* my_string_field */ + vals[6] = 598; /* my_int_field */ + + /* Now generate a gen_synth_test event */ + ret = synth_event_trace_array(gen_synth_test, vals, ARRAY_SIZE(vals)); + free: + kfree(buf); + return ret; + delete: + /* We got an error after creating the event, delete it */ + synth_event_delete("gen_synth_test"); + goto free; +} + +/* + * Test to make sure we can create an initially empty synthetic event, + * then add all the fields. + */ +static int __init test_empty_synth_event(void) +{ + struct dynevent_cmd cmd; + u64 vals[7]; + char *buf; + int ret; + + /* Create a buffer to hold the generated command */ + buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* Before generating the command, initialize the cmd object */ + synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN); + + /* + * Create the empty_synth_test synthetic event with no fields. + */ + ret = synth_event_gen_cmd_start(&cmd, "empty_synth_test", THIS_MODULE); + if (ret) + goto free; + + /* Use synth_event_add_field to add all of the fields */ + + ret = synth_event_add_field(&cmd, "pid_t", "next_pid_field"); + if (ret) + goto free; + + ret = synth_event_add_field(&cmd, "char[16]", "next_comm_field"); + if (ret) + goto free; + + ret = synth_event_add_field(&cmd, "u64", "ts_ns"); + if (ret) + goto free; + + ret = synth_event_add_field(&cmd, "u64", "ts_ms"); + if (ret) + goto free; + + ret = synth_event_add_field(&cmd, "unsigned int", "cpu"); + if (ret) + goto free; + + ret = synth_event_add_field(&cmd, "char[64]", "my_string_field"); + if (ret) + goto free; + + ret = synth_event_add_field(&cmd, "int", "my_int_field"); + if (ret) + goto free; + + /* All fields have been added, close and register the synth event */ + + ret = synth_event_gen_cmd_end(&cmd); + if (ret) + goto free; + + /* + * Now get the empty_synth_test event file. We need to + * prevent the instance and event from disappearing from + * underneath us, which trace_get_event_file() does (though in + * this case we're using the top-level instance which never + * goes away). + */ + empty_synth_test = trace_get_event_file(NULL, "synthetic", + "empty_synth_test"); + if (IS_ERR(empty_synth_test)) { + ret = PTR_ERR(empty_synth_test); + goto delete; + } + + /* Enable the event or you won't see anything */ + ret = trace_array_set_clr_event(empty_synth_test->tr, + "synthetic", "empty_synth_test", true); + if (ret) { + trace_put_event_file(empty_synth_test); + goto delete; + } + + /* Create some bogus values just for testing */ + + vals[0] = 777; /* next_pid_field */ + vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ + vals[2] = 1000000; /* ts_ns */ + vals[3] = 1000; /* ts_ms */ + vals[4] = raw_smp_processor_id(); /* cpu */ + vals[5] = (u64)(long)"thneed_2.0"; /* my_string_field */ + vals[6] = 399; /* my_int_field */ + + /* Now trace an empty_synth_test event */ + ret = synth_event_trace_array(empty_synth_test, vals, ARRAY_SIZE(vals)); + free: + kfree(buf); + return ret; + delete: + /* We got an error after creating the event, delete it */ + synth_event_delete("empty_synth_test"); + goto free; +} + +static struct synth_field_desc create_synth_test_fields[] = { + { .type = "pid_t", .name = "next_pid_field" }, + { .type = "char[16]", .name = "next_comm_field" }, + { .type = "u64", .name = "ts_ns" }, + { .type = "char[]", .name = "dynstring_field_1" }, + { .type = "u64", .name = "ts_ms" }, + { .type = "unsigned int", .name = "cpu" }, + { .type = "char[64]", .name = "my_string_field" }, + { .type = "char[]", .name = "dynstring_field_2" }, + { .type = "int", .name = "my_int_field" }, +}; + +/* + * Test synthetic event creation all at once from array of field + * descriptors. + */ +static int __init test_create_synth_event(void) +{ + u64 vals[9]; + int ret; + + /* Create the create_synth_test event with the fields above */ + ret = synth_event_create("create_synth_test", + create_synth_test_fields, + ARRAY_SIZE(create_synth_test_fields), + THIS_MODULE); + if (ret) + goto out; + + /* + * Now get the create_synth_test event file. We need to + * prevent the instance and event from disappearing from + * underneath us, which trace_get_event_file() does (though in + * this case we're using the top-level instance which never + * goes away). + */ + create_synth_test = trace_get_event_file(NULL, "synthetic", + "create_synth_test"); + if (IS_ERR(create_synth_test)) { + ret = PTR_ERR(create_synth_test); + goto delete; + } + + /* Enable the event or you won't see anything */ + ret = trace_array_set_clr_event(create_synth_test->tr, + "synthetic", "create_synth_test", true); + if (ret) { + trace_put_event_file(create_synth_test); + goto delete; + } + + /* Create some bogus values just for testing */ + + vals[0] = 777; /* next_pid_field */ + vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */ + vals[2] = 1000000; /* ts_ns */ + vals[3] = (u64)(long)"xrayspecs"; /* dynstring_field_1 */ + vals[4] = 1000; /* ts_ms */ + vals[5] = raw_smp_processor_id(); /* cpu */ + vals[6] = (u64)(long)"thneed"; /* my_string_field */ + vals[7] = (u64)(long)"kerplunk"; /* dynstring_field_2 */ + vals[8] = 398; /* my_int_field */ + + /* Now generate a create_synth_test event */ + ret = synth_event_trace_array(create_synth_test, vals, ARRAY_SIZE(vals)); + out: + return ret; + delete: + /* We got an error after creating the event, delete it */ + synth_event_delete("create_synth_test"); + + goto out; +} + +/* + * Test tracing a synthetic event by reserving trace buffer space, + * then filling in fields one after another. + */ +static int __init test_add_next_synth_val(void) +{ + struct synth_event_trace_state trace_state; + int ret; + + /* Start by reserving space in the trace buffer */ + ret = synth_event_trace_start(gen_synth_test, &trace_state); + if (ret) + return ret; + + /* Write some bogus values into the trace buffer, one after another */ + + /* next_pid_field */ + ret = synth_event_add_next_val(777, &trace_state); + if (ret) + goto out; + + /* next_comm_field */ + ret = synth_event_add_next_val((u64)(long)"slinky", &trace_state); + if (ret) + goto out; + + /* ts_ns */ + ret = synth_event_add_next_val(1000000, &trace_state); + if (ret) + goto out; + + /* ts_ms */ + ret = synth_event_add_next_val(1000, &trace_state); + if (ret) + goto out; + + /* cpu */ + ret = synth_event_add_next_val(raw_smp_processor_id(), &trace_state); + if (ret) + goto out; + + /* my_string_field */ + ret = synth_event_add_next_val((u64)(long)"thneed_2.01", &trace_state); + if (ret) + goto out; + + /* my_int_field */ + ret = synth_event_add_next_val(395, &trace_state); + out: + /* Finally, commit the event */ + ret = synth_event_trace_end(&trace_state); + + return ret; +} + +/* + * Test tracing a synthetic event by reserving trace buffer space, + * then filling in fields using field names, which can be done in any + * order. + */ +static int __init test_add_synth_val(void) +{ + struct synth_event_trace_state trace_state; + int ret; + + /* Start by reserving space in the trace buffer */ + ret = synth_event_trace_start(gen_synth_test, &trace_state); + if (ret) + return ret; + + /* Write some bogus values into the trace buffer, using field names */ + + ret = synth_event_add_val("ts_ns", 1000000, &trace_state); + if (ret) + goto out; + + ret = synth_event_add_val("ts_ms", 1000, &trace_state); + if (ret) + goto out; + + ret = synth_event_add_val("cpu", raw_smp_processor_id(), &trace_state); + if (ret) + goto out; + + ret = synth_event_add_val("next_pid_field", 777, &trace_state); + if (ret) + goto out; + + ret = synth_event_add_val("next_comm_field", (u64)(long)"silly putty", + &trace_state); + if (ret) + goto out; + + ret = synth_event_add_val("my_string_field", (u64)(long)"thneed_9", + &trace_state); + if (ret) + goto out; + + ret = synth_event_add_val("my_int_field", 3999, &trace_state); + out: + /* Finally, commit the event */ + ret = synth_event_trace_end(&trace_state); + + return ret; +} + +/* + * Test tracing a synthetic event all at once from array of values. + */ +static int __init test_trace_synth_event(void) +{ + int ret; + + /* Trace some bogus values just for testing */ + ret = synth_event_trace(create_synth_test, 9, /* number of values */ + (u64)444, /* next_pid_field */ + (u64)(long)"clackers", /* next_comm_field */ + (u64)1000000, /* ts_ns */ + (u64)(long)"viewmaster",/* dynstring_field_1 */ + (u64)1000, /* ts_ms */ + (u64)raw_smp_processor_id(), /* cpu */ + (u64)(long)"Thneed", /* my_string_field */ + (u64)(long)"yoyos", /* dynstring_field_2 */ + (u64)999); /* my_int_field */ + return ret; +} + +static int __init synth_event_gen_test_init(void) +{ + int ret; + + ret = test_gen_synth_cmd(); + if (ret) + return ret; + + ret = test_empty_synth_event(); + if (ret) { + WARN_ON(trace_array_set_clr_event(gen_synth_test->tr, + "synthetic", + "gen_synth_test", false)); + trace_put_event_file(gen_synth_test); + WARN_ON(synth_event_delete("gen_synth_test")); + goto out; + } + + ret = test_create_synth_event(); + if (ret) { + WARN_ON(trace_array_set_clr_event(gen_synth_test->tr, + "synthetic", + "gen_synth_test", false)); + trace_put_event_file(gen_synth_test); + WARN_ON(synth_event_delete("gen_synth_test")); + + WARN_ON(trace_array_set_clr_event(empty_synth_test->tr, + "synthetic", + "empty_synth_test", false)); + trace_put_event_file(empty_synth_test); + WARN_ON(synth_event_delete("empty_synth_test")); + goto out; + } + + ret = test_add_next_synth_val(); + WARN_ON(ret); + + ret = test_add_synth_val(); + WARN_ON(ret); + + ret = test_trace_synth_event(); + WARN_ON(ret); + + /* Disable when done */ + trace_array_set_clr_event(gen_synth_test->tr, + "synthetic", + "gen_synth_test", false); + trace_array_set_clr_event(empty_synth_test->tr, + "synthetic", + "empty_synth_test", false); + trace_array_set_clr_event(create_synth_test->tr, + "synthetic", + "create_synth_test", false); + out: + return ret; +} + +static void __exit synth_event_gen_test_exit(void) +{ + /* Disable the event or you can't remove it */ + WARN_ON(trace_array_set_clr_event(gen_synth_test->tr, + "synthetic", + "gen_synth_test", false)); + + /* Now give the file and instance back */ + trace_put_event_file(gen_synth_test); + + /* Now unregister and free the synthetic event */ + WARN_ON(synth_event_delete("gen_synth_test")); + + /* Disable the event or you can't remove it */ + WARN_ON(trace_array_set_clr_event(empty_synth_test->tr, + "synthetic", + "empty_synth_test", false)); + + /* Now give the file and instance back */ + trace_put_event_file(empty_synth_test); + + /* Now unregister and free the synthetic event */ + WARN_ON(synth_event_delete("empty_synth_test")); + + /* Disable the event or you can't remove it */ + WARN_ON(trace_array_set_clr_event(create_synth_test->tr, + "synthetic", + "create_synth_test", false)); + + /* Now give the file and instance back */ + trace_put_event_file(create_synth_test); + + /* Now unregister and free the synthetic event */ + WARN_ON(synth_event_delete("create_synth_test")); +} + +module_init(synth_event_gen_test_init) +module_exit(synth_event_gen_test_exit) + +MODULE_AUTHOR("Tom Zanussi"); +MODULE_DESCRIPTION("synthetic event generation test"); +MODULE_LICENSE("GPL v2"); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c new file mode 100644 index 0000000000..fc00356a5a --- /dev/null +++ b/kernel/trace/trace.c @@ -0,0 +1,10645 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ring buffer based function tracer + * + * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> + * + * Originally taken from the RT patch by: + * Arnaldo Carvalho de Melo <acme@redhat.com> + * + * Based on code from the latency_tracer, that is: + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 Nadia Yvette Chambers + */ +#include <linux/ring_buffer.h> +#include <generated/utsrelease.h> +#include <linux/stacktrace.h> +#include <linux/writeback.h> +#include <linux/kallsyms.h> +#include <linux/security.h> +#include <linux/seq_file.h> +#include <linux/irqflags.h> +#include <linux/debugfs.h> +#include <linux/tracefs.h> +#include <linux/pagemap.h> +#include <linux/hardirq.h> +#include <linux/linkage.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <linux/ftrace.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/splice.h> +#include <linux/kdebug.h> +#include <linux/string.h> +#include <linux/mount.h> +#include <linux/rwsem.h> +#include <linux/slab.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/panic_notifier.h> +#include <linux/poll.h> +#include <linux/nmi.h> +#include <linux/fs.h> +#include <linux/trace.h> +#include <linux/sched/clock.h> +#include <linux/sched/rt.h> +#include <linux/fsnotify.h> +#include <linux/irq_work.h> +#include <linux/workqueue.h> + +#include <asm/setup.h> /* COMMAND_LINE_SIZE */ + +#include "trace.h" +#include "trace_output.h" + +/* + * On boot up, the ring buffer is set to the minimum size, so that + * we do not waste memory on systems that are not using tracing. + */ +bool ring_buffer_expanded; + +#ifdef CONFIG_FTRACE_STARTUP_TEST +/* + * We need to change this state when a selftest is running. + * A selftest will lurk into the ring-buffer to count the + * entries inserted during the selftest although some concurrent + * insertions into the ring-buffer such as trace_printk could occurred + * at the same time, giving false positive or negative results. + */ +static bool __read_mostly tracing_selftest_running; + +/* + * If boot-time tracing including tracers/events via kernel cmdline + * is running, we do not want to run SELFTEST. + */ +bool __read_mostly tracing_selftest_disabled; + +void __init disable_tracing_selftest(const char *reason) +{ + if (!tracing_selftest_disabled) { + tracing_selftest_disabled = true; + pr_info("Ftrace startup test is disabled due to %s\n", reason); + } +} +#else +#define tracing_selftest_running 0 +#define tracing_selftest_disabled 0 +#endif + +/* Pipe tracepoints to printk */ +static struct trace_iterator *tracepoint_print_iter; +int tracepoint_printk; +static bool tracepoint_printk_stop_on_boot __initdata; +static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); + +/* For tracers that don't implement custom flags */ +static struct tracer_opt dummy_tracer_opt[] = { + { } +}; + +static int +dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) +{ + return 0; +} + +/* + * To prevent the comm cache from being overwritten when no + * tracing is active, only save the comm when a trace event + * occurred. + */ +static DEFINE_PER_CPU(bool, trace_taskinfo_save); + +/* + * Kill all tracing for good (never come back). + * It is initialized to 1 but will turn to zero if the initialization + * of the tracer is successful. But that is the only place that sets + * this back to zero. + */ +static int tracing_disabled = 1; + +cpumask_var_t __read_mostly tracing_buffer_mask; + +/* + * ftrace_dump_on_oops - variable to dump ftrace buffer on oops + * + * If there is an oops (or kernel panic) and the ftrace_dump_on_oops + * is set, then ftrace_dump is called. This will output the contents + * of the ftrace buffers to the console. This is very useful for + * capturing traces that lead to crashes and outputing it to a + * serial console. + * + * It is default off, but you can enable it with either specifying + * "ftrace_dump_on_oops" in the kernel command line, or setting + * /proc/sys/kernel/ftrace_dump_on_oops + * Set 1 if you want to dump buffers of all CPUs + * Set 2 if you want to dump the buffer of the CPU that triggered oops + */ + +enum ftrace_dump_mode ftrace_dump_on_oops; + +/* When set, tracing will stop when a WARN*() is hit */ +int __disable_trace_on_warning; + +#ifdef CONFIG_TRACE_EVAL_MAP_FILE +/* Map of enums to their values, for "eval_map" file */ +struct trace_eval_map_head { + struct module *mod; + unsigned long length; +}; + +union trace_eval_map_item; + +struct trace_eval_map_tail { + /* + * "end" is first and points to NULL as it must be different + * than "mod" or "eval_string" + */ + union trace_eval_map_item *next; + const char *end; /* points to NULL */ +}; + +static DEFINE_MUTEX(trace_eval_mutex); + +/* + * The trace_eval_maps are saved in an array with two extra elements, + * one at the beginning, and one at the end. The beginning item contains + * the count of the saved maps (head.length), and the module they + * belong to if not built in (head.mod). The ending item contains a + * pointer to the next array of saved eval_map items. + */ +union trace_eval_map_item { + struct trace_eval_map map; + struct trace_eval_map_head head; + struct trace_eval_map_tail tail; +}; + +static union trace_eval_map_item *trace_eval_maps; +#endif /* CONFIG_TRACE_EVAL_MAP_FILE */ + +int tracing_set_tracer(struct trace_array *tr, const char *buf); +static void ftrace_trace_userstack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned int trace_ctx); + +#define MAX_TRACER_SIZE 100 +static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; +static char *default_bootup_tracer; + +static bool allocate_snapshot; +static bool snapshot_at_boot; + +static char boot_instance_info[COMMAND_LINE_SIZE] __initdata; +static int boot_instance_index; + +static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata; +static int boot_snapshot_index; + +static int __init set_cmdline_ftrace(char *str) +{ + strscpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); + default_bootup_tracer = bootup_tracer_buf; + /* We are using ftrace early, expand it */ + ring_buffer_expanded = true; + return 1; +} +__setup("ftrace=", set_cmdline_ftrace); + +static int __init set_ftrace_dump_on_oops(char *str) +{ + if (*str++ != '=' || !*str || !strcmp("1", str)) { + ftrace_dump_on_oops = DUMP_ALL; + return 1; + } + + if (!strcmp("orig_cpu", str) || !strcmp("2", str)) { + ftrace_dump_on_oops = DUMP_ORIG; + return 1; + } + + return 0; +} +__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); + +static int __init stop_trace_on_warning(char *str) +{ + if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) + __disable_trace_on_warning = 1; + return 1; +} +__setup("traceoff_on_warning", stop_trace_on_warning); + +static int __init boot_alloc_snapshot(char *str) +{ + char *slot = boot_snapshot_info + boot_snapshot_index; + int left = sizeof(boot_snapshot_info) - boot_snapshot_index; + int ret; + + if (str[0] == '=') { + str++; + if (strlen(str) >= left) + return -1; + + ret = snprintf(slot, left, "%s\t", str); + boot_snapshot_index += ret; + } else { + allocate_snapshot = true; + /* We also need the main ring buffer expanded */ + ring_buffer_expanded = true; + } + return 1; +} +__setup("alloc_snapshot", boot_alloc_snapshot); + + +static int __init boot_snapshot(char *str) +{ + snapshot_at_boot = true; + boot_alloc_snapshot(str); + return 1; +} +__setup("ftrace_boot_snapshot", boot_snapshot); + + +static int __init boot_instance(char *str) +{ + char *slot = boot_instance_info + boot_instance_index; + int left = sizeof(boot_instance_info) - boot_instance_index; + int ret; + + if (strlen(str) >= left) + return -1; + + ret = snprintf(slot, left, "%s\t", str); + boot_instance_index += ret; + + return 1; +} +__setup("trace_instance=", boot_instance); + + +static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; + +static int __init set_trace_boot_options(char *str) +{ + strscpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); + return 1; +} +__setup("trace_options=", set_trace_boot_options); + +static char trace_boot_clock_buf[MAX_TRACER_SIZE] __initdata; +static char *trace_boot_clock __initdata; + +static int __init set_trace_boot_clock(char *str) +{ + strscpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); + trace_boot_clock = trace_boot_clock_buf; + return 1; +} +__setup("trace_clock=", set_trace_boot_clock); + +static int __init set_tracepoint_printk(char *str) +{ + /* Ignore the "tp_printk_stop_on_boot" param */ + if (*str == '_') + return 0; + + if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) + tracepoint_printk = 1; + return 1; +} +__setup("tp_printk", set_tracepoint_printk); + +static int __init set_tracepoint_printk_stop(char *str) +{ + tracepoint_printk_stop_on_boot = true; + return 1; +} +__setup("tp_printk_stop_on_boot", set_tracepoint_printk_stop); + +unsigned long long ns2usecs(u64 nsec) +{ + nsec += 500; + do_div(nsec, 1000); + return nsec; +} + +static void +trace_process_export(struct trace_export *export, + struct ring_buffer_event *event, int flag) +{ + struct trace_entry *entry; + unsigned int size = 0; + + if (export->flags & flag) { + entry = ring_buffer_event_data(event); + size = ring_buffer_event_length(event); + export->write(export, entry, size); + } +} + +static DEFINE_MUTEX(ftrace_export_lock); + +static struct trace_export __rcu *ftrace_exports_list __read_mostly; + +static DEFINE_STATIC_KEY_FALSE(trace_function_exports_enabled); +static DEFINE_STATIC_KEY_FALSE(trace_event_exports_enabled); +static DEFINE_STATIC_KEY_FALSE(trace_marker_exports_enabled); + +static inline void ftrace_exports_enable(struct trace_export *export) +{ + if (export->flags & TRACE_EXPORT_FUNCTION) + static_branch_inc(&trace_function_exports_enabled); + + if (export->flags & TRACE_EXPORT_EVENT) + static_branch_inc(&trace_event_exports_enabled); + + if (export->flags & TRACE_EXPORT_MARKER) + static_branch_inc(&trace_marker_exports_enabled); +} + +static inline void ftrace_exports_disable(struct trace_export *export) +{ + if (export->flags & TRACE_EXPORT_FUNCTION) + static_branch_dec(&trace_function_exports_enabled); + + if (export->flags & TRACE_EXPORT_EVENT) + static_branch_dec(&trace_event_exports_enabled); + + if (export->flags & TRACE_EXPORT_MARKER) + static_branch_dec(&trace_marker_exports_enabled); +} + +static void ftrace_exports(struct ring_buffer_event *event, int flag) +{ + struct trace_export *export; + + preempt_disable_notrace(); + + export = rcu_dereference_raw_check(ftrace_exports_list); + while (export) { + trace_process_export(export, event, flag); + export = rcu_dereference_raw_check(export->next); + } + + preempt_enable_notrace(); +} + +static inline void +add_trace_export(struct trace_export **list, struct trace_export *export) +{ + rcu_assign_pointer(export->next, *list); + /* + * We are entering export into the list but another + * CPU might be walking that list. We need to make sure + * the export->next pointer is valid before another CPU sees + * the export pointer included into the list. + */ + rcu_assign_pointer(*list, export); +} + +static inline int +rm_trace_export(struct trace_export **list, struct trace_export *export) +{ + struct trace_export **p; + + for (p = list; *p != NULL; p = &(*p)->next) + if (*p == export) + break; + + if (*p != export) + return -1; + + rcu_assign_pointer(*p, (*p)->next); + + return 0; +} + +static inline void +add_ftrace_export(struct trace_export **list, struct trace_export *export) +{ + ftrace_exports_enable(export); + + add_trace_export(list, export); +} + +static inline int +rm_ftrace_export(struct trace_export **list, struct trace_export *export) +{ + int ret; + + ret = rm_trace_export(list, export); + ftrace_exports_disable(export); + + return ret; +} + +int register_ftrace_export(struct trace_export *export) +{ + if (WARN_ON_ONCE(!export->write)) + return -1; + + mutex_lock(&ftrace_export_lock); + + add_ftrace_export(&ftrace_exports_list, export); + + mutex_unlock(&ftrace_export_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(register_ftrace_export); + +int unregister_ftrace_export(struct trace_export *export) +{ + int ret; + + mutex_lock(&ftrace_export_lock); + + ret = rm_ftrace_export(&ftrace_exports_list, export); + + mutex_unlock(&ftrace_export_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_export); + +/* trace_flags holds trace_options default values */ +#define TRACE_DEFAULT_FLAGS \ + (FUNCTION_DEFAULT_FLAGS | \ + TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | \ + TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \ + TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \ + TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | \ + TRACE_ITER_HASH_PTR) + +/* trace_options that are only supported by global_trace */ +#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ + TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD) + +/* trace_flags that are default zero for instances */ +#define ZEROED_TRACE_FLAGS \ + (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK) + +/* + * The global_trace is the descriptor that holds the top-level tracing + * buffers for the live tracing. + */ +static struct trace_array global_trace = { + .trace_flags = TRACE_DEFAULT_FLAGS, +}; + +LIST_HEAD(ftrace_trace_arrays); + +int trace_array_get(struct trace_array *this_tr) +{ + struct trace_array *tr; + int ret = -ENODEV; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr == this_tr) { + tr->ref++; + ret = 0; + break; + } + } + mutex_unlock(&trace_types_lock); + + return ret; +} + +static void __trace_array_put(struct trace_array *this_tr) +{ + WARN_ON(!this_tr->ref); + this_tr->ref--; +} + +/** + * trace_array_put - Decrement the reference counter for this trace array. + * @this_tr : pointer to the trace array + * + * NOTE: Use this when we no longer need the trace array returned by + * trace_array_get_by_name(). This ensures the trace array can be later + * destroyed. + * + */ +void trace_array_put(struct trace_array *this_tr) +{ + if (!this_tr) + return; + + mutex_lock(&trace_types_lock); + __trace_array_put(this_tr); + mutex_unlock(&trace_types_lock); +} +EXPORT_SYMBOL_GPL(trace_array_put); + +int tracing_check_open_get_tr(struct trace_array *tr) +{ + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + if (tracing_disabled) + return -ENODEV; + + if (tr && trace_array_get(tr) < 0) + return -ENODEV; + + return 0; +} + +int call_filter_check_discard(struct trace_event_call *call, void *rec, + struct trace_buffer *buffer, + struct ring_buffer_event *event) +{ + if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && + !filter_match_preds(call->filter, rec)) { + __trace_event_discard_commit(buffer, event); + return 1; + } + + return 0; +} + +/** + * trace_find_filtered_pid - check if a pid exists in a filtered_pid list + * @filtered_pids: The list of pids to check + * @search_pid: The PID to find in @filtered_pids + * + * Returns true if @search_pid is found in @filtered_pids, and false otherwise. + */ +bool +trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) +{ + return trace_pid_list_is_set(filtered_pids, search_pid); +} + +/** + * trace_ignore_this_task - should a task be ignored for tracing + * @filtered_pids: The list of pids to check + * @filtered_no_pids: The list of pids not to be traced + * @task: The task that should be ignored if not filtered + * + * Checks if @task should be traced or not from @filtered_pids. + * Returns true if @task should *NOT* be traced. + * Returns false if @task should be traced. + */ +bool +trace_ignore_this_task(struct trace_pid_list *filtered_pids, + struct trace_pid_list *filtered_no_pids, + struct task_struct *task) +{ + /* + * If filtered_no_pids is not empty, and the task's pid is listed + * in filtered_no_pids, then return true. + * Otherwise, if filtered_pids is empty, that means we can + * trace all tasks. If it has content, then only trace pids + * within filtered_pids. + */ + + return (filtered_pids && + !trace_find_filtered_pid(filtered_pids, task->pid)) || + (filtered_no_pids && + trace_find_filtered_pid(filtered_no_pids, task->pid)); +} + +/** + * trace_filter_add_remove_task - Add or remove a task from a pid_list + * @pid_list: The list to modify + * @self: The current task for fork or NULL for exit + * @task: The task to add or remove + * + * If adding a task, if @self is defined, the task is only added if @self + * is also included in @pid_list. This happens on fork and tasks should + * only be added when the parent is listed. If @self is NULL, then the + * @task pid will be removed from the list, which would happen on exit + * of a task. + */ +void trace_filter_add_remove_task(struct trace_pid_list *pid_list, + struct task_struct *self, + struct task_struct *task) +{ + if (!pid_list) + return; + + /* For forks, we only add if the forking task is listed */ + if (self) { + if (!trace_find_filtered_pid(pid_list, self->pid)) + return; + } + + /* "self" is set for forks, and NULL for exits */ + if (self) + trace_pid_list_set(pid_list, task->pid); + else + trace_pid_list_clear(pid_list, task->pid); +} + +/** + * trace_pid_next - Used for seq_file to get to the next pid of a pid_list + * @pid_list: The pid list to show + * @v: The last pid that was shown (+1 the actual pid to let zero be displayed) + * @pos: The position of the file + * + * This is used by the seq_file "next" operation to iterate the pids + * listed in a trace_pid_list structure. + * + * Returns the pid+1 as we want to display pid of zero, but NULL would + * stop the iteration. + */ +void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) +{ + long pid = (unsigned long)v; + unsigned int next; + + (*pos)++; + + /* pid already is +1 of the actual previous bit */ + if (trace_pid_list_next(pid_list, pid, &next) < 0) + return NULL; + + pid = next; + + /* Return pid + 1 to allow zero to be represented */ + return (void *)(pid + 1); +} + +/** + * trace_pid_start - Used for seq_file to start reading pid lists + * @pid_list: The pid list to show + * @pos: The position of the file + * + * This is used by seq_file "start" operation to start the iteration + * of listing pids. + * + * Returns the pid+1 as we want to display pid of zero, but NULL would + * stop the iteration. + */ +void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos) +{ + unsigned long pid; + unsigned int first; + loff_t l = 0; + + if (trace_pid_list_first(pid_list, &first) < 0) + return NULL; + + pid = first; + + /* Return pid + 1 so that zero can be the exit value */ + for (pid++; pid && l < *pos; + pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l)) + ; + return (void *)pid; +} + +/** + * trace_pid_show - show the current pid in seq_file processing + * @m: The seq_file structure to write into + * @v: A void pointer of the pid (+1) value to display + * + * Can be directly used by seq_file operations to display the current + * pid value. + */ +int trace_pid_show(struct seq_file *m, void *v) +{ + unsigned long pid = (unsigned long)v - 1; + + seq_printf(m, "%lu\n", pid); + return 0; +} + +/* 128 should be much more than enough */ +#define PID_BUF_SIZE 127 + +int trace_pid_write(struct trace_pid_list *filtered_pids, + struct trace_pid_list **new_pid_list, + const char __user *ubuf, size_t cnt) +{ + struct trace_pid_list *pid_list; + struct trace_parser parser; + unsigned long val; + int nr_pids = 0; + ssize_t read = 0; + ssize_t ret; + loff_t pos; + pid_t pid; + + if (trace_parser_get_init(&parser, PID_BUF_SIZE + 1)) + return -ENOMEM; + + /* + * Always recreate a new array. The write is an all or nothing + * operation. Always create a new array when adding new pids by + * the user. If the operation fails, then the current list is + * not modified. + */ + pid_list = trace_pid_list_alloc(); + if (!pid_list) { + trace_parser_put(&parser); + return -ENOMEM; + } + + if (filtered_pids) { + /* copy the current bits to the new max */ + ret = trace_pid_list_first(filtered_pids, &pid); + while (!ret) { + trace_pid_list_set(pid_list, pid); + ret = trace_pid_list_next(filtered_pids, pid + 1, &pid); + nr_pids++; + } + } + + ret = 0; + while (cnt > 0) { + + pos = 0; + + ret = trace_get_user(&parser, ubuf, cnt, &pos); + if (ret < 0) + break; + + read += ret; + ubuf += ret; + cnt -= ret; + + if (!trace_parser_loaded(&parser)) + break; + + ret = -EINVAL; + if (kstrtoul(parser.buffer, 0, &val)) + break; + + pid = (pid_t)val; + + if (trace_pid_list_set(pid_list, pid) < 0) { + ret = -1; + break; + } + nr_pids++; + + trace_parser_clear(&parser); + ret = 0; + } + trace_parser_put(&parser); + + if (ret < 0) { + trace_pid_list_free(pid_list); + return ret; + } + + if (!nr_pids) { + /* Cleared the list of pids */ + trace_pid_list_free(pid_list); + pid_list = NULL; + } + + *new_pid_list = pid_list; + + return read; +} + +static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu) +{ + u64 ts; + + /* Early boot up does not have a buffer yet */ + if (!buf->buffer) + return trace_clock_local(); + + ts = ring_buffer_time_stamp(buf->buffer); + ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts); + + return ts; +} + +u64 ftrace_now(int cpu) +{ + return buffer_ftrace_now(&global_trace.array_buffer, cpu); +} + +/** + * tracing_is_enabled - Show if global_trace has been enabled + * + * Shows if the global trace has been enabled or not. It uses the + * mirror flag "buffer_disabled" to be used in fast paths such as for + * the irqsoff tracer. But it may be inaccurate due to races. If you + * need to know the accurate state, use tracing_is_on() which is a little + * slower, but accurate. + */ +int tracing_is_enabled(void) +{ + /* + * For quick access (irqsoff uses this in fast path), just + * return the mirror variable of the state of the ring buffer. + * It's a little racy, but we don't really care. + */ + smp_rmb(); + return !global_trace.buffer_disabled; +} + +/* + * trace_buf_size is the size in bytes that is allocated + * for a buffer. Note, the number of bytes is always rounded + * to page size. + * + * This number is purposely set to a low number of 16384. + * If the dump on oops happens, it will be much appreciated + * to not have to wait for all that output. Anyway this can be + * boot time and run time configurable. + */ +#define TRACE_BUF_SIZE_DEFAULT 1441792UL /* 16384 * 88 (sizeof(entry)) */ + +static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT; + +/* trace_types holds a link list of available tracers. */ +static struct tracer *trace_types __read_mostly; + +/* + * trace_types_lock is used to protect the trace_types list. + */ +DEFINE_MUTEX(trace_types_lock); + +/* + * serialize the access of the ring buffer + * + * ring buffer serializes readers, but it is low level protection. + * The validity of the events (which returns by ring_buffer_peek() ..etc) + * are not protected by ring buffer. + * + * The content of events may become garbage if we allow other process consumes + * these events concurrently: + * A) the page of the consumed events may become a normal page + * (not reader page) in ring buffer, and this page will be rewritten + * by events producer. + * B) The page of the consumed events may become a page for splice_read, + * and this page will be returned to system. + * + * These primitives allow multi process access to different cpu ring buffer + * concurrently. + * + * These primitives don't distinguish read-only and read-consume access. + * Multi read-only access are also serialized. + */ + +#ifdef CONFIG_SMP +static DECLARE_RWSEM(all_cpu_access_lock); +static DEFINE_PER_CPU(struct mutex, cpu_access_lock); + +static inline void trace_access_lock(int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) { + /* gain it for accessing the whole ring buffer. */ + down_write(&all_cpu_access_lock); + } else { + /* gain it for accessing a cpu ring buffer. */ + + /* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */ + down_read(&all_cpu_access_lock); + + /* Secondly block other access to this @cpu ring buffer. */ + mutex_lock(&per_cpu(cpu_access_lock, cpu)); + } +} + +static inline void trace_access_unlock(int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) { + up_write(&all_cpu_access_lock); + } else { + mutex_unlock(&per_cpu(cpu_access_lock, cpu)); + up_read(&all_cpu_access_lock); + } +} + +static inline void trace_access_lock_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + mutex_init(&per_cpu(cpu_access_lock, cpu)); +} + +#else + +static DEFINE_MUTEX(access_lock); + +static inline void trace_access_lock(int cpu) +{ + (void)cpu; + mutex_lock(&access_lock); +} + +static inline void trace_access_unlock(int cpu) +{ + (void)cpu; + mutex_unlock(&access_lock); +} + +static inline void trace_access_lock_init(void) +{ +} + +#endif + +#ifdef CONFIG_STACKTRACE +static void __ftrace_trace_stack(struct trace_buffer *buffer, + unsigned int trace_ctx, + int skip, struct pt_regs *regs); +static inline void ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned int trace_ctx, + int skip, struct pt_regs *regs); + +#else +static inline void __ftrace_trace_stack(struct trace_buffer *buffer, + unsigned int trace_ctx, + int skip, struct pt_regs *regs) +{ +} +static inline void ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned long trace_ctx, + int skip, struct pt_regs *regs) +{ +} + +#endif + +static __always_inline void +trace_event_setup(struct ring_buffer_event *event, + int type, unsigned int trace_ctx) +{ + struct trace_entry *ent = ring_buffer_event_data(event); + + tracing_generic_entry_update(ent, type, trace_ctx); +} + +static __always_inline struct ring_buffer_event * +__trace_buffer_lock_reserve(struct trace_buffer *buffer, + int type, + unsigned long len, + unsigned int trace_ctx) +{ + struct ring_buffer_event *event; + + event = ring_buffer_lock_reserve(buffer, len); + if (event != NULL) + trace_event_setup(event, type, trace_ctx); + + return event; +} + +void tracer_tracing_on(struct trace_array *tr) +{ + if (tr->array_buffer.buffer) + ring_buffer_record_on(tr->array_buffer.buffer); + /* + * This flag is looked at when buffers haven't been allocated + * yet, or by some tracers (like irqsoff), that just want to + * know if the ring buffer has been disabled, but it can handle + * races of where it gets disabled but we still do a record. + * As the check is in the fast path of the tracers, it is more + * important to be fast than accurate. + */ + tr->buffer_disabled = 0; + /* Make the flag seen by readers */ + smp_wmb(); +} + +/** + * tracing_on - enable tracing buffers + * + * This function enables tracing buffers that may have been + * disabled with tracing_off. + */ +void tracing_on(void) +{ + tracer_tracing_on(&global_trace); +} +EXPORT_SYMBOL_GPL(tracing_on); + + +static __always_inline void +__buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) +{ + __this_cpu_write(trace_taskinfo_save, true); + + /* If this is the temp buffer, we need to commit fully */ + if (this_cpu_read(trace_buffered_event) == event) { + /* Length is in event->array[0] */ + ring_buffer_write(buffer, event->array[0], &event->array[1]); + /* Release the temp buffer */ + this_cpu_dec(trace_buffered_event_cnt); + /* ring_buffer_unlock_commit() enables preemption */ + preempt_enable_notrace(); + } else + ring_buffer_unlock_commit(buffer); +} + +int __trace_array_puts(struct trace_array *tr, unsigned long ip, + const char *str, int size) +{ + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct print_entry *entry; + unsigned int trace_ctx; + int alloc; + + if (!(tr->trace_flags & TRACE_ITER_PRINTK)) + return 0; + + if (unlikely(tracing_selftest_running && tr == &global_trace)) + return 0; + + if (unlikely(tracing_disabled)) + return 0; + + alloc = sizeof(*entry) + size + 2; /* possible \n added */ + + trace_ctx = tracing_gen_ctx(); + buffer = tr->array_buffer.buffer; + ring_buffer_nest_start(buffer); + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, + trace_ctx); + if (!event) { + size = 0; + goto out; + } + + entry = ring_buffer_event_data(event); + entry->ip = ip; + + memcpy(&entry->buf, str, size); + + /* Add a newline if necessary */ + if (entry->buf[size - 1] != '\n') { + entry->buf[size] = '\n'; + entry->buf[size + 1] = '\0'; + } else + entry->buf[size] = '\0'; + + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); + out: + ring_buffer_nest_end(buffer); + return size; +} +EXPORT_SYMBOL_GPL(__trace_array_puts); + +/** + * __trace_puts - write a constant string into the trace buffer. + * @ip: The address of the caller + * @str: The constant string to write + * @size: The size of the string. + */ +int __trace_puts(unsigned long ip, const char *str, int size) +{ + return __trace_array_puts(&global_trace, ip, str, size); +} +EXPORT_SYMBOL_GPL(__trace_puts); + +/** + * __trace_bputs - write the pointer to a constant string into trace buffer + * @ip: The address of the caller + * @str: The constant string to write to the buffer to + */ +int __trace_bputs(unsigned long ip, const char *str) +{ + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct bputs_entry *entry; + unsigned int trace_ctx; + int size = sizeof(struct bputs_entry); + int ret = 0; + + if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) + return 0; + + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + + trace_ctx = tracing_gen_ctx(); + buffer = global_trace.array_buffer.buffer; + + ring_buffer_nest_start(buffer); + event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, + trace_ctx); + if (!event) + goto out; + + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->str = str; + + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL); + + ret = 1; + out: + ring_buffer_nest_end(buffer); + return ret; +} +EXPORT_SYMBOL_GPL(__trace_bputs); + +#ifdef CONFIG_TRACER_SNAPSHOT +static void tracing_snapshot_instance_cond(struct trace_array *tr, + void *cond_data) +{ + struct tracer *tracer = tr->current_trace; + unsigned long flags; + + if (in_nmi()) { + trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); + trace_array_puts(tr, "*** snapshot is being ignored ***\n"); + return; + } + + if (!tr->allocated_snapshot) { + trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n"); + trace_array_puts(tr, "*** stopping trace here! ***\n"); + tracer_tracing_off(tr); + return; + } + + /* Note, snapshot can not be used when the tracer uses it */ + if (tracer->use_max_tr) { + trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n"); + trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); + return; + } + + local_irq_save(flags); + update_max_tr(tr, current, smp_processor_id(), cond_data); + local_irq_restore(flags); +} + +void tracing_snapshot_instance(struct trace_array *tr) +{ + tracing_snapshot_instance_cond(tr, NULL); +} + +/** + * tracing_snapshot - take a snapshot of the current buffer. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + * + * Note, make sure to allocate the snapshot with either + * a tracing_snapshot_alloc(), or by doing it manually + * with: echo 1 > /sys/kernel/tracing/snapshot + * + * If the snapshot buffer is not allocated, it will stop tracing. + * Basically making a permanent snapshot. + */ +void tracing_snapshot(void) +{ + struct trace_array *tr = &global_trace; + + tracing_snapshot_instance(tr); +} +EXPORT_SYMBOL_GPL(tracing_snapshot); + +/** + * tracing_snapshot_cond - conditionally take a snapshot of the current buffer. + * @tr: The tracing instance to snapshot + * @cond_data: The data to be tested conditionally, and possibly saved + * + * This is the same as tracing_snapshot() except that the snapshot is + * conditional - the snapshot will only happen if the + * cond_snapshot.update() implementation receiving the cond_data + * returns true, which means that the trace array's cond_snapshot + * update() operation used the cond_data to determine whether the + * snapshot should be taken, and if it was, presumably saved it along + * with the snapshot. + */ +void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) +{ + tracing_snapshot_instance_cond(tr, cond_data); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond); + +/** + * tracing_cond_snapshot_data - get the user data associated with a snapshot + * @tr: The tracing instance + * + * When the user enables a conditional snapshot using + * tracing_snapshot_cond_enable(), the user-defined cond_data is saved + * with the snapshot. This accessor is used to retrieve it. + * + * Should not be called from cond_snapshot.update(), since it takes + * the tr->max_lock lock, which the code calling + * cond_snapshot.update() has already done. + * + * Returns the cond_data associated with the trace array's snapshot. + */ +void *tracing_cond_snapshot_data(struct trace_array *tr) +{ + void *cond_data = NULL; + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + + if (tr->cond_snapshot) + cond_data = tr->cond_snapshot->cond_data; + + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + + return cond_data; +} +EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); + +static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, + struct array_buffer *size_buf, int cpu_id); +static void set_buffer_entries(struct array_buffer *buf, unsigned long val); + +int tracing_alloc_snapshot_instance(struct trace_array *tr) +{ + int ret; + + if (!tr->allocated_snapshot) { + + /* allocate spare buffer */ + ret = resize_buffer_duplicate_size(&tr->max_buffer, + &tr->array_buffer, RING_BUFFER_ALL_CPUS); + if (ret < 0) + return ret; + + tr->allocated_snapshot = true; + } + + return 0; +} + +static void free_snapshot(struct trace_array *tr) +{ + /* + * We don't free the ring buffer. instead, resize it because + * The max_tr ring buffer has some state (e.g. ring->clock) and + * we want preserve it. + */ + ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); + set_buffer_entries(&tr->max_buffer, 1); + tracing_reset_online_cpus(&tr->max_buffer); + tr->allocated_snapshot = false; +} + +/** + * tracing_alloc_snapshot - allocate snapshot buffer. + * + * This only allocates the snapshot buffer if it isn't already + * allocated - it doesn't also take a snapshot. + * + * This is meant to be used in cases where the snapshot buffer needs + * to be set up for events that can't sleep but need to be able to + * trigger a snapshot. + */ +int tracing_alloc_snapshot(void) +{ + struct trace_array *tr = &global_trace; + int ret; + + ret = tracing_alloc_snapshot_instance(tr); + WARN_ON(ret < 0); + + return ret; +} +EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); + +/** + * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer. + * + * This is similar to tracing_snapshot(), but it will allocate the + * snapshot buffer if it isn't already allocated. Use this only + * where it is safe to sleep, as the allocation may sleep. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + */ +void tracing_snapshot_alloc(void) +{ + int ret; + + ret = tracing_alloc_snapshot(); + if (ret < 0) + return; + + tracing_snapshot(); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); + +/** + * tracing_snapshot_cond_enable - enable conditional snapshot for an instance + * @tr: The tracing instance + * @cond_data: User data to associate with the snapshot + * @update: Implementation of the cond_snapshot update function + * + * Check whether the conditional snapshot for the given instance has + * already been enabled, or if the current tracer is already using a + * snapshot; if so, return -EBUSY, else create a cond_snapshot and + * save the cond_data and update function inside. + * + * Returns 0 if successful, error otherwise. + */ +int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, + cond_update_fn_t update) +{ + struct cond_snapshot *cond_snapshot; + int ret = 0; + + cond_snapshot = kzalloc(sizeof(*cond_snapshot), GFP_KERNEL); + if (!cond_snapshot) + return -ENOMEM; + + cond_snapshot->cond_data = cond_data; + cond_snapshot->update = update; + + mutex_lock(&trace_types_lock); + + ret = tracing_alloc_snapshot_instance(tr); + if (ret) + goto fail_unlock; + + if (tr->current_trace->use_max_tr) { + ret = -EBUSY; + goto fail_unlock; + } + + /* + * The cond_snapshot can only change to NULL without the + * trace_types_lock. We don't care if we race with it going + * to NULL, but we want to make sure that it's not set to + * something other than NULL when we get here, which we can + * do safely with only holding the trace_types_lock and not + * having to take the max_lock. + */ + if (tr->cond_snapshot) { + ret = -EBUSY; + goto fail_unlock; + } + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + tr->cond_snapshot = cond_snapshot; + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + + mutex_unlock(&trace_types_lock); + + return ret; + + fail_unlock: + mutex_unlock(&trace_types_lock); + kfree(cond_snapshot); + return ret; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); + +/** + * tracing_snapshot_cond_disable - disable conditional snapshot for an instance + * @tr: The tracing instance + * + * Check whether the conditional snapshot for the given instance is + * enabled; if so, free the cond_snapshot associated with it, + * otherwise return -EINVAL. + * + * Returns 0 if successful, error otherwise. + */ +int tracing_snapshot_cond_disable(struct trace_array *tr) +{ + int ret = 0; + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + + if (!tr->cond_snapshot) + ret = -EINVAL; + else { + kfree(tr->cond_snapshot); + tr->cond_snapshot = NULL; + } + + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + + return ret; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); +#else +void tracing_snapshot(void) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); +} +EXPORT_SYMBOL_GPL(tracing_snapshot); +void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used"); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond); +int tracing_alloc_snapshot(void) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used"); + return -ENODEV; +} +EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); +void tracing_snapshot_alloc(void) +{ + /* Give warning */ + tracing_snapshot(); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +void *tracing_cond_snapshot_data(struct trace_array *tr) +{ + return NULL; +} +EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); +int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) +{ + return -ENODEV; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); +int tracing_snapshot_cond_disable(struct trace_array *tr) +{ + return false; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); +#define free_snapshot(tr) do { } while (0) +#endif /* CONFIG_TRACER_SNAPSHOT */ + +void tracer_tracing_off(struct trace_array *tr) +{ + if (tr->array_buffer.buffer) + ring_buffer_record_off(tr->array_buffer.buffer); + /* + * This flag is looked at when buffers haven't been allocated + * yet, or by some tracers (like irqsoff), that just want to + * know if the ring buffer has been disabled, but it can handle + * races of where it gets disabled but we still do a record. + * As the check is in the fast path of the tracers, it is more + * important to be fast than accurate. + */ + tr->buffer_disabled = 1; + /* Make the flag seen by readers */ + smp_wmb(); +} + +/** + * tracing_off - turn off tracing buffers + * + * This function stops the tracing buffers from recording data. + * It does not disable any overhead the tracers themselves may + * be causing. This function simply causes all recording to + * the ring buffers to fail. + */ +void tracing_off(void) +{ + tracer_tracing_off(&global_trace); +} +EXPORT_SYMBOL_GPL(tracing_off); + +void disable_trace_on_warning(void) +{ + if (__disable_trace_on_warning) { + trace_array_printk_buf(global_trace.array_buffer.buffer, _THIS_IP_, + "Disabling tracing due to warning\n"); + tracing_off(); + } +} + +/** + * tracer_tracing_is_on - show real state of ring buffer enabled + * @tr : the trace array to know if ring buffer is enabled + * + * Shows real state of the ring buffer if it is enabled or not. + */ +bool tracer_tracing_is_on(struct trace_array *tr) +{ + if (tr->array_buffer.buffer) + return ring_buffer_record_is_on(tr->array_buffer.buffer); + return !tr->buffer_disabled; +} + +/** + * tracing_is_on - show state of ring buffers enabled + */ +int tracing_is_on(void) +{ + return tracer_tracing_is_on(&global_trace); +} +EXPORT_SYMBOL_GPL(tracing_is_on); + +static int __init set_buf_size(char *str) +{ + unsigned long buf_size; + + if (!str) + return 0; + buf_size = memparse(str, &str); + /* + * nr_entries can not be zero and the startup + * tests require some buffer space. Therefore + * ensure we have at least 4096 bytes of buffer. + */ + trace_buf_size = max(4096UL, buf_size); + return 1; +} +__setup("trace_buf_size=", set_buf_size); + +static int __init set_tracing_thresh(char *str) +{ + unsigned long threshold; + int ret; + + if (!str) + return 0; + ret = kstrtoul(str, 0, &threshold); + if (ret < 0) + return 0; + tracing_thresh = threshold * 1000; + return 1; +} +__setup("tracing_thresh=", set_tracing_thresh); + +unsigned long nsecs_to_usecs(unsigned long nsecs) +{ + return nsecs / 1000; +} + +/* + * TRACE_FLAGS is defined as a tuple matching bit masks with strings. + * It uses C(a, b) where 'a' is the eval (enum) name and 'b' is the string that + * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list + * of strings in the order that the evals (enum) were defined. + */ +#undef C +#define C(a, b) b + +/* These must match the bit positions in trace_iterator_flags */ +static const char *trace_options[] = { + TRACE_FLAGS + NULL +}; + +static struct { + u64 (*func)(void); + const char *name; + int in_ns; /* is this clock in nanoseconds? */ +} trace_clocks[] = { + { trace_clock_local, "local", 1 }, + { trace_clock_global, "global", 1 }, + { trace_clock_counter, "counter", 0 }, + { trace_clock_jiffies, "uptime", 0 }, + { trace_clock, "perf", 1 }, + { ktime_get_mono_fast_ns, "mono", 1 }, + { ktime_get_raw_fast_ns, "mono_raw", 1 }, + { ktime_get_boot_fast_ns, "boot", 1 }, + { ktime_get_tai_fast_ns, "tai", 1 }, + ARCH_TRACE_CLOCKS +}; + +bool trace_clock_in_ns(struct trace_array *tr) +{ + if (trace_clocks[tr->clock_id].in_ns) + return true; + + return false; +} + +/* + * trace_parser_get_init - gets the buffer for trace parser + */ +int trace_parser_get_init(struct trace_parser *parser, int size) +{ + memset(parser, 0, sizeof(*parser)); + + parser->buffer = kmalloc(size, GFP_KERNEL); + if (!parser->buffer) + return 1; + + parser->size = size; + return 0; +} + +/* + * trace_parser_put - frees the buffer for trace parser + */ +void trace_parser_put(struct trace_parser *parser) +{ + kfree(parser->buffer); + parser->buffer = NULL; +} + +/* + * trace_get_user - reads the user input string separated by space + * (matched by isspace(ch)) + * + * For each string found the 'struct trace_parser' is updated, + * and the function returns. + * + * Returns number of bytes read. + * + * See kernel/trace/trace.h for 'struct trace_parser' details. + */ +int trace_get_user(struct trace_parser *parser, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char ch; + size_t read = 0; + ssize_t ret; + + if (!*ppos) + trace_parser_clear(parser); + + ret = get_user(ch, ubuf++); + if (ret) + goto out; + + read++; + cnt--; + + /* + * The parser is not finished with the last write, + * continue reading the user input without skipping spaces. + */ + if (!parser->cont) { + /* skip white space */ + while (cnt && isspace(ch)) { + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + parser->idx = 0; + + /* only spaces were written */ + if (isspace(ch) || !ch) { + *ppos += read; + ret = read; + goto out; + } + } + + /* read the non-space input */ + while (cnt && !isspace(ch) && ch) { + if (parser->idx < parser->size - 1) + parser->buffer[parser->idx++] = ch; + else { + ret = -EINVAL; + goto out; + } + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + /* We either got finished input or we have to wait for another call. */ + if (isspace(ch) || !ch) { + parser->buffer[parser->idx] = 0; + parser->cont = false; + } else if (parser->idx < parser->size - 1) { + parser->cont = true; + parser->buffer[parser->idx++] = ch; + /* Make sure the parsed string always terminates with '\0'. */ + parser->buffer[parser->idx] = 0; + } else { + ret = -EINVAL; + goto out; + } + + *ppos += read; + ret = read; + +out: + return ret; +} + +/* TODO add a seq_buf_to_buffer() */ +static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) +{ + int len; + + if (trace_seq_used(s) <= s->seq.readpos) + return -EBUSY; + + len = trace_seq_used(s) - s->seq.readpos; + if (cnt > len) + cnt = len; + memcpy(buf, s->buffer + s->seq.readpos, cnt); + + s->seq.readpos += cnt; + return cnt; +} + +unsigned long __read_mostly tracing_thresh; + +#ifdef CONFIG_TRACER_MAX_TRACE +static const struct file_operations tracing_max_lat_fops; + +#ifdef LATENCY_FS_NOTIFY + +static struct workqueue_struct *fsnotify_wq; + +static void latency_fsnotify_workfn(struct work_struct *work) +{ + struct trace_array *tr = container_of(work, struct trace_array, + fsnotify_work); + fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY); +} + +static void latency_fsnotify_workfn_irq(struct irq_work *iwork) +{ + struct trace_array *tr = container_of(iwork, struct trace_array, + fsnotify_irqwork); + queue_work(fsnotify_wq, &tr->fsnotify_work); +} + +static void trace_create_maxlat_file(struct trace_array *tr, + struct dentry *d_tracer) +{ + INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); + init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); + tr->d_max_latency = trace_create_file("tracing_max_latency", + TRACE_MODE_WRITE, + d_tracer, tr, + &tracing_max_lat_fops); +} + +__init static int latency_fsnotify_init(void) +{ + fsnotify_wq = alloc_workqueue("tr_max_lat_wq", + WQ_UNBOUND | WQ_HIGHPRI, 0); + if (!fsnotify_wq) { + pr_err("Unable to allocate tr_max_lat_wq\n"); + return -ENOMEM; + } + return 0; +} + +late_initcall_sync(latency_fsnotify_init); + +void latency_fsnotify(struct trace_array *tr) +{ + if (!fsnotify_wq) + return; + /* + * We cannot call queue_work(&tr->fsnotify_work) from here because it's + * possible that we are called from __schedule() or do_idle(), which + * could cause a deadlock. + */ + irq_work_queue(&tr->fsnotify_irqwork); +} + +#else /* !LATENCY_FS_NOTIFY */ + +#define trace_create_maxlat_file(tr, d_tracer) \ + trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \ + d_tracer, tr, &tracing_max_lat_fops) + +#endif + +/* + * Copy the new maximum trace into the separate maximum-trace + * structure. (this way the maximum trace is permanently saved, + * for later retrieval via /sys/kernel/tracing/tracing_max_latency) + */ +static void +__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct array_buffer *trace_buf = &tr->array_buffer; + struct array_buffer *max_buf = &tr->max_buffer; + struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); + struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); + + max_buf->cpu = cpu; + max_buf->time_start = data->preempt_timestamp; + + max_data->saved_latency = tr->max_latency; + max_data->critical_start = data->critical_start; + max_data->critical_end = data->critical_end; + + strncpy(max_data->comm, tsk->comm, TASK_COMM_LEN); + max_data->pid = tsk->pid; + /* + * If tsk == current, then use current_uid(), as that does not use + * RCU. The irq tracer can be called out of RCU scope. + */ + if (tsk == current) + max_data->uid = current_uid(); + else + max_data->uid = task_uid(tsk); + + max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; + max_data->policy = tsk->policy; + max_data->rt_priority = tsk->rt_priority; + + /* record this tasks comm */ + tracing_record_cmdline(tsk); + latency_fsnotify(tr); +} + +/** + * update_max_tr - snapshot all trace buffers from global_trace to max_tr + * @tr: tracer + * @tsk: the task with the latency + * @cpu: The cpu that initiated the trace. + * @cond_data: User data associated with a conditional snapshot + * + * Flip the buffers between the @tr and the max_tr and record information + * about which task was the cause of this latency. + */ +void +update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, + void *cond_data) +{ + if (tr->stop_count) + return; + + WARN_ON_ONCE(!irqs_disabled()); + + if (!tr->allocated_snapshot) { + /* Only the nop tracer should hit this when disabling */ + WARN_ON_ONCE(tr->current_trace != &nop_trace); + return; + } + + arch_spin_lock(&tr->max_lock); + + /* Inherit the recordable setting from array_buffer */ + if (ring_buffer_record_is_set_on(tr->array_buffer.buffer)) + ring_buffer_record_on(tr->max_buffer.buffer); + else + ring_buffer_record_off(tr->max_buffer.buffer); + +#ifdef CONFIG_TRACER_SNAPSHOT + if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) { + arch_spin_unlock(&tr->max_lock); + return; + } +#endif + swap(tr->array_buffer.buffer, tr->max_buffer.buffer); + + __update_max_tr(tr, tsk, cpu); + + arch_spin_unlock(&tr->max_lock); + + /* Any waiters on the old snapshot buffer need to wake up */ + ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS); +} + +/** + * update_max_tr_single - only copy one trace over, and reset the rest + * @tr: tracer + * @tsk: task with the latency + * @cpu: the cpu of the buffer to copy. + * + * Flip the trace of a single CPU buffer between the @tr and the max_tr. + */ +void +update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + int ret; + + if (tr->stop_count) + return; + + WARN_ON_ONCE(!irqs_disabled()); + if (!tr->allocated_snapshot) { + /* Only the nop tracer should hit this when disabling */ + WARN_ON_ONCE(tr->current_trace != &nop_trace); + return; + } + + arch_spin_lock(&tr->max_lock); + + ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->array_buffer.buffer, cpu); + + if (ret == -EBUSY) { + /* + * We failed to swap the buffer due to a commit taking + * place on this CPU. We fail to record, but we reset + * the max trace buffer (no one writes directly to it) + * and flag that it failed. + * Another reason is resize is in progress. + */ + trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_, + "Failed to swap buffers due to commit or resize in progress\n"); + } + + WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); + + __update_max_tr(tr, tsk, cpu); + arch_spin_unlock(&tr->max_lock); +} + +#endif /* CONFIG_TRACER_MAX_TRACE */ + +static int wait_on_pipe(struct trace_iterator *iter, int full) +{ + int ret; + + /* Iterators are static, they should be filled or empty */ + if (trace_buffer_iter(iter, iter->cpu_file)) + return 0; + + ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full); + +#ifdef CONFIG_TRACER_MAX_TRACE + /* + * Make sure this is still the snapshot buffer, as if a snapshot were + * to happen, this would now be the main buffer. + */ + if (iter->snapshot) + iter->array_buffer = &iter->tr->max_buffer; +#endif + return ret; +} + +#ifdef CONFIG_FTRACE_STARTUP_TEST +static bool selftests_can_run; + +struct trace_selftests { + struct list_head list; + struct tracer *type; +}; + +static LIST_HEAD(postponed_selftests); + +static int save_selftest(struct tracer *type) +{ + struct trace_selftests *selftest; + + selftest = kmalloc(sizeof(*selftest), GFP_KERNEL); + if (!selftest) + return -ENOMEM; + + selftest->type = type; + list_add(&selftest->list, &postponed_selftests); + return 0; +} + +static int run_tracer_selftest(struct tracer *type) +{ + struct trace_array *tr = &global_trace; + struct tracer *saved_tracer = tr->current_trace; + int ret; + + if (!type->selftest || tracing_selftest_disabled) + return 0; + + /* + * If a tracer registers early in boot up (before scheduling is + * initialized and such), then do not run its selftests yet. + * Instead, run it a little later in the boot process. + */ + if (!selftests_can_run) + return save_selftest(type); + + if (!tracing_is_on()) { + pr_warn("Selftest for tracer %s skipped due to tracing disabled\n", + type->name); + return 0; + } + + /* + * Run a selftest on this tracer. + * Here we reset the trace buffer, and set the current + * tracer to be this tracer. The tracer can then run some + * internal tracing to verify that everything is in order. + * If we fail, we do not register this tracer. + */ + tracing_reset_online_cpus(&tr->array_buffer); + + tr->current_trace = type; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (type->use_max_tr) { + /* If we expanded the buffers, make sure the max is expanded too */ + if (ring_buffer_expanded) + ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, + RING_BUFFER_ALL_CPUS); + tr->allocated_snapshot = true; + } +#endif + + /* the test is responsible for initializing and enabling */ + pr_info("Testing tracer %s: ", type->name); + ret = type->selftest(type, tr); + /* the test is responsible for resetting too */ + tr->current_trace = saved_tracer; + if (ret) { + printk(KERN_CONT "FAILED!\n"); + /* Add the warning after printing 'FAILED' */ + WARN_ON(1); + return -1; + } + /* Only reset on passing, to avoid touching corrupted buffers */ + tracing_reset_online_cpus(&tr->array_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (type->use_max_tr) { + tr->allocated_snapshot = false; + + /* Shrink the max buffer again */ + if (ring_buffer_expanded) + ring_buffer_resize(tr->max_buffer.buffer, 1, + RING_BUFFER_ALL_CPUS); + } +#endif + + printk(KERN_CONT "PASSED\n"); + return 0; +} + +static int do_run_tracer_selftest(struct tracer *type) +{ + int ret; + + /* + * Tests can take a long time, especially if they are run one after the + * other, as does happen during bootup when all the tracers are + * registered. This could cause the soft lockup watchdog to trigger. + */ + cond_resched(); + + tracing_selftest_running = true; + ret = run_tracer_selftest(type); + tracing_selftest_running = false; + + return ret; +} + +static __init int init_trace_selftests(void) +{ + struct trace_selftests *p, *n; + struct tracer *t, **last; + int ret; + + selftests_can_run = true; + + mutex_lock(&trace_types_lock); + + if (list_empty(&postponed_selftests)) + goto out; + + pr_info("Running postponed tracer tests:\n"); + + tracing_selftest_running = true; + list_for_each_entry_safe(p, n, &postponed_selftests, list) { + /* This loop can take minutes when sanitizers are enabled, so + * lets make sure we allow RCU processing. + */ + cond_resched(); + ret = run_tracer_selftest(p->type); + /* If the test fails, then warn and remove from available_tracers */ + if (ret < 0) { + WARN(1, "tracer: %s failed selftest, disabling\n", + p->type->name); + last = &trace_types; + for (t = trace_types; t; t = t->next) { + if (t == p->type) { + *last = t->next; + break; + } + last = &t->next; + } + } + list_del(&p->list); + kfree(p); + } + tracing_selftest_running = false; + + out: + mutex_unlock(&trace_types_lock); + + return 0; +} +core_initcall(init_trace_selftests); +#else +static inline int run_tracer_selftest(struct tracer *type) +{ + return 0; +} +static inline int do_run_tracer_selftest(struct tracer *type) +{ + return 0; +} +#endif /* CONFIG_FTRACE_STARTUP_TEST */ + +static void add_tracer_options(struct trace_array *tr, struct tracer *t); + +static void __init apply_trace_boot_options(void); + +/** + * register_tracer - register a tracer with the ftrace system. + * @type: the plugin for the tracer + * + * Register a new plugin tracer. + */ +int __init register_tracer(struct tracer *type) +{ + struct tracer *t; + int ret = 0; + + if (!type->name) { + pr_info("Tracer must have a name\n"); + return -1; + } + + if (strlen(type->name) >= MAX_TRACER_SIZE) { + pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); + return -1; + } + + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warn("Can not register tracer %s due to lockdown\n", + type->name); + return -EPERM; + } + + mutex_lock(&trace_types_lock); + + for (t = trace_types; t; t = t->next) { + if (strcmp(type->name, t->name) == 0) { + /* already found */ + pr_info("Tracer %s already registered\n", + type->name); + ret = -1; + goto out; + } + } + + if (!type->set_flag) + type->set_flag = &dummy_set_flag; + if (!type->flags) { + /*allocate a dummy tracer_flags*/ + type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL); + if (!type->flags) { + ret = -ENOMEM; + goto out; + } + type->flags->val = 0; + type->flags->opts = dummy_tracer_opt; + } else + if (!type->flags->opts) + type->flags->opts = dummy_tracer_opt; + + /* store the tracer for __set_tracer_option */ + type->flags->trace = type; + + ret = do_run_tracer_selftest(type); + if (ret < 0) + goto out; + + type->next = trace_types; + trace_types = type; + add_tracer_options(&global_trace, type); + + out: + mutex_unlock(&trace_types_lock); + + if (ret || !default_bootup_tracer) + goto out_unlock; + + if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE)) + goto out_unlock; + + printk(KERN_INFO "Starting tracer '%s'\n", type->name); + /* Do we want this tracer to start on bootup? */ + tracing_set_tracer(&global_trace, type->name); + default_bootup_tracer = NULL; + + apply_trace_boot_options(); + + /* disable other selftests, since this will break it. */ + disable_tracing_selftest("running a tracer"); + + out_unlock: + return ret; +} + +static void tracing_reset_cpu(struct array_buffer *buf, int cpu) +{ + struct trace_buffer *buffer = buf->buffer; + + if (!buffer) + return; + + ring_buffer_record_disable(buffer); + + /* Make sure all commits have finished */ + synchronize_rcu(); + ring_buffer_reset_cpu(buffer, cpu); + + ring_buffer_record_enable(buffer); +} + +void tracing_reset_online_cpus(struct array_buffer *buf) +{ + struct trace_buffer *buffer = buf->buffer; + + if (!buffer) + return; + + ring_buffer_record_disable(buffer); + + /* Make sure all commits have finished */ + synchronize_rcu(); + + buf->time_start = buffer_ftrace_now(buf, buf->cpu); + + ring_buffer_reset_online_cpus(buffer); + + ring_buffer_record_enable(buffer); +} + +/* Must have trace_types_lock held */ +void tracing_reset_all_online_cpus_unlocked(void) +{ + struct trace_array *tr; + + lockdep_assert_held(&trace_types_lock); + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->clear_trace) + continue; + tr->clear_trace = false; + tracing_reset_online_cpus(&tr->array_buffer); +#ifdef CONFIG_TRACER_MAX_TRACE + tracing_reset_online_cpus(&tr->max_buffer); +#endif + } +} + +void tracing_reset_all_online_cpus(void) +{ + mutex_lock(&trace_types_lock); + tracing_reset_all_online_cpus_unlocked(); + mutex_unlock(&trace_types_lock); +} + +/* + * The tgid_map array maps from pid to tgid; i.e. the value stored at index i + * is the tgid last observed corresponding to pid=i. + */ +static int *tgid_map; + +/* The maximum valid index into tgid_map. */ +static size_t tgid_map_max; + +#define SAVED_CMDLINES_DEFAULT 128 +#define NO_CMDLINE_MAP UINT_MAX +/* + * Preemption must be disabled before acquiring trace_cmdline_lock. + * The various trace_arrays' max_lock must be acquired in a context + * where interrupt is disabled. + */ +static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; +struct saved_cmdlines_buffer { + unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; + unsigned *map_cmdline_to_pid; + unsigned cmdline_num; + int cmdline_idx; + char *saved_cmdlines; +}; +static struct saved_cmdlines_buffer *savedcmd; + +static inline char *get_saved_cmdlines(int idx) +{ + return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; +} + +static inline void set_cmdline(int idx, const char *cmdline) +{ + strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); +} + +static int allocate_cmdlines_buffer(unsigned int val, + struct saved_cmdlines_buffer *s) +{ + s->map_cmdline_to_pid = kmalloc_array(val, + sizeof(*s->map_cmdline_to_pid), + GFP_KERNEL); + if (!s->map_cmdline_to_pid) + return -ENOMEM; + + s->saved_cmdlines = kmalloc_array(TASK_COMM_LEN, val, GFP_KERNEL); + if (!s->saved_cmdlines) { + kfree(s->map_cmdline_to_pid); + return -ENOMEM; + } + + s->cmdline_idx = 0; + s->cmdline_num = val; + memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, + sizeof(s->map_pid_to_cmdline)); + memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, + val * sizeof(*s->map_cmdline_to_pid)); + + return 0; +} + +static int trace_create_savedcmd(void) +{ + int ret; + + savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL); + if (!savedcmd) + return -ENOMEM; + + ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd); + if (ret < 0) { + kfree(savedcmd); + savedcmd = NULL; + return -ENOMEM; + } + + return 0; +} + +int is_tracing_stopped(void) +{ + return global_trace.stop_count; +} + +static void tracing_start_tr(struct trace_array *tr) +{ + struct trace_buffer *buffer; + unsigned long flags; + + if (tracing_disabled) + return; + + raw_spin_lock_irqsave(&tr->start_lock, flags); + if (--tr->stop_count) { + if (WARN_ON_ONCE(tr->stop_count < 0)) { + /* Someone screwed up their debugging */ + tr->stop_count = 0; + } + goto out; + } + + /* Prevent the buffers from switching */ + arch_spin_lock(&tr->max_lock); + + buffer = tr->array_buffer.buffer; + if (buffer) + ring_buffer_record_enable(buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE + buffer = tr->max_buffer.buffer; + if (buffer) + ring_buffer_record_enable(buffer); +#endif + + arch_spin_unlock(&tr->max_lock); + + out: + raw_spin_unlock_irqrestore(&tr->start_lock, flags); +} + +/** + * tracing_start - quick start of the tracer + * + * If tracing is enabled but was stopped by tracing_stop, + * this will start the tracer back up. + */ +void tracing_start(void) + +{ + return tracing_start_tr(&global_trace); +} + +static void tracing_stop_tr(struct trace_array *tr) +{ + struct trace_buffer *buffer; + unsigned long flags; + + raw_spin_lock_irqsave(&tr->start_lock, flags); + if (tr->stop_count++) + goto out; + + /* Prevent the buffers from switching */ + arch_spin_lock(&tr->max_lock); + + buffer = tr->array_buffer.buffer; + if (buffer) + ring_buffer_record_disable(buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE + buffer = tr->max_buffer.buffer; + if (buffer) + ring_buffer_record_disable(buffer); +#endif + + arch_spin_unlock(&tr->max_lock); + + out: + raw_spin_unlock_irqrestore(&tr->start_lock, flags); +} + +/** + * tracing_stop - quick stop of the tracer + * + * Light weight way to stop tracing. Use in conjunction with + * tracing_start. + */ +void tracing_stop(void) +{ + return tracing_stop_tr(&global_trace); +} + +static int trace_save_cmdline(struct task_struct *tsk) +{ + unsigned tpid, idx; + + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + tpid = tsk->pid & (PID_MAX_DEFAULT - 1); + + /* + * It's not the end of the world if we don't get + * the lock, but we also don't want to spin + * nor do we want to disable interrupts, + * so if we miss here, then better luck next time. + * + * This is called within the scheduler and wake up, so interrupts + * had better been disabled and run queue lock been held. + */ + lockdep_assert_preemption_disabled(); + if (!arch_spin_trylock(&trace_cmdline_lock)) + return 0; + + idx = savedcmd->map_pid_to_cmdline[tpid]; + if (idx == NO_CMDLINE_MAP) { + idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; + + savedcmd->map_pid_to_cmdline[tpid] = idx; + savedcmd->cmdline_idx = idx; + } + + savedcmd->map_cmdline_to_pid[idx] = tsk->pid; + set_cmdline(idx, tsk->comm); + + arch_spin_unlock(&trace_cmdline_lock); + + return 1; +} + +static void __trace_find_cmdline(int pid, char comm[]) +{ + unsigned map; + int tpid; + + if (!pid) { + strcpy(comm, "<idle>"); + return; + } + + if (WARN_ON_ONCE(pid < 0)) { + strcpy(comm, "<XXX>"); + return; + } + + tpid = pid & (PID_MAX_DEFAULT - 1); + map = savedcmd->map_pid_to_cmdline[tpid]; + if (map != NO_CMDLINE_MAP) { + tpid = savedcmd->map_cmdline_to_pid[map]; + if (tpid == pid) { + strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); + return; + } + } + strcpy(comm, "<...>"); +} + +void trace_find_cmdline(int pid, char comm[]) +{ + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + __trace_find_cmdline(pid, comm); + + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); +} + +static int *trace_find_tgid_ptr(int pid) +{ + /* + * Pairs with the smp_store_release in set_tracer_flag() to ensure that + * if we observe a non-NULL tgid_map then we also observe the correct + * tgid_map_max. + */ + int *map = smp_load_acquire(&tgid_map); + + if (unlikely(!map || pid > tgid_map_max)) + return NULL; + + return &map[pid]; +} + +int trace_find_tgid(int pid) +{ + int *ptr = trace_find_tgid_ptr(pid); + + return ptr ? *ptr : 0; +} + +static int trace_save_tgid(struct task_struct *tsk) +{ + int *ptr; + + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + ptr = trace_find_tgid_ptr(tsk->pid); + if (!ptr) + return 0; + + *ptr = tsk->tgid; + return 1; +} + +static bool tracing_record_taskinfo_skip(int flags) +{ + if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) + return true; + if (!__this_cpu_read(trace_taskinfo_save)) + return true; + return false; +} + +/** + * tracing_record_taskinfo - record the task info of a task + * + * @task: task to record + * @flags: TRACE_RECORD_CMDLINE for recording comm + * TRACE_RECORD_TGID for recording tgid + */ +void tracing_record_taskinfo(struct task_struct *task, int flags) +{ + bool done; + + if (tracing_record_taskinfo_skip(flags)) + return; + + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task); + + /* If recording any information failed, retry again soon. */ + if (!done) + return; + + __this_cpu_write(trace_taskinfo_save, false); +} + +/** + * tracing_record_taskinfo_sched_switch - record task info for sched_switch + * + * @prev: previous task during sched_switch + * @next: next task during sched_switch + * @flags: TRACE_RECORD_CMDLINE for recording comm + * TRACE_RECORD_TGID for recording tgid + */ +void tracing_record_taskinfo_sched_switch(struct task_struct *prev, + struct task_struct *next, int flags) +{ + bool done; + + if (tracing_record_taskinfo_skip(flags)) + return; + + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev); + done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next); + + /* If recording any information failed, retry again soon. */ + if (!done) + return; + + __this_cpu_write(trace_taskinfo_save, false); +} + +/* Helpers to record a specific task information */ +void tracing_record_cmdline(struct task_struct *task) +{ + tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE); +} + +void tracing_record_tgid(struct task_struct *task) +{ + tracing_record_taskinfo(task, TRACE_RECORD_TGID); +} + +/* + * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq + * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function + * simplifies those functions and keeps them in sync. + */ +enum print_line_t trace_handle_return(struct trace_seq *s) +{ + return trace_seq_has_overflowed(s) ? + TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; +} +EXPORT_SYMBOL_GPL(trace_handle_return); + +static unsigned short migration_disable_value(void) +{ +#if defined(CONFIG_SMP) + return current->migration_disabled; +#else + return 0; +#endif +} + +unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) +{ + unsigned int trace_flags = irqs_status; + unsigned int pc; + + pc = preempt_count(); + + if (pc & NMI_MASK) + trace_flags |= TRACE_FLAG_NMI; + if (pc & HARDIRQ_MASK) + trace_flags |= TRACE_FLAG_HARDIRQ; + if (in_serving_softirq()) + trace_flags |= TRACE_FLAG_SOFTIRQ; + if (softirq_count() >> (SOFTIRQ_SHIFT + 1)) + trace_flags |= TRACE_FLAG_BH_OFF; + + if (tif_need_resched()) + trace_flags |= TRACE_FLAG_NEED_RESCHED; + if (test_preempt_need_resched()) + trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; + return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) | + (min_t(unsigned int, migration_disable_value(), 0xf)) << 4; +} + +struct ring_buffer_event * +trace_buffer_lock_reserve(struct trace_buffer *buffer, + int type, + unsigned long len, + unsigned int trace_ctx) +{ + return __trace_buffer_lock_reserve(buffer, type, len, trace_ctx); +} + +DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); +DEFINE_PER_CPU(int, trace_buffered_event_cnt); +static int trace_buffered_event_ref; + +/** + * trace_buffered_event_enable - enable buffering events + * + * When events are being filtered, it is quicker to use a temporary + * buffer to write the event data into if there's a likely chance + * that it will not be committed. The discard of the ring buffer + * is not as fast as committing, and is much slower than copying + * a commit. + * + * When an event is to be filtered, allocate per cpu buffers to + * write the event data into, and if the event is filtered and discarded + * it is simply dropped, otherwise, the entire data is to be committed + * in one shot. + */ +void trace_buffered_event_enable(void) +{ + struct ring_buffer_event *event; + struct page *page; + int cpu; + + WARN_ON_ONCE(!mutex_is_locked(&event_mutex)); + + if (trace_buffered_event_ref++) + return; + + for_each_tracing_cpu(cpu) { + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_NORETRY, 0); + /* This is just an optimization and can handle failures */ + if (!page) { + pr_err("Failed to allocate event buffer\n"); + break; + } + + event = page_address(page); + memset(event, 0, sizeof(*event)); + + per_cpu(trace_buffered_event, cpu) = event; + + preempt_disable(); + if (cpu == smp_processor_id() && + __this_cpu_read(trace_buffered_event) != + per_cpu(trace_buffered_event, cpu)) + WARN_ON_ONCE(1); + preempt_enable(); + } +} + +static void enable_trace_buffered_event(void *data) +{ + /* Probably not needed, but do it anyway */ + smp_rmb(); + this_cpu_dec(trace_buffered_event_cnt); +} + +static void disable_trace_buffered_event(void *data) +{ + this_cpu_inc(trace_buffered_event_cnt); +} + +/** + * trace_buffered_event_disable - disable buffering events + * + * When a filter is removed, it is faster to not use the buffered + * events, and to commit directly into the ring buffer. Free up + * the temp buffers when there are no more users. This requires + * special synchronization with current events. + */ +void trace_buffered_event_disable(void) +{ + int cpu; + + WARN_ON_ONCE(!mutex_is_locked(&event_mutex)); + + if (WARN_ON_ONCE(!trace_buffered_event_ref)) + return; + + if (--trace_buffered_event_ref) + return; + + /* For each CPU, set the buffer as used. */ + on_each_cpu_mask(tracing_buffer_mask, disable_trace_buffered_event, + NULL, true); + + /* Wait for all current users to finish */ + synchronize_rcu(); + + for_each_tracing_cpu(cpu) { + free_page((unsigned long)per_cpu(trace_buffered_event, cpu)); + per_cpu(trace_buffered_event, cpu) = NULL; + } + + /* + * Wait for all CPUs that potentially started checking if they can use + * their event buffer only after the previous synchronize_rcu() call and + * they still read a valid pointer from trace_buffered_event. It must be + * ensured they don't see cleared trace_buffered_event_cnt else they + * could wrongly decide to use the pointed-to buffer which is now freed. + */ + synchronize_rcu(); + + /* For each CPU, relinquish the buffer */ + on_each_cpu_mask(tracing_buffer_mask, enable_trace_buffered_event, NULL, + true); +} + +static struct trace_buffer *temp_buffer; + +struct ring_buffer_event * +trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, + struct trace_event_file *trace_file, + int type, unsigned long len, + unsigned int trace_ctx) +{ + struct ring_buffer_event *entry; + struct trace_array *tr = trace_file->tr; + int val; + + *current_rb = tr->array_buffer.buffer; + + if (!tr->no_filter_buffering_ref && + (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED))) { + preempt_disable_notrace(); + /* + * Filtering is on, so try to use the per cpu buffer first. + * This buffer will simulate a ring_buffer_event, + * where the type_len is zero and the array[0] will + * hold the full length. + * (see include/linux/ring-buffer.h for details on + * how the ring_buffer_event is structured). + * + * Using a temp buffer during filtering and copying it + * on a matched filter is quicker than writing directly + * into the ring buffer and then discarding it when + * it doesn't match. That is because the discard + * requires several atomic operations to get right. + * Copying on match and doing nothing on a failed match + * is still quicker than no copy on match, but having + * to discard out of the ring buffer on a failed match. + */ + if ((entry = __this_cpu_read(trace_buffered_event))) { + int max_len = PAGE_SIZE - struct_size(entry, array, 1); + + val = this_cpu_inc_return(trace_buffered_event_cnt); + + /* + * Preemption is disabled, but interrupts and NMIs + * can still come in now. If that happens after + * the above increment, then it will have to go + * back to the old method of allocating the event + * on the ring buffer, and if the filter fails, it + * will have to call ring_buffer_discard_commit() + * to remove it. + * + * Need to also check the unlikely case that the + * length is bigger than the temp buffer size. + * If that happens, then the reserve is pretty much + * guaranteed to fail, as the ring buffer currently + * only allows events less than a page. But that may + * change in the future, so let the ring buffer reserve + * handle the failure in that case. + */ + if (val == 1 && likely(len <= max_len)) { + trace_event_setup(entry, type, trace_ctx); + entry->array[0] = len; + /* Return with preemption disabled */ + return entry; + } + this_cpu_dec(trace_buffered_event_cnt); + } + /* __trace_buffer_lock_reserve() disables preemption */ + preempt_enable_notrace(); + } + + entry = __trace_buffer_lock_reserve(*current_rb, type, len, + trace_ctx); + /* + * If tracing is off, but we have triggers enabled + * we still need to look at the event data. Use the temp_buffer + * to store the trace event for the trigger to use. It's recursive + * safe and will not be recorded anywhere. + */ + if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) { + *current_rb = temp_buffer; + entry = __trace_buffer_lock_reserve(*current_rb, type, len, + trace_ctx); + } + return entry; +} +EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); + +static DEFINE_RAW_SPINLOCK(tracepoint_iter_lock); +static DEFINE_MUTEX(tracepoint_printk_mutex); + +static void output_printk(struct trace_event_buffer *fbuffer) +{ + struct trace_event_call *event_call; + struct trace_event_file *file; + struct trace_event *event; + unsigned long flags; + struct trace_iterator *iter = tracepoint_print_iter; + + /* We should never get here if iter is NULL */ + if (WARN_ON_ONCE(!iter)) + return; + + event_call = fbuffer->trace_file->event_call; + if (!event_call || !event_call->event.funcs || + !event_call->event.funcs->trace) + return; + + file = fbuffer->trace_file; + if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || + (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && + !filter_match_preds(file->filter, fbuffer->entry))) + return; + + event = &fbuffer->trace_file->event_call->event; + + raw_spin_lock_irqsave(&tracepoint_iter_lock, flags); + trace_seq_init(&iter->seq); + iter->ent = fbuffer->entry; + event_call->event.funcs->trace(iter, 0, event); + trace_seq_putc(&iter->seq, 0); + printk("%s", iter->seq.buffer); + + raw_spin_unlock_irqrestore(&tracepoint_iter_lock, flags); +} + +int tracepoint_printk_sysctl(struct ctl_table *table, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + int save_tracepoint_printk; + int ret; + + mutex_lock(&tracepoint_printk_mutex); + save_tracepoint_printk = tracepoint_printk; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + /* + * This will force exiting early, as tracepoint_printk + * is always zero when tracepoint_printk_iter is not allocated + */ + if (!tracepoint_print_iter) + tracepoint_printk = 0; + + if (save_tracepoint_printk == tracepoint_printk) + goto out; + + if (tracepoint_printk) + static_key_enable(&tracepoint_printk_key.key); + else + static_key_disable(&tracepoint_printk_key.key); + + out: + mutex_unlock(&tracepoint_printk_mutex); + + return ret; +} + +void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) +{ + enum event_trigger_type tt = ETT_NONE; + struct trace_event_file *file = fbuffer->trace_file; + + if (__event_trigger_test_discard(file, fbuffer->buffer, fbuffer->event, + fbuffer->entry, &tt)) + goto discard; + + if (static_key_false(&tracepoint_printk_key.key)) + output_printk(fbuffer); + + if (static_branch_unlikely(&trace_event_exports_enabled)) + ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT); + + trace_buffer_unlock_commit_regs(file->tr, fbuffer->buffer, + fbuffer->event, fbuffer->trace_ctx, fbuffer->regs); + +discard: + if (tt) + event_triggers_post_call(file, tt); + +} +EXPORT_SYMBOL_GPL(trace_event_buffer_commit); + +/* + * Skip 3: + * + * trace_buffer_unlock_commit_regs() + * trace_event_buffer_commit() + * trace_event_raw_event_xxx() + */ +# define STACK_SKIP 3 + +void trace_buffer_unlock_commit_regs(struct trace_array *tr, + struct trace_buffer *buffer, + struct ring_buffer_event *event, + unsigned int trace_ctx, + struct pt_regs *regs) +{ + __buffer_unlock_commit(buffer, event); + + /* + * If regs is not set, then skip the necessary functions. + * Note, we can still get here via blktrace, wakeup tracer + * and mmiotrace, but that's ok if they lose a function or + * two. They are not that meaningful. + */ + ftrace_trace_stack(tr, buffer, trace_ctx, regs ? 0 : STACK_SKIP, regs); + ftrace_trace_userstack(tr, buffer, trace_ctx); +} + +/* + * Similar to trace_buffer_unlock_commit_regs() but do not dump stack. + */ +void +trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, + struct ring_buffer_event *event) +{ + __buffer_unlock_commit(buffer, event); +} + +void +trace_function(struct trace_array *tr, unsigned long ip, unsigned long + parent_ip, unsigned int trace_ctx) +{ + struct trace_event_call *call = &event_function; + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct ring_buffer_event *event; + struct ftrace_entry *entry; + + event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), + trace_ctx); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->parent_ip = parent_ip; + + if (!call_filter_check_discard(call, entry, buffer, event)) { + if (static_branch_unlikely(&trace_function_exports_enabled)) + ftrace_exports(event, TRACE_EXPORT_FUNCTION); + __buffer_unlock_commit(buffer, event); + } +} + +#ifdef CONFIG_STACKTRACE + +/* Allow 4 levels of nesting: normal, softirq, irq, NMI */ +#define FTRACE_KSTACK_NESTING 4 + +#define FTRACE_KSTACK_ENTRIES (PAGE_SIZE / FTRACE_KSTACK_NESTING) + +struct ftrace_stack { + unsigned long calls[FTRACE_KSTACK_ENTRIES]; +}; + + +struct ftrace_stacks { + struct ftrace_stack stacks[FTRACE_KSTACK_NESTING]; +}; + +static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks); +static DEFINE_PER_CPU(int, ftrace_stack_reserve); + +static void __ftrace_trace_stack(struct trace_buffer *buffer, + unsigned int trace_ctx, + int skip, struct pt_regs *regs) +{ + struct trace_event_call *call = &event_kernel_stack; + struct ring_buffer_event *event; + unsigned int size, nr_entries; + struct ftrace_stack *fstack; + struct stack_entry *entry; + int stackidx; + + /* + * Add one, for this function and the call to save_stack_trace() + * If regs is set, then these functions will not be in the way. + */ +#ifndef CONFIG_UNWINDER_ORC + if (!regs) + skip++; +#endif + + preempt_disable_notrace(); + + stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; + + /* This should never happen. If it does, yell once and skip */ + if (WARN_ON_ONCE(stackidx >= FTRACE_KSTACK_NESTING)) + goto out; + + /* + * The above __this_cpu_inc_return() is 'atomic' cpu local. An + * interrupt will either see the value pre increment or post + * increment. If the interrupt happens pre increment it will have + * restored the counter when it returns. We just need a barrier to + * keep gcc from moving things around. + */ + barrier(); + + fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx; + size = ARRAY_SIZE(fstack->calls); + + if (regs) { + nr_entries = stack_trace_save_regs(regs, fstack->calls, + size, skip); + } else { + nr_entries = stack_trace_save(fstack->calls, size, skip); + } + + event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, + struct_size(entry, caller, nr_entries), + trace_ctx); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + + entry->size = nr_entries; + memcpy(&entry->caller, fstack->calls, + flex_array_size(entry, caller, nr_entries)); + + if (!call_filter_check_discard(call, entry, buffer, event)) + __buffer_unlock_commit(buffer, event); + + out: + /* Again, don't let gcc optimize things here */ + barrier(); + __this_cpu_dec(ftrace_stack_reserve); + preempt_enable_notrace(); + +} + +static inline void ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned int trace_ctx, + int skip, struct pt_regs *regs) +{ + if (!(tr->trace_flags & TRACE_ITER_STACKTRACE)) + return; + + __ftrace_trace_stack(buffer, trace_ctx, skip, regs); +} + +void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, + int skip) +{ + struct trace_buffer *buffer = tr->array_buffer.buffer; + + if (rcu_is_watching()) { + __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); + return; + } + + if (WARN_ON_ONCE(IS_ENABLED(CONFIG_GENERIC_ENTRY))) + return; + + /* + * When an NMI triggers, RCU is enabled via ct_nmi_enter(), + * but if the above rcu_is_watching() failed, then the NMI + * triggered someplace critical, and ct_irq_enter() should + * not be called from NMI. + */ + if (unlikely(in_nmi())) + return; + + ct_irq_enter_irqson(); + __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); + ct_irq_exit_irqson(); +} + +/** + * trace_dump_stack - record a stack back trace in the trace buffer + * @skip: Number of functions to skip (helper handlers) + */ +void trace_dump_stack(int skip) +{ + if (tracing_disabled || tracing_selftest_running) + return; + +#ifndef CONFIG_UNWINDER_ORC + /* Skip 1 to skip this function. */ + skip++; +#endif + __ftrace_trace_stack(global_trace.array_buffer.buffer, + tracing_gen_ctx(), skip, NULL); +} +EXPORT_SYMBOL_GPL(trace_dump_stack); + +#ifdef CONFIG_USER_STACKTRACE_SUPPORT +static DEFINE_PER_CPU(int, user_stack_count); + +static void +ftrace_trace_userstack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned int trace_ctx) +{ + struct trace_event_call *call = &event_user_stack; + struct ring_buffer_event *event; + struct userstack_entry *entry; + + if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE)) + return; + + /* + * NMIs can not handle page faults, even with fix ups. + * The save user stack can (and often does) fault. + */ + if (unlikely(in_nmi())) + return; + + /* + * prevent recursion, since the user stack tracing may + * trigger other kernel events. + */ + preempt_disable(); + if (__this_cpu_read(user_stack_count)) + goto out; + + __this_cpu_inc(user_stack_count); + + event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, + sizeof(*entry), trace_ctx); + if (!event) + goto out_drop_count; + entry = ring_buffer_event_data(event); + + entry->tgid = current->tgid; + memset(&entry->caller, 0, sizeof(entry->caller)); + + stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES); + if (!call_filter_check_discard(call, entry, buffer, event)) + __buffer_unlock_commit(buffer, event); + + out_drop_count: + __this_cpu_dec(user_stack_count); + out: + preempt_enable(); +} +#else /* CONFIG_USER_STACKTRACE_SUPPORT */ +static void ftrace_trace_userstack(struct trace_array *tr, + struct trace_buffer *buffer, + unsigned int trace_ctx) +{ +} +#endif /* !CONFIG_USER_STACKTRACE_SUPPORT */ + +#endif /* CONFIG_STACKTRACE */ + +static inline void +func_repeats_set_delta_ts(struct func_repeats_entry *entry, + unsigned long long delta) +{ + entry->bottom_delta_ts = delta & U32_MAX; + entry->top_delta_ts = (delta >> 32); +} + +void trace_last_func_repeats(struct trace_array *tr, + struct trace_func_repeats *last_info, + unsigned int trace_ctx) +{ + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct func_repeats_entry *entry; + struct ring_buffer_event *event; + u64 delta; + + event = __trace_buffer_lock_reserve(buffer, TRACE_FUNC_REPEATS, + sizeof(*entry), trace_ctx); + if (!event) + return; + + delta = ring_buffer_event_time_stamp(buffer, event) - + last_info->ts_last_call; + + entry = ring_buffer_event_data(event); + entry->ip = last_info->ip; + entry->parent_ip = last_info->parent_ip; + entry->count = last_info->count; + func_repeats_set_delta_ts(entry, delta); + + __buffer_unlock_commit(buffer, event); +} + +/* created for use with alloc_percpu */ +struct trace_buffer_struct { + int nesting; + char buffer[4][TRACE_BUF_SIZE]; +}; + +static struct trace_buffer_struct __percpu *trace_percpu_buffer; + +/* + * This allows for lockless recording. If we're nested too deeply, then + * this returns NULL. + */ +static char *get_trace_buf(void) +{ + struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); + + if (!trace_percpu_buffer || buffer->nesting >= 4) + return NULL; + + buffer->nesting++; + + /* Interrupts must see nesting incremented before we use the buffer */ + barrier(); + return &buffer->buffer[buffer->nesting - 1][0]; +} + +static void put_trace_buf(void) +{ + /* Don't let the decrement of nesting leak before this */ + barrier(); + this_cpu_dec(trace_percpu_buffer->nesting); +} + +static int alloc_percpu_trace_buffer(void) +{ + struct trace_buffer_struct __percpu *buffers; + + if (trace_percpu_buffer) + return 0; + + buffers = alloc_percpu(struct trace_buffer_struct); + if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer")) + return -ENOMEM; + + trace_percpu_buffer = buffers; + return 0; +} + +static int buffers_allocated; + +void trace_printk_init_buffers(void) +{ + if (buffers_allocated) + return; + + if (alloc_percpu_trace_buffer()) + return; + + /* trace_printk() is for debug use only. Don't use it in production. */ + + pr_warn("\n"); + pr_warn("**********************************************************\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("** **\n"); + pr_warn("** trace_printk() being used. Allocating extra memory. **\n"); + pr_warn("** **\n"); + pr_warn("** This means that this is a DEBUG kernel and it is **\n"); + pr_warn("** unsafe for production use. **\n"); + pr_warn("** **\n"); + pr_warn("** If you see this message and you are not debugging **\n"); + pr_warn("** the kernel, report this immediately to your vendor! **\n"); + pr_warn("** **\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("**********************************************************\n"); + + /* Expand the buffers to set size */ + tracing_update_buffers(); + + buffers_allocated = 1; + + /* + * trace_printk_init_buffers() can be called by modules. + * If that happens, then we need to start cmdline recording + * directly here. If the global_trace.buffer is already + * allocated here, then this was called by module code. + */ + if (global_trace.array_buffer.buffer) + tracing_start_cmdline_record(); +} +EXPORT_SYMBOL_GPL(trace_printk_init_buffers); + +void trace_printk_start_comm(void) +{ + /* Start tracing comms if trace printk is set */ + if (!buffers_allocated) + return; + tracing_start_cmdline_record(); +} + +static void trace_printk_start_stop_comm(int enabled) +{ + if (!buffers_allocated) + return; + + if (enabled) + tracing_start_cmdline_record(); + else + tracing_stop_cmdline_record(); +} + +/** + * trace_vbprintk - write binary msg to tracing buffer + * @ip: The address of the caller + * @fmt: The string format to write to the buffer + * @args: Arguments for @fmt + */ +int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) +{ + struct trace_event_call *call = &event_bprint; + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct trace_array *tr = &global_trace; + struct bprint_entry *entry; + unsigned int trace_ctx; + char *tbuffer; + int len = 0, size; + + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + + /* Don't pollute graph traces with trace_vprintk internals */ + pause_graph_tracing(); + + trace_ctx = tracing_gen_ctx(); + preempt_disable_notrace(); + + tbuffer = get_trace_buf(); + if (!tbuffer) { + len = 0; + goto out_nobuffer; + } + + len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); + + if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) + goto out_put; + + size = sizeof(*entry) + sizeof(u32) * len; + buffer = tr->array_buffer.buffer; + ring_buffer_nest_start(buffer); + event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, + trace_ctx); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->fmt = fmt; + + memcpy(entry->buf, tbuffer, sizeof(u32) * len); + if (!call_filter_check_discard(call, entry, buffer, event)) { + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); + } + +out: + ring_buffer_nest_end(buffer); +out_put: + put_trace_buf(); + +out_nobuffer: + preempt_enable_notrace(); + unpause_graph_tracing(); + + return len; +} +EXPORT_SYMBOL_GPL(trace_vbprintk); + +__printf(3, 0) +static int +__trace_array_vprintk(struct trace_buffer *buffer, + unsigned long ip, const char *fmt, va_list args) +{ + struct trace_event_call *call = &event_print; + struct ring_buffer_event *event; + int len = 0, size; + struct print_entry *entry; + unsigned int trace_ctx; + char *tbuffer; + + if (tracing_disabled) + return 0; + + /* Don't pollute graph traces with trace_vprintk internals */ + pause_graph_tracing(); + + trace_ctx = tracing_gen_ctx(); + preempt_disable_notrace(); + + + tbuffer = get_trace_buf(); + if (!tbuffer) { + len = 0; + goto out_nobuffer; + } + + len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); + + size = sizeof(*entry) + len + 1; + ring_buffer_nest_start(buffer); + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + trace_ctx); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->ip = ip; + + memcpy(&entry->buf, tbuffer, len + 1); + if (!call_filter_check_discard(call, entry, buffer, event)) { + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(&global_trace, buffer, trace_ctx, 6, NULL); + } + +out: + ring_buffer_nest_end(buffer); + put_trace_buf(); + +out_nobuffer: + preempt_enable_notrace(); + unpause_graph_tracing(); + + return len; +} + +__printf(3, 0) +int trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args) +{ + if (tracing_selftest_running && tr == &global_trace) + return 0; + + return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args); +} + +/** + * trace_array_printk - Print a message to a specific instance + * @tr: The instance trace_array descriptor + * @ip: The instruction pointer that this is called from. + * @fmt: The format to print (printf format) + * + * If a subsystem sets up its own instance, they have the right to + * printk strings into their tracing instance buffer using this + * function. Note, this function will not write into the top level + * buffer (use trace_printk() for that), as writing into the top level + * buffer should only have events that can be individually disabled. + * trace_printk() is only used for debugging a kernel, and should not + * be ever incorporated in normal use. + * + * trace_array_printk() can be used, as it will not add noise to the + * top level tracing buffer. + * + * Note, trace_array_init_printk() must be called on @tr before this + * can be used. + */ +__printf(3, 0) +int trace_array_printk(struct trace_array *tr, + unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!tr) + return -ENOENT; + + /* This is only allowed for created instances */ + if (tr == &global_trace) + return 0; + + if (!(tr->trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = trace_array_vprintk(tr, ip, fmt, ap); + va_end(ap); + return ret; +} +EXPORT_SYMBOL_GPL(trace_array_printk); + +/** + * trace_array_init_printk - Initialize buffers for trace_array_printk() + * @tr: The trace array to initialize the buffers for + * + * As trace_array_printk() only writes into instances, they are OK to + * have in the kernel (unlike trace_printk()). This needs to be called + * before trace_array_printk() can be used on a trace_array. + */ +int trace_array_init_printk(struct trace_array *tr) +{ + if (!tr) + return -ENOENT; + + /* This is only allowed for created instances */ + if (tr == &global_trace) + return -EINVAL; + + return alloc_percpu_trace_buffer(); +} +EXPORT_SYMBOL_GPL(trace_array_init_printk); + +__printf(3, 4) +int trace_array_printk_buf(struct trace_buffer *buffer, + unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = __trace_array_vprintk(buffer, ip, fmt, ap); + va_end(ap); + return ret; +} + +__printf(2, 0) +int trace_vprintk(unsigned long ip, const char *fmt, va_list args) +{ + return trace_array_vprintk(&global_trace, ip, fmt, args); +} +EXPORT_SYMBOL_GPL(trace_vprintk); + +static void trace_iterator_increment(struct trace_iterator *iter) +{ + struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu); + + iter->idx++; + if (buf_iter) + ring_buffer_iter_advance(buf_iter); +} + +static struct trace_entry * +peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, + unsigned long *lost_events) +{ + struct ring_buffer_event *event; + struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu); + + if (buf_iter) { + event = ring_buffer_iter_peek(buf_iter, ts); + if (lost_events) + *lost_events = ring_buffer_iter_dropped(buf_iter) ? + (unsigned long)-1 : 0; + } else { + event = ring_buffer_peek(iter->array_buffer->buffer, cpu, ts, + lost_events); + } + + if (event) { + iter->ent_size = ring_buffer_event_length(event); + return ring_buffer_event_data(event); + } + iter->ent_size = 0; + return NULL; +} + +static struct trace_entry * +__find_next_entry(struct trace_iterator *iter, int *ent_cpu, + unsigned long *missing_events, u64 *ent_ts) +{ + struct trace_buffer *buffer = iter->array_buffer->buffer; + struct trace_entry *ent, *next = NULL; + unsigned long lost_events = 0, next_lost = 0; + int cpu_file = iter->cpu_file; + u64 next_ts = 0, ts; + int next_cpu = -1; + int next_size = 0; + int cpu; + + /* + * If we are in a per_cpu trace file, don't bother by iterating over + * all cpu and peek directly. + */ + if (cpu_file > RING_BUFFER_ALL_CPUS) { + if (ring_buffer_empty_cpu(buffer, cpu_file)) + return NULL; + ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); + if (ent_cpu) + *ent_cpu = cpu_file; + + return ent; + } + + for_each_tracing_cpu(cpu) { + + if (ring_buffer_empty_cpu(buffer, cpu)) + continue; + + ent = peek_next_entry(iter, cpu, &ts, &lost_events); + + /* + * Pick the entry with the smallest timestamp: + */ + if (ent && (!next || ts < next_ts)) { + next = ent; + next_cpu = cpu; + next_ts = ts; + next_lost = lost_events; + next_size = iter->ent_size; + } + } + + iter->ent_size = next_size; + + if (ent_cpu) + *ent_cpu = next_cpu; + + if (ent_ts) + *ent_ts = next_ts; + + if (missing_events) + *missing_events = next_lost; + + return next; +} + +#define STATIC_FMT_BUF_SIZE 128 +static char static_fmt_buf[STATIC_FMT_BUF_SIZE]; + +char *trace_iter_expand_format(struct trace_iterator *iter) +{ + char *tmp; + + /* + * iter->tr is NULL when used with tp_printk, which makes + * this get called where it is not safe to call krealloc(). + */ + if (!iter->tr || iter->fmt == static_fmt_buf) + return NULL; + + tmp = krealloc(iter->fmt, iter->fmt_size + STATIC_FMT_BUF_SIZE, + GFP_KERNEL); + if (tmp) { + iter->fmt_size += STATIC_FMT_BUF_SIZE; + iter->fmt = tmp; + } + + return tmp; +} + +/* Returns true if the string is safe to dereference from an event */ +static bool trace_safe_str(struct trace_iterator *iter, const char *str, + bool star, int len) +{ + unsigned long addr = (unsigned long)str; + struct trace_event *trace_event; + struct trace_event_call *event; + + /* Ignore strings with no length */ + if (star && !len) + return true; + + /* OK if part of the event data */ + if ((addr >= (unsigned long)iter->ent) && + (addr < (unsigned long)iter->ent + iter->ent_size)) + return true; + + /* OK if part of the temp seq buffer */ + if ((addr >= (unsigned long)iter->tmp_seq.buffer) && + (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE)) + return true; + + /* Core rodata can not be freed */ + if (is_kernel_rodata(addr)) + return true; + + if (trace_is_tracepoint_string(str)) + return true; + + /* + * Now this could be a module event, referencing core module + * data, which is OK. + */ + if (!iter->ent) + return false; + + trace_event = ftrace_find_event(iter->ent->type); + if (!trace_event) + return false; + + event = container_of(trace_event, struct trace_event_call, event); + if ((event->flags & TRACE_EVENT_FL_DYNAMIC) || !event->module) + return false; + + /* Would rather have rodata, but this will suffice */ + if (within_module_core(addr, event->module)) + return true; + + return false; +} + +static const char *show_buffer(struct trace_seq *s) +{ + struct seq_buf *seq = &s->seq; + + seq_buf_terminate(seq); + + return seq->buffer; +} + +static DEFINE_STATIC_KEY_FALSE(trace_no_verify); + +static int test_can_verify_check(const char *fmt, ...) +{ + char buf[16]; + va_list ap; + int ret; + + /* + * The verifier is dependent on vsnprintf() modifies the va_list + * passed to it, where it is sent as a reference. Some architectures + * (like x86_32) passes it by value, which means that vsnprintf() + * does not modify the va_list passed to it, and the verifier + * would then need to be able to understand all the values that + * vsnprintf can use. If it is passed by value, then the verifier + * is disabled. + */ + va_start(ap, fmt); + vsnprintf(buf, 16, "%d", ap); + ret = va_arg(ap, int); + va_end(ap); + + return ret; +} + +static void test_can_verify(void) +{ + if (!test_can_verify_check("%d %d", 0, 1)) { + pr_info("trace event string verifier disabled\n"); + static_branch_inc(&trace_no_verify); + } +} + +/** + * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer + * @iter: The iterator that holds the seq buffer and the event being printed + * @fmt: The format used to print the event + * @ap: The va_list holding the data to print from @fmt. + * + * This writes the data into the @iter->seq buffer using the data from + * @fmt and @ap. If the format has a %s, then the source of the string + * is examined to make sure it is safe to print, otherwise it will + * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string + * pointer. + */ +void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, + va_list ap) +{ + const char *p = fmt; + const char *str; + int i, j; + + if (WARN_ON_ONCE(!fmt)) + return; + + if (static_branch_unlikely(&trace_no_verify)) + goto print; + + /* Don't bother checking when doing a ftrace_dump() */ + if (iter->fmt == static_fmt_buf) + goto print; + + while (*p) { + bool star = false; + int len = 0; + + j = 0; + + /* We only care about %s and variants */ + for (i = 0; p[i]; i++) { + if (i + 1 >= iter->fmt_size) { + /* + * If we can't expand the copy buffer, + * just print it. + */ + if (!trace_iter_expand_format(iter)) + goto print; + } + + if (p[i] == '\\' && p[i+1]) { + i++; + continue; + } + if (p[i] == '%') { + /* Need to test cases like %08.*s */ + for (j = 1; p[i+j]; j++) { + if (isdigit(p[i+j]) || + p[i+j] == '.') + continue; + if (p[i+j] == '*') { + star = true; + continue; + } + break; + } + if (p[i+j] == 's') + break; + star = false; + } + j = 0; + } + /* If no %s found then just print normally */ + if (!p[i]) + break; + + /* Copy up to the %s, and print that */ + strncpy(iter->fmt, p, i); + iter->fmt[i] = '\0'; + trace_seq_vprintf(&iter->seq, iter->fmt, ap); + + /* + * If iter->seq is full, the above call no longer guarantees + * that ap is in sync with fmt processing, and further calls + * to va_arg() can return wrong positional arguments. + * + * Ensure that ap is no longer used in this case. + */ + if (iter->seq.full) { + p = ""; + break; + } + + if (star) + len = va_arg(ap, int); + + /* The ap now points to the string data of the %s */ + str = va_arg(ap, const char *); + + /* + * If you hit this warning, it is likely that the + * trace event in question used %s on a string that + * was saved at the time of the event, but may not be + * around when the trace is read. Use __string(), + * __assign_str() and __get_str() helpers in the TRACE_EVENT() + * instead. See samples/trace_events/trace-events-sample.h + * for reference. + */ + if (WARN_ONCE(!trace_safe_str(iter, str, star, len), + "fmt: '%s' current_buffer: '%s'", + fmt, show_buffer(&iter->seq))) { + int ret; + + /* Try to safely read the string */ + if (star) { + if (len + 1 > iter->fmt_size) + len = iter->fmt_size - 1; + if (len < 0) + len = 0; + ret = copy_from_kernel_nofault(iter->fmt, str, len); + iter->fmt[len] = 0; + star = false; + } else { + ret = strncpy_from_kernel_nofault(iter->fmt, str, + iter->fmt_size); + } + if (ret < 0) + trace_seq_printf(&iter->seq, "(0x%px)", str); + else + trace_seq_printf(&iter->seq, "(0x%px:%s)", + str, iter->fmt); + str = "[UNSAFE-MEMORY]"; + strcpy(iter->fmt, "%s"); + } else { + strncpy(iter->fmt, p + i, j + 1); + iter->fmt[j+1] = '\0'; + } + if (star) + trace_seq_printf(&iter->seq, iter->fmt, len, str); + else + trace_seq_printf(&iter->seq, iter->fmt, str); + + p += i + j + 1; + } + print: + if (*p) + trace_seq_vprintf(&iter->seq, p, ap); +} + +const char *trace_event_format(struct trace_iterator *iter, const char *fmt) +{ + const char *p, *new_fmt; + char *q; + + if (WARN_ON_ONCE(!fmt)) + return fmt; + + if (!iter->tr || iter->tr->trace_flags & TRACE_ITER_HASH_PTR) + return fmt; + + p = fmt; + new_fmt = q = iter->fmt; + while (*p) { + if (unlikely(q - new_fmt + 3 > iter->fmt_size)) { + if (!trace_iter_expand_format(iter)) + return fmt; + + q += iter->fmt - new_fmt; + new_fmt = iter->fmt; + } + + *q++ = *p++; + + /* Replace %p with %px */ + if (p[-1] == '%') { + if (p[0] == '%') { + *q++ = *p++; + } else if (p[0] == 'p' && !isalnum(p[1])) { + *q++ = *p++; + *q++ = 'x'; + } + } + } + *q = '\0'; + + return new_fmt; +} + +#define STATIC_TEMP_BUF_SIZE 128 +static char static_temp_buf[STATIC_TEMP_BUF_SIZE] __aligned(4); + +/* Find the next real entry, without updating the iterator itself */ +struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, + int *ent_cpu, u64 *ent_ts) +{ + /* __find_next_entry will reset ent_size */ + int ent_size = iter->ent_size; + struct trace_entry *entry; + + /* + * If called from ftrace_dump(), then the iter->temp buffer + * will be the static_temp_buf and not created from kmalloc. + * If the entry size is greater than the buffer, we can + * not save it. Just return NULL in that case. This is only + * used to add markers when two consecutive events' time + * stamps have a large delta. See trace_print_lat_context() + */ + if (iter->temp == static_temp_buf && + STATIC_TEMP_BUF_SIZE < ent_size) + return NULL; + + /* + * The __find_next_entry() may call peek_next_entry(), which may + * call ring_buffer_peek() that may make the contents of iter->ent + * undefined. Need to copy iter->ent now. + */ + if (iter->ent && iter->ent != iter->temp) { + if ((!iter->temp || iter->temp_size < iter->ent_size) && + !WARN_ON_ONCE(iter->temp == static_temp_buf)) { + void *temp; + temp = kmalloc(iter->ent_size, GFP_KERNEL); + if (!temp) + return NULL; + kfree(iter->temp); + iter->temp = temp; + iter->temp_size = iter->ent_size; + } + memcpy(iter->temp, iter->ent, iter->ent_size); + iter->ent = iter->temp; + } + entry = __find_next_entry(iter, ent_cpu, NULL, ent_ts); + /* Put back the original ent_size */ + iter->ent_size = ent_size; + + return entry; +} + +/* Find the next real entry, and increment the iterator to the next entry */ +void *trace_find_next_entry_inc(struct trace_iterator *iter) +{ + iter->ent = __find_next_entry(iter, &iter->cpu, + &iter->lost_events, &iter->ts); + + if (iter->ent) + trace_iterator_increment(iter); + + return iter->ent ? iter : NULL; +} + +static void trace_consume(struct trace_iterator *iter) +{ + ring_buffer_consume(iter->array_buffer->buffer, iter->cpu, &iter->ts, + &iter->lost_events); +} + +static void *s_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_iterator *iter = m->private; + int i = (int)*pos; + void *ent; + + WARN_ON_ONCE(iter->leftover); + + (*pos)++; + + /* can't go backwards */ + if (iter->idx > i) + return NULL; + + if (iter->idx < 0) + ent = trace_find_next_entry_inc(iter); + else + ent = iter; + + while (ent && iter->idx < i) + ent = trace_find_next_entry_inc(iter); + + iter->pos = *pos; + + return ent; +} + +void tracing_iter_reset(struct trace_iterator *iter, int cpu) +{ + struct ring_buffer_iter *buf_iter; + unsigned long entries = 0; + u64 ts; + + per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = 0; + + buf_iter = trace_buffer_iter(iter, cpu); + if (!buf_iter) + return; + + ring_buffer_iter_reset(buf_iter); + + /* + * We could have the case with the max latency tracers + * that a reset never took place on a cpu. This is evident + * by the timestamp being before the start of the buffer. + */ + while (ring_buffer_iter_peek(buf_iter, &ts)) { + if (ts >= iter->array_buffer->time_start) + break; + entries++; + ring_buffer_iter_advance(buf_iter); + } + + per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries; +} + +/* + * The current tracer is copied to avoid a global locking + * all around. + */ +static void *s_start(struct seq_file *m, loff_t *pos) +{ + struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; + int cpu_file = iter->cpu_file; + void *p = NULL; + loff_t l = 0; + int cpu; + + mutex_lock(&trace_types_lock); + if (unlikely(tr->current_trace != iter->trace)) { + /* Close iter->trace before switching to the new current tracer */ + if (iter->trace->close) + iter->trace->close(iter); + iter->trace = tr->current_trace; + /* Reopen the new current tracer */ + if (iter->trace->open) + iter->trace->open(iter); + } + mutex_unlock(&trace_types_lock); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (iter->snapshot && iter->trace->use_max_tr) + return ERR_PTR(-EBUSY); +#endif + + if (*pos != iter->pos) { + iter->ent = NULL; + iter->cpu = 0; + iter->idx = -1; + + if (cpu_file == RING_BUFFER_ALL_CPUS) { + for_each_tracing_cpu(cpu) + tracing_iter_reset(iter, cpu); + } else + tracing_iter_reset(iter, cpu_file); + + iter->leftover = 0; + for (p = iter; p && l < *pos; p = s_next(m, p, &l)) + ; + + } else { + /* + * If we overflowed the seq_file before, then we want + * to just reuse the trace_seq buffer again. + */ + if (iter->leftover) + p = iter; + else { + l = *pos - 1; + p = s_next(m, p, &l); + } + } + + trace_event_read_lock(); + trace_access_lock(cpu_file); + return p; +} + +static void s_stop(struct seq_file *m, void *p) +{ + struct trace_iterator *iter = m->private; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (iter->snapshot && iter->trace->use_max_tr) + return; +#endif + + trace_access_unlock(iter->cpu_file); + trace_event_read_unlock(); +} + +static void +get_total_entries_cpu(struct array_buffer *buf, unsigned long *total, + unsigned long *entries, int cpu) +{ + unsigned long count; + + count = ring_buffer_entries_cpu(buf->buffer, cpu); + /* + * If this buffer has skipped entries, then we hold all + * entries for the trace and we need to ignore the + * ones before the time stamp. + */ + if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { + count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; + /* total is the same as the entries */ + *total = count; + } else + *total = count + + ring_buffer_overrun_cpu(buf->buffer, cpu); + *entries = count; +} + +static void +get_total_entries(struct array_buffer *buf, + unsigned long *total, unsigned long *entries) +{ + unsigned long t, e; + int cpu; + + *total = 0; + *entries = 0; + + for_each_tracing_cpu(cpu) { + get_total_entries_cpu(buf, &t, &e, cpu); + *total += t; + *entries += e; + } +} + +unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu) +{ + unsigned long total, entries; + + if (!tr) + tr = &global_trace; + + get_total_entries_cpu(&tr->array_buffer, &total, &entries, cpu); + + return entries; +} + +unsigned long trace_total_entries(struct trace_array *tr) +{ + unsigned long total, entries; + + if (!tr) + tr = &global_trace; + + get_total_entries(&tr->array_buffer, &total, &entries); + + return entries; +} + +static void print_lat_help_header(struct seq_file *m) +{ + seq_puts(m, "# _------=> CPU# \n" + "# / _-----=> irqs-off/BH-disabled\n" + "# | / _----=> need-resched \n" + "# || / _---=> hardirq/softirq \n" + "# ||| / _--=> preempt-depth \n" + "# |||| / _-=> migrate-disable \n" + "# ||||| / delay \n" + "# cmd pid |||||| time | caller \n" + "# \\ / |||||| \\ | / \n"); +} + +static void print_event_info(struct array_buffer *buf, struct seq_file *m) +{ + unsigned long total; + unsigned long entries; + + get_total_entries(buf, &total, &entries); + seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", + entries, total, num_online_cpus()); + seq_puts(m, "#\n"); +} + +static void print_func_help_header(struct array_buffer *buf, struct seq_file *m, + unsigned int flags) +{ + bool tgid = flags & TRACE_ITER_RECORD_TGID; + + print_event_info(buf, m); + + seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? " TGID " : ""); + seq_printf(m, "# | | %s | | |\n", tgid ? " | " : ""); +} + +static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m, + unsigned int flags) +{ + bool tgid = flags & TRACE_ITER_RECORD_TGID; + static const char space[] = " "; + int prec = tgid ? 12 : 2; + + print_event_info(buf, m); + + seq_printf(m, "# %.*s _-----=> irqs-off/BH-disabled\n", prec, space); + seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); + seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); + seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); + seq_printf(m, "# %.*s||| / _-=> migrate-disable\n", prec, space); + seq_printf(m, "# %.*s|||| / delay\n", prec, space); + seq_printf(m, "# TASK-PID %.*s CPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID "); + seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | "); +} + +void +print_trace_header(struct seq_file *m, struct trace_iterator *iter) +{ + unsigned long sym_flags = (global_trace.trace_flags & TRACE_ITER_SYM_MASK); + struct array_buffer *buf = iter->array_buffer; + struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu); + struct tracer *type = iter->trace; + unsigned long entries; + unsigned long total; + const char *name = type->name; + + get_total_entries(buf, &total, &entries); + + seq_printf(m, "# %s latency trace v1.1.5 on %s\n", + name, UTS_RELEASE); + seq_puts(m, "# -----------------------------------" + "---------------------------------\n"); + seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |" + " (M:%s VP:%d, KP:%d, SP:%d HP:%d", + nsecs_to_usecs(data->saved_latency), + entries, + total, + buf->cpu, + preempt_model_none() ? "server" : + preempt_model_voluntary() ? "desktop" : + preempt_model_full() ? "preempt" : + preempt_model_rt() ? "preempt_rt" : + "unknown", + /* These are reserved for later use */ + 0, 0, 0, 0); +#ifdef CONFIG_SMP + seq_printf(m, " #P:%d)\n", num_online_cpus()); +#else + seq_puts(m, ")\n"); +#endif + seq_puts(m, "# -----------------\n"); + seq_printf(m, "# | task: %.16s-%d " + "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", + data->comm, data->pid, + from_kuid_munged(seq_user_ns(m), data->uid), data->nice, + data->policy, data->rt_priority); + seq_puts(m, "# -----------------\n"); + + if (data->critical_start) { + seq_puts(m, "# => started at: "); + seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags); + trace_print_seq(m, &iter->seq); + seq_puts(m, "\n# => ended at: "); + seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); + trace_print_seq(m, &iter->seq); + seq_puts(m, "\n#\n"); + } + + seq_puts(m, "#\n"); +} + +static void test_cpu_buff_start(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_array *tr = iter->tr; + + if (!(tr->trace_flags & TRACE_ITER_ANNOTATE)) + return; + + if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) + return; + + if (cpumask_available(iter->started) && + cpumask_test_cpu(iter->cpu, iter->started)) + return; + + if (per_cpu_ptr(iter->array_buffer->data, iter->cpu)->skipped_entries) + return; + + if (cpumask_available(iter->started)) + cpumask_set_cpu(iter->cpu, iter->started); + + /* Don't print started cpu buffer for the first entry of the trace */ + if (iter->idx > 1) + trace_seq_printf(s, "##### CPU %u buffer started ####\n", + iter->cpu); +} + +static enum print_line_t print_trace_fmt(struct trace_iterator *iter) +{ + struct trace_array *tr = iter->tr; + struct trace_seq *s = &iter->seq; + unsigned long sym_flags = (tr->trace_flags & TRACE_ITER_SYM_MASK); + struct trace_entry *entry; + struct trace_event *event; + + entry = iter->ent; + + test_cpu_buff_start(iter); + + event = ftrace_find_event(entry->type); + + if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (iter->iter_flags & TRACE_FILE_LAT_FMT) + trace_print_lat_context(iter); + else + trace_print_context(iter); + } + + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + + if (event) { + if (tr->trace_flags & TRACE_ITER_FIELDS) + return print_event_fields(iter, event); + return event->funcs->trace(iter, sym_flags, event); + } + + trace_seq_printf(s, "Unknown type %d\n", entry->type); + + return trace_handle_return(s); +} + +static enum print_line_t print_raw_fmt(struct trace_iterator *iter) +{ + struct trace_array *tr = iter->tr; + struct trace_seq *s = &iter->seq; + struct trace_entry *entry; + struct trace_event *event; + + entry = iter->ent; + + if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) + trace_seq_printf(s, "%d %d %llu ", + entry->pid, iter->cpu, iter->ts); + + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + + event = ftrace_find_event(entry->type); + if (event) + return event->funcs->raw(iter, 0, event); + + trace_seq_printf(s, "%d ?\n", entry->type); + + return trace_handle_return(s); +} + +static enum print_line_t print_hex_fmt(struct trace_iterator *iter) +{ + struct trace_array *tr = iter->tr; + struct trace_seq *s = &iter->seq; + unsigned char newline = '\n'; + struct trace_entry *entry; + struct trace_event *event; + + entry = iter->ent; + + if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { + SEQ_PUT_HEX_FIELD(s, entry->pid); + SEQ_PUT_HEX_FIELD(s, iter->cpu); + SEQ_PUT_HEX_FIELD(s, iter->ts); + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + } + + event = ftrace_find_event(entry->type); + if (event) { + enum print_line_t ret = event->funcs->hex(iter, 0, event); + if (ret != TRACE_TYPE_HANDLED) + return ret; + } + + SEQ_PUT_FIELD(s, newline); + + return trace_handle_return(s); +} + +static enum print_line_t print_bin_fmt(struct trace_iterator *iter) +{ + struct trace_array *tr = iter->tr; + struct trace_seq *s = &iter->seq; + struct trace_entry *entry; + struct trace_event *event; + + entry = iter->ent; + + if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { + SEQ_PUT_FIELD(s, entry->pid); + SEQ_PUT_FIELD(s, iter->cpu); + SEQ_PUT_FIELD(s, iter->ts); + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + } + + event = ftrace_find_event(entry->type); + return event ? event->funcs->binary(iter, 0, event) : + TRACE_TYPE_HANDLED; +} + +int trace_empty(struct trace_iterator *iter) +{ + struct ring_buffer_iter *buf_iter; + int cpu; + + /* If we are looking at one CPU buffer, only check that one */ + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { + cpu = iter->cpu_file; + buf_iter = trace_buffer_iter(iter, cpu); + if (buf_iter) { + if (!ring_buffer_iter_empty(buf_iter)) + return 0; + } else { + if (!ring_buffer_empty_cpu(iter->array_buffer->buffer, cpu)) + return 0; + } + return 1; + } + + for_each_tracing_cpu(cpu) { + buf_iter = trace_buffer_iter(iter, cpu); + if (buf_iter) { + if (!ring_buffer_iter_empty(buf_iter)) + return 0; + } else { + if (!ring_buffer_empty_cpu(iter->array_buffer->buffer, cpu)) + return 0; + } + } + + return 1; +} + +/* Called with trace_event_read_lock() held. */ +enum print_line_t print_trace_line(struct trace_iterator *iter) +{ + struct trace_array *tr = iter->tr; + unsigned long trace_flags = tr->trace_flags; + enum print_line_t ret; + + if (iter->lost_events) { + if (iter->lost_events == (unsigned long)-1) + trace_seq_printf(&iter->seq, "CPU:%d [LOST EVENTS]\n", + iter->cpu); + else + trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->cpu, iter->lost_events); + if (trace_seq_has_overflowed(&iter->seq)) + return TRACE_TYPE_PARTIAL_LINE; + } + + if (iter->trace && iter->trace->print_line) { + ret = iter->trace->print_line(iter); + if (ret != TRACE_TYPE_UNHANDLED) + return ret; + } + + if (iter->ent->type == TRACE_BPUTS && + trace_flags & TRACE_ITER_PRINTK && + trace_flags & TRACE_ITER_PRINTK_MSGONLY) + return trace_print_bputs_msg_only(iter); + + if (iter->ent->type == TRACE_BPRINT && + trace_flags & TRACE_ITER_PRINTK && + trace_flags & TRACE_ITER_PRINTK_MSGONLY) + return trace_print_bprintk_msg_only(iter); + + if (iter->ent->type == TRACE_PRINT && + trace_flags & TRACE_ITER_PRINTK && + trace_flags & TRACE_ITER_PRINTK_MSGONLY) + return trace_print_printk_msg_only(iter); + + if (trace_flags & TRACE_ITER_BIN) + return print_bin_fmt(iter); + + if (trace_flags & TRACE_ITER_HEX) + return print_hex_fmt(iter); + + if (trace_flags & TRACE_ITER_RAW) + return print_raw_fmt(iter); + + return print_trace_fmt(iter); +} + +void trace_latency_header(struct seq_file *m) +{ + struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; + + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + + if (iter->iter_flags & TRACE_FILE_LAT_FMT) + print_trace_header(m, iter); + + if (!(tr->trace_flags & TRACE_ITER_VERBOSE)) + print_lat_help_header(m); +} + +void trace_default_header(struct seq_file *m) +{ + struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; + unsigned long trace_flags = tr->trace_flags; + + if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) + return; + + if (iter->iter_flags & TRACE_FILE_LAT_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + print_trace_header(m, iter); + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_lat_help_header(m); + } else { + if (!(trace_flags & TRACE_ITER_VERBOSE)) { + if (trace_flags & TRACE_ITER_IRQ_INFO) + print_func_help_header_irq(iter->array_buffer, + m, trace_flags); + else + print_func_help_header(iter->array_buffer, m, + trace_flags); + } + } +} + +static void test_ftrace_alive(struct seq_file *m) +{ + if (!ftrace_is_dead()) + return; + seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n" + "# MAY BE MISSING FUNCTION EVENTS\n"); +} + +#ifdef CONFIG_TRACER_MAX_TRACE +static void show_snapshot_main_help(struct seq_file *m) +{ + seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n" + "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" + "# Takes a snapshot of the main buffer.\n" + "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n" + "# (Doesn't have to be '2' works with any number that\n" + "# is not a '0' or '1')\n"); +} + +static void show_snapshot_percpu_help(struct seq_file *m) +{ + seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP + seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" + "# Takes a snapshot of the main buffer for this cpu.\n"); +#else + seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n" + "# Must use main snapshot file to allocate.\n"); +#endif + seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n" + "# (Doesn't have to be '2' works with any number that\n" + "# is not a '0' or '1')\n"); +} + +static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) +{ + if (iter->tr->allocated_snapshot) + seq_puts(m, "#\n# * Snapshot is allocated *\n#\n"); + else + seq_puts(m, "#\n# * Snapshot is freed *\n#\n"); + + seq_puts(m, "# Snapshot commands:\n"); + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + show_snapshot_main_help(m); + else + show_snapshot_percpu_help(m); +} +#else +/* Should never be called */ +static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } +#endif + +static int s_show(struct seq_file *m, void *v) +{ + struct trace_iterator *iter = v; + int ret; + + if (iter->ent == NULL) { + if (iter->tr) { + seq_printf(m, "# tracer: %s\n", iter->trace->name); + seq_puts(m, "#\n"); + test_ftrace_alive(m); + } + if (iter->snapshot && trace_empty(iter)) + print_snapshot_help(m, iter); + else if (iter->trace && iter->trace->print_header) + iter->trace->print_header(m); + else + trace_default_header(m); + + } else if (iter->leftover) { + /* + * If we filled the seq_file buffer earlier, we + * want to just show it now. + */ + ret = trace_print_seq(m, &iter->seq); + + /* ret should this time be zero, but you never know */ + iter->leftover = ret; + + } else { + ret = print_trace_line(iter); + if (ret == TRACE_TYPE_PARTIAL_LINE) { + iter->seq.full = 0; + trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n"); + } + ret = trace_print_seq(m, &iter->seq); + /* + * If we overflow the seq_file buffer, then it will + * ask us for this data again at start up. + * Use that instead. + * ret is 0 if seq_file write succeeded. + * -1 otherwise. + */ + iter->leftover = ret; + } + + return 0; +} + +/* + * Should be used after trace_array_get(), trace_types_lock + * ensures that i_cdev was already initialized. + */ +static inline int tracing_get_cpu(struct inode *inode) +{ + if (inode->i_cdev) /* See trace_create_cpu_file() */ + return (long)inode->i_cdev - 1; + return RING_BUFFER_ALL_CPUS; +} + +static const struct seq_operations tracer_seq_ops = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +/* + * Note, as iter itself can be allocated and freed in different + * ways, this function is only used to free its content, and not + * the iterator itself. The only requirement to all the allocations + * is that it must zero all fields (kzalloc), as freeing works with + * ethier allocated content or NULL. + */ +static void free_trace_iter_content(struct trace_iterator *iter) +{ + /* The fmt is either NULL, allocated or points to static_fmt_buf */ + if (iter->fmt != static_fmt_buf) + kfree(iter->fmt); + + kfree(iter->temp); + kfree(iter->buffer_iter); + mutex_destroy(&iter->mutex); + free_cpumask_var(iter->started); +} + +static struct trace_iterator * +__tracing_open(struct inode *inode, struct file *file, bool snapshot) +{ + struct trace_array *tr = inode->i_private; + struct trace_iterator *iter; + int cpu; + + if (tracing_disabled) + return ERR_PTR(-ENODEV); + + iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter)); + if (!iter) + return ERR_PTR(-ENOMEM); + + iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter), + GFP_KERNEL); + if (!iter->buffer_iter) + goto release; + + /* + * trace_find_next_entry() may need to save off iter->ent. + * It will place it into the iter->temp buffer. As most + * events are less than 128, allocate a buffer of that size. + * If one is greater, then trace_find_next_entry() will + * allocate a new buffer to adjust for the bigger iter->ent. + * It's not critical if it fails to get allocated here. + */ + iter->temp = kmalloc(128, GFP_KERNEL); + if (iter->temp) + iter->temp_size = 128; + + /* + * trace_event_printf() may need to modify given format + * string to replace %p with %px so that it shows real address + * instead of hash value. However, that is only for the event + * tracing, other tracer may not need. Defer the allocation + * until it is needed. + */ + iter->fmt = NULL; + iter->fmt_size = 0; + + mutex_lock(&trace_types_lock); + iter->trace = tr->current_trace; + + if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) + goto fail; + + iter->tr = tr; + +#ifdef CONFIG_TRACER_MAX_TRACE + /* Currently only the top directory has a snapshot */ + if (tr->current_trace->print_max || snapshot) + iter->array_buffer = &tr->max_buffer; + else +#endif + iter->array_buffer = &tr->array_buffer; + iter->snapshot = snapshot; + iter->pos = -1; + iter->cpu_file = tracing_get_cpu(inode); + mutex_init(&iter->mutex); + + /* Notify the tracer early; before we stop tracing. */ + if (iter->trace->open) + iter->trace->open(iter); + + /* Annotate start of buffers if we had overruns */ + if (ring_buffer_overruns(iter->array_buffer->buffer)) + iter->iter_flags |= TRACE_FILE_ANNOTATE; + + /* Output in nanoseconds only if we are using a clock in nanoseconds. */ + if (trace_clocks[tr->clock_id].in_ns) + iter->iter_flags |= TRACE_FILE_TIME_IN_NS; + + /* + * If pause-on-trace is enabled, then stop the trace while + * dumping, unless this is the "snapshot" file + */ + if (!iter->snapshot && (tr->trace_flags & TRACE_ITER_PAUSE_ON_TRACE)) + tracing_stop_tr(tr); + + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { + for_each_tracing_cpu(cpu) { + iter->buffer_iter[cpu] = + ring_buffer_read_prepare(iter->array_buffer->buffer, + cpu, GFP_KERNEL); + } + ring_buffer_read_prepare_sync(); + for_each_tracing_cpu(cpu) { + ring_buffer_read_start(iter->buffer_iter[cpu]); + tracing_iter_reset(iter, cpu); + } + } else { + cpu = iter->cpu_file; + iter->buffer_iter[cpu] = + ring_buffer_read_prepare(iter->array_buffer->buffer, + cpu, GFP_KERNEL); + ring_buffer_read_prepare_sync(); + ring_buffer_read_start(iter->buffer_iter[cpu]); + tracing_iter_reset(iter, cpu); + } + + mutex_unlock(&trace_types_lock); + + return iter; + + fail: + mutex_unlock(&trace_types_lock); + free_trace_iter_content(iter); +release: + seq_release_private(inode, file); + return ERR_PTR(-ENOMEM); +} + +int tracing_open_generic(struct inode *inode, struct file *filp) +{ + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; + + filp->private_data = inode->i_private; + return 0; +} + +bool tracing_is_disabled(void) +{ + return (tracing_disabled) ? true: false; +} + +/* + * Open and update trace_array ref count. + * Must have the current trace_array passed to it. + */ +int tracing_open_generic_tr(struct inode *inode, struct file *filp) +{ + struct trace_array *tr = inode->i_private; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + filp->private_data = inode->i_private; + + return 0; +} + +/* + * The private pointer of the inode is the trace_event_file. + * Update the tr ref count associated to it. + */ +int tracing_open_file_tr(struct inode *inode, struct file *filp) +{ + struct trace_event_file *file = inode->i_private; + int ret; + + ret = tracing_check_open_get_tr(file->tr); + if (ret) + return ret; + + mutex_lock(&event_mutex); + + /* Fail if the file is marked for removal */ + if (file->flags & EVENT_FILE_FL_FREED) { + trace_array_put(file->tr); + ret = -ENODEV; + } else { + event_file_get(file); + } + + mutex_unlock(&event_mutex); + if (ret) + return ret; + + filp->private_data = inode->i_private; + + return 0; +} + +int tracing_release_file_tr(struct inode *inode, struct file *filp) +{ + struct trace_event_file *file = inode->i_private; + + trace_array_put(file->tr); + event_file_put(file); + + return 0; +} + +int tracing_single_release_file_tr(struct inode *inode, struct file *filp) +{ + tracing_release_file_tr(inode, filp); + return single_release(inode, filp); +} + +static int tracing_mark_open(struct inode *inode, struct file *filp) +{ + stream_open(inode, filp); + return tracing_open_generic_tr(inode, filp); +} + +static int tracing_release(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + struct seq_file *m = file->private_data; + struct trace_iterator *iter; + int cpu; + + if (!(file->f_mode & FMODE_READ)) { + trace_array_put(tr); + return 0; + } + + /* Writes do not use seq_file */ + iter = m->private; + mutex_lock(&trace_types_lock); + + for_each_tracing_cpu(cpu) { + if (iter->buffer_iter[cpu]) + ring_buffer_read_finish(iter->buffer_iter[cpu]); + } + + if (iter->trace && iter->trace->close) + iter->trace->close(iter); + + if (!iter->snapshot && tr->stop_count) + /* reenable tracing if it was previously enabled */ + tracing_start_tr(tr); + + __trace_array_put(tr); + + mutex_unlock(&trace_types_lock); + + free_trace_iter_content(iter); + seq_release_private(inode, file); + + return 0; +} + +static int tracing_release_generic_tr(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + return 0; +} + +static int tracing_single_release_tr(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + + return single_release(inode, file); +} + +static int tracing_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + struct trace_iterator *iter; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + /* If this file was open for write, then erase contents */ + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + int cpu = tracing_get_cpu(inode); + struct array_buffer *trace_buf = &tr->array_buffer; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->current_trace->print_max) + trace_buf = &tr->max_buffer; +#endif + + if (cpu == RING_BUFFER_ALL_CPUS) + tracing_reset_online_cpus(trace_buf); + else + tracing_reset_cpu(trace_buf, cpu); + } + + if (file->f_mode & FMODE_READ) { + iter = __tracing_open(inode, file, false); + if (IS_ERR(iter)) + ret = PTR_ERR(iter); + else if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) + iter->iter_flags |= TRACE_FILE_LAT_FMT; + } + + if (ret < 0) + trace_array_put(tr); + + return ret; +} + +/* + * Some tracers are not suitable for instance buffers. + * A tracer is always available for the global array (toplevel) + * or if it explicitly states that it is. + */ +static bool +trace_ok_for_array(struct tracer *t, struct trace_array *tr) +{ + return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances; +} + +/* Find the next tracer that this trace array may use */ +static struct tracer * +get_tracer_for_array(struct trace_array *tr, struct tracer *t) +{ + while (t && !trace_ok_for_array(t, tr)) + t = t->next; + + return t; +} + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_array *tr = m->private; + struct tracer *t = v; + + (*pos)++; + + if (t) + t = get_tracer_for_array(tr, t->next); + + return t; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + struct trace_array *tr = m->private; + struct tracer *t; + loff_t l = 0; + + mutex_lock(&trace_types_lock); + + t = get_tracer_for_array(tr, trace_types); + for (; t && l < *pos; t = t_next(m, t, &l)) + ; + + return t; +} + +static void t_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&trace_types_lock); +} + +static int t_show(struct seq_file *m, void *v) +{ + struct tracer *t = v; + + if (!t) + return 0; + + seq_puts(m, t->name); + if (t->next) + seq_putc(m, ' '); + else + seq_putc(m, '\n'); + + return 0; +} + +static const struct seq_operations show_traces_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int show_traces_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + struct seq_file *m; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + ret = seq_open(file, &show_traces_seq_ops); + if (ret) { + trace_array_put(tr); + return ret; + } + + m = file->private_data; + m->private = tr; + + return 0; +} + +static int show_traces_release(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + return seq_release(inode, file); +} + +static ssize_t +tracing_write_stub(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + return count; +} + +loff_t tracing_lseek(struct file *file, loff_t offset, int whence) +{ + int ret; + + if (file->f_mode & FMODE_READ) + ret = seq_lseek(file, offset, whence); + else + file->f_pos = ret = 0; + + return ret; +} + +static const struct file_operations tracing_fops = { + .open = tracing_open, + .read = seq_read, + .read_iter = seq_read_iter, + .splice_read = copy_splice_read, + .write = tracing_write_stub, + .llseek = tracing_lseek, + .release = tracing_release, +}; + +static const struct file_operations show_traces_fops = { + .open = show_traces_open, + .read = seq_read, + .llseek = seq_lseek, + .release = show_traces_release, +}; + +static ssize_t +tracing_cpumask_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct trace_array *tr = file_inode(filp)->i_private; + char *mask_str; + int len; + + len = snprintf(NULL, 0, "%*pb\n", + cpumask_pr_args(tr->tracing_cpumask)) + 1; + mask_str = kmalloc(len, GFP_KERNEL); + if (!mask_str) + return -ENOMEM; + + len = snprintf(mask_str, len, "%*pb\n", + cpumask_pr_args(tr->tracing_cpumask)); + if (len >= count) { + count = -EINVAL; + goto out_err; + } + count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); + +out_err: + kfree(mask_str); + + return count; +} + +int tracing_set_cpumask(struct trace_array *tr, + cpumask_var_t tracing_cpumask_new) +{ + int cpu; + + if (!tr) + return -EINVAL; + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + for_each_tracing_cpu(cpu) { + /* + * Increase/decrease the disabled counter if we are + * about to flip a bit in the cpumask: + */ + if (cpumask_test_cpu(cpu, tr->tracing_cpumask) && + !cpumask_test_cpu(cpu, tracing_cpumask_new)) { + atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); + ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu); +#ifdef CONFIG_TRACER_MAX_TRACE + ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu); +#endif + } + if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && + cpumask_test_cpu(cpu, tracing_cpumask_new)) { + atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); + ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu); +#ifdef CONFIG_TRACER_MAX_TRACE + ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu); +#endif + } + } + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + + cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); + + return 0; +} + +static ssize_t +tracing_cpumask_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct trace_array *tr = file_inode(filp)->i_private; + cpumask_var_t tracing_cpumask_new; + int err; + + if (!zalloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) + return -ENOMEM; + + err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); + if (err) + goto err_free; + + err = tracing_set_cpumask(tr, tracing_cpumask_new); + if (err) + goto err_free; + + free_cpumask_var(tracing_cpumask_new); + + return count; + +err_free: + free_cpumask_var(tracing_cpumask_new); + + return err; +} + +static const struct file_operations tracing_cpumask_fops = { + .open = tracing_open_generic_tr, + .read = tracing_cpumask_read, + .write = tracing_cpumask_write, + .release = tracing_release_generic_tr, + .llseek = generic_file_llseek, +}; + +static int tracing_trace_options_show(struct seq_file *m, void *v) +{ + struct tracer_opt *trace_opts; + struct trace_array *tr = m->private; + u32 tracer_flags; + int i; + + mutex_lock(&trace_types_lock); + tracer_flags = tr->current_trace->flags->val; + trace_opts = tr->current_trace->flags->opts; + + for (i = 0; trace_options[i]; i++) { + if (tr->trace_flags & (1 << i)) + seq_printf(m, "%s\n", trace_options[i]); + else + seq_printf(m, "no%s\n", trace_options[i]); + } + + for (i = 0; trace_opts[i].name; i++) { + if (tracer_flags & trace_opts[i].bit) + seq_printf(m, "%s\n", trace_opts[i].name); + else + seq_printf(m, "no%s\n", trace_opts[i].name); + } + mutex_unlock(&trace_types_lock); + + return 0; +} + +static int __set_tracer_option(struct trace_array *tr, + struct tracer_flags *tracer_flags, + struct tracer_opt *opts, int neg) +{ + struct tracer *trace = tracer_flags->trace; + int ret; + + ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); + if (ret) + return ret; + + if (neg) + tracer_flags->val &= ~opts->bit; + else + tracer_flags->val |= opts->bit; + return 0; +} + +/* Try to assign a tracer specific option */ +static int set_tracer_option(struct trace_array *tr, char *cmp, int neg) +{ + struct tracer *trace = tr->current_trace; + struct tracer_flags *tracer_flags = trace->flags; + struct tracer_opt *opts = NULL; + int i; + + for (i = 0; tracer_flags->opts[i].name; i++) { + opts = &tracer_flags->opts[i]; + + if (strcmp(cmp, opts->name) == 0) + return __set_tracer_option(tr, trace->flags, opts, neg); + } + + return -EINVAL; +} + +/* Some tracers require overwrite to stay enabled */ +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) +{ + if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set) + return -1; + + return 0; +} + +int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) +{ + int *map; + + if ((mask == TRACE_ITER_RECORD_TGID) || + (mask == TRACE_ITER_RECORD_CMD)) + lockdep_assert_held(&event_mutex); + + /* do nothing if flag is already set */ + if (!!(tr->trace_flags & mask) == !!enabled) + return 0; + + /* Give the tracer a chance to approve the change */ + if (tr->current_trace->flag_changed) + if (tr->current_trace->flag_changed(tr, mask, !!enabled)) + return -EINVAL; + + if (enabled) + tr->trace_flags |= mask; + else + tr->trace_flags &= ~mask; + + if (mask == TRACE_ITER_RECORD_CMD) + trace_event_enable_cmd_record(enabled); + + if (mask == TRACE_ITER_RECORD_TGID) { + if (!tgid_map) { + tgid_map_max = pid_max; + map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), + GFP_KERNEL); + + /* + * Pairs with smp_load_acquire() in + * trace_find_tgid_ptr() to ensure that if it observes + * the tgid_map we just allocated then it also observes + * the corresponding tgid_map_max value. + */ + smp_store_release(&tgid_map, map); + } + if (!tgid_map) { + tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; + return -ENOMEM; + } + + trace_event_enable_tgid_record(enabled); + } + + if (mask == TRACE_ITER_EVENT_FORK) + trace_event_follow_fork(tr, enabled); + + if (mask == TRACE_ITER_FUNC_FORK) + ftrace_pid_follow_fork(tr, enabled); + + if (mask == TRACE_ITER_OVERWRITE) { + ring_buffer_change_overwrite(tr->array_buffer.buffer, enabled); +#ifdef CONFIG_TRACER_MAX_TRACE + ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled); +#endif + } + + if (mask == TRACE_ITER_PRINTK) { + trace_printk_start_stop_comm(enabled); + trace_printk_control(enabled); + } + + return 0; +} + +int trace_set_options(struct trace_array *tr, char *option) +{ + char *cmp; + int neg = 0; + int ret; + size_t orig_len = strlen(option); + int len; + + cmp = strstrip(option); + + len = str_has_prefix(cmp, "no"); + if (len) + neg = 1; + + cmp += len; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + ret = match_string(trace_options, -1, cmp); + /* If no option could be set, test the specific tracer options */ + if (ret < 0) + ret = set_tracer_option(tr, cmp, neg); + else + ret = set_tracer_flag(tr, 1 << ret, !neg); + + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + + /* + * If the first trailing whitespace is replaced with '\0' by strstrip, + * turn it back into a space. + */ + if (orig_len > strlen(option)) + option[strlen(option)] = ' '; + + return ret; +} + +static void __init apply_trace_boot_options(void) +{ + char *buf = trace_boot_options_buf; + char *option; + + while (true) { + option = strsep(&buf, ","); + + if (!option) + break; + + if (*option) + trace_set_options(&global_trace, option); + + /* Put back the comma to allow this to be called again */ + if (buf) + *(buf - 1) = ','; + } +} + +static ssize_t +tracing_trace_options_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; + char buf[64]; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = trace_set_options(tr, buf); + if (ret < 0) + return ret; + + *ppos += cnt; + + return cnt; +} + +static int tracing_trace_options_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + ret = single_open(file, tracing_trace_options_show, inode->i_private); + if (ret < 0) + trace_array_put(tr); + + return ret; +} + +static const struct file_operations tracing_iter_fops = { + .open = tracing_trace_options_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_single_release_tr, + .write = tracing_trace_options_write, +}; + +static const char readme_msg[] = + "tracing mini-HOWTO:\n\n" + "# echo 0 > tracing_on : quick way to disable tracing\n" + "# echo 1 > tracing_on : quick way to re-enable tracing\n\n" + " Important files:\n" + " trace\t\t\t- The static contents of the buffer\n" + "\t\t\t To clear the buffer write into this file: echo > trace\n" + " trace_pipe\t\t- A consuming read to see the contents of the buffer\n" + " current_tracer\t- function and latency tracers\n" + " available_tracers\t- list of configured tracers for current_tracer\n" + " error_log\t- error log for failed commands (that support it)\n" + " buffer_size_kb\t- view and modify size of per cpu buffer\n" + " buffer_total_size_kb - view total size of all cpu buffers\n\n" + " trace_clock\t\t- change the clock used to order events\n" + " local: Per cpu clock but may not be synced across CPUs\n" + " global: Synced across CPUs but slows tracing down.\n" + " counter: Not a clock, but just an increment\n" + " uptime: Jiffy counter from time of boot\n" + " perf: Same clock that perf events use\n" +#ifdef CONFIG_X86_64 + " x86-tsc: TSC cycle counter\n" +#endif + "\n timestamp_mode\t- view the mode used to timestamp events\n" + " delta: Delta difference against a buffer-wide timestamp\n" + " absolute: Absolute (standalone) timestamp\n" + "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" + "\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n" + " tracing_cpumask\t- Limit which CPUs to trace\n" + " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" + "\t\t\t Remove sub-buffer with rmdir\n" + " trace_options\t\t- Set format or modify how tracing happens\n" + "\t\t\t Disable an option by prefixing 'no' to the\n" + "\t\t\t option name\n" + " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n" +#ifdef CONFIG_DYNAMIC_FTRACE + "\n available_filter_functions - list of functions that can be filtered on\n" + " set_ftrace_filter\t- echo function name in here to only trace these\n" + "\t\t\t functions\n" + "\t accepts: func_full_name or glob-matching-pattern\n" + "\t modules: Can select a group via module\n" + "\t Format: :mod:<module-name>\n" + "\t example: echo :mod:ext3 > set_ftrace_filter\n" + "\t triggers: a command to perform when function is hit\n" + "\t Format: <function>:<trigger>[:count]\n" + "\t trigger: traceon, traceoff\n" + "\t\t enable_event:<system>:<event>\n" + "\t\t disable_event:<system>:<event>\n" +#ifdef CONFIG_STACKTRACE + "\t\t stacktrace\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT + "\t\t snapshot\n" +#endif + "\t\t dump\n" + "\t\t cpudump\n" + "\t example: echo do_fault:traceoff > set_ftrace_filter\n" + "\t echo do_trap:traceoff:3 > set_ftrace_filter\n" + "\t The first one will disable tracing every time do_fault is hit\n" + "\t The second will disable tracing at most 3 times when do_trap is hit\n" + "\t The first time do trap is hit and it disables tracing, the\n" + "\t counter will decrement to 2. If tracing is already disabled,\n" + "\t the counter will not decrement. It only decrements when the\n" + "\t trigger did work\n" + "\t To remove trigger without count:\n" + "\t echo '!<function>:<trigger> > set_ftrace_filter\n" + "\t To remove trigger with a count:\n" + "\t echo '!<function>:<trigger>:0 > set_ftrace_filter\n" + " set_ftrace_notrace\t- echo function name in here to never trace.\n" + "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" + "\t modules: Can select a group via module command :mod:\n" + "\t Does not accept triggers\n" +#endif /* CONFIG_DYNAMIC_FTRACE */ +#ifdef CONFIG_FUNCTION_TRACER + " set_ftrace_pid\t- Write pid(s) to only function trace those pids\n" + "\t\t (function)\n" + " set_ftrace_notrace_pid\t- Write pid(s) to not function trace those pids\n" + "\t\t (function)\n" +#endif +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" + " set_graph_notrace\t- Do not trace the nested calls of a function (function_graph)\n" + " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT + "\n snapshot\t\t- Like 'trace' but shows the content of the static\n" + "\t\t\t snapshot buffer. Read the contents for more\n" + "\t\t\t information\n" +#endif +#ifdef CONFIG_STACK_TRACER + " stack_trace\t\t- Shows the max stack trace when active\n" + " stack_max_size\t- Shows current max stack size that was traced\n" + "\t\t\t Write into this file to reset the max size (trigger a\n" + "\t\t\t new trace)\n" +#ifdef CONFIG_DYNAMIC_FTRACE + " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace\n" + "\t\t\t traces\n" +#endif +#endif /* CONFIG_STACK_TRACER */ +#ifdef CONFIG_DYNAMIC_EVENTS + " dynamic_events\t\t- Create/append/remove/show the generic dynamic events\n" + "\t\t\t Write into this file to define/undefine new trace events.\n" +#endif +#ifdef CONFIG_KPROBE_EVENTS + " kprobe_events\t\t- Create/append/remove/show the kernel dynamic events\n" + "\t\t\t Write into this file to define/undefine new trace events.\n" +#endif +#ifdef CONFIG_UPROBE_EVENTS + " uprobe_events\t\t- Create/append/remove/show the userspace dynamic events\n" + "\t\t\t Write into this file to define/undefine new trace events.\n" +#endif +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) || \ + defined(CONFIG_FPROBE_EVENTS) + "\t accepts: event-definitions (one definition per line)\n" +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) + "\t Format: p[:[<group>/][<event>]] <place> [<args>]\n" + "\t r[maxactive][:[<group>/][<event>]] <place> [<args>]\n" +#endif +#ifdef CONFIG_FPROBE_EVENTS + "\t f[:[<group>/][<event>]] <func-name>[%return] [<args>]\n" + "\t t[:[<group>/][<event>]] <tracepoint> [<args>]\n" +#endif +#ifdef CONFIG_HIST_TRIGGERS + "\t s:[synthetic/]<event> <field> [<field>]\n" +#endif + "\t e[:[<group>/][<event>]] <attached-group>.<attached-event> [<args>] [if <filter>]\n" + "\t -:[<group>/][<event>]\n" +#ifdef CONFIG_KPROBE_EVENTS + "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" + "place (kretprobe): [<module>:]<symbol>[+<offset>]%return|<memaddr>\n" +#endif +#ifdef CONFIG_UPROBE_EVENTS + " place (uprobe): <path>:<offset>[%return][(ref_ctr_offset)]\n" +#endif + "\t args: <name>=fetcharg[:type]\n" + "\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n" +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API +#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS + "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n" + "\t <argname>[->field[->field|.field...]],\n" +#else + "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n" +#endif +#else + "\t $stack<index>, $stack, $retval, $comm,\n" +#endif + "\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n" + "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n" + "\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n" + "\t symstr, <type>\\[<array-size>\\]\n" +#ifdef CONFIG_HIST_TRIGGERS + "\t field: <stype> <name>;\n" + "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" + "\t [unsigned] char/int/long\n" +#endif + "\t efield: For event probes ('e' types), the field is on of the fields\n" + "\t of the <attached-group>/<attached-event>.\n" +#endif + " events/\t\t- Directory containing all trace event subsystems:\n" + " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" + " events/<system>/\t- Directory containing all trace events for <system>:\n" + " enable\t\t- Write 0/1 to enable/disable tracing of all <system>\n" + "\t\t\t events\n" + " filter\t\t- If set, only events passing filter are traced\n" + " events/<system>/<event>/\t- Directory containing control files for\n" + "\t\t\t <event>:\n" + " enable\t\t- Write 0/1 to enable/disable tracing of <event>\n" + " filter\t\t- If set, only events passing filter are traced\n" + " trigger\t\t- If set, a command to perform when event is hit\n" + "\t Format: <trigger>[:count][if <filter>]\n" + "\t trigger: traceon, traceoff\n" + "\t enable_event:<system>:<event>\n" + "\t disable_event:<system>:<event>\n" +#ifdef CONFIG_HIST_TRIGGERS + "\t enable_hist:<system>:<event>\n" + "\t disable_hist:<system>:<event>\n" +#endif +#ifdef CONFIG_STACKTRACE + "\t\t stacktrace\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT + "\t\t snapshot\n" +#endif +#ifdef CONFIG_HIST_TRIGGERS + "\t\t hist (see below)\n" +#endif + "\t example: echo traceoff > events/block/block_unplug/trigger\n" + "\t echo traceoff:3 > events/block/block_unplug/trigger\n" + "\t echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' > \\\n" + "\t events/block/block_unplug/trigger\n" + "\t The first disables tracing every time block_unplug is hit.\n" + "\t The second disables tracing the first 3 times block_unplug is hit.\n" + "\t The third enables the kmalloc event the first 3 times block_unplug\n" + "\t is hit and has value of greater than 1 for the 'nr_rq' event field.\n" + "\t Like function triggers, the counter is only decremented if it\n" + "\t enabled or disabled tracing.\n" + "\t To remove a trigger without a count:\n" + "\t echo '!<trigger> > <system>/<event>/trigger\n" + "\t To remove a trigger with a count:\n" + "\t echo '!<trigger>:0 > <system>/<event>/trigger\n" + "\t Filters can be ignored when removing a trigger.\n" +#ifdef CONFIG_HIST_TRIGGERS + " hist trigger\t- If set, event hits are aggregated into a hash table\n" + "\t Format: hist:keys=<field1[,field2,...]>\n" + "\t [:<var1>=<field|var_ref|numeric_literal>[,<var2>=...]]\n" + "\t [:values=<field1[,field2,...]>]\n" + "\t [:sort=<field1[,field2,...]>]\n" + "\t [:size=#entries]\n" + "\t [:pause][:continue][:clear]\n" + "\t [:name=histname1]\n" + "\t [:nohitcount]\n" + "\t [:<handler>.<action>]\n" + "\t [if <filter>]\n\n" + "\t Note, special fields can be used as well:\n" + "\t common_timestamp - to record current timestamp\n" + "\t common_cpu - to record the CPU the event happened on\n" + "\n" + "\t A hist trigger variable can be:\n" + "\t - a reference to a field e.g. x=current_timestamp,\n" + "\t - a reference to another variable e.g. y=$x,\n" + "\t - a numeric literal: e.g. ms_per_sec=1000,\n" + "\t - an arithmetic expression: e.g. time_secs=current_timestamp/1000\n" + "\n" + "\t hist trigger arithmetic expressions support addition(+), subtraction(-),\n" + "\t multiplication(*) and division(/) operators. An operand can be either a\n" + "\t variable reference, field or numeric literal.\n" + "\n" + "\t When a matching event is hit, an entry is added to a hash\n" + "\t table using the key(s) and value(s) named, and the value of a\n" + "\t sum called 'hitcount' is incremented. Keys and values\n" + "\t correspond to fields in the event's format description. Keys\n" + "\t can be any field, or the special string 'common_stacktrace'.\n" + "\t Compound keys consisting of up to two fields can be specified\n" + "\t by the 'keys' keyword. Values must correspond to numeric\n" + "\t fields. Sort keys consisting of up to two fields can be\n" + "\t specified using the 'sort' keyword. The sort direction can\n" + "\t be modified by appending '.descending' or '.ascending' to a\n" + "\t sort field. The 'size' parameter can be used to specify more\n" + "\t or fewer than the default 2048 entries for the hashtable size.\n" + "\t If a hist trigger is given a name using the 'name' parameter,\n" + "\t its histogram data will be shared with other triggers of the\n" + "\t same name, and trigger hits will update this common data.\n\n" + "\t Reading the 'hist' file for the event will dump the hash\n" + "\t table in its entirety to stdout. If there are multiple hist\n" + "\t triggers attached to an event, there will be a table for each\n" + "\t trigger in the output. The table displayed for a named\n" + "\t trigger will be the same as any other instance having the\n" + "\t same name. The default format used to display a given field\n" + "\t can be modified by appending any of the following modifiers\n" + "\t to the field name, as applicable:\n\n" + "\t .hex display a number as a hex value\n" + "\t .sym display an address as a symbol\n" + "\t .sym-offset display an address as a symbol and offset\n" + "\t .execname display a common_pid as a program name\n" + "\t .syscall display a syscall id as a syscall name\n" + "\t .log2 display log2 value rather than raw number\n" + "\t .buckets=size display values in groups of size rather than raw number\n" + "\t .usecs display a common_timestamp in microseconds\n" + "\t .percent display a number of percentage value\n" + "\t .graph display a bar-graph of a value\n\n" + "\t The 'pause' parameter can be used to pause an existing hist\n" + "\t trigger or to start a hist trigger but not log any events\n" + "\t until told to do so. 'continue' can be used to start or\n" + "\t restart a paused hist trigger.\n\n" + "\t The 'clear' parameter will clear the contents of a running\n" + "\t hist trigger and leave its current paused/active state\n" + "\t unchanged.\n\n" + "\t The 'nohitcount' (or NOHC) parameter will suppress display of\n" + "\t raw hitcount in the histogram.\n\n" + "\t The enable_hist and disable_hist triggers can be used to\n" + "\t have one event conditionally start and stop another event's\n" + "\t already-attached hist trigger. The syntax is analogous to\n" + "\t the enable_event and disable_event triggers.\n\n" + "\t Hist trigger handlers and actions are executed whenever a\n" + "\t a histogram entry is added or updated. They take the form:\n\n" + "\t <handler>.<action>\n\n" + "\t The available handlers are:\n\n" + "\t onmatch(matching.event) - invoke on addition or update\n" + "\t onmax(var) - invoke if var exceeds current max\n" + "\t onchange(var) - invoke action if var changes\n\n" + "\t The available actions are:\n\n" + "\t trace(<synthetic_event>,param list) - generate synthetic event\n" + "\t save(field,...) - save current event fields\n" +#ifdef CONFIG_TRACER_SNAPSHOT + "\t snapshot() - snapshot the trace buffer\n\n" +#endif +#ifdef CONFIG_SYNTH_EVENTS + " events/synthetic_events\t- Create/append/remove/show synthetic events\n" + "\t Write into this file to define/undefine new synthetic events.\n" + "\t example: echo 'myevent u64 lat; char name[]; long[] stack' >> synthetic_events\n" +#endif +#endif +; + +static ssize_t +tracing_readme_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_read_from_buffer(ubuf, cnt, ppos, + readme_msg, strlen(readme_msg)); +} + +static const struct file_operations tracing_readme_fops = { + .open = tracing_open_generic, + .read = tracing_readme_read, + .llseek = generic_file_llseek, +}; + +static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) +{ + int pid = ++(*pos); + + return trace_find_tgid_ptr(pid); +} + +static void *saved_tgids_start(struct seq_file *m, loff_t *pos) +{ + int pid = *pos; + + return trace_find_tgid_ptr(pid); +} + +static void saved_tgids_stop(struct seq_file *m, void *v) +{ +} + +static int saved_tgids_show(struct seq_file *m, void *v) +{ + int *entry = (int *)v; + int pid = entry - tgid_map; + int tgid = *entry; + + if (tgid == 0) + return SEQ_SKIP; + + seq_printf(m, "%d %d\n", pid, tgid); + return 0; +} + +static const struct seq_operations tracing_saved_tgids_seq_ops = { + .start = saved_tgids_start, + .stop = saved_tgids_stop, + .next = saved_tgids_next, + .show = saved_tgids_show, +}; + +static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; + + return seq_open(filp, &tracing_saved_tgids_seq_ops); +} + + +static const struct file_operations tracing_saved_tgids_fops = { + .open = tracing_saved_tgids_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) +{ + unsigned int *ptr = v; + + if (*pos || m->count) + ptr++; + + (*pos)++; + + for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; + ptr++) { + if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) + continue; + + return ptr; + } + + return NULL; +} + +static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) +{ + void *v; + loff_t l = 0; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + v = &savedcmd->map_cmdline_to_pid[0]; + while (l <= *pos) { + v = saved_cmdlines_next(m, v, &l); + if (!v) + return NULL; + } + + return v; +} + +static void saved_cmdlines_stop(struct seq_file *m, void *v) +{ + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); +} + +static int saved_cmdlines_show(struct seq_file *m, void *v) +{ + char buf[TASK_COMM_LEN]; + unsigned int *pid = v; + + __trace_find_cmdline(*pid, buf); + seq_printf(m, "%d %s\n", *pid, buf); + return 0; +} + +static const struct seq_operations tracing_saved_cmdlines_seq_ops = { + .start = saved_cmdlines_start, + .next = saved_cmdlines_next, + .stop = saved_cmdlines_stop, + .show = saved_cmdlines_show, +}; + +static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; + + return seq_open(filp, &tracing_saved_cmdlines_seq_ops); +} + +static const struct file_operations tracing_saved_cmdlines_fops = { + .open = tracing_saved_cmdlines_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static ssize_t +tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) +{ + kfree(s->saved_cmdlines); + kfree(s->map_cmdline_to_pid); + kfree(s); +} + +static int tracing_resize_saved_cmdlines(unsigned int val) +{ + struct saved_cmdlines_buffer *s, *savedcmd_temp; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + if (allocate_cmdlines_buffer(val, s) < 0) { + kfree(s); + return -ENOMEM; + } + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + savedcmd_temp = savedcmd; + savedcmd = s; + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + free_saved_cmdlines_buffer(savedcmd_temp); + + return 0; +} + +static ssize_t +tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + /* must have at least 1 entry or less than PID_MAX_DEFAULT */ + if (!val || val > PID_MAX_DEFAULT) + return -EINVAL; + + ret = tracing_resize_saved_cmdlines((unsigned int)val); + if (ret < 0) + return ret; + + *ppos += cnt; + + return cnt; +} + +static const struct file_operations tracing_saved_cmdlines_size_fops = { + .open = tracing_open_generic, + .read = tracing_saved_cmdlines_size_read, + .write = tracing_saved_cmdlines_size_write, +}; + +#ifdef CONFIG_TRACE_EVAL_MAP_FILE +static union trace_eval_map_item * +update_eval_map(union trace_eval_map_item *ptr) +{ + if (!ptr->map.eval_string) { + if (ptr->tail.next) { + ptr = ptr->tail.next; + /* Set ptr to the next real item (skip head) */ + ptr++; + } else + return NULL; + } + return ptr; +} + +static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos) +{ + union trace_eval_map_item *ptr = v; + + /* + * Paranoid! If ptr points to end, we don't want to increment past it. + * This really should never happen. + */ + (*pos)++; + ptr = update_eval_map(ptr); + if (WARN_ON_ONCE(!ptr)) + return NULL; + + ptr++; + ptr = update_eval_map(ptr); + + return ptr; +} + +static void *eval_map_start(struct seq_file *m, loff_t *pos) +{ + union trace_eval_map_item *v; + loff_t l = 0; + + mutex_lock(&trace_eval_mutex); + + v = trace_eval_maps; + if (v) + v++; + + while (v && l < *pos) { + v = eval_map_next(m, v, &l); + } + + return v; +} + +static void eval_map_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&trace_eval_mutex); +} + +static int eval_map_show(struct seq_file *m, void *v) +{ + union trace_eval_map_item *ptr = v; + + seq_printf(m, "%s %ld (%s)\n", + ptr->map.eval_string, ptr->map.eval_value, + ptr->map.system); + + return 0; +} + +static const struct seq_operations tracing_eval_map_seq_ops = { + .start = eval_map_start, + .next = eval_map_next, + .stop = eval_map_stop, + .show = eval_map_show, +}; + +static int tracing_eval_map_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; + + return seq_open(filp, &tracing_eval_map_seq_ops); +} + +static const struct file_operations tracing_eval_map_fops = { + .open = tracing_eval_map_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static inline union trace_eval_map_item * +trace_eval_jmp_to_tail(union trace_eval_map_item *ptr) +{ + /* Return tail of array given the head */ + return ptr + ptr->head.length + 1; +} + +static void +trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, + int len) +{ + struct trace_eval_map **stop; + struct trace_eval_map **map; + union trace_eval_map_item *map_array; + union trace_eval_map_item *ptr; + + stop = start + len; + + /* + * The trace_eval_maps contains the map plus a head and tail item, + * where the head holds the module and length of array, and the + * tail holds a pointer to the next list. + */ + map_array = kmalloc_array(len + 2, sizeof(*map_array), GFP_KERNEL); + if (!map_array) { + pr_warn("Unable to allocate trace eval mapping\n"); + return; + } + + mutex_lock(&trace_eval_mutex); + + if (!trace_eval_maps) + trace_eval_maps = map_array; + else { + ptr = trace_eval_maps; + for (;;) { + ptr = trace_eval_jmp_to_tail(ptr); + if (!ptr->tail.next) + break; + ptr = ptr->tail.next; + + } + ptr->tail.next = map_array; + } + map_array->head.mod = mod; + map_array->head.length = len; + map_array++; + + for (map = start; (unsigned long)map < (unsigned long)stop; map++) { + map_array->map = **map; + map_array++; + } + memset(map_array, 0, sizeof(*map_array)); + + mutex_unlock(&trace_eval_mutex); +} + +static void trace_create_eval_file(struct dentry *d_tracer) +{ + trace_create_file("eval_map", TRACE_MODE_READ, d_tracer, + NULL, &tracing_eval_map_fops); +} + +#else /* CONFIG_TRACE_EVAL_MAP_FILE */ +static inline void trace_create_eval_file(struct dentry *d_tracer) { } +static inline void trace_insert_eval_map_file(struct module *mod, + struct trace_eval_map **start, int len) { } +#endif /* !CONFIG_TRACE_EVAL_MAP_FILE */ + +static void trace_insert_eval_map(struct module *mod, + struct trace_eval_map **start, int len) +{ + struct trace_eval_map **map; + + if (len <= 0) + return; + + map = start; + + trace_event_eval_update(map, len); + + trace_insert_eval_map_file(mod, start, len); +} + +static ssize_t +tracing_set_trace_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[MAX_TRACER_SIZE+2]; + int r; + + mutex_lock(&trace_types_lock); + r = sprintf(buf, "%s\n", tr->current_trace->name); + mutex_unlock(&trace_types_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +int tracer_init(struct tracer *t, struct trace_array *tr) +{ + tracing_reset_online_cpus(&tr->array_buffer); + return t->init(tr); +} + +static void set_buffer_entries(struct array_buffer *buf, unsigned long val) +{ + int cpu; + + for_each_tracing_cpu(cpu) + per_cpu_ptr(buf->data, cpu)->entries = val; +} + +static void update_buffer_entries(struct array_buffer *buf, int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) { + set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0)); + } else { + per_cpu_ptr(buf->data, cpu)->entries = ring_buffer_size(buf->buffer, cpu); + } +} + +#ifdef CONFIG_TRACER_MAX_TRACE +/* resize @tr's buffer to the size of @size_tr's entries */ +static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, + struct array_buffer *size_buf, int cpu_id) +{ + int cpu, ret = 0; + + if (cpu_id == RING_BUFFER_ALL_CPUS) { + for_each_tracing_cpu(cpu) { + ret = ring_buffer_resize(trace_buf->buffer, + per_cpu_ptr(size_buf->data, cpu)->entries, cpu); + if (ret < 0) + break; + per_cpu_ptr(trace_buf->data, cpu)->entries = + per_cpu_ptr(size_buf->data, cpu)->entries; + } + } else { + ret = ring_buffer_resize(trace_buf->buffer, + per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id); + if (ret == 0) + per_cpu_ptr(trace_buf->data, cpu_id)->entries = + per_cpu_ptr(size_buf->data, cpu_id)->entries; + } + + return ret; +} +#endif /* CONFIG_TRACER_MAX_TRACE */ + +static int __tracing_resize_ring_buffer(struct trace_array *tr, + unsigned long size, int cpu) +{ + int ret; + + /* + * If kernel or user changes the size of the ring buffer + * we use the size that was given, and we can forget about + * expanding it later. + */ + ring_buffer_expanded = true; + + /* May be called before buffers are initialized */ + if (!tr->array_buffer.buffer) + return 0; + + /* Do not allow tracing while resizing ring buffer */ + tracing_stop_tr(tr); + + ret = ring_buffer_resize(tr->array_buffer.buffer, size, cpu); + if (ret < 0) + goto out_start; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (!tr->allocated_snapshot) + goto out; + + ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu); + if (ret < 0) { + int r = resize_buffer_duplicate_size(&tr->array_buffer, + &tr->array_buffer, cpu); + if (r < 0) { + /* + * AARGH! We are left with different + * size max buffer!!!! + * The max buffer is our "snapshot" buffer. + * When a tracer needs a snapshot (one of the + * latency tracers), it swaps the max buffer + * with the saved snap shot. We succeeded to + * update the size of the main buffer, but failed to + * update the size of the max buffer. But when we tried + * to reset the main buffer to the original size, we + * failed there too. This is very unlikely to + * happen, but if it does, warn and kill all + * tracing. + */ + WARN_ON(1); + tracing_disabled = 1; + } + goto out_start; + } + + update_buffer_entries(&tr->max_buffer, cpu); + + out: +#endif /* CONFIG_TRACER_MAX_TRACE */ + + update_buffer_entries(&tr->array_buffer, cpu); + out_start: + tracing_start_tr(tr); + return ret; +} + +ssize_t tracing_resize_ring_buffer(struct trace_array *tr, + unsigned long size, int cpu_id) +{ + int ret; + + mutex_lock(&trace_types_lock); + + if (cpu_id != RING_BUFFER_ALL_CPUS) { + /* make sure, this cpu is enabled in the mask */ + if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) { + ret = -EINVAL; + goto out; + } + } + + ret = __tracing_resize_ring_buffer(tr, size, cpu_id); + if (ret < 0) + ret = -ENOMEM; + +out: + mutex_unlock(&trace_types_lock); + + return ret; +} + + +/** + * tracing_update_buffers - used by tracing facility to expand ring buffers + * + * To save on memory when the tracing is never used on a system with it + * configured in. The ring buffers are set to a minimum size. But once + * a user starts to use the tracing facility, then they need to grow + * to their default size. + * + * This function is to be called when a tracer is about to be used. + */ +int tracing_update_buffers(void) +{ + int ret = 0; + + mutex_lock(&trace_types_lock); + if (!ring_buffer_expanded) + ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size, + RING_BUFFER_ALL_CPUS); + mutex_unlock(&trace_types_lock); + + return ret; +} + +struct trace_option_dentry; + +static void +create_trace_option_files(struct trace_array *tr, struct tracer *tracer); + +/* + * Used to clear out the tracer before deletion of an instance. + * Must have trace_types_lock held. + */ +static void tracing_set_nop(struct trace_array *tr) +{ + if (tr->current_trace == &nop_trace) + return; + + tr->current_trace->enabled--; + + if (tr->current_trace->reset) + tr->current_trace->reset(tr); + + tr->current_trace = &nop_trace; +} + +static bool tracer_options_updated; + +static void add_tracer_options(struct trace_array *tr, struct tracer *t) +{ + /* Only enable if the directory has been created already. */ + if (!tr->dir) + return; + + /* Only create trace option files after update_tracer_options finish */ + if (!tracer_options_updated) + return; + + create_trace_option_files(tr, t); +} + +int tracing_set_tracer(struct trace_array *tr, const char *buf) +{ + struct tracer *t; +#ifdef CONFIG_TRACER_MAX_TRACE + bool had_max_tr; +#endif + int ret = 0; + + mutex_lock(&trace_types_lock); + + if (!ring_buffer_expanded) { + ret = __tracing_resize_ring_buffer(tr, trace_buf_size, + RING_BUFFER_ALL_CPUS); + if (ret < 0) + goto out; + ret = 0; + } + + for (t = trace_types; t; t = t->next) { + if (strcmp(t->name, buf) == 0) + break; + } + if (!t) { + ret = -EINVAL; + goto out; + } + if (t == tr->current_trace) + goto out; + +#ifdef CONFIG_TRACER_SNAPSHOT + if (t->use_max_tr) { + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + if (tr->cond_snapshot) + ret = -EBUSY; + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + if (ret) + goto out; + } +#endif + /* Some tracers won't work on kernel command line */ + if (system_state < SYSTEM_RUNNING && t->noboot) { + pr_warn("Tracer '%s' is not allowed on command line, ignored\n", + t->name); + goto out; + } + + /* Some tracers are only allowed for the top level buffer */ + if (!trace_ok_for_array(t, tr)) { + ret = -EINVAL; + goto out; + } + + /* If trace pipe files are being read, we can't change the tracer */ + if (tr->trace_ref) { + ret = -EBUSY; + goto out; + } + + trace_branch_disable(); + + tr->current_trace->enabled--; + + if (tr->current_trace->reset) + tr->current_trace->reset(tr); + +#ifdef CONFIG_TRACER_MAX_TRACE + had_max_tr = tr->current_trace->use_max_tr; + + /* Current trace needs to be nop_trace before synchronize_rcu */ + tr->current_trace = &nop_trace; + + if (had_max_tr && !t->use_max_tr) { + /* + * We need to make sure that the update_max_tr sees that + * current_trace changed to nop_trace to keep it from + * swapping the buffers after we resize it. + * The update_max_tr is called from interrupts disabled + * so a synchronized_sched() is sufficient. + */ + synchronize_rcu(); + free_snapshot(tr); + } + + if (t->use_max_tr && !tr->allocated_snapshot) { + ret = tracing_alloc_snapshot_instance(tr); + if (ret < 0) + goto out; + } +#else + tr->current_trace = &nop_trace; +#endif + + if (t->init) { + ret = tracer_init(t, tr); + if (ret) + goto out; + } + + tr->current_trace = t; + tr->current_trace->enabled++; + trace_branch_enable(tr); + out: + mutex_unlock(&trace_types_lock); + + return ret; +} + +static ssize_t +tracing_set_trace_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[MAX_TRACER_SIZE+1]; + char *name; + size_t ret; + int err; + + ret = cnt; + + if (cnt > MAX_TRACER_SIZE) + cnt = MAX_TRACER_SIZE; + + if (copy_from_user(buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + name = strim(buf); + + err = tracing_set_tracer(tr, name); + if (err) + return err; + + *ppos += ret; + + return ret; +} + +static ssize_t +tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + r = snprintf(buf, sizeof(buf), "%ld\n", + *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr)); + if (r > sizeof(buf)) + r = sizeof(buf); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + *ptr = val * 1000; + + return cnt; +} + +static ssize_t +tracing_thresh_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return tracing_nsecs_read(&tracing_thresh, ubuf, cnt, ppos); +} + +static ssize_t +tracing_thresh_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + int ret; + + mutex_lock(&trace_types_lock); + ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos); + if (ret < 0) + goto out; + + if (tr->current_trace->update_thresh) { + ret = tr->current_trace->update_thresh(tr); + if (ret < 0) + goto out; + } + + ret = cnt; +out: + mutex_unlock(&trace_types_lock); + + return ret; +} + +#ifdef CONFIG_TRACER_MAX_TRACE + +static ssize_t +tracing_max_lat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + + return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos); +} + +static ssize_t +tracing_max_lat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + + return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos); +} + +#endif + +static int open_pipe_on_cpu(struct trace_array *tr, int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) { + if (cpumask_empty(tr->pipe_cpumask)) { + cpumask_setall(tr->pipe_cpumask); + return 0; + } + } else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) { + cpumask_set_cpu(cpu, tr->pipe_cpumask); + return 0; + } + return -EBUSY; +} + +static void close_pipe_on_cpu(struct trace_array *tr, int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) { + WARN_ON(!cpumask_full(tr->pipe_cpumask)); + cpumask_clear(tr->pipe_cpumask); + } else { + WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask)); + cpumask_clear_cpu(cpu, tr->pipe_cpumask); + } +} + +static int tracing_open_pipe(struct inode *inode, struct file *filp) +{ + struct trace_array *tr = inode->i_private; + struct trace_iterator *iter; + int cpu; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + mutex_lock(&trace_types_lock); + cpu = tracing_get_cpu(inode); + ret = open_pipe_on_cpu(tr, cpu); + if (ret) + goto fail_pipe_on_cpu; + + /* create a buffer to store the information to pass to userspace */ + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) { + ret = -ENOMEM; + goto fail_alloc_iter; + } + + trace_seq_init(&iter->seq); + iter->trace = tr->current_trace; + + if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { + ret = -ENOMEM; + goto fail; + } + + /* trace pipe does not show start of buffer */ + cpumask_setall(iter->started); + + if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) + iter->iter_flags |= TRACE_FILE_LAT_FMT; + + /* Output in nanoseconds only if we are using a clock in nanoseconds. */ + if (trace_clocks[tr->clock_id].in_ns) + iter->iter_flags |= TRACE_FILE_TIME_IN_NS; + + iter->tr = tr; + iter->array_buffer = &tr->array_buffer; + iter->cpu_file = cpu; + mutex_init(&iter->mutex); + filp->private_data = iter; + + if (iter->trace->pipe_open) + iter->trace->pipe_open(iter); + + nonseekable_open(inode, filp); + + tr->trace_ref++; + + mutex_unlock(&trace_types_lock); + return ret; + +fail: + kfree(iter); +fail_alloc_iter: + close_pipe_on_cpu(tr, cpu); +fail_pipe_on_cpu: + __trace_array_put(tr); + mutex_unlock(&trace_types_lock); + return ret; +} + +static int tracing_release_pipe(struct inode *inode, struct file *file) +{ + struct trace_iterator *iter = file->private_data; + struct trace_array *tr = inode->i_private; + + mutex_lock(&trace_types_lock); + + tr->trace_ref--; + + if (iter->trace->pipe_close) + iter->trace->pipe_close(iter); + close_pipe_on_cpu(tr, iter->cpu_file); + mutex_unlock(&trace_types_lock); + + free_trace_iter_content(iter); + kfree(iter); + + trace_array_put(tr); + + return 0; +} + +static __poll_t +trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table) +{ + struct trace_array *tr = iter->tr; + + /* Iterators are static, they should be filled or empty */ + if (trace_buffer_iter(iter, iter->cpu_file)) + return EPOLLIN | EPOLLRDNORM; + + if (tr->trace_flags & TRACE_ITER_BLOCK) + /* + * Always select as readable when in blocking mode + */ + return EPOLLIN | EPOLLRDNORM; + else + return ring_buffer_poll_wait(iter->array_buffer->buffer, iter->cpu_file, + filp, poll_table, iter->tr->buffer_percent); +} + +static __poll_t +tracing_poll_pipe(struct file *filp, poll_table *poll_table) +{ + struct trace_iterator *iter = filp->private_data; + + return trace_poll(iter, filp, poll_table); +} + +/* Must be called with iter->mutex held. */ +static int tracing_wait_pipe(struct file *filp) +{ + struct trace_iterator *iter = filp->private_data; + int ret; + + while (trace_empty(iter)) { + + if ((filp->f_flags & O_NONBLOCK)) { + return -EAGAIN; + } + + /* + * We block until we read something and tracing is disabled. + * We still block if tracing is disabled, but we have never + * read anything. This allows a user to cat this file, and + * then enable tracing. But after we have read something, + * we give an EOF when tracing is again disabled. + * + * iter->pos will be 0 if we haven't read anything. + */ + if (!tracer_tracing_is_on(iter->tr) && iter->pos) + break; + + mutex_unlock(&iter->mutex); + + ret = wait_on_pipe(iter, 0); + + mutex_lock(&iter->mutex); + + if (ret) + return ret; + } + + return 1; +} + +/* + * Consumer reader. + */ +static ssize_t +tracing_read_pipe(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_iterator *iter = filp->private_data; + ssize_t sret; + + /* + * Avoid more than one consumer on a single file descriptor + * This is just a matter of traces coherency, the ring buffer itself + * is protected. + */ + mutex_lock(&iter->mutex); + + /* return any leftover data */ + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (sret != -EBUSY) + goto out; + + trace_seq_init(&iter->seq); + + if (iter->trace->read) { + sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); + if (sret) + goto out; + } + +waitagain: + sret = tracing_wait_pipe(filp); + if (sret <= 0) + goto out; + + /* stop when tracing is finished */ + if (trace_empty(iter)) { + sret = 0; + goto out; + } + + if (cnt >= PAGE_SIZE) + cnt = PAGE_SIZE - 1; + + /* reset all but tr, trace, and overruns */ + trace_iterator_reset(iter); + cpumask_clear(iter->started); + trace_seq_init(&iter->seq); + + trace_event_read_lock(); + trace_access_lock(iter->cpu_file); + while (trace_find_next_entry_inc(iter) != NULL) { + enum print_line_t ret; + int save_len = iter->seq.seq.len; + + ret = print_trace_line(iter); + if (ret == TRACE_TYPE_PARTIAL_LINE) { + /* + * If one print_trace_line() fills entire trace_seq in one shot, + * trace_seq_to_user() will returns -EBUSY because save_len == 0, + * In this case, we need to consume it, otherwise, loop will peek + * this event next time, resulting in an infinite loop. + */ + if (save_len == 0) { + iter->seq.full = 0; + trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n"); + trace_consume(iter); + break; + } + + /* In other cases, don't print partial lines */ + iter->seq.seq.len = save_len; + break; + } + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(iter); + + if (trace_seq_used(&iter->seq) >= cnt) + break; + + /* + * Setting the full flag means we reached the trace_seq buffer + * size and we should leave by partial output condition above. + * One of the trace_seq_* functions is not used properly. + */ + WARN_ONCE(iter->seq.full, "full flag set for trace type %d", + iter->ent->type); + } + trace_access_unlock(iter->cpu_file); + trace_event_read_unlock(); + + /* Now copy what we have to the user */ + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq)) + trace_seq_init(&iter->seq); + + /* + * If there was nothing to send to user, in spite of consuming trace + * entries, go back to wait for more entries. + */ + if (sret == -EBUSY) + goto waitagain; + +out: + mutex_unlock(&iter->mutex); + + return sret; +} + +static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, + unsigned int idx) +{ + __free_page(spd->pages[idx]); +} + +static size_t +tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) +{ + size_t count; + int save_len; + int ret; + + /* Seq buffer is page-sized, exactly what we need. */ + for (;;) { + save_len = iter->seq.seq.len; + ret = print_trace_line(iter); + + if (trace_seq_has_overflowed(&iter->seq)) { + iter->seq.seq.len = save_len; + break; + } + + /* + * This should not be hit, because it should only + * be set if the iter->seq overflowed. But check it + * anyway to be safe. + */ + if (ret == TRACE_TYPE_PARTIAL_LINE) { + iter->seq.seq.len = save_len; + break; + } + + count = trace_seq_used(&iter->seq) - save_len; + if (rem < count) { + rem = 0; + iter->seq.seq.len = save_len; + break; + } + + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(iter); + rem -= count; + if (!trace_find_next_entry_inc(iter)) { + rem = 0; + iter->ent = NULL; + break; + } + } + + return rem; +} + +static ssize_t tracing_splice_read_pipe(struct file *filp, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags) +{ + struct page *pages_def[PIPE_DEF_BUFFERS]; + struct partial_page partial_def[PIPE_DEF_BUFFERS]; + struct trace_iterator *iter = filp->private_data; + struct splice_pipe_desc spd = { + .pages = pages_def, + .partial = partial_def, + .nr_pages = 0, /* This gets updated below. */ + .nr_pages_max = PIPE_DEF_BUFFERS, + .ops = &default_pipe_buf_ops, + .spd_release = tracing_spd_release_pipe, + }; + ssize_t ret; + size_t rem; + unsigned int i; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + mutex_lock(&iter->mutex); + + if (iter->trace->splice_read) { + ret = iter->trace->splice_read(iter, filp, + ppos, pipe, len, flags); + if (ret) + goto out_err; + } + + ret = tracing_wait_pipe(filp); + if (ret <= 0) + goto out_err; + + if (!iter->ent && !trace_find_next_entry_inc(iter)) { + ret = -EFAULT; + goto out_err; + } + + trace_event_read_lock(); + trace_access_lock(iter->cpu_file); + + /* Fill as many pages as possible. */ + for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) { + spd.pages[i] = alloc_page(GFP_KERNEL); + if (!spd.pages[i]) + break; + + rem = tracing_fill_pipe_page(rem, iter); + + /* Copy the data into the page, so we can start over. */ + ret = trace_seq_to_buffer(&iter->seq, + page_address(spd.pages[i]), + trace_seq_used(&iter->seq)); + if (ret < 0) { + __free_page(spd.pages[i]); + break; + } + spd.partial[i].offset = 0; + spd.partial[i].len = trace_seq_used(&iter->seq); + + trace_seq_init(&iter->seq); + } + + trace_access_unlock(iter->cpu_file); + trace_event_read_unlock(); + mutex_unlock(&iter->mutex); + + spd.nr_pages = i; + + if (i) + ret = splice_to_pipe(pipe, &spd); + else + ret = 0; +out: + splice_shrink_spd(&spd); + return ret; + +out_err: + mutex_unlock(&iter->mutex); + goto out; +} + +static ssize_t +tracing_entries_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + int cpu = tracing_get_cpu(inode); + char buf[64]; + int r = 0; + ssize_t ret; + + mutex_lock(&trace_types_lock); + + if (cpu == RING_BUFFER_ALL_CPUS) { + int cpu, buf_size_same; + unsigned long size; + + size = 0; + buf_size_same = 1; + /* check if all cpu sizes are same */ + for_each_tracing_cpu(cpu) { + /* fill in the size from first enabled cpu */ + if (size == 0) + size = per_cpu_ptr(tr->array_buffer.data, cpu)->entries; + if (size != per_cpu_ptr(tr->array_buffer.data, cpu)->entries) { + buf_size_same = 0; + break; + } + } + + if (buf_size_same) { + if (!ring_buffer_expanded) + r = sprintf(buf, "%lu (expanded: %lu)\n", + size >> 10, + trace_buf_size >> 10); + else + r = sprintf(buf, "%lu\n", size >> 10); + } else + r = sprintf(buf, "X\n"); + } else + r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10); + + mutex_unlock(&trace_types_lock); + + ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + return ret; +} + +static ssize_t +tracing_entries_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + /* must have at least 1 entry */ + if (!val) + return -EINVAL; + + /* value is in KB */ + val <<= 10; + ret = tracing_resize_ring_buffer(tr, val, tracing_get_cpu(inode)); + if (ret < 0) + return ret; + + *ppos += cnt; + + return cnt; +} + +static ssize_t +tracing_total_entries_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r, cpu; + unsigned long size = 0, expanded_size = 0; + + mutex_lock(&trace_types_lock); + for_each_tracing_cpu(cpu) { + size += per_cpu_ptr(tr->array_buffer.data, cpu)->entries >> 10; + if (!ring_buffer_expanded) + expanded_size += trace_buf_size >> 10; + } + if (ring_buffer_expanded) + r = sprintf(buf, "%lu\n", size); + else + r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size); + mutex_unlock(&trace_types_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_free_buffer_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + /* + * There is no need to read what the user has written, this function + * is just to make sure that there is no error when "echo" is used + */ + + *ppos += cnt; + + return cnt; +} + +static int +tracing_free_buffer_release(struct inode *inode, struct file *filp) +{ + struct trace_array *tr = inode->i_private; + + /* disable tracing ? */ + if (tr->trace_flags & TRACE_ITER_STOP_ON_FREE) + tracer_tracing_off(tr); + /* resize the ring buffer to 0 */ + tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); + + trace_array_put(tr); + + return 0; +} + +static ssize_t +tracing_mark_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + struct trace_array *tr = filp->private_data; + struct ring_buffer_event *event; + enum event_trigger_type tt = ETT_NONE; + struct trace_buffer *buffer; + struct print_entry *entry; + ssize_t written; + int size; + int len; + +/* Used in tracing_mark_raw_write() as well */ +#define FAULTED_STR "<faulted>" +#define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */ + + if (tracing_disabled) + return -EINVAL; + + if (!(tr->trace_flags & TRACE_ITER_MARKERS)) + return -EINVAL; + + if (cnt > TRACE_BUF_SIZE) + cnt = TRACE_BUF_SIZE; + + BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); + + size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */ + + /* If less than "<faulted>", then make sure we can still add that */ + if (cnt < FAULTED_SIZE) + size += FAULTED_SIZE - cnt; + + buffer = tr->array_buffer.buffer; + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + tracing_gen_ctx()); + if (unlikely(!event)) + /* Ring buffer disabled, return as if not open for write */ + return -EBADF; + + entry = ring_buffer_event_data(event); + entry->ip = _THIS_IP_; + + len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); + if (len) { + memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); + cnt = FAULTED_SIZE; + written = -EFAULT; + } else + written = cnt; + + if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) { + /* do not add \n before testing triggers, but add \0 */ + entry->buf[cnt] = '\0'; + tt = event_triggers_call(tr->trace_marker_file, buffer, entry, event); + } + + if (entry->buf[cnt - 1] != '\n') { + entry->buf[cnt] = '\n'; + entry->buf[cnt + 1] = '\0'; + } else + entry->buf[cnt] = '\0'; + + if (static_branch_unlikely(&trace_marker_exports_enabled)) + ftrace_exports(event, TRACE_EXPORT_MARKER); + __buffer_unlock_commit(buffer, event); + + if (tt) + event_triggers_post_call(tr->trace_marker_file, tt); + + return written; +} + +/* Limit it for now to 3K (including tag) */ +#define RAW_DATA_MAX_SIZE (1024*3) + +static ssize_t +tracing_mark_raw_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + struct trace_array *tr = filp->private_data; + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct raw_data_entry *entry; + ssize_t written; + int size; + int len; + +#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) + + if (tracing_disabled) + return -EINVAL; + + if (!(tr->trace_flags & TRACE_ITER_MARKERS)) + return -EINVAL; + + /* The marker must at least have a tag id */ + if (cnt < sizeof(unsigned int) || cnt > RAW_DATA_MAX_SIZE) + return -EINVAL; + + if (cnt > TRACE_BUF_SIZE) + cnt = TRACE_BUF_SIZE; + + BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); + + size = sizeof(*entry) + cnt; + if (cnt < FAULT_SIZE_ID) + size += FAULT_SIZE_ID - cnt; + + buffer = tr->array_buffer.buffer; + event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, + tracing_gen_ctx()); + if (!event) + /* Ring buffer disabled, return as if not open for write */ + return -EBADF; + + entry = ring_buffer_event_data(event); + + len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); + if (len) { + entry->id = -1; + memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); + written = -EFAULT; + } else + written = cnt; + + __buffer_unlock_commit(buffer, event); + + return written; +} + +static int tracing_clock_show(struct seq_file *m, void *v) +{ + struct trace_array *tr = m->private; + int i; + + for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) + seq_printf(m, + "%s%s%s%s", i ? " " : "", + i == tr->clock_id ? "[" : "", trace_clocks[i].name, + i == tr->clock_id ? "]" : ""); + seq_putc(m, '\n'); + + return 0; +} + +int tracing_set_clock(struct trace_array *tr, const char *clockstr) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { + if (strcmp(trace_clocks[i].name, clockstr) == 0) + break; + } + if (i == ARRAY_SIZE(trace_clocks)) + return -EINVAL; + + mutex_lock(&trace_types_lock); + + tr->clock_id = i; + + ring_buffer_set_clock(tr->array_buffer.buffer, trace_clocks[i].func); + + /* + * New clock may not be consistent with the previous clock. + * Reset the buffer so that it doesn't have incomparable timestamps. + */ + tracing_reset_online_cpus(&tr->array_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->max_buffer.buffer) + ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); + tracing_reset_online_cpus(&tr->max_buffer); +#endif + + mutex_unlock(&trace_types_lock); + + return 0; +} + +static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; + char buf[64]; + const char *clockstr; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + clockstr = strstrip(buf); + + ret = tracing_set_clock(tr, clockstr); + if (ret) + return ret; + + *fpos += cnt; + + return cnt; +} + +static int tracing_clock_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + ret = single_open(file, tracing_clock_show, inode->i_private); + if (ret < 0) + trace_array_put(tr); + + return ret; +} + +static int tracing_time_stamp_mode_show(struct seq_file *m, void *v) +{ + struct trace_array *tr = m->private; + + mutex_lock(&trace_types_lock); + + if (ring_buffer_time_stamp_abs(tr->array_buffer.buffer)) + seq_puts(m, "delta [absolute]\n"); + else + seq_puts(m, "[delta] absolute\n"); + + mutex_unlock(&trace_types_lock); + + return 0; +} + +static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private); + if (ret < 0) + trace_array_put(tr); + + return ret; +} + +u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe) +{ + if (rbe == this_cpu_read(trace_buffered_event)) + return ring_buffer_time_stamp(buffer); + + return ring_buffer_event_time_stamp(buffer, rbe); +} + +/* + * Set or disable using the per CPU trace_buffer_event when possible. + */ +int tracing_set_filter_buffering(struct trace_array *tr, bool set) +{ + int ret = 0; + + mutex_lock(&trace_types_lock); + + if (set && tr->no_filter_buffering_ref++) + goto out; + + if (!set) { + if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) { + ret = -EINVAL; + goto out; + } + + --tr->no_filter_buffering_ref; + } + out: + mutex_unlock(&trace_types_lock); + + return ret; +} + +struct ftrace_buffer_info { + struct trace_iterator iter; + void *spare; + unsigned int spare_cpu; + unsigned int read; +}; + +#ifdef CONFIG_TRACER_SNAPSHOT +static int tracing_snapshot_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + struct trace_iterator *iter; + struct seq_file *m; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + if (file->f_mode & FMODE_READ) { + iter = __tracing_open(inode, file, true); + if (IS_ERR(iter)) + ret = PTR_ERR(iter); + } else { + /* Writes still need the seq_file to hold the private data */ + ret = -ENOMEM; + m = kzalloc(sizeof(*m), GFP_KERNEL); + if (!m) + goto out; + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) { + kfree(m); + goto out; + } + ret = 0; + + iter->tr = tr; + iter->array_buffer = &tr->max_buffer; + iter->cpu_file = tracing_get_cpu(inode); + m->private = iter; + file->private_data = m; + } +out: + if (ret < 0) + trace_array_put(tr); + + return ret; +} + +static void tracing_swap_cpu_buffer(void *tr) +{ + update_max_tr_single((struct trace_array *)tr, current, smp_processor_id()); +} + +static ssize_t +tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct seq_file *m = filp->private_data; + struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; + unsigned long val; + int ret; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + mutex_lock(&trace_types_lock); + + if (tr->current_trace->use_max_tr) { + ret = -EBUSY; + goto out; + } + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + if (tr->cond_snapshot) + ret = -EBUSY; + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + if (ret) + goto out; + + switch (val) { + case 0: + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { + ret = -EINVAL; + break; + } + if (tr->allocated_snapshot) + free_snapshot(tr); + break; + case 1: +/* Only allow per-cpu swap if the ring buffer supports it */ +#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { + ret = -EINVAL; + break; + } +#endif + if (tr->allocated_snapshot) + ret = resize_buffer_duplicate_size(&tr->max_buffer, + &tr->array_buffer, iter->cpu_file); + else + ret = tracing_alloc_snapshot_instance(tr); + if (ret < 0) + break; + /* Now, we're going to swap */ + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { + local_irq_disable(); + update_max_tr(tr, current, smp_processor_id(), NULL); + local_irq_enable(); + } else { + smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer, + (void *)tr, 1); + } + break; + default: + if (tr->allocated_snapshot) { + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + tracing_reset_online_cpus(&tr->max_buffer); + else + tracing_reset_cpu(&tr->max_buffer, iter->cpu_file); + } + break; + } + + if (ret >= 0) { + *ppos += cnt; + ret = cnt; + } +out: + mutex_unlock(&trace_types_lock); + return ret; +} + +static int tracing_snapshot_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + int ret; + + ret = tracing_release(inode, file); + + if (file->f_mode & FMODE_READ) + return ret; + + /* If write only, the seq_file is just a stub */ + if (m) + kfree(m->private); + kfree(m); + + return 0; +} + +static int tracing_buffers_open(struct inode *inode, struct file *filp); +static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos); +static int tracing_buffers_release(struct inode *inode, struct file *file); +static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, unsigned int flags); + +static int snapshot_raw_open(struct inode *inode, struct file *filp) +{ + struct ftrace_buffer_info *info; + int ret; + + /* The following checks for tracefs lockdown */ + ret = tracing_buffers_open(inode, filp); + if (ret < 0) + return ret; + + info = filp->private_data; + + if (info->iter.trace->use_max_tr) { + tracing_buffers_release(inode, filp); + return -EBUSY; + } + + info->iter.snapshot = true; + info->iter.array_buffer = &info->iter.tr->max_buffer; + + return ret; +} + +#endif /* CONFIG_TRACER_SNAPSHOT */ + + +static const struct file_operations tracing_thresh_fops = { + .open = tracing_open_generic, + .read = tracing_thresh_read, + .write = tracing_thresh_write, + .llseek = generic_file_llseek, +}; + +#ifdef CONFIG_TRACER_MAX_TRACE +static const struct file_operations tracing_max_lat_fops = { + .open = tracing_open_generic_tr, + .read = tracing_max_lat_read, + .write = tracing_max_lat_write, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; +#endif + +static const struct file_operations set_tracer_fops = { + .open = tracing_open_generic_tr, + .read = tracing_set_trace_read, + .write = tracing_set_trace_write, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; + +static const struct file_operations tracing_pipe_fops = { + .open = tracing_open_pipe, + .poll = tracing_poll_pipe, + .read = tracing_read_pipe, + .splice_read = tracing_splice_read_pipe, + .release = tracing_release_pipe, + .llseek = no_llseek, +}; + +static const struct file_operations tracing_entries_fops = { + .open = tracing_open_generic_tr, + .read = tracing_entries_read, + .write = tracing_entries_write, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; + +static const struct file_operations tracing_total_entries_fops = { + .open = tracing_open_generic_tr, + .read = tracing_total_entries_read, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; + +static const struct file_operations tracing_free_buffer_fops = { + .open = tracing_open_generic_tr, + .write = tracing_free_buffer_write, + .release = tracing_free_buffer_release, +}; + +static const struct file_operations tracing_mark_fops = { + .open = tracing_mark_open, + .write = tracing_mark_write, + .release = tracing_release_generic_tr, +}; + +static const struct file_operations tracing_mark_raw_fops = { + .open = tracing_mark_open, + .write = tracing_mark_raw_write, + .release = tracing_release_generic_tr, +}; + +static const struct file_operations trace_clock_fops = { + .open = tracing_clock_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_single_release_tr, + .write = tracing_clock_write, +}; + +static const struct file_operations trace_time_stamp_mode_fops = { + .open = tracing_time_stamp_mode_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_single_release_tr, +}; + +#ifdef CONFIG_TRACER_SNAPSHOT +static const struct file_operations snapshot_fops = { + .open = tracing_snapshot_open, + .read = seq_read, + .write = tracing_snapshot_write, + .llseek = tracing_lseek, + .release = tracing_snapshot_release, +}; + +static const struct file_operations snapshot_raw_fops = { + .open = snapshot_raw_open, + .read = tracing_buffers_read, + .release = tracing_buffers_release, + .splice_read = tracing_buffers_splice_read, + .llseek = no_llseek, +}; + +#endif /* CONFIG_TRACER_SNAPSHOT */ + +/* + * trace_min_max_write - Write a u64 value to a trace_min_max_param struct + * @filp: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function implements the write interface for a struct trace_min_max_param. + * The filp->private_data must point to a trace_min_max_param structure that + * defines where to write the value, the min and the max acceptable values, + * and a lock to protect the write. + */ +static ssize_t +trace_min_max_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_min_max_param *param = filp->private_data; + u64 val; + int err; + + if (!param) + return -EFAULT; + + err = kstrtoull_from_user(ubuf, cnt, 10, &val); + if (err) + return err; + + if (param->lock) + mutex_lock(param->lock); + + if (param->min && val < *param->min) + err = -EINVAL; + + if (param->max && val > *param->max) + err = -EINVAL; + + if (!err) + *param->val = val; + + if (param->lock) + mutex_unlock(param->lock); + + if (err) + return err; + + return cnt; +} + +/* + * trace_min_max_read - Read a u64 value from a trace_min_max_param struct + * @filp: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * This function implements the read interface for a struct trace_min_max_param. + * The filp->private_data must point to a trace_min_max_param struct with valid + * data. + */ +static ssize_t +trace_min_max_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_min_max_param *param = filp->private_data; + char buf[U64_STR_SIZE]; + int len; + u64 val; + + if (!param) + return -EFAULT; + + val = *param->val; + + if (cnt > sizeof(buf)) + cnt = sizeof(buf); + + len = snprintf(buf, sizeof(buf), "%llu\n", val); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); +} + +const struct file_operations trace_min_max_fops = { + .open = tracing_open_generic, + .read = trace_min_max_read, + .write = trace_min_max_write, +}; + +#define TRACING_LOG_ERRS_MAX 8 +#define TRACING_LOG_LOC_MAX 128 + +#define CMD_PREFIX " Command: " + +struct err_info { + const char **errs; /* ptr to loc-specific array of err strings */ + u8 type; /* index into errs -> specific err string */ + u16 pos; /* caret position */ + u64 ts; +}; + +struct tracing_log_err { + struct list_head list; + struct err_info info; + char loc[TRACING_LOG_LOC_MAX]; /* err location */ + char *cmd; /* what caused err */ +}; + +static DEFINE_MUTEX(tracing_err_log_lock); + +static struct tracing_log_err *alloc_tracing_log_err(int len) +{ + struct tracing_log_err *err; + + err = kzalloc(sizeof(*err), GFP_KERNEL); + if (!err) + return ERR_PTR(-ENOMEM); + + err->cmd = kzalloc(len, GFP_KERNEL); + if (!err->cmd) { + kfree(err); + return ERR_PTR(-ENOMEM); + } + + return err; +} + +static void free_tracing_log_err(struct tracing_log_err *err) +{ + kfree(err->cmd); + kfree(err); +} + +static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr, + int len) +{ + struct tracing_log_err *err; + char *cmd; + + if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) { + err = alloc_tracing_log_err(len); + if (PTR_ERR(err) != -ENOMEM) + tr->n_err_log_entries++; + + return err; + } + cmd = kzalloc(len, GFP_KERNEL); + if (!cmd) + return ERR_PTR(-ENOMEM); + err = list_first_entry(&tr->err_log, struct tracing_log_err, list); + kfree(err->cmd); + err->cmd = cmd; + list_del(&err->list); + + return err; +} + +/** + * err_pos - find the position of a string within a command for error careting + * @cmd: The tracing command that caused the error + * @str: The string to position the caret at within @cmd + * + * Finds the position of the first occurrence of @str within @cmd. The + * return value can be passed to tracing_log_err() for caret placement + * within @cmd. + * + * Returns the index within @cmd of the first occurrence of @str or 0 + * if @str was not found. + */ +unsigned int err_pos(char *cmd, const char *str) +{ + char *found; + + if (WARN_ON(!strlen(cmd))) + return 0; + + found = strstr(cmd, str); + if (found) + return found - cmd; + + return 0; +} + +/** + * tracing_log_err - write an error to the tracing error log + * @tr: The associated trace array for the error (NULL for top level array) + * @loc: A string describing where the error occurred + * @cmd: The tracing command that caused the error + * @errs: The array of loc-specific static error strings + * @type: The index into errs[], which produces the specific static err string + * @pos: The position the caret should be placed in the cmd + * + * Writes an error into tracing/error_log of the form: + * + * <loc>: error: <text> + * Command: <cmd> + * ^ + * + * tracing/error_log is a small log file containing the last + * TRACING_LOG_ERRS_MAX errors (8). Memory for errors isn't allocated + * unless there has been a tracing error, and the error log can be + * cleared and have its memory freed by writing the empty string in + * truncation mode to it i.e. echo > tracing/error_log. + * + * NOTE: the @errs array along with the @type param are used to + * produce a static error string - this string is not copied and saved + * when the error is logged - only a pointer to it is saved. See + * existing callers for examples of how static strings are typically + * defined for use with tracing_log_err(). + */ +void tracing_log_err(struct trace_array *tr, + const char *loc, const char *cmd, + const char **errs, u8 type, u16 pos) +{ + struct tracing_log_err *err; + int len = 0; + + if (!tr) + tr = &global_trace; + + len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1; + + mutex_lock(&tracing_err_log_lock); + err = get_tracing_log_err(tr, len); + if (PTR_ERR(err) == -ENOMEM) { + mutex_unlock(&tracing_err_log_lock); + return; + } + + snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc); + snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd); + + err->info.errs = errs; + err->info.type = type; + err->info.pos = pos; + err->info.ts = local_clock(); + + list_add_tail(&err->list, &tr->err_log); + mutex_unlock(&tracing_err_log_lock); +} + +static void clear_tracing_err_log(struct trace_array *tr) +{ + struct tracing_log_err *err, *next; + + mutex_lock(&tracing_err_log_lock); + list_for_each_entry_safe(err, next, &tr->err_log, list) { + list_del(&err->list); + free_tracing_log_err(err); + } + + tr->n_err_log_entries = 0; + mutex_unlock(&tracing_err_log_lock); +} + +static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos) +{ + struct trace_array *tr = m->private; + + mutex_lock(&tracing_err_log_lock); + + return seq_list_start(&tr->err_log, *pos); +} + +static void *tracing_err_log_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_array *tr = m->private; + + return seq_list_next(v, &tr->err_log, pos); +} + +static void tracing_err_log_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&tracing_err_log_lock); +} + +static void tracing_err_log_show_pos(struct seq_file *m, u16 pos) +{ + u16 i; + + for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++) + seq_putc(m, ' '); + for (i = 0; i < pos; i++) + seq_putc(m, ' '); + seq_puts(m, "^\n"); +} + +static int tracing_err_log_seq_show(struct seq_file *m, void *v) +{ + struct tracing_log_err *err = v; + + if (err) { + const char *err_text = err->info.errs[err->info.type]; + u64 sec = err->info.ts; + u32 nsec; + + nsec = do_div(sec, NSEC_PER_SEC); + seq_printf(m, "[%5llu.%06u] %s%s", sec, nsec / 1000, + err->loc, err_text); + seq_printf(m, "%s", err->cmd); + tracing_err_log_show_pos(m, err->info.pos); + } + + return 0; +} + +static const struct seq_operations tracing_err_log_seq_ops = { + .start = tracing_err_log_seq_start, + .next = tracing_err_log_seq_next, + .stop = tracing_err_log_seq_stop, + .show = tracing_err_log_seq_show +}; + +static int tracing_err_log_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + int ret = 0; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + /* If this file was opened for write, then erase contents */ + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) + clear_tracing_err_log(tr); + + if (file->f_mode & FMODE_READ) { + ret = seq_open(file, &tracing_err_log_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = tr; + } else { + trace_array_put(tr); + } + } + return ret; +} + +static ssize_t tracing_err_log_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + return count; +} + +static int tracing_err_log_release(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + + return 0; +} + +static const struct file_operations tracing_err_log_fops = { + .open = tracing_err_log_open, + .write = tracing_err_log_write, + .read = seq_read, + .llseek = tracing_lseek, + .release = tracing_err_log_release, +}; + +static int tracing_buffers_open(struct inode *inode, struct file *filp) +{ + struct trace_array *tr = inode->i_private; + struct ftrace_buffer_info *info; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + info = kvzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + trace_array_put(tr); + return -ENOMEM; + } + + mutex_lock(&trace_types_lock); + + info->iter.tr = tr; + info->iter.cpu_file = tracing_get_cpu(inode); + info->iter.trace = tr->current_trace; + info->iter.array_buffer = &tr->array_buffer; + info->spare = NULL; + /* Force reading ring buffer for first read */ + info->read = (unsigned int)-1; + + filp->private_data = info; + + tr->trace_ref++; + + mutex_unlock(&trace_types_lock); + + ret = nonseekable_open(inode, filp); + if (ret < 0) + trace_array_put(tr); + + return ret; +} + +static __poll_t +tracing_buffers_poll(struct file *filp, poll_table *poll_table) +{ + struct ftrace_buffer_info *info = filp->private_data; + struct trace_iterator *iter = &info->iter; + + return trace_poll(iter, filp, poll_table); +} + +static ssize_t +tracing_buffers_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct ftrace_buffer_info *info = filp->private_data; + struct trace_iterator *iter = &info->iter; + ssize_t ret = 0; + ssize_t size; + + if (!count) + return 0; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (iter->snapshot && iter->tr->current_trace->use_max_tr) + return -EBUSY; +#endif + + if (!info->spare) { + info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer, + iter->cpu_file); + if (IS_ERR(info->spare)) { + ret = PTR_ERR(info->spare); + info->spare = NULL; + } else { + info->spare_cpu = iter->cpu_file; + } + } + if (!info->spare) + return ret; + + /* Do we have previous read data to read? */ + if (info->read < PAGE_SIZE) + goto read; + + again: + trace_access_lock(iter->cpu_file); + ret = ring_buffer_read_page(iter->array_buffer->buffer, + &info->spare, + count, + iter->cpu_file, 0); + trace_access_unlock(iter->cpu_file); + + if (ret < 0) { + if (trace_empty(iter)) { + if ((filp->f_flags & O_NONBLOCK)) + return -EAGAIN; + + ret = wait_on_pipe(iter, 0); + if (ret) + return ret; + + goto again; + } + return 0; + } + + info->read = 0; + read: + size = PAGE_SIZE - info->read; + if (size > count) + size = count; + + ret = copy_to_user(ubuf, info->spare + info->read, size); + if (ret == size) + return -EFAULT; + + size -= ret; + + *ppos += size; + info->read += size; + + return size; +} + +static int tracing_buffers_release(struct inode *inode, struct file *file) +{ + struct ftrace_buffer_info *info = file->private_data; + struct trace_iterator *iter = &info->iter; + + mutex_lock(&trace_types_lock); + + iter->tr->trace_ref--; + + __trace_array_put(iter->tr); + + iter->wait_index++; + /* Make sure the waiters see the new wait_index */ + smp_wmb(); + + ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); + + if (info->spare) + ring_buffer_free_read_page(iter->array_buffer->buffer, + info->spare_cpu, info->spare); + kvfree(info); + + mutex_unlock(&trace_types_lock); + + return 0; +} + +struct buffer_ref { + struct trace_buffer *buffer; + void *page; + int cpu; + refcount_t refcount; +}; + +static void buffer_ref_release(struct buffer_ref *ref) +{ + if (!refcount_dec_and_test(&ref->refcount)) + return; + ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); + kfree(ref); +} + +static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct buffer_ref *ref = (struct buffer_ref *)buf->private; + + buffer_ref_release(ref); + buf->private = 0; +} + +static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct buffer_ref *ref = (struct buffer_ref *)buf->private; + + if (refcount_read(&ref->refcount) > INT_MAX/2) + return false; + + refcount_inc(&ref->refcount); + return true; +} + +/* Pipe buffer operations for a buffer. */ +static const struct pipe_buf_operations buffer_pipe_buf_ops = { + .release = buffer_pipe_buf_release, + .get = buffer_pipe_buf_get, +}; + +/* + * Callback from splice_to_pipe(), if we need to release some pages + * at the end of the spd in case we error'ed out in filling the pipe. + */ +static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) +{ + struct buffer_ref *ref = + (struct buffer_ref *)spd->partial[i].private; + + buffer_ref_release(ref); + spd->partial[i].private = 0; +} + +static ssize_t +tracing_buffers_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct ftrace_buffer_info *info = file->private_data; + struct trace_iterator *iter = &info->iter; + struct partial_page partial_def[PIPE_DEF_BUFFERS]; + struct page *pages_def[PIPE_DEF_BUFFERS]; + struct splice_pipe_desc spd = { + .pages = pages_def, + .partial = partial_def, + .nr_pages_max = PIPE_DEF_BUFFERS, + .ops = &buffer_pipe_buf_ops, + .spd_release = buffer_spd_release, + }; + struct buffer_ref *ref; + int entries, i; + ssize_t ret = 0; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (iter->snapshot && iter->tr->current_trace->use_max_tr) + return -EBUSY; +#endif + + if (*ppos & (PAGE_SIZE - 1)) + return -EINVAL; + + if (len & (PAGE_SIZE - 1)) { + if (len < PAGE_SIZE) + return -EINVAL; + len &= PAGE_MASK; + } + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + again: + trace_access_lock(iter->cpu_file); + entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); + + for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) { + struct page *page; + int r; + + ref = kzalloc(sizeof(*ref), GFP_KERNEL); + if (!ref) { + ret = -ENOMEM; + break; + } + + refcount_set(&ref->refcount, 1); + ref->buffer = iter->array_buffer->buffer; + ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); + if (IS_ERR(ref->page)) { + ret = PTR_ERR(ref->page); + ref->page = NULL; + kfree(ref); + break; + } + ref->cpu = iter->cpu_file; + + r = ring_buffer_read_page(ref->buffer, &ref->page, + len, iter->cpu_file, 1); + if (r < 0) { + ring_buffer_free_read_page(ref->buffer, ref->cpu, + ref->page); + kfree(ref); + break; + } + + page = virt_to_page(ref->page); + + spd.pages[i] = page; + spd.partial[i].len = PAGE_SIZE; + spd.partial[i].offset = 0; + spd.partial[i].private = (unsigned long)ref; + spd.nr_pages++; + *ppos += PAGE_SIZE; + + entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); + } + + trace_access_unlock(iter->cpu_file); + spd.nr_pages = i; + + /* did we read anything? */ + if (!spd.nr_pages) { + long wait_index; + + if (ret) + goto out; + + ret = -EAGAIN; + if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) + goto out; + + wait_index = READ_ONCE(iter->wait_index); + + ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent); + if (ret) + goto out; + + /* No need to wait after waking up when tracing is off */ + if (!tracer_tracing_is_on(iter->tr)) + goto out; + + /* Make sure we see the new wait_index */ + smp_rmb(); + if (wait_index != iter->wait_index) + goto out; + + goto again; + } + + ret = splice_to_pipe(pipe, &spd); +out: + splice_shrink_spd(&spd); + + return ret; +} + +/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */ +static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct ftrace_buffer_info *info = file->private_data; + struct trace_iterator *iter = &info->iter; + + if (cmd) + return -ENOIOCTLCMD; + + mutex_lock(&trace_types_lock); + + iter->wait_index++; + /* Make sure the waiters see the new wait_index */ + smp_wmb(); + + ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); + + mutex_unlock(&trace_types_lock); + return 0; +} + +static const struct file_operations tracing_buffers_fops = { + .open = tracing_buffers_open, + .read = tracing_buffers_read, + .poll = tracing_buffers_poll, + .release = tracing_buffers_release, + .splice_read = tracing_buffers_splice_read, + .unlocked_ioctl = tracing_buffers_ioctl, + .llseek = no_llseek, +}; + +static ssize_t +tracing_stats_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_array *tr = inode->i_private; + struct array_buffer *trace_buf = &tr->array_buffer; + int cpu = tracing_get_cpu(inode); + struct trace_seq *s; + unsigned long cnt; + unsigned long long t; + unsigned long usec_rem; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu); + trace_seq_printf(s, "entries: %ld\n", cnt); + + cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu); + trace_seq_printf(s, "overrun: %ld\n", cnt); + + cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu); + trace_seq_printf(s, "commit overrun: %ld\n", cnt); + + cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu); + trace_seq_printf(s, "bytes: %ld\n", cnt); + + if (trace_clocks[tr->clock_id].in_ns) { + /* local or global for trace_clock */ + t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); + usec_rem = do_div(t, USEC_PER_SEC); + trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", + t, usec_rem); + + t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer)); + usec_rem = do_div(t, USEC_PER_SEC); + trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); + } else { + /* counter or tsc mode for trace_clock */ + trace_seq_printf(s, "oldest event ts: %llu\n", + ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); + + trace_seq_printf(s, "now ts: %llu\n", + ring_buffer_time_stamp(trace_buf->buffer)); + } + + cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu); + trace_seq_printf(s, "dropped events: %ld\n", cnt); + + cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); + trace_seq_printf(s, "read events: %ld\n", cnt); + + count = simple_read_from_buffer(ubuf, count, ppos, + s->buffer, trace_seq_used(s)); + + kfree(s); + + return count; +} + +static const struct file_operations tracing_stats_fops = { + .open = tracing_open_generic_tr, + .read = tracing_stats_read, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; + +#ifdef CONFIG_DYNAMIC_FTRACE + +static ssize_t +tracing_read_dyn_info(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + ssize_t ret; + char *buf; + int r; + + /* 256 should be plenty to hold the amount needed */ + buf = kmalloc(256, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + r = scnprintf(buf, 256, "%ld pages:%ld groups: %ld\n", + ftrace_update_tot_cnt, + ftrace_number_of_pages, + ftrace_number_of_groups); + + ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + kfree(buf); + return ret; +} + +static const struct file_operations tracing_dyn_info_fops = { + .open = tracing_open_generic, + .read = tracing_read_dyn_info, + .llseek = generic_file_llseek, +}; +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) +static void +ftrace_snapshot(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) +{ + tracing_snapshot_instance(tr); +} + +static void +ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) +{ + struct ftrace_func_mapper *mapper = data; + long *count = NULL; + + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) { + + if (*count <= 0) + return; + + (*count)--; + } + + tracing_snapshot_instance(tr); +} + +static int +ftrace_snapshot_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + struct ftrace_func_mapper *mapper = data; + long *count = NULL; + + seq_printf(m, "%ps:", (void *)ip); + + seq_puts(m, "snapshot"); + + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) + seq_printf(m, ":count=%ld\n", *count); + else + seq_puts(m, ":unlimited\n"); + + return 0; +} + +static int +ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *init_data, void **data) +{ + struct ftrace_func_mapper *mapper = *data; + + if (!mapper) { + mapper = allocate_ftrace_func_mapper(); + if (!mapper) + return -ENOMEM; + *data = mapper; + } + + return ftrace_func_mapper_add_ip(mapper, ip, init_data); +} + +static void +ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *data) +{ + struct ftrace_func_mapper *mapper = data; + + if (!ip) { + if (!mapper) + return; + free_ftrace_func_mapper(mapper, NULL); + return; + } + + ftrace_func_mapper_remove_ip(mapper, ip); +} + +static struct ftrace_probe_ops snapshot_probe_ops = { + .func = ftrace_snapshot, + .print = ftrace_snapshot_print, +}; + +static struct ftrace_probe_ops snapshot_count_probe_ops = { + .func = ftrace_count_snapshot, + .print = ftrace_snapshot_print, + .init = ftrace_snapshot_init, + .free = ftrace_snapshot_free, +}; + +static int +ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + void *count = (void *)-1; + char *number; + int ret; + + if (!tr) + return -ENODEV; + + /* hash funcs only work with set_ftrace_filter */ + if (!enable) + return -EINVAL; + + ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; + + if (glob[0] == '!') + return unregister_ftrace_function_probe_func(glob+1, tr, ops); + + if (!param) + goto out_reg; + + number = strsep(¶m, ":"); + + if (!strlen(number)) + goto out_reg; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, (unsigned long *)&count); + if (ret) + return ret; + + out_reg: + ret = tracing_alloc_snapshot_instance(tr); + if (ret < 0) + goto out; + + ret = register_ftrace_function_probe(glob, tr, ops, count); + + out: + return ret < 0 ? ret : 0; +} + +static struct ftrace_func_command ftrace_snapshot_cmd = { + .name = "snapshot", + .func = ftrace_trace_snapshot_callback, +}; + +static __init int register_snapshot_cmd(void) +{ + return register_ftrace_command(&ftrace_snapshot_cmd); +} +#else +static inline __init int register_snapshot_cmd(void) { return 0; } +#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ + +static struct dentry *tracing_get_dentry(struct trace_array *tr) +{ + if (WARN_ON(!tr->dir)) + return ERR_PTR(-ENODEV); + + /* Top directory uses NULL as the parent */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return NULL; + + /* All sub buffers have a descriptor */ + return tr->dir; +} + +static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) +{ + struct dentry *d_tracer; + + if (tr->percpu_dir) + return tr->percpu_dir; + + d_tracer = tracing_get_dentry(tr); + if (IS_ERR(d_tracer)) + return NULL; + + tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer); + + MEM_FAIL(!tr->percpu_dir, + "Could not create tracefs directory 'per_cpu/%d'\n", cpu); + + return tr->percpu_dir; +} + +static struct dentry * +trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, + void *data, long cpu, const struct file_operations *fops) +{ + struct dentry *ret = trace_create_file(name, mode, parent, data, fops); + + if (ret) /* See tracing_get_cpu() */ + d_inode(ret)->i_cdev = (void *)(cpu + 1); + return ret; +} + +static void +tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) +{ + struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); + struct dentry *d_cpu; + char cpu_dir[30]; /* 30 characters should be more than enough */ + + if (!d_percpu) + return; + + snprintf(cpu_dir, 30, "cpu%ld", cpu); + d_cpu = tracefs_create_dir(cpu_dir, d_percpu); + if (!d_cpu) { + pr_warn("Could not create tracefs '%s' entry\n", cpu_dir); + return; + } + + /* per cpu trace_pipe */ + trace_create_cpu_file("trace_pipe", TRACE_MODE_READ, d_cpu, + tr, cpu, &tracing_pipe_fops); + + /* per cpu trace */ + trace_create_cpu_file("trace", TRACE_MODE_WRITE, d_cpu, + tr, cpu, &tracing_fops); + + trace_create_cpu_file("trace_pipe_raw", TRACE_MODE_READ, d_cpu, + tr, cpu, &tracing_buffers_fops); + + trace_create_cpu_file("stats", TRACE_MODE_READ, d_cpu, + tr, cpu, &tracing_stats_fops); + + trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu, + tr, cpu, &tracing_entries_fops); + +#ifdef CONFIG_TRACER_SNAPSHOT + trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu, + tr, cpu, &snapshot_fops); + + trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu, + tr, cpu, &snapshot_raw_fops); +#endif +} + +#ifdef CONFIG_FTRACE_SELFTEST +/* Let selftest have access to static functions in this file */ +#include "trace_selftest.c" +#endif + +static ssize_t +trace_options_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_option_dentry *topt = filp->private_data; + char *buf; + + if (topt->flags->val & topt->opt->bit) + buf = "1\n"; + else + buf = "0\n"; + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); +} + +static ssize_t +trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_option_dentry *topt = filp->private_data; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + if (val != 0 && val != 1) + return -EINVAL; + + if (!!(topt->flags->val & topt->opt->bit) != val) { + mutex_lock(&trace_types_lock); + ret = __set_tracer_option(topt->tr, topt->flags, + topt->opt, !val); + mutex_unlock(&trace_types_lock); + if (ret) + return ret; + } + + *ppos += cnt; + + return cnt; +} + +static int tracing_open_options(struct inode *inode, struct file *filp) +{ + struct trace_option_dentry *topt = inode->i_private; + int ret; + + ret = tracing_check_open_get_tr(topt->tr); + if (ret) + return ret; + + filp->private_data = inode->i_private; + return 0; +} + +static int tracing_release_options(struct inode *inode, struct file *file) +{ + struct trace_option_dentry *topt = file->private_data; + + trace_array_put(topt->tr); + return 0; +} + +static const struct file_operations trace_options_fops = { + .open = tracing_open_options, + .read = trace_options_read, + .write = trace_options_write, + .llseek = generic_file_llseek, + .release = tracing_release_options, +}; + +/* + * In order to pass in both the trace_array descriptor as well as the index + * to the flag that the trace option file represents, the trace_array + * has a character array of trace_flags_index[], which holds the index + * of the bit for the flag it represents. index[0] == 0, index[1] == 1, etc. + * The address of this character array is passed to the flag option file + * read/write callbacks. + * + * In order to extract both the index and the trace_array descriptor, + * get_tr_index() uses the following algorithm. + * + * idx = *ptr; + * + * As the pointer itself contains the address of the index (remember + * index[1] == 1). + * + * Then to get the trace_array descriptor, by subtracting that index + * from the ptr, we get to the start of the index itself. + * + * ptr - idx == &index[0] + * + * Then a simple container_of() from that pointer gets us to the + * trace_array descriptor. + */ +static void get_tr_index(void *data, struct trace_array **ptr, + unsigned int *pindex) +{ + *pindex = *(unsigned char *)data; + + *ptr = container_of(data - *pindex, struct trace_array, + trace_flags_index); +} + +static ssize_t +trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + void *tr_index = filp->private_data; + struct trace_array *tr; + unsigned int index; + char *buf; + + get_tr_index(tr_index, &tr, &index); + + if (tr->trace_flags & (1 << index)) + buf = "1\n"; + else + buf = "0\n"; + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); +} + +static ssize_t +trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + void *tr_index = filp->private_data; + struct trace_array *tr; + unsigned int index; + unsigned long val; + int ret; + + get_tr_index(tr_index, &tr, &index); + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + if (val != 0 && val != 1) + return -EINVAL; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + ret = set_tracer_flag(tr, 1 << index, val); + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + + if (ret < 0) + return ret; + + *ppos += cnt; + + return cnt; +} + +static const struct file_operations trace_options_core_fops = { + .open = tracing_open_generic, + .read = trace_options_core_read, + .write = trace_options_core_write, + .llseek = generic_file_llseek, +}; + +struct dentry *trace_create_file(const char *name, + umode_t mode, + struct dentry *parent, + void *data, + const struct file_operations *fops) +{ + struct dentry *ret; + + ret = tracefs_create_file(name, mode, parent, data, fops); + if (!ret) + pr_warn("Could not create tracefs '%s' entry\n", name); + + return ret; +} + + +static struct dentry *trace_options_init_dentry(struct trace_array *tr) +{ + struct dentry *d_tracer; + + if (tr->options) + return tr->options; + + d_tracer = tracing_get_dentry(tr); + if (IS_ERR(d_tracer)) + return NULL; + + tr->options = tracefs_create_dir("options", d_tracer); + if (!tr->options) { + pr_warn("Could not create tracefs directory 'options'\n"); + return NULL; + } + + return tr->options; +} + +static void +create_trace_option_file(struct trace_array *tr, + struct trace_option_dentry *topt, + struct tracer_flags *flags, + struct tracer_opt *opt) +{ + struct dentry *t_options; + + t_options = trace_options_init_dentry(tr); + if (!t_options) + return; + + topt->flags = flags; + topt->opt = opt; + topt->tr = tr; + + topt->entry = trace_create_file(opt->name, TRACE_MODE_WRITE, + t_options, topt, &trace_options_fops); + +} + +static void +create_trace_option_files(struct trace_array *tr, struct tracer *tracer) +{ + struct trace_option_dentry *topts; + struct trace_options *tr_topts; + struct tracer_flags *flags; + struct tracer_opt *opts; + int cnt; + int i; + + if (!tracer) + return; + + flags = tracer->flags; + + if (!flags || !flags->opts) + return; + + /* + * If this is an instance, only create flags for tracers + * the instance may have. + */ + if (!trace_ok_for_array(tracer, tr)) + return; + + for (i = 0; i < tr->nr_topts; i++) { + /* Make sure there's no duplicate flags. */ + if (WARN_ON_ONCE(tr->topts[i].tracer->flags == tracer->flags)) + return; + } + + opts = flags->opts; + + for (cnt = 0; opts[cnt].name; cnt++) + ; + + topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL); + if (!topts) + return; + + tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1), + GFP_KERNEL); + if (!tr_topts) { + kfree(topts); + return; + } + + tr->topts = tr_topts; + tr->topts[tr->nr_topts].tracer = tracer; + tr->topts[tr->nr_topts].topts = topts; + tr->nr_topts++; + + for (cnt = 0; opts[cnt].name; cnt++) { + create_trace_option_file(tr, &topts[cnt], flags, + &opts[cnt]); + MEM_FAIL(topts[cnt].entry == NULL, + "Failed to create trace option: %s", + opts[cnt].name); + } +} + +static struct dentry * +create_trace_option_core_file(struct trace_array *tr, + const char *option, long index) +{ + struct dentry *t_options; + + t_options = trace_options_init_dentry(tr); + if (!t_options) + return NULL; + + return trace_create_file(option, TRACE_MODE_WRITE, t_options, + (void *)&tr->trace_flags_index[index], + &trace_options_core_fops); +} + +static void create_trace_options_dir(struct trace_array *tr) +{ + struct dentry *t_options; + bool top_level = tr == &global_trace; + int i; + + t_options = trace_options_init_dentry(tr); + if (!t_options) + return; + + for (i = 0; trace_options[i]; i++) { + if (top_level || + !((1 << i) & TOP_LEVEL_TRACE_FLAGS)) + create_trace_option_core_file(tr, trace_options[i], i); + } +} + +static ssize_t +rb_simple_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r; + + r = tracer_tracing_is_on(tr); + r = sprintf(buf, "%d\n", r); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +rb_simple_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + struct trace_buffer *buffer = tr->array_buffer.buffer; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + if (buffer) { + mutex_lock(&trace_types_lock); + if (!!val == tracer_tracing_is_on(tr)) { + val = 0; /* do nothing */ + } else if (val) { + tracer_tracing_on(tr); + if (tr->current_trace->start) + tr->current_trace->start(tr); + } else { + tracer_tracing_off(tr); + if (tr->current_trace->stop) + tr->current_trace->stop(tr); + /* Wake up any waiters */ + ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS); + } + mutex_unlock(&trace_types_lock); + } + + (*ppos)++; + + return cnt; +} + +static const struct file_operations rb_simple_fops = { + .open = tracing_open_generic_tr, + .read = rb_simple_read, + .write = rb_simple_write, + .release = tracing_release_generic_tr, + .llseek = default_llseek, +}; + +static ssize_t +buffer_percent_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r; + + r = tr->buffer_percent; + r = sprintf(buf, "%d\n", r); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +buffer_percent_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + if (val > 100) + return -EINVAL; + + tr->buffer_percent = val; + + (*ppos)++; + + return cnt; +} + +static const struct file_operations buffer_percent_fops = { + .open = tracing_open_generic_tr, + .read = buffer_percent_read, + .write = buffer_percent_write, + .release = tracing_release_generic_tr, + .llseek = default_llseek, +}; + +static struct dentry *trace_instance_dir; + +static void +init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); + +static int +allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size) +{ + enum ring_buffer_flags rb_flags; + + rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + + buf->tr = tr; + + buf->buffer = ring_buffer_alloc(size, rb_flags); + if (!buf->buffer) + return -ENOMEM; + + buf->data = alloc_percpu(struct trace_array_cpu); + if (!buf->data) { + ring_buffer_free(buf->buffer); + buf->buffer = NULL; + return -ENOMEM; + } + + /* Allocate the first page for all buffers */ + set_buffer_entries(&tr->array_buffer, + ring_buffer_size(tr->array_buffer.buffer, 0)); + + return 0; +} + +static void free_trace_buffer(struct array_buffer *buf) +{ + if (buf->buffer) { + ring_buffer_free(buf->buffer); + buf->buffer = NULL; + free_percpu(buf->data); + buf->data = NULL; + } +} + +static int allocate_trace_buffers(struct trace_array *tr, int size) +{ + int ret; + + ret = allocate_trace_buffer(tr, &tr->array_buffer, size); + if (ret) + return ret; + +#ifdef CONFIG_TRACER_MAX_TRACE + ret = allocate_trace_buffer(tr, &tr->max_buffer, + allocate_snapshot ? size : 1); + if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) { + free_trace_buffer(&tr->array_buffer); + return -ENOMEM; + } + tr->allocated_snapshot = allocate_snapshot; + + allocate_snapshot = false; +#endif + + return 0; +} + +static void free_trace_buffers(struct trace_array *tr) +{ + if (!tr) + return; + + free_trace_buffer(&tr->array_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE + free_trace_buffer(&tr->max_buffer); +#endif +} + +static void init_trace_flags_index(struct trace_array *tr) +{ + int i; + + /* Used by the trace options files */ + for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) + tr->trace_flags_index[i] = i; +} + +static void __update_tracer_options(struct trace_array *tr) +{ + struct tracer *t; + + for (t = trace_types; t; t = t->next) + add_tracer_options(tr, t); +} + +static void update_tracer_options(struct trace_array *tr) +{ + mutex_lock(&trace_types_lock); + tracer_options_updated = true; + __update_tracer_options(tr); + mutex_unlock(&trace_types_lock); +} + +/* Must have trace_types_lock held */ +struct trace_array *trace_array_find(const char *instance) +{ + struct trace_array *tr, *found = NULL; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, instance) == 0) { + found = tr; + break; + } + } + + return found; +} + +struct trace_array *trace_array_find_get(const char *instance) +{ + struct trace_array *tr; + + mutex_lock(&trace_types_lock); + tr = trace_array_find(instance); + if (tr) + tr->ref++; + mutex_unlock(&trace_types_lock); + + return tr; +} + +static int trace_array_create_dir(struct trace_array *tr) +{ + int ret; + + tr->dir = tracefs_create_dir(tr->name, trace_instance_dir); + if (!tr->dir) + return -EINVAL; + + ret = event_trace_add_tracer(tr->dir, tr); + if (ret) { + tracefs_remove(tr->dir); + return ret; + } + + init_tracer_tracefs(tr, tr->dir); + __update_tracer_options(tr); + + return ret; +} + +static struct trace_array *trace_array_create(const char *name) +{ + struct trace_array *tr; + int ret; + + ret = -ENOMEM; + tr = kzalloc(sizeof(*tr), GFP_KERNEL); + if (!tr) + return ERR_PTR(ret); + + tr->name = kstrdup(name, GFP_KERNEL); + if (!tr->name) + goto out_free_tr; + + if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) + goto out_free_tr; + + if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL)) + goto out_free_tr; + + tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; + + cpumask_copy(tr->tracing_cpumask, cpu_all_mask); + + raw_spin_lock_init(&tr->start_lock); + + tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + + tr->current_trace = &nop_trace; + + INIT_LIST_HEAD(&tr->systems); + INIT_LIST_HEAD(&tr->events); + INIT_LIST_HEAD(&tr->hist_vars); + INIT_LIST_HEAD(&tr->err_log); + + if (allocate_trace_buffers(tr, trace_buf_size) < 0) + goto out_free_tr; + + if (ftrace_allocate_ftrace_ops(tr) < 0) + goto out_free_tr; + + ftrace_init_trace_array(tr); + + init_trace_flags_index(tr); + + if (trace_instance_dir) { + ret = trace_array_create_dir(tr); + if (ret) + goto out_free_tr; + } else + __trace_early_add_events(tr); + + list_add(&tr->list, &ftrace_trace_arrays); + + tr->ref++; + + return tr; + + out_free_tr: + ftrace_free_ftrace_ops(tr); + free_trace_buffers(tr); + free_cpumask_var(tr->pipe_cpumask); + free_cpumask_var(tr->tracing_cpumask); + kfree(tr->name); + kfree(tr); + + return ERR_PTR(ret); +} + +static int instance_mkdir(const char *name) +{ + struct trace_array *tr; + int ret; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + ret = -EEXIST; + if (trace_array_find(name)) + goto out_unlock; + + tr = trace_array_create(name); + + ret = PTR_ERR_OR_ZERO(tr); + +out_unlock: + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + return ret; +} + +/** + * trace_array_get_by_name - Create/Lookup a trace array, given its name. + * @name: The name of the trace array to be looked up/created. + * + * Returns pointer to trace array with given name. + * NULL, if it cannot be created. + * + * NOTE: This function increments the reference counter associated with the + * trace array returned. This makes sure it cannot be freed while in use. + * Use trace_array_put() once the trace array is no longer needed. + * If the trace_array is to be freed, trace_array_destroy() needs to + * be called after the trace_array_put(), or simply let user space delete + * it from the tracefs instances directory. But until the + * trace_array_put() is called, user space can not delete it. + * + */ +struct trace_array *trace_array_get_by_name(const char *name) +{ + struct trace_array *tr; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) + goto out_unlock; + } + + tr = trace_array_create(name); + + if (IS_ERR(tr)) + tr = NULL; +out_unlock: + if (tr) + tr->ref++; + + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + return tr; +} +EXPORT_SYMBOL_GPL(trace_array_get_by_name); + +static int __remove_instance(struct trace_array *tr) +{ + int i; + + /* Reference counter for a newly created trace array = 1. */ + if (tr->ref > 1 || (tr->current_trace && tr->trace_ref)) + return -EBUSY; + + list_del(&tr->list); + + /* Disable all the flags that were enabled coming in */ + for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) { + if ((1 << i) & ZEROED_TRACE_FLAGS) + set_tracer_flag(tr, 1 << i, 0); + } + + tracing_set_nop(tr); + clear_ftrace_function_probes(tr); + event_trace_del_tracer(tr); + ftrace_clear_pids(tr); + ftrace_destroy_function_files(tr); + tracefs_remove(tr->dir); + free_percpu(tr->last_func_repeats); + free_trace_buffers(tr); + clear_tracing_err_log(tr); + + for (i = 0; i < tr->nr_topts; i++) { + kfree(tr->topts[i].topts); + } + kfree(tr->topts); + + free_cpumask_var(tr->pipe_cpumask); + free_cpumask_var(tr->tracing_cpumask); + kfree(tr->name); + kfree(tr); + + return 0; +} + +int trace_array_destroy(struct trace_array *this_tr) +{ + struct trace_array *tr; + int ret; + + if (!this_tr) + return -EINVAL; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + ret = -ENODEV; + + /* Making sure trace array exists before destroying it. */ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr == this_tr) { + ret = __remove_instance(tr); + break; + } + } + + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(trace_array_destroy); + +static int instance_rmdir(const char *name) +{ + struct trace_array *tr; + int ret; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + ret = -ENODEV; + tr = trace_array_find(name); + if (tr) + ret = __remove_instance(tr); + + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + + return ret; +} + +static __init void create_trace_instances(struct dentry *d_tracer) +{ + struct trace_array *tr; + + trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer, + instance_mkdir, + instance_rmdir); + if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n")) + return; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->name) + continue; + if (MEM_FAIL(trace_array_create_dir(tr) < 0, + "Failed to create instance directory\n")) + break; + } + + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); +} + +static void +init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) +{ + struct trace_event_file *file; + int cpu; + + trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer, + tr, &show_traces_fops); + + trace_create_file("current_tracer", TRACE_MODE_WRITE, d_tracer, + tr, &set_tracer_fops); + + trace_create_file("tracing_cpumask", TRACE_MODE_WRITE, d_tracer, + tr, &tracing_cpumask_fops); + + trace_create_file("trace_options", TRACE_MODE_WRITE, d_tracer, + tr, &tracing_iter_fops); + + trace_create_file("trace", TRACE_MODE_WRITE, d_tracer, + tr, &tracing_fops); + + trace_create_file("trace_pipe", TRACE_MODE_READ, d_tracer, + tr, &tracing_pipe_fops); + + trace_create_file("buffer_size_kb", TRACE_MODE_WRITE, d_tracer, + tr, &tracing_entries_fops); + + trace_create_file("buffer_total_size_kb", TRACE_MODE_READ, d_tracer, + tr, &tracing_total_entries_fops); + + trace_create_file("free_buffer", 0200, d_tracer, + tr, &tracing_free_buffer_fops); + + trace_create_file("trace_marker", 0220, d_tracer, + tr, &tracing_mark_fops); + + file = __find_event_file(tr, "ftrace", "print"); + if (file && file->ef) + eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef, + file, &event_trigger_fops); + tr->trace_marker_file = file; + + trace_create_file("trace_marker_raw", 0220, d_tracer, + tr, &tracing_mark_raw_fops); + + trace_create_file("trace_clock", TRACE_MODE_WRITE, d_tracer, tr, + &trace_clock_fops); + + trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer, + tr, &rb_simple_fops); + + trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr, + &trace_time_stamp_mode_fops); + + tr->buffer_percent = 50; + + trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer, + tr, &buffer_percent_fops); + + create_trace_options_dir(tr); + +#ifdef CONFIG_TRACER_MAX_TRACE + trace_create_maxlat_file(tr, d_tracer); +#endif + + if (ftrace_create_function_files(tr, d_tracer)) + MEM_FAIL(1, "Could not allocate function filter files"); + +#ifdef CONFIG_TRACER_SNAPSHOT + trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer, + tr, &snapshot_fops); +#endif + + trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer, + tr, &tracing_err_log_fops); + + for_each_tracing_cpu(cpu) + tracing_init_tracefs_percpu(tr, cpu); + + ftrace_init_tracefs(tr, d_tracer); +} + +static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) +{ + struct vfsmount *mnt; + struct file_system_type *type; + + /* + * To maintain backward compatibility for tools that mount + * debugfs to get to the tracing facility, tracefs is automatically + * mounted to the debugfs/tracing directory. + */ + type = get_fs_type("tracefs"); + if (!type) + return NULL; + mnt = vfs_submount(mntpt, type, "tracefs", NULL); + put_filesystem(type); + if (IS_ERR(mnt)) + return NULL; + mntget(mnt); + + return mnt; +} + +/** + * tracing_init_dentry - initialize top level trace array + * + * This is called when creating files or directories in the tracing + * directory. It is called via fs_initcall() by any of the boot up code + * and expects to return the dentry of the top level tracing directory. + */ +int tracing_init_dentry(void) +{ + struct trace_array *tr = &global_trace; + + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warn("Tracing disabled due to lockdown\n"); + return -EPERM; + } + + /* The top level trace array uses NULL as parent */ + if (tr->dir) + return 0; + + if (WARN_ON(!tracefs_initialized())) + return -ENODEV; + + /* + * As there may still be users that expect the tracing + * files to exist in debugfs/tracing, we must automount + * the tracefs file system there, so older tools still + * work with the newer kernel. + */ + tr->dir = debugfs_create_automount("tracing", NULL, + trace_automount, NULL); + + return 0; +} + +extern struct trace_eval_map *__start_ftrace_eval_maps[]; +extern struct trace_eval_map *__stop_ftrace_eval_maps[]; + +static struct workqueue_struct *eval_map_wq __initdata; +static struct work_struct eval_map_work __initdata; +static struct work_struct tracerfs_init_work __initdata; + +static void __init eval_map_work_func(struct work_struct *work) +{ + int len; + + len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps; + trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len); +} + +static int __init trace_eval_init(void) +{ + INIT_WORK(&eval_map_work, eval_map_work_func); + + eval_map_wq = alloc_workqueue("eval_map_wq", WQ_UNBOUND, 0); + if (!eval_map_wq) { + pr_err("Unable to allocate eval_map_wq\n"); + /* Do work here */ + eval_map_work_func(&eval_map_work); + return -ENOMEM; + } + + queue_work(eval_map_wq, &eval_map_work); + return 0; +} + +subsys_initcall(trace_eval_init); + +static int __init trace_eval_sync(void) +{ + /* Make sure the eval map updates are finished */ + if (eval_map_wq) + destroy_workqueue(eval_map_wq); + return 0; +} + +late_initcall_sync(trace_eval_sync); + + +#ifdef CONFIG_MODULES +static void trace_module_add_evals(struct module *mod) +{ + if (!mod->num_trace_evals) + return; + + /* + * Modules with bad taint do not have events created, do + * not bother with enums either. + */ + if (trace_module_has_bad_taint(mod)) + return; + + trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals); +} + +#ifdef CONFIG_TRACE_EVAL_MAP_FILE +static void trace_module_remove_evals(struct module *mod) +{ + union trace_eval_map_item *map; + union trace_eval_map_item **last = &trace_eval_maps; + + if (!mod->num_trace_evals) + return; + + mutex_lock(&trace_eval_mutex); + + map = trace_eval_maps; + + while (map) { + if (map->head.mod == mod) + break; + map = trace_eval_jmp_to_tail(map); + last = &map->tail.next; + map = map->tail.next; + } + if (!map) + goto out; + + *last = trace_eval_jmp_to_tail(map)->tail.next; + kfree(map); + out: + mutex_unlock(&trace_eval_mutex); +} +#else +static inline void trace_module_remove_evals(struct module *mod) { } +#endif /* CONFIG_TRACE_EVAL_MAP_FILE */ + +static int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + switch (val) { + case MODULE_STATE_COMING: + trace_module_add_evals(mod); + break; + case MODULE_STATE_GOING: + trace_module_remove_evals(mod); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block trace_module_nb = { + .notifier_call = trace_module_notify, + .priority = 0, +}; +#endif /* CONFIG_MODULES */ + +static __init void tracer_init_tracefs_work_func(struct work_struct *work) +{ + + event_trace_init(); + + init_tracer_tracefs(&global_trace, NULL); + ftrace_init_tracefs_toplevel(&global_trace, NULL); + + trace_create_file("tracing_thresh", TRACE_MODE_WRITE, NULL, + &global_trace, &tracing_thresh_fops); + + trace_create_file("README", TRACE_MODE_READ, NULL, + NULL, &tracing_readme_fops); + + trace_create_file("saved_cmdlines", TRACE_MODE_READ, NULL, + NULL, &tracing_saved_cmdlines_fops); + + trace_create_file("saved_cmdlines_size", TRACE_MODE_WRITE, NULL, + NULL, &tracing_saved_cmdlines_size_fops); + + trace_create_file("saved_tgids", TRACE_MODE_READ, NULL, + NULL, &tracing_saved_tgids_fops); + + trace_create_eval_file(NULL); + +#ifdef CONFIG_MODULES + register_module_notifier(&trace_module_nb); +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE + trace_create_file("dyn_ftrace_total_info", TRACE_MODE_READ, NULL, + NULL, &tracing_dyn_info_fops); +#endif + + create_trace_instances(NULL); + + update_tracer_options(&global_trace); +} + +static __init int tracer_init_tracefs(void) +{ + int ret; + + trace_access_lock_init(); + + ret = tracing_init_dentry(); + if (ret) + return 0; + + if (eval_map_wq) { + INIT_WORK(&tracerfs_init_work, tracer_init_tracefs_work_func); + queue_work(eval_map_wq, &tracerfs_init_work); + } else { + tracer_init_tracefs_work_func(NULL); + } + + rv_init_interface(); + + return 0; +} + +fs_initcall(tracer_init_tracefs); + +static int trace_die_panic_handler(struct notifier_block *self, + unsigned long ev, void *unused); + +static struct notifier_block trace_panic_notifier = { + .notifier_call = trace_die_panic_handler, + .priority = INT_MAX - 1, +}; + +static struct notifier_block trace_die_notifier = { + .notifier_call = trace_die_panic_handler, + .priority = INT_MAX - 1, +}; + +/* + * The idea is to execute the following die/panic callback early, in order + * to avoid showing irrelevant information in the trace (like other panic + * notifier functions); we are the 2nd to run, after hung_task/rcu_stall + * warnings get disabled (to prevent potential log flooding). + */ +static int trace_die_panic_handler(struct notifier_block *self, + unsigned long ev, void *unused) +{ + if (!ftrace_dump_on_oops) + return NOTIFY_DONE; + + /* The die notifier requires DIE_OOPS to trigger */ + if (self == &trace_die_notifier && ev != DIE_OOPS) + return NOTIFY_DONE; + + ftrace_dump(ftrace_dump_on_oops); + + return NOTIFY_DONE; +} + +/* + * printk is set to max of 1024, we really don't need it that big. + * Nothing should be printing 1000 characters anyway. + */ +#define TRACE_MAX_PRINT 1000 + +/* + * Define here KERN_TRACE so that we have one place to modify + * it if we decide to change what log level the ftrace dump + * should be at. + */ +#define KERN_TRACE KERN_EMERG + +void +trace_printk_seq(struct trace_seq *s) +{ + /* Probably should print a warning here. */ + if (s->seq.len >= TRACE_MAX_PRINT) + s->seq.len = TRACE_MAX_PRINT; + + /* + * More paranoid code. Although the buffer size is set to + * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just + * an extra layer of protection. + */ + if (WARN_ON_ONCE(s->seq.len >= s->seq.size)) + s->seq.len = s->seq.size - 1; + + /* should be zero ended, but we are paranoid. */ + s->buffer[s->seq.len] = 0; + + printk(KERN_TRACE "%s", s->buffer); + + trace_seq_init(s); +} + +void trace_init_global_iter(struct trace_iterator *iter) +{ + iter->tr = &global_trace; + iter->trace = iter->tr->current_trace; + iter->cpu_file = RING_BUFFER_ALL_CPUS; + iter->array_buffer = &global_trace.array_buffer; + + if (iter->trace && iter->trace->open) + iter->trace->open(iter); + + /* Annotate start of buffers if we had overruns */ + if (ring_buffer_overruns(iter->array_buffer->buffer)) + iter->iter_flags |= TRACE_FILE_ANNOTATE; + + /* Output in nanoseconds only if we are using a clock in nanoseconds. */ + if (trace_clocks[iter->tr->clock_id].in_ns) + iter->iter_flags |= TRACE_FILE_TIME_IN_NS; + + /* Can not use kmalloc for iter.temp and iter.fmt */ + iter->temp = static_temp_buf; + iter->temp_size = STATIC_TEMP_BUF_SIZE; + iter->fmt = static_fmt_buf; + iter->fmt_size = STATIC_FMT_BUF_SIZE; +} + +void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) +{ + /* use static because iter can be a bit big for the stack */ + static struct trace_iterator iter; + static atomic_t dump_running; + struct trace_array *tr = &global_trace; + unsigned int old_userobj; + unsigned long flags; + int cnt = 0, cpu; + + /* Only allow one dump user at a time. */ + if (atomic_inc_return(&dump_running) != 1) { + atomic_dec(&dump_running); + return; + } + + /* + * Always turn off tracing when we dump. + * We don't need to show trace output of what happens + * between multiple crashes. + * + * If the user does a sysrq-z, then they can re-enable + * tracing with echo 1 > tracing_on. + */ + tracing_off(); + + local_irq_save(flags); + + /* Simulate the iterator */ + trace_init_global_iter(&iter); + + for_each_tracing_cpu(cpu) { + atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); + } + + old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ; + + /* don't look at user memory in panic mode */ + tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + + switch (oops_dump_mode) { + case DUMP_ALL: + iter.cpu_file = RING_BUFFER_ALL_CPUS; + break; + case DUMP_ORIG: + iter.cpu_file = raw_smp_processor_id(); + break; + case DUMP_NONE: + goto out_enable; + default: + printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); + iter.cpu_file = RING_BUFFER_ALL_CPUS; + } + + printk(KERN_TRACE "Dumping ftrace buffer:\n"); + + /* Did function tracer already get disabled? */ + if (ftrace_is_dead()) { + printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); + printk("# MAY BE MISSING FUNCTION EVENTS\n"); + } + + /* + * We need to stop all tracing on all CPUS to read + * the next buffer. This is a bit expensive, but is + * not done often. We fill all what we can read, + * and then release the locks again. + */ + + while (!trace_empty(&iter)) { + + if (!cnt) + printk(KERN_TRACE "---------------------------------\n"); + + cnt++; + + trace_iterator_reset(&iter); + iter.iter_flags |= TRACE_FILE_LAT_FMT; + + if (trace_find_next_entry_inc(&iter) != NULL) { + int ret; + + ret = print_trace_line(&iter); + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(&iter); + } + touch_nmi_watchdog(); + + trace_printk_seq(&iter.seq); + } + + if (!cnt) + printk(KERN_TRACE " (ftrace buffer empty)\n"); + else + printk(KERN_TRACE "---------------------------------\n"); + + out_enable: + tr->trace_flags |= old_userobj; + + for_each_tracing_cpu(cpu) { + atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); + } + atomic_dec(&dump_running); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(ftrace_dump); + +#define WRITE_BUFSIZE 4096 + +ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos, + int (*createfn)(const char *)) +{ + char *kbuf, *buf, *tmp; + int ret = 0; + size_t done = 0; + size_t size; + + kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + while (done < count) { + size = count - done; + + if (size >= WRITE_BUFSIZE) + size = WRITE_BUFSIZE - 1; + + if (copy_from_user(kbuf, buffer + done, size)) { + ret = -EFAULT; + goto out; + } + kbuf[size] = '\0'; + buf = kbuf; + do { + tmp = strchr(buf, '\n'); + if (tmp) { + *tmp = '\0'; + size = tmp - buf + 1; + } else { + size = strlen(buf); + if (done + size < count) { + if (buf != kbuf) + break; + /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */ + pr_warn("Line length is too long: Should be less than %d\n", + WRITE_BUFSIZE - 2); + ret = -EINVAL; + goto out; + } + } + done += size; + + /* Remove comments */ + tmp = strchr(buf, '#'); + + if (tmp) + *tmp = '\0'; + + ret = createfn(buf); + if (ret) + goto out; + buf += size; + + } while (done < count); + } + ret = done; + +out: + kfree(kbuf); + + return ret; +} + +#ifdef CONFIG_TRACER_MAX_TRACE +__init static bool tr_needs_alloc_snapshot(const char *name) +{ + char *test; + int len = strlen(name); + bool ret; + + if (!boot_snapshot_index) + return false; + + if (strncmp(name, boot_snapshot_info, len) == 0 && + boot_snapshot_info[len] == '\t') + return true; + + test = kmalloc(strlen(name) + 3, GFP_KERNEL); + if (!test) + return false; + + sprintf(test, "\t%s\t", name); + ret = strstr(boot_snapshot_info, test) == NULL; + kfree(test); + return ret; +} + +__init static void do_allocate_snapshot(const char *name) +{ + if (!tr_needs_alloc_snapshot(name)) + return; + + /* + * When allocate_snapshot is set, the next call to + * allocate_trace_buffers() (called by trace_array_get_by_name()) + * will allocate the snapshot buffer. That will alse clear + * this flag. + */ + allocate_snapshot = true; +} +#else +static inline void do_allocate_snapshot(const char *name) { } +#endif + +__init static void enable_instances(void) +{ + struct trace_array *tr; + char *curr_str; + char *str; + char *tok; + + /* A tab is always appended */ + boot_instance_info[boot_instance_index - 1] = '\0'; + str = boot_instance_info; + + while ((curr_str = strsep(&str, "\t"))) { + + tok = strsep(&curr_str, ","); + + if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE)) + do_allocate_snapshot(tok); + + tr = trace_array_get_by_name(tok); + if (!tr) { + pr_warn("Failed to create instance buffer %s\n", curr_str); + continue; + } + /* Allow user space to delete it */ + trace_array_put(tr); + + while ((tok = strsep(&curr_str, ","))) { + early_enable_events(tr, tok, true); + } + } +} + +__init static int tracer_alloc_buffers(void) +{ + int ring_buf_size; + int ret = -ENOMEM; + + + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warn("Tracing disabled due to lockdown\n"); + return -EPERM; + } + + /* + * Make sure we don't accidentally add more trace options + * than we have bits for. + */ + BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE); + + if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) + goto out; + + if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL)) + goto out_free_buffer_mask; + + /* Only allocate trace_printk buffers if a trace_printk exists */ + if (&__stop___trace_bprintk_fmt != &__start___trace_bprintk_fmt) + /* Must be called before global_trace.buffer is allocated */ + trace_printk_init_buffers(); + + /* To save memory, keep the ring buffer size to its minimum */ + if (ring_buffer_expanded) + ring_buf_size = trace_buf_size; + else + ring_buf_size = 1; + + cpumask_copy(tracing_buffer_mask, cpu_possible_mask); + cpumask_copy(global_trace.tracing_cpumask, cpu_all_mask); + + raw_spin_lock_init(&global_trace.start_lock); + + /* + * The prepare callbacks allocates some memory for the ring buffer. We + * don't free the buffer if the CPU goes down. If we were to free + * the buffer, then the user would lose any trace that was in the + * buffer. The memory will be removed once the "instance" is removed. + */ + ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE, + "trace/RB:prepare", trace_rb_cpu_prepare, + NULL); + if (ret < 0) + goto out_free_cpumask; + /* Used for event triggers */ + ret = -ENOMEM; + temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE); + if (!temp_buffer) + goto out_rm_hp_state; + + if (trace_create_savedcmd() < 0) + goto out_free_temp_buffer; + + if (!zalloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL)) + goto out_free_savedcmd; + + /* TODO: make the number of buffers hot pluggable with CPUS */ + if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { + MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n"); + goto out_free_pipe_cpumask; + } + if (global_trace.buffer_disabled) + tracing_off(); + + if (trace_boot_clock) { + ret = tracing_set_clock(&global_trace, trace_boot_clock); + if (ret < 0) + pr_warn("Trace clock %s not defined, going back to default\n", + trace_boot_clock); + } + + /* + * register_tracer() might reference current_trace, so it + * needs to be set before we register anything. This is + * just a bootstrap of current_trace anyway. + */ + global_trace.current_trace = &nop_trace; + + global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + + ftrace_init_global_array_ops(&global_trace); + + init_trace_flags_index(&global_trace); + + register_tracer(&nop_trace); + + /* Function tracing may start here (via kernel command line) */ + init_function_trace(); + + /* All seems OK, enable tracing */ + tracing_disabled = 0; + + atomic_notifier_chain_register(&panic_notifier_list, + &trace_panic_notifier); + + register_die_notifier(&trace_die_notifier); + + global_trace.flags = TRACE_ARRAY_FL_GLOBAL; + + INIT_LIST_HEAD(&global_trace.systems); + INIT_LIST_HEAD(&global_trace.events); + INIT_LIST_HEAD(&global_trace.hist_vars); + INIT_LIST_HEAD(&global_trace.err_log); + list_add(&global_trace.list, &ftrace_trace_arrays); + + apply_trace_boot_options(); + + register_snapshot_cmd(); + + test_can_verify(); + + return 0; + +out_free_pipe_cpumask: + free_cpumask_var(global_trace.pipe_cpumask); +out_free_savedcmd: + free_saved_cmdlines_buffer(savedcmd); +out_free_temp_buffer: + ring_buffer_free(temp_buffer); +out_rm_hp_state: + cpuhp_remove_multi_state(CPUHP_TRACE_RB_PREPARE); +out_free_cpumask: + free_cpumask_var(global_trace.tracing_cpumask); +out_free_buffer_mask: + free_cpumask_var(tracing_buffer_mask); +out: + return ret; +} + +void __init ftrace_boot_snapshot(void) +{ +#ifdef CONFIG_TRACER_MAX_TRACE + struct trace_array *tr; + + if (!snapshot_at_boot) + return; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->allocated_snapshot) + continue; + + tracing_snapshot_instance(tr); + trace_array_puts(tr, "** Boot snapshot taken **\n"); + } +#endif +} + +void __init early_trace_init(void) +{ + if (tracepoint_printk) { + tracepoint_print_iter = + kzalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL); + if (MEM_FAIL(!tracepoint_print_iter, + "Failed to allocate trace iterator\n")) + tracepoint_printk = 0; + else + static_key_enable(&tracepoint_printk_key.key); + } + tracer_alloc_buffers(); + + init_events(); +} + +void __init trace_init(void) +{ + trace_event_init(); + + if (boot_instance_index) + enable_instances(); +} + +__init static void clear_boot_tracer(void) +{ + /* + * The default tracer at boot buffer is an init section. + * This function is called in lateinit. If we did not + * find the boot tracer, then clear it out, to prevent + * later registration from accessing the buffer that is + * about to be freed. + */ + if (!default_bootup_tracer) + return; + + printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n", + default_bootup_tracer); + default_bootup_tracer = NULL; +} + +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +__init static void tracing_set_default_clock(void) +{ + /* sched_clock_stable() is determined in late_initcall */ + if (!trace_boot_clock && !sched_clock_stable()) { + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warn("Can not set tracing clock due to lockdown\n"); + return; + } + + printk(KERN_WARNING + "Unstable clock detected, switching default tracing clock to \"global\"\n" + "If you want to keep using the local clock, then add:\n" + " \"trace_clock=local\"\n" + "on the kernel command line\n"); + tracing_set_clock(&global_trace, "global"); + } +} +#else +static inline void tracing_set_default_clock(void) { } +#endif + +__init static int late_trace_init(void) +{ + if (tracepoint_printk && tracepoint_printk_stop_on_boot) { + static_key_disable(&tracepoint_printk_key.key); + tracepoint_printk = 0; + } + + tracing_set_default_clock(); + clear_boot_tracer(); + return 0; +} + +late_initcall_sync(late_trace_init); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h new file mode 100644 index 0000000000..51c0a97033 --- /dev/null +++ b/kernel/trace/trace.h @@ -0,0 +1,2063 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef _LINUX_KERNEL_TRACE_H +#define _LINUX_KERNEL_TRACE_H + +#include <linux/fs.h> +#include <linux/atomic.h> +#include <linux/sched.h> +#include <linux/clocksource.h> +#include <linux/ring_buffer.h> +#include <linux/mmiotrace.h> +#include <linux/tracepoint.h> +#include <linux/ftrace.h> +#include <linux/trace.h> +#include <linux/hw_breakpoint.h> +#include <linux/trace_seq.h> +#include <linux/trace_events.h> +#include <linux/compiler.h> +#include <linux/glob.h> +#include <linux/irq_work.h> +#include <linux/workqueue.h> +#include <linux/ctype.h> +#include <linux/once_lite.h> + +#include "pid_list.h" + +#ifdef CONFIG_FTRACE_SYSCALLS +#include <asm/unistd.h> /* For NR_syscalls */ +#include <asm/syscall.h> /* some archs define it here */ +#endif + +#define TRACE_MODE_WRITE 0640 +#define TRACE_MODE_READ 0440 + +enum trace_type { + __TRACE_FIRST_TYPE = 0, + + TRACE_FN, + TRACE_CTX, + TRACE_WAKE, + TRACE_STACK, + TRACE_PRINT, + TRACE_BPRINT, + TRACE_MMIO_RW, + TRACE_MMIO_MAP, + TRACE_BRANCH, + TRACE_GRAPH_RET, + TRACE_GRAPH_ENT, + TRACE_USER_STACK, + TRACE_BLK, + TRACE_BPUTS, + TRACE_HWLAT, + TRACE_OSNOISE, + TRACE_TIMERLAT, + TRACE_RAW_DATA, + TRACE_FUNC_REPEATS, + + __TRACE_LAST_TYPE, +}; + + +#undef __field +#define __field(type, item) type item; + +#undef __field_fn +#define __field_fn(type, item) type item; + +#undef __field_struct +#define __field_struct(type, item) __field(type, item) + +#undef __field_desc +#define __field_desc(type, container, item) + +#undef __field_packed +#define __field_packed(type, container, item) + +#undef __array +#define __array(type, item, size) type item[size]; + +/* + * For backward compatibility, older user space expects to see the + * kernel_stack event with a fixed size caller field. But today the fix + * size is ignored by the kernel, and the real structure is dynamic. + * Expose to user space: "unsigned long caller[8];" but the real structure + * will be "unsigned long caller[] __counted_by(size)" + */ +#undef __stack_array +#define __stack_array(type, item, size, field) type item[] __counted_by(field); + +#undef __array_desc +#define __array_desc(type, container, item, size) + +#undef __dynamic_array +#define __dynamic_array(type, item) type item[]; + +#undef __rel_dynamic_array +#define __rel_dynamic_array(type, item) type item[]; + +#undef F_STRUCT +#define F_STRUCT(args...) args + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ + struct struct_name { \ + struct trace_entry ent; \ + tstruct \ + } + +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) + +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) + +#undef FTRACE_ENTRY_PACKED +#define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) __packed + +#include "trace_entries.h" + +/* Use this for memory failure errors */ +#define MEM_FAIL(condition, fmt, ...) \ + DO_ONCE_LITE_IF(condition, pr_err, "ERROR: " fmt, ##__VA_ARGS__) + +#define FAULT_STRING "(fault)" + +#define HIST_STACKTRACE_DEPTH 16 +#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) +#define HIST_STACKTRACE_SKIP 5 + +/* + * syscalls are special, and need special handling, this is why + * they are not included in trace_entries.h + */ +struct syscall_trace_enter { + struct trace_entry ent; + int nr; + unsigned long args[]; +}; + +struct syscall_trace_exit { + struct trace_entry ent; + int nr; + long ret; +}; + +struct kprobe_trace_entry_head { + struct trace_entry ent; + unsigned long ip; +}; + +struct eprobe_trace_entry_head { + struct trace_entry ent; +}; + +struct kretprobe_trace_entry_head { + struct trace_entry ent; + unsigned long func; + unsigned long ret_ip; +}; + +struct fentry_trace_entry_head { + struct trace_entry ent; + unsigned long ip; +}; + +struct fexit_trace_entry_head { + struct trace_entry ent; + unsigned long func; + unsigned long ret_ip; +}; + +#define TRACE_BUF_SIZE 1024 + +struct trace_array; + +/* + * The CPU trace array - it consists of thousands of trace entries + * plus some other descriptor data: (for example which task started + * the trace, etc.) + */ +struct trace_array_cpu { + atomic_t disabled; + void *buffer_page; /* ring buffer spare */ + + unsigned long entries; + unsigned long saved_latency; + unsigned long critical_start; + unsigned long critical_end; + unsigned long critical_sequence; + unsigned long nice; + unsigned long policy; + unsigned long rt_priority; + unsigned long skipped_entries; + u64 preempt_timestamp; + pid_t pid; + kuid_t uid; + char comm[TASK_COMM_LEN]; + +#ifdef CONFIG_FUNCTION_TRACER + int ftrace_ignore_pid; +#endif + bool ignore_pid; +}; + +struct tracer; +struct trace_option_dentry; + +struct array_buffer { + struct trace_array *tr; + struct trace_buffer *buffer; + struct trace_array_cpu __percpu *data; + u64 time_start; + int cpu; +}; + +#define TRACE_FLAGS_MAX_SIZE 32 + +struct trace_options { + struct tracer *tracer; + struct trace_option_dentry *topts; +}; + +struct trace_pid_list *trace_pid_list_alloc(void); +void trace_pid_list_free(struct trace_pid_list *pid_list); +bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid); +int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid); +int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid); +int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid); +int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid, + unsigned int *next); + +enum { + TRACE_PIDS = BIT(0), + TRACE_NO_PIDS = BIT(1), +}; + +static inline bool pid_type_enabled(int type, struct trace_pid_list *pid_list, + struct trace_pid_list *no_pid_list) +{ + /* Return true if the pid list in type has pids */ + return ((type & TRACE_PIDS) && pid_list) || + ((type & TRACE_NO_PIDS) && no_pid_list); +} + +static inline bool still_need_pid_events(int type, struct trace_pid_list *pid_list, + struct trace_pid_list *no_pid_list) +{ + /* + * Turning off what is in @type, return true if the "other" + * pid list, still has pids in it. + */ + return (!(type & TRACE_PIDS) && pid_list) || + (!(type & TRACE_NO_PIDS) && no_pid_list); +} + +typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data); + +/** + * struct cond_snapshot - conditional snapshot data and callback + * + * The cond_snapshot structure encapsulates a callback function and + * data associated with the snapshot for a given tracing instance. + * + * When a snapshot is taken conditionally, by invoking + * tracing_snapshot_cond(tr, cond_data), the cond_data passed in is + * passed in turn to the cond_snapshot.update() function. That data + * can be compared by the update() implementation with the cond_data + * contained within the struct cond_snapshot instance associated with + * the trace_array. Because the tr->max_lock is held throughout the + * update() call, the update() function can directly retrieve the + * cond_snapshot and cond_data associated with the per-instance + * snapshot associated with the trace_array. + * + * The cond_snapshot.update() implementation can save data to be + * associated with the snapshot if it decides to, and returns 'true' + * in that case, or it returns 'false' if the conditional snapshot + * shouldn't be taken. + * + * The cond_snapshot instance is created and associated with the + * user-defined cond_data by tracing_cond_snapshot_enable(). + * Likewise, the cond_snapshot instance is destroyed and is no longer + * associated with the trace instance by + * tracing_cond_snapshot_disable(). + * + * The method below is required. + * + * @update: When a conditional snapshot is invoked, the update() + * callback function is invoked with the tr->max_lock held. The + * update() implementation signals whether or not to actually + * take the snapshot, by returning 'true' if so, 'false' if no + * snapshot should be taken. Because the max_lock is held for + * the duration of update(), the implementation is safe to + * directly retrieved and save any implementation data it needs + * to in association with the snapshot. + */ +struct cond_snapshot { + void *cond_data; + cond_update_fn_t update; +}; + +/* + * struct trace_func_repeats - used to keep track of the consecutive + * (on the same CPU) calls of a single function. + */ +struct trace_func_repeats { + unsigned long ip; + unsigned long parent_ip; + unsigned long count; + u64 ts_last_call; +}; + +/* + * The trace array - an array of per-CPU trace arrays. This is the + * highest level data structure that individual tracers deal with. + * They have on/off state as well: + */ +struct trace_array { + struct list_head list; + char *name; + struct array_buffer array_buffer; +#ifdef CONFIG_TRACER_MAX_TRACE + /* + * The max_buffer is used to snapshot the trace when a maximum + * latency is reached, or when the user initiates a snapshot. + * Some tracers will use this to store a maximum trace while + * it continues examining live traces. + * + * The buffers for the max_buffer are set up the same as the array_buffer + * When a snapshot is taken, the buffer of the max_buffer is swapped + * with the buffer of the array_buffer and the buffers are reset for + * the array_buffer so the tracing can continue. + */ + struct array_buffer max_buffer; + bool allocated_snapshot; +#endif +#ifdef CONFIG_TRACER_MAX_TRACE + unsigned long max_latency; +#ifdef CONFIG_FSNOTIFY + struct dentry *d_max_latency; + struct work_struct fsnotify_work; + struct irq_work fsnotify_irqwork; +#endif +#endif + struct trace_pid_list __rcu *filtered_pids; + struct trace_pid_list __rcu *filtered_no_pids; + /* + * max_lock is used to protect the swapping of buffers + * when taking a max snapshot. The buffers themselves are + * protected by per_cpu spinlocks. But the action of the swap + * needs its own lock. + * + * This is defined as a arch_spinlock_t in order to help + * with performance when lockdep debugging is enabled. + * + * It is also used in other places outside the update_max_tr + * so it needs to be defined outside of the + * CONFIG_TRACER_MAX_TRACE. + */ + arch_spinlock_t max_lock; + int buffer_disabled; +#ifdef CONFIG_FTRACE_SYSCALLS + int sys_refcount_enter; + int sys_refcount_exit; + struct trace_event_file __rcu *enter_syscall_files[NR_syscalls]; + struct trace_event_file __rcu *exit_syscall_files[NR_syscalls]; +#endif + int stop_count; + int clock_id; + int nr_topts; + bool clear_trace; + int buffer_percent; + unsigned int n_err_log_entries; + struct tracer *current_trace; + unsigned int trace_flags; + unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; + unsigned int flags; + raw_spinlock_t start_lock; + struct list_head err_log; + struct dentry *dir; + struct dentry *options; + struct dentry *percpu_dir; + struct dentry *event_dir; + struct trace_options *topts; + struct list_head systems; + struct list_head events; + struct trace_event_file *trace_marker_file; + cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ + /* one per_cpu trace_pipe can be opened by only one user */ + cpumask_var_t pipe_cpumask; + int ref; + int trace_ref; +#ifdef CONFIG_FUNCTION_TRACER + struct ftrace_ops *ops; + struct trace_pid_list __rcu *function_pids; + struct trace_pid_list __rcu *function_no_pids; +#ifdef CONFIG_DYNAMIC_FTRACE + /* All of these are protected by the ftrace_lock */ + struct list_head func_probes; + struct list_head mod_trace; + struct list_head mod_notrace; +#endif + /* function tracing enabled */ + int function_enabled; +#endif + int no_filter_buffering_ref; + struct list_head hist_vars; +#ifdef CONFIG_TRACER_SNAPSHOT + struct cond_snapshot *cond_snapshot; +#endif + struct trace_func_repeats __percpu *last_func_repeats; +}; + +enum { + TRACE_ARRAY_FL_GLOBAL = (1 << 0) +}; + +extern struct list_head ftrace_trace_arrays; + +extern struct mutex trace_types_lock; + +extern int trace_array_get(struct trace_array *tr); +extern int tracing_check_open_get_tr(struct trace_array *tr); +extern struct trace_array *trace_array_find(const char *instance); +extern struct trace_array *trace_array_find_get(const char *instance); + +extern u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe); +extern int tracing_set_filter_buffering(struct trace_array *tr, bool set); +extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); + +extern bool trace_clock_in_ns(struct trace_array *tr); + +/* + * The global tracer (top) should be the first trace array added, + * but we check the flag anyway. + */ +static inline struct trace_array *top_trace_array(void) +{ + struct trace_array *tr; + + if (list_empty(&ftrace_trace_arrays)) + return NULL; + + tr = list_entry(ftrace_trace_arrays.prev, + typeof(*tr), list); + WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); + return tr; +} + +#define FTRACE_CMP_TYPE(var, type) \ + __builtin_types_compatible_p(typeof(var), type *) + +#undef IF_ASSIGN +#define IF_ASSIGN(var, entry, etype, id) \ + if (FTRACE_CMP_TYPE(var, etype)) { \ + var = (typeof(var))(entry); \ + WARN_ON(id != 0 && (entry)->type != id); \ + break; \ + } + +/* Will cause compile errors if type is not found. */ +extern void __ftrace_bad_type(void); + +/* + * The trace_assign_type is a verifier that the entry type is + * the same as the type being assigned. To add new types simply + * add a line with the following format: + * + * IF_ASSIGN(var, ent, type, id); + * + * Where "type" is the trace type that includes the trace_entry + * as the "ent" item. And "id" is the trace identifier that is + * used in the trace_type enum. + * + * If the type can have more than one id, then use zero. + */ +#define trace_assign_type(var, ent) \ + do { \ + IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \ + IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ + IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ + IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ + IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ + IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ + IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ + IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \ + IF_ASSIGN(var, ent, struct osnoise_entry, TRACE_OSNOISE);\ + IF_ASSIGN(var, ent, struct timerlat_entry, TRACE_TIMERLAT);\ + IF_ASSIGN(var, ent, struct raw_data_entry, TRACE_RAW_DATA);\ + IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ + TRACE_MMIO_RW); \ + IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ + TRACE_MMIO_MAP); \ + IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ + IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ + TRACE_GRAPH_ENT); \ + IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ + TRACE_GRAPH_RET); \ + IF_ASSIGN(var, ent, struct func_repeats_entry, \ + TRACE_FUNC_REPEATS); \ + __ftrace_bad_type(); \ + } while (0) + +/* + * An option specific to a tracer. This is a boolean value. + * The bit is the bit index that sets its value on the + * flags value in struct tracer_flags. + */ +struct tracer_opt { + const char *name; /* Will appear on the trace_options file */ + u32 bit; /* Mask assigned in val field in tracer_flags */ +}; + +/* + * The set of specific options for a tracer. Your tracer + * have to set the initial value of the flags val. + */ +struct tracer_flags { + u32 val; + struct tracer_opt *opts; + struct tracer *trace; +}; + +/* Makes more easy to define a tracer opt */ +#define TRACER_OPT(s, b) .name = #s, .bit = b + + +struct trace_option_dentry { + struct tracer_opt *opt; + struct tracer_flags *flags; + struct trace_array *tr; + struct dentry *entry; +}; + +/** + * struct tracer - a specific tracer and its callbacks to interact with tracefs + * @name: the name chosen to select it on the available_tracers file + * @init: called when one switches to this tracer (echo name > current_tracer) + * @reset: called when one switches to another tracer + * @start: called when tracing is unpaused (echo 1 > tracing_on) + * @stop: called when tracing is paused (echo 0 > tracing_on) + * @update_thresh: called when tracing_thresh is updated + * @open: called when the trace file is opened + * @pipe_open: called when the trace_pipe file is opened + * @close: called when the trace file is released + * @pipe_close: called when the trace_pipe file is released + * @read: override the default read callback on trace_pipe + * @splice_read: override the default splice_read callback on trace_pipe + * @selftest: selftest to run on boot (see trace_selftest.c) + * @print_headers: override the first lines that describe your columns + * @print_line: callback that prints a trace + * @set_flag: signals one of your private flags changed (trace_options file) + * @flags: your private flags + */ +struct tracer { + const char *name; + int (*init)(struct trace_array *tr); + void (*reset)(struct trace_array *tr); + void (*start)(struct trace_array *tr); + void (*stop)(struct trace_array *tr); + int (*update_thresh)(struct trace_array *tr); + void (*open)(struct trace_iterator *iter); + void (*pipe_open)(struct trace_iterator *iter); + void (*close)(struct trace_iterator *iter); + void (*pipe_close)(struct trace_iterator *iter); + ssize_t (*read)(struct trace_iterator *iter, + struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos); + ssize_t (*splice_read)(struct trace_iterator *iter, + struct file *filp, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags); +#ifdef CONFIG_FTRACE_STARTUP_TEST + int (*selftest)(struct tracer *trace, + struct trace_array *tr); +#endif + void (*print_header)(struct seq_file *m); + enum print_line_t (*print_line)(struct trace_iterator *iter); + /* If you handled the flag setting, return 0 */ + int (*set_flag)(struct trace_array *tr, + u32 old_flags, u32 bit, int set); + /* Return 0 if OK with change, else return non-zero */ + int (*flag_changed)(struct trace_array *tr, + u32 mask, int set); + struct tracer *next; + struct tracer_flags *flags; + int enabled; + bool print_max; + bool allow_instances; +#ifdef CONFIG_TRACER_MAX_TRACE + bool use_max_tr; +#endif + /* True if tracer cannot be enabled in kernel param */ + bool noboot; +}; + +static inline struct ring_buffer_iter * +trace_buffer_iter(struct trace_iterator *iter, int cpu) +{ + return iter->buffer_iter ? iter->buffer_iter[cpu] : NULL; +} + +int tracer_init(struct tracer *t, struct trace_array *tr); +int tracing_is_enabled(void); +void tracing_reset_online_cpus(struct array_buffer *buf); +void tracing_reset_all_online_cpus(void); +void tracing_reset_all_online_cpus_unlocked(void); +int tracing_open_generic(struct inode *inode, struct file *filp); +int tracing_open_generic_tr(struct inode *inode, struct file *filp); +int tracing_open_file_tr(struct inode *inode, struct file *filp); +int tracing_release_file_tr(struct inode *inode, struct file *filp); +int tracing_single_release_file_tr(struct inode *inode, struct file *filp); +bool tracing_is_disabled(void); +bool tracer_tracing_is_on(struct trace_array *tr); +void tracer_tracing_on(struct trace_array *tr); +void tracer_tracing_off(struct trace_array *tr); +struct dentry *trace_create_file(const char *name, + umode_t mode, + struct dentry *parent, + void *data, + const struct file_operations *fops); + +int tracing_init_dentry(void); + +struct ring_buffer_event; + +struct ring_buffer_event * +trace_buffer_lock_reserve(struct trace_buffer *buffer, + int type, + unsigned long len, + unsigned int trace_ctx); + +struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, + struct trace_array_cpu *data); + +struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, + int *ent_cpu, u64 *ent_ts); + +void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, + struct ring_buffer_event *event); + +bool trace_is_tracepoint_string(const char *str); +const char *trace_event_format(struct trace_iterator *iter, const char *fmt); +void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, + va_list ap) __printf(2, 0); +char *trace_iter_expand_format(struct trace_iterator *iter); + +int trace_empty(struct trace_iterator *iter); + +void *trace_find_next_entry_inc(struct trace_iterator *iter); + +void trace_init_global_iter(struct trace_iterator *iter); + +void tracing_iter_reset(struct trace_iterator *iter, int cpu); + +unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu); +unsigned long trace_total_entries(struct trace_array *tr); + +void trace_function(struct trace_array *tr, + unsigned long ip, + unsigned long parent_ip, + unsigned int trace_ctx); +void trace_graph_function(struct trace_array *tr, + unsigned long ip, + unsigned long parent_ip, + unsigned int trace_ctx); +void trace_latency_header(struct seq_file *m); +void trace_default_header(struct seq_file *m); +void print_trace_header(struct seq_file *m, struct trace_iterator *iter); + +void trace_graph_return(struct ftrace_graph_ret *trace); +int trace_graph_entry(struct ftrace_graph_ent *trace); +void set_graph_array(struct trace_array *tr); + +void tracing_start_cmdline_record(void); +void tracing_stop_cmdline_record(void); +void tracing_start_tgid_record(void); +void tracing_stop_tgid_record(void); + +int register_tracer(struct tracer *type); +int is_tracing_stopped(void); + +loff_t tracing_lseek(struct file *file, loff_t offset, int whence); + +extern cpumask_var_t __read_mostly tracing_buffer_mask; + +#define for_each_tracing_cpu(cpu) \ + for_each_cpu(cpu, tracing_buffer_mask) + +extern unsigned long nsecs_to_usecs(unsigned long nsecs); + +extern unsigned long tracing_thresh; + +/* PID filtering */ + +extern int pid_max; + +bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, + pid_t search_pid); +bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, + struct trace_pid_list *filtered_no_pids, + struct task_struct *task); +void trace_filter_add_remove_task(struct trace_pid_list *pid_list, + struct task_struct *self, + struct task_struct *task); +void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos); +void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos); +int trace_pid_show(struct seq_file *m, void *v); +int trace_pid_write(struct trace_pid_list *filtered_pids, + struct trace_pid_list **new_pid_list, + const char __user *ubuf, size_t cnt); + +#ifdef CONFIG_TRACER_MAX_TRACE +void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, + void *cond_data); +void update_max_tr_single(struct trace_array *tr, + struct task_struct *tsk, int cpu); + +#ifdef CONFIG_FSNOTIFY +#define LATENCY_FS_NOTIFY +#endif +#endif /* CONFIG_TRACER_MAX_TRACE */ + +#ifdef LATENCY_FS_NOTIFY +void latency_fsnotify(struct trace_array *tr); +#else +static inline void latency_fsnotify(struct trace_array *tr) { } +#endif + +#ifdef CONFIG_STACKTRACE +void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip); +#else +static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, + int skip) +{ +} +#endif /* CONFIG_STACKTRACE */ + +void trace_last_func_repeats(struct trace_array *tr, + struct trace_func_repeats *last_info, + unsigned int trace_ctx); + +extern u64 ftrace_now(int cpu); + +extern void trace_find_cmdline(int pid, char comm[]); +extern int trace_find_tgid(int pid); +extern void trace_event_follow_fork(struct trace_array *tr, bool enable); + +#ifdef CONFIG_DYNAMIC_FTRACE +extern unsigned long ftrace_update_tot_cnt; +extern unsigned long ftrace_number_of_pages; +extern unsigned long ftrace_number_of_groups; +void ftrace_init_trace_array(struct trace_array *tr); +#else +static inline void ftrace_init_trace_array(struct trace_array *tr) { } +#endif +#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func +extern int DYN_FTRACE_TEST_NAME(void); +#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 +extern int DYN_FTRACE_TEST_NAME2(void); + +extern bool ring_buffer_expanded; +extern bool tracing_selftest_disabled; + +#ifdef CONFIG_FTRACE_STARTUP_TEST +extern void __init disable_tracing_selftest(const char *reason); + +extern int trace_selftest_startup_function(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_function_graph(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_irqsoff(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_preemptoff(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_wakeup(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_nop(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_branch(struct tracer *trace, + struct trace_array *tr); +/* + * Tracer data references selftest functions that only occur + * on boot up. These can be __init functions. Thus, when selftests + * are enabled, then the tracers need to reference __init functions. + */ +#define __tracer_data __refdata +#else +static inline void __init disable_tracing_selftest(const char *reason) +{ +} +/* Tracers are seldom changed. Optimize when selftests are disabled. */ +#define __tracer_data __read_mostly +#endif /* CONFIG_FTRACE_STARTUP_TEST */ + +extern void *head_page(struct trace_array_cpu *data); +extern unsigned long long ns2usecs(u64 nsec); +extern int +trace_vbprintk(unsigned long ip, const char *fmt, va_list args); +extern int +trace_vprintk(unsigned long ip, const char *fmt, va_list args); +extern int +trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args); +int trace_array_printk_buf(struct trace_buffer *buffer, + unsigned long ip, const char *fmt, ...); +void trace_printk_seq(struct trace_seq *s); +enum print_line_t print_trace_line(struct trace_iterator *iter); + +extern char trace_find_mark(unsigned long long duration); + +struct ftrace_hash; + +struct ftrace_mod_load { + struct list_head list; + char *func; + char *module; + int enable; +}; + +enum { + FTRACE_HASH_FL_MOD = (1 << 0), +}; + +struct ftrace_hash { + unsigned long size_bits; + struct hlist_head *buckets; + unsigned long count; + unsigned long flags; + struct rcu_head rcu; +}; + +struct ftrace_func_entry * +ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip); + +static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) +{ + return !hash || !(hash->count || (hash->flags & FTRACE_HASH_FL_MOD)); +} + +/* Standard output formatting function used for function return traces */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +/* Flag options */ +#define TRACE_GRAPH_PRINT_OVERRUN 0x1 +#define TRACE_GRAPH_PRINT_CPU 0x2 +#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 +#define TRACE_GRAPH_PRINT_PROC 0x8 +#define TRACE_GRAPH_PRINT_DURATION 0x10 +#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 +#define TRACE_GRAPH_PRINT_REL_TIME 0x40 +#define TRACE_GRAPH_PRINT_IRQS 0x80 +#define TRACE_GRAPH_PRINT_TAIL 0x100 +#define TRACE_GRAPH_SLEEP_TIME 0x200 +#define TRACE_GRAPH_GRAPH_TIME 0x400 +#define TRACE_GRAPH_PRINT_RETVAL 0x800 +#define TRACE_GRAPH_PRINT_RETVAL_HEX 0x1000 +#define TRACE_GRAPH_PRINT_FILL_SHIFT 28 +#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) + +extern void ftrace_graph_sleep_time_control(bool enable); + +#ifdef CONFIG_FUNCTION_PROFILER +extern void ftrace_graph_graph_time_control(bool enable); +#else +static inline void ftrace_graph_graph_time_control(bool enable) { } +#endif + +extern enum print_line_t +print_graph_function_flags(struct trace_iterator *iter, u32 flags); +extern void print_graph_headers_flags(struct seq_file *s, u32 flags); +extern void +trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); +extern void graph_trace_open(struct trace_iterator *iter); +extern void graph_trace_close(struct trace_iterator *iter); +extern int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned int trace_ctx); +extern void __trace_graph_return(struct trace_array *tr, + struct ftrace_graph_ret *trace, + unsigned int trace_ctx); + +#ifdef CONFIG_DYNAMIC_FTRACE +extern struct ftrace_hash __rcu *ftrace_graph_hash; +extern struct ftrace_hash __rcu *ftrace_graph_notrace_hash; + +static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) +{ + unsigned long addr = trace->func; + int ret = 0; + struct ftrace_hash *hash; + + preempt_disable_notrace(); + + /* + * Have to open code "rcu_dereference_sched()" because the + * function graph tracer can be called when RCU is not + * "watching". + * Protected with schedule_on_each_cpu(ftrace_sync) + */ + hash = rcu_dereference_protected(ftrace_graph_hash, !preemptible()); + + if (ftrace_hash_empty(hash)) { + ret = 1; + goto out; + } + + if (ftrace_lookup_ip(hash, addr)) { + + /* + * This needs to be cleared on the return functions + * when the depth is zero. + */ + trace_recursion_set(TRACE_GRAPH_BIT); + trace_recursion_set_depth(trace->depth); + + /* + * If no irqs are to be traced, but a set_graph_function + * is set, and called by an interrupt handler, we still + * want to trace it. + */ + if (in_hardirq()) + trace_recursion_set(TRACE_IRQ_BIT); + else + trace_recursion_clear(TRACE_IRQ_BIT); + ret = 1; + } + +out: + preempt_enable_notrace(); + return ret; +} + +static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace) +{ + if (trace_recursion_test(TRACE_GRAPH_BIT) && + trace->depth == trace_recursion_depth()) + trace_recursion_clear(TRACE_GRAPH_BIT); +} + +static inline int ftrace_graph_notrace_addr(unsigned long addr) +{ + int ret = 0; + struct ftrace_hash *notrace_hash; + + preempt_disable_notrace(); + + /* + * Have to open code "rcu_dereference_sched()" because the + * function graph tracer can be called when RCU is not + * "watching". + * Protected with schedule_on_each_cpu(ftrace_sync) + */ + notrace_hash = rcu_dereference_protected(ftrace_graph_notrace_hash, + !preemptible()); + + if (ftrace_lookup_ip(notrace_hash, addr)) + ret = 1; + + preempt_enable_notrace(); + return ret; +} +#else +static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) +{ + return 1; +} + +static inline int ftrace_graph_notrace_addr(unsigned long addr) +{ + return 0; +} +static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace) +{ } +#endif /* CONFIG_DYNAMIC_FTRACE */ + +extern unsigned int fgraph_max_depth; + +static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace) +{ + /* trace it when it is-nested-in or is a function enabled. */ + return !(trace_recursion_test(TRACE_GRAPH_BIT) || + ftrace_graph_addr(trace)) || + (trace->depth < 0) || + (fgraph_max_depth && trace->depth >= fgraph_max_depth); +} + +#else /* CONFIG_FUNCTION_GRAPH_TRACER */ +static inline enum print_line_t +print_graph_function_flags(struct trace_iterator *iter, u32 flags) +{ + return TRACE_TYPE_UNHANDLED; +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +extern struct list_head ftrace_pids; + +#ifdef CONFIG_FUNCTION_TRACER + +#define FTRACE_PID_IGNORE -1 +#define FTRACE_PID_TRACE -2 + +struct ftrace_func_command { + struct list_head list; + char *name; + int (*func)(struct trace_array *tr, + struct ftrace_hash *hash, + char *func, char *cmd, + char *params, int enable); +}; +extern bool ftrace_filter_param __initdata; +static inline int ftrace_trace_task(struct trace_array *tr) +{ + return this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid) != + FTRACE_PID_IGNORE; +} +extern int ftrace_is_dead(void); +int ftrace_create_function_files(struct trace_array *tr, + struct dentry *parent); +void ftrace_destroy_function_files(struct trace_array *tr); +int ftrace_allocate_ftrace_ops(struct trace_array *tr); +void ftrace_free_ftrace_ops(struct trace_array *tr); +void ftrace_init_global_array_ops(struct trace_array *tr); +void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); +void ftrace_reset_array_ops(struct trace_array *tr); +void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); +void ftrace_init_tracefs_toplevel(struct trace_array *tr, + struct dentry *d_tracer); +void ftrace_clear_pids(struct trace_array *tr); +int init_function_trace(void); +void ftrace_pid_follow_fork(struct trace_array *tr, bool enable); +#else +static inline int ftrace_trace_task(struct trace_array *tr) +{ + return 1; +} +static inline int ftrace_is_dead(void) { return 0; } +static inline int +ftrace_create_function_files(struct trace_array *tr, + struct dentry *parent) +{ + return 0; +} +static inline int ftrace_allocate_ftrace_ops(struct trace_array *tr) +{ + return 0; +} +static inline void ftrace_free_ftrace_ops(struct trace_array *tr) { } +static inline void ftrace_destroy_function_files(struct trace_array *tr) { } +static inline __init void +ftrace_init_global_array_ops(struct trace_array *tr) { } +static inline void ftrace_reset_array_ops(struct trace_array *tr) { } +static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } +static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { } +static inline void ftrace_clear_pids(struct trace_array *tr) { } +static inline int init_function_trace(void) { return 0; } +static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) { } +/* ftace_func_t type is not defined, use macro instead of static inline */ +#define ftrace_init_array_ops(tr, func) do { } while (0) +#endif /* CONFIG_FUNCTION_TRACER */ + +#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) + +struct ftrace_probe_ops { + void (*func)(unsigned long ip, + unsigned long parent_ip, + struct trace_array *tr, + struct ftrace_probe_ops *ops, + void *data); + int (*init)(struct ftrace_probe_ops *ops, + struct trace_array *tr, + unsigned long ip, void *init_data, + void **data); + void (*free)(struct ftrace_probe_ops *ops, + struct trace_array *tr, + unsigned long ip, void *data); + int (*print)(struct seq_file *m, + unsigned long ip, + struct ftrace_probe_ops *ops, + void *data); +}; + +struct ftrace_func_mapper; +typedef int (*ftrace_mapper_func)(void *data); + +struct ftrace_func_mapper *allocate_ftrace_func_mapper(void); +void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper, + unsigned long ip); +int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, + unsigned long ip, void *data); +void *ftrace_func_mapper_remove_ip(struct ftrace_func_mapper *mapper, + unsigned long ip); +void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, + ftrace_mapper_func free_func); + +extern int +register_ftrace_function_probe(char *glob, struct trace_array *tr, + struct ftrace_probe_ops *ops, void *data); +extern int +unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, + struct ftrace_probe_ops *ops); +extern void clear_ftrace_function_probes(struct trace_array *tr); + +int register_ftrace_command(struct ftrace_func_command *cmd); +int unregister_ftrace_command(struct ftrace_func_command *cmd); + +void ftrace_create_filter_files(struct ftrace_ops *ops, + struct dentry *parent); +void ftrace_destroy_filter_files(struct ftrace_ops *ops); + +extern int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset); +extern int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset); +#else +struct ftrace_func_command; + +static inline __init int register_ftrace_command(struct ftrace_func_command *cmd) +{ + return -EINVAL; +} +static inline __init int unregister_ftrace_command(char *cmd_name) +{ + return -EINVAL; +} +static inline void clear_ftrace_function_probes(struct trace_array *tr) +{ +} + +/* + * The ops parameter passed in is usually undefined. + * This must be a macro. + */ +#define ftrace_create_filter_files(ops, parent) do { } while (0) +#define ftrace_destroy_filter_files(ops) do { } while (0) +#endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */ + +bool ftrace_event_is_function(struct trace_event_call *call); + +/* + * struct trace_parser - servers for reading the user input separated by spaces + * @cont: set if the input is not complete - no final space char was found + * @buffer: holds the parsed user input + * @idx: user input length + * @size: buffer size + */ +struct trace_parser { + bool cont; + char *buffer; + unsigned idx; + unsigned size; +}; + +static inline bool trace_parser_loaded(struct trace_parser *parser) +{ + return (parser->idx != 0); +} + +static inline bool trace_parser_cont(struct trace_parser *parser) +{ + return parser->cont; +} + +static inline void trace_parser_clear(struct trace_parser *parser) +{ + parser->cont = false; + parser->idx = 0; +} + +extern int trace_parser_get_init(struct trace_parser *parser, int size); +extern void trace_parser_put(struct trace_parser *parser); +extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, + size_t cnt, loff_t *ppos); + +/* + * Only create function graph options if function graph is configured. + */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +# define FGRAPH_FLAGS \ + C(DISPLAY_GRAPH, "display-graph"), +#else +# define FGRAPH_FLAGS +#endif + +#ifdef CONFIG_BRANCH_TRACER +# define BRANCH_FLAGS \ + C(BRANCH, "branch"), +#else +# define BRANCH_FLAGS +#endif + +#ifdef CONFIG_FUNCTION_TRACER +# define FUNCTION_FLAGS \ + C(FUNCTION, "function-trace"), \ + C(FUNC_FORK, "function-fork"), +# define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION +#else +# define FUNCTION_FLAGS +# define FUNCTION_DEFAULT_FLAGS 0UL +# define TRACE_ITER_FUNC_FORK 0UL +#endif + +#ifdef CONFIG_STACKTRACE +# define STACK_FLAGS \ + C(STACKTRACE, "stacktrace"), +#else +# define STACK_FLAGS +#endif + +/* + * trace_iterator_flags is an enumeration that defines bit + * positions into trace_flags that controls the output. + * + * NOTE: These bits must match the trace_options array in + * trace.c (this macro guarantees it). + */ +#define TRACE_FLAGS \ + C(PRINT_PARENT, "print-parent"), \ + C(SYM_OFFSET, "sym-offset"), \ + C(SYM_ADDR, "sym-addr"), \ + C(VERBOSE, "verbose"), \ + C(RAW, "raw"), \ + C(HEX, "hex"), \ + C(BIN, "bin"), \ + C(BLOCK, "block"), \ + C(FIELDS, "fields"), \ + C(PRINTK, "trace_printk"), \ + C(ANNOTATE, "annotate"), \ + C(USERSTACKTRACE, "userstacktrace"), \ + C(SYM_USEROBJ, "sym-userobj"), \ + C(PRINTK_MSGONLY, "printk-msg-only"), \ + C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \ + C(LATENCY_FMT, "latency-format"), \ + C(RECORD_CMD, "record-cmd"), \ + C(RECORD_TGID, "record-tgid"), \ + C(OVERWRITE, "overwrite"), \ + C(STOP_ON_FREE, "disable_on_free"), \ + C(IRQ_INFO, "irq-info"), \ + C(MARKERS, "markers"), \ + C(EVENT_FORK, "event-fork"), \ + C(PAUSE_ON_TRACE, "pause-on-trace"), \ + C(HASH_PTR, "hash-ptr"), /* Print hashed pointer */ \ + FUNCTION_FLAGS \ + FGRAPH_FLAGS \ + STACK_FLAGS \ + BRANCH_FLAGS + +/* + * By defining C, we can make TRACE_FLAGS a list of bit names + * that will define the bits for the flag masks. + */ +#undef C +#define C(a, b) TRACE_ITER_##a##_BIT + +enum trace_iterator_bits { + TRACE_FLAGS + /* Make sure we don't go more than we have bits for */ + TRACE_ITER_LAST_BIT +}; + +/* + * By redefining C, we can make TRACE_FLAGS a list of masks that + * use the bits as defined above. + */ +#undef C +#define C(a, b) TRACE_ITER_##a = (1 << TRACE_ITER_##a##_BIT) + +enum trace_iterator_flags { TRACE_FLAGS }; + +/* + * TRACE_ITER_SYM_MASK masks the options in trace_flags that + * control the output of kernel symbols. + */ +#define TRACE_ITER_SYM_MASK \ + (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) + +extern struct tracer nop_trace; + +#ifdef CONFIG_BRANCH_TRACER +extern int enable_branch_tracing(struct trace_array *tr); +extern void disable_branch_tracing(void); +static inline int trace_branch_enable(struct trace_array *tr) +{ + if (tr->trace_flags & TRACE_ITER_BRANCH) + return enable_branch_tracing(tr); + return 0; +} +static inline void trace_branch_disable(void) +{ + /* due to races, always disable */ + disable_branch_tracing(); +} +#else +static inline int trace_branch_enable(struct trace_array *tr) +{ + return 0; +} +static inline void trace_branch_disable(void) +{ +} +#endif /* CONFIG_BRANCH_TRACER */ + +/* set ring buffers to default size if not already done so */ +int tracing_update_buffers(void); + +union trace_synth_field { + u8 as_u8; + u16 as_u16; + u32 as_u32; + u64 as_u64; + struct trace_dynamic_info as_dynamic; +}; + +struct ftrace_event_field { + struct list_head link; + const char *name; + const char *type; + int filter_type; + int offset; + int size; + int is_signed; + int len; +}; + +struct prog_entry; + +struct event_filter { + struct prog_entry __rcu *prog; + char *filter_string; +}; + +struct event_subsystem { + struct list_head list; + const char *name; + struct event_filter *filter; + int ref_count; +}; + +struct trace_subsystem_dir { + struct list_head list; + struct event_subsystem *subsystem; + struct trace_array *tr; + struct eventfs_file *ef; + int ref_count; + int nr_events; +}; + +extern int call_filter_check_discard(struct trace_event_call *call, void *rec, + struct trace_buffer *buffer, + struct ring_buffer_event *event); + +void trace_buffer_unlock_commit_regs(struct trace_array *tr, + struct trace_buffer *buffer, + struct ring_buffer_event *event, + unsigned int trcace_ctx, + struct pt_regs *regs); + +static inline void trace_buffer_unlock_commit(struct trace_array *tr, + struct trace_buffer *buffer, + struct ring_buffer_event *event, + unsigned int trace_ctx) +{ + trace_buffer_unlock_commit_regs(tr, buffer, event, trace_ctx, NULL); +} + +DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); +DECLARE_PER_CPU(int, trace_buffered_event_cnt); +void trace_buffered_event_disable(void); +void trace_buffered_event_enable(void); + +void early_enable_events(struct trace_array *tr, char *buf, bool disable_first); + +static inline void +__trace_event_discard_commit(struct trace_buffer *buffer, + struct ring_buffer_event *event) +{ + if (this_cpu_read(trace_buffered_event) == event) { + /* Simply release the temp buffer and enable preemption */ + this_cpu_dec(trace_buffered_event_cnt); + preempt_enable_notrace(); + return; + } + /* ring_buffer_discard_commit() enables preemption */ + ring_buffer_discard_commit(buffer, event); +} + +/* + * Helper function for event_trigger_unlock_commit{_regs}(). + * If there are event triggers attached to this event that requires + * filtering against its fields, then they will be called as the + * entry already holds the field information of the current event. + * + * It also checks if the event should be discarded or not. + * It is to be discarded if the event is soft disabled and the + * event was only recorded to process triggers, or if the event + * filter is active and this event did not match the filters. + * + * Returns true if the event is discarded, false otherwise. + */ +static inline bool +__event_trigger_test_discard(struct trace_event_file *file, + struct trace_buffer *buffer, + struct ring_buffer_event *event, + void *entry, + enum event_trigger_type *tt) +{ + unsigned long eflags = file->flags; + + if (eflags & EVENT_FILE_FL_TRIGGER_COND) + *tt = event_triggers_call(file, buffer, entry, event); + + if (likely(!(file->flags & (EVENT_FILE_FL_SOFT_DISABLED | + EVENT_FILE_FL_FILTERED | + EVENT_FILE_FL_PID_FILTER)))) + return false; + + if (file->flags & EVENT_FILE_FL_SOFT_DISABLED) + goto discard; + + if (file->flags & EVENT_FILE_FL_FILTERED && + !filter_match_preds(file->filter, entry)) + goto discard; + + if ((file->flags & EVENT_FILE_FL_PID_FILTER) && + trace_event_ignore_this_pid(file)) + goto discard; + + return false; + discard: + __trace_event_discard_commit(buffer, event); + return true; +} + +/** + * event_trigger_unlock_commit - handle triggers and finish event commit + * @file: The file pointer associated with the event + * @buffer: The ring buffer that the event is being written to + * @event: The event meta data in the ring buffer + * @entry: The event itself + * @trace_ctx: The tracing context flags. + * + * This is a helper function to handle triggers that require data + * from the event itself. It also tests the event against filters and + * if the event is soft disabled and should be discarded. + */ +static inline void +event_trigger_unlock_commit(struct trace_event_file *file, + struct trace_buffer *buffer, + struct ring_buffer_event *event, + void *entry, unsigned int trace_ctx) +{ + enum event_trigger_type tt = ETT_NONE; + + if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) + trace_buffer_unlock_commit(file->tr, buffer, event, trace_ctx); + + if (tt) + event_triggers_post_call(file, tt); +} + +#define FILTER_PRED_INVALID ((unsigned short)-1) +#define FILTER_PRED_IS_RIGHT (1 << 15) +#define FILTER_PRED_FOLD (1 << 15) + +/* + * The max preds is the size of unsigned short with + * two flags at the MSBs. One bit is used for both the IS_RIGHT + * and FOLD flags. The other is reserved. + * + * 2^14 preds is way more than enough. + */ +#define MAX_FILTER_PRED 16384 + +struct filter_pred; +struct regex; + +typedef int (*regex_match_func)(char *str, struct regex *r, int len); + +enum regex_type { + MATCH_FULL = 0, + MATCH_FRONT_ONLY, + MATCH_MIDDLE_ONLY, + MATCH_END_ONLY, + MATCH_GLOB, + MATCH_INDEX, +}; + +struct regex { + char pattern[MAX_FILTER_STR_VAL]; + int len; + int field_len; + regex_match_func match; +}; + +static inline bool is_string_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING || + field->filter_type == FILTER_STATIC_STRING || + field->filter_type == FILTER_PTR_STRING || + field->filter_type == FILTER_COMM; +} + +static inline bool is_function_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_TRACE_FN; +} + +extern enum regex_type +filter_parse_regex(char *buff, int len, char **search, int *not); +extern void print_event_filter(struct trace_event_file *file, + struct trace_seq *s); +extern int apply_event_filter(struct trace_event_file *file, + char *filter_string); +extern int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, + char *filter_string); +extern void print_subsystem_event_filter(struct event_subsystem *system, + struct trace_seq *s); +extern int filter_assign_type(const char *type); +extern int create_event_filter(struct trace_array *tr, + struct trace_event_call *call, + char *filter_str, bool set_str, + struct event_filter **filterp); +extern void free_event_filter(struct event_filter *filter); + +struct ftrace_event_field * +trace_find_event_field(struct trace_event_call *call, char *name); + +extern void trace_event_enable_cmd_record(bool enable); +extern void trace_event_enable_tgid_record(bool enable); + +extern int event_trace_init(void); +extern int init_events(void); +extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); +extern int event_trace_del_tracer(struct trace_array *tr); +extern void __trace_early_add_events(struct trace_array *tr); + +extern struct trace_event_file *__find_event_file(struct trace_array *tr, + const char *system, + const char *event); +extern struct trace_event_file *find_event_file(struct trace_array *tr, + const char *system, + const char *event); + +static inline void *event_file_data(struct file *filp) +{ + return READ_ONCE(file_inode(filp)->i_private); +} + +extern struct mutex event_mutex; +extern struct list_head ftrace_events; + +extern const struct file_operations event_trigger_fops; +extern const struct file_operations event_hist_fops; +extern const struct file_operations event_hist_debug_fops; +extern const struct file_operations event_inject_fops; + +#ifdef CONFIG_HIST_TRIGGERS +extern int register_trigger_hist_cmd(void); +extern int register_trigger_hist_enable_disable_cmds(void); +#else +static inline int register_trigger_hist_cmd(void) { return 0; } +static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; } +#endif + +extern int register_trigger_cmds(void); +extern void clear_event_triggers(struct trace_array *tr); + +enum { + EVENT_TRIGGER_FL_PROBE = BIT(0), +}; + +struct event_trigger_data { + unsigned long count; + int ref; + int flags; + struct event_trigger_ops *ops; + struct event_command *cmd_ops; + struct event_filter __rcu *filter; + char *filter_str; + void *private_data; + bool paused; + bool paused_tmp; + struct list_head list; + char *name; + struct list_head named_list; + struct event_trigger_data *named_data; +}; + +/* Avoid typos */ +#define ENABLE_EVENT_STR "enable_event" +#define DISABLE_EVENT_STR "disable_event" +#define ENABLE_HIST_STR "enable_hist" +#define DISABLE_HIST_STR "disable_hist" + +struct enable_trigger_data { + struct trace_event_file *file; + bool enable; + bool hist; +}; + +extern int event_enable_trigger_print(struct seq_file *m, + struct event_trigger_data *data); +extern void event_enable_trigger_free(struct event_trigger_data *data); +extern int event_enable_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, + char *param_and_filter); +extern int event_enable_register_trigger(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file); +extern void event_enable_unregister_trigger(char *glob, + struct event_trigger_data *test, + struct trace_event_file *file); +extern void trigger_data_free(struct event_trigger_data *data); +extern int event_trigger_init(struct event_trigger_data *data); +extern int trace_event_trigger_enable_disable(struct trace_event_file *file, + int trigger_enable); +extern void update_cond_flag(struct trace_event_file *file); +extern int set_trigger_filter(char *filter_str, + struct event_trigger_data *trigger_data, + struct trace_event_file *file); +extern struct event_trigger_data *find_named_trigger(const char *name); +extern bool is_named_trigger(struct event_trigger_data *test); +extern int save_named_trigger(const char *name, + struct event_trigger_data *data); +extern void del_named_trigger(struct event_trigger_data *data); +extern void pause_named_trigger(struct event_trigger_data *data); +extern void unpause_named_trigger(struct event_trigger_data *data); +extern void set_named_trigger_data(struct event_trigger_data *data, + struct event_trigger_data *named_data); +extern struct event_trigger_data * +get_named_trigger_data(struct event_trigger_data *data); +extern int register_event_command(struct event_command *cmd); +extern int unregister_event_command(struct event_command *cmd); +extern int register_trigger_hist_enable_disable_cmds(void); +extern bool event_trigger_check_remove(const char *glob); +extern bool event_trigger_empty_param(const char *param); +extern int event_trigger_separate_filter(char *param_and_filter, char **param, + char **filter, bool param_required); +extern struct event_trigger_data * +event_trigger_alloc(struct event_command *cmd_ops, + char *cmd, + char *param, + void *private_data); +extern int event_trigger_parse_num(char *trigger, + struct event_trigger_data *trigger_data); +extern int event_trigger_set_filter(struct event_command *cmd_ops, + struct trace_event_file *file, + char *param, + struct event_trigger_data *trigger_data); +extern void event_trigger_reset_filter(struct event_command *cmd_ops, + struct event_trigger_data *trigger_data); +extern int event_trigger_register(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, + struct event_trigger_data *trigger_data); +extern void event_trigger_unregister(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, + struct event_trigger_data *trigger_data); + +extern void event_file_get(struct trace_event_file *file); +extern void event_file_put(struct trace_event_file *file); + +/** + * struct event_trigger_ops - callbacks for trace event triggers + * + * The methods in this structure provide per-event trigger hooks for + * various trigger operations. + * + * The @init and @free methods are used during trigger setup and + * teardown, typically called from an event_command's @parse() + * function implementation. + * + * The @print method is used to print the trigger spec. + * + * The @trigger method is the function that actually implements the + * trigger and is called in the context of the triggering event + * whenever that event occurs. + * + * All the methods below, except for @init() and @free(), must be + * implemented. + * + * @trigger: The trigger 'probe' function called when the triggering + * event occurs. The data passed into this callback is the data + * that was supplied to the event_command @reg() function that + * registered the trigger (see struct event_command) along with + * the trace record, rec. + * + * @init: An optional initialization function called for the trigger + * when the trigger is registered (via the event_command reg() + * function). This can be used to perform per-trigger + * initialization such as incrementing a per-trigger reference + * count, for instance. This is usually implemented by the + * generic utility function @event_trigger_init() (see + * trace_event_triggers.c). + * + * @free: An optional de-initialization function called for the + * trigger when the trigger is unregistered (via the + * event_command @reg() function). This can be used to perform + * per-trigger de-initialization such as decrementing a + * per-trigger reference count and freeing corresponding trigger + * data, for instance. This is usually implemented by the + * generic utility function @event_trigger_free() (see + * trace_event_triggers.c). + * + * @print: The callback function invoked to have the trigger print + * itself. This is usually implemented by a wrapper function + * that calls the generic utility function @event_trigger_print() + * (see trace_event_triggers.c). + */ +struct event_trigger_ops { + void (*trigger)(struct event_trigger_data *data, + struct trace_buffer *buffer, + void *rec, + struct ring_buffer_event *rbe); + int (*init)(struct event_trigger_data *data); + void (*free)(struct event_trigger_data *data); + int (*print)(struct seq_file *m, + struct event_trigger_data *data); +}; + +/** + * struct event_command - callbacks and data members for event commands + * + * Event commands are invoked by users by writing the command name + * into the 'trigger' file associated with a trace event. The + * parameters associated with a specific invocation of an event + * command are used to create an event trigger instance, which is + * added to the list of trigger instances associated with that trace + * event. When the event is hit, the set of triggers associated with + * that event is invoked. + * + * The data members in this structure provide per-event command data + * for various event commands. + * + * All the data members below, except for @post_trigger, must be set + * for each event command. + * + * @name: The unique name that identifies the event command. This is + * the name used when setting triggers via trigger files. + * + * @trigger_type: A unique id that identifies the event command + * 'type'. This value has two purposes, the first to ensure that + * only one trigger of the same type can be set at a given time + * for a particular event e.g. it doesn't make sense to have both + * a traceon and traceoff trigger attached to a single event at + * the same time, so traceon and traceoff have the same type + * though they have different names. The @trigger_type value is + * also used as a bit value for deferring the actual trigger + * action until after the current event is finished. Some + * commands need to do this if they themselves log to the trace + * buffer (see the @post_trigger() member below). @trigger_type + * values are defined by adding new values to the trigger_type + * enum in include/linux/trace_events.h. + * + * @flags: See the enum event_command_flags below. + * + * All the methods below, except for @set_filter() and @unreg_all(), + * must be implemented. + * + * @parse: The callback function responsible for parsing and + * registering the trigger written to the 'trigger' file by the + * user. It allocates the trigger instance and registers it with + * the appropriate trace event. It makes use of the other + * event_command callback functions to orchestrate this, and is + * usually implemented by the generic utility function + * @event_trigger_callback() (see trace_event_triggers.c). + * + * @reg: Adds the trigger to the list of triggers associated with the + * event, and enables the event trigger itself, after + * initializing it (via the event_trigger_ops @init() function). + * This is also where commands can use the @trigger_type value to + * make the decision as to whether or not multiple instances of + * the trigger should be allowed. This is usually implemented by + * the generic utility function @register_trigger() (see + * trace_event_triggers.c). + * + * @unreg: Removes the trigger from the list of triggers associated + * with the event, and disables the event trigger itself, after + * initializing it (via the event_trigger_ops @free() function). + * This is usually implemented by the generic utility function + * @unregister_trigger() (see trace_event_triggers.c). + * + * @unreg_all: An optional function called to remove all the triggers + * from the list of triggers associated with the event. Called + * when a trigger file is opened in truncate mode. + * + * @set_filter: An optional function called to parse and set a filter + * for the trigger. If no @set_filter() method is set for the + * event command, filters set by the user for the command will be + * ignored. This is usually implemented by the generic utility + * function @set_trigger_filter() (see trace_event_triggers.c). + * + * @get_trigger_ops: The callback function invoked to retrieve the + * event_trigger_ops implementation associated with the command. + * This callback function allows a single event_command to + * support multiple trigger implementations via different sets of + * event_trigger_ops, depending on the value of the @param + * string. + */ +struct event_command { + struct list_head list; + char *name; + enum event_trigger_type trigger_type; + int flags; + int (*parse)(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, + char *param_and_filter); + int (*reg)(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file); + void (*unreg)(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file); + void (*unreg_all)(struct trace_event_file *file); + int (*set_filter)(char *filter_str, + struct event_trigger_data *data, + struct trace_event_file *file); + struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); +}; + +/** + * enum event_command_flags - flags for struct event_command + * + * @POST_TRIGGER: A flag that says whether or not this command needs + * to have its action delayed until after the current event has + * been closed. Some triggers need to avoid being invoked while + * an event is currently in the process of being logged, since + * the trigger may itself log data into the trace buffer. Thus + * we make sure the current event is committed before invoking + * those triggers. To do that, the trigger invocation is split + * in two - the first part checks the filter using the current + * trace record; if a command has the @post_trigger flag set, it + * sets a bit for itself in the return value, otherwise it + * directly invokes the trigger. Once all commands have been + * either invoked or set their return flag, the current record is + * either committed or discarded. At that point, if any commands + * have deferred their triggers, those commands are finally + * invoked following the close of the current event. In other + * words, if the event_trigger_ops @func() probe implementation + * itself logs to the trace buffer, this flag should be set, + * otherwise it can be left unspecified. + * + * @NEEDS_REC: A flag that says whether or not this command needs + * access to the trace record in order to perform its function, + * regardless of whether or not it has a filter associated with + * it (filters make a trigger require access to the trace record + * but are not always present). + */ +enum event_command_flags { + EVENT_CMD_FL_POST_TRIGGER = 1, + EVENT_CMD_FL_NEEDS_REC = 2, +}; + +static inline bool event_command_post_trigger(struct event_command *cmd_ops) +{ + return cmd_ops->flags & EVENT_CMD_FL_POST_TRIGGER; +} + +static inline bool event_command_needs_rec(struct event_command *cmd_ops) +{ + return cmd_ops->flags & EVENT_CMD_FL_NEEDS_REC; +} + +extern int trace_event_enable_disable(struct trace_event_file *file, + int enable, int soft_disable); +extern int tracing_alloc_snapshot(void); +extern void tracing_snapshot_cond(struct trace_array *tr, void *cond_data); +extern int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update); + +extern int tracing_snapshot_cond_disable(struct trace_array *tr); +extern void *tracing_cond_snapshot_data(struct trace_array *tr); + +extern const char *__start___trace_bprintk_fmt[]; +extern const char *__stop___trace_bprintk_fmt[]; + +extern const char *__start___tracepoint_str[]; +extern const char *__stop___tracepoint_str[]; + +void trace_printk_control(bool enabled); +void trace_printk_start_comm(void); +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); +int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); + +/* Used from boot time tracer */ +extern int trace_set_options(struct trace_array *tr, char *option); +extern int tracing_set_tracer(struct trace_array *tr, const char *buf); +extern ssize_t tracing_resize_ring_buffer(struct trace_array *tr, + unsigned long size, int cpu_id); +extern int tracing_set_cpumask(struct trace_array *tr, + cpumask_var_t tracing_cpumask_new); + + +#define MAX_EVENT_NAME_LEN 64 + +extern ssize_t trace_parse_run_command(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos, + int (*createfn)(const char *)); + +extern unsigned int err_pos(char *cmd, const char *str); +extern void tracing_log_err(struct trace_array *tr, + const char *loc, const char *cmd, + const char **errs, u8 type, u16 pos); + +/* + * Normal trace_printk() and friends allocates special buffers + * to do the manipulation, as well as saves the print formats + * into sections to display. But the trace infrastructure wants + * to use these without the added overhead at the price of being + * a bit slower (used mainly for warnings, where we don't care + * about performance). The internal_trace_puts() is for such + * a purpose. + */ +#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str)) + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ + extern struct trace_event_call \ + __aligned(4) event_##call; +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#undef FTRACE_ENTRY_PACKED +#define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) + +#include "trace_entries.h" + +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) +int perf_ftrace_event_register(struct trace_event_call *call, + enum trace_reg type, void *data); +#else +#define perf_ftrace_event_register NULL +#endif + +#ifdef CONFIG_FTRACE_SYSCALLS +void init_ftrace_syscalls(void); +const char *get_syscall_name(int syscall); +#else +static inline void init_ftrace_syscalls(void) { } +static inline const char *get_syscall_name(int syscall) +{ + return NULL; +} +#endif + +#ifdef CONFIG_EVENT_TRACING +void trace_event_init(void); +void trace_event_eval_update(struct trace_eval_map **map, int len); +/* Used from boot time tracer */ +extern int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); +extern int trigger_process_regex(struct trace_event_file *file, char *buff); +#else +static inline void __init trace_event_init(void) { } +static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { } +#endif + +#ifdef CONFIG_TRACER_SNAPSHOT +void tracing_snapshot_instance(struct trace_array *tr); +int tracing_alloc_snapshot_instance(struct trace_array *tr); +#else +static inline void tracing_snapshot_instance(struct trace_array *tr) { } +static inline int tracing_alloc_snapshot_instance(struct trace_array *tr) +{ + return 0; +} +#endif + +#ifdef CONFIG_PREEMPT_TRACER +void tracer_preempt_on(unsigned long a0, unsigned long a1); +void tracer_preempt_off(unsigned long a0, unsigned long a1); +#else +static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { } +static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { } +#endif +#ifdef CONFIG_IRQSOFF_TRACER +void tracer_hardirqs_on(unsigned long a0, unsigned long a1); +void tracer_hardirqs_off(unsigned long a0, unsigned long a1); +#else +static inline void tracer_hardirqs_on(unsigned long a0, unsigned long a1) { } +static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { } +#endif + +/* + * Reset the state of the trace_iterator so that it can read consumed data. + * Normally, the trace_iterator is used for reading the data when it is not + * consumed, and must retain state. + */ +static __always_inline void trace_iterator_reset(struct trace_iterator *iter) +{ + memset_startat(iter, 0, seq); + iter->pos = -1; +} + +/* Check the name is good for event/group/fields */ +static inline bool __is_good_name(const char *name, bool hash_ok) +{ + if (!isalpha(*name) && *name != '_' && (!hash_ok || *name != '-')) + return false; + while (*++name != '\0') { + if (!isalpha(*name) && !isdigit(*name) && *name != '_' && + (!hash_ok || *name != '-')) + return false; + } + return true; +} + +/* Check the name is good for event/group/fields */ +static inline bool is_good_name(const char *name) +{ + return __is_good_name(name, false); +} + +/* Check the name is good for system */ +static inline bool is_good_system_name(const char *name) +{ + return __is_good_name(name, true); +} + +/* Convert certain expected symbols into '_' when generating event names */ +static inline void sanitize_event_name(char *name) +{ + while (*name++ != '\0') + if (*name == ':' || *name == '.') + *name = '_'; +} + +/* + * This is a generic way to read and write a u64 value from a file in tracefs. + * + * The value is stored on the variable pointed by *val. The value needs + * to be at least *min and at most *max. The write is protected by an + * existing *lock. + */ +struct trace_min_max_param { + struct mutex *lock; + u64 *val; + u64 *min; + u64 *max; +}; + +#define U64_STR_SIZE 24 /* 20 digits max */ + +extern const struct file_operations trace_min_max_fops; + +#ifdef CONFIG_RV +extern int rv_init_interface(void); +#else +static inline int rv_init_interface(void) +{ + return 0; +} +#endif + +#endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c new file mode 100644 index 0000000000..54d5fa35c9 --- /dev/null +++ b/kernel/trace/trace_benchmark.c @@ -0,0 +1,229 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/trace_clock.h> + +#define CREATE_TRACE_POINTS +#include "trace_benchmark.h" + +static struct task_struct *bm_event_thread; + +static char bm_str[BENCHMARK_EVENT_STRLEN] = "START"; + +static u64 bm_total; +static u64 bm_totalsq; +static u64 bm_last; +static u64 bm_max; +static u64 bm_min; +static u64 bm_first; +static u64 bm_cnt; +static u64 bm_stddev; +static unsigned int bm_avg; +static unsigned int bm_std; + +static bool ok_to_run; + +/* + * This gets called in a loop recording the time it took to write + * the tracepoint. What it writes is the time statistics of the last + * tracepoint write. As there is nothing to write the first time + * it simply writes "START". As the first write is cold cache and + * the rest is hot, we save off that time in bm_first and it is + * reported as "first", which is shown in the second write to the + * tracepoint. The "first" field is written within the statics from + * then on but never changes. + */ +static void trace_do_benchmark(void) +{ + u64 start; + u64 stop; + u64 delta; + u64 stddev; + u64 seed; + u64 last_seed; + unsigned int avg; + unsigned int std = 0; + + /* Only run if the tracepoint is actually active */ + if (!trace_benchmark_event_enabled() || !tracing_is_on()) + return; + + local_irq_disable(); + start = trace_clock_local(); + trace_benchmark_event(bm_str, bm_last); + stop = trace_clock_local(); + local_irq_enable(); + + bm_cnt++; + + delta = stop - start; + + /* + * The first read is cold cached, keep it separate from the + * other calculations. + */ + if (bm_cnt == 1) { + bm_first = delta; + scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, + "first=%llu [COLD CACHED]", bm_first); + return; + } + + bm_last = delta; + + if (delta > bm_max) + bm_max = delta; + if (!bm_min || delta < bm_min) + bm_min = delta; + + /* + * When bm_cnt is greater than UINT_MAX, it breaks the statistics + * accounting. Freeze the statistics when that happens. + * We should have enough data for the avg and stddev anyway. + */ + if (bm_cnt > UINT_MAX) { + scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, + "last=%llu first=%llu max=%llu min=%llu ** avg=%u std=%d std^2=%lld", + bm_last, bm_first, bm_max, bm_min, bm_avg, bm_std, bm_stddev); + return; + } + + bm_total += delta; + bm_totalsq += delta * delta; + + + if (bm_cnt > 1) { + /* + * Apply Welford's method to calculate standard deviation: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + */ + stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total; + do_div(stddev, (u32)bm_cnt); + do_div(stddev, (u32)bm_cnt - 1); + } else + stddev = 0; + + delta = bm_total; + do_div(delta, bm_cnt); + avg = delta; + + if (stddev > 0) { + int i = 0; + /* + * stddev is the square of standard deviation but + * we want the actually number. Use the average + * as our seed to find the std. + * + * The next try is: + * x = (x + N/x) / 2 + * + * Where N is the squared number to find the square + * root of. + */ + seed = avg; + do { + last_seed = seed; + seed = stddev; + if (!last_seed) + break; + do_div(seed, last_seed); + seed += last_seed; + do_div(seed, 2); + } while (i++ < 10 && last_seed != seed); + + std = seed; + } + + scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, + "last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld", + bm_last, bm_first, bm_max, bm_min, avg, std, stddev); + + bm_std = std; + bm_avg = avg; + bm_stddev = stddev; +} + +static int benchmark_event_kthread(void *arg) +{ + /* sleep a bit to make sure the tracepoint gets activated */ + msleep(100); + + while (!kthread_should_stop()) { + + trace_do_benchmark(); + + /* + * We don't go to sleep, but let others run as well. + * This is basically a "yield()" to let any task that + * wants to run, schedule in, but if the CPU is idle, + * we'll keep burning cycles. + * + * Note the tasks_rcu_qs() version of cond_resched() will + * notify synchronize_rcu_tasks() that this thread has + * passed a quiescent state for rcu_tasks. Otherwise + * this thread will never voluntarily schedule which would + * block synchronize_rcu_tasks() indefinitely. + */ + cond_resched_tasks_rcu_qs(); + } + + return 0; +} + +/* + * When the benchmark tracepoint is enabled, it calls this + * function and the thread that calls the tracepoint is created. + */ +int trace_benchmark_reg(void) +{ + if (!ok_to_run) { + pr_warn("trace benchmark cannot be started via kernel command line\n"); + return -EBUSY; + } + + bm_event_thread = kthread_run(benchmark_event_kthread, + NULL, "event_benchmark"); + if (IS_ERR(bm_event_thread)) { + pr_warn("trace benchmark failed to create kernel thread\n"); + return PTR_ERR(bm_event_thread); + } + + return 0; +} + +/* + * When the benchmark tracepoint is disabled, it calls this + * function and the thread that calls the tracepoint is deleted + * and all the numbers are reset. + */ +void trace_benchmark_unreg(void) +{ + if (!bm_event_thread) + return; + + kthread_stop(bm_event_thread); + bm_event_thread = NULL; + + strcpy(bm_str, "START"); + bm_total = 0; + bm_totalsq = 0; + bm_last = 0; + bm_max = 0; + bm_min = 0; + bm_cnt = 0; + /* These don't need to be reset but reset them anyway */ + bm_first = 0; + bm_std = 0; + bm_avg = 0; + bm_stddev = 0; +} + +static __init int ok_to_run_trace_benchmark(void) +{ + ok_to_run = true; + + return 0; +} + +early_initcall(ok_to_run_trace_benchmark); diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h new file mode 100644 index 0000000000..c3e91060dc --- /dev/null +++ b/kernel/trace/trace_benchmark.h @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +#undef TRACE_SYSTEM +#define TRACE_SYSTEM benchmark + +#if !defined(_TRACE_BENCHMARK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BENCHMARK_H + +#include <linux/tracepoint.h> + +extern int trace_benchmark_reg(void); +extern void trace_benchmark_unreg(void); + +#define BENCHMARK_EVENT_STRLEN 128 + +TRACE_EVENT_FN(benchmark_event, + + TP_PROTO(const char *str, u64 delta), + + TP_ARGS(str, delta), + + TP_STRUCT__entry( + __array( char, str, BENCHMARK_EVENT_STRLEN ) + __field( u64, delta) + ), + + TP_fast_assign( + memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN); + __entry->delta = delta; + ), + + TP_printk("%s delta=%llu", __entry->str, __entry->delta), + + trace_benchmark_reg, trace_benchmark_unreg +); + +#endif /* _TRACE_BENCHMARK_H */ + +#undef TRACE_INCLUDE_FILE +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_benchmark + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c new file mode 100644 index 0000000000..7ccc7a8e15 --- /dev/null +++ b/kernel/trace/trace_boot.c @@ -0,0 +1,671 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace_boot.c + * Tracing kernel boot-time + */ + +#define pr_fmt(fmt) "trace_boot: " fmt + +#include <linux/bootconfig.h> +#include <linux/cpumask.h> +#include <linux/ftrace.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mutex.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/trace.h> +#include <linux/trace_events.h> + +#include "trace.h" + +#define MAX_BUF_LEN 256 + +static void __init +trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node) +{ + struct xbc_node *anode; + const char *p; + char buf[MAX_BUF_LEN]; + unsigned long v = 0; + + /* Common ftrace options */ + xbc_node_for_each_array_value(node, "options", anode, p) { + if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) { + pr_err("String is too long: %s\n", p); + continue; + } + + if (trace_set_options(tr, buf) < 0) + pr_err("Failed to set option: %s\n", buf); + } + + p = xbc_node_find_value(node, "tracing_on", NULL); + if (p && *p != '\0') { + if (kstrtoul(p, 10, &v)) + pr_err("Failed to set tracing on: %s\n", p); + if (v) + tracer_tracing_on(tr); + else + tracer_tracing_off(tr); + } + + p = xbc_node_find_value(node, "trace_clock", NULL); + if (p && *p != '\0') { + if (tracing_set_clock(tr, p) < 0) + pr_err("Failed to set trace clock: %s\n", p); + } + + p = xbc_node_find_value(node, "buffer_size", NULL); + if (p && *p != '\0') { + v = memparse(p, NULL); + if (v < PAGE_SIZE) + pr_err("Buffer size is too small: %s\n", p); + if (tracing_resize_ring_buffer(tr, v, RING_BUFFER_ALL_CPUS) < 0) + pr_err("Failed to resize trace buffer to %s\n", p); + } + + p = xbc_node_find_value(node, "cpumask", NULL); + if (p && *p != '\0') { + cpumask_var_t new_mask; + + if (alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + if (cpumask_parse(p, new_mask) < 0 || + tracing_set_cpumask(tr, new_mask) < 0) + pr_err("Failed to set new CPU mask %s\n", p); + free_cpumask_var(new_mask); + } + } +} + +#ifdef CONFIG_EVENT_TRACING +static void __init +trace_boot_enable_events(struct trace_array *tr, struct xbc_node *node) +{ + struct xbc_node *anode; + char buf[MAX_BUF_LEN]; + const char *p; + + xbc_node_for_each_array_value(node, "events", anode, p) { + if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) { + pr_err("String is too long: %s\n", p); + continue; + } + + if (ftrace_set_clr_event(tr, buf, 1) < 0) + pr_err("Failed to enable event: %s\n", p); + } +} + +#ifdef CONFIG_KPROBE_EVENTS +static int __init +trace_boot_add_kprobe_event(struct xbc_node *node, const char *event) +{ + struct dynevent_cmd cmd; + struct xbc_node *anode; + char buf[MAX_BUF_LEN]; + const char *val; + int ret = 0; + + xbc_node_for_each_array_value(node, "probes", anode, val) { + kprobe_event_cmd_init(&cmd, buf, MAX_BUF_LEN); + + ret = kprobe_event_gen_cmd_start(&cmd, event, val); + if (ret) { + pr_err("Failed to generate probe: %s\n", buf); + break; + } + + ret = kprobe_event_gen_cmd_end(&cmd); + if (ret) { + pr_err("Failed to add probe: %s\n", buf); + break; + } + } + + return ret; +} +#else +static inline int __init +trace_boot_add_kprobe_event(struct xbc_node *node, const char *event) +{ + pr_err("Kprobe event is not supported.\n"); + return -ENOTSUPP; +} +#endif + +#ifdef CONFIG_SYNTH_EVENTS +static int __init +trace_boot_add_synth_event(struct xbc_node *node, const char *event) +{ + struct dynevent_cmd cmd; + struct xbc_node *anode; + char buf[MAX_BUF_LEN]; + const char *p; + int ret; + + synth_event_cmd_init(&cmd, buf, MAX_BUF_LEN); + + ret = synth_event_gen_cmd_start(&cmd, event, NULL); + if (ret) + return ret; + + xbc_node_for_each_array_value(node, "fields", anode, p) { + ret = synth_event_add_field_str(&cmd, p); + if (ret) + return ret; + } + + ret = synth_event_gen_cmd_end(&cmd); + if (ret < 0) + pr_err("Failed to add synthetic event: %s\n", buf); + + return ret; +} +#else +static inline int __init +trace_boot_add_synth_event(struct xbc_node *node, const char *event) +{ + pr_err("Synthetic event is not supported.\n"); + return -ENOTSUPP; +} +#endif + +#ifdef CONFIG_HIST_TRIGGERS +static int __init __printf(3, 4) +append_printf(char **bufp, char *end, const char *fmt, ...) +{ + va_list args; + int ret; + + if (*bufp == end) + return -ENOSPC; + + va_start(args, fmt); + ret = vsnprintf(*bufp, end - *bufp, fmt, args); + if (ret < end - *bufp) { + *bufp += ret; + } else { + *bufp = end; + ret = -ERANGE; + } + va_end(args); + + return ret; +} + +static int __init +append_str_nospace(char **bufp, char *end, const char *str) +{ + char *p = *bufp; + int len; + + while (p < end - 1 && *str != '\0') { + if (!isspace(*str)) + *(p++) = *str; + str++; + } + *p = '\0'; + if (p == end - 1) { + *bufp = end; + return -ENOSPC; + } + len = p - *bufp; + *bufp = p; + return (int)len; +} + +static int __init +trace_boot_hist_add_array(struct xbc_node *hnode, char **bufp, + char *end, const char *key) +{ + struct xbc_node *anode; + const char *p; + char sep; + + p = xbc_node_find_value(hnode, key, &anode); + if (p) { + if (!anode) { + pr_err("hist.%s requires value(s).\n", key); + return -EINVAL; + } + + append_printf(bufp, end, ":%s", key); + sep = '='; + xbc_array_for_each_value(anode, p) { + append_printf(bufp, end, "%c%s", sep, p); + if (sep == '=') + sep = ','; + } + } else + return -ENOENT; + + return 0; +} + +static int __init +trace_boot_hist_add_one_handler(struct xbc_node *hnode, char **bufp, + char *end, const char *handler, + const char *param) +{ + struct xbc_node *knode, *anode; + const char *p; + char sep; + + /* Compose 'handler' parameter */ + p = xbc_node_find_value(hnode, param, NULL); + if (!p) { + pr_err("hist.%s requires '%s' option.\n", + xbc_node_get_data(hnode), param); + return -EINVAL; + } + append_printf(bufp, end, ":%s(%s)", handler, p); + + /* Compose 'action' parameter */ + knode = xbc_node_find_subkey(hnode, "trace"); + if (!knode) + knode = xbc_node_find_subkey(hnode, "save"); + + if (knode) { + anode = xbc_node_get_child(knode); + if (!anode || !xbc_node_is_value(anode)) { + pr_err("hist.%s.%s requires value(s).\n", + xbc_node_get_data(hnode), + xbc_node_get_data(knode)); + return -EINVAL; + } + + append_printf(bufp, end, ".%s", xbc_node_get_data(knode)); + sep = '('; + xbc_array_for_each_value(anode, p) { + append_printf(bufp, end, "%c%s", sep, p); + if (sep == '(') + sep = ','; + } + append_printf(bufp, end, ")"); + } else if (xbc_node_find_subkey(hnode, "snapshot")) { + append_printf(bufp, end, ".snapshot()"); + } else { + pr_err("hist.%s requires an action.\n", + xbc_node_get_data(hnode)); + return -EINVAL; + } + + return 0; +} + +static int __init +trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp, + char *end, const char *param) +{ + struct xbc_node *node; + const char *p, *handler; + int ret = 0; + + handler = xbc_node_get_data(hnode); + + xbc_node_for_each_subkey(hnode, node) { + p = xbc_node_get_data(node); + if (!isdigit(p[0])) + continue; + /* All digit started node should be instances. */ + ret = trace_boot_hist_add_one_handler(node, bufp, end, handler, param); + if (ret < 0) + break; + } + + if (xbc_node_find_subkey(hnode, param)) + ret = trace_boot_hist_add_one_handler(hnode, bufp, end, handler, param); + + return ret; +} + +/* + * Histogram boottime tracing syntax. + * + * ftrace.[instance.INSTANCE.]event.GROUP.EVENT.hist[.N] { + * keys = <KEY>[,...] + * values = <VAL>[,...] + * sort = <SORT-KEY>[,...] + * size = <ENTRIES> + * name = <HISTNAME> + * var { <VAR> = <EXPR> ... } + * pause|continue|clear + * onmax|onchange[.N] { var = <VAR>; <ACTION> [= <PARAM>] } + * onmatch[.N] { event = <EVENT>; <ACTION> [= <PARAM>] } + * filter = <FILTER> + * } + * + * Where <ACTION> are; + * + * trace = <EVENT>, <ARG1>[, ...] + * save = <ARG1>[, ...] + * snapshot + */ +static int __init +trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) +{ + struct xbc_node *node, *knode; + char *end = buf + size; + const char *p; + int ret = 0; + + append_printf(&buf, end, "hist"); + + ret = trace_boot_hist_add_array(hnode, &buf, end, "keys"); + if (ret < 0) { + if (ret == -ENOENT) + pr_err("hist requires keys.\n"); + return -EINVAL; + } + + ret = trace_boot_hist_add_array(hnode, &buf, end, "values"); + if (ret == -EINVAL) + return ret; + ret = trace_boot_hist_add_array(hnode, &buf, end, "sort"); + if (ret == -EINVAL) + return ret; + + p = xbc_node_find_value(hnode, "size", NULL); + if (p) + append_printf(&buf, end, ":size=%s", p); + + p = xbc_node_find_value(hnode, "name", NULL); + if (p) + append_printf(&buf, end, ":name=%s", p); + + node = xbc_node_find_subkey(hnode, "var"); + if (node) { + xbc_node_for_each_key_value(node, knode, p) { + /* Expression must not include spaces. */ + append_printf(&buf, end, ":%s=", + xbc_node_get_data(knode)); + append_str_nospace(&buf, end, p); + } + } + + /* Histogram control attributes (mutual exclusive) */ + if (xbc_node_find_value(hnode, "pause", NULL)) + append_printf(&buf, end, ":pause"); + else if (xbc_node_find_value(hnode, "continue", NULL)) + append_printf(&buf, end, ":continue"); + else if (xbc_node_find_value(hnode, "clear", NULL)) + append_printf(&buf, end, ":clear"); + + /* Histogram handler and actions */ + node = xbc_node_find_subkey(hnode, "onmax"); + if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0) + return -EINVAL; + node = xbc_node_find_subkey(hnode, "onchange"); + if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0) + return -EINVAL; + node = xbc_node_find_subkey(hnode, "onmatch"); + if (node && trace_boot_hist_add_handlers(node, &buf, end, "event") < 0) + return -EINVAL; + + p = xbc_node_find_value(hnode, "filter", NULL); + if (p) + append_printf(&buf, end, " if %s", p); + + if (buf == end) { + pr_err("hist exceeds the max command length.\n"); + return -E2BIG; + } + + return 0; +} + +static void __init +trace_boot_init_histograms(struct trace_event_file *file, + struct xbc_node *hnode, char *buf, size_t size) +{ + struct xbc_node *node; + const char *p; + char *tmp; + + xbc_node_for_each_subkey(hnode, node) { + p = xbc_node_get_data(node); + if (!isdigit(p[0])) + continue; + /* All digit started node should be instances. */ + if (trace_boot_compose_hist_cmd(node, buf, size) == 0) { + tmp = kstrdup(buf, GFP_KERNEL); + if (!tmp) + return; + if (trigger_process_regex(file, buf) < 0) + pr_err("Failed to apply hist trigger: %s\n", tmp); + kfree(tmp); + } + } + + if (xbc_node_find_subkey(hnode, "keys")) { + if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) { + tmp = kstrdup(buf, GFP_KERNEL); + if (!tmp) + return; + if (trigger_process_regex(file, buf) < 0) + pr_err("Failed to apply hist trigger: %s\n", tmp); + kfree(tmp); + } + } +} +#else +static void __init +trace_boot_init_histograms(struct trace_event_file *file, + struct xbc_node *hnode, char *buf, size_t size) +{ + /* do nothing */ +} +#endif + +static void __init +trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, + struct xbc_node *enode) +{ + struct trace_event_file *file; + struct xbc_node *anode; + char buf[MAX_BUF_LEN]; + const char *p, *group, *event; + + group = xbc_node_get_data(gnode); + event = xbc_node_get_data(enode); + + if (!strcmp(group, "kprobes")) + if (trace_boot_add_kprobe_event(enode, event) < 0) + return; + if (!strcmp(group, "synthetic")) + if (trace_boot_add_synth_event(enode, event) < 0) + return; + + mutex_lock(&event_mutex); + file = find_event_file(tr, group, event); + if (!file) { + pr_err("Failed to find event: %s:%s\n", group, event); + goto out; + } + + p = xbc_node_find_value(enode, "filter", NULL); + if (p && *p != '\0') { + if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) + pr_err("filter string is too long: %s\n", p); + else if (apply_event_filter(file, buf) < 0) + pr_err("Failed to apply filter: %s\n", buf); + } + + if (IS_ENABLED(CONFIG_HIST_TRIGGERS)) { + xbc_node_for_each_array_value(enode, "actions", anode, p) { + if (strscpy(buf, p, ARRAY_SIZE(buf)) < 0) + pr_err("action string is too long: %s\n", p); + else if (trigger_process_regex(file, buf) < 0) + pr_err("Failed to apply an action: %s\n", p); + } + anode = xbc_node_find_subkey(enode, "hist"); + if (anode) + trace_boot_init_histograms(file, anode, buf, ARRAY_SIZE(buf)); + } else if (xbc_node_find_value(enode, "actions", NULL)) + pr_err("Failed to apply event actions because CONFIG_HIST_TRIGGERS is not set.\n"); + + if (xbc_node_find_value(enode, "enable", NULL)) { + if (trace_event_enable_disable(file, 1, 0) < 0) + pr_err("Failed to enable event node: %s:%s\n", + group, event); + } +out: + mutex_unlock(&event_mutex); +} + +static void __init +trace_boot_init_events(struct trace_array *tr, struct xbc_node *node) +{ + struct xbc_node *gnode, *enode; + bool enable, enable_all = false; + const char *data; + + node = xbc_node_find_subkey(node, "event"); + if (!node) + return; + /* per-event key starts with "event.GROUP.EVENT" */ + xbc_node_for_each_subkey(node, gnode) { + data = xbc_node_get_data(gnode); + if (!strcmp(data, "enable")) { + enable_all = true; + continue; + } + enable = false; + xbc_node_for_each_subkey(gnode, enode) { + data = xbc_node_get_data(enode); + if (!strcmp(data, "enable")) { + enable = true; + continue; + } + trace_boot_init_one_event(tr, gnode, enode); + } + /* Event enablement must be done after event settings */ + if (enable) { + data = xbc_node_get_data(gnode); + trace_array_set_clr_event(tr, data, NULL, true); + } + } + /* Ditto */ + if (enable_all) + trace_array_set_clr_event(tr, NULL, NULL, true); +} +#else +#define trace_boot_enable_events(tr, node) do {} while (0) +#define trace_boot_init_events(tr, node) do {} while (0) +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE +static void __init +trace_boot_set_ftrace_filter(struct trace_array *tr, struct xbc_node *node) +{ + struct xbc_node *anode; + const char *p; + char *q; + + xbc_node_for_each_array_value(node, "ftrace.filters", anode, p) { + q = kstrdup(p, GFP_KERNEL); + if (!q) + return; + if (ftrace_set_filter(tr->ops, q, strlen(q), 0) < 0) + pr_err("Failed to add %s to ftrace filter\n", p); + else + ftrace_filter_param = true; + kfree(q); + } + xbc_node_for_each_array_value(node, "ftrace.notraces", anode, p) { + q = kstrdup(p, GFP_KERNEL); + if (!q) + return; + if (ftrace_set_notrace(tr->ops, q, strlen(q), 0) < 0) + pr_err("Failed to add %s to ftrace filter\n", p); + else + ftrace_filter_param = true; + kfree(q); + } +} +#else +#define trace_boot_set_ftrace_filter(tr, node) do {} while (0) +#endif + +static void __init +trace_boot_enable_tracer(struct trace_array *tr, struct xbc_node *node) +{ + const char *p; + + trace_boot_set_ftrace_filter(tr, node); + + p = xbc_node_find_value(node, "tracer", NULL); + if (p && *p != '\0') { + if (tracing_set_tracer(tr, p) < 0) + pr_err("Failed to set given tracer: %s\n", p); + } + + /* Since tracer can free snapshot buffer, allocate snapshot here.*/ + if (xbc_node_find_value(node, "alloc_snapshot", NULL)) { + if (tracing_alloc_snapshot_instance(tr) < 0) + pr_err("Failed to allocate snapshot buffer\n"); + } +} + +static void __init +trace_boot_init_one_instance(struct trace_array *tr, struct xbc_node *node) +{ + trace_boot_set_instance_options(tr, node); + trace_boot_init_events(tr, node); + trace_boot_enable_events(tr, node); + trace_boot_enable_tracer(tr, node); +} + +static void __init +trace_boot_init_instances(struct xbc_node *node) +{ + struct xbc_node *inode; + struct trace_array *tr; + const char *p; + + node = xbc_node_find_subkey(node, "instance"); + if (!node) + return; + + xbc_node_for_each_subkey(node, inode) { + p = xbc_node_get_data(inode); + if (!p || *p == '\0') + continue; + + tr = trace_array_get_by_name(p); + if (!tr) { + pr_err("Failed to get trace instance %s\n", p); + continue; + } + trace_boot_init_one_instance(tr, inode); + trace_array_put(tr); + } +} + +static int __init trace_boot_init(void) +{ + struct xbc_node *trace_node; + struct trace_array *tr; + + trace_node = xbc_find_node("ftrace"); + if (!trace_node) + return 0; + + tr = top_trace_array(); + if (!tr) + return 0; + + /* Global trace array is also one instance */ + trace_boot_init_one_instance(tr, trace_node); + trace_boot_init_instances(trace_node); + + disable_tracing_selftest("running boot-time tracing"); + + return 0; +} +/* + * Start tracing at the end of core-initcall, so that it starts tracing + * from the beginning of postcore_initcall. + */ +core_initcall_sync(trace_boot_init); diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c new file mode 100644 index 0000000000..e47fdb4c92 --- /dev/null +++ b/kernel/trace/trace_branch.c @@ -0,0 +1,455 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * unlikely profiler + * + * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> + */ +#include <linux/kallsyms.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> +#include <linux/irqflags.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/ftrace.h> +#include <linux/hash.h> +#include <linux/fs.h> +#include <asm/local.h> + +#include "trace.h" +#include "trace_stat.h" +#include "trace_output.h" + +#ifdef CONFIG_BRANCH_TRACER + +static struct tracer branch_trace; +static int branch_tracing_enabled __read_mostly; +static DEFINE_MUTEX(branch_tracing_mutex); + +static struct trace_array *branch_tracer; + +static void +probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) +{ + struct trace_event_call *call = &event_branch; + struct trace_array *tr = branch_tracer; + struct trace_buffer *buffer; + struct trace_array_cpu *data; + struct ring_buffer_event *event; + struct trace_branch *entry; + unsigned long flags; + unsigned int trace_ctx; + const char *p; + + if (current->trace_recursion & TRACE_BRANCH_BIT) + return; + + /* + * I would love to save just the ftrace_likely_data pointer, but + * this code can also be used by modules. Ugly things can happen + * if the module is unloaded, and then we go and read the + * pointer. This is slower, but much safer. + */ + + if (unlikely(!tr)) + return; + + raw_local_irq_save(flags); + current->trace_recursion |= TRACE_BRANCH_BIT; + data = this_cpu_ptr(tr->array_buffer.data); + if (atomic_read(&data->disabled)) + goto out; + + trace_ctx = tracing_gen_ctx_flags(flags); + buffer = tr->array_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, + sizeof(*entry), trace_ctx); + if (!event) + goto out; + + entry = ring_buffer_event_data(event); + + /* Strip off the path, only save the file */ + p = f->data.file + strlen(f->data.file); + while (p >= f->data.file && *p != '/') + p--; + p++; + + strncpy(entry->func, f->data.func, TRACE_FUNC_SIZE); + strncpy(entry->file, p, TRACE_FILE_SIZE); + entry->func[TRACE_FUNC_SIZE] = 0; + entry->file[TRACE_FILE_SIZE] = 0; + entry->constant = f->constant; + entry->line = f->data.line; + entry->correct = val == expect; + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit_nostack(buffer, event); + + out: + current->trace_recursion &= ~TRACE_BRANCH_BIT; + raw_local_irq_restore(flags); +} + +static inline +void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect) +{ + if (!branch_tracing_enabled) + return; + + probe_likely_condition(f, val, expect); +} + +int enable_branch_tracing(struct trace_array *tr) +{ + mutex_lock(&branch_tracing_mutex); + branch_tracer = tr; + /* + * Must be seen before enabling. The reader is a condition + * where we do not need a matching rmb() + */ + smp_wmb(); + branch_tracing_enabled++; + mutex_unlock(&branch_tracing_mutex); + + return 0; +} + +void disable_branch_tracing(void) +{ + mutex_lock(&branch_tracing_mutex); + + if (!branch_tracing_enabled) + goto out_unlock; + + branch_tracing_enabled--; + + out_unlock: + mutex_unlock(&branch_tracing_mutex); +} + +static int branch_trace_init(struct trace_array *tr) +{ + return enable_branch_tracing(tr); +} + +static void branch_trace_reset(struct trace_array *tr) +{ + disable_branch_tracing(); +} + +static enum print_line_t trace_branch_print(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + struct trace_branch *field; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n", + field->correct ? " ok " : " MISS ", + field->func, + field->file, + field->line); + + return trace_handle_return(&iter->seq); +} + +static void branch_print_header(struct seq_file *s) +{ + seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT" + " FUNC:FILE:LINE\n" + "# | | | | | " + " |\n"); +} + +static struct trace_event_functions trace_branch_funcs = { + .trace = trace_branch_print, +}; + +static struct trace_event trace_branch_event = { + .type = TRACE_BRANCH, + .funcs = &trace_branch_funcs, +}; + +static struct tracer branch_trace __read_mostly = +{ + .name = "branch", + .init = branch_trace_init, + .reset = branch_trace_reset, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_branch, +#endif /* CONFIG_FTRACE_SELFTEST */ + .print_header = branch_print_header, +}; + +__init static int init_branch_tracer(void) +{ + int ret; + + ret = register_trace_event(&trace_branch_event); + if (!ret) { + printk(KERN_WARNING "Warning: could not register " + "branch events\n"); + return 1; + } + return register_tracer(&branch_trace); +} +core_initcall(init_branch_tracer); + +#else +static inline +void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect) +{ +} +#endif /* CONFIG_BRANCH_TRACER */ + +void ftrace_likely_update(struct ftrace_likely_data *f, int val, + int expect, int is_constant) +{ + unsigned long flags = user_access_save(); + + /* A constant is always correct */ + if (is_constant) { + f->constant++; + val = expect; + } + /* + * I would love to have a trace point here instead, but the + * trace point code is so inundated with unlikely and likely + * conditions that the recursive nightmare that exists is too + * much to try to get working. At least for now. + */ + trace_likely_condition(f, val, expect); + + /* FIXME: Make this atomic! */ + if (val == expect) + f->data.correct++; + else + f->data.incorrect++; + + user_access_restore(flags); +} +EXPORT_SYMBOL(ftrace_likely_update); + +extern unsigned long __start_annotated_branch_profile[]; +extern unsigned long __stop_annotated_branch_profile[]; + +static int annotated_branch_stat_headers(struct seq_file *m) +{ + seq_puts(m, " correct incorrect % " + " Function " + " File Line\n" + " ------- --------- - " + " -------- " + " ---- ----\n"); + return 0; +} + +static inline long get_incorrect_percent(const struct ftrace_branch_data *p) +{ + long percent; + + if (p->correct) { + percent = p->incorrect * 100; + percent /= p->correct + p->incorrect; + } else + percent = p->incorrect ? 100 : -1; + + return percent; +} + +static const char *branch_stat_process_file(struct ftrace_branch_data *p) +{ + const char *f; + + /* Only print the file, not the path */ + f = p->file + strlen(p->file); + while (f >= p->file && *f != '/') + f--; + return ++f; +} + +static void branch_stat_show(struct seq_file *m, + struct ftrace_branch_data *p, const char *f) +{ + long percent; + + /* + * The miss is overlayed on correct, and hit on incorrect. + */ + percent = get_incorrect_percent(p); + + if (percent < 0) + seq_puts(m, " X "); + else + seq_printf(m, "%3ld ", percent); + + seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); +} + +static int branch_stat_show_normal(struct seq_file *m, + struct ftrace_branch_data *p, const char *f) +{ + seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); + branch_stat_show(m, p, f); + return 0; +} + +static int annotate_branch_stat_show(struct seq_file *m, void *v) +{ + struct ftrace_likely_data *p = v; + const char *f; + int l; + + f = branch_stat_process_file(&p->data); + + if (!p->constant) + return branch_stat_show_normal(m, &p->data, f); + + l = snprintf(NULL, 0, "/%lu", p->constant); + l = l > 8 ? 0 : 8 - l; + + seq_printf(m, "%8lu/%lu %*lu ", + p->data.correct, p->constant, l, p->data.incorrect); + branch_stat_show(m, &p->data, f); + return 0; +} + +static void *annotated_branch_stat_start(struct tracer_stat *trace) +{ + return __start_annotated_branch_profile; +} + +static void * +annotated_branch_stat_next(void *v, int idx) +{ + struct ftrace_likely_data *p = v; + + ++p; + + if ((void *)p >= (void *)__stop_annotated_branch_profile) + return NULL; + + return p; +} + +static int annotated_branch_stat_cmp(const void *p1, const void *p2) +{ + const struct ftrace_branch_data *a = p1; + const struct ftrace_branch_data *b = p2; + + long percent_a, percent_b; + + percent_a = get_incorrect_percent(a); + percent_b = get_incorrect_percent(b); + + if (percent_a < percent_b) + return -1; + if (percent_a > percent_b) + return 1; + + if (a->incorrect < b->incorrect) + return -1; + if (a->incorrect > b->incorrect) + return 1; + + /* + * Since the above shows worse (incorrect) cases + * first, we continue that by showing best (correct) + * cases last. + */ + if (a->correct > b->correct) + return -1; + if (a->correct < b->correct) + return 1; + + return 0; +} + +static struct tracer_stat annotated_branch_stats = { + .name = "branch_annotated", + .stat_start = annotated_branch_stat_start, + .stat_next = annotated_branch_stat_next, + .stat_cmp = annotated_branch_stat_cmp, + .stat_headers = annotated_branch_stat_headers, + .stat_show = annotate_branch_stat_show +}; + +__init static int init_annotated_branch_stats(void) +{ + int ret; + + ret = register_stat_tracer(&annotated_branch_stats); + if (!ret) { + printk(KERN_WARNING "Warning: could not register " + "annotated branches stats\n"); + return 1; + } + return 0; +} +fs_initcall(init_annotated_branch_stats); + +#ifdef CONFIG_PROFILE_ALL_BRANCHES + +extern unsigned long __start_branch_profile[]; +extern unsigned long __stop_branch_profile[]; + +static int all_branch_stat_headers(struct seq_file *m) +{ + seq_puts(m, " miss hit % " + " Function " + " File Line\n" + " ------- --------- - " + " -------- " + " ---- ----\n"); + return 0; +} + +static void *all_branch_stat_start(struct tracer_stat *trace) +{ + return __start_branch_profile; +} + +static void * +all_branch_stat_next(void *v, int idx) +{ + struct ftrace_branch_data *p = v; + + ++p; + + if ((void *)p >= (void *)__stop_branch_profile) + return NULL; + + return p; +} + +static int all_branch_stat_show(struct seq_file *m, void *v) +{ + struct ftrace_branch_data *p = v; + const char *f; + + f = branch_stat_process_file(p); + return branch_stat_show_normal(m, p, f); +} + +static struct tracer_stat all_branch_stats = { + .name = "branch_all", + .stat_start = all_branch_stat_start, + .stat_next = all_branch_stat_next, + .stat_headers = all_branch_stat_headers, + .stat_show = all_branch_stat_show +}; + +__init static int all_annotated_branch_stats(void) +{ + int ret; + + ret = register_stat_tracer(&all_branch_stats); + if (!ret) { + printk(KERN_WARNING "Warning: could not register " + "all branches stats\n"); + return 1; + } + return 0; +} +fs_initcall(all_annotated_branch_stats); +#endif /* CONFIG_PROFILE_ALL_BRANCHES */ diff --git a/kernel/trace/trace_btf.c b/kernel/trace/trace_btf.c new file mode 100644 index 0000000000..ca224d53bf --- /dev/null +++ b/kernel/trace/trace_btf.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/btf.h> +#include <linux/kernel.h> +#include <linux/slab.h> + +#include "trace_btf.h" + +/* + * Find a function proto type by name, and return the btf_type with its btf + * in *@btf_p. Return NULL if not found. + * Note that caller has to call btf_put(*@btf_p) after using the btf_type. + */ +const struct btf_type *btf_find_func_proto(const char *func_name, struct btf **btf_p) +{ + const struct btf_type *t; + s32 id; + + id = bpf_find_btf_id(func_name, BTF_KIND_FUNC, btf_p); + if (id < 0) + return NULL; + + /* Get BTF_KIND_FUNC type */ + t = btf_type_by_id(*btf_p, id); + if (!t || !btf_type_is_func(t)) + goto err; + + /* The type of BTF_KIND_FUNC is BTF_KIND_FUNC_PROTO */ + t = btf_type_by_id(*btf_p, t->type); + if (!t || !btf_type_is_func_proto(t)) + goto err; + + return t; +err: + btf_put(*btf_p); + return NULL; +} + +/* + * Get function parameter with the number of parameters. + * This can return NULL if the function has no parameters. + * It can return -EINVAL if the @func_proto is not a function proto type. + */ +const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, s32 *nr) +{ + if (!btf_type_is_func_proto(func_proto)) + return ERR_PTR(-EINVAL); + + *nr = btf_type_vlen(func_proto); + if (*nr > 0) + return (const struct btf_param *)(func_proto + 1); + else + return NULL; +} + +#define BTF_ANON_STACK_MAX 16 + +struct btf_anon_stack { + u32 tid; + u32 offset; +}; + +/* + * Find a member of data structure/union by name and return it. + * Return NULL if not found, or -EINVAL if parameter is invalid. + * If the member is an member of anonymous union/structure, the offset + * of that anonymous union/structure is stored into @anon_offset. Caller + * can calculate the correct offset from the root data structure by + * adding anon_offset to the member's offset. + */ +const struct btf_member *btf_find_struct_member(struct btf *btf, + const struct btf_type *type, + const char *member_name, + u32 *anon_offset) +{ + struct btf_anon_stack *anon_stack; + const struct btf_member *member; + u32 tid, cur_offset = 0; + const char *name; + int i, top = 0; + + anon_stack = kcalloc(BTF_ANON_STACK_MAX, sizeof(*anon_stack), GFP_KERNEL); + if (!anon_stack) + return ERR_PTR(-ENOMEM); + +retry: + if (!btf_type_is_struct(type)) { + member = ERR_PTR(-EINVAL); + goto out; + } + + for_each_member(i, type, member) { + if (!member->name_off) { + /* Anonymous union/struct: push it for later use */ + type = btf_type_skip_modifiers(btf, member->type, &tid); + if (type && top < BTF_ANON_STACK_MAX) { + anon_stack[top].tid = tid; + anon_stack[top++].offset = + cur_offset + member->offset; + } + } else { + name = btf_name_by_offset(btf, member->name_off); + if (name && !strcmp(member_name, name)) { + if (anon_offset) + *anon_offset = cur_offset; + goto out; + } + } + } + if (top > 0) { + /* Pop from the anonymous stack and retry */ + tid = anon_stack[--top].tid; + cur_offset = anon_stack[top].offset; + type = btf_type_by_id(btf, tid); + goto retry; + } + member = NULL; + +out: + kfree(anon_stack); + return member; +} + diff --git a/kernel/trace/trace_btf.h b/kernel/trace/trace_btf.h new file mode 100644 index 0000000000..4bc44bc261 --- /dev/null +++ b/kernel/trace/trace_btf.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/btf.h> + +const struct btf_type *btf_find_func_proto(const char *func_name, + struct btf **btf_p); +const struct btf_param *btf_get_func_param(const struct btf_type *func_proto, + s32 *nr); +const struct btf_member *btf_find_struct_member(struct btf *btf, + const struct btf_type *type, + const char *member_name, + u32 *anon_offset); diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c new file mode 100644 index 0000000000..4702efb00f --- /dev/null +++ b/kernel/trace/trace_clock.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * tracing clocks + * + * Copyright (C) 2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * + * Implements 3 trace clock variants, with differing scalability/precision + * tradeoffs: + * + * - local: CPU-local trace clock + * - medium: scalable global clock with some jitter + * - global: globally monotonic, serialized clock + * + * Tracer plugins will chose a default from these clocks. + */ +#include <linux/spinlock.h> +#include <linux/irqflags.h> +#include <linux/hardirq.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/sched.h> +#include <linux/sched/clock.h> +#include <linux/ktime.h> +#include <linux/trace_clock.h> + +/* + * trace_clock_local(): the simplest and least coherent tracing clock. + * + * Useful for tracing that does not cross to other CPUs nor + * does it go through idle events. + */ +u64 notrace trace_clock_local(void) +{ + u64 clock; + + /* + * sched_clock() is an architecture implemented, fast, scalable, + * lockless clock. It is not guaranteed to be coherent across + * CPUs, nor across CPU idle events. + */ + preempt_disable_notrace(); + clock = sched_clock(); + preempt_enable_notrace(); + + return clock; +} +EXPORT_SYMBOL_GPL(trace_clock_local); + +/* + * trace_clock(): 'between' trace clock. Not completely serialized, + * but not completely incorrect when crossing CPUs either. + * + * This is based on cpu_clock(), which will allow at most ~1 jiffy of + * jitter between CPUs. So it's a pretty scalable clock, but there + * can be offsets in the trace data. + */ +u64 notrace trace_clock(void) +{ + return local_clock(); +} +EXPORT_SYMBOL_GPL(trace_clock); + +/* + * trace_jiffy_clock(): Simply use jiffies as a clock counter. + * Note that this use of jiffies_64 is not completely safe on + * 32-bit systems. But the window is tiny, and the effect if + * we are affected is that we will have an obviously bogus + * timestamp on a trace event - i.e. not life threatening. + */ +u64 notrace trace_clock_jiffies(void) +{ + return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES); +} +EXPORT_SYMBOL_GPL(trace_clock_jiffies); + +/* + * trace_clock_global(): special globally coherent trace clock + * + * It has higher overhead than the other trace clocks but is still + * an order of magnitude faster than GTOD derived hardware clocks. + * + * Used by plugins that need globally coherent timestamps. + */ + +/* keep prev_time and lock in the same cacheline. */ +static struct { + u64 prev_time; + arch_spinlock_t lock; +} trace_clock_struct ____cacheline_aligned_in_smp = + { + .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED, + }; + +u64 notrace trace_clock_global(void) +{ + unsigned long flags; + int this_cpu; + u64 now, prev_time; + + raw_local_irq_save(flags); + + this_cpu = raw_smp_processor_id(); + + /* + * The global clock "guarantees" that the events are ordered + * between CPUs. But if two events on two different CPUS call + * trace_clock_global at roughly the same time, it really does + * not matter which one gets the earlier time. Just make sure + * that the same CPU will always show a monotonic clock. + * + * Use a read memory barrier to get the latest written + * time that was recorded. + */ + smp_rmb(); + prev_time = READ_ONCE(trace_clock_struct.prev_time); + now = sched_clock_cpu(this_cpu); + + /* Make sure that now is always greater than or equal to prev_time */ + if ((s64)(now - prev_time) < 0) + now = prev_time; + + /* + * If in an NMI context then dont risk lockups and simply return + * the current time. + */ + if (unlikely(in_nmi())) + goto out; + + /* Tracing can cause strange recursion, always use a try lock */ + if (arch_spin_trylock(&trace_clock_struct.lock)) { + /* Reread prev_time in case it was already updated */ + prev_time = READ_ONCE(trace_clock_struct.prev_time); + if ((s64)(now - prev_time) < 0) + now = prev_time; + + trace_clock_struct.prev_time = now; + + /* The unlock acts as the wmb for the above rmb */ + arch_spin_unlock(&trace_clock_struct.lock); + } + out: + raw_local_irq_restore(flags); + + return now; +} +EXPORT_SYMBOL_GPL(trace_clock_global); + +static atomic64_t trace_counter; + +/* + * trace_clock_counter(): simply an atomic counter. + * Use the trace_counter "counter" for cases where you do not care + * about timings, but are interested in strict ordering. + */ +u64 notrace trace_clock_counter(void) +{ + return atomic64_add_return(1, &trace_counter); +} diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c new file mode 100644 index 0000000000..4376887e0d --- /dev/null +++ b/kernel/trace/trace_dynevent.c @@ -0,0 +1,483 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generic dynamic event control interface + * + * Copyright (C) 2018 Masami Hiramatsu <mhiramat@kernel.org> + */ + +#include <linux/debugfs.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/tracefs.h> + +#include "trace.h" +#include "trace_output.h" /* for trace_event_sem */ +#include "trace_dynevent.h" + +static DEFINE_MUTEX(dyn_event_ops_mutex); +static LIST_HEAD(dyn_event_ops_list); + +bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call) +{ + struct trace_event_call *call; + bool ret = false; + + if (WARN_ON_ONCE(!(dyn_call->flags & TRACE_EVENT_FL_DYNAMIC))) + return false; + + down_read(&trace_event_sem); + list_for_each_entry(call, &ftrace_events, list) { + if (call == dyn_call) { + atomic_inc(&dyn_call->refcnt); + ret = true; + } + } + up_read(&trace_event_sem); + return ret; +} + +void trace_event_dyn_put_ref(struct trace_event_call *call) +{ + if (WARN_ON_ONCE(!(call->flags & TRACE_EVENT_FL_DYNAMIC))) + return; + + if (WARN_ON_ONCE(atomic_read(&call->refcnt) <= 0)) { + atomic_set(&call->refcnt, 0); + return; + } + + atomic_dec(&call->refcnt); +} + +bool trace_event_dyn_busy(struct trace_event_call *call) +{ + return atomic_read(&call->refcnt) != 0; +} + +int dyn_event_register(struct dyn_event_operations *ops) +{ + if (!ops || !ops->create || !ops->show || !ops->is_busy || + !ops->free || !ops->match) + return -EINVAL; + + INIT_LIST_HEAD(&ops->list); + mutex_lock(&dyn_event_ops_mutex); + list_add_tail(&ops->list, &dyn_event_ops_list); + mutex_unlock(&dyn_event_ops_mutex); + return 0; +} + +int dyn_event_release(const char *raw_command, struct dyn_event_operations *type) +{ + struct dyn_event *pos, *n; + char *system = NULL, *event, *p; + int argc, ret = -ENOENT; + char **argv; + + argv = argv_split(GFP_KERNEL, raw_command, &argc); + if (!argv) + return -ENOMEM; + + if (argv[0][0] == '-') { + if (argv[0][1] != ':') { + ret = -EINVAL; + goto out; + } + event = &argv[0][2]; + } else { + event = strchr(argv[0], ':'); + if (!event) { + ret = -EINVAL; + goto out; + } + event++; + } + + p = strchr(event, '/'); + if (p) { + system = event; + event = p + 1; + *p = '\0'; + } + if (!system && event[0] == '\0') { + ret = -EINVAL; + goto out; + } + + mutex_lock(&event_mutex); + for_each_dyn_event_safe(pos, n) { + if (type && type != pos->ops) + continue; + if (!pos->ops->match(system, event, + argc - 1, (const char **)argv + 1, pos)) + continue; + + ret = pos->ops->free(pos); + if (ret) + break; + } + tracing_reset_all_online_cpus(); + mutex_unlock(&event_mutex); +out: + argv_free(argv); + return ret; +} + +static int create_dyn_event(const char *raw_command) +{ + struct dyn_event_operations *ops; + int ret = -ENODEV; + + if (raw_command[0] == '-' || raw_command[0] == '!') + return dyn_event_release(raw_command, NULL); + + mutex_lock(&dyn_event_ops_mutex); + list_for_each_entry(ops, &dyn_event_ops_list, list) { + ret = ops->create(raw_command); + if (!ret || ret != -ECANCELED) + break; + } + mutex_unlock(&dyn_event_ops_mutex); + if (ret == -ECANCELED) + ret = -EINVAL; + + return ret; +} + +/* Protected by event_mutex */ +LIST_HEAD(dyn_event_list); + +void *dyn_event_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&event_mutex); + return seq_list_start(&dyn_event_list, *pos); +} + +void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &dyn_event_list, pos); +} + +void dyn_event_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&event_mutex); +} + +static int dyn_event_seq_show(struct seq_file *m, void *v) +{ + struct dyn_event *ev = v; + + if (ev && ev->ops) + return ev->ops->show(m, ev); + + return 0; +} + +static const struct seq_operations dyn_event_seq_op = { + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, + .show = dyn_event_seq_show +}; + +/* + * dyn_events_release_all - Release all specific events + * @type: the dyn_event_operations * which filters releasing events + * + * This releases all events which ->ops matches @type. If @type is NULL, + * all events are released. + * Return -EBUSY if any of them are in use, and return other errors when + * it failed to free the given event. Except for -EBUSY, event releasing + * process will be aborted at that point and there may be some other + * releasable events on the list. + */ +int dyn_events_release_all(struct dyn_event_operations *type) +{ + struct dyn_event *ev, *tmp; + int ret = 0; + + mutex_lock(&event_mutex); + for_each_dyn_event(ev) { + if (type && ev->ops != type) + continue; + if (ev->ops->is_busy(ev)) { + ret = -EBUSY; + goto out; + } + } + for_each_dyn_event_safe(ev, tmp) { + if (type && ev->ops != type) + continue; + ret = ev->ops->free(ev); + if (ret) + break; + } +out: + tracing_reset_all_online_cpus(); + mutex_unlock(&event_mutex); + + return ret; +} + +static int dyn_event_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; + + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ret = dyn_events_release_all(NULL); + if (ret < 0) + return ret; + } + + return seq_open(file, &dyn_event_seq_op); +} + +static ssize_t dyn_event_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + return trace_parse_run_command(file, buffer, count, ppos, + create_dyn_event); +} + +static const struct file_operations dynamic_events_ops = { + .owner = THIS_MODULE, + .open = dyn_event_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = dyn_event_write, +}; + +/* Make a tracefs interface for controlling dynamic events */ +static __init int init_dynamic_event(void) +{ + int ret; + + ret = tracing_init_dentry(); + if (ret) + return 0; + + trace_create_file("dynamic_events", TRACE_MODE_WRITE, NULL, + NULL, &dynamic_events_ops); + + return 0; +} +fs_initcall(init_dynamic_event); + +/** + * dynevent_arg_add - Add an arg to a dynevent_cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event cmd + * @arg: The argument to append to the current cmd + * @check_arg: An (optional) pointer to a function checking arg sanity + * + * Append an argument to a dynevent_cmd. The argument string will be + * appended to the current cmd string, followed by a separator, if + * applicable. Before the argument is added, the @check_arg function, + * if present, will be used to check the sanity of the current arg + * string. + * + * The cmd string and separator should be set using the + * dynevent_arg_init() before any arguments are added using this + * function. + * + * Return: 0 if successful, error otherwise. + */ +int dynevent_arg_add(struct dynevent_cmd *cmd, + struct dynevent_arg *arg, + dynevent_check_arg_fn_t check_arg) +{ + int ret = 0; + + if (check_arg) { + ret = check_arg(arg); + if (ret) + return ret; + } + + ret = seq_buf_printf(&cmd->seq, " %s%c", arg->str, arg->separator); + if (ret) { + pr_err("String is too long: %s%c\n", arg->str, arg->separator); + return -E2BIG; + } + + return ret; +} + +/** + * dynevent_arg_pair_add - Add an arg pair to a dynevent_cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event cmd + * @arg_pair: The argument pair to append to the current cmd + * @check_arg: An (optional) pointer to a function checking arg sanity + * + * Append an argument pair to a dynevent_cmd. An argument pair + * consists of a left-hand-side argument and a right-hand-side + * argument separated by an operator, which can be whitespace, all + * followed by a separator, if applicable. This can be used to add + * arguments of the form 'type variable_name;' or 'x+y'. + * + * The lhs argument string will be appended to the current cmd string, + * followed by an operator, if applicable, followed by the rhs string, + * followed finally by a separator, if applicable. Before the + * argument is added, the @check_arg function, if present, will be + * used to check the sanity of the current arg strings. + * + * The cmd strings, operator, and separator should be set using the + * dynevent_arg_pair_init() before any arguments are added using this + * function. + * + * Return: 0 if successful, error otherwise. + */ +int dynevent_arg_pair_add(struct dynevent_cmd *cmd, + struct dynevent_arg_pair *arg_pair, + dynevent_check_arg_fn_t check_arg) +{ + int ret = 0; + + if (check_arg) { + ret = check_arg(arg_pair); + if (ret) + return ret; + } + + ret = seq_buf_printf(&cmd->seq, " %s%c%s%c", arg_pair->lhs, + arg_pair->operator, arg_pair->rhs, + arg_pair->separator); + if (ret) { + pr_err("field string is too long: %s%c%s%c\n", arg_pair->lhs, + arg_pair->operator, arg_pair->rhs, + arg_pair->separator); + return -E2BIG; + } + + return ret; +} + +/** + * dynevent_str_add - Add a string to a dynevent_cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event cmd + * @str: The string to append to the current cmd + * + * Append a string to a dynevent_cmd. The string will be appended to + * the current cmd string as-is, with nothing prepended or appended. + * + * Return: 0 if successful, error otherwise. + */ +int dynevent_str_add(struct dynevent_cmd *cmd, const char *str) +{ + int ret = 0; + + ret = seq_buf_puts(&cmd->seq, str); + if (ret) { + pr_err("String is too long: %s\n", str); + return -E2BIG; + } + + return ret; +} + +/** + * dynevent_cmd_init - Initialize a dynevent_cmd object + * @cmd: A pointer to the dynevent_cmd struct representing the cmd + * @buf: A pointer to the buffer to generate the command into + * @maxlen: The length of the buffer the command will be generated into + * @type: The type of the cmd, checked against further operations + * @run_command: The type-specific function that will actually run the command + * + * Initialize a dynevent_cmd. A dynevent_cmd is used to build up and + * run dynamic event creation commands, such as commands for creating + * synthetic and kprobe events. Before calling any of the functions + * used to build the command, a dynevent_cmd object should be + * instantiated and initialized using this function. + * + * The initialization sets things up by saving a pointer to the + * user-supplied buffer and its length via the @buf and @maxlen + * params, and by saving the cmd-specific @type and @run_command + * params which are used to check subsequent dynevent_cmd operations + * and actually run the command when complete. + */ +void dynevent_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen, + enum dynevent_type type, + dynevent_create_fn_t run_command) +{ + memset(cmd, '\0', sizeof(*cmd)); + + seq_buf_init(&cmd->seq, buf, maxlen); + cmd->type = type; + cmd->run_command = run_command; +} + +/** + * dynevent_arg_init - Initialize a dynevent_arg object + * @arg: A pointer to the dynevent_arg struct representing the arg + * @separator: An (optional) separator, appended after adding the arg + * + * Initialize a dynevent_arg object. A dynevent_arg represents an + * object used to append single arguments to the current command + * string. After the arg string is successfully appended to the + * command string, the optional @separator is appended. If no + * separator was specified when initializing the arg, a space will be + * appended. + */ +void dynevent_arg_init(struct dynevent_arg *arg, + char separator) +{ + memset(arg, '\0', sizeof(*arg)); + + if (!separator) + separator = ' '; + arg->separator = separator; +} + +/** + * dynevent_arg_pair_init - Initialize a dynevent_arg_pair object + * @arg_pair: A pointer to the dynevent_arg_pair struct representing the arg + * @operator: An (optional) operator, appended after adding the first arg + * @separator: An (optional) separator, appended after adding the second arg + * + * Initialize a dynevent_arg_pair object. A dynevent_arg_pair + * represents an object used to append argument pairs such as 'type + * variable_name;' or 'x+y' to the current command string. An + * argument pair consists of a left-hand-side argument and a + * right-hand-side argument separated by an operator, which can be + * whitespace, all followed by a separator, if applicable. After the + * first arg string is successfully appended to the command string, + * the optional @operator is appended, followed by the second arg and + * optional @separator. If no separator was specified when + * initializing the arg, a space will be appended. + */ +void dynevent_arg_pair_init(struct dynevent_arg_pair *arg_pair, + char operator, char separator) +{ + memset(arg_pair, '\0', sizeof(*arg_pair)); + + if (!operator) + operator = ' '; + arg_pair->operator = operator; + + if (!separator) + separator = ' '; + arg_pair->separator = separator; +} + +/** + * dynevent_create - Create the dynamic event contained in dynevent_cmd + * @cmd: The dynevent_cmd object containing the dynamic event creation command + * + * Once a dynevent_cmd object has been successfully built up via the + * dynevent_cmd_init(), dynevent_arg_add() and dynevent_arg_pair_add() + * functions, this function runs the final command to actually create + * the event. + * + * Return: 0 if the event was successfully created, error otherwise. + */ +int dynevent_create(struct dynevent_cmd *cmd) +{ + return cmd->run_command(cmd); +} +EXPORT_SYMBOL_GPL(dynevent_create); diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h new file mode 100644 index 0000000000..936477a111 --- /dev/null +++ b/kernel/trace/trace_dynevent.h @@ -0,0 +1,154 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common header file for generic dynamic events. + */ + +#ifndef _TRACE_DYNEVENT_H +#define _TRACE_DYNEVENT_H + +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/seq_file.h> + +#include "trace.h" + +struct dyn_event; + +/** + * struct dyn_event_operations - Methods for each type of dynamic events + * + * These methods must be set for each type, since there is no default method. + * Before using this for dyn_event_init(), it must be registered by + * dyn_event_register(). + * + * @create: Parse and create event method. This is invoked when user passes + * a event definition to dynamic_events interface. This must not destruct + * the arguments and return -ECANCELED if given arguments doesn't match its + * command prefix. + * @show: Showing method. This is invoked when user reads the event definitions + * via dynamic_events interface. + * @is_busy: Check whether given event is busy so that it can not be deleted. + * Return true if it is busy, otherwise false. + * @free: Delete the given event. Return 0 if success, otherwise error. + * @match: Check whether given event and system name match this event. The argc + * and argv is used for exact match. Return true if it matches, otherwise + * false. + * + * Except for @create, these methods are called under holding event_mutex. + */ +struct dyn_event_operations { + struct list_head list; + int (*create)(const char *raw_command); + int (*show)(struct seq_file *m, struct dyn_event *ev); + bool (*is_busy)(struct dyn_event *ev); + int (*free)(struct dyn_event *ev); + bool (*match)(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev); +}; + +/* Register new dyn_event type -- must be called at first */ +int dyn_event_register(struct dyn_event_operations *ops); + +/** + * struct dyn_event - Dynamic event list header + * + * The dyn_event structure encapsulates a list and a pointer to the operators + * for making a global list of dynamic events. + * User must includes this in each event structure, so that those events can + * be added/removed via dynamic_events interface. + */ +struct dyn_event { + struct list_head list; + struct dyn_event_operations *ops; +}; + +extern struct list_head dyn_event_list; + +static inline +int dyn_event_init(struct dyn_event *ev, struct dyn_event_operations *ops) +{ + if (!ev || !ops) + return -EINVAL; + + INIT_LIST_HEAD(&ev->list); + ev->ops = ops; + return 0; +} + +static inline int dyn_event_add(struct dyn_event *ev, + struct trace_event_call *call) +{ + lockdep_assert_held(&event_mutex); + + if (!ev || !ev->ops) + return -EINVAL; + + call->flags |= TRACE_EVENT_FL_DYNAMIC; + list_add_tail(&ev->list, &dyn_event_list); + return 0; +} + +static inline void dyn_event_remove(struct dyn_event *ev) +{ + lockdep_assert_held(&event_mutex); + list_del_init(&ev->list); +} + +void *dyn_event_seq_start(struct seq_file *m, loff_t *pos); +void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos); +void dyn_event_seq_stop(struct seq_file *m, void *v); +int dyn_events_release_all(struct dyn_event_operations *type); +int dyn_event_release(const char *raw_command, struct dyn_event_operations *type); + +/* + * for_each_dyn_event - iterate over the dyn_event list + * @pos: the struct dyn_event * to use as a loop cursor + * + * This is just a basement of for_each macro. Wrap this for + * each actual event structure with ops filtering. + */ +#define for_each_dyn_event(pos) \ + list_for_each_entry(pos, &dyn_event_list, list) + +/* + * for_each_dyn_event - iterate over the dyn_event list safely + * @pos: the struct dyn_event * to use as a loop cursor + * @n: the struct dyn_event * to use as temporary storage + */ +#define for_each_dyn_event_safe(pos, n) \ + list_for_each_entry_safe(pos, n, &dyn_event_list, list) + +extern void dynevent_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen, + enum dynevent_type type, + dynevent_create_fn_t run_command); + +typedef int (*dynevent_check_arg_fn_t)(void *data); + +struct dynevent_arg { + const char *str; + char separator; /* e.g. ';', ',', or nothing */ +}; + +extern void dynevent_arg_init(struct dynevent_arg *arg, + char separator); +extern int dynevent_arg_add(struct dynevent_cmd *cmd, + struct dynevent_arg *arg, + dynevent_check_arg_fn_t check_arg); + +struct dynevent_arg_pair { + const char *lhs; + const char *rhs; + char operator; /* e.g. '=' or nothing */ + char separator; /* e.g. ';', ',', or nothing */ +}; + +extern void dynevent_arg_pair_init(struct dynevent_arg_pair *arg_pair, + char operator, char separator); + +extern int dynevent_arg_pair_add(struct dynevent_cmd *cmd, + struct dynevent_arg_pair *arg_pair, + dynevent_check_arg_fn_t check_arg); +extern int dynevent_str_add(struct dynevent_cmd *cmd, const char *str); + +#endif diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h new file mode 100644 index 0000000000..c47422b209 --- /dev/null +++ b/kernel/trace/trace_entries.h @@ -0,0 +1,429 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file defines the trace event structures that go into the ring + * buffer directly. They are created via macros so that changes for them + * appear in the format file. Using macros will automate this process. + * + * The macro used to create a ftrace data structure is: + * + * FTRACE_ENTRY( name, struct_name, id, structure, print ) + * + * @name: the name used the event name, as well as the name of + * the directory that holds the format file. + * + * @struct_name: the name of the structure that is created. + * + * @id: The event identifier that is used to detect what event + * this is from the ring buffer. + * + * @structure: the structure layout + * + * - __field( type, item ) + * This is equivalent to declaring + * type item; + * in the structure. + * - __array( type, item, size ) + * This is equivalent to declaring + * type item[size]; + * in the structure. + * + * * for structures within structures, the format of the internal + * structure is laid out. This allows the internal structure + * to be deciphered for the format file. Although these macros + * may become out of sync with the internal structure, they + * will create a compile error if it happens. Since the + * internal structures are just tracing helpers, this is not + * an issue. + * + * When an internal structure is used, it should use: + * + * __field_struct( type, item ) + * + * instead of __field. This will prevent it from being shown in + * the output file. The fields in the structure should use. + * + * __field_desc( type, container, item ) + * __array_desc( type, container, item, len ) + * + * type, item and len are the same as __field and __array, but + * container is added. This is the name of the item in + * __field_struct that this is describing. + * + * + * @print: the print format shown to users in the format file. + */ + +/* + * Function trace entry - function address and parent function address: + */ +FTRACE_ENTRY_REG(function, ftrace_entry, + + TRACE_FN, + + F_STRUCT( + __field_fn( unsigned long, ip ) + __field_fn( unsigned long, parent_ip ) + ), + + F_printk(" %ps <-- %ps", + (void *)__entry->ip, (void *)__entry->parent_ip), + + perf_ftrace_event_register +); + +/* Function call entry */ +FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, + + TRACE_GRAPH_ENT, + + F_STRUCT( + __field_struct( struct ftrace_graph_ent, graph_ent ) + __field_packed( unsigned long, graph_ent, func ) + __field_packed( int, graph_ent, depth ) + ), + + F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth) +); + +/* Function return entry */ +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + +FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, + + TRACE_GRAPH_RET, + + F_STRUCT( + __field_struct( struct ftrace_graph_ret, ret ) + __field_packed( unsigned long, ret, func ) + __field_packed( unsigned long, ret, retval ) + __field_packed( int, ret, depth ) + __field_packed( unsigned int, ret, overrun ) + __field_packed( unsigned long long, ret, calltime) + __field_packed( unsigned long long, ret, rettime ) + ), + + F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d retval: %lx", + (void *)__entry->func, __entry->depth, + __entry->calltime, __entry->rettime, + __entry->depth, __entry->retval) +); + +#else + +FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, + + TRACE_GRAPH_RET, + + F_STRUCT( + __field_struct( struct ftrace_graph_ret, ret ) + __field_packed( unsigned long, ret, func ) + __field_packed( int, ret, depth ) + __field_packed( unsigned int, ret, overrun ) + __field_packed( unsigned long long, ret, calltime) + __field_packed( unsigned long long, ret, rettime ) + ), + + F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d", + (void *)__entry->func, __entry->depth, + __entry->calltime, __entry->rettime, + __entry->depth) +); + +#endif + +/* + * Context switch trace entry - which task (and prio) we switched from/to: + * + * This is used for both wakeup and context switches. We only want + * to create one structure, but we need two outputs for it. + */ +#define FTRACE_CTX_FIELDS \ + __field( unsigned int, prev_pid ) \ + __field( unsigned int, next_pid ) \ + __field( unsigned int, next_cpu ) \ + __field( unsigned char, prev_prio ) \ + __field( unsigned char, prev_state ) \ + __field( unsigned char, next_prio ) \ + __field( unsigned char, next_state ) + +FTRACE_ENTRY(context_switch, ctx_switch_entry, + + TRACE_CTX, + + F_STRUCT( + FTRACE_CTX_FIELDS + ), + + F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", + __entry->prev_pid, __entry->prev_prio, __entry->prev_state, + __entry->next_pid, __entry->next_prio, __entry->next_state, + __entry->next_cpu) +); + +/* + * FTRACE_ENTRY_DUP only creates the format file, it will not + * create another structure. + */ +FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, + + TRACE_WAKE, + + F_STRUCT( + FTRACE_CTX_FIELDS + ), + + F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", + __entry->prev_pid, __entry->prev_prio, __entry->prev_state, + __entry->next_pid, __entry->next_prio, __entry->next_state, + __entry->next_cpu) +); + +/* + * Stack-trace entry: + */ + +#define FTRACE_STACK_ENTRIES 8 + +FTRACE_ENTRY(kernel_stack, stack_entry, + + TRACE_STACK, + + F_STRUCT( + __field( int, size ) + __stack_array( unsigned long, caller, FTRACE_STACK_ENTRIES, size) + ), + + F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n" + "\t=> %ps\n\t=> %ps\n\t=> %ps\n" + "\t=> %ps\n\t=> %ps\n", + (void *)__entry->caller[0], (void *)__entry->caller[1], + (void *)__entry->caller[2], (void *)__entry->caller[3], + (void *)__entry->caller[4], (void *)__entry->caller[5], + (void *)__entry->caller[6], (void *)__entry->caller[7]) +); + +FTRACE_ENTRY(user_stack, userstack_entry, + + TRACE_USER_STACK, + + F_STRUCT( + __field( unsigned int, tgid ) + __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + ), + + F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n" + "\t=> %ps\n\t=> %ps\n\t=> %ps\n" + "\t=> %ps\n\t=> %ps\n", + (void *)__entry->caller[0], (void *)__entry->caller[1], + (void *)__entry->caller[2], (void *)__entry->caller[3], + (void *)__entry->caller[4], (void *)__entry->caller[5], + (void *)__entry->caller[6], (void *)__entry->caller[7]) +); + +/* + * trace_printk entry: + */ +FTRACE_ENTRY(bprint, bprint_entry, + + TRACE_BPRINT, + + F_STRUCT( + __field( unsigned long, ip ) + __field( const char *, fmt ) + __dynamic_array( u32, buf ) + ), + + F_printk("%ps: %s", + (void *)__entry->ip, __entry->fmt) +); + +FTRACE_ENTRY_REG(print, print_entry, + + TRACE_PRINT, + + F_STRUCT( + __field( unsigned long, ip ) + __dynamic_array( char, buf ) + ), + + F_printk("%ps: %s", + (void *)__entry->ip, __entry->buf), + + ftrace_event_register +); + +FTRACE_ENTRY(raw_data, raw_data_entry, + + TRACE_RAW_DATA, + + F_STRUCT( + __field( unsigned int, id ) + __dynamic_array( char, buf ) + ), + + F_printk("id:%04x %08x", + __entry->id, (int)__entry->buf[0]) +); + +FTRACE_ENTRY(bputs, bputs_entry, + + TRACE_BPUTS, + + F_STRUCT( + __field( unsigned long, ip ) + __field( const char *, str ) + ), + + F_printk("%ps: %s", + (void *)__entry->ip, __entry->str) +); + +FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, + + TRACE_MMIO_RW, + + F_STRUCT( + __field_struct( struct mmiotrace_rw, rw ) + __field_desc( resource_size_t, rw, phys ) + __field_desc( unsigned long, rw, value ) + __field_desc( unsigned long, rw, pc ) + __field_desc( int, rw, map_id ) + __field_desc( unsigned char, rw, opcode ) + __field_desc( unsigned char, rw, width ) + ), + + F_printk("%lx %lx %lx %d %x %x", + (unsigned long)__entry->phys, __entry->value, __entry->pc, + __entry->map_id, __entry->opcode, __entry->width) +); + +FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, + + TRACE_MMIO_MAP, + + F_STRUCT( + __field_struct( struct mmiotrace_map, map ) + __field_desc( resource_size_t, map, phys ) + __field_desc( unsigned long, map, virt ) + __field_desc( unsigned long, map, len ) + __field_desc( int, map, map_id ) + __field_desc( unsigned char, map, opcode ) + ), + + F_printk("%lx %lx %lx %d %x", + (unsigned long)__entry->phys, __entry->virt, __entry->len, + __entry->map_id, __entry->opcode) +); + + +#define TRACE_FUNC_SIZE 30 +#define TRACE_FILE_SIZE 20 + +FTRACE_ENTRY(branch, trace_branch, + + TRACE_BRANCH, + + F_STRUCT( + __field( unsigned int, line ) + __array( char, func, TRACE_FUNC_SIZE+1 ) + __array( char, file, TRACE_FILE_SIZE+1 ) + __field( char, correct ) + __field( char, constant ) + ), + + F_printk("%u:%s:%s (%u)%s", + __entry->line, + __entry->func, __entry->file, __entry->correct, + __entry->constant ? " CONSTANT" : "") +); + + +FTRACE_ENTRY(hwlat, hwlat_entry, + + TRACE_HWLAT, + + F_STRUCT( + __field( u64, duration ) + __field( u64, outer_duration ) + __field( u64, nmi_total_ts ) + __field_struct( struct timespec64, timestamp ) + __field_desc( s64, timestamp, tv_sec ) + __field_desc( long, timestamp, tv_nsec ) + __field( unsigned int, nmi_count ) + __field( unsigned int, seqnum ) + __field( unsigned int, count ) + ), + + F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llu\tcount:%d\tnmi-ts:%llu\tnmi-count:%u\n", + __entry->seqnum, + __entry->tv_sec, + __entry->tv_nsec, + __entry->duration, + __entry->outer_duration, + __entry->count, + __entry->nmi_total_ts, + __entry->nmi_count) +); + +#define FUNC_REPEATS_GET_DELTA_TS(entry) \ + (((u64)(entry)->top_delta_ts << 32) | (entry)->bottom_delta_ts) \ + +FTRACE_ENTRY(func_repeats, func_repeats_entry, + + TRACE_FUNC_REPEATS, + + F_STRUCT( + __field( unsigned long, ip ) + __field( unsigned long, parent_ip ) + __field( u16 , count ) + __field( u16 , top_delta_ts ) + __field( u32 , bottom_delta_ts ) + ), + + F_printk(" %ps <-%ps\t(repeats:%u delta: -%llu)", + (void *)__entry->ip, + (void *)__entry->parent_ip, + __entry->count, + FUNC_REPEATS_GET_DELTA_TS(__entry)) +); + +FTRACE_ENTRY(osnoise, osnoise_entry, + + TRACE_OSNOISE, + + F_STRUCT( + __field( u64, noise ) + __field( u64, runtime ) + __field( u64, max_sample ) + __field( unsigned int, hw_count ) + __field( unsigned int, nmi_count ) + __field( unsigned int, irq_count ) + __field( unsigned int, softirq_count ) + __field( unsigned int, thread_count ) + ), + + F_printk("noise:%llu\tmax_sample:%llu\thw:%u\tnmi:%u\tirq:%u\tsoftirq:%u\tthread:%u\n", + __entry->noise, + __entry->max_sample, + __entry->hw_count, + __entry->nmi_count, + __entry->irq_count, + __entry->softirq_count, + __entry->thread_count) +); + +FTRACE_ENTRY(timerlat, timerlat_entry, + + TRACE_TIMERLAT, + + F_STRUCT( + __field( unsigned int, seqnum ) + __field( int, context ) + __field( u64, timer_latency ) + ), + + F_printk("seq:%u\tcontext:%d\ttimer_latency:%llu\n", + __entry->seqnum, + __entry->context, + __entry->timer_latency) +); diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c new file mode 100644 index 0000000000..72714cbf47 --- /dev/null +++ b/kernel/trace/trace_eprobe.c @@ -0,0 +1,987 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * event probes + * + * Part of this code was copied from kernel/trace/trace_kprobe.c written by + * Masami Hiramatsu <mhiramat@kernel.org> + * + * Copyright (C) 2021, VMware Inc, Steven Rostedt <rostedt@goodmis.org> + * Copyright (C) 2021, VMware Inc, Tzvetomir Stoyanov tz.stoyanov@gmail.com> + * + */ +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/ftrace.h> + +#include "trace_dynevent.h" +#include "trace_probe.h" +#include "trace_probe_tmpl.h" +#include "trace_probe_kernel.h" + +#define EPROBE_EVENT_SYSTEM "eprobes" + +struct trace_eprobe { + /* tracepoint system */ + const char *event_system; + + /* tracepoint event */ + const char *event_name; + + /* filter string for the tracepoint */ + char *filter_str; + + struct trace_event_call *event; + + struct dyn_event devent; + struct trace_probe tp; +}; + +struct eprobe_data { + struct trace_event_file *file; + struct trace_eprobe *ep; +}; + + +#define for_each_trace_eprobe_tp(ep, _tp) \ + list_for_each_entry(ep, trace_probe_probe_list(_tp), tp.list) + +static int __trace_eprobe_create(int argc, const char *argv[]); + +static void trace_event_probe_cleanup(struct trace_eprobe *ep) +{ + if (!ep) + return; + trace_probe_cleanup(&ep->tp); + kfree(ep->event_name); + kfree(ep->event_system); + if (ep->event) + trace_event_put_ref(ep->event); + kfree(ep->filter_str); + kfree(ep); +} + +static struct trace_eprobe *to_trace_eprobe(struct dyn_event *ev) +{ + return container_of(ev, struct trace_eprobe, devent); +} + +static int eprobe_dyn_event_create(const char *raw_command) +{ + return trace_probe_create(raw_command, __trace_eprobe_create); +} + +static int eprobe_dyn_event_show(struct seq_file *m, struct dyn_event *ev) +{ + struct trace_eprobe *ep = to_trace_eprobe(ev); + int i; + + seq_printf(m, "e:%s/%s", trace_probe_group_name(&ep->tp), + trace_probe_name(&ep->tp)); + seq_printf(m, " %s.%s", ep->event_system, ep->event_name); + + for (i = 0; i < ep->tp.nr_args; i++) + seq_printf(m, " %s=%s", ep->tp.args[i].name, ep->tp.args[i].comm); + seq_putc(m, '\n'); + + return 0; +} + +static int unregister_trace_eprobe(struct trace_eprobe *ep) +{ + /* If other probes are on the event, just unregister eprobe */ + if (trace_probe_has_sibling(&ep->tp)) + goto unreg; + + /* Enabled event can not be unregistered */ + if (trace_probe_is_enabled(&ep->tp)) + return -EBUSY; + + /* Will fail if probe is being used by ftrace or perf */ + if (trace_probe_unregister_event_call(&ep->tp)) + return -EBUSY; + +unreg: + dyn_event_remove(&ep->devent); + trace_probe_unlink(&ep->tp); + + return 0; +} + +static int eprobe_dyn_event_release(struct dyn_event *ev) +{ + struct trace_eprobe *ep = to_trace_eprobe(ev); + int ret = unregister_trace_eprobe(ep); + + if (!ret) + trace_event_probe_cleanup(ep); + return ret; +} + +static bool eprobe_dyn_event_is_busy(struct dyn_event *ev) +{ + struct trace_eprobe *ep = to_trace_eprobe(ev); + + return trace_probe_is_enabled(&ep->tp); +} + +static bool eprobe_dyn_event_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev) +{ + struct trace_eprobe *ep = to_trace_eprobe(ev); + const char *slash; + + /* + * We match the following: + * event only - match all eprobes with event name + * system and event only - match all system/event probes + * system only - match all system probes + * + * The below has the above satisfied with more arguments: + * + * attached system/event - If the arg has the system and event + * the probe is attached to, match + * probes with the attachment. + * + * If any more args are given, then it requires a full match. + */ + + /* + * If system exists, but this probe is not part of that system + * do not match. + */ + if (system && strcmp(trace_probe_group_name(&ep->tp), system) != 0) + return false; + + /* Must match the event name */ + if (event[0] != '\0' && strcmp(trace_probe_name(&ep->tp), event) != 0) + return false; + + /* No arguments match all */ + if (argc < 1) + return true; + + /* First argument is the system/event the probe is attached to */ + + slash = strchr(argv[0], '/'); + if (!slash) + slash = strchr(argv[0], '.'); + if (!slash) + return false; + + if (strncmp(ep->event_system, argv[0], slash - argv[0])) + return false; + if (strcmp(ep->event_name, slash + 1)) + return false; + + argc--; + argv++; + + /* If there are no other args, then match */ + if (argc < 1) + return true; + + return trace_probe_match_command_args(&ep->tp, argc, argv); +} + +static struct dyn_event_operations eprobe_dyn_event_ops = { + .create = eprobe_dyn_event_create, + .show = eprobe_dyn_event_show, + .is_busy = eprobe_dyn_event_is_busy, + .free = eprobe_dyn_event_release, + .match = eprobe_dyn_event_match, +}; + +static struct trace_eprobe *alloc_event_probe(const char *group, + const char *this_event, + struct trace_event_call *event, + int nargs) +{ + struct trace_eprobe *ep; + const char *event_name; + const char *sys_name; + int ret = -ENOMEM; + + if (!event) + return ERR_PTR(-ENODEV); + + sys_name = event->class->system; + event_name = trace_event_name(event); + + ep = kzalloc(struct_size(ep, tp.args, nargs), GFP_KERNEL); + if (!ep) { + trace_event_put_ref(event); + goto error; + } + ep->event = event; + ep->event_name = kstrdup(event_name, GFP_KERNEL); + if (!ep->event_name) + goto error; + ep->event_system = kstrdup(sys_name, GFP_KERNEL); + if (!ep->event_system) + goto error; + + ret = trace_probe_init(&ep->tp, this_event, group, false); + if (ret < 0) + goto error; + + dyn_event_init(&ep->devent, &eprobe_dyn_event_ops); + return ep; +error: + trace_event_probe_cleanup(ep); + return ERR_PTR(ret); +} + +static int eprobe_event_define_fields(struct trace_event_call *event_call) +{ + struct eprobe_trace_entry_head field; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; + + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); +} + +static struct trace_event_fields eprobe_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = eprobe_event_define_fields }, + {} +}; + +/* Event entry printers */ +static enum print_line_t +print_eprobe_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct eprobe_trace_entry_head *field; + struct trace_event_call *pevent; + struct trace_event *probed_event; + struct trace_seq *s = &iter->seq; + struct trace_eprobe *ep; + struct trace_probe *tp; + unsigned int type; + + field = (struct eprobe_trace_entry_head *)iter->ent; + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; + + ep = container_of(tp, struct trace_eprobe, tp); + type = ep->event->event.type; + + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); + + probed_event = ftrace_find_event(type); + if (probed_event) { + pevent = container_of(probed_event, struct trace_event_call, event); + trace_seq_printf(s, "%s.%s", pevent->class->system, + trace_event_name(pevent)); + } else { + trace_seq_printf(s, "%u", type); + } + + trace_seq_putc(s, ')'); + + if (trace_probe_print_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; + + trace_seq_putc(s, '\n'); + out: + return trace_handle_return(s); +} + +static nokprobe_inline unsigned long +get_event_field(struct fetch_insn *code, void *rec) +{ + struct ftrace_event_field *field = code->data; + unsigned long val; + void *addr; + + addr = rec + field->offset; + + if (is_string_field(field)) { + switch (field->filter_type) { + case FILTER_DYN_STRING: + val = (unsigned long)(rec + (*(unsigned int *)addr & 0xffff)); + break; + case FILTER_RDYN_STRING: + val = (unsigned long)(addr + (*(unsigned int *)addr & 0xffff)); + break; + case FILTER_STATIC_STRING: + val = (unsigned long)addr; + break; + case FILTER_PTR_STRING: + val = (unsigned long)(*(char *)addr); + break; + default: + WARN_ON_ONCE(1); + return 0; + } + return val; + } + + switch (field->size) { + case 1: + if (field->is_signed) + val = *(char *)addr; + else + val = *(unsigned char *)addr; + break; + case 2: + if (field->is_signed) + val = *(short *)addr; + else + val = *(unsigned short *)addr; + break; + case 4: + if (field->is_signed) + val = *(int *)addr; + else + val = *(unsigned int *)addr; + break; + default: + if (field->is_signed) + val = *(long *)addr; + else + val = *(unsigned long *)addr; + break; + } + return val; +} + +static int get_eprobe_size(struct trace_probe *tp, void *rec) +{ + struct fetch_insn *code; + struct probe_arg *arg; + int i, len, ret = 0; + + for (i = 0; i < tp->nr_args; i++) { + arg = tp->args + i; + if (arg->dynamic) { + unsigned long val; + + code = arg->code; + retry: + switch (code->op) { + case FETCH_OP_TP_ARG: + val = get_event_field(code, rec); + break; + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + if (process_common_fetch_insn(code, &val) < 0) + continue; + } + code++; + len = process_fetch_insn_bottom(code, val, NULL, NULL); + if (len > 0) + ret += len; + } + } + + return ret; +} + +/* Kprobe specific fetch functions */ + +/* Note that we don't verify it, since the code does not come from user space */ +static int +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, + void *base) +{ + unsigned long val; + int ret; + + retry: + switch (code->op) { + case FETCH_OP_TP_ARG: + val = get_event_field(code, rec); + break; + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + ret = process_common_fetch_insn(code, &val); + if (ret < 0) + return ret; + } + code++; + return process_fetch_insn_bottom(code, val, dest, base); +} +NOKPROBE_SYMBOL(process_fetch_insn) + +/* eprobe handler */ +static inline void +__eprobe_trace_func(struct eprobe_data *edata, void *rec) +{ + struct eprobe_trace_entry_head *entry; + struct trace_event_call *call = trace_probe_event_call(&edata->ep->tp); + struct trace_event_buffer fbuffer; + int dsize; + + if (WARN_ON_ONCE(call != edata->file->event_call)) + return; + + if (trace_trigger_soft_disabled(edata->file)) + return; + + dsize = get_eprobe_size(&edata->ep->tp, rec); + + entry = trace_event_buffer_reserve(&fbuffer, edata->file, + sizeof(*entry) + edata->ep->tp.size + dsize); + + if (!entry) + return; + + entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); + store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize); + + trace_event_buffer_commit(&fbuffer); +} + +/* + * The event probe implementation uses event triggers to get access to + * the event it is attached to, but is not an actual trigger. The below + * functions are just stubs to fulfill what is needed to use the trigger + * infrastructure. + */ +static int eprobe_trigger_init(struct event_trigger_data *data) +{ + return 0; +} + +static void eprobe_trigger_free(struct event_trigger_data *data) +{ + +} + +static int eprobe_trigger_print(struct seq_file *m, + struct event_trigger_data *data) +{ + /* Do not print eprobe event triggers */ + return 0; +} + +static void eprobe_trigger_func(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *rbe) +{ + struct eprobe_data *edata = data->private_data; + + if (unlikely(!rec)) + return; + + __eprobe_trace_func(edata, rec); +} + +static struct event_trigger_ops eprobe_trigger_ops = { + .trigger = eprobe_trigger_func, + .print = eprobe_trigger_print, + .init = eprobe_trigger_init, + .free = eprobe_trigger_free, +}; + +static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, + char *param_and_filter) +{ + return -1; +} + +static int eprobe_trigger_reg_func(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + return -1; +} + +static void eprobe_trigger_unreg_func(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + +} + +static struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd, + char *param) +{ + return &eprobe_trigger_ops; +} + +static struct event_command event_trigger_cmd = { + .name = "eprobe", + .trigger_type = ETT_EVENT_EPROBE, + .flags = EVENT_CMD_FL_NEEDS_REC, + .parse = eprobe_trigger_cmd_parse, + .reg = eprobe_trigger_reg_func, + .unreg = eprobe_trigger_unreg_func, + .unreg_all = NULL, + .get_trigger_ops = eprobe_trigger_get_ops, + .set_filter = NULL, +}; + +static struct event_trigger_data * +new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file) +{ + struct event_trigger_data *trigger; + struct event_filter *filter = NULL; + struct eprobe_data *edata; + int ret; + + edata = kzalloc(sizeof(*edata), GFP_KERNEL); + trigger = kzalloc(sizeof(*trigger), GFP_KERNEL); + if (!trigger || !edata) { + ret = -ENOMEM; + goto error; + } + + trigger->flags = EVENT_TRIGGER_FL_PROBE; + trigger->count = -1; + trigger->ops = &eprobe_trigger_ops; + + /* + * EVENT PROBE triggers are not registered as commands with + * register_event_command(), as they are not controlled by the user + * from the trigger file + */ + trigger->cmd_ops = &event_trigger_cmd; + + INIT_LIST_HEAD(&trigger->list); + + if (ep->filter_str) { + ret = create_event_filter(file->tr, ep->event, + ep->filter_str, false, &filter); + if (ret) + goto error; + } + RCU_INIT_POINTER(trigger->filter, filter); + + edata->file = file; + edata->ep = ep; + trigger->private_data = edata; + + return trigger; +error: + free_event_filter(filter); + kfree(edata); + kfree(trigger); + return ERR_PTR(ret); +} + +static int enable_eprobe(struct trace_eprobe *ep, + struct trace_event_file *eprobe_file) +{ + struct event_trigger_data *trigger; + struct trace_event_file *file; + struct trace_array *tr = eprobe_file->tr; + + file = find_event_file(tr, ep->event_system, ep->event_name); + if (!file) + return -ENOENT; + trigger = new_eprobe_trigger(ep, eprobe_file); + if (IS_ERR(trigger)) + return PTR_ERR(trigger); + + list_add_tail_rcu(&trigger->list, &file->triggers); + + trace_event_trigger_enable_disable(file, 1); + update_cond_flag(file); + + return 0; +} + +static struct trace_event_functions eprobe_funcs = { + .trace = print_eprobe_event +}; + +static int disable_eprobe(struct trace_eprobe *ep, + struct trace_array *tr) +{ + struct event_trigger_data *trigger = NULL, *iter; + struct trace_event_file *file; + struct event_filter *filter; + struct eprobe_data *edata; + + file = find_event_file(tr, ep->event_system, ep->event_name); + if (!file) + return -ENOENT; + + list_for_each_entry(iter, &file->triggers, list) { + if (!(iter->flags & EVENT_TRIGGER_FL_PROBE)) + continue; + edata = iter->private_data; + if (edata->ep == ep) { + trigger = iter; + break; + } + } + if (!trigger) + return -ENODEV; + + list_del_rcu(&trigger->list); + + trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); + + /* Make sure nothing is using the edata or trigger */ + tracepoint_synchronize_unregister(); + + filter = rcu_access_pointer(trigger->filter); + + if (filter) + free_event_filter(filter); + kfree(edata); + kfree(trigger); + + return 0; +} + +static int enable_trace_eprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *tp; + struct trace_eprobe *ep; + bool enabled; + int ret = 0; + int cnt = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This also changes "enabled" state */ + if (file) { + ret = trace_probe_add_file(tp, file); + if (ret) + return ret; + } else + trace_probe_set_flag(tp, TP_FLAG_PROFILE); + + if (enabled) + return 0; + + for_each_trace_eprobe_tp(ep, tp) { + ret = enable_eprobe(ep, file); + if (ret) + break; + enabled = true; + cnt++; + } + + if (ret) { + /* Failed to enable one of them. Roll back all */ + if (enabled) { + /* + * It's a bug if one failed for something other than memory + * not being available but another eprobe succeeded. + */ + WARN_ON_ONCE(ret != -ENOMEM); + + for_each_trace_eprobe_tp(ep, tp) { + disable_eprobe(ep, file->tr); + if (!--cnt) + break; + } + } + if (file) + trace_probe_remove_file(tp, file); + else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + } + + return ret; +} + +static int disable_trace_eprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *tp; + struct trace_eprobe *ep; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + if (file) { + if (!trace_probe_get_file_link(tp, file)) + return -ENOENT; + if (!trace_probe_has_single_file(tp)) + goto out; + trace_probe_clear_flag(tp, TP_FLAG_TRACE); + } else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + + if (!trace_probe_is_enabled(tp)) { + for_each_trace_eprobe_tp(ep, tp) + disable_eprobe(ep, file->tr); + } + + out: + if (file) + /* + * Synchronization is done in below function. For perf event, + * file == NULL and perf_trace_event_unreg() calls + * tracepoint_synchronize_unregister() to ensure synchronize + * event. We don't need to care about it. + */ + trace_probe_remove_file(tp, file); + + return 0; +} + +static int eprobe_register(struct trace_event_call *event, + enum trace_reg type, void *data) +{ + struct trace_event_file *file = data; + + switch (type) { + case TRACE_REG_REGISTER: + return enable_trace_eprobe(event, file); + case TRACE_REG_UNREGISTER: + return disable_trace_eprobe(event, file); +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + case TRACE_REG_PERF_UNREGISTER: + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; +#endif + } + return 0; +} + +static inline void init_trace_eprobe_call(struct trace_eprobe *ep) +{ + struct trace_event_call *call = trace_probe_event_call(&ep->tp); + + call->flags = TRACE_EVENT_FL_EPROBE; + call->event.funcs = &eprobe_funcs; + call->class->fields_array = eprobe_fields_array; + call->class->reg = eprobe_register; +} + +static struct trace_event_call * +find_and_get_event(const char *system, const char *event_name) +{ + struct trace_event_call *tp_event; + const char *name; + + list_for_each_entry(tp_event, &ftrace_events, list) { + /* Skip other probes and ftrace events */ + if (tp_event->flags & + (TRACE_EVENT_FL_IGNORE_ENABLE | + TRACE_EVENT_FL_KPROBE | + TRACE_EVENT_FL_UPROBE | + TRACE_EVENT_FL_EPROBE)) + continue; + if (!tp_event->class->system || + strcmp(system, tp_event->class->system)) + continue; + name = trace_event_name(tp_event); + if (!name || strcmp(event_name, name)) + continue; + if (!trace_event_try_get_ref(tp_event)) { + return NULL; + break; + } + return tp_event; + break; + } + return NULL; +} + +static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i) +{ + struct traceprobe_parse_context ctx = { + .event = ep->event, + .flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT, + }; + int ret; + + ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], &ctx); + /* Handle symbols "@" */ + if (!ret) + ret = traceprobe_update_arg(&ep->tp.args[i]); + + traceprobe_finish_parse(&ctx); + return ret; +} + +static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const char *argv[]) +{ + struct event_filter *dummy = NULL; + int i, ret, len = 0; + char *p; + + if (argc == 0) { + trace_probe_log_err(0, NO_EP_FILTER); + return -EINVAL; + } + + /* Recover the filter string */ + for (i = 0; i < argc; i++) + len += strlen(argv[i]) + 1; + + ep->filter_str = kzalloc(len, GFP_KERNEL); + if (!ep->filter_str) + return -ENOMEM; + + p = ep->filter_str; + for (i = 0; i < argc; i++) { + if (i) + ret = snprintf(p, len, " %s", argv[i]); + else + ret = snprintf(p, len, "%s", argv[i]); + p += ret; + len -= ret; + } + + /* + * Ensure the filter string can be parsed correctly. Note, this + * filter string is for the original event, not for the eprobe. + */ + ret = create_event_filter(top_trace_array(), ep->event, ep->filter_str, + true, &dummy); + free_event_filter(dummy); + if (ret) + goto error; + + return 0; +error: + kfree(ep->filter_str); + ep->filter_str = NULL; + return ret; +} + +static int __trace_eprobe_create(int argc, const char *argv[]) +{ + /* + * Argument syntax: + * e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS] [if FILTER] + * Fetch args (no space): + * <name>=$<field>[:TYPE] + */ + const char *event = NULL, *group = EPROBE_EVENT_SYSTEM; + const char *sys_event = NULL, *sys_name = NULL; + struct trace_event_call *event_call; + struct trace_eprobe *ep = NULL; + char buf1[MAX_EVENT_NAME_LEN]; + char buf2[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; + int ret = 0, filter_idx = 0; + int i, filter_cnt; + + if (argc < 2 || argv[0][0] != 'e') + return -ECANCELED; + + trace_probe_log_init("event_probe", argc, argv); + + event = strchr(&argv[0][1], ':'); + if (event) { + event++; + ret = traceprobe_parse_event_name(&event, &group, gbuf, + event - argv[0]); + if (ret) + goto parse_error; + } + + trace_probe_log_set_index(1); + sys_event = argv[1]; + ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0); + if (ret || !sys_event || !sys_name) { + trace_probe_log_err(0, NO_EVENT_INFO); + goto parse_error; + } + + if (!event) { + strscpy(buf1, sys_event, MAX_EVENT_NAME_LEN); + event = buf1; + } + + for (i = 2; i < argc; i++) { + if (!strcmp(argv[i], "if")) { + filter_idx = i + 1; + filter_cnt = argc - filter_idx; + argc = i; + break; + } + } + + mutex_lock(&event_mutex); + event_call = find_and_get_event(sys_name, sys_event); + ep = alloc_event_probe(group, event, event_call, argc - 2); + mutex_unlock(&event_mutex); + + if (IS_ERR(ep)) { + ret = PTR_ERR(ep); + if (ret == -ENODEV) + trace_probe_log_err(0, BAD_ATTACH_EVENT); + /* This must return -ENOMEM or missing event, else there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM && ret != -ENODEV); + ep = NULL; + goto error; + } + + if (filter_idx) { + trace_probe_log_set_index(filter_idx); + ret = trace_eprobe_parse_filter(ep, filter_cnt, argv + filter_idx); + if (ret) + goto parse_error; + } else + ep->filter_str = NULL; + + argc -= 2; argv += 2; + /* parse arguments */ + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + trace_probe_log_set_index(i + 2); + ret = trace_eprobe_tp_update_arg(ep, argv, i); + if (ret) + goto error; + } + ret = traceprobe_set_print_fmt(&ep->tp, PROBE_PRINT_EVENT); + if (ret < 0) + goto error; + init_trace_eprobe_call(ep); + mutex_lock(&event_mutex); + ret = trace_probe_register_event_call(&ep->tp); + if (ret) { + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } + mutex_unlock(&event_mutex); + goto error; + } + ret = dyn_event_add(&ep->devent, &ep->tp.event->call); + mutex_unlock(&event_mutex); + return ret; +parse_error: + ret = -EINVAL; +error: + trace_event_probe_cleanup(ep); + return ret; +} + +/* + * Register dynevent at core_initcall. This allows kernel to setup eprobe + * events in postcore_initcall without tracefs. + */ +static __init int trace_events_eprobe_init_early(void) +{ + int err = 0; + + err = dyn_event_register(&eprobe_dyn_event_ops); + if (err) + pr_warn("Could not register eprobe_dyn_event_ops\n"); + + return err; +} +core_initcall(trace_events_eprobe_init_early); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c new file mode 100644 index 0000000000..05e7912418 --- /dev/null +++ b/kernel/trace/trace_event_perf.c @@ -0,0 +1,525 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace event based perf event profiling/tracing + * + * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra + * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com> + */ + +#include <linux/module.h> +#include <linux/kprobes.h> +#include <linux/security.h> +#include "trace.h" +#include "trace_probe.h" + +static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; + +/* + * Force it to be aligned to unsigned long to avoid misaligned accesses + * surprises + */ +typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) + perf_trace_t; + +/* Count the events in use (per event id, not per instance) */ +static int total_ref_count; + +static int perf_trace_event_perm(struct trace_event_call *tp_event, + struct perf_event *p_event) +{ + int ret; + + if (tp_event->perf_perm) { + ret = tp_event->perf_perm(tp_event, p_event); + if (ret) + return ret; + } + + /* + * We checked and allowed to create parent, + * allow children without checking. + */ + if (p_event->parent) + return 0; + + /* + * It's ok to check current process (owner) permissions in here, + * because code below is called only via perf_event_open syscall. + */ + + /* The ftrace function trace is allowed only for root. */ + if (ftrace_event_is_function(tp_event)) { + ret = perf_allow_tracepoint(&p_event->attr); + if (ret) + return ret; + + if (!is_sampling_event(p_event)) + return 0; + + /* + * We don't allow user space callchains for function trace + * event, due to issues with page faults while tracing page + * fault handler and its overall trickiness nature. + */ + if (!p_event->attr.exclude_callchain_user) + return -EINVAL; + + /* + * Same reason to disable user stack dump as for user space + * callchains above. + */ + if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER) + return -EINVAL; + } + + /* No tracing, just counting, so no obvious leak */ + if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) + return 0; + + /* Some events are ok to be traced by non-root users... */ + if (p_event->attach_state == PERF_ATTACH_TASK) { + if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY) + return 0; + } + + /* + * ...otherwise raw tracepoint data can be a severe data leak, + * only allow root to have these. + */ + ret = perf_allow_tracepoint(&p_event->attr); + if (ret) + return ret; + + return 0; +} + +static int perf_trace_event_reg(struct trace_event_call *tp_event, + struct perf_event *p_event) +{ + struct hlist_head __percpu *list; + int ret = -ENOMEM; + int cpu; + + p_event->tp_event = tp_event; + if (tp_event->perf_refcount++ > 0) + return 0; + + list = alloc_percpu(struct hlist_head); + if (!list) + goto fail; + + for_each_possible_cpu(cpu) + INIT_HLIST_HEAD(per_cpu_ptr(list, cpu)); + + tp_event->perf_events = list; + + if (!total_ref_count) { + char __percpu *buf; + int i; + + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + buf = (char __percpu *)alloc_percpu(perf_trace_t); + if (!buf) + goto fail; + + perf_trace_buf[i] = buf; + } + } + + ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL); + if (ret) + goto fail; + + total_ref_count++; + return 0; + +fail: + if (!total_ref_count) { + int i; + + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + free_percpu(perf_trace_buf[i]); + perf_trace_buf[i] = NULL; + } + } + + if (!--tp_event->perf_refcount) { + free_percpu(tp_event->perf_events); + tp_event->perf_events = NULL; + } + + return ret; +} + +static void perf_trace_event_unreg(struct perf_event *p_event) +{ + struct trace_event_call *tp_event = p_event->tp_event; + int i; + + if (--tp_event->perf_refcount > 0) + return; + + tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); + + /* + * Ensure our callback won't be called anymore. The buffers + * will be freed after that. + */ + tracepoint_synchronize_unregister(); + + free_percpu(tp_event->perf_events); + tp_event->perf_events = NULL; + + if (!--total_ref_count) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + free_percpu(perf_trace_buf[i]); + perf_trace_buf[i] = NULL; + } + } +} + +static int perf_trace_event_open(struct perf_event *p_event) +{ + struct trace_event_call *tp_event = p_event->tp_event; + return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event); +} + +static void perf_trace_event_close(struct perf_event *p_event) +{ + struct trace_event_call *tp_event = p_event->tp_event; + tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event); +} + +static int perf_trace_event_init(struct trace_event_call *tp_event, + struct perf_event *p_event) +{ + int ret; + + ret = perf_trace_event_perm(tp_event, p_event); + if (ret) + return ret; + + ret = perf_trace_event_reg(tp_event, p_event); + if (ret) + return ret; + + ret = perf_trace_event_open(p_event); + if (ret) { + perf_trace_event_unreg(p_event); + return ret; + } + + return 0; +} + +int perf_trace_init(struct perf_event *p_event) +{ + struct trace_event_call *tp_event; + u64 event_id = p_event->attr.config; + int ret = -EINVAL; + + mutex_lock(&event_mutex); + list_for_each_entry(tp_event, &ftrace_events, list) { + if (tp_event->event.type == event_id && + tp_event->class && tp_event->class->reg && + trace_event_try_get_ref(tp_event)) { + ret = perf_trace_event_init(tp_event, p_event); + if (ret) + trace_event_put_ref(tp_event); + break; + } + } + mutex_unlock(&event_mutex); + + return ret; +} + +void perf_trace_destroy(struct perf_event *p_event) +{ + mutex_lock(&event_mutex); + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); + mutex_unlock(&event_mutex); +} + +#ifdef CONFIG_KPROBE_EVENTS +int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) +{ + int ret; + char *func = NULL; + struct trace_event_call *tp_event; + + if (p_event->attr.kprobe_func) { + func = strndup_user(u64_to_user_ptr(p_event->attr.kprobe_func), + KSYM_NAME_LEN); + if (IS_ERR(func)) { + ret = PTR_ERR(func); + return (ret == -EINVAL) ? -E2BIG : ret; + } + + if (func[0] == '\0') { + kfree(func); + func = NULL; + } + } + + tp_event = create_local_trace_kprobe( + func, (void *)(unsigned long)(p_event->attr.kprobe_addr), + p_event->attr.probe_offset, is_retprobe); + if (IS_ERR(tp_event)) { + ret = PTR_ERR(tp_event); + goto out; + } + + mutex_lock(&event_mutex); + ret = perf_trace_event_init(tp_event, p_event); + if (ret) + destroy_local_trace_kprobe(tp_event); + mutex_unlock(&event_mutex); +out: + kfree(func); + return ret; +} + +void perf_kprobe_destroy(struct perf_event *p_event) +{ + mutex_lock(&event_mutex); + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); + mutex_unlock(&event_mutex); + + destroy_local_trace_kprobe(p_event->tp_event); +} +#endif /* CONFIG_KPROBE_EVENTS */ + +#ifdef CONFIG_UPROBE_EVENTS +int perf_uprobe_init(struct perf_event *p_event, + unsigned long ref_ctr_offset, bool is_retprobe) +{ + int ret; + char *path = NULL; + struct trace_event_call *tp_event; + + if (!p_event->attr.uprobe_path) + return -EINVAL; + + path = strndup_user(u64_to_user_ptr(p_event->attr.uprobe_path), + PATH_MAX); + if (IS_ERR(path)) { + ret = PTR_ERR(path); + return (ret == -EINVAL) ? -E2BIG : ret; + } + if (path[0] == '\0') { + ret = -EINVAL; + goto out; + } + + tp_event = create_local_trace_uprobe(path, p_event->attr.probe_offset, + ref_ctr_offset, is_retprobe); + if (IS_ERR(tp_event)) { + ret = PTR_ERR(tp_event); + goto out; + } + + /* + * local trace_uprobe need to hold event_mutex to call + * uprobe_buffer_enable() and uprobe_buffer_disable(). + * event_mutex is not required for local trace_kprobes. + */ + mutex_lock(&event_mutex); + ret = perf_trace_event_init(tp_event, p_event); + if (ret) + destroy_local_trace_uprobe(tp_event); + mutex_unlock(&event_mutex); +out: + kfree(path); + return ret; +} + +void perf_uprobe_destroy(struct perf_event *p_event) +{ + mutex_lock(&event_mutex); + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + trace_event_put_ref(p_event->tp_event); + mutex_unlock(&event_mutex); + destroy_local_trace_uprobe(p_event->tp_event); +} +#endif /* CONFIG_UPROBE_EVENTS */ + +int perf_trace_add(struct perf_event *p_event, int flags) +{ + struct trace_event_call *tp_event = p_event->tp_event; + + if (!(flags & PERF_EF_START)) + p_event->hw.state = PERF_HES_STOPPED; + + /* + * If TRACE_REG_PERF_ADD returns false; no custom action was performed + * and we need to take the default action of enqueueing our event on + * the right per-cpu hlist. + */ + if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) { + struct hlist_head __percpu *pcpu_list; + struct hlist_head *list; + + pcpu_list = tp_event->perf_events; + if (WARN_ON_ONCE(!pcpu_list)) + return -EINVAL; + + list = this_cpu_ptr(pcpu_list); + hlist_add_head_rcu(&p_event->hlist_entry, list); + } + + return 0; +} + +void perf_trace_del(struct perf_event *p_event, int flags) +{ + struct trace_event_call *tp_event = p_event->tp_event; + + /* + * If TRACE_REG_PERF_DEL returns false; no custom action was performed + * and we need to take the default action of dequeueing our event from + * the right per-cpu hlist. + */ + if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event)) + hlist_del_rcu(&p_event->hlist_entry); +} + +void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp) +{ + char *raw_data; + int rctx; + + BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); + + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, + "perf buffer not large enough, wanted %d, have %d", + size, PERF_MAX_TRACE_SIZE)) + return NULL; + + *rctxp = rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + return NULL; + + if (regs) + *regs = this_cpu_ptr(&__perf_regs[rctx]); + raw_data = this_cpu_ptr(perf_trace_buf[rctx]); + + /* zero the dead bytes from align to not leak stack to user */ + memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); + return raw_data; +} +EXPORT_SYMBOL_GPL(perf_trace_buf_alloc); +NOKPROBE_SYMBOL(perf_trace_buf_alloc); + +void perf_trace_buf_update(void *record, u16 type) +{ + struct trace_entry *entry = record; + + tracing_generic_entry_update(entry, type, tracing_gen_ctx()); +} +NOKPROBE_SYMBOL(perf_trace_buf_update); + +#ifdef CONFIG_FUNCTION_TRACER +static void +perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + struct ftrace_entry *entry; + struct perf_event *event; + struct hlist_head head; + struct pt_regs regs; + int rctx; + int bit; + + if (!rcu_is_watching()) + return; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + if ((unsigned long)ops->private != smp_processor_id()) + goto out; + + event = container_of(ops, struct perf_event, ftrace_ops); + + /* + * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all + * the perf code does is hlist_for_each_entry_rcu(), so we can + * get away with simply setting the @head.first pointer in order + * to create a singular list. + */ + head.first = &event->hlist_entry; + +#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ + sizeof(u64)) - sizeof(u32)) + + BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE); + + memset(®s, 0, sizeof(regs)); + perf_fetch_caller_regs(®s); + + entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx); + if (!entry) + goto out; + + entry->ip = ip; + entry->parent_ip = parent_ip; + perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN, + 1, ®s, &head, NULL); + +out: + ftrace_test_recursion_unlock(bit); +#undef ENTRY_SIZE +} + +static int perf_ftrace_function_register(struct perf_event *event) +{ + struct ftrace_ops *ops = &event->ftrace_ops; + + ops->func = perf_ftrace_function_call; + ops->private = (void *)(unsigned long)nr_cpu_ids; + + return register_ftrace_function(ops); +} + +static int perf_ftrace_function_unregister(struct perf_event *event) +{ + struct ftrace_ops *ops = &event->ftrace_ops; + int ret = unregister_ftrace_function(ops); + ftrace_free_filter(ops); + return ret; +} + +int perf_ftrace_event_register(struct trace_event_call *call, + enum trace_reg type, void *data) +{ + struct perf_event *event = data; + + switch (type) { + case TRACE_REG_REGISTER: + case TRACE_REG_UNREGISTER: + break; + case TRACE_REG_PERF_REGISTER: + case TRACE_REG_PERF_UNREGISTER: + return 0; + case TRACE_REG_PERF_OPEN: + return perf_ftrace_function_register(data); + case TRACE_REG_PERF_CLOSE: + return perf_ftrace_function_unregister(data); + case TRACE_REG_PERF_ADD: + event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id(); + return 1; + case TRACE_REG_PERF_DEL: + event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids; + return 1; + } + + return -EINVAL; +} +#endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c new file mode 100644 index 0000000000..82cb22ad6d --- /dev/null +++ b/kernel/trace/trace_events.c @@ -0,0 +1,4173 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * event tracer + * + * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> + * + * - Added format output of fields of the trace point. + * This was based off of work by Tom Zanussi <tzanussi@gmail.com>. + * + */ + +#define pr_fmt(fmt) fmt + +#include <linux/workqueue.h> +#include <linux/security.h> +#include <linux/spinlock.h> +#include <linux/kthread.h> +#include <linux/tracefs.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/sort.h> +#include <linux/slab.h> +#include <linux/delay.h> + +#include <trace/events/sched.h> +#include <trace/syscall.h> + +#include <asm/setup.h> + +#include "trace_output.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM "TRACE_SYSTEM" + +DEFINE_MUTEX(event_mutex); + +LIST_HEAD(ftrace_events); +static LIST_HEAD(ftrace_generic_fields); +static LIST_HEAD(ftrace_common_fields); +static bool eventdir_initialized; + +static LIST_HEAD(module_strings); + +struct module_string { + struct list_head next; + struct module *module; + char *str; +}; + +#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) + +static struct kmem_cache *field_cachep; +static struct kmem_cache *file_cachep; + +static inline int system_refcount(struct event_subsystem *system) +{ + return system->ref_count; +} + +static int system_refcount_inc(struct event_subsystem *system) +{ + return system->ref_count++; +} + +static int system_refcount_dec(struct event_subsystem *system) +{ + return --system->ref_count; +} + +/* Double loops, do not use break, only goto's work */ +#define do_for_each_event_file(tr, file) \ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ + list_for_each_entry(file, &tr->events, list) + +#define do_for_each_event_file_safe(tr, file) \ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ + struct trace_event_file *___n; \ + list_for_each_entry_safe(file, ___n, &tr->events, list) + +#define while_for_each_event_file() \ + } + +static struct ftrace_event_field * +__find_event_field(struct list_head *head, char *name) +{ + struct ftrace_event_field *field; + + list_for_each_entry(field, head, link) { + if (!strcmp(field->name, name)) + return field; + } + + return NULL; +} + +struct ftrace_event_field * +trace_find_event_field(struct trace_event_call *call, char *name) +{ + struct ftrace_event_field *field; + struct list_head *head; + + head = trace_get_fields(call); + field = __find_event_field(head, name); + if (field) + return field; + + field = __find_event_field(&ftrace_generic_fields, name); + if (field) + return field; + + return __find_event_field(&ftrace_common_fields, name); +} + +static int __trace_define_field(struct list_head *head, const char *type, + const char *name, int offset, int size, + int is_signed, int filter_type, int len) +{ + struct ftrace_event_field *field; + + field = kmem_cache_alloc(field_cachep, GFP_TRACE); + if (!field) + return -ENOMEM; + + field->name = name; + field->type = type; + + if (filter_type == FILTER_OTHER) + field->filter_type = filter_assign_type(type); + else + field->filter_type = filter_type; + + field->offset = offset; + field->size = size; + field->is_signed = is_signed; + field->len = len; + + list_add(&field->link, head); + + return 0; +} + +int trace_define_field(struct trace_event_call *call, const char *type, + const char *name, int offset, int size, int is_signed, + int filter_type) +{ + struct list_head *head; + + if (WARN_ON(!call->class)) + return 0; + + head = trace_get_fields(call); + return __trace_define_field(head, type, name, offset, size, + is_signed, filter_type, 0); +} +EXPORT_SYMBOL_GPL(trace_define_field); + +static int trace_define_field_ext(struct trace_event_call *call, const char *type, + const char *name, int offset, int size, int is_signed, + int filter_type, int len) +{ + struct list_head *head; + + if (WARN_ON(!call->class)) + return 0; + + head = trace_get_fields(call); + return __trace_define_field(head, type, name, offset, size, + is_signed, filter_type, len); +} + +#define __generic_field(type, item, filter_type) \ + ret = __trace_define_field(&ftrace_generic_fields, #type, \ + #item, 0, 0, is_signed_type(type), \ + filter_type, 0); \ + if (ret) \ + return ret; + +#define __common_field(type, item) \ + ret = __trace_define_field(&ftrace_common_fields, #type, \ + "common_" #item, \ + offsetof(typeof(ent), item), \ + sizeof(ent.item), \ + is_signed_type(type), FILTER_OTHER, 0); \ + if (ret) \ + return ret; + +static int trace_define_generic_fields(void) +{ + int ret; + + __generic_field(int, CPU, FILTER_CPU); + __generic_field(int, cpu, FILTER_CPU); + __generic_field(int, common_cpu, FILTER_CPU); + __generic_field(char *, COMM, FILTER_COMM); + __generic_field(char *, comm, FILTER_COMM); + __generic_field(char *, stacktrace, FILTER_STACKTRACE); + __generic_field(char *, STACKTRACE, FILTER_STACKTRACE); + + return ret; +} + +static int trace_define_common_fields(void) +{ + int ret; + struct trace_entry ent; + + __common_field(unsigned short, type); + __common_field(unsigned char, flags); + /* Holds both preempt_count and migrate_disable */ + __common_field(unsigned char, preempt_count); + __common_field(int, pid); + + return ret; +} + +static void trace_destroy_fields(struct trace_event_call *call) +{ + struct ftrace_event_field *field, *next; + struct list_head *head; + + head = trace_get_fields(call); + list_for_each_entry_safe(field, next, head, link) { + list_del(&field->link); + kmem_cache_free(field_cachep, field); + } +} + +/* + * run-time version of trace_event_get_offsets_<call>() that returns the last + * accessible offset of trace fields excluding __dynamic_array bytes + */ +int trace_event_get_offsets(struct trace_event_call *call) +{ + struct ftrace_event_field *tail; + struct list_head *head; + + head = trace_get_fields(call); + /* + * head->next points to the last field with the largest offset, + * since it was added last by trace_define_field() + */ + tail = list_first_entry(head, struct ftrace_event_field, link); + return tail->offset + tail->size; +} + +/* + * Check if the referenced field is an array and return true, + * as arrays are OK to dereference. + */ +static bool test_field(const char *fmt, struct trace_event_call *call) +{ + struct trace_event_fields *field = call->class->fields_array; + const char *array_descriptor; + const char *p = fmt; + int len; + + if (!(len = str_has_prefix(fmt, "REC->"))) + return false; + fmt += len; + for (p = fmt; *p; p++) { + if (!isalnum(*p) && *p != '_') + break; + } + len = p - fmt; + + for (; field->type; field++) { + if (strncmp(field->name, fmt, len) || + field->name[len]) + continue; + array_descriptor = strchr(field->type, '['); + /* This is an array and is OK to dereference. */ + return array_descriptor != NULL; + } + return false; +} + +/* + * Examine the print fmt of the event looking for unsafe dereference + * pointers using %p* that could be recorded in the trace event and + * much later referenced after the pointer was freed. Dereferencing + * pointers are OK, if it is dereferenced into the event itself. + */ +static void test_event_printk(struct trace_event_call *call) +{ + u64 dereference_flags = 0; + bool first = true; + const char *fmt, *c, *r, *a; + int parens = 0; + char in_quote = 0; + int start_arg = 0; + int arg = 0; + int i; + + fmt = call->print_fmt; + + if (!fmt) + return; + + for (i = 0; fmt[i]; i++) { + switch (fmt[i]) { + case '\\': + i++; + if (!fmt[i]) + return; + continue; + case '"': + case '\'': + /* + * The print fmt starts with a string that + * is processed first to find %p* usage, + * then after the first string, the print fmt + * contains arguments that are used to check + * if the dereferenced %p* usage is safe. + */ + if (first) { + if (fmt[i] == '\'') + continue; + if (in_quote) { + arg = 0; + first = false; + /* + * If there was no %p* uses + * the fmt is OK. + */ + if (!dereference_flags) + return; + } + } + if (in_quote) { + if (in_quote == fmt[i]) + in_quote = 0; + } else { + in_quote = fmt[i]; + } + continue; + case '%': + if (!first || !in_quote) + continue; + i++; + if (!fmt[i]) + return; + switch (fmt[i]) { + case '%': + continue; + case 'p': + /* Find dereferencing fields */ + switch (fmt[i + 1]) { + case 'B': case 'R': case 'r': + case 'b': case 'M': case 'm': + case 'I': case 'i': case 'E': + case 'U': case 'V': case 'N': + case 'a': case 'd': case 'D': + case 'g': case 't': case 'C': + case 'O': case 'f': + if (WARN_ONCE(arg == 63, + "Too many args for event: %s", + trace_event_name(call))) + return; + dereference_flags |= 1ULL << arg; + } + break; + default: + { + bool star = false; + int j; + + /* Increment arg if %*s exists. */ + for (j = 0; fmt[i + j]; j++) { + if (isdigit(fmt[i + j]) || + fmt[i + j] == '.') + continue; + if (fmt[i + j] == '*') { + star = true; + continue; + } + if ((fmt[i + j] == 's') && star) + arg++; + break; + } + break; + } /* default */ + + } /* switch */ + arg++; + continue; + case '(': + if (in_quote) + continue; + parens++; + continue; + case ')': + if (in_quote) + continue; + parens--; + if (WARN_ONCE(parens < 0, + "Paren mismatch for event: %s\narg='%s'\n%*s", + trace_event_name(call), + fmt + start_arg, + (i - start_arg) + 5, "^")) + return; + continue; + case ',': + if (in_quote || parens) + continue; + i++; + while (isspace(fmt[i])) + i++; + start_arg = i; + if (!(dereference_flags & (1ULL << arg))) + goto next_arg; + + /* Find the REC-> in the argument */ + c = strchr(fmt + i, ','); + r = strstr(fmt + i, "REC->"); + if (r && (!c || r < c)) { + /* + * Addresses of events on the buffer, + * or an array on the buffer is + * OK to dereference. + * There's ways to fool this, but + * this is to catch common mistakes, + * not malicious code. + */ + a = strchr(fmt + i, '&'); + if ((a && (a < r)) || test_field(r, call)) + dereference_flags &= ~(1ULL << arg); + } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) && + (!c || r < c)) { + dereference_flags &= ~(1ULL << arg); + } else if ((r = strstr(fmt + i, "__get_sockaddr(")) && + (!c || r < c)) { + dereference_flags &= ~(1ULL << arg); + } + + next_arg: + i--; + arg++; + } + } + + /* + * If you triggered the below warning, the trace event reported + * uses an unsafe dereference pointer %p*. As the data stored + * at the trace event time may no longer exist when the trace + * event is printed, dereferencing to the original source is + * unsafe. The source of the dereference must be copied into the + * event itself, and the dereference must access the copy instead. + */ + if (WARN_ON_ONCE(dereference_flags)) { + arg = 1; + while (!(dereference_flags & 1)) { + dereference_flags >>= 1; + arg++; + } + pr_warn("event %s has unsafe dereference of argument %d\n", + trace_event_name(call), arg); + pr_warn("print_fmt: %s\n", fmt); + } +} + +int trace_event_raw_init(struct trace_event_call *call) +{ + int id; + + id = register_trace_event(&call->event); + if (!id) + return -ENODEV; + + test_event_printk(call); + + return 0; +} +EXPORT_SYMBOL_GPL(trace_event_raw_init); + +bool trace_event_ignore_this_pid(struct trace_event_file *trace_file) +{ + struct trace_array *tr = trace_file->tr; + struct trace_array_cpu *data; + struct trace_pid_list *no_pid_list; + struct trace_pid_list *pid_list; + + pid_list = rcu_dereference_raw(tr->filtered_pids); + no_pid_list = rcu_dereference_raw(tr->filtered_no_pids); + + if (!pid_list && !no_pid_list) + return false; + + data = this_cpu_ptr(tr->array_buffer.data); + + return data->ignore_pid; +} +EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid); + +void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, + struct trace_event_file *trace_file, + unsigned long len) +{ + struct trace_event_call *event_call = trace_file->event_call; + + if ((trace_file->flags & EVENT_FILE_FL_PID_FILTER) && + trace_event_ignore_this_pid(trace_file)) + return NULL; + + /* + * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables + * preemption (adding one to the preempt_count). Since we are + * interested in the preempt_count at the time the tracepoint was + * hit, we need to subtract one to offset the increment. + */ + fbuffer->trace_ctx = tracing_gen_ctx_dec(); + fbuffer->trace_file = trace_file; + + fbuffer->event = + trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file, + event_call->event.type, len, + fbuffer->trace_ctx); + if (!fbuffer->event) + return NULL; + + fbuffer->regs = NULL; + fbuffer->entry = ring_buffer_event_data(fbuffer->event); + return fbuffer->entry; +} +EXPORT_SYMBOL_GPL(trace_event_buffer_reserve); + +int trace_event_reg(struct trace_event_call *call, + enum trace_reg type, void *data) +{ + struct trace_event_file *file = data; + + WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT)); + switch (type) { + case TRACE_REG_REGISTER: + return tracepoint_probe_register(call->tp, + call->class->probe, + file); + case TRACE_REG_UNREGISTER: + tracepoint_probe_unregister(call->tp, + call->class->probe, + file); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return tracepoint_probe_register(call->tp, + call->class->perf_probe, + call); + case TRACE_REG_PERF_UNREGISTER: + tracepoint_probe_unregister(call->tp, + call->class->perf_probe, + call); + return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; +#endif + } + return 0; +} +EXPORT_SYMBOL_GPL(trace_event_reg); + +void trace_event_enable_cmd_record(bool enable) +{ + struct trace_event_file *file; + struct trace_array *tr; + + lockdep_assert_held(&event_mutex); + + do_for_each_event_file(tr, file) { + + if (!(file->flags & EVENT_FILE_FL_ENABLED)) + continue; + + if (enable) { + tracing_start_cmdline_record(); + set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); + } else { + tracing_stop_cmdline_record(); + clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); + } + } while_for_each_event_file(); +} + +void trace_event_enable_tgid_record(bool enable) +{ + struct trace_event_file *file; + struct trace_array *tr; + + lockdep_assert_held(&event_mutex); + + do_for_each_event_file(tr, file) { + if (!(file->flags & EVENT_FILE_FL_ENABLED)) + continue; + + if (enable) { + tracing_start_tgid_record(); + set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); + } else { + tracing_stop_tgid_record(); + clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, + &file->flags); + } + } while_for_each_event_file(); +} + +static int __ftrace_event_enable_disable(struct trace_event_file *file, + int enable, int soft_disable) +{ + struct trace_event_call *call = file->event_call; + struct trace_array *tr = file->tr; + int ret = 0; + int disable; + + switch (enable) { + case 0: + /* + * When soft_disable is set and enable is cleared, the sm_ref + * reference counter is decremented. If it reaches 0, we want + * to clear the SOFT_DISABLED flag but leave the event in the + * state that it was. That is, if the event was enabled and + * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED + * is set we do not want the event to be enabled before we + * clear the bit. + * + * When soft_disable is not set but the SOFT_MODE flag is, + * we do nothing. Do not disable the tracepoint, otherwise + * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. + */ + if (soft_disable) { + if (atomic_dec_return(&file->sm_ref) > 0) + break; + disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED; + clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); + /* Disable use of trace_buffered_event */ + trace_buffered_event_disable(); + } else + disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE); + + if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) { + clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); + if (file->flags & EVENT_FILE_FL_RECORDED_CMD) { + tracing_stop_cmdline_record(); + clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); + } + + if (file->flags & EVENT_FILE_FL_RECORDED_TGID) { + tracing_stop_tgid_record(); + clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); + } + + call->class->reg(call, TRACE_REG_UNREGISTER, file); + } + /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ + if (file->flags & EVENT_FILE_FL_SOFT_MODE) + set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); + else + clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); + break; + case 1: + /* + * When soft_disable is set and enable is set, we want to + * register the tracepoint for the event, but leave the event + * as is. That means, if the event was already enabled, we do + * nothing (but set SOFT_MODE). If the event is disabled, we + * set SOFT_DISABLED before enabling the event tracepoint, so + * it still seems to be disabled. + */ + if (!soft_disable) + clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); + else { + if (atomic_inc_return(&file->sm_ref) > 1) + break; + set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); + /* Enable use of trace_buffered_event */ + trace_buffered_event_enable(); + } + + if (!(file->flags & EVENT_FILE_FL_ENABLED)) { + bool cmd = false, tgid = false; + + /* Keep the event disabled, when going to SOFT_MODE. */ + if (soft_disable) + set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); + + if (tr->trace_flags & TRACE_ITER_RECORD_CMD) { + cmd = true; + tracing_start_cmdline_record(); + set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); + } + + if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { + tgid = true; + tracing_start_tgid_record(); + set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); + } + + ret = call->class->reg(call, TRACE_REG_REGISTER, file); + if (ret) { + if (cmd) + tracing_stop_cmdline_record(); + if (tgid) + tracing_stop_tgid_record(); + pr_info("event trace: Could not enable event " + "%s\n", trace_event_name(call)); + break; + } + set_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); + + /* WAS_ENABLED gets set but never cleared. */ + set_bit(EVENT_FILE_FL_WAS_ENABLED_BIT, &file->flags); + } + break; + } + + return ret; +} + +int trace_event_enable_disable(struct trace_event_file *file, + int enable, int soft_disable) +{ + return __ftrace_event_enable_disable(file, enable, soft_disable); +} + +static int ftrace_event_enable_disable(struct trace_event_file *file, + int enable) +{ + return __ftrace_event_enable_disable(file, enable, 0); +} + +static void ftrace_clear_events(struct trace_array *tr) +{ + struct trace_event_file *file; + + mutex_lock(&event_mutex); + list_for_each_entry(file, &tr->events, list) { + ftrace_event_enable_disable(file, 0); + } + mutex_unlock(&event_mutex); +} + +static void +event_filter_pid_sched_process_exit(void *data, struct task_struct *task) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = data; + + pid_list = rcu_dereference_raw(tr->filtered_pids); + trace_filter_add_remove_task(pid_list, NULL, task); + + pid_list = rcu_dereference_raw(tr->filtered_no_pids); + trace_filter_add_remove_task(pid_list, NULL, task); +} + +static void +event_filter_pid_sched_process_fork(void *data, + struct task_struct *self, + struct task_struct *task) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = data; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + trace_filter_add_remove_task(pid_list, self, task); + + pid_list = rcu_dereference_sched(tr->filtered_no_pids); + trace_filter_add_remove_task(pid_list, self, task); +} + +void trace_event_follow_fork(struct trace_array *tr, bool enable) +{ + if (enable) { + register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork, + tr, INT_MIN); + register_trace_prio_sched_process_free(event_filter_pid_sched_process_exit, + tr, INT_MAX); + } else { + unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork, + tr); + unregister_trace_sched_process_free(event_filter_pid_sched_process_exit, + tr); + } +} + +static void +event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + struct trace_array *tr = data; + struct trace_pid_list *no_pid_list; + struct trace_pid_list *pid_list; + bool ret; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); + + /* + * Sched switch is funny, as we only want to ignore it + * in the notrace case if both prev and next should be ignored. + */ + ret = trace_ignore_this_task(NULL, no_pid_list, prev) && + trace_ignore_this_task(NULL, no_pid_list, next); + + this_cpu_write(tr->array_buffer.data->ignore_pid, ret || + (trace_ignore_this_task(pid_list, NULL, prev) && + trace_ignore_this_task(pid_list, NULL, next))); +} + +static void +event_filter_pid_sched_switch_probe_post(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + struct trace_array *tr = data; + struct trace_pid_list *no_pid_list; + struct trace_pid_list *pid_list; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); + + this_cpu_write(tr->array_buffer.data->ignore_pid, + trace_ignore_this_task(pid_list, no_pid_list, next)); +} + +static void +event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) +{ + struct trace_array *tr = data; + struct trace_pid_list *no_pid_list; + struct trace_pid_list *pid_list; + + /* Nothing to do if we are already tracing */ + if (!this_cpu_read(tr->array_buffer.data->ignore_pid)) + return; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); + + this_cpu_write(tr->array_buffer.data->ignore_pid, + trace_ignore_this_task(pid_list, no_pid_list, task)); +} + +static void +event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) +{ + struct trace_array *tr = data; + struct trace_pid_list *no_pid_list; + struct trace_pid_list *pid_list; + + /* Nothing to do if we are not tracing */ + if (this_cpu_read(tr->array_buffer.data->ignore_pid)) + return; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + no_pid_list = rcu_dereference_sched(tr->filtered_no_pids); + + /* Set tracing if current is enabled */ + this_cpu_write(tr->array_buffer.data->ignore_pid, + trace_ignore_this_task(pid_list, no_pid_list, current)); +} + +static void unregister_pid_events(struct trace_array *tr) +{ + unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_pre, tr); + unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_post, tr); + + unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, tr); + unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr); + + unregister_trace_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, tr); + unregister_trace_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, tr); + + unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_pre, tr); + unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_post, tr); +} + +static void __ftrace_clear_event_pids(struct trace_array *tr, int type) +{ + struct trace_pid_list *pid_list; + struct trace_pid_list *no_pid_list; + struct trace_event_file *file; + int cpu; + + pid_list = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + no_pid_list = rcu_dereference_protected(tr->filtered_no_pids, + lockdep_is_held(&event_mutex)); + + /* Make sure there's something to do */ + if (!pid_type_enabled(type, pid_list, no_pid_list)) + return; + + if (!still_need_pid_events(type, pid_list, no_pid_list)) { + unregister_pid_events(tr); + + list_for_each_entry(file, &tr->events, list) { + clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); + } + + for_each_possible_cpu(cpu) + per_cpu_ptr(tr->array_buffer.data, cpu)->ignore_pid = false; + } + + if (type & TRACE_PIDS) + rcu_assign_pointer(tr->filtered_pids, NULL); + + if (type & TRACE_NO_PIDS) + rcu_assign_pointer(tr->filtered_no_pids, NULL); + + /* Wait till all users are no longer using pid filtering */ + tracepoint_synchronize_unregister(); + + if ((type & TRACE_PIDS) && pid_list) + trace_pid_list_free(pid_list); + + if ((type & TRACE_NO_PIDS) && no_pid_list) + trace_pid_list_free(no_pid_list); +} + +static void ftrace_clear_event_pids(struct trace_array *tr, int type) +{ + mutex_lock(&event_mutex); + __ftrace_clear_event_pids(tr, type); + mutex_unlock(&event_mutex); +} + +static void __put_system(struct event_subsystem *system) +{ + struct event_filter *filter = system->filter; + + WARN_ON_ONCE(system_refcount(system) == 0); + if (system_refcount_dec(system)) + return; + + list_del(&system->list); + + if (filter) { + kfree(filter->filter_string); + kfree(filter); + } + kfree_const(system->name); + kfree(system); +} + +static void __get_system(struct event_subsystem *system) +{ + WARN_ON_ONCE(system_refcount(system) == 0); + system_refcount_inc(system); +} + +static void __get_system_dir(struct trace_subsystem_dir *dir) +{ + WARN_ON_ONCE(dir->ref_count == 0); + dir->ref_count++; + __get_system(dir->subsystem); +} + +static void __put_system_dir(struct trace_subsystem_dir *dir) +{ + WARN_ON_ONCE(dir->ref_count == 0); + /* If the subsystem is about to be freed, the dir must be too */ + WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1); + + __put_system(dir->subsystem); + if (!--dir->ref_count) + kfree(dir); +} + +static void put_system(struct trace_subsystem_dir *dir) +{ + mutex_lock(&event_mutex); + __put_system_dir(dir); + mutex_unlock(&event_mutex); +} + +static void remove_subsystem(struct trace_subsystem_dir *dir) +{ + if (!dir) + return; + + if (!--dir->nr_events) { + eventfs_remove(dir->ef); + list_del(&dir->list); + __put_system_dir(dir); + } +} + +void event_file_get(struct trace_event_file *file) +{ + atomic_inc(&file->ref); +} + +void event_file_put(struct trace_event_file *file) +{ + if (WARN_ON_ONCE(!atomic_read(&file->ref))) { + if (file->flags & EVENT_FILE_FL_FREED) + kmem_cache_free(file_cachep, file); + return; + } + + if (atomic_dec_and_test(&file->ref)) { + /* Count should only go to zero when it is freed */ + if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED))) + return; + kmem_cache_free(file_cachep, file); + } +} + +static void remove_event_file_dir(struct trace_event_file *file) +{ + eventfs_remove(file->ef); + list_del(&file->list); + remove_subsystem(file->system); + free_event_filter(file->filter); + file->flags |= EVENT_FILE_FL_FREED; + event_file_put(file); +} + +/* + * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. + */ +static int +__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, + const char *sub, const char *event, int set) +{ + struct trace_event_file *file; + struct trace_event_call *call; + const char *name; + int ret = -EINVAL; + int eret = 0; + + list_for_each_entry(file, &tr->events, list) { + + call = file->event_call; + name = trace_event_name(call); + + if (!name || !call->class || !call->class->reg) + continue; + + if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) + continue; + + if (match && + strcmp(match, name) != 0 && + strcmp(match, call->class->system) != 0) + continue; + + if (sub && strcmp(sub, call->class->system) != 0) + continue; + + if (event && strcmp(event, name) != 0) + continue; + + ret = ftrace_event_enable_disable(file, set); + + /* + * Save the first error and return that. Some events + * may still have been enabled, but let the user + * know that something went wrong. + */ + if (ret && !eret) + eret = ret; + + ret = eret; + } + + return ret; +} + +static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, + const char *sub, const char *event, int set) +{ + int ret; + + mutex_lock(&event_mutex); + ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set); + mutex_unlock(&event_mutex); + + return ret; +} + +int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) +{ + char *event = NULL, *sub = NULL, *match; + int ret; + + if (!tr) + return -ENOENT; + /* + * The buf format can be <subsystem>:<event-name> + * *:<event-name> means any event by that name. + * :<event-name> is the same. + * + * <subsystem>:* means all events in that subsystem + * <subsystem>: means the same. + * + * <name> (no ':') means all events in a subsystem with + * the name <name> or any event that matches <name> + */ + + match = strsep(&buf, ":"); + if (buf) { + sub = match; + event = buf; + match = NULL; + + if (!strlen(sub) || strcmp(sub, "*") == 0) + sub = NULL; + if (!strlen(event) || strcmp(event, "*") == 0) + event = NULL; + } + + ret = __ftrace_set_clr_event(tr, match, sub, event, set); + + /* Put back the colon to allow this to be called again */ + if (buf) + *(buf - 1) = ':'; + + return ret; +} + +/** + * trace_set_clr_event - enable or disable an event + * @system: system name to match (NULL for any system) + * @event: event name to match (NULL for all events, within system) + * @set: 1 to enable, 0 to disable + * + * This is a way for other parts of the kernel to enable or disable + * event recording. + * + * Returns 0 on success, -EINVAL if the parameters do not match any + * registered events. + */ +int trace_set_clr_event(const char *system, const char *event, int set) +{ + struct trace_array *tr = top_trace_array(); + + if (!tr) + return -ENODEV; + + return __ftrace_set_clr_event(tr, NULL, system, event, set); +} +EXPORT_SYMBOL_GPL(trace_set_clr_event); + +/** + * trace_array_set_clr_event - enable or disable an event for a trace array. + * @tr: concerned trace array. + * @system: system name to match (NULL for any system) + * @event: event name to match (NULL for all events, within system) + * @enable: true to enable, false to disable + * + * This is a way for other parts of the kernel to enable or disable + * event recording. + * + * Returns 0 on success, -EINVAL if the parameters do not match any + * registered events. + */ +int trace_array_set_clr_event(struct trace_array *tr, const char *system, + const char *event, bool enable) +{ + int set; + + if (!tr) + return -ENOENT; + + set = (enable == true) ? 1 : 0; + return __ftrace_set_clr_event(tr, NULL, system, event, set); +} +EXPORT_SYMBOL_GPL(trace_array_set_clr_event); + +/* 128 should be much more than enough */ +#define EVENT_BUF_SIZE 127 + +static ssize_t +ftrace_event_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_parser parser; + struct seq_file *m = file->private_data; + struct trace_array *tr = m->private; + ssize_t read, ret; + + if (!cnt) + return 0; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) + return -ENOMEM; + + read = trace_get_user(&parser, ubuf, cnt, ppos); + + if (read >= 0 && trace_parser_loaded((&parser))) { + int set = 1; + + if (*parser.buffer == '!') + set = 0; + + ret = ftrace_set_clr_event(tr, parser.buffer + !set, set); + if (ret) + goto out_put; + } + + ret = read; + + out_put: + trace_parser_put(&parser); + + return ret; +} + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_event_file *file = v; + struct trace_event_call *call; + struct trace_array *tr = m->private; + + (*pos)++; + + list_for_each_entry_continue(file, &tr->events, list) { + call = file->event_call; + /* + * The ftrace subsystem is for showing formats only. + * They can not be enabled or disabled via the event files. + */ + if (call->class && call->class->reg && + !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) + return file; + } + + return NULL; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + struct trace_event_file *file; + struct trace_array *tr = m->private; + loff_t l; + + mutex_lock(&event_mutex); + + file = list_entry(&tr->events, struct trace_event_file, list); + for (l = 0; l <= *pos; ) { + file = t_next(m, file, &l); + if (!file) + break; + } + return file; +} + +static void * +s_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_event_file *file = v; + struct trace_array *tr = m->private; + + (*pos)++; + + list_for_each_entry_continue(file, &tr->events, list) { + if (file->flags & EVENT_FILE_FL_ENABLED) + return file; + } + + return NULL; +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + struct trace_event_file *file; + struct trace_array *tr = m->private; + loff_t l; + + mutex_lock(&event_mutex); + + file = list_entry(&tr->events, struct trace_event_file, list); + for (l = 0; l <= *pos; ) { + file = s_next(m, file, &l); + if (!file) + break; + } + return file; +} + +static int t_show(struct seq_file *m, void *v) +{ + struct trace_event_file *file = v; + struct trace_event_call *call = file->event_call; + + if (strcmp(call->class->system, TRACE_SYSTEM) != 0) + seq_printf(m, "%s:", call->class->system); + seq_printf(m, "%s\n", trace_event_name(call)); + + return 0; +} + +static void t_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&event_mutex); +} + +static void * +__next(struct seq_file *m, void *v, loff_t *pos, int type) +{ + struct trace_array *tr = m->private; + struct trace_pid_list *pid_list; + + if (type == TRACE_PIDS) + pid_list = rcu_dereference_sched(tr->filtered_pids); + else + pid_list = rcu_dereference_sched(tr->filtered_no_pids); + + return trace_pid_next(pid_list, v, pos); +} + +static void * +p_next(struct seq_file *m, void *v, loff_t *pos) +{ + return __next(m, v, pos, TRACE_PIDS); +} + +static void * +np_next(struct seq_file *m, void *v, loff_t *pos) +{ + return __next(m, v, pos, TRACE_NO_PIDS); +} + +static void *__start(struct seq_file *m, loff_t *pos, int type) + __acquires(RCU) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = m->private; + + /* + * Grab the mutex, to keep calls to p_next() having the same + * tr->filtered_pids as p_start() has. + * If we just passed the tr->filtered_pids around, then RCU would + * have been enough, but doing that makes things more complex. + */ + mutex_lock(&event_mutex); + rcu_read_lock_sched(); + + if (type == TRACE_PIDS) + pid_list = rcu_dereference_sched(tr->filtered_pids); + else + pid_list = rcu_dereference_sched(tr->filtered_no_pids); + + if (!pid_list) + return NULL; + + return trace_pid_start(pid_list, pos); +} + +static void *p_start(struct seq_file *m, loff_t *pos) + __acquires(RCU) +{ + return __start(m, pos, TRACE_PIDS); +} + +static void *np_start(struct seq_file *m, loff_t *pos) + __acquires(RCU) +{ + return __start(m, pos, TRACE_NO_PIDS); +} + +static void p_stop(struct seq_file *m, void *p) + __releases(RCU) +{ + rcu_read_unlock_sched(); + mutex_unlock(&event_mutex); +} + +static ssize_t +event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_event_file *file; + unsigned long flags; + char buf[4] = "0"; + + mutex_lock(&event_mutex); + file = event_file_data(filp); + if (likely(file)) + flags = file->flags; + mutex_unlock(&event_mutex); + + if (!file || flags & EVENT_FILE_FL_FREED) + return -ENODEV; + + if (flags & EVENT_FILE_FL_ENABLED && + !(flags & EVENT_FILE_FL_SOFT_DISABLED)) + strcpy(buf, "1"); + + if (flags & EVENT_FILE_FL_SOFT_DISABLED || + flags & EVENT_FILE_FL_SOFT_MODE) + strcat(buf, "*"); + + strcat(buf, "\n"); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf)); +} + +static ssize_t +event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_event_file *file; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + switch (val) { + case 0: + case 1: + ret = -ENODEV; + mutex_lock(&event_mutex); + file = event_file_data(filp); + if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) + ret = ftrace_event_enable_disable(file, val); + mutex_unlock(&event_mutex); + break; + + default: + return -EINVAL; + } + + *ppos += cnt; + + return ret ? ret : cnt; +} + +static ssize_t +system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + const char set_to_char[4] = { '?', '0', '1', 'X' }; + struct trace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; + struct trace_event_call *call; + struct trace_event_file *file; + struct trace_array *tr = dir->tr; + char buf[2]; + int set = 0; + int ret; + + mutex_lock(&event_mutex); + list_for_each_entry(file, &tr->events, list) { + call = file->event_call; + if ((call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) || + !trace_event_name(call) || !call->class || !call->class->reg) + continue; + + if (system && strcmp(call->class->system, system->name) != 0) + continue; + + /* + * We need to find out if all the events are set + * or if all events or cleared, or if we have + * a mixture. + */ + set |= (1 << !!(file->flags & EVENT_FILE_FL_ENABLED)); + + /* + * If we have a mixture, no need to look further. + */ + if (set == 3) + break; + } + mutex_unlock(&event_mutex); + + buf[0] = set_to_char[set]; + buf[1] = '\n'; + + ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); + + return ret; +} + +static ssize_t +system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; + const char *name = NULL; + unsigned long val; + ssize_t ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + if (val != 0 && val != 1) + return -EINVAL; + + /* + * Opening of "enable" adds a ref count to system, + * so the name is safe to use. + */ + if (system) + name = system->name; + + ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val); + if (ret) + goto out; + + ret = cnt; + +out: + *ppos += cnt; + + return ret; +} + +enum { + FORMAT_HEADER = 1, + FORMAT_FIELD_SEPERATOR = 2, + FORMAT_PRINTFMT = 3, +}; + +static void *f_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_event_call *call = event_file_data(m->private); + struct list_head *common_head = &ftrace_common_fields; + struct list_head *head = trace_get_fields(call); + struct list_head *node = v; + + (*pos)++; + + switch ((unsigned long)v) { + case FORMAT_HEADER: + node = common_head; + break; + + case FORMAT_FIELD_SEPERATOR: + node = head; + break; + + case FORMAT_PRINTFMT: + /* all done */ + return NULL; + } + + node = node->prev; + if (node == common_head) + return (void *)FORMAT_FIELD_SEPERATOR; + else if (node == head) + return (void *)FORMAT_PRINTFMT; + else + return node; +} + +static int f_show(struct seq_file *m, void *v) +{ + struct trace_event_call *call = event_file_data(m->private); + struct ftrace_event_field *field; + const char *array_descriptor; + + switch ((unsigned long)v) { + case FORMAT_HEADER: + seq_printf(m, "name: %s\n", trace_event_name(call)); + seq_printf(m, "ID: %d\n", call->event.type); + seq_puts(m, "format:\n"); + return 0; + + case FORMAT_FIELD_SEPERATOR: + seq_putc(m, '\n'); + return 0; + + case FORMAT_PRINTFMT: + seq_printf(m, "\nprint fmt: %s\n", + call->print_fmt); + return 0; + } + + field = list_entry(v, struct ftrace_event_field, link); + /* + * Smartly shows the array type(except dynamic array). + * Normal: + * field:TYPE VAR + * If TYPE := TYPE[LEN], it is shown: + * field:TYPE VAR[LEN] + */ + array_descriptor = strchr(field->type, '['); + + if (str_has_prefix(field->type, "__data_loc")) + array_descriptor = NULL; + + if (!array_descriptor) + seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", + field->type, field->name, field->offset, + field->size, !!field->is_signed); + else if (field->len) + seq_printf(m, "\tfield:%.*s %s[%d];\toffset:%u;\tsize:%u;\tsigned:%d;\n", + (int)(array_descriptor - field->type), + field->type, field->name, + field->len, field->offset, + field->size, !!field->is_signed); + else + seq_printf(m, "\tfield:%.*s %s[];\toffset:%u;\tsize:%u;\tsigned:%d;\n", + (int)(array_descriptor - field->type), + field->type, field->name, + field->offset, field->size, !!field->is_signed); + + return 0; +} + +static void *f_start(struct seq_file *m, loff_t *pos) +{ + void *p = (void *)FORMAT_HEADER; + loff_t l = 0; + + /* ->stop() is called even if ->start() fails */ + mutex_lock(&event_mutex); + if (!event_file_data(m->private)) + return ERR_PTR(-ENODEV); + + while (l < *pos && p) + p = f_next(m, p, &l); + + return p; +} + +static void f_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&event_mutex); +} + +static const struct seq_operations trace_format_seq_ops = { + .start = f_start, + .next = f_next, + .stop = f_stop, + .show = f_show, +}; + +static int trace_format_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + int ret; + + /* Do we want to hide event format files on tracefs lockdown? */ + + ret = seq_open(file, &trace_format_seq_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = file; + + return 0; +} + +static ssize_t +event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + int id = (long)event_file_data(filp); + char buf[32]; + int len; + + if (unlikely(!id)) + return -ENODEV; + + len = sprintf(buf, "%d\n", id); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); +} + +static ssize_t +event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_event_file *file; + struct trace_seq *s; + int r = -ENODEV; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + mutex_lock(&event_mutex); + file = event_file_data(filp); + if (file && !(file->flags & EVENT_FILE_FL_FREED)) + print_event_filter(file, s); + mutex_unlock(&event_mutex); + + if (file) + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, trace_seq_used(s)); + + kfree(s); + + return r; +} + +static ssize_t +event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_event_file *file; + char *buf; + int err = -ENODEV; + + if (cnt >= PAGE_SIZE) + return -EINVAL; + + buf = memdup_user_nul(ubuf, cnt); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + mutex_lock(&event_mutex); + file = event_file_data(filp); + if (file) + err = apply_event_filter(file, buf); + mutex_unlock(&event_mutex); + + kfree(buf); + if (err < 0) + return err; + + *ppos += cnt; + + return cnt; +} + +static LIST_HEAD(event_subsystems); + +static int subsystem_open(struct inode *inode, struct file *filp) +{ + struct trace_subsystem_dir *dir = NULL, *iter_dir; + struct trace_array *tr = NULL, *iter_tr; + struct event_subsystem *system = NULL; + int ret; + + if (tracing_is_disabled()) + return -ENODEV; + + /* Make sure the system still exists */ + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + list_for_each_entry(iter_tr, &ftrace_trace_arrays, list) { + list_for_each_entry(iter_dir, &iter_tr->systems, list) { + if (iter_dir == inode->i_private) { + /* Don't open systems with no events */ + tr = iter_tr; + dir = iter_dir; + if (dir->nr_events) { + __get_system_dir(dir); + system = dir->subsystem; + } + goto exit_loop; + } + } + } + exit_loop: + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + + if (!system) + return -ENODEV; + + /* Still need to increment the ref count of the system */ + if (trace_array_get(tr) < 0) { + put_system(dir); + return -ENODEV; + } + + ret = tracing_open_generic(inode, filp); + if (ret < 0) { + trace_array_put(tr); + put_system(dir); + } + + return ret; +} + +static int system_tr_open(struct inode *inode, struct file *filp) +{ + struct trace_subsystem_dir *dir; + struct trace_array *tr = inode->i_private; + int ret; + + /* Make a temporary dir that has no system but points to tr */ + dir = kzalloc(sizeof(*dir), GFP_KERNEL); + if (!dir) + return -ENOMEM; + + ret = tracing_open_generic_tr(inode, filp); + if (ret < 0) { + kfree(dir); + return ret; + } + dir->tr = tr; + filp->private_data = dir; + + return 0; +} + +static int subsystem_release(struct inode *inode, struct file *file) +{ + struct trace_subsystem_dir *dir = file->private_data; + + trace_array_put(dir->tr); + + /* + * If dir->subsystem is NULL, then this is a temporary + * descriptor that was made for a trace_array to enable + * all subsystems. + */ + if (dir->subsystem) + put_system(dir); + else + kfree(dir); + + return 0; +} + +static ssize_t +subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + print_subsystem_event_filter(system, s); + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, trace_seq_used(s)); + + kfree(s); + + return r; +} + +static ssize_t +subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_subsystem_dir *dir = filp->private_data; + char *buf; + int err; + + if (cnt >= PAGE_SIZE) + return -EINVAL; + + buf = memdup_user_nul(ubuf, cnt); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + err = apply_subsystem_event_filter(dir, buf); + kfree(buf); + if (err < 0) + return err; + + *ppos += cnt; + + return cnt; +} + +static ssize_t +show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + int (*func)(struct trace_seq *s) = filp->private_data; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + func(s); + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, trace_seq_used(s)); + + kfree(s); + + return r; +} + +static void ignore_task_cpu(void *data) +{ + struct trace_array *tr = data; + struct trace_pid_list *pid_list; + struct trace_pid_list *no_pid_list; + + /* + * This function is called by on_each_cpu() while the + * event_mutex is held. + */ + pid_list = rcu_dereference_protected(tr->filtered_pids, + mutex_is_locked(&event_mutex)); + no_pid_list = rcu_dereference_protected(tr->filtered_no_pids, + mutex_is_locked(&event_mutex)); + + this_cpu_write(tr->array_buffer.data->ignore_pid, + trace_ignore_this_task(pid_list, no_pid_list, current)); +} + +static void register_pid_events(struct trace_array *tr) +{ + /* + * Register a probe that is called before all other probes + * to set ignore_pid if next or prev do not match. + * Register a probe this is called after all other probes + * to only keep ignore_pid set if next pid matches. + */ + register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_post, + tr, 0); + + register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, + tr, 0); + + register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, + tr, 0); + + register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_post, + tr, 0); +} + +static ssize_t +event_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos, int type) +{ + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; + struct trace_pid_list *filtered_pids = NULL; + struct trace_pid_list *other_pids = NULL; + struct trace_pid_list *pid_list; + struct trace_event_file *file; + ssize_t ret; + + if (!cnt) + return 0; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + mutex_lock(&event_mutex); + + if (type == TRACE_PIDS) { + filtered_pids = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + other_pids = rcu_dereference_protected(tr->filtered_no_pids, + lockdep_is_held(&event_mutex)); + } else { + filtered_pids = rcu_dereference_protected(tr->filtered_no_pids, + lockdep_is_held(&event_mutex)); + other_pids = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + } + + ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); + if (ret < 0) + goto out; + + if (type == TRACE_PIDS) + rcu_assign_pointer(tr->filtered_pids, pid_list); + else + rcu_assign_pointer(tr->filtered_no_pids, pid_list); + + list_for_each_entry(file, &tr->events, list) { + set_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); + } + + if (filtered_pids) { + tracepoint_synchronize_unregister(); + trace_pid_list_free(filtered_pids); + } else if (pid_list && !other_pids) { + register_pid_events(tr); + } + + /* + * Ignoring of pids is done at task switch. But we have to + * check for those tasks that are currently running. + * Always do this in case a pid was appended or removed. + */ + on_each_cpu(ignore_task_cpu, tr, 1); + + out: + mutex_unlock(&event_mutex); + + if (ret > 0) + *ppos += ret; + + return ret; +} + +static ssize_t +ftrace_event_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return event_pid_write(filp, ubuf, cnt, ppos, TRACE_PIDS); +} + +static ssize_t +ftrace_event_npid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return event_pid_write(filp, ubuf, cnt, ppos, TRACE_NO_PIDS); +} + +static int ftrace_event_avail_open(struct inode *inode, struct file *file); +static int ftrace_event_set_open(struct inode *inode, struct file *file); +static int ftrace_event_set_pid_open(struct inode *inode, struct file *file); +static int ftrace_event_set_npid_open(struct inode *inode, struct file *file); +static int ftrace_event_release(struct inode *inode, struct file *file); + +static const struct seq_operations show_event_seq_ops = { + .start = t_start, + .next = t_next, + .show = t_show, + .stop = t_stop, +}; + +static const struct seq_operations show_set_event_seq_ops = { + .start = s_start, + .next = s_next, + .show = t_show, + .stop = t_stop, +}; + +static const struct seq_operations show_set_pid_seq_ops = { + .start = p_start, + .next = p_next, + .show = trace_pid_show, + .stop = p_stop, +}; + +static const struct seq_operations show_set_no_pid_seq_ops = { + .start = np_start, + .next = np_next, + .show = trace_pid_show, + .stop = p_stop, +}; + +static const struct file_operations ftrace_avail_fops = { + .open = ftrace_event_avail_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations ftrace_set_event_fops = { + .open = ftrace_event_set_open, + .read = seq_read, + .write = ftrace_event_write, + .llseek = seq_lseek, + .release = ftrace_event_release, +}; + +static const struct file_operations ftrace_set_event_pid_fops = { + .open = ftrace_event_set_pid_open, + .read = seq_read, + .write = ftrace_event_pid_write, + .llseek = seq_lseek, + .release = ftrace_event_release, +}; + +static const struct file_operations ftrace_set_event_notrace_pid_fops = { + .open = ftrace_event_set_npid_open, + .read = seq_read, + .write = ftrace_event_npid_write, + .llseek = seq_lseek, + .release = ftrace_event_release, +}; + +static const struct file_operations ftrace_enable_fops = { + .open = tracing_open_file_tr, + .read = event_enable_read, + .write = event_enable_write, + .release = tracing_release_file_tr, + .llseek = default_llseek, +}; + +static const struct file_operations ftrace_event_format_fops = { + .open = trace_format_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations ftrace_event_id_fops = { + .read = event_id_read, + .llseek = default_llseek, +}; + +static const struct file_operations ftrace_event_filter_fops = { + .open = tracing_open_file_tr, + .read = event_filter_read, + .write = event_filter_write, + .release = tracing_release_file_tr, + .llseek = default_llseek, +}; + +static const struct file_operations ftrace_subsystem_filter_fops = { + .open = subsystem_open, + .read = subsystem_filter_read, + .write = subsystem_filter_write, + .llseek = default_llseek, + .release = subsystem_release, +}; + +static const struct file_operations ftrace_system_enable_fops = { + .open = subsystem_open, + .read = system_enable_read, + .write = system_enable_write, + .llseek = default_llseek, + .release = subsystem_release, +}; + +static const struct file_operations ftrace_tr_enable_fops = { + .open = system_tr_open, + .read = system_enable_read, + .write = system_enable_write, + .llseek = default_llseek, + .release = subsystem_release, +}; + +static const struct file_operations ftrace_show_header_fops = { + .open = tracing_open_generic, + .read = show_header, + .llseek = default_llseek, +}; + +static int +ftrace_event_open(struct inode *inode, struct file *file, + const struct seq_operations *seq_ops) +{ + struct seq_file *m; + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + ret = seq_open(file, seq_ops); + if (ret < 0) + return ret; + m = file->private_data; + /* copy tr over to seq ops */ + m->private = inode->i_private; + + return ret; +} + +static int ftrace_event_release(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + + return seq_release(inode, file); +} + +static int +ftrace_event_avail_open(struct inode *inode, struct file *file) +{ + const struct seq_operations *seq_ops = &show_event_seq_ops; + + /* Checks for tracefs lockdown */ + return ftrace_event_open(inode, file, seq_ops); +} + +static int +ftrace_event_set_open(struct inode *inode, struct file *file) +{ + const struct seq_operations *seq_ops = &show_set_event_seq_ops; + struct trace_array *tr = inode->i_private; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_clear_events(tr); + + ret = ftrace_event_open(inode, file, seq_ops); + if (ret < 0) + trace_array_put(tr); + return ret; +} + +static int +ftrace_event_set_pid_open(struct inode *inode, struct file *file) +{ + const struct seq_operations *seq_ops = &show_set_pid_seq_ops; + struct trace_array *tr = inode->i_private; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_clear_event_pids(tr, TRACE_PIDS); + + ret = ftrace_event_open(inode, file, seq_ops); + if (ret < 0) + trace_array_put(tr); + return ret; +} + +static int +ftrace_event_set_npid_open(struct inode *inode, struct file *file) +{ + const struct seq_operations *seq_ops = &show_set_no_pid_seq_ops; + struct trace_array *tr = inode->i_private; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_clear_event_pids(tr, TRACE_NO_PIDS); + + ret = ftrace_event_open(inode, file, seq_ops); + if (ret < 0) + trace_array_put(tr); + return ret; +} + +static struct event_subsystem * +create_new_subsystem(const char *name) +{ + struct event_subsystem *system; + + /* need to create new entry */ + system = kmalloc(sizeof(*system), GFP_KERNEL); + if (!system) + return NULL; + + system->ref_count = 1; + + /* Only allocate if dynamic (kprobes and modules) */ + system->name = kstrdup_const(name, GFP_KERNEL); + if (!system->name) + goto out_free; + + system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); + if (!system->filter) + goto out_free; + + list_add(&system->list, &event_subsystems); + + return system; + + out_free: + kfree_const(system->name); + kfree(system); + return NULL; +} + +static struct eventfs_file * +event_subsystem_dir(struct trace_array *tr, const char *name, + struct trace_event_file *file, struct dentry *parent) +{ + struct event_subsystem *system, *iter; + struct trace_subsystem_dir *dir; + struct eventfs_file *ef; + int res; + + /* First see if we did not already create this dir */ + list_for_each_entry(dir, &tr->systems, list) { + system = dir->subsystem; + if (strcmp(system->name, name) == 0) { + dir->nr_events++; + file->system = dir; + return dir->ef; + } + } + + /* Now see if the system itself exists. */ + system = NULL; + list_for_each_entry(iter, &event_subsystems, list) { + if (strcmp(iter->name, name) == 0) { + system = iter; + break; + } + } + + dir = kmalloc(sizeof(*dir), GFP_KERNEL); + if (!dir) + goto out_fail; + + if (!system) { + system = create_new_subsystem(name); + if (!system) + goto out_free; + } else + __get_system(system); + + ef = eventfs_add_subsystem_dir(name, parent); + if (IS_ERR(ef)) { + pr_warn("Failed to create system directory %s\n", name); + __put_system(system); + goto out_free; + } + + dir->ef = ef; + dir->tr = tr; + dir->ref_count = 1; + dir->nr_events = 1; + dir->subsystem = system; + file->system = dir; + + /* the ftrace system is special, do not create enable or filter files */ + if (strcmp(name, "ftrace") != 0) { + + res = eventfs_add_file("filter", TRACE_MODE_WRITE, + dir->ef, dir, + &ftrace_subsystem_filter_fops); + if (res) { + kfree(system->filter); + system->filter = NULL; + pr_warn("Could not create tracefs '%s/filter' entry\n", name); + } + + eventfs_add_file("enable", TRACE_MODE_WRITE, dir->ef, dir, + &ftrace_system_enable_fops); + } + + list_add(&dir->list, &tr->systems); + + return dir->ef; + + out_free: + kfree(dir); + out_fail: + /* Only print this message if failed on memory allocation */ + if (!dir || !system) + pr_warn("No memory to create event subsystem %s\n", name); + return NULL; +} + +static int +event_define_fields(struct trace_event_call *call) +{ + struct list_head *head; + int ret = 0; + + /* + * Other events may have the same class. Only update + * the fields if they are not already defined. + */ + head = trace_get_fields(call); + if (list_empty(head)) { + struct trace_event_fields *field = call->class->fields_array; + unsigned int offset = sizeof(struct trace_entry); + + for (; field->type; field++) { + if (field->type == TRACE_FUNCTION_TYPE) { + field->define_fields(call); + break; + } + + offset = ALIGN(offset, field->align); + ret = trace_define_field_ext(call, field->type, field->name, + offset, field->size, + field->is_signed, field->filter_type, + field->len); + if (WARN_ON_ONCE(ret)) { + pr_err("error code is %d\n", ret); + break; + } + + offset += field->size; + } + } + + return ret; +} + +static int +event_create_dir(struct dentry *parent, struct trace_event_file *file) +{ + struct trace_event_call *call = file->event_call; + struct eventfs_file *ef_subsystem = NULL; + struct trace_array *tr = file->tr; + struct eventfs_file *ef; + const char *name; + int ret; + + /* + * If the trace point header did not define TRACE_SYSTEM + * then the system would be called "TRACE_SYSTEM". This should + * never happen. + */ + if (WARN_ON_ONCE(strcmp(call->class->system, TRACE_SYSTEM) == 0)) + return -ENODEV; + + ef_subsystem = event_subsystem_dir(tr, call->class->system, file, parent); + if (!ef_subsystem) + return -ENOMEM; + + name = trace_event_name(call); + ef = eventfs_add_dir(name, ef_subsystem); + if (IS_ERR(ef)) { + pr_warn("Could not create tracefs '%s' directory\n", name); + return -1; + } + + file->ef = ef; + + if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) + eventfs_add_file("enable", TRACE_MODE_WRITE, file->ef, file, + &ftrace_enable_fops); + +#ifdef CONFIG_PERF_EVENTS + if (call->event.type && call->class->reg) + eventfs_add_file("id", TRACE_MODE_READ, file->ef, + (void *)(long)call->event.type, + &ftrace_event_id_fops); +#endif + + ret = event_define_fields(call); + if (ret < 0) { + pr_warn("Could not initialize trace point events/%s\n", name); + return ret; + } + + /* + * Only event directories that can be enabled should have + * triggers or filters. + */ + if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) { + eventfs_add_file("filter", TRACE_MODE_WRITE, file->ef, + file, &ftrace_event_filter_fops); + + eventfs_add_file("trigger", TRACE_MODE_WRITE, file->ef, + file, &event_trigger_fops); + } + +#ifdef CONFIG_HIST_TRIGGERS + eventfs_add_file("hist", TRACE_MODE_READ, file->ef, file, + &event_hist_fops); +#endif +#ifdef CONFIG_HIST_TRIGGERS_DEBUG + eventfs_add_file("hist_debug", TRACE_MODE_READ, file->ef, file, + &event_hist_debug_fops); +#endif + eventfs_add_file("format", TRACE_MODE_READ, file->ef, call, + &ftrace_event_format_fops); + +#ifdef CONFIG_TRACE_EVENT_INJECT + if (call->event.type && call->class->reg) + eventfs_add_file("inject", 0200, file->ef, file, + &event_inject_fops); +#endif + + return 0; +} + +static void remove_event_from_tracers(struct trace_event_call *call) +{ + struct trace_event_file *file; + struct trace_array *tr; + + do_for_each_event_file_safe(tr, file) { + if (file->event_call != call) + continue; + + remove_event_file_dir(file); + /* + * The do_for_each_event_file_safe() is + * a double loop. After finding the call for this + * trace_array, we use break to jump to the next + * trace_array. + */ + break; + } while_for_each_event_file(); +} + +static void event_remove(struct trace_event_call *call) +{ + struct trace_array *tr; + struct trace_event_file *file; + + do_for_each_event_file(tr, file) { + if (file->event_call != call) + continue; + + if (file->flags & EVENT_FILE_FL_WAS_ENABLED) + tr->clear_trace = true; + + ftrace_event_enable_disable(file, 0); + /* + * The do_for_each_event_file() is + * a double loop. After finding the call for this + * trace_array, we use break to jump to the next + * trace_array. + */ + break; + } while_for_each_event_file(); + + if (call->event.funcs) + __unregister_trace_event(&call->event); + remove_event_from_tracers(call); + list_del(&call->list); +} + +static int event_init(struct trace_event_call *call) +{ + int ret = 0; + const char *name; + + name = trace_event_name(call); + if (WARN_ON(!name)) + return -EINVAL; + + if (call->class->raw_init) { + ret = call->class->raw_init(call); + if (ret < 0 && ret != -ENOSYS) + pr_warn("Could not initialize trace events/%s\n", name); + } + + return ret; +} + +static int +__register_event(struct trace_event_call *call, struct module *mod) +{ + int ret; + + ret = event_init(call); + if (ret < 0) + return ret; + + list_add(&call->list, &ftrace_events); + if (call->flags & TRACE_EVENT_FL_DYNAMIC) + atomic_set(&call->refcnt, 0); + else + call->module = mod; + + return 0; +} + +static char *eval_replace(char *ptr, struct trace_eval_map *map, int len) +{ + int rlen; + int elen; + + /* Find the length of the eval value as a string */ + elen = snprintf(ptr, 0, "%ld", map->eval_value); + /* Make sure there's enough room to replace the string with the value */ + if (len < elen) + return NULL; + + snprintf(ptr, elen + 1, "%ld", map->eval_value); + + /* Get the rest of the string of ptr */ + rlen = strlen(ptr + len); + memmove(ptr + elen, ptr + len, rlen); + /* Make sure we end the new string */ + ptr[elen + rlen] = 0; + + return ptr + elen; +} + +static void update_event_printk(struct trace_event_call *call, + struct trace_eval_map *map) +{ + char *ptr; + int quote = 0; + int len = strlen(map->eval_string); + + for (ptr = call->print_fmt; *ptr; ptr++) { + if (*ptr == '\\') { + ptr++; + /* paranoid */ + if (!*ptr) + break; + continue; + } + if (*ptr == '"') { + quote ^= 1; + continue; + } + if (quote) + continue; + if (isdigit(*ptr)) { + /* skip numbers */ + do { + ptr++; + /* Check for alpha chars like ULL */ + } while (isalnum(*ptr)); + if (!*ptr) + break; + /* + * A number must have some kind of delimiter after + * it, and we can ignore that too. + */ + continue; + } + if (isalpha(*ptr) || *ptr == '_') { + if (strncmp(map->eval_string, ptr, len) == 0 && + !isalnum(ptr[len]) && ptr[len] != '_') { + ptr = eval_replace(ptr, map, len); + /* enum/sizeof string smaller than value */ + if (WARN_ON_ONCE(!ptr)) + return; + /* + * No need to decrement here, as eval_replace() + * returns the pointer to the character passed + * the eval, and two evals can not be placed + * back to back without something in between. + * We can skip that something in between. + */ + continue; + } + skip_more: + do { + ptr++; + } while (isalnum(*ptr) || *ptr == '_'); + if (!*ptr) + break; + /* + * If what comes after this variable is a '.' or + * '->' then we can continue to ignore that string. + */ + if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) { + ptr += *ptr == '.' ? 1 : 2; + if (!*ptr) + break; + goto skip_more; + } + /* + * Once again, we can skip the delimiter that came + * after the string. + */ + continue; + } + } +} + +static void add_str_to_module(struct module *module, char *str) +{ + struct module_string *modstr; + + modstr = kmalloc(sizeof(*modstr), GFP_KERNEL); + + /* + * If we failed to allocate memory here, then we'll just + * let the str memory leak when the module is removed. + * If this fails to allocate, there's worse problems than + * a leaked string on module removal. + */ + if (WARN_ON_ONCE(!modstr)) + return; + + modstr->module = module; + modstr->str = str; + + list_add(&modstr->next, &module_strings); +} + +static void update_event_fields(struct trace_event_call *call, + struct trace_eval_map *map) +{ + struct ftrace_event_field *field; + struct list_head *head; + char *ptr; + char *str; + int len = strlen(map->eval_string); + + /* Dynamic events should never have field maps */ + if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC)) + return; + + head = trace_get_fields(call); + list_for_each_entry(field, head, link) { + ptr = strchr(field->type, '['); + if (!ptr) + continue; + ptr++; + + if (!isalpha(*ptr) && *ptr != '_') + continue; + + if (strncmp(map->eval_string, ptr, len) != 0) + continue; + + str = kstrdup(field->type, GFP_KERNEL); + if (WARN_ON_ONCE(!str)) + return; + ptr = str + (ptr - field->type); + ptr = eval_replace(ptr, map, len); + /* enum/sizeof string smaller than value */ + if (WARN_ON_ONCE(!ptr)) { + kfree(str); + continue; + } + + /* + * If the event is part of a module, then we need to free the string + * when the module is removed. Otherwise, it will stay allocated + * until a reboot. + */ + if (call->module) + add_str_to_module(call->module, str); + + field->type = str; + } +} + +void trace_event_eval_update(struct trace_eval_map **map, int len) +{ + struct trace_event_call *call, *p; + const char *last_system = NULL; + bool first = false; + int last_i; + int i; + + down_write(&trace_event_sem); + list_for_each_entry_safe(call, p, &ftrace_events, list) { + /* events are usually grouped together with systems */ + if (!last_system || call->class->system != last_system) { + first = true; + last_i = 0; + last_system = call->class->system; + } + + /* + * Since calls are grouped by systems, the likelihood that the + * next call in the iteration belongs to the same system as the + * previous call is high. As an optimization, we skip searching + * for a map[] that matches the call's system if the last call + * was from the same system. That's what last_i is for. If the + * call has the same system as the previous call, then last_i + * will be the index of the first map[] that has a matching + * system. + */ + for (i = last_i; i < len; i++) { + if (call->class->system == map[i]->system) { + /* Save the first system if need be */ + if (first) { + last_i = i; + first = false; + } + update_event_printk(call, map[i]); + update_event_fields(call, map[i]); + } + } + cond_resched(); + } + up_write(&trace_event_sem); +} + +static struct trace_event_file * +trace_create_new_event(struct trace_event_call *call, + struct trace_array *tr) +{ + struct trace_pid_list *no_pid_list; + struct trace_pid_list *pid_list; + struct trace_event_file *file; + unsigned int first; + + file = kmem_cache_alloc(file_cachep, GFP_TRACE); + if (!file) + return NULL; + + pid_list = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + no_pid_list = rcu_dereference_protected(tr->filtered_no_pids, + lockdep_is_held(&event_mutex)); + + if (!trace_pid_list_first(pid_list, &first) || + !trace_pid_list_first(no_pid_list, &first)) + file->flags |= EVENT_FILE_FL_PID_FILTER; + + file->event_call = call; + file->tr = tr; + atomic_set(&file->sm_ref, 0); + atomic_set(&file->tm_ref, 0); + INIT_LIST_HEAD(&file->triggers); + list_add(&file->list, &tr->events); + event_file_get(file); + + return file; +} + +#define MAX_BOOT_TRIGGERS 32 + +static struct boot_triggers { + const char *event; + char *trigger; +} bootup_triggers[MAX_BOOT_TRIGGERS]; + +static char bootup_trigger_buf[COMMAND_LINE_SIZE]; +static int nr_boot_triggers; + +static __init int setup_trace_triggers(char *str) +{ + char *trigger; + char *buf; + int i; + + strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); + ring_buffer_expanded = true; + disable_tracing_selftest("running event triggers"); + + buf = bootup_trigger_buf; + for (i = 0; i < MAX_BOOT_TRIGGERS; i++) { + trigger = strsep(&buf, ","); + if (!trigger) + break; + bootup_triggers[i].event = strsep(&trigger, "."); + bootup_triggers[i].trigger = trigger; + if (!bootup_triggers[i].trigger) + break; + } + + nr_boot_triggers = i; + return 1; +} +__setup("trace_trigger=", setup_trace_triggers); + +/* Add an event to a trace directory */ +static int +__trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) +{ + struct trace_event_file *file; + + file = trace_create_new_event(call, tr); + if (!file) + return -ENOMEM; + + if (eventdir_initialized) + return event_create_dir(tr->event_dir, file); + else + return event_define_fields(call); +} + +static void trace_early_triggers(struct trace_event_file *file, const char *name) +{ + int ret; + int i; + + for (i = 0; i < nr_boot_triggers; i++) { + if (strcmp(name, bootup_triggers[i].event)) + continue; + mutex_lock(&event_mutex); + ret = trigger_process_regex(file, bootup_triggers[i].trigger); + mutex_unlock(&event_mutex); + if (ret) + pr_err("Failed to register trigger '%s' on event %s\n", + bootup_triggers[i].trigger, + bootup_triggers[i].event); + } +} + +/* + * Just create a descriptor for early init. A descriptor is required + * for enabling events at boot. We want to enable events before + * the filesystem is initialized. + */ +static int +__trace_early_add_new_event(struct trace_event_call *call, + struct trace_array *tr) +{ + struct trace_event_file *file; + int ret; + + file = trace_create_new_event(call, tr); + if (!file) + return -ENOMEM; + + ret = event_define_fields(call); + if (ret) + return ret; + + trace_early_triggers(file, trace_event_name(call)); + + return 0; +} + +struct ftrace_module_file_ops; +static void __add_event_to_tracers(struct trace_event_call *call); + +/* Add an additional event_call dynamically */ +int trace_add_event_call(struct trace_event_call *call) +{ + int ret; + lockdep_assert_held(&event_mutex); + + mutex_lock(&trace_types_lock); + + ret = __register_event(call, NULL); + if (ret >= 0) + __add_event_to_tracers(call); + + mutex_unlock(&trace_types_lock); + return ret; +} +EXPORT_SYMBOL_GPL(trace_add_event_call); + +/* + * Must be called under locking of trace_types_lock, event_mutex and + * trace_event_sem. + */ +static void __trace_remove_event_call(struct trace_event_call *call) +{ + event_remove(call); + trace_destroy_fields(call); + free_event_filter(call->filter); + call->filter = NULL; +} + +static int probe_remove_event_call(struct trace_event_call *call) +{ + struct trace_array *tr; + struct trace_event_file *file; + +#ifdef CONFIG_PERF_EVENTS + if (call->perf_refcount) + return -EBUSY; +#endif + do_for_each_event_file(tr, file) { + if (file->event_call != call) + continue; + /* + * We can't rely on ftrace_event_enable_disable(enable => 0) + * we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress + * TRACE_REG_UNREGISTER. + */ + if (file->flags & EVENT_FILE_FL_ENABLED) + goto busy; + + if (file->flags & EVENT_FILE_FL_WAS_ENABLED) + tr->clear_trace = true; + /* + * The do_for_each_event_file_safe() is + * a double loop. After finding the call for this + * trace_array, we use break to jump to the next + * trace_array. + */ + break; + } while_for_each_event_file(); + + __trace_remove_event_call(call); + + return 0; + busy: + /* No need to clear the trace now */ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + tr->clear_trace = false; + } + return -EBUSY; +} + +/* Remove an event_call */ +int trace_remove_event_call(struct trace_event_call *call) +{ + int ret; + + lockdep_assert_held(&event_mutex); + + mutex_lock(&trace_types_lock); + down_write(&trace_event_sem); + ret = probe_remove_event_call(call); + up_write(&trace_event_sem); + mutex_unlock(&trace_types_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(trace_remove_event_call); + +#define for_each_event(event, start, end) \ + for (event = start; \ + (unsigned long)event < (unsigned long)end; \ + event++) + +#ifdef CONFIG_MODULES + +static void trace_module_add_events(struct module *mod) +{ + struct trace_event_call **call, **start, **end; + + if (!mod->num_trace_events) + return; + + /* Don't add infrastructure for mods without tracepoints */ + if (trace_module_has_bad_taint(mod)) { + pr_err("%s: module has bad taint, not creating trace events\n", + mod->name); + return; + } + + start = mod->trace_events; + end = mod->trace_events + mod->num_trace_events; + + for_each_event(call, start, end) { + __register_event(*call, mod); + __add_event_to_tracers(*call); + } +} + +static void trace_module_remove_events(struct module *mod) +{ + struct trace_event_call *call, *p; + struct module_string *modstr, *m; + + down_write(&trace_event_sem); + list_for_each_entry_safe(call, p, &ftrace_events, list) { + if ((call->flags & TRACE_EVENT_FL_DYNAMIC) || !call->module) + continue; + if (call->module == mod) + __trace_remove_event_call(call); + } + /* Check for any strings allocade for this module */ + list_for_each_entry_safe(modstr, m, &module_strings, next) { + if (modstr->module != mod) + continue; + list_del(&modstr->next); + kfree(modstr->str); + kfree(modstr); + } + up_write(&trace_event_sem); + + /* + * It is safest to reset the ring buffer if the module being unloaded + * registered any events that were used. The only worry is if + * a new module gets loaded, and takes on the same id as the events + * of this module. When printing out the buffer, traced events left + * over from this module may be passed to the new module events and + * unexpected results may occur. + */ + tracing_reset_all_online_cpus_unlocked(); +} + +static int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + switch (val) { + case MODULE_STATE_COMING: + trace_module_add_events(mod); + break; + case MODULE_STATE_GOING: + trace_module_remove_events(mod); + break; + } + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + + return NOTIFY_OK; +} + +static struct notifier_block trace_module_nb = { + .notifier_call = trace_module_notify, + .priority = 1, /* higher than trace.c module notify */ +}; +#endif /* CONFIG_MODULES */ + +/* Create a new event directory structure for a trace directory. */ +static void +__trace_add_event_dirs(struct trace_array *tr) +{ + struct trace_event_call *call; + int ret; + + list_for_each_entry(call, &ftrace_events, list) { + ret = __trace_add_new_event(call, tr); + if (ret < 0) + pr_warn("Could not create directory for event %s\n", + trace_event_name(call)); + } +} + +/* Returns any file that matches the system and event */ +struct trace_event_file * +__find_event_file(struct trace_array *tr, const char *system, const char *event) +{ + struct trace_event_file *file; + struct trace_event_call *call; + const char *name; + + list_for_each_entry(file, &tr->events, list) { + + call = file->event_call; + name = trace_event_name(call); + + if (!name || !call->class) + continue; + + if (strcmp(event, name) == 0 && + strcmp(system, call->class->system) == 0) + return file; + } + return NULL; +} + +/* Returns valid trace event files that match system and event */ +struct trace_event_file * +find_event_file(struct trace_array *tr, const char *system, const char *event) +{ + struct trace_event_file *file; + + file = __find_event_file(tr, system, event); + if (!file || !file->event_call->class->reg || + file->event_call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) + return NULL; + + return file; +} + +/** + * trace_get_event_file - Find and return a trace event file + * @instance: The name of the trace instance containing the event + * @system: The name of the system containing the event + * @event: The name of the event + * + * Return a trace event file given the trace instance name, trace + * system, and trace event name. If the instance name is NULL, it + * refers to the top-level trace array. + * + * This function will look it up and return it if found, after calling + * trace_array_get() to prevent the instance from going away, and + * increment the event's module refcount to prevent it from being + * removed. + * + * To release the file, call trace_put_event_file(), which will call + * trace_array_put() and decrement the event's module refcount. + * + * Return: The trace event on success, ERR_PTR otherwise. + */ +struct trace_event_file *trace_get_event_file(const char *instance, + const char *system, + const char *event) +{ + struct trace_array *tr = top_trace_array(); + struct trace_event_file *file = NULL; + int ret = -EINVAL; + + if (instance) { + tr = trace_array_find_get(instance); + if (!tr) + return ERR_PTR(-ENOENT); + } else { + ret = trace_array_get(tr); + if (ret) + return ERR_PTR(ret); + } + + mutex_lock(&event_mutex); + + file = find_event_file(tr, system, event); + if (!file) { + trace_array_put(tr); + ret = -EINVAL; + goto out; + } + + /* Don't let event modules unload while in use */ + ret = trace_event_try_get_ref(file->event_call); + if (!ret) { + trace_array_put(tr); + ret = -EBUSY; + goto out; + } + + ret = 0; + out: + mutex_unlock(&event_mutex); + + if (ret) + file = ERR_PTR(ret); + + return file; +} +EXPORT_SYMBOL_GPL(trace_get_event_file); + +/** + * trace_put_event_file - Release a file from trace_get_event_file() + * @file: The trace event file + * + * If a file was retrieved using trace_get_event_file(), this should + * be called when it's no longer needed. It will cancel the previous + * trace_array_get() called by that function, and decrement the + * event's module refcount. + */ +void trace_put_event_file(struct trace_event_file *file) +{ + mutex_lock(&event_mutex); + trace_event_put_ref(file->event_call); + mutex_unlock(&event_mutex); + + trace_array_put(file->tr); +} +EXPORT_SYMBOL_GPL(trace_put_event_file); + +#ifdef CONFIG_DYNAMIC_FTRACE + +/* Avoid typos */ +#define ENABLE_EVENT_STR "enable_event" +#define DISABLE_EVENT_STR "disable_event" + +struct event_probe_data { + struct trace_event_file *file; + unsigned long count; + int ref; + bool enable; +}; + +static void update_event_probe(struct event_probe_data *data) +{ + if (data->enable) + clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); + else + set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); +} + +static void +event_enable_probe(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) +{ + struct ftrace_func_mapper *mapper = data; + struct event_probe_data *edata; + void **pdata; + + pdata = ftrace_func_mapper_find_ip(mapper, ip); + if (!pdata || !*pdata) + return; + + edata = *pdata; + update_event_probe(edata); +} + +static void +event_enable_count_probe(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) +{ + struct ftrace_func_mapper *mapper = data; + struct event_probe_data *edata; + void **pdata; + + pdata = ftrace_func_mapper_find_ip(mapper, ip); + if (!pdata || !*pdata) + return; + + edata = *pdata; + + if (!edata->count) + return; + + /* Skip if the event is in a state we want to switch to */ + if (edata->enable == !(edata->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) + return; + + if (edata->count != -1) + (edata->count)--; + + update_event_probe(edata); +} + +static int +event_enable_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + struct ftrace_func_mapper *mapper = data; + struct event_probe_data *edata; + void **pdata; + + pdata = ftrace_func_mapper_find_ip(mapper, ip); + + if (WARN_ON_ONCE(!pdata || !*pdata)) + return 0; + + edata = *pdata; + + seq_printf(m, "%ps:", (void *)ip); + + seq_printf(m, "%s:%s:%s", + edata->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, + edata->file->event_call->class->system, + trace_event_name(edata->file->event_call)); + + if (edata->count == -1) + seq_puts(m, ":unlimited\n"); + else + seq_printf(m, ":count=%ld\n", edata->count); + + return 0; +} + +static int +event_enable_init(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *init_data, void **data) +{ + struct ftrace_func_mapper *mapper = *data; + struct event_probe_data *edata = init_data; + int ret; + + if (!mapper) { + mapper = allocate_ftrace_func_mapper(); + if (!mapper) + return -ENODEV; + *data = mapper; + } + + ret = ftrace_func_mapper_add_ip(mapper, ip, edata); + if (ret < 0) + return ret; + + edata->ref++; + + return 0; +} + +static int free_probe_data(void *data) +{ + struct event_probe_data *edata = data; + + edata->ref--; + if (!edata->ref) { + /* Remove the SOFT_MODE flag */ + __ftrace_event_enable_disable(edata->file, 0, 1); + trace_event_put_ref(edata->file->event_call); + kfree(edata); + } + return 0; +} + +static void +event_enable_free(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *data) +{ + struct ftrace_func_mapper *mapper = data; + struct event_probe_data *edata; + + if (!ip) { + if (!mapper) + return; + free_ftrace_func_mapper(mapper, free_probe_data); + return; + } + + edata = ftrace_func_mapper_remove_ip(mapper, ip); + + if (WARN_ON_ONCE(!edata)) + return; + + if (WARN_ON_ONCE(edata->ref <= 0)) + return; + + free_probe_data(edata); +} + +static struct ftrace_probe_ops event_enable_probe_ops = { + .func = event_enable_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static struct ftrace_probe_ops event_enable_count_probe_ops = { + .func = event_enable_count_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static struct ftrace_probe_ops event_disable_probe_ops = { + .func = event_enable_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static struct ftrace_probe_ops event_disable_count_probe_ops = { + .func = event_enable_count_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static int +event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enabled) +{ + struct trace_event_file *file; + struct ftrace_probe_ops *ops; + struct event_probe_data *data; + const char *system; + const char *event; + char *number; + bool enable; + int ret; + + if (!tr) + return -ENODEV; + + /* hash funcs only work with set_ftrace_filter */ + if (!enabled || !param) + return -EINVAL; + + system = strsep(¶m, ":"); + if (!param) + return -EINVAL; + + event = strsep(¶m, ":"); + + mutex_lock(&event_mutex); + + ret = -EINVAL; + file = find_event_file(tr, system, event); + if (!file) + goto out; + + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + + if (enable) + ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops; + else + ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; + + if (glob[0] == '!') { + ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); + goto out; + } + + ret = -ENOMEM; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out; + + data->enable = enable; + data->count = -1; + data->file = file; + + if (!param) + goto out_reg; + + number = strsep(¶m, ":"); + + ret = -EINVAL; + if (!strlen(number)) + goto out_free; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &data->count); + if (ret) + goto out_free; + + out_reg: + /* Don't let event modules unload while probe registered */ + ret = trace_event_try_get_ref(file->event_call); + if (!ret) { + ret = -EBUSY; + goto out_free; + } + + ret = __ftrace_event_enable_disable(file, 1, 1); + if (ret < 0) + goto out_put; + + ret = register_ftrace_function_probe(glob, tr, ops, data); + /* + * The above returns on success the # of functions enabled, + * but if it didn't find any functions it returns zero. + * Consider no functions a failure too. + */ + if (!ret) { + ret = -ENOENT; + goto out_disable; + } else if (ret < 0) + goto out_disable; + /* Just return zero, not the number of enabled functions */ + ret = 0; + out: + mutex_unlock(&event_mutex); + return ret; + + out_disable: + __ftrace_event_enable_disable(file, 0, 1); + out_put: + trace_event_put_ref(file->event_call); + out_free: + kfree(data); + goto out; +} + +static struct ftrace_func_command event_enable_cmd = { + .name = ENABLE_EVENT_STR, + .func = event_enable_func, +}; + +static struct ftrace_func_command event_disable_cmd = { + .name = DISABLE_EVENT_STR, + .func = event_enable_func, +}; + +static __init int register_event_cmds(void) +{ + int ret; + + ret = register_ftrace_command(&event_enable_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_ftrace_command(&event_disable_cmd); + if (WARN_ON(ret < 0)) + unregister_ftrace_command(&event_enable_cmd); + return ret; +} +#else +static inline int register_event_cmds(void) { return 0; } +#endif /* CONFIG_DYNAMIC_FTRACE */ + +/* + * The top level array and trace arrays created by boot-time tracing + * have already had its trace_event_file descriptors created in order + * to allow for early events to be recorded. + * This function is called after the tracefs has been initialized, + * and we now have to create the files associated to the events. + */ +static void __trace_early_add_event_dirs(struct trace_array *tr) +{ + struct trace_event_file *file; + int ret; + + + list_for_each_entry(file, &tr->events, list) { + ret = event_create_dir(tr->event_dir, file); + if (ret < 0) + pr_warn("Could not create directory for event %s\n", + trace_event_name(file->event_call)); + } +} + +/* + * For early boot up, the top trace array and the trace arrays created + * by boot-time tracing require to have a list of events that can be + * enabled. This must be done before the filesystem is set up in order + * to allow events to be traced early. + */ +void __trace_early_add_events(struct trace_array *tr) +{ + struct trace_event_call *call; + int ret; + + list_for_each_entry(call, &ftrace_events, list) { + /* Early boot up should not have any modules loaded */ + if (!(call->flags & TRACE_EVENT_FL_DYNAMIC) && + WARN_ON_ONCE(call->module)) + continue; + + ret = __trace_early_add_new_event(call, tr); + if (ret < 0) + pr_warn("Could not create early event %s\n", + trace_event_name(call)); + } +} + +/* Remove the event directory structure for a trace directory. */ +static void +__trace_remove_event_dirs(struct trace_array *tr) +{ + struct trace_event_file *file, *next; + + list_for_each_entry_safe(file, next, &tr->events, list) + remove_event_file_dir(file); +} + +static void __add_event_to_tracers(struct trace_event_call *call) +{ + struct trace_array *tr; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) + __trace_add_new_event(call, tr); +} + +extern struct trace_event_call *__start_ftrace_events[]; +extern struct trace_event_call *__stop_ftrace_events[]; + +static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; + +static __init int setup_trace_event(char *str) +{ + strscpy(bootup_event_buf, str, COMMAND_LINE_SIZE); + ring_buffer_expanded = true; + disable_tracing_selftest("running event tracing"); + + return 1; +} +__setup("trace_event=", setup_trace_event); + +/* Expects to have event_mutex held when called */ +static int +create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) +{ + struct dentry *d_events; + struct dentry *entry; + int error = 0; + + entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent, + tr, &ftrace_set_event_fops); + if (!entry) + return -ENOMEM; + + d_events = eventfs_create_events_dir("events", parent); + if (IS_ERR(d_events)) { + pr_warn("Could not create tracefs 'events' directory\n"); + return -ENOMEM; + } + + error = eventfs_add_events_file("enable", TRACE_MODE_WRITE, d_events, + tr, &ftrace_tr_enable_fops); + if (error) + return -ENOMEM; + + /* There are not as crucial, just warn if they are not created */ + + trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent, + tr, &ftrace_set_event_pid_fops); + + trace_create_file("set_event_notrace_pid", + TRACE_MODE_WRITE, parent, tr, + &ftrace_set_event_notrace_pid_fops); + + /* ring buffer internal formats */ + eventfs_add_events_file("header_page", TRACE_MODE_READ, d_events, + ring_buffer_print_page_header, + &ftrace_show_header_fops); + + eventfs_add_events_file("header_event", TRACE_MODE_READ, d_events, + ring_buffer_print_entry_header, + &ftrace_show_header_fops); + + tr->event_dir = d_events; + + return 0; +} + +/** + * event_trace_add_tracer - add a instance of a trace_array to events + * @parent: The parent dentry to place the files/directories for events in + * @tr: The trace array associated with these events + * + * When a new instance is created, it needs to set up its events + * directory, as well as other files associated with events. It also + * creates the event hierarchy in the @parent/events directory. + * + * Returns 0 on success. + * + * Must be called with event_mutex held. + */ +int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) +{ + int ret; + + lockdep_assert_held(&event_mutex); + + ret = create_event_toplevel_files(parent, tr); + if (ret) + goto out; + + down_write(&trace_event_sem); + /* If tr already has the event list, it is initialized in early boot. */ + if (unlikely(!list_empty(&tr->events))) + __trace_early_add_event_dirs(tr); + else + __trace_add_event_dirs(tr); + up_write(&trace_event_sem); + + out: + return ret; +} + +/* + * The top trace array already had its file descriptors created. + * Now the files themselves need to be created. + */ +static __init int +early_event_add_tracer(struct dentry *parent, struct trace_array *tr) +{ + int ret; + + mutex_lock(&event_mutex); + + ret = create_event_toplevel_files(parent, tr); + if (ret) + goto out_unlock; + + down_write(&trace_event_sem); + __trace_early_add_event_dirs(tr); + up_write(&trace_event_sem); + + out_unlock: + mutex_unlock(&event_mutex); + + return ret; +} + +/* Must be called with event_mutex held */ +int event_trace_del_tracer(struct trace_array *tr) +{ + lockdep_assert_held(&event_mutex); + + /* Disable any event triggers and associated soft-disabled events */ + clear_event_triggers(tr); + + /* Clear the pid list */ + __ftrace_clear_event_pids(tr, TRACE_PIDS | TRACE_NO_PIDS); + + /* Disable any running events */ + __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); + + /* Make sure no more events are being executed */ + tracepoint_synchronize_unregister(); + + down_write(&trace_event_sem); + __trace_remove_event_dirs(tr); + eventfs_remove_events_dir(tr->event_dir); + up_write(&trace_event_sem); + + tr->event_dir = NULL; + + return 0; +} + +static __init int event_trace_memsetup(void) +{ + field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC); + file_cachep = KMEM_CACHE(trace_event_file, SLAB_PANIC); + return 0; +} + +__init void +early_enable_events(struct trace_array *tr, char *buf, bool disable_first) +{ + char *token; + int ret; + + while (true) { + token = strsep(&buf, ","); + + if (!token) + break; + + if (*token) { + /* Restarting syscalls requires that we stop them first */ + if (disable_first) + ftrace_set_clr_event(tr, token, 0); + + ret = ftrace_set_clr_event(tr, token, 1); + if (ret) + pr_warn("Failed to enable trace event: %s\n", token); + } + + /* Put back the comma to allow this to be called again */ + if (buf) + *(buf - 1) = ','; + } +} + +static __init int event_trace_enable(void) +{ + struct trace_array *tr = top_trace_array(); + struct trace_event_call **iter, *call; + int ret; + + if (!tr) + return -ENODEV; + + for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { + + call = *iter; + ret = event_init(call); + if (!ret) + list_add(&call->list, &ftrace_events); + } + + register_trigger_cmds(); + + /* + * We need the top trace array to have a working set of trace + * points at early init, before the debug files and directories + * are created. Create the file entries now, and attach them + * to the actual file dentries later. + */ + __trace_early_add_events(tr); + + early_enable_events(tr, bootup_event_buf, false); + + trace_printk_start_comm(); + + register_event_cmds(); + + + return 0; +} + +/* + * event_trace_enable() is called from trace_event_init() first to + * initialize events and perhaps start any events that are on the + * command line. Unfortunately, there are some events that will not + * start this early, like the system call tracepoints that need + * to set the %SYSCALL_WORK_SYSCALL_TRACEPOINT flag of pid 1. But + * event_trace_enable() is called before pid 1 starts, and this flag + * is never set, making the syscall tracepoint never get reached, but + * the event is enabled regardless (and not doing anything). + */ +static __init int event_trace_enable_again(void) +{ + struct trace_array *tr; + + tr = top_trace_array(); + if (!tr) + return -ENODEV; + + early_enable_events(tr, bootup_event_buf, true); + + return 0; +} + +early_initcall(event_trace_enable_again); + +/* Init fields which doesn't related to the tracefs */ +static __init int event_trace_init_fields(void) +{ + if (trace_define_generic_fields()) + pr_warn("tracing: Failed to allocated generic fields"); + + if (trace_define_common_fields()) + pr_warn("tracing: Failed to allocate common fields"); + + return 0; +} + +__init int event_trace_init(void) +{ + struct trace_array *tr; + int ret; + + tr = top_trace_array(); + if (!tr) + return -ENODEV; + + trace_create_file("available_events", TRACE_MODE_READ, + NULL, tr, &ftrace_avail_fops); + + ret = early_event_add_tracer(NULL, tr); + if (ret) + return ret; + +#ifdef CONFIG_MODULES + ret = register_module_notifier(&trace_module_nb); + if (ret) + pr_warn("Failed to register trace events module notifier\n"); +#endif + + eventdir_initialized = true; + + return 0; +} + +void __init trace_event_init(void) +{ + event_trace_memsetup(); + init_ftrace_syscalls(); + event_trace_enable(); + event_trace_init_fields(); +} + +#ifdef CONFIG_EVENT_TRACE_STARTUP_TEST + +static DEFINE_SPINLOCK(test_spinlock); +static DEFINE_SPINLOCK(test_spinlock_irq); +static DEFINE_MUTEX(test_mutex); + +static __init void test_work(struct work_struct *dummy) +{ + spin_lock(&test_spinlock); + spin_lock_irq(&test_spinlock_irq); + udelay(1); + spin_unlock_irq(&test_spinlock_irq); + spin_unlock(&test_spinlock); + + mutex_lock(&test_mutex); + msleep(1); + mutex_unlock(&test_mutex); +} + +static __init int event_test_thread(void *unused) +{ + void *test_malloc; + + test_malloc = kmalloc(1234, GFP_KERNEL); + if (!test_malloc) + pr_info("failed to kmalloc\n"); + + schedule_on_each_cpu(test_work); + + kfree(test_malloc); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + + return 0; +} + +/* + * Do various things that may trigger events. + */ +static __init void event_test_stuff(void) +{ + struct task_struct *test_thread; + + test_thread = kthread_run(event_test_thread, NULL, "test-events"); + msleep(1); + kthread_stop(test_thread); +} + +/* + * For every trace event defined, we will test each trace point separately, + * and then by groups, and finally all trace points. + */ +static __init void event_trace_self_tests(void) +{ + struct trace_subsystem_dir *dir; + struct trace_event_file *file; + struct trace_event_call *call; + struct event_subsystem *system; + struct trace_array *tr; + int ret; + + tr = top_trace_array(); + if (!tr) + return; + + pr_info("Running tests on trace events:\n"); + + list_for_each_entry(file, &tr->events, list) { + + call = file->event_call; + + /* Only test those that have a probe */ + if (!call->class || !call->class->probe) + continue; + +/* + * Testing syscall events here is pretty useless, but + * we still do it if configured. But this is time consuming. + * What we really need is a user thread to perform the + * syscalls as we test. + */ +#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS + if (call->class->system && + strcmp(call->class->system, "syscalls") == 0) + continue; +#endif + + pr_info("Testing event %s: ", trace_event_name(call)); + + /* + * If an event is already enabled, someone is using + * it and the self test should not be on. + */ + if (file->flags & EVENT_FILE_FL_ENABLED) { + pr_warn("Enabled event during self test!\n"); + WARN_ON_ONCE(1); + continue; + } + + ftrace_event_enable_disable(file, 1); + event_test_stuff(); + ftrace_event_enable_disable(file, 0); + + pr_cont("OK\n"); + } + + /* Now test at the sub system level */ + + pr_info("Running tests on trace event systems:\n"); + + list_for_each_entry(dir, &tr->systems, list) { + + system = dir->subsystem; + + /* the ftrace system is special, skip it */ + if (strcmp(system->name, "ftrace") == 0) + continue; + + pr_info("Testing event system %s: ", system->name); + + ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1); + if (WARN_ON_ONCE(ret)) { + pr_warn("error enabling system %s\n", + system->name); + continue; + } + + event_test_stuff(); + + ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); + if (WARN_ON_ONCE(ret)) { + pr_warn("error disabling system %s\n", + system->name); + continue; + } + + pr_cont("OK\n"); + } + + /* Test with all events enabled */ + + pr_info("Running tests on all trace events:\n"); + pr_info("Testing all events: "); + + ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1); + if (WARN_ON_ONCE(ret)) { + pr_warn("error enabling all events\n"); + return; + } + + event_test_stuff(); + + /* reset sysname */ + ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); + if (WARN_ON_ONCE(ret)) { + pr_warn("error disabling all events\n"); + return; + } + + pr_cont("OK\n"); +} + +#ifdef CONFIG_FUNCTION_TRACER + +static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); + +static struct trace_event_file event_trace_file __initdata; + +static void __init +function_test_events_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *regs) +{ + struct trace_buffer *buffer; + struct ring_buffer_event *event; + struct ftrace_entry *entry; + unsigned int trace_ctx; + long disabled; + int cpu; + + trace_ctx = tracing_gen_ctx(); + preempt_disable_notrace(); + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); + + if (disabled != 1) + goto out; + + event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file, + TRACE_FN, sizeof(*entry), + trace_ctx); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->parent_ip = parent_ip; + + event_trigger_unlock_commit(&event_trace_file, buffer, event, + entry, trace_ctx); + out: + atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); + preempt_enable_notrace(); +} + +static struct ftrace_ops trace_ops __initdata = +{ + .func = function_test_events_call, +}; + +static __init void event_trace_self_test_with_function(void) +{ + int ret; + + event_trace_file.tr = top_trace_array(); + if (WARN_ON(!event_trace_file.tr)) + return; + + ret = register_ftrace_function(&trace_ops); + if (WARN_ON(ret < 0)) { + pr_info("Failed to enable function tracer for event tests\n"); + return; + } + pr_info("Running tests again, along with the function tracer\n"); + event_trace_self_tests(); + unregister_ftrace_function(&trace_ops); +} +#else +static __init void event_trace_self_test_with_function(void) +{ +} +#endif + +static __init int event_trace_self_tests_init(void) +{ + if (!tracing_selftest_disabled) { + event_trace_self_tests(); + event_trace_self_test_with_function(); + } + + return 0; +} + +late_initcall(event_trace_self_tests_init); + +#endif diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c new file mode 100644 index 0000000000..0c611b281a --- /dev/null +++ b/kernel/trace/trace_events_filter.c @@ -0,0 +1,2827 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace_events_filter - generic event filtering + * + * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> + */ + +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/mutex.h> +#include <linux/perf_event.h> +#include <linux/slab.h> + +#include "trace.h" +#include "trace_output.h" + +#define DEFAULT_SYS_FILTER_MESSAGE \ + "### global filter ###\n" \ + "# Use this to set filters for multiple events.\n" \ + "# Only events with the given fields will be affected.\n" \ + "# If no events are modified, an error message will be displayed here" + +/* Due to token parsing '<=' must be before '<' and '>=' must be before '>' */ +#define OPS \ + C( OP_GLOB, "~" ), \ + C( OP_NE, "!=" ), \ + C( OP_EQ, "==" ), \ + C( OP_LE, "<=" ), \ + C( OP_LT, "<" ), \ + C( OP_GE, ">=" ), \ + C( OP_GT, ">" ), \ + C( OP_BAND, "&" ), \ + C( OP_MAX, NULL ) + +#undef C +#define C(a, b) a + +enum filter_op_ids { OPS }; + +#undef C +#define C(a, b) b + +static const char * ops[] = { OPS }; + +enum filter_pred_fn { + FILTER_PRED_FN_NOP, + FILTER_PRED_FN_64, + FILTER_PRED_FN_64_CPUMASK, + FILTER_PRED_FN_S64, + FILTER_PRED_FN_U64, + FILTER_PRED_FN_32, + FILTER_PRED_FN_32_CPUMASK, + FILTER_PRED_FN_S32, + FILTER_PRED_FN_U32, + FILTER_PRED_FN_16, + FILTER_PRED_FN_16_CPUMASK, + FILTER_PRED_FN_S16, + FILTER_PRED_FN_U16, + FILTER_PRED_FN_8, + FILTER_PRED_FN_8_CPUMASK, + FILTER_PRED_FN_S8, + FILTER_PRED_FN_U8, + FILTER_PRED_FN_COMM, + FILTER_PRED_FN_STRING, + FILTER_PRED_FN_STRLOC, + FILTER_PRED_FN_STRRELLOC, + FILTER_PRED_FN_PCHAR_USER, + FILTER_PRED_FN_PCHAR, + FILTER_PRED_FN_CPU, + FILTER_PRED_FN_CPU_CPUMASK, + FILTER_PRED_FN_CPUMASK, + FILTER_PRED_FN_CPUMASK_CPU, + FILTER_PRED_FN_FUNCTION, + FILTER_PRED_FN_, + FILTER_PRED_TEST_VISITED, +}; + +struct filter_pred { + struct regex *regex; + struct cpumask *mask; + unsigned short *ops; + struct ftrace_event_field *field; + u64 val; + u64 val2; + enum filter_pred_fn fn_num; + int offset; + int not; + int op; +}; + +/* + * pred functions are OP_LE, OP_LT, OP_GE, OP_GT, and OP_BAND + * pred_funcs_##type below must match the order of them above. + */ +#define PRED_FUNC_START OP_LE +#define PRED_FUNC_MAX (OP_BAND - PRED_FUNC_START) + +#define ERRORS \ + C(NONE, "No error"), \ + C(INVALID_OP, "Invalid operator"), \ + C(TOO_MANY_OPEN, "Too many '('"), \ + C(TOO_MANY_CLOSE, "Too few '('"), \ + C(MISSING_QUOTE, "Missing matching quote"), \ + C(MISSING_BRACE_OPEN, "Missing '{'"), \ + C(MISSING_BRACE_CLOSE, "Missing '}'"), \ + C(OPERAND_TOO_LONG, "Operand too long"), \ + C(EXPECT_STRING, "Expecting string field"), \ + C(EXPECT_DIGIT, "Expecting numeric field"), \ + C(ILLEGAL_FIELD_OP, "Illegal operation for field type"), \ + C(FIELD_NOT_FOUND, "Field not found"), \ + C(ILLEGAL_INTVAL, "Illegal integer value"), \ + C(BAD_SUBSYS_FILTER, "Couldn't find or set field in one of a subsystem's events"), \ + C(TOO_MANY_PREDS, "Too many terms in predicate expression"), \ + C(INVALID_FILTER, "Meaningless filter expression"), \ + C(INVALID_CPULIST, "Invalid cpulist"), \ + C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ + C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \ + C(NO_FUNCTION, "Function not found"), \ + C(ERRNO, "Error"), \ + C(NO_FILTER, "No filter found") + +#undef C +#define C(a, b) FILT_ERR_##a + +enum { ERRORS }; + +#undef C +#define C(a, b) b + +static const char *err_text[] = { ERRORS }; + +/* Called after a '!' character but "!=" and "!~" are not "not"s */ +static bool is_not(const char *str) +{ + switch (str[1]) { + case '=': + case '~': + return false; + } + return true; +} + +/** + * struct prog_entry - a singe entry in the filter program + * @target: Index to jump to on a branch (actually one minus the index) + * @when_to_branch: The value of the result of the predicate to do a branch + * @pred: The predicate to execute. + */ +struct prog_entry { + int target; + int when_to_branch; + struct filter_pred *pred; +}; + +/** + * update_preds - assign a program entry a label target + * @prog: The program array + * @N: The index of the current entry in @prog + * @invert: What to assign a program entry for its branch condition + * + * The program entry at @N has a target that points to the index of a program + * entry that can have its target and when_to_branch fields updated. + * Update the current program entry denoted by index @N target field to be + * that of the updated entry. This will denote the entry to update if + * we are processing an "||" after an "&&". + */ +static void update_preds(struct prog_entry *prog, int N, int invert) +{ + int t, s; + + t = prog[N].target; + s = prog[t].target; + prog[t].when_to_branch = invert; + prog[t].target = N; + prog[N].target = s; +} + +struct filter_parse_error { + int lasterr; + int lasterr_pos; +}; + +static void parse_error(struct filter_parse_error *pe, int err, int pos) +{ + pe->lasterr = err; + pe->lasterr_pos = pos; +} + +typedef int (*parse_pred_fn)(const char *str, void *data, int pos, + struct filter_parse_error *pe, + struct filter_pred **pred); + +enum { + INVERT = 1, + PROCESS_AND = 2, + PROCESS_OR = 4, +}; + +static void free_predicate(struct filter_pred *pred) +{ + if (pred) { + kfree(pred->regex); + kfree(pred->mask); + kfree(pred); + } +} + +/* + * Without going into a formal proof, this explains the method that is used in + * parsing the logical expressions. + * + * For example, if we have: "a && !(!b || (c && g)) || d || e && !f" + * The first pass will convert it into the following program: + * + * n1: r=a; l1: if (!r) goto l4; + * n2: r=b; l2: if (!r) goto l4; + * n3: r=c; r=!r; l3: if (r) goto l4; + * n4: r=g; r=!r; l4: if (r) goto l5; + * n5: r=d; l5: if (r) goto T + * n6: r=e; l6: if (!r) goto l7; + * n7: r=f; r=!r; l7: if (!r) goto F + * T: return TRUE + * F: return FALSE + * + * To do this, we use a data structure to represent each of the above + * predicate and conditions that has: + * + * predicate, when_to_branch, invert, target + * + * The "predicate" will hold the function to determine the result "r". + * The "when_to_branch" denotes what "r" should be if a branch is to be taken + * "&&" would contain "!r" or (0) and "||" would contain "r" or (1). + * The "invert" holds whether the value should be reversed before testing. + * The "target" contains the label "l#" to jump to. + * + * A stack is created to hold values when parentheses are used. + * + * To simplify the logic, the labels will start at 0 and not 1. + * + * The possible invert values are 1 and 0. The number of "!"s that are in scope + * before the predicate determines the invert value, if the number is odd then + * the invert value is 1 and 0 otherwise. This means the invert value only + * needs to be toggled when a new "!" is introduced compared to what is stored + * on the stack, where parentheses were used. + * + * The top of the stack and "invert" are initialized to zero. + * + * ** FIRST PASS ** + * + * #1 A loop through all the tokens is done: + * + * #2 If the token is an "(", the stack is push, and the current stack value + * gets the current invert value, and the loop continues to the next token. + * The top of the stack saves the "invert" value to keep track of what + * the current inversion is. As "!(a && !b || c)" would require all + * predicates being affected separately by the "!" before the parentheses. + * And that would end up being equivalent to "(!a || b) && !c" + * + * #3 If the token is an "!", the current "invert" value gets inverted, and + * the loop continues. Note, if the next token is a predicate, then + * this "invert" value is only valid for the current program entry, + * and does not affect other predicates later on. + * + * The only other acceptable token is the predicate string. + * + * #4 A new entry into the program is added saving: the predicate and the + * current value of "invert". The target is currently assigned to the + * previous program index (this will not be its final value). + * + * #5 We now enter another loop and look at the next token. The only valid + * tokens are ")", "&&", "||" or end of the input string "\0". + * + * #6 The invert variable is reset to the current value saved on the top of + * the stack. + * + * #7 The top of the stack holds not only the current invert value, but also + * if a "&&" or "||" needs to be processed. Note, the "&&" takes higher + * precedence than "||". That is "a && b || c && d" is equivalent to + * "(a && b) || (c && d)". Thus the first thing to do is to see if "&&" needs + * to be processed. This is the case if an "&&" was the last token. If it was + * then we call update_preds(). This takes the program, the current index in + * the program, and the current value of "invert". More will be described + * below about this function. + * + * #8 If the next token is "&&" then we set a flag in the top of the stack + * that denotes that "&&" needs to be processed, break out of this loop + * and continue with the outer loop. + * + * #9 Otherwise, if a "||" needs to be processed then update_preds() is called. + * This is called with the program, the current index in the program, but + * this time with an inverted value of "invert" (that is !invert). This is + * because the value taken will become the "when_to_branch" value of the + * program. + * Note, this is called when the next token is not an "&&". As stated before, + * "&&" takes higher precedence, and "||" should not be processed yet if the + * next logical operation is "&&". + * + * #10 If the next token is "||" then we set a flag in the top of the stack + * that denotes that "||" needs to be processed, break out of this loop + * and continue with the outer loop. + * + * #11 If this is the end of the input string "\0" then we break out of both + * loops. + * + * #12 Otherwise, the next token is ")", where we pop the stack and continue + * this inner loop. + * + * Now to discuss the update_pred() function, as that is key to the setting up + * of the program. Remember the "target" of the program is initialized to the + * previous index and not the "l" label. The target holds the index into the + * program that gets affected by the operand. Thus if we have something like + * "a || b && c", when we process "a" the target will be "-1" (undefined). + * When we process "b", its target is "0", which is the index of "a", as that's + * the predicate that is affected by "||". But because the next token after "b" + * is "&&" we don't call update_preds(). Instead continue to "c". As the + * next token after "c" is not "&&" but the end of input, we first process the + * "&&" by calling update_preds() for the "&&" then we process the "||" by + * calling updates_preds() with the values for processing "||". + * + * What does that mean? What update_preds() does is to first save the "target" + * of the program entry indexed by the current program entry's "target" + * (remember the "target" is initialized to previous program entry), and then + * sets that "target" to the current index which represents the label "l#". + * That entry's "when_to_branch" is set to the value passed in (the "invert" + * or "!invert"). Then it sets the current program entry's target to the saved + * "target" value (the old value of the program that had its "target" updated + * to the label). + * + * Looking back at "a || b && c", we have the following steps: + * "a" - prog[0] = { "a", X, -1 } // pred, when_to_branch, target + * "||" - flag that we need to process "||"; continue outer loop + * "b" - prog[1] = { "b", X, 0 } + * "&&" - flag that we need to process "&&"; continue outer loop + * (Notice we did not process "||") + * "c" - prog[2] = { "c", X, 1 } + * update_preds(prog, 2, 0); // invert = 0 as we are processing "&&" + * t = prog[2].target; // t = 1 + * s = prog[t].target; // s = 0 + * prog[t].target = 2; // Set target to "l2" + * prog[t].when_to_branch = 0; + * prog[2].target = s; + * update_preds(prog, 2, 1); // invert = 1 as we are now processing "||" + * t = prog[2].target; // t = 0 + * s = prog[t].target; // s = -1 + * prog[t].target = 2; // Set target to "l2" + * prog[t].when_to_branch = 1; + * prog[2].target = s; + * + * #13 Which brings us to the final step of the first pass, which is to set + * the last program entry's when_to_branch and target, which will be + * when_to_branch = 0; target = N; ( the label after the program entry after + * the last program entry processed above). + * + * If we denote "TRUE" to be the entry after the last program entry processed, + * and "FALSE" the program entry after that, we are now done with the first + * pass. + * + * Making the above "a || b && c" have a program of: + * prog[0] = { "a", 1, 2 } + * prog[1] = { "b", 0, 2 } + * prog[2] = { "c", 0, 3 } + * + * Which translates into: + * n0: r = a; l0: if (r) goto l2; + * n1: r = b; l1: if (!r) goto l2; + * n2: r = c; l2: if (!r) goto l3; // Which is the same as "goto F;" + * T: return TRUE; l3: + * F: return FALSE + * + * Although, after the first pass, the program is correct, it is + * inefficient. The simple sample of "a || b && c" could be easily been + * converted into: + * n0: r = a; if (r) goto T + * n1: r = b; if (!r) goto F + * n2: r = c; if (!r) goto F + * T: return TRUE; + * F: return FALSE; + * + * The First Pass is over the input string. The next too passes are over + * the program itself. + * + * ** SECOND PASS ** + * + * Which brings us to the second pass. If a jump to a label has the + * same condition as that label, it can instead jump to its target. + * The original example of "a && !(!b || (c && g)) || d || e && !f" + * where the first pass gives us: + * + * n1: r=a; l1: if (!r) goto l4; + * n2: r=b; l2: if (!r) goto l4; + * n3: r=c; r=!r; l3: if (r) goto l4; + * n4: r=g; r=!r; l4: if (r) goto l5; + * n5: r=d; l5: if (r) goto T + * n6: r=e; l6: if (!r) goto l7; + * n7: r=f; r=!r; l7: if (!r) goto F: + * T: return TRUE; + * F: return FALSE + * + * We can see that "l3: if (r) goto l4;" and at l4, we have "if (r) goto l5;". + * And "l5: if (r) goto T", we could optimize this by converting l3 and l4 + * to go directly to T. To accomplish this, we start from the last + * entry in the program and work our way back. If the target of the entry + * has the same "when_to_branch" then we could use that entry's target. + * Doing this, the above would end up as: + * + * n1: r=a; l1: if (!r) goto l4; + * n2: r=b; l2: if (!r) goto l4; + * n3: r=c; r=!r; l3: if (r) goto T; + * n4: r=g; r=!r; l4: if (r) goto T; + * n5: r=d; l5: if (r) goto T; + * n6: r=e; l6: if (!r) goto F; + * n7: r=f; r=!r; l7: if (!r) goto F; + * T: return TRUE + * F: return FALSE + * + * In that same pass, if the "when_to_branch" doesn't match, we can simply + * go to the program entry after the label. That is, "l2: if (!r) goto l4;" + * where "l4: if (r) goto T;", then we can convert l2 to be: + * "l2: if (!r) goto n5;". + * + * This will have the second pass give us: + * n1: r=a; l1: if (!r) goto n5; + * n2: r=b; l2: if (!r) goto n5; + * n3: r=c; r=!r; l3: if (r) goto T; + * n4: r=g; r=!r; l4: if (r) goto T; + * n5: r=d; l5: if (r) goto T + * n6: r=e; l6: if (!r) goto F; + * n7: r=f; r=!r; l7: if (!r) goto F + * T: return TRUE + * F: return FALSE + * + * Notice, all the "l#" labels are no longer used, and they can now + * be discarded. + * + * ** THIRD PASS ** + * + * For the third pass we deal with the inverts. As they simply just + * make the "when_to_branch" get inverted, a simple loop over the + * program to that does: "when_to_branch ^= invert;" will do the + * job, leaving us with: + * n1: r=a; if (!r) goto n5; + * n2: r=b; if (!r) goto n5; + * n3: r=c: if (!r) goto T; + * n4: r=g; if (!r) goto T; + * n5: r=d; if (r) goto T + * n6: r=e; if (!r) goto F; + * n7: r=f; if (r) goto F + * T: return TRUE + * F: return FALSE + * + * As "r = a; if (!r) goto n5;" is obviously the same as + * "if (!a) goto n5;" without doing anything we can interpret the + * program as: + * n1: if (!a) goto n5; + * n2: if (!b) goto n5; + * n3: if (!c) goto T; + * n4: if (!g) goto T; + * n5: if (d) goto T + * n6: if (!e) goto F; + * n7: if (f) goto F + * T: return TRUE + * F: return FALSE + * + * Since the inverts are discarded at the end, there's no reason to store + * them in the program array (and waste memory). A separate array to hold + * the inverts is used and freed at the end. + */ +static struct prog_entry * +predicate_parse(const char *str, int nr_parens, int nr_preds, + parse_pred_fn parse_pred, void *data, + struct filter_parse_error *pe) +{ + struct prog_entry *prog_stack; + struct prog_entry *prog; + const char *ptr = str; + char *inverts = NULL; + int *op_stack; + int *top; + int invert = 0; + int ret = -ENOMEM; + int len; + int N = 0; + int i; + + nr_preds += 2; /* For TRUE and FALSE */ + + op_stack = kmalloc_array(nr_parens, sizeof(*op_stack), GFP_KERNEL); + if (!op_stack) + return ERR_PTR(-ENOMEM); + prog_stack = kcalloc(nr_preds, sizeof(*prog_stack), GFP_KERNEL); + if (!prog_stack) { + parse_error(pe, -ENOMEM, 0); + goto out_free; + } + inverts = kmalloc_array(nr_preds, sizeof(*inverts), GFP_KERNEL); + if (!inverts) { + parse_error(pe, -ENOMEM, 0); + goto out_free; + } + + top = op_stack; + prog = prog_stack; + *top = 0; + + /* First pass */ + while (*ptr) { /* #1 */ + const char *next = ptr++; + + if (isspace(*next)) + continue; + + switch (*next) { + case '(': /* #2 */ + if (top - op_stack > nr_parens) { + ret = -EINVAL; + goto out_free; + } + *(++top) = invert; + continue; + case '!': /* #3 */ + if (!is_not(next)) + break; + invert = !invert; + continue; + } + + if (N >= nr_preds) { + parse_error(pe, FILT_ERR_TOO_MANY_PREDS, next - str); + goto out_free; + } + + inverts[N] = invert; /* #4 */ + prog[N].target = N-1; + + len = parse_pred(next, data, ptr - str, pe, &prog[N].pred); + if (len < 0) { + ret = len; + goto out_free; + } + ptr = next + len; + + N++; + + ret = -1; + while (1) { /* #5 */ + next = ptr++; + if (isspace(*next)) + continue; + + switch (*next) { + case ')': + case '\0': + break; + case '&': + case '|': + /* accepting only "&&" or "||" */ + if (next[1] == next[0]) { + ptr++; + break; + } + fallthrough; + default: + parse_error(pe, FILT_ERR_TOO_MANY_PREDS, + next - str); + goto out_free; + } + + invert = *top & INVERT; + + if (*top & PROCESS_AND) { /* #7 */ + update_preds(prog, N - 1, invert); + *top &= ~PROCESS_AND; + } + if (*next == '&') { /* #8 */ + *top |= PROCESS_AND; + break; + } + if (*top & PROCESS_OR) { /* #9 */ + update_preds(prog, N - 1, !invert); + *top &= ~PROCESS_OR; + } + if (*next == '|') { /* #10 */ + *top |= PROCESS_OR; + break; + } + if (!*next) /* #11 */ + goto out; + + if (top == op_stack) { + ret = -1; + /* Too few '(' */ + parse_error(pe, FILT_ERR_TOO_MANY_CLOSE, ptr - str); + goto out_free; + } + top--; /* #12 */ + } + } + out: + if (top != op_stack) { + /* Too many '(' */ + parse_error(pe, FILT_ERR_TOO_MANY_OPEN, ptr - str); + goto out_free; + } + + if (!N) { + /* No program? */ + ret = -EINVAL; + parse_error(pe, FILT_ERR_NO_FILTER, ptr - str); + goto out_free; + } + + prog[N].pred = NULL; /* #13 */ + prog[N].target = 1; /* TRUE */ + prog[N+1].pred = NULL; + prog[N+1].target = 0; /* FALSE */ + prog[N-1].target = N; + prog[N-1].when_to_branch = false; + + /* Second Pass */ + for (i = N-1 ; i--; ) { + int target = prog[i].target; + if (prog[i].when_to_branch == prog[target].when_to_branch) + prog[i].target = prog[target].target; + } + + /* Third Pass */ + for (i = 0; i < N; i++) { + invert = inverts[i] ^ prog[i].when_to_branch; + prog[i].when_to_branch = invert; + /* Make sure the program always moves forward */ + if (WARN_ON(prog[i].target <= i)) { + ret = -EINVAL; + goto out_free; + } + } + + kfree(op_stack); + kfree(inverts); + return prog; +out_free: + kfree(op_stack); + kfree(inverts); + if (prog_stack) { + for (i = 0; prog_stack[i].pred; i++) + free_predicate(prog_stack[i].pred); + kfree(prog_stack); + } + return ERR_PTR(ret); +} + +static inline int +do_filter_cpumask(int op, const struct cpumask *mask, const struct cpumask *cmp) +{ + switch (op) { + case OP_EQ: + return cpumask_equal(mask, cmp); + case OP_NE: + return !cpumask_equal(mask, cmp); + case OP_BAND: + return cpumask_intersects(mask, cmp); + default: + return 0; + } +} + +/* Optimisation of do_filter_cpumask() for scalar fields */ +static inline int +do_filter_scalar_cpumask(int op, unsigned int cpu, const struct cpumask *mask) +{ + /* + * Per the weight-of-one cpumask optimisations, the mask passed in this + * function has a weight >= 2, so it is never equal to a single scalar. + */ + switch (op) { + case OP_EQ: + return false; + case OP_NE: + return true; + case OP_BAND: + return cpumask_test_cpu(cpu, mask); + default: + return 0; + } +} + +static inline int +do_filter_cpumask_scalar(int op, const struct cpumask *mask, unsigned int cpu) +{ + switch (op) { + case OP_EQ: + return cpumask_test_cpu(cpu, mask) && + cpumask_nth(1, mask) >= nr_cpu_ids; + case OP_NE: + return !cpumask_test_cpu(cpu, mask) || + cpumask_nth(1, mask) < nr_cpu_ids; + case OP_BAND: + return cpumask_test_cpu(cpu, mask); + default: + return 0; + } +} + +enum pred_cmp_types { + PRED_CMP_TYPE_NOP, + PRED_CMP_TYPE_LT, + PRED_CMP_TYPE_LE, + PRED_CMP_TYPE_GT, + PRED_CMP_TYPE_GE, + PRED_CMP_TYPE_BAND, +}; + +#define DEFINE_COMPARISON_PRED(type) \ +static int filter_pred_##type(struct filter_pred *pred, void *event) \ +{ \ + switch (pred->op) { \ + case OP_LT: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return *addr < val; \ + } \ + case OP_LE: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return *addr <= val; \ + } \ + case OP_GT: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return *addr > val; \ + } \ + case OP_GE: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return *addr >= val; \ + } \ + case OP_BAND: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return !!(*addr & val); \ + } \ + default: \ + return 0; \ + } \ +} + +#define DEFINE_CPUMASK_COMPARISON_PRED(size) \ +static int filter_pred_##size##_cpumask(struct filter_pred *pred, void *event) \ +{ \ + u##size *addr = (u##size *)(event + pred->offset); \ + unsigned int cpu = *addr; \ + \ + if (cpu >= nr_cpu_ids) \ + return 0; \ + \ + return do_filter_scalar_cpumask(pred->op, cpu, pred->mask); \ +} + +#define DEFINE_EQUALITY_PRED(size) \ +static int filter_pred_##size(struct filter_pred *pred, void *event) \ +{ \ + u##size *addr = (u##size *)(event + pred->offset); \ + u##size val = (u##size)pred->val; \ + int match; \ + \ + match = (val == *addr) ^ pred->not; \ + \ + return match; \ +} + +DEFINE_COMPARISON_PRED(s64); +DEFINE_COMPARISON_PRED(u64); +DEFINE_COMPARISON_PRED(s32); +DEFINE_COMPARISON_PRED(u32); +DEFINE_COMPARISON_PRED(s16); +DEFINE_COMPARISON_PRED(u16); +DEFINE_COMPARISON_PRED(s8); +DEFINE_COMPARISON_PRED(u8); + +DEFINE_CPUMASK_COMPARISON_PRED(64); +DEFINE_CPUMASK_COMPARISON_PRED(32); +DEFINE_CPUMASK_COMPARISON_PRED(16); +DEFINE_CPUMASK_COMPARISON_PRED(8); + +DEFINE_EQUALITY_PRED(64); +DEFINE_EQUALITY_PRED(32); +DEFINE_EQUALITY_PRED(16); +DEFINE_EQUALITY_PRED(8); + +/* user space strings temp buffer */ +#define USTRING_BUF_SIZE 1024 + +struct ustring_buffer { + char buffer[USTRING_BUF_SIZE]; +}; + +static __percpu struct ustring_buffer *ustring_per_cpu; + +static __always_inline char *test_string(char *str) +{ + struct ustring_buffer *ubuf; + char *kstr; + + if (!ustring_per_cpu) + return NULL; + + ubuf = this_cpu_ptr(ustring_per_cpu); + kstr = ubuf->buffer; + + /* For safety, do not trust the string pointer */ + if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE)) + return NULL; + return kstr; +} + +static __always_inline char *test_ustring(char *str) +{ + struct ustring_buffer *ubuf; + char __user *ustr; + char *kstr; + + if (!ustring_per_cpu) + return NULL; + + ubuf = this_cpu_ptr(ustring_per_cpu); + kstr = ubuf->buffer; + + /* user space address? */ + ustr = (char __user *)str; + if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE)) + return NULL; + + return kstr; +} + +/* Filter predicate for fixed sized arrays of characters */ +static int filter_pred_string(struct filter_pred *pred, void *event) +{ + char *addr = (char *)(event + pred->offset); + int cmp, match; + + cmp = pred->regex->match(addr, pred->regex, pred->regex->field_len); + + match = cmp ^ pred->not; + + return match; +} + +static __always_inline int filter_pchar(struct filter_pred *pred, char *str) +{ + int cmp, match; + int len; + + len = strlen(str) + 1; /* including tailing '\0' */ + cmp = pred->regex->match(str, pred->regex, len); + + match = cmp ^ pred->not; + + return match; +} +/* Filter predicate for char * pointers */ +static int filter_pred_pchar(struct filter_pred *pred, void *event) +{ + char **addr = (char **)(event + pred->offset); + char *str; + + str = test_string(*addr); + if (!str) + return 0; + + return filter_pchar(pred, str); +} + +/* Filter predicate for char * pointers in user space*/ +static int filter_pred_pchar_user(struct filter_pred *pred, void *event) +{ + char **addr = (char **)(event + pred->offset); + char *str; + + str = test_ustring(*addr); + if (!str) + return 0; + + return filter_pchar(pred, str); +} + +/* + * Filter predicate for dynamic sized arrays of characters. + * These are implemented through a list of strings at the end + * of the entry. + * Also each of these strings have a field in the entry which + * contains its offset from the beginning of the entry. + * We have then first to get this field, dereference it + * and add it to the address of the entry, and at last we have + * the address of the string. + */ +static int filter_pred_strloc(struct filter_pred *pred, void *event) +{ + u32 str_item = *(u32 *)(event + pred->offset); + int str_loc = str_item & 0xffff; + int str_len = str_item >> 16; + char *addr = (char *)(event + str_loc); + int cmp, match; + + cmp = pred->regex->match(addr, pred->regex, str_len); + + match = cmp ^ pred->not; + + return match; +} + +/* + * Filter predicate for relative dynamic sized arrays of characters. + * These are implemented through a list of strings at the end + * of the entry as same as dynamic string. + * The difference is that the relative one records the location offset + * from the field itself, not the event entry. + */ +static int filter_pred_strrelloc(struct filter_pred *pred, void *event) +{ + u32 *item = (u32 *)(event + pred->offset); + u32 str_item = *item; + int str_loc = str_item & 0xffff; + int str_len = str_item >> 16; + char *addr = (char *)(&item[1]) + str_loc; + int cmp, match; + + cmp = pred->regex->match(addr, pred->regex, str_len); + + match = cmp ^ pred->not; + + return match; +} + +/* Filter predicate for CPUs. */ +static int filter_pred_cpu(struct filter_pred *pred, void *event) +{ + int cpu, cmp; + + cpu = raw_smp_processor_id(); + cmp = pred->val; + + switch (pred->op) { + case OP_EQ: + return cpu == cmp; + case OP_NE: + return cpu != cmp; + case OP_LT: + return cpu < cmp; + case OP_LE: + return cpu <= cmp; + case OP_GT: + return cpu > cmp; + case OP_GE: + return cpu >= cmp; + default: + return 0; + } +} + +/* Filter predicate for current CPU vs user-provided cpumask */ +static int filter_pred_cpu_cpumask(struct filter_pred *pred, void *event) +{ + int cpu = raw_smp_processor_id(); + + return do_filter_scalar_cpumask(pred->op, cpu, pred->mask); +} + +/* Filter predicate for cpumask field vs user-provided cpumask */ +static int filter_pred_cpumask(struct filter_pred *pred, void *event) +{ + u32 item = *(u32 *)(event + pred->offset); + int loc = item & 0xffff; + const struct cpumask *mask = (event + loc); + const struct cpumask *cmp = pred->mask; + + return do_filter_cpumask(pred->op, mask, cmp); +} + +/* Filter predicate for cpumask field vs user-provided scalar */ +static int filter_pred_cpumask_cpu(struct filter_pred *pred, void *event) +{ + u32 item = *(u32 *)(event + pred->offset); + int loc = item & 0xffff; + const struct cpumask *mask = (event + loc); + unsigned int cpu = pred->val; + + return do_filter_cpumask_scalar(pred->op, mask, cpu); +} + +/* Filter predicate for COMM. */ +static int filter_pred_comm(struct filter_pred *pred, void *event) +{ + int cmp; + + cmp = pred->regex->match(current->comm, pred->regex, + TASK_COMM_LEN); + return cmp ^ pred->not; +} + +/* Filter predicate for functions. */ +static int filter_pred_function(struct filter_pred *pred, void *event) +{ + unsigned long *addr = (unsigned long *)(event + pred->offset); + unsigned long start = (unsigned long)pred->val; + unsigned long end = (unsigned long)pred->val2; + int ret = *addr >= start && *addr < end; + + return pred->op == OP_EQ ? ret : !ret; +} + +/* + * regex_match_foo - Basic regex callbacks + * + * @str: the string to be searched + * @r: the regex structure containing the pattern string + * @len: the length of the string to be searched (including '\0') + * + * Note: + * - @str might not be NULL-terminated if it's of type DYN_STRING + * RDYN_STRING, or STATIC_STRING, unless @len is zero. + */ + +static int regex_match_full(char *str, struct regex *r, int len) +{ + /* len of zero means str is dynamic and ends with '\0' */ + if (!len) + return strcmp(str, r->pattern) == 0; + + return strncmp(str, r->pattern, len) == 0; +} + +static int regex_match_front(char *str, struct regex *r, int len) +{ + if (len && len < r->len) + return 0; + + return strncmp(str, r->pattern, r->len) == 0; +} + +static int regex_match_middle(char *str, struct regex *r, int len) +{ + if (!len) + return strstr(str, r->pattern) != NULL; + + return strnstr(str, r->pattern, len) != NULL; +} + +static int regex_match_end(char *str, struct regex *r, int len) +{ + int strlen = len - 1; + + if (strlen >= r->len && + memcmp(str + strlen - r->len, r->pattern, r->len) == 0) + return 1; + return 0; +} + +static int regex_match_glob(char *str, struct regex *r, int len __maybe_unused) +{ + if (glob_match(r->pattern, str)) + return 1; + return 0; +} + +/** + * filter_parse_regex - parse a basic regex + * @buff: the raw regex + * @len: length of the regex + * @search: will point to the beginning of the string to compare + * @not: tell whether the match will have to be inverted + * + * This passes in a buffer containing a regex and this function will + * set search to point to the search part of the buffer and + * return the type of search it is (see enum above). + * This does modify buff. + * + * Returns enum type. + * search returns the pointer to use for comparison. + * not returns 1 if buff started with a '!' + * 0 otherwise. + */ +enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) +{ + int type = MATCH_FULL; + int i; + + if (buff[0] == '!') { + *not = 1; + buff++; + len--; + } else + *not = 0; + + *search = buff; + + if (isdigit(buff[0])) + return MATCH_INDEX; + + for (i = 0; i < len; i++) { + if (buff[i] == '*') { + if (!i) { + type = MATCH_END_ONLY; + } else if (i == len - 1) { + if (type == MATCH_END_ONLY) + type = MATCH_MIDDLE_ONLY; + else + type = MATCH_FRONT_ONLY; + buff[i] = 0; + break; + } else { /* pattern continues, use full glob */ + return MATCH_GLOB; + } + } else if (strchr("[?\\", buff[i])) { + return MATCH_GLOB; + } + } + if (buff[0] == '*') + *search = buff + 1; + + return type; +} + +static void filter_build_regex(struct filter_pred *pred) +{ + struct regex *r = pred->regex; + char *search; + enum regex_type type = MATCH_FULL; + + if (pred->op == OP_GLOB) { + type = filter_parse_regex(r->pattern, r->len, &search, &pred->not); + r->len = strlen(search); + memmove(r->pattern, search, r->len+1); + } + + switch (type) { + /* MATCH_INDEX should not happen, but if it does, match full */ + case MATCH_INDEX: + case MATCH_FULL: + r->match = regex_match_full; + break; + case MATCH_FRONT_ONLY: + r->match = regex_match_front; + break; + case MATCH_MIDDLE_ONLY: + r->match = regex_match_middle; + break; + case MATCH_END_ONLY: + r->match = regex_match_end; + break; + case MATCH_GLOB: + r->match = regex_match_glob; + break; + } +} + + +#ifdef CONFIG_FTRACE_STARTUP_TEST +static int test_pred_visited_fn(struct filter_pred *pred, void *event); +#else +static int test_pred_visited_fn(struct filter_pred *pred, void *event) +{ + return 0; +} +#endif + + +static int filter_pred_fn_call(struct filter_pred *pred, void *event); + +/* return 1 if event matches, 0 otherwise (discard) */ +int filter_match_preds(struct event_filter *filter, void *rec) +{ + struct prog_entry *prog; + int i; + + /* no filter is considered a match */ + if (!filter) + return 1; + + /* Protected by either SRCU(tracepoint_srcu) or preempt_disable */ + prog = rcu_dereference_raw(filter->prog); + if (!prog) + return 1; + + for (i = 0; prog[i].pred; i++) { + struct filter_pred *pred = prog[i].pred; + int match = filter_pred_fn_call(pred, rec); + if (match == prog[i].when_to_branch) + i = prog[i].target; + } + return prog[i].target; +} +EXPORT_SYMBOL_GPL(filter_match_preds); + +static void remove_filter_string(struct event_filter *filter) +{ + if (!filter) + return; + + kfree(filter->filter_string); + filter->filter_string = NULL; +} + +static void append_filter_err(struct trace_array *tr, + struct filter_parse_error *pe, + struct event_filter *filter) +{ + struct trace_seq *s; + int pos = pe->lasterr_pos; + char *buf; + int len; + + if (WARN_ON(!filter->filter_string)) + return; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return; + trace_seq_init(s); + + len = strlen(filter->filter_string); + if (pos > len) + pos = len; + + /* indexing is off by one */ + if (pos) + pos++; + + trace_seq_puts(s, filter->filter_string); + if (pe->lasterr > 0) { + trace_seq_printf(s, "\n%*s", pos, "^"); + trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]); + tracing_log_err(tr, "event filter parse error", + filter->filter_string, err_text, + pe->lasterr, pe->lasterr_pos); + } else { + trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr); + tracing_log_err(tr, "event filter parse error", + filter->filter_string, err_text, + FILT_ERR_ERRNO, 0); + } + trace_seq_putc(s, 0); + buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL); + if (buf) { + kfree(filter->filter_string); + filter->filter_string = buf; + } + kfree(s); +} + +static inline struct event_filter *event_filter(struct trace_event_file *file) +{ + return file->filter; +} + +/* caller must hold event_mutex */ +void print_event_filter(struct trace_event_file *file, struct trace_seq *s) +{ + struct event_filter *filter = event_filter(file); + + if (filter && filter->filter_string) + trace_seq_printf(s, "%s\n", filter->filter_string); + else + trace_seq_puts(s, "none\n"); +} + +void print_subsystem_event_filter(struct event_subsystem *system, + struct trace_seq *s) +{ + struct event_filter *filter; + + mutex_lock(&event_mutex); + filter = system->filter; + if (filter && filter->filter_string) + trace_seq_printf(s, "%s\n", filter->filter_string); + else + trace_seq_puts(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); + mutex_unlock(&event_mutex); +} + +static void free_prog(struct event_filter *filter) +{ + struct prog_entry *prog; + int i; + + prog = rcu_access_pointer(filter->prog); + if (!prog) + return; + + for (i = 0; prog[i].pred; i++) + free_predicate(prog[i].pred); + kfree(prog); +} + +static void filter_disable(struct trace_event_file *file) +{ + unsigned long old_flags = file->flags; + + file->flags &= ~EVENT_FILE_FL_FILTERED; + + if (old_flags != file->flags) + trace_buffered_event_disable(); +} + +static void __free_filter(struct event_filter *filter) +{ + if (!filter) + return; + + free_prog(filter); + kfree(filter->filter_string); + kfree(filter); +} + +void free_event_filter(struct event_filter *filter) +{ + __free_filter(filter); +} + +static inline void __remove_filter(struct trace_event_file *file) +{ + filter_disable(file); + remove_filter_string(file->filter); +} + +static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, + struct trace_array *tr) +{ + struct trace_event_file *file; + + list_for_each_entry(file, &tr->events, list) { + if (file->system != dir) + continue; + __remove_filter(file); + } +} + +static inline void __free_subsystem_filter(struct trace_event_file *file) +{ + __free_filter(file->filter); + file->filter = NULL; +} + +static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, + struct trace_array *tr) +{ + struct trace_event_file *file; + + list_for_each_entry(file, &tr->events, list) { + if (file->system != dir) + continue; + __free_subsystem_filter(file); + } +} + +int filter_assign_type(const char *type) +{ + if (strstr(type, "__data_loc")) { + if (strstr(type, "char")) + return FILTER_DYN_STRING; + if (strstr(type, "cpumask_t")) + return FILTER_CPUMASK; + } + + if (strstr(type, "__rel_loc") && strstr(type, "char")) + return FILTER_RDYN_STRING; + + if (strchr(type, '[') && strstr(type, "char")) + return FILTER_STATIC_STRING; + + if (strcmp(type, "char *") == 0 || strcmp(type, "const char *") == 0) + return FILTER_PTR_STRING; + + return FILTER_OTHER; +} + +static enum filter_pred_fn select_comparison_fn(enum filter_op_ids op, + int field_size, int field_is_signed) +{ + enum filter_pred_fn fn = FILTER_PRED_FN_NOP; + int pred_func_index = -1; + + switch (op) { + case OP_EQ: + case OP_NE: + break; + default: + if (WARN_ON_ONCE(op < PRED_FUNC_START)) + return fn; + pred_func_index = op - PRED_FUNC_START; + if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX)) + return fn; + } + + switch (field_size) { + case 8: + if (pred_func_index < 0) + fn = FILTER_PRED_FN_64; + else if (field_is_signed) + fn = FILTER_PRED_FN_S64; + else + fn = FILTER_PRED_FN_U64; + break; + case 4: + if (pred_func_index < 0) + fn = FILTER_PRED_FN_32; + else if (field_is_signed) + fn = FILTER_PRED_FN_S32; + else + fn = FILTER_PRED_FN_U32; + break; + case 2: + if (pred_func_index < 0) + fn = FILTER_PRED_FN_16; + else if (field_is_signed) + fn = FILTER_PRED_FN_S16; + else + fn = FILTER_PRED_FN_U16; + break; + case 1: + if (pred_func_index < 0) + fn = FILTER_PRED_FN_8; + else if (field_is_signed) + fn = FILTER_PRED_FN_S8; + else + fn = FILTER_PRED_FN_U8; + break; + } + + return fn; +} + + +static int filter_pred_fn_call(struct filter_pred *pred, void *event) +{ + switch (pred->fn_num) { + case FILTER_PRED_FN_64: + return filter_pred_64(pred, event); + case FILTER_PRED_FN_64_CPUMASK: + return filter_pred_64_cpumask(pred, event); + case FILTER_PRED_FN_S64: + return filter_pred_s64(pred, event); + case FILTER_PRED_FN_U64: + return filter_pred_u64(pred, event); + case FILTER_PRED_FN_32: + return filter_pred_32(pred, event); + case FILTER_PRED_FN_32_CPUMASK: + return filter_pred_32_cpumask(pred, event); + case FILTER_PRED_FN_S32: + return filter_pred_s32(pred, event); + case FILTER_PRED_FN_U32: + return filter_pred_u32(pred, event); + case FILTER_PRED_FN_16: + return filter_pred_16(pred, event); + case FILTER_PRED_FN_16_CPUMASK: + return filter_pred_16_cpumask(pred, event); + case FILTER_PRED_FN_S16: + return filter_pred_s16(pred, event); + case FILTER_PRED_FN_U16: + return filter_pred_u16(pred, event); + case FILTER_PRED_FN_8: + return filter_pred_8(pred, event); + case FILTER_PRED_FN_8_CPUMASK: + return filter_pred_8_cpumask(pred, event); + case FILTER_PRED_FN_S8: + return filter_pred_s8(pred, event); + case FILTER_PRED_FN_U8: + return filter_pred_u8(pred, event); + case FILTER_PRED_FN_COMM: + return filter_pred_comm(pred, event); + case FILTER_PRED_FN_STRING: + return filter_pred_string(pred, event); + case FILTER_PRED_FN_STRLOC: + return filter_pred_strloc(pred, event); + case FILTER_PRED_FN_STRRELLOC: + return filter_pred_strrelloc(pred, event); + case FILTER_PRED_FN_PCHAR_USER: + return filter_pred_pchar_user(pred, event); + case FILTER_PRED_FN_PCHAR: + return filter_pred_pchar(pred, event); + case FILTER_PRED_FN_CPU: + return filter_pred_cpu(pred, event); + case FILTER_PRED_FN_CPU_CPUMASK: + return filter_pred_cpu_cpumask(pred, event); + case FILTER_PRED_FN_CPUMASK: + return filter_pred_cpumask(pred, event); + case FILTER_PRED_FN_CPUMASK_CPU: + return filter_pred_cpumask_cpu(pred, event); + case FILTER_PRED_FN_FUNCTION: + return filter_pred_function(pred, event); + case FILTER_PRED_TEST_VISITED: + return test_pred_visited_fn(pred, event); + default: + return 0; + } +} + +/* Called when a predicate is encountered by predicate_parse() */ +static int parse_pred(const char *str, void *data, + int pos, struct filter_parse_error *pe, + struct filter_pred **pred_ptr) +{ + struct trace_event_call *call = data; + struct ftrace_event_field *field; + struct filter_pred *pred = NULL; + unsigned long offset; + unsigned long size; + unsigned long ip; + char num_buf[24]; /* Big enough to hold an address */ + char *field_name; + char *name; + bool function = false; + bool ustring = false; + char q; + u64 val; + int len; + int ret; + int op; + int s; + int i = 0; + + /* First find the field to associate to */ + while (isspace(str[i])) + i++; + s = i; + + while (isalnum(str[i]) || str[i] == '_') + i++; + + len = i - s; + + if (!len) + return -1; + + field_name = kmemdup_nul(str + s, len, GFP_KERNEL); + if (!field_name) + return -ENOMEM; + + /* Make sure that the field exists */ + + field = trace_find_event_field(call, field_name); + kfree(field_name); + if (!field) { + parse_error(pe, FILT_ERR_FIELD_NOT_FOUND, pos + i); + return -EINVAL; + } + + /* See if the field is a user space string */ + if ((len = str_has_prefix(str + i, ".ustring"))) { + ustring = true; + i += len; + } + + /* See if the field is a kernel function name */ + if ((len = str_has_prefix(str + i, ".function"))) { + function = true; + i += len; + } + + while (isspace(str[i])) + i++; + + /* Make sure this op is supported */ + for (op = 0; ops[op]; op++) { + /* This is why '<=' must come before '<' in ops[] */ + if (strncmp(str + i, ops[op], strlen(ops[op])) == 0) + break; + } + + if (!ops[op]) { + parse_error(pe, FILT_ERR_INVALID_OP, pos + i); + goto err_free; + } + + i += strlen(ops[op]); + + while (isspace(str[i])) + i++; + + s = i; + + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return -ENOMEM; + + pred->field = field; + pred->offset = field->offset; + pred->op = op; + + if (function) { + /* The field must be the same size as long */ + if (field->size != sizeof(long)) { + parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); + goto err_free; + } + + /* Function only works with '==' or '!=' and an unquoted string */ + switch (op) { + case OP_NE: + case OP_EQ: + break; + default: + parse_error(pe, FILT_ERR_INVALID_OP, pos + i); + goto err_free; + } + + if (isdigit(str[i])) { + /* We allow 0xDEADBEEF */ + while (isalnum(str[i])) + i++; + + len = i - s; + /* 0xfeedfacedeadbeef is 18 chars max */ + if (len >= sizeof(num_buf)) { + parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); + goto err_free; + } + + strncpy(num_buf, str + s, len); + num_buf[len] = 0; + + ret = kstrtoul(num_buf, 0, &ip); + if (ret) { + parse_error(pe, FILT_ERR_INVALID_VALUE, pos + i); + goto err_free; + } + } else { + s = i; + for (; str[i] && !isspace(str[i]); i++) + ; + + len = i - s; + name = kmemdup_nul(str + s, len, GFP_KERNEL); + if (!name) + goto err_mem; + ip = kallsyms_lookup_name(name); + kfree(name); + if (!ip) { + parse_error(pe, FILT_ERR_NO_FUNCTION, pos + i); + goto err_free; + } + } + + /* Now find the function start and end address */ + if (!kallsyms_lookup_size_offset(ip, &size, &offset)) { + parse_error(pe, FILT_ERR_NO_FUNCTION, pos + i); + goto err_free; + } + + pred->fn_num = FILTER_PRED_FN_FUNCTION; + pred->val = ip - offset; + pred->val2 = pred->val + size; + + } else if (ftrace_event_is_function(call)) { + /* + * Perf does things different with function events. + * It only allows an "ip" field, and expects a string. + * But the string does not need to be surrounded by quotes. + * If it is a string, the assigned function as a nop, + * (perf doesn't use it) and grab everything. + */ + if (strcmp(field->name, "ip") != 0) { + parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); + goto err_free; + } + pred->fn_num = FILTER_PRED_FN_NOP; + + /* + * Quotes are not required, but if they exist then we need + * to read them till we hit a matching one. + */ + if (str[i] == '\'' || str[i] == '"') + q = str[i]; + else + q = 0; + + for (i++; str[i]; i++) { + if (q && str[i] == q) + break; + if (!q && (str[i] == ')' || str[i] == '&' || + str[i] == '|')) + break; + } + /* Skip quotes */ + if (q) + s++; + len = i - s; + if (len >= MAX_FILTER_STR_VAL) { + parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); + goto err_free; + } + + pred->regex = kzalloc(sizeof(*pred->regex), GFP_KERNEL); + if (!pred->regex) + goto err_mem; + pred->regex->len = len; + strncpy(pred->regex->pattern, str + s, len); + pred->regex->pattern[len] = 0; + + } else if (!strncmp(str + i, "CPUS", 4)) { + unsigned int maskstart; + bool single; + char *tmp; + + switch (field->filter_type) { + case FILTER_CPUMASK: + case FILTER_CPU: + case FILTER_OTHER: + break; + default: + parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); + goto err_free; + } + + switch (op) { + case OP_EQ: + case OP_NE: + case OP_BAND: + break; + default: + parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); + goto err_free; + } + + /* Skip CPUS */ + i += 4; + if (str[i++] != '{') { + parse_error(pe, FILT_ERR_MISSING_BRACE_OPEN, pos + i); + goto err_free; + } + maskstart = i; + + /* Walk the cpulist until closing } */ + for (; str[i] && str[i] != '}'; i++) + ; + + if (str[i] != '}') { + parse_error(pe, FILT_ERR_MISSING_BRACE_CLOSE, pos + i); + goto err_free; + } + + if (maskstart == i) { + parse_error(pe, FILT_ERR_INVALID_CPULIST, pos + i); + goto err_free; + } + + /* Copy the cpulist between { and } */ + tmp = kmalloc((i - maskstart) + 1, GFP_KERNEL); + if (!tmp) + goto err_mem; + + strscpy(tmp, str + maskstart, (i - maskstart) + 1); + pred->mask = kzalloc(cpumask_size(), GFP_KERNEL); + if (!pred->mask) { + kfree(tmp); + goto err_mem; + } + + /* Now parse it */ + if (cpulist_parse(tmp, pred->mask)) { + kfree(tmp); + parse_error(pe, FILT_ERR_INVALID_CPULIST, pos + i); + goto err_free; + } + kfree(tmp); + + /* Move along */ + i++; + + /* + * Optimisation: if the user-provided mask has a weight of one + * then we can treat it as a scalar input. + */ + single = cpumask_weight(pred->mask) == 1; + if (single) { + pred->val = cpumask_first(pred->mask); + kfree(pred->mask); + pred->mask = NULL; + } + + if (field->filter_type == FILTER_CPUMASK) { + pred->fn_num = single ? + FILTER_PRED_FN_CPUMASK_CPU : + FILTER_PRED_FN_CPUMASK; + } else if (field->filter_type == FILTER_CPU) { + if (single) { + if (pred->op == OP_BAND) + pred->op = OP_EQ; + + pred->fn_num = FILTER_PRED_FN_CPU; + } else { + pred->fn_num = FILTER_PRED_FN_CPU_CPUMASK; + } + } else if (single) { + if (pred->op == OP_BAND) + pred->op = OP_EQ; + + pred->fn_num = select_comparison_fn(pred->op, field->size, false); + if (pred->op == OP_NE) + pred->not = 1; + } else { + switch (field->size) { + case 8: + pred->fn_num = FILTER_PRED_FN_64_CPUMASK; + break; + case 4: + pred->fn_num = FILTER_PRED_FN_32_CPUMASK; + break; + case 2: + pred->fn_num = FILTER_PRED_FN_16_CPUMASK; + break; + case 1: + pred->fn_num = FILTER_PRED_FN_8_CPUMASK; + break; + } + } + + /* This is either a string, or an integer */ + } else if (str[i] == '\'' || str[i] == '"') { + char q = str[i]; + + /* Make sure the op is OK for strings */ + switch (op) { + case OP_NE: + pred->not = 1; + fallthrough; + case OP_GLOB: + case OP_EQ: + break; + default: + parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); + goto err_free; + } + + /* Make sure the field is OK for strings */ + if (!is_string_field(field)) { + parse_error(pe, FILT_ERR_EXPECT_DIGIT, pos + i); + goto err_free; + } + + for (i++; str[i]; i++) { + if (str[i] == q) + break; + } + if (!str[i]) { + parse_error(pe, FILT_ERR_MISSING_QUOTE, pos + i); + goto err_free; + } + + /* Skip quotes */ + s++; + len = i - s; + if (len >= MAX_FILTER_STR_VAL) { + parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); + goto err_free; + } + + pred->regex = kzalloc(sizeof(*pred->regex), GFP_KERNEL); + if (!pred->regex) + goto err_mem; + pred->regex->len = len; + strncpy(pred->regex->pattern, str + s, len); + pred->regex->pattern[len] = 0; + + filter_build_regex(pred); + + if (field->filter_type == FILTER_COMM) { + pred->fn_num = FILTER_PRED_FN_COMM; + + } else if (field->filter_type == FILTER_STATIC_STRING) { + pred->fn_num = FILTER_PRED_FN_STRING; + pred->regex->field_len = field->size; + + } else if (field->filter_type == FILTER_DYN_STRING) { + pred->fn_num = FILTER_PRED_FN_STRLOC; + } else if (field->filter_type == FILTER_RDYN_STRING) + pred->fn_num = FILTER_PRED_FN_STRRELLOC; + else { + + if (!ustring_per_cpu) { + /* Once allocated, keep it around for good */ + ustring_per_cpu = alloc_percpu(struct ustring_buffer); + if (!ustring_per_cpu) + goto err_mem; + } + + if (ustring) + pred->fn_num = FILTER_PRED_FN_PCHAR_USER; + else + pred->fn_num = FILTER_PRED_FN_PCHAR; + } + /* go past the last quote */ + i++; + + } else if (isdigit(str[i]) || str[i] == '-') { + + /* Make sure the field is not a string */ + if (is_string_field(field)) { + parse_error(pe, FILT_ERR_EXPECT_STRING, pos + i); + goto err_free; + } + + if (op == OP_GLOB) { + parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); + goto err_free; + } + + if (str[i] == '-') + i++; + + /* We allow 0xDEADBEEF */ + while (isalnum(str[i])) + i++; + + len = i - s; + /* 0xfeedfacedeadbeef is 18 chars max */ + if (len >= sizeof(num_buf)) { + parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); + goto err_free; + } + + strncpy(num_buf, str + s, len); + num_buf[len] = 0; + + /* Make sure it is a value */ + if (field->is_signed) + ret = kstrtoll(num_buf, 0, &val); + else + ret = kstrtoull(num_buf, 0, &val); + if (ret) { + parse_error(pe, FILT_ERR_ILLEGAL_INTVAL, pos + s); + goto err_free; + } + + pred->val = val; + + if (field->filter_type == FILTER_CPU) + pred->fn_num = FILTER_PRED_FN_CPU; + else { + pred->fn_num = select_comparison_fn(pred->op, field->size, + field->is_signed); + if (pred->op == OP_NE) + pred->not = 1; + } + + } else { + parse_error(pe, FILT_ERR_INVALID_VALUE, pos + i); + goto err_free; + } + + *pred_ptr = pred; + return i; + +err_free: + free_predicate(pred); + return -EINVAL; +err_mem: + free_predicate(pred); + return -ENOMEM; +} + +enum { + TOO_MANY_CLOSE = -1, + TOO_MANY_OPEN = -2, + MISSING_QUOTE = -3, +}; + +/* + * Read the filter string once to calculate the number of predicates + * as well as how deep the parentheses go. + * + * Returns: + * 0 - everything is fine (err is undefined) + * -1 - too many ')' + * -2 - too many '(' + * -3 - No matching quote + */ +static int calc_stack(const char *str, int *parens, int *preds, int *err) +{ + bool is_pred = false; + int nr_preds = 0; + int open = 1; /* Count the expression as "(E)" */ + int last_quote = 0; + int max_open = 1; + int quote = 0; + int i; + + *err = 0; + + for (i = 0; str[i]; i++) { + if (isspace(str[i])) + continue; + if (quote) { + if (str[i] == quote) + quote = 0; + continue; + } + + switch (str[i]) { + case '\'': + case '"': + quote = str[i]; + last_quote = i; + break; + case '|': + case '&': + if (str[i+1] != str[i]) + break; + is_pred = false; + continue; + case '(': + is_pred = false; + open++; + if (open > max_open) + max_open = open; + continue; + case ')': + is_pred = false; + if (open == 1) { + *err = i; + return TOO_MANY_CLOSE; + } + open--; + continue; + } + if (!is_pred) { + nr_preds++; + is_pred = true; + } + } + + if (quote) { + *err = last_quote; + return MISSING_QUOTE; + } + + if (open != 1) { + int level = open; + + /* find the bad open */ + for (i--; i; i--) { + if (quote) { + if (str[i] == quote) + quote = 0; + continue; + } + switch (str[i]) { + case '(': + if (level == open) { + *err = i; + return TOO_MANY_OPEN; + } + level--; + break; + case ')': + level++; + break; + case '\'': + case '"': + quote = str[i]; + break; + } + } + /* First character is the '(' with missing ')' */ + *err = 0; + return TOO_MANY_OPEN; + } + + /* Set the size of the required stacks */ + *parens = max_open; + *preds = nr_preds; + return 0; +} + +static int process_preds(struct trace_event_call *call, + const char *filter_string, + struct event_filter *filter, + struct filter_parse_error *pe) +{ + struct prog_entry *prog; + int nr_parens; + int nr_preds; + int index; + int ret; + + ret = calc_stack(filter_string, &nr_parens, &nr_preds, &index); + if (ret < 0) { + switch (ret) { + case MISSING_QUOTE: + parse_error(pe, FILT_ERR_MISSING_QUOTE, index); + break; + case TOO_MANY_OPEN: + parse_error(pe, FILT_ERR_TOO_MANY_OPEN, index); + break; + default: + parse_error(pe, FILT_ERR_TOO_MANY_CLOSE, index); + } + return ret; + } + + if (!nr_preds) + return -EINVAL; + + prog = predicate_parse(filter_string, nr_parens, nr_preds, + parse_pred, call, pe); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + rcu_assign_pointer(filter->prog, prog); + return 0; +} + +static inline void event_set_filtered_flag(struct trace_event_file *file) +{ + unsigned long old_flags = file->flags; + + file->flags |= EVENT_FILE_FL_FILTERED; + + if (old_flags != file->flags) + trace_buffered_event_enable(); +} + +static inline void event_set_filter(struct trace_event_file *file, + struct event_filter *filter) +{ + rcu_assign_pointer(file->filter, filter); +} + +static inline void event_clear_filter(struct trace_event_file *file) +{ + RCU_INIT_POINTER(file->filter, NULL); +} + +struct filter_list { + struct list_head list; + struct event_filter *filter; +}; + +static int process_system_preds(struct trace_subsystem_dir *dir, + struct trace_array *tr, + struct filter_parse_error *pe, + char *filter_string) +{ + struct trace_event_file *file; + struct filter_list *filter_item; + struct event_filter *filter = NULL; + struct filter_list *tmp; + LIST_HEAD(filter_list); + bool fail = true; + int err; + + list_for_each_entry(file, &tr->events, list) { + + if (file->system != dir) + continue; + + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!filter) + goto fail_mem; + + filter->filter_string = kstrdup(filter_string, GFP_KERNEL); + if (!filter->filter_string) + goto fail_mem; + + err = process_preds(file->event_call, filter_string, filter, pe); + if (err) { + filter_disable(file); + parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); + append_filter_err(tr, pe, filter); + } else + event_set_filtered_flag(file); + + + filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); + if (!filter_item) + goto fail_mem; + + list_add_tail(&filter_item->list, &filter_list); + /* + * Regardless of if this returned an error, we still + * replace the filter for the call. + */ + filter_item->filter = event_filter(file); + event_set_filter(file, filter); + filter = NULL; + + fail = false; + } + + if (fail) + goto fail; + + /* + * The calls can still be using the old filters. + * Do a synchronize_rcu() and to ensure all calls are + * done with them before we free them. + */ + tracepoint_synchronize_unregister(); + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } + return 0; + fail: + /* No call succeeded */ + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + list_del(&filter_item->list); + kfree(filter_item); + } + parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); + return -EINVAL; + fail_mem: + __free_filter(filter); + /* If any call succeeded, we still need to sync */ + if (!fail) + tracepoint_synchronize_unregister(); + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } + return -ENOMEM; +} + +static int create_filter_start(char *filter_string, bool set_str, + struct filter_parse_error **pse, + struct event_filter **filterp) +{ + struct event_filter *filter; + struct filter_parse_error *pe = NULL; + int err = 0; + + if (WARN_ON_ONCE(*pse || *filterp)) + return -EINVAL; + + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (filter && set_str) { + filter->filter_string = kstrdup(filter_string, GFP_KERNEL); + if (!filter->filter_string) + err = -ENOMEM; + } + + pe = kzalloc(sizeof(*pe), GFP_KERNEL); + + if (!filter || !pe || err) { + kfree(pe); + __free_filter(filter); + return -ENOMEM; + } + + /* we're committed to creating a new filter */ + *filterp = filter; + *pse = pe; + + return 0; +} + +static void create_filter_finish(struct filter_parse_error *pe) +{ + kfree(pe); +} + +/** + * create_filter - create a filter for a trace_event_call + * @tr: the trace array associated with these events + * @call: trace_event_call to create a filter for + * @filter_string: filter string + * @set_str: remember @filter_str and enable detailed error in filter + * @filterp: out param for created filter (always updated on return) + * Must be a pointer that references a NULL pointer. + * + * Creates a filter for @call with @filter_str. If @set_str is %true, + * @filter_str is copied and recorded in the new filter. + * + * On success, returns 0 and *@filterp points to the new filter. On + * failure, returns -errno and *@filterp may point to %NULL or to a new + * filter. In the latter case, the returned filter contains error + * information if @set_str is %true and the caller is responsible for + * freeing it. + */ +static int create_filter(struct trace_array *tr, + struct trace_event_call *call, + char *filter_string, bool set_str, + struct event_filter **filterp) +{ + struct filter_parse_error *pe = NULL; + int err; + + /* filterp must point to NULL */ + if (WARN_ON(*filterp)) + *filterp = NULL; + + err = create_filter_start(filter_string, set_str, &pe, filterp); + if (err) + return err; + + err = process_preds(call, filter_string, *filterp, pe); + if (err && set_str) + append_filter_err(tr, pe, *filterp); + create_filter_finish(pe); + + return err; +} + +int create_event_filter(struct trace_array *tr, + struct trace_event_call *call, + char *filter_str, bool set_str, + struct event_filter **filterp) +{ + return create_filter(tr, call, filter_str, set_str, filterp); +} + +/** + * create_system_filter - create a filter for an event subsystem + * @dir: the descriptor for the subsystem directory + * @filter_str: filter string + * @filterp: out param for created filter (always updated on return) + * + * Identical to create_filter() except that it creates a subsystem filter + * and always remembers @filter_str. + */ +static int create_system_filter(struct trace_subsystem_dir *dir, + char *filter_str, struct event_filter **filterp) +{ + struct filter_parse_error *pe = NULL; + int err; + + err = create_filter_start(filter_str, true, &pe, filterp); + if (!err) { + err = process_system_preds(dir, dir->tr, pe, filter_str); + if (!err) { + /* System filters just show a default message */ + kfree((*filterp)->filter_string); + (*filterp)->filter_string = NULL; + } else { + append_filter_err(dir->tr, pe, *filterp); + } + } + create_filter_finish(pe); + + return err; +} + +/* caller must hold event_mutex */ +int apply_event_filter(struct trace_event_file *file, char *filter_string) +{ + struct trace_event_call *call = file->event_call; + struct event_filter *filter = NULL; + int err; + + if (file->flags & EVENT_FILE_FL_FREED) + return -ENODEV; + + if (!strcmp(strstrip(filter_string), "0")) { + filter_disable(file); + filter = event_filter(file); + + if (!filter) + return 0; + + event_clear_filter(file); + + /* Make sure the filter is not being used */ + tracepoint_synchronize_unregister(); + __free_filter(filter); + + return 0; + } + + err = create_filter(file->tr, call, filter_string, true, &filter); + + /* + * Always swap the call filter with the new filter + * even if there was an error. If there was an error + * in the filter, we disable the filter and show the error + * string + */ + if (filter) { + struct event_filter *tmp; + + tmp = event_filter(file); + if (!err) + event_set_filtered_flag(file); + else + filter_disable(file); + + event_set_filter(file, filter); + + if (tmp) { + /* Make sure the call is done with the filter */ + tracepoint_synchronize_unregister(); + __free_filter(tmp); + } + } + + return err; +} + +int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, + char *filter_string) +{ + struct event_subsystem *system = dir->subsystem; + struct trace_array *tr = dir->tr; + struct event_filter *filter = NULL; + int err = 0; + + mutex_lock(&event_mutex); + + /* Make sure the system still has events */ + if (!dir->nr_events) { + err = -ENODEV; + goto out_unlock; + } + + if (!strcmp(strstrip(filter_string), "0")) { + filter_free_subsystem_preds(dir, tr); + remove_filter_string(system->filter); + filter = system->filter; + system->filter = NULL; + /* Ensure all filters are no longer used */ + tracepoint_synchronize_unregister(); + filter_free_subsystem_filters(dir, tr); + __free_filter(filter); + goto out_unlock; + } + + err = create_system_filter(dir, filter_string, &filter); + if (filter) { + /* + * No event actually uses the system filter + * we can free it without synchronize_rcu(). + */ + __free_filter(system->filter); + system->filter = filter; + } +out_unlock: + mutex_unlock(&event_mutex); + + return err; +} + +#ifdef CONFIG_PERF_EVENTS + +void ftrace_profile_free_filter(struct perf_event *event) +{ + struct event_filter *filter = event->filter; + + event->filter = NULL; + __free_filter(filter); +} + +struct function_filter_data { + struct ftrace_ops *ops; + int first_filter; + int first_notrace; +}; + +#ifdef CONFIG_FUNCTION_TRACER +static char ** +ftrace_function_filter_re(char *buf, int len, int *count) +{ + char *str, **re; + + str = kstrndup(buf, len, GFP_KERNEL); + if (!str) + return NULL; + + /* + * The argv_split function takes white space + * as a separator, so convert ',' into spaces. + */ + strreplace(str, ',', ' '); + + re = argv_split(GFP_KERNEL, str, count); + kfree(str); + return re; +} + +static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter, + int reset, char *re, int len) +{ + int ret; + + if (filter) + ret = ftrace_set_filter(ops, re, len, reset); + else + ret = ftrace_set_notrace(ops, re, len, reset); + + return ret; +} + +static int __ftrace_function_set_filter(int filter, char *buf, int len, + struct function_filter_data *data) +{ + int i, re_cnt, ret = -EINVAL; + int *reset; + char **re; + + reset = filter ? &data->first_filter : &data->first_notrace; + + /* + * The 'ip' field could have multiple filters set, separated + * either by space or comma. We first cut the filter and apply + * all pieces separately. + */ + re = ftrace_function_filter_re(buf, len, &re_cnt); + if (!re) + return -EINVAL; + + for (i = 0; i < re_cnt; i++) { + ret = ftrace_function_set_regexp(data->ops, filter, *reset, + re[i], strlen(re[i])); + if (ret) + break; + + if (*reset) + *reset = 0; + } + + argv_free(re); + return ret; +} + +static int ftrace_function_check_pred(struct filter_pred *pred) +{ + struct ftrace_event_field *field = pred->field; + + /* + * Check the predicate for function trace, verify: + * - only '==' and '!=' is used + * - the 'ip' field is used + */ + if ((pred->op != OP_EQ) && (pred->op != OP_NE)) + return -EINVAL; + + if (strcmp(field->name, "ip")) + return -EINVAL; + + return 0; +} + +static int ftrace_function_set_filter_pred(struct filter_pred *pred, + struct function_filter_data *data) +{ + int ret; + + /* Checking the node is valid for function trace. */ + ret = ftrace_function_check_pred(pred); + if (ret) + return ret; + + return __ftrace_function_set_filter(pred->op == OP_EQ, + pred->regex->pattern, + pred->regex->len, + data); +} + +static bool is_or(struct prog_entry *prog, int i) +{ + int target; + + /* + * Only "||" is allowed for function events, thus, + * all true branches should jump to true, and any + * false branch should jump to false. + */ + target = prog[i].target + 1; + /* True and false have NULL preds (all prog entries should jump to one */ + if (prog[target].pred) + return false; + + /* prog[target].target is 1 for TRUE, 0 for FALSE */ + return prog[i].when_to_branch == prog[target].target; +} + +static int ftrace_function_set_filter(struct perf_event *event, + struct event_filter *filter) +{ + struct prog_entry *prog = rcu_dereference_protected(filter->prog, + lockdep_is_held(&event_mutex)); + struct function_filter_data data = { + .first_filter = 1, + .first_notrace = 1, + .ops = &event->ftrace_ops, + }; + int i; + + for (i = 0; prog[i].pred; i++) { + struct filter_pred *pred = prog[i].pred; + + if (!is_or(prog, i)) + return -EINVAL; + + if (ftrace_function_set_filter_pred(pred, &data) < 0) + return -EINVAL; + } + return 0; +} +#else +static int ftrace_function_set_filter(struct perf_event *event, + struct event_filter *filter) +{ + return -ENODEV; +} +#endif /* CONFIG_FUNCTION_TRACER */ + +int ftrace_profile_set_filter(struct perf_event *event, int event_id, + char *filter_str) +{ + int err; + struct event_filter *filter = NULL; + struct trace_event_call *call; + + mutex_lock(&event_mutex); + + call = event->tp_event; + + err = -EINVAL; + if (!call) + goto out_unlock; + + err = -EEXIST; + if (event->filter) + goto out_unlock; + + err = create_filter(NULL, call, filter_str, false, &filter); + if (err) + goto free_filter; + + if (ftrace_event_is_function(call)) + err = ftrace_function_set_filter(event, filter); + else + event->filter = filter; + +free_filter: + if (err || ftrace_event_is_function(call)) + __free_filter(filter); + +out_unlock: + mutex_unlock(&event_mutex); + + return err; +} + +#endif /* CONFIG_PERF_EVENTS */ + +#ifdef CONFIG_FTRACE_STARTUP_TEST + +#include <linux/types.h> +#include <linux/tracepoint.h> + +#define CREATE_TRACE_POINTS +#include "trace_events_filter_test.h" + +#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \ +{ \ + .filter = FILTER, \ + .rec = { .a = va, .b = vb, .c = vc, .d = vd, \ + .e = ve, .f = vf, .g = vg, .h = vh }, \ + .match = m, \ + .not_visited = nvisit, \ +} +#define YES 1 +#define NO 0 + +static struct test_filter_data_t { + char *filter; + struct trace_event_raw_ftrace_test_filter rec; + int match; + char *not_visited; +} test_filter_data[] = { +#define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \ + "e == 1 && f == 1 && g == 1 && h == 1" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""), + DATA_REC(NO, 0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"), + DATA_REC(NO, 1, 1, 1, 1, 1, 1, 1, 0, ""), +#undef FILTER +#define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \ + "e == 1 || f == 1 || g == 1 || h == 1" + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""), + DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""), + DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"), +#undef FILTER +#define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \ + "(e == 1 || f == 1) && (g == 1 || h == 1)" + DATA_REC(NO, 0, 0, 1, 1, 1, 1, 1, 1, "dfh"), + DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), + DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"), + DATA_REC(NO, 1, 0, 1, 0, 0, 1, 0, 0, "bd"), +#undef FILTER +#define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \ + "(e == 1 && f == 1) || (g == 1 && h == 1)" + DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"), + DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""), + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""), +#undef FILTER +#define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \ + "(e == 1 && f == 1) || (g == 1 && h == 1)" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"), + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""), + DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""), +#undef FILTER +#define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \ + "(e == 1 || f == 1)) && (g == 1 || h == 1)" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"), + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""), + DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"), +#undef FILTER +#define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \ + "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"), + DATA_REC(NO, 0, 1, 0, 1, 0, 1, 0, 1, ""), + DATA_REC(NO, 1, 0, 1, 0, 1, 0, 1, 0, ""), +#undef FILTER +#define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \ + "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"), + DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), + DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"), +}; + +#undef DATA_REC +#undef FILTER +#undef YES +#undef NO + +#define DATA_CNT ARRAY_SIZE(test_filter_data) + +static int test_pred_visited; + +static int test_pred_visited_fn(struct filter_pred *pred, void *event) +{ + struct ftrace_event_field *field = pred->field; + + test_pred_visited = 1; + printk(KERN_INFO "\npred visited %s\n", field->name); + return 1; +} + +static void update_pred_fn(struct event_filter *filter, char *fields) +{ + struct prog_entry *prog = rcu_dereference_protected(filter->prog, + lockdep_is_held(&event_mutex)); + int i; + + for (i = 0; prog[i].pred; i++) { + struct filter_pred *pred = prog[i].pred; + struct ftrace_event_field *field = pred->field; + + WARN_ON_ONCE(pred->fn_num == FILTER_PRED_FN_NOP); + + if (!field) { + WARN_ONCE(1, "all leafs should have field defined %d", i); + continue; + } + + if (!strchr(fields, *field->name)) + continue; + + pred->fn_num = FILTER_PRED_TEST_VISITED; + } +} + +static __init int ftrace_test_event_filter(void) +{ + int i; + + printk(KERN_INFO "Testing ftrace filter: "); + + for (i = 0; i < DATA_CNT; i++) { + struct event_filter *filter = NULL; + struct test_filter_data_t *d = &test_filter_data[i]; + int err; + + err = create_filter(NULL, &event_ftrace_test_filter, + d->filter, false, &filter); + if (err) { + printk(KERN_INFO + "Failed to get filter for '%s', err %d\n", + d->filter, err); + __free_filter(filter); + break; + } + + /* Needed to dereference filter->prog */ + mutex_lock(&event_mutex); + /* + * The preemption disabling is not really needed for self + * tests, but the rcu dereference will complain without it. + */ + preempt_disable(); + if (*d->not_visited) + update_pred_fn(filter, d->not_visited); + + test_pred_visited = 0; + err = filter_match_preds(filter, &d->rec); + preempt_enable(); + + mutex_unlock(&event_mutex); + + __free_filter(filter); + + if (test_pred_visited) { + printk(KERN_INFO + "Failed, unwanted pred visited for filter %s\n", + d->filter); + break; + } + + if (err != d->match) { + printk(KERN_INFO + "Failed to match filter '%s', expected %d\n", + d->filter, d->match); + break; + } + } + + if (i == DATA_CNT) + printk(KERN_CONT "OK\n"); + + return 0; +} + +late_initcall(ftrace_test_event_filter); + +#endif /* CONFIG_FTRACE_STARTUP_TEST */ diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h new file mode 100644 index 0000000000..e651dfbd34 --- /dev/null +++ b/kernel/trace/trace_events_filter_test.h @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 +#undef TRACE_SYSTEM +#define TRACE_SYSTEM test + +#if !defined(_TRACE_TEST_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TEST_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(ftrace_test_filter, + + TP_PROTO(int a, int b, int c, int d, int e, int f, int g, int h), + + TP_ARGS(a, b, c, d, e, f, g, h), + + TP_STRUCT__entry( + __field(int, a) + __field(int, b) + __field(int, c) + __field(int, d) + __field(int, e) + __field(int, f) + __field(int, g) + __field(int, h) + ), + + TP_fast_assign( + __entry->a = a; + __entry->b = b; + __entry->c = c; + __entry->d = d; + __entry->e = e; + __entry->f = f; + __entry->g = g; + __entry->h = h; + ), + + TP_printk("a %d, b %d, c %d, d %d, e %d, f %d, g %d, h %d", + __entry->a, __entry->b, __entry->c, __entry->d, + __entry->e, __entry->f, __entry->g, __entry->h) +); + +#endif /* _TRACE_TEST_H || TRACE_HEADER_MULTI_READ */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_events_filter_test + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c new file mode 100644 index 0000000000..68aaf0bd7a --- /dev/null +++ b/kernel/trace/trace_events_hist.c @@ -0,0 +1,6866 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace_events_hist - trace event hist triggers + * + * Copyright (C) 2015 Tom Zanussi <tom.zanussi@linux.intel.com> + */ + +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/security.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/stacktrace.h> +#include <linux/rculist.h> +#include <linux/tracefs.h> + +/* for gfp flag names */ +#include <linux/trace_events.h> +#include <trace/events/mmflags.h> + +#include "tracing_map.h" +#include "trace_synth.h" + +#define ERRORS \ + C(NONE, "No error"), \ + C(DUPLICATE_VAR, "Variable already defined"), \ + C(VAR_NOT_UNIQUE, "Variable name not unique, need to use fully qualified name (subsys.event.var) for variable"), \ + C(TOO_MANY_VARS, "Too many variables defined"), \ + C(MALFORMED_ASSIGNMENT, "Malformed assignment"), \ + C(NAMED_MISMATCH, "Named hist trigger doesn't match existing named trigger (includes variables)"), \ + C(TRIGGER_EEXIST, "Hist trigger already exists"), \ + C(TRIGGER_ENOENT_CLEAR, "Can't clear or continue a nonexistent hist trigger"), \ + C(SET_CLOCK_FAIL, "Couldn't set trace_clock"), \ + C(BAD_FIELD_MODIFIER, "Invalid field modifier"), \ + C(TOO_MANY_SUBEXPR, "Too many subexpressions (3 max)"), \ + C(TIMESTAMP_MISMATCH, "Timestamp units in expression don't match"), \ + C(TOO_MANY_FIELD_VARS, "Too many field variables defined"), \ + C(EVENT_FILE_NOT_FOUND, "Event file not found"), \ + C(HIST_NOT_FOUND, "Matching event histogram not found"), \ + C(HIST_CREATE_FAIL, "Couldn't create histogram for field"), \ + C(SYNTH_VAR_NOT_FOUND, "Couldn't find synthetic variable"), \ + C(SYNTH_EVENT_NOT_FOUND,"Couldn't find synthetic event"), \ + C(SYNTH_TYPE_MISMATCH, "Param type doesn't match synthetic event field type"), \ + C(SYNTH_COUNT_MISMATCH, "Param count doesn't match synthetic event field count"), \ + C(FIELD_VAR_PARSE_FAIL, "Couldn't parse field variable"), \ + C(VAR_CREATE_FIND_FAIL, "Couldn't create or find variable"), \ + C(ONX_NOT_VAR, "For onmax(x) or onchange(x), x must be a variable"), \ + C(ONX_VAR_NOT_FOUND, "Couldn't find onmax or onchange variable"), \ + C(ONX_VAR_CREATE_FAIL, "Couldn't create onmax or onchange variable"), \ + C(FIELD_VAR_CREATE_FAIL,"Couldn't create field variable"), \ + C(TOO_MANY_PARAMS, "Too many action params"), \ + C(PARAM_NOT_FOUND, "Couldn't find param"), \ + C(INVALID_PARAM, "Invalid action param"), \ + C(ACTION_NOT_FOUND, "No action found"), \ + C(NO_SAVE_PARAMS, "No params found for save()"), \ + C(TOO_MANY_SAVE_ACTIONS,"Can't have more than one save() action per hist"), \ + C(ACTION_MISMATCH, "Handler doesn't support action"), \ + C(NO_CLOSING_PAREN, "No closing paren found"), \ + C(SUBSYS_NOT_FOUND, "Missing subsystem"), \ + C(INVALID_SUBSYS_EVENT, "Invalid subsystem or event name"), \ + C(INVALID_REF_KEY, "Using variable references in keys not supported"), \ + C(VAR_NOT_FOUND, "Couldn't find variable"), \ + C(FIELD_NOT_FOUND, "Couldn't find field"), \ + C(EMPTY_ASSIGNMENT, "Empty assignment"), \ + C(INVALID_SORT_MODIFIER,"Invalid sort modifier"), \ + C(EMPTY_SORT_FIELD, "Empty sort field"), \ + C(TOO_MANY_SORT_FIELDS, "Too many sort fields (Max = 2)"), \ + C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), \ + C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \ + C(EXPECT_NUMBER, "Expecting numeric literal"), \ + C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"), \ + C(DIVISION_BY_ZERO, "Division by zero"), \ + C(NEED_NOHC_VAL, "Non-hitcount value is required for 'nohitcount'"), + +#undef C +#define C(a, b) HIST_ERR_##a + +enum { ERRORS }; + +#undef C +#define C(a, b) b + +static const char *err_text[] = { ERRORS }; + +struct hist_field; + +typedef u64 (*hist_field_fn_t) (struct hist_field *field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event); + +#define HIST_FIELD_OPERANDS_MAX 2 +#define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX) +#define HIST_ACTIONS_MAX 8 +#define HIST_CONST_DIGITS_MAX 21 +#define HIST_DIV_SHIFT 20 /* For optimizing division by constants */ + +enum field_op_id { + FIELD_OP_NONE, + FIELD_OP_PLUS, + FIELD_OP_MINUS, + FIELD_OP_UNARY_MINUS, + FIELD_OP_DIV, + FIELD_OP_MULT, +}; + +enum hist_field_fn { + HIST_FIELD_FN_NOP, + HIST_FIELD_FN_VAR_REF, + HIST_FIELD_FN_COUNTER, + HIST_FIELD_FN_CONST, + HIST_FIELD_FN_LOG2, + HIST_FIELD_FN_BUCKET, + HIST_FIELD_FN_TIMESTAMP, + HIST_FIELD_FN_CPU, + HIST_FIELD_FN_STRING, + HIST_FIELD_FN_DYNSTRING, + HIST_FIELD_FN_RELDYNSTRING, + HIST_FIELD_FN_PSTRING, + HIST_FIELD_FN_S64, + HIST_FIELD_FN_U64, + HIST_FIELD_FN_S32, + HIST_FIELD_FN_U32, + HIST_FIELD_FN_S16, + HIST_FIELD_FN_U16, + HIST_FIELD_FN_S8, + HIST_FIELD_FN_U8, + HIST_FIELD_FN_UMINUS, + HIST_FIELD_FN_MINUS, + HIST_FIELD_FN_PLUS, + HIST_FIELD_FN_DIV, + HIST_FIELD_FN_MULT, + HIST_FIELD_FN_DIV_POWER2, + HIST_FIELD_FN_DIV_NOT_POWER2, + HIST_FIELD_FN_DIV_MULT_SHIFT, + HIST_FIELD_FN_EXECNAME, + HIST_FIELD_FN_STACK, +}; + +/* + * A hist_var (histogram variable) contains variable information for + * hist_fields having the HIST_FIELD_FL_VAR or HIST_FIELD_FL_VAR_REF + * flag set. A hist_var has a variable name e.g. ts0, and is + * associated with a given histogram trigger, as specified by + * hist_data. The hist_var idx is the unique index assigned to the + * variable by the hist trigger's tracing_map. The idx is what is + * used to set a variable's value and, by a variable reference, to + * retrieve it. + */ +struct hist_var { + char *name; + struct hist_trigger_data *hist_data; + unsigned int idx; +}; + +struct hist_field { + struct ftrace_event_field *field; + unsigned long flags; + unsigned long buckets; + const char *type; + struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; + struct hist_trigger_data *hist_data; + enum hist_field_fn fn_num; + unsigned int ref; + unsigned int size; + unsigned int offset; + unsigned int is_signed; + + /* + * Variable fields contain variable-specific info in var. + */ + struct hist_var var; + enum field_op_id operator; + char *system; + char *event_name; + + /* + * The name field is used for EXPR and VAR_REF fields. VAR + * fields contain the variable name in var.name. + */ + char *name; + + /* + * When a histogram trigger is hit, if it has any references + * to variables, the values of those variables are collected + * into a var_ref_vals array by resolve_var_refs(). The + * current value of each variable is read from the tracing_map + * using the hist field's hist_var.idx and entered into the + * var_ref_idx entry i.e. var_ref_vals[var_ref_idx]. + */ + unsigned int var_ref_idx; + bool read_once; + + unsigned int var_str_idx; + + /* Numeric literals are represented as u64 */ + u64 constant; + /* Used to optimize division by constants */ + u64 div_multiplier; +}; + +static u64 hist_fn_call(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event); + +static u64 hist_field_const(struct hist_field *field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + return field->constant; +} + +static u64 hist_field_counter(struct hist_field *field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + return 1; +} + +static u64 hist_field_string(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + char *addr = (char *)(event + hist_field->field->offset); + + return (u64)(unsigned long)addr; +} + +static u64 hist_field_dynstring(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + u32 str_item = *(u32 *)(event + hist_field->field->offset); + int str_loc = str_item & 0xffff; + char *addr = (char *)(event + str_loc); + + return (u64)(unsigned long)addr; +} + +static u64 hist_field_reldynstring(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + u32 *item = event + hist_field->field->offset; + u32 str_item = *item; + int str_loc = str_item & 0xffff; + char *addr = (char *)&item[1] + str_loc; + + return (u64)(unsigned long)addr; +} + +static u64 hist_field_pstring(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + char **addr = (char **)(event + hist_field->field->offset); + + return (u64)(unsigned long)*addr; +} + +static u64 hist_field_log2(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand = hist_field->operands[0]; + + u64 val = hist_fn_call(operand, elt, buffer, rbe, event); + + return (u64) ilog2(roundup_pow_of_two(val)); +} + +static u64 hist_field_bucket(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand = hist_field->operands[0]; + unsigned long buckets = hist_field->buckets; + + u64 val = hist_fn_call(operand, elt, buffer, rbe, event); + + if (WARN_ON_ONCE(!buckets)) + return val; + + if (val >= LONG_MAX) + val = div64_ul(val, buckets); + else + val = (u64)((unsigned long)val / buckets); + return val * buckets; +} + +static u64 hist_field_plus(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event); + + return val1 + val2; +} + +static u64 hist_field_minus(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event); + + return val1 - val2; +} + +static u64 hist_field_div(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event); + + /* Return -1 for the undefined case */ + if (!val2) + return -1; + + /* Use shift if the divisor is a power of 2 */ + if (!(val2 & (val2 - 1))) + return val1 >> __ffs64(val2); + + return div64_u64(val1, val2); +} + +static u64 div_by_power_of_two(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + + return val1 >> __ffs64(operand2->constant); +} + +static u64 div_by_not_power_of_two(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + + return div64_u64(val1, operand2->constant); +} + +static u64 div_by_mult_and_shift(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + + /* + * If the divisor is a constant, do a multiplication and shift instead. + * + * Choose Z = some power of 2. If Y <= Z, then: + * X / Y = (X * (Z / Y)) / Z + * + * (Z / Y) is a constant (mult) which is calculated at parse time, so: + * X / Y = (X * mult) / Z + * + * The division by Z can be replaced by a shift since Z is a power of 2: + * X / Y = (X * mult) >> HIST_DIV_SHIFT + * + * As long, as X < Z the results will not be off by more than 1. + */ + if (val1 < (1 << HIST_DIV_SHIFT)) { + u64 mult = operand2->div_multiplier; + + return (val1 * mult + ((1 << HIST_DIV_SHIFT) - 1)) >> HIST_DIV_SHIFT; + } + + return div64_u64(val1, operand2->constant); +} + +static u64 hist_field_mult(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event); + + return val1 * val2; +} + +static u64 hist_field_unary_minus(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand = hist_field->operands[0]; + + s64 sval = (s64)hist_fn_call(operand, elt, buffer, rbe, event); + u64 val = (u64)-sval; + + return val; +} + +#define DEFINE_HIST_FIELD_FN(type) \ + static u64 hist_field_##type(struct hist_field *hist_field, \ + struct tracing_map_elt *elt, \ + struct trace_buffer *buffer, \ + struct ring_buffer_event *rbe, \ + void *event) \ +{ \ + type *addr = (type *)(event + hist_field->field->offset); \ + \ + return (u64)(unsigned long)*addr; \ +} + +DEFINE_HIST_FIELD_FN(s64); +DEFINE_HIST_FIELD_FN(u64); +DEFINE_HIST_FIELD_FN(s32); +DEFINE_HIST_FIELD_FN(u32); +DEFINE_HIST_FIELD_FN(s16); +DEFINE_HIST_FIELD_FN(u16); +DEFINE_HIST_FIELD_FN(s8); +DEFINE_HIST_FIELD_FN(u8); + +#define for_each_hist_field(i, hist_data) \ + for ((i) = 0; (i) < (hist_data)->n_fields; (i)++) + +#define for_each_hist_val_field(i, hist_data) \ + for ((i) = 0; (i) < (hist_data)->n_vals; (i)++) + +#define for_each_hist_key_field(i, hist_data) \ + for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++) + +#define HITCOUNT_IDX 0 +#define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE) + +enum hist_field_flags { + HIST_FIELD_FL_HITCOUNT = 1 << 0, + HIST_FIELD_FL_KEY = 1 << 1, + HIST_FIELD_FL_STRING = 1 << 2, + HIST_FIELD_FL_HEX = 1 << 3, + HIST_FIELD_FL_SYM = 1 << 4, + HIST_FIELD_FL_SYM_OFFSET = 1 << 5, + HIST_FIELD_FL_EXECNAME = 1 << 6, + HIST_FIELD_FL_SYSCALL = 1 << 7, + HIST_FIELD_FL_STACKTRACE = 1 << 8, + HIST_FIELD_FL_LOG2 = 1 << 9, + HIST_FIELD_FL_TIMESTAMP = 1 << 10, + HIST_FIELD_FL_TIMESTAMP_USECS = 1 << 11, + HIST_FIELD_FL_VAR = 1 << 12, + HIST_FIELD_FL_EXPR = 1 << 13, + HIST_FIELD_FL_VAR_REF = 1 << 14, + HIST_FIELD_FL_CPU = 1 << 15, + HIST_FIELD_FL_ALIAS = 1 << 16, + HIST_FIELD_FL_BUCKET = 1 << 17, + HIST_FIELD_FL_CONST = 1 << 18, + HIST_FIELD_FL_PERCENT = 1 << 19, + HIST_FIELD_FL_GRAPH = 1 << 20, +}; + +struct var_defs { + unsigned int n_vars; + char *name[TRACING_MAP_VARS_MAX]; + char *expr[TRACING_MAP_VARS_MAX]; +}; + +struct hist_trigger_attrs { + char *keys_str; + char *vals_str; + char *sort_key_str; + char *name; + char *clock; + bool pause; + bool cont; + bool clear; + bool ts_in_usecs; + bool no_hitcount; + unsigned int map_bits; + + char *assignment_str[TRACING_MAP_VARS_MAX]; + unsigned int n_assignments; + + char *action_str[HIST_ACTIONS_MAX]; + unsigned int n_actions; + + struct var_defs var_defs; +}; + +struct field_var { + struct hist_field *var; + struct hist_field *val; +}; + +struct field_var_hist { + struct hist_trigger_data *hist_data; + char *cmd; +}; + +struct hist_trigger_data { + struct hist_field *fields[HIST_FIELDS_MAX]; + unsigned int n_vals; + unsigned int n_keys; + unsigned int n_fields; + unsigned int n_vars; + unsigned int n_var_str; + unsigned int key_size; + struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX]; + unsigned int n_sort_keys; + struct trace_event_file *event_file; + struct hist_trigger_attrs *attrs; + struct tracing_map *map; + bool enable_timestamps; + bool remove; + struct hist_field *var_refs[TRACING_MAP_VARS_MAX]; + unsigned int n_var_refs; + + struct action_data *actions[HIST_ACTIONS_MAX]; + unsigned int n_actions; + + struct field_var *field_vars[SYNTH_FIELDS_MAX]; + unsigned int n_field_vars; + unsigned int n_field_var_str; + struct field_var_hist *field_var_hists[SYNTH_FIELDS_MAX]; + unsigned int n_field_var_hists; + + struct field_var *save_vars[SYNTH_FIELDS_MAX]; + unsigned int n_save_vars; + unsigned int n_save_var_str; +}; + +struct action_data; + +typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, u64 *var_ref_vals); + +typedef bool (*check_track_val_fn_t) (u64 track_val, u64 var_val); + +enum handler_id { + HANDLER_ONMATCH = 1, + HANDLER_ONMAX, + HANDLER_ONCHANGE, +}; + +enum action_id { + ACTION_SAVE = 1, + ACTION_TRACE, + ACTION_SNAPSHOT, +}; + +struct action_data { + enum handler_id handler; + enum action_id action; + char *action_name; + action_fn_t fn; + + unsigned int n_params; + char *params[SYNTH_FIELDS_MAX]; + + /* + * When a histogram trigger is hit, the values of any + * references to variables, including variables being passed + * as parameters to synthetic events, are collected into a + * var_ref_vals array. This var_ref_idx array is an array of + * indices into the var_ref_vals array, one for each synthetic + * event param, and is passed to the synthetic event + * invocation. + */ + unsigned int var_ref_idx[SYNTH_FIELDS_MAX]; + struct synth_event *synth_event; + bool use_trace_keyword; + char *synth_event_name; + + union { + struct { + char *event; + char *event_system; + } match_data; + + struct { + /* + * var_str contains the $-unstripped variable + * name referenced by var_ref, and used when + * printing the action. Because var_ref + * creation is deferred to create_actions(), + * we need a per-action way to save it until + * then, thus var_str. + */ + char *var_str; + + /* + * var_ref refers to the variable being + * tracked e.g onmax($var). + */ + struct hist_field *var_ref; + + /* + * track_var contains the 'invisible' tracking + * variable created to keep the current + * e.g. max value. + */ + struct hist_field *track_var; + + check_track_val_fn_t check_val; + action_fn_t save_data; + } track_data; + }; +}; + +struct track_data { + u64 track_val; + bool updated; + + unsigned int key_len; + void *key; + struct tracing_map_elt elt; + + struct action_data *action_data; + struct hist_trigger_data *hist_data; +}; + +struct hist_elt_data { + char *comm; + u64 *var_ref_vals; + char **field_var_str; + int n_field_var_str; +}; + +struct snapshot_context { + struct tracing_map_elt *elt; + void *key; +}; + +/* + * Returns the specific division function to use if the divisor + * is constant. This avoids extra branches when the trigger is hit. + */ +static enum hist_field_fn hist_field_get_div_fn(struct hist_field *divisor) +{ + u64 div = divisor->constant; + + if (!(div & (div - 1))) + return HIST_FIELD_FN_DIV_POWER2; + + /* If the divisor is too large, do a regular division */ + if (div > (1 << HIST_DIV_SHIFT)) + return HIST_FIELD_FN_DIV_NOT_POWER2; + + divisor->div_multiplier = div64_u64((u64)(1 << HIST_DIV_SHIFT), div); + return HIST_FIELD_FN_DIV_MULT_SHIFT; +} + +static void track_data_free(struct track_data *track_data) +{ + struct hist_elt_data *elt_data; + + if (!track_data) + return; + + kfree(track_data->key); + + elt_data = track_data->elt.private_data; + if (elt_data) { + kfree(elt_data->comm); + kfree(elt_data); + } + + kfree(track_data); +} + +static struct track_data *track_data_alloc(unsigned int key_len, + struct action_data *action_data, + struct hist_trigger_data *hist_data) +{ + struct track_data *data = kzalloc(sizeof(*data), GFP_KERNEL); + struct hist_elt_data *elt_data; + + if (!data) + return ERR_PTR(-ENOMEM); + + data->key = kzalloc(key_len, GFP_KERNEL); + if (!data->key) { + track_data_free(data); + return ERR_PTR(-ENOMEM); + } + + data->key_len = key_len; + data->action_data = action_data; + data->hist_data = hist_data; + + elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL); + if (!elt_data) { + track_data_free(data); + return ERR_PTR(-ENOMEM); + } + + data->elt.private_data = elt_data; + + elt_data->comm = kzalloc(TASK_COMM_LEN, GFP_KERNEL); + if (!elt_data->comm) { + track_data_free(data); + return ERR_PTR(-ENOMEM); + } + + return data; +} + +#define HIST_PREFIX "hist:" + +static char *last_cmd; +static char last_cmd_loc[MAX_FILTER_STR_VAL]; + +static int errpos(char *str) +{ + if (!str || !last_cmd) + return 0; + + return err_pos(last_cmd, str); +} + +static void last_cmd_set(struct trace_event_file *file, char *str) +{ + const char *system = NULL, *name = NULL; + struct trace_event_call *call; + int len; + + if (!str) + return; + + /* sizeof() contains the nul byte */ + len = sizeof(HIST_PREFIX) + strlen(str); + kfree(last_cmd); + last_cmd = kzalloc(len, GFP_KERNEL); + if (!last_cmd) + return; + + strcpy(last_cmd, HIST_PREFIX); + /* Again, sizeof() contains the nul byte */ + len -= sizeof(HIST_PREFIX); + strncat(last_cmd, str, len); + + if (file) { + call = file->event_call; + system = call->class->system; + if (system) { + name = trace_event_name(call); + if (!name) + system = NULL; + } + } + + if (system) + snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, HIST_PREFIX "%s:%s", system, name); +} + +static void hist_err(struct trace_array *tr, u8 err_type, u16 err_pos) +{ + if (!last_cmd) + return; + + tracing_log_err(tr, last_cmd_loc, last_cmd, err_text, + err_type, err_pos); +} + +static void hist_err_clear(void) +{ + if (last_cmd) + last_cmd[0] = '\0'; + last_cmd_loc[0] = '\0'; +} + +typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals, + unsigned int *var_ref_idx); + +static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, + unsigned int *var_ref_idx) +{ + struct tracepoint *tp = event->tp; + + if (unlikely(atomic_read(&tp->key.enabled) > 0)) { + struct tracepoint_func *probe_func_ptr; + synth_probe_func_t probe_func; + void *__data; + + if (!(cpu_online(raw_smp_processor_id()))) + return; + + probe_func_ptr = rcu_dereference_sched((tp)->funcs); + if (probe_func_ptr) { + do { + probe_func = probe_func_ptr->func; + __data = probe_func_ptr->data; + probe_func(__data, var_ref_vals, var_ref_idx); + } while ((++probe_func_ptr)->func); + } + } +} + +static void action_trace(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, u64 *var_ref_vals) +{ + struct synth_event *event = data->synth_event; + + trace_synth(event, var_ref_vals, data->var_ref_idx); +} + +struct hist_var_data { + struct list_head list; + struct hist_trigger_data *hist_data; +}; + +static u64 hist_field_timestamp(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_trigger_data *hist_data = hist_field->hist_data; + struct trace_array *tr = hist_data->event_file->tr; + + u64 ts = ring_buffer_event_time_stamp(buffer, rbe); + + if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr)) + ts = ns2usecs(ts); + + return ts; +} + +static u64 hist_field_cpu(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + int cpu = smp_processor_id(); + + return cpu; +} + +/** + * check_field_for_var_ref - Check if a VAR_REF field references a variable + * @hist_field: The VAR_REF field to check + * @var_data: The hist trigger that owns the variable + * @var_idx: The trigger variable identifier + * + * Check the given VAR_REF field to see whether or not it references + * the given variable associated with the given trigger. + * + * Return: The VAR_REF field if it does reference the variable, NULL if not + */ +static struct hist_field * +check_field_for_var_ref(struct hist_field *hist_field, + struct hist_trigger_data *var_data, + unsigned int var_idx) +{ + WARN_ON(!(hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF)); + + if (hist_field && hist_field->var.idx == var_idx && + hist_field->var.hist_data == var_data) + return hist_field; + + return NULL; +} + +/** + * find_var_ref - Check if a trigger has a reference to a trigger variable + * @hist_data: The hist trigger that might have a reference to the variable + * @var_data: The hist trigger that owns the variable + * @var_idx: The trigger variable identifier + * + * Check the list of var_refs[] on the first hist trigger to see + * whether any of them are references to the variable on the second + * trigger. + * + * Return: The VAR_REF field referencing the variable if so, NULL if not + */ +static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data, + struct hist_trigger_data *var_data, + unsigned int var_idx) +{ + struct hist_field *hist_field; + unsigned int i; + + for (i = 0; i < hist_data->n_var_refs; i++) { + hist_field = hist_data->var_refs[i]; + if (check_field_for_var_ref(hist_field, var_data, var_idx)) + return hist_field; + } + + return NULL; +} + +/** + * find_any_var_ref - Check if there is a reference to a given trigger variable + * @hist_data: The hist trigger + * @var_idx: The trigger variable identifier + * + * Check to see whether the given variable is currently referenced by + * any other trigger. + * + * The trigger the variable is defined on is explicitly excluded - the + * assumption being that a self-reference doesn't prevent a trigger + * from being removed. + * + * Return: The VAR_REF field referencing the variable if so, NULL if not + */ +static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data, + unsigned int var_idx) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_field *found = NULL; + struct hist_var_data *var_data; + + list_for_each_entry(var_data, &tr->hist_vars, list) { + if (var_data->hist_data == hist_data) + continue; + found = find_var_ref(var_data->hist_data, hist_data, var_idx); + if (found) + break; + } + + return found; +} + +/** + * check_var_refs - Check if there is a reference to any of trigger's variables + * @hist_data: The hist trigger + * + * A trigger can define one or more variables. If any one of them is + * currently referenced by any other trigger, this function will + * determine that. + * + * Typically used to determine whether or not a trigger can be removed + * - if there are any references to a trigger's variables, it cannot. + * + * Return: True if there is a reference to any of trigger's variables + */ +static bool check_var_refs(struct hist_trigger_data *hist_data) +{ + struct hist_field *field; + bool found = false; + int i; + + for_each_hist_field(i, hist_data) { + field = hist_data->fields[i]; + if (field && field->flags & HIST_FIELD_FL_VAR) { + if (find_any_var_ref(hist_data, field->var.idx)) { + found = true; + break; + } + } + } + + return found; +} + +static struct hist_var_data *find_hist_vars(struct hist_trigger_data *hist_data) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_var_data *var_data, *found = NULL; + + list_for_each_entry(var_data, &tr->hist_vars, list) { + if (var_data->hist_data == hist_data) { + found = var_data; + break; + } + } + + return found; +} + +static bool field_has_hist_vars(struct hist_field *hist_field, + unsigned int level) +{ + int i; + + if (level > 3) + return false; + + if (!hist_field) + return false; + + if (hist_field->flags & HIST_FIELD_FL_VAR || + hist_field->flags & HIST_FIELD_FL_VAR_REF) + return true; + + for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) { + struct hist_field *operand; + + operand = hist_field->operands[i]; + if (field_has_hist_vars(operand, level + 1)) + return true; + } + + return false; +} + +static bool has_hist_vars(struct hist_trigger_data *hist_data) +{ + struct hist_field *hist_field; + int i; + + for_each_hist_field(i, hist_data) { + hist_field = hist_data->fields[i]; + if (field_has_hist_vars(hist_field, 0)) + return true; + } + + return false; +} + +static int save_hist_vars(struct hist_trigger_data *hist_data) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_var_data *var_data; + + var_data = find_hist_vars(hist_data); + if (var_data) + return 0; + + if (tracing_check_open_get_tr(tr)) + return -ENODEV; + + var_data = kzalloc(sizeof(*var_data), GFP_KERNEL); + if (!var_data) { + trace_array_put(tr); + return -ENOMEM; + } + + var_data->hist_data = hist_data; + list_add(&var_data->list, &tr->hist_vars); + + return 0; +} + +static void remove_hist_vars(struct hist_trigger_data *hist_data) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_var_data *var_data; + + var_data = find_hist_vars(hist_data); + if (!var_data) + return; + + if (WARN_ON(check_var_refs(hist_data))) + return; + + list_del(&var_data->list); + + kfree(var_data); + + trace_array_put(tr); +} + +static struct hist_field *find_var_field(struct hist_trigger_data *hist_data, + const char *var_name) +{ + struct hist_field *hist_field, *found = NULL; + int i; + + for_each_hist_field(i, hist_data) { + hist_field = hist_data->fields[i]; + if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR && + strcmp(hist_field->var.name, var_name) == 0) { + found = hist_field; + break; + } + } + + return found; +} + +static struct hist_field *find_var(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + const char *var_name) +{ + struct hist_trigger_data *test_data; + struct event_trigger_data *test; + struct hist_field *hist_field; + + lockdep_assert_held(&event_mutex); + + hist_field = find_var_field(hist_data, var_name); + if (hist_field) + return hist_field; + + list_for_each_entry(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + test_data = test->private_data; + hist_field = find_var_field(test_data, var_name); + if (hist_field) + return hist_field; + } + } + + return NULL; +} + +static struct trace_event_file *find_var_file(struct trace_array *tr, + char *system, + char *event_name, + char *var_name) +{ + struct hist_trigger_data *var_hist_data; + struct hist_var_data *var_data; + struct trace_event_file *file, *found = NULL; + + if (system) + return find_event_file(tr, system, event_name); + + list_for_each_entry(var_data, &tr->hist_vars, list) { + var_hist_data = var_data->hist_data; + file = var_hist_data->event_file; + if (file == found) + continue; + + if (find_var_field(var_hist_data, var_name)) { + if (found) { + hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name)); + return NULL; + } + + found = file; + } + } + + return found; +} + +static struct hist_field *find_file_var(struct trace_event_file *file, + const char *var_name) +{ + struct hist_trigger_data *test_data; + struct event_trigger_data *test; + struct hist_field *hist_field; + + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + test_data = test->private_data; + hist_field = find_var_field(test_data, var_name); + if (hist_field) + return hist_field; + } + } + + return NULL; +} + +static struct hist_field * +find_match_var(struct hist_trigger_data *hist_data, char *var_name) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_field *hist_field, *found = NULL; + struct trace_event_file *file; + unsigned int i; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + + if (data->handler == HANDLER_ONMATCH) { + char *system = data->match_data.event_system; + char *event_name = data->match_data.event; + + file = find_var_file(tr, system, event_name, var_name); + if (!file) + continue; + hist_field = find_file_var(file, var_name); + if (hist_field) { + if (found) { + hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE, + errpos(var_name)); + return ERR_PTR(-EINVAL); + } + + found = hist_field; + } + } + } + return found; +} + +static struct hist_field *find_event_var(struct hist_trigger_data *hist_data, + char *system, + char *event_name, + char *var_name) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_field *hist_field = NULL; + struct trace_event_file *file; + + if (!system || !event_name) { + hist_field = find_match_var(hist_data, var_name); + if (IS_ERR(hist_field)) + return NULL; + if (hist_field) + return hist_field; + } + + file = find_var_file(tr, system, event_name, var_name); + if (!file) + return NULL; + + hist_field = find_file_var(file, var_name); + + return hist_field; +} + +static u64 hist_field_var_ref(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_elt_data *elt_data; + u64 var_val = 0; + + if (WARN_ON_ONCE(!elt)) + return var_val; + + elt_data = elt->private_data; + var_val = elt_data->var_ref_vals[hist_field->var_ref_idx]; + + return var_val; +} + +static bool resolve_var_refs(struct hist_trigger_data *hist_data, void *key, + u64 *var_ref_vals, bool self) +{ + struct hist_trigger_data *var_data; + struct tracing_map_elt *var_elt; + struct hist_field *hist_field; + unsigned int i, var_idx; + bool resolved = true; + u64 var_val = 0; + + for (i = 0; i < hist_data->n_var_refs; i++) { + hist_field = hist_data->var_refs[i]; + var_idx = hist_field->var.idx; + var_data = hist_field->var.hist_data; + + if (var_data == NULL) { + resolved = false; + break; + } + + if ((self && var_data != hist_data) || + (!self && var_data == hist_data)) + continue; + + var_elt = tracing_map_lookup(var_data->map, key); + if (!var_elt) { + resolved = false; + break; + } + + if (!tracing_map_var_set(var_elt, var_idx)) { + resolved = false; + break; + } + + if (self || !hist_field->read_once) + var_val = tracing_map_read_var(var_elt, var_idx); + else + var_val = tracing_map_read_var_once(var_elt, var_idx); + + var_ref_vals[i] = var_val; + } + + return resolved; +} + +static const char *hist_field_name(struct hist_field *field, + unsigned int level) +{ + const char *field_name = ""; + + if (WARN_ON_ONCE(!field)) + return field_name; + + if (level > 1) + return field_name; + + if (field->field) + field_name = field->field->name; + else if (field->flags & HIST_FIELD_FL_LOG2 || + field->flags & HIST_FIELD_FL_ALIAS || + field->flags & HIST_FIELD_FL_BUCKET) + field_name = hist_field_name(field->operands[0], ++level); + else if (field->flags & HIST_FIELD_FL_CPU) + field_name = "common_cpu"; + else if (field->flags & HIST_FIELD_FL_EXPR || + field->flags & HIST_FIELD_FL_VAR_REF) { + if (field->system) { + static char full_name[MAX_FILTER_STR_VAL]; + + strcat(full_name, field->system); + strcat(full_name, "."); + strcat(full_name, field->event_name); + strcat(full_name, "."); + strcat(full_name, field->name); + field_name = full_name; + } else + field_name = field->name; + } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) + field_name = "common_timestamp"; + else if (field->flags & HIST_FIELD_FL_STACKTRACE) { + if (field->field) + field_name = field->field->name; + else + field_name = "common_stacktrace"; + } else if (field->flags & HIST_FIELD_FL_HITCOUNT) + field_name = "hitcount"; + + if (field_name == NULL) + field_name = ""; + + return field_name; +} + +static enum hist_field_fn select_value_fn(int field_size, int field_is_signed) +{ + switch (field_size) { + case 8: + if (field_is_signed) + return HIST_FIELD_FN_S64; + else + return HIST_FIELD_FN_U64; + case 4: + if (field_is_signed) + return HIST_FIELD_FN_S32; + else + return HIST_FIELD_FN_U32; + case 2: + if (field_is_signed) + return HIST_FIELD_FN_S16; + else + return HIST_FIELD_FN_U16; + case 1: + if (field_is_signed) + return HIST_FIELD_FN_S8; + else + return HIST_FIELD_FN_U8; + } + + return HIST_FIELD_FN_NOP; +} + +static int parse_map_size(char *str) +{ + unsigned long size, map_bits; + int ret; + + ret = kstrtoul(str, 0, &size); + if (ret) + goto out; + + map_bits = ilog2(roundup_pow_of_two(size)); + if (map_bits < TRACING_MAP_BITS_MIN || + map_bits > TRACING_MAP_BITS_MAX) + ret = -EINVAL; + else + ret = map_bits; + out: + return ret; +} + +static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) +{ + unsigned int i; + + if (!attrs) + return; + + for (i = 0; i < attrs->n_assignments; i++) + kfree(attrs->assignment_str[i]); + + for (i = 0; i < attrs->n_actions; i++) + kfree(attrs->action_str[i]); + + kfree(attrs->name); + kfree(attrs->sort_key_str); + kfree(attrs->keys_str); + kfree(attrs->vals_str); + kfree(attrs->clock); + kfree(attrs); +} + +static int parse_action(char *str, struct hist_trigger_attrs *attrs) +{ + int ret = -EINVAL; + + if (attrs->n_actions >= HIST_ACTIONS_MAX) + return ret; + + if ((str_has_prefix(str, "onmatch(")) || + (str_has_prefix(str, "onmax(")) || + (str_has_prefix(str, "onchange("))) { + attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL); + if (!attrs->action_str[attrs->n_actions]) { + ret = -ENOMEM; + return ret; + } + attrs->n_actions++; + ret = 0; + } + return ret; +} + +static int parse_assignment(struct trace_array *tr, + char *str, struct hist_trigger_attrs *attrs) +{ + int len, ret = 0; + + if ((len = str_has_prefix(str, "key=")) || + (len = str_has_prefix(str, "keys="))) { + attrs->keys_str = kstrdup(str + len, GFP_KERNEL); + if (!attrs->keys_str) { + ret = -ENOMEM; + goto out; + } + } else if ((len = str_has_prefix(str, "val=")) || + (len = str_has_prefix(str, "vals=")) || + (len = str_has_prefix(str, "values="))) { + attrs->vals_str = kstrdup(str + len, GFP_KERNEL); + if (!attrs->vals_str) { + ret = -ENOMEM; + goto out; + } + } else if ((len = str_has_prefix(str, "sort="))) { + attrs->sort_key_str = kstrdup(str + len, GFP_KERNEL); + if (!attrs->sort_key_str) { + ret = -ENOMEM; + goto out; + } + } else if (str_has_prefix(str, "name=")) { + attrs->name = kstrdup(str, GFP_KERNEL); + if (!attrs->name) { + ret = -ENOMEM; + goto out; + } + } else if ((len = str_has_prefix(str, "clock="))) { + str += len; + + str = strstrip(str); + attrs->clock = kstrdup(str, GFP_KERNEL); + if (!attrs->clock) { + ret = -ENOMEM; + goto out; + } + } else if ((len = str_has_prefix(str, "size="))) { + int map_bits = parse_map_size(str + len); + + if (map_bits < 0) { + ret = map_bits; + goto out; + } + attrs->map_bits = map_bits; + } else { + char *assignment; + + if (attrs->n_assignments == TRACING_MAP_VARS_MAX) { + hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(str)); + ret = -EINVAL; + goto out; + } + + assignment = kstrdup(str, GFP_KERNEL); + if (!assignment) { + ret = -ENOMEM; + goto out; + } + + attrs->assignment_str[attrs->n_assignments++] = assignment; + } + out: + return ret; +} + +static struct hist_trigger_attrs * +parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str) +{ + struct hist_trigger_attrs *attrs; + int ret = 0; + + attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); + if (!attrs) + return ERR_PTR(-ENOMEM); + + while (trigger_str) { + char *str = strsep(&trigger_str, ":"); + char *rhs; + + rhs = strchr(str, '='); + if (rhs) { + if (!strlen(++rhs)) { + ret = -EINVAL; + hist_err(tr, HIST_ERR_EMPTY_ASSIGNMENT, errpos(str)); + goto free; + } + ret = parse_assignment(tr, str, attrs); + if (ret) + goto free; + } else if (strcmp(str, "nohitcount") == 0 || + strcmp(str, "NOHC") == 0) + attrs->no_hitcount = true; + else if (strcmp(str, "pause") == 0) + attrs->pause = true; + else if ((strcmp(str, "cont") == 0) || + (strcmp(str, "continue") == 0)) + attrs->cont = true; + else if (strcmp(str, "clear") == 0) + attrs->clear = true; + else { + ret = parse_action(str, attrs); + if (ret) + goto free; + } + } + + if (!attrs->keys_str) { + ret = -EINVAL; + goto free; + } + + if (!attrs->clock) { + attrs->clock = kstrdup("global", GFP_KERNEL); + if (!attrs->clock) { + ret = -ENOMEM; + goto free; + } + } + + return attrs; + free: + destroy_hist_trigger_attrs(attrs); + + return ERR_PTR(ret); +} + +static inline void save_comm(char *comm, struct task_struct *task) +{ + if (!task->pid) { + strcpy(comm, "<idle>"); + return; + } + + if (WARN_ON_ONCE(task->pid < 0)) { + strcpy(comm, "<XXX>"); + return; + } + + strncpy(comm, task->comm, TASK_COMM_LEN); +} + +static void hist_elt_data_free(struct hist_elt_data *elt_data) +{ + unsigned int i; + + for (i = 0; i < elt_data->n_field_var_str; i++) + kfree(elt_data->field_var_str[i]); + + kfree(elt_data->field_var_str); + + kfree(elt_data->comm); + kfree(elt_data); +} + +static void hist_trigger_elt_data_free(struct tracing_map_elt *elt) +{ + struct hist_elt_data *elt_data = elt->private_data; + + hist_elt_data_free(elt_data); +} + +static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) +{ + struct hist_trigger_data *hist_data = elt->map->private_data; + unsigned int size = TASK_COMM_LEN; + struct hist_elt_data *elt_data; + struct hist_field *hist_field; + unsigned int i, n_str; + + elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL); + if (!elt_data) + return -ENOMEM; + + for_each_hist_field(i, hist_data) { + hist_field = hist_data->fields[i]; + + if (hist_field->flags & HIST_FIELD_FL_EXECNAME) { + elt_data->comm = kzalloc(size, GFP_KERNEL); + if (!elt_data->comm) { + kfree(elt_data); + return -ENOMEM; + } + break; + } + } + + n_str = hist_data->n_field_var_str + hist_data->n_save_var_str + + hist_data->n_var_str; + if (n_str > SYNTH_FIELDS_MAX) { + hist_elt_data_free(elt_data); + return -EINVAL; + } + + BUILD_BUG_ON(STR_VAR_LEN_MAX & (sizeof(u64) - 1)); + + size = STR_VAR_LEN_MAX; + + elt_data->field_var_str = kcalloc(n_str, sizeof(char *), GFP_KERNEL); + if (!elt_data->field_var_str) { + hist_elt_data_free(elt_data); + return -EINVAL; + } + elt_data->n_field_var_str = n_str; + + for (i = 0; i < n_str; i++) { + elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL); + if (!elt_data->field_var_str[i]) { + hist_elt_data_free(elt_data); + return -ENOMEM; + } + } + + elt->private_data = elt_data; + + return 0; +} + +static void hist_trigger_elt_data_init(struct tracing_map_elt *elt) +{ + struct hist_elt_data *elt_data = elt->private_data; + + if (elt_data->comm) + save_comm(elt_data->comm, current); +} + +static const struct tracing_map_ops hist_trigger_elt_data_ops = { + .elt_alloc = hist_trigger_elt_data_alloc, + .elt_free = hist_trigger_elt_data_free, + .elt_init = hist_trigger_elt_data_init, +}; + +static const char *get_hist_field_flags(struct hist_field *hist_field) +{ + const char *flags_str = NULL; + + if (hist_field->flags & HIST_FIELD_FL_HEX) + flags_str = "hex"; + else if (hist_field->flags & HIST_FIELD_FL_SYM) + flags_str = "sym"; + else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET) + flags_str = "sym-offset"; + else if (hist_field->flags & HIST_FIELD_FL_EXECNAME) + flags_str = "execname"; + else if (hist_field->flags & HIST_FIELD_FL_SYSCALL) + flags_str = "syscall"; + else if (hist_field->flags & HIST_FIELD_FL_LOG2) + flags_str = "log2"; + else if (hist_field->flags & HIST_FIELD_FL_BUCKET) + flags_str = "buckets"; + else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS) + flags_str = "usecs"; + else if (hist_field->flags & HIST_FIELD_FL_PERCENT) + flags_str = "percent"; + else if (hist_field->flags & HIST_FIELD_FL_GRAPH) + flags_str = "graph"; + else if (hist_field->flags & HIST_FIELD_FL_STACKTRACE) + flags_str = "stacktrace"; + + return flags_str; +} + +static void expr_field_str(struct hist_field *field, char *expr) +{ + if (field->flags & HIST_FIELD_FL_VAR_REF) + strcat(expr, "$"); + else if (field->flags & HIST_FIELD_FL_CONST) { + char str[HIST_CONST_DIGITS_MAX]; + + snprintf(str, HIST_CONST_DIGITS_MAX, "%llu", field->constant); + strcat(expr, str); + } + + strcat(expr, hist_field_name(field, 0)); + + if (field->flags && !(field->flags & HIST_FIELD_FL_VAR_REF)) { + const char *flags_str = get_hist_field_flags(field); + + if (flags_str) { + strcat(expr, "."); + strcat(expr, flags_str); + } + } +} + +static char *expr_str(struct hist_field *field, unsigned int level) +{ + char *expr; + + if (level > 1) + return NULL; + + expr = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL); + if (!expr) + return NULL; + + if (!field->operands[0]) { + expr_field_str(field, expr); + return expr; + } + + if (field->operator == FIELD_OP_UNARY_MINUS) { + char *subexpr; + + strcat(expr, "-("); + subexpr = expr_str(field->operands[0], ++level); + if (!subexpr) { + kfree(expr); + return NULL; + } + strcat(expr, subexpr); + strcat(expr, ")"); + + kfree(subexpr); + + return expr; + } + + expr_field_str(field->operands[0], expr); + + switch (field->operator) { + case FIELD_OP_MINUS: + strcat(expr, "-"); + break; + case FIELD_OP_PLUS: + strcat(expr, "+"); + break; + case FIELD_OP_DIV: + strcat(expr, "/"); + break; + case FIELD_OP_MULT: + strcat(expr, "*"); + break; + default: + kfree(expr); + return NULL; + } + + expr_field_str(field->operands[1], expr); + + return expr; +} + +/* + * If field_op != FIELD_OP_NONE, *sep points to the root operator + * of the expression tree to be evaluated. + */ +static int contains_operator(char *str, char **sep) +{ + enum field_op_id field_op = FIELD_OP_NONE; + char *minus_op, *plus_op, *div_op, *mult_op; + + + /* + * Report the last occurrence of the operators first, so that the + * expression is evaluated left to right. This is important since + * subtraction and division are not associative. + * + * e.g + * 64/8/4/2 is 1, i.e 64/8/4/2 = ((64/8)/4)/2 + * 14-7-5-2 is 0, i.e 14-7-5-2 = ((14-7)-5)-2 + */ + + /* + * First, find lower precedence addition and subtraction + * since the expression will be evaluated recursively. + */ + minus_op = strrchr(str, '-'); + if (minus_op) { + /* + * Unary minus is not supported in sub-expressions. If + * present, it is always the next root operator. + */ + if (minus_op == str) { + field_op = FIELD_OP_UNARY_MINUS; + goto out; + } + + field_op = FIELD_OP_MINUS; + } + + plus_op = strrchr(str, '+'); + if (plus_op || minus_op) { + /* + * For operators of the same precedence use to rightmost as the + * root, so that the expression is evaluated left to right. + */ + if (plus_op > minus_op) + field_op = FIELD_OP_PLUS; + goto out; + } + + /* + * Multiplication and division have higher precedence than addition and + * subtraction. + */ + div_op = strrchr(str, '/'); + if (div_op) + field_op = FIELD_OP_DIV; + + mult_op = strrchr(str, '*'); + /* + * For operators of the same precedence use to rightmost as the + * root, so that the expression is evaluated left to right. + */ + if (mult_op > div_op) + field_op = FIELD_OP_MULT; + +out: + if (sep) { + switch (field_op) { + case FIELD_OP_UNARY_MINUS: + case FIELD_OP_MINUS: + *sep = minus_op; + break; + case FIELD_OP_PLUS: + *sep = plus_op; + break; + case FIELD_OP_DIV: + *sep = div_op; + break; + case FIELD_OP_MULT: + *sep = mult_op; + break; + case FIELD_OP_NONE: + default: + *sep = NULL; + break; + } + } + + return field_op; +} + +static void get_hist_field(struct hist_field *hist_field) +{ + hist_field->ref++; +} + +static void __destroy_hist_field(struct hist_field *hist_field) +{ + if (--hist_field->ref > 1) + return; + + kfree(hist_field->var.name); + kfree(hist_field->name); + + /* Can likely be a const */ + kfree_const(hist_field->type); + + kfree(hist_field->system); + kfree(hist_field->event_name); + + kfree(hist_field); +} + +static void destroy_hist_field(struct hist_field *hist_field, + unsigned int level) +{ + unsigned int i; + + if (level > 3) + return; + + if (!hist_field) + return; + + if (hist_field->flags & HIST_FIELD_FL_VAR_REF) + return; /* var refs will be destroyed separately */ + + for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) + destroy_hist_field(hist_field->operands[i], level + 1); + + __destroy_hist_field(hist_field); +} + +static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, + struct ftrace_event_field *field, + unsigned long flags, + char *var_name) +{ + struct hist_field *hist_field; + + if (field && is_function_field(field)) + return NULL; + + hist_field = kzalloc(sizeof(struct hist_field), GFP_KERNEL); + if (!hist_field) + return NULL; + + hist_field->ref = 1; + + hist_field->hist_data = hist_data; + + if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS) + goto out; /* caller will populate */ + + if (flags & HIST_FIELD_FL_VAR_REF) { + hist_field->fn_num = HIST_FIELD_FN_VAR_REF; + goto out; + } + + if (flags & HIST_FIELD_FL_HITCOUNT) { + hist_field->fn_num = HIST_FIELD_FN_COUNTER; + hist_field->size = sizeof(u64); + hist_field->type = "u64"; + goto out; + } + + if (flags & HIST_FIELD_FL_CONST) { + hist_field->fn_num = HIST_FIELD_FN_CONST; + hist_field->size = sizeof(u64); + hist_field->type = kstrdup("u64", GFP_KERNEL); + if (!hist_field->type) + goto free; + goto out; + } + + if (flags & HIST_FIELD_FL_STACKTRACE) { + if (field) + hist_field->fn_num = HIST_FIELD_FN_STACK; + else + hist_field->fn_num = HIST_FIELD_FN_NOP; + hist_field->size = HIST_STACKTRACE_SIZE; + hist_field->type = kstrdup_const("unsigned long[]", GFP_KERNEL); + if (!hist_field->type) + goto free; + goto out; + } + + if (flags & (HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET)) { + unsigned long fl = flags & ~(HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET); + hist_field->fn_num = flags & HIST_FIELD_FL_LOG2 ? HIST_FIELD_FN_LOG2 : + HIST_FIELD_FN_BUCKET; + hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); + if (!hist_field->operands[0]) + goto free; + hist_field->size = hist_field->operands[0]->size; + hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL); + if (!hist_field->type) + goto free; + goto out; + } + + if (flags & HIST_FIELD_FL_TIMESTAMP) { + hist_field->fn_num = HIST_FIELD_FN_TIMESTAMP; + hist_field->size = sizeof(u64); + hist_field->type = "u64"; + goto out; + } + + if (flags & HIST_FIELD_FL_CPU) { + hist_field->fn_num = HIST_FIELD_FN_CPU; + hist_field->size = sizeof(int); + hist_field->type = "unsigned int"; + goto out; + } + + if (WARN_ON_ONCE(!field)) + goto out; + + /* Pointers to strings are just pointers and dangerous to dereference */ + if (is_string_field(field) && + (field->filter_type != FILTER_PTR_STRING)) { + flags |= HIST_FIELD_FL_STRING; + + hist_field->size = MAX_FILTER_STR_VAL; + hist_field->type = kstrdup_const(field->type, GFP_KERNEL); + if (!hist_field->type) + goto free; + + if (field->filter_type == FILTER_STATIC_STRING) { + hist_field->fn_num = HIST_FIELD_FN_STRING; + hist_field->size = field->size; + } else if (field->filter_type == FILTER_DYN_STRING) { + hist_field->fn_num = HIST_FIELD_FN_DYNSTRING; + } else if (field->filter_type == FILTER_RDYN_STRING) + hist_field->fn_num = HIST_FIELD_FN_RELDYNSTRING; + else + hist_field->fn_num = HIST_FIELD_FN_PSTRING; + } else { + hist_field->size = field->size; + hist_field->is_signed = field->is_signed; + hist_field->type = kstrdup_const(field->type, GFP_KERNEL); + if (!hist_field->type) + goto free; + + hist_field->fn_num = select_value_fn(field->size, + field->is_signed); + if (hist_field->fn_num == HIST_FIELD_FN_NOP) { + destroy_hist_field(hist_field, 0); + return NULL; + } + } + out: + hist_field->field = field; + hist_field->flags = flags; + + if (var_name) { + hist_field->var.name = kstrdup(var_name, GFP_KERNEL); + if (!hist_field->var.name) + goto free; + } + + return hist_field; + free: + destroy_hist_field(hist_field, 0); + return NULL; +} + +static void destroy_hist_fields(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < HIST_FIELDS_MAX; i++) { + if (hist_data->fields[i]) { + destroy_hist_field(hist_data->fields[i], 0); + hist_data->fields[i] = NULL; + } + } + + for (i = 0; i < hist_data->n_var_refs; i++) { + WARN_ON(!(hist_data->var_refs[i]->flags & HIST_FIELD_FL_VAR_REF)); + __destroy_hist_field(hist_data->var_refs[i]); + hist_data->var_refs[i] = NULL; + } +} + +static int init_var_ref(struct hist_field *ref_field, + struct hist_field *var_field, + char *system, char *event_name) +{ + int err = 0; + + ref_field->var.idx = var_field->var.idx; + ref_field->var.hist_data = var_field->hist_data; + ref_field->size = var_field->size; + ref_field->is_signed = var_field->is_signed; + ref_field->flags |= var_field->flags & + (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); + + if (system) { + ref_field->system = kstrdup(system, GFP_KERNEL); + if (!ref_field->system) + return -ENOMEM; + } + + if (event_name) { + ref_field->event_name = kstrdup(event_name, GFP_KERNEL); + if (!ref_field->event_name) { + err = -ENOMEM; + goto free; + } + } + + if (var_field->var.name) { + ref_field->name = kstrdup(var_field->var.name, GFP_KERNEL); + if (!ref_field->name) { + err = -ENOMEM; + goto free; + } + } else if (var_field->name) { + ref_field->name = kstrdup(var_field->name, GFP_KERNEL); + if (!ref_field->name) { + err = -ENOMEM; + goto free; + } + } + + ref_field->type = kstrdup_const(var_field->type, GFP_KERNEL); + if (!ref_field->type) { + err = -ENOMEM; + goto free; + } + out: + return err; + free: + kfree(ref_field->system); + ref_field->system = NULL; + kfree(ref_field->event_name); + ref_field->event_name = NULL; + kfree(ref_field->name); + ref_field->name = NULL; + + goto out; +} + +static int find_var_ref_idx(struct hist_trigger_data *hist_data, + struct hist_field *var_field) +{ + struct hist_field *ref_field; + int i; + + for (i = 0; i < hist_data->n_var_refs; i++) { + ref_field = hist_data->var_refs[i]; + if (ref_field->var.idx == var_field->var.idx && + ref_field->var.hist_data == var_field->hist_data) + return i; + } + + return -ENOENT; +} + +/** + * create_var_ref - Create a variable reference and attach it to trigger + * @hist_data: The trigger that will be referencing the variable + * @var_field: The VAR field to create a reference to + * @system: The optional system string + * @event_name: The optional event_name string + * + * Given a variable hist_field, create a VAR_REF hist_field that + * represents a reference to it. + * + * This function also adds the reference to the trigger that + * now references the variable. + * + * Return: The VAR_REF field if successful, NULL if not + */ +static struct hist_field *create_var_ref(struct hist_trigger_data *hist_data, + struct hist_field *var_field, + char *system, char *event_name) +{ + unsigned long flags = HIST_FIELD_FL_VAR_REF; + struct hist_field *ref_field; + int i; + + /* Check if the variable already exists */ + for (i = 0; i < hist_data->n_var_refs; i++) { + ref_field = hist_data->var_refs[i]; + if (ref_field->var.idx == var_field->var.idx && + ref_field->var.hist_data == var_field->hist_data) { + get_hist_field(ref_field); + return ref_field; + } + } + /* Sanity check to avoid out-of-bound write on 'hist_data->var_refs' */ + if (hist_data->n_var_refs >= TRACING_MAP_VARS_MAX) + return NULL; + ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL); + if (ref_field) { + if (init_var_ref(ref_field, var_field, system, event_name)) { + destroy_hist_field(ref_field, 0); + return NULL; + } + + hist_data->var_refs[hist_data->n_var_refs] = ref_field; + ref_field->var_ref_idx = hist_data->n_var_refs++; + } + + return ref_field; +} + +static bool is_var_ref(char *var_name) +{ + if (!var_name || strlen(var_name) < 2 || var_name[0] != '$') + return false; + + return true; +} + +static char *field_name_from_var(struct hist_trigger_data *hist_data, + char *var_name) +{ + char *name, *field; + unsigned int i; + + for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) { + name = hist_data->attrs->var_defs.name[i]; + + if (strcmp(var_name, name) == 0) { + field = hist_data->attrs->var_defs.expr[i]; + if (contains_operator(field, NULL) || is_var_ref(field)) + continue; + return field; + } + } + + return NULL; +} + +static char *local_field_var_ref(struct hist_trigger_data *hist_data, + char *system, char *event_name, + char *var_name) +{ + struct trace_event_call *call; + + if (system && event_name) { + call = hist_data->event_file->event_call; + + if (strcmp(system, call->class->system) != 0) + return NULL; + + if (strcmp(event_name, trace_event_name(call)) != 0) + return NULL; + } + + if (!!system != !!event_name) + return NULL; + + if (!is_var_ref(var_name)) + return NULL; + + var_name++; + + return field_name_from_var(hist_data, var_name); +} + +static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, + char *system, char *event_name, + char *var_name) +{ + struct hist_field *var_field = NULL, *ref_field = NULL; + struct trace_array *tr = hist_data->event_file->tr; + + if (!is_var_ref(var_name)) + return NULL; + + var_name++; + + var_field = find_event_var(hist_data, system, event_name, var_name); + if (var_field) + ref_field = create_var_ref(hist_data, var_field, + system, event_name); + + if (!ref_field) + hist_err(tr, HIST_ERR_VAR_NOT_FOUND, errpos(var_name)); + + return ref_field; +} + +static struct ftrace_event_field * +parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, + char *field_str, unsigned long *flags, unsigned long *buckets) +{ + struct ftrace_event_field *field = NULL; + char *field_name, *modifier, *str; + struct trace_array *tr = file->tr; + + modifier = str = kstrdup(field_str, GFP_KERNEL); + if (!modifier) + return ERR_PTR(-ENOMEM); + + field_name = strsep(&modifier, "."); + if (modifier) { + if (strcmp(modifier, "hex") == 0) + *flags |= HIST_FIELD_FL_HEX; + else if (strcmp(modifier, "sym") == 0) + *flags |= HIST_FIELD_FL_SYM; + /* + * 'sym-offset' occurrences in the trigger string are modified + * to 'symXoffset' to simplify arithmetic expression parsing. + */ + else if (strcmp(modifier, "symXoffset") == 0) + *flags |= HIST_FIELD_FL_SYM_OFFSET; + else if ((strcmp(modifier, "execname") == 0) && + (strcmp(field_name, "common_pid") == 0)) + *flags |= HIST_FIELD_FL_EXECNAME; + else if (strcmp(modifier, "syscall") == 0) + *flags |= HIST_FIELD_FL_SYSCALL; + else if (strcmp(modifier, "stacktrace") == 0) + *flags |= HIST_FIELD_FL_STACKTRACE; + else if (strcmp(modifier, "log2") == 0) + *flags |= HIST_FIELD_FL_LOG2; + else if (strcmp(modifier, "usecs") == 0) + *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; + else if (strncmp(modifier, "bucket", 6) == 0) { + int ret; + + modifier += 6; + + if (*modifier == 's') + modifier++; + if (*modifier != '=') + goto error; + modifier++; + ret = kstrtoul(modifier, 0, buckets); + if (ret || !(*buckets)) + goto error; + *flags |= HIST_FIELD_FL_BUCKET; + } else if (strncmp(modifier, "percent", 7) == 0) { + if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY)) + goto error; + *flags |= HIST_FIELD_FL_PERCENT; + } else if (strncmp(modifier, "graph", 5) == 0) { + if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY)) + goto error; + *flags |= HIST_FIELD_FL_GRAPH; + } else { + error: + hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); + field = ERR_PTR(-EINVAL); + goto out; + } + } + + if (strcmp(field_name, "common_timestamp") == 0) { + *flags |= HIST_FIELD_FL_TIMESTAMP; + hist_data->enable_timestamps = true; + if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS) + hist_data->attrs->ts_in_usecs = true; + } else if (strcmp(field_name, "common_stacktrace") == 0) { + *flags |= HIST_FIELD_FL_STACKTRACE; + } else if (strcmp(field_name, "common_cpu") == 0) + *flags |= HIST_FIELD_FL_CPU; + else if (strcmp(field_name, "hitcount") == 0) + *flags |= HIST_FIELD_FL_HITCOUNT; + else { + field = trace_find_event_field(file->event_call, field_name); + if (!field || !field->size) { + /* + * For backward compatibility, if field_name + * was "cpu" or "stacktrace", then we treat this + * the same as common_cpu and common_stacktrace + * respectively. This also works for "CPU", and + * "STACKTRACE". + */ + if (field && field->filter_type == FILTER_CPU) { + *flags |= HIST_FIELD_FL_CPU; + } else if (field && field->filter_type == FILTER_STACKTRACE) { + *flags |= HIST_FIELD_FL_STACKTRACE; + } else { + hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, + errpos(field_name)); + field = ERR_PTR(-EINVAL); + goto out; + } + } + } + out: + kfree(str); + + return field; +} + +static struct hist_field *create_alias(struct hist_trigger_data *hist_data, + struct hist_field *var_ref, + char *var_name) +{ + struct hist_field *alias = NULL; + unsigned long flags = HIST_FIELD_FL_ALIAS | HIST_FIELD_FL_VAR; + + alias = create_hist_field(hist_data, NULL, flags, var_name); + if (!alias) + return NULL; + + alias->fn_num = var_ref->fn_num; + alias->operands[0] = var_ref; + + if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) { + destroy_hist_field(alias, 0); + return NULL; + } + + alias->var_ref_idx = var_ref->var_ref_idx; + + return alias; +} + +static struct hist_field *parse_const(struct hist_trigger_data *hist_data, + char *str, char *var_name, + unsigned long *flags) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_field *field = NULL; + u64 constant; + + if (kstrtoull(str, 0, &constant)) { + hist_err(tr, HIST_ERR_EXPECT_NUMBER, errpos(str)); + return NULL; + } + + *flags |= HIST_FIELD_FL_CONST; + field = create_hist_field(hist_data, NULL, *flags, var_name); + if (!field) + return NULL; + + field->constant = constant; + + return field; +} + +static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, + struct trace_event_file *file, char *str, + unsigned long *flags, char *var_name) +{ + char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str; + struct ftrace_event_field *field = NULL; + struct hist_field *hist_field = NULL; + unsigned long buckets = 0; + int ret = 0; + + if (isdigit(str[0])) { + hist_field = parse_const(hist_data, str, var_name, flags); + if (!hist_field) { + ret = -EINVAL; + goto out; + } + return hist_field; + } + + s = strchr(str, '.'); + if (s) { + s = strchr(++s, '.'); + if (s) { + ref_system = strsep(&str, "."); + if (!str) { + ret = -EINVAL; + goto out; + } + ref_event = strsep(&str, "."); + if (!str) { + ret = -EINVAL; + goto out; + } + ref_var = str; + } + } + + s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var); + if (!s) { + hist_field = parse_var_ref(hist_data, ref_system, + ref_event, ref_var); + if (hist_field) { + if (var_name) { + hist_field = create_alias(hist_data, hist_field, var_name); + if (!hist_field) { + ret = -ENOMEM; + goto out; + } + } + return hist_field; + } + } else + str = s; + + field = parse_field(hist_data, file, str, flags, &buckets); + if (IS_ERR(field)) { + ret = PTR_ERR(field); + goto out; + } + + hist_field = create_hist_field(hist_data, field, *flags, var_name); + if (!hist_field) { + ret = -ENOMEM; + goto out; + } + hist_field->buckets = buckets; + + return hist_field; + out: + return ERR_PTR(ret); +} + +static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + char *str, unsigned long flags, + char *var_name, unsigned int *n_subexprs); + +static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + char *str, unsigned long flags, + char *var_name, unsigned int *n_subexprs) +{ + struct hist_field *operand1, *expr = NULL; + unsigned long operand_flags; + int ret = 0; + char *s; + + /* Unary minus operator, increment n_subexprs */ + ++*n_subexprs; + + /* we support only -(xxx) i.e. explicit parens required */ + + if (*n_subexprs > 3) { + hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); + ret = -EINVAL; + goto free; + } + + str++; /* skip leading '-' */ + + s = strchr(str, '('); + if (s) + str++; + else { + ret = -EINVAL; + goto free; + } + + s = strrchr(str, ')'); + if (s) { + /* unary minus not supported in sub-expressions */ + if (*(s+1) != '\0') { + hist_err(file->tr, HIST_ERR_UNARY_MINUS_SUBEXPR, + errpos(str)); + ret = -EINVAL; + goto free; + } + *s = '\0'; + } + else { + ret = -EINVAL; /* no closing ')' */ + goto free; + } + + flags |= HIST_FIELD_FL_EXPR; + expr = create_hist_field(hist_data, NULL, flags, var_name); + if (!expr) { + ret = -ENOMEM; + goto free; + } + + operand_flags = 0; + operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, n_subexprs); + if (IS_ERR(operand1)) { + ret = PTR_ERR(operand1); + goto free; + } + if (operand1->flags & HIST_FIELD_FL_STRING) { + /* String type can not be the operand of unary operator. */ + hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(str)); + destroy_hist_field(operand1, 0); + ret = -EINVAL; + goto free; + } + + expr->flags |= operand1->flags & + (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); + expr->fn_num = HIST_FIELD_FN_UMINUS; + expr->operands[0] = operand1; + expr->size = operand1->size; + expr->is_signed = operand1->is_signed; + expr->operator = FIELD_OP_UNARY_MINUS; + expr->name = expr_str(expr, 0); + expr->type = kstrdup_const(operand1->type, GFP_KERNEL); + if (!expr->type) { + ret = -ENOMEM; + goto free; + } + + return expr; + free: + destroy_hist_field(expr, 0); + return ERR_PTR(ret); +} + +/* + * If the operands are var refs, return pointers the + * variable(s) referenced in var1 and var2, else NULL. + */ +static int check_expr_operands(struct trace_array *tr, + struct hist_field *operand1, + struct hist_field *operand2, + struct hist_field **var1, + struct hist_field **var2) +{ + unsigned long operand1_flags = operand1->flags; + unsigned long operand2_flags = operand2->flags; + + if ((operand1_flags & HIST_FIELD_FL_VAR_REF) || + (operand1_flags & HIST_FIELD_FL_ALIAS)) { + struct hist_field *var; + + var = find_var_field(operand1->var.hist_data, operand1->name); + if (!var) + return -EINVAL; + operand1_flags = var->flags; + *var1 = var; + } + + if ((operand2_flags & HIST_FIELD_FL_VAR_REF) || + (operand2_flags & HIST_FIELD_FL_ALIAS)) { + struct hist_field *var; + + var = find_var_field(operand2->var.hist_data, operand2->name); + if (!var) + return -EINVAL; + operand2_flags = var->flags; + *var2 = var; + } + + if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) != + (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) { + hist_err(tr, HIST_ERR_TIMESTAMP_MISMATCH, 0); + return -EINVAL; + } + + return 0; +} + +static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + char *str, unsigned long flags, + char *var_name, unsigned int *n_subexprs) +{ + struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL; + struct hist_field *var1 = NULL, *var2 = NULL; + unsigned long operand_flags, operand2_flags; + int field_op, ret = -EINVAL; + char *sep, *operand1_str; + enum hist_field_fn op_fn; + bool combine_consts; + + if (*n_subexprs > 3) { + hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); + return ERR_PTR(-EINVAL); + } + + field_op = contains_operator(str, &sep); + + if (field_op == FIELD_OP_NONE) + return parse_atom(hist_data, file, str, &flags, var_name); + + if (field_op == FIELD_OP_UNARY_MINUS) + return parse_unary(hist_data, file, str, flags, var_name, n_subexprs); + + /* Binary operator found, increment n_subexprs */ + ++*n_subexprs; + + /* Split the expression string at the root operator */ + if (!sep) + return ERR_PTR(-EINVAL); + + *sep = '\0'; + operand1_str = str; + str = sep+1; + + /* Binary operator requires both operands */ + if (*operand1_str == '\0' || *str == '\0') + return ERR_PTR(-EINVAL); + + operand_flags = 0; + + /* LHS of string is an expression e.g. a+b in a+b+c */ + operand1 = parse_expr(hist_data, file, operand1_str, operand_flags, NULL, n_subexprs); + if (IS_ERR(operand1)) + return ERR_CAST(operand1); + + if (operand1->flags & HIST_FIELD_FL_STRING) { + hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(operand1_str)); + ret = -EINVAL; + goto free_op1; + } + + /* RHS of string is another expression e.g. c in a+b+c */ + operand_flags = 0; + operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, n_subexprs); + if (IS_ERR(operand2)) { + ret = PTR_ERR(operand2); + goto free_op1; + } + if (operand2->flags & HIST_FIELD_FL_STRING) { + hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(str)); + ret = -EINVAL; + goto free_operands; + } + + switch (field_op) { + case FIELD_OP_MINUS: + op_fn = HIST_FIELD_FN_MINUS; + break; + case FIELD_OP_PLUS: + op_fn = HIST_FIELD_FN_PLUS; + break; + case FIELD_OP_DIV: + op_fn = HIST_FIELD_FN_DIV; + break; + case FIELD_OP_MULT: + op_fn = HIST_FIELD_FN_MULT; + break; + default: + ret = -EINVAL; + goto free_operands; + } + + ret = check_expr_operands(file->tr, operand1, operand2, &var1, &var2); + if (ret) + goto free_operands; + + operand_flags = var1 ? var1->flags : operand1->flags; + operand2_flags = var2 ? var2->flags : operand2->flags; + + /* + * If both operands are constant, the expression can be + * collapsed to a single constant. + */ + combine_consts = operand_flags & operand2_flags & HIST_FIELD_FL_CONST; + + flags |= combine_consts ? HIST_FIELD_FL_CONST : HIST_FIELD_FL_EXPR; + + flags |= operand1->flags & + (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); + + expr = create_hist_field(hist_data, NULL, flags, var_name); + if (!expr) { + ret = -ENOMEM; + goto free_operands; + } + + operand1->read_once = true; + operand2->read_once = true; + + /* The operands are now owned and free'd by 'expr' */ + expr->operands[0] = operand1; + expr->operands[1] = operand2; + + if (field_op == FIELD_OP_DIV && + operand2_flags & HIST_FIELD_FL_CONST) { + u64 divisor = var2 ? var2->constant : operand2->constant; + + if (!divisor) { + hist_err(file->tr, HIST_ERR_DIVISION_BY_ZERO, errpos(str)); + ret = -EDOM; + goto free_expr; + } + + /* + * Copy the divisor here so we don't have to look it up + * later if this is a var ref + */ + operand2->constant = divisor; + op_fn = hist_field_get_div_fn(operand2); + } + + expr->fn_num = op_fn; + + if (combine_consts) { + if (var1) + expr->operands[0] = var1; + if (var2) + expr->operands[1] = var2; + + expr->constant = hist_fn_call(expr, NULL, NULL, NULL, NULL); + expr->fn_num = HIST_FIELD_FN_CONST; + + expr->operands[0] = NULL; + expr->operands[1] = NULL; + + /* + * var refs won't be destroyed immediately + * See: destroy_hist_field() + */ + destroy_hist_field(operand2, 0); + destroy_hist_field(operand1, 0); + + expr->name = expr_str(expr, 0); + } else { + /* The operand sizes should be the same, so just pick one */ + expr->size = operand1->size; + expr->is_signed = operand1->is_signed; + + expr->operator = field_op; + expr->type = kstrdup_const(operand1->type, GFP_KERNEL); + if (!expr->type) { + ret = -ENOMEM; + goto free_expr; + } + + expr->name = expr_str(expr, 0); + } + + return expr; + +free_operands: + destroy_hist_field(operand2, 0); +free_op1: + destroy_hist_field(operand1, 0); + return ERR_PTR(ret); + +free_expr: + destroy_hist_field(expr, 0); + return ERR_PTR(ret); +} + +static char *find_trigger_filter(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + struct event_trigger_data *test; + + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (test->private_data == hist_data) + return test->filter_str; + } + } + + return NULL; +} + +static struct event_command trigger_hist_cmd; +static int event_hist_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, + char *param_and_filter); + +static bool compatible_keys(struct hist_trigger_data *target_hist_data, + struct hist_trigger_data *hist_data, + unsigned int n_keys) +{ + struct hist_field *target_hist_field, *hist_field; + unsigned int n, i, j; + + if (hist_data->n_fields - hist_data->n_vals != n_keys) + return false; + + i = hist_data->n_vals; + j = target_hist_data->n_vals; + + for (n = 0; n < n_keys; n++) { + hist_field = hist_data->fields[i + n]; + target_hist_field = target_hist_data->fields[j + n]; + + if (strcmp(hist_field->type, target_hist_field->type) != 0) + return false; + if (hist_field->size != target_hist_field->size) + return false; + if (hist_field->is_signed != target_hist_field->is_signed) + return false; + } + + return true; +} + +static struct hist_trigger_data * +find_compatible_hist(struct hist_trigger_data *target_hist_data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data; + struct event_trigger_data *test; + unsigned int n_keys; + + lockdep_assert_held(&event_mutex); + + n_keys = target_hist_data->n_fields - target_hist_data->n_vals; + + list_for_each_entry(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + hist_data = test->private_data; + + if (compatible_keys(target_hist_data, hist_data, n_keys)) + return hist_data; + } + } + + return NULL; +} + +static struct trace_event_file *event_file(struct trace_array *tr, + char *system, char *event_name) +{ + struct trace_event_file *file; + + file = __find_event_file(tr, system, event_name); + if (!file) + return ERR_PTR(-EINVAL); + + return file; +} + +static struct hist_field * +find_synthetic_field_var(struct hist_trigger_data *target_hist_data, + char *system, char *event_name, char *field_name) +{ + struct hist_field *event_var; + char *synthetic_name; + + synthetic_name = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL); + if (!synthetic_name) + return ERR_PTR(-ENOMEM); + + strcpy(synthetic_name, "synthetic_"); + strcat(synthetic_name, field_name); + + event_var = find_event_var(target_hist_data, system, event_name, synthetic_name); + + kfree(synthetic_name); + + return event_var; +} + +/** + * create_field_var_hist - Automatically create a histogram and var for a field + * @target_hist_data: The target hist trigger + * @subsys_name: Optional subsystem name + * @event_name: Optional event name + * @field_name: The name of the field (and the resulting variable) + * + * Hist trigger actions fetch data from variables, not directly from + * events. However, for convenience, users are allowed to directly + * specify an event field in an action, which will be automatically + * converted into a variable on their behalf. + * + * If a user specifies a field on an event that isn't the event the + * histogram currently being defined (the target event histogram), the + * only way that can be accomplished is if a new hist trigger is + * created and the field variable defined on that. + * + * This function creates a new histogram compatible with the target + * event (meaning a histogram with the same key as the target + * histogram), and creates a variable for the specified field, but + * with 'synthetic_' prepended to the variable name in order to avoid + * collision with normal field variables. + * + * Return: The variable created for the field. + */ +static struct hist_field * +create_field_var_hist(struct hist_trigger_data *target_hist_data, + char *subsys_name, char *event_name, char *field_name) +{ + struct trace_array *tr = target_hist_data->event_file->tr; + struct hist_trigger_data *hist_data; + unsigned int i, n, first = true; + struct field_var_hist *var_hist; + struct trace_event_file *file; + struct hist_field *key_field; + struct hist_field *event_var; + char *saved_filter; + char *cmd; + int ret; + + if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) { + hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name)); + return ERR_PTR(-EINVAL); + } + + file = event_file(tr, subsys_name, event_name); + + if (IS_ERR(file)) { + hist_err(tr, HIST_ERR_EVENT_FILE_NOT_FOUND, errpos(field_name)); + ret = PTR_ERR(file); + return ERR_PTR(ret); + } + + /* + * Look for a histogram compatible with target. We'll use the + * found histogram specification to create a new matching + * histogram with our variable on it. target_hist_data is not + * yet a registered histogram so we can't use that. + */ + hist_data = find_compatible_hist(target_hist_data, file); + if (!hist_data) { + hist_err(tr, HIST_ERR_HIST_NOT_FOUND, errpos(field_name)); + return ERR_PTR(-EINVAL); + } + + /* See if a synthetic field variable has already been created */ + event_var = find_synthetic_field_var(target_hist_data, subsys_name, + event_name, field_name); + if (!IS_ERR_OR_NULL(event_var)) + return event_var; + + var_hist = kzalloc(sizeof(*var_hist), GFP_KERNEL); + if (!var_hist) + return ERR_PTR(-ENOMEM); + + cmd = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL); + if (!cmd) { + kfree(var_hist); + return ERR_PTR(-ENOMEM); + } + + /* Use the same keys as the compatible histogram */ + strcat(cmd, "keys="); + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + if (!first) + strcat(cmd, ","); + strcat(cmd, key_field->field->name); + first = false; + } + + /* Create the synthetic field variable specification */ + strcat(cmd, ":synthetic_"); + strcat(cmd, field_name); + strcat(cmd, "="); + strcat(cmd, field_name); + + /* Use the same filter as the compatible histogram */ + saved_filter = find_trigger_filter(hist_data, file); + if (saved_filter) { + strcat(cmd, " if "); + strcat(cmd, saved_filter); + } + + var_hist->cmd = kstrdup(cmd, GFP_KERNEL); + if (!var_hist->cmd) { + kfree(cmd); + kfree(var_hist); + return ERR_PTR(-ENOMEM); + } + + /* Save the compatible histogram information */ + var_hist->hist_data = hist_data; + + /* Create the new histogram with our variable */ + ret = event_hist_trigger_parse(&trigger_hist_cmd, file, + "", "hist", cmd); + if (ret) { + kfree(cmd); + kfree(var_hist->cmd); + kfree(var_hist); + hist_err(tr, HIST_ERR_HIST_CREATE_FAIL, errpos(field_name)); + return ERR_PTR(ret); + } + + kfree(cmd); + + /* If we can't find the variable, something went wrong */ + event_var = find_synthetic_field_var(target_hist_data, subsys_name, + event_name, field_name); + if (IS_ERR_OR_NULL(event_var)) { + kfree(var_hist->cmd); + kfree(var_hist); + hist_err(tr, HIST_ERR_SYNTH_VAR_NOT_FOUND, errpos(field_name)); + return ERR_PTR(-EINVAL); + } + + n = target_hist_data->n_field_var_hists; + target_hist_data->field_var_hists[n] = var_hist; + target_hist_data->n_field_var_hists++; + + return event_var; +} + +static struct hist_field * +find_target_event_var(struct hist_trigger_data *hist_data, + char *subsys_name, char *event_name, char *var_name) +{ + struct trace_event_file *file = hist_data->event_file; + struct hist_field *hist_field = NULL; + + if (subsys_name) { + struct trace_event_call *call; + + if (!event_name) + return NULL; + + call = file->event_call; + + if (strcmp(subsys_name, call->class->system) != 0) + return NULL; + + if (strcmp(event_name, trace_event_name(call)) != 0) + return NULL; + } + + hist_field = find_var_field(hist_data, var_name); + + return hist_field; +} + +static inline void __update_field_vars(struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *rec, + struct field_var **field_vars, + unsigned int n_field_vars, + unsigned int field_var_str_start) +{ + struct hist_elt_data *elt_data = elt->private_data; + unsigned int i, j, var_idx; + u64 var_val; + + /* Make sure stacktrace can fit in the string variable length */ + BUILD_BUG_ON((HIST_STACKTRACE_DEPTH + 1) * sizeof(long) >= STR_VAR_LEN_MAX); + + for (i = 0, j = field_var_str_start; i < n_field_vars; i++) { + struct field_var *field_var = field_vars[i]; + struct hist_field *var = field_var->var; + struct hist_field *val = field_var->val; + + var_val = hist_fn_call(val, elt, buffer, rbe, rec); + var_idx = var->var.idx; + + if (val->flags & (HIST_FIELD_FL_STRING | + HIST_FIELD_FL_STACKTRACE)) { + char *str = elt_data->field_var_str[j++]; + char *val_str = (char *)(uintptr_t)var_val; + unsigned int size; + + if (val->flags & HIST_FIELD_FL_STRING) { + size = min(val->size, STR_VAR_LEN_MAX); + strscpy(str, val_str, size); + } else { + char *stack_start = str + sizeof(unsigned long); + int e; + + e = stack_trace_save((void *)stack_start, + HIST_STACKTRACE_DEPTH, + HIST_STACKTRACE_SKIP); + if (e < HIST_STACKTRACE_DEPTH - 1) + ((unsigned long *)stack_start)[e] = 0; + *((unsigned long *)str) = e; + } + var_val = (u64)(uintptr_t)str; + } + tracing_map_set_var(elt, var_idx, var_val); + } +} + +static void update_field_vars(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *rec) +{ + __update_field_vars(elt, buffer, rbe, rec, hist_data->field_vars, + hist_data->n_field_vars, 0); +} + +static void save_track_data_vars(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, u64 *var_ref_vals) +{ + __update_field_vars(elt, buffer, rbe, rec, hist_data->save_vars, + hist_data->n_save_vars, hist_data->n_field_var_str); +} + +static struct hist_field *create_var(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + char *name, int size, const char *type) +{ + struct hist_field *var; + int idx; + + if (find_var(hist_data, file, name) && !hist_data->remove) { + var = ERR_PTR(-EINVAL); + goto out; + } + + var = kzalloc(sizeof(struct hist_field), GFP_KERNEL); + if (!var) { + var = ERR_PTR(-ENOMEM); + goto out; + } + + idx = tracing_map_add_var(hist_data->map); + if (idx < 0) { + kfree(var); + var = ERR_PTR(-EINVAL); + goto out; + } + + var->ref = 1; + var->flags = HIST_FIELD_FL_VAR; + var->var.idx = idx; + var->var.hist_data = var->hist_data = hist_data; + var->size = size; + var->var.name = kstrdup(name, GFP_KERNEL); + var->type = kstrdup_const(type, GFP_KERNEL); + if (!var->var.name || !var->type) { + kfree_const(var->type); + kfree(var->var.name); + kfree(var); + var = ERR_PTR(-ENOMEM); + } + out: + return var; +} + +static struct field_var *create_field_var(struct hist_trigger_data *hist_data, + struct trace_event_file *file, + char *field_name) +{ + struct hist_field *val = NULL, *var = NULL; + unsigned long flags = HIST_FIELD_FL_VAR; + struct trace_array *tr = file->tr; + struct field_var *field_var; + int ret = 0; + + if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) { + hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name)); + ret = -EINVAL; + goto err; + } + + val = parse_atom(hist_data, file, field_name, &flags, NULL); + if (IS_ERR(val)) { + hist_err(tr, HIST_ERR_FIELD_VAR_PARSE_FAIL, errpos(field_name)); + ret = PTR_ERR(val); + goto err; + } + + var = create_var(hist_data, file, field_name, val->size, val->type); + if (IS_ERR(var)) { + hist_err(tr, HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name)); + kfree(val); + ret = PTR_ERR(var); + goto err; + } + + field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL); + if (!field_var) { + kfree(val); + kfree(var); + ret = -ENOMEM; + goto err; + } + + field_var->var = var; + field_var->val = val; + out: + return field_var; + err: + field_var = ERR_PTR(ret); + goto out; +} + +/** + * create_target_field_var - Automatically create a variable for a field + * @target_hist_data: The target hist trigger + * @subsys_name: Optional subsystem name + * @event_name: Optional event name + * @var_name: The name of the field (and the resulting variable) + * + * Hist trigger actions fetch data from variables, not directly from + * events. However, for convenience, users are allowed to directly + * specify an event field in an action, which will be automatically + * converted into a variable on their behalf. + * + * This function creates a field variable with the name var_name on + * the hist trigger currently being defined on the target event. If + * subsys_name and event_name are specified, this function simply + * verifies that they do in fact match the target event subsystem and + * event name. + * + * Return: The variable created for the field. + */ +static struct field_var * +create_target_field_var(struct hist_trigger_data *target_hist_data, + char *subsys_name, char *event_name, char *var_name) +{ + struct trace_event_file *file = target_hist_data->event_file; + + if (subsys_name) { + struct trace_event_call *call; + + if (!event_name) + return NULL; + + call = file->event_call; + + if (strcmp(subsys_name, call->class->system) != 0) + return NULL; + + if (strcmp(event_name, trace_event_name(call)) != 0) + return NULL; + } + + return create_field_var(target_hist_data, file, var_name); +} + +static bool check_track_val_max(u64 track_val, u64 var_val) +{ + if (var_val <= track_val) + return false; + + return true; +} + +static bool check_track_val_changed(u64 track_val, u64 var_val) +{ + if (var_val == track_val) + return false; + + return true; +} + +static u64 get_track_val(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct action_data *data) +{ + unsigned int track_var_idx = data->track_data.track_var->var.idx; + u64 track_val; + + track_val = tracing_map_read_var(elt, track_var_idx); + + return track_val; +} + +static void save_track_val(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct action_data *data, u64 var_val) +{ + unsigned int track_var_idx = data->track_data.track_var->var.idx; + + tracing_map_set_var(elt, track_var_idx, var_val); +} + +static void save_track_data(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, u64 *var_ref_vals) +{ + if (data->track_data.save_data) + data->track_data.save_data(hist_data, elt, buffer, rec, rbe, + key, data, var_ref_vals); +} + +static bool check_track_val(struct tracing_map_elt *elt, + struct action_data *data, + u64 var_val) +{ + struct hist_trigger_data *hist_data; + u64 track_val; + + hist_data = data->track_data.track_var->hist_data; + track_val = get_track_val(hist_data, elt, data); + + return data->track_data.check_val(track_val, var_val); +} + +#ifdef CONFIG_TRACER_SNAPSHOT +static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) +{ + /* called with tr->max_lock held */ + struct track_data *track_data = tr->cond_snapshot->cond_data; + struct hist_elt_data *elt_data, *track_elt_data; + struct snapshot_context *context = cond_data; + struct action_data *action; + u64 track_val; + + if (!track_data) + return false; + + action = track_data->action_data; + + track_val = get_track_val(track_data->hist_data, context->elt, + track_data->action_data); + + if (!action->track_data.check_val(track_data->track_val, track_val)) + return false; + + track_data->track_val = track_val; + memcpy(track_data->key, context->key, track_data->key_len); + + elt_data = context->elt->private_data; + track_elt_data = track_data->elt.private_data; + if (elt_data->comm) + strncpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN); + + track_data->updated = true; + + return true; +} + +static void save_track_data_snapshot(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, + u64 *var_ref_vals) +{ + struct trace_event_file *file = hist_data->event_file; + struct snapshot_context context; + + context.elt = elt; + context.key = key; + + tracing_snapshot_cond(file->tr, &context); +} + +static void hist_trigger_print_key(struct seq_file *m, + struct hist_trigger_data *hist_data, + void *key, + struct tracing_map_elt *elt); + +static struct action_data *snapshot_action(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + if (!hist_data->n_actions) + return NULL; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + + if (data->action == ACTION_SNAPSHOT) + return data; + } + + return NULL; +} + +static void track_data_snapshot_print(struct seq_file *m, + struct hist_trigger_data *hist_data) +{ + struct trace_event_file *file = hist_data->event_file; + struct track_data *track_data; + struct action_data *action; + + track_data = tracing_cond_snapshot_data(file->tr); + if (!track_data) + return; + + if (!track_data->updated) + return; + + action = snapshot_action(hist_data); + if (!action) + return; + + seq_puts(m, "\nSnapshot taken (see tracing/snapshot). Details:\n"); + seq_printf(m, "\ttriggering value { %s(%s) }: %10llu", + action->handler == HANDLER_ONMAX ? "onmax" : "onchange", + action->track_data.var_str, track_data->track_val); + + seq_puts(m, "\ttriggered by event with key: "); + hist_trigger_print_key(m, hist_data, track_data->key, &track_data->elt); + seq_putc(m, '\n'); +} +#else +static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) +{ + return false; +} +static void save_track_data_snapshot(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, + u64 *var_ref_vals) {} +static void track_data_snapshot_print(struct seq_file *m, + struct hist_trigger_data *hist_data) {} +#endif /* CONFIG_TRACER_SNAPSHOT */ + +static void track_data_print(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct action_data *data) +{ + u64 track_val = get_track_val(hist_data, elt, data); + unsigned int i, save_var_idx; + + if (data->handler == HANDLER_ONMAX) + seq_printf(m, "\n\tmax: %10llu", track_val); + else if (data->handler == HANDLER_ONCHANGE) + seq_printf(m, "\n\tchanged: %10llu", track_val); + + if (data->action == ACTION_SNAPSHOT) + return; + + for (i = 0; i < hist_data->n_save_vars; i++) { + struct hist_field *save_val = hist_data->save_vars[i]->val; + struct hist_field *save_var = hist_data->save_vars[i]->var; + u64 val; + + save_var_idx = save_var->var.idx; + + val = tracing_map_read_var(elt, save_var_idx); + + if (save_val->flags & HIST_FIELD_FL_STRING) { + seq_printf(m, " %s: %-32s", save_var->var.name, + (char *)(uintptr_t)(val)); + } else + seq_printf(m, " %s: %10llu", save_var->var.name, val); + } +} + +static void ontrack_action(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, u64 *var_ref_vals) +{ + u64 var_val = var_ref_vals[data->track_data.var_ref->var_ref_idx]; + + if (check_track_val(elt, data, var_val)) { + save_track_val(hist_data, elt, data, var_val); + save_track_data(hist_data, elt, buffer, rec, rbe, + key, data, var_ref_vals); + } +} + +static void action_data_destroy(struct action_data *data) +{ + unsigned int i; + + lockdep_assert_held(&event_mutex); + + kfree(data->action_name); + + for (i = 0; i < data->n_params; i++) + kfree(data->params[i]); + + if (data->synth_event) + data->synth_event->ref--; + + kfree(data->synth_event_name); + + kfree(data); +} + +static void track_data_destroy(struct hist_trigger_data *hist_data, + struct action_data *data) +{ + struct trace_event_file *file = hist_data->event_file; + + destroy_hist_field(data->track_data.track_var, 0); + + if (data->action == ACTION_SNAPSHOT) { + struct track_data *track_data; + + track_data = tracing_cond_snapshot_data(file->tr); + if (track_data && track_data->hist_data == hist_data) { + tracing_snapshot_cond_disable(file->tr); + track_data_free(track_data); + } + } + + kfree(data->track_data.var_str); + + action_data_destroy(data); +} + +static int action_create(struct hist_trigger_data *hist_data, + struct action_data *data); + +static int track_data_create(struct hist_trigger_data *hist_data, + struct action_data *data) +{ + struct hist_field *var_field, *ref_field, *track_var = NULL; + struct trace_event_file *file = hist_data->event_file; + struct trace_array *tr = file->tr; + char *track_data_var_str; + int ret = 0; + + track_data_var_str = data->track_data.var_str; + if (track_data_var_str[0] != '$') { + hist_err(tr, HIST_ERR_ONX_NOT_VAR, errpos(track_data_var_str)); + return -EINVAL; + } + track_data_var_str++; + + var_field = find_target_event_var(hist_data, NULL, NULL, track_data_var_str); + if (!var_field) { + hist_err(tr, HIST_ERR_ONX_VAR_NOT_FOUND, errpos(track_data_var_str)); + return -EINVAL; + } + + ref_field = create_var_ref(hist_data, var_field, NULL, NULL); + if (!ref_field) + return -ENOMEM; + + data->track_data.var_ref = ref_field; + + if (data->handler == HANDLER_ONMAX) + track_var = create_var(hist_data, file, "__max", sizeof(u64), "u64"); + if (IS_ERR(track_var)) { + hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0); + ret = PTR_ERR(track_var); + goto out; + } + + if (data->handler == HANDLER_ONCHANGE) + track_var = create_var(hist_data, file, "__change", sizeof(u64), "u64"); + if (IS_ERR(track_var)) { + hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0); + ret = PTR_ERR(track_var); + goto out; + } + data->track_data.track_var = track_var; + + ret = action_create(hist_data, data); + out: + return ret; +} + +static int parse_action_params(struct trace_array *tr, char *params, + struct action_data *data) +{ + char *param, *saved_param; + bool first_param = true; + int ret = 0; + + while (params) { + if (data->n_params >= SYNTH_FIELDS_MAX) { + hist_err(tr, HIST_ERR_TOO_MANY_PARAMS, 0); + ret = -EINVAL; + goto out; + } + + param = strsep(¶ms, ","); + if (!param) { + hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, 0); + ret = -EINVAL; + goto out; + } + + param = strstrip(param); + if (strlen(param) < 2) { + hist_err(tr, HIST_ERR_INVALID_PARAM, errpos(param)); + ret = -EINVAL; + goto out; + } + + saved_param = kstrdup(param, GFP_KERNEL); + if (!saved_param) { + ret = -ENOMEM; + goto out; + } + + if (first_param && data->use_trace_keyword) { + data->synth_event_name = saved_param; + first_param = false; + continue; + } + first_param = false; + + data->params[data->n_params++] = saved_param; + } + out: + return ret; +} + +static int action_parse(struct trace_array *tr, char *str, struct action_data *data, + enum handler_id handler) +{ + char *action_name; + int ret = 0; + + strsep(&str, "."); + if (!str) { + hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0); + ret = -EINVAL; + goto out; + } + + action_name = strsep(&str, "("); + if (!action_name || !str) { + hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0); + ret = -EINVAL; + goto out; + } + + if (str_has_prefix(action_name, "save")) { + char *params = strsep(&str, ")"); + + if (!params) { + hist_err(tr, HIST_ERR_NO_SAVE_PARAMS, 0); + ret = -EINVAL; + goto out; + } + + ret = parse_action_params(tr, params, data); + if (ret) + goto out; + + if (handler == HANDLER_ONMAX) + data->track_data.check_val = check_track_val_max; + else if (handler == HANDLER_ONCHANGE) + data->track_data.check_val = check_track_val_changed; + else { + hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name)); + ret = -EINVAL; + goto out; + } + + data->track_data.save_data = save_track_data_vars; + data->fn = ontrack_action; + data->action = ACTION_SAVE; + } else if (str_has_prefix(action_name, "snapshot")) { + char *params = strsep(&str, ")"); + + if (!str) { + hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(params)); + ret = -EINVAL; + goto out; + } + + if (handler == HANDLER_ONMAX) + data->track_data.check_val = check_track_val_max; + else if (handler == HANDLER_ONCHANGE) + data->track_data.check_val = check_track_val_changed; + else { + hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name)); + ret = -EINVAL; + goto out; + } + + data->track_data.save_data = save_track_data_snapshot; + data->fn = ontrack_action; + data->action = ACTION_SNAPSHOT; + } else { + char *params = strsep(&str, ")"); + + if (str_has_prefix(action_name, "trace")) + data->use_trace_keyword = true; + + if (params) { + ret = parse_action_params(tr, params, data); + if (ret) + goto out; + } + + if (handler == HANDLER_ONMAX) + data->track_data.check_val = check_track_val_max; + else if (handler == HANDLER_ONCHANGE) + data->track_data.check_val = check_track_val_changed; + + if (handler != HANDLER_ONMATCH) { + data->track_data.save_data = action_trace; + data->fn = ontrack_action; + } else + data->fn = action_trace; + + data->action = ACTION_TRACE; + } + + data->action_name = kstrdup(action_name, GFP_KERNEL); + if (!data->action_name) { + ret = -ENOMEM; + goto out; + } + + data->handler = handler; + out: + return ret; +} + +static struct action_data *track_data_parse(struct hist_trigger_data *hist_data, + char *str, enum handler_id handler) +{ + struct action_data *data; + int ret = -EINVAL; + char *var_str; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + var_str = strsep(&str, ")"); + if (!var_str || !str) { + ret = -EINVAL; + goto free; + } + + data->track_data.var_str = kstrdup(var_str, GFP_KERNEL); + if (!data->track_data.var_str) { + ret = -ENOMEM; + goto free; + } + + ret = action_parse(hist_data->event_file->tr, str, data, handler); + if (ret) + goto free; + out: + return data; + free: + track_data_destroy(hist_data, data); + data = ERR_PTR(ret); + goto out; +} + +static void onmatch_destroy(struct action_data *data) +{ + kfree(data->match_data.event); + kfree(data->match_data.event_system); + + action_data_destroy(data); +} + +static void destroy_field_var(struct field_var *field_var) +{ + if (!field_var) + return; + + destroy_hist_field(field_var->var, 0); + destroy_hist_field(field_var->val, 0); + + kfree(field_var); +} + +static void destroy_field_vars(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_field_vars; i++) + destroy_field_var(hist_data->field_vars[i]); + + for (i = 0; i < hist_data->n_save_vars; i++) + destroy_field_var(hist_data->save_vars[i]); +} + +static void save_field_var(struct hist_trigger_data *hist_data, + struct field_var *field_var) +{ + hist_data->field_vars[hist_data->n_field_vars++] = field_var; + + /* Stack traces are saved in the string storage too */ + if (field_var->val->flags & (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE)) + hist_data->n_field_var_str++; +} + + +static int check_synth_field(struct synth_event *event, + struct hist_field *hist_field, + unsigned int field_pos) +{ + struct synth_field *field; + + if (field_pos >= event->n_fields) + return -EINVAL; + + field = event->fields[field_pos]; + + /* + * A dynamic string synth field can accept static or + * dynamic. A static string synth field can only accept a + * same-sized static string, which is checked for later. + */ + if (strstr(hist_field->type, "char[") && field->is_string + && field->is_dynamic) + return 0; + + if (strstr(hist_field->type, "long[") && field->is_stack) + return 0; + + if (strcmp(field->type, hist_field->type) != 0) { + if (field->size != hist_field->size || + (!field->is_string && field->is_signed != hist_field->is_signed)) + return -EINVAL; + } + + return 0; +} + +static struct hist_field * +trace_action_find_var(struct hist_trigger_data *hist_data, + struct action_data *data, + char *system, char *event, char *var) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_field *hist_field; + + var++; /* skip '$' */ + + hist_field = find_target_event_var(hist_data, system, event, var); + if (!hist_field) { + if (!system && data->handler == HANDLER_ONMATCH) { + system = data->match_data.event_system; + event = data->match_data.event; + } + + hist_field = find_event_var(hist_data, system, event, var); + } + + if (!hist_field) + hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, errpos(var)); + + return hist_field; +} + +static struct hist_field * +trace_action_create_field_var(struct hist_trigger_data *hist_data, + struct action_data *data, char *system, + char *event, char *var) +{ + struct hist_field *hist_field = NULL; + struct field_var *field_var; + + /* + * First try to create a field var on the target event (the + * currently being defined). This will create a variable for + * unqualified fields on the target event, or if qualified, + * target fields that have qualified names matching the target. + */ + field_var = create_target_field_var(hist_data, system, event, var); + + if (field_var && !IS_ERR(field_var)) { + save_field_var(hist_data, field_var); + hist_field = field_var->var; + } else { + field_var = NULL; + /* + * If no explicit system.event is specified, default to + * looking for fields on the onmatch(system.event.xxx) + * event. + */ + if (!system && data->handler == HANDLER_ONMATCH) { + system = data->match_data.event_system; + event = data->match_data.event; + } + + if (!event) + goto free; + /* + * At this point, we're looking at a field on another + * event. Because we can't modify a hist trigger on + * another event to add a variable for a field, we need + * to create a new trigger on that event and create the + * variable at the same time. + */ + hist_field = create_field_var_hist(hist_data, system, event, var); + if (IS_ERR(hist_field)) + goto free; + } + out: + return hist_field; + free: + destroy_field_var(field_var); + hist_field = NULL; + goto out; +} + +static int trace_action_create(struct hist_trigger_data *hist_data, + struct action_data *data) +{ + struct trace_array *tr = hist_data->event_file->tr; + char *event_name, *param, *system = NULL; + struct hist_field *hist_field, *var_ref; + unsigned int i; + unsigned int field_pos = 0; + struct synth_event *event; + char *synth_event_name; + int var_ref_idx, ret = 0; + + lockdep_assert_held(&event_mutex); + + /* Sanity check to avoid out-of-bound write on 'data->var_ref_idx' */ + if (data->n_params > SYNTH_FIELDS_MAX) + return -EINVAL; + + if (data->use_trace_keyword) + synth_event_name = data->synth_event_name; + else + synth_event_name = data->action_name; + + event = find_synth_event(synth_event_name); + if (!event) { + hist_err(tr, HIST_ERR_SYNTH_EVENT_NOT_FOUND, errpos(synth_event_name)); + return -EINVAL; + } + + event->ref++; + + for (i = 0; i < data->n_params; i++) { + char *p; + + p = param = kstrdup(data->params[i], GFP_KERNEL); + if (!param) { + ret = -ENOMEM; + goto err; + } + + system = strsep(¶m, "."); + if (!param) { + param = (char *)system; + system = event_name = NULL; + } else { + event_name = strsep(¶m, "."); + if (!param) { + kfree(p); + ret = -EINVAL; + goto err; + } + } + + if (param[0] == '$') + hist_field = trace_action_find_var(hist_data, data, + system, event_name, + param); + else + hist_field = trace_action_create_field_var(hist_data, + data, + system, + event_name, + param); + + if (!hist_field) { + kfree(p); + ret = -EINVAL; + goto err; + } + + if (check_synth_field(event, hist_field, field_pos) == 0) { + var_ref = create_var_ref(hist_data, hist_field, + system, event_name); + if (!var_ref) { + kfree(p); + ret = -ENOMEM; + goto err; + } + + var_ref_idx = find_var_ref_idx(hist_data, var_ref); + if (WARN_ON(var_ref_idx < 0)) { + kfree(p); + ret = var_ref_idx; + goto err; + } + + data->var_ref_idx[i] = var_ref_idx; + + field_pos++; + kfree(p); + continue; + } + + hist_err(tr, HIST_ERR_SYNTH_TYPE_MISMATCH, errpos(param)); + kfree(p); + ret = -EINVAL; + goto err; + } + + if (field_pos != event->n_fields) { + hist_err(tr, HIST_ERR_SYNTH_COUNT_MISMATCH, errpos(event->name)); + ret = -EINVAL; + goto err; + } + + data->synth_event = event; + out: + return ret; + err: + event->ref--; + + goto out; +} + +static int action_create(struct hist_trigger_data *hist_data, + struct action_data *data) +{ + struct trace_event_file *file = hist_data->event_file; + struct trace_array *tr = file->tr; + struct track_data *track_data; + struct field_var *field_var; + unsigned int i; + char *param; + int ret = 0; + + if (data->action == ACTION_TRACE) + return trace_action_create(hist_data, data); + + if (data->action == ACTION_SNAPSHOT) { + track_data = track_data_alloc(hist_data->key_size, data, hist_data); + if (IS_ERR(track_data)) { + ret = PTR_ERR(track_data); + goto out; + } + + ret = tracing_snapshot_cond_enable(file->tr, track_data, + cond_snapshot_update); + if (ret) + track_data_free(track_data); + + goto out; + } + + if (data->action == ACTION_SAVE) { + if (hist_data->n_save_vars) { + ret = -EEXIST; + hist_err(tr, HIST_ERR_TOO_MANY_SAVE_ACTIONS, 0); + goto out; + } + + for (i = 0; i < data->n_params; i++) { + param = kstrdup(data->params[i], GFP_KERNEL); + if (!param) { + ret = -ENOMEM; + goto out; + } + + field_var = create_target_field_var(hist_data, NULL, NULL, param); + if (IS_ERR(field_var)) { + hist_err(tr, HIST_ERR_FIELD_VAR_CREATE_FAIL, + errpos(param)); + ret = PTR_ERR(field_var); + kfree(param); + goto out; + } + + hist_data->save_vars[hist_data->n_save_vars++] = field_var; + if (field_var->val->flags & + (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE)) + hist_data->n_save_var_str++; + kfree(param); + } + } + out: + return ret; +} + +static int onmatch_create(struct hist_trigger_data *hist_data, + struct action_data *data) +{ + return action_create(hist_data, data); +} + +static struct action_data *onmatch_parse(struct trace_array *tr, char *str) +{ + char *match_event, *match_event_system; + struct action_data *data; + int ret = -EINVAL; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + match_event = strsep(&str, ")"); + if (!match_event || !str) { + hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(match_event)); + goto free; + } + + match_event_system = strsep(&match_event, "."); + if (!match_event) { + hist_err(tr, HIST_ERR_SUBSYS_NOT_FOUND, errpos(match_event_system)); + goto free; + } + + if (IS_ERR(event_file(tr, match_event_system, match_event))) { + hist_err(tr, HIST_ERR_INVALID_SUBSYS_EVENT, errpos(match_event)); + goto free; + } + + data->match_data.event = kstrdup(match_event, GFP_KERNEL); + if (!data->match_data.event) { + ret = -ENOMEM; + goto free; + } + + data->match_data.event_system = kstrdup(match_event_system, GFP_KERNEL); + if (!data->match_data.event_system) { + ret = -ENOMEM; + goto free; + } + + ret = action_parse(tr, str, data, HANDLER_ONMATCH); + if (ret) + goto free; + out: + return data; + free: + onmatch_destroy(data); + data = ERR_PTR(ret); + goto out; +} + +static int create_hitcount_val(struct hist_trigger_data *hist_data) +{ + hist_data->fields[HITCOUNT_IDX] = + create_hist_field(hist_data, NULL, HIST_FIELD_FL_HITCOUNT, NULL); + if (!hist_data->fields[HITCOUNT_IDX]) + return -ENOMEM; + + hist_data->n_vals++; + hist_data->n_fields++; + + if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX)) + return -EINVAL; + + return 0; +} + +static int __create_val_field(struct hist_trigger_data *hist_data, + unsigned int val_idx, + struct trace_event_file *file, + char *var_name, char *field_str, + unsigned long flags) +{ + struct hist_field *hist_field; + int ret = 0, n_subexprs = 0; + + hist_field = parse_expr(hist_data, file, field_str, flags, var_name, &n_subexprs); + if (IS_ERR(hist_field)) { + ret = PTR_ERR(hist_field); + goto out; + } + + /* values and variables should not have some modifiers */ + if (hist_field->flags & HIST_FIELD_FL_VAR) { + /* Variable */ + if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT | + HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2)) + goto err; + } else { + /* Value */ + if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT | + HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2 | + HIST_FIELD_FL_SYM | HIST_FIELD_FL_SYM_OFFSET | + HIST_FIELD_FL_SYSCALL | HIST_FIELD_FL_STACKTRACE)) + goto err; + } + + hist_data->fields[val_idx] = hist_field; + + ++hist_data->n_vals; + ++hist_data->n_fields; + + if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) + ret = -EINVAL; + out: + return ret; + err: + hist_err(file->tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(field_str)); + return -EINVAL; +} + +static int create_val_field(struct hist_trigger_data *hist_data, + unsigned int val_idx, + struct trace_event_file *file, + char *field_str) +{ + if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX)) + return -EINVAL; + + return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0); +} + +static const char no_comm[] = "(no comm)"; + +static u64 hist_field_execname(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_elt_data *elt_data; + + if (WARN_ON_ONCE(!elt)) + return (u64)(unsigned long)no_comm; + + elt_data = elt->private_data; + + if (WARN_ON_ONCE(!elt_data->comm)) + return (u64)(unsigned long)no_comm; + + return (u64)(unsigned long)(elt_data->comm); +} + +static u64 hist_field_stack(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + u32 str_item = *(u32 *)(event + hist_field->field->offset); + int str_loc = str_item & 0xffff; + char *addr = (char *)(event + str_loc); + + return (u64)(unsigned long)addr; +} + +static u64 hist_fn_call(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + switch (hist_field->fn_num) { + case HIST_FIELD_FN_VAR_REF: + return hist_field_var_ref(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_COUNTER: + return hist_field_counter(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_CONST: + return hist_field_const(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_LOG2: + return hist_field_log2(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_BUCKET: + return hist_field_bucket(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_TIMESTAMP: + return hist_field_timestamp(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_CPU: + return hist_field_cpu(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_STRING: + return hist_field_string(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DYNSTRING: + return hist_field_dynstring(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_RELDYNSTRING: + return hist_field_reldynstring(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_PSTRING: + return hist_field_pstring(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_S64: + return hist_field_s64(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_U64: + return hist_field_u64(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_S32: + return hist_field_s32(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_U32: + return hist_field_u32(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_S16: + return hist_field_s16(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_U16: + return hist_field_u16(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_S8: + return hist_field_s8(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_U8: + return hist_field_u8(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_UMINUS: + return hist_field_unary_minus(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_MINUS: + return hist_field_minus(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_PLUS: + return hist_field_plus(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DIV: + return hist_field_div(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_MULT: + return hist_field_mult(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DIV_POWER2: + return div_by_power_of_two(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DIV_NOT_POWER2: + return div_by_not_power_of_two(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DIV_MULT_SHIFT: + return div_by_mult_and_shift(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_EXECNAME: + return hist_field_execname(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_STACK: + return hist_field_stack(hist_field, elt, buffer, rbe, event); + default: + return 0; + } +} + +/* Convert a var that points to common_pid.execname to a string */ +static void update_var_execname(struct hist_field *hist_field) +{ + hist_field->flags = HIST_FIELD_FL_STRING | HIST_FIELD_FL_VAR | + HIST_FIELD_FL_EXECNAME; + hist_field->size = MAX_FILTER_STR_VAL; + hist_field->is_signed = 0; + + kfree_const(hist_field->type); + hist_field->type = "char[]"; + + hist_field->fn_num = HIST_FIELD_FN_EXECNAME; +} + +static int create_var_field(struct hist_trigger_data *hist_data, + unsigned int val_idx, + struct trace_event_file *file, + char *var_name, char *expr_str) +{ + struct trace_array *tr = hist_data->event_file->tr; + unsigned long flags = 0; + int ret; + + if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) + return -EINVAL; + + if (find_var(hist_data, file, var_name) && !hist_data->remove) { + hist_err(tr, HIST_ERR_DUPLICATE_VAR, errpos(var_name)); + return -EINVAL; + } + + flags |= HIST_FIELD_FL_VAR; + hist_data->n_vars++; + if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX)) + return -EINVAL; + + ret = __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags); + + if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_EXECNAME) + update_var_execname(hist_data->fields[val_idx]); + + if (!ret && hist_data->fields[val_idx]->flags & + (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE)) + hist_data->fields[val_idx]->var_str_idx = hist_data->n_var_str++; + + return ret; +} + +static int create_val_fields(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + unsigned int i, j = 1, n_hitcount = 0; + char *fields_str, *field_str; + int ret; + + ret = create_hitcount_val(hist_data); + if (ret) + goto out; + + fields_str = hist_data->attrs->vals_str; + if (!fields_str) + goto out; + + for (i = 0, j = 1; i < TRACING_MAP_VALS_MAX && + j < TRACING_MAP_VALS_MAX; i++) { + field_str = strsep(&fields_str, ","); + if (!field_str) + break; + + if (strcmp(field_str, "hitcount") == 0) { + if (!n_hitcount++) + continue; + } + + ret = create_val_field(hist_data, j++, file, field_str); + if (ret) + goto out; + } + + if (fields_str && (strcmp(fields_str, "hitcount") != 0)) + ret = -EINVAL; + out: + /* There is only raw hitcount but nohitcount suppresses it. */ + if (j == 1 && hist_data->attrs->no_hitcount) { + hist_err(hist_data->event_file->tr, HIST_ERR_NEED_NOHC_VAL, 0); + ret = -ENOENT; + } + + return ret; +} + +static int create_key_field(struct hist_trigger_data *hist_data, + unsigned int key_idx, + unsigned int key_offset, + struct trace_event_file *file, + char *field_str) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_field *hist_field = NULL; + unsigned long flags = 0; + unsigned int key_size; + int ret = 0, n_subexprs = 0; + + if (WARN_ON(key_idx >= HIST_FIELDS_MAX)) + return -EINVAL; + + flags |= HIST_FIELD_FL_KEY; + + if (strcmp(field_str, "stacktrace") == 0) { + flags |= HIST_FIELD_FL_STACKTRACE; + key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH; + hist_field = create_hist_field(hist_data, NULL, flags, NULL); + } else { + hist_field = parse_expr(hist_data, file, field_str, flags, + NULL, &n_subexprs); + if (IS_ERR(hist_field)) { + ret = PTR_ERR(hist_field); + goto out; + } + + if (field_has_hist_vars(hist_field, 0)) { + hist_err(tr, HIST_ERR_INVALID_REF_KEY, errpos(field_str)); + destroy_hist_field(hist_field, 0); + ret = -EINVAL; + goto out; + } + + key_size = hist_field->size; + } + + hist_data->fields[key_idx] = hist_field; + + key_size = ALIGN(key_size, sizeof(u64)); + hist_data->fields[key_idx]->size = key_size; + hist_data->fields[key_idx]->offset = key_offset; + + hist_data->key_size += key_size; + + if (hist_data->key_size > HIST_KEY_SIZE_MAX) { + ret = -EINVAL; + goto out; + } + + hist_data->n_keys++; + hist_data->n_fields++; + + if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX)) + return -EINVAL; + + ret = key_size; + out: + return ret; +} + +static int create_key_fields(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + unsigned int i, key_offset = 0, n_vals = hist_data->n_vals; + char *fields_str, *field_str; + int ret = -EINVAL; + + fields_str = hist_data->attrs->keys_str; + if (!fields_str) + goto out; + + for (i = n_vals; i < n_vals + TRACING_MAP_KEYS_MAX; i++) { + field_str = strsep(&fields_str, ","); + if (!field_str) + break; + ret = create_key_field(hist_data, i, key_offset, + file, field_str); + if (ret < 0) + goto out; + key_offset += ret; + } + if (fields_str) { + ret = -EINVAL; + goto out; + } + ret = 0; + out: + return ret; +} + +static int create_var_fields(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + unsigned int i, j = hist_data->n_vals; + int ret = 0; + + unsigned int n_vars = hist_data->attrs->var_defs.n_vars; + + for (i = 0; i < n_vars; i++) { + char *var_name = hist_data->attrs->var_defs.name[i]; + char *expr = hist_data->attrs->var_defs.expr[i]; + + ret = create_var_field(hist_data, j++, file, var_name, expr); + if (ret) + goto out; + } + out: + return ret; +} + +static void free_var_defs(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) { + kfree(hist_data->attrs->var_defs.name[i]); + kfree(hist_data->attrs->var_defs.expr[i]); + } + + hist_data->attrs->var_defs.n_vars = 0; +} + +static int parse_var_defs(struct hist_trigger_data *hist_data) +{ + struct trace_array *tr = hist_data->event_file->tr; + char *s, *str, *var_name, *field_str; + unsigned int i, j, n_vars = 0; + int ret = 0; + + for (i = 0; i < hist_data->attrs->n_assignments; i++) { + str = hist_data->attrs->assignment_str[i]; + for (j = 0; j < TRACING_MAP_VARS_MAX; j++) { + field_str = strsep(&str, ","); + if (!field_str) + break; + + var_name = strsep(&field_str, "="); + if (!var_name || !field_str) { + hist_err(tr, HIST_ERR_MALFORMED_ASSIGNMENT, + errpos(var_name)); + ret = -EINVAL; + goto free; + } + + if (n_vars == TRACING_MAP_VARS_MAX) { + hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(var_name)); + ret = -EINVAL; + goto free; + } + + s = kstrdup(var_name, GFP_KERNEL); + if (!s) { + ret = -ENOMEM; + goto free; + } + hist_data->attrs->var_defs.name[n_vars] = s; + + s = kstrdup(field_str, GFP_KERNEL); + if (!s) { + kfree(hist_data->attrs->var_defs.name[n_vars]); + hist_data->attrs->var_defs.name[n_vars] = NULL; + ret = -ENOMEM; + goto free; + } + hist_data->attrs->var_defs.expr[n_vars++] = s; + + hist_data->attrs->var_defs.n_vars = n_vars; + } + } + + return ret; + free: + free_var_defs(hist_data); + + return ret; +} + +static int create_hist_fields(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + int ret; + + ret = parse_var_defs(hist_data); + if (ret) + return ret; + + ret = create_val_fields(hist_data, file); + if (ret) + goto out; + + ret = create_var_fields(hist_data, file); + if (ret) + goto out; + + ret = create_key_fields(hist_data, file); + + out: + free_var_defs(hist_data); + + return ret; +} + +static int is_descending(struct trace_array *tr, const char *str) +{ + if (!str) + return 0; + + if (strcmp(str, "descending") == 0) + return 1; + + if (strcmp(str, "ascending") == 0) + return 0; + + hist_err(tr, HIST_ERR_INVALID_SORT_MODIFIER, errpos((char *)str)); + + return -EINVAL; +} + +static int create_sort_keys(struct hist_trigger_data *hist_data) +{ + struct trace_array *tr = hist_data->event_file->tr; + char *fields_str = hist_data->attrs->sort_key_str; + struct tracing_map_sort_key *sort_key; + int descending, ret = 0; + unsigned int i, j, k; + + hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */ + + if (!fields_str) + goto out; + + for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) { + struct hist_field *hist_field; + char *field_str, *field_name; + const char *test_name; + + sort_key = &hist_data->sort_keys[i]; + + field_str = strsep(&fields_str, ","); + if (!field_str) + break; + + if (!*field_str) { + ret = -EINVAL; + hist_err(tr, HIST_ERR_EMPTY_SORT_FIELD, errpos("sort=")); + break; + } + + if ((i == TRACING_MAP_SORT_KEYS_MAX - 1) && fields_str) { + hist_err(tr, HIST_ERR_TOO_MANY_SORT_FIELDS, errpos("sort=")); + ret = -EINVAL; + break; + } + + field_name = strsep(&field_str, "."); + if (!field_name || !*field_name) { + ret = -EINVAL; + hist_err(tr, HIST_ERR_EMPTY_SORT_FIELD, errpos("sort=")); + break; + } + + if (strcmp(field_name, "hitcount") == 0) { + descending = is_descending(tr, field_str); + if (descending < 0) { + ret = descending; + break; + } + sort_key->descending = descending; + continue; + } + + for (j = 1, k = 1; j < hist_data->n_fields; j++) { + unsigned int idx; + + hist_field = hist_data->fields[j]; + if (hist_field->flags & HIST_FIELD_FL_VAR) + continue; + + idx = k++; + + test_name = hist_field_name(hist_field, 0); + + if (strcmp(field_name, test_name) == 0) { + sort_key->field_idx = idx; + descending = is_descending(tr, field_str); + if (descending < 0) { + ret = descending; + goto out; + } + sort_key->descending = descending; + break; + } + } + if (j == hist_data->n_fields) { + ret = -EINVAL; + hist_err(tr, HIST_ERR_INVALID_SORT_FIELD, errpos(field_name)); + break; + } + } + + hist_data->n_sort_keys = i; + out: + return ret; +} + +static void destroy_actions(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + + if (data->handler == HANDLER_ONMATCH) + onmatch_destroy(data); + else if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) + track_data_destroy(hist_data, data); + else + kfree(data); + } +} + +static int parse_actions(struct hist_trigger_data *hist_data) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct action_data *data; + unsigned int i; + int ret = 0; + char *str; + int len; + + for (i = 0; i < hist_data->attrs->n_actions; i++) { + str = hist_data->attrs->action_str[i]; + + if ((len = str_has_prefix(str, "onmatch("))) { + char *action_str = str + len; + + data = onmatch_parse(tr, action_str); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + break; + } + } else if ((len = str_has_prefix(str, "onmax("))) { + char *action_str = str + len; + + data = track_data_parse(hist_data, action_str, + HANDLER_ONMAX); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + break; + } + } else if ((len = str_has_prefix(str, "onchange("))) { + char *action_str = str + len; + + data = track_data_parse(hist_data, action_str, + HANDLER_ONCHANGE); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + break; + } + } else { + ret = -EINVAL; + break; + } + + hist_data->actions[hist_data->n_actions++] = data; + } + + return ret; +} + +static int create_actions(struct hist_trigger_data *hist_data) +{ + struct action_data *data; + unsigned int i; + int ret = 0; + + for (i = 0; i < hist_data->attrs->n_actions; i++) { + data = hist_data->actions[i]; + + if (data->handler == HANDLER_ONMATCH) { + ret = onmatch_create(hist_data, data); + if (ret) + break; + } else if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) { + ret = track_data_create(hist_data, data); + if (ret) + break; + } else { + ret = -EINVAL; + break; + } + } + + return ret; +} + +static void print_actions(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + + if (data->action == ACTION_SNAPSHOT) + continue; + + if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) + track_data_print(m, hist_data, elt, data); + } +} + +static void print_action_spec(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct action_data *data) +{ + unsigned int i; + + if (data->action == ACTION_SAVE) { + for (i = 0; i < hist_data->n_save_vars; i++) { + seq_printf(m, "%s", hist_data->save_vars[i]->var->var.name); + if (i < hist_data->n_save_vars - 1) + seq_puts(m, ","); + } + } else if (data->action == ACTION_TRACE) { + if (data->use_trace_keyword) + seq_printf(m, "%s", data->synth_event_name); + for (i = 0; i < data->n_params; i++) { + if (i || data->use_trace_keyword) + seq_puts(m, ","); + seq_printf(m, "%s", data->params[i]); + } + } +} + +static void print_track_data_spec(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct action_data *data) +{ + if (data->handler == HANDLER_ONMAX) + seq_puts(m, ":onmax("); + else if (data->handler == HANDLER_ONCHANGE) + seq_puts(m, ":onchange("); + seq_printf(m, "%s", data->track_data.var_str); + seq_printf(m, ").%s(", data->action_name); + + print_action_spec(m, hist_data, data); + + seq_puts(m, ")"); +} + +static void print_onmatch_spec(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct action_data *data) +{ + seq_printf(m, ":onmatch(%s.%s).", data->match_data.event_system, + data->match_data.event); + + seq_printf(m, "%s(", data->action_name); + + print_action_spec(m, hist_data, data); + + seq_puts(m, ")"); +} + +static bool actions_match(struct hist_trigger_data *hist_data, + struct hist_trigger_data *hist_data_test) +{ + unsigned int i, j; + + if (hist_data->n_actions != hist_data_test->n_actions) + return false; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + struct action_data *data_test = hist_data_test->actions[i]; + char *action_name, *action_name_test; + + if (data->handler != data_test->handler) + return false; + if (data->action != data_test->action) + return false; + + if (data->n_params != data_test->n_params) + return false; + + for (j = 0; j < data->n_params; j++) { + if (strcmp(data->params[j], data_test->params[j]) != 0) + return false; + } + + if (data->use_trace_keyword) + action_name = data->synth_event_name; + else + action_name = data->action_name; + + if (data_test->use_trace_keyword) + action_name_test = data_test->synth_event_name; + else + action_name_test = data_test->action_name; + + if (strcmp(action_name, action_name_test) != 0) + return false; + + if (data->handler == HANDLER_ONMATCH) { + if (strcmp(data->match_data.event_system, + data_test->match_data.event_system) != 0) + return false; + if (strcmp(data->match_data.event, + data_test->match_data.event) != 0) + return false; + } else if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) { + if (strcmp(data->track_data.var_str, + data_test->track_data.var_str) != 0) + return false; + } + } + + return true; +} + + +static void print_actions_spec(struct seq_file *m, + struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + + if (data->handler == HANDLER_ONMATCH) + print_onmatch_spec(m, hist_data, data); + else if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) + print_track_data_spec(m, hist_data, data); + } +} + +static void destroy_field_var_hists(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < hist_data->n_field_var_hists; i++) { + kfree(hist_data->field_var_hists[i]->cmd); + kfree(hist_data->field_var_hists[i]); + } +} + +static void destroy_hist_data(struct hist_trigger_data *hist_data) +{ + if (!hist_data) + return; + + destroy_hist_trigger_attrs(hist_data->attrs); + destroy_hist_fields(hist_data); + tracing_map_destroy(hist_data->map); + + destroy_actions(hist_data); + destroy_field_vars(hist_data); + destroy_field_var_hists(hist_data); + + kfree(hist_data); +} + +static int create_tracing_map_fields(struct hist_trigger_data *hist_data) +{ + struct tracing_map *map = hist_data->map; + struct ftrace_event_field *field; + struct hist_field *hist_field; + int i, idx = 0; + + for_each_hist_field(i, hist_data) { + hist_field = hist_data->fields[i]; + if (hist_field->flags & HIST_FIELD_FL_KEY) { + tracing_map_cmp_fn_t cmp_fn; + + field = hist_field->field; + + if (hist_field->flags & HIST_FIELD_FL_STACKTRACE) + cmp_fn = tracing_map_cmp_none; + else if (!field || hist_field->flags & HIST_FIELD_FL_CPU) + cmp_fn = tracing_map_cmp_num(hist_field->size, + hist_field->is_signed); + else if (is_string_field(field)) + cmp_fn = tracing_map_cmp_string; + else + cmp_fn = tracing_map_cmp_num(field->size, + field->is_signed); + idx = tracing_map_add_key_field(map, + hist_field->offset, + cmp_fn); + } else if (!(hist_field->flags & HIST_FIELD_FL_VAR)) + idx = tracing_map_add_sum_field(map); + + if (idx < 0) + return idx; + + if (hist_field->flags & HIST_FIELD_FL_VAR) { + idx = tracing_map_add_var(map); + if (idx < 0) + return idx; + hist_field->var.idx = idx; + hist_field->var.hist_data = hist_data; + } + } + + return 0; +} + +static struct hist_trigger_data * +create_hist_data(unsigned int map_bits, + struct hist_trigger_attrs *attrs, + struct trace_event_file *file, + bool remove) +{ + const struct tracing_map_ops *map_ops = NULL; + struct hist_trigger_data *hist_data; + int ret = 0; + + hist_data = kzalloc(sizeof(*hist_data), GFP_KERNEL); + if (!hist_data) + return ERR_PTR(-ENOMEM); + + hist_data->attrs = attrs; + hist_data->remove = remove; + hist_data->event_file = file; + + ret = parse_actions(hist_data); + if (ret) + goto free; + + ret = create_hist_fields(hist_data, file); + if (ret) + goto free; + + ret = create_sort_keys(hist_data); + if (ret) + goto free; + + map_ops = &hist_trigger_elt_data_ops; + + hist_data->map = tracing_map_create(map_bits, hist_data->key_size, + map_ops, hist_data); + if (IS_ERR(hist_data->map)) { + ret = PTR_ERR(hist_data->map); + hist_data->map = NULL; + goto free; + } + + ret = create_tracing_map_fields(hist_data); + if (ret) + goto free; + out: + return hist_data; + free: + hist_data->attrs = NULL; + + destroy_hist_data(hist_data); + + hist_data = ERR_PTR(ret); + + goto out; +} + +static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *rbe, + u64 *var_ref_vals) +{ + struct hist_elt_data *elt_data; + struct hist_field *hist_field; + unsigned int i, var_idx; + u64 hist_val; + + elt_data = elt->private_data; + elt_data->var_ref_vals = var_ref_vals; + + for_each_hist_val_field(i, hist_data) { + hist_field = hist_data->fields[i]; + hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec); + if (hist_field->flags & HIST_FIELD_FL_VAR) { + var_idx = hist_field->var.idx; + + if (hist_field->flags & + (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE)) { + unsigned int str_start, var_str_idx, idx; + char *str, *val_str; + unsigned int size; + + str_start = hist_data->n_field_var_str + + hist_data->n_save_var_str; + var_str_idx = hist_field->var_str_idx; + idx = str_start + var_str_idx; + + str = elt_data->field_var_str[idx]; + val_str = (char *)(uintptr_t)hist_val; + + if (hist_field->flags & HIST_FIELD_FL_STRING) { + size = min(hist_field->size, STR_VAR_LEN_MAX); + strscpy(str, val_str, size); + } else { + char *stack_start = str + sizeof(unsigned long); + int e; + + e = stack_trace_save((void *)stack_start, + HIST_STACKTRACE_DEPTH, + HIST_STACKTRACE_SKIP); + if (e < HIST_STACKTRACE_DEPTH - 1) + ((unsigned long *)stack_start)[e] = 0; + *((unsigned long *)str) = e; + } + hist_val = (u64)(uintptr_t)str; + } + tracing_map_set_var(elt, var_idx, hist_val); + continue; + } + tracing_map_update_sum(elt, i, hist_val); + } + + for_each_hist_key_field(i, hist_data) { + hist_field = hist_data->fields[i]; + if (hist_field->flags & HIST_FIELD_FL_VAR) { + hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec); + var_idx = hist_field->var.idx; + tracing_map_set_var(elt, var_idx, hist_val); + } + } + + update_field_vars(hist_data, elt, buffer, rbe, rec); +} + +static inline void add_to_key(char *compound_key, void *key, + struct hist_field *key_field, void *rec) +{ + size_t size = key_field->size; + + if (key_field->flags & HIST_FIELD_FL_STRING) { + struct ftrace_event_field *field; + + field = key_field->field; + if (field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING) + size = *(u32 *)(rec + field->offset) >> 16; + else if (field->filter_type == FILTER_STATIC_STRING) + size = field->size; + + /* ensure NULL-termination */ + if (size > key_field->size - 1) + size = key_field->size - 1; + + strncpy(compound_key + key_field->offset, (char *)key, size); + } else + memcpy(compound_key + key_field->offset, key, size); +} + +static void +hist_trigger_actions(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *rbe, void *key, + u64 *var_ref_vals) +{ + struct action_data *data; + unsigned int i; + + for (i = 0; i < hist_data->n_actions; i++) { + data = hist_data->actions[i]; + data->fn(hist_data, elt, buffer, rec, rbe, key, data, var_ref_vals); + } +} + +static void event_hist_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *rbe) +{ + struct hist_trigger_data *hist_data = data->private_data; + bool use_compound_key = (hist_data->n_keys > 1); + unsigned long entries[HIST_STACKTRACE_DEPTH]; + u64 var_ref_vals[TRACING_MAP_VARS_MAX]; + char compound_key[HIST_KEY_SIZE_MAX]; + struct tracing_map_elt *elt = NULL; + struct hist_field *key_field; + u64 field_contents; + void *key = NULL; + unsigned int i; + + if (unlikely(!rbe)) + return; + + memset(compound_key, 0, hist_data->key_size); + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { + memset(entries, 0, HIST_STACKTRACE_SIZE); + if (key_field->field) { + unsigned long *stack, n_entries; + + field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec); + stack = (unsigned long *)(long)field_contents; + n_entries = *stack; + memcpy(entries, ++stack, n_entries * sizeof(unsigned long)); + } else { + stack_trace_save(entries, HIST_STACKTRACE_DEPTH, + HIST_STACKTRACE_SKIP); + } + key = entries; + } else { + field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec); + if (key_field->flags & HIST_FIELD_FL_STRING) { + key = (void *)(unsigned long)field_contents; + use_compound_key = true; + } else + key = (void *)&field_contents; + } + + if (use_compound_key) + add_to_key(compound_key, key, key_field, rec); + } + + if (use_compound_key) + key = compound_key; + + if (hist_data->n_var_refs && + !resolve_var_refs(hist_data, key, var_ref_vals, false)) + return; + + elt = tracing_map_insert(hist_data->map, key); + if (!elt) + return; + + hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, var_ref_vals); + + if (resolve_var_refs(hist_data, key, var_ref_vals, true)) + hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals); +} + +static void hist_trigger_stacktrace_print(struct seq_file *m, + unsigned long *stacktrace_entries, + unsigned int max_entries) +{ + unsigned int spaces = 8; + unsigned int i; + + for (i = 0; i < max_entries; i++) { + if (!stacktrace_entries[i]) + return; + + seq_printf(m, "%*c", 1 + spaces, ' '); + seq_printf(m, "%pS\n", (void*)stacktrace_entries[i]); + } +} + +static void hist_trigger_print_key(struct seq_file *m, + struct hist_trigger_data *hist_data, + void *key, + struct tracing_map_elt *elt) +{ + struct hist_field *key_field; + bool multiline = false; + const char *field_name; + unsigned int i; + u64 uval; + + seq_puts(m, "{ "); + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + if (i > hist_data->n_vals) + seq_puts(m, ", "); + + field_name = hist_field_name(key_field, 0); + + if (key_field->flags & HIST_FIELD_FL_HEX) { + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: %llx", field_name, uval); + } else if (key_field->flags & HIST_FIELD_FL_SYM) { + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: [%llx] %-45ps", field_name, + uval, (void *)(uintptr_t)uval); + } else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) { + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: [%llx] %-55pS", field_name, + uval, (void *)(uintptr_t)uval); + } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) { + struct hist_elt_data *elt_data = elt->private_data; + char *comm; + + if (WARN_ON_ONCE(!elt_data)) + return; + + comm = elt_data->comm; + + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: %-16s[%10llu]", field_name, + comm, uval); + } else if (key_field->flags & HIST_FIELD_FL_SYSCALL) { + const char *syscall_name; + + uval = *(u64 *)(key + key_field->offset); + syscall_name = get_syscall_name(uval); + if (!syscall_name) + syscall_name = "unknown_syscall"; + + seq_printf(m, "%s: %-30s[%3llu]", field_name, + syscall_name, uval); + } else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { + if (key_field->field) + seq_printf(m, "%s.stacktrace", key_field->field->name); + else + seq_puts(m, "common_stacktrace:\n"); + hist_trigger_stacktrace_print(m, + key + key_field->offset, + HIST_STACKTRACE_DEPTH); + multiline = true; + } else if (key_field->flags & HIST_FIELD_FL_LOG2) { + seq_printf(m, "%s: ~ 2^%-2llu", field_name, + *(u64 *)(key + key_field->offset)); + } else if (key_field->flags & HIST_FIELD_FL_BUCKET) { + unsigned long buckets = key_field->buckets; + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: ~ %llu-%llu", field_name, + uval, uval + buckets -1); + } else if (key_field->flags & HIST_FIELD_FL_STRING) { + seq_printf(m, "%s: %-50s", field_name, + (char *)(key + key_field->offset)); + } else { + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: %10llu", field_name, uval); + } + } + + if (!multiline) + seq_puts(m, " "); + + seq_puts(m, "}"); +} + +/* Get the 100 times of the percentage of @val in @total */ +static inline unsigned int __get_percentage(u64 val, u64 total) +{ + if (!total) + goto div0; + + if (val < (U64_MAX / 10000)) + return (unsigned int)div64_ul(val * 10000, total); + + total = div64_u64(total, 10000); + if (!total) + goto div0; + + return (unsigned int)div64_ul(val, total); +div0: + return val ? UINT_MAX : 0; +} + +#define BAR_CHAR '#' + +static inline const char *__fill_bar_str(char *buf, int size, u64 val, u64 max) +{ + unsigned int len = __get_percentage(val, max); + int i; + + if (len == UINT_MAX) { + snprintf(buf, size, "[ERROR]"); + return buf; + } + + len = len * size / 10000; + for (i = 0; i < len && i < size; i++) + buf[i] = BAR_CHAR; + while (i < size) + buf[i++] = ' '; + buf[size] = '\0'; + + return buf; +} + +struct hist_val_stat { + u64 max; + u64 total; +}; + +static void hist_trigger_print_val(struct seq_file *m, unsigned int idx, + const char *field_name, unsigned long flags, + struct hist_val_stat *stats, + struct tracing_map_elt *elt) +{ + u64 val = tracing_map_read_sum(elt, idx); + unsigned int pc; + char bar[21]; + + if (flags & HIST_FIELD_FL_PERCENT) { + pc = __get_percentage(val, stats[idx].total); + if (pc == UINT_MAX) + seq_printf(m, " %s (%%):[ERROR]", field_name); + else + seq_printf(m, " %s (%%): %3u.%02u", field_name, + pc / 100, pc % 100); + } else if (flags & HIST_FIELD_FL_GRAPH) { + seq_printf(m, " %s: %20s", field_name, + __fill_bar_str(bar, 20, val, stats[idx].max)); + } else if (flags & HIST_FIELD_FL_HEX) { + seq_printf(m, " %s: %10llx", field_name, val); + } else { + seq_printf(m, " %s: %10llu", field_name, val); + } +} + +static void hist_trigger_entry_print(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct hist_val_stat *stats, + void *key, + struct tracing_map_elt *elt) +{ + const char *field_name; + unsigned int i = HITCOUNT_IDX; + unsigned long flags; + + hist_trigger_print_key(m, hist_data, key, elt); + + /* At first, show the raw hitcount if !nohitcount */ + if (!hist_data->attrs->no_hitcount) + hist_trigger_print_val(m, i, "hitcount", 0, stats, elt); + + for (i = 1; i < hist_data->n_vals; i++) { + field_name = hist_field_name(hist_data->fields[i], 0); + flags = hist_data->fields[i]->flags; + if (flags & HIST_FIELD_FL_VAR || flags & HIST_FIELD_FL_EXPR) + continue; + + seq_puts(m, " "); + hist_trigger_print_val(m, i, field_name, flags, stats, elt); + } + + print_actions(m, hist_data, elt); + + seq_puts(m, "\n"); +} + +static int print_entries(struct seq_file *m, + struct hist_trigger_data *hist_data) +{ + struct tracing_map_sort_entry **sort_entries = NULL; + struct tracing_map *map = hist_data->map; + int i, j, n_entries; + struct hist_val_stat *stats = NULL; + u64 val; + + n_entries = tracing_map_sort_entries(map, hist_data->sort_keys, + hist_data->n_sort_keys, + &sort_entries); + if (n_entries < 0) + return n_entries; + + /* Calculate the max and the total for each field if needed. */ + for (j = 0; j < hist_data->n_vals; j++) { + if (!(hist_data->fields[j]->flags & + (HIST_FIELD_FL_PERCENT | HIST_FIELD_FL_GRAPH))) + continue; + if (!stats) { + stats = kcalloc(hist_data->n_vals, sizeof(*stats), + GFP_KERNEL); + if (!stats) { + n_entries = -ENOMEM; + goto out; + } + } + for (i = 0; i < n_entries; i++) { + val = tracing_map_read_sum(sort_entries[i]->elt, j); + stats[j].total += val; + if (stats[j].max < val) + stats[j].max = val; + } + } + + for (i = 0; i < n_entries; i++) + hist_trigger_entry_print(m, hist_data, stats, + sort_entries[i]->key, + sort_entries[i]->elt); + + kfree(stats); +out: + tracing_map_destroy_sort_entries(sort_entries, n_entries); + + return n_entries; +} + +static void hist_trigger_show(struct seq_file *m, + struct event_trigger_data *data, int n) +{ + struct hist_trigger_data *hist_data; + int n_entries; + + if (n > 0) + seq_puts(m, "\n\n"); + + seq_puts(m, "# event histogram\n#\n# trigger info: "); + data->ops->print(m, data); + seq_puts(m, "#\n\n"); + + hist_data = data->private_data; + n_entries = print_entries(m, hist_data); + if (n_entries < 0) + n_entries = 0; + + track_data_snapshot_print(m, hist_data); + + seq_printf(m, "\nTotals:\n Hits: %llu\n Entries: %u\n Dropped: %llu\n", + (u64)atomic64_read(&hist_data->map->hits), + n_entries, (u64)atomic64_read(&hist_data->map->drops)); +} + +static int hist_show(struct seq_file *m, void *v) +{ + struct event_trigger_data *data; + struct trace_event_file *event_file; + int n = 0, ret = 0; + + mutex_lock(&event_mutex); + + event_file = event_file_data(m->private); + if (unlikely(!event_file)) { + ret = -ENODEV; + goto out_unlock; + } + + list_for_each_entry(data, &event_file->triggers, list) { + if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) + hist_trigger_show(m, data, n++); + } + + out_unlock: + mutex_unlock(&event_mutex); + + return ret; +} + +static int event_hist_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = tracing_open_file_tr(inode, file); + if (ret) + return ret; + + /* Clear private_data to avoid warning in single_open() */ + file->private_data = NULL; + return single_open(file, hist_show, file); +} + +const struct file_operations event_hist_fops = { + .open = event_hist_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_single_release_file_tr, +}; + +#ifdef CONFIG_HIST_TRIGGERS_DEBUG +static void hist_field_debug_show_flags(struct seq_file *m, + unsigned long flags) +{ + seq_puts(m, " flags:\n"); + + if (flags & HIST_FIELD_FL_KEY) + seq_puts(m, " HIST_FIELD_FL_KEY\n"); + else if (flags & HIST_FIELD_FL_HITCOUNT) + seq_puts(m, " VAL: HIST_FIELD_FL_HITCOUNT\n"); + else if (flags & HIST_FIELD_FL_VAR) + seq_puts(m, " HIST_FIELD_FL_VAR\n"); + else if (flags & HIST_FIELD_FL_VAR_REF) + seq_puts(m, " HIST_FIELD_FL_VAR_REF\n"); + else + seq_puts(m, " VAL: normal u64 value\n"); + + if (flags & HIST_FIELD_FL_ALIAS) + seq_puts(m, " HIST_FIELD_FL_ALIAS\n"); + else if (flags & HIST_FIELD_FL_CONST) + seq_puts(m, " HIST_FIELD_FL_CONST\n"); +} + +static int hist_field_debug_show(struct seq_file *m, + struct hist_field *field, unsigned long flags) +{ + if ((field->flags & flags) != flags) { + seq_printf(m, "ERROR: bad flags - %lx\n", flags); + return -EINVAL; + } + + hist_field_debug_show_flags(m, field->flags); + if (field->field) + seq_printf(m, " ftrace_event_field name: %s\n", + field->field->name); + + if (field->flags & HIST_FIELD_FL_VAR) { + seq_printf(m, " var.name: %s\n", field->var.name); + seq_printf(m, " var.idx (into tracing_map_elt.vars[]): %u\n", + field->var.idx); + } + + if (field->flags & HIST_FIELD_FL_CONST) + seq_printf(m, " constant: %llu\n", field->constant); + + if (field->flags & HIST_FIELD_FL_ALIAS) + seq_printf(m, " var_ref_idx (into hist_data->var_refs[]): %u\n", + field->var_ref_idx); + + if (field->flags & HIST_FIELD_FL_VAR_REF) { + seq_printf(m, " name: %s\n", field->name); + seq_printf(m, " var.idx (into tracing_map_elt.vars[]): %u\n", + field->var.idx); + seq_printf(m, " var.hist_data: %p\n", field->var.hist_data); + seq_printf(m, " var_ref_idx (into hist_data->var_refs[]): %u\n", + field->var_ref_idx); + if (field->system) + seq_printf(m, " system: %s\n", field->system); + if (field->event_name) + seq_printf(m, " event_name: %s\n", field->event_name); + } + + seq_printf(m, " type: %s\n", field->type); + seq_printf(m, " size: %u\n", field->size); + seq_printf(m, " is_signed: %u\n", field->is_signed); + + return 0; +} + +static int field_var_debug_show(struct seq_file *m, + struct field_var *field_var, unsigned int i, + bool save_vars) +{ + const char *vars_name = save_vars ? "save_vars" : "field_vars"; + struct hist_field *field; + int ret = 0; + + seq_printf(m, "\n hist_data->%s[%d]:\n", vars_name, i); + + field = field_var->var; + + seq_printf(m, "\n %s[%d].var:\n", vars_name, i); + + hist_field_debug_show_flags(m, field->flags); + seq_printf(m, " var.name: %s\n", field->var.name); + seq_printf(m, " var.idx (into tracing_map_elt.vars[]): %u\n", + field->var.idx); + + field = field_var->val; + + seq_printf(m, "\n %s[%d].val:\n", vars_name, i); + if (field->field) + seq_printf(m, " ftrace_event_field name: %s\n", + field->field->name); + else { + ret = -EINVAL; + goto out; + } + + seq_printf(m, " type: %s\n", field->type); + seq_printf(m, " size: %u\n", field->size); + seq_printf(m, " is_signed: %u\n", field->is_signed); +out: + return ret; +} + +static int hist_action_debug_show(struct seq_file *m, + struct action_data *data, int i) +{ + int ret = 0; + + if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) { + seq_printf(m, "\n hist_data->actions[%d].track_data.var_ref:\n", i); + ret = hist_field_debug_show(m, data->track_data.var_ref, + HIST_FIELD_FL_VAR_REF); + if (ret) + goto out; + + seq_printf(m, "\n hist_data->actions[%d].track_data.track_var:\n", i); + ret = hist_field_debug_show(m, data->track_data.track_var, + HIST_FIELD_FL_VAR); + if (ret) + goto out; + } + + if (data->handler == HANDLER_ONMATCH) { + seq_printf(m, "\n hist_data->actions[%d].match_data.event_system: %s\n", + i, data->match_data.event_system); + seq_printf(m, " hist_data->actions[%d].match_data.event: %s\n", + i, data->match_data.event); + } +out: + return ret; +} + +static int hist_actions_debug_show(struct seq_file *m, + struct hist_trigger_data *hist_data) +{ + int i, ret = 0; + + if (hist_data->n_actions) + seq_puts(m, "\n action tracking variables (for onmax()/onchange()/onmatch()):\n"); + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *action = hist_data->actions[i]; + + ret = hist_action_debug_show(m, action, i); + if (ret) + goto out; + } + + if (hist_data->n_save_vars) + seq_puts(m, "\n save action variables (save() params):\n"); + + for (i = 0; i < hist_data->n_save_vars; i++) { + ret = field_var_debug_show(m, hist_data->save_vars[i], i, true); + if (ret) + goto out; + } +out: + return ret; +} + +static void hist_trigger_debug_show(struct seq_file *m, + struct event_trigger_data *data, int n) +{ + struct hist_trigger_data *hist_data; + int i, ret; + + if (n > 0) + seq_puts(m, "\n\n"); + + seq_puts(m, "# event histogram\n#\n# trigger info: "); + data->ops->print(m, data); + seq_puts(m, "#\n\n"); + + hist_data = data->private_data; + + seq_printf(m, "hist_data: %p\n\n", hist_data); + seq_printf(m, " n_vals: %u\n", hist_data->n_vals); + seq_printf(m, " n_keys: %u\n", hist_data->n_keys); + seq_printf(m, " n_fields: %u\n", hist_data->n_fields); + + seq_puts(m, "\n val fields:\n\n"); + + seq_puts(m, " hist_data->fields[0]:\n"); + ret = hist_field_debug_show(m, hist_data->fields[0], + HIST_FIELD_FL_HITCOUNT); + if (ret) + return; + + for (i = 1; i < hist_data->n_vals; i++) { + seq_printf(m, "\n hist_data->fields[%d]:\n", i); + ret = hist_field_debug_show(m, hist_data->fields[i], 0); + if (ret) + return; + } + + seq_puts(m, "\n key fields:\n"); + + for (i = hist_data->n_vals; i < hist_data->n_fields; i++) { + seq_printf(m, "\n hist_data->fields[%d]:\n", i); + ret = hist_field_debug_show(m, hist_data->fields[i], + HIST_FIELD_FL_KEY); + if (ret) + return; + } + + if (hist_data->n_var_refs) + seq_puts(m, "\n variable reference fields:\n"); + + for (i = 0; i < hist_data->n_var_refs; i++) { + seq_printf(m, "\n hist_data->var_refs[%d]:\n", i); + ret = hist_field_debug_show(m, hist_data->var_refs[i], + HIST_FIELD_FL_VAR_REF); + if (ret) + return; + } + + if (hist_data->n_field_vars) + seq_puts(m, "\n field variables:\n"); + + for (i = 0; i < hist_data->n_field_vars; i++) { + ret = field_var_debug_show(m, hist_data->field_vars[i], i, false); + if (ret) + return; + } + + ret = hist_actions_debug_show(m, hist_data); + if (ret) + return; +} + +static int hist_debug_show(struct seq_file *m, void *v) +{ + struct event_trigger_data *data; + struct trace_event_file *event_file; + int n = 0, ret = 0; + + mutex_lock(&event_mutex); + + event_file = event_file_data(m->private); + if (unlikely(!event_file)) { + ret = -ENODEV; + goto out_unlock; + } + + list_for_each_entry(data, &event_file->triggers, list) { + if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) + hist_trigger_debug_show(m, data, n++); + } + + out_unlock: + mutex_unlock(&event_mutex); + + return ret; +} + +static int event_hist_debug_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = tracing_open_file_tr(inode, file); + if (ret) + return ret; + + /* Clear private_data to avoid warning in single_open() */ + file->private_data = NULL; + return single_open(file, hist_debug_show, file); +} + +const struct file_operations event_hist_debug_fops = { + .open = event_hist_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_single_release_file_tr, +}; +#endif + +static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) +{ + const char *field_name = hist_field_name(hist_field, 0); + + if (hist_field->var.name) + seq_printf(m, "%s=", hist_field->var.name); + + if (hist_field->flags & HIST_FIELD_FL_CPU) + seq_puts(m, "common_cpu"); + else if (hist_field->flags & HIST_FIELD_FL_CONST) + seq_printf(m, "%llu", hist_field->constant); + else if (field_name) { + if (hist_field->flags & HIST_FIELD_FL_VAR_REF || + hist_field->flags & HIST_FIELD_FL_ALIAS) + seq_putc(m, '$'); + seq_printf(m, "%s", field_name); + } else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP) + seq_puts(m, "common_timestamp"); + + if (hist_field->flags) { + if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) && + !(hist_field->flags & HIST_FIELD_FL_EXPR) && + !(hist_field->flags & HIST_FIELD_FL_STACKTRACE)) { + const char *flags = get_hist_field_flags(hist_field); + + if (flags) + seq_printf(m, ".%s", flags); + } + } + if (hist_field->buckets) + seq_printf(m, "=%ld", hist_field->buckets); +} + +static int event_hist_trigger_print(struct seq_file *m, + struct event_trigger_data *data) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct hist_field *field; + bool have_var = false; + bool show_val = false; + unsigned int i; + + seq_puts(m, HIST_PREFIX); + + if (data->name) + seq_printf(m, "%s:", data->name); + + seq_puts(m, "keys="); + + for_each_hist_key_field(i, hist_data) { + field = hist_data->fields[i]; + + if (i > hist_data->n_vals) + seq_puts(m, ","); + + if (field->flags & HIST_FIELD_FL_STACKTRACE) { + if (field->field) + seq_printf(m, "%s.stacktrace", field->field->name); + else + seq_puts(m, "common_stacktrace"); + } else + hist_field_print(m, field); + } + + seq_puts(m, ":vals="); + + for_each_hist_val_field(i, hist_data) { + field = hist_data->fields[i]; + if (field->flags & HIST_FIELD_FL_VAR) { + have_var = true; + continue; + } + + if (i == HITCOUNT_IDX) { + if (hist_data->attrs->no_hitcount) + continue; + seq_puts(m, "hitcount"); + } else { + if (show_val) + seq_puts(m, ","); + hist_field_print(m, field); + } + show_val = true; + } + + if (have_var) { + unsigned int n = 0; + + seq_puts(m, ":"); + + for_each_hist_val_field(i, hist_data) { + field = hist_data->fields[i]; + + if (field->flags & HIST_FIELD_FL_VAR) { + if (n++) + seq_puts(m, ","); + hist_field_print(m, field); + } + } + } + + seq_puts(m, ":sort="); + + for (i = 0; i < hist_data->n_sort_keys; i++) { + struct tracing_map_sort_key *sort_key; + unsigned int idx, first_key_idx; + + /* skip VAR vals */ + first_key_idx = hist_data->n_vals - hist_data->n_vars; + + sort_key = &hist_data->sort_keys[i]; + idx = sort_key->field_idx; + + if (WARN_ON(idx >= HIST_FIELDS_MAX)) + return -EINVAL; + + if (i > 0) + seq_puts(m, ","); + + if (idx == HITCOUNT_IDX) + seq_puts(m, "hitcount"); + else { + if (idx >= first_key_idx) + idx += hist_data->n_vars; + hist_field_print(m, hist_data->fields[idx]); + } + + if (sort_key->descending) + seq_puts(m, ".descending"); + } + seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits)); + if (hist_data->enable_timestamps) + seq_printf(m, ":clock=%s", hist_data->attrs->clock); + if (hist_data->attrs->no_hitcount) + seq_puts(m, ":nohitcount"); + + print_actions_spec(m, hist_data); + + if (data->filter_str) + seq_printf(m, " if %s", data->filter_str); + + if (data->paused) + seq_puts(m, " [paused]"); + else + seq_puts(m, " [active]"); + + seq_putc(m, '\n'); + + return 0; +} + +static int event_hist_trigger_init(struct event_trigger_data *data) +{ + struct hist_trigger_data *hist_data = data->private_data; + + if (!data->ref && hist_data->attrs->name) + save_named_trigger(hist_data->attrs->name, data); + + data->ref++; + + return 0; +} + +static void unregister_field_var_hists(struct hist_trigger_data *hist_data) +{ + struct trace_event_file *file; + unsigned int i; + char *cmd; + int ret; + + for (i = 0; i < hist_data->n_field_var_hists; i++) { + file = hist_data->field_var_hists[i]->hist_data->event_file; + cmd = hist_data->field_var_hists[i]->cmd; + ret = event_hist_trigger_parse(&trigger_hist_cmd, file, + "!hist", "hist", cmd); + WARN_ON_ONCE(ret < 0); + } +} + +static void event_hist_trigger_free(struct event_trigger_data *data) +{ + struct hist_trigger_data *hist_data = data->private_data; + + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) { + if (data->name) + del_named_trigger(data); + + trigger_data_free(data); + + remove_hist_vars(hist_data); + + unregister_field_var_hists(hist_data); + + destroy_hist_data(hist_data); + } +} + +static struct event_trigger_ops event_hist_trigger_ops = { + .trigger = event_hist_trigger, + .print = event_hist_trigger_print, + .init = event_hist_trigger_init, + .free = event_hist_trigger_free, +}; + +static int event_hist_trigger_named_init(struct event_trigger_data *data) +{ + data->ref++; + + save_named_trigger(data->named_data->name, data); + + event_hist_trigger_init(data->named_data); + + return 0; +} + +static void event_hist_trigger_named_free(struct event_trigger_data *data) +{ + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + event_hist_trigger_free(data->named_data); + + data->ref--; + if (!data->ref) { + del_named_trigger(data); + trigger_data_free(data); + } +} + +static struct event_trigger_ops event_hist_trigger_named_ops = { + .trigger = event_hist_trigger, + .print = event_hist_trigger_print, + .init = event_hist_trigger_named_init, + .free = event_hist_trigger_named_free, +}; + +static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, + char *param) +{ + return &event_hist_trigger_ops; +} + +static void hist_clear(struct event_trigger_data *data) +{ + struct hist_trigger_data *hist_data = data->private_data; + + if (data->name) + pause_named_trigger(data); + + tracepoint_synchronize_unregister(); + + tracing_map_clear(hist_data->map); + + if (data->name) + unpause_named_trigger(data); +} + +static bool compatible_field(struct ftrace_event_field *field, + struct ftrace_event_field *test_field) +{ + if (field == test_field) + return true; + if (field == NULL || test_field == NULL) + return false; + if (strcmp(field->name, test_field->name) != 0) + return false; + if (strcmp(field->type, test_field->type) != 0) + return false; + if (field->size != test_field->size) + return false; + if (field->is_signed != test_field->is_signed) + return false; + + return true; +} + +static bool hist_trigger_match(struct event_trigger_data *data, + struct event_trigger_data *data_test, + struct event_trigger_data *named_data, + bool ignore_filter) +{ + struct tracing_map_sort_key *sort_key, *sort_key_test; + struct hist_trigger_data *hist_data, *hist_data_test; + struct hist_field *key_field, *key_field_test; + unsigned int i; + + if (named_data && (named_data != data_test) && + (named_data != data_test->named_data)) + return false; + + if (!named_data && is_named_trigger(data_test)) + return false; + + hist_data = data->private_data; + hist_data_test = data_test->private_data; + + if (hist_data->n_vals != hist_data_test->n_vals || + hist_data->n_fields != hist_data_test->n_fields || + hist_data->n_sort_keys != hist_data_test->n_sort_keys) + return false; + + if (!ignore_filter) { + if ((data->filter_str && !data_test->filter_str) || + (!data->filter_str && data_test->filter_str)) + return false; + } + + for_each_hist_field(i, hist_data) { + key_field = hist_data->fields[i]; + key_field_test = hist_data_test->fields[i]; + + if (key_field->flags != key_field_test->flags) + return false; + if (!compatible_field(key_field->field, key_field_test->field)) + return false; + if (key_field->offset != key_field_test->offset) + return false; + if (key_field->size != key_field_test->size) + return false; + if (key_field->is_signed != key_field_test->is_signed) + return false; + if (!!key_field->var.name != !!key_field_test->var.name) + return false; + if (key_field->var.name && + strcmp(key_field->var.name, key_field_test->var.name) != 0) + return false; + } + + for (i = 0; i < hist_data->n_sort_keys; i++) { + sort_key = &hist_data->sort_keys[i]; + sort_key_test = &hist_data_test->sort_keys[i]; + + if (sort_key->field_idx != sort_key_test->field_idx || + sort_key->descending != sort_key_test->descending) + return false; + } + + if (!ignore_filter && data->filter_str && + (strcmp(data->filter_str, data_test->filter_str) != 0)) + return false; + + if (!actions_match(hist_data, hist_data_test)) + return false; + + return true; +} + +static bool existing_hist_update_only(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct event_trigger_data *test, *named_data = NULL; + bool updated = false; + + if (!hist_data->attrs->pause && !hist_data->attrs->cont && + !hist_data->attrs->clear) + goto out; + + if (hist_data->attrs->name) { + named_data = find_named_trigger(hist_data->attrs->name); + if (named_data) { + if (!hist_trigger_match(data, named_data, named_data, + true)) + goto out; + } + } + + if (hist_data->attrs->name && !named_data) + goto out; + + list_for_each_entry(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, test, named_data, false)) + continue; + if (hist_data->attrs->pause) + test->paused = true; + else if (hist_data->attrs->cont) + test->paused = false; + else if (hist_data->attrs->clear) + hist_clear(test); + updated = true; + goto out; + } + } + out: + return updated; +} + +static int hist_register_trigger(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct event_trigger_data *test, *named_data = NULL; + struct trace_array *tr = file->tr; + int ret = 0; + + if (hist_data->attrs->name) { + named_data = find_named_trigger(hist_data->attrs->name); + if (named_data) { + if (!hist_trigger_match(data, named_data, named_data, + true)) { + hist_err(tr, HIST_ERR_NAMED_MISMATCH, errpos(hist_data->attrs->name)); + ret = -EINVAL; + goto out; + } + } + } + + if (hist_data->attrs->name && !named_data) + goto new; + + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (hist_trigger_match(data, test, named_data, false)) { + hist_err(tr, HIST_ERR_TRIGGER_EEXIST, 0); + ret = -EEXIST; + goto out; + } + } + } + new: + if (hist_data->attrs->cont || hist_data->attrs->clear) { + hist_err(tr, HIST_ERR_TRIGGER_ENOENT_CLEAR, 0); + ret = -ENOENT; + goto out; + } + + if (hist_data->attrs->pause) + data->paused = true; + + if (named_data) { + data->private_data = named_data->private_data; + set_named_trigger_data(data, named_data); + data->ops = &event_hist_trigger_named_ops; + } + + if (data->ops->init) { + ret = data->ops->init(data); + if (ret < 0) + goto out; + } + + if (hist_data->enable_timestamps) { + char *clock = hist_data->attrs->clock; + + ret = tracing_set_clock(file->tr, hist_data->attrs->clock); + if (ret) { + hist_err(tr, HIST_ERR_SET_CLOCK_FAIL, errpos(clock)); + goto out; + } + + tracing_set_filter_buffering(file->tr, true); + } + + if (named_data) + destroy_hist_data(hist_data); + out: + return ret; +} + +static int hist_trigger_enable(struct event_trigger_data *data, + struct trace_event_file *file) +{ + int ret = 0; + + list_add_tail_rcu(&data->list, &file->triggers); + + update_cond_flag(file); + + if (trace_event_trigger_enable_disable(file, 1) < 0) { + list_del_rcu(&data->list); + update_cond_flag(file); + ret--; + } + + return ret; +} + +static bool have_hist_trigger_match(struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct event_trigger_data *test, *named_data = NULL; + bool match = false; + + lockdep_assert_held(&event_mutex); + + if (hist_data->attrs->name) + named_data = find_named_trigger(hist_data->attrs->name); + + list_for_each_entry(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (hist_trigger_match(data, test, named_data, false)) { + match = true; + break; + } + } + } + + return match; +} + +static bool hist_trigger_check_refs(struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct event_trigger_data *test, *named_data = NULL; + + lockdep_assert_held(&event_mutex); + + if (hist_data->attrs->name) + named_data = find_named_trigger(hist_data->attrs->name); + + list_for_each_entry(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, test, named_data, false)) + continue; + hist_data = test->private_data; + if (check_var_refs(hist_data)) + return true; + break; + } + } + + return false; +} + +static void hist_unregister_trigger(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct event_trigger_data *test = NULL, *iter, *named_data = NULL; + struct hist_trigger_data *hist_data = data->private_data; + + lockdep_assert_held(&event_mutex); + + if (hist_data->attrs->name) + named_data = find_named_trigger(hist_data->attrs->name); + + list_for_each_entry(iter, &file->triggers, list) { + if (iter->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, iter, named_data, false)) + continue; + test = iter; + list_del_rcu(&test->list); + trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); + break; + } + } + + if (test && test->ops->free) + test->ops->free(test); + + if (hist_data->enable_timestamps) { + if (!hist_data->remove || test) + tracing_set_filter_buffering(file->tr, false); + } +} + +static bool hist_file_check_refs(struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data; + struct event_trigger_data *test; + + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + hist_data = test->private_data; + if (check_var_refs(hist_data)) + return true; + } + } + + return false; +} + +static void hist_unreg_all(struct trace_event_file *file) +{ + struct event_trigger_data *test, *n; + struct hist_trigger_data *hist_data; + struct synth_event *se; + const char *se_name; + + lockdep_assert_held(&event_mutex); + + if (hist_file_check_refs(file)) + return; + + list_for_each_entry_safe(test, n, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + hist_data = test->private_data; + list_del_rcu(&test->list); + trace_event_trigger_enable_disable(file, 0); + + se_name = trace_event_name(file->event_call); + se = find_synth_event(se_name); + if (se) + se->ref--; + + update_cond_flag(file); + if (hist_data->enable_timestamps) + tracing_set_filter_buffering(file->tr, false); + if (test->ops->free) + test->ops->free(test); + } + } +} + +static int event_hist_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, + char *param_and_filter) +{ + unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT; + struct event_trigger_data *trigger_data; + struct hist_trigger_attrs *attrs; + struct hist_trigger_data *hist_data; + char *param, *filter, *p, *start; + struct synth_event *se; + const char *se_name; + bool remove; + int ret = 0; + + lockdep_assert_held(&event_mutex); + + if (WARN_ON(!glob)) + return -EINVAL; + + if (glob[0]) { + hist_err_clear(); + last_cmd_set(file, param_and_filter); + } + + remove = event_trigger_check_remove(glob); + + if (event_trigger_empty_param(param_and_filter)) + return -EINVAL; + + /* + * separate the trigger from the filter (k:v [if filter]) + * allowing for whitespace in the trigger + */ + p = param = param_and_filter; + do { + p = strstr(p, "if"); + if (!p) + break; + if (p == param_and_filter) + return -EINVAL; + if (*(p - 1) != ' ' && *(p - 1) != '\t') { + p++; + continue; + } + if (p >= param_and_filter + strlen(param_and_filter) - (sizeof("if") - 1) - 1) + return -EINVAL; + if (*(p + sizeof("if") - 1) != ' ' && *(p + sizeof("if") - 1) != '\t') { + p++; + continue; + } + break; + } while (1); + + if (!p) + filter = NULL; + else { + *(p - 1) = '\0'; + filter = strstrip(p); + param = strstrip(param); + } + + /* + * To simplify arithmetic expression parsing, replace occurrences of + * '.sym-offset' modifier with '.symXoffset' + */ + start = strstr(param, ".sym-offset"); + while (start) { + *(start + 4) = 'X'; + start = strstr(start + 11, ".sym-offset"); + } + + attrs = parse_hist_trigger_attrs(file->tr, param); + if (IS_ERR(attrs)) + return PTR_ERR(attrs); + + if (attrs->map_bits) + hist_trigger_bits = attrs->map_bits; + + hist_data = create_hist_data(hist_trigger_bits, attrs, file, remove); + if (IS_ERR(hist_data)) { + destroy_hist_trigger_attrs(attrs); + return PTR_ERR(hist_data); + } + + trigger_data = event_trigger_alloc(cmd_ops, cmd, param, hist_data); + if (!trigger_data) { + ret = -ENOMEM; + goto out_free; + } + + ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data); + if (ret < 0) + goto out_free; + + if (remove) { + if (!have_hist_trigger_match(trigger_data, file)) + goto out_free; + + if (hist_trigger_check_refs(trigger_data, file)) { + ret = -EBUSY; + goto out_free; + } + + event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); + se_name = trace_event_name(file->event_call); + se = find_synth_event(se_name); + if (se) + se->ref--; + ret = 0; + goto out_free; + } + + if (existing_hist_update_only(glob, trigger_data, file)) + goto out_free; + + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret < 0) + goto out_free; + + if (get_named_trigger_data(trigger_data)) + goto enable; + + ret = create_actions(hist_data); + if (ret) + goto out_unreg; + + if (has_hist_vars(hist_data) || hist_data->n_var_refs) { + ret = save_hist_vars(hist_data); + if (ret) + goto out_unreg; + } + + ret = tracing_map_init(hist_data->map); + if (ret) + goto out_unreg; +enable: + ret = hist_trigger_enable(trigger_data, file); + if (ret) + goto out_unreg; + + se_name = trace_event_name(file->event_call); + se = find_synth_event(se_name); + if (se) + se->ref++; + out: + if (ret == 0 && glob[0]) + hist_err_clear(); + + return ret; + out_unreg: + event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); + out_free: + event_trigger_reset_filter(cmd_ops, trigger_data); + + remove_hist_vars(hist_data); + + kfree(trigger_data); + + destroy_hist_data(hist_data); + goto out; +} + +static struct event_command trigger_hist_cmd = { + .name = "hist", + .trigger_type = ETT_EVENT_HIST, + .flags = EVENT_CMD_FL_NEEDS_REC, + .parse = event_hist_trigger_parse, + .reg = hist_register_trigger, + .unreg = hist_unregister_trigger, + .unreg_all = hist_unreg_all, + .get_trigger_ops = event_hist_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +__init int register_trigger_hist_cmd(void) +{ + int ret; + + ret = register_event_command(&trigger_hist_cmd); + WARN_ON(ret < 0); + + return ret; +} + +static void +hist_enable_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) +{ + struct enable_trigger_data *enable_data = data->private_data; + struct event_trigger_data *test; + + list_for_each_entry_rcu(test, &enable_data->file->triggers, list, + lockdep_is_held(&event_mutex)) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (enable_data->enable) + test->paused = false; + else + test->paused = true; + } + } +} + +static void +hist_enable_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) +{ + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + hist_enable_trigger(data, buffer, rec, event); +} + +static struct event_trigger_ops hist_enable_trigger_ops = { + .trigger = hist_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops hist_enable_count_trigger_ops = { + .trigger = hist_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops hist_disable_trigger_ops = { + .trigger = hist_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops hist_disable_count_trigger_ops = { + .trigger = hist_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops * +hist_enable_get_trigger_ops(char *cmd, char *param) +{ + struct event_trigger_ops *ops; + bool enable; + + enable = (strcmp(cmd, ENABLE_HIST_STR) == 0); + + if (enable) + ops = param ? &hist_enable_count_trigger_ops : + &hist_enable_trigger_ops; + else + ops = param ? &hist_disable_count_trigger_ops : + &hist_disable_trigger_ops; + + return ops; +} + +static void hist_enable_unreg_all(struct trace_event_file *file) +{ + struct event_trigger_data *test, *n; + + list_for_each_entry_safe(test, n, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_HIST_ENABLE) { + list_del_rcu(&test->list); + update_cond_flag(file); + trace_event_trigger_enable_disable(file, 0); + if (test->ops->free) + test->ops->free(test); + } + } +} + +static struct event_command trigger_hist_enable_cmd = { + .name = ENABLE_HIST_STR, + .trigger_type = ETT_HIST_ENABLE, + .parse = event_enable_trigger_parse, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .unreg_all = hist_enable_unreg_all, + .get_trigger_ops = hist_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static struct event_command trigger_hist_disable_cmd = { + .name = DISABLE_HIST_STR, + .trigger_type = ETT_HIST_ENABLE, + .parse = event_enable_trigger_parse, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .unreg_all = hist_enable_unreg_all, + .get_trigger_ops = hist_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init void unregister_trigger_hist_enable_disable_cmds(void) +{ + unregister_event_command(&trigger_hist_enable_cmd); + unregister_event_command(&trigger_hist_disable_cmd); +} + +__init int register_trigger_hist_enable_disable_cmds(void) +{ + int ret; + + ret = register_event_command(&trigger_hist_enable_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_event_command(&trigger_hist_disable_cmd); + if (WARN_ON(ret < 0)) + unregister_trigger_hist_enable_disable_cmds(); + + return ret; +} diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c new file mode 100644 index 0000000000..8650562bda --- /dev/null +++ b/kernel/trace/trace_events_inject.c @@ -0,0 +1,335 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace_events_inject - trace event injection + * + * Copyright (C) 2019 Cong Wang <cwang@twitter.com> + */ + +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/rculist.h> + +#include "trace.h" + +static int +trace_inject_entry(struct trace_event_file *file, void *rec, int len) +{ + struct trace_event_buffer fbuffer; + int written = 0; + void *entry; + + rcu_read_lock_sched(); + entry = trace_event_buffer_reserve(&fbuffer, file, len); + if (entry) { + memcpy(entry, rec, len); + written = len; + trace_event_buffer_commit(&fbuffer); + } + rcu_read_unlock_sched(); + + return written; +} + +static int +parse_field(char *str, struct trace_event_call *call, + struct ftrace_event_field **pf, u64 *pv) +{ + struct ftrace_event_field *field; + char *field_name; + int s, i = 0; + int len; + u64 val; + + if (!str[i]) + return 0; + /* First find the field to associate to */ + while (isspace(str[i])) + i++; + s = i; + while (isalnum(str[i]) || str[i] == '_') + i++; + len = i - s; + if (!len) + return -EINVAL; + + field_name = kmemdup_nul(str + s, len, GFP_KERNEL); + if (!field_name) + return -ENOMEM; + field = trace_find_event_field(call, field_name); + kfree(field_name); + if (!field) + return -ENOENT; + + *pf = field; + while (isspace(str[i])) + i++; + if (str[i] != '=') + return -EINVAL; + i++; + while (isspace(str[i])) + i++; + s = i; + if (isdigit(str[i]) || str[i] == '-') { + char *num, c; + int ret; + + /* Make sure the field is not a string */ + if (is_string_field(field)) + return -EINVAL; + + if (str[i] == '-') + i++; + + /* We allow 0xDEADBEEF */ + while (isalnum(str[i])) + i++; + num = str + s; + c = str[i]; + if (c != '\0' && !isspace(c)) + return -EINVAL; + str[i] = '\0'; + /* Make sure it is a value */ + if (field->is_signed) + ret = kstrtoll(num, 0, &val); + else + ret = kstrtoull(num, 0, &val); + str[i] = c; + if (ret) + return ret; + + *pv = val; + return i; + } else if (str[i] == '\'' || str[i] == '"') { + char q = str[i]; + + /* Make sure the field is OK for strings */ + if (!is_string_field(field)) + return -EINVAL; + + for (i++; str[i]; i++) { + if (str[i] == '\\' && str[i + 1]) { + i++; + continue; + } + if (str[i] == q) + break; + } + if (!str[i]) + return -EINVAL; + + /* Skip quotes */ + s++; + len = i - s; + if (len >= MAX_FILTER_STR_VAL) + return -EINVAL; + + *pv = (unsigned long)(str + s); + str[i] = 0; + /* go past the last quote */ + i++; + return i; + } + + return -EINVAL; +} + +static int trace_get_entry_size(struct trace_event_call *call) +{ + struct ftrace_event_field *field; + struct list_head *head; + int size = 0; + + head = trace_get_fields(call); + list_for_each_entry(field, head, link) { + if (field->size + field->offset > size) + size = field->size + field->offset; + } + + return size; +} + +static void *trace_alloc_entry(struct trace_event_call *call, int *size) +{ + int entry_size = trace_get_entry_size(call); + struct ftrace_event_field *field; + struct list_head *head; + void *entry = NULL; + + /* We need an extra '\0' at the end. */ + entry = kzalloc(entry_size + 1, GFP_KERNEL); + if (!entry) + return NULL; + + head = trace_get_fields(call); + list_for_each_entry(field, head, link) { + if (!is_string_field(field)) + continue; + if (field->filter_type == FILTER_STATIC_STRING) + continue; + if (field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING) { + u32 *str_item; + int str_loc = entry_size & 0xffff; + + if (field->filter_type == FILTER_RDYN_STRING) + str_loc -= field->offset + field->size; + + str_item = (u32 *)(entry + field->offset); + *str_item = str_loc; /* string length is 0. */ + } else { + char **paddr; + + paddr = (char **)(entry + field->offset); + *paddr = ""; + } + } + + *size = entry_size + 1; + return entry; +} + +#define INJECT_STRING "STATIC STRING CAN NOT BE INJECTED" + +/* Caller is responsible to free the *pentry. */ +static int parse_entry(char *str, struct trace_event_call *call, void **pentry) +{ + struct ftrace_event_field *field; + void *entry = NULL; + int entry_size; + u64 val = 0; + int len; + + entry = trace_alloc_entry(call, &entry_size); + *pentry = entry; + if (!entry) + return -ENOMEM; + + tracing_generic_entry_update(entry, call->event.type, + tracing_gen_ctx()); + + while ((len = parse_field(str, call, &field, &val)) > 0) { + if (is_function_field(field)) + return -EINVAL; + + if (is_string_field(field)) { + char *addr = (char *)(unsigned long) val; + + if (field->filter_type == FILTER_STATIC_STRING) { + strscpy(entry + field->offset, addr, field->size); + } else if (field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING) { + int str_len = strlen(addr) + 1; + int str_loc = entry_size & 0xffff; + u32 *str_item; + + entry_size += str_len; + *pentry = krealloc(entry, entry_size, GFP_KERNEL); + if (!*pentry) { + kfree(entry); + return -ENOMEM; + } + entry = *pentry; + + strscpy(entry + (entry_size - str_len), addr, str_len); + str_item = (u32 *)(entry + field->offset); + if (field->filter_type == FILTER_RDYN_STRING) + str_loc -= field->offset + field->size; + *str_item = (str_len << 16) | str_loc; + } else { + char **paddr; + + paddr = (char **)(entry + field->offset); + *paddr = INJECT_STRING; + } + } else { + switch (field->size) { + case 1: { + u8 tmp = (u8) val; + + memcpy(entry + field->offset, &tmp, 1); + break; + } + case 2: { + u16 tmp = (u16) val; + + memcpy(entry + field->offset, &tmp, 2); + break; + } + case 4: { + u32 tmp = (u32) val; + + memcpy(entry + field->offset, &tmp, 4); + break; + } + case 8: + memcpy(entry + field->offset, &val, 8); + break; + default: + return -EINVAL; + } + } + + str += len; + } + + if (len < 0) + return len; + + return entry_size; +} + +static ssize_t +event_inject_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_event_call *call; + struct trace_event_file *file; + int err = -ENODEV, size; + void *entry = NULL; + char *buf; + + if (cnt >= PAGE_SIZE) + return -EINVAL; + + buf = memdup_user_nul(ubuf, cnt); + if (IS_ERR(buf)) + return PTR_ERR(buf); + strim(buf); + + mutex_lock(&event_mutex); + file = event_file_data(filp); + if (file) { + call = file->event_call; + size = parse_entry(buf, call, &entry); + if (size < 0) + err = size; + else + err = trace_inject_entry(file, entry, size); + } + mutex_unlock(&event_mutex); + + kfree(entry); + kfree(buf); + + if (err < 0) + return err; + + *ppos += err; + return cnt; +} + +static ssize_t +event_inject_read(struct file *file, char __user *buf, size_t size, + loff_t *ppos) +{ + return -EPERM; +} + +const struct file_operations event_inject_fops = { + .open = tracing_open_file_tr, + .read = event_inject_read, + .write = event_inject_write, + .release = tracing_release_file_tr, +}; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c new file mode 100644 index 0000000000..846e02c0fb --- /dev/null +++ b/kernel/trace/trace_events_synth.c @@ -0,0 +1,2335 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace_events_synth - synthetic trace events + * + * Copyright (C) 2015, 2020 Tom Zanussi <tom.zanussi@linux.intel.com> + */ + +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/security.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/stacktrace.h> +#include <linux/rculist.h> +#include <linux/tracefs.h> + +/* for gfp flag names */ +#include <linux/trace_events.h> +#include <trace/events/mmflags.h> +#include "trace_probe.h" +#include "trace_probe_kernel.h" + +#include "trace_synth.h" + +#undef ERRORS +#define ERRORS \ + C(BAD_NAME, "Illegal name"), \ + C(INVALID_CMD, "Command must be of the form: <name> field[;field] ..."),\ + C(INVALID_DYN_CMD, "Command must be of the form: s or -:[synthetic/]<name> field[;field] ..."),\ + C(EVENT_EXISTS, "Event already exists"), \ + C(TOO_MANY_FIELDS, "Too many fields"), \ + C(INCOMPLETE_TYPE, "Incomplete type"), \ + C(INVALID_TYPE, "Invalid type"), \ + C(INVALID_FIELD, "Invalid field"), \ + C(INVALID_ARRAY_SPEC, "Invalid array specification"), + +#undef C +#define C(a, b) SYNTH_ERR_##a + +enum { ERRORS }; + +#undef C +#define C(a, b) b + +static const char *err_text[] = { ERRORS }; + +static DEFINE_MUTEX(lastcmd_mutex); +static char *last_cmd; + +static int errpos(const char *str) +{ + int ret = 0; + + mutex_lock(&lastcmd_mutex); + if (!str || !last_cmd) + goto out; + + ret = err_pos(last_cmd, str); + out: + mutex_unlock(&lastcmd_mutex); + return ret; +} + +static void last_cmd_set(const char *str) +{ + if (!str) + return; + + mutex_lock(&lastcmd_mutex); + kfree(last_cmd); + last_cmd = kstrdup(str, GFP_KERNEL); + mutex_unlock(&lastcmd_mutex); +} + +static void synth_err(u8 err_type, u16 err_pos) +{ + mutex_lock(&lastcmd_mutex); + if (!last_cmd) + goto out; + + tracing_log_err(NULL, "synthetic_events", last_cmd, err_text, + err_type, err_pos); + out: + mutex_unlock(&lastcmd_mutex); +} + +static int create_synth_event(const char *raw_command); +static int synth_event_show(struct seq_file *m, struct dyn_event *ev); +static int synth_event_release(struct dyn_event *ev); +static bool synth_event_is_busy(struct dyn_event *ev); +static bool synth_event_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev); + +static struct dyn_event_operations synth_event_ops = { + .create = create_synth_event, + .show = synth_event_show, + .is_busy = synth_event_is_busy, + .free = synth_event_release, + .match = synth_event_match, +}; + +static bool is_synth_event(struct dyn_event *ev) +{ + return ev->ops == &synth_event_ops; +} + +static struct synth_event *to_synth_event(struct dyn_event *ev) +{ + return container_of(ev, struct synth_event, devent); +} + +static bool synth_event_is_busy(struct dyn_event *ev) +{ + struct synth_event *event = to_synth_event(ev); + + return event->ref != 0; +} + +static bool synth_event_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev) +{ + struct synth_event *sev = to_synth_event(ev); + + return strcmp(sev->name, event) == 0 && + (!system || strcmp(system, SYNTH_SYSTEM) == 0); +} + +struct synth_trace_event { + struct trace_entry ent; + union trace_synth_field fields[]; +}; + +static int synth_event_define_fields(struct trace_event_call *call) +{ + struct synth_trace_event trace; + int offset = offsetof(typeof(trace), fields); + struct synth_event *event = call->data; + unsigned int i, size, n_u64; + char *name, *type; + bool is_signed; + int ret = 0; + + for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + size = event->fields[i]->size; + is_signed = event->fields[i]->is_signed; + type = event->fields[i]->type; + name = event->fields[i]->name; + ret = trace_define_field(call, type, name, offset, size, + is_signed, FILTER_OTHER); + if (ret) + break; + + event->fields[i]->offset = n_u64; + + if (event->fields[i]->is_string && !event->fields[i]->is_dynamic) { + offset += STR_VAR_LEN_MAX; + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } else { + offset += sizeof(u64); + n_u64++; + } + } + + event->n_u64 = n_u64; + + return ret; +} + +static bool synth_field_signed(char *type) +{ + if (str_has_prefix(type, "u")) + return false; + if (strcmp(type, "gfp_t") == 0) + return false; + + return true; +} + +static int synth_field_is_string(char *type) +{ + if (strstr(type, "char[") != NULL) + return true; + + return false; +} + +static int synth_field_is_stack(char *type) +{ + if (strstr(type, "long[") != NULL) + return true; + + return false; +} + +static int synth_field_string_size(char *type) +{ + char buf[4], *end, *start; + unsigned int len; + int size, err; + + start = strstr(type, "char["); + if (start == NULL) + return -EINVAL; + start += sizeof("char[") - 1; + + end = strchr(type, ']'); + if (!end || end < start || type + strlen(type) > end + 1) + return -EINVAL; + + len = end - start; + if (len > 3) + return -EINVAL; + + if (len == 0) + return 0; /* variable-length string */ + + strncpy(buf, start, len); + buf[len] = '\0'; + + err = kstrtouint(buf, 0, &size); + if (err) + return err; + + if (size > STR_VAR_LEN_MAX) + return -EINVAL; + + return size; +} + +static int synth_field_size(char *type) +{ + int size = 0; + + if (strcmp(type, "s64") == 0) + size = sizeof(s64); + else if (strcmp(type, "u64") == 0) + size = sizeof(u64); + else if (strcmp(type, "s32") == 0) + size = sizeof(s32); + else if (strcmp(type, "u32") == 0) + size = sizeof(u32); + else if (strcmp(type, "s16") == 0) + size = sizeof(s16); + else if (strcmp(type, "u16") == 0) + size = sizeof(u16); + else if (strcmp(type, "s8") == 0) + size = sizeof(s8); + else if (strcmp(type, "u8") == 0) + size = sizeof(u8); + else if (strcmp(type, "char") == 0) + size = sizeof(char); + else if (strcmp(type, "unsigned char") == 0) + size = sizeof(unsigned char); + else if (strcmp(type, "int") == 0) + size = sizeof(int); + else if (strcmp(type, "unsigned int") == 0) + size = sizeof(unsigned int); + else if (strcmp(type, "long") == 0) + size = sizeof(long); + else if (strcmp(type, "unsigned long") == 0) + size = sizeof(unsigned long); + else if (strcmp(type, "bool") == 0) + size = sizeof(bool); + else if (strcmp(type, "pid_t") == 0) + size = sizeof(pid_t); + else if (strcmp(type, "gfp_t") == 0) + size = sizeof(gfp_t); + else if (synth_field_is_string(type)) + size = synth_field_string_size(type); + else if (synth_field_is_stack(type)) + size = 0; + + return size; +} + +static const char *synth_field_fmt(char *type) +{ + const char *fmt = "%llu"; + + if (strcmp(type, "s64") == 0) + fmt = "%lld"; + else if (strcmp(type, "u64") == 0) + fmt = "%llu"; + else if (strcmp(type, "s32") == 0) + fmt = "%d"; + else if (strcmp(type, "u32") == 0) + fmt = "%u"; + else if (strcmp(type, "s16") == 0) + fmt = "%d"; + else if (strcmp(type, "u16") == 0) + fmt = "%u"; + else if (strcmp(type, "s8") == 0) + fmt = "%d"; + else if (strcmp(type, "u8") == 0) + fmt = "%u"; + else if (strcmp(type, "char") == 0) + fmt = "%d"; + else if (strcmp(type, "unsigned char") == 0) + fmt = "%u"; + else if (strcmp(type, "int") == 0) + fmt = "%d"; + else if (strcmp(type, "unsigned int") == 0) + fmt = "%u"; + else if (strcmp(type, "long") == 0) + fmt = "%ld"; + else if (strcmp(type, "unsigned long") == 0) + fmt = "%lu"; + else if (strcmp(type, "bool") == 0) + fmt = "%d"; + else if (strcmp(type, "pid_t") == 0) + fmt = "%d"; + else if (strcmp(type, "gfp_t") == 0) + fmt = "%x"; + else if (synth_field_is_string(type)) + fmt = "%.*s"; + else if (synth_field_is_stack(type)) + fmt = "%s"; + + return fmt; +} + +static void print_synth_event_num_val(struct trace_seq *s, + char *print_fmt, char *name, + int size, union trace_synth_field *val, char *space) +{ + switch (size) { + case 1: + trace_seq_printf(s, print_fmt, name, val->as_u8, space); + break; + + case 2: + trace_seq_printf(s, print_fmt, name, val->as_u16, space); + break; + + case 4: + trace_seq_printf(s, print_fmt, name, val->as_u32, space); + break; + + default: + trace_seq_printf(s, print_fmt, name, val->as_u64, space); + break; + } +} + +static enum print_line_t print_synth_event(struct trace_iterator *iter, + int flags, + struct trace_event *event) +{ + struct trace_array *tr = iter->tr; + struct trace_seq *s = &iter->seq; + struct synth_trace_event *entry; + struct synth_event *se; + unsigned int i, j, n_u64; + char print_fmt[32]; + const char *fmt; + + entry = (struct synth_trace_event *)iter->ent; + se = container_of(event, struct synth_event, call.event); + + trace_seq_printf(s, "%s: ", se->name); + + for (i = 0, n_u64 = 0; i < se->n_fields; i++) { + if (trace_seq_has_overflowed(s)) + goto end; + + fmt = synth_field_fmt(se->fields[i]->type); + + /* parameter types */ + if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) + trace_seq_printf(s, "%s ", fmt); + + snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt); + + /* parameter values */ + if (se->fields[i]->is_string) { + if (se->fields[i]->is_dynamic) { + union trace_synth_field *data = &entry->fields[n_u64]; + + trace_seq_printf(s, print_fmt, se->fields[i]->name, + STR_VAR_LEN_MAX, + (char *)entry + data->as_dynamic.offset, + i == se->n_fields - 1 ? "" : " "); + n_u64++; + } else { + trace_seq_printf(s, print_fmt, se->fields[i]->name, + STR_VAR_LEN_MAX, + (char *)&entry->fields[n_u64].as_u64, + i == se->n_fields - 1 ? "" : " "); + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); + } + } else if (se->fields[i]->is_stack) { + union trace_synth_field *data = &entry->fields[n_u64]; + unsigned long *p = (void *)entry + data->as_dynamic.offset; + + trace_seq_printf(s, "%s=STACK:\n", se->fields[i]->name); + for (j = 1; j < data->as_dynamic.len / sizeof(long); j++) + trace_seq_printf(s, "=> %pS\n", (void *)p[j]); + n_u64++; + } else { + struct trace_print_flags __flags[] = { + __def_gfpflag_names, {-1, NULL} }; + char *space = (i == se->n_fields - 1 ? "" : " "); + + print_synth_event_num_val(s, print_fmt, + se->fields[i]->name, + se->fields[i]->size, + &entry->fields[n_u64], + space); + + if (strcmp(se->fields[i]->type, "gfp_t") == 0) { + trace_seq_puts(s, " ("); + trace_print_flags_seq(s, "|", + entry->fields[n_u64].as_u64, + __flags); + trace_seq_putc(s, ')'); + } + n_u64++; + } + } +end: + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + +static struct trace_event_functions synth_event_funcs = { + .trace = print_synth_event +}; + +static unsigned int trace_string(struct synth_trace_event *entry, + struct synth_event *event, + char *str_val, + bool is_dynamic, + unsigned int data_size, + unsigned int *n_u64) +{ + unsigned int len = 0; + char *str_field; + int ret; + + if (is_dynamic) { + union trace_synth_field *data = &entry->fields[*n_u64]; + + data->as_dynamic.offset = struct_size(entry, fields, event->n_u64) + data_size; + data->as_dynamic.len = fetch_store_strlen((unsigned long)str_val); + + ret = fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry); + + (*n_u64)++; + } else { + str_field = (char *)&entry->fields[*n_u64].as_u64; + +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)str_val < TASK_SIZE) + ret = strncpy_from_user_nofault(str_field, (const void __user *)str_val, STR_VAR_LEN_MAX); + else +#endif + ret = strncpy_from_kernel_nofault(str_field, str_val, STR_VAR_LEN_MAX); + + if (ret < 0) + strcpy(str_field, FAULT_STRING); + + (*n_u64) += STR_VAR_LEN_MAX / sizeof(u64); + } + + return len; +} + +static unsigned int trace_stack(struct synth_trace_event *entry, + struct synth_event *event, + long *stack, + unsigned int data_size, + unsigned int *n_u64) +{ + union trace_synth_field *data = &entry->fields[*n_u64]; + unsigned int len; + u32 data_offset; + void *data_loc; + + data_offset = struct_size(entry, fields, event->n_u64); + data_offset += data_size; + + for (len = 0; len < HIST_STACKTRACE_DEPTH; len++) { + if (!stack[len]) + break; + } + + len *= sizeof(long); + + /* Find the dynamic section to copy the stack into. */ + data_loc = (void *)entry + data_offset; + memcpy(data_loc, stack, len); + + /* Fill in the field that holds the offset/len combo */ + + data->as_dynamic.offset = data_offset; + data->as_dynamic.len = len; + + (*n_u64)++; + + return len; +} + +static notrace void trace_event_raw_event_synth(void *__data, + u64 *var_ref_vals, + unsigned int *var_ref_idx) +{ + unsigned int i, n_u64, val_idx, len, data_size = 0; + struct trace_event_file *trace_file = __data; + struct synth_trace_event *entry; + struct trace_event_buffer fbuffer; + struct trace_buffer *buffer; + struct synth_event *event; + int fields_size = 0; + + event = trace_file->event_call->data; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + fields_size = event->n_u64 * sizeof(u64); + + for (i = 0; i < event->n_dynamic_fields; i++) { + unsigned int field_pos = event->dynamic_fields[i]->field_pos; + char *str_val; + + val_idx = var_ref_idx[field_pos]; + str_val = (char *)(long)var_ref_vals[val_idx]; + + if (event->dynamic_fields[i]->is_stack) { + /* reserve one extra element for size */ + len = *((unsigned long *)str_val) + 1; + len *= sizeof(unsigned long); + } else { + len = fetch_store_strlen((unsigned long)str_val); + } + + fields_size += len; + } + + /* + * Avoid ring buffer recursion detection, as this event + * is being performed within another event. + */ + buffer = trace_file->tr->array_buffer.buffer; + ring_buffer_nest_start(buffer); + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + fields_size); + if (!entry) + goto out; + + for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + val_idx = var_ref_idx[i]; + if (event->fields[i]->is_string) { + char *str_val = (char *)(long)var_ref_vals[val_idx]; + + len = trace_string(entry, event, str_val, + event->fields[i]->is_dynamic, + data_size, &n_u64); + data_size += len; /* only dynamic string increments */ + } else if (event->fields[i]->is_stack) { + long *stack = (long *)(long)var_ref_vals[val_idx]; + + len = trace_stack(entry, event, stack, + data_size, &n_u64); + data_size += len; + } else { + struct synth_field *field = event->fields[i]; + u64 val = var_ref_vals[val_idx]; + + switch (field->size) { + case 1: + entry->fields[n_u64].as_u8 = (u8)val; + break; + + case 2: + entry->fields[n_u64].as_u16 = (u16)val; + break; + + case 4: + entry->fields[n_u64].as_u32 = (u32)val; + break; + + default: + entry->fields[n_u64].as_u64 = val; + break; + } + n_u64++; + } + } + + trace_event_buffer_commit(&fbuffer); +out: + ring_buffer_nest_end(buffer); +} + +static void free_synth_event_print_fmt(struct trace_event_call *call) +{ + if (call) { + kfree(call->print_fmt); + call->print_fmt = NULL; + } +} + +static int __set_synth_event_print_fmt(struct synth_event *event, + char *buf, int len) +{ + const char *fmt; + int pos = 0; + int i; + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + for (i = 0; i < event->n_fields; i++) { + fmt = synth_field_fmt(event->fields[i]->type); + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s", + event->fields[i]->name, fmt, + i == event->n_fields - 1 ? "" : ", "); + } + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + for (i = 0; i < event->n_fields; i++) { + if (event->fields[i]->is_string && + event->fields[i]->is_dynamic) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", __get_str(%s)", event->fields[i]->name); + else if (event->fields[i]->is_stack) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", __get_stacktrace(%s)", event->fields[i]->name); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", REC->%s", event->fields[i]->name); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +static int set_synth_event_print_fmt(struct trace_event_call *call) +{ + struct synth_event *event = call->data; + char *print_fmt; + int len; + + /* First: called with 0 length to calculate the needed length */ + len = __set_synth_event_print_fmt(event, NULL, 0); + + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_synth_event_print_fmt(event, print_fmt, len + 1); + call->print_fmt = print_fmt; + + return 0; +} + +static void free_synth_field(struct synth_field *field) +{ + kfree(field->type); + kfree(field->name); + kfree(field); +} + +static int check_field_version(const char *prefix, const char *field_type, + const char *field_name) +{ + /* + * For backward compatibility, the old synthetic event command + * format did not require semicolons, and in order to not + * break user space, that old format must still work. If a new + * feature is added, then the format that uses the new feature + * will be required to have semicolons, as nothing that uses + * the old format would be using the new, yet to be created, + * feature. When a new feature is added, this will detect it, + * and return a number greater than 1, and require the format + * to use semicolons. + */ + return 1; +} + +static struct synth_field *parse_synth_field(int argc, char **argv, + int *consumed, int *field_version) +{ + const char *prefix = NULL, *field_type = argv[0], *field_name, *array; + struct synth_field *field; + int len, ret = -ENOMEM; + struct seq_buf s; + ssize_t size; + + if (!strcmp(field_type, "unsigned")) { + if (argc < 3) { + synth_err(SYNTH_ERR_INCOMPLETE_TYPE, errpos(field_type)); + return ERR_PTR(-EINVAL); + } + prefix = "unsigned "; + field_type = argv[1]; + field_name = argv[2]; + *consumed += 3; + } else { + field_name = argv[1]; + *consumed += 2; + } + + if (!field_name) { + synth_err(SYNTH_ERR_INVALID_FIELD, errpos(field_type)); + return ERR_PTR(-EINVAL); + } + + *field_version = check_field_version(prefix, field_type, field_name); + + field = kzalloc(sizeof(*field), GFP_KERNEL); + if (!field) + return ERR_PTR(-ENOMEM); + + len = strlen(field_name); + array = strchr(field_name, '['); + if (array) + len -= strlen(array); + + field->name = kmemdup_nul(field_name, len, GFP_KERNEL); + if (!field->name) + goto free; + + if (!is_good_name(field->name)) { + synth_err(SYNTH_ERR_BAD_NAME, errpos(field_name)); + ret = -EINVAL; + goto free; + } + + len = strlen(field_type) + 1; + + if (array) + len += strlen(array); + + if (prefix) + len += strlen(prefix); + + field->type = kzalloc(len, GFP_KERNEL); + if (!field->type) + goto free; + + seq_buf_init(&s, field->type, len); + if (prefix) + seq_buf_puts(&s, prefix); + seq_buf_puts(&s, field_type); + if (array) + seq_buf_puts(&s, array); + if (WARN_ON_ONCE(!seq_buf_buffer_left(&s))) + goto free; + + s.buffer[s.len] = '\0'; + + size = synth_field_size(field->type); + if (size < 0) { + if (array) + synth_err(SYNTH_ERR_INVALID_ARRAY_SPEC, errpos(field_name)); + else + synth_err(SYNTH_ERR_INVALID_TYPE, errpos(field_type)); + ret = -EINVAL; + goto free; + } else if (size == 0) { + if (synth_field_is_string(field->type) || + synth_field_is_stack(field->type)) { + char *type; + + len = sizeof("__data_loc ") + strlen(field->type) + 1; + type = kzalloc(len, GFP_KERNEL); + if (!type) + goto free; + + seq_buf_init(&s, type, len); + seq_buf_puts(&s, "__data_loc "); + seq_buf_puts(&s, field->type); + + if (WARN_ON_ONCE(!seq_buf_buffer_left(&s))) + goto free; + s.buffer[s.len] = '\0'; + + kfree(field->type); + field->type = type; + + field->is_dynamic = true; + size = sizeof(u64); + } else { + synth_err(SYNTH_ERR_INVALID_TYPE, errpos(field_type)); + ret = -EINVAL; + goto free; + } + } + field->size = size; + + if (synth_field_is_string(field->type)) + field->is_string = true; + else if (synth_field_is_stack(field->type)) + field->is_stack = true; + + field->is_signed = synth_field_signed(field->type); + out: + return field; + free: + free_synth_field(field); + field = ERR_PTR(ret); + goto out; +} + +static void free_synth_tracepoint(struct tracepoint *tp) +{ + if (!tp) + return; + + kfree(tp->name); + kfree(tp); +} + +static struct tracepoint *alloc_synth_tracepoint(char *name) +{ + struct tracepoint *tp; + + tp = kzalloc(sizeof(*tp), GFP_KERNEL); + if (!tp) + return ERR_PTR(-ENOMEM); + + tp->name = kstrdup(name, GFP_KERNEL); + if (!tp->name) { + kfree(tp); + return ERR_PTR(-ENOMEM); + } + + return tp; +} + +struct synth_event *find_synth_event(const char *name) +{ + struct dyn_event *pos; + struct synth_event *event; + + for_each_dyn_event(pos) { + if (!is_synth_event(pos)) + continue; + event = to_synth_event(pos); + if (strcmp(event->name, name) == 0) + return event; + } + + return NULL; +} + +static struct trace_event_fields synth_event_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = synth_event_define_fields }, + {} +}; + +static int register_synth_event(struct synth_event *event) +{ + struct trace_event_call *call = &event->call; + int ret = 0; + + event->call.class = &event->class; + event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL); + if (!event->class.system) { + ret = -ENOMEM; + goto out; + } + + event->tp = alloc_synth_tracepoint(event->name); + if (IS_ERR(event->tp)) { + ret = PTR_ERR(event->tp); + event->tp = NULL; + goto out; + } + + INIT_LIST_HEAD(&call->class->fields); + call->event.funcs = &synth_event_funcs; + call->class->fields_array = synth_event_fields_array; + + ret = register_trace_event(&call->event); + if (!ret) { + ret = -ENODEV; + goto out; + } + call->flags = TRACE_EVENT_FL_TRACEPOINT; + call->class->reg = trace_event_reg; + call->class->probe = trace_event_raw_event_synth; + call->data = event; + call->tp = event->tp; + + ret = trace_add_event_call(call); + if (ret) { + pr_warn("Failed to register synthetic event: %s\n", + trace_event_name(call)); + goto err; + } + + ret = set_synth_event_print_fmt(call); + /* unregister_trace_event() will be called inside */ + if (ret < 0) + trace_remove_event_call(call); + out: + return ret; + err: + unregister_trace_event(&call->event); + goto out; +} + +static int unregister_synth_event(struct synth_event *event) +{ + struct trace_event_call *call = &event->call; + int ret; + + ret = trace_remove_event_call(call); + + return ret; +} + +static void free_synth_event(struct synth_event *event) +{ + unsigned int i; + + if (!event) + return; + + for (i = 0; i < event->n_fields; i++) + free_synth_field(event->fields[i]); + + kfree(event->fields); + kfree(event->dynamic_fields); + kfree(event->name); + kfree(event->class.system); + free_synth_tracepoint(event->tp); + free_synth_event_print_fmt(&event->call); + kfree(event); +} + +static struct synth_event *alloc_synth_event(const char *name, int n_fields, + struct synth_field **fields) +{ + unsigned int i, j, n_dynamic_fields = 0; + struct synth_event *event; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) { + event = ERR_PTR(-ENOMEM); + goto out; + } + + event->name = kstrdup(name, GFP_KERNEL); + if (!event->name) { + kfree(event); + event = ERR_PTR(-ENOMEM); + goto out; + } + + event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL); + if (!event->fields) { + free_synth_event(event); + event = ERR_PTR(-ENOMEM); + goto out; + } + + for (i = 0; i < n_fields; i++) + if (fields[i]->is_dynamic) + n_dynamic_fields++; + + if (n_dynamic_fields) { + event->dynamic_fields = kcalloc(n_dynamic_fields, + sizeof(*event->dynamic_fields), + GFP_KERNEL); + if (!event->dynamic_fields) { + free_synth_event(event); + event = ERR_PTR(-ENOMEM); + goto out; + } + } + + dyn_event_init(&event->devent, &synth_event_ops); + + for (i = 0, j = 0; i < n_fields; i++) { + fields[i]->field_pos = i; + event->fields[i] = fields[i]; + + if (fields[i]->is_dynamic) + event->dynamic_fields[j++] = fields[i]; + } + event->n_dynamic_fields = j; + event->n_fields = n_fields; + out: + return event; +} + +static int synth_event_check_arg_fn(void *data) +{ + struct dynevent_arg_pair *arg_pair = data; + int size; + + size = synth_field_size((char *)arg_pair->lhs); + if (size == 0) { + if (strstr((char *)arg_pair->lhs, "[")) + return 0; + } + + return size ? 0 : -EINVAL; +} + +/** + * synth_event_add_field - Add a new field to a synthetic event cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @type: The type of the new field to add + * @name: The name of the new field to add + * + * Add a new field to a synthetic event cmd object. Field ordering is in + * the same order the fields are added. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_add_field(struct dynevent_cmd *cmd, const char *type, + const char *name) +{ + struct dynevent_arg_pair arg_pair; + int ret; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + if (!type || !name) + return -EINVAL; + + dynevent_arg_pair_init(&arg_pair, 0, ';'); + + arg_pair.lhs = type; + arg_pair.rhs = name; + + ret = dynevent_arg_pair_add(cmd, &arg_pair, synth_event_check_arg_fn); + if (ret) + return ret; + + if (++cmd->n_fields > SYNTH_FIELDS_MAX) + ret = -EINVAL; + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_add_field); + +/** + * synth_event_add_field_str - Add a new field to a synthetic event cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @type_name: The type and name of the new field to add, as a single string + * + * Add a new field to a synthetic event cmd object, as a single + * string. The @type_name string is expected to be of the form 'type + * name', which will be appended by ';'. No sanity checking is done - + * what's passed in is assumed to already be well-formed. Field + * ordering is in the same order the fields are added. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name) +{ + struct dynevent_arg arg; + int ret; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + if (!type_name) + return -EINVAL; + + dynevent_arg_init(&arg, ';'); + + arg.str = type_name; + + ret = dynevent_arg_add(cmd, &arg, NULL); + if (ret) + return ret; + + if (++cmd->n_fields > SYNTH_FIELDS_MAX) + ret = -EINVAL; + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_add_field_str); + +/** + * synth_event_add_fields - Add multiple fields to a synthetic event cmd + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @fields: An array of type/name field descriptions + * @n_fields: The number of field descriptions contained in the fields array + * + * Add a new set of fields to a synthetic event cmd object. The event + * fields that will be defined for the event should be passed in as an + * array of struct synth_field_desc, and the number of elements in the + * array passed in as n_fields. Field ordering will retain the + * ordering given in the fields array. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_add_fields(struct dynevent_cmd *cmd, + struct synth_field_desc *fields, + unsigned int n_fields) +{ + unsigned int i; + int ret = 0; + + for (i = 0; i < n_fields; i++) { + if (fields[i].type == NULL || fields[i].name == NULL) { + ret = -EINVAL; + break; + } + + ret = synth_event_add_field(cmd, fields[i].type, fields[i].name); + if (ret) + break; + } + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_add_fields); + +/** + * __synth_event_gen_cmd_start - Start a synthetic event command from arg list + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @name: The name of the synthetic event + * @mod: The module creating the event, NULL if not created from a module + * @args: Variable number of arg (pairs), one pair for each field + * + * NOTE: Users normally won't want to call this function directly, but + * rather use the synth_event_gen_cmd_start() wrapper, which + * automatically adds a NULL to the end of the arg list. If this + * function is used directly, make sure the last arg in the variable + * arg list is NULL. + * + * Generate a synthetic event command to be executed by + * synth_event_gen_cmd_end(). This function can be used to generate + * the complete command or only the first part of it; in the latter + * case, synth_event_add_field(), synth_event_add_field_str(), or + * synth_event_add_fields() can be used to add more fields following + * this. + * + * There should be an even number variable args, each pair consisting + * of a type followed by a field name. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name, + struct module *mod, ...) +{ + struct dynevent_arg arg; + va_list args; + int ret; + + cmd->event_name = name; + cmd->private_data = mod; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + dynevent_arg_init(&arg, 0); + arg.str = name; + ret = dynevent_arg_add(cmd, &arg, NULL); + if (ret) + return ret; + + va_start(args, mod); + for (;;) { + const char *type, *name; + + type = va_arg(args, const char *); + if (!type) + break; + name = va_arg(args, const char *); + if (!name) + break; + + if (++cmd->n_fields > SYNTH_FIELDS_MAX) { + ret = -EINVAL; + break; + } + + ret = synth_event_add_field(cmd, type, name); + if (ret) + break; + } + va_end(args); + + return ret; +} +EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start); + +/** + * synth_event_gen_cmd_array_start - Start synthetic event command from an array + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @name: The name of the synthetic event + * @mod: The module creating the event, NULL if not created from a module + * @fields: An array of type/name field descriptions + * @n_fields: The number of field descriptions contained in the fields array + * + * Generate a synthetic event command to be executed by + * synth_event_gen_cmd_end(). This function can be used to generate + * the complete command or only the first part of it; in the latter + * case, synth_event_add_field(), synth_event_add_field_str(), or + * synth_event_add_fields() can be used to add more fields following + * this. + * + * The event fields that will be defined for the event should be + * passed in as an array of struct synth_field_desc, and the number of + * elements in the array passed in as n_fields. Field ordering will + * retain the ordering given in the fields array. + * + * See synth_field_size() for available types. If field_name contains + * [n] the field is considered to be an array. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name, + struct module *mod, + struct synth_field_desc *fields, + unsigned int n_fields) +{ + struct dynevent_arg arg; + unsigned int i; + int ret = 0; + + cmd->event_name = name; + cmd->private_data = mod; + + if (cmd->type != DYNEVENT_TYPE_SYNTH) + return -EINVAL; + + if (n_fields > SYNTH_FIELDS_MAX) + return -EINVAL; + + dynevent_arg_init(&arg, 0); + arg.str = name; + ret = dynevent_arg_add(cmd, &arg, NULL); + if (ret) + return ret; + + for (i = 0; i < n_fields; i++) { + if (fields[i].type == NULL || fields[i].name == NULL) + return -EINVAL; + + ret = synth_event_add_field(cmd, fields[i].type, fields[i].name); + if (ret) + break; + } + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start); + +static int __create_synth_event(const char *name, const char *raw_fields) +{ + char **argv, *field_str, *tmp_fields, *saved_fields = NULL; + struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; + int consumed, cmd_version = 1, n_fields_this_loop; + int i, argc, n_fields = 0, ret = 0; + struct synth_event *event = NULL; + + /* + * Argument syntax: + * - Add synthetic event: <event_name> field[;field] ... + * - Remove synthetic event: !<event_name> field[;field] ... + * where 'field' = type field_name + */ + + if (name[0] == '\0') { + synth_err(SYNTH_ERR_INVALID_CMD, 0); + return -EINVAL; + } + + if (!is_good_name(name)) { + synth_err(SYNTH_ERR_BAD_NAME, errpos(name)); + return -EINVAL; + } + + mutex_lock(&event_mutex); + + event = find_synth_event(name); + if (event) { + synth_err(SYNTH_ERR_EVENT_EXISTS, errpos(name)); + ret = -EEXIST; + goto err; + } + + tmp_fields = saved_fields = kstrdup(raw_fields, GFP_KERNEL); + if (!tmp_fields) { + ret = -ENOMEM; + goto err; + } + + while ((field_str = strsep(&tmp_fields, ";")) != NULL) { + argv = argv_split(GFP_KERNEL, field_str, &argc); + if (!argv) { + ret = -ENOMEM; + goto err; + } + + if (!argc) { + argv_free(argv); + continue; + } + + n_fields_this_loop = 0; + consumed = 0; + while (argc > consumed) { + int field_version; + + field = parse_synth_field(argc - consumed, + argv + consumed, &consumed, + &field_version); + if (IS_ERR(field)) { + ret = PTR_ERR(field); + goto err_free_arg; + } + + /* + * Track the highest version of any field we + * found in the command. + */ + if (field_version > cmd_version) + cmd_version = field_version; + + /* + * Now sort out what is and isn't valid for + * each supported version. + * + * If we see more than 1 field per loop, it + * means we have multiple fields between + * semicolons, and that's something we no + * longer support in a version 2 or greater + * command. + */ + if (cmd_version > 1 && n_fields_this_loop >= 1) { + synth_err(SYNTH_ERR_INVALID_CMD, errpos(field_str)); + ret = -EINVAL; + goto err_free_arg; + } + + if (n_fields == SYNTH_FIELDS_MAX) { + synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0); + ret = -EINVAL; + goto err_free_arg; + } + fields[n_fields++] = field; + + n_fields_this_loop++; + } + argv_free(argv); + + if (consumed < argc) { + synth_err(SYNTH_ERR_INVALID_CMD, 0); + ret = -EINVAL; + goto err; + } + + } + + if (n_fields == 0) { + synth_err(SYNTH_ERR_INVALID_CMD, 0); + ret = -EINVAL; + goto err; + } + + event = alloc_synth_event(name, n_fields, fields); + if (IS_ERR(event)) { + ret = PTR_ERR(event); + event = NULL; + goto err; + } + ret = register_synth_event(event); + if (!ret) + dyn_event_add(&event->devent, &event->call); + else + free_synth_event(event); + out: + mutex_unlock(&event_mutex); + + kfree(saved_fields); + + return ret; + err_free_arg: + argv_free(argv); + err: + for (i = 0; i < n_fields; i++) + free_synth_field(fields[i]); + + goto out; +} + +/** + * synth_event_create - Create a new synthetic event + * @name: The name of the new synthetic event + * @fields: An array of type/name field descriptions + * @n_fields: The number of field descriptions contained in the fields array + * @mod: The module creating the event, NULL if not created from a module + * + * Create a new synthetic event with the given name under the + * trace/events/synthetic/ directory. The event fields that will be + * defined for the event should be passed in as an array of struct + * synth_field_desc, and the number elements in the array passed in as + * n_fields. Field ordering will retain the ordering given in the + * fields array. + * + * If the new synthetic event is being created from a module, the mod + * param must be non-NULL. This will ensure that the trace buffer + * won't contain unreadable events. + * + * The new synth event should be deleted using synth_event_delete() + * function. The new synthetic event can be generated from modules or + * other kernel code using trace_synth_event() and related functions. + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_create(const char *name, struct synth_field_desc *fields, + unsigned int n_fields, struct module *mod) +{ + struct dynevent_cmd cmd; + char *buf; + int ret; + + buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN); + + ret = synth_event_gen_cmd_array_start(&cmd, name, mod, + fields, n_fields); + if (ret) + goto out; + + ret = synth_event_gen_cmd_end(&cmd); + out: + kfree(buf); + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_create); + +static int destroy_synth_event(struct synth_event *se) +{ + int ret; + + if (se->ref) + return -EBUSY; + + if (trace_event_dyn_busy(&se->call)) + return -EBUSY; + + ret = unregister_synth_event(se); + if (!ret) { + dyn_event_remove(&se->devent); + free_synth_event(se); + } + + return ret; +} + +/** + * synth_event_delete - Delete a synthetic event + * @event_name: The name of the new synthetic event + * + * Delete a synthetic event that was created with synth_event_create(). + * + * Return: 0 if successful, error otherwise. + */ +int synth_event_delete(const char *event_name) +{ + struct synth_event *se = NULL; + struct module *mod = NULL; + int ret = -ENOENT; + + mutex_lock(&event_mutex); + se = find_synth_event(event_name); + if (se) { + mod = se->mod; + ret = destroy_synth_event(se); + } + mutex_unlock(&event_mutex); + + if (mod) { + /* + * It is safest to reset the ring buffer if the module + * being unloaded registered any events that were + * used. The only worry is if a new module gets + * loaded, and takes on the same id as the events of + * this module. When printing out the buffer, traced + * events left over from this module may be passed to + * the new module events and unexpected results may + * occur. + */ + tracing_reset_all_online_cpus(); + } + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_delete); + +static int check_command(const char *raw_command) +{ + char **argv = NULL, *cmd, *saved_cmd, *name_and_field; + int argc, ret = 0; + + cmd = saved_cmd = kstrdup(raw_command, GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + name_and_field = strsep(&cmd, ";"); + if (!name_and_field) { + ret = -EINVAL; + goto free; + } + + if (name_and_field[0] == '!') + goto free; + + argv = argv_split(GFP_KERNEL, name_and_field, &argc); + if (!argv) { + ret = -ENOMEM; + goto free; + } + argv_free(argv); + + if (argc < 3) + ret = -EINVAL; +free: + kfree(saved_cmd); + + return ret; +} + +static int create_or_delete_synth_event(const char *raw_command) +{ + char *name = NULL, *fields, *p; + int ret = 0; + + raw_command = skip_spaces(raw_command); + if (raw_command[0] == '\0') + return ret; + + last_cmd_set(raw_command); + + ret = check_command(raw_command); + if (ret) { + synth_err(SYNTH_ERR_INVALID_CMD, 0); + return ret; + } + + p = strpbrk(raw_command, " \t"); + if (!p && raw_command[0] != '!') { + synth_err(SYNTH_ERR_INVALID_CMD, 0); + ret = -EINVAL; + goto free; + } + + name = kmemdup_nul(raw_command, p ? p - raw_command : strlen(raw_command), GFP_KERNEL); + if (!name) + return -ENOMEM; + + if (name[0] == '!') { + ret = synth_event_delete(name + 1); + goto free; + } + + fields = skip_spaces(p); + + ret = __create_synth_event(name, fields); +free: + kfree(name); + + return ret; +} + +static int synth_event_run_command(struct dynevent_cmd *cmd) +{ + struct synth_event *se; + int ret; + + ret = create_or_delete_synth_event(cmd->seq.buffer); + if (ret) + return ret; + + se = find_synth_event(cmd->event_name); + if (WARN_ON(!se)) + return -ENOENT; + + se->mod = cmd->private_data; + + return ret; +} + +/** + * synth_event_cmd_init - Initialize a synthetic event command object + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @buf: A pointer to the buffer used to build the command + * @maxlen: The length of the buffer passed in @buf + * + * Initialize a synthetic event command object. Use this before + * calling any of the other dyenvent_cmd functions. + */ +void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) +{ + dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_SYNTH, + synth_event_run_command); +} +EXPORT_SYMBOL_GPL(synth_event_cmd_init); + +static inline int +__synth_event_trace_init(struct trace_event_file *file, + struct synth_event_trace_state *trace_state) +{ + int ret = 0; + + memset(trace_state, '\0', sizeof(*trace_state)); + + /* + * Normal event tracing doesn't get called at all unless the + * ENABLED bit is set (which attaches the probe thus allowing + * this code to be called, etc). Because this is called + * directly by the user, we don't have that but we still need + * to honor not logging when disabled. For the iterated + * trace case, we save the enabled state upon start and just + * ignore the following data calls. + */ + if (!(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) { + trace_state->disabled = true; + ret = -ENOENT; + goto out; + } + + trace_state->event = file->event_call->data; +out: + return ret; +} + +static inline int +__synth_event_trace_start(struct trace_event_file *file, + struct synth_event_trace_state *trace_state, + int dynamic_fields_size) +{ + int entry_size, fields_size = 0; + int ret = 0; + + fields_size = trace_state->event->n_u64 * sizeof(u64); + fields_size += dynamic_fields_size; + + /* + * Avoid ring buffer recursion detection, as this event + * is being performed within another event. + */ + trace_state->buffer = file->tr->array_buffer.buffer; + ring_buffer_nest_start(trace_state->buffer); + + entry_size = sizeof(*trace_state->entry) + fields_size; + trace_state->entry = trace_event_buffer_reserve(&trace_state->fbuffer, + file, + entry_size); + if (!trace_state->entry) { + ring_buffer_nest_end(trace_state->buffer); + ret = -EINVAL; + } + + return ret; +} + +static inline void +__synth_event_trace_end(struct synth_event_trace_state *trace_state) +{ + trace_event_buffer_commit(&trace_state->fbuffer); + + ring_buffer_nest_end(trace_state->buffer); +} + +/** + * synth_event_trace - Trace a synthetic event + * @file: The trace_event_file representing the synthetic event + * @n_vals: The number of values in vals + * @args: Variable number of args containing the event values + * + * Trace a synthetic event using the values passed in the variable + * argument list. + * + * The argument list should be a list 'n_vals' u64 values. The number + * of vals must match the number of field in the synthetic event, and + * must be in the same order as the synthetic event fields. + * + * All vals should be cast to u64, and string vals are just pointers + * to strings, cast to u64. Strings will be copied into space + * reserved in the event for the string, using these pointers. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) +{ + unsigned int i, n_u64, len, data_size = 0; + struct synth_event_trace_state state; + va_list args; + int ret; + + ret = __synth_event_trace_init(file, &state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; + } + + if (state.event->n_dynamic_fields) { + va_start(args, n_vals); + + for (i = 0; i < state.event->n_fields; i++) { + u64 val = va_arg(args, u64); + + if (state.event->fields[i]->is_string && + state.event->fields[i]->is_dynamic) { + char *str_val = (char *)(long)val; + + data_size += strlen(str_val) + 1; + } + } + + va_end(args); + } + + ret = __synth_event_trace_start(file, &state, data_size); + if (ret) + return ret; + + if (n_vals != state.event->n_fields) { + ret = -EINVAL; + goto out; + } + + data_size = 0; + + va_start(args, n_vals); + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { + u64 val; + + val = va_arg(args, u64); + + if (state.event->fields[i]->is_string) { + char *str_val = (char *)(long)val; + + len = trace_string(state.entry, state.event, str_val, + state.event->fields[i]->is_dynamic, + data_size, &n_u64); + data_size += len; /* only dynamic string increments */ + } else { + struct synth_field *field = state.event->fields[i]; + + switch (field->size) { + case 1: + state.entry->fields[n_u64].as_u8 = (u8)val; + break; + + case 2: + state.entry->fields[n_u64].as_u16 = (u16)val; + break; + + case 4: + state.entry->fields[n_u64].as_u32 = (u32)val; + break; + + default: + state.entry->fields[n_u64].as_u64 = val; + break; + } + n_u64++; + } + } + va_end(args); +out: + __synth_event_trace_end(&state); + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_trace); + +/** + * synth_event_trace_array - Trace a synthetic event from an array + * @file: The trace_event_file representing the synthetic event + * @vals: Array of values + * @n_vals: The number of values in vals + * + * Trace a synthetic event using the values passed in as 'vals'. + * + * The 'vals' array is just an array of 'n_vals' u64. The number of + * vals must match the number of field in the synthetic event, and + * must be in the same order as the synthetic event fields. + * + * All vals should be cast to u64, and string vals are just pointers + * to strings, cast to u64. Strings will be copied into space + * reserved in the event for the string, using these pointers. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace_array(struct trace_event_file *file, u64 *vals, + unsigned int n_vals) +{ + unsigned int i, n_u64, field_pos, len, data_size = 0; + struct synth_event_trace_state state; + char *str_val; + int ret; + + ret = __synth_event_trace_init(file, &state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; + } + + if (state.event->n_dynamic_fields) { + for (i = 0; i < state.event->n_dynamic_fields; i++) { + field_pos = state.event->dynamic_fields[i]->field_pos; + str_val = (char *)(long)vals[field_pos]; + len = strlen(str_val) + 1; + data_size += len; + } + } + + ret = __synth_event_trace_start(file, &state, data_size); + if (ret) + return ret; + + if (n_vals != state.event->n_fields) { + ret = -EINVAL; + goto out; + } + + data_size = 0; + + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { + if (state.event->fields[i]->is_string) { + char *str_val = (char *)(long)vals[i]; + + len = trace_string(state.entry, state.event, str_val, + state.event->fields[i]->is_dynamic, + data_size, &n_u64); + data_size += len; /* only dynamic string increments */ + } else { + struct synth_field *field = state.event->fields[i]; + u64 val = vals[i]; + + switch (field->size) { + case 1: + state.entry->fields[n_u64].as_u8 = (u8)val; + break; + + case 2: + state.entry->fields[n_u64].as_u16 = (u16)val; + break; + + case 4: + state.entry->fields[n_u64].as_u32 = (u32)val; + break; + + default: + state.entry->fields[n_u64].as_u64 = val; + break; + } + n_u64++; + } + } +out: + __synth_event_trace_end(&state); + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_trace_array); + +/** + * synth_event_trace_start - Start piecewise synthetic event trace + * @file: The trace_event_file representing the synthetic event + * @trace_state: A pointer to object tracking the piecewise trace state + * + * Start the trace of a synthetic event field-by-field rather than all + * at once. + * + * This function 'opens' an event trace, which means space is reserved + * for the event in the trace buffer, after which the event's + * individual field values can be set through either + * synth_event_add_next_val() or synth_event_add_val(). + * + * A pointer to a trace_state object is passed in, which will keep + * track of the current event trace state until the event trace is + * closed (and the event finally traced) using + * synth_event_trace_end(). + * + * Note that synth_event_trace_end() must be called after all values + * have been added for each event trace, regardless of whether adding + * all field values succeeded or not. + * + * Note also that for a given event trace, all fields must be added + * using either synth_event_add_next_val() or synth_event_add_val() + * but not both together or interleaved. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace_start(struct trace_event_file *file, + struct synth_event_trace_state *trace_state) +{ + int ret; + + if (!trace_state) + return -EINVAL; + + ret = __synth_event_trace_init(file, trace_state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; + } + + if (trace_state->event->n_dynamic_fields) + return -ENOTSUPP; + + ret = __synth_event_trace_start(file, trace_state, 0); + + return ret; +} +EXPORT_SYMBOL_GPL(synth_event_trace_start); + +static int __synth_event_add_val(const char *field_name, u64 val, + struct synth_event_trace_state *trace_state) +{ + struct synth_field *field = NULL; + struct synth_trace_event *entry; + struct synth_event *event; + int i, ret = 0; + + if (!trace_state) { + ret = -EINVAL; + goto out; + } + + /* can't mix add_next_synth_val() with add_synth_val() */ + if (field_name) { + if (trace_state->add_next) { + ret = -EINVAL; + goto out; + } + trace_state->add_name = true; + } else { + if (trace_state->add_name) { + ret = -EINVAL; + goto out; + } + trace_state->add_next = true; + } + + if (trace_state->disabled) + goto out; + + event = trace_state->event; + if (trace_state->add_name) { + for (i = 0; i < event->n_fields; i++) { + field = event->fields[i]; + if (strcmp(field->name, field_name) == 0) + break; + } + if (!field) { + ret = -EINVAL; + goto out; + } + } else { + if (trace_state->cur_field >= event->n_fields) { + ret = -EINVAL; + goto out; + } + field = event->fields[trace_state->cur_field++]; + } + + entry = trace_state->entry; + if (field->is_string) { + char *str_val = (char *)(long)val; + char *str_field; + + if (field->is_dynamic) { /* add_val can't do dynamic strings */ + ret = -EINVAL; + goto out; + } + + if (!str_val) { + ret = -EINVAL; + goto out; + } + + str_field = (char *)&entry->fields[field->offset]; + strscpy(str_field, str_val, STR_VAR_LEN_MAX); + } else { + switch (field->size) { + case 1: + trace_state->entry->fields[field->offset].as_u8 = (u8)val; + break; + + case 2: + trace_state->entry->fields[field->offset].as_u16 = (u16)val; + break; + + case 4: + trace_state->entry->fields[field->offset].as_u32 = (u32)val; + break; + + default: + trace_state->entry->fields[field->offset].as_u64 = val; + break; + } + } + out: + return ret; +} + +/** + * synth_event_add_next_val - Add the next field's value to an open synth trace + * @val: The value to set the next field to + * @trace_state: A pointer to object tracking the piecewise trace state + * + * Set the value of the next field in an event that's been opened by + * synth_event_trace_start(). + * + * The val param should be the value cast to u64. If the value points + * to a string, the val param should be a char * cast to u64. + * + * This function assumes all the fields in an event are to be set one + * after another - successive calls to this function are made, one for + * each field, in the order of the fields in the event, until all + * fields have been set. If you'd rather set each field individually + * without regard to ordering, synth_event_add_val() can be used + * instead. + * + * Note however that synth_event_add_next_val() and + * synth_event_add_val() can't be intermixed for a given event trace - + * one or the other but not both can be used at the same time. + * + * Note also that synth_event_trace_end() must be called after all + * values have been added for each event trace, regardless of whether + * adding all field values succeeded or not. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_add_next_val(u64 val, + struct synth_event_trace_state *trace_state) +{ + return __synth_event_add_val(NULL, val, trace_state); +} +EXPORT_SYMBOL_GPL(synth_event_add_next_val); + +/** + * synth_event_add_val - Add a named field's value to an open synth trace + * @field_name: The name of the synthetic event field value to set + * @val: The value to set the named field to + * @trace_state: A pointer to object tracking the piecewise trace state + * + * Set the value of the named field in an event that's been opened by + * synth_event_trace_start(). + * + * The val param should be the value cast to u64. If the value points + * to a string, the val param should be a char * cast to u64. + * + * This function looks up the field name, and if found, sets the field + * to the specified value. This lookup makes this function more + * expensive than synth_event_add_next_val(), so use that or the + * none-piecewise synth_event_trace() instead if efficiency is more + * important. + * + * Note however that synth_event_add_next_val() and + * synth_event_add_val() can't be intermixed for a given event trace - + * one or the other but not both can be used at the same time. + * + * Note also that synth_event_trace_end() must be called after all + * values have been added for each event trace, regardless of whether + * adding all field values succeeded or not. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_add_val(const char *field_name, u64 val, + struct synth_event_trace_state *trace_state) +{ + return __synth_event_add_val(field_name, val, trace_state); +} +EXPORT_SYMBOL_GPL(synth_event_add_val); + +/** + * synth_event_trace_end - End piecewise synthetic event trace + * @trace_state: A pointer to object tracking the piecewise trace state + * + * End the trace of a synthetic event opened by + * synth_event_trace__start(). + * + * This function 'closes' an event trace, which basically means that + * it commits the reserved event and cleans up other loose ends. + * + * A pointer to a trace_state object is passed in, which will keep + * track of the current event trace state opened with + * synth_event_trace_start(). + * + * Note that this function must be called after all values have been + * added for each event trace, regardless of whether adding all field + * values succeeded or not. + * + * Return: 0 on success, err otherwise. + */ +int synth_event_trace_end(struct synth_event_trace_state *trace_state) +{ + if (!trace_state) + return -EINVAL; + + __synth_event_trace_end(trace_state); + + return 0; +} +EXPORT_SYMBOL_GPL(synth_event_trace_end); + +static int create_synth_event(const char *raw_command) +{ + char *fields, *p; + const char *name; + int len, ret = 0; + + raw_command = skip_spaces(raw_command); + if (raw_command[0] == '\0') + return ret; + + last_cmd_set(raw_command); + + name = raw_command; + + /* Don't try to process if not our system */ + if (name[0] != 's' || name[1] != ':') + return -ECANCELED; + name += 2; + + p = strpbrk(raw_command, " \t"); + if (!p) { + synth_err(SYNTH_ERR_INVALID_CMD, 0); + return -EINVAL; + } + + fields = skip_spaces(p); + + /* This interface accepts group name prefix */ + if (strchr(name, '/')) { + len = str_has_prefix(name, SYNTH_SYSTEM "/"); + if (len == 0) { + synth_err(SYNTH_ERR_INVALID_DYN_CMD, 0); + return -EINVAL; + } + name += len; + } + + len = name - raw_command; + + ret = check_command(raw_command + len); + if (ret) { + synth_err(SYNTH_ERR_INVALID_CMD, 0); + return ret; + } + + name = kmemdup_nul(raw_command + len, p - raw_command - len, GFP_KERNEL); + if (!name) + return -ENOMEM; + + ret = __create_synth_event(name, fields); + + kfree(name); + + return ret; +} + +static int synth_event_release(struct dyn_event *ev) +{ + struct synth_event *event = to_synth_event(ev); + int ret; + + if (event->ref) + return -EBUSY; + + if (trace_event_dyn_busy(&event->call)) + return -EBUSY; + + ret = unregister_synth_event(event); + if (ret) + return ret; + + dyn_event_remove(ev); + free_synth_event(event); + return 0; +} + +static int __synth_event_show(struct seq_file *m, struct synth_event *event) +{ + struct synth_field *field; + unsigned int i; + char *type, *t; + + seq_printf(m, "%s\t", event->name); + + for (i = 0; i < event->n_fields; i++) { + field = event->fields[i]; + + type = field->type; + t = strstr(type, "__data_loc"); + if (t) { /* __data_loc belongs in format but not event desc */ + t += sizeof("__data_loc"); + type = t; + } + + /* parameter values */ + seq_printf(m, "%s %s%s", type, field->name, + i == event->n_fields - 1 ? "" : "; "); + } + + seq_putc(m, '\n'); + + return 0; +} + +static int synth_event_show(struct seq_file *m, struct dyn_event *ev) +{ + struct synth_event *event = to_synth_event(ev); + + seq_printf(m, "s:%s/", event->class.system); + + return __synth_event_show(m, event); +} + +static int synth_events_seq_show(struct seq_file *m, void *v) +{ + struct dyn_event *ev = v; + + if (!is_synth_event(ev)) + return 0; + + return __synth_event_show(m, to_synth_event(ev)); +} + +static const struct seq_operations synth_events_seq_op = { + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, + .show = synth_events_seq_show, +}; + +static int synth_events_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ret = dyn_events_release_all(&synth_event_ops); + if (ret < 0) + return ret; + } + + return seq_open(file, &synth_events_seq_op); +} + +static ssize_t synth_events_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + return trace_parse_run_command(file, buffer, count, ppos, + create_or_delete_synth_event); +} + +static const struct file_operations synth_events_fops = { + .open = synth_events_open, + .write = synth_events_write, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * Register dynevent at core_initcall. This allows kernel to setup kprobe + * events in postcore_initcall without tracefs. + */ +static __init int trace_events_synth_init_early(void) +{ + int err = 0; + + err = dyn_event_register(&synth_event_ops); + if (err) + pr_warn("Could not register synth_event_ops\n"); + + return err; +} +core_initcall(trace_events_synth_init_early); + +static __init int trace_events_synth_init(void) +{ + struct dentry *entry = NULL; + int err = 0; + err = tracing_init_dentry(); + if (err) + goto err; + + entry = tracefs_create_file("synthetic_events", TRACE_MODE_WRITE, + NULL, NULL, &synth_events_fops); + if (!entry) { + err = -ENODEV; + goto err; + } + + return err; + err: + pr_warn("Could not create tracefs 'synthetic_events' entry\n"); + + return err; +} + +fs_initcall(trace_events_synth_init); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c new file mode 100644 index 0000000000..46439e3bce --- /dev/null +++ b/kernel/trace/trace_events_trigger.c @@ -0,0 +1,1997 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace_events_trigger - trace event triggers + * + * Copyright (C) 2013 Tom Zanussi <tom.zanussi@linux.intel.com> + */ + +#include <linux/security.h> +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/rculist.h> + +#include "trace.h" + +static LIST_HEAD(trigger_commands); +static DEFINE_MUTEX(trigger_cmd_mutex); + +void trigger_data_free(struct event_trigger_data *data) +{ + if (data->cmd_ops->set_filter) + data->cmd_ops->set_filter(NULL, data, NULL); + + /* make sure current triggers exit before free */ + tracepoint_synchronize_unregister(); + + kfree(data); +} + +/** + * event_triggers_call - Call triggers associated with a trace event + * @file: The trace_event_file associated with the event + * @buffer: The ring buffer that the event is being written to + * @rec: The trace entry for the event, NULL for unconditional invocation + * @event: The event meta data in the ring buffer + * + * For each trigger associated with an event, invoke the trigger + * function registered with the associated trigger command. If rec is + * non-NULL, it means that the trigger requires further processing and + * shouldn't be unconditionally invoked. If rec is non-NULL and the + * trigger has a filter associated with it, rec will checked against + * the filter and if the record matches the trigger will be invoked. + * If the trigger is a 'post_trigger', meaning it shouldn't be invoked + * in any case until the current event is written, the trigger + * function isn't invoked but the bit associated with the deferred + * trigger is set in the return value. + * + * Returns an enum event_trigger_type value containing a set bit for + * any trigger that should be deferred, ETT_NONE if nothing to defer. + * + * Called from tracepoint handlers (with rcu_read_lock_sched() held). + * + * Return: an enum event_trigger_type value containing a set bit for + * any trigger that should be deferred, ETT_NONE if nothing to defer. + */ +enum event_trigger_type +event_triggers_call(struct trace_event_file *file, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) +{ + struct event_trigger_data *data; + enum event_trigger_type tt = ETT_NONE; + struct event_filter *filter; + + if (list_empty(&file->triggers)) + return tt; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->paused) + continue; + if (!rec) { + data->ops->trigger(data, buffer, rec, event); + continue; + } + filter = rcu_dereference_sched(data->filter); + if (filter && !filter_match_preds(filter, rec)) + continue; + if (event_command_post_trigger(data->cmd_ops)) { + tt |= data->cmd_ops->trigger_type; + continue; + } + data->ops->trigger(data, buffer, rec, event); + } + return tt; +} +EXPORT_SYMBOL_GPL(event_triggers_call); + +bool __trace_trigger_soft_disabled(struct trace_event_file *file) +{ + unsigned long eflags = file->flags; + + if (eflags & EVENT_FILE_FL_TRIGGER_MODE) + event_triggers_call(file, NULL, NULL, NULL); + if (eflags & EVENT_FILE_FL_SOFT_DISABLED) + return true; + if (eflags & EVENT_FILE_FL_PID_FILTER) + return trace_event_ignore_this_pid(file); + return false; +} +EXPORT_SYMBOL_GPL(__trace_trigger_soft_disabled); + +/** + * event_triggers_post_call - Call 'post_triggers' for a trace event + * @file: The trace_event_file associated with the event + * @tt: enum event_trigger_type containing a set bit for each trigger to invoke + * + * For each trigger associated with an event, invoke the trigger + * function registered with the associated trigger command, if the + * corresponding bit is set in the tt enum passed into this function. + * See @event_triggers_call for details on how those bits are set. + * + * Called from tracepoint handlers (with rcu_read_lock_sched() held). + */ +void +event_triggers_post_call(struct trace_event_file *file, + enum event_trigger_type tt) +{ + struct event_trigger_data *data; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->paused) + continue; + if (data->cmd_ops->trigger_type & tt) + data->ops->trigger(data, NULL, NULL, NULL); + } +} +EXPORT_SYMBOL_GPL(event_triggers_post_call); + +#define SHOW_AVAILABLE_TRIGGERS (void *)(1UL) + +static void *trigger_next(struct seq_file *m, void *t, loff_t *pos) +{ + struct trace_event_file *event_file = event_file_data(m->private); + + if (t == SHOW_AVAILABLE_TRIGGERS) { + (*pos)++; + return NULL; + } + return seq_list_next(t, &event_file->triggers, pos); +} + +static bool check_user_trigger(struct trace_event_file *file) +{ + struct event_trigger_data *data; + + list_for_each_entry_rcu(data, &file->triggers, list, + lockdep_is_held(&event_mutex)) { + if (data->flags & EVENT_TRIGGER_FL_PROBE) + continue; + return true; + } + return false; +} + +static void *trigger_start(struct seq_file *m, loff_t *pos) +{ + struct trace_event_file *event_file; + + /* ->stop() is called even if ->start() fails */ + mutex_lock(&event_mutex); + event_file = event_file_data(m->private); + if (unlikely(!event_file)) + return ERR_PTR(-ENODEV); + + if (list_empty(&event_file->triggers) || !check_user_trigger(event_file)) + return *pos == 0 ? SHOW_AVAILABLE_TRIGGERS : NULL; + + return seq_list_start(&event_file->triggers, *pos); +} + +static void trigger_stop(struct seq_file *m, void *t) +{ + mutex_unlock(&event_mutex); +} + +static int trigger_show(struct seq_file *m, void *v) +{ + struct event_trigger_data *data; + struct event_command *p; + + if (v == SHOW_AVAILABLE_TRIGGERS) { + seq_puts(m, "# Available triggers:\n"); + seq_putc(m, '#'); + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry_reverse(p, &trigger_commands, list) + seq_printf(m, " %s", p->name); + seq_putc(m, '\n'); + mutex_unlock(&trigger_cmd_mutex); + return 0; + } + + data = list_entry(v, struct event_trigger_data, list); + data->ops->print(m, data); + + return 0; +} + +static const struct seq_operations event_triggers_seq_ops = { + .start = trigger_start, + .next = trigger_next, + .stop = trigger_stop, + .show = trigger_show, +}; + +static int event_trigger_regex_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + mutex_lock(&event_mutex); + + if (unlikely(!event_file_data(file))) { + mutex_unlock(&event_mutex); + return -ENODEV; + } + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) { + struct trace_event_file *event_file; + struct event_command *p; + + event_file = event_file_data(file); + + list_for_each_entry(p, &trigger_commands, list) { + if (p->unreg_all) + p->unreg_all(event_file); + } + } + + if (file->f_mode & FMODE_READ) { + ret = seq_open(file, &event_triggers_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = file; + } + } + + mutex_unlock(&event_mutex); + + return ret; +} + +int trigger_process_regex(struct trace_event_file *file, char *buff) +{ + char *command, *next; + struct event_command *p; + int ret = -EINVAL; + + next = buff = skip_spaces(buff); + command = strsep(&next, ": \t"); + if (next) { + next = skip_spaces(next); + if (!*next) + next = NULL; + } + command = (command[0] != '!') ? command : command + 1; + + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry(p, &trigger_commands, list) { + if (strcmp(p->name, command) == 0) { + ret = p->parse(p, file, buff, command, next); + goto out_unlock; + } + } + out_unlock: + mutex_unlock(&trigger_cmd_mutex); + + return ret; +} + +static ssize_t event_trigger_regex_write(struct file *file, + const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_event_file *event_file; + ssize_t ret; + char *buf; + + if (!cnt) + return 0; + + if (cnt >= PAGE_SIZE) + return -EINVAL; + + buf = memdup_user_nul(ubuf, cnt); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + strim(buf); + + mutex_lock(&event_mutex); + event_file = event_file_data(file); + if (unlikely(!event_file)) { + mutex_unlock(&event_mutex); + kfree(buf); + return -ENODEV; + } + ret = trigger_process_regex(event_file, buf); + mutex_unlock(&event_mutex); + + kfree(buf); + if (ret < 0) + goto out; + + *ppos += cnt; + ret = cnt; + out: + return ret; +} + +static int event_trigger_regex_release(struct inode *inode, struct file *file) +{ + mutex_lock(&event_mutex); + + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + + mutex_unlock(&event_mutex); + + return 0; +} + +static ssize_t +event_trigger_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return event_trigger_regex_write(filp, ubuf, cnt, ppos); +} + +static int +event_trigger_open(struct inode *inode, struct file *filp) +{ + /* Checks for tracefs lockdown */ + return event_trigger_regex_open(inode, filp); +} + +static int +event_trigger_release(struct inode *inode, struct file *file) +{ + return event_trigger_regex_release(inode, file); +} + +const struct file_operations event_trigger_fops = { + .open = event_trigger_open, + .read = seq_read, + .write = event_trigger_write, + .llseek = tracing_lseek, + .release = event_trigger_release, +}; + +/* + * Currently we only register event commands from __init, so mark this + * __init too. + */ +__init int register_event_command(struct event_command *cmd) +{ + struct event_command *p; + int ret = 0; + + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry(p, &trigger_commands, list) { + if (strcmp(cmd->name, p->name) == 0) { + ret = -EBUSY; + goto out_unlock; + } + } + list_add(&cmd->list, &trigger_commands); + out_unlock: + mutex_unlock(&trigger_cmd_mutex); + + return ret; +} + +/* + * Currently we only unregister event commands from __init, so mark + * this __init too. + */ +__init int unregister_event_command(struct event_command *cmd) +{ + struct event_command *p, *n; + int ret = -ENODEV; + + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry_safe(p, n, &trigger_commands, list) { + if (strcmp(cmd->name, p->name) == 0) { + ret = 0; + list_del_init(&p->list); + goto out_unlock; + } + } + out_unlock: + mutex_unlock(&trigger_cmd_mutex); + + return ret; +} + +/** + * event_trigger_print - Generic event_trigger_ops @print implementation + * @name: The name of the event trigger + * @m: The seq_file being printed to + * @data: Trigger-specific data + * @filter_str: filter_str to print, if present + * + * Common implementation for event triggers to print themselves. + * + * Usually wrapped by a function that simply sets the @name of the + * trigger command and then invokes this. + * + * Return: 0 on success, errno otherwise + */ +static int +event_trigger_print(const char *name, struct seq_file *m, + void *data, char *filter_str) +{ + long count = (long)data; + + seq_puts(m, name); + + if (count == -1) + seq_puts(m, ":unlimited"); + else + seq_printf(m, ":count=%ld", count); + + if (filter_str) + seq_printf(m, " if %s\n", filter_str); + else + seq_putc(m, '\n'); + + return 0; +} + +/** + * event_trigger_init - Generic event_trigger_ops @init implementation + * @data: Trigger-specific data + * + * Common implementation of event trigger initialization. + * + * Usually used directly as the @init method in event trigger + * implementations. + * + * Return: 0 on success, errno otherwise + */ +int event_trigger_init(struct event_trigger_data *data) +{ + data->ref++; + return 0; +} + +/** + * event_trigger_free - Generic event_trigger_ops @free implementation + * @data: Trigger-specific data + * + * Common implementation of event trigger de-initialization. + * + * Usually used directly as the @free method in event trigger + * implementations. + */ +static void +event_trigger_free(struct event_trigger_data *data) +{ + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) + trigger_data_free(data); +} + +int trace_event_trigger_enable_disable(struct trace_event_file *file, + int trigger_enable) +{ + int ret = 0; + + if (trigger_enable) { + if (atomic_inc_return(&file->tm_ref) > 1) + return ret; + set_bit(EVENT_FILE_FL_TRIGGER_MODE_BIT, &file->flags); + ret = trace_event_enable_disable(file, 1, 1); + } else { + if (atomic_dec_return(&file->tm_ref) > 0) + return ret; + clear_bit(EVENT_FILE_FL_TRIGGER_MODE_BIT, &file->flags); + ret = trace_event_enable_disable(file, 0, 1); + } + + return ret; +} + +/** + * clear_event_triggers - Clear all triggers associated with a trace array + * @tr: The trace array to clear + * + * For each trigger, the triggering event has its tm_ref decremented + * via trace_event_trigger_enable_disable(), and any associated event + * (in the case of enable/disable_event triggers) will have its sm_ref + * decremented via free()->trace_event_enable_disable(). That + * combination effectively reverses the soft-mode/trigger state added + * by trigger registration. + * + * Must be called with event_mutex held. + */ +void +clear_event_triggers(struct trace_array *tr) +{ + struct trace_event_file *file; + + list_for_each_entry(file, &tr->events, list) { + struct event_trigger_data *data, *n; + list_for_each_entry_safe(data, n, &file->triggers, list) { + trace_event_trigger_enable_disable(file, 0); + list_del_rcu(&data->list); + if (data->ops->free) + data->ops->free(data); + } + } +} + +/** + * update_cond_flag - Set or reset the TRIGGER_COND bit + * @file: The trace_event_file associated with the event + * + * If an event has triggers and any of those triggers has a filter or + * a post_trigger, trigger invocation needs to be deferred until after + * the current event has logged its data, and the event should have + * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be + * cleared. + */ +void update_cond_flag(struct trace_event_file *file) +{ + struct event_trigger_data *data; + bool set_cond = false; + + lockdep_assert_held(&event_mutex); + + list_for_each_entry(data, &file->triggers, list) { + if (data->filter || event_command_post_trigger(data->cmd_ops) || + event_command_needs_rec(data->cmd_ops)) { + set_cond = true; + break; + } + } + + if (set_cond) + set_bit(EVENT_FILE_FL_TRIGGER_COND_BIT, &file->flags); + else + clear_bit(EVENT_FILE_FL_TRIGGER_COND_BIT, &file->flags); +} + +/** + * register_trigger - Generic event_command @reg implementation + * @glob: The raw string used to register the trigger + * @data: Trigger-specific data to associate with the trigger + * @file: The trace_event_file associated with the event + * + * Common implementation for event trigger registration. + * + * Usually used directly as the @reg method in event command + * implementations. + * + * Return: 0 on success, errno otherwise + */ +static int register_trigger(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct event_trigger_data *test; + int ret = 0; + + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) { + ret = -EEXIST; + goto out; + } + } + + if (data->ops->init) { + ret = data->ops->init(data); + if (ret < 0) + goto out; + } + + list_add_rcu(&data->list, &file->triggers); + + update_cond_flag(file); + ret = trace_event_trigger_enable_disable(file, 1); + if (ret < 0) { + list_del_rcu(&data->list); + update_cond_flag(file); + } +out: + return ret; +} + +/** + * unregister_trigger - Generic event_command @unreg implementation + * @glob: The raw string used to register the trigger + * @test: Trigger-specific data used to find the trigger to remove + * @file: The trace_event_file associated with the event + * + * Common implementation for event trigger unregistration. + * + * Usually used directly as the @unreg method in event command + * implementations. + */ +static void unregister_trigger(char *glob, + struct event_trigger_data *test, + struct trace_event_file *file) +{ + struct event_trigger_data *data = NULL, *iter; + + lockdep_assert_held(&event_mutex); + + list_for_each_entry(iter, &file->triggers, list) { + if (iter->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { + data = iter; + list_del_rcu(&data->list); + trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); + break; + } + } + + if (data && data->ops->free) + data->ops->free(data); +} + +/* + * Event trigger parsing helper functions. + * + * These functions help make it easier to write an event trigger + * parsing function i.e. the struct event_command.parse() callback + * function responsible for parsing and registering a trigger command + * written to the 'trigger' file. + * + * A trigger command (or just 'trigger' for short) takes the form: + * [trigger] [if filter] + * + * The struct event_command.parse() callback (and other struct + * event_command functions) refer to several components of a trigger + * command. Those same components are referenced by the event trigger + * parsing helper functions defined below. These components are: + * + * cmd - the trigger command name + * glob - the trigger command name optionally prefaced with '!' + * param_and_filter - text following cmd and ':' + * param - text following cmd and ':' and stripped of filter + * filter - the optional filter text following (and including) 'if' + * + * To illustrate the use of these componenents, here are some concrete + * examples. For the following triggers: + * + * echo 'traceon:5 if pid == 0' > trigger + * - 'traceon' is both cmd and glob + * - '5 if pid == 0' is the param_and_filter + * - '5' is the param + * - 'if pid == 0' is the filter + * + * echo 'enable_event:sys:event:n' > trigger + * - 'enable_event' is both cmd and glob + * - 'sys:event:n' is the param_and_filter + * - 'sys:event:n' is the param + * - there is no filter + * + * echo 'hist:keys=pid if prio > 50' > trigger + * - 'hist' is both cmd and glob + * - 'keys=pid if prio > 50' is the param_and_filter + * - 'keys=pid' is the param + * - 'if prio > 50' is the filter + * + * echo '!enable_event:sys:event:n' > trigger + * - 'enable_event' the cmd + * - '!enable_event' is the glob + * - 'sys:event:n' is the param_and_filter + * - 'sys:event:n' is the param + * - there is no filter + * + * echo 'traceoff' > trigger + * - 'traceoff' is both cmd and glob + * - there is no param_and_filter + * - there is no param + * - there is no filter + * + * There are a few different categories of event trigger covered by + * these helpers: + * + * - triggers that don't require a parameter e.g. traceon + * - triggers that do require a parameter e.g. enable_event and hist + * - triggers that though they may not require a param may support an + * optional 'n' param (n = number of times the trigger should fire) + * e.g.: traceon:5 or enable_event:sys:event:n + * - triggers that do not support an 'n' param e.g. hist + * + * These functions can be used or ignored as necessary - it all + * depends on the complexity of the trigger, and the granularity of + * the functions supported reflects the fact that some implementations + * may need to customize certain aspects of their implementations and + * won't need certain functions. For instance, the hist trigger + * implementation doesn't use event_trigger_separate_filter() because + * it has special requirements for handling the filter. + */ + +/** + * event_trigger_check_remove - check whether an event trigger specifies remove + * @glob: The trigger command string, with optional remove(!) operator + * + * The event trigger callback implementations pass in 'glob' as a + * parameter. This is the command name either with or without a + * remove(!) operator. This function simply parses the glob and + * determines whether the command corresponds to a trigger removal or + * a trigger addition. + * + * Return: true if this is a remove command, false otherwise + */ +bool event_trigger_check_remove(const char *glob) +{ + return (glob && glob[0] == '!') ? true : false; +} + +/** + * event_trigger_empty_param - check whether the param is empty + * @param: The trigger param string + * + * The event trigger callback implementations pass in 'param' as a + * parameter. This corresponds to the string following the command + * name minus the command name. This function can be called by a + * callback implementation for any command that requires a param; a + * callback that doesn't require a param can ignore it. + * + * Return: true if this is an empty param, false otherwise + */ +bool event_trigger_empty_param(const char *param) +{ + return !param; +} + +/** + * event_trigger_separate_filter - separate an event trigger from a filter + * @param_and_filter: String containing trigger and possibly filter + * @param: outparam, will be filled with a pointer to the trigger + * @filter: outparam, will be filled with a pointer to the filter + * @param_required: Specifies whether or not the param string is required + * + * Given a param string of the form '[trigger] [if filter]', this + * function separates the filter from the trigger and returns the + * trigger in @param and the filter in @filter. Either the @param + * or the @filter may be set to NULL by this function - if not set to + * NULL, they will contain strings corresponding to the trigger and + * filter. + * + * There are two cases that need to be handled with respect to the + * passed-in param: either the param is required, or it is not + * required. If @param_required is set, and there's no param, it will + * return -EINVAL. If @param_required is not set and there's a param + * that starts with a number, that corresponds to the case of a + * trigger with :n (n = number of times the trigger should fire) and + * the parsing continues normally; otherwise the function just returns + * and assumes param just contains a filter and there's nothing else + * to do. + * + * Return: 0 on success, errno otherwise + */ +int event_trigger_separate_filter(char *param_and_filter, char **param, + char **filter, bool param_required) +{ + int ret = 0; + + *param = *filter = NULL; + + if (!param_and_filter) { + if (param_required) + ret = -EINVAL; + goto out; + } + + /* + * Here we check for an optional param. The only legal + * optional param is :n, and if that's the case, continue + * below. Otherwise we assume what's left is a filter and + * return it as the filter string for the caller to deal with. + */ + if (!param_required && param_and_filter && !isdigit(param_and_filter[0])) { + *filter = param_and_filter; + goto out; + } + + /* + * Separate the param from the filter (param [if filter]). + * Here we have either an optional :n param or a required + * param and an optional filter. + */ + *param = strsep(¶m_and_filter, " \t"); + + /* + * Here we have a filter, though it may be empty. + */ + if (param_and_filter) { + *filter = skip_spaces(param_and_filter); + if (!**filter) + *filter = NULL; + } +out: + return ret; +} + +/** + * event_trigger_alloc - allocate and init event_trigger_data for a trigger + * @cmd_ops: The event_command operations for the trigger + * @cmd: The cmd string + * @param: The param string + * @private_data: User data to associate with the event trigger + * + * Allocate an event_trigger_data instance and initialize it. The + * @cmd_ops are used along with the @cmd and @param to get the + * trigger_ops to assign to the event_trigger_data. @private_data can + * also be passed in and associated with the event_trigger_data. + * + * Use event_trigger_free() to free an event_trigger_data object. + * + * Return: The trigger_data object success, NULL otherwise + */ +struct event_trigger_data *event_trigger_alloc(struct event_command *cmd_ops, + char *cmd, + char *param, + void *private_data) +{ + struct event_trigger_data *trigger_data; + struct event_trigger_ops *trigger_ops; + + trigger_ops = cmd_ops->get_trigger_ops(cmd, param); + + trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + if (!trigger_data) + return NULL; + + trigger_data->count = -1; + trigger_data->ops = trigger_ops; + trigger_data->cmd_ops = cmd_ops; + trigger_data->private_data = private_data; + + INIT_LIST_HEAD(&trigger_data->list); + INIT_LIST_HEAD(&trigger_data->named_list); + RCU_INIT_POINTER(trigger_data->filter, NULL); + + return trigger_data; +} + +/** + * event_trigger_parse_num - parse and return the number param for a trigger + * @param: The param string + * @trigger_data: The trigger_data for the trigger + * + * Parse the :n (n = number of times the trigger should fire) param + * and set the count variable in the trigger_data to the parsed count. + * + * Return: 0 on success, errno otherwise + */ +int event_trigger_parse_num(char *param, + struct event_trigger_data *trigger_data) +{ + char *number; + int ret = 0; + + if (param) { + number = strsep(¶m, ":"); + + if (!strlen(number)) + return -EINVAL; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &trigger_data->count); + } + + return ret; +} + +/** + * event_trigger_set_filter - set an event trigger's filter + * @cmd_ops: The event_command operations for the trigger + * @file: The event file for the trigger's event + * @param: The string containing the filter + * @trigger_data: The trigger_data for the trigger + * + * Set the filter for the trigger. If the filter is NULL, just return + * without error. + * + * Return: 0 on success, errno otherwise + */ +int event_trigger_set_filter(struct event_command *cmd_ops, + struct trace_event_file *file, + char *param, + struct event_trigger_data *trigger_data) +{ + if (param && cmd_ops->set_filter) + return cmd_ops->set_filter(param, trigger_data, file); + + return 0; +} + +/** + * event_trigger_reset_filter - reset an event trigger's filter + * @cmd_ops: The event_command operations for the trigger + * @trigger_data: The trigger_data for the trigger + * + * Reset the filter for the trigger to no filter. + */ +void event_trigger_reset_filter(struct event_command *cmd_ops, + struct event_trigger_data *trigger_data) +{ + if (cmd_ops->set_filter) + cmd_ops->set_filter(NULL, trigger_data, NULL); +} + +/** + * event_trigger_register - register an event trigger + * @cmd_ops: The event_command operations for the trigger + * @file: The event file for the trigger's event + * @glob: The trigger command string, with optional remove(!) operator + * @trigger_data: The trigger_data for the trigger + * + * Register an event trigger. The @cmd_ops are used to call the + * cmd_ops->reg() function which actually does the registration. + * + * Return: 0 on success, errno otherwise + */ +int event_trigger_register(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, + struct event_trigger_data *trigger_data) +{ + return cmd_ops->reg(glob, trigger_data, file); +} + +/** + * event_trigger_unregister - unregister an event trigger + * @cmd_ops: The event_command operations for the trigger + * @file: The event file for the trigger's event + * @glob: The trigger command string, with optional remove(!) operator + * @trigger_data: The trigger_data for the trigger + * + * Unregister an event trigger. The @cmd_ops are used to call the + * cmd_ops->unreg() function which actually does the unregistration. + */ +void event_trigger_unregister(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, + struct event_trigger_data *trigger_data) +{ + cmd_ops->unreg(glob, trigger_data, file); +} + +/* + * End event trigger parsing helper functions. + */ + +/** + * event_trigger_parse - Generic event_command @parse implementation + * @cmd_ops: The command ops, used for trigger registration + * @file: The trace_event_file associated with the event + * @glob: The raw string used to register the trigger + * @cmd: The cmd portion of the string used to register the trigger + * @param_and_filter: The param and filter portion of the string used to register the trigger + * + * Common implementation for event command parsing and trigger + * instantiation. + * + * Usually used directly as the @parse method in event command + * implementations. + * + * Return: 0 on success, errno otherwise + */ +static int +event_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param_and_filter) +{ + struct event_trigger_data *trigger_data; + char *param, *filter; + bool remove; + int ret; + + remove = event_trigger_check_remove(glob); + + ret = event_trigger_separate_filter(param_and_filter, ¶m, &filter, false); + if (ret) + return ret; + + ret = -ENOMEM; + trigger_data = event_trigger_alloc(cmd_ops, cmd, param, file); + if (!trigger_data) + goto out; + + if (remove) { + event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); + kfree(trigger_data); + ret = 0; + goto out; + } + + ret = event_trigger_parse_num(param, trigger_data); + if (ret) + goto out_free; + + ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data); + if (ret < 0) + goto out_free; + + /* Up the trigger_data count to make sure reg doesn't free it on failure */ + event_trigger_init(trigger_data); + + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret) + goto out_free; + + /* Down the counter of trigger_data or free it if not used anymore */ + event_trigger_free(trigger_data); + out: + return ret; + + out_free: + event_trigger_reset_filter(cmd_ops, trigger_data); + kfree(trigger_data); + goto out; +} + +/** + * set_trigger_filter - Generic event_command @set_filter implementation + * @filter_str: The filter string for the trigger, NULL to remove filter + * @trigger_data: Trigger-specific data + * @file: The trace_event_file associated with the event + * + * Common implementation for event command filter parsing and filter + * instantiation. + * + * Usually used directly as the @set_filter method in event command + * implementations. + * + * Also used to remove a filter (if filter_str = NULL). + * + * Return: 0 on success, errno otherwise + */ +int set_trigger_filter(char *filter_str, + struct event_trigger_data *trigger_data, + struct trace_event_file *file) +{ + struct event_trigger_data *data = trigger_data; + struct event_filter *filter = NULL, *tmp; + int ret = -EINVAL; + char *s; + + if (!filter_str) /* clear the current filter */ + goto assign; + + s = strsep(&filter_str, " \t"); + + if (!strlen(s) || strcmp(s, "if") != 0) + goto out; + + if (!filter_str) + goto out; + + /* The filter is for the 'trigger' event, not the triggered event */ + ret = create_event_filter(file->tr, file->event_call, + filter_str, true, &filter); + + /* Only enabled set_str for error handling */ + if (filter) { + kfree(filter->filter_string); + filter->filter_string = NULL; + } + + /* + * If create_event_filter() fails, filter still needs to be freed. + * Which the calling code will do with data->filter. + */ + assign: + tmp = rcu_access_pointer(data->filter); + + rcu_assign_pointer(data->filter, filter); + + if (tmp) { + /* + * Make sure the call is done with the filter. + * It is possible that a filter could fail at boot up, + * and then this path will be called. Avoid the synchronization + * in that case. + */ + if (system_state != SYSTEM_BOOTING) + tracepoint_synchronize_unregister(); + free_event_filter(tmp); + } + + kfree(data->filter_str); + data->filter_str = NULL; + + if (filter_str) { + data->filter_str = kstrdup(filter_str, GFP_KERNEL); + if (!data->filter_str) { + free_event_filter(rcu_access_pointer(data->filter)); + data->filter = NULL; + ret = -ENOMEM; + } + } + out: + return ret; +} + +static LIST_HEAD(named_triggers); + +/** + * find_named_trigger - Find the common named trigger associated with @name + * @name: The name of the set of named triggers to find the common data for + * + * Named triggers are sets of triggers that share a common set of + * trigger data. The first named trigger registered with a given name + * owns the common trigger data that the others subsequently + * registered with the same name will reference. This function + * returns the common trigger data associated with that first + * registered instance. + * + * Return: the common trigger data for the given named trigger on + * success, NULL otherwise. + */ +struct event_trigger_data *find_named_trigger(const char *name) +{ + struct event_trigger_data *data; + + if (!name) + return NULL; + + list_for_each_entry(data, &named_triggers, named_list) { + if (data->named_data) + continue; + if (strcmp(data->name, name) == 0) + return data; + } + + return NULL; +} + +/** + * is_named_trigger - determine if a given trigger is a named trigger + * @test: The trigger data to test + * + * Return: true if 'test' is a named trigger, false otherwise. + */ +bool is_named_trigger(struct event_trigger_data *test) +{ + struct event_trigger_data *data; + + list_for_each_entry(data, &named_triggers, named_list) { + if (test == data) + return true; + } + + return false; +} + +/** + * save_named_trigger - save the trigger in the named trigger list + * @name: The name of the named trigger set + * @data: The trigger data to save + * + * Return: 0 if successful, negative error otherwise. + */ +int save_named_trigger(const char *name, struct event_trigger_data *data) +{ + data->name = kstrdup(name, GFP_KERNEL); + if (!data->name) + return -ENOMEM; + + list_add(&data->named_list, &named_triggers); + + return 0; +} + +/** + * del_named_trigger - delete a trigger from the named trigger list + * @data: The trigger data to delete + */ +void del_named_trigger(struct event_trigger_data *data) +{ + kfree(data->name); + data->name = NULL; + + list_del(&data->named_list); +} + +static void __pause_named_trigger(struct event_trigger_data *data, bool pause) +{ + struct event_trigger_data *test; + + list_for_each_entry(test, &named_triggers, named_list) { + if (strcmp(test->name, data->name) == 0) { + if (pause) { + test->paused_tmp = test->paused; + test->paused = true; + } else { + test->paused = test->paused_tmp; + } + } + } +} + +/** + * pause_named_trigger - Pause all named triggers with the same name + * @data: The trigger data of a named trigger to pause + * + * Pauses a named trigger along with all other triggers having the + * same name. Because named triggers share a common set of data, + * pausing only one is meaningless, so pausing one named trigger needs + * to pause all triggers with the same name. + */ +void pause_named_trigger(struct event_trigger_data *data) +{ + __pause_named_trigger(data, true); +} + +/** + * unpause_named_trigger - Un-pause all named triggers with the same name + * @data: The trigger data of a named trigger to unpause + * + * Un-pauses a named trigger along with all other triggers having the + * same name. Because named triggers share a common set of data, + * unpausing only one is meaningless, so unpausing one named trigger + * needs to unpause all triggers with the same name. + */ +void unpause_named_trigger(struct event_trigger_data *data) +{ + __pause_named_trigger(data, false); +} + +/** + * set_named_trigger_data - Associate common named trigger data + * @data: The trigger data to associate + * @named_data: The common named trigger to be associated + * + * Named triggers are sets of triggers that share a common set of + * trigger data. The first named trigger registered with a given name + * owns the common trigger data that the others subsequently + * registered with the same name will reference. This function + * associates the common trigger data from the first trigger with the + * given trigger. + */ +void set_named_trigger_data(struct event_trigger_data *data, + struct event_trigger_data *named_data) +{ + data->named_data = named_data; +} + +struct event_trigger_data * +get_named_trigger_data(struct event_trigger_data *data) +{ + return data->named_data; +} + +static void +traceon_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) +{ + struct trace_event_file *file = data->private_data; + + if (file) { + if (tracer_tracing_is_on(file->tr)) + return; + + tracer_tracing_on(file->tr); + return; + } + + if (tracing_is_on()) + return; + + tracing_on(); +} + +static void +traceon_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) +{ + struct trace_event_file *file = data->private_data; + + if (file) { + if (tracer_tracing_is_on(file->tr)) + return; + } else { + if (tracing_is_on()) + return; + } + + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + if (file) + tracer_tracing_on(file->tr); + else + tracing_on(); +} + +static void +traceoff_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) +{ + struct trace_event_file *file = data->private_data; + + if (file) { + if (!tracer_tracing_is_on(file->tr)) + return; + + tracer_tracing_off(file->tr); + return; + } + + if (!tracing_is_on()) + return; + + tracing_off(); +} + +static void +traceoff_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) +{ + struct trace_event_file *file = data->private_data; + + if (file) { + if (!tracer_tracing_is_on(file->tr)) + return; + } else { + if (!tracing_is_on()) + return; + } + + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + if (file) + tracer_tracing_off(file->tr); + else + tracing_off(); +} + +static int +traceon_trigger_print(struct seq_file *m, struct event_trigger_data *data) +{ + return event_trigger_print("traceon", m, (void *)data->count, + data->filter_str); +} + +static int +traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data) +{ + return event_trigger_print("traceoff", m, (void *)data->count, + data->filter_str); +} + +static struct event_trigger_ops traceon_trigger_ops = { + .trigger = traceon_trigger, + .print = traceon_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops traceon_count_trigger_ops = { + .trigger = traceon_count_trigger, + .print = traceon_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops traceoff_trigger_ops = { + .trigger = traceoff_trigger, + .print = traceoff_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops traceoff_count_trigger_ops = { + .trigger = traceoff_count_trigger, + .print = traceoff_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops * +onoff_get_trigger_ops(char *cmd, char *param) +{ + struct event_trigger_ops *ops; + + /* we register both traceon and traceoff to this callback */ + if (strcmp(cmd, "traceon") == 0) + ops = param ? &traceon_count_trigger_ops : + &traceon_trigger_ops; + else + ops = param ? &traceoff_count_trigger_ops : + &traceoff_trigger_ops; + + return ops; +} + +static struct event_command trigger_traceon_cmd = { + .name = "traceon", + .trigger_type = ETT_TRACE_ONOFF, + .parse = event_trigger_parse, + .reg = register_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = onoff_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static struct event_command trigger_traceoff_cmd = { + .name = "traceoff", + .trigger_type = ETT_TRACE_ONOFF, + .flags = EVENT_CMD_FL_POST_TRIGGER, + .parse = event_trigger_parse, + .reg = register_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = onoff_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +#ifdef CONFIG_TRACER_SNAPSHOT +static void +snapshot_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) +{ + struct trace_event_file *file = data->private_data; + + if (file) + tracing_snapshot_instance(file->tr); + else + tracing_snapshot(); +} + +static void +snapshot_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) +{ + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + snapshot_trigger(data, buffer, rec, event); +} + +static int +register_snapshot_trigger(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + if (tracing_alloc_snapshot_instance(file->tr) != 0) + return 0; + + return register_trigger(glob, data, file); +} + +static int +snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data) +{ + return event_trigger_print("snapshot", m, (void *)data->count, + data->filter_str); +} + +static struct event_trigger_ops snapshot_trigger_ops = { + .trigger = snapshot_trigger, + .print = snapshot_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops snapshot_count_trigger_ops = { + .trigger = snapshot_count_trigger, + .print = snapshot_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops * +snapshot_get_trigger_ops(char *cmd, char *param) +{ + return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops; +} + +static struct event_command trigger_snapshot_cmd = { + .name = "snapshot", + .trigger_type = ETT_SNAPSHOT, + .parse = event_trigger_parse, + .reg = register_snapshot_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = snapshot_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init int register_trigger_snapshot_cmd(void) +{ + int ret; + + ret = register_event_command(&trigger_snapshot_cmd); + WARN_ON(ret < 0); + + return ret; +} +#else +static __init int register_trigger_snapshot_cmd(void) { return 0; } +#endif /* CONFIG_TRACER_SNAPSHOT */ + +#ifdef CONFIG_STACKTRACE +#ifdef CONFIG_UNWINDER_ORC +/* Skip 2: + * event_triggers_post_call() + * trace_event_raw_event_xxx() + */ +# define STACK_SKIP 2 +#else +/* + * Skip 4: + * stacktrace_trigger() + * event_triggers_post_call() + * trace_event_buffer_commit() + * trace_event_raw_event_xxx() + */ +#define STACK_SKIP 4 +#endif + +static void +stacktrace_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) +{ + struct trace_event_file *file = data->private_data; + + if (file) + __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP); + else + trace_dump_stack(STACK_SKIP); +} + +static void +stacktrace_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) +{ + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + stacktrace_trigger(data, buffer, rec, event); +} + +static int +stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data) +{ + return event_trigger_print("stacktrace", m, (void *)data->count, + data->filter_str); +} + +static struct event_trigger_ops stacktrace_trigger_ops = { + .trigger = stacktrace_trigger, + .print = stacktrace_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops stacktrace_count_trigger_ops = { + .trigger = stacktrace_count_trigger, + .print = stacktrace_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops * +stacktrace_get_trigger_ops(char *cmd, char *param) +{ + return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops; +} + +static struct event_command trigger_stacktrace_cmd = { + .name = "stacktrace", + .trigger_type = ETT_STACKTRACE, + .flags = EVENT_CMD_FL_POST_TRIGGER, + .parse = event_trigger_parse, + .reg = register_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = stacktrace_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init int register_trigger_stacktrace_cmd(void) +{ + int ret; + + ret = register_event_command(&trigger_stacktrace_cmd); + WARN_ON(ret < 0); + + return ret; +} +#else +static __init int register_trigger_stacktrace_cmd(void) { return 0; } +#endif /* CONFIG_STACKTRACE */ + +static __init void unregister_trigger_traceon_traceoff_cmds(void) +{ + unregister_event_command(&trigger_traceon_cmd); + unregister_event_command(&trigger_traceoff_cmd); +} + +static void +event_enable_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) +{ + struct enable_trigger_data *enable_data = data->private_data; + + if (enable_data->enable) + clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); + else + set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); +} + +static void +event_enable_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, + struct ring_buffer_event *event) +{ + struct enable_trigger_data *enable_data = data->private_data; + + if (!data->count) + return; + + /* Skip if the event is in a state we want to switch to */ + if (enable_data->enable == !(enable_data->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) + return; + + if (data->count != -1) + (data->count)--; + + event_enable_trigger(data, buffer, rec, event); +} + +int event_enable_trigger_print(struct seq_file *m, + struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + seq_printf(m, "%s:%s:%s", + enable_data->hist ? + (enable_data->enable ? ENABLE_HIST_STR : DISABLE_HIST_STR) : + (enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR), + enable_data->file->event_call->class->system, + trace_event_name(enable_data->file->event_call)); + + if (data->count == -1) + seq_puts(m, ":unlimited"); + else + seq_printf(m, ":count=%ld", data->count); + + if (data->filter_str) + seq_printf(m, " if %s\n", data->filter_str); + else + seq_putc(m, '\n'); + + return 0; +} + +void event_enable_trigger_free(struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) { + /* Remove the SOFT_MODE flag */ + trace_event_enable_disable(enable_data->file, 0, 1); + trace_event_put_ref(enable_data->file->event_call); + trigger_data_free(data); + kfree(enable_data); + } +} + +static struct event_trigger_ops event_enable_trigger_ops = { + .trigger = event_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops event_enable_count_trigger_ops = { + .trigger = event_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops event_disable_trigger_ops = { + .trigger = event_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops event_disable_count_trigger_ops = { + .trigger = event_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +int event_enable_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param_and_filter) +{ + struct trace_event_file *event_enable_file; + struct enable_trigger_data *enable_data; + struct event_trigger_data *trigger_data; + struct trace_array *tr = file->tr; + char *param, *filter; + bool enable, remove; + const char *system; + const char *event; + bool hist = false; + int ret; + + remove = event_trigger_check_remove(glob); + + if (event_trigger_empty_param(param_and_filter)) + return -EINVAL; + + ret = event_trigger_separate_filter(param_and_filter, ¶m, &filter, true); + if (ret) + return ret; + + system = strsep(¶m, ":"); + if (!param) + return -EINVAL; + + event = strsep(¶m, ":"); + + ret = -EINVAL; + event_enable_file = find_event_file(tr, system, event); + if (!event_enable_file) + goto out; + +#ifdef CONFIG_HIST_TRIGGERS + hist = ((strcmp(cmd, ENABLE_HIST_STR) == 0) || + (strcmp(cmd, DISABLE_HIST_STR) == 0)); + + enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) || + (strcmp(cmd, ENABLE_HIST_STR) == 0)); +#else + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; +#endif + ret = -ENOMEM; + + enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL); + if (!enable_data) + goto out; + + enable_data->hist = hist; + enable_data->enable = enable; + enable_data->file = event_enable_file; + + trigger_data = event_trigger_alloc(cmd_ops, cmd, param, enable_data); + if (!trigger_data) { + kfree(enable_data); + goto out; + } + + if (remove) { + event_trigger_unregister(cmd_ops, file, glob+1, trigger_data); + kfree(trigger_data); + kfree(enable_data); + ret = 0; + goto out; + } + + /* Up the trigger_data count to make sure nothing frees it on failure */ + event_trigger_init(trigger_data); + + ret = event_trigger_parse_num(param, trigger_data); + if (ret) + goto out_free; + + ret = event_trigger_set_filter(cmd_ops, file, filter, trigger_data); + if (ret < 0) + goto out_free; + + /* Don't let event modules unload while probe registered */ + ret = trace_event_try_get_ref(event_enable_file->event_call); + if (!ret) { + ret = -EBUSY; + goto out_free; + } + + ret = trace_event_enable_disable(event_enable_file, 1, 1); + if (ret < 0) + goto out_put; + + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret) + goto out_disable; + + event_trigger_free(trigger_data); + out: + return ret; + out_disable: + trace_event_enable_disable(event_enable_file, 0, 1); + out_put: + trace_event_put_ref(event_enable_file->event_call); + out_free: + event_trigger_reset_filter(cmd_ops, trigger_data); + event_trigger_free(trigger_data); + kfree(enable_data); + + goto out; +} + +int event_enable_register_trigger(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct enable_trigger_data *enable_data = data->private_data; + struct enable_trigger_data *test_enable_data; + struct event_trigger_data *test; + int ret = 0; + + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { + test_enable_data = test->private_data; + if (test_enable_data && + (test->cmd_ops->trigger_type == + data->cmd_ops->trigger_type) && + (test_enable_data->file == enable_data->file)) { + ret = -EEXIST; + goto out; + } + } + + if (data->ops->init) { + ret = data->ops->init(data); + if (ret < 0) + goto out; + } + + list_add_rcu(&data->list, &file->triggers); + + update_cond_flag(file); + ret = trace_event_trigger_enable_disable(file, 1); + if (ret < 0) { + list_del_rcu(&data->list); + update_cond_flag(file); + } +out: + return ret; +} + +void event_enable_unregister_trigger(char *glob, + struct event_trigger_data *test, + struct trace_event_file *file) +{ + struct enable_trigger_data *test_enable_data = test->private_data; + struct event_trigger_data *data = NULL, *iter; + struct enable_trigger_data *enable_data; + + lockdep_assert_held(&event_mutex); + + list_for_each_entry(iter, &file->triggers, list) { + enable_data = iter->private_data; + if (enable_data && + (iter->cmd_ops->trigger_type == + test->cmd_ops->trigger_type) && + (enable_data->file == test_enable_data->file)) { + data = iter; + list_del_rcu(&data->list); + trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); + break; + } + } + + if (data && data->ops->free) + data->ops->free(data); +} + +static struct event_trigger_ops * +event_enable_get_trigger_ops(char *cmd, char *param) +{ + struct event_trigger_ops *ops; + bool enable; + +#ifdef CONFIG_HIST_TRIGGERS + enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) || + (strcmp(cmd, ENABLE_HIST_STR) == 0)); +#else + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; +#endif + if (enable) + ops = param ? &event_enable_count_trigger_ops : + &event_enable_trigger_ops; + else + ops = param ? &event_disable_count_trigger_ops : + &event_disable_trigger_ops; + + return ops; +} + +static struct event_command trigger_enable_cmd = { + .name = ENABLE_EVENT_STR, + .trigger_type = ETT_EVENT_ENABLE, + .parse = event_enable_trigger_parse, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .get_trigger_ops = event_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static struct event_command trigger_disable_cmd = { + .name = DISABLE_EVENT_STR, + .trigger_type = ETT_EVENT_ENABLE, + .parse = event_enable_trigger_parse, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .get_trigger_ops = event_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init void unregister_trigger_enable_disable_cmds(void) +{ + unregister_event_command(&trigger_enable_cmd); + unregister_event_command(&trigger_disable_cmd); +} + +static __init int register_trigger_enable_disable_cmds(void) +{ + int ret; + + ret = register_event_command(&trigger_enable_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_event_command(&trigger_disable_cmd); + if (WARN_ON(ret < 0)) + unregister_trigger_enable_disable_cmds(); + + return ret; +} + +static __init int register_trigger_traceon_traceoff_cmds(void) +{ + int ret; + + ret = register_event_command(&trigger_traceon_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_event_command(&trigger_traceoff_cmd); + if (WARN_ON(ret < 0)) + unregister_trigger_traceon_traceoff_cmds(); + + return ret; +} + +__init int register_trigger_cmds(void) +{ + register_trigger_traceon_traceoff_cmds(); + register_trigger_snapshot_cmd(); + register_trigger_stacktrace_cmd(); + register_trigger_enable_disable_cmds(); + register_trigger_hist_enable_disable_cmds(); + register_trigger_hist_cmd(); + + return 0; +} diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c new file mode 100644 index 0000000000..b87f41187c --- /dev/null +++ b/kernel/trace/trace_events_user.c @@ -0,0 +1,2778 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021, Microsoft Corporation. + * + * Authors: + * Beau Belgrave <beaub@linux.microsoft.com> + */ + +#include <linux/bitmap.h> +#include <linux/cdev.h> +#include <linux/hashtable.h> +#include <linux/list.h> +#include <linux/io.h> +#include <linux/uio.h> +#include <linux/ioctl.h> +#include <linux/jhash.h> +#include <linux/refcount.h> +#include <linux/trace_events.h> +#include <linux/tracefs.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/highmem.h> +#include <linux/init.h> +#include <linux/user_events.h> +#include "trace_dynevent.h" +#include "trace_output.h" +#include "trace.h" + +#define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1) + +#define FIELD_DEPTH_TYPE 0 +#define FIELD_DEPTH_NAME 1 +#define FIELD_DEPTH_SIZE 2 + +/* Limit how long of an event name plus args within the subsystem. */ +#define MAX_EVENT_DESC 512 +#define EVENT_NAME(user_event) ((user_event)->tracepoint.name) +#define MAX_FIELD_ARRAY_SIZE 1024 + +/* + * Internal bits (kernel side only) to keep track of connected probes: + * These are used when status is requested in text form about an event. These + * bits are compared against an internal byte on the event to determine which + * probes to print out to the user. + * + * These do not reflect the mapped bytes between the user and kernel space. + */ +#define EVENT_STATUS_FTRACE BIT(0) +#define EVENT_STATUS_PERF BIT(1) +#define EVENT_STATUS_OTHER BIT(7) + +/* + * User register flags are not allowed yet, keep them here until we are + * ready to expose them out to the user ABI. + */ +enum user_reg_flag { + /* Event will not delete upon last reference closing */ + USER_EVENT_REG_PERSIST = 1U << 0, + + /* This value or above is currently non-ABI */ + USER_EVENT_REG_MAX = 1U << 1, +}; + +/* + * Stores the system name, tables, and locks for a group of events. This + * allows isolation for events by various means. + */ +struct user_event_group { + char *system_name; + struct hlist_node node; + struct mutex reg_mutex; + DECLARE_HASHTABLE(register_table, 8); +}; + +/* Group for init_user_ns mapping, top-most group */ +static struct user_event_group *init_group; + +/* Max allowed events for the whole system */ +static unsigned int max_user_events = 32768; + +/* Current number of events on the whole system */ +static unsigned int current_user_events; + +/* + * Stores per-event properties, as users register events + * within a file a user_event might be created if it does not + * already exist. These are globally used and their lifetime + * is tied to the refcnt member. These cannot go away until the + * refcnt reaches one. + */ +struct user_event { + struct user_event_group *group; + struct tracepoint tracepoint; + struct trace_event_call call; + struct trace_event_class class; + struct dyn_event devent; + struct hlist_node node; + struct list_head fields; + struct list_head validators; + struct work_struct put_work; + refcount_t refcnt; + int min_size; + int reg_flags; + char status; +}; + +/* + * Stores per-mm/event properties that enable an address to be + * updated properly for each task. As tasks are forked, we use + * these to track enablement sites that are tied to an event. + */ +struct user_event_enabler { + struct list_head mm_enablers_link; + struct user_event *event; + unsigned long addr; + + /* Track enable bit, flags, etc. Aligned for bitops. */ + unsigned long values; +}; + +/* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */ +#define ENABLE_VAL_BIT_MASK 0x3F + +/* Bit 6 is for faulting status of enablement */ +#define ENABLE_VAL_FAULTING_BIT 6 + +/* Bit 7 is for freeing status of enablement */ +#define ENABLE_VAL_FREEING_BIT 7 + +/* Bit 8 is for marking 32-bit on 64-bit */ +#define ENABLE_VAL_32_ON_64_BIT 8 + +#define ENABLE_VAL_COMPAT_MASK (1 << ENABLE_VAL_32_ON_64_BIT) + +/* Only duplicate the bit and compat values */ +#define ENABLE_VAL_DUP_MASK (ENABLE_VAL_BIT_MASK | ENABLE_VAL_COMPAT_MASK) + +#define ENABLE_BITOPS(e) (&(e)->values) + +#define ENABLE_BIT(e) ((int)((e)->values & ENABLE_VAL_BIT_MASK)) + +/* Used for asynchronous faulting in of pages */ +struct user_event_enabler_fault { + struct work_struct work; + struct user_event_mm *mm; + struct user_event_enabler *enabler; + int attempt; +}; + +static struct kmem_cache *fault_cache; + +/* Global list of memory descriptors using user_events */ +static LIST_HEAD(user_event_mms); +static DEFINE_SPINLOCK(user_event_mms_lock); + +/* + * Stores per-file events references, as users register events + * within a file this structure is modified and freed via RCU. + * The lifetime of this struct is tied to the lifetime of the file. + * These are not shared and only accessible by the file that created it. + */ +struct user_event_refs { + struct rcu_head rcu; + int count; + struct user_event *events[]; +}; + +struct user_event_file_info { + struct user_event_group *group; + struct user_event_refs *refs; +}; + +#define VALIDATOR_ENSURE_NULL (1 << 0) +#define VALIDATOR_REL (1 << 1) + +struct user_event_validator { + struct list_head user_event_link; + int offset; + int flags; +}; + +static inline void align_addr_bit(unsigned long *addr, int *bit, + unsigned long *flags) +{ + if (IS_ALIGNED(*addr, sizeof(long))) { +#ifdef __BIG_ENDIAN + /* 32 bit on BE 64 bit requires a 32 bit offset when aligned. */ + if (test_bit(ENABLE_VAL_32_ON_64_BIT, flags)) + *bit += 32; +#endif + return; + } + + *addr = ALIGN_DOWN(*addr, sizeof(long)); + + /* + * We only support 32 and 64 bit values. The only time we need + * to align is a 32 bit value on a 64 bit kernel, which on LE + * is always 32 bits, and on BE requires no change when unaligned. + */ +#ifdef __LITTLE_ENDIAN + *bit += 32; +#endif +} + +typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, + void *tpdata, bool *faulted); + +static int user_event_parse(struct user_event_group *group, char *name, + char *args, char *flags, + struct user_event **newuser, int reg_flags); + +static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm); +static struct user_event_mm *user_event_mm_get_all(struct user_event *user); +static void user_event_mm_put(struct user_event_mm *mm); +static int destroy_user_event(struct user_event *user); + +static u32 user_event_key(char *name) +{ + return jhash(name, strlen(name), 0); +} + +static struct user_event *user_event_get(struct user_event *user) +{ + refcount_inc(&user->refcnt); + + return user; +} + +static void delayed_destroy_user_event(struct work_struct *work) +{ + struct user_event *user = container_of( + work, struct user_event, put_work); + + mutex_lock(&event_mutex); + + if (!refcount_dec_and_test(&user->refcnt)) + goto out; + + if (destroy_user_event(user)) { + /* + * The only reason this would fail here is if we cannot + * update the visibility of the event. In this case the + * event stays in the hashtable, waiting for someone to + * attempt to delete it later. + */ + pr_warn("user_events: Unable to delete event\n"); + refcount_set(&user->refcnt, 1); + } +out: + mutex_unlock(&event_mutex); +} + +static void user_event_put(struct user_event *user, bool locked) +{ + bool delete; + + if (unlikely(!user)) + return; + + /* + * When the event is not enabled for auto-delete there will always + * be at least 1 reference to the event. During the event creation + * we initially set the refcnt to 2 to achieve this. In those cases + * the caller must acquire event_mutex and after decrement check if + * the refcnt is 1, meaning this is the last reference. When auto + * delete is enabled, there will only be 1 ref, IE: refcnt will be + * only set to 1 during creation to allow the below checks to go + * through upon the last put. The last put must always be done with + * the event mutex held. + */ + if (!locked) { + lockdep_assert_not_held(&event_mutex); + delete = refcount_dec_and_mutex_lock(&user->refcnt, &event_mutex); + } else { + lockdep_assert_held(&event_mutex); + delete = refcount_dec_and_test(&user->refcnt); + } + + if (!delete) + return; + + /* + * We now have the event_mutex in all cases, which ensures that + * no new references will be taken until event_mutex is released. + * New references come through find_user_event(), which requires + * the event_mutex to be held. + */ + + if (user->reg_flags & USER_EVENT_REG_PERSIST) { + /* We should not get here when persist flag is set */ + pr_alert("BUG: Auto-delete engaged on persistent event\n"); + goto out; + } + + /* + * Unfortunately we have to attempt the actual destroy in a work + * queue. This is because not all cases handle a trace_event_call + * being removed within the class->reg() operation for unregister. + */ + INIT_WORK(&user->put_work, delayed_destroy_user_event); + + /* + * Since the event is still in the hashtable, we have to re-inc + * the ref count to 1. This count will be decremented and checked + * in the work queue to ensure it's still the last ref. This is + * needed because a user-process could register the same event in + * between the time of event_mutex release and the work queue + * running the delayed destroy. If we removed the item now from + * the hashtable, this would result in a timing window where a + * user process would fail a register because the trace_event_call + * register would fail in the tracing layers. + */ + refcount_set(&user->refcnt, 1); + + if (WARN_ON_ONCE(!schedule_work(&user->put_work))) { + /* + * If we fail we must wait for an admin to attempt delete or + * another register/close of the event, whichever is first. + */ + pr_warn("user_events: Unable to queue delayed destroy\n"); + } +out: + /* Ensure if we didn't have event_mutex before we unlock it */ + if (!locked) + mutex_unlock(&event_mutex); +} + +static void user_event_group_destroy(struct user_event_group *group) +{ + kfree(group->system_name); + kfree(group); +} + +static char *user_event_group_system_name(void) +{ + char *system_name; + int len = sizeof(USER_EVENTS_SYSTEM) + 1; + + system_name = kmalloc(len, GFP_KERNEL); + + if (!system_name) + return NULL; + + snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM); + + return system_name; +} + +static struct user_event_group *current_user_event_group(void) +{ + return init_group; +} + +static struct user_event_group *user_event_group_create(void) +{ + struct user_event_group *group; + + group = kzalloc(sizeof(*group), GFP_KERNEL); + + if (!group) + return NULL; + + group->system_name = user_event_group_system_name(); + + if (!group->system_name) + goto error; + + mutex_init(&group->reg_mutex); + hash_init(group->register_table); + + return group; +error: + if (group) + user_event_group_destroy(group); + + return NULL; +}; + +static void user_event_enabler_destroy(struct user_event_enabler *enabler, + bool locked) +{ + list_del_rcu(&enabler->mm_enablers_link); + + /* No longer tracking the event via the enabler */ + user_event_put(enabler->event, locked); + + kfree(enabler); +} + +static int user_event_mm_fault_in(struct user_event_mm *mm, unsigned long uaddr, + int attempt) +{ + bool unlocked; + int ret; + + /* + * Normally this is low, ensure that it cannot be taken advantage of by + * bad user processes to cause excessive looping. + */ + if (attempt > 10) + return -EFAULT; + + mmap_read_lock(mm->mm); + + /* Ensure MM has tasks, cannot use after exit_mm() */ + if (refcount_read(&mm->tasks) == 0) { + ret = -ENOENT; + goto out; + } + + ret = fixup_user_fault(mm->mm, uaddr, FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, + &unlocked); +out: + mmap_read_unlock(mm->mm); + + return ret; +} + +static int user_event_enabler_write(struct user_event_mm *mm, + struct user_event_enabler *enabler, + bool fixup_fault, int *attempt); + +static void user_event_enabler_fault_fixup(struct work_struct *work) +{ + struct user_event_enabler_fault *fault = container_of( + work, struct user_event_enabler_fault, work); + struct user_event_enabler *enabler = fault->enabler; + struct user_event_mm *mm = fault->mm; + unsigned long uaddr = enabler->addr; + int attempt = fault->attempt; + int ret; + + ret = user_event_mm_fault_in(mm, uaddr, attempt); + + if (ret && ret != -ENOENT) { + struct user_event *user = enabler->event; + + pr_warn("user_events: Fault for mm: 0x%pK @ 0x%llx event: %s\n", + mm->mm, (unsigned long long)uaddr, EVENT_NAME(user)); + } + + /* Prevent state changes from racing */ + mutex_lock(&event_mutex); + + /* User asked for enabler to be removed during fault */ + if (test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))) { + user_event_enabler_destroy(enabler, true); + goto out; + } + + /* + * If we managed to get the page, re-issue the write. We do not + * want to get into a possible infinite loop, which is why we only + * attempt again directly if the page came in. If we couldn't get + * the page here, then we will try again the next time the event is + * enabled/disabled. + */ + clear_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)); + + if (!ret) { + mmap_read_lock(mm->mm); + user_event_enabler_write(mm, enabler, true, &attempt); + mmap_read_unlock(mm->mm); + } +out: + mutex_unlock(&event_mutex); + + /* In all cases we no longer need the mm or fault */ + user_event_mm_put(mm); + kmem_cache_free(fault_cache, fault); +} + +static bool user_event_enabler_queue_fault(struct user_event_mm *mm, + struct user_event_enabler *enabler, + int attempt) +{ + struct user_event_enabler_fault *fault; + + fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN); + + if (!fault) + return false; + + INIT_WORK(&fault->work, user_event_enabler_fault_fixup); + fault->mm = user_event_mm_get(mm); + fault->enabler = enabler; + fault->attempt = attempt; + + /* Don't try to queue in again while we have a pending fault */ + set_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)); + + if (!schedule_work(&fault->work)) { + /* Allow another attempt later */ + clear_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)); + + user_event_mm_put(mm); + kmem_cache_free(fault_cache, fault); + + return false; + } + + return true; +} + +static int user_event_enabler_write(struct user_event_mm *mm, + struct user_event_enabler *enabler, + bool fixup_fault, int *attempt) +{ + unsigned long uaddr = enabler->addr; + unsigned long *ptr; + struct page *page; + void *kaddr; + int bit = ENABLE_BIT(enabler); + int ret; + + lockdep_assert_held(&event_mutex); + mmap_assert_locked(mm->mm); + + *attempt += 1; + + /* Ensure MM has tasks, cannot use after exit_mm() */ + if (refcount_read(&mm->tasks) == 0) + return -ENOENT; + + if (unlikely(test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)) || + test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)))) + return -EBUSY; + + align_addr_bit(&uaddr, &bit, ENABLE_BITOPS(enabler)); + + ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT, + &page, NULL); + + if (unlikely(ret <= 0)) { + if (!fixup_fault) + return -EFAULT; + + if (!user_event_enabler_queue_fault(mm, enabler, *attempt)) + pr_warn("user_events: Unable to queue fault handler\n"); + + return -EFAULT; + } + + kaddr = kmap_local_page(page); + ptr = kaddr + (uaddr & ~PAGE_MASK); + + /* Update bit atomically, user tracers must be atomic as well */ + if (enabler->event && enabler->event->status) + set_bit(bit, ptr); + else + clear_bit(bit, ptr); + + kunmap_local(kaddr); + unpin_user_pages_dirty_lock(&page, 1, true); + + return 0; +} + +static bool user_event_enabler_exists(struct user_event_mm *mm, + unsigned long uaddr, unsigned char bit) +{ + struct user_event_enabler *enabler; + + list_for_each_entry(enabler, &mm->enablers, mm_enablers_link) { + if (enabler->addr == uaddr && ENABLE_BIT(enabler) == bit) + return true; + } + + return false; +} + +static void user_event_enabler_update(struct user_event *user) +{ + struct user_event_enabler *enabler; + struct user_event_mm *next; + struct user_event_mm *mm; + int attempt; + + lockdep_assert_held(&event_mutex); + + /* + * We need to build a one-shot list of all the mms that have an + * enabler for the user_event passed in. This list is only valid + * while holding the event_mutex. The only reason for this is due + * to the global mm list being RCU protected and we use methods + * which can wait (mmap_read_lock and pin_user_pages_remote). + * + * NOTE: user_event_mm_get_all() increments the ref count of each + * mm that is added to the list to prevent removal timing windows. + * We must always put each mm after they are used, which may wait. + */ + mm = user_event_mm_get_all(user); + + while (mm) { + next = mm->next; + mmap_read_lock(mm->mm); + + list_for_each_entry(enabler, &mm->enablers, mm_enablers_link) { + if (enabler->event == user) { + attempt = 0; + user_event_enabler_write(mm, enabler, true, &attempt); + } + } + + mmap_read_unlock(mm->mm); + user_event_mm_put(mm); + mm = next; + } +} + +static bool user_event_enabler_dup(struct user_event_enabler *orig, + struct user_event_mm *mm) +{ + struct user_event_enabler *enabler; + + /* Skip pending frees */ + if (unlikely(test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(orig)))) + return true; + + enabler = kzalloc(sizeof(*enabler), GFP_NOWAIT | __GFP_ACCOUNT); + + if (!enabler) + return false; + + enabler->event = user_event_get(orig->event); + enabler->addr = orig->addr; + + /* Only dup part of value (ignore future flags, etc) */ + enabler->values = orig->values & ENABLE_VAL_DUP_MASK; + + /* Enablers not exposed yet, RCU not required */ + list_add(&enabler->mm_enablers_link, &mm->enablers); + + return true; +} + +static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm) +{ + refcount_inc(&mm->refcnt); + + return mm; +} + +static struct user_event_mm *user_event_mm_get_all(struct user_event *user) +{ + struct user_event_mm *found = NULL; + struct user_event_enabler *enabler; + struct user_event_mm *mm; + + /* + * We use the mm->next field to build a one-shot list from the global + * RCU protected list. To build this list the event_mutex must be held. + * This lets us build a list without requiring allocs that could fail + * when user based events are most wanted for diagnostics. + */ + lockdep_assert_held(&event_mutex); + + /* + * We do not want to block fork/exec while enablements are being + * updated, so we use RCU to walk the current tasks that have used + * user_events ABI for 1 or more events. Each enabler found in each + * task that matches the event being updated has a write to reflect + * the kernel state back into the process. Waits/faults must not occur + * during this. So we scan the list under RCU for all the mm that have + * the event within it. This is needed because mm_read_lock() can wait. + * Each user mm returned has a ref inc to handle remove RCU races. + */ + rcu_read_lock(); + + list_for_each_entry_rcu(mm, &user_event_mms, mms_link) { + list_for_each_entry_rcu(enabler, &mm->enablers, mm_enablers_link) { + if (enabler->event == user) { + mm->next = found; + found = user_event_mm_get(mm); + break; + } + } + } + + rcu_read_unlock(); + + return found; +} + +static struct user_event_mm *user_event_mm_alloc(struct task_struct *t) +{ + struct user_event_mm *user_mm; + + user_mm = kzalloc(sizeof(*user_mm), GFP_KERNEL_ACCOUNT); + + if (!user_mm) + return NULL; + + user_mm->mm = t->mm; + INIT_LIST_HEAD(&user_mm->enablers); + refcount_set(&user_mm->refcnt, 1); + refcount_set(&user_mm->tasks, 1); + + /* + * The lifetime of the memory descriptor can slightly outlast + * the task lifetime if a ref to the user_event_mm is taken + * between list_del_rcu() and call_rcu(). Therefore we need + * to take a reference to it to ensure it can live this long + * under this corner case. This can also occur in clones that + * outlast the parent. + */ + mmgrab(user_mm->mm); + + return user_mm; +} + +static void user_event_mm_attach(struct user_event_mm *user_mm, struct task_struct *t) +{ + unsigned long flags; + + spin_lock_irqsave(&user_event_mms_lock, flags); + list_add_rcu(&user_mm->mms_link, &user_event_mms); + spin_unlock_irqrestore(&user_event_mms_lock, flags); + + t->user_event_mm = user_mm; +} + +static struct user_event_mm *current_user_event_mm(void) +{ + struct user_event_mm *user_mm = current->user_event_mm; + + if (user_mm) + goto inc; + + user_mm = user_event_mm_alloc(current); + + if (!user_mm) + goto error; + + user_event_mm_attach(user_mm, current); +inc: + refcount_inc(&user_mm->refcnt); +error: + return user_mm; +} + +static void user_event_mm_destroy(struct user_event_mm *mm) +{ + struct user_event_enabler *enabler, *next; + + list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link) + user_event_enabler_destroy(enabler, false); + + mmdrop(mm->mm); + kfree(mm); +} + +static void user_event_mm_put(struct user_event_mm *mm) +{ + if (mm && refcount_dec_and_test(&mm->refcnt)) + user_event_mm_destroy(mm); +} + +static void delayed_user_event_mm_put(struct work_struct *work) +{ + struct user_event_mm *mm; + + mm = container_of(to_rcu_work(work), struct user_event_mm, put_rwork); + user_event_mm_put(mm); +} + +void user_event_mm_remove(struct task_struct *t) +{ + struct user_event_mm *mm; + unsigned long flags; + + might_sleep(); + + mm = t->user_event_mm; + t->user_event_mm = NULL; + + /* Clone will increment the tasks, only remove if last clone */ + if (!refcount_dec_and_test(&mm->tasks)) + return; + + /* Remove the mm from the list, so it can no longer be enabled */ + spin_lock_irqsave(&user_event_mms_lock, flags); + list_del_rcu(&mm->mms_link); + spin_unlock_irqrestore(&user_event_mms_lock, flags); + + /* + * We need to wait for currently occurring writes to stop within + * the mm. This is required since exit_mm() snaps the current rss + * stats and clears them. On the final mmdrop(), check_mm() will + * report a bug if these increment. + * + * All writes/pins are done under mmap_read lock, take the write + * lock to ensure in-progress faults have completed. Faults that + * are pending but yet to run will check the task count and skip + * the fault since the mm is going away. + */ + mmap_write_lock(mm->mm); + mmap_write_unlock(mm->mm); + + /* + * Put for mm must be done after RCU delay to handle new refs in + * between the list_del_rcu() and now. This ensures any get refs + * during rcu_read_lock() are accounted for during list removal. + * + * CPU A | CPU B + * --------------------------------------------------------------- + * user_event_mm_remove() | rcu_read_lock(); + * list_del_rcu() | list_for_each_entry_rcu(); + * call_rcu() | refcount_inc(); + * . | rcu_read_unlock(); + * schedule_work() | . + * user_event_mm_put() | . + * + * mmdrop() cannot be called in the softirq context of call_rcu() + * so we use a work queue after call_rcu() to run within. + */ + INIT_RCU_WORK(&mm->put_rwork, delayed_user_event_mm_put); + queue_rcu_work(system_wq, &mm->put_rwork); +} + +void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm) +{ + struct user_event_mm *mm = user_event_mm_alloc(t); + struct user_event_enabler *enabler; + + if (!mm) + return; + + rcu_read_lock(); + + list_for_each_entry_rcu(enabler, &old_mm->enablers, mm_enablers_link) { + if (!user_event_enabler_dup(enabler, mm)) + goto error; + } + + rcu_read_unlock(); + + user_event_mm_attach(mm, t); + return; +error: + rcu_read_unlock(); + user_event_mm_destroy(mm); +} + +static bool current_user_event_enabler_exists(unsigned long uaddr, + unsigned char bit) +{ + struct user_event_mm *user_mm = current_user_event_mm(); + bool exists; + + if (!user_mm) + return false; + + exists = user_event_enabler_exists(user_mm, uaddr, bit); + + user_event_mm_put(user_mm); + + return exists; +} + +static struct user_event_enabler +*user_event_enabler_create(struct user_reg *reg, struct user_event *user, + int *write_result) +{ + struct user_event_enabler *enabler; + struct user_event_mm *user_mm; + unsigned long uaddr = (unsigned long)reg->enable_addr; + int attempt = 0; + + user_mm = current_user_event_mm(); + + if (!user_mm) + return NULL; + + enabler = kzalloc(sizeof(*enabler), GFP_KERNEL_ACCOUNT); + + if (!enabler) + goto out; + + enabler->event = user; + enabler->addr = uaddr; + enabler->values = reg->enable_bit; + +#if BITS_PER_LONG >= 64 + if (reg->enable_size == 4) + set_bit(ENABLE_VAL_32_ON_64_BIT, ENABLE_BITOPS(enabler)); +#endif + +retry: + /* Prevents state changes from racing with new enablers */ + mutex_lock(&event_mutex); + + /* Attempt to reflect the current state within the process */ + mmap_read_lock(user_mm->mm); + *write_result = user_event_enabler_write(user_mm, enabler, false, + &attempt); + mmap_read_unlock(user_mm->mm); + + /* + * If the write works, then we will track the enabler. A ref to the + * underlying user_event is held by the enabler to prevent it going + * away while the enabler is still in use by a process. The ref is + * removed when the enabler is destroyed. This means a event cannot + * be forcefully deleted from the system until all tasks using it + * exit or run exec(), which includes forks and clones. + */ + if (!*write_result) { + user_event_get(user); + list_add_rcu(&enabler->mm_enablers_link, &user_mm->enablers); + } + + mutex_unlock(&event_mutex); + + if (*write_result) { + /* Attempt to fault-in and retry if it worked */ + if (!user_event_mm_fault_in(user_mm, uaddr, attempt)) + goto retry; + + kfree(enabler); + enabler = NULL; + } +out: + user_event_mm_put(user_mm); + + return enabler; +} + +static __always_inline __must_check +bool user_event_last_ref(struct user_event *user) +{ + int last = 0; + + if (user->reg_flags & USER_EVENT_REG_PERSIST) + last = 1; + + return refcount_read(&user->refcnt) == last; +} + +static __always_inline __must_check +size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i) +{ + size_t ret; + + pagefault_disable(); + + ret = copy_from_iter_nocache(addr, bytes, i); + + pagefault_enable(); + + return ret; +} + +static struct list_head *user_event_get_fields(struct trace_event_call *call) +{ + struct user_event *user = (struct user_event *)call->data; + + return &user->fields; +} + +/* + * Parses a register command for user_events + * Format: event_name[:FLAG1[,FLAG2...]] [field1[;field2...]] + * + * Example event named 'test' with a 20 char 'msg' field with an unsigned int + * 'id' field after: + * test char[20] msg;unsigned int id + * + * NOTE: Offsets are from the user data perspective, they are not from the + * trace_entry/buffer perspective. We automatically add the common properties + * sizes to the offset for the user. + * + * Upon success user_event has its ref count increased by 1. + */ +static int user_event_parse_cmd(struct user_event_group *group, + char *raw_command, struct user_event **newuser, + int reg_flags) +{ + char *name = raw_command; + char *args = strpbrk(name, " "); + char *flags; + + if (args) + *args++ = '\0'; + + flags = strpbrk(name, ":"); + + if (flags) + *flags++ = '\0'; + + return user_event_parse(group, name, args, flags, newuser, reg_flags); +} + +static int user_field_array_size(const char *type) +{ + const char *start = strchr(type, '['); + char val[8]; + char *bracket; + int size = 0; + + if (start == NULL) + return -EINVAL; + + if (strscpy(val, start + 1, sizeof(val)) <= 0) + return -EINVAL; + + bracket = strchr(val, ']'); + + if (!bracket) + return -EINVAL; + + *bracket = '\0'; + + if (kstrtouint(val, 0, &size)) + return -EINVAL; + + if (size > MAX_FIELD_ARRAY_SIZE) + return -EINVAL; + + return size; +} + +static int user_field_size(const char *type) +{ + /* long is not allowed from a user, since it's ambigious in size */ + if (strcmp(type, "s64") == 0) + return sizeof(s64); + if (strcmp(type, "u64") == 0) + return sizeof(u64); + if (strcmp(type, "s32") == 0) + return sizeof(s32); + if (strcmp(type, "u32") == 0) + return sizeof(u32); + if (strcmp(type, "int") == 0) + return sizeof(int); + if (strcmp(type, "unsigned int") == 0) + return sizeof(unsigned int); + if (strcmp(type, "s16") == 0) + return sizeof(s16); + if (strcmp(type, "u16") == 0) + return sizeof(u16); + if (strcmp(type, "short") == 0) + return sizeof(short); + if (strcmp(type, "unsigned short") == 0) + return sizeof(unsigned short); + if (strcmp(type, "s8") == 0) + return sizeof(s8); + if (strcmp(type, "u8") == 0) + return sizeof(u8); + if (strcmp(type, "char") == 0) + return sizeof(char); + if (strcmp(type, "unsigned char") == 0) + return sizeof(unsigned char); + if (str_has_prefix(type, "char[")) + return user_field_array_size(type); + if (str_has_prefix(type, "unsigned char[")) + return user_field_array_size(type); + if (str_has_prefix(type, "__data_loc ")) + return sizeof(u32); + if (str_has_prefix(type, "__rel_loc ")) + return sizeof(u32); + + /* Uknown basic type, error */ + return -EINVAL; +} + +static void user_event_destroy_validators(struct user_event *user) +{ + struct user_event_validator *validator, *next; + struct list_head *head = &user->validators; + + list_for_each_entry_safe(validator, next, head, user_event_link) { + list_del(&validator->user_event_link); + kfree(validator); + } +} + +static void user_event_destroy_fields(struct user_event *user) +{ + struct ftrace_event_field *field, *next; + struct list_head *head = &user->fields; + + list_for_each_entry_safe(field, next, head, link) { + list_del(&field->link); + kfree(field); + } +} + +static int user_event_add_field(struct user_event *user, const char *type, + const char *name, int offset, int size, + int is_signed, int filter_type) +{ + struct user_event_validator *validator; + struct ftrace_event_field *field; + int validator_flags = 0; + + field = kmalloc(sizeof(*field), GFP_KERNEL_ACCOUNT); + + if (!field) + return -ENOMEM; + + if (str_has_prefix(type, "__data_loc ")) + goto add_validator; + + if (str_has_prefix(type, "__rel_loc ")) { + validator_flags |= VALIDATOR_REL; + goto add_validator; + } + + goto add_field; + +add_validator: + if (strstr(type, "char") != NULL) + validator_flags |= VALIDATOR_ENSURE_NULL; + + validator = kmalloc(sizeof(*validator), GFP_KERNEL_ACCOUNT); + + if (!validator) { + kfree(field); + return -ENOMEM; + } + + validator->flags = validator_flags; + validator->offset = offset; + + /* Want sequential access when validating */ + list_add_tail(&validator->user_event_link, &user->validators); + +add_field: + field->type = type; + field->name = name; + field->offset = offset; + field->size = size; + field->is_signed = is_signed; + field->filter_type = filter_type; + + if (filter_type == FILTER_OTHER) + field->filter_type = filter_assign_type(type); + + list_add(&field->link, &user->fields); + + /* + * Min size from user writes that are required, this does not include + * the size of trace_entry (common fields). + */ + user->min_size = (offset + size) - sizeof(struct trace_entry); + + return 0; +} + +/* + * Parses the values of a field within the description + * Format: type name [size] + */ +static int user_event_parse_field(char *field, struct user_event *user, + u32 *offset) +{ + char *part, *type, *name; + u32 depth = 0, saved_offset = *offset; + int len, size = -EINVAL; + bool is_struct = false; + + field = skip_spaces(field); + + if (*field == '\0') + return 0; + + /* Handle types that have a space within */ + len = str_has_prefix(field, "unsigned "); + if (len) + goto skip_next; + + len = str_has_prefix(field, "struct "); + if (len) { + is_struct = true; + goto skip_next; + } + + len = str_has_prefix(field, "__data_loc unsigned "); + if (len) + goto skip_next; + + len = str_has_prefix(field, "__data_loc "); + if (len) + goto skip_next; + + len = str_has_prefix(field, "__rel_loc unsigned "); + if (len) + goto skip_next; + + len = str_has_prefix(field, "__rel_loc "); + if (len) + goto skip_next; + + goto parse; +skip_next: + type = field; + field = strpbrk(field + len, " "); + + if (field == NULL) + return -EINVAL; + + *field++ = '\0'; + depth++; +parse: + name = NULL; + + while ((part = strsep(&field, " ")) != NULL) { + switch (depth++) { + case FIELD_DEPTH_TYPE: + type = part; + break; + case FIELD_DEPTH_NAME: + name = part; + break; + case FIELD_DEPTH_SIZE: + if (!is_struct) + return -EINVAL; + + if (kstrtou32(part, 10, &size)) + return -EINVAL; + break; + default: + return -EINVAL; + } + } + + if (depth < FIELD_DEPTH_SIZE || !name) + return -EINVAL; + + if (depth == FIELD_DEPTH_SIZE) + size = user_field_size(type); + + if (size == 0) + return -EINVAL; + + if (size < 0) + return size; + + *offset = saved_offset + size; + + return user_event_add_field(user, type, name, saved_offset, size, + type[0] != 'u', FILTER_OTHER); +} + +static int user_event_parse_fields(struct user_event *user, char *args) +{ + char *field; + u32 offset = sizeof(struct trace_entry); + int ret = -EINVAL; + + if (args == NULL) + return 0; + + while ((field = strsep(&args, ";")) != NULL) { + ret = user_event_parse_field(field, user, &offset); + + if (ret) + break; + } + + return ret; +} + +static struct trace_event_fields user_event_fields_array[1]; + +static const char *user_field_format(const char *type) +{ + if (strcmp(type, "s64") == 0) + return "%lld"; + if (strcmp(type, "u64") == 0) + return "%llu"; + if (strcmp(type, "s32") == 0) + return "%d"; + if (strcmp(type, "u32") == 0) + return "%u"; + if (strcmp(type, "int") == 0) + return "%d"; + if (strcmp(type, "unsigned int") == 0) + return "%u"; + if (strcmp(type, "s16") == 0) + return "%d"; + if (strcmp(type, "u16") == 0) + return "%u"; + if (strcmp(type, "short") == 0) + return "%d"; + if (strcmp(type, "unsigned short") == 0) + return "%u"; + if (strcmp(type, "s8") == 0) + return "%d"; + if (strcmp(type, "u8") == 0) + return "%u"; + if (strcmp(type, "char") == 0) + return "%d"; + if (strcmp(type, "unsigned char") == 0) + return "%u"; + if (strstr(type, "char[") != NULL) + return "%s"; + + /* Unknown, likely struct, allowed treat as 64-bit */ + return "%llu"; +} + +static bool user_field_is_dyn_string(const char *type, const char **str_func) +{ + if (str_has_prefix(type, "__data_loc ")) { + *str_func = "__get_str"; + goto check; + } + + if (str_has_prefix(type, "__rel_loc ")) { + *str_func = "__get_rel_str"; + goto check; + } + + return false; +check: + return strstr(type, "char") != NULL; +} + +#define LEN_OR_ZERO (len ? len - pos : 0) +static int user_dyn_field_set_string(int argc, const char **argv, int *iout, + char *buf, int len, bool *colon) +{ + int pos = 0, i = *iout; + + *colon = false; + + for (; i < argc; ++i) { + if (i != *iout) + pos += snprintf(buf + pos, LEN_OR_ZERO, " "); + + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", argv[i]); + + if (strchr(argv[i], ';')) { + ++i; + *colon = true; + break; + } + } + + /* Actual set, advance i */ + if (len != 0) + *iout = i; + + return pos + 1; +} + +static int user_field_set_string(struct ftrace_event_field *field, + char *buf, int len, bool colon) +{ + int pos = 0; + + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->type); + pos += snprintf(buf + pos, LEN_OR_ZERO, " "); + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->name); + + if (str_has_prefix(field->type, "struct ")) + pos += snprintf(buf + pos, LEN_OR_ZERO, " %d", field->size); + + if (colon) + pos += snprintf(buf + pos, LEN_OR_ZERO, ";"); + + return pos + 1; +} + +static int user_event_set_print_fmt(struct user_event *user, char *buf, int len) +{ + struct ftrace_event_field *field; + struct list_head *head = &user->fields; + int pos = 0, depth = 0; + const char *str_func; + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + list_for_each_entry_reverse(field, head, link) { + if (depth != 0) + pos += snprintf(buf + pos, LEN_OR_ZERO, " "); + + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s", + field->name, user_field_format(field->type)); + + depth++; + } + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + list_for_each_entry_reverse(field, head, link) { + if (user_field_is_dyn_string(field->type, &str_func)) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", %s(%s)", str_func, field->name); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", REC->%s", field->name); + } + + return pos + 1; +} +#undef LEN_OR_ZERO + +static int user_event_create_print_fmt(struct user_event *user) +{ + char *print_fmt; + int len; + + len = user_event_set_print_fmt(user, NULL, 0); + + print_fmt = kmalloc(len, GFP_KERNEL_ACCOUNT); + + if (!print_fmt) + return -ENOMEM; + + user_event_set_print_fmt(user, print_fmt, len); + + user->call.print_fmt = print_fmt; + + return 0; +} + +static enum print_line_t user_event_print_trace(struct trace_iterator *iter, + int flags, + struct trace_event *event) +{ + return print_event_fields(iter, event); +} + +static struct trace_event_functions user_event_funcs = { + .trace = user_event_print_trace, +}; + +static int user_event_set_call_visible(struct user_event *user, bool visible) +{ + int ret; + const struct cred *old_cred; + struct cred *cred; + + cred = prepare_creds(); + + if (!cred) + return -ENOMEM; + + /* + * While by default tracefs is locked down, systems can be configured + * to allow user_event files to be less locked down. The extreme case + * being "other" has read/write access to user_events_data/status. + * + * When not locked down, processes may not have permissions to + * add/remove calls themselves to tracefs. We need to temporarily + * switch to root file permission to allow for this scenario. + */ + cred->fsuid = GLOBAL_ROOT_UID; + + old_cred = override_creds(cred); + + if (visible) + ret = trace_add_event_call(&user->call); + else + ret = trace_remove_event_call(&user->call); + + revert_creds(old_cred); + put_cred(cred); + + return ret; +} + +static int destroy_user_event(struct user_event *user) +{ + int ret = 0; + + lockdep_assert_held(&event_mutex); + + /* Must destroy fields before call removal */ + user_event_destroy_fields(user); + + ret = user_event_set_call_visible(user, false); + + if (ret) + return ret; + + dyn_event_remove(&user->devent); + hash_del(&user->node); + + user_event_destroy_validators(user); + kfree(user->call.print_fmt); + kfree(EVENT_NAME(user)); + kfree(user); + + if (current_user_events > 0) + current_user_events--; + else + pr_alert("BUG: Bad current_user_events\n"); + + return ret; +} + +static struct user_event *find_user_event(struct user_event_group *group, + char *name, u32 *outkey) +{ + struct user_event *user; + u32 key = user_event_key(name); + + *outkey = key; + + hash_for_each_possible(group->register_table, user, node, key) + if (!strcmp(EVENT_NAME(user), name)) + return user_event_get(user); + + return NULL; +} + +static int user_event_validate(struct user_event *user, void *data, int len) +{ + struct list_head *head = &user->validators; + struct user_event_validator *validator; + void *pos, *end = data + len; + u32 loc, offset, size; + + list_for_each_entry(validator, head, user_event_link) { + pos = data + validator->offset; + + /* Already done min_size check, no bounds check here */ + loc = *(u32 *)pos; + offset = loc & 0xffff; + size = loc >> 16; + + if (likely(validator->flags & VALIDATOR_REL)) + pos += offset + sizeof(loc); + else + pos = data + offset; + + pos += size; + + if (unlikely(pos > end)) + return -EFAULT; + + if (likely(validator->flags & VALIDATOR_ENSURE_NULL)) + if (unlikely(*(char *)(pos - 1) != '\0')) + return -EFAULT; + } + + return 0; +} + +/* + * Writes the user supplied payload out to a trace file. + */ +static void user_event_ftrace(struct user_event *user, struct iov_iter *i, + void *tpdata, bool *faulted) +{ + struct trace_event_file *file; + struct trace_entry *entry; + struct trace_event_buffer event_buffer; + size_t size = sizeof(*entry) + i->count; + + file = (struct trace_event_file *)tpdata; + + if (!file || + !(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) + return; + + /* Allocates and fills trace_entry, + 1 of this is data payload */ + entry = trace_event_buffer_reserve(&event_buffer, file, size); + + if (unlikely(!entry)) + return; + + if (unlikely(i->count != 0 && !copy_nofault(entry + 1, i->count, i))) + goto discard; + + if (!list_empty(&user->validators) && + unlikely(user_event_validate(user, entry, size))) + goto discard; + + trace_event_buffer_commit(&event_buffer); + + return; +discard: + *faulted = true; + __trace_event_discard_commit(event_buffer.buffer, + event_buffer.event); +} + +#ifdef CONFIG_PERF_EVENTS +/* + * Writes the user supplied payload out to perf ring buffer. + */ +static void user_event_perf(struct user_event *user, struct iov_iter *i, + void *tpdata, bool *faulted) +{ + struct hlist_head *perf_head; + + perf_head = this_cpu_ptr(user->call.perf_events); + + if (perf_head && !hlist_empty(perf_head)) { + struct trace_entry *perf_entry; + struct pt_regs *regs; + size_t size = sizeof(*perf_entry) + i->count; + int context; + + perf_entry = perf_trace_buf_alloc(ALIGN(size, 8), + ®s, &context); + + if (unlikely(!perf_entry)) + return; + + perf_fetch_caller_regs(regs); + + if (unlikely(i->count != 0 && !copy_nofault(perf_entry + 1, i->count, i))) + goto discard; + + if (!list_empty(&user->validators) && + unlikely(user_event_validate(user, perf_entry, size))) + goto discard; + + perf_trace_buf_submit(perf_entry, size, context, + user->call.event.type, 1, regs, + perf_head, NULL); + + return; +discard: + *faulted = true; + perf_swevent_put_recursion_context(context); + } +} +#endif + +/* + * Update the enabled bit among all user processes. + */ +static void update_enable_bit_for(struct user_event *user) +{ + struct tracepoint *tp = &user->tracepoint; + char status = 0; + + if (atomic_read(&tp->key.enabled) > 0) { + struct tracepoint_func *probe_func_ptr; + user_event_func_t probe_func; + + rcu_read_lock_sched(); + + probe_func_ptr = rcu_dereference_sched(tp->funcs); + + if (probe_func_ptr) { + do { + probe_func = probe_func_ptr->func; + + if (probe_func == user_event_ftrace) + status |= EVENT_STATUS_FTRACE; +#ifdef CONFIG_PERF_EVENTS + else if (probe_func == user_event_perf) + status |= EVENT_STATUS_PERF; +#endif + else + status |= EVENT_STATUS_OTHER; + } while ((++probe_func_ptr)->func); + } + + rcu_read_unlock_sched(); + } + + user->status = status; + + user_event_enabler_update(user); +} + +/* + * Register callback for our events from tracing sub-systems. + */ +static int user_event_reg(struct trace_event_call *call, + enum trace_reg type, + void *data) +{ + struct user_event *user = (struct user_event *)call->data; + int ret = 0; + + if (!user) + return -ENOENT; + + switch (type) { + case TRACE_REG_REGISTER: + ret = tracepoint_probe_register(call->tp, + call->class->probe, + data); + if (!ret) + goto inc; + break; + + case TRACE_REG_UNREGISTER: + tracepoint_probe_unregister(call->tp, + call->class->probe, + data); + goto dec; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + ret = tracepoint_probe_register(call->tp, + call->class->perf_probe, + data); + if (!ret) + goto inc; + break; + + case TRACE_REG_PERF_UNREGISTER: + tracepoint_probe_unregister(call->tp, + call->class->perf_probe, + data); + goto dec; + + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + break; +#endif + } + + return ret; +inc: + user_event_get(user); + update_enable_bit_for(user); + return 0; +dec: + update_enable_bit_for(user); + user_event_put(user, true); + return 0; +} + +static int user_event_create(const char *raw_command) +{ + struct user_event_group *group; + struct user_event *user; + char *name; + int ret; + + if (!str_has_prefix(raw_command, USER_EVENTS_PREFIX)) + return -ECANCELED; + + raw_command += USER_EVENTS_PREFIX_LEN; + raw_command = skip_spaces(raw_command); + + name = kstrdup(raw_command, GFP_KERNEL_ACCOUNT); + + if (!name) + return -ENOMEM; + + group = current_user_event_group(); + + if (!group) { + kfree(name); + return -ENOENT; + } + + mutex_lock(&group->reg_mutex); + + /* Dyn events persist, otherwise they would cleanup immediately */ + ret = user_event_parse_cmd(group, name, &user, USER_EVENT_REG_PERSIST); + + if (!ret) + user_event_put(user, false); + + mutex_unlock(&group->reg_mutex); + + if (ret) + kfree(name); + + return ret; +} + +static int user_event_show(struct seq_file *m, struct dyn_event *ev) +{ + struct user_event *user = container_of(ev, struct user_event, devent); + struct ftrace_event_field *field; + struct list_head *head; + int depth = 0; + + seq_printf(m, "%s%s", USER_EVENTS_PREFIX, EVENT_NAME(user)); + + head = trace_get_fields(&user->call); + + list_for_each_entry_reverse(field, head, link) { + if (depth == 0) + seq_puts(m, " "); + else + seq_puts(m, "; "); + + seq_printf(m, "%s %s", field->type, field->name); + + if (str_has_prefix(field->type, "struct ")) + seq_printf(m, " %d", field->size); + + depth++; + } + + seq_puts(m, "\n"); + + return 0; +} + +static bool user_event_is_busy(struct dyn_event *ev) +{ + struct user_event *user = container_of(ev, struct user_event, devent); + + return !user_event_last_ref(user); +} + +static int user_event_free(struct dyn_event *ev) +{ + struct user_event *user = container_of(ev, struct user_event, devent); + + if (!user_event_last_ref(user)) + return -EBUSY; + + return destroy_user_event(user); +} + +static bool user_field_match(struct ftrace_event_field *field, int argc, + const char **argv, int *iout) +{ + char *field_name = NULL, *dyn_field_name = NULL; + bool colon = false, match = false; + int dyn_len, len; + + if (*iout >= argc) + return false; + + dyn_len = user_dyn_field_set_string(argc, argv, iout, dyn_field_name, + 0, &colon); + + len = user_field_set_string(field, field_name, 0, colon); + + if (dyn_len != len) + return false; + + dyn_field_name = kmalloc(dyn_len, GFP_KERNEL); + field_name = kmalloc(len, GFP_KERNEL); + + if (!dyn_field_name || !field_name) + goto out; + + user_dyn_field_set_string(argc, argv, iout, dyn_field_name, + dyn_len, &colon); + + user_field_set_string(field, field_name, len, colon); + + match = strcmp(dyn_field_name, field_name) == 0; +out: + kfree(dyn_field_name); + kfree(field_name); + + return match; +} + +static bool user_fields_match(struct user_event *user, int argc, + const char **argv) +{ + struct ftrace_event_field *field; + struct list_head *head = &user->fields; + int i = 0; + + list_for_each_entry_reverse(field, head, link) { + if (!user_field_match(field, argc, argv, &i)) + return false; + } + + if (i != argc) + return false; + + return true; +} + +static bool user_event_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev) +{ + struct user_event *user = container_of(ev, struct user_event, devent); + bool match; + + match = strcmp(EVENT_NAME(user), event) == 0 && + (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0); + + if (match && argc > 0) + match = user_fields_match(user, argc, argv); + else if (match && argc == 0) + match = list_empty(&user->fields); + + return match; +} + +static struct dyn_event_operations user_event_dops = { + .create = user_event_create, + .show = user_event_show, + .is_busy = user_event_is_busy, + .free = user_event_free, + .match = user_event_match, +}; + +static int user_event_trace_register(struct user_event *user) +{ + int ret; + + ret = register_trace_event(&user->call.event); + + if (!ret) + return -ENODEV; + + ret = user_event_set_call_visible(user, true); + + if (ret) + unregister_trace_event(&user->call.event); + + return ret; +} + +/* + * Parses the event name, arguments and flags then registers if successful. + * The name buffer lifetime is owned by this method for success cases only. + * Upon success the returned user_event has its ref count increased by 1. + */ +static int user_event_parse(struct user_event_group *group, char *name, + char *args, char *flags, + struct user_event **newuser, int reg_flags) +{ + int ret; + u32 key; + struct user_event *user; + int argc = 0; + char **argv; + + /* User register flags are not ready yet */ + if (reg_flags != 0 || flags != NULL) + return -EINVAL; + + /* Prevent dyn_event from racing */ + mutex_lock(&event_mutex); + user = find_user_event(group, name, &key); + mutex_unlock(&event_mutex); + + if (user) { + if (args) { + argv = argv_split(GFP_KERNEL, args, &argc); + if (!argv) { + ret = -ENOMEM; + goto error; + } + + ret = user_fields_match(user, argc, (const char **)argv); + argv_free(argv); + + } else + ret = list_empty(&user->fields); + + if (ret) { + *newuser = user; + /* + * Name is allocated by caller, free it since it already exists. + * Caller only worries about failure cases for freeing. + */ + kfree(name); + } else { + ret = -EADDRINUSE; + goto error; + } + + return 0; +error: + user_event_put(user, false); + return ret; + } + + user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT); + + if (!user) + return -ENOMEM; + + INIT_LIST_HEAD(&user->class.fields); + INIT_LIST_HEAD(&user->fields); + INIT_LIST_HEAD(&user->validators); + + user->group = group; + user->tracepoint.name = name; + + ret = user_event_parse_fields(user, args); + + if (ret) + goto put_user; + + ret = user_event_create_print_fmt(user); + + if (ret) + goto put_user; + + user->call.data = user; + user->call.class = &user->class; + user->call.name = name; + user->call.flags = TRACE_EVENT_FL_TRACEPOINT; + user->call.tp = &user->tracepoint; + user->call.event.funcs = &user_event_funcs; + user->class.system = group->system_name; + + user->class.fields_array = user_event_fields_array; + user->class.get_fields = user_event_get_fields; + user->class.reg = user_event_reg; + user->class.probe = user_event_ftrace; +#ifdef CONFIG_PERF_EVENTS + user->class.perf_probe = user_event_perf; +#endif + + mutex_lock(&event_mutex); + + if (current_user_events >= max_user_events) { + ret = -EMFILE; + goto put_user_lock; + } + + ret = user_event_trace_register(user); + + if (ret) + goto put_user_lock; + + user->reg_flags = reg_flags; + + if (user->reg_flags & USER_EVENT_REG_PERSIST) { + /* Ensure we track self ref and caller ref (2) */ + refcount_set(&user->refcnt, 2); + } else { + /* Ensure we track only caller ref (1) */ + refcount_set(&user->refcnt, 1); + } + + dyn_event_init(&user->devent, &user_event_dops); + dyn_event_add(&user->devent, &user->call); + hash_add(group->register_table, &user->node, key); + current_user_events++; + + mutex_unlock(&event_mutex); + + *newuser = user; + return 0; +put_user_lock: + mutex_unlock(&event_mutex); +put_user: + user_event_destroy_fields(user); + user_event_destroy_validators(user); + kfree(user->call.print_fmt); + kfree(user); + return ret; +} + +/* + * Deletes a previously created event if it is no longer being used. + */ +static int delete_user_event(struct user_event_group *group, char *name) +{ + u32 key; + struct user_event *user = find_user_event(group, name, &key); + + if (!user) + return -ENOENT; + + user_event_put(user, true); + + if (!user_event_last_ref(user)) + return -EBUSY; + + return destroy_user_event(user); +} + +/* + * Validates the user payload and writes via iterator. + */ +static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) +{ + struct user_event_file_info *info = file->private_data; + struct user_event_refs *refs; + struct user_event *user = NULL; + struct tracepoint *tp; + ssize_t ret = i->count; + int idx; + + if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx))) + return -EFAULT; + + if (idx < 0) + return -EINVAL; + + rcu_read_lock_sched(); + + refs = rcu_dereference_sched(info->refs); + + /* + * The refs->events array is protected by RCU, and new items may be + * added. But the user retrieved from indexing into the events array + * shall be immutable while the file is opened. + */ + if (likely(refs && idx < refs->count)) + user = refs->events[idx]; + + rcu_read_unlock_sched(); + + if (unlikely(user == NULL)) + return -ENOENT; + + if (unlikely(i->count < user->min_size)) + return -EINVAL; + + tp = &user->tracepoint; + + /* + * It's possible key.enabled disables after this check, however + * we don't mind if a few events are included in this condition. + */ + if (likely(atomic_read(&tp->key.enabled) > 0)) { + struct tracepoint_func *probe_func_ptr; + user_event_func_t probe_func; + struct iov_iter copy; + void *tpdata; + bool faulted; + + if (unlikely(fault_in_iov_iter_readable(i, i->count))) + return -EFAULT; + + faulted = false; + + rcu_read_lock_sched(); + + probe_func_ptr = rcu_dereference_sched(tp->funcs); + + if (probe_func_ptr) { + do { + copy = *i; + probe_func = probe_func_ptr->func; + tpdata = probe_func_ptr->data; + probe_func(user, ©, tpdata, &faulted); + } while ((++probe_func_ptr)->func); + } + + rcu_read_unlock_sched(); + + if (unlikely(faulted)) + return -EFAULT; + } else + return -EBADF; + + return ret; +} + +static int user_events_open(struct inode *node, struct file *file) +{ + struct user_event_group *group; + struct user_event_file_info *info; + + group = current_user_event_group(); + + if (!group) + return -ENOENT; + + info = kzalloc(sizeof(*info), GFP_KERNEL_ACCOUNT); + + if (!info) + return -ENOMEM; + + info->group = group; + + file->private_data = info; + + return 0; +} + +static ssize_t user_events_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct iovec iov; + struct iov_iter i; + + if (unlikely(*ppos != 0)) + return -EFAULT; + + if (unlikely(import_single_range(ITER_SOURCE, (char __user *)ubuf, + count, &iov, &i))) + return -EFAULT; + + return user_events_write_core(file, &i); +} + +static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i) +{ + return user_events_write_core(kp->ki_filp, i); +} + +static int user_events_ref_add(struct user_event_file_info *info, + struct user_event *user) +{ + struct user_event_group *group = info->group; + struct user_event_refs *refs, *new_refs; + int i, size, count = 0; + + refs = rcu_dereference_protected(info->refs, + lockdep_is_held(&group->reg_mutex)); + + if (refs) { + count = refs->count; + + for (i = 0; i < count; ++i) + if (refs->events[i] == user) + return i; + } + + size = struct_size(refs, events, count + 1); + + new_refs = kzalloc(size, GFP_KERNEL_ACCOUNT); + + if (!new_refs) + return -ENOMEM; + + new_refs->count = count + 1; + + for (i = 0; i < count; ++i) + new_refs->events[i] = refs->events[i]; + + new_refs->events[i] = user_event_get(user); + + rcu_assign_pointer(info->refs, new_refs); + + if (refs) + kfree_rcu(refs, rcu); + + return i; +} + +static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg) +{ + u32 size; + long ret; + + ret = get_user(size, &ureg->size); + + if (ret) + return ret; + + if (size > PAGE_SIZE) + return -E2BIG; + + if (size < offsetofend(struct user_reg, write_index)) + return -EINVAL; + + ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); + + if (ret) + return ret; + + /* Ensure only valid flags */ + if (kreg->flags & ~(USER_EVENT_REG_MAX-1)) + return -EINVAL; + + /* Ensure supported size */ + switch (kreg->enable_size) { + case 4: + /* 32-bit */ + break; +#if BITS_PER_LONG >= 64 + case 8: + /* 64-bit */ + break; +#endif + default: + return -EINVAL; + } + + /* Ensure natural alignment */ + if (kreg->enable_addr % kreg->enable_size) + return -EINVAL; + + /* Ensure bit range for size */ + if (kreg->enable_bit > (kreg->enable_size * BITS_PER_BYTE) - 1) + return -EINVAL; + + /* Ensure accessible */ + if (!access_ok((const void __user *)(uintptr_t)kreg->enable_addr, + kreg->enable_size)) + return -EFAULT; + + kreg->size = size; + + return 0; +} + +/* + * Registers a user_event on behalf of a user process. + */ +static long user_events_ioctl_reg(struct user_event_file_info *info, + unsigned long uarg) +{ + struct user_reg __user *ureg = (struct user_reg __user *)uarg; + struct user_reg reg; + struct user_event *user; + struct user_event_enabler *enabler; + char *name; + long ret; + int write_result; + + ret = user_reg_get(ureg, ®); + + if (ret) + return ret; + + /* + * Prevent users from using the same address and bit multiple times + * within the same mm address space. This can cause unexpected behavior + * for user processes that is far easier to debug if this is explictly + * an error upon registering. + */ + if (current_user_event_enabler_exists((unsigned long)reg.enable_addr, + reg.enable_bit)) + return -EADDRINUSE; + + name = strndup_user((const char __user *)(uintptr_t)reg.name_args, + MAX_EVENT_DESC); + + if (IS_ERR(name)) { + ret = PTR_ERR(name); + return ret; + } + + ret = user_event_parse_cmd(info->group, name, &user, reg.flags); + + if (ret) { + kfree(name); + return ret; + } + + ret = user_events_ref_add(info, user); + + /* No longer need parse ref, ref_add either worked or not */ + user_event_put(user, false); + + /* Positive number is index and valid */ + if (ret < 0) + return ret; + + /* + * user_events_ref_add succeeded: + * At this point we have a user_event, it's lifetime is bound by the + * reference count, not this file. If anything fails, the user_event + * still has a reference until the file is released. During release + * any remaining references (from user_events_ref_add) are decremented. + * + * Attempt to create an enabler, which too has a lifetime tied in the + * same way for the event. Once the task that caused the enabler to be + * created exits or issues exec() then the enablers it has created + * will be destroyed and the ref to the event will be decremented. + */ + enabler = user_event_enabler_create(®, user, &write_result); + + if (!enabler) + return -ENOMEM; + + /* Write failed/faulted, give error back to caller */ + if (write_result) + return write_result; + + put_user((u32)ret, &ureg->write_index); + + return 0; +} + +/* + * Deletes a user_event on behalf of a user process. + */ +static long user_events_ioctl_del(struct user_event_file_info *info, + unsigned long uarg) +{ + void __user *ubuf = (void __user *)uarg; + char *name; + long ret; + + name = strndup_user(ubuf, MAX_EVENT_DESC); + + if (IS_ERR(name)) + return PTR_ERR(name); + + /* event_mutex prevents dyn_event from racing */ + mutex_lock(&event_mutex); + ret = delete_user_event(info->group, name); + mutex_unlock(&event_mutex); + + kfree(name); + + return ret; +} + +static long user_unreg_get(struct user_unreg __user *ureg, + struct user_unreg *kreg) +{ + u32 size; + long ret; + + ret = get_user(size, &ureg->size); + + if (ret) + return ret; + + if (size > PAGE_SIZE) + return -E2BIG; + + if (size < offsetofend(struct user_unreg, disable_addr)) + return -EINVAL; + + ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); + + /* Ensure no reserved values, since we don't support any yet */ + if (kreg->__reserved || kreg->__reserved2) + return -EINVAL; + + return ret; +} + +static int user_event_mm_clear_bit(struct user_event_mm *user_mm, + unsigned long uaddr, unsigned char bit, + unsigned long flags) +{ + struct user_event_enabler enabler; + int result; + int attempt = 0; + + memset(&enabler, 0, sizeof(enabler)); + enabler.addr = uaddr; + enabler.values = bit | flags; +retry: + /* Prevents state changes from racing with new enablers */ + mutex_lock(&event_mutex); + + /* Force the bit to be cleared, since no event is attached */ + mmap_read_lock(user_mm->mm); + result = user_event_enabler_write(user_mm, &enabler, false, &attempt); + mmap_read_unlock(user_mm->mm); + + mutex_unlock(&event_mutex); + + if (result) { + /* Attempt to fault-in and retry if it worked */ + if (!user_event_mm_fault_in(user_mm, uaddr, attempt)) + goto retry; + } + + return result; +} + +/* + * Unregisters an enablement address/bit within a task/user mm. + */ +static long user_events_ioctl_unreg(unsigned long uarg) +{ + struct user_unreg __user *ureg = (struct user_unreg __user *)uarg; + struct user_event_mm *mm = current->user_event_mm; + struct user_event_enabler *enabler, *next; + struct user_unreg reg; + unsigned long flags; + long ret; + + ret = user_unreg_get(ureg, ®); + + if (ret) + return ret; + + if (!mm) + return -ENOENT; + + flags = 0; + ret = -ENOENT; + + /* + * Flags freeing and faulting are used to indicate if the enabler is in + * use at all. When faulting is set a page-fault is occurring asyncly. + * During async fault if freeing is set, the enabler will be destroyed. + * If no async fault is happening, we can destroy it now since we hold + * the event_mutex during these checks. + */ + mutex_lock(&event_mutex); + + list_for_each_entry_safe(enabler, next, &mm->enablers, mm_enablers_link) { + if (enabler->addr == reg.disable_addr && + ENABLE_BIT(enabler) == reg.disable_bit) { + set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)); + + /* We must keep compat flags for the clear */ + flags |= enabler->values & ENABLE_VAL_COMPAT_MASK; + + if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler))) + user_event_enabler_destroy(enabler, true); + + /* Removed at least one */ + ret = 0; + } + } + + mutex_unlock(&event_mutex); + + /* Ensure bit is now cleared for user, regardless of event status */ + if (!ret) + ret = user_event_mm_clear_bit(mm, reg.disable_addr, + reg.disable_bit, flags); + + return ret; +} + +/* + * Handles the ioctl from user mode to register or alter operations. + */ +static long user_events_ioctl(struct file *file, unsigned int cmd, + unsigned long uarg) +{ + struct user_event_file_info *info = file->private_data; + struct user_event_group *group = info->group; + long ret = -ENOTTY; + + switch (cmd) { + case DIAG_IOCSREG: + mutex_lock(&group->reg_mutex); + ret = user_events_ioctl_reg(info, uarg); + mutex_unlock(&group->reg_mutex); + break; + + case DIAG_IOCSDEL: + mutex_lock(&group->reg_mutex); + ret = user_events_ioctl_del(info, uarg); + mutex_unlock(&group->reg_mutex); + break; + + case DIAG_IOCSUNREG: + mutex_lock(&group->reg_mutex); + ret = user_events_ioctl_unreg(uarg); + mutex_unlock(&group->reg_mutex); + break; + } + + return ret; +} + +/* + * Handles the final close of the file from user mode. + */ +static int user_events_release(struct inode *node, struct file *file) +{ + struct user_event_file_info *info = file->private_data; + struct user_event_group *group; + struct user_event_refs *refs; + int i; + + if (!info) + return -EINVAL; + + group = info->group; + + /* + * Ensure refs cannot change under any situation by taking the + * register mutex during the final freeing of the references. + */ + mutex_lock(&group->reg_mutex); + + refs = info->refs; + + if (!refs) + goto out; + + /* + * The lifetime of refs has reached an end, it's tied to this file. + * The underlying user_events are ref counted, and cannot be freed. + * After this decrement, the user_events may be freed elsewhere. + */ + for (i = 0; i < refs->count; ++i) + user_event_put(refs->events[i], false); + +out: + file->private_data = NULL; + + mutex_unlock(&group->reg_mutex); + + kfree(refs); + kfree(info); + + return 0; +} + +static const struct file_operations user_data_fops = { + .open = user_events_open, + .write = user_events_write, + .write_iter = user_events_write_iter, + .unlocked_ioctl = user_events_ioctl, + .release = user_events_release, +}; + +static void *user_seq_start(struct seq_file *m, loff_t *pos) +{ + if (*pos) + return NULL; + + return (void *)1; +} + +static void *user_seq_next(struct seq_file *m, void *p, loff_t *pos) +{ + ++*pos; + return NULL; +} + +static void user_seq_stop(struct seq_file *m, void *p) +{ +} + +static int user_seq_show(struct seq_file *m, void *p) +{ + struct user_event_group *group = m->private; + struct user_event *user; + char status; + int i, active = 0, busy = 0; + + if (!group) + return -EINVAL; + + mutex_lock(&group->reg_mutex); + + hash_for_each(group->register_table, i, user, node) { + status = user->status; + + seq_printf(m, "%s", EVENT_NAME(user)); + + if (status != 0) + seq_puts(m, " #"); + + if (status != 0) { + seq_puts(m, " Used by"); + if (status & EVENT_STATUS_FTRACE) + seq_puts(m, " ftrace"); + if (status & EVENT_STATUS_PERF) + seq_puts(m, " perf"); + if (status & EVENT_STATUS_OTHER) + seq_puts(m, " other"); + busy++; + } + + seq_puts(m, "\n"); + active++; + } + + mutex_unlock(&group->reg_mutex); + + seq_puts(m, "\n"); + seq_printf(m, "Active: %d\n", active); + seq_printf(m, "Busy: %d\n", busy); + + return 0; +} + +static const struct seq_operations user_seq_ops = { + .start = user_seq_start, + .next = user_seq_next, + .stop = user_seq_stop, + .show = user_seq_show, +}; + +static int user_status_open(struct inode *node, struct file *file) +{ + struct user_event_group *group; + int ret; + + group = current_user_event_group(); + + if (!group) + return -ENOENT; + + ret = seq_open(file, &user_seq_ops); + + if (!ret) { + /* Chain group to seq_file */ + struct seq_file *m = file->private_data; + + m->private = group; + } + + return ret; +} + +static const struct file_operations user_status_fops = { + .open = user_status_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * Creates a set of tracefs files to allow user mode interactions. + */ +static int create_user_tracefs(void) +{ + struct dentry *edata, *emmap; + + edata = tracefs_create_file("user_events_data", TRACE_MODE_WRITE, + NULL, NULL, &user_data_fops); + + if (!edata) { + pr_warn("Could not create tracefs 'user_events_data' entry\n"); + goto err; + } + + emmap = tracefs_create_file("user_events_status", TRACE_MODE_READ, + NULL, NULL, &user_status_fops); + + if (!emmap) { + tracefs_remove(edata); + pr_warn("Could not create tracefs 'user_events_mmap' entry\n"); + goto err; + } + + return 0; +err: + return -ENODEV; +} + +static int set_max_user_events_sysctl(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + mutex_lock(&event_mutex); + + ret = proc_douintvec(table, write, buffer, lenp, ppos); + + mutex_unlock(&event_mutex); + + return ret; +} + +static struct ctl_table user_event_sysctls[] = { + { + .procname = "user_events_max", + .data = &max_user_events, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = set_max_user_events_sysctl, + }, + {} +}; + +static int __init trace_events_user_init(void) +{ + int ret; + + fault_cache = KMEM_CACHE(user_event_enabler_fault, 0); + + if (!fault_cache) + return -ENOMEM; + + init_group = user_event_group_create(); + + if (!init_group) { + kmem_cache_destroy(fault_cache); + return -ENOMEM; + } + + ret = create_user_tracefs(); + + if (ret) { + pr_warn("user_events could not register with tracefs\n"); + user_event_group_destroy(init_group); + kmem_cache_destroy(fault_cache); + init_group = NULL; + return ret; + } + + if (dyn_event_register(&user_event_dops)) + pr_warn("user_events could not register with dyn_events\n"); + + register_sysctl_init("kernel", user_event_sysctls); + + return 0; +} + +fs_initcall(trace_events_user_init); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c new file mode 100644 index 0000000000..1698fc22af --- /dev/null +++ b/kernel/trace/trace_export.c @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace_export.c - export basic ftrace utilities to user space + * + * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com> + */ +#include <linux/stringify.h> +#include <linux/kallsyms.h> +#include <linux/seq_file.h> +#include <linux/uaccess.h> +#include <linux/ftrace.h> +#include <linux/module.h> +#include <linux/init.h> + +#include "trace_output.h" + +/* Stub function for events with triggers */ +static int ftrace_event_register(struct trace_event_call *call, + enum trace_reg type, void *data) +{ + return 0; +} + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ftrace + +/* + * The FTRACE_ENTRY_REG macro allows ftrace entry to define register + * function and thus become accessible via perf. + */ +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) + +/* not needed for this file */ +#undef __field_struct +#define __field_struct(type, item) + +#undef __field +#define __field(type, item) type item; + +#undef __field_fn +#define __field_fn(type, item) type item; + +#undef __field_desc +#define __field_desc(type, container, item) type item; + +#undef __field_packed +#define __field_packed(type, container, item) type item; + +#undef __array +#define __array(type, item, size) type item[size]; + +#undef __stack_array +#define __stack_array(type, item, size, field) __array(type, item, size) + +#undef __array_desc +#define __array_desc(type, container, item, size) type item[size]; + +#undef __dynamic_array +#define __dynamic_array(type, item) type item[]; + +#undef F_STRUCT +#define F_STRUCT(args...) args + +#undef F_printk +#define F_printk(fmt, args...) fmt, args + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ +struct ____ftrace_##name { \ + tstruct \ +}; \ +static void __always_unused ____ftrace_check_##name(void) \ +{ \ + struct ____ftrace_##name *__entry = NULL; \ + \ + /* force compile-time check on F_printk() */ \ + printk(print); \ +} + +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) + +#include "trace_entries.h" + +#undef __field_ext +#define __field_ext(_type, _item, _filter_type) { \ + .type = #_type, .name = #_item, \ + .size = sizeof(_type), .align = __alignof__(_type), \ + is_signed_type(_type), .filter_type = _filter_type }, + + +#undef __field_ext_packed +#define __field_ext_packed(_type, _item, _filter_type) { \ + .type = #_type, .name = #_item, \ + .size = sizeof(_type), .align = 1, \ + is_signed_type(_type), .filter_type = _filter_type }, + +#undef __field +#define __field(_type, _item) __field_ext(_type, _item, FILTER_OTHER) + +#undef __field_fn +#define __field_fn(_type, _item) __field_ext(_type, _item, FILTER_TRACE_FN) + +#undef __field_desc +#define __field_desc(_type, _container, _item) __field_ext(_type, _item, FILTER_OTHER) + +#undef __field_packed +#define __field_packed(_type, _container, _item) __field_ext_packed(_type, _item, FILTER_OTHER) + +#undef __array +#define __array(_type, _item, _len) { \ + .type = #_type"["__stringify(_len)"]", .name = #_item, \ + .size = sizeof(_type[_len]), .align = __alignof__(_type), \ + is_signed_type(_type), .filter_type = FILTER_OTHER, \ + .len = _len }, + +#undef __stack_array +#define __stack_array(_type, _item, _len, _field) __array(_type, _item, _len) + +#undef __array_desc +#define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len) + +#undef __dynamic_array +#define __dynamic_array(_type, _item) { \ + .type = #_type "[]", .name = #_item, \ + .size = 0, .align = __alignof__(_type), \ + is_signed_type(_type), .filter_type = FILTER_OTHER }, + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ +static struct trace_event_fields ftrace_event_fields_##name[] = { \ + tstruct \ + {} }; + +#include "trace_entries.h" + +#undef __entry +#define __entry REC + +#undef __field +#define __field(type, item) + +#undef __field_fn +#define __field_fn(type, item) + +#undef __field_desc +#define __field_desc(type, container, item) + +#undef __field_packed +#define __field_packed(type, container, item) + +#undef __array +#define __array(type, item, len) + +#undef __stack_array +#define __stack_array(type, item, len, field) + +#undef __array_desc +#define __array_desc(type, container, item, len) + +#undef __dynamic_array +#define __dynamic_array(type, item) + +#undef F_printk +#define F_printk(fmt, args...) __stringify(fmt) ", " __stringify(args) + +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, regfn) \ +static struct trace_event_class __refdata event_class_ftrace_##call = { \ + .system = __stringify(TRACE_SYSTEM), \ + .fields_array = ftrace_event_fields_##call, \ + .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ + .reg = regfn, \ +}; \ + \ +struct trace_event_call __used event_##call = { \ + .class = &event_class_ftrace_##call, \ + { \ + .name = #call, \ + }, \ + .event.type = etype, \ + .print_fmt = print, \ + .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ +}; \ +static struct trace_event_call __used \ +__section("_ftrace_events") *__event_##call = &event_##call; + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ + FTRACE_ENTRY_REG(call, struct_name, etype, \ + PARAMS(tstruct), PARAMS(print), NULL) + +bool ftrace_event_is_function(struct trace_event_call *call) +{ + return call == &event_function; +} + +#include "trace_entries.h" diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c new file mode 100644 index 0000000000..7d2ddbcfa3 --- /dev/null +++ b/kernel/trace/trace_fprobe.c @@ -0,0 +1,1231 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Fprobe-based tracing events + * Copyright (C) 2022 Google LLC. + */ +#define pr_fmt(fmt) "trace_fprobe: " fmt + +#include <linux/fprobe.h> +#include <linux/module.h> +#include <linux/rculist.h> +#include <linux/security.h> +#include <linux/tracepoint.h> +#include <linux/uaccess.h> + +#include "trace_dynevent.h" +#include "trace_probe.h" +#include "trace_probe_kernel.h" +#include "trace_probe_tmpl.h" + +#define FPROBE_EVENT_SYSTEM "fprobes" +#define TRACEPOINT_EVENT_SYSTEM "tracepoints" +#define RETHOOK_MAXACTIVE_MAX 4096 + +static int trace_fprobe_create(const char *raw_command); +static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev); +static int trace_fprobe_release(struct dyn_event *ev); +static bool trace_fprobe_is_busy(struct dyn_event *ev); +static bool trace_fprobe_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev); + +static struct dyn_event_operations trace_fprobe_ops = { + .create = trace_fprobe_create, + .show = trace_fprobe_show, + .is_busy = trace_fprobe_is_busy, + .free = trace_fprobe_release, + .match = trace_fprobe_match, +}; + +/* + * Fprobe event core functions + */ +struct trace_fprobe { + struct dyn_event devent; + struct fprobe fp; + const char *symbol; + struct tracepoint *tpoint; + struct module *mod; + struct trace_probe tp; +}; + +static bool is_trace_fprobe(struct dyn_event *ev) +{ + return ev->ops == &trace_fprobe_ops; +} + +static struct trace_fprobe *to_trace_fprobe(struct dyn_event *ev) +{ + return container_of(ev, struct trace_fprobe, devent); +} + +/** + * for_each_trace_fprobe - iterate over the trace_fprobe list + * @pos: the struct trace_fprobe * for each entry + * @dpos: the struct dyn_event * to use as a loop cursor + */ +#define for_each_trace_fprobe(pos, dpos) \ + for_each_dyn_event(dpos) \ + if (is_trace_fprobe(dpos) && (pos = to_trace_fprobe(dpos))) + +static bool trace_fprobe_is_return(struct trace_fprobe *tf) +{ + return tf->fp.exit_handler != NULL; +} + +static bool trace_fprobe_is_tracepoint(struct trace_fprobe *tf) +{ + return tf->tpoint != NULL; +} + +static const char *trace_fprobe_symbol(struct trace_fprobe *tf) +{ + return tf->symbol ? tf->symbol : "unknown"; +} + +static bool trace_fprobe_is_busy(struct dyn_event *ev) +{ + struct trace_fprobe *tf = to_trace_fprobe(ev); + + return trace_probe_is_enabled(&tf->tp); +} + +static bool trace_fprobe_match_command_head(struct trace_fprobe *tf, + int argc, const char **argv) +{ + char buf[MAX_ARGSTR_LEN + 1]; + + if (!argc) + return true; + + snprintf(buf, sizeof(buf), "%s", trace_fprobe_symbol(tf)); + if (strcmp(buf, argv[0])) + return false; + argc--; argv++; + + return trace_probe_match_command_args(&tf->tp, argc, argv); +} + +static bool trace_fprobe_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev) +{ + struct trace_fprobe *tf = to_trace_fprobe(ev); + + if (event[0] != '\0' && strcmp(trace_probe_name(&tf->tp), event)) + return false; + + if (system && strcmp(trace_probe_group_name(&tf->tp), system)) + return false; + + return trace_fprobe_match_command_head(tf, argc, argv); +} + +static bool trace_fprobe_is_registered(struct trace_fprobe *tf) +{ + return fprobe_is_registered(&tf->fp); +} + +/* + * Note that we don't verify the fetch_insn code, since it does not come + * from user space. + */ +static int +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, + void *base) +{ + struct pt_regs *regs = rec; + unsigned long val; + int ret; + +retry: + /* 1st stage: get value from context */ + switch (code->op) { + case FETCH_OP_STACK: + val = regs_get_kernel_stack_nth(regs, code->param); + break; + case FETCH_OP_STACKP: + val = kernel_stack_pointer(regs); + break; + case FETCH_OP_RETVAL: + val = regs_return_value(regs); + break; +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + case FETCH_OP_ARG: + val = regs_get_kernel_argument(regs, code->param); + break; +#endif + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + ret = process_common_fetch_insn(code, &val); + if (ret < 0) + return ret; + } + code++; + + return process_fetch_insn_bottom(code, val, dest, base); +} +NOKPROBE_SYMBOL(process_fetch_insn) + +/* function entry handler */ +static nokprobe_inline void +__fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, + struct pt_regs *regs, + struct trace_event_file *trace_file) +{ + struct fentry_trace_entry_head *entry; + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + struct trace_event_buffer fbuffer; + int dsize; + + if (WARN_ON_ONCE(call != trace_file->event_call)) + return; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + dsize = __get_data_size(&tf->tp, regs); + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + tf->tp.size + dsize); + if (!entry) + return; + + fbuffer.regs = regs; + entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); + entry->ip = entry_ip; + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + + trace_event_buffer_commit(&fbuffer); +} + +static void +fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, + struct pt_regs *regs) +{ + struct event_file_link *link; + + trace_probe_for_each_link_rcu(link, &tf->tp) + __fentry_trace_func(tf, entry_ip, regs, link->file); +} +NOKPROBE_SYMBOL(fentry_trace_func); + +/* Kretprobe handler */ +static nokprobe_inline void +__fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs, + struct trace_event_file *trace_file) +{ + struct fexit_trace_entry_head *entry; + struct trace_event_buffer fbuffer; + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + int dsize; + + if (WARN_ON_ONCE(call != trace_file->event_call)) + return; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + dsize = __get_data_size(&tf->tp, regs); + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + tf->tp.size + dsize); + if (!entry) + return; + + fbuffer.regs = regs; + entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); + entry->func = entry_ip; + entry->ret_ip = ret_ip; + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + + trace_event_buffer_commit(&fbuffer); +} + +static void +fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs) +{ + struct event_file_link *link; + + trace_probe_for_each_link_rcu(link, &tf->tp) + __fexit_trace_func(tf, entry_ip, ret_ip, regs, link->file); +} +NOKPROBE_SYMBOL(fexit_trace_func); + +#ifdef CONFIG_PERF_EVENTS + +static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, + struct pt_regs *regs) +{ + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + struct fentry_trace_entry_head *entry; + struct hlist_head *head; + int size, __size, dsize; + int rctx; + + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return 0; + + dsize = __get_data_size(&tf->tp, regs); + __size = sizeof(*entry) + tf->tp.size + dsize; + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + entry = perf_trace_buf_alloc(size, NULL, &rctx); + if (!entry) + return 0; + + entry->ip = entry_ip; + memset(&entry[1], 0, dsize); + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); + return 0; +} +NOKPROBE_SYMBOL(fentry_perf_func); + +static void +fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs) +{ + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + struct fexit_trace_entry_head *entry; + struct hlist_head *head; + int size, __size, dsize; + int rctx; + + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return; + + dsize = __get_data_size(&tf->tp, regs); + __size = sizeof(*entry) + tf->tp.size + dsize; + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + entry = perf_trace_buf_alloc(size, NULL, &rctx); + if (!entry) + return; + + entry->func = entry_ip; + entry->ret_ip = ret_ip; + store_trace_args(&entry[1], &tf->tp, regs, sizeof(*entry), dsize); + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); +} +NOKPROBE_SYMBOL(fexit_perf_func); +#endif /* CONFIG_PERF_EVENTS */ + +static int fentry_dispatcher(struct fprobe *fp, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs, + void *entry_data) +{ + struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); + int ret = 0; + + if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) + fentry_trace_func(tf, entry_ip, regs); +#ifdef CONFIG_PERF_EVENTS + if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) + ret = fentry_perf_func(tf, entry_ip, regs); +#endif + return ret; +} +NOKPROBE_SYMBOL(fentry_dispatcher); + +static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs, + void *entry_data) +{ + struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); + + if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) + fexit_trace_func(tf, entry_ip, ret_ip, regs); +#ifdef CONFIG_PERF_EVENTS + if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) + fexit_perf_func(tf, entry_ip, ret_ip, regs); +#endif +} +NOKPROBE_SYMBOL(fexit_dispatcher); + +static void free_trace_fprobe(struct trace_fprobe *tf) +{ + if (tf) { + trace_probe_cleanup(&tf->tp); + kfree(tf->symbol); + kfree(tf); + } +} + +/* + * Allocate new trace_probe and initialize it (including fprobe). + */ +static struct trace_fprobe *alloc_trace_fprobe(const char *group, + const char *event, + const char *symbol, + struct tracepoint *tpoint, + int maxactive, + int nargs, bool is_return) +{ + struct trace_fprobe *tf; + int ret = -ENOMEM; + + tf = kzalloc(struct_size(tf, tp.args, nargs), GFP_KERNEL); + if (!tf) + return ERR_PTR(ret); + + tf->symbol = kstrdup(symbol, GFP_KERNEL); + if (!tf->symbol) + goto error; + + if (is_return) + tf->fp.exit_handler = fexit_dispatcher; + else + tf->fp.entry_handler = fentry_dispatcher; + + tf->tpoint = tpoint; + tf->fp.nr_maxactive = maxactive; + + ret = trace_probe_init(&tf->tp, event, group, false); + if (ret < 0) + goto error; + + dyn_event_init(&tf->devent, &trace_fprobe_ops); + return tf; +error: + free_trace_fprobe(tf); + return ERR_PTR(ret); +} + +static struct trace_fprobe *find_trace_fprobe(const char *event, + const char *group) +{ + struct dyn_event *pos; + struct trace_fprobe *tf; + + for_each_trace_fprobe(tf, pos) + if (strcmp(trace_probe_name(&tf->tp), event) == 0 && + strcmp(trace_probe_group_name(&tf->tp), group) == 0) + return tf; + return NULL; +} + +static inline int __enable_trace_fprobe(struct trace_fprobe *tf) +{ + if (trace_fprobe_is_registered(tf)) + enable_fprobe(&tf->fp); + + return 0; +} + +static void __disable_trace_fprobe(struct trace_probe *tp) +{ + struct trace_fprobe *tf; + + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + if (!trace_fprobe_is_registered(tf)) + continue; + disable_fprobe(&tf->fp); + } +} + +/* + * Enable trace_probe + * if the file is NULL, enable "perf" handler, or enable "trace" handler. + */ +static int enable_trace_fprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *tp; + struct trace_fprobe *tf; + bool enabled; + int ret = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This also changes "enabled" state */ + if (file) { + ret = trace_probe_add_file(tp, file); + if (ret) + return ret; + } else + trace_probe_set_flag(tp, TP_FLAG_PROFILE); + + if (!enabled) { + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + /* TODO: check the fprobe is gone */ + __enable_trace_fprobe(tf); + } + } + + return 0; +} + +/* + * Disable trace_probe + * if the file is NULL, disable "perf" handler, or disable "trace" handler. + */ +static int disable_trace_fprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + if (file) { + if (!trace_probe_get_file_link(tp, file)) + return -ENOENT; + if (!trace_probe_has_single_file(tp)) + goto out; + trace_probe_clear_flag(tp, TP_FLAG_TRACE); + } else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + + if (!trace_probe_is_enabled(tp)) + __disable_trace_fprobe(tp); + + out: + if (file) + /* + * Synchronization is done in below function. For perf event, + * file == NULL and perf_trace_event_unreg() calls + * tracepoint_synchronize_unregister() to ensure synchronize + * event. We don't need to care about it. + */ + trace_probe_remove_file(tp, file); + + return 0; +} + +/* Event entry printers */ +static enum print_line_t +print_fentry_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct fentry_trace_entry_head *field; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + + field = (struct fentry_trace_entry_head *)iter->ent; + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; + + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); + + if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + goto out; + + trace_seq_putc(s, ')'); + + if (trace_probe_print_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; + + trace_seq_putc(s, '\n'); + out: + return trace_handle_return(s); +} + +static enum print_line_t +print_fexit_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct fexit_trace_entry_head *field; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + + field = (struct fexit_trace_entry_head *)iter->ent; + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; + + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); + + if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + goto out; + + trace_seq_puts(s, " <- "); + + if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + goto out; + + trace_seq_putc(s, ')'); + + if (trace_probe_print_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; + + trace_seq_putc(s, '\n'); + + out: + return trace_handle_return(s); +} + +static int fentry_event_define_fields(struct trace_event_call *event_call) +{ + int ret; + struct fentry_trace_entry_head field; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; + + DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); + + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); +} + +static int fexit_event_define_fields(struct trace_event_call *event_call) +{ + int ret; + struct fexit_trace_entry_head field; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; + + DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); + DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); + + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); +} + +static struct trace_event_functions fentry_funcs = { + .trace = print_fentry_event +}; + +static struct trace_event_functions fexit_funcs = { + .trace = print_fexit_event +}; + +static struct trace_event_fields fentry_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = fentry_event_define_fields }, + {} +}; + +static struct trace_event_fields fexit_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = fexit_event_define_fields }, + {} +}; + +static int fprobe_register(struct trace_event_call *event, + enum trace_reg type, void *data); + +static inline void init_trace_event_call(struct trace_fprobe *tf) +{ + struct trace_event_call *call = trace_probe_event_call(&tf->tp); + + if (trace_fprobe_is_return(tf)) { + call->event.funcs = &fexit_funcs; + call->class->fields_array = fexit_fields_array; + } else { + call->event.funcs = &fentry_funcs; + call->class->fields_array = fentry_fields_array; + } + + call->flags = TRACE_EVENT_FL_FPROBE; + call->class->reg = fprobe_register; +} + +static int register_fprobe_event(struct trace_fprobe *tf) +{ + init_trace_event_call(tf); + + return trace_probe_register_event_call(&tf->tp); +} + +static int unregister_fprobe_event(struct trace_fprobe *tf) +{ + return trace_probe_unregister_event_call(&tf->tp); +} + +/* Internal register function - just handle fprobe and flags */ +static int __register_trace_fprobe(struct trace_fprobe *tf) +{ + int i, ret; + + /* Should we need new LOCKDOWN flag for fprobe? */ + ret = security_locked_down(LOCKDOWN_KPROBES); + if (ret) + return ret; + + if (trace_fprobe_is_registered(tf)) + return -EINVAL; + + for (i = 0; i < tf->tp.nr_args; i++) { + ret = traceprobe_update_arg(&tf->tp.args[i]); + if (ret) + return ret; + } + + /* Set/clear disabled flag according to tp->flag */ + if (trace_probe_is_enabled(&tf->tp)) + tf->fp.flags &= ~FPROBE_FL_DISABLED; + else + tf->fp.flags |= FPROBE_FL_DISABLED; + + if (trace_fprobe_is_tracepoint(tf)) { + struct tracepoint *tpoint = tf->tpoint; + unsigned long ip = (unsigned long)tpoint->probestub; + /* + * Here, we do 2 steps to enable fprobe on a tracepoint. + * At first, put __probestub_##TP function on the tracepoint + * and put a fprobe on the stub function. + */ + ret = tracepoint_probe_register_prio_may_exist(tpoint, + tpoint->probestub, NULL, 0); + if (ret < 0) + return ret; + return register_fprobe_ips(&tf->fp, &ip, 1); + } + + /* TODO: handle filter, nofilter or symbol list */ + return register_fprobe(&tf->fp, tf->symbol, NULL); +} + +/* Internal unregister function - just handle fprobe and flags */ +static void __unregister_trace_fprobe(struct trace_fprobe *tf) +{ + if (trace_fprobe_is_registered(tf)) { + unregister_fprobe(&tf->fp); + memset(&tf->fp, 0, sizeof(tf->fp)); + if (trace_fprobe_is_tracepoint(tf)) { + tracepoint_probe_unregister(tf->tpoint, + tf->tpoint->probestub, NULL); + tf->tpoint = NULL; + tf->mod = NULL; + } + } +} + +/* TODO: make this trace_*probe common function */ +/* Unregister a trace_probe and probe_event */ +static int unregister_trace_fprobe(struct trace_fprobe *tf) +{ + /* If other probes are on the event, just unregister fprobe */ + if (trace_probe_has_sibling(&tf->tp)) + goto unreg; + + /* Enabled event can not be unregistered */ + if (trace_probe_is_enabled(&tf->tp)) + return -EBUSY; + + /* If there's a reference to the dynamic event */ + if (trace_event_dyn_busy(trace_probe_event_call(&tf->tp))) + return -EBUSY; + + /* Will fail if probe is being used by ftrace or perf */ + if (unregister_fprobe_event(tf)) + return -EBUSY; + +unreg: + __unregister_trace_fprobe(tf); + dyn_event_remove(&tf->devent); + trace_probe_unlink(&tf->tp); + + return 0; +} + +static bool trace_fprobe_has_same_fprobe(struct trace_fprobe *orig, + struct trace_fprobe *comp) +{ + struct trace_probe_event *tpe = orig->tp.event; + int i; + + list_for_each_entry(orig, &tpe->probes, tp.list) { + if (strcmp(trace_fprobe_symbol(orig), + trace_fprobe_symbol(comp))) + continue; + + /* + * trace_probe_compare_arg_type() ensured that nr_args and + * each argument name and type are same. Let's compare comm. + */ + for (i = 0; i < orig->tp.nr_args; i++) { + if (strcmp(orig->tp.args[i].comm, + comp->tp.args[i].comm)) + break; + } + + if (i == orig->tp.nr_args) + return true; + } + + return false; +} + +static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) +{ + int ret; + + if (trace_fprobe_is_return(tf) != trace_fprobe_is_return(to) || + trace_fprobe_is_tracepoint(tf) != trace_fprobe_is_tracepoint(to)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, DIFF_PROBE_TYPE); + return -EEXIST; + } + ret = trace_probe_compare_arg_type(&tf->tp, &to->tp); + if (ret) { + /* Note that argument starts index = 2 */ + trace_probe_log_set_index(ret + 1); + trace_probe_log_err(0, DIFF_ARG_TYPE); + return -EEXIST; + } + if (trace_fprobe_has_same_fprobe(to, tf)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, SAME_PROBE); + return -EEXIST; + } + + /* Append to existing event */ + ret = trace_probe_append(&tf->tp, &to->tp); + if (ret) + return ret; + + ret = __register_trace_fprobe(tf); + if (ret) + trace_probe_unlink(&tf->tp); + else + dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp)); + + return ret; +} + +/* Register a trace_probe and probe_event */ +static int register_trace_fprobe(struct trace_fprobe *tf) +{ + struct trace_fprobe *old_tf; + int ret; + + mutex_lock(&event_mutex); + + old_tf = find_trace_fprobe(trace_probe_name(&tf->tp), + trace_probe_group_name(&tf->tp)); + if (old_tf) { + ret = append_trace_fprobe(tf, old_tf); + goto end; + } + + /* Register new event */ + ret = register_fprobe_event(tf); + if (ret) { + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } else + pr_warn("Failed to register probe event(%d)\n", ret); + goto end; + } + + /* Register fprobe */ + ret = __register_trace_fprobe(tf); + if (ret < 0) + unregister_fprobe_event(tf); + else + dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp)); + +end: + mutex_unlock(&event_mutex); + return ret; +} + +#ifdef CONFIG_MODULES +static int __tracepoint_probe_module_cb(struct notifier_block *self, + unsigned long val, void *data) +{ + struct tp_module *tp_mod = data; + struct trace_fprobe *tf; + struct dyn_event *pos; + + if (val != MODULE_STATE_GOING) + return NOTIFY_DONE; + + mutex_lock(&event_mutex); + for_each_trace_fprobe(tf, pos) { + if (tp_mod->mod == tf->mod) { + tracepoint_probe_unregister(tf->tpoint, + tf->tpoint->probestub, NULL); + tf->tpoint = NULL; + tf->mod = NULL; + } + } + mutex_unlock(&event_mutex); + + return NOTIFY_DONE; +} + +static struct notifier_block tracepoint_module_nb = { + .notifier_call = __tracepoint_probe_module_cb, +}; +#endif /* CONFIG_MODULES */ + +struct __find_tracepoint_cb_data { + const char *tp_name; + struct tracepoint *tpoint; +}; + +static void __find_tracepoint_cb(struct tracepoint *tp, void *priv) +{ + struct __find_tracepoint_cb_data *data = priv; + + if (!data->tpoint && !strcmp(data->tp_name, tp->name)) + data->tpoint = tp; +} + +static struct tracepoint *find_tracepoint(const char *tp_name) +{ + struct __find_tracepoint_cb_data data = { + .tp_name = tp_name, + }; + + for_each_kernel_tracepoint(__find_tracepoint_cb, &data); + + return data.tpoint; +} + +static int parse_symbol_and_return(int argc, const char *argv[], + char **symbol, bool *is_return, + bool is_tracepoint) +{ + char *tmp = strchr(argv[1], '%'); + int i; + + if (tmp) { + int len = tmp - argv[1]; + + if (!is_tracepoint && !strcmp(tmp, "%return")) { + *is_return = true; + } else { + trace_probe_log_err(len, BAD_ADDR_SUFFIX); + return -EINVAL; + } + *symbol = kmemdup_nul(argv[1], len, GFP_KERNEL); + } else + *symbol = kstrdup(argv[1], GFP_KERNEL); + if (!*symbol) + return -ENOMEM; + + if (*is_return) + return 0; + + /* If there is $retval, this should be a return fprobe. */ + for (i = 2; i < argc; i++) { + tmp = strstr(argv[i], "$retval"); + if (tmp && !isalnum(tmp[7]) && tmp[7] != '_') { + if (is_tracepoint) { + trace_probe_log_set_index(i); + trace_probe_log_err(tmp - argv[i], RETVAL_ON_PROBE); + return -EINVAL; + } + *is_return = true; + break; + } + } + return 0; +} + +static int __trace_fprobe_create(int argc, const char *argv[]) +{ + /* + * Argument syntax: + * - Add fentry probe: + * f[:[GRP/][EVENT]] [MOD:]KSYM [FETCHARGS] + * - Add fexit probe: + * f[N][:[GRP/][EVENT]] [MOD:]KSYM%return [FETCHARGS] + * - Add tracepoint probe: + * t[:[GRP/][EVENT]] TRACEPOINT [FETCHARGS] + * + * Fetch args: + * $retval : fetch return value + * $stack : fetch stack address + * $stackN : fetch Nth entry of stack (N:0-) + * $argN : fetch Nth argument (N:1-) + * $comm : fetch current task comm + * @ADDR : fetch memory at ADDR (ADDR should be in kernel) + * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) + * Dereferencing memory fetch: + * +|-offs(ARG) : fetch memory at ARG +|- offs address. + * Alias name of args: + * NAME=FETCHARG : set NAME as alias of FETCHARG. + * Type of args: + * FETCHARG:TYPE : use TYPE instead of unsigned long. + */ + struct trace_fprobe *tf = NULL; + int i, len, new_argc = 0, ret = 0; + bool is_return = false; + char *symbol = NULL; + const char *event = NULL, *group = FPROBE_EVENT_SYSTEM; + const char **new_argv = NULL; + int maxactive = 0; + char buf[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; + char sbuf[KSYM_NAME_LEN]; + char abuf[MAX_BTF_ARGS_LEN]; + bool is_tracepoint = false; + struct tracepoint *tpoint = NULL; + struct traceprobe_parse_context ctx = { + .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE, + }; + + if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2) + return -ECANCELED; + + if (argv[0][0] == 't') { + is_tracepoint = true; + group = TRACEPOINT_EVENT_SYSTEM; + } + + trace_probe_log_init("trace_fprobe", argc, argv); + + event = strchr(&argv[0][1], ':'); + if (event) + event++; + + if (isdigit(argv[0][1])) { + if (event) + len = event - &argv[0][1] - 1; + else + len = strlen(&argv[0][1]); + if (len > MAX_EVENT_NAME_LEN - 1) { + trace_probe_log_err(1, BAD_MAXACT); + goto parse_error; + } + memcpy(buf, &argv[0][1], len); + buf[len] = '\0'; + ret = kstrtouint(buf, 0, &maxactive); + if (ret || !maxactive) { + trace_probe_log_err(1, BAD_MAXACT); + goto parse_error; + } + /* fprobe rethook instances are iterated over via a list. The + * maximum should stay reasonable. + */ + if (maxactive > RETHOOK_MAXACTIVE_MAX) { + trace_probe_log_err(1, MAXACT_TOO_BIG); + goto parse_error; + } + } + + trace_probe_log_set_index(1); + + /* a symbol(or tracepoint) must be specified */ + ret = parse_symbol_and_return(argc, argv, &symbol, &is_return, is_tracepoint); + if (ret < 0) + goto parse_error; + + if (!is_return && maxactive) { + trace_probe_log_set_index(0); + trace_probe_log_err(1, BAD_MAXACT_TYPE); + goto parse_error; + } + + trace_probe_log_set_index(0); + if (event) { + ret = traceprobe_parse_event_name(&event, &group, gbuf, + event - argv[0]); + if (ret) + goto parse_error; + } + + if (!event) { + /* Make a new event name */ + if (is_tracepoint) + snprintf(buf, MAX_EVENT_NAME_LEN, "%s%s", + isdigit(*symbol) ? "_" : "", symbol); + else + snprintf(buf, MAX_EVENT_NAME_LEN, "%s__%s", symbol, + is_return ? "exit" : "entry"); + sanitize_event_name(buf); + event = buf; + } + + if (is_return) + ctx.flags |= TPARG_FL_RETURN; + else + ctx.flags |= TPARG_FL_FENTRY; + + if (is_tracepoint) { + ctx.flags |= TPARG_FL_TPOINT; + tpoint = find_tracepoint(symbol); + if (!tpoint) { + trace_probe_log_set_index(1); + trace_probe_log_err(0, NO_TRACEPOINT); + goto parse_error; + } + ctx.funcname = kallsyms_lookup( + (unsigned long)tpoint->probestub, + NULL, NULL, NULL, sbuf); + } else + ctx.funcname = symbol; + + argc -= 2; argv += 2; + new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, + abuf, MAX_BTF_ARGS_LEN, &ctx); + if (IS_ERR(new_argv)) { + ret = PTR_ERR(new_argv); + new_argv = NULL; + goto out; + } + if (new_argv) { + argc = new_argc; + argv = new_argv; + } + + /* setup a probe */ + tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive, + argc, is_return); + if (IS_ERR(tf)) { + ret = PTR_ERR(tf); + /* This must return -ENOMEM, else there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM); + goto out; /* We know tf is not allocated */ + } + + if (is_tracepoint) + tf->mod = __module_text_address( + (unsigned long)tf->tpoint->probestub); + + /* parse arguments */ + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + trace_probe_log_set_index(i + 2); + ctx.offset = 0; + ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx); + if (ret) + goto error; /* This can be -ENOMEM */ + } + + ret = traceprobe_set_print_fmt(&tf->tp, + is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL); + if (ret < 0) + goto error; + + ret = register_trace_fprobe(tf); + if (ret) { + trace_probe_log_set_index(1); + if (ret == -EILSEQ) + trace_probe_log_err(0, BAD_INSN_BNDRY); + else if (ret == -ENOENT) + trace_probe_log_err(0, BAD_PROBE_ADDR); + else if (ret != -ENOMEM && ret != -EEXIST) + trace_probe_log_err(0, FAIL_REG_PROBE); + goto error; + } + +out: + traceprobe_finish_parse(&ctx); + trace_probe_log_clear(); + kfree(new_argv); + kfree(symbol); + return ret; + +parse_error: + ret = -EINVAL; +error: + free_trace_fprobe(tf); + goto out; +} + +static int trace_fprobe_create(const char *raw_command) +{ + return trace_probe_create(raw_command, __trace_fprobe_create); +} + +static int trace_fprobe_release(struct dyn_event *ev) +{ + struct trace_fprobe *tf = to_trace_fprobe(ev); + int ret = unregister_trace_fprobe(tf); + + if (!ret) + free_trace_fprobe(tf); + return ret; +} + +static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev) +{ + struct trace_fprobe *tf = to_trace_fprobe(ev); + int i; + + if (trace_fprobe_is_tracepoint(tf)) + seq_putc(m, 't'); + else + seq_putc(m, 'f'); + if (trace_fprobe_is_return(tf) && tf->fp.nr_maxactive) + seq_printf(m, "%d", tf->fp.nr_maxactive); + seq_printf(m, ":%s/%s", trace_probe_group_name(&tf->tp), + trace_probe_name(&tf->tp)); + + seq_printf(m, " %s%s", trace_fprobe_symbol(tf), + trace_fprobe_is_return(tf) ? "%return" : ""); + + for (i = 0; i < tf->tp.nr_args; i++) + seq_printf(m, " %s=%s", tf->tp.args[i].name, tf->tp.args[i].comm); + seq_putc(m, '\n'); + + return 0; +} + +/* + * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex. + */ +static int fprobe_register(struct trace_event_call *event, + enum trace_reg type, void *data) +{ + struct trace_event_file *file = data; + + switch (type) { + case TRACE_REG_REGISTER: + return enable_trace_fprobe(event, file); + case TRACE_REG_UNREGISTER: + return disable_trace_fprobe(event, file); + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return enable_trace_fprobe(event, NULL); + case TRACE_REG_PERF_UNREGISTER: + return disable_trace_fprobe(event, NULL); + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; +#endif + } + return 0; +} + +/* + * Register dynevent at core_initcall. This allows kernel to setup fprobe + * events in postcore_initcall without tracefs. + */ +static __init int init_fprobe_trace_early(void) +{ + int ret; + + ret = dyn_event_register(&trace_fprobe_ops); + if (ret) + return ret; + +#ifdef CONFIG_MODULES + ret = register_tracepoint_module_notifier(&tracepoint_module_nb); + if (ret) + return ret; +#endif + + return 0; +} +core_initcall(init_fprobe_trace_early); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c new file mode 100644 index 0000000000..9f1bfbe105 --- /dev/null +++ b/kernel/trace/trace_functions.c @@ -0,0 +1,974 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ring buffer based function tracer + * + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> + * + * Based on code from the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 Nadia Yvette Chambers + */ +#include <linux/ring_buffer.h> +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/ftrace.h> +#include <linux/slab.h> +#include <linux/fs.h> + +#include "trace.h" + +static void tracing_start_function_trace(struct trace_array *tr); +static void tracing_stop_function_trace(struct trace_array *tr); +static void +function_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); +static void +function_stack_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); +static void +function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); +static void +function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs); +static struct tracer_flags func_flags; + +/* Our option */ +enum { + + TRACE_FUNC_NO_OPTS = 0x0, /* No flags set. */ + TRACE_FUNC_OPT_STACK = 0x1, + TRACE_FUNC_OPT_NO_REPEATS = 0x2, + + /* Update this to next highest bit. */ + TRACE_FUNC_OPT_HIGHEST_BIT = 0x4 +}; + +#define TRACE_FUNC_OPT_MASK (TRACE_FUNC_OPT_HIGHEST_BIT - 1) + +int ftrace_allocate_ftrace_ops(struct trace_array *tr) +{ + struct ftrace_ops *ops; + + /* The top level array uses the "global_ops" */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return 0; + + ops = kzalloc(sizeof(*ops), GFP_KERNEL); + if (!ops) + return -ENOMEM; + + /* Currently only the non stack version is supported */ + ops->func = function_trace_call; + ops->flags = FTRACE_OPS_FL_PID; + + tr->ops = ops; + ops->private = tr; + + return 0; +} + +void ftrace_free_ftrace_ops(struct trace_array *tr) +{ + kfree(tr->ops); + tr->ops = NULL; +} + +int ftrace_create_function_files(struct trace_array *tr, + struct dentry *parent) +{ + /* + * The top level array uses the "global_ops", and the files are + * created on boot up. + */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return 0; + + if (!tr->ops) + return -EINVAL; + + ftrace_create_filter_files(tr->ops, parent); + + return 0; +} + +void ftrace_destroy_function_files(struct trace_array *tr) +{ + ftrace_destroy_filter_files(tr->ops); + ftrace_free_ftrace_ops(tr); +} + +static ftrace_func_t select_trace_function(u32 flags_val) +{ + switch (flags_val & TRACE_FUNC_OPT_MASK) { + case TRACE_FUNC_NO_OPTS: + return function_trace_call; + case TRACE_FUNC_OPT_STACK: + return function_stack_trace_call; + case TRACE_FUNC_OPT_NO_REPEATS: + return function_no_repeats_trace_call; + case TRACE_FUNC_OPT_STACK | TRACE_FUNC_OPT_NO_REPEATS: + return function_stack_no_repeats_trace_call; + default: + return NULL; + } +} + +static bool handle_func_repeats(struct trace_array *tr, u32 flags_val) +{ + if (!tr->last_func_repeats && + (flags_val & TRACE_FUNC_OPT_NO_REPEATS)) { + tr->last_func_repeats = alloc_percpu(struct trace_func_repeats); + if (!tr->last_func_repeats) + return false; + } + + return true; +} + +static int function_trace_init(struct trace_array *tr) +{ + ftrace_func_t func; + /* + * Instance trace_arrays get their ops allocated + * at instance creation. Unless it failed + * the allocation. + */ + if (!tr->ops) + return -ENOMEM; + + func = select_trace_function(func_flags.val); + if (!func) + return -EINVAL; + + if (!handle_func_repeats(tr, func_flags.val)) + return -ENOMEM; + + ftrace_init_array_ops(tr, func); + + tr->array_buffer.cpu = raw_smp_processor_id(); + + tracing_start_cmdline_record(); + tracing_start_function_trace(tr); + return 0; +} + +static void function_trace_reset(struct trace_array *tr) +{ + tracing_stop_function_trace(tr); + tracing_stop_cmdline_record(); + ftrace_reset_array_ops(tr); +} + +static void function_trace_start(struct trace_array *tr) +{ + tracing_reset_online_cpus(&tr->array_buffer); +} + +static void +function_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + struct trace_array *tr = op->private; + struct trace_array_cpu *data; + unsigned int trace_ctx; + int bit; + int cpu; + + if (unlikely(!tr->function_enabled)) + return; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + trace_ctx = tracing_gen_ctx(); + + cpu = smp_processor_id(); + data = per_cpu_ptr(tr->array_buffer.data, cpu); + if (!atomic_read(&data->disabled)) + trace_function(tr, ip, parent_ip, trace_ctx); + + ftrace_test_recursion_unlock(bit); +} + +#ifdef CONFIG_UNWINDER_ORC +/* + * Skip 2: + * + * function_stack_trace_call() + * ftrace_call() + */ +#define STACK_SKIP 2 +#else +/* + * Skip 3: + * __trace_stack() + * function_stack_trace_call() + * ftrace_call() + */ +#define STACK_SKIP 3 +#endif + +static void +function_stack_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + struct trace_array *tr = op->private; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + unsigned int trace_ctx; + + if (unlikely(!tr->function_enabled)) + return; + + /* + * Need to use raw, since this must be called before the + * recursive protection is performed. + */ + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = per_cpu_ptr(tr->array_buffer.data, cpu); + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { + trace_ctx = tracing_gen_ctx_flags(flags); + trace_function(tr, ip, parent_ip, trace_ctx); + __trace_stack(tr, trace_ctx, STACK_SKIP); + } + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +static inline bool is_repeat_check(struct trace_array *tr, + struct trace_func_repeats *last_info, + unsigned long ip, unsigned long parent_ip) +{ + if (last_info->ip == ip && + last_info->parent_ip == parent_ip && + last_info->count < U16_MAX) { + last_info->ts_last_call = + ring_buffer_time_stamp(tr->array_buffer.buffer); + last_info->count++; + return true; + } + + return false; +} + +static inline void process_repeats(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + struct trace_func_repeats *last_info, + unsigned int trace_ctx) +{ + if (last_info->count) { + trace_last_func_repeats(tr, last_info, trace_ctx); + last_info->count = 0; + } + + last_info->ip = ip; + last_info->parent_ip = parent_ip; +} + +static void +function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + struct trace_func_repeats *last_info; + struct trace_array *tr = op->private; + struct trace_array_cpu *data; + unsigned int trace_ctx; + unsigned long flags; + int bit; + int cpu; + + if (unlikely(!tr->function_enabled)) + return; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + cpu = smp_processor_id(); + data = per_cpu_ptr(tr->array_buffer.data, cpu); + if (atomic_read(&data->disabled)) + goto out; + + /* + * An interrupt may happen at any place here. But as far as I can see, + * the only damage that this can cause is to mess up the repetition + * counter without valuable data being lost. + * TODO: think about a solution that is better than just hoping to be + * lucky. + */ + last_info = per_cpu_ptr(tr->last_func_repeats, cpu); + if (is_repeat_check(tr, last_info, ip, parent_ip)) + goto out; + + local_save_flags(flags); + trace_ctx = tracing_gen_ctx_flags(flags); + process_repeats(tr, ip, parent_ip, last_info, trace_ctx); + + trace_function(tr, ip, parent_ip, trace_ctx); + +out: + ftrace_test_recursion_unlock(bit); +} + +static void +function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + struct trace_func_repeats *last_info; + struct trace_array *tr = op->private; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + unsigned int trace_ctx; + + if (unlikely(!tr->function_enabled)) + return; + + /* + * Need to use raw, since this must be called before the + * recursive protection is performed. + */ + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = per_cpu_ptr(tr->array_buffer.data, cpu); + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { + last_info = per_cpu_ptr(tr->last_func_repeats, cpu); + if (is_repeat_check(tr, last_info, ip, parent_ip)) + goto out; + + trace_ctx = tracing_gen_ctx_flags(flags); + process_repeats(tr, ip, parent_ip, last_info, trace_ctx); + + trace_function(tr, ip, parent_ip, trace_ctx); + __trace_stack(tr, trace_ctx, STACK_SKIP); + } + + out: + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +static struct tracer_opt func_opts[] = { +#ifdef CONFIG_STACKTRACE + { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, +#endif + { TRACER_OPT(func-no-repeats, TRACE_FUNC_OPT_NO_REPEATS) }, + { } /* Always set a last empty entry */ +}; + +static struct tracer_flags func_flags = { + .val = TRACE_FUNC_NO_OPTS, /* By default: all flags disabled */ + .opts = func_opts +}; + +static void tracing_start_function_trace(struct trace_array *tr) +{ + tr->function_enabled = 0; + register_ftrace_function(tr->ops); + tr->function_enabled = 1; +} + +static void tracing_stop_function_trace(struct trace_array *tr) +{ + tr->function_enabled = 0; + unregister_ftrace_function(tr->ops); +} + +static struct tracer function_trace; + +static int +func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) +{ + ftrace_func_t func; + u32 new_flags; + + /* Do nothing if already set. */ + if (!!set == !!(func_flags.val & bit)) + return 0; + + /* We can change this flag only when not running. */ + if (tr->current_trace != &function_trace) + return 0; + + new_flags = (func_flags.val & ~bit) | (set ? bit : 0); + func = select_trace_function(new_flags); + if (!func) + return -EINVAL; + + /* Check if there's anything to change. */ + if (tr->ops->func == func) + return 0; + + if (!handle_func_repeats(tr, new_flags)) + return -ENOMEM; + + unregister_ftrace_function(tr->ops); + tr->ops->func = func; + register_ftrace_function(tr->ops); + + return 0; +} + +static struct tracer function_trace __tracer_data = +{ + .name = "function", + .init = function_trace_init, + .reset = function_trace_reset, + .start = function_trace_start, + .flags = &func_flags, + .set_flag = func_set_flag, + .allow_instances = true, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_function, +#endif +}; + +#ifdef CONFIG_DYNAMIC_FTRACE +static void update_traceon_count(struct ftrace_probe_ops *ops, + unsigned long ip, + struct trace_array *tr, bool on, + void *data) +{ + struct ftrace_func_mapper *mapper = data; + long *count; + long old_count; + + /* + * Tracing gets disabled (or enabled) once per count. + * This function can be called at the same time on multiple CPUs. + * It is fine if both disable (or enable) tracing, as disabling + * (or enabling) the second time doesn't do anything as the + * state of the tracer is already disabled (or enabled). + * What needs to be synchronized in this case is that the count + * only gets decremented once, even if the tracer is disabled + * (or enabled) twice, as the second one is really a nop. + * + * The memory barriers guarantee that we only decrement the + * counter once. First the count is read to a local variable + * and a read barrier is used to make sure that it is loaded + * before checking if the tracer is in the state we want. + * If the tracer is not in the state we want, then the count + * is guaranteed to be the old count. + * + * Next the tracer is set to the state we want (disabled or enabled) + * then a write memory barrier is used to make sure that + * the new state is visible before changing the counter by + * one minus the old counter. This guarantees that another CPU + * executing this code will see the new state before seeing + * the new counter value, and would not do anything if the new + * counter is seen. + * + * Note, there is no synchronization between this and a user + * setting the tracing_on file. But we currently don't care + * about that. + */ + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + old_count = *count; + + if (old_count <= 0) + return; + + /* Make sure we see count before checking tracing state */ + smp_rmb(); + + if (on == !!tracer_tracing_is_on(tr)) + return; + + if (on) + tracer_tracing_on(tr); + else + tracer_tracing_off(tr); + + /* Make sure tracing state is visible before updating count */ + smp_wmb(); + + *count = old_count - 1; +} + +static void +ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) +{ + update_traceon_count(ops, ip, tr, 1, data); +} + +static void +ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) +{ + update_traceon_count(ops, ip, tr, 0, data); +} + +static void +ftrace_traceon(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) +{ + if (tracer_tracing_is_on(tr)) + return; + + tracer_tracing_on(tr); +} + +static void +ftrace_traceoff(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) +{ + if (!tracer_tracing_is_on(tr)) + return; + + tracer_tracing_off(tr); +} + +#ifdef CONFIG_UNWINDER_ORC +/* + * Skip 3: + * + * function_trace_probe_call() + * ftrace_ops_assist_func() + * ftrace_call() + */ +#define FTRACE_STACK_SKIP 3 +#else +/* + * Skip 5: + * + * __trace_stack() + * ftrace_stacktrace() + * function_trace_probe_call() + * ftrace_ops_assist_func() + * ftrace_call() + */ +#define FTRACE_STACK_SKIP 5 +#endif + +static __always_inline void trace_stack(struct trace_array *tr) +{ + unsigned int trace_ctx; + + trace_ctx = tracing_gen_ctx(); + + __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP); +} + +static void +ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) +{ + trace_stack(tr); +} + +static void +ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) +{ + struct ftrace_func_mapper *mapper = data; + long *count; + long old_count; + long new_count; + + if (!tracing_is_on()) + return; + + /* unlimited? */ + if (!mapper) { + trace_stack(tr); + return; + } + + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + /* + * Stack traces should only execute the number of times the + * user specified in the counter. + */ + do { + old_count = *count; + + if (!old_count) + return; + + new_count = old_count - 1; + new_count = cmpxchg(count, old_count, new_count); + if (new_count == old_count) + trace_stack(tr); + + if (!tracing_is_on()) + return; + + } while (new_count != old_count); +} + +static int update_count(struct ftrace_probe_ops *ops, unsigned long ip, + void *data) +{ + struct ftrace_func_mapper *mapper = data; + long *count = NULL; + + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) { + if (*count <= 0) + return 0; + (*count)--; + } + + return 1; +} + +static void +ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) +{ + if (update_count(ops, ip, data)) + ftrace_dump(DUMP_ALL); +} + +/* Only dump the current CPU buffer. */ +static void +ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) +{ + if (update_count(ops, ip, data)) + ftrace_dump(DUMP_ORIG); +} + +static int +ftrace_probe_print(const char *name, struct seq_file *m, + unsigned long ip, struct ftrace_probe_ops *ops, + void *data) +{ + struct ftrace_func_mapper *mapper = data; + long *count = NULL; + + seq_printf(m, "%ps:%s", (void *)ip, name); + + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) + seq_printf(m, ":count=%ld\n", *count); + else + seq_puts(m, ":unlimited\n"); + + return 0; +} + +static int +ftrace_traceon_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, + void *data) +{ + return ftrace_probe_print("traceon", m, ip, ops, data); +} + +static int +ftrace_traceoff_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("traceoff", m, ip, ops, data); +} + +static int +ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("stacktrace", m, ip, ops, data); +} + +static int +ftrace_dump_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("dump", m, ip, ops, data); +} + +static int +ftrace_cpudump_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("cpudump", m, ip, ops, data); +} + + +static int +ftrace_count_init(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *init_data, void **data) +{ + struct ftrace_func_mapper *mapper = *data; + + if (!mapper) { + mapper = allocate_ftrace_func_mapper(); + if (!mapper) + return -ENOMEM; + *data = mapper; + } + + return ftrace_func_mapper_add_ip(mapper, ip, init_data); +} + +static void +ftrace_count_free(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *data) +{ + struct ftrace_func_mapper *mapper = data; + + if (!ip) { + free_ftrace_func_mapper(mapper, NULL); + return; + } + + ftrace_func_mapper_remove_ip(mapper, ip); +} + +static struct ftrace_probe_ops traceon_count_probe_ops = { + .func = ftrace_traceon_count, + .print = ftrace_traceon_print, + .init = ftrace_count_init, + .free = ftrace_count_free, +}; + +static struct ftrace_probe_ops traceoff_count_probe_ops = { + .func = ftrace_traceoff_count, + .print = ftrace_traceoff_print, + .init = ftrace_count_init, + .free = ftrace_count_free, +}; + +static struct ftrace_probe_ops stacktrace_count_probe_ops = { + .func = ftrace_stacktrace_count, + .print = ftrace_stacktrace_print, + .init = ftrace_count_init, + .free = ftrace_count_free, +}; + +static struct ftrace_probe_ops dump_probe_ops = { + .func = ftrace_dump_probe, + .print = ftrace_dump_print, + .init = ftrace_count_init, + .free = ftrace_count_free, +}; + +static struct ftrace_probe_ops cpudump_probe_ops = { + .func = ftrace_cpudump_probe, + .print = ftrace_cpudump_print, +}; + +static struct ftrace_probe_ops traceon_probe_ops = { + .func = ftrace_traceon, + .print = ftrace_traceon_print, +}; + +static struct ftrace_probe_ops traceoff_probe_ops = { + .func = ftrace_traceoff, + .print = ftrace_traceoff_print, +}; + +static struct ftrace_probe_ops stacktrace_probe_ops = { + .func = ftrace_stacktrace, + .print = ftrace_stacktrace_print, +}; + +static int +ftrace_trace_probe_callback(struct trace_array *tr, + struct ftrace_probe_ops *ops, + struct ftrace_hash *hash, char *glob, + char *cmd, char *param, int enable) +{ + void *count = (void *)-1; + char *number; + int ret; + + /* hash funcs only work with set_ftrace_filter */ + if (!enable) + return -EINVAL; + + if (glob[0] == '!') + return unregister_ftrace_function_probe_func(glob+1, tr, ops); + + if (!param) + goto out_reg; + + number = strsep(¶m, ":"); + + if (!strlen(number)) + goto out_reg; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, (unsigned long *)&count); + if (ret) + return ret; + + out_reg: + ret = register_ftrace_function_probe(glob, tr, ops, count); + + return ret < 0 ? ret : 0; +} + +static int +ftrace_trace_onoff_callback(struct trace_array *tr, struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + if (!tr) + return -ENODEV; + + /* we register both traceon and traceoff to this callback */ + if (strcmp(cmd, "traceon") == 0) + ops = param ? &traceon_count_probe_ops : &traceon_probe_ops; + else + ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops; + + return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, + param, enable); +} + +static int +ftrace_stacktrace_callback(struct trace_array *tr, struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + if (!tr) + return -ENODEV; + + ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; + + return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, + param, enable); +} + +static int +ftrace_dump_callback(struct trace_array *tr, struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + if (!tr) + return -ENODEV; + + ops = &dump_probe_ops; + + /* Only dump once. */ + return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, + "1", enable); +} + +static int +ftrace_cpudump_callback(struct trace_array *tr, struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + if (!tr) + return -ENODEV; + + ops = &cpudump_probe_ops; + + /* Only dump once. */ + return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, + "1", enable); +} + +static struct ftrace_func_command ftrace_traceon_cmd = { + .name = "traceon", + .func = ftrace_trace_onoff_callback, +}; + +static struct ftrace_func_command ftrace_traceoff_cmd = { + .name = "traceoff", + .func = ftrace_trace_onoff_callback, +}; + +static struct ftrace_func_command ftrace_stacktrace_cmd = { + .name = "stacktrace", + .func = ftrace_stacktrace_callback, +}; + +static struct ftrace_func_command ftrace_dump_cmd = { + .name = "dump", + .func = ftrace_dump_callback, +}; + +static struct ftrace_func_command ftrace_cpudump_cmd = { + .name = "cpudump", + .func = ftrace_cpudump_callback, +}; + +static int __init init_func_cmd_traceon(void) +{ + int ret; + + ret = register_ftrace_command(&ftrace_traceoff_cmd); + if (ret) + return ret; + + ret = register_ftrace_command(&ftrace_traceon_cmd); + if (ret) + goto out_free_traceoff; + + ret = register_ftrace_command(&ftrace_stacktrace_cmd); + if (ret) + goto out_free_traceon; + + ret = register_ftrace_command(&ftrace_dump_cmd); + if (ret) + goto out_free_stacktrace; + + ret = register_ftrace_command(&ftrace_cpudump_cmd); + if (ret) + goto out_free_dump; + + return 0; + + out_free_dump: + unregister_ftrace_command(&ftrace_dump_cmd); + out_free_stacktrace: + unregister_ftrace_command(&ftrace_stacktrace_cmd); + out_free_traceon: + unregister_ftrace_command(&ftrace_traceon_cmd); + out_free_traceoff: + unregister_ftrace_command(&ftrace_traceoff_cmd); + + return ret; +} +#else +static inline int init_func_cmd_traceon(void) +{ + return 0; +} +#endif /* CONFIG_DYNAMIC_FTRACE */ + +__init int init_function_trace(void) +{ + init_func_cmd_traceon(); + return register_tracer(&function_trace); +} diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c new file mode 100644 index 0000000000..c35fbaab2a --- /dev/null +++ b/kernel/trace/trace_functions_graph.c @@ -0,0 +1,1440 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Function graph tracer. + * Copyright (c) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com> + * Mostly borrowed from function tracer which + * is Copyright (c) Steven Rostedt <srostedt@redhat.com> + * + */ +#include <linux/uaccess.h> +#include <linux/ftrace.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/fs.h> + +#include "trace.h" +#include "trace_output.h" + +/* When set, irq functions will be ignored */ +static int ftrace_graph_skip_irqs; + +struct fgraph_cpu_data { + pid_t last_pid; + int depth; + int depth_irq; + int ignore; + unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; +}; + +struct fgraph_data { + struct fgraph_cpu_data __percpu *cpu_data; + + /* Place to preserve last processed entry. */ + struct ftrace_graph_ent_entry ent; + struct ftrace_graph_ret_entry ret; + int failed; + int cpu; +}; + +#define TRACE_GRAPH_INDENT 2 + +unsigned int fgraph_max_depth; + +static struct tracer_opt trace_opts[] = { + /* Display overruns? (for self-debug purpose) */ + { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, + /* Display CPU ? */ + { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) }, + /* Display Overhead ? */ + { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) }, + /* Display proc name/pid */ + { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) }, + /* Display duration of execution */ + { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, + /* Display absolute time of an entry */ + { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, + /* Display interrupts */ + { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, + /* Display function name after trailing } */ + { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) }, +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + /* Display function return value ? */ + { TRACER_OPT(funcgraph-retval, TRACE_GRAPH_PRINT_RETVAL) }, + /* Display function return value in hexadecimal format ? */ + { TRACER_OPT(funcgraph-retval-hex, TRACE_GRAPH_PRINT_RETVAL_HEX) }, +#endif + /* Include sleep time (scheduled out) between entry and return */ + { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, + +#ifdef CONFIG_FUNCTION_PROFILER + /* Include time within nested functions */ + { TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) }, +#endif + + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + /* Don't display overruns, proc, or tail by default */ + .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | + TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS | + TRACE_GRAPH_SLEEP_TIME | TRACE_GRAPH_GRAPH_TIME, + .opts = trace_opts +}; + +static struct trace_array *graph_array; + +/* + * DURATION column is being also used to display IRQ signs, + * following values are used by print_graph_irq and others + * to fill in space into DURATION column. + */ +enum { + FLAGS_FILL_FULL = 1 << TRACE_GRAPH_PRINT_FILL_SHIFT, + FLAGS_FILL_START = 2 << TRACE_GRAPH_PRINT_FILL_SHIFT, + FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT, +}; + +static void +print_graph_duration(struct trace_array *tr, unsigned long long duration, + struct trace_seq *s, u32 flags); + +int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned int trace_ctx) +{ + struct trace_event_call *call = &event_funcgraph_entry; + struct ring_buffer_event *event; + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct ftrace_graph_ent_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, + sizeof(*entry), trace_ctx); + if (!event) + return 0; + entry = ring_buffer_event_data(event); + entry->graph_ent = *trace; + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit_nostack(buffer, event); + + return 1; +} + +static inline int ftrace_graph_ignore_irqs(void) +{ + if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT)) + return 0; + + return in_hardirq(); +} + +int trace_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = graph_array; + struct trace_array_cpu *data; + unsigned long flags; + unsigned int trace_ctx; + long disabled; + int ret; + int cpu; + + if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) + return 0; + + /* + * Do not trace a function if it's filtered by set_graph_notrace. + * Make the index of ret stack negative to indicate that it should + * ignore further functions. But it needs its own ret stack entry + * to recover the original index in order to continue tracing after + * returning from the function. + */ + if (ftrace_graph_notrace_addr(trace->func)) { + trace_recursion_set(TRACE_GRAPH_NOTRACE_BIT); + /* + * Need to return 1 to have the return called + * that will clear the NOTRACE bit. + */ + return 1; + } + + if (!ftrace_trace_task(tr)) + return 0; + + if (ftrace_graph_ignore_func(trace)) + return 0; + + if (ftrace_graph_ignore_irqs()) + return 0; + + /* + * Stop here if tracing_threshold is set. We only write function return + * events to the ring buffer. + */ + if (tracing_thresh) + return 1; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = per_cpu_ptr(tr->array_buffer.data, cpu); + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + trace_ctx = tracing_gen_ctx_flags(flags); + ret = __trace_graph_entry(tr, trace, trace_ctx); + } else { + ret = 0; + } + + atomic_dec(&data->disabled); + local_irq_restore(flags); + + return ret; +} + +static void +__trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned int trace_ctx) +{ + u64 time = trace_clock_local(); + struct ftrace_graph_ent ent = { + .func = ip, + .depth = 0, + }; + struct ftrace_graph_ret ret = { + .func = ip, + .depth = 0, + .calltime = time, + .rettime = time, + }; + + __trace_graph_entry(tr, &ent, trace_ctx); + __trace_graph_return(tr, &ret, trace_ctx); +} + +void +trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned int trace_ctx) +{ + __trace_graph_function(tr, ip, trace_ctx); +} + +void __trace_graph_return(struct trace_array *tr, + struct ftrace_graph_ret *trace, + unsigned int trace_ctx) +{ + struct trace_event_call *call = &event_funcgraph_exit; + struct ring_buffer_event *event; + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct ftrace_graph_ret_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, + sizeof(*entry), trace_ctx); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->ret = *trace; + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit_nostack(buffer, event); +} + +void trace_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = graph_array; + struct trace_array_cpu *data; + unsigned long flags; + unsigned int trace_ctx; + long disabled; + int cpu; + + ftrace_graph_addr_finish(trace); + + if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) { + trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT); + return; + } + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = per_cpu_ptr(tr->array_buffer.data, cpu); + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + trace_ctx = tracing_gen_ctx_flags(flags); + __trace_graph_return(tr, trace, trace_ctx); + } + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +void set_graph_array(struct trace_array *tr) +{ + graph_array = tr; + + /* Make graph_array visible before we start tracing */ + + smp_mb(); +} + +static void trace_graph_thresh_return(struct ftrace_graph_ret *trace) +{ + ftrace_graph_addr_finish(trace); + + if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) { + trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT); + return; + } + + if (tracing_thresh && + (trace->rettime - trace->calltime < tracing_thresh)) + return; + else + trace_graph_return(trace); +} + +static struct fgraph_ops funcgraph_thresh_ops = { + .entryfunc = &trace_graph_entry, + .retfunc = &trace_graph_thresh_return, +}; + +static struct fgraph_ops funcgraph_ops = { + .entryfunc = &trace_graph_entry, + .retfunc = &trace_graph_return, +}; + +static int graph_trace_init(struct trace_array *tr) +{ + int ret; + + set_graph_array(tr); + if (tracing_thresh) + ret = register_ftrace_graph(&funcgraph_thresh_ops); + else + ret = register_ftrace_graph(&funcgraph_ops); + if (ret) + return ret; + tracing_start_cmdline_record(); + + return 0; +} + +static void graph_trace_reset(struct trace_array *tr) +{ + tracing_stop_cmdline_record(); + if (tracing_thresh) + unregister_ftrace_graph(&funcgraph_thresh_ops); + else + unregister_ftrace_graph(&funcgraph_ops); +} + +static int graph_trace_update_thresh(struct trace_array *tr) +{ + graph_trace_reset(tr); + return graph_trace_init(tr); +} + +static int max_bytes_for_cpu; + +static void print_graph_cpu(struct trace_seq *s, int cpu) +{ + /* + * Start with a space character - to make it stand out + * to the right a bit when trace output is pasted into + * email: + */ + trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); +} + +#define TRACE_GRAPH_PROCINFO_LENGTH 14 + +static void print_graph_proc(struct trace_seq *s, pid_t pid) +{ + char comm[TASK_COMM_LEN]; + /* sign + log10(MAX_INT) + '\0' */ + char pid_str[11]; + int spaces = 0; + int len; + int i; + + trace_find_cmdline(pid, comm); + comm[7] = '\0'; + sprintf(pid_str, "%d", pid); + + /* 1 stands for the "-" character */ + len = strlen(comm) + strlen(pid_str) + 1; + + if (len < TRACE_GRAPH_PROCINFO_LENGTH) + spaces = TRACE_GRAPH_PROCINFO_LENGTH - len; + + /* First spaces to align center */ + for (i = 0; i < spaces / 2; i++) + trace_seq_putc(s, ' '); + + trace_seq_printf(s, "%s-%s", comm, pid_str); + + /* Last spaces to align center */ + for (i = 0; i < spaces - (spaces / 2); i++) + trace_seq_putc(s, ' '); +} + + +static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +{ + trace_seq_putc(s, ' '); + trace_print_lat_fmt(s, entry); + trace_seq_puts(s, " | "); +} + +/* If the pid changed since the last trace, output this event */ +static void +verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) +{ + pid_t prev_pid; + pid_t *last_pid; + + if (!data) + return; + + last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); + + if (*last_pid == pid) + return; + + prev_pid = *last_pid; + *last_pid = pid; + + if (prev_pid == -1) + return; +/* + * Context-switch trace line: + + ------------------------------------------ + | 1) migration/0--1 => sshd-1755 + ------------------------------------------ + + */ + trace_seq_puts(s, " ------------------------------------------\n"); + print_graph_cpu(s, cpu); + print_graph_proc(s, prev_pid); + trace_seq_puts(s, " => "); + print_graph_proc(s, pid); + trace_seq_puts(s, "\n ------------------------------------------\n\n"); +} + +static struct ftrace_graph_ret_entry * +get_return_for_leaf(struct trace_iterator *iter, + struct ftrace_graph_ent_entry *curr) +{ + struct fgraph_data *data = iter->private; + struct ring_buffer_iter *ring_iter = NULL; + struct ring_buffer_event *event; + struct ftrace_graph_ret_entry *next; + + /* + * If the previous output failed to write to the seq buffer, + * then we just reuse the data from before. + */ + if (data && data->failed) { + curr = &data->ent; + next = &data->ret; + } else { + + ring_iter = trace_buffer_iter(iter, iter->cpu); + + /* First peek to compare current entry and the next one */ + if (ring_iter) + event = ring_buffer_iter_peek(ring_iter, NULL); + else { + /* + * We need to consume the current entry to see + * the next one. + */ + ring_buffer_consume(iter->array_buffer->buffer, iter->cpu, + NULL, NULL); + event = ring_buffer_peek(iter->array_buffer->buffer, iter->cpu, + NULL, NULL); + } + + if (!event) + return NULL; + + next = ring_buffer_event_data(event); + + if (data) { + /* + * Save current and next entries for later reference + * if the output fails. + */ + data->ent = *curr; + /* + * If the next event is not a return type, then + * we only care about what type it is. Otherwise we can + * safely copy the entire event. + */ + if (next->ent.type == TRACE_GRAPH_RET) + data->ret = *next; + else + data->ret.ent.type = next->ent.type; + } + } + + if (next->ent.type != TRACE_GRAPH_RET) + return NULL; + + if (curr->ent.pid != next->ent.pid || + curr->graph_ent.func != next->ret.func) + return NULL; + + /* this is a leaf, now advance the iterator */ + if (ring_iter) + ring_buffer_iter_advance(ring_iter); + + return next; +} + +static void print_graph_abs_time(u64 t, struct trace_seq *s) +{ + unsigned long usecs_rem; + + usecs_rem = do_div(t, NSEC_PER_SEC); + usecs_rem /= 1000; + + trace_seq_printf(s, "%5lu.%06lu | ", + (unsigned long)t, usecs_rem); +} + +static void +print_graph_rel_time(struct trace_iterator *iter, struct trace_seq *s) +{ + unsigned long long usecs; + + usecs = iter->ts - iter->array_buffer->time_start; + do_div(usecs, NSEC_PER_USEC); + + trace_seq_printf(s, "%9llu us | ", usecs); +} + +static void +print_graph_irq(struct trace_iterator *iter, unsigned long addr, + enum trace_type type, int cpu, pid_t pid, u32 flags) +{ + struct trace_array *tr = iter->tr; + struct trace_seq *s = &iter->seq; + struct trace_entry *ent = iter->ent; + + if (addr < (unsigned long)__irqentry_text_start || + addr >= (unsigned long)__irqentry_text_end) + return; + + if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) { + /* Absolute time */ + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) + print_graph_abs_time(iter->ts, s); + + /* Relative time */ + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + print_graph_rel_time(iter, s); + + /* Cpu */ + if (flags & TRACE_GRAPH_PRINT_CPU) + print_graph_cpu(s, cpu); + + /* Proc */ + if (flags & TRACE_GRAPH_PRINT_PROC) { + print_graph_proc(s, pid); + trace_seq_puts(s, " | "); + } + + /* Latency format */ + if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) + print_graph_lat_fmt(s, ent); + } + + /* No overhead */ + print_graph_duration(tr, 0, s, flags | FLAGS_FILL_START); + + if (type == TRACE_GRAPH_ENT) + trace_seq_puts(s, "==========>"); + else + trace_seq_puts(s, "<=========="); + + print_graph_duration(tr, 0, s, flags | FLAGS_FILL_END); + trace_seq_putc(s, '\n'); +} + +void +trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) +{ + unsigned long nsecs_rem = do_div(duration, 1000); + /* log10(ULONG_MAX) + '\0' */ + char usecs_str[21]; + char nsecs_str[5]; + int len; + int i; + + sprintf(usecs_str, "%lu", (unsigned long) duration); + + /* Print msecs */ + trace_seq_printf(s, "%s", usecs_str); + + len = strlen(usecs_str); + + /* Print nsecs (we don't want to exceed 7 numbers) */ + if (len < 7) { + size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); + + snprintf(nsecs_str, slen, "%03lu", nsecs_rem); + trace_seq_printf(s, ".%s", nsecs_str); + len += strlen(nsecs_str) + 1; + } + + trace_seq_puts(s, " us "); + + /* Print remaining spaces to fit the row's width */ + for (i = len; i < 8; i++) + trace_seq_putc(s, ' '); +} + +static void +print_graph_duration(struct trace_array *tr, unsigned long long duration, + struct trace_seq *s, u32 flags) +{ + if (!(flags & TRACE_GRAPH_PRINT_DURATION) || + !(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) + return; + + /* No real adata, just filling the column with spaces */ + switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) { + case FLAGS_FILL_FULL: + trace_seq_puts(s, " | "); + return; + case FLAGS_FILL_START: + trace_seq_puts(s, " "); + return; + case FLAGS_FILL_END: + trace_seq_puts(s, " |"); + return; + } + + /* Signal a overhead of time execution to the output */ + if (flags & TRACE_GRAPH_PRINT_OVERHEAD) + trace_seq_printf(s, "%c ", trace_find_mark(duration)); + else + trace_seq_puts(s, " "); + + trace_print_graph_duration(duration, s); + trace_seq_puts(s, "| "); +} + +#ifdef CONFIG_FUNCTION_GRAPH_RETVAL + +#define __TRACE_GRAPH_PRINT_RETVAL TRACE_GRAPH_PRINT_RETVAL + +static void print_graph_retval(struct trace_seq *s, unsigned long retval, + bool leaf, void *func, bool hex_format) +{ + unsigned long err_code = 0; + + if (retval == 0 || hex_format) + goto done; + + /* Check if the return value matches the negative format */ + if (IS_ENABLED(CONFIG_64BIT) && (retval & BIT(31)) && + (((u64)retval) >> 32) == 0) { + /* sign extension */ + err_code = (unsigned long)(s32)retval; + } else { + err_code = retval; + } + + if (!IS_ERR_VALUE(err_code)) + err_code = 0; + +done: + if (leaf) { + if (hex_format || (err_code == 0)) + trace_seq_printf(s, "%ps(); /* = 0x%lx */\n", + func, retval); + else + trace_seq_printf(s, "%ps(); /* = %ld */\n", + func, err_code); + } else { + if (hex_format || (err_code == 0)) + trace_seq_printf(s, "} /* %ps = 0x%lx */\n", + func, retval); + else + trace_seq_printf(s, "} /* %ps = %ld */\n", + func, err_code); + } +} + +#else + +#define __TRACE_GRAPH_PRINT_RETVAL 0 + +#define print_graph_retval(_seq, _retval, _leaf, _func, _format) do {} while (0) + +#endif + +/* Case of a leaf function on its call entry */ +static enum print_line_t +print_graph_entry_leaf(struct trace_iterator *iter, + struct ftrace_graph_ent_entry *entry, + struct ftrace_graph_ret_entry *ret_entry, + struct trace_seq *s, u32 flags) +{ + struct fgraph_data *data = iter->private; + struct trace_array *tr = iter->tr; + struct ftrace_graph_ret *graph_ret; + struct ftrace_graph_ent *call; + unsigned long long duration; + int cpu = iter->cpu; + int i; + + graph_ret = &ret_entry->ret; + call = &entry->graph_ent; + duration = graph_ret->rettime - graph_ret->calltime; + + if (data) { + struct fgraph_cpu_data *cpu_data; + + cpu_data = per_cpu_ptr(data->cpu_data, cpu); + + /* + * Comments display at + 1 to depth. Since + * this is a leaf function, keep the comments + * equal to this depth. + */ + cpu_data->depth = call->depth - 1; + + /* No need to keep this function around for this depth */ + if (call->depth < FTRACE_RETFUNC_DEPTH && + !WARN_ON_ONCE(call->depth < 0)) + cpu_data->enter_funcs[call->depth] = 0; + } + + /* Overhead and duration */ + print_graph_duration(tr, duration, s, flags); + + /* Function */ + for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) + trace_seq_putc(s, ' '); + + /* + * Write out the function return value if the option function-retval is + * enabled. + */ + if (flags & __TRACE_GRAPH_PRINT_RETVAL) + print_graph_retval(s, graph_ret->retval, true, (void *)call->func, + !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX)); + else + trace_seq_printf(s, "%ps();\n", (void *)call->func); + + print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET, + cpu, iter->ent->pid, flags); + + return trace_handle_return(s); +} + +static enum print_line_t +print_graph_entry_nested(struct trace_iterator *iter, + struct ftrace_graph_ent_entry *entry, + struct trace_seq *s, int cpu, u32 flags) +{ + struct ftrace_graph_ent *call = &entry->graph_ent; + struct fgraph_data *data = iter->private; + struct trace_array *tr = iter->tr; + int i; + + if (data) { + struct fgraph_cpu_data *cpu_data; + int cpu = iter->cpu; + + cpu_data = per_cpu_ptr(data->cpu_data, cpu); + cpu_data->depth = call->depth; + + /* Save this function pointer to see if the exit matches */ + if (call->depth < FTRACE_RETFUNC_DEPTH && + !WARN_ON_ONCE(call->depth < 0)) + cpu_data->enter_funcs[call->depth] = call->func; + } + + /* No time */ + print_graph_duration(tr, 0, s, flags | FLAGS_FILL_FULL); + + /* Function */ + for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) + trace_seq_putc(s, ' '); + + trace_seq_printf(s, "%ps() {\n", (void *)call->func); + + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + + /* + * we already consumed the current entry to check the next one + * and see if this is a leaf. + */ + return TRACE_TYPE_NO_CONSUME; +} + +static void +print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, + int type, unsigned long addr, u32 flags) +{ + struct fgraph_data *data = iter->private; + struct trace_entry *ent = iter->ent; + struct trace_array *tr = iter->tr; + int cpu = iter->cpu; + + /* Pid */ + verif_pid(s, ent->pid, cpu, data); + + if (type) + /* Interrupt */ + print_graph_irq(iter, addr, type, cpu, ent->pid, flags); + + if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) + return; + + /* Absolute time */ + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) + print_graph_abs_time(iter->ts, s); + + /* Relative time */ + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + print_graph_rel_time(iter, s); + + /* Cpu */ + if (flags & TRACE_GRAPH_PRINT_CPU) + print_graph_cpu(s, cpu); + + /* Proc */ + if (flags & TRACE_GRAPH_PRINT_PROC) { + print_graph_proc(s, ent->pid); + trace_seq_puts(s, " | "); + } + + /* Latency format */ + if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) + print_graph_lat_fmt(s, ent); + + return; +} + +/* + * Entry check for irq code + * + * returns 1 if + * - we are inside irq code + * - we just entered irq code + * + * returns 0 if + * - funcgraph-interrupts option is set + * - we are not inside irq code + */ +static int +check_irq_entry(struct trace_iterator *iter, u32 flags, + unsigned long addr, int depth) +{ + int cpu = iter->cpu; + int *depth_irq; + struct fgraph_data *data = iter->private; + + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) + return 0; + + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + /* + * We are inside the irq code + */ + if (*depth_irq >= 0) + return 1; + + if ((addr < (unsigned long)__irqentry_text_start) || + (addr >= (unsigned long)__irqentry_text_end)) + return 0; + + /* + * We are entering irq code. + */ + *depth_irq = depth; + return 1; +} + +/* + * Return check for irq code + * + * returns 1 if + * - we are inside irq code + * - we just left irq code + * + * returns 0 if + * - funcgraph-interrupts option is set + * - we are not inside irq code + */ +static int +check_irq_return(struct trace_iterator *iter, u32 flags, int depth) +{ + int cpu = iter->cpu; + int *depth_irq; + struct fgraph_data *data = iter->private; + + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) + return 0; + + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + /* + * We are not inside the irq code. + */ + if (*depth_irq == -1) + return 0; + + /* + * We are inside the irq code, and this is returning entry. + * Let's not trace it and clear the entry depth, since + * we are out of irq code. + * + * This condition ensures that we 'leave the irq code' once + * we are out of the entry depth. Thus protecting us from + * the RETURN entry loss. + */ + if (*depth_irq >= depth) { + *depth_irq = -1; + return 1; + } + + /* + * We are inside the irq code, and this is not the entry. + */ + return 1; +} + +static enum print_line_t +print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, + struct trace_iterator *iter, u32 flags) +{ + struct fgraph_data *data = iter->private; + struct ftrace_graph_ent *call = &field->graph_ent; + struct ftrace_graph_ret_entry *leaf_ret; + static enum print_line_t ret; + int cpu = iter->cpu; + + if (check_irq_entry(iter, flags, call->func, call->depth)) + return TRACE_TYPE_HANDLED; + + print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags); + + leaf_ret = get_return_for_leaf(iter, field); + if (leaf_ret) + ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags); + else + ret = print_graph_entry_nested(iter, field, s, cpu, flags); + + if (data) { + /* + * If we failed to write our output, then we need to make + * note of it. Because we already consumed our entry. + */ + if (s->full) { + data->failed = 1; + data->cpu = cpu; + } else + data->failed = 0; + } + + return ret; +} + +static enum print_line_t +print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, + struct trace_entry *ent, struct trace_iterator *iter, + u32 flags) +{ + unsigned long long duration = trace->rettime - trace->calltime; + struct fgraph_data *data = iter->private; + struct trace_array *tr = iter->tr; + pid_t pid = ent->pid; + int cpu = iter->cpu; + int func_match = 1; + int i; + + if (check_irq_return(iter, flags, trace->depth)) + return TRACE_TYPE_HANDLED; + + if (data) { + struct fgraph_cpu_data *cpu_data; + int cpu = iter->cpu; + + cpu_data = per_cpu_ptr(data->cpu_data, cpu); + + /* + * Comments display at + 1 to depth. This is the + * return from a function, we now want the comments + * to display at the same level of the bracket. + */ + cpu_data->depth = trace->depth - 1; + + if (trace->depth < FTRACE_RETFUNC_DEPTH && + !WARN_ON_ONCE(trace->depth < 0)) { + if (cpu_data->enter_funcs[trace->depth] != trace->func) + func_match = 0; + cpu_data->enter_funcs[trace->depth] = 0; + } + } + + print_graph_prologue(iter, s, 0, 0, flags); + + /* Overhead and duration */ + print_graph_duration(tr, duration, s, flags); + + /* Closing brace */ + for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) + trace_seq_putc(s, ' '); + + /* + * Always write out the function name and its return value if the + * function-retval option is enabled. + */ + if (flags & __TRACE_GRAPH_PRINT_RETVAL) { + print_graph_retval(s, trace->retval, false, (void *)trace->func, + !!(flags & TRACE_GRAPH_PRINT_RETVAL_HEX)); + } else { + /* + * If the return function does not have a matching entry, + * then the entry was lost. Instead of just printing + * the '}' and letting the user guess what function this + * belongs to, write out the function name. Always do + * that if the funcgraph-tail option is enabled. + */ + if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) + trace_seq_puts(s, "}\n"); + else + trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); + } + + /* Overrun */ + if (flags & TRACE_GRAPH_PRINT_OVERRUN) + trace_seq_printf(s, " (Overruns: %u)\n", + trace->overrun); + + print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, + cpu, pid, flags); + + return trace_handle_return(s); +} + +static enum print_line_t +print_graph_comment(struct trace_seq *s, struct trace_entry *ent, + struct trace_iterator *iter, u32 flags) +{ + struct trace_array *tr = iter->tr; + unsigned long sym_flags = (tr->trace_flags & TRACE_ITER_SYM_MASK); + struct fgraph_data *data = iter->private; + struct trace_event *event; + int depth = 0; + int ret; + int i; + + if (data) + depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; + + print_graph_prologue(iter, s, 0, 0, flags); + + /* No time */ + print_graph_duration(tr, 0, s, flags | FLAGS_FILL_FULL); + + /* Indentation */ + if (depth > 0) + for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) + trace_seq_putc(s, ' '); + + /* The comment */ + trace_seq_puts(s, "/* "); + + switch (iter->ent->type) { + case TRACE_BPUTS: + ret = trace_print_bputs_msg_only(iter); + if (ret != TRACE_TYPE_HANDLED) + return ret; + break; + case TRACE_BPRINT: + ret = trace_print_bprintk_msg_only(iter); + if (ret != TRACE_TYPE_HANDLED) + return ret; + break; + case TRACE_PRINT: + ret = trace_print_printk_msg_only(iter); + if (ret != TRACE_TYPE_HANDLED) + return ret; + break; + default: + event = ftrace_find_event(ent->type); + if (!event) + return TRACE_TYPE_UNHANDLED; + + ret = event->funcs->trace(iter, sym_flags, event); + if (ret != TRACE_TYPE_HANDLED) + return ret; + } + + if (trace_seq_has_overflowed(s)) + goto out; + + /* Strip ending newline */ + if (s->buffer[s->seq.len - 1] == '\n') { + s->buffer[s->seq.len - 1] = '\0'; + s->seq.len--; + } + + trace_seq_puts(s, " */\n"); + out: + return trace_handle_return(s); +} + + +enum print_line_t +print_graph_function_flags(struct trace_iterator *iter, u32 flags) +{ + struct ftrace_graph_ent_entry *field; + struct fgraph_data *data = iter->private; + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + int cpu = iter->cpu; + int ret; + + if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) { + per_cpu_ptr(data->cpu_data, cpu)->ignore = 0; + return TRACE_TYPE_HANDLED; + } + + /* + * If the last output failed, there's a possibility we need + * to print out the missing entry which would never go out. + */ + if (data && data->failed) { + field = &data->ent; + iter->cpu = data->cpu; + ret = print_graph_entry(field, s, iter, flags); + if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) { + per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1; + ret = TRACE_TYPE_NO_CONSUME; + } + iter->cpu = cpu; + return ret; + } + + switch (entry->type) { + case TRACE_GRAPH_ENT: { + /* + * print_graph_entry() may consume the current event, + * thus @field may become invalid, so we need to save it. + * sizeof(struct ftrace_graph_ent_entry) is very small, + * it can be safely saved at the stack. + */ + struct ftrace_graph_ent_entry saved; + trace_assign_type(field, entry); + saved = *field; + return print_graph_entry(&saved, s, iter, flags); + } + case TRACE_GRAPH_RET: { + struct ftrace_graph_ret_entry *field; + trace_assign_type(field, entry); + return print_graph_return(&field->ret, s, entry, iter, flags); + } + case TRACE_STACK: + case TRACE_FN: + /* dont trace stack and functions as comments */ + return TRACE_TYPE_UNHANDLED; + + default: + return print_graph_comment(s, entry, iter, flags); + } + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_function(struct trace_iterator *iter) +{ + return print_graph_function_flags(iter, tracer_flags.val); +} + +static enum print_line_t +print_graph_function_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + return print_graph_function(iter); +} + +static void print_lat_header(struct seq_file *s, u32 flags) +{ + static const char spaces[] = " " /* 16 spaces */ + " " /* 4 spaces */ + " "; /* 17 spaces */ + int size = 0; + + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) + size += 16; + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + size += 16; + if (flags & TRACE_GRAPH_PRINT_CPU) + size += 4; + if (flags & TRACE_GRAPH_PRINT_PROC) + size += 17; + + seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces); + seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); + seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); + seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); + seq_printf(s, "#%.*s||| / \n", size, spaces); +} + +static void __print_graph_headers_flags(struct trace_array *tr, + struct seq_file *s, u32 flags) +{ + int lat = tr->trace_flags & TRACE_ITER_LATENCY_FMT; + + if (lat) + print_lat_header(s, flags); + + /* 1st line */ + seq_putc(s, '#'); + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) + seq_puts(s, " TIME "); + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + seq_puts(s, " REL TIME "); + if (flags & TRACE_GRAPH_PRINT_CPU) + seq_puts(s, " CPU"); + if (flags & TRACE_GRAPH_PRINT_PROC) + seq_puts(s, " TASK/PID "); + if (lat) + seq_puts(s, "|||| "); + if (flags & TRACE_GRAPH_PRINT_DURATION) + seq_puts(s, " DURATION "); + seq_puts(s, " FUNCTION CALLS\n"); + + /* 2nd line */ + seq_putc(s, '#'); + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) + seq_puts(s, " | "); + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + seq_puts(s, " | "); + if (flags & TRACE_GRAPH_PRINT_CPU) + seq_puts(s, " | "); + if (flags & TRACE_GRAPH_PRINT_PROC) + seq_puts(s, " | | "); + if (lat) + seq_puts(s, "|||| "); + if (flags & TRACE_GRAPH_PRINT_DURATION) + seq_puts(s, " | | "); + seq_puts(s, " | | | |\n"); +} + +static void print_graph_headers(struct seq_file *s) +{ + print_graph_headers_flags(s, tracer_flags.val); +} + +void print_graph_headers_flags(struct seq_file *s, u32 flags) +{ + struct trace_iterator *iter = s->private; + struct trace_array *tr = iter->tr; + + if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) + return; + + if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + + print_trace_header(s, iter); + } + + __print_graph_headers_flags(tr, s, flags); +} + +void graph_trace_open(struct trace_iterator *iter) +{ + /* pid and depth on the last trace processed */ + struct fgraph_data *data; + gfp_t gfpflags; + int cpu; + + iter->private = NULL; + + /* We can be called in atomic context via ftrace_dump() */ + gfpflags = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL; + + data = kzalloc(sizeof(*data), gfpflags); + if (!data) + goto out_err; + + data->cpu_data = alloc_percpu_gfp(struct fgraph_cpu_data, gfpflags); + if (!data->cpu_data) + goto out_err_free; + + for_each_possible_cpu(cpu) { + pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); + int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); + int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); + int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + *pid = -1; + *depth = 0; + *ignore = 0; + *depth_irq = -1; + } + + iter->private = data; + + return; + + out_err_free: + kfree(data); + out_err: + pr_warn("function graph tracer: not enough memory\n"); +} + +void graph_trace_close(struct trace_iterator *iter) +{ + struct fgraph_data *data = iter->private; + + if (data) { + free_percpu(data->cpu_data); + kfree(data); + } +} + +static int +func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) +{ + if (bit == TRACE_GRAPH_PRINT_IRQS) + ftrace_graph_skip_irqs = !set; + + if (bit == TRACE_GRAPH_SLEEP_TIME) + ftrace_graph_sleep_time_control(set); + + if (bit == TRACE_GRAPH_GRAPH_TIME) + ftrace_graph_graph_time_control(set); + + return 0; +} + +static struct trace_event_functions graph_functions = { + .trace = print_graph_function_event, +}; + +static struct trace_event graph_trace_entry_event = { + .type = TRACE_GRAPH_ENT, + .funcs = &graph_functions, +}; + +static struct trace_event graph_trace_ret_event = { + .type = TRACE_GRAPH_RET, + .funcs = &graph_functions +}; + +static struct tracer graph_trace __tracer_data = { + .name = "function_graph", + .update_thresh = graph_trace_update_thresh, + .open = graph_trace_open, + .pipe_open = graph_trace_open, + .close = graph_trace_close, + .pipe_close = graph_trace_close, + .init = graph_trace_init, + .reset = graph_trace_reset, + .print_line = print_graph_function, + .print_header = print_graph_headers, + .flags = &tracer_flags, + .set_flag = func_graph_set_flag, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_function_graph, +#endif +}; + + +static ssize_t +graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + fgraph_max_depth = val; + + *ppos += cnt; + + return cnt; +} + +static ssize_t +graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/ + int n; + + n = sprintf(buf, "%d\n", fgraph_max_depth); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); +} + +static const struct file_operations graph_depth_fops = { + .open = tracing_open_generic, + .write = graph_depth_write, + .read = graph_depth_read, + .llseek = generic_file_llseek, +}; + +static __init int init_graph_tracefs(void) +{ + int ret; + + ret = tracing_init_dentry(); + if (ret) + return 0; + + trace_create_file("max_graph_depth", TRACE_MODE_WRITE, NULL, + NULL, &graph_depth_fops); + + return 0; +} +fs_initcall(init_graph_tracefs); + +static __init int init_graph_trace(void) +{ + max_bytes_for_cpu = snprintf(NULL, 0, "%u", nr_cpu_ids - 1); + + if (!register_trace_event(&graph_trace_entry_event)) { + pr_warn("Warning: could not register graph trace events\n"); + return 1; + } + + if (!register_trace_event(&graph_trace_ret_event)) { + pr_warn("Warning: could not register graph trace events\n"); + return 1; + } + + return register_tracer(&graph_trace); +} + +core_initcall(init_graph_trace); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c new file mode 100644 index 0000000000..b791524a65 --- /dev/null +++ b/kernel/trace/trace_hwlat.c @@ -0,0 +1,891 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace_hwlat.c - A simple Hardware Latency detector. + * + * Use this tracer to detect large system latencies induced by the behavior of + * certain underlying system hardware or firmware, independent of Linux itself. + * The code was developed originally to detect the presence of SMIs on Intel + * and AMD systems, although there is no dependency upon x86 herein. + * + * The classical example usage of this tracer is in detecting the presence of + * SMIs or System Management Interrupts on Intel and AMD systems. An SMI is a + * somewhat special form of hardware interrupt spawned from earlier CPU debug + * modes in which the (BIOS/EFI/etc.) firmware arranges for the South Bridge + * LPC (or other device) to generate a special interrupt under certain + * circumstances, for example, upon expiration of a special SMI timer device, + * due to certain external thermal readings, on certain I/O address accesses, + * and other situations. An SMI hits a special CPU pin, triggers a special + * SMI mode (complete with special memory map), and the OS is unaware. + * + * Although certain hardware-inducing latencies are necessary (for example, + * a modern system often requires an SMI handler for correct thermal control + * and remote management) they can wreak havoc upon any OS-level performance + * guarantees toward low-latency, especially when the OS is not even made + * aware of the presence of these interrupts. For this reason, we need a + * somewhat brute force mechanism to detect these interrupts. In this case, + * we do it by hogging all of the CPU(s) for configurable timer intervals, + * sampling the built-in CPU timer, looking for discontiguous readings. + * + * WARNING: This implementation necessarily introduces latencies. Therefore, + * you should NEVER use this tracer while running in a production + * environment requiring any kind of low-latency performance + * guarantee(s). + * + * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com> + * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com> + * + * Includes useful feedback from Clark Williams <williams@redhat.com> + * + */ +#include <linux/kthread.h> +#include <linux/tracefs.h> +#include <linux/uaccess.h> +#include <linux/cpumask.h> +#include <linux/delay.h> +#include <linux/sched/clock.h> +#include "trace.h" + +static struct trace_array *hwlat_trace; + +#define U64STR_SIZE 22 /* 20 digits max */ + +#define BANNER "hwlat_detector: " +#define DEFAULT_SAMPLE_WINDOW 1000000 /* 1s */ +#define DEFAULT_SAMPLE_WIDTH 500000 /* 0.5s */ +#define DEFAULT_LAT_THRESHOLD 10 /* 10us */ + +static struct dentry *hwlat_sample_width; /* sample width us */ +static struct dentry *hwlat_sample_window; /* sample window us */ +static struct dentry *hwlat_thread_mode; /* hwlat thread mode */ + +enum { + MODE_NONE = 0, + MODE_ROUND_ROBIN, + MODE_PER_CPU, + MODE_MAX +}; +static char *thread_mode_str[] = { "none", "round-robin", "per-cpu" }; + +/* Save the previous tracing_thresh value */ +static unsigned long save_tracing_thresh; + +/* runtime kthread data */ +struct hwlat_kthread_data { + struct task_struct *kthread; + /* NMI timestamp counters */ + u64 nmi_ts_start; + u64 nmi_total_ts; + int nmi_count; + int nmi_cpu; +}; + +static struct hwlat_kthread_data hwlat_single_cpu_data; +static DEFINE_PER_CPU(struct hwlat_kthread_data, hwlat_per_cpu_data); + +/* Tells NMIs to call back to the hwlat tracer to record timestamps */ +bool trace_hwlat_callback_enabled; + +/* If the user changed threshold, remember it */ +static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC; + +/* Individual latency samples are stored here when detected. */ +struct hwlat_sample { + u64 seqnum; /* unique sequence */ + u64 duration; /* delta */ + u64 outer_duration; /* delta (outer loop) */ + u64 nmi_total_ts; /* Total time spent in NMIs */ + struct timespec64 timestamp; /* wall time */ + int nmi_count; /* # NMIs during this sample */ + int count; /* # of iterations over thresh */ +}; + +/* keep the global state somewhere. */ +static struct hwlat_data { + + struct mutex lock; /* protect changes */ + + u64 count; /* total since reset */ + + u64 sample_window; /* total sampling window (on+off) */ + u64 sample_width; /* active sampling portion of window */ + + int thread_mode; /* thread mode */ + +} hwlat_data = { + .sample_window = DEFAULT_SAMPLE_WINDOW, + .sample_width = DEFAULT_SAMPLE_WIDTH, + .thread_mode = MODE_ROUND_ROBIN +}; + +static struct hwlat_kthread_data *get_cpu_data(void) +{ + if (hwlat_data.thread_mode == MODE_PER_CPU) + return this_cpu_ptr(&hwlat_per_cpu_data); + else + return &hwlat_single_cpu_data; +} + +static bool hwlat_busy; + +static void trace_hwlat_sample(struct hwlat_sample *sample) +{ + struct trace_array *tr = hwlat_trace; + struct trace_event_call *call = &event_hwlat; + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct ring_buffer_event *event; + struct hwlat_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry), + tracing_gen_ctx()); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->seqnum = sample->seqnum; + entry->duration = sample->duration; + entry->outer_duration = sample->outer_duration; + entry->timestamp = sample->timestamp; + entry->nmi_total_ts = sample->nmi_total_ts; + entry->nmi_count = sample->nmi_count; + entry->count = sample->count; + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit_nostack(buffer, event); +} + +/* Macros to encapsulate the time capturing infrastructure */ +#define time_type u64 +#define time_get() trace_clock_local() +#define time_to_us(x) div_u64(x, 1000) +#define time_sub(a, b) ((a) - (b)) +#define init_time(a, b) (a = b) +#define time_u64(a) a + +void trace_hwlat_callback(bool enter) +{ + struct hwlat_kthread_data *kdata = get_cpu_data(); + + if (!kdata->kthread) + return; + + /* + * Currently trace_clock_local() calls sched_clock() and the + * generic version is not NMI safe. + */ + if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) { + if (enter) + kdata->nmi_ts_start = time_get(); + else + kdata->nmi_total_ts += time_get() - kdata->nmi_ts_start; + } + + if (enter) + kdata->nmi_count++; +} + +/* + * hwlat_err - report a hwlat error. + */ +#define hwlat_err(msg) ({ \ + struct trace_array *tr = hwlat_trace; \ + \ + trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, msg); \ +}) + +/** + * get_sample - sample the CPU TSC and look for likely hardware latencies + * + * Used to repeatedly capture the CPU TSC (or similar), looking for potential + * hardware-induced latency. Called with interrupts disabled and with + * hwlat_data.lock held. + */ +static int get_sample(void) +{ + struct hwlat_kthread_data *kdata = get_cpu_data(); + struct trace_array *tr = hwlat_trace; + struct hwlat_sample s; + time_type start, t1, t2, last_t2; + s64 diff, outer_diff, total, last_total = 0; + u64 sample = 0; + u64 thresh = tracing_thresh; + u64 outer_sample = 0; + int ret = -1; + unsigned int count = 0; + + do_div(thresh, NSEC_PER_USEC); /* modifies interval value */ + + kdata->nmi_total_ts = 0; + kdata->nmi_count = 0; + /* Make sure NMIs see this first */ + barrier(); + + trace_hwlat_callback_enabled = true; + + init_time(last_t2, 0); + start = time_get(); /* start timestamp */ + outer_diff = 0; + + do { + + t1 = time_get(); /* we'll look for a discontinuity */ + t2 = time_get(); + + if (time_u64(last_t2)) { + /* Check the delta from outer loop (t2 to next t1) */ + outer_diff = time_to_us(time_sub(t1, last_t2)); + /* This shouldn't happen */ + if (outer_diff < 0) { + hwlat_err(BANNER "time running backwards\n"); + goto out; + } + if (outer_diff > outer_sample) + outer_sample = outer_diff; + } + last_t2 = t2; + + total = time_to_us(time_sub(t2, start)); /* sample width */ + + /* Check for possible overflows */ + if (total < last_total) { + hwlat_err("Time total overflowed\n"); + break; + } + last_total = total; + + /* This checks the inner loop (t1 to t2) */ + diff = time_to_us(time_sub(t2, t1)); /* current diff */ + + if (diff > thresh || outer_diff > thresh) { + if (!count) + ktime_get_real_ts64(&s.timestamp); + count++; + } + + /* This shouldn't happen */ + if (diff < 0) { + hwlat_err(BANNER "time running backwards\n"); + goto out; + } + + if (diff > sample) + sample = diff; /* only want highest value */ + + } while (total <= hwlat_data.sample_width); + + barrier(); /* finish the above in the view for NMIs */ + trace_hwlat_callback_enabled = false; + barrier(); /* Make sure nmi_total_ts is no longer updated */ + + ret = 0; + + /* If we exceed the threshold value, we have found a hardware latency */ + if (sample > thresh || outer_sample > thresh) { + u64 latency; + + ret = 1; + + /* We read in microseconds */ + if (kdata->nmi_total_ts) + do_div(kdata->nmi_total_ts, NSEC_PER_USEC); + + hwlat_data.count++; + s.seqnum = hwlat_data.count; + s.duration = sample; + s.outer_duration = outer_sample; + s.nmi_total_ts = kdata->nmi_total_ts; + s.nmi_count = kdata->nmi_count; + s.count = count; + trace_hwlat_sample(&s); + + latency = max(sample, outer_sample); + + /* Keep a running maximum ever recorded hardware latency */ + if (latency > tr->max_latency) { + tr->max_latency = latency; + latency_fsnotify(tr); + } + } + +out: + return ret; +} + +static struct cpumask save_cpumask; + +static void move_to_next_cpu(void) +{ + struct cpumask *current_mask = &save_cpumask; + struct trace_array *tr = hwlat_trace; + int next_cpu; + + /* + * If for some reason the user modifies the CPU affinity + * of this thread, then stop migrating for the duration + * of the current test. + */ + if (!cpumask_equal(current_mask, current->cpus_ptr)) + goto change_mode; + + cpus_read_lock(); + cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); + next_cpu = cpumask_next(raw_smp_processor_id(), current_mask); + cpus_read_unlock(); + + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(current_mask); + + if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */ + goto change_mode; + + cpumask_clear(current_mask); + cpumask_set_cpu(next_cpu, current_mask); + + set_cpus_allowed_ptr(current, current_mask); + return; + + change_mode: + hwlat_data.thread_mode = MODE_NONE; + pr_info(BANNER "cpumask changed while in round-robin mode, switching to mode none\n"); +} + +/* + * kthread_fn - The CPU time sampling/hardware latency detection kernel thread + * + * Used to periodically sample the CPU TSC via a call to get_sample. We + * disable interrupts, which does (intentionally) introduce latency since we + * need to ensure nothing else might be running (and thus preempting). + * Obviously this should never be used in production environments. + * + * Executes one loop interaction on each CPU in tracing_cpumask sysfs file. + */ +static int kthread_fn(void *data) +{ + u64 interval; + + while (!kthread_should_stop()) { + + if (hwlat_data.thread_mode == MODE_ROUND_ROBIN) + move_to_next_cpu(); + + local_irq_disable(); + get_sample(); + local_irq_enable(); + + mutex_lock(&hwlat_data.lock); + interval = hwlat_data.sample_window - hwlat_data.sample_width; + mutex_unlock(&hwlat_data.lock); + + do_div(interval, USEC_PER_MSEC); /* modifies interval value */ + + /* Always sleep for at least 1ms */ + if (interval < 1) + interval = 1; + + if (msleep_interruptible(interval)) + break; + } + + return 0; +} + +/* + * stop_stop_kthread - Inform the hardware latency sampling/detector kthread to stop + * + * This kicks the running hardware latency sampling/detector kernel thread and + * tells it to stop sampling now. Use this on unload and at system shutdown. + */ +static void stop_single_kthread(void) +{ + struct hwlat_kthread_data *kdata = get_cpu_data(); + struct task_struct *kthread; + + cpus_read_lock(); + kthread = kdata->kthread; + + if (!kthread) + goto out_put_cpus; + + kthread_stop(kthread); + kdata->kthread = NULL; + +out_put_cpus: + cpus_read_unlock(); +} + + +/* + * start_single_kthread - Kick off the hardware latency sampling/detector kthread + * + * This starts the kernel thread that will sit and sample the CPU timestamp + * counter (TSC or similar) and look for potential hardware latencies. + */ +static int start_single_kthread(struct trace_array *tr) +{ + struct hwlat_kthread_data *kdata = get_cpu_data(); + struct cpumask *current_mask = &save_cpumask; + struct task_struct *kthread; + int next_cpu; + + cpus_read_lock(); + if (kdata->kthread) + goto out_put_cpus; + + kthread = kthread_create(kthread_fn, NULL, "hwlatd"); + if (IS_ERR(kthread)) { + pr_err(BANNER "could not start sampling thread\n"); + cpus_read_unlock(); + return -ENOMEM; + } + + /* Just pick the first CPU on first iteration */ + cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); + + if (hwlat_data.thread_mode == MODE_ROUND_ROBIN) { + next_cpu = cpumask_first(current_mask); + cpumask_clear(current_mask); + cpumask_set_cpu(next_cpu, current_mask); + + } + + set_cpus_allowed_ptr(kthread, current_mask); + + kdata->kthread = kthread; + wake_up_process(kthread); + +out_put_cpus: + cpus_read_unlock(); + return 0; +} + +/* + * stop_cpu_kthread - Stop a hwlat cpu kthread + */ +static void stop_cpu_kthread(unsigned int cpu) +{ + struct task_struct *kthread; + + kthread = per_cpu(hwlat_per_cpu_data, cpu).kthread; + if (kthread) + kthread_stop(kthread); + per_cpu(hwlat_per_cpu_data, cpu).kthread = NULL; +} + +/* + * stop_per_cpu_kthreads - Inform the hardware latency sampling/detector kthread to stop + * + * This kicks the running hardware latency sampling/detector kernel threads and + * tells it to stop sampling now. Use this on unload and at system shutdown. + */ +static void stop_per_cpu_kthreads(void) +{ + unsigned int cpu; + + cpus_read_lock(); + for_each_online_cpu(cpu) + stop_cpu_kthread(cpu); + cpus_read_unlock(); +} + +/* + * start_cpu_kthread - Start a hwlat cpu kthread + */ +static int start_cpu_kthread(unsigned int cpu) +{ + struct task_struct *kthread; + + /* Do not start a new hwlatd thread if it is already running */ + if (per_cpu(hwlat_per_cpu_data, cpu).kthread) + return 0; + + kthread = kthread_run_on_cpu(kthread_fn, NULL, cpu, "hwlatd/%u"); + if (IS_ERR(kthread)) { + pr_err(BANNER "could not start sampling thread\n"); + return -ENOMEM; + } + + per_cpu(hwlat_per_cpu_data, cpu).kthread = kthread; + + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +static void hwlat_hotplug_workfn(struct work_struct *dummy) +{ + struct trace_array *tr = hwlat_trace; + unsigned int cpu = smp_processor_id(); + + mutex_lock(&trace_types_lock); + mutex_lock(&hwlat_data.lock); + cpus_read_lock(); + + if (!hwlat_busy || hwlat_data.thread_mode != MODE_PER_CPU) + goto out_unlock; + + if (!cpumask_test_cpu(cpu, tr->tracing_cpumask)) + goto out_unlock; + + start_cpu_kthread(cpu); + +out_unlock: + cpus_read_unlock(); + mutex_unlock(&hwlat_data.lock); + mutex_unlock(&trace_types_lock); +} + +static DECLARE_WORK(hwlat_hotplug_work, hwlat_hotplug_workfn); + +/* + * hwlat_cpu_init - CPU hotplug online callback function + */ +static int hwlat_cpu_init(unsigned int cpu) +{ + schedule_work_on(cpu, &hwlat_hotplug_work); + return 0; +} + +/* + * hwlat_cpu_die - CPU hotplug offline callback function + */ +static int hwlat_cpu_die(unsigned int cpu) +{ + stop_cpu_kthread(cpu); + return 0; +} + +static void hwlat_init_hotplug_support(void) +{ + int ret; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "trace/hwlat:online", + hwlat_cpu_init, hwlat_cpu_die); + if (ret < 0) + pr_warn(BANNER "Error to init cpu hotplug support\n"); + + return; +} +#else /* CONFIG_HOTPLUG_CPU */ +static void hwlat_init_hotplug_support(void) +{ + return; +} +#endif /* CONFIG_HOTPLUG_CPU */ + +/* + * start_per_cpu_kthreads - Kick off the hardware latency sampling/detector kthreads + * + * This starts the kernel threads that will sit on potentially all cpus and + * sample the CPU timestamp counter (TSC or similar) and look for potential + * hardware latencies. + */ +static int start_per_cpu_kthreads(struct trace_array *tr) +{ + struct cpumask *current_mask = &save_cpumask; + unsigned int cpu; + int retval; + + cpus_read_lock(); + /* + * Run only on CPUs in which hwlat is allowed to run. + */ + cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); + + for_each_cpu(cpu, current_mask) { + retval = start_cpu_kthread(cpu); + if (retval) + goto out_error; + } + cpus_read_unlock(); + + return 0; + +out_error: + cpus_read_unlock(); + stop_per_cpu_kthreads(); + return retval; +} + +static void *s_mode_start(struct seq_file *s, loff_t *pos) +{ + int mode = *pos; + + mutex_lock(&hwlat_data.lock); + + if (mode >= MODE_MAX) + return NULL; + + return pos; +} + +static void *s_mode_next(struct seq_file *s, void *v, loff_t *pos) +{ + int mode = ++(*pos); + + if (mode >= MODE_MAX) + return NULL; + + return pos; +} + +static int s_mode_show(struct seq_file *s, void *v) +{ + loff_t *pos = v; + int mode = *pos; + + if (mode == hwlat_data.thread_mode) + seq_printf(s, "[%s]", thread_mode_str[mode]); + else + seq_printf(s, "%s", thread_mode_str[mode]); + + if (mode < MODE_MAX - 1) /* if mode is any but last */ + seq_puts(s, " "); + + return 0; +} + +static void s_mode_stop(struct seq_file *s, void *v) +{ + seq_puts(s, "\n"); + mutex_unlock(&hwlat_data.lock); +} + +static const struct seq_operations thread_mode_seq_ops = { + .start = s_mode_start, + .next = s_mode_next, + .show = s_mode_show, + .stop = s_mode_stop +}; + +static int hwlat_mode_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &thread_mode_seq_ops); +}; + +static void hwlat_tracer_start(struct trace_array *tr); +static void hwlat_tracer_stop(struct trace_array *tr); + +/** + * hwlat_mode_write - Write function for "mode" entry + * @filp: The active open file structure + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in @file + * + * This function provides a write implementation for the "mode" interface + * to the hardware latency detector. hwlatd has different operation modes. + * The "none" sets the allowed cpumask for a single hwlatd thread at the + * startup and lets the scheduler handle the migration. The default mode is + * the "round-robin" one, in which a single hwlatd thread runs, migrating + * among the allowed CPUs in a round-robin fashion. The "per-cpu" mode + * creates one hwlatd thread per allowed CPU. + */ +static ssize_t hwlat_mode_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = hwlat_trace; + const char *mode; + char buf[64]; + int ret, i; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + mode = strstrip(buf); + + ret = -EINVAL; + + /* + * trace_types_lock is taken to avoid concurrency on start/stop + * and hwlat_busy. + */ + mutex_lock(&trace_types_lock); + if (hwlat_busy) + hwlat_tracer_stop(tr); + + mutex_lock(&hwlat_data.lock); + + for (i = 0; i < MODE_MAX; i++) { + if (strcmp(mode, thread_mode_str[i]) == 0) { + hwlat_data.thread_mode = i; + ret = cnt; + } + } + + mutex_unlock(&hwlat_data.lock); + + if (hwlat_busy) + hwlat_tracer_start(tr); + mutex_unlock(&trace_types_lock); + + *ppos += cnt; + + + + return ret; +} + +/* + * The width parameter is read/write using the generic trace_min_max_param + * method. The *val is protected by the hwlat_data lock and is upper + * bounded by the window parameter. + */ +static struct trace_min_max_param hwlat_width = { + .lock = &hwlat_data.lock, + .val = &hwlat_data.sample_width, + .max = &hwlat_data.sample_window, + .min = NULL, +}; + +/* + * The window parameter is read/write using the generic trace_min_max_param + * method. The *val is protected by the hwlat_data lock and is lower + * bounded by the width parameter. + */ +static struct trace_min_max_param hwlat_window = { + .lock = &hwlat_data.lock, + .val = &hwlat_data.sample_window, + .max = NULL, + .min = &hwlat_data.sample_width, +}; + +static const struct file_operations thread_mode_fops = { + .open = hwlat_mode_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = hwlat_mode_write +}; +/** + * init_tracefs - A function to initialize the tracefs interface files + * + * This function creates entries in tracefs for "hwlat_detector". + * It creates the hwlat_detector directory in the tracing directory, + * and within that directory is the count, width and window files to + * change and view those values. + */ +static int init_tracefs(void) +{ + int ret; + struct dentry *top_dir; + + ret = tracing_init_dentry(); + if (ret) + return -ENOMEM; + + top_dir = tracefs_create_dir("hwlat_detector", NULL); + if (!top_dir) + return -ENOMEM; + + hwlat_sample_window = tracefs_create_file("window", TRACE_MODE_WRITE, + top_dir, + &hwlat_window, + &trace_min_max_fops); + if (!hwlat_sample_window) + goto err; + + hwlat_sample_width = tracefs_create_file("width", TRACE_MODE_WRITE, + top_dir, + &hwlat_width, + &trace_min_max_fops); + if (!hwlat_sample_width) + goto err; + + hwlat_thread_mode = trace_create_file("mode", TRACE_MODE_WRITE, + top_dir, + NULL, + &thread_mode_fops); + if (!hwlat_thread_mode) + goto err; + + return 0; + + err: + tracefs_remove(top_dir); + return -ENOMEM; +} + +static void hwlat_tracer_start(struct trace_array *tr) +{ + int err; + + if (hwlat_data.thread_mode == MODE_PER_CPU) + err = start_per_cpu_kthreads(tr); + else + err = start_single_kthread(tr); + if (err) + pr_err(BANNER "Cannot start hwlat kthread\n"); +} + +static void hwlat_tracer_stop(struct trace_array *tr) +{ + if (hwlat_data.thread_mode == MODE_PER_CPU) + stop_per_cpu_kthreads(); + else + stop_single_kthread(); +} + +static int hwlat_tracer_init(struct trace_array *tr) +{ + /* Only allow one instance to enable this */ + if (hwlat_busy) + return -EBUSY; + + hwlat_trace = tr; + + hwlat_data.count = 0; + tr->max_latency = 0; + save_tracing_thresh = tracing_thresh; + + /* tracing_thresh is in nsecs, we speak in usecs */ + if (!tracing_thresh) + tracing_thresh = last_tracing_thresh; + + if (tracer_tracing_is_on(tr)) + hwlat_tracer_start(tr); + + hwlat_busy = true; + + return 0; +} + +static void hwlat_tracer_reset(struct trace_array *tr) +{ + hwlat_tracer_stop(tr); + + /* the tracing threshold is static between runs */ + last_tracing_thresh = tracing_thresh; + + tracing_thresh = save_tracing_thresh; + hwlat_busy = false; +} + +static struct tracer hwlat_tracer __read_mostly = +{ + .name = "hwlat", + .init = hwlat_tracer_init, + .reset = hwlat_tracer_reset, + .start = hwlat_tracer_start, + .stop = hwlat_tracer_stop, + .allow_instances = true, +}; + +__init static int init_hwlat_tracer(void) +{ + int ret; + + mutex_init(&hwlat_data.lock); + + ret = register_tracer(&hwlat_tracer); + if (ret) + return ret; + + hwlat_init_hotplug_support(); + + init_tracefs(); + + return 0; +} +late_initcall(init_hwlat_tracer); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c new file mode 100644 index 0000000000..ba37f768e2 --- /dev/null +++ b/kernel/trace/trace_irqsoff.c @@ -0,0 +1,752 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace irqs off critical timings + * + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> + * + * From code in the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 Nadia Yvette Chambers + */ +#include <linux/kallsyms.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/ftrace.h> +#include <linux/kprobes.h> + +#include "trace.h" + +#include <trace/events/preemptirq.h> + +#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER) +static struct trace_array *irqsoff_trace __read_mostly; +static int tracer_enabled __read_mostly; + +static DEFINE_PER_CPU(int, tracing_cpu); + +static DEFINE_RAW_SPINLOCK(max_trace_lock); + +enum { + TRACER_IRQS_OFF = (1 << 1), + TRACER_PREEMPT_OFF = (1 << 2), +}; + +static int trace_type __read_mostly; + +static int save_flags; + +static void stop_irqsoff_tracer(struct trace_array *tr, int graph); +static int start_irqsoff_tracer(struct trace_array *tr, int graph); + +#ifdef CONFIG_PREEMPT_TRACER +static inline int +preempt_trace(int pc) +{ + return ((trace_type & TRACER_PREEMPT_OFF) && pc); +} +#else +# define preempt_trace(pc) (0) +#endif + +#ifdef CONFIG_IRQSOFF_TRACER +static inline int +irq_trace(void) +{ + return ((trace_type & TRACER_IRQS_OFF) && + irqs_disabled()); +} +#else +# define irq_trace() (0) +#endif + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int irqsoff_display_graph(struct trace_array *tr, int set); +# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH) +#else +static inline int irqsoff_display_graph(struct trace_array *tr, int set) +{ + return -EINVAL; +} +# define is_graph(tr) false +#endif + +/* + * Sequence count - we record it when starting a measurement and + * skip the latency if the sequence has changed - some other section + * did a maximum and could disturb our measurement with serial console + * printouts, etc. Truly coinciding maximum latencies should be rare + * and what happens together happens separately as well, so this doesn't + * decrease the validity of the maximum found: + */ +static __cacheline_aligned_in_smp unsigned long max_sequence; + +#ifdef CONFIG_FUNCTION_TRACER +/* + * Prologue for the preempt and irqs off function tracers. + * + * Returns 1 if it is OK to continue, and data->disabled is + * incremented. + * 0 if the trace is to be ignored, and data->disabled + * is kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. + */ +static int func_prolog_dec(struct trace_array *tr, + struct trace_array_cpu **data, + unsigned long *flags) +{ + long disabled; + int cpu; + + /* + * Does not matter if we preempt. We test the flags + * afterward, to see if irqs are disabled or not. + * If we preempt and get a false positive, the flags + * test will fail. + */ + cpu = raw_smp_processor_id(); + if (likely(!per_cpu(tracing_cpu, cpu))) + return 0; + + local_save_flags(*flags); + /* + * Slight chance to get a false positive on tracing_cpu, + * although I'm starting to think there isn't a chance. + * Leave this for now just to be paranoid. + */ + if (!irqs_disabled_flags(*flags) && !preempt_count()) + return 0; + + *data = per_cpu_ptr(tr->array_buffer.data, cpu); + disabled = atomic_inc_return(&(*data)->disabled); + + if (likely(disabled == 1)) + return 1; + + atomic_dec(&(*data)->disabled); + + return 0; +} + +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + unsigned int trace_ctx; + + if (!func_prolog_dec(tr, &data, &flags)) + return; + + trace_ctx = tracing_gen_ctx_flags(flags); + + trace_function(tr, ip, parent_ip, trace_ctx); + + atomic_dec(&data->disabled); +} +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int irqsoff_display_graph(struct trace_array *tr, int set) +{ + int cpu; + + if (!(is_graph(tr) ^ set)) + return 0; + + stop_irqsoff_tracer(irqsoff_trace, !set); + + for_each_possible_cpu(cpu) + per_cpu(tracing_cpu, cpu) = 0; + + tr->max_latency = 0; + tracing_reset_online_cpus(&irqsoff_trace->array_buffer); + + return start_irqsoff_tracer(irqsoff_trace, set); +} + +static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + unsigned int trace_ctx; + int ret; + + if (ftrace_graph_ignore_func(trace)) + return 0; + /* + * Do not trace a function if it's filtered by set_graph_notrace. + * Make the index of ret stack negative to indicate that it should + * ignore further functions. But it needs its own ret stack entry + * to recover the original index in order to continue tracing after + * returning from the function. + */ + if (ftrace_graph_notrace_addr(trace->func)) + return 1; + + if (!func_prolog_dec(tr, &data, &flags)) + return 0; + + trace_ctx = tracing_gen_ctx_flags(flags); + ret = __trace_graph_entry(tr, trace, trace_ctx); + atomic_dec(&data->disabled); + + return ret; +} + +static void irqsoff_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + unsigned int trace_ctx; + + ftrace_graph_addr_finish(trace); + + if (!func_prolog_dec(tr, &data, &flags)) + return; + + trace_ctx = tracing_gen_ctx_flags(flags); + __trace_graph_return(tr, trace, trace_ctx); + atomic_dec(&data->disabled); +} + +static struct fgraph_ops fgraph_ops = { + .entryfunc = &irqsoff_graph_entry, + .retfunc = &irqsoff_graph_return, +}; + +static void irqsoff_trace_open(struct trace_iterator *iter) +{ + if (is_graph(iter->tr)) + graph_trace_open(iter); + else + iter->private = NULL; +} + +static void irqsoff_trace_close(struct trace_iterator *iter) +{ + if (iter->private) + graph_trace_close(iter); +} + +#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ + TRACE_GRAPH_PRINT_PROC | \ + TRACE_GRAPH_PRINT_REL_TIME | \ + TRACE_GRAPH_PRINT_DURATION) + +static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) +{ + /* + * In graph mode call the graph tracer output function, + * otherwise go with the TRACE_FN event handler + */ + if (is_graph(iter->tr)) + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); + + return TRACE_TYPE_UNHANDLED; +} + +static void irqsoff_print_header(struct seq_file *s) +{ + struct trace_array *tr = irqsoff_trace; + + if (is_graph(tr)) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else + trace_default_header(s); +} + +static void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned int trace_ctx) +{ + if (is_graph(tr)) + trace_graph_function(tr, ip, parent_ip, trace_ctx); + else + trace_function(tr, ip, parent_ip, trace_ctx); +} + +#else +#define __trace_function trace_function + +static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) +{ + return TRACE_TYPE_UNHANDLED; +} + +static void irqsoff_trace_open(struct trace_iterator *iter) { } +static void irqsoff_trace_close(struct trace_iterator *iter) { } + +#ifdef CONFIG_FUNCTION_TRACER +static void irqsoff_print_header(struct seq_file *s) +{ + trace_default_header(s); +} +#else +static void irqsoff_print_header(struct seq_file *s) +{ + trace_latency_header(s); +} +#endif /* CONFIG_FUNCTION_TRACER */ +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +/* + * Should this new latency be reported/recorded? + */ +static bool report_latency(struct trace_array *tr, u64 delta) +{ + if (tracing_thresh) { + if (delta < tracing_thresh) + return false; + } else { + if (delta <= tr->max_latency) + return false; + } + return true; +} + +static void +check_critical_timing(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long parent_ip, + int cpu) +{ + u64 T0, T1, delta; + unsigned long flags; + unsigned int trace_ctx; + + T0 = data->preempt_timestamp; + T1 = ftrace_now(cpu); + delta = T1-T0; + + trace_ctx = tracing_gen_ctx(); + + if (!report_latency(tr, delta)) + goto out; + + raw_spin_lock_irqsave(&max_trace_lock, flags); + + /* check if we are still the max latency */ + if (!report_latency(tr, delta)) + goto out_unlock; + + __trace_function(tr, CALLER_ADDR0, parent_ip, trace_ctx); + /* Skip 5 functions to get to the irq/preempt enable function */ + __trace_stack(tr, trace_ctx, 5); + + if (data->critical_sequence != max_sequence) + goto out_unlock; + + data->critical_end = parent_ip; + + if (likely(!is_tracing_stopped())) { + tr->max_latency = delta; + update_max_tr_single(tr, current, cpu); + } + + max_sequence++; + +out_unlock: + raw_spin_unlock_irqrestore(&max_trace_lock, flags); + +out: + data->critical_sequence = max_sequence; + data->preempt_timestamp = ftrace_now(cpu); + __trace_function(tr, CALLER_ADDR0, parent_ip, trace_ctx); +} + +static nokprobe_inline void +start_critical_timing(unsigned long ip, unsigned long parent_ip) +{ + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + + if (!tracer_enabled || !tracing_is_enabled()) + return; + + cpu = raw_smp_processor_id(); + + if (per_cpu(tracing_cpu, cpu)) + return; + + data = per_cpu_ptr(tr->array_buffer.data, cpu); + + if (unlikely(!data) || atomic_read(&data->disabled)) + return; + + atomic_inc(&data->disabled); + + data->critical_sequence = max_sequence; + data->preempt_timestamp = ftrace_now(cpu); + data->critical_start = parent_ip ? : ip; + + __trace_function(tr, ip, parent_ip, tracing_gen_ctx()); + + per_cpu(tracing_cpu, cpu) = 1; + + atomic_dec(&data->disabled); +} + +static nokprobe_inline void +stop_critical_timing(unsigned long ip, unsigned long parent_ip) +{ + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned int trace_ctx; + + cpu = raw_smp_processor_id(); + /* Always clear the tracing cpu on stopping the trace */ + if (unlikely(per_cpu(tracing_cpu, cpu))) + per_cpu(tracing_cpu, cpu) = 0; + else + return; + + if (!tracer_enabled || !tracing_is_enabled()) + return; + + data = per_cpu_ptr(tr->array_buffer.data, cpu); + + if (unlikely(!data) || + !data->critical_start || atomic_read(&data->disabled)) + return; + + atomic_inc(&data->disabled); + + trace_ctx = tracing_gen_ctx(); + __trace_function(tr, ip, parent_ip, trace_ctx); + check_critical_timing(tr, data, parent_ip ? : ip, cpu); + data->critical_start = 0; + atomic_dec(&data->disabled); +} + +/* start and stop critical timings used to for stoppage (in idle) */ +void start_critical_timings(void) +{ + if (preempt_trace(preempt_count()) || irq_trace()) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL_GPL(start_critical_timings); +NOKPROBE_SYMBOL(start_critical_timings); + +void stop_critical_timings(void) +{ + if (preempt_trace(preempt_count()) || irq_trace()) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL_GPL(stop_critical_timings); +NOKPROBE_SYMBOL(stop_critical_timings); + +#ifdef CONFIG_FUNCTION_TRACER +static bool function_enabled; + +static int register_irqsoff_function(struct trace_array *tr, int graph, int set) +{ + int ret; + + /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ + if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION))) + return 0; + + if (graph) + ret = register_ftrace_graph(&fgraph_ops); + else + ret = register_ftrace_function(tr->ops); + + if (!ret) + function_enabled = true; + + return ret; +} + +static void unregister_irqsoff_function(struct trace_array *tr, int graph) +{ + if (!function_enabled) + return; + + if (graph) + unregister_ftrace_graph(&fgraph_ops); + else + unregister_ftrace_function(tr->ops); + + function_enabled = false; +} + +static int irqsoff_function_set(struct trace_array *tr, u32 mask, int set) +{ + if (!(mask & TRACE_ITER_FUNCTION)) + return 0; + + if (set) + register_irqsoff_function(tr, is_graph(tr), 1); + else + unregister_irqsoff_function(tr, is_graph(tr)); + return 1; +} +#else +static int register_irqsoff_function(struct trace_array *tr, int graph, int set) +{ + return 0; +} +static void unregister_irqsoff_function(struct trace_array *tr, int graph) { } +static inline int irqsoff_function_set(struct trace_array *tr, u32 mask, int set) +{ + return 0; +} +#endif /* CONFIG_FUNCTION_TRACER */ + +static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) +{ + struct tracer *tracer = tr->current_trace; + + if (irqsoff_function_set(tr, mask, set)) + return 0; + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (mask & TRACE_ITER_DISPLAY_GRAPH) + return irqsoff_display_graph(tr, set); +#endif + + return trace_keep_overwrite(tracer, mask, set); +} + +static int start_irqsoff_tracer(struct trace_array *tr, int graph) +{ + int ret; + + ret = register_irqsoff_function(tr, graph, 0); + + if (!ret && tracing_is_enabled()) + tracer_enabled = 1; + else + tracer_enabled = 0; + + return ret; +} + +static void stop_irqsoff_tracer(struct trace_array *tr, int graph) +{ + tracer_enabled = 0; + + unregister_irqsoff_function(tr, graph); +} + +static bool irqsoff_busy; + +static int __irqsoff_tracer_init(struct trace_array *tr) +{ + if (irqsoff_busy) + return -EBUSY; + + save_flags = tr->trace_flags; + + /* non overwrite screws up the latency tracers */ + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); + /* without pause, we will produce garbage if another latency occurs */ + set_tracer_flag(tr, TRACE_ITER_PAUSE_ON_TRACE, 1); + + tr->max_latency = 0; + irqsoff_trace = tr; + /* make sure that the tracer is visible */ + smp_wmb(); + + ftrace_init_array_ops(tr, irqsoff_tracer_call); + + /* Only toplevel instance supports graph tracing */ + if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL && + is_graph(tr)))) + printk(KERN_ERR "failed to start irqsoff tracer\n"); + + irqsoff_busy = true; + return 0; +} + +static void __irqsoff_tracer_reset(struct trace_array *tr) +{ + int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; + int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + int pause_flag = save_flags & TRACE_ITER_PAUSE_ON_TRACE; + + stop_irqsoff_tracer(tr, is_graph(tr)); + + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); + set_tracer_flag(tr, TRACE_ITER_PAUSE_ON_TRACE, pause_flag); + ftrace_reset_array_ops(tr); + + irqsoff_busy = false; +} + +static void irqsoff_tracer_start(struct trace_array *tr) +{ + tracer_enabled = 1; +} + +static void irqsoff_tracer_stop(struct trace_array *tr) +{ + tracer_enabled = 0; +} + +#ifdef CONFIG_IRQSOFF_TRACER +/* + * We are only interested in hardirq on/off events: + */ +void tracer_hardirqs_on(unsigned long a0, unsigned long a1) +{ + if (!preempt_trace(preempt_count()) && irq_trace()) + stop_critical_timing(a0, a1); +} +NOKPROBE_SYMBOL(tracer_hardirqs_on); + +void tracer_hardirqs_off(unsigned long a0, unsigned long a1) +{ + if (!preempt_trace(preempt_count()) && irq_trace()) + start_critical_timing(a0, a1); +} +NOKPROBE_SYMBOL(tracer_hardirqs_off); + +static int irqsoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_IRQS_OFF; + + return __irqsoff_tracer_init(tr); +} + +static void irqsoff_tracer_reset(struct trace_array *tr) +{ + __irqsoff_tracer_reset(tr); +} + +static struct tracer irqsoff_tracer __read_mostly = +{ + .name = "irqsoff", + .init = irqsoff_tracer_init, + .reset = irqsoff_tracer_reset, + .start = irqsoff_tracer_start, + .stop = irqsoff_tracer_stop, + .print_max = true, + .print_header = irqsoff_print_header, + .print_line = irqsoff_print_line, + .flag_changed = irqsoff_flag_changed, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_irqsoff, +#endif + .open = irqsoff_trace_open, + .close = irqsoff_trace_close, + .allow_instances = true, + .use_max_tr = true, +}; +#endif /* CONFIG_IRQSOFF_TRACER */ + +#ifdef CONFIG_PREEMPT_TRACER +void tracer_preempt_on(unsigned long a0, unsigned long a1) +{ + if (preempt_trace(preempt_count()) && !irq_trace()) + stop_critical_timing(a0, a1); +} + +void tracer_preempt_off(unsigned long a0, unsigned long a1) +{ + if (preempt_trace(preempt_count()) && !irq_trace()) + start_critical_timing(a0, a1); +} + +static int preemptoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_PREEMPT_OFF; + + return __irqsoff_tracer_init(tr); +} + +static void preemptoff_tracer_reset(struct trace_array *tr) +{ + __irqsoff_tracer_reset(tr); +} + +static struct tracer preemptoff_tracer __read_mostly = +{ + .name = "preemptoff", + .init = preemptoff_tracer_init, + .reset = preemptoff_tracer_reset, + .start = irqsoff_tracer_start, + .stop = irqsoff_tracer_stop, + .print_max = true, + .print_header = irqsoff_print_header, + .print_line = irqsoff_print_line, + .flag_changed = irqsoff_flag_changed, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_preemptoff, +#endif + .open = irqsoff_trace_open, + .close = irqsoff_trace_close, + .allow_instances = true, + .use_max_tr = true, +}; +#endif /* CONFIG_PREEMPT_TRACER */ + +#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) + +static int preemptirqsoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; + + return __irqsoff_tracer_init(tr); +} + +static void preemptirqsoff_tracer_reset(struct trace_array *tr) +{ + __irqsoff_tracer_reset(tr); +} + +static struct tracer preemptirqsoff_tracer __read_mostly = +{ + .name = "preemptirqsoff", + .init = preemptirqsoff_tracer_init, + .reset = preemptirqsoff_tracer_reset, + .start = irqsoff_tracer_start, + .stop = irqsoff_tracer_stop, + .print_max = true, + .print_header = irqsoff_print_header, + .print_line = irqsoff_print_line, + .flag_changed = irqsoff_flag_changed, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_preemptirqsoff, +#endif + .open = irqsoff_trace_open, + .close = irqsoff_trace_close, + .allow_instances = true, + .use_max_tr = true, +}; +#endif + +__init static int init_irqsoff_tracer(void) +{ +#ifdef CONFIG_IRQSOFF_TRACER + register_tracer(&irqsoff_tracer); +#endif +#ifdef CONFIG_PREEMPT_TRACER + register_tracer(&preemptoff_tracer); +#endif +#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) + register_tracer(&preemptirqsoff_tracer); +#endif + + return 0; +} +core_initcall(init_irqsoff_tracer); +#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */ diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c new file mode 100644 index 0000000000..59857a1ee4 --- /dev/null +++ b/kernel/trace/trace_kdb.c @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * kdb helper for dumping the ftrace buffer + * + * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com> + * + * ftrace_dump_buf based on ftrace_dump: + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> + * + */ +#include <linux/init.h> +#include <linux/kgdb.h> +#include <linux/kdb.h> +#include <linux/ftrace.h> + +#include "trace.h" +#include "trace_output.h" + +static struct trace_iterator iter; +static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS]; + +static void ftrace_dump_buf(int skip_entries, long cpu_file) +{ + struct trace_array *tr; + unsigned int old_userobj; + int cnt = 0, cpu; + + tr = iter.tr; + + old_userobj = tr->trace_flags; + + /* don't look at user memory in panic mode */ + tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + + kdb_printf("Dumping ftrace buffer:\n"); + if (skip_entries) + kdb_printf("(skipping %d entries)\n", skip_entries); + + trace_iterator_reset(&iter); + iter.iter_flags |= TRACE_FILE_LAT_FMT; + + if (cpu_file == RING_BUFFER_ALL_CPUS) { + for_each_tracing_cpu(cpu) { + iter.buffer_iter[cpu] = + ring_buffer_read_prepare(iter.array_buffer->buffer, + cpu, GFP_ATOMIC); + ring_buffer_read_start(iter.buffer_iter[cpu]); + tracing_iter_reset(&iter, cpu); + } + } else { + iter.cpu_file = cpu_file; + iter.buffer_iter[cpu_file] = + ring_buffer_read_prepare(iter.array_buffer->buffer, + cpu_file, GFP_ATOMIC); + ring_buffer_read_start(iter.buffer_iter[cpu_file]); + tracing_iter_reset(&iter, cpu_file); + } + + while (trace_find_next_entry_inc(&iter)) { + if (!cnt) + kdb_printf("---------------------------------\n"); + cnt++; + + if (!skip_entries) { + print_trace_line(&iter); + trace_printk_seq(&iter.seq); + } else { + skip_entries--; + } + + if (KDB_FLAG(CMD_INTERRUPT)) + goto out; + } + + if (!cnt) + kdb_printf(" (ftrace buffer empty)\n"); + else + kdb_printf("---------------------------------\n"); + +out: + tr->trace_flags = old_userobj; + + for_each_tracing_cpu(cpu) { + if (iter.buffer_iter[cpu]) { + ring_buffer_read_finish(iter.buffer_iter[cpu]); + iter.buffer_iter[cpu] = NULL; + } + } +} + +/* + * kdb_ftdump - Dump the ftrace log buffer + */ +static int kdb_ftdump(int argc, const char **argv) +{ + int skip_entries = 0; + long cpu_file; + char *cp; + int cnt; + int cpu; + + if (argc > 2) + return KDB_ARGCOUNT; + + if (argc) { + skip_entries = simple_strtol(argv[1], &cp, 0); + if (*cp) + skip_entries = 0; + } + + if (argc == 2) { + cpu_file = simple_strtol(argv[2], &cp, 0); + if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 || + !cpu_online(cpu_file)) + return KDB_BADINT; + } else { + cpu_file = RING_BUFFER_ALL_CPUS; + } + + kdb_trap_printk++; + + trace_init_global_iter(&iter); + iter.buffer_iter = buffer_iter; + + for_each_tracing_cpu(cpu) { + atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); + } + + /* A negative skip_entries means skip all but the last entries */ + if (skip_entries < 0) { + if (cpu_file == RING_BUFFER_ALL_CPUS) + cnt = trace_total_entries(NULL); + else + cnt = trace_total_entries_cpu(NULL, cpu_file); + skip_entries = max(cnt + skip_entries, 0); + } + + ftrace_dump_buf(skip_entries, cpu_file); + + for_each_tracing_cpu(cpu) { + atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); + } + + kdb_trap_printk--; + + return 0; +} + +static kdbtab_t ftdump_cmd = { + .name = "ftdump", + .func = kdb_ftdump, + .usage = "[skip_#entries] [cpu]", + .help = "Dump ftrace log; -skip dumps last #entries", + .flags = KDB_ENABLE_ALWAYS_SAFE, +}; + +static __init int kdb_ftrace_register(void) +{ + kdb_register(&ftdump_cmd); + return 0; +} + +late_initcall(kdb_ftrace_register); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c new file mode 100644 index 0000000000..47812aa16b --- /dev/null +++ b/kernel/trace/trace_kprobe.c @@ -0,0 +1,2083 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Kprobes-based tracing events + * + * Created by Masami Hiramatsu <mhiramat@redhat.com> + * + */ +#define pr_fmt(fmt) "trace_kprobe: " fmt + +#include <linux/bpf-cgroup.h> +#include <linux/security.h> +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/rculist.h> +#include <linux/error-injection.h> + +#include <asm/setup.h> /* for COMMAND_LINE_SIZE */ + +#include "trace_dynevent.h" +#include "trace_kprobe_selftest.h" +#include "trace_probe.h" +#include "trace_probe_tmpl.h" +#include "trace_probe_kernel.h" + +#define KPROBE_EVENT_SYSTEM "kprobes" +#define KRETPROBE_MAXACTIVE_MAX 4096 + +/* Kprobe early definition from command line */ +static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata; + +static int __init set_kprobe_boot_events(char *str) +{ + strscpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); + disable_tracing_selftest("running kprobe events"); + + return 1; +} +__setup("kprobe_event=", set_kprobe_boot_events); + +static int trace_kprobe_create(const char *raw_command); +static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev); +static int trace_kprobe_release(struct dyn_event *ev); +static bool trace_kprobe_is_busy(struct dyn_event *ev); +static bool trace_kprobe_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev); + +static struct dyn_event_operations trace_kprobe_ops = { + .create = trace_kprobe_create, + .show = trace_kprobe_show, + .is_busy = trace_kprobe_is_busy, + .free = trace_kprobe_release, + .match = trace_kprobe_match, +}; + +/* + * Kprobe event core functions + */ +struct trace_kprobe { + struct dyn_event devent; + struct kretprobe rp; /* Use rp.kp for kprobe use */ + unsigned long __percpu *nhit; + const char *symbol; /* symbol name */ + struct trace_probe tp; +}; + +static bool is_trace_kprobe(struct dyn_event *ev) +{ + return ev->ops == &trace_kprobe_ops; +} + +static struct trace_kprobe *to_trace_kprobe(struct dyn_event *ev) +{ + return container_of(ev, struct trace_kprobe, devent); +} + +/** + * for_each_trace_kprobe - iterate over the trace_kprobe list + * @pos: the struct trace_kprobe * for each entry + * @dpos: the struct dyn_event * to use as a loop cursor + */ +#define for_each_trace_kprobe(pos, dpos) \ + for_each_dyn_event(dpos) \ + if (is_trace_kprobe(dpos) && (pos = to_trace_kprobe(dpos))) + +static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) +{ + return tk->rp.handler != NULL; +} + +static nokprobe_inline const char *trace_kprobe_symbol(struct trace_kprobe *tk) +{ + return tk->symbol ? tk->symbol : "unknown"; +} + +static nokprobe_inline unsigned long trace_kprobe_offset(struct trace_kprobe *tk) +{ + return tk->rp.kp.offset; +} + +static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk) +{ + return kprobe_gone(&tk->rp.kp); +} + +static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, + struct module *mod) +{ + int len = strlen(module_name(mod)); + const char *name = trace_kprobe_symbol(tk); + + return strncmp(module_name(mod), name, len) == 0 && name[len] == ':'; +} + +static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) +{ + char *p; + bool ret; + + if (!tk->symbol) + return false; + p = strchr(tk->symbol, ':'); + if (!p) + return true; + *p = '\0'; + rcu_read_lock_sched(); + ret = !!find_module(tk->symbol); + rcu_read_unlock_sched(); + *p = ':'; + + return ret; +} + +static bool trace_kprobe_is_busy(struct dyn_event *ev) +{ + struct trace_kprobe *tk = to_trace_kprobe(ev); + + return trace_probe_is_enabled(&tk->tp); +} + +static bool trace_kprobe_match_command_head(struct trace_kprobe *tk, + int argc, const char **argv) +{ + char buf[MAX_ARGSTR_LEN + 1]; + + if (!argc) + return true; + + if (!tk->symbol) + snprintf(buf, sizeof(buf), "0x%p", tk->rp.kp.addr); + else if (tk->rp.kp.offset) + snprintf(buf, sizeof(buf), "%s+%u", + trace_kprobe_symbol(tk), tk->rp.kp.offset); + else + snprintf(buf, sizeof(buf), "%s", trace_kprobe_symbol(tk)); + if (strcmp(buf, argv[0])) + return false; + argc--; argv++; + + return trace_probe_match_command_args(&tk->tp, argc, argv); +} + +static bool trace_kprobe_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev) +{ + struct trace_kprobe *tk = to_trace_kprobe(ev); + + return (event[0] == '\0' || + strcmp(trace_probe_name(&tk->tp), event) == 0) && + (!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0) && + trace_kprobe_match_command_head(tk, argc, argv); +} + +static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) +{ + unsigned long nhit = 0; + int cpu; + + for_each_possible_cpu(cpu) + nhit += *per_cpu_ptr(tk->nhit, cpu); + + return nhit; +} + +static nokprobe_inline bool trace_kprobe_is_registered(struct trace_kprobe *tk) +{ + return !(list_empty(&tk->rp.kp.list) && + hlist_unhashed(&tk->rp.kp.hlist)); +} + +/* Return 0 if it fails to find the symbol address */ +static nokprobe_inline +unsigned long trace_kprobe_address(struct trace_kprobe *tk) +{ + unsigned long addr; + + if (tk->symbol) { + addr = (unsigned long) + kallsyms_lookup_name(trace_kprobe_symbol(tk)); + if (addr) + addr += tk->rp.kp.offset; + } else { + addr = (unsigned long)tk->rp.kp.addr; + } + return addr; +} + +static nokprobe_inline struct trace_kprobe * +trace_kprobe_primary_from_call(struct trace_event_call *call) +{ + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return NULL; + + return container_of(tp, struct trace_kprobe, tp); +} + +bool trace_kprobe_on_func_entry(struct trace_event_call *call) +{ + struct trace_kprobe *tk = trace_kprobe_primary_from_call(call); + + return tk ? (kprobe_on_func_entry(tk->rp.kp.addr, + tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name, + tk->rp.kp.addr ? 0 : tk->rp.kp.offset) == 0) : false; +} + +bool trace_kprobe_error_injectable(struct trace_event_call *call) +{ + struct trace_kprobe *tk = trace_kprobe_primary_from_call(call); + + return tk ? within_error_injection_list(trace_kprobe_address(tk)) : + false; +} + +static int register_kprobe_event(struct trace_kprobe *tk); +static int unregister_kprobe_event(struct trace_kprobe *tk); + +static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); +static int kretprobe_dispatcher(struct kretprobe_instance *ri, + struct pt_regs *regs); + +static void free_trace_kprobe(struct trace_kprobe *tk) +{ + if (tk) { + trace_probe_cleanup(&tk->tp); + kfree(tk->symbol); + free_percpu(tk->nhit); + kfree(tk); + } +} + +/* + * Allocate new trace_probe and initialize it (including kprobes). + */ +static struct trace_kprobe *alloc_trace_kprobe(const char *group, + const char *event, + void *addr, + const char *symbol, + unsigned long offs, + int maxactive, + int nargs, bool is_return) +{ + struct trace_kprobe *tk; + int ret = -ENOMEM; + + tk = kzalloc(struct_size(tk, tp.args, nargs), GFP_KERNEL); + if (!tk) + return ERR_PTR(ret); + + tk->nhit = alloc_percpu(unsigned long); + if (!tk->nhit) + goto error; + + if (symbol) { + tk->symbol = kstrdup(symbol, GFP_KERNEL); + if (!tk->symbol) + goto error; + tk->rp.kp.symbol_name = tk->symbol; + tk->rp.kp.offset = offs; + } else + tk->rp.kp.addr = addr; + + if (is_return) + tk->rp.handler = kretprobe_dispatcher; + else + tk->rp.kp.pre_handler = kprobe_dispatcher; + + tk->rp.maxactive = maxactive; + INIT_HLIST_NODE(&tk->rp.kp.hlist); + INIT_LIST_HEAD(&tk->rp.kp.list); + + ret = trace_probe_init(&tk->tp, event, group, false); + if (ret < 0) + goto error; + + dyn_event_init(&tk->devent, &trace_kprobe_ops); + return tk; +error: + free_trace_kprobe(tk); + return ERR_PTR(ret); +} + +static struct trace_kprobe *find_trace_kprobe(const char *event, + const char *group) +{ + struct dyn_event *pos; + struct trace_kprobe *tk; + + for_each_trace_kprobe(tk, pos) + if (strcmp(trace_probe_name(&tk->tp), event) == 0 && + strcmp(trace_probe_group_name(&tk->tp), group) == 0) + return tk; + return NULL; +} + +static inline int __enable_trace_kprobe(struct trace_kprobe *tk) +{ + int ret = 0; + + if (trace_kprobe_is_registered(tk) && !trace_kprobe_has_gone(tk)) { + if (trace_kprobe_is_return(tk)) + ret = enable_kretprobe(&tk->rp); + else + ret = enable_kprobe(&tk->rp.kp); + } + + return ret; +} + +static void __disable_trace_kprobe(struct trace_probe *tp) +{ + struct trace_kprobe *tk; + + list_for_each_entry(tk, trace_probe_probe_list(tp), tp.list) { + if (!trace_kprobe_is_registered(tk)) + continue; + if (trace_kprobe_is_return(tk)) + disable_kretprobe(&tk->rp); + else + disable_kprobe(&tk->rp.kp); + } +} + +/* + * Enable trace_probe + * if the file is NULL, enable "perf" handler, or enable "trace" handler. + */ +static int enable_trace_kprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *tp; + struct trace_kprobe *tk; + bool enabled; + int ret = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This also changes "enabled" state */ + if (file) { + ret = trace_probe_add_file(tp, file); + if (ret) + return ret; + } else + trace_probe_set_flag(tp, TP_FLAG_PROFILE); + + if (enabled) + return 0; + + list_for_each_entry(tk, trace_probe_probe_list(tp), tp.list) { + if (trace_kprobe_has_gone(tk)) + continue; + ret = __enable_trace_kprobe(tk); + if (ret) + break; + enabled = true; + } + + if (ret) { + /* Failed to enable one of them. Roll back all */ + if (enabled) + __disable_trace_kprobe(tp); + if (file) + trace_probe_remove_file(tp, file); + else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + } + + return ret; +} + +/* + * Disable trace_probe + * if the file is NULL, disable "perf" handler, or disable "trace" handler. + */ +static int disable_trace_kprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + if (file) { + if (!trace_probe_get_file_link(tp, file)) + return -ENOENT; + if (!trace_probe_has_single_file(tp)) + goto out; + trace_probe_clear_flag(tp, TP_FLAG_TRACE); + } else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + + if (!trace_probe_is_enabled(tp)) + __disable_trace_kprobe(tp); + + out: + if (file) + /* + * Synchronization is done in below function. For perf event, + * file == NULL and perf_trace_event_unreg() calls + * tracepoint_synchronize_unregister() to ensure synchronize + * event. We don't need to care about it. + */ + trace_probe_remove_file(tp, file); + + return 0; +} + +#if defined(CONFIG_DYNAMIC_FTRACE) && \ + !defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE) +static bool __within_notrace_func(unsigned long addr) +{ + unsigned long offset, size; + + if (!addr || !kallsyms_lookup_size_offset(addr, &size, &offset)) + return false; + + /* Get the entry address of the target function */ + addr -= offset; + + /* + * Since ftrace_location_range() does inclusive range check, we need + * to subtract 1 byte from the end address. + */ + return !ftrace_location_range(addr, addr + size - 1); +} + +static bool within_notrace_func(struct trace_kprobe *tk) +{ + unsigned long addr = trace_kprobe_address(tk); + char symname[KSYM_NAME_LEN], *p; + + if (!__within_notrace_func(addr)) + return false; + + /* Check if the address is on a suffixed-symbol */ + if (!lookup_symbol_name(addr, symname)) { + p = strchr(symname, '.'); + if (!p) + return true; + *p = '\0'; + addr = (unsigned long)kprobe_lookup_name(symname, 0); + if (addr) + return __within_notrace_func(addr); + } + + return true; +} +#else +#define within_notrace_func(tk) (false) +#endif + +/* Internal register function - just handle k*probes and flags */ +static int __register_trace_kprobe(struct trace_kprobe *tk) +{ + int i, ret; + + ret = security_locked_down(LOCKDOWN_KPROBES); + if (ret) + return ret; + + if (trace_kprobe_is_registered(tk)) + return -EINVAL; + + if (within_notrace_func(tk)) { + pr_warn("Could not probe notrace function %s\n", + trace_kprobe_symbol(tk)); + return -EINVAL; + } + + for (i = 0; i < tk->tp.nr_args; i++) { + ret = traceprobe_update_arg(&tk->tp.args[i]); + if (ret) + return ret; + } + + /* Set/clear disabled flag according to tp->flag */ + if (trace_probe_is_enabled(&tk->tp)) + tk->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; + else + tk->rp.kp.flags |= KPROBE_FLAG_DISABLED; + + if (trace_kprobe_is_return(tk)) + ret = register_kretprobe(&tk->rp); + else + ret = register_kprobe(&tk->rp.kp); + + return ret; +} + +/* Internal unregister function - just handle k*probes and flags */ +static void __unregister_trace_kprobe(struct trace_kprobe *tk) +{ + if (trace_kprobe_is_registered(tk)) { + if (trace_kprobe_is_return(tk)) + unregister_kretprobe(&tk->rp); + else + unregister_kprobe(&tk->rp.kp); + /* Cleanup kprobe for reuse and mark it unregistered */ + INIT_HLIST_NODE(&tk->rp.kp.hlist); + INIT_LIST_HEAD(&tk->rp.kp.list); + if (tk->rp.kp.symbol_name) + tk->rp.kp.addr = NULL; + } +} + +/* Unregister a trace_probe and probe_event */ +static int unregister_trace_kprobe(struct trace_kprobe *tk) +{ + /* If other probes are on the event, just unregister kprobe */ + if (trace_probe_has_sibling(&tk->tp)) + goto unreg; + + /* Enabled event can not be unregistered */ + if (trace_probe_is_enabled(&tk->tp)) + return -EBUSY; + + /* If there's a reference to the dynamic event */ + if (trace_event_dyn_busy(trace_probe_event_call(&tk->tp))) + return -EBUSY; + + /* Will fail if probe is being used by ftrace or perf */ + if (unregister_kprobe_event(tk)) + return -EBUSY; + +unreg: + __unregister_trace_kprobe(tk); + dyn_event_remove(&tk->devent); + trace_probe_unlink(&tk->tp); + + return 0; +} + +static bool trace_kprobe_has_same_kprobe(struct trace_kprobe *orig, + struct trace_kprobe *comp) +{ + struct trace_probe_event *tpe = orig->tp.event; + int i; + + list_for_each_entry(orig, &tpe->probes, tp.list) { + if (strcmp(trace_kprobe_symbol(orig), + trace_kprobe_symbol(comp)) || + trace_kprobe_offset(orig) != trace_kprobe_offset(comp)) + continue; + + /* + * trace_probe_compare_arg_type() ensured that nr_args and + * each argument name and type are same. Let's compare comm. + */ + for (i = 0; i < orig->tp.nr_args; i++) { + if (strcmp(orig->tp.args[i].comm, + comp->tp.args[i].comm)) + break; + } + + if (i == orig->tp.nr_args) + return true; + } + + return false; +} + +static int append_trace_kprobe(struct trace_kprobe *tk, struct trace_kprobe *to) +{ + int ret; + + ret = trace_probe_compare_arg_type(&tk->tp, &to->tp); + if (ret) { + /* Note that argument starts index = 2 */ + trace_probe_log_set_index(ret + 1); + trace_probe_log_err(0, DIFF_ARG_TYPE); + return -EEXIST; + } + if (trace_kprobe_has_same_kprobe(to, tk)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, SAME_PROBE); + return -EEXIST; + } + + /* Append to existing event */ + ret = trace_probe_append(&tk->tp, &to->tp); + if (ret) + return ret; + + /* Register k*probe */ + ret = __register_trace_kprobe(tk); + if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) { + pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); + ret = 0; + } + + if (ret) + trace_probe_unlink(&tk->tp); + else + dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp)); + + return ret; +} + +/* Register a trace_probe and probe_event */ +static int register_trace_kprobe(struct trace_kprobe *tk) +{ + struct trace_kprobe *old_tk; + int ret; + + mutex_lock(&event_mutex); + + old_tk = find_trace_kprobe(trace_probe_name(&tk->tp), + trace_probe_group_name(&tk->tp)); + if (old_tk) { + if (trace_kprobe_is_return(tk) != trace_kprobe_is_return(old_tk)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, DIFF_PROBE_TYPE); + ret = -EEXIST; + } else { + ret = append_trace_kprobe(tk, old_tk); + } + goto end; + } + + /* Register new event */ + ret = register_kprobe_event(tk); + if (ret) { + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } else + pr_warn("Failed to register probe event(%d)\n", ret); + goto end; + } + + /* Register k*probe */ + ret = __register_trace_kprobe(tk); + if (ret == -ENOENT && !trace_kprobe_module_exist(tk)) { + pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); + ret = 0; + } + + if (ret < 0) + unregister_kprobe_event(tk); + else + dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp)); + +end: + mutex_unlock(&event_mutex); + return ret; +} + +/* Module notifier call back, checking event on the module */ +static int trace_kprobe_module_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct module *mod = data; + struct dyn_event *pos; + struct trace_kprobe *tk; + int ret; + + if (val != MODULE_STATE_COMING) + return NOTIFY_DONE; + + /* Update probes on coming module */ + mutex_lock(&event_mutex); + for_each_trace_kprobe(tk, pos) { + if (trace_kprobe_within_module(tk, mod)) { + /* Don't need to check busy - this should have gone. */ + __unregister_trace_kprobe(tk); + ret = __register_trace_kprobe(tk); + if (ret) + pr_warn("Failed to re-register probe %s on %s: %d\n", + trace_probe_name(&tk->tp), + module_name(mod), ret); + } + } + mutex_unlock(&event_mutex); + + return NOTIFY_DONE; +} + +static struct notifier_block trace_kprobe_module_nb = { + .notifier_call = trace_kprobe_module_callback, + .priority = 1 /* Invoked after kprobe module callback */ +}; + +static int count_symbols(void *data, unsigned long unused) +{ + unsigned int *count = data; + + (*count)++; + + return 0; +} + +struct sym_count_ctx { + unsigned int count; + const char *name; +}; + +static int count_mod_symbols(void *data, const char *name, unsigned long unused) +{ + struct sym_count_ctx *ctx = data; + + if (strcmp(name, ctx->name) == 0) + ctx->count++; + + return 0; +} + +static unsigned int number_of_same_symbols(char *func_name) +{ + struct sym_count_ctx ctx = { .count = 0, .name = func_name }; + + kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count); + + module_kallsyms_on_each_symbol(NULL, count_mod_symbols, &ctx); + + return ctx.count; +} + +static int __trace_kprobe_create(int argc, const char *argv[]) +{ + /* + * Argument syntax: + * - Add kprobe: + * p[:[GRP/][EVENT]] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] + * - Add kretprobe: + * r[MAXACTIVE][:[GRP/][EVENT]] [MOD:]KSYM[+0] [FETCHARGS] + * Or + * p[:[GRP/][EVENT]] [MOD:]KSYM[+0]%return [FETCHARGS] + * + * Fetch args: + * $retval : fetch return value + * $stack : fetch stack address + * $stackN : fetch Nth of stack (N:0-) + * $comm : fetch current task comm + * @ADDR : fetch memory at ADDR (ADDR should be in kernel) + * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) + * %REG : fetch register REG + * Dereferencing memory fetch: + * +|-offs(ARG) : fetch memory at ARG +|- offs address. + * Alias name of args: + * NAME=FETCHARG : set NAME as alias of FETCHARG. + * Type of args: + * FETCHARG:TYPE : use TYPE instead of unsigned long. + */ + struct trace_kprobe *tk = NULL; + int i, len, new_argc = 0, ret = 0; + bool is_return = false; + char *symbol = NULL, *tmp = NULL; + const char **new_argv = NULL; + const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; + enum probe_print_type ptype; + int maxactive = 0; + long offset = 0; + void *addr = NULL; + char buf[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; + char abuf[MAX_BTF_ARGS_LEN]; + struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; + + switch (argv[0][0]) { + case 'r': + is_return = true; + break; + case 'p': + break; + default: + return -ECANCELED; + } + if (argc < 2) + return -ECANCELED; + + trace_probe_log_init("trace_kprobe", argc, argv); + + event = strchr(&argv[0][1], ':'); + if (event) + event++; + + if (isdigit(argv[0][1])) { + if (!is_return) { + trace_probe_log_err(1, BAD_MAXACT_TYPE); + goto parse_error; + } + if (event) + len = event - &argv[0][1] - 1; + else + len = strlen(&argv[0][1]); + if (len > MAX_EVENT_NAME_LEN - 1) { + trace_probe_log_err(1, BAD_MAXACT); + goto parse_error; + } + memcpy(buf, &argv[0][1], len); + buf[len] = '\0'; + ret = kstrtouint(buf, 0, &maxactive); + if (ret || !maxactive) { + trace_probe_log_err(1, BAD_MAXACT); + goto parse_error; + } + /* kretprobes instances are iterated over via a list. The + * maximum should stay reasonable. + */ + if (maxactive > KRETPROBE_MAXACTIVE_MAX) { + trace_probe_log_err(1, MAXACT_TOO_BIG); + goto parse_error; + } + } + + /* try to parse an address. if that fails, try to read the + * input as a symbol. */ + if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) { + trace_probe_log_set_index(1); + /* Check whether uprobe event specified */ + if (strchr(argv[1], '/') && strchr(argv[1], ':')) { + ret = -ECANCELED; + goto error; + } + /* a symbol specified */ + symbol = kstrdup(argv[1], GFP_KERNEL); + if (!symbol) + return -ENOMEM; + + tmp = strchr(symbol, '%'); + if (tmp) { + if (!strcmp(tmp, "%return")) { + *tmp = '\0'; + is_return = true; + } else { + trace_probe_log_err(tmp - symbol, BAD_ADDR_SUFFIX); + goto parse_error; + } + } + + /* TODO: support .init module functions */ + ret = traceprobe_split_symbol_offset(symbol, &offset); + if (ret || offset < 0 || offset > UINT_MAX) { + trace_probe_log_err(0, BAD_PROBE_ADDR); + goto parse_error; + } + if (is_return) + ctx.flags |= TPARG_FL_RETURN; + ret = kprobe_on_func_entry(NULL, symbol, offset); + if (ret == 0 && !is_return) + ctx.flags |= TPARG_FL_FENTRY; + /* Defer the ENOENT case until register kprobe */ + if (ret == -EINVAL && is_return) { + trace_probe_log_err(0, BAD_RETPROBE); + goto parse_error; + } + } + + if (symbol && !strchr(symbol, ':')) { + unsigned int count; + + count = number_of_same_symbols(symbol); + if (count > 1) { + /* + * Users should use ADDR to remove the ambiguity of + * using KSYM only. + */ + trace_probe_log_err(0, NON_UNIQ_SYMBOL); + ret = -EADDRNOTAVAIL; + + goto error; + } else if (count == 0) { + /* + * We can return ENOENT earlier than when register the + * kprobe. + */ + trace_probe_log_err(0, BAD_PROBE_ADDR); + ret = -ENOENT; + + goto error; + } + } + + trace_probe_log_set_index(0); + if (event) { + ret = traceprobe_parse_event_name(&event, &group, gbuf, + event - argv[0]); + if (ret) + goto parse_error; + } + + if (!event) { + /* Make a new event name */ + if (symbol) + snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", + is_return ? 'r' : 'p', symbol, offset); + else + snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", + is_return ? 'r' : 'p', addr); + sanitize_event_name(buf); + event = buf; + } + + argc -= 2; argv += 2; + ctx.funcname = symbol; + new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, + abuf, MAX_BTF_ARGS_LEN, &ctx); + if (IS_ERR(new_argv)) { + ret = PTR_ERR(new_argv); + new_argv = NULL; + goto out; + } + if (new_argv) { + argc = new_argc; + argv = new_argv; + } + + /* setup a probe */ + tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, + argc, is_return); + if (IS_ERR(tk)) { + ret = PTR_ERR(tk); + /* This must return -ENOMEM, else there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM); + goto out; /* We know tk is not allocated */ + } + + /* parse arguments */ + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + trace_probe_log_set_index(i + 2); + ctx.offset = 0; + ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], &ctx); + if (ret) + goto error; /* This can be -ENOMEM */ + } + + ptype = is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; + ret = traceprobe_set_print_fmt(&tk->tp, ptype); + if (ret < 0) + goto error; + + ret = register_trace_kprobe(tk); + if (ret) { + trace_probe_log_set_index(1); + if (ret == -EILSEQ) + trace_probe_log_err(0, BAD_INSN_BNDRY); + else if (ret == -ENOENT) + trace_probe_log_err(0, BAD_PROBE_ADDR); + else if (ret != -ENOMEM && ret != -EEXIST) + trace_probe_log_err(0, FAIL_REG_PROBE); + goto error; + } + +out: + traceprobe_finish_parse(&ctx); + trace_probe_log_clear(); + kfree(new_argv); + kfree(symbol); + return ret; + +parse_error: + ret = -EINVAL; +error: + free_trace_kprobe(tk); + goto out; +} + +static int trace_kprobe_create(const char *raw_command) +{ + return trace_probe_create(raw_command, __trace_kprobe_create); +} + +static int create_or_delete_trace_kprobe(const char *raw_command) +{ + int ret; + + if (raw_command[0] == '-') + return dyn_event_release(raw_command, &trace_kprobe_ops); + + ret = trace_kprobe_create(raw_command); + return ret == -ECANCELED ? -EINVAL : ret; +} + +static int trace_kprobe_run_command(struct dynevent_cmd *cmd) +{ + return create_or_delete_trace_kprobe(cmd->seq.buffer); +} + +/** + * kprobe_event_cmd_init - Initialize a kprobe event command object + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @buf: A pointer to the buffer used to build the command + * @maxlen: The length of the buffer passed in @buf + * + * Initialize a synthetic event command object. Use this before + * calling any of the other kprobe_event functions. + */ +void kprobe_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) +{ + dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_KPROBE, + trace_kprobe_run_command); +} +EXPORT_SYMBOL_GPL(kprobe_event_cmd_init); + +/** + * __kprobe_event_gen_cmd_start - Generate a kprobe event command from arg list + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @kretprobe: Is this a return probe? + * @name: The name of the kprobe event + * @loc: The location of the kprobe event + * @...: Variable number of arg (pairs), one pair for each field + * + * NOTE: Users normally won't want to call this function directly, but + * rather use the kprobe_event_gen_cmd_start() wrapper, which automatically + * adds a NULL to the end of the arg list. If this function is used + * directly, make sure the last arg in the variable arg list is NULL. + * + * Generate a kprobe event command to be executed by + * kprobe_event_gen_cmd_end(). This function can be used to generate the + * complete command or only the first part of it; in the latter case, + * kprobe_event_add_fields() can be used to add more fields following this. + * + * Unlikely the synth_event_gen_cmd_start(), @loc must be specified. This + * returns -EINVAL if @loc == NULL. + * + * Return: 0 if successful, error otherwise. + */ +int __kprobe_event_gen_cmd_start(struct dynevent_cmd *cmd, bool kretprobe, + const char *name, const char *loc, ...) +{ + char buf[MAX_EVENT_NAME_LEN]; + struct dynevent_arg arg; + va_list args; + int ret; + + if (cmd->type != DYNEVENT_TYPE_KPROBE) + return -EINVAL; + + if (!loc) + return -EINVAL; + + if (kretprobe) + snprintf(buf, MAX_EVENT_NAME_LEN, "r:kprobes/%s", name); + else + snprintf(buf, MAX_EVENT_NAME_LEN, "p:kprobes/%s", name); + + ret = dynevent_str_add(cmd, buf); + if (ret) + return ret; + + dynevent_arg_init(&arg, 0); + arg.str = loc; + ret = dynevent_arg_add(cmd, &arg, NULL); + if (ret) + return ret; + + va_start(args, loc); + for (;;) { + const char *field; + + field = va_arg(args, const char *); + if (!field) + break; + + if (++cmd->n_fields > MAX_TRACE_ARGS) { + ret = -EINVAL; + break; + } + + arg.str = field; + ret = dynevent_arg_add(cmd, &arg, NULL); + if (ret) + break; + } + va_end(args); + + return ret; +} +EXPORT_SYMBOL_GPL(__kprobe_event_gen_cmd_start); + +/** + * __kprobe_event_add_fields - Add probe fields to a kprobe command from arg list + * @cmd: A pointer to the dynevent_cmd struct representing the new event + * @...: Variable number of arg (pairs), one pair for each field + * + * NOTE: Users normally won't want to call this function directly, but + * rather use the kprobe_event_add_fields() wrapper, which + * automatically adds a NULL to the end of the arg list. If this + * function is used directly, make sure the last arg in the variable + * arg list is NULL. + * + * Add probe fields to an existing kprobe command using a variable + * list of args. Fields are added in the same order they're listed. + * + * Return: 0 if successful, error otherwise. + */ +int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...) +{ + struct dynevent_arg arg; + va_list args; + int ret = 0; + + if (cmd->type != DYNEVENT_TYPE_KPROBE) + return -EINVAL; + + dynevent_arg_init(&arg, 0); + + va_start(args, cmd); + for (;;) { + const char *field; + + field = va_arg(args, const char *); + if (!field) + break; + + if (++cmd->n_fields > MAX_TRACE_ARGS) { + ret = -EINVAL; + break; + } + + arg.str = field; + ret = dynevent_arg_add(cmd, &arg, NULL); + if (ret) + break; + } + va_end(args); + + return ret; +} +EXPORT_SYMBOL_GPL(__kprobe_event_add_fields); + +/** + * kprobe_event_delete - Delete a kprobe event + * @name: The name of the kprobe event to delete + * + * Delete a kprobe event with the give @name from kernel code rather + * than directly from the command line. + * + * Return: 0 if successful, error otherwise. + */ +int kprobe_event_delete(const char *name) +{ + char buf[MAX_EVENT_NAME_LEN]; + + snprintf(buf, MAX_EVENT_NAME_LEN, "-:%s", name); + + return create_or_delete_trace_kprobe(buf); +} +EXPORT_SYMBOL_GPL(kprobe_event_delete); + +static int trace_kprobe_release(struct dyn_event *ev) +{ + struct trace_kprobe *tk = to_trace_kprobe(ev); + int ret = unregister_trace_kprobe(tk); + + if (!ret) + free_trace_kprobe(tk); + return ret; +} + +static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev) +{ + struct trace_kprobe *tk = to_trace_kprobe(ev); + int i; + + seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p'); + if (trace_kprobe_is_return(tk) && tk->rp.maxactive) + seq_printf(m, "%d", tk->rp.maxactive); + seq_printf(m, ":%s/%s", trace_probe_group_name(&tk->tp), + trace_probe_name(&tk->tp)); + + if (!tk->symbol) + seq_printf(m, " 0x%p", tk->rp.kp.addr); + else if (tk->rp.kp.offset) + seq_printf(m, " %s+%u", trace_kprobe_symbol(tk), + tk->rp.kp.offset); + else + seq_printf(m, " %s", trace_kprobe_symbol(tk)); + + for (i = 0; i < tk->tp.nr_args; i++) + seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm); + seq_putc(m, '\n'); + + return 0; +} + +static int probes_seq_show(struct seq_file *m, void *v) +{ + struct dyn_event *ev = v; + + if (!is_trace_kprobe(ev)) + return 0; + + return trace_kprobe_show(m, ev); +} + +static const struct seq_operations probes_seq_op = { + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, + .show = probes_seq_show +}; + +static int probes_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ret = dyn_events_release_all(&trace_kprobe_ops); + if (ret < 0) + return ret; + } + + return seq_open(file, &probes_seq_op); +} + +static ssize_t probes_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + return trace_parse_run_command(file, buffer, count, ppos, + create_or_delete_trace_kprobe); +} + +static const struct file_operations kprobe_events_ops = { + .owner = THIS_MODULE, + .open = probes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = probes_write, +}; + +/* Probes profiling interfaces */ +static int probes_profile_seq_show(struct seq_file *m, void *v) +{ + struct dyn_event *ev = v; + struct trace_kprobe *tk; + unsigned long nmissed; + + if (!is_trace_kprobe(ev)) + return 0; + + tk = to_trace_kprobe(ev); + nmissed = trace_kprobe_is_return(tk) ? + tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed; + seq_printf(m, " %-44s %15lu %15lu\n", + trace_probe_name(&tk->tp), + trace_kprobe_nhit(tk), + nmissed); + + return 0; +} + +static const struct seq_operations profile_seq_op = { + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, + .show = probes_profile_seq_show +}; + +static int profile_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + return seq_open(file, &profile_seq_op); +} + +static const struct file_operations kprobe_profile_ops = { + .owner = THIS_MODULE, + .open = profile_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* Note that we don't verify it, since the code does not come from user space */ +static int +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, + void *base) +{ + struct pt_regs *regs = rec; + unsigned long val; + int ret; + +retry: + /* 1st stage: get value from context */ + switch (code->op) { + case FETCH_OP_REG: + val = regs_get_register(regs, code->param); + break; + case FETCH_OP_STACK: + val = regs_get_kernel_stack_nth(regs, code->param); + break; + case FETCH_OP_STACKP: + val = kernel_stack_pointer(regs); + break; + case FETCH_OP_RETVAL: + val = regs_return_value(regs); + break; +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + case FETCH_OP_ARG: + val = regs_get_kernel_argument(regs, code->param); + break; +#endif + case FETCH_NOP_SYMBOL: /* Ignore a place holder */ + code++; + goto retry; + default: + ret = process_common_fetch_insn(code, &val); + if (ret < 0) + return ret; + } + code++; + + return process_fetch_insn_bottom(code, val, dest, base); +} +NOKPROBE_SYMBOL(process_fetch_insn) + +/* Kprobe handler */ +static nokprobe_inline void +__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, + struct trace_event_file *trace_file) +{ + struct kprobe_trace_entry_head *entry; + struct trace_event_call *call = trace_probe_event_call(&tk->tp); + struct trace_event_buffer fbuffer; + int dsize; + + WARN_ON(call != trace_file->event_call); + + if (trace_trigger_soft_disabled(trace_file)) + return; + + dsize = __get_data_size(&tk->tp, regs); + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + tk->tp.size + dsize); + if (!entry) + return; + + fbuffer.regs = regs; + entry->ip = (unsigned long)tk->rp.kp.addr; + store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); + + trace_event_buffer_commit(&fbuffer); +} + +static void +kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) +{ + struct event_file_link *link; + + trace_probe_for_each_link_rcu(link, &tk->tp) + __kprobe_trace_func(tk, regs, link->file); +} +NOKPROBE_SYMBOL(kprobe_trace_func); + +/* Kretprobe handler */ +static nokprobe_inline void +__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, + struct pt_regs *regs, + struct trace_event_file *trace_file) +{ + struct kretprobe_trace_entry_head *entry; + struct trace_event_buffer fbuffer; + struct trace_event_call *call = trace_probe_event_call(&tk->tp); + int dsize; + + WARN_ON(call != trace_file->event_call); + + if (trace_trigger_soft_disabled(trace_file)) + return; + + dsize = __get_data_size(&tk->tp, regs); + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + tk->tp.size + dsize); + if (!entry) + return; + + fbuffer.regs = regs; + entry->func = (unsigned long)tk->rp.kp.addr; + entry->ret_ip = get_kretprobe_retaddr(ri); + store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); + + trace_event_buffer_commit(&fbuffer); +} + +static void +kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct event_file_link *link; + + trace_probe_for_each_link_rcu(link, &tk->tp) + __kretprobe_trace_func(tk, ri, regs, link->file); +} +NOKPROBE_SYMBOL(kretprobe_trace_func); + +/* Event entry printers */ +static enum print_line_t +print_kprobe_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct kprobe_trace_entry_head *field; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + + field = (struct kprobe_trace_entry_head *)iter->ent; + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; + + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); + + if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + goto out; + + trace_seq_putc(s, ')'); + + if (trace_probe_print_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; + + trace_seq_putc(s, '\n'); + out: + return trace_handle_return(s); +} + +static enum print_line_t +print_kretprobe_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct kretprobe_trace_entry_head *field; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + + field = (struct kretprobe_trace_entry_head *)iter->ent; + tp = trace_probe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (WARN_ON_ONCE(!tp)) + goto out; + + trace_seq_printf(s, "%s: (", trace_probe_name(tp)); + + if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + goto out; + + trace_seq_puts(s, " <- "); + + if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + goto out; + + trace_seq_putc(s, ')'); + + if (trace_probe_print_args(s, tp->args, tp->nr_args, + (u8 *)&field[1], field) < 0) + goto out; + + trace_seq_putc(s, '\n'); + + out: + return trace_handle_return(s); +} + + +static int kprobe_event_define_fields(struct trace_event_call *event_call) +{ + int ret; + struct kprobe_trace_entry_head field; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; + + DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); + + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); +} + +static int kretprobe_event_define_fields(struct trace_event_call *event_call) +{ + int ret; + struct kretprobe_trace_entry_head field; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(event_call); + if (WARN_ON_ONCE(!tp)) + return -ENOENT; + + DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); + DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); + + return traceprobe_define_arg_fields(event_call, sizeof(field), tp); +} + +#ifdef CONFIG_PERF_EVENTS + +/* Kprobe profile handler */ +static int +kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) +{ + struct trace_event_call *call = trace_probe_event_call(&tk->tp); + struct kprobe_trace_entry_head *entry; + struct hlist_head *head; + int size, __size, dsize; + int rctx; + + if (bpf_prog_array_valid(call)) { + unsigned long orig_ip = instruction_pointer(regs); + int ret; + + ret = trace_call_bpf(call, regs); + + /* + * We need to check and see if we modified the pc of the + * pt_regs, and if so return 1 so that we don't do the + * single stepping. + */ + if (orig_ip != instruction_pointer(regs)) + return 1; + if (!ret) + return 0; + } + + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return 0; + + dsize = __get_data_size(&tk->tp, regs); + __size = sizeof(*entry) + tk->tp.size + dsize; + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + entry = perf_trace_buf_alloc(size, NULL, &rctx); + if (!entry) + return 0; + + entry->ip = (unsigned long)tk->rp.kp.addr; + memset(&entry[1], 0, dsize); + store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); + return 0; +} +NOKPROBE_SYMBOL(kprobe_perf_func); + +/* Kretprobe profile handler */ +static void +kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct trace_event_call *call = trace_probe_event_call(&tk->tp); + struct kretprobe_trace_entry_head *entry; + struct hlist_head *head; + int size, __size, dsize; + int rctx; + + if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) + return; + + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + return; + + dsize = __get_data_size(&tk->tp, regs); + __size = sizeof(*entry) + tk->tp.size + dsize; + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + entry = perf_trace_buf_alloc(size, NULL, &rctx); + if (!entry) + return; + + entry->func = (unsigned long)tk->rp.kp.addr; + entry->ret_ip = get_kretprobe_retaddr(ri); + store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize); + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); +} +NOKPROBE_SYMBOL(kretprobe_perf_func); + +int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, + const char **symbol, u64 *probe_offset, + u64 *probe_addr, bool perf_type_tracepoint) +{ + const char *pevent = trace_event_name(event->tp_event); + const char *group = event->tp_event->class->system; + struct trace_kprobe *tk; + + if (perf_type_tracepoint) + tk = find_trace_kprobe(pevent, group); + else + tk = trace_kprobe_primary_from_call(event->tp_event); + if (!tk) + return -EINVAL; + + *fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE + : BPF_FD_TYPE_KPROBE; + *probe_offset = tk->rp.kp.offset; + *probe_addr = kallsyms_show_value(current_cred()) ? + (unsigned long)tk->rp.kp.addr : 0; + *symbol = tk->symbol; + return 0; +} +#endif /* CONFIG_PERF_EVENTS */ + +/* + * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex. + * + * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe + * lockless, but we can't race with this __init function. + */ +static int kprobe_register(struct trace_event_call *event, + enum trace_reg type, void *data) +{ + struct trace_event_file *file = data; + + switch (type) { + case TRACE_REG_REGISTER: + return enable_trace_kprobe(event, file); + case TRACE_REG_UNREGISTER: + return disable_trace_kprobe(event, file); + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return enable_trace_kprobe(event, NULL); + case TRACE_REG_PERF_UNREGISTER: + return disable_trace_kprobe(event, NULL); + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; +#endif + } + return 0; +} + +static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) +{ + struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); + int ret = 0; + + raw_cpu_inc(*tk->nhit); + + if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE)) + kprobe_trace_func(tk, regs); +#ifdef CONFIG_PERF_EVENTS + if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE)) + ret = kprobe_perf_func(tk, regs); +#endif + return ret; +} +NOKPROBE_SYMBOL(kprobe_dispatcher); + +static int +kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + struct kretprobe *rp = get_kretprobe(ri); + struct trace_kprobe *tk; + + /* + * There is a small chance that get_kretprobe(ri) returns NULL when + * the kretprobe is unregister on another CPU between kretprobe's + * trampoline_handler and this function. + */ + if (unlikely(!rp)) + return 0; + + tk = container_of(rp, struct trace_kprobe, rp); + raw_cpu_inc(*tk->nhit); + + if (trace_probe_test_flag(&tk->tp, TP_FLAG_TRACE)) + kretprobe_trace_func(tk, ri, regs); +#ifdef CONFIG_PERF_EVENTS + if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE)) + kretprobe_perf_func(tk, ri, regs); +#endif + return 0; /* We don't tweak kernel, so just return 0 */ +} +NOKPROBE_SYMBOL(kretprobe_dispatcher); + +static struct trace_event_functions kretprobe_funcs = { + .trace = print_kretprobe_event +}; + +static struct trace_event_functions kprobe_funcs = { + .trace = print_kprobe_event +}; + +static struct trace_event_fields kretprobe_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = kretprobe_event_define_fields }, + {} +}; + +static struct trace_event_fields kprobe_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = kprobe_event_define_fields }, + {} +}; + +static inline void init_trace_event_call(struct trace_kprobe *tk) +{ + struct trace_event_call *call = trace_probe_event_call(&tk->tp); + + if (trace_kprobe_is_return(tk)) { + call->event.funcs = &kretprobe_funcs; + call->class->fields_array = kretprobe_fields_array; + } else { + call->event.funcs = &kprobe_funcs; + call->class->fields_array = kprobe_fields_array; + } + + call->flags = TRACE_EVENT_FL_KPROBE; + call->class->reg = kprobe_register; +} + +static int register_kprobe_event(struct trace_kprobe *tk) +{ + init_trace_event_call(tk); + + return trace_probe_register_event_call(&tk->tp); +} + +static int unregister_kprobe_event(struct trace_kprobe *tk) +{ + return trace_probe_unregister_event_call(&tk->tp); +} + +#ifdef CONFIG_PERF_EVENTS + +/* create a trace_kprobe, but don't add it to global lists */ +struct trace_event_call * +create_local_trace_kprobe(char *func, void *addr, unsigned long offs, + bool is_return) +{ + enum probe_print_type ptype; + struct trace_kprobe *tk; + int ret; + char *event; + + if (func) { + unsigned int count; + + count = number_of_same_symbols(func); + if (count > 1) + /* + * Users should use addr to remove the ambiguity of + * using func only. + */ + return ERR_PTR(-EADDRNOTAVAIL); + else if (count == 0) + /* + * We can return ENOENT earlier than when register the + * kprobe. + */ + return ERR_PTR(-ENOENT); + } + + /* + * local trace_kprobes are not added to dyn_event, so they are never + * searched in find_trace_kprobe(). Therefore, there is no concern of + * duplicated name here. + */ + event = func ? func : "DUMMY_EVENT"; + + tk = alloc_trace_kprobe(KPROBE_EVENT_SYSTEM, event, (void *)addr, func, + offs, 0 /* maxactive */, 0 /* nargs */, + is_return); + + if (IS_ERR(tk)) { + pr_info("Failed to allocate trace_probe.(%d)\n", + (int)PTR_ERR(tk)); + return ERR_CAST(tk); + } + + init_trace_event_call(tk); + + ptype = trace_kprobe_is_return(tk) ? + PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; + if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0) { + ret = -ENOMEM; + goto error; + } + + ret = __register_trace_kprobe(tk); + if (ret < 0) + goto error; + + return trace_probe_event_call(&tk->tp); +error: + free_trace_kprobe(tk); + return ERR_PTR(ret); +} + +void destroy_local_trace_kprobe(struct trace_event_call *event_call) +{ + struct trace_kprobe *tk; + + tk = trace_kprobe_primary_from_call(event_call); + if (unlikely(!tk)) + return; + + if (trace_probe_is_enabled(&tk->tp)) { + WARN_ON(1); + return; + } + + __unregister_trace_kprobe(tk); + + free_trace_kprobe(tk); +} +#endif /* CONFIG_PERF_EVENTS */ + +static __init void enable_boot_kprobe_events(void) +{ + struct trace_array *tr = top_trace_array(); + struct trace_event_file *file; + struct trace_kprobe *tk; + struct dyn_event *pos; + + mutex_lock(&event_mutex); + for_each_trace_kprobe(tk, pos) { + list_for_each_entry(file, &tr->events, list) + if (file->event_call == trace_probe_event_call(&tk->tp)) + trace_event_enable_disable(file, 1, 0); + } + mutex_unlock(&event_mutex); +} + +static __init void setup_boot_kprobe_events(void) +{ + char *p, *cmd = kprobe_boot_events_buf; + int ret; + + strreplace(kprobe_boot_events_buf, ',', ' '); + + while (cmd && *cmd != '\0') { + p = strchr(cmd, ';'); + if (p) + *p++ = '\0'; + + ret = create_or_delete_trace_kprobe(cmd); + if (ret) + pr_warn("Failed to add event(%d): %s\n", ret, cmd); + + cmd = p; + } + + enable_boot_kprobe_events(); +} + +/* + * Register dynevent at core_initcall. This allows kernel to setup kprobe + * events in postcore_initcall without tracefs. + */ +static __init int init_kprobe_trace_early(void) +{ + int ret; + + ret = dyn_event_register(&trace_kprobe_ops); + if (ret) + return ret; + + if (register_module_notifier(&trace_kprobe_module_nb)) + return -EINVAL; + + return 0; +} +core_initcall(init_kprobe_trace_early); + +/* Make a tracefs interface for controlling probe points */ +static __init int init_kprobe_trace(void) +{ + int ret; + + ret = tracing_init_dentry(); + if (ret) + return 0; + + /* Event list interface */ + trace_create_file("kprobe_events", TRACE_MODE_WRITE, + NULL, NULL, &kprobe_events_ops); + + /* Profile interface */ + trace_create_file("kprobe_profile", TRACE_MODE_READ, + NULL, NULL, &kprobe_profile_ops); + + setup_boot_kprobe_events(); + + return 0; +} +fs_initcall(init_kprobe_trace); + + +#ifdef CONFIG_FTRACE_STARTUP_TEST +static __init struct trace_event_file * +find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr) +{ + struct trace_event_file *file; + + list_for_each_entry(file, &tr->events, list) + if (file->event_call == trace_probe_event_call(&tk->tp)) + return file; + + return NULL; +} + +/* + * Nobody but us can call enable_trace_kprobe/disable_trace_kprobe at this + * stage, we can do this lockless. + */ +static __init int kprobe_trace_self_tests_init(void) +{ + int ret, warn = 0; + int (*target)(int, int, int, int, int, int); + struct trace_kprobe *tk; + struct trace_event_file *file; + + if (tracing_is_disabled()) + return -ENODEV; + + if (tracing_selftest_disabled) + return 0; + + target = kprobe_trace_selftest_target; + + pr_info("Testing kprobe tracing: "); + + ret = create_or_delete_trace_kprobe("p:testprobe kprobe_trace_selftest_target $stack $stack0 +0($stack)"); + if (WARN_ON_ONCE(ret)) { + pr_warn("error on probing function entry.\n"); + warn++; + } else { + /* Enable trace point */ + tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { + pr_warn("error on getting new probe.\n"); + warn++; + } else { + file = find_trace_probe_file(tk, top_trace_array()); + if (WARN_ON_ONCE(file == NULL)) { + pr_warn("error on getting probe file.\n"); + warn++; + } else + enable_trace_kprobe( + trace_probe_event_call(&tk->tp), file); + } + } + + ret = create_or_delete_trace_kprobe("r:testprobe2 kprobe_trace_selftest_target $retval"); + if (WARN_ON_ONCE(ret)) { + pr_warn("error on probing function return.\n"); + warn++; + } else { + /* Enable trace point */ + tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { + pr_warn("error on getting 2nd new probe.\n"); + warn++; + } else { + file = find_trace_probe_file(tk, top_trace_array()); + if (WARN_ON_ONCE(file == NULL)) { + pr_warn("error on getting probe file.\n"); + warn++; + } else + enable_trace_kprobe( + trace_probe_event_call(&tk->tp), file); + } + } + + if (warn) + goto end; + + ret = target(1, 2, 3, 4, 5, 6); + + /* + * Not expecting an error here, the check is only to prevent the + * optimizer from removing the call to target() as otherwise there + * are no side-effects and the call is never performed. + */ + if (ret != 21) + warn++; + + /* Disable trace points before removing it */ + tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { + pr_warn("error on getting test probe.\n"); + warn++; + } else { + if (trace_kprobe_nhit(tk) != 1) { + pr_warn("incorrect number of testprobe hits\n"); + warn++; + } + + file = find_trace_probe_file(tk, top_trace_array()); + if (WARN_ON_ONCE(file == NULL)) { + pr_warn("error on getting probe file.\n"); + warn++; + } else + disable_trace_kprobe( + trace_probe_event_call(&tk->tp), file); + } + + tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { + pr_warn("error on getting 2nd test probe.\n"); + warn++; + } else { + if (trace_kprobe_nhit(tk) != 1) { + pr_warn("incorrect number of testprobe2 hits\n"); + warn++; + } + + file = find_trace_probe_file(tk, top_trace_array()); + if (WARN_ON_ONCE(file == NULL)) { + pr_warn("error on getting probe file.\n"); + warn++; + } else + disable_trace_kprobe( + trace_probe_event_call(&tk->tp), file); + } + + ret = create_or_delete_trace_kprobe("-:testprobe"); + if (WARN_ON_ONCE(ret)) { + pr_warn("error on deleting a probe.\n"); + warn++; + } + + ret = create_or_delete_trace_kprobe("-:testprobe2"); + if (WARN_ON_ONCE(ret)) { + pr_warn("error on deleting a probe.\n"); + warn++; + } + +end: + ret = dyn_events_release_all(&trace_kprobe_ops); + if (WARN_ON_ONCE(ret)) { + pr_warn("error on cleaning up probes.\n"); + warn++; + } + /* + * Wait for the optimizer work to finish. Otherwise it might fiddle + * with probes in already freed __init text. + */ + wait_for_kprobe_optimizer(); + if (warn) + pr_cont("NG: Some tests are failed. Please check them.\n"); + else + pr_cont("OK\n"); + return 0; +} + +late_initcall(kprobe_trace_self_tests_init); + +#endif diff --git a/kernel/trace/trace_kprobe_selftest.c b/kernel/trace/trace_kprobe_selftest.c new file mode 100644 index 0000000000..3851cd1e6a --- /dev/null +++ b/kernel/trace/trace_kprobe_selftest.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "trace_kprobe_selftest.h" + +/* + * Function used during the kprobe self test. This function is in a separate + * compile unit so it can be compile with CC_FLAGS_FTRACE to ensure that it + * can be probed by the selftests. + */ +int kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6) +{ + return a1 + a2 + a3 + a4 + a5 + a6; +} diff --git a/kernel/trace/trace_kprobe_selftest.h b/kernel/trace/trace_kprobe_selftest.h new file mode 100644 index 0000000000..c4fc7268ba --- /dev/null +++ b/kernel/trace/trace_kprobe_selftest.h @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Function used during the kprobe self test. This function is in a separate + * compile unit so it can be compile with CC_FLAGS_FTRACE to ensure that it + * can be probed by the selftests. + */ +int kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c new file mode 100644 index 0000000000..64e77b5136 --- /dev/null +++ b/kernel/trace/trace_mmiotrace.c @@ -0,0 +1,362 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory mapped I/O tracing + * + * Copyright (C) 2008 Pekka Paalanen <pq@iki.fi> + */ + +#include <linux/kernel.h> +#include <linux/mmiotrace.h> +#include <linux/pci.h> +#include <linux/slab.h> +#include <linux/time.h> + +#include <linux/atomic.h> + +#include "trace.h" +#include "trace_output.h" + +struct header_iter { + struct pci_dev *dev; +}; + +static struct trace_array *mmio_trace_array; +static bool overrun_detected; +static unsigned long prev_overruns; +static atomic_t dropped_count; + +static void mmio_reset_data(struct trace_array *tr) +{ + overrun_detected = false; + prev_overruns = 0; + + tracing_reset_online_cpus(&tr->array_buffer); +} + +static int mmio_trace_init(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + mmio_trace_array = tr; + + mmio_reset_data(tr); + enable_mmiotrace(); + return 0; +} + +static void mmio_trace_reset(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + + disable_mmiotrace(); + mmio_reset_data(tr); + mmio_trace_array = NULL; +} + +static void mmio_trace_start(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + mmio_reset_data(tr); +} + +static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) +{ + int i; + resource_size_t start, end; + const struct pci_driver *drv = pci_dev_driver(dev); + + trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", + dev->bus->number, dev->devfn, + dev->vendor, dev->device, dev->irq); + for (i = 0; i < 7; i++) { + start = dev->resource[i].start; + trace_seq_printf(s, " %llx", + (unsigned long long)(start | + (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); + } + for (i = 0; i < 7; i++) { + start = dev->resource[i].start; + end = dev->resource[i].end; + trace_seq_printf(s, " %llx", + dev->resource[i].start < dev->resource[i].end ? + (unsigned long long)(end - start) + 1 : 0); + } + if (drv) + trace_seq_printf(s, " %s\n", drv->name); + else + trace_seq_puts(s, " \n"); +} + +static void destroy_header_iter(struct header_iter *hiter) +{ + if (!hiter) + return; + pci_dev_put(hiter->dev); + kfree(hiter); +} + +static void mmio_pipe_open(struct trace_iterator *iter) +{ + struct header_iter *hiter; + struct trace_seq *s = &iter->seq; + + trace_seq_puts(s, "VERSION 20070824\n"); + + hiter = kzalloc(sizeof(*hiter), GFP_KERNEL); + if (!hiter) + return; + + hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL); + iter->private = hiter; +} + +/* XXX: This is not called when the pipe is closed! */ +static void mmio_close(struct trace_iterator *iter) +{ + struct header_iter *hiter = iter->private; + destroy_header_iter(hiter); + iter->private = NULL; +} + +static unsigned long count_overruns(struct trace_iterator *iter) +{ + unsigned long cnt = atomic_xchg(&dropped_count, 0); + unsigned long over = ring_buffer_overruns(iter->array_buffer->buffer); + + if (over > prev_overruns) + cnt += over - prev_overruns; + prev_overruns = over; + return cnt; +} + +static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp, + char __user *ubuf, size_t cnt, loff_t *ppos) +{ + ssize_t ret; + struct header_iter *hiter = iter->private; + struct trace_seq *s = &iter->seq; + unsigned long n; + + n = count_overruns(iter); + if (n) { + /* XXX: This is later than where events were lost. */ + trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n); + if (!overrun_detected) + pr_warn("mmiotrace has lost events\n"); + overrun_detected = true; + goto print_out; + } + + if (!hiter) + return 0; + + mmio_print_pcidev(s, hiter->dev); + hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, hiter->dev); + + if (!hiter->dev) { + destroy_header_iter(hiter); + iter->private = NULL; + } + +print_out: + ret = trace_seq_to_user(s, ubuf, cnt); + return (ret == -EBUSY) ? 0 : ret; +} + +static enum print_line_t mmio_print_rw(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct trace_mmiotrace_rw *field; + struct mmiotrace_rw *rw; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(iter->ts); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); + unsigned secs = (unsigned long)t; + + trace_assign_type(field, entry); + rw = &field->rw; + + switch (rw->opcode) { + case MMIO_READ: + trace_seq_printf(s, + "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, + rw->value, rw->pc, 0); + break; + case MMIO_WRITE: + trace_seq_printf(s, + "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, + rw->value, rw->pc, 0); + break; + case MMIO_UNKNOWN_OP: + trace_seq_printf(s, + "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx," + "%02lx 0x%lx %d\n", + secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, + (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, + (rw->value >> 0) & 0xff, rw->pc, 0); + break; + default: + trace_seq_puts(s, "rw what?\n"); + break; + } + + return trace_handle_return(s); +} + +static enum print_line_t mmio_print_map(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct trace_mmiotrace_map *field; + struct mmiotrace_map *m; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(iter->ts); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); + unsigned secs = (unsigned long)t; + + trace_assign_type(field, entry); + m = &field->map; + + switch (m->opcode) { + case MMIO_PROBE: + trace_seq_printf(s, + "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", + secs, usec_rem, m->map_id, + (unsigned long long)m->phys, m->virt, m->len, + 0UL, 0); + break; + case MMIO_UNPROBE: + trace_seq_printf(s, + "UNMAP %u.%06lu %d 0x%lx %d\n", + secs, usec_rem, m->map_id, 0UL, 0); + break; + default: + trace_seq_puts(s, "map what?\n"); + break; + } + + return trace_handle_return(s); +} + +static enum print_line_t mmio_print_mark(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct print_entry *print = (struct print_entry *)entry; + const char *msg = print->buf; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(iter->ts); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); + unsigned secs = (unsigned long)t; + + /* The trailing newline must be in the message. */ + trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); + + return trace_handle_return(s); +} + +static enum print_line_t mmio_print_line(struct trace_iterator *iter) +{ + switch (iter->ent->type) { + case TRACE_MMIO_RW: + return mmio_print_rw(iter); + case TRACE_MMIO_MAP: + return mmio_print_map(iter); + case TRACE_PRINT: + return mmio_print_mark(iter); + default: + return TRACE_TYPE_HANDLED; /* ignore unknown entries */ + } +} + +static struct tracer mmio_tracer __read_mostly = +{ + .name = "mmiotrace", + .init = mmio_trace_init, + .reset = mmio_trace_reset, + .start = mmio_trace_start, + .pipe_open = mmio_pipe_open, + .close = mmio_close, + .read = mmio_read, + .print_line = mmio_print_line, + .noboot = true, +}; + +__init static int init_mmio_trace(void) +{ + return register_tracer(&mmio_tracer); +} +device_initcall(init_mmio_trace); + +static void __trace_mmiotrace_rw(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_rw *rw) +{ + struct trace_event_call *call = &event_mmiotrace_rw; + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct ring_buffer_event *event; + struct trace_mmiotrace_rw *entry; + unsigned int trace_ctx; + + trace_ctx = tracing_gen_ctx_flags(0); + event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW, + sizeof(*entry), trace_ctx); + if (!event) { + atomic_inc(&dropped_count); + return; + } + entry = ring_buffer_event_data(event); + entry->rw = *rw; + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); +} + +void mmio_trace_rw(struct mmiotrace_rw *rw) +{ + struct trace_array *tr = mmio_trace_array; + struct trace_array_cpu *data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id()); + __trace_mmiotrace_rw(tr, data, rw); +} + +static void __trace_mmiotrace_map(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_map *map) +{ + struct trace_event_call *call = &event_mmiotrace_map; + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct ring_buffer_event *event; + struct trace_mmiotrace_map *entry; + unsigned int trace_ctx; + + trace_ctx = tracing_gen_ctx_flags(0); + event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP, + sizeof(*entry), trace_ctx); + if (!event) { + atomic_inc(&dropped_count); + return; + } + entry = ring_buffer_event_data(event); + entry->map = *map; + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); +} + +void mmio_trace_mapping(struct mmiotrace_map *map) +{ + struct trace_array *tr = mmio_trace_array; + struct trace_array_cpu *data; + + preempt_disable(); + data = per_cpu_ptr(tr->array_buffer.data, smp_processor_id()); + __trace_mmiotrace_map(tr, data, map); + preempt_enable(); +} + +int mmio_trace_printk(const char *fmt, va_list args) +{ + return trace_vprintk(0, fmt, args); +} diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c new file mode 100644 index 0000000000..50523f953a --- /dev/null +++ b/kernel/trace/trace_nop.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * nop tracer + * + * Copyright (C) 2008 Steven Noonan <steven@uplinklabs.net> + * + */ + +#include <linux/module.h> +#include <linux/ftrace.h> + +#include "trace.h" + +/* Our two options */ +enum { + TRACE_NOP_OPT_ACCEPT = 0x1, + TRACE_NOP_OPT_REFUSE = 0x2 +}; + +/* Options for the tracer (see trace_options file) */ +static struct tracer_opt nop_opts[] = { + /* Option that will be accepted by set_flag callback */ + { TRACER_OPT(test_nop_accept, TRACE_NOP_OPT_ACCEPT) }, + /* Option that will be refused by set_flag callback */ + { TRACER_OPT(test_nop_refuse, TRACE_NOP_OPT_REFUSE) }, + { } /* Always set a last empty entry */ +}; + +static struct tracer_flags nop_flags = { + /* You can check your flags value here when you want. */ + .val = 0, /* By default: all flags disabled */ + .opts = nop_opts +}; + +static struct trace_array *ctx_trace; + +static void start_nop_trace(struct trace_array *tr) +{ + /* Nothing to do! */ +} + +static void stop_nop_trace(struct trace_array *tr) +{ + /* Nothing to do! */ +} + +static int nop_trace_init(struct trace_array *tr) +{ + ctx_trace = tr; + start_nop_trace(tr); + return 0; +} + +static void nop_trace_reset(struct trace_array *tr) +{ + stop_nop_trace(tr); +} + +/* It only serves as a signal handler and a callback to + * accept or refuse the setting of a flag. + * If you don't implement it, then the flag setting will be + * automatically accepted. + */ +static int nop_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) +{ + /* + * Note that you don't need to update nop_flags.val yourself. + * The tracing Api will do it automatically if you return 0 + */ + if (bit == TRACE_NOP_OPT_ACCEPT) { + printk(KERN_DEBUG "nop_test_accept flag set to %d: we accept." + " Now cat trace_options to see the result\n", + set); + return 0; + } + + if (bit == TRACE_NOP_OPT_REFUSE) { + printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse." + " Now cat trace_options to see the result\n", + set); + return -EINVAL; + } + + return 0; +} + + +struct tracer nop_trace __read_mostly = +{ + .name = "nop", + .init = nop_trace_init, + .reset = nop_trace_reset, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_nop, +#endif + .flags = &nop_flags, + .set_flag = nop_set_flag, + .allow_instances = true, +}; + diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c new file mode 100644 index 0000000000..bd0d01d00f --- /dev/null +++ b/kernel/trace/trace_osnoise.c @@ -0,0 +1,3133 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * OS Noise Tracer: computes the OS Noise suffered by a running thread. + * Timerlat Tracer: measures the wakeup latency of a timer triggered IRQ and thread. + * + * Based on "hwlat_detector" tracer by: + * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com> + * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com> + * With feedback from Clark Williams <williams@redhat.com> + * + * And also based on the rtsl tracer presented on: + * DE OLIVEIRA, Daniel Bristot, et al. Demystifying the real-time linux + * scheduling latency. In: 32nd Euromicro Conference on Real-Time Systems + * (ECRTS 2020). Schloss Dagstuhl-Leibniz-Zentrum fur Informatik, 2020. + * + * Copyright (C) 2021 Daniel Bristot de Oliveira, Red Hat, Inc. <bristot@redhat.com> + */ + +#include <linux/kthread.h> +#include <linux/tracefs.h> +#include <linux/uaccess.h> +#include <linux/cpumask.h> +#include <linux/delay.h> +#include <linux/sched/clock.h> +#include <uapi/linux/sched/types.h> +#include <linux/sched.h> +#include "trace.h" + +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/trace/irq_vectors.h> +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#endif /* CONFIG_X86_LOCAL_APIC */ + +#include <trace/events/irq.h> +#include <trace/events/sched.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/osnoise.h> + +/* + * Default values. + */ +#define BANNER "osnoise: " +#define DEFAULT_SAMPLE_PERIOD 1000000 /* 1s */ +#define DEFAULT_SAMPLE_RUNTIME 1000000 /* 1s */ + +#define DEFAULT_TIMERLAT_PERIOD 1000 /* 1ms */ +#define DEFAULT_TIMERLAT_PRIO 95 /* FIFO 95 */ + +/* + * osnoise/options entries. + */ +enum osnoise_options_index { + OSN_DEFAULTS = 0, + OSN_WORKLOAD, + OSN_PANIC_ON_STOP, + OSN_PREEMPT_DISABLE, + OSN_IRQ_DISABLE, + OSN_MAX +}; + +static const char * const osnoise_options_str[OSN_MAX] = { + "DEFAULTS", + "OSNOISE_WORKLOAD", + "PANIC_ON_STOP", + "OSNOISE_PREEMPT_DISABLE", + "OSNOISE_IRQ_DISABLE" }; + +#define OSN_DEFAULT_OPTIONS 0x2 +static unsigned long osnoise_options = OSN_DEFAULT_OPTIONS; + +/* + * trace_array of the enabled osnoise/timerlat instances. + */ +struct osnoise_instance { + struct list_head list; + struct trace_array *tr; +}; + +static struct list_head osnoise_instances; + +static bool osnoise_has_registered_instances(void) +{ + return !!list_first_or_null_rcu(&osnoise_instances, + struct osnoise_instance, + list); +} + +/* + * osnoise_instance_registered - check if a tr is already registered + */ +static int osnoise_instance_registered(struct trace_array *tr) +{ + struct osnoise_instance *inst; + int found = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + if (inst->tr == tr) + found = 1; + } + rcu_read_unlock(); + + return found; +} + +/* + * osnoise_register_instance - register a new trace instance + * + * Register a trace_array *tr in the list of instances running + * osnoise/timerlat tracers. + */ +static int osnoise_register_instance(struct trace_array *tr) +{ + struct osnoise_instance *inst; + + /* + * register/unregister serialization is provided by trace's + * trace_types_lock. + */ + lockdep_assert_held(&trace_types_lock); + + inst = kmalloc(sizeof(*inst), GFP_KERNEL); + if (!inst) + return -ENOMEM; + + INIT_LIST_HEAD_RCU(&inst->list); + inst->tr = tr; + list_add_tail_rcu(&inst->list, &osnoise_instances); + + return 0; +} + +/* + * osnoise_unregister_instance - unregister a registered trace instance + * + * Remove the trace_array *tr from the list of instances running + * osnoise/timerlat tracers. + */ +static void osnoise_unregister_instance(struct trace_array *tr) +{ + struct osnoise_instance *inst; + int found = 0; + + /* + * register/unregister serialization is provided by trace's + * trace_types_lock. + */ + list_for_each_entry_rcu(inst, &osnoise_instances, list, + lockdep_is_held(&trace_types_lock)) { + if (inst->tr == tr) { + list_del_rcu(&inst->list); + found = 1; + break; + } + } + + if (!found) + return; + + kvfree_rcu_mightsleep(inst); +} + +/* + * NMI runtime info. + */ +struct osn_nmi { + u64 count; + u64 delta_start; +}; + +/* + * IRQ runtime info. + */ +struct osn_irq { + u64 count; + u64 arrival_time; + u64 delta_start; +}; + +#define IRQ_CONTEXT 0 +#define THREAD_CONTEXT 1 +#define THREAD_URET 2 +/* + * sofirq runtime info. + */ +struct osn_softirq { + u64 count; + u64 arrival_time; + u64 delta_start; +}; + +/* + * thread runtime info. + */ +struct osn_thread { + u64 count; + u64 arrival_time; + u64 delta_start; +}; + +/* + * Runtime information: this structure saves the runtime information used by + * one sampling thread. + */ +struct osnoise_variables { + struct task_struct *kthread; + bool sampling; + pid_t pid; + struct osn_nmi nmi; + struct osn_irq irq; + struct osn_softirq softirq; + struct osn_thread thread; + local_t int_counter; +}; + +/* + * Per-cpu runtime information. + */ +static DEFINE_PER_CPU(struct osnoise_variables, per_cpu_osnoise_var); + +/* + * this_cpu_osn_var - Return the per-cpu osnoise_variables on its relative CPU + */ +static inline struct osnoise_variables *this_cpu_osn_var(void) +{ + return this_cpu_ptr(&per_cpu_osnoise_var); +} + +#ifdef CONFIG_TIMERLAT_TRACER +/* + * Runtime information for the timer mode. + */ +struct timerlat_variables { + struct task_struct *kthread; + struct hrtimer timer; + u64 rel_period; + u64 abs_period; + bool tracing_thread; + u64 count; + bool uthread_migrate; +}; + +static DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var); + +/* + * this_cpu_tmr_var - Return the per-cpu timerlat_variables on its relative CPU + */ +static inline struct timerlat_variables *this_cpu_tmr_var(void) +{ + return this_cpu_ptr(&per_cpu_timerlat_var); +} + +/* + * tlat_var_reset - Reset the values of the given timerlat_variables + */ +static inline void tlat_var_reset(void) +{ + struct timerlat_variables *tlat_var; + int cpu; + /* + * So far, all the values are initialized as 0, so + * zeroing the structure is perfect. + */ + for_each_cpu(cpu, cpu_online_mask) { + tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); + memset(tlat_var, 0, sizeof(*tlat_var)); + } +} +#else /* CONFIG_TIMERLAT_TRACER */ +#define tlat_var_reset() do {} while (0) +#endif /* CONFIG_TIMERLAT_TRACER */ + +/* + * osn_var_reset - Reset the values of the given osnoise_variables + */ +static inline void osn_var_reset(void) +{ + struct osnoise_variables *osn_var; + int cpu; + + /* + * So far, all the values are initialized as 0, so + * zeroing the structure is perfect. + */ + for_each_cpu(cpu, cpu_online_mask) { + osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); + memset(osn_var, 0, sizeof(*osn_var)); + } +} + +/* + * osn_var_reset_all - Reset the value of all per-cpu osnoise_variables + */ +static inline void osn_var_reset_all(void) +{ + osn_var_reset(); + tlat_var_reset(); +} + +/* + * Tells NMIs to call back to the osnoise tracer to record timestamps. + */ +bool trace_osnoise_callback_enabled; + +/* + * osnoise sample structure definition. Used to store the statistics of a + * sample run. + */ +struct osnoise_sample { + u64 runtime; /* runtime */ + u64 noise; /* noise */ + u64 max_sample; /* max single noise sample */ + int hw_count; /* # HW (incl. hypervisor) interference */ + int nmi_count; /* # NMIs during this sample */ + int irq_count; /* # IRQs during this sample */ + int softirq_count; /* # softirqs during this sample */ + int thread_count; /* # threads during this sample */ +}; + +#ifdef CONFIG_TIMERLAT_TRACER +/* + * timerlat sample structure definition. Used to store the statistics of + * a sample run. + */ +struct timerlat_sample { + u64 timer_latency; /* timer_latency */ + unsigned int seqnum; /* unique sequence */ + int context; /* timer context */ +}; +#endif + +/* + * Protect the interface. + */ +static struct mutex interface_lock; + +/* + * Tracer data. + */ +static struct osnoise_data { + u64 sample_period; /* total sampling period */ + u64 sample_runtime; /* active sampling portion of period */ + u64 stop_tracing; /* stop trace in the internal operation (loop/irq) */ + u64 stop_tracing_total; /* stop trace in the final operation (report/thread) */ +#ifdef CONFIG_TIMERLAT_TRACER + u64 timerlat_period; /* timerlat period */ + u64 print_stack; /* print IRQ stack if total > */ + int timerlat_tracer; /* timerlat tracer */ +#endif + bool tainted; /* infor users and developers about a problem */ +} osnoise_data = { + .sample_period = DEFAULT_SAMPLE_PERIOD, + .sample_runtime = DEFAULT_SAMPLE_RUNTIME, + .stop_tracing = 0, + .stop_tracing_total = 0, +#ifdef CONFIG_TIMERLAT_TRACER + .print_stack = 0, + .timerlat_period = DEFAULT_TIMERLAT_PERIOD, + .timerlat_tracer = 0, +#endif +}; + +#ifdef CONFIG_TIMERLAT_TRACER +static inline bool timerlat_enabled(void) +{ + return osnoise_data.timerlat_tracer; +} + +static inline int timerlat_softirq_exit(struct osnoise_variables *osn_var) +{ + struct timerlat_variables *tlat_var = this_cpu_tmr_var(); + /* + * If the timerlat is enabled, but the irq handler did + * not run yet enabling timerlat_tracer, do not trace. + */ + if (!tlat_var->tracing_thread) { + osn_var->softirq.arrival_time = 0; + osn_var->softirq.delta_start = 0; + return 0; + } + return 1; +} + +static inline int timerlat_thread_exit(struct osnoise_variables *osn_var) +{ + struct timerlat_variables *tlat_var = this_cpu_tmr_var(); + /* + * If the timerlat is enabled, but the irq handler did + * not run yet enabling timerlat_tracer, do not trace. + */ + if (!tlat_var->tracing_thread) { + osn_var->thread.delta_start = 0; + osn_var->thread.arrival_time = 0; + return 0; + } + return 1; +} +#else /* CONFIG_TIMERLAT_TRACER */ +static inline bool timerlat_enabled(void) +{ + return false; +} + +static inline int timerlat_softirq_exit(struct osnoise_variables *osn_var) +{ + return 1; +} +static inline int timerlat_thread_exit(struct osnoise_variables *osn_var) +{ + return 1; +} +#endif + +#ifdef CONFIG_PREEMPT_RT +/* + * Print the osnoise header info. + */ +static void print_osnoise_headers(struct seq_file *s) +{ + if (osnoise_data.tainted) + seq_puts(s, "# osnoise is tainted!\n"); + + seq_puts(s, "# _-------=> irqs-off\n"); + seq_puts(s, "# / _------=> need-resched\n"); + seq_puts(s, "# | / _-----=> need-resched-lazy\n"); + seq_puts(s, "# || / _----=> hardirq/softirq\n"); + seq_puts(s, "# ||| / _---=> preempt-depth\n"); + seq_puts(s, "# |||| / _--=> preempt-lazy-depth\n"); + seq_puts(s, "# ||||| / _-=> migrate-disable\n"); + + seq_puts(s, "# |||||| / "); + seq_puts(s, " MAX\n"); + + seq_puts(s, "# ||||| / "); + seq_puts(s, " SINGLE Interference counters:\n"); + + seq_puts(s, "# ||||||| RUNTIME "); + seq_puts(s, " NOISE %% OF CPU NOISE +-----------------------------+\n"); + + seq_puts(s, "# TASK-PID CPU# ||||||| TIMESTAMP IN US "); + seq_puts(s, " IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD\n"); + + seq_puts(s, "# | | | ||||||| | | "); + seq_puts(s, " | | | | | | | |\n"); +} +#else /* CONFIG_PREEMPT_RT */ +static void print_osnoise_headers(struct seq_file *s) +{ + if (osnoise_data.tainted) + seq_puts(s, "# osnoise is tainted!\n"); + + seq_puts(s, "# _-----=> irqs-off\n"); + seq_puts(s, "# / _----=> need-resched\n"); + seq_puts(s, "# | / _---=> hardirq/softirq\n"); + seq_puts(s, "# || / _--=> preempt-depth\n"); + seq_puts(s, "# ||| / _-=> migrate-disable "); + seq_puts(s, " MAX\n"); + seq_puts(s, "# |||| / delay "); + seq_puts(s, " SINGLE Interference counters:\n"); + + seq_puts(s, "# ||||| RUNTIME "); + seq_puts(s, " NOISE %% OF CPU NOISE +-----------------------------+\n"); + + seq_puts(s, "# TASK-PID CPU# ||||| TIMESTAMP IN US "); + seq_puts(s, " IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD\n"); + + seq_puts(s, "# | | | ||||| | | "); + seq_puts(s, " | | | | | | | |\n"); +} +#endif /* CONFIG_PREEMPT_RT */ + +/* + * osnoise_taint - report an osnoise error. + */ +#define osnoise_taint(msg) ({ \ + struct osnoise_instance *inst; \ + struct trace_buffer *buffer; \ + \ + rcu_read_lock(); \ + list_for_each_entry_rcu(inst, &osnoise_instances, list) { \ + buffer = inst->tr->array_buffer.buffer; \ + trace_array_printk_buf(buffer, _THIS_IP_, msg); \ + } \ + rcu_read_unlock(); \ + osnoise_data.tainted = true; \ +}) + +/* + * Record an osnoise_sample into the tracer buffer. + */ +static void +__trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer) +{ + struct trace_event_call *call = &event_osnoise; + struct ring_buffer_event *event; + struct osnoise_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_OSNOISE, sizeof(*entry), + tracing_gen_ctx()); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->runtime = sample->runtime; + entry->noise = sample->noise; + entry->max_sample = sample->max_sample; + entry->hw_count = sample->hw_count; + entry->nmi_count = sample->nmi_count; + entry->irq_count = sample->irq_count; + entry->softirq_count = sample->softirq_count; + entry->thread_count = sample->thread_count; + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit_nostack(buffer, event); +} + +/* + * Record an osnoise_sample on all osnoise instances. + */ +static void trace_osnoise_sample(struct osnoise_sample *sample) +{ + struct osnoise_instance *inst; + struct trace_buffer *buffer; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + buffer = inst->tr->array_buffer.buffer; + __trace_osnoise_sample(sample, buffer); + } + rcu_read_unlock(); +} + +#ifdef CONFIG_TIMERLAT_TRACER +/* + * Print the timerlat header info. + */ +#ifdef CONFIG_PREEMPT_RT +static void print_timerlat_headers(struct seq_file *s) +{ + seq_puts(s, "# _-------=> irqs-off\n"); + seq_puts(s, "# / _------=> need-resched\n"); + seq_puts(s, "# | / _-----=> need-resched-lazy\n"); + seq_puts(s, "# || / _----=> hardirq/softirq\n"); + seq_puts(s, "# ||| / _---=> preempt-depth\n"); + seq_puts(s, "# |||| / _--=> preempt-lazy-depth\n"); + seq_puts(s, "# ||||| / _-=> migrate-disable\n"); + seq_puts(s, "# |||||| /\n"); + seq_puts(s, "# ||||||| ACTIVATION\n"); + seq_puts(s, "# TASK-PID CPU# ||||||| TIMESTAMP ID "); + seq_puts(s, " CONTEXT LATENCY\n"); + seq_puts(s, "# | | | ||||||| | | "); + seq_puts(s, " | |\n"); +} +#else /* CONFIG_PREEMPT_RT */ +static void print_timerlat_headers(struct seq_file *s) +{ + seq_puts(s, "# _-----=> irqs-off\n"); + seq_puts(s, "# / _----=> need-resched\n"); + seq_puts(s, "# | / _---=> hardirq/softirq\n"); + seq_puts(s, "# || / _--=> preempt-depth\n"); + seq_puts(s, "# ||| / _-=> migrate-disable\n"); + seq_puts(s, "# |||| / delay\n"); + seq_puts(s, "# ||||| ACTIVATION\n"); + seq_puts(s, "# TASK-PID CPU# ||||| TIMESTAMP ID "); + seq_puts(s, " CONTEXT LATENCY\n"); + seq_puts(s, "# | | | ||||| | | "); + seq_puts(s, " | |\n"); +} +#endif /* CONFIG_PREEMPT_RT */ + +static void +__trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer) +{ + struct trace_event_call *call = &event_osnoise; + struct ring_buffer_event *event; + struct timerlat_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_TIMERLAT, sizeof(*entry), + tracing_gen_ctx()); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->seqnum = sample->seqnum; + entry->context = sample->context; + entry->timer_latency = sample->timer_latency; + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit_nostack(buffer, event); +} + +/* + * Record an timerlat_sample into the tracer buffer. + */ +static void trace_timerlat_sample(struct timerlat_sample *sample) +{ + struct osnoise_instance *inst; + struct trace_buffer *buffer; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + buffer = inst->tr->array_buffer.buffer; + __trace_timerlat_sample(sample, buffer); + } + rcu_read_unlock(); +} + +#ifdef CONFIG_STACKTRACE + +#define MAX_CALLS 256 + +/* + * Stack trace will take place only at IRQ level, so, no need + * to control nesting here. + */ +struct trace_stack { + int stack_size; + int nr_entries; + unsigned long calls[MAX_CALLS]; +}; + +static DEFINE_PER_CPU(struct trace_stack, trace_stack); + +/* + * timerlat_save_stack - save a stack trace without printing + * + * Save the current stack trace without printing. The + * stack will be printed later, after the end of the measurement. + */ +static void timerlat_save_stack(int skip) +{ + unsigned int size, nr_entries; + struct trace_stack *fstack; + + fstack = this_cpu_ptr(&trace_stack); + + size = ARRAY_SIZE(fstack->calls); + + nr_entries = stack_trace_save(fstack->calls, size, skip); + + fstack->stack_size = nr_entries * sizeof(unsigned long); + fstack->nr_entries = nr_entries; + + return; + +} + +static void +__timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, unsigned int size) +{ + struct trace_event_call *call = &event_osnoise; + struct ring_buffer_event *event; + struct stack_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_STACK, sizeof(*entry) + size, + tracing_gen_ctx()); + if (!event) + return; + + entry = ring_buffer_event_data(event); + + memcpy(&entry->caller, fstack->calls, size); + entry->size = fstack->nr_entries; + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit_nostack(buffer, event); +} + +/* + * timerlat_dump_stack - dump a stack trace previously saved + */ +static void timerlat_dump_stack(u64 latency) +{ + struct osnoise_instance *inst; + struct trace_buffer *buffer; + struct trace_stack *fstack; + unsigned int size; + + /* + * trace only if latency > print_stack config, if enabled. + */ + if (!osnoise_data.print_stack || osnoise_data.print_stack > latency) + return; + + preempt_disable_notrace(); + fstack = this_cpu_ptr(&trace_stack); + size = fstack->stack_size; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + buffer = inst->tr->array_buffer.buffer; + __timerlat_dump_stack(buffer, fstack, size); + + } + rcu_read_unlock(); + preempt_enable_notrace(); +} +#else /* CONFIG_STACKTRACE */ +#define timerlat_dump_stack(u64 latency) do {} while (0) +#define timerlat_save_stack(a) do {} while (0) +#endif /* CONFIG_STACKTRACE */ +#endif /* CONFIG_TIMERLAT_TRACER */ + +/* + * Macros to encapsulate the time capturing infrastructure. + */ +#define time_get() trace_clock_local() +#define time_to_us(x) div_u64(x, 1000) +#define time_sub(a, b) ((a) - (b)) + +/* + * cond_move_irq_delta_start - Forward the delta_start of a running IRQ + * + * If an IRQ is preempted by an NMI, its delta_start is pushed forward + * to discount the NMI interference. + * + * See get_int_safe_duration(). + */ +static inline void +cond_move_irq_delta_start(struct osnoise_variables *osn_var, u64 duration) +{ + if (osn_var->irq.delta_start) + osn_var->irq.delta_start += duration; +} + +#ifndef CONFIG_PREEMPT_RT +/* + * cond_move_softirq_delta_start - Forward the delta_start of a running softirq. + * + * If a softirq is preempted by an IRQ or NMI, its delta_start is pushed + * forward to discount the interference. + * + * See get_int_safe_duration(). + */ +static inline void +cond_move_softirq_delta_start(struct osnoise_variables *osn_var, u64 duration) +{ + if (osn_var->softirq.delta_start) + osn_var->softirq.delta_start += duration; +} +#else /* CONFIG_PREEMPT_RT */ +#define cond_move_softirq_delta_start(osn_var, duration) do {} while (0) +#endif + +/* + * cond_move_thread_delta_start - Forward the delta_start of a running thread + * + * If a noisy thread is preempted by an softirq, IRQ or NMI, its delta_start + * is pushed forward to discount the interference. + * + * See get_int_safe_duration(). + */ +static inline void +cond_move_thread_delta_start(struct osnoise_variables *osn_var, u64 duration) +{ + if (osn_var->thread.delta_start) + osn_var->thread.delta_start += duration; +} + +/* + * get_int_safe_duration - Get the duration of a window + * + * The irq, softirq and thread varaibles need to have its duration without + * the interference from higher priority interrupts. Instead of keeping a + * variable to discount the interrupt interference from these variables, the + * starting time of these variables are pushed forward with the interrupt's + * duration. In this way, a single variable is used to: + * + * - Know if a given window is being measured. + * - Account its duration. + * - Discount the interference. + * + * To avoid getting inconsistent values, e.g.,: + * + * now = time_get() + * ---> interrupt! + * delta_start -= int duration; + * <--- + * duration = now - delta_start; + * + * result: negative duration if the variable duration before the + * interrupt was smaller than the interrupt execution. + * + * A counter of interrupts is used. If the counter increased, try + * to capture an interference safe duration. + */ +static inline s64 +get_int_safe_duration(struct osnoise_variables *osn_var, u64 *delta_start) +{ + u64 int_counter, now; + s64 duration; + + do { + int_counter = local_read(&osn_var->int_counter); + /* synchronize with interrupts */ + barrier(); + + now = time_get(); + duration = (now - *delta_start); + + /* synchronize with interrupts */ + barrier(); + } while (int_counter != local_read(&osn_var->int_counter)); + + /* + * This is an evidence of race conditions that cause + * a value to be "discounted" too much. + */ + if (duration < 0) + osnoise_taint("Negative duration!\n"); + + *delta_start = 0; + + return duration; +} + +/* + * + * set_int_safe_time - Save the current time on *time, aware of interference + * + * Get the time, taking into consideration a possible interference from + * higher priority interrupts. + * + * See get_int_safe_duration() for an explanation. + */ +static u64 +set_int_safe_time(struct osnoise_variables *osn_var, u64 *time) +{ + u64 int_counter; + + do { + int_counter = local_read(&osn_var->int_counter); + /* synchronize with interrupts */ + barrier(); + + *time = time_get(); + + /* synchronize with interrupts */ + barrier(); + } while (int_counter != local_read(&osn_var->int_counter)); + + return int_counter; +} + +#ifdef CONFIG_TIMERLAT_TRACER +/* + * copy_int_safe_time - Copy *src into *desc aware of interference + */ +static u64 +copy_int_safe_time(struct osnoise_variables *osn_var, u64 *dst, u64 *src) +{ + u64 int_counter; + + do { + int_counter = local_read(&osn_var->int_counter); + /* synchronize with interrupts */ + barrier(); + + *dst = *src; + + /* synchronize with interrupts */ + barrier(); + } while (int_counter != local_read(&osn_var->int_counter)); + + return int_counter; +} +#endif /* CONFIG_TIMERLAT_TRACER */ + +/* + * trace_osnoise_callback - NMI entry/exit callback + * + * This function is called at the entry and exit NMI code. The bool enter + * distinguishes between either case. This function is used to note a NMI + * occurrence, compute the noise caused by the NMI, and to remove the noise + * it is potentially causing on other interference variables. + */ +void trace_osnoise_callback(bool enter) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + u64 duration; + + if (!osn_var->sampling) + return; + + /* + * Currently trace_clock_local() calls sched_clock() and the + * generic version is not NMI safe. + */ + if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) { + if (enter) { + osn_var->nmi.delta_start = time_get(); + local_inc(&osn_var->int_counter); + } else { + duration = time_get() - osn_var->nmi.delta_start; + + trace_nmi_noise(osn_var->nmi.delta_start, duration); + + cond_move_irq_delta_start(osn_var, duration); + cond_move_softirq_delta_start(osn_var, duration); + cond_move_thread_delta_start(osn_var, duration); + } + } + + if (enter) + osn_var->nmi.count++; +} + +/* + * osnoise_trace_irq_entry - Note the starting of an IRQ + * + * Save the starting time of an IRQ. As IRQs are non-preemptive to other IRQs, + * it is safe to use a single variable (ons_var->irq) to save the statistics. + * The arrival_time is used to report... the arrival time. The delta_start + * is used to compute the duration at the IRQ exit handler. See + * cond_move_irq_delta_start(). + */ +void osnoise_trace_irq_entry(int id) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + + if (!osn_var->sampling) + return; + /* + * This value will be used in the report, but not to compute + * the execution time, so it is safe to get it unsafe. + */ + osn_var->irq.arrival_time = time_get(); + set_int_safe_time(osn_var, &osn_var->irq.delta_start); + osn_var->irq.count++; + + local_inc(&osn_var->int_counter); +} + +/* + * osnoise_irq_exit - Note the end of an IRQ, sava data and trace + * + * Computes the duration of the IRQ noise, and trace it. Also discounts the + * interference from other sources of noise could be currently being accounted. + */ +void osnoise_trace_irq_exit(int id, const char *desc) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + s64 duration; + + if (!osn_var->sampling) + return; + + duration = get_int_safe_duration(osn_var, &osn_var->irq.delta_start); + trace_irq_noise(id, desc, osn_var->irq.arrival_time, duration); + osn_var->irq.arrival_time = 0; + cond_move_softirq_delta_start(osn_var, duration); + cond_move_thread_delta_start(osn_var, duration); +} + +/* + * trace_irqentry_callback - Callback to the irq:irq_entry traceevent + * + * Used to note the starting of an IRQ occurece. + */ +static void trace_irqentry_callback(void *data, int irq, + struct irqaction *action) +{ + osnoise_trace_irq_entry(irq); +} + +/* + * trace_irqexit_callback - Callback to the irq:irq_exit traceevent + * + * Used to note the end of an IRQ occurece. + */ +static void trace_irqexit_callback(void *data, int irq, + struct irqaction *action, int ret) +{ + osnoise_trace_irq_exit(irq, action->name); +} + +/* + * arch specific register function. + */ +int __weak osnoise_arch_register(void) +{ + return 0; +} + +/* + * arch specific unregister function. + */ +void __weak osnoise_arch_unregister(void) +{ + return; +} + +/* + * hook_irq_events - Hook IRQ handling events + * + * This function hooks the IRQ related callbacks to the respective trace + * events. + */ +static int hook_irq_events(void) +{ + int ret; + + ret = register_trace_irq_handler_entry(trace_irqentry_callback, NULL); + if (ret) + goto out_err; + + ret = register_trace_irq_handler_exit(trace_irqexit_callback, NULL); + if (ret) + goto out_unregister_entry; + + ret = osnoise_arch_register(); + if (ret) + goto out_irq_exit; + + return 0; + +out_irq_exit: + unregister_trace_irq_handler_exit(trace_irqexit_callback, NULL); +out_unregister_entry: + unregister_trace_irq_handler_entry(trace_irqentry_callback, NULL); +out_err: + return -EINVAL; +} + +/* + * unhook_irq_events - Unhook IRQ handling events + * + * This function unhooks the IRQ related callbacks to the respective trace + * events. + */ +static void unhook_irq_events(void) +{ + osnoise_arch_unregister(); + unregister_trace_irq_handler_exit(trace_irqexit_callback, NULL); + unregister_trace_irq_handler_entry(trace_irqentry_callback, NULL); +} + +#ifndef CONFIG_PREEMPT_RT +/* + * trace_softirq_entry_callback - Note the starting of a softirq + * + * Save the starting time of a softirq. As softirqs are non-preemptive to + * other softirqs, it is safe to use a single variable (ons_var->softirq) + * to save the statistics. The arrival_time is used to report... the + * arrival time. The delta_start is used to compute the duration at the + * softirq exit handler. See cond_move_softirq_delta_start(). + */ +static void trace_softirq_entry_callback(void *data, unsigned int vec_nr) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + + if (!osn_var->sampling) + return; + /* + * This value will be used in the report, but not to compute + * the execution time, so it is safe to get it unsafe. + */ + osn_var->softirq.arrival_time = time_get(); + set_int_safe_time(osn_var, &osn_var->softirq.delta_start); + osn_var->softirq.count++; + + local_inc(&osn_var->int_counter); +} + +/* + * trace_softirq_exit_callback - Note the end of an softirq + * + * Computes the duration of the softirq noise, and trace it. Also discounts the + * interference from other sources of noise could be currently being accounted. + */ +static void trace_softirq_exit_callback(void *data, unsigned int vec_nr) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + s64 duration; + + if (!osn_var->sampling) + return; + + if (unlikely(timerlat_enabled())) + if (!timerlat_softirq_exit(osn_var)) + return; + + duration = get_int_safe_duration(osn_var, &osn_var->softirq.delta_start); + trace_softirq_noise(vec_nr, osn_var->softirq.arrival_time, duration); + cond_move_thread_delta_start(osn_var, duration); + osn_var->softirq.arrival_time = 0; +} + +/* + * hook_softirq_events - Hook softirq handling events + * + * This function hooks the softirq related callbacks to the respective trace + * events. + */ +static int hook_softirq_events(void) +{ + int ret; + + ret = register_trace_softirq_entry(trace_softirq_entry_callback, NULL); + if (ret) + goto out_err; + + ret = register_trace_softirq_exit(trace_softirq_exit_callback, NULL); + if (ret) + goto out_unreg_entry; + + return 0; + +out_unreg_entry: + unregister_trace_softirq_entry(trace_softirq_entry_callback, NULL); +out_err: + return -EINVAL; +} + +/* + * unhook_softirq_events - Unhook softirq handling events + * + * This function hooks the softirq related callbacks to the respective trace + * events. + */ +static void unhook_softirq_events(void) +{ + unregister_trace_softirq_entry(trace_softirq_entry_callback, NULL); + unregister_trace_softirq_exit(trace_softirq_exit_callback, NULL); +} +#else /* CONFIG_PREEMPT_RT */ +/* + * softirq are threads on the PREEMPT_RT mode. + */ +static int hook_softirq_events(void) +{ + return 0; +} +static void unhook_softirq_events(void) +{ +} +#endif + +/* + * thread_entry - Record the starting of a thread noise window + * + * It saves the context switch time for a noisy thread, and increments + * the interference counters. + */ +static void +thread_entry(struct osnoise_variables *osn_var, struct task_struct *t) +{ + if (!osn_var->sampling) + return; + /* + * The arrival time will be used in the report, but not to compute + * the execution time, so it is safe to get it unsafe. + */ + osn_var->thread.arrival_time = time_get(); + + set_int_safe_time(osn_var, &osn_var->thread.delta_start); + + osn_var->thread.count++; + local_inc(&osn_var->int_counter); +} + +/* + * thread_exit - Report the end of a thread noise window + * + * It computes the total noise from a thread, tracing if needed. + */ +static void +thread_exit(struct osnoise_variables *osn_var, struct task_struct *t) +{ + s64 duration; + + if (!osn_var->sampling) + return; + + if (unlikely(timerlat_enabled())) + if (!timerlat_thread_exit(osn_var)) + return; + + duration = get_int_safe_duration(osn_var, &osn_var->thread.delta_start); + + trace_thread_noise(t, osn_var->thread.arrival_time, duration); + + osn_var->thread.arrival_time = 0; +} + +#ifdef CONFIG_TIMERLAT_TRACER +/* + * osnoise_stop_exception - Stop tracing and the tracer. + */ +static __always_inline void osnoise_stop_exception(char *msg, int cpu) +{ + struct osnoise_instance *inst; + struct trace_array *tr; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + tr = inst->tr; + trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, + "stop tracing hit on cpu %d due to exception: %s\n", + smp_processor_id(), + msg); + + if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options)) + panic("tracer hit on cpu %d due to exception: %s\n", + smp_processor_id(), + msg); + + tracer_tracing_off(tr); + } + rcu_read_unlock(); +} + +/* + * trace_sched_migrate_callback - sched:sched_migrate_task trace event handler + * + * his function is hooked to the sched:sched_migrate_task trace event, and monitors + * timerlat user-space thread migration. + */ +static void trace_sched_migrate_callback(void *data, struct task_struct *p, int dest_cpu) +{ + struct osnoise_variables *osn_var; + long cpu = task_cpu(p); + + osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); + if (osn_var->pid == p->pid && dest_cpu != cpu) { + per_cpu_ptr(&per_cpu_timerlat_var, cpu)->uthread_migrate = 1; + osnoise_taint("timerlat user-thread migrated\n"); + osnoise_stop_exception("timerlat user-thread migrated", cpu); + } +} + +static int register_migration_monitor(void) +{ + int ret = 0; + + /* + * Timerlat thread migration check is only required when running timerlat in user-space. + * Thus, enable callback only if timerlat is set with no workload. + */ + if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) + ret = register_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); + + return ret; +} + +static void unregister_migration_monitor(void) +{ + if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) + unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); +} +#else +static int register_migration_monitor(void) +{ + return 0; +} +static void unregister_migration_monitor(void) {} +#endif +/* + * trace_sched_switch - sched:sched_switch trace event handler + * + * This function is hooked to the sched:sched_switch trace event, and it is + * used to record the beginning and to report the end of a thread noise window. + */ +static void +trace_sched_switch_callback(void *data, bool preempt, + struct task_struct *p, + struct task_struct *n, + unsigned int prev_state) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + int workload = test_bit(OSN_WORKLOAD, &osnoise_options); + + if ((p->pid != osn_var->pid) || !workload) + thread_exit(osn_var, p); + + if ((n->pid != osn_var->pid) || !workload) + thread_entry(osn_var, n); +} + +/* + * hook_thread_events - Hook the instrumentation for thread noise + * + * Hook the osnoise tracer callbacks to handle the noise from other + * threads on the necessary kernel events. + */ +static int hook_thread_events(void) +{ + int ret; + + ret = register_trace_sched_switch(trace_sched_switch_callback, NULL); + if (ret) + return -EINVAL; + + ret = register_migration_monitor(); + if (ret) + goto out_unreg; + + return 0; + +out_unreg: + unregister_trace_sched_switch(trace_sched_switch_callback, NULL); + return -EINVAL; +} + +/* + * unhook_thread_events - unhook the instrumentation for thread noise + * + * Unook the osnoise tracer callbacks to handle the noise from other + * threads on the necessary kernel events. + */ +static void unhook_thread_events(void) +{ + unregister_trace_sched_switch(trace_sched_switch_callback, NULL); + unregister_migration_monitor(); +} + +/* + * save_osn_sample_stats - Save the osnoise_sample statistics + * + * Save the osnoise_sample statistics before the sampling phase. These + * values will be used later to compute the diff betwneen the statistics + * before and after the osnoise sampling. + */ +static void +save_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s) +{ + s->nmi_count = osn_var->nmi.count; + s->irq_count = osn_var->irq.count; + s->softirq_count = osn_var->softirq.count; + s->thread_count = osn_var->thread.count; +} + +/* + * diff_osn_sample_stats - Compute the osnoise_sample statistics + * + * After a sample period, compute the difference on the osnoise_sample + * statistics. The struct osnoise_sample *s contains the statistics saved via + * save_osn_sample_stats() before the osnoise sampling. + */ +static void +diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *s) +{ + s->nmi_count = osn_var->nmi.count - s->nmi_count; + s->irq_count = osn_var->irq.count - s->irq_count; + s->softirq_count = osn_var->softirq.count - s->softirq_count; + s->thread_count = osn_var->thread.count - s->thread_count; +} + +/* + * osnoise_stop_tracing - Stop tracing and the tracer. + */ +static __always_inline void osnoise_stop_tracing(void) +{ + struct osnoise_instance *inst; + struct trace_array *tr; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + tr = inst->tr; + trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, + "stop tracing hit on cpu %d\n", smp_processor_id()); + + if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options)) + panic("tracer hit stop condition on CPU %d\n", smp_processor_id()); + + tracer_tracing_off(tr); + } + rcu_read_unlock(); +} + +/* + * osnoise_has_tracing_on - Check if there is at least one instance on + */ +static __always_inline int osnoise_has_tracing_on(void) +{ + struct osnoise_instance *inst; + int trace_is_on = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) + trace_is_on += tracer_tracing_is_on(inst->tr); + rcu_read_unlock(); + + return trace_is_on; +} + +/* + * notify_new_max_latency - Notify a new max latency via fsnotify interface. + */ +static void notify_new_max_latency(u64 latency) +{ + struct osnoise_instance *inst; + struct trace_array *tr; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + tr = inst->tr; + if (tracer_tracing_is_on(tr) && tr->max_latency < latency) { + tr->max_latency = latency; + latency_fsnotify(tr); + } + } + rcu_read_unlock(); +} + +/* + * run_osnoise - Sample the time and look for osnoise + * + * Used to capture the time, looking for potential osnoise latency repeatedly. + * Different from hwlat_detector, it is called with preemption and interrupts + * enabled. This allows irqs, softirqs and threads to run, interfering on the + * osnoise sampling thread, as they would do with a regular thread. + */ +static int run_osnoise(void) +{ + bool disable_irq = test_bit(OSN_IRQ_DISABLE, &osnoise_options); + struct osnoise_variables *osn_var = this_cpu_osn_var(); + u64 start, sample, last_sample; + u64 last_int_count, int_count; + s64 noise = 0, max_noise = 0; + s64 total, last_total = 0; + struct osnoise_sample s; + bool disable_preemption; + unsigned int threshold; + u64 runtime, stop_in; + u64 sum_noise = 0; + int hw_count = 0; + int ret = -1; + + /* + * Disabling preemption is only required if IRQs are enabled, + * and the options is set on. + */ + disable_preemption = !disable_irq && test_bit(OSN_PREEMPT_DISABLE, &osnoise_options); + + /* + * Considers the current thread as the workload. + */ + osn_var->pid = current->pid; + + /* + * Save the current stats for the diff + */ + save_osn_sample_stats(osn_var, &s); + + /* + * if threshold is 0, use the default value of 5 us. + */ + threshold = tracing_thresh ? : 5000; + + /* + * Apply PREEMPT and IRQ disabled options. + */ + if (disable_irq) + local_irq_disable(); + + if (disable_preemption) + preempt_disable(); + + /* + * Make sure NMIs see sampling first + */ + osn_var->sampling = true; + barrier(); + + /* + * Transform the *_us config to nanoseconds to avoid the + * division on the main loop. + */ + runtime = osnoise_data.sample_runtime * NSEC_PER_USEC; + stop_in = osnoise_data.stop_tracing * NSEC_PER_USEC; + + /* + * Start timestemp + */ + start = time_get(); + + /* + * "previous" loop. + */ + last_int_count = set_int_safe_time(osn_var, &last_sample); + + do { + /* + * Get sample! + */ + int_count = set_int_safe_time(osn_var, &sample); + + noise = time_sub(sample, last_sample); + + /* + * This shouldn't happen. + */ + if (noise < 0) { + osnoise_taint("negative noise!"); + goto out; + } + + /* + * Sample runtime. + */ + total = time_sub(sample, start); + + /* + * Check for possible overflows. + */ + if (total < last_total) { + osnoise_taint("total overflow!"); + break; + } + + last_total = total; + + if (noise >= threshold) { + int interference = int_count - last_int_count; + + if (noise > max_noise) + max_noise = noise; + + if (!interference) + hw_count++; + + sum_noise += noise; + + trace_sample_threshold(last_sample, noise, interference); + + if (osnoise_data.stop_tracing) + if (noise > stop_in) + osnoise_stop_tracing(); + } + + /* + * In some cases, notably when running on a nohz_full CPU with + * a stopped tick PREEMPT_RCU has no way to account for QSs. + * This will eventually cause unwarranted noise as PREEMPT_RCU + * will force preemption as the means of ending the current + * grace period. We avoid this problem by calling + * rcu_momentary_dyntick_idle(), which performs a zero duration + * EQS allowing PREEMPT_RCU to end the current grace period. + * This call shouldn't be wrapped inside an RCU critical + * section. + * + * Note that in non PREEMPT_RCU kernels QSs are handled through + * cond_resched() + */ + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { + if (!disable_irq) + local_irq_disable(); + + rcu_momentary_dyntick_idle(); + + if (!disable_irq) + local_irq_enable(); + } + + /* + * For the non-preemptive kernel config: let threads runs, if + * they so wish, unless set not do to so. + */ + if (!disable_irq && !disable_preemption) + cond_resched(); + + last_sample = sample; + last_int_count = int_count; + + } while (total < runtime && !kthread_should_stop()); + + /* + * Finish the above in the view for interrupts. + */ + barrier(); + + osn_var->sampling = false; + + /* + * Make sure sampling data is no longer updated. + */ + barrier(); + + /* + * Return to the preemptive state. + */ + if (disable_preemption) + preempt_enable(); + + if (disable_irq) + local_irq_enable(); + + /* + * Save noise info. + */ + s.noise = time_to_us(sum_noise); + s.runtime = time_to_us(total); + s.max_sample = time_to_us(max_noise); + s.hw_count = hw_count; + + /* Save interference stats info */ + diff_osn_sample_stats(osn_var, &s); + + trace_osnoise_sample(&s); + + notify_new_max_latency(max_noise); + + if (osnoise_data.stop_tracing_total) + if (s.noise > osnoise_data.stop_tracing_total) + osnoise_stop_tracing(); + + return 0; +out: + return ret; +} + +static struct cpumask osnoise_cpumask; +static struct cpumask save_cpumask; + +/* + * osnoise_sleep - sleep until the next period + */ +static void osnoise_sleep(bool skip_period) +{ + u64 interval; + ktime_t wake_time; + + mutex_lock(&interface_lock); + if (skip_period) + interval = osnoise_data.sample_period; + else + interval = osnoise_data.sample_period - osnoise_data.sample_runtime; + mutex_unlock(&interface_lock); + + /* + * differently from hwlat_detector, the osnoise tracer can run + * without a pause because preemption is on. + */ + if (!interval) { + /* Let synchronize_rcu_tasks() make progress */ + cond_resched_tasks_rcu_qs(); + return; + } + + wake_time = ktime_add_us(ktime_get(), interval); + __set_current_state(TASK_INTERRUPTIBLE); + + while (schedule_hrtimeout(&wake_time, HRTIMER_MODE_ABS)) { + if (kthread_should_stop()) + break; + } +} + +/* + * osnoise_migration_pending - checks if the task needs to migrate + * + * osnoise/timerlat threads are per-cpu. If there is a pending request to + * migrate the thread away from the current CPU, something bad has happened. + * Play the good citizen and leave. + * + * Returns 0 if it is safe to continue, 1 otherwise. + */ +static inline int osnoise_migration_pending(void) +{ + if (!current->migration_pending) + return 0; + + /* + * If migration is pending, there is a task waiting for the + * tracer to enable migration. The tracer does not allow migration, + * thus: taint and leave to unblock the blocked thread. + */ + osnoise_taint("migration requested to osnoise threads, leaving."); + + /* + * Unset this thread from the threads managed by the interface. + * The tracers are responsible for cleaning their env before + * exiting. + */ + mutex_lock(&interface_lock); + this_cpu_osn_var()->kthread = NULL; + mutex_unlock(&interface_lock); + + return 1; +} + +/* + * osnoise_main - The osnoise detection kernel thread + * + * Calls run_osnoise() function to measure the osnoise for the configured runtime, + * every period. + */ +static int osnoise_main(void *data) +{ + unsigned long flags; + + /* + * This thread was created pinned to the CPU using PF_NO_SETAFFINITY. + * The problem is that cgroup does not allow PF_NO_SETAFFINITY thread. + * + * To work around this limitation, disable migration and remove the + * flag. + */ + migrate_disable(); + raw_spin_lock_irqsave(¤t->pi_lock, flags); + current->flags &= ~(PF_NO_SETAFFINITY); + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + + while (!kthread_should_stop()) { + if (osnoise_migration_pending()) + break; + + /* skip a period if tracing is off on all instances */ + if (!osnoise_has_tracing_on()) { + osnoise_sleep(true); + continue; + } + + run_osnoise(); + osnoise_sleep(false); + } + + migrate_enable(); + return 0; +} + +#ifdef CONFIG_TIMERLAT_TRACER +/* + * timerlat_irq - hrtimer handler for timerlat. + */ +static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + struct timerlat_variables *tlat; + struct timerlat_sample s; + u64 now; + u64 diff; + + /* + * I am not sure if the timer was armed for this CPU. So, get + * the timerlat struct from the timer itself, not from this + * CPU. + */ + tlat = container_of(timer, struct timerlat_variables, timer); + + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); + + /* + * Enable the osnoise: events for thread an softirq. + */ + tlat->tracing_thread = true; + + osn_var->thread.arrival_time = time_get(); + + /* + * A hardirq is running: the timer IRQ. It is for sure preempting + * a thread, and potentially preempting a softirq. + * + * At this point, it is not interesting to know the duration of the + * preempted thread (and maybe softirq), but how much time they will + * delay the beginning of the execution of the timer thread. + * + * To get the correct (net) delay added by the softirq, its delta_start + * is set as the IRQ one. In this way, at the return of the IRQ, the delta + * start of the sofitrq will be zeroed, accounting then only the time + * after that. + * + * The thread follows the same principle. However, if a softirq is + * running, the thread needs to receive the softirq delta_start. The + * reason being is that the softirq will be the last to be unfolded, + * resseting the thread delay to zero. + * + * The PREEMPT_RT is a special case, though. As softirqs run as threads + * on RT, moving the thread is enough. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && osn_var->softirq.delta_start) { + copy_int_safe_time(osn_var, &osn_var->thread.delta_start, + &osn_var->softirq.delta_start); + + copy_int_safe_time(osn_var, &osn_var->softirq.delta_start, + &osn_var->irq.delta_start); + } else { + copy_int_safe_time(osn_var, &osn_var->thread.delta_start, + &osn_var->irq.delta_start); + } + + /* + * Compute the current time with the expected time. + */ + diff = now - tlat->abs_period; + + tlat->count++; + s.seqnum = tlat->count; + s.timer_latency = diff; + s.context = IRQ_CONTEXT; + + trace_timerlat_sample(&s); + + if (osnoise_data.stop_tracing) { + if (time_to_us(diff) >= osnoise_data.stop_tracing) { + + /* + * At this point, if stop_tracing is set and <= print_stack, + * print_stack is set and would be printed in the thread handler. + * + * Thus, print the stack trace as it is helpful to define the + * root cause of an IRQ latency. + */ + if (osnoise_data.stop_tracing <= osnoise_data.print_stack) { + timerlat_save_stack(0); + timerlat_dump_stack(time_to_us(diff)); + } + + osnoise_stop_tracing(); + notify_new_max_latency(diff); + + wake_up_process(tlat->kthread); + + return HRTIMER_NORESTART; + } + } + + wake_up_process(tlat->kthread); + + if (osnoise_data.print_stack) + timerlat_save_stack(0); + + return HRTIMER_NORESTART; +} + +/* + * wait_next_period - Wait for the next period for timerlat + */ +static int wait_next_period(struct timerlat_variables *tlat) +{ + ktime_t next_abs_period, now; + u64 rel_period = osnoise_data.timerlat_period * 1000; + + now = hrtimer_cb_get_time(&tlat->timer); + next_abs_period = ns_to_ktime(tlat->abs_period + rel_period); + + /* + * Save the next abs_period. + */ + tlat->abs_period = (u64) ktime_to_ns(next_abs_period); + + /* + * If the new abs_period is in the past, skip the activation. + */ + while (ktime_compare(now, next_abs_period) > 0) { + next_abs_period = ns_to_ktime(tlat->abs_period + rel_period); + tlat->abs_period = (u64) ktime_to_ns(next_abs_period); + } + + set_current_state(TASK_INTERRUPTIBLE); + + hrtimer_start(&tlat->timer, next_abs_period, HRTIMER_MODE_ABS_PINNED_HARD); + schedule(); + return 1; +} + +/* + * timerlat_main- Timerlat main + */ +static int timerlat_main(void *data) +{ + struct osnoise_variables *osn_var = this_cpu_osn_var(); + struct timerlat_variables *tlat = this_cpu_tmr_var(); + struct timerlat_sample s; + struct sched_param sp; + unsigned long flags; + u64 now, diff; + + /* + * Make the thread RT, that is how cyclictest is usually used. + */ + sp.sched_priority = DEFAULT_TIMERLAT_PRIO; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + + /* + * This thread was created pinned to the CPU using PF_NO_SETAFFINITY. + * The problem is that cgroup does not allow PF_NO_SETAFFINITY thread. + * + * To work around this limitation, disable migration and remove the + * flag. + */ + migrate_disable(); + raw_spin_lock_irqsave(¤t->pi_lock, flags); + current->flags &= ~(PF_NO_SETAFFINITY); + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + + tlat->count = 0; + tlat->tracing_thread = false; + + hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); + tlat->timer.function = timerlat_irq; + tlat->kthread = current; + osn_var->pid = current->pid; + /* + * Anotate the arrival time. + */ + tlat->abs_period = hrtimer_cb_get_time(&tlat->timer); + + wait_next_period(tlat); + + osn_var->sampling = 1; + + while (!kthread_should_stop()) { + + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); + diff = now - tlat->abs_period; + + s.seqnum = tlat->count; + s.timer_latency = diff; + s.context = THREAD_CONTEXT; + + trace_timerlat_sample(&s); + + notify_new_max_latency(diff); + + timerlat_dump_stack(time_to_us(diff)); + + tlat->tracing_thread = false; + if (osnoise_data.stop_tracing_total) + if (time_to_us(diff) >= osnoise_data.stop_tracing_total) + osnoise_stop_tracing(); + + if (osnoise_migration_pending()) + break; + + wait_next_period(tlat); + } + + hrtimer_cancel(&tlat->timer); + migrate_enable(); + return 0; +} +#else /* CONFIG_TIMERLAT_TRACER */ +static int timerlat_main(void *data) +{ + return 0; +} +#endif /* CONFIG_TIMERLAT_TRACER */ + +/* + * stop_kthread - stop a workload thread + */ +static void stop_kthread(unsigned int cpu) +{ + struct task_struct *kthread; + + kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; + if (kthread) { + if (test_bit(OSN_WORKLOAD, &osnoise_options)) { + kthread_stop(kthread); + } else { + /* + * This is a user thread waiting on the timerlat_fd. We need + * to close all users, and the best way to guarantee this is + * by killing the thread. NOTE: this is a purpose specific file. + */ + kill_pid(kthread->thread_pid, SIGKILL, 1); + put_task_struct(kthread); + } + per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + } else { + /* if no workload, just return */ + if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + /* + * This is set in the osnoise tracer case. + */ + per_cpu(per_cpu_osnoise_var, cpu).sampling = false; + barrier(); + return; + } + } +} + +/* + * stop_per_cpu_kthread - Stop per-cpu threads + * + * Stop the osnoise sampling htread. Use this on unload and at system + * shutdown. + */ +static void stop_per_cpu_kthreads(void) +{ + int cpu; + + cpus_read_lock(); + + for_each_online_cpu(cpu) + stop_kthread(cpu); + + cpus_read_unlock(); +} + +/* + * start_kthread - Start a workload tread + */ +static int start_kthread(unsigned int cpu) +{ + struct task_struct *kthread; + void *main = osnoise_main; + char comm[24]; + + if (timerlat_enabled()) { + snprintf(comm, 24, "timerlat/%d", cpu); + main = timerlat_main; + } else { + /* if no workload, just return */ + if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + per_cpu(per_cpu_osnoise_var, cpu).sampling = true; + barrier(); + return 0; + } + snprintf(comm, 24, "osnoise/%d", cpu); + } + + kthread = kthread_run_on_cpu(main, NULL, cpu, comm); + + if (IS_ERR(kthread)) { + pr_err(BANNER "could not start sampling thread\n"); + stop_per_cpu_kthreads(); + return -ENOMEM; + } + + per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread; + + return 0; +} + +/* + * start_per_cpu_kthread - Kick off per-cpu osnoise sampling kthreads + * + * This starts the kernel thread that will look for osnoise on many + * cpus. + */ +static int start_per_cpu_kthreads(void) +{ + struct cpumask *current_mask = &save_cpumask; + int retval = 0; + int cpu; + + if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { + if (timerlat_enabled()) + return 0; + } + + cpus_read_lock(); + /* + * Run only on online CPUs in which osnoise is allowed to run. + */ + cpumask_and(current_mask, cpu_online_mask, &osnoise_cpumask); + + for_each_possible_cpu(cpu) + per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + + for_each_cpu(cpu, current_mask) { + retval = start_kthread(cpu); + if (retval) { + cpus_read_unlock(); + stop_per_cpu_kthreads(); + return retval; + } + } + + cpus_read_unlock(); + + return retval; +} + +#ifdef CONFIG_HOTPLUG_CPU +static void osnoise_hotplug_workfn(struct work_struct *dummy) +{ + unsigned int cpu = smp_processor_id(); + + mutex_lock(&trace_types_lock); + + if (!osnoise_has_registered_instances()) + goto out_unlock_trace; + + mutex_lock(&interface_lock); + cpus_read_lock(); + + if (!cpumask_test_cpu(cpu, &osnoise_cpumask)) + goto out_unlock; + + start_kthread(cpu); + +out_unlock: + cpus_read_unlock(); + mutex_unlock(&interface_lock); +out_unlock_trace: + mutex_unlock(&trace_types_lock); +} + +static DECLARE_WORK(osnoise_hotplug_work, osnoise_hotplug_workfn); + +/* + * osnoise_cpu_init - CPU hotplug online callback function + */ +static int osnoise_cpu_init(unsigned int cpu) +{ + schedule_work_on(cpu, &osnoise_hotplug_work); + return 0; +} + +/* + * osnoise_cpu_die - CPU hotplug offline callback function + */ +static int osnoise_cpu_die(unsigned int cpu) +{ + stop_kthread(cpu); + return 0; +} + +static void osnoise_init_hotplug_support(void) +{ + int ret; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "trace/osnoise:online", + osnoise_cpu_init, osnoise_cpu_die); + if (ret < 0) + pr_warn(BANNER "Error to init cpu hotplug support\n"); + + return; +} +#else /* CONFIG_HOTPLUG_CPU */ +static void osnoise_init_hotplug_support(void) +{ + return; +} +#endif /* CONFIG_HOTPLUG_CPU */ + +/* + * seq file functions for the osnoise/options file. + */ +static void *s_options_start(struct seq_file *s, loff_t *pos) +{ + int option = *pos; + + mutex_lock(&interface_lock); + + if (option >= OSN_MAX) + return NULL; + + return pos; +} + +static void *s_options_next(struct seq_file *s, void *v, loff_t *pos) +{ + int option = ++(*pos); + + if (option >= OSN_MAX) + return NULL; + + return pos; +} + +static int s_options_show(struct seq_file *s, void *v) +{ + loff_t *pos = v; + int option = *pos; + + if (option == OSN_DEFAULTS) { + if (osnoise_options == OSN_DEFAULT_OPTIONS) + seq_printf(s, "%s", osnoise_options_str[option]); + else + seq_printf(s, "NO_%s", osnoise_options_str[option]); + goto out; + } + + if (test_bit(option, &osnoise_options)) + seq_printf(s, "%s", osnoise_options_str[option]); + else + seq_printf(s, "NO_%s", osnoise_options_str[option]); + +out: + if (option != OSN_MAX) + seq_puts(s, " "); + + return 0; +} + +static void s_options_stop(struct seq_file *s, void *v) +{ + seq_puts(s, "\n"); + mutex_unlock(&interface_lock); +} + +static const struct seq_operations osnoise_options_seq_ops = { + .start = s_options_start, + .next = s_options_next, + .show = s_options_show, + .stop = s_options_stop +}; + +static int osnoise_options_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &osnoise_options_seq_ops); +}; + +/** + * osnoise_options_write - Write function for "options" entry + * @filp: The active open file structure + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in @file + * + * Writing the option name sets the option, writing the "NO_" + * prefix in front of the option name disables it. + * + * Writing "DEFAULTS" resets the option values to the default ones. + */ +static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + int running, option, enable, retval; + char buf[256], *option_str; + + if (cnt >= 256) + return -EINVAL; + + if (copy_from_user(buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (strncmp(buf, "NO_", 3)) { + option_str = strstrip(buf); + enable = true; + } else { + option_str = strstrip(&buf[3]); + enable = false; + } + + option = match_string(osnoise_options_str, OSN_MAX, option_str); + if (option < 0) + return -EINVAL; + + /* + * trace_types_lock is taken to avoid concurrency on start/stop. + */ + mutex_lock(&trace_types_lock); + running = osnoise_has_registered_instances(); + if (running) + stop_per_cpu_kthreads(); + + mutex_lock(&interface_lock); + /* + * avoid CPU hotplug operations that might read options. + */ + cpus_read_lock(); + + retval = cnt; + + if (enable) { + if (option == OSN_DEFAULTS) + osnoise_options = OSN_DEFAULT_OPTIONS; + else + set_bit(option, &osnoise_options); + } else { + if (option == OSN_DEFAULTS) + retval = -EINVAL; + else + clear_bit(option, &osnoise_options); + } + + cpus_read_unlock(); + mutex_unlock(&interface_lock); + + if (running) + start_per_cpu_kthreads(); + mutex_unlock(&trace_types_lock); + + return retval; +} + +/* + * osnoise_cpus_read - Read function for reading the "cpus" file + * @filp: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * Prints the "cpus" output into the user-provided buffer. + */ +static ssize_t +osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count, + loff_t *ppos) +{ + char *mask_str; + int len; + + mutex_lock(&interface_lock); + + len = snprintf(NULL, 0, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)) + 1; + mask_str = kmalloc(len, GFP_KERNEL); + if (!mask_str) { + count = -ENOMEM; + goto out_unlock; + } + + len = snprintf(mask_str, len, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)); + if (len >= count) { + count = -EINVAL; + goto out_free; + } + + count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); + +out_free: + kfree(mask_str); +out_unlock: + mutex_unlock(&interface_lock); + + return count; +} + +/* + * osnoise_cpus_write - Write function for "cpus" entry + * @filp: The active open file structure + * @ubuf: The user buffer that contains the value to write + * @cnt: The maximum number of bytes to write to "file" + * @ppos: The current position in @file + * + * This function provides a write implementation for the "cpus" + * interface to the osnoise trace. By default, it lists all CPUs, + * in this way, allowing osnoise threads to run on any online CPU + * of the system. It serves to restrict the execution of osnoise to the + * set of CPUs writing via this interface. Why not use "tracing_cpumask"? + * Because the user might be interested in tracing what is running on + * other CPUs. For instance, one might run osnoise in one HT CPU + * while observing what is running on the sibling HT CPU. + */ +static ssize_t +osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, + loff_t *ppos) +{ + cpumask_var_t osnoise_cpumask_new; + int running, err; + char buf[256]; + + if (count >= 256) + return -EINVAL; + + if (copy_from_user(buf, ubuf, count)) + return -EFAULT; + + if (!zalloc_cpumask_var(&osnoise_cpumask_new, GFP_KERNEL)) + return -ENOMEM; + + err = cpulist_parse(buf, osnoise_cpumask_new); + if (err) + goto err_free; + + /* + * trace_types_lock is taken to avoid concurrency on start/stop. + */ + mutex_lock(&trace_types_lock); + running = osnoise_has_registered_instances(); + if (running) + stop_per_cpu_kthreads(); + + mutex_lock(&interface_lock); + /* + * osnoise_cpumask is read by CPU hotplug operations. + */ + cpus_read_lock(); + + cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new); + + cpus_read_unlock(); + mutex_unlock(&interface_lock); + + if (running) + start_per_cpu_kthreads(); + mutex_unlock(&trace_types_lock); + + free_cpumask_var(osnoise_cpumask_new); + return count; + +err_free: + free_cpumask_var(osnoise_cpumask_new); + + return err; +} + +#ifdef CONFIG_TIMERLAT_TRACER +static int timerlat_fd_open(struct inode *inode, struct file *file) +{ + struct osnoise_variables *osn_var; + struct timerlat_variables *tlat; + long cpu = (long) inode->i_cdev; + + mutex_lock(&interface_lock); + + /* + * This file is accessible only if timerlat is enabled, and + * NO_OSNOISE_WORKLOAD is set. + */ + if (!timerlat_enabled() || test_bit(OSN_WORKLOAD, &osnoise_options)) { + mutex_unlock(&interface_lock); + return -EINVAL; + } + + migrate_disable(); + + osn_var = this_cpu_osn_var(); + + /* + * The osn_var->pid holds the single access to this file. + */ + if (osn_var->pid) { + mutex_unlock(&interface_lock); + migrate_enable(); + return -EBUSY; + } + + /* + * timerlat tracer is a per-cpu tracer. Check if the user-space too + * is pinned to a single CPU. The tracer laters monitor if the task + * migrates and then disables tracer if it does. However, it is + * worth doing this basic acceptance test to avoid obviusly wrong + * setup. + */ + if (current->nr_cpus_allowed > 1 || cpu != smp_processor_id()) { + mutex_unlock(&interface_lock); + migrate_enable(); + return -EPERM; + } + + /* + * From now on, it is good to go. + */ + file->private_data = inode->i_cdev; + + get_task_struct(current); + + osn_var->kthread = current; + osn_var->pid = current->pid; + + /* + * Setup is done. + */ + mutex_unlock(&interface_lock); + + tlat = this_cpu_tmr_var(); + tlat->count = 0; + + migrate_enable(); + return 0; +}; + +/* + * timerlat_fd_read - Read function for "timerlat_fd" file + * @file: The active open file structure + * @ubuf: The userspace provided buffer to read value into + * @cnt: The maximum number of bytes to read + * @ppos: The current "file" position + * + * Prints 1 on timerlat, the number of interferences on osnoise, -1 on error. + */ +static ssize_t +timerlat_fd_read(struct file *file, char __user *ubuf, size_t count, + loff_t *ppos) +{ + long cpu = (long) file->private_data; + struct osnoise_variables *osn_var; + struct timerlat_variables *tlat; + struct timerlat_sample s; + s64 diff; + u64 now; + + migrate_disable(); + + tlat = this_cpu_tmr_var(); + + /* + * While in user-space, the thread is migratable. There is nothing + * we can do about it. + * So, if the thread is running on another CPU, stop the machinery. + */ + if (cpu == smp_processor_id()) { + if (tlat->uthread_migrate) { + migrate_enable(); + return -EINVAL; + } + } else { + per_cpu_ptr(&per_cpu_timerlat_var, cpu)->uthread_migrate = 1; + osnoise_taint("timerlat user thread migrate\n"); + osnoise_stop_tracing(); + migrate_enable(); + return -EINVAL; + } + + osn_var = this_cpu_osn_var(); + + /* + * The timerlat in user-space runs in a different order: + * the read() starts from the execution of the previous occurrence, + * sleeping for the next occurrence. + * + * So, skip if we are entering on read() before the first wakeup + * from timerlat IRQ: + */ + if (likely(osn_var->sampling)) { + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); + diff = now - tlat->abs_period; + + /* + * it was not a timer firing, but some other signal? + */ + if (diff < 0) + goto out; + + s.seqnum = tlat->count; + s.timer_latency = diff; + s.context = THREAD_URET; + + trace_timerlat_sample(&s); + + notify_new_max_latency(diff); + + tlat->tracing_thread = false; + if (osnoise_data.stop_tracing_total) + if (time_to_us(diff) >= osnoise_data.stop_tracing_total) + osnoise_stop_tracing(); + } else { + tlat->tracing_thread = false; + tlat->kthread = current; + + hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); + tlat->timer.function = timerlat_irq; + + /* Annotate now to drift new period */ + tlat->abs_period = hrtimer_cb_get_time(&tlat->timer); + + osn_var->sampling = 1; + } + + /* wait for the next period */ + wait_next_period(tlat); + + /* This is the wakeup from this cycle */ + now = ktime_to_ns(hrtimer_cb_get_time(&tlat->timer)); + diff = now - tlat->abs_period; + + /* + * it was not a timer firing, but some other signal? + */ + if (diff < 0) + goto out; + + s.seqnum = tlat->count; + s.timer_latency = diff; + s.context = THREAD_CONTEXT; + + trace_timerlat_sample(&s); + + if (osnoise_data.stop_tracing_total) { + if (time_to_us(diff) >= osnoise_data.stop_tracing_total) { + timerlat_dump_stack(time_to_us(diff)); + notify_new_max_latency(diff); + osnoise_stop_tracing(); + } + } + +out: + migrate_enable(); + return 0; +} + +static int timerlat_fd_release(struct inode *inode, struct file *file) +{ + struct osnoise_variables *osn_var; + struct timerlat_variables *tlat_var; + long cpu = (long) file->private_data; + + migrate_disable(); + mutex_lock(&interface_lock); + + osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); + tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); + + hrtimer_cancel(&tlat_var->timer); + memset(tlat_var, 0, sizeof(*tlat_var)); + + osn_var->sampling = 0; + osn_var->pid = 0; + + /* + * We are leaving, not being stopped... see stop_kthread(); + */ + if (osn_var->kthread) { + put_task_struct(osn_var->kthread); + osn_var->kthread = NULL; + } + + mutex_unlock(&interface_lock); + migrate_enable(); + return 0; +} +#endif + +/* + * osnoise/runtime_us: cannot be greater than the period. + */ +static struct trace_min_max_param osnoise_runtime = { + .lock = &interface_lock, + .val = &osnoise_data.sample_runtime, + .max = &osnoise_data.sample_period, + .min = NULL, +}; + +/* + * osnoise/period_us: cannot be smaller than the runtime. + */ +static struct trace_min_max_param osnoise_period = { + .lock = &interface_lock, + .val = &osnoise_data.sample_period, + .max = NULL, + .min = &osnoise_data.sample_runtime, +}; + +/* + * osnoise/stop_tracing_us: no limit. + */ +static struct trace_min_max_param osnoise_stop_tracing_in = { + .lock = &interface_lock, + .val = &osnoise_data.stop_tracing, + .max = NULL, + .min = NULL, +}; + +/* + * osnoise/stop_tracing_total_us: no limit. + */ +static struct trace_min_max_param osnoise_stop_tracing_total = { + .lock = &interface_lock, + .val = &osnoise_data.stop_tracing_total, + .max = NULL, + .min = NULL, +}; + +#ifdef CONFIG_TIMERLAT_TRACER +/* + * osnoise/print_stack: print the stacktrace of the IRQ handler if the total + * latency is higher than val. + */ +static struct trace_min_max_param osnoise_print_stack = { + .lock = &interface_lock, + .val = &osnoise_data.print_stack, + .max = NULL, + .min = NULL, +}; + +/* + * osnoise/timerlat_period: min 100 us, max 1 s + */ +static u64 timerlat_min_period = 100; +static u64 timerlat_max_period = 1000000; +static struct trace_min_max_param timerlat_period = { + .lock = &interface_lock, + .val = &osnoise_data.timerlat_period, + .max = &timerlat_max_period, + .min = &timerlat_min_period, +}; + +static const struct file_operations timerlat_fd_fops = { + .open = timerlat_fd_open, + .read = timerlat_fd_read, + .release = timerlat_fd_release, + .llseek = generic_file_llseek, +}; +#endif + +static const struct file_operations cpus_fops = { + .open = tracing_open_generic, + .read = osnoise_cpus_read, + .write = osnoise_cpus_write, + .llseek = generic_file_llseek, +}; + +static const struct file_operations osnoise_options_fops = { + .open = osnoise_options_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = osnoise_options_write +}; + +#ifdef CONFIG_TIMERLAT_TRACER +#ifdef CONFIG_STACKTRACE +static int init_timerlat_stack_tracefs(struct dentry *top_dir) +{ + struct dentry *tmp; + + tmp = tracefs_create_file("print_stack", TRACE_MODE_WRITE, top_dir, + &osnoise_print_stack, &trace_min_max_fops); + if (!tmp) + return -ENOMEM; + + return 0; +} +#else /* CONFIG_STACKTRACE */ +static int init_timerlat_stack_tracefs(struct dentry *top_dir) +{ + return 0; +} +#endif /* CONFIG_STACKTRACE */ + +static int osnoise_create_cpu_timerlat_fd(struct dentry *top_dir) +{ + struct dentry *timerlat_fd; + struct dentry *per_cpu; + struct dentry *cpu_dir; + char cpu_str[30]; /* see trace.c: tracing_init_tracefs_percpu() */ + long cpu; + + /* + * Why not using tracing instance per_cpu/ dir? + * + * Because osnoise/timerlat have a single workload, having + * multiple files like these are wast of memory. + */ + per_cpu = tracefs_create_dir("per_cpu", top_dir); + if (!per_cpu) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + snprintf(cpu_str, 30, "cpu%ld", cpu); + cpu_dir = tracefs_create_dir(cpu_str, per_cpu); + if (!cpu_dir) + goto out_clean; + + timerlat_fd = trace_create_file("timerlat_fd", TRACE_MODE_READ, + cpu_dir, NULL, &timerlat_fd_fops); + if (!timerlat_fd) + goto out_clean; + + /* Record the CPU */ + d_inode(timerlat_fd)->i_cdev = (void *)(cpu); + } + + return 0; + +out_clean: + tracefs_remove(per_cpu); + return -ENOMEM; +} + +/* + * init_timerlat_tracefs - A function to initialize the timerlat interface files + */ +static int init_timerlat_tracefs(struct dentry *top_dir) +{ + struct dentry *tmp; + int retval; + + tmp = tracefs_create_file("timerlat_period_us", TRACE_MODE_WRITE, top_dir, + &timerlat_period, &trace_min_max_fops); + if (!tmp) + return -ENOMEM; + + retval = osnoise_create_cpu_timerlat_fd(top_dir); + if (retval) + return retval; + + return init_timerlat_stack_tracefs(top_dir); +} +#else /* CONFIG_TIMERLAT_TRACER */ +static int init_timerlat_tracefs(struct dentry *top_dir) +{ + return 0; +} +#endif /* CONFIG_TIMERLAT_TRACER */ + +/* + * init_tracefs - A function to initialize the tracefs interface files + * + * This function creates entries in tracefs for "osnoise" and "timerlat". + * It creates these directories in the tracing directory, and within that + * directory the use can change and view the configs. + */ +static int init_tracefs(void) +{ + struct dentry *top_dir; + struct dentry *tmp; + int ret; + + ret = tracing_init_dentry(); + if (ret) + return -ENOMEM; + + top_dir = tracefs_create_dir("osnoise", NULL); + if (!top_dir) + return 0; + + tmp = tracefs_create_file("period_us", TRACE_MODE_WRITE, top_dir, + &osnoise_period, &trace_min_max_fops); + if (!tmp) + goto err; + + tmp = tracefs_create_file("runtime_us", TRACE_MODE_WRITE, top_dir, + &osnoise_runtime, &trace_min_max_fops); + if (!tmp) + goto err; + + tmp = tracefs_create_file("stop_tracing_us", TRACE_MODE_WRITE, top_dir, + &osnoise_stop_tracing_in, &trace_min_max_fops); + if (!tmp) + goto err; + + tmp = tracefs_create_file("stop_tracing_total_us", TRACE_MODE_WRITE, top_dir, + &osnoise_stop_tracing_total, &trace_min_max_fops); + if (!tmp) + goto err; + + tmp = trace_create_file("cpus", TRACE_MODE_WRITE, top_dir, NULL, &cpus_fops); + if (!tmp) + goto err; + + tmp = trace_create_file("options", TRACE_MODE_WRITE, top_dir, NULL, + &osnoise_options_fops); + if (!tmp) + goto err; + + ret = init_timerlat_tracefs(top_dir); + if (ret) + goto err; + + return 0; + +err: + tracefs_remove(top_dir); + return -ENOMEM; +} + +static int osnoise_hook_events(void) +{ + int retval; + + /* + * Trace is already hooked, we are re-enabling from + * a stop_tracing_*. + */ + if (trace_osnoise_callback_enabled) + return 0; + + retval = hook_irq_events(); + if (retval) + return -EINVAL; + + retval = hook_softirq_events(); + if (retval) + goto out_unhook_irq; + + retval = hook_thread_events(); + /* + * All fine! + */ + if (!retval) + return 0; + + unhook_softirq_events(); +out_unhook_irq: + unhook_irq_events(); + return -EINVAL; +} + +static void osnoise_unhook_events(void) +{ + unhook_thread_events(); + unhook_softirq_events(); + unhook_irq_events(); +} + +/* + * osnoise_workload_start - start the workload and hook to events + */ +static int osnoise_workload_start(void) +{ + int retval; + + /* + * Instances need to be registered after calling workload + * start. Hence, if there is already an instance, the + * workload was already registered. Otherwise, this + * code is on the way to register the first instance, + * and the workload will start. + */ + if (osnoise_has_registered_instances()) + return 0; + + osn_var_reset_all(); + + retval = osnoise_hook_events(); + if (retval) + return retval; + + /* + * Make sure that ftrace_nmi_enter/exit() see reset values + * before enabling trace_osnoise_callback_enabled. + */ + barrier(); + trace_osnoise_callback_enabled = true; + + retval = start_per_cpu_kthreads(); + if (retval) { + trace_osnoise_callback_enabled = false; + /* + * Make sure that ftrace_nmi_enter/exit() see + * trace_osnoise_callback_enabled as false before continuing. + */ + barrier(); + + osnoise_unhook_events(); + return retval; + } + + return 0; +} + +/* + * osnoise_workload_stop - stop the workload and unhook the events + */ +static void osnoise_workload_stop(void) +{ + /* + * Instances need to be unregistered before calling + * stop. Hence, if there is a registered instance, more + * than one instance is running, and the workload will not + * yet stop. Otherwise, this code is on the way to disable + * the last instance, and the workload can stop. + */ + if (osnoise_has_registered_instances()) + return; + + /* + * If callbacks were already disabled in a previous stop + * call, there is no need to disable then again. + * + * For instance, this happens when tracing is stopped via: + * echo 0 > tracing_on + * echo nop > current_tracer. + */ + if (!trace_osnoise_callback_enabled) + return; + + trace_osnoise_callback_enabled = false; + /* + * Make sure that ftrace_nmi_enter/exit() see + * trace_osnoise_callback_enabled as false before continuing. + */ + barrier(); + + stop_per_cpu_kthreads(); + + osnoise_unhook_events(); +} + +static void osnoise_tracer_start(struct trace_array *tr) +{ + int retval; + + /* + * If the instance is already registered, there is no need to + * register it again. + */ + if (osnoise_instance_registered(tr)) + return; + + retval = osnoise_workload_start(); + if (retval) + pr_err(BANNER "Error starting osnoise tracer\n"); + + osnoise_register_instance(tr); +} + +static void osnoise_tracer_stop(struct trace_array *tr) +{ + osnoise_unregister_instance(tr); + osnoise_workload_stop(); +} + +static int osnoise_tracer_init(struct trace_array *tr) +{ + /* + * Only allow osnoise tracer if timerlat tracer is not running + * already. + */ + if (timerlat_enabled()) + return -EBUSY; + + tr->max_latency = 0; + + osnoise_tracer_start(tr); + return 0; +} + +static void osnoise_tracer_reset(struct trace_array *tr) +{ + osnoise_tracer_stop(tr); +} + +static struct tracer osnoise_tracer __read_mostly = { + .name = "osnoise", + .init = osnoise_tracer_init, + .reset = osnoise_tracer_reset, + .start = osnoise_tracer_start, + .stop = osnoise_tracer_stop, + .print_header = print_osnoise_headers, + .allow_instances = true, +}; + +#ifdef CONFIG_TIMERLAT_TRACER +static void timerlat_tracer_start(struct trace_array *tr) +{ + int retval; + + /* + * If the instance is already registered, there is no need to + * register it again. + */ + if (osnoise_instance_registered(tr)) + return; + + retval = osnoise_workload_start(); + if (retval) + pr_err(BANNER "Error starting timerlat tracer\n"); + + osnoise_register_instance(tr); + + return; +} + +static void timerlat_tracer_stop(struct trace_array *tr) +{ + int cpu; + + osnoise_unregister_instance(tr); + + /* + * Instruct the threads to stop only if this is the last instance. + */ + if (!osnoise_has_registered_instances()) { + for_each_online_cpu(cpu) + per_cpu(per_cpu_osnoise_var, cpu).sampling = 0; + } + + osnoise_workload_stop(); +} + +static int timerlat_tracer_init(struct trace_array *tr) +{ + /* + * Only allow timerlat tracer if osnoise tracer is not running already. + */ + if (osnoise_has_registered_instances() && !osnoise_data.timerlat_tracer) + return -EBUSY; + + /* + * If this is the first instance, set timerlat_tracer to block + * osnoise tracer start. + */ + if (!osnoise_has_registered_instances()) + osnoise_data.timerlat_tracer = 1; + + tr->max_latency = 0; + timerlat_tracer_start(tr); + + return 0; +} + +static void timerlat_tracer_reset(struct trace_array *tr) +{ + timerlat_tracer_stop(tr); + + /* + * If this is the last instance, reset timerlat_tracer allowing + * osnoise to be started. + */ + if (!osnoise_has_registered_instances()) + osnoise_data.timerlat_tracer = 0; +} + +static struct tracer timerlat_tracer __read_mostly = { + .name = "timerlat", + .init = timerlat_tracer_init, + .reset = timerlat_tracer_reset, + .start = timerlat_tracer_start, + .stop = timerlat_tracer_stop, + .print_header = print_timerlat_headers, + .allow_instances = true, +}; + +__init static int init_timerlat_tracer(void) +{ + return register_tracer(&timerlat_tracer); +} +#else /* CONFIG_TIMERLAT_TRACER */ +__init static int init_timerlat_tracer(void) +{ + return 0; +} +#endif /* CONFIG_TIMERLAT_TRACER */ + +__init static int init_osnoise_tracer(void) +{ + int ret; + + mutex_init(&interface_lock); + + cpumask_copy(&osnoise_cpumask, cpu_all_mask); + + ret = register_tracer(&osnoise_tracer); + if (ret) { + pr_err(BANNER "Error registering osnoise!\n"); + return ret; + } + + ret = init_timerlat_tracer(); + if (ret) { + pr_err(BANNER "Error registering timerlat!\n"); + return ret; + } + + osnoise_init_hotplug_support(); + + INIT_LIST_HEAD_RCU(&osnoise_instances); + + init_tracefs(); + + return 0; +} +late_initcall(init_osnoise_tracer); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c new file mode 100644 index 0000000000..3b7d3e9eb6 --- /dev/null +++ b/kernel/trace/trace_output.c @@ -0,0 +1,1727 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace_output.c + * + * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> + * + */ +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/ftrace.h> +#include <linux/kprobes.h> +#include <linux/sched/clock.h> +#include <linux/sched/mm.h> +#include <linux/idr.h> + +#include "trace_output.h" + +/* must be a power of 2 */ +#define EVENT_HASHSIZE 128 + +DECLARE_RWSEM(trace_event_sem); + +static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; + +enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct bputs_entry *field; + + trace_assign_type(field, entry); + + trace_seq_puts(s, field->str); + + return trace_handle_return(s); +} + +enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct bprint_entry *field; + + trace_assign_type(field, entry); + + trace_seq_bprintf(s, field->fmt, field->buf); + + return trace_handle_return(s); +} + +enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct print_entry *field; + + trace_assign_type(field, entry); + + trace_seq_puts(s, field->buf); + + return trace_handle_return(s); +} + +const char * +trace_print_flags_seq(struct trace_seq *p, const char *delim, + unsigned long flags, + const struct trace_print_flags *flag_array) +{ + unsigned long mask; + const char *str; + const char *ret = trace_seq_buffer_ptr(p); + int i, first = 1; + + for (i = 0; flag_array[i].name && flags; i++) { + + mask = flag_array[i].mask; + if ((flags & mask) != mask) + continue; + + str = flag_array[i].name; + flags &= ~mask; + if (!first && delim) + trace_seq_puts(p, delim); + else + first = 0; + trace_seq_puts(p, str); + } + + /* check for left over flags */ + if (flags) { + if (!first && delim) + trace_seq_puts(p, delim); + trace_seq_printf(p, "0x%lx", flags); + } + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(trace_print_flags_seq); + +const char * +trace_print_symbols_seq(struct trace_seq *p, unsigned long val, + const struct trace_print_flags *symbol_array) +{ + int i; + const char *ret = trace_seq_buffer_ptr(p); + + for (i = 0; symbol_array[i].name; i++) { + + if (val != symbol_array[i].mask) + continue; + + trace_seq_puts(p, symbol_array[i].name); + break; + } + + if (ret == (const char *)(trace_seq_buffer_ptr(p))) + trace_seq_printf(p, "0x%lx", val); + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(trace_print_symbols_seq); + +#if BITS_PER_LONG == 32 +const char * +trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, + unsigned long long flags, + const struct trace_print_flags_u64 *flag_array) +{ + unsigned long long mask; + const char *str; + const char *ret = trace_seq_buffer_ptr(p); + int i, first = 1; + + for (i = 0; flag_array[i].name && flags; i++) { + + mask = flag_array[i].mask; + if ((flags & mask) != mask) + continue; + + str = flag_array[i].name; + flags &= ~mask; + if (!first && delim) + trace_seq_puts(p, delim); + else + first = 0; + trace_seq_puts(p, str); + } + + /* check for left over flags */ + if (flags) { + if (!first && delim) + trace_seq_puts(p, delim); + trace_seq_printf(p, "0x%llx", flags); + } + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(trace_print_flags_seq_u64); + +const char * +trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, + const struct trace_print_flags_u64 *symbol_array) +{ + int i; + const char *ret = trace_seq_buffer_ptr(p); + + for (i = 0; symbol_array[i].name; i++) { + + if (val != symbol_array[i].mask) + continue; + + trace_seq_puts(p, symbol_array[i].name); + break; + } + + if (ret == (const char *)(trace_seq_buffer_ptr(p))) + trace_seq_printf(p, "0x%llx", val); + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(trace_print_symbols_seq_u64); +#endif + +const char * +trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, + unsigned int bitmask_size) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL_GPL(trace_print_bitmask_seq); + +/** + * trace_print_hex_seq - print buffer as hex sequence + * @p: trace seq struct to write to + * @buf: The buffer to print + * @buf_len: Length of @buf in bytes + * @concatenate: Print @buf as single hex string or with spacing + * + * Prints the passed buffer as a hex sequence either as a whole, + * single hex string if @concatenate is true or with spacing after + * each byte in case @concatenate is false. + */ +const char * +trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len, + bool concatenate) +{ + int i; + const char *ret = trace_seq_buffer_ptr(p); + const char *fmt = concatenate ? "%*phN" : "%*ph"; + + for (i = 0; i < buf_len; i += 16) { + if (!concatenate && i != 0) + trace_seq_putc(p, ' '); + trace_seq_printf(p, fmt, min(buf_len - i, 16), &buf[i]); + } + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(trace_print_hex_seq); + +const char * +trace_print_array_seq(struct trace_seq *p, const void *buf, int count, + size_t el_size) +{ + const char *ret = trace_seq_buffer_ptr(p); + const char *prefix = ""; + void *ptr = (void *)buf; + size_t buf_len = count * el_size; + + trace_seq_putc(p, '{'); + + while (ptr < buf + buf_len) { + switch (el_size) { + case 1: + trace_seq_printf(p, "%s0x%x", prefix, + *(u8 *)ptr); + break; + case 2: + trace_seq_printf(p, "%s0x%x", prefix, + *(u16 *)ptr); + break; + case 4: + trace_seq_printf(p, "%s0x%x", prefix, + *(u32 *)ptr); + break; + case 8: + trace_seq_printf(p, "%s0x%llx", prefix, + *(u64 *)ptr); + break; + default: + trace_seq_printf(p, "BAD SIZE:%zu 0x%x", el_size, + *(u8 *)ptr); + el_size = 1; + } + prefix = ","; + ptr += el_size; + } + + trace_seq_putc(p, '}'); + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(trace_print_array_seq); + +const char * +trace_print_hex_dump_seq(struct trace_seq *p, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_putc(p, '\n'); + trace_seq_hex_dump(p, prefix_str, prefix_type, + rowsize, groupsize, buf, len, ascii); + trace_seq_putc(p, 0); + return ret; +} +EXPORT_SYMBOL(trace_print_hex_dump_seq); + +int trace_raw_output_prep(struct trace_iterator *iter, + struct trace_event *trace_event) +{ + struct trace_event_call *event; + struct trace_seq *s = &iter->seq; + struct trace_seq *p = &iter->tmp_seq; + struct trace_entry *entry; + + event = container_of(trace_event, struct trace_event_call, event); + entry = iter->ent; + + if (entry->type != event->event.type) { + WARN_ON_ONCE(1); + return TRACE_TYPE_UNHANDLED; + } + + trace_seq_init(p); + trace_seq_printf(s, "%s: ", trace_event_name(event)); + + return trace_handle_return(s); +} +EXPORT_SYMBOL(trace_raw_output_prep); + +void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + trace_check_vprintf(iter, trace_event_format(iter, fmt), ap); + va_end(ap); +} +EXPORT_SYMBOL(trace_event_printf); + +static __printf(3, 0) +int trace_output_raw(struct trace_iterator *iter, char *name, + char *fmt, va_list ap) +{ + struct trace_seq *s = &iter->seq; + + trace_seq_printf(s, "%s: ", name); + trace_seq_vprintf(s, trace_event_format(iter, fmt), ap); + + return trace_handle_return(s); +} + +int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = trace_output_raw(iter, name, fmt, ap); + va_end(ap); + + return ret; +} +EXPORT_SYMBOL_GPL(trace_output_call); + +static inline const char *kretprobed(const char *name, unsigned long addr) +{ + if (is_kretprobe_trampoline(addr)) + return "[unknown/kretprobe'd]"; + return name; +} + +void +trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + const char *name; + + if (offset) + sprint_symbol(str, address); + else + kallsyms_lookup(address, NULL, NULL, NULL, str); + name = kretprobed(str, address); + + if (name && strlen(name)) { + trace_seq_puts(s, name); + return; + } +#endif + trace_seq_printf(s, "0x%08lx", address); +} + +#ifndef CONFIG_64BIT +# define IP_FMT "%08lx" +#else +# define IP_FMT "%016lx" +#endif + +static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, + unsigned long ip, unsigned long sym_flags) +{ + struct file *file = NULL; + unsigned long vmstart = 0; + int ret = 1; + + if (s->full) + return 0; + + if (mm) { + const struct vm_area_struct *vma; + + mmap_read_lock(mm); + vma = find_vma(mm, ip); + if (vma) { + file = vma->vm_file; + vmstart = vma->vm_start; + } + if (file) { + ret = trace_seq_path(s, &file->f_path); + if (ret) + trace_seq_printf(s, "[+0x%lx]", + ip - vmstart); + } + mmap_read_unlock(mm); + } + if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) + trace_seq_printf(s, " <" IP_FMT ">", ip); + return !trace_seq_has_overflowed(s); +} + +int +seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) +{ + if (!ip) { + trace_seq_putc(s, '0'); + goto out; + } + + trace_seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET); + + if (sym_flags & TRACE_ITER_SYM_ADDR) + trace_seq_printf(s, " <" IP_FMT ">", ip); + + out: + return !trace_seq_has_overflowed(s); +} + +/** + * trace_print_lat_fmt - print the irq, preempt and lockdep fields + * @s: trace seq struct to write to + * @entry: The trace entry field from the ring buffer + * + * Prints the generic fields of irqs off, in hard or softirq, preempt + * count. + */ +int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +{ + char hardsoft_irq; + char need_resched; + char irqs_off; + int hardirq; + int softirq; + int bh_off; + int nmi; + + nmi = entry->flags & TRACE_FLAG_NMI; + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + bh_off = entry->flags & TRACE_FLAG_BH_OFF; + + irqs_off = + (entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' : + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + bh_off ? 'b' : + (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : + '.'; + + switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | + TRACE_FLAG_PREEMPT_RESCHED)) { + case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED: + need_resched = 'N'; + break; + case TRACE_FLAG_NEED_RESCHED: + need_resched = 'n'; + break; + case TRACE_FLAG_PREEMPT_RESCHED: + need_resched = 'p'; + break; + default: + need_resched = '.'; + break; + } + + hardsoft_irq = + (nmi && hardirq) ? 'Z' : + nmi ? 'z' : + (hardirq && softirq) ? 'H' : + hardirq ? 'h' : + softirq ? 's' : + '.' ; + + trace_seq_printf(s, "%c%c%c", + irqs_off, need_resched, hardsoft_irq); + + if (entry->preempt_count & 0xf) + trace_seq_printf(s, "%x", entry->preempt_count & 0xf); + else + trace_seq_putc(s, '.'); + + if (entry->preempt_count & 0xf0) + trace_seq_printf(s, "%x", entry->preempt_count >> 4); + else + trace_seq_putc(s, '.'); + + return !trace_seq_has_overflowed(s); +} + +static int +lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +{ + char comm[TASK_COMM_LEN]; + + trace_find_cmdline(entry->pid, comm); + + trace_seq_printf(s, "%8.8s-%-7d %3d", + comm, entry->pid, cpu); + + return trace_print_lat_fmt(s, entry); +} + +#undef MARK +#define MARK(v, s) {.val = v, .sym = s} +/* trace overhead mark */ +static const struct trace_mark { + unsigned long long val; /* unit: nsec */ + char sym; +} mark[] = { + MARK(1000000000ULL , '$'), /* 1 sec */ + MARK(100000000ULL , '@'), /* 100 msec */ + MARK(10000000ULL , '*'), /* 10 msec */ + MARK(1000000ULL , '#'), /* 1000 usecs */ + MARK(100000ULL , '!'), /* 100 usecs */ + MARK(10000ULL , '+'), /* 10 usecs */ +}; +#undef MARK + +char trace_find_mark(unsigned long long d) +{ + int i; + int size = ARRAY_SIZE(mark); + + for (i = 0; i < size; i++) { + if (d > mark[i].val) + break; + } + + return (i == size) ? ' ' : mark[i].sym; +} + +static int +lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) +{ + struct trace_array *tr = iter->tr; + unsigned long verbose = tr->trace_flags & TRACE_ITER_VERBOSE; + unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; + unsigned long long abs_ts = iter->ts - iter->array_buffer->time_start; + unsigned long long rel_ts = next_ts - iter->ts; + struct trace_seq *s = &iter->seq; + + if (in_ns) { + abs_ts = ns2usecs(abs_ts); + rel_ts = ns2usecs(rel_ts); + } + + if (verbose && in_ns) { + unsigned long abs_usec = do_div(abs_ts, USEC_PER_MSEC); + unsigned long abs_msec = (unsigned long)abs_ts; + unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC); + unsigned long rel_msec = (unsigned long)rel_ts; + + trace_seq_printf( + s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", + ns2usecs(iter->ts), + abs_msec, abs_usec, + rel_msec, rel_usec); + + } else if (verbose && !in_ns) { + trace_seq_printf( + s, "[%016llx] %lld (+%lld): ", + iter->ts, abs_ts, rel_ts); + + } else if (!verbose && in_ns) { + trace_seq_printf( + s, " %4lldus%c: ", + abs_ts, + trace_find_mark(rel_ts * NSEC_PER_USEC)); + + } else { /* !verbose && !in_ns */ + trace_seq_printf(s, " %4lld: ", abs_ts); + } + + return !trace_seq_has_overflowed(s); +} + +static void trace_print_time(struct trace_seq *s, struct trace_iterator *iter, + unsigned long long ts) +{ + unsigned long secs, usec_rem; + unsigned long long t; + + if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { + t = ns2usecs(ts); + usec_rem = do_div(t, USEC_PER_SEC); + secs = (unsigned long)t; + trace_seq_printf(s, " %5lu.%06lu", secs, usec_rem); + } else + trace_seq_printf(s, " %12llu", ts); +} + +int trace_print_context(struct trace_iterator *iter) +{ + struct trace_array *tr = iter->tr; + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + char comm[TASK_COMM_LEN]; + + trace_find_cmdline(entry->pid, comm); + + trace_seq_printf(s, "%16s-%-7d ", comm, entry->pid); + + if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { + unsigned int tgid = trace_find_tgid(entry->pid); + + if (!tgid) + trace_seq_printf(s, "(-------) "); + else + trace_seq_printf(s, "(%7d) ", tgid); + } + + trace_seq_printf(s, "[%03d] ", iter->cpu); + + if (tr->trace_flags & TRACE_ITER_IRQ_INFO) + trace_print_lat_fmt(s, entry); + + trace_print_time(s, iter, iter->ts); + trace_seq_puts(s, ": "); + + return !trace_seq_has_overflowed(s); +} + +int trace_print_lat_context(struct trace_iterator *iter) +{ + struct trace_entry *entry, *next_entry; + struct trace_array *tr = iter->tr; + struct trace_seq *s = &iter->seq; + unsigned long verbose = (tr->trace_flags & TRACE_ITER_VERBOSE); + u64 next_ts; + + next_entry = trace_find_next_entry(iter, NULL, &next_ts); + if (!next_entry) + next_ts = iter->ts; + + /* trace_find_next_entry() may change iter->ent */ + entry = iter->ent; + + if (verbose) { + char comm[TASK_COMM_LEN]; + + trace_find_cmdline(entry->pid, comm); + + trace_seq_printf( + s, "%16s %7d %3d %d %08x %08lx ", + comm, entry->pid, iter->cpu, entry->flags, + entry->preempt_count & 0xf, iter->idx); + } else { + lat_print_generic(s, entry, iter->cpu); + } + + lat_print_timestamp(iter, next_ts); + + return !trace_seq_has_overflowed(s); +} + +/** + * ftrace_find_event - find a registered event + * @type: the type of event to look for + * + * Returns an event of type @type otherwise NULL + * Called with trace_event_read_lock() held. + */ +struct trace_event *ftrace_find_event(int type) +{ + struct trace_event *event; + unsigned key; + + key = type & (EVENT_HASHSIZE - 1); + + hlist_for_each_entry(event, &event_hash[key], node) { + if (event->type == type) + return event; + } + + return NULL; +} + +static DEFINE_IDA(trace_event_ida); + +static void free_trace_event_type(int type) +{ + if (type >= __TRACE_LAST_TYPE) + ida_free(&trace_event_ida, type); +} + +static int alloc_trace_event_type(void) +{ + int next; + + /* Skip static defined type numbers */ + next = ida_alloc_range(&trace_event_ida, __TRACE_LAST_TYPE, + TRACE_EVENT_TYPE_MAX, GFP_KERNEL); + if (next < 0) + return 0; + return next; +} + +void trace_event_read_lock(void) +{ + down_read(&trace_event_sem); +} + +void trace_event_read_unlock(void) +{ + up_read(&trace_event_sem); +} + +/** + * register_trace_event - register output for an event type + * @event: the event type to register + * + * Event types are stored in a hash and this hash is used to + * find a way to print an event. If the @event->type is set + * then it will use that type, otherwise it will assign a + * type to use. + * + * If you assign your own type, please make sure it is added + * to the trace_type enum in trace.h, to avoid collisions + * with the dynamic types. + * + * Returns the event type number or zero on error. + */ +int register_trace_event(struct trace_event *event) +{ + unsigned key; + int ret = 0; + + down_write(&trace_event_sem); + + if (WARN_ON(!event)) + goto out; + + if (WARN_ON(!event->funcs)) + goto out; + + if (!event->type) { + event->type = alloc_trace_event_type(); + if (!event->type) + goto out; + } else if (WARN(event->type > __TRACE_LAST_TYPE, + "Need to add type to trace.h")) { + goto out; + } else { + /* Is this event already used */ + if (ftrace_find_event(event->type)) + goto out; + } + + if (event->funcs->trace == NULL) + event->funcs->trace = trace_nop_print; + if (event->funcs->raw == NULL) + event->funcs->raw = trace_nop_print; + if (event->funcs->hex == NULL) + event->funcs->hex = trace_nop_print; + if (event->funcs->binary == NULL) + event->funcs->binary = trace_nop_print; + + key = event->type & (EVENT_HASHSIZE - 1); + + hlist_add_head(&event->node, &event_hash[key]); + + ret = event->type; + out: + up_write(&trace_event_sem); + + return ret; +} +EXPORT_SYMBOL_GPL(register_trace_event); + +/* + * Used by module code with the trace_event_sem held for write. + */ +int __unregister_trace_event(struct trace_event *event) +{ + hlist_del(&event->node); + free_trace_event_type(event->type); + return 0; +} + +/** + * unregister_trace_event - remove a no longer used event + * @event: the event to remove + */ +int unregister_trace_event(struct trace_event *event) +{ + down_write(&trace_event_sem); + __unregister_trace_event(event); + up_write(&trace_event_sem); + + return 0; +} +EXPORT_SYMBOL_GPL(unregister_trace_event); + +/* + * Standard events + */ + +static void print_array(struct trace_iterator *iter, void *pos, + struct ftrace_event_field *field) +{ + int offset; + int len; + int i; + + offset = *(int *)pos & 0xffff; + len = *(int *)pos >> 16; + + if (field) + offset += field->offset + sizeof(int); + + if (offset + len > iter->ent_size) { + trace_seq_puts(&iter->seq, "<OVERFLOW>"); + return; + } + + pos = (void *)iter->ent + offset; + + for (i = 0; i < len; i++, pos++) { + if (i) + trace_seq_putc(&iter->seq, ','); + trace_seq_printf(&iter->seq, "%02x", *(unsigned char *)pos); + } +} + +static void print_fields(struct trace_iterator *iter, struct trace_event_call *call, + struct list_head *head) +{ + struct ftrace_event_field *field; + int offset; + int len; + int ret; + void *pos; + + list_for_each_entry_reverse(field, head, link) { + trace_seq_printf(&iter->seq, " %s=", field->name); + if (field->offset + field->size > iter->ent_size) { + trace_seq_puts(&iter->seq, "<OVERFLOW>"); + continue; + } + pos = (void *)iter->ent + field->offset; + + switch (field->filter_type) { + case FILTER_COMM: + case FILTER_STATIC_STRING: + trace_seq_printf(&iter->seq, "%.*s", field->size, (char *)pos); + break; + case FILTER_RDYN_STRING: + case FILTER_DYN_STRING: + offset = *(int *)pos & 0xffff; + len = *(int *)pos >> 16; + + if (field->filter_type == FILTER_RDYN_STRING) + offset += field->offset + sizeof(int); + + if (offset + len > iter->ent_size) { + trace_seq_puts(&iter->seq, "<OVERFLOW>"); + break; + } + pos = (void *)iter->ent + offset; + trace_seq_printf(&iter->seq, "%.*s", len, (char *)pos); + break; + case FILTER_PTR_STRING: + if (!iter->fmt_size) + trace_iter_expand_format(iter); + pos = *(void **)pos; + ret = strncpy_from_kernel_nofault(iter->fmt, pos, + iter->fmt_size); + if (ret < 0) + trace_seq_printf(&iter->seq, "(0x%px)", pos); + else + trace_seq_printf(&iter->seq, "(0x%px:%s)", + pos, iter->fmt); + break; + case FILTER_TRACE_FN: + pos = *(void **)pos; + trace_seq_printf(&iter->seq, "%pS", pos); + break; + case FILTER_CPU: + case FILTER_OTHER: + switch (field->size) { + case 1: + if (isprint(*(char *)pos)) { + trace_seq_printf(&iter->seq, "'%c'", + *(unsigned char *)pos); + } + trace_seq_printf(&iter->seq, "(%d)", + *(unsigned char *)pos); + break; + case 2: + trace_seq_printf(&iter->seq, "0x%x (%d)", + *(unsigned short *)pos, + *(unsigned short *)pos); + break; + case 4: + /* dynamic array info is 4 bytes */ + if (strstr(field->type, "__data_loc")) { + print_array(iter, pos, NULL); + break; + } + + if (strstr(field->type, "__rel_loc")) { + print_array(iter, pos, field); + break; + } + + trace_seq_printf(&iter->seq, "0x%x (%d)", + *(unsigned int *)pos, + *(unsigned int *)pos); + break; + case 8: + trace_seq_printf(&iter->seq, "0x%llx (%lld)", + *(unsigned long long *)pos, + *(unsigned long long *)pos); + break; + default: + trace_seq_puts(&iter->seq, "<INVALID-SIZE>"); + break; + } + break; + default: + trace_seq_puts(&iter->seq, "<INVALID-TYPE>"); + } + } + trace_seq_putc(&iter->seq, '\n'); +} + +enum print_line_t print_event_fields(struct trace_iterator *iter, + struct trace_event *event) +{ + struct trace_event_call *call; + struct list_head *head; + + /* ftrace defined events have separate call structures */ + if (event->type <= __TRACE_LAST_TYPE) { + bool found = false; + + down_read(&trace_event_sem); + list_for_each_entry(call, &ftrace_events, list) { + if (call->event.type == event->type) { + found = true; + break; + } + /* No need to search all events */ + if (call->event.type > __TRACE_LAST_TYPE) + break; + } + up_read(&trace_event_sem); + if (!found) { + trace_seq_printf(&iter->seq, "UNKNOWN TYPE %d\n", event->type); + goto out; + } + } else { + call = container_of(event, struct trace_event_call, event); + } + head = trace_get_fields(call); + + trace_seq_printf(&iter->seq, "%s:", trace_event_name(call)); + + if (head && !list_empty(head)) + print_fields(iter, call, head); + else + trace_seq_puts(&iter->seq, "No fields found\n"); + + out: + return trace_handle_return(&iter->seq); +} + +enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type); + + return trace_handle_return(&iter->seq); +} + +static void print_fn_trace(struct trace_seq *s, unsigned long ip, + unsigned long parent_ip, int flags) +{ + seq_print_ip_sym(s, ip, flags); + + if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) { + trace_seq_puts(s, " <-"); + seq_print_ip_sym(s, parent_ip, flags); + } +} + +/* TRACE_FN */ +static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + print_fn_trace(s, field->ip, field->parent_ip, flags); + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + +static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_entry *field; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(&iter->seq, "%lx %lx\n", + field->ip, + field->parent_ip); + + return trace_handle_return(&iter->seq); +} + +static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_HEX_FIELD(s, field->ip); + SEQ_PUT_HEX_FIELD(s, field->parent_ip); + + return trace_handle_return(s); +} + +static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_FIELD(s, field->ip); + SEQ_PUT_FIELD(s, field->parent_ip); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_fn_funcs = { + .trace = trace_fn_trace, + .raw = trace_fn_raw, + .hex = trace_fn_hex, + .binary = trace_fn_bin, +}; + +static struct trace_event trace_fn_event = { + .type = TRACE_FN, + .funcs = &trace_fn_funcs, +}; + +/* TRACE_CTX an TRACE_WAKE */ +static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, + char *delim) +{ + struct ctx_switch_entry *field; + char comm[TASK_COMM_LEN]; + int S, T; + + + trace_assign_type(field, iter->ent); + + T = task_index_to_char(field->next_state); + S = task_index_to_char(field->prev_state); + trace_find_cmdline(field->next_pid, comm); + trace_seq_printf(&iter->seq, + " %7d:%3d:%c %s [%03d] %7d:%3d:%c %s\n", + field->prev_pid, + field->prev_prio, + S, delim, + field->next_cpu, + field->next_pid, + field->next_prio, + T, comm); + + return trace_handle_return(&iter->seq); +} + +static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + return trace_ctxwake_print(iter, "==>"); +} + +static enum print_line_t trace_wake_print(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + return trace_ctxwake_print(iter, " +"); +} + +static int trace_ctxwake_raw(struct trace_iterator *iter, char S) +{ + struct ctx_switch_entry *field; + int T; + + trace_assign_type(field, iter->ent); + + if (!S) + S = task_index_to_char(field->prev_state); + T = task_index_to_char(field->next_state); + trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", + field->prev_pid, + field->prev_prio, + S, + field->next_cpu, + field->next_pid, + field->next_prio, + T); + + return trace_handle_return(&iter->seq); +} + +static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + return trace_ctxwake_raw(iter, 0); +} + +static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + return trace_ctxwake_raw(iter, '+'); +} + + +static int trace_ctxwake_hex(struct trace_iterator *iter, char S) +{ + struct ctx_switch_entry *field; + struct trace_seq *s = &iter->seq; + int T; + + trace_assign_type(field, iter->ent); + + if (!S) + S = task_index_to_char(field->prev_state); + T = task_index_to_char(field->next_state); + + SEQ_PUT_HEX_FIELD(s, field->prev_pid); + SEQ_PUT_HEX_FIELD(s, field->prev_prio); + SEQ_PUT_HEX_FIELD(s, S); + SEQ_PUT_HEX_FIELD(s, field->next_cpu); + SEQ_PUT_HEX_FIELD(s, field->next_pid); + SEQ_PUT_HEX_FIELD(s, field->next_prio); + SEQ_PUT_HEX_FIELD(s, T); + + return trace_handle_return(s); +} + +static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + return trace_ctxwake_hex(iter, 0); +} + +static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + return trace_ctxwake_hex(iter, '+'); +} + +static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + struct ctx_switch_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_FIELD(s, field->prev_pid); + SEQ_PUT_FIELD(s, field->prev_prio); + SEQ_PUT_FIELD(s, field->prev_state); + SEQ_PUT_FIELD(s, field->next_cpu); + SEQ_PUT_FIELD(s, field->next_pid); + SEQ_PUT_FIELD(s, field->next_prio); + SEQ_PUT_FIELD(s, field->next_state); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_ctx_funcs = { + .trace = trace_ctx_print, + .raw = trace_ctx_raw, + .hex = trace_ctx_hex, + .binary = trace_ctxwake_bin, +}; + +static struct trace_event trace_ctx_event = { + .type = TRACE_CTX, + .funcs = &trace_ctx_funcs, +}; + +static struct trace_event_functions trace_wake_funcs = { + .trace = trace_wake_print, + .raw = trace_wake_raw, + .hex = trace_wake_hex, + .binary = trace_ctxwake_bin, +}; + +static struct trace_event trace_wake_event = { + .type = TRACE_WAKE, + .funcs = &trace_wake_funcs, +}; + +/* TRACE_STACK */ + +static enum print_line_t trace_stack_print(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + struct stack_entry *field; + struct trace_seq *s = &iter->seq; + unsigned long *p; + unsigned long *end; + + trace_assign_type(field, iter->ent); + end = (unsigned long *)((long)iter->ent + iter->ent_size); + + trace_seq_puts(s, "<stack trace>\n"); + + for (p = field->caller; p && p < end && *p != ULONG_MAX; p++) { + + if (trace_seq_has_overflowed(s)) + break; + + trace_seq_puts(s, " => "); + seq_print_ip_sym(s, *p, flags); + trace_seq_putc(s, '\n'); + } + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_stack_funcs = { + .trace = trace_stack_print, +}; + +static struct trace_event trace_stack_event = { + .type = TRACE_STACK, + .funcs = &trace_stack_funcs, +}; + +/* TRACE_USER_STACK */ +static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + struct trace_array *tr = iter->tr; + struct userstack_entry *field; + struct trace_seq *s = &iter->seq; + struct mm_struct *mm = NULL; + unsigned int i; + + trace_assign_type(field, iter->ent); + + trace_seq_puts(s, "<user stack trace>\n"); + + if (tr->trace_flags & TRACE_ITER_SYM_USEROBJ) { + struct task_struct *task; + /* + * we do the lookup on the thread group leader, + * since individual threads might have already quit! + */ + rcu_read_lock(); + task = find_task_by_vpid(field->tgid); + if (task) + mm = get_task_mm(task); + rcu_read_unlock(); + } + + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + unsigned long ip = field->caller[i]; + + if (!ip || trace_seq_has_overflowed(s)) + break; + + trace_seq_puts(s, " => "); + seq_print_user_ip(s, mm, ip, flags); + trace_seq_putc(s, '\n'); + } + + if (mm) + mmput(mm); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_user_stack_funcs = { + .trace = trace_user_stack_print, +}; + +static struct trace_event trace_user_stack_event = { + .type = TRACE_USER_STACK, + .funcs = &trace_user_stack_funcs, +}; + +/* TRACE_HWLAT */ +static enum print_line_t +trace_hwlat_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct hwlat_entry *field; + + trace_assign_type(field, entry); + + trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld count:%d", + field->seqnum, + field->duration, + field->outer_duration, + (long long)field->timestamp.tv_sec, + field->timestamp.tv_nsec, field->count); + + if (field->nmi_count) { + /* + * The generic sched_clock() is not NMI safe, thus + * we only record the count and not the time. + */ + if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) + trace_seq_printf(s, " nmi-total:%llu", + field->nmi_total_ts); + trace_seq_printf(s, " nmi-count:%u", + field->nmi_count); + } + + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + +static enum print_line_t +trace_hwlat_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct hwlat_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(s, "%llu %lld %lld %09ld %u\n", + field->duration, + field->outer_duration, + (long long)field->timestamp.tv_sec, + field->timestamp.tv_nsec, + field->seqnum); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_hwlat_funcs = { + .trace = trace_hwlat_print, + .raw = trace_hwlat_raw, +}; + +static struct trace_event trace_hwlat_event = { + .type = TRACE_HWLAT, + .funcs = &trace_hwlat_funcs, +}; + +/* TRACE_OSNOISE */ +static enum print_line_t +trace_osnoise_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct osnoise_entry *field; + u64 ratio, ratio_dec; + u64 net_runtime; + + trace_assign_type(field, entry); + + /* + * compute the available % of cpu time. + */ + net_runtime = field->runtime - field->noise; + ratio = net_runtime * 10000000; + do_div(ratio, field->runtime); + ratio_dec = do_div(ratio, 100000); + + trace_seq_printf(s, "%llu %10llu %3llu.%05llu %7llu", + field->runtime, + field->noise, + ratio, ratio_dec, + field->max_sample); + + trace_seq_printf(s, " %6u", field->hw_count); + trace_seq_printf(s, " %6u", field->nmi_count); + trace_seq_printf(s, " %6u", field->irq_count); + trace_seq_printf(s, " %6u", field->softirq_count); + trace_seq_printf(s, " %6u", field->thread_count); + + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + +static enum print_line_t +trace_osnoise_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct osnoise_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(s, "%lld %llu %llu %u %u %u %u %u\n", + field->runtime, + field->noise, + field->max_sample, + field->hw_count, + field->nmi_count, + field->irq_count, + field->softirq_count, + field->thread_count); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_osnoise_funcs = { + .trace = trace_osnoise_print, + .raw = trace_osnoise_raw, +}; + +static struct trace_event trace_osnoise_event = { + .type = TRACE_OSNOISE, + .funcs = &trace_osnoise_funcs, +}; + +/* TRACE_TIMERLAT */ + +static char *timerlat_lat_context[] = {"irq", "thread", "user-ret"}; +static enum print_line_t +trace_timerlat_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct timerlat_entry *field; + + trace_assign_type(field, entry); + + trace_seq_printf(s, "#%-5u context %6s timer_latency %9llu ns\n", + field->seqnum, + timerlat_lat_context[field->context], + field->timer_latency); + + return trace_handle_return(s); +} + +static enum print_line_t +trace_timerlat_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct timerlat_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(s, "%u %d %llu\n", + field->seqnum, + field->context, + field->timer_latency); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_timerlat_funcs = { + .trace = trace_timerlat_print, + .raw = trace_timerlat_raw, +}; + +static struct trace_event trace_timerlat_event = { + .type = TRACE_TIMERLAT, + .funcs = &trace_timerlat_funcs, +}; + +/* TRACE_BPUTS */ +static enum print_line_t +trace_bputs_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct bputs_entry *field; + + trace_assign_type(field, entry); + + seq_print_ip_sym(s, field->ip, flags); + trace_seq_puts(s, ": "); + trace_seq_puts(s, field->str); + + return trace_handle_return(s); +} + + +static enum print_line_t +trace_bputs_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct bputs_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(s, ": %lx : ", field->ip); + trace_seq_puts(s, field->str); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_bputs_funcs = { + .trace = trace_bputs_print, + .raw = trace_bputs_raw, +}; + +static struct trace_event trace_bputs_event = { + .type = TRACE_BPUTS, + .funcs = &trace_bputs_funcs, +}; + +/* TRACE_BPRINT */ +static enum print_line_t +trace_bprint_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct bprint_entry *field; + + trace_assign_type(field, entry); + + seq_print_ip_sym(s, field->ip, flags); + trace_seq_puts(s, ": "); + trace_seq_bprintf(s, field->fmt, field->buf); + + return trace_handle_return(s); +} + + +static enum print_line_t +trace_bprint_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct bprint_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(s, ": %lx : ", field->ip); + trace_seq_bprintf(s, field->fmt, field->buf); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_bprint_funcs = { + .trace = trace_bprint_print, + .raw = trace_bprint_raw, +}; + +static struct trace_event trace_bprint_event = { + .type = TRACE_BPRINT, + .funcs = &trace_bprint_funcs, +}; + +/* TRACE_PRINT */ +static enum print_line_t trace_print_print(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + struct print_entry *field; + struct trace_seq *s = &iter->seq; + int max = iter->ent_size - offsetof(struct print_entry, buf); + + trace_assign_type(field, iter->ent); + + seq_print_ip_sym(s, field->ip, flags); + trace_seq_printf(s, ": %.*s", max, field->buf); + + return trace_handle_return(s); +} + +static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct print_entry *field; + int max = iter->ent_size - offsetof(struct print_entry, buf); + + trace_assign_type(field, iter->ent); + + trace_seq_printf(&iter->seq, "# %lx %.*s", field->ip, max, field->buf); + + return trace_handle_return(&iter->seq); +} + +static struct trace_event_functions trace_print_funcs = { + .trace = trace_print_print, + .raw = trace_print_raw, +}; + +static struct trace_event trace_print_event = { + .type = TRACE_PRINT, + .funcs = &trace_print_funcs, +}; + +static enum print_line_t trace_raw_data(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct raw_data_entry *field; + int i; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(&iter->seq, "# %x buf:", field->id); + + for (i = 0; i < iter->ent_size - offsetof(struct raw_data_entry, buf); i++) + trace_seq_printf(&iter->seq, " %02x", + (unsigned char)field->buf[i]); + + trace_seq_putc(&iter->seq, '\n'); + + return trace_handle_return(&iter->seq); +} + +static struct trace_event_functions trace_raw_data_funcs = { + .trace = trace_raw_data, + .raw = trace_raw_data, +}; + +static struct trace_event trace_raw_data_event = { + .type = TRACE_RAW_DATA, + .funcs = &trace_raw_data_funcs, +}; + +static enum print_line_t +trace_func_repeats_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct func_repeats_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(s, "%lu %lu %u %llu\n", + field->ip, + field->parent_ip, + field->count, + FUNC_REPEATS_GET_DELTA_TS(field)); + + return trace_handle_return(s); +} + +static enum print_line_t +trace_func_repeats_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct func_repeats_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + print_fn_trace(s, field->ip, field->parent_ip, flags); + trace_seq_printf(s, " (repeats: %u, last_ts:", field->count); + trace_print_time(s, iter, + iter->ts - FUNC_REPEATS_GET_DELTA_TS(field)); + trace_seq_puts(s, ")\n"); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_func_repeats_funcs = { + .trace = trace_func_repeats_print, + .raw = trace_func_repeats_raw, +}; + +static struct trace_event trace_func_repeats_event = { + .type = TRACE_FUNC_REPEATS, + .funcs = &trace_func_repeats_funcs, +}; + +static struct trace_event *events[] __initdata = { + &trace_fn_event, + &trace_ctx_event, + &trace_wake_event, + &trace_stack_event, + &trace_user_stack_event, + &trace_bputs_event, + &trace_bprint_event, + &trace_print_event, + &trace_hwlat_event, + &trace_osnoise_event, + &trace_timerlat_event, + &trace_raw_data_event, + &trace_func_repeats_event, + NULL +}; + +__init int init_events(void) +{ + struct trace_event *event; + int i, ret; + + for (i = 0; events[i]; i++) { + event = events[i]; + ret = register_trace_event(event); + WARN_ONCE(!ret, "event %d failed to register", event->type); + } + + return 0; +} diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h new file mode 100644 index 0000000000..dca40f1f1d --- /dev/null +++ b/kernel/trace/trace_output.h @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __TRACE_EVENTS_H +#define __TRACE_EVENTS_H + +#include <linux/trace_seq.h> +#include "trace.h" + +extern enum print_line_t +trace_print_bputs_msg_only(struct trace_iterator *iter); +extern enum print_line_t +trace_print_bprintk_msg_only(struct trace_iterator *iter); +extern enum print_line_t +trace_print_printk_msg_only(struct trace_iterator *iter); + +extern int +seq_print_ip_sym(struct trace_seq *s, unsigned long ip, + unsigned long sym_flags); + +extern void trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset); +extern int trace_print_context(struct trace_iterator *iter); +extern int trace_print_lat_context(struct trace_iterator *iter); +extern enum print_line_t print_event_fields(struct trace_iterator *iter, + struct trace_event *event); + +extern void trace_event_read_lock(void); +extern void trace_event_read_unlock(void); +extern struct trace_event *ftrace_find_event(int type); + +extern enum print_line_t trace_nop_print(struct trace_iterator *iter, + int flags, struct trace_event *event); +extern int +trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); + +/* used by module unregistering */ +extern int __unregister_trace_event(struct trace_event *event); +extern struct rw_semaphore trace_event_sem; + +#define SEQ_PUT_FIELD(s, x) \ + trace_seq_putmem(s, &(x), sizeof(x)) + +#define SEQ_PUT_HEX_FIELD(s, x) \ + trace_seq_putmem_hex(s, &(x), sizeof(x)) + +#endif + diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c new file mode 100644 index 0000000000..e37446f791 --- /dev/null +++ b/kernel/trace/trace_preemptirq.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * preemptoff and irqoff tracepoints + * + * Copyright (C) Joel Fernandes (Google) <joel@joelfernandes.org> + */ + +#include <linux/kallsyms.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/ftrace.h> +#include <linux/kprobes.h> +#include "trace.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/preemptirq.h> + +/* + * Use regular trace points on architectures that implement noinstr + * tooling: these calls will only happen with RCU enabled, which can + * use a regular tracepoint. + * + * On older architectures, use the rcuidle tracing methods (which + * aren't NMI-safe - so exclude NMI contexts): + */ +#ifdef CONFIG_ARCH_WANTS_NO_INSTR +#define trace(point) trace_##point +#else +#define trace(point) if (!in_nmi()) trace_##point##_rcuidle +#endif + +#ifdef CONFIG_TRACE_IRQFLAGS +/* Per-cpu variable to prevent redundant calls when IRQs already off */ +static DEFINE_PER_CPU(int, tracing_irq_cpu); + +/* + * Like trace_hardirqs_on() but without the lockdep invocation. This is + * used in the low level entry code where the ordering vs. RCU is important + * and lockdep uses a staged approach which splits the lockdep hardirq + * tracking into a RCU on and a RCU off section. + */ +void trace_hardirqs_on_prepare(void) +{ + if (this_cpu_read(tracing_irq_cpu)) { + trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1); + tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); + this_cpu_write(tracing_irq_cpu, 0); + } +} +EXPORT_SYMBOL(trace_hardirqs_on_prepare); +NOKPROBE_SYMBOL(trace_hardirqs_on_prepare); + +void trace_hardirqs_on(void) +{ + if (this_cpu_read(tracing_irq_cpu)) { + trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1); + tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); + this_cpu_write(tracing_irq_cpu, 0); + } + + lockdep_hardirqs_on_prepare(); + lockdep_hardirqs_on(CALLER_ADDR0); +} +EXPORT_SYMBOL(trace_hardirqs_on); +NOKPROBE_SYMBOL(trace_hardirqs_on); + +/* + * Like trace_hardirqs_off() but without the lockdep invocation. This is + * used in the low level entry code where the ordering vs. RCU is important + * and lockdep uses a staged approach which splits the lockdep hardirq + * tracking into a RCU on and a RCU off section. + */ +void trace_hardirqs_off_finish(void) +{ + if (!this_cpu_read(tracing_irq_cpu)) { + this_cpu_write(tracing_irq_cpu, 1); + tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1); + } + +} +EXPORT_SYMBOL(trace_hardirqs_off_finish); +NOKPROBE_SYMBOL(trace_hardirqs_off_finish); + +void trace_hardirqs_off(void) +{ + lockdep_hardirqs_off(CALLER_ADDR0); + + if (!this_cpu_read(tracing_irq_cpu)) { + this_cpu_write(tracing_irq_cpu, 1); + tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1); + } +} +EXPORT_SYMBOL(trace_hardirqs_off); +NOKPROBE_SYMBOL(trace_hardirqs_off); +#endif /* CONFIG_TRACE_IRQFLAGS */ + +#ifdef CONFIG_TRACE_PREEMPT_TOGGLE + +void trace_preempt_on(unsigned long a0, unsigned long a1) +{ + trace(preempt_enable)(a0, a1); + tracer_preempt_on(a0, a1); +} + +void trace_preempt_off(unsigned long a0, unsigned long a1) +{ + trace(preempt_disable)(a0, a1); + tracer_preempt_off(a0, a1); +} +#endif diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c new file mode 100644 index 0000000000..29f6e95439 --- /dev/null +++ b/kernel/trace/trace_printk.c @@ -0,0 +1,400 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace binary printk + * + * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com> + * + */ +#include <linux/seq_file.h> +#include <linux/security.h> +#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/ftrace.h> +#include <linux/string.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/ctype.h> +#include <linux/list.h> +#include <linux/slab.h> + +#include "trace.h" + +#ifdef CONFIG_MODULES + +/* + * modules trace_printk()'s formats are autosaved in struct trace_bprintk_fmt + * which are queued on trace_bprintk_fmt_list. + */ +static LIST_HEAD(trace_bprintk_fmt_list); + +/* serialize accesses to trace_bprintk_fmt_list */ +static DEFINE_MUTEX(btrace_mutex); + +struct trace_bprintk_fmt { + struct list_head list; + const char *fmt; +}; + +static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) +{ + struct trace_bprintk_fmt *pos; + + if (!fmt) + return ERR_PTR(-EINVAL); + + list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { + if (!strcmp(pos->fmt, fmt)) + return pos; + } + return NULL; +} + +static +void hold_module_trace_bprintk_format(const char **start, const char **end) +{ + const char **iter; + char *fmt; + + /* allocate the trace_printk per cpu buffers */ + if (start != end) + trace_printk_init_buffers(); + + mutex_lock(&btrace_mutex); + for (iter = start; iter < end; iter++) { + struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); + if (tb_fmt) { + if (!IS_ERR(tb_fmt)) + *iter = tb_fmt->fmt; + continue; + } + + fmt = NULL; + tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); + if (tb_fmt) { + fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); + if (fmt) { + list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); + strcpy(fmt, *iter); + tb_fmt->fmt = fmt; + } else + kfree(tb_fmt); + } + *iter = fmt; + + } + mutex_unlock(&btrace_mutex); +} + +static int module_trace_bprintk_format_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + if (mod->num_trace_bprintk_fmt) { + const char **start = mod->trace_bprintk_fmt_start; + const char **end = start + mod->num_trace_bprintk_fmt; + + if (val == MODULE_STATE_COMING) + hold_module_trace_bprintk_format(start, end); + } + return NOTIFY_OK; +} + +/* + * The debugfs/tracing/printk_formats file maps the addresses with + * the ASCII formats that are used in the bprintk events in the + * buffer. For userspace tools to be able to decode the events from + * the buffer, they need to be able to map the address with the format. + * + * The addresses of the bprintk formats are in their own section + * __trace_printk_fmt. But for modules we copy them into a link list. + * The code to print the formats and their addresses passes around the + * address of the fmt string. If the fmt address passed into the seq + * functions is within the kernel core __trace_printk_fmt section, then + * it simply uses the next pointer in the list. + * + * When the fmt pointer is outside the kernel core __trace_printk_fmt + * section, then we need to read the link list pointers. The trick is + * we pass the address of the string to the seq function just like + * we do for the kernel core formats. To get back the structure that + * holds the format, we simply use container_of() and then go to the + * next format in the list. + */ +static const char ** +find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) +{ + struct trace_bprintk_fmt *mod_fmt; + + if (list_empty(&trace_bprintk_fmt_list)) + return NULL; + + /* + * v will point to the address of the fmt record from t_next + * v will be NULL from t_start. + * If this is the first pointer or called from start + * then we need to walk the list. + */ + if (!v || start_index == *pos) { + struct trace_bprintk_fmt *p; + + /* search the module list */ + list_for_each_entry(p, &trace_bprintk_fmt_list, list) { + if (start_index == *pos) + return &p->fmt; + start_index++; + } + /* pos > index */ + return NULL; + } + + /* + * v points to the address of the fmt field in the mod list + * structure that holds the module print format. + */ + mod_fmt = container_of(v, typeof(*mod_fmt), fmt); + if (mod_fmt->list.next == &trace_bprintk_fmt_list) + return NULL; + + mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list); + + return &mod_fmt->fmt; +} + +static void format_mod_start(void) +{ + mutex_lock(&btrace_mutex); +} + +static void format_mod_stop(void) +{ + mutex_unlock(&btrace_mutex); +} + +#else /* !CONFIG_MODULES */ +__init static int +module_trace_bprintk_format_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return NOTIFY_OK; +} +static inline const char ** +find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) +{ + return NULL; +} +static inline void format_mod_start(void) { } +static inline void format_mod_stop(void) { } +#endif /* CONFIG_MODULES */ + +static bool __read_mostly trace_printk_enabled = true; + +void trace_printk_control(bool enabled) +{ + trace_printk_enabled = enabled; +} + +__initdata_or_module static +struct notifier_block module_trace_bprintk_format_nb = { + .notifier_call = module_trace_bprintk_format_notify, +}; + +int __trace_bprintk(unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (unlikely(!fmt)) + return 0; + + if (!trace_printk_enabled) + return 0; + + va_start(ap, fmt); + ret = trace_vbprintk(ip, fmt, ap); + va_end(ap); + return ret; +} +EXPORT_SYMBOL_GPL(__trace_bprintk); + +int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap) +{ + if (unlikely(!fmt)) + return 0; + + if (!trace_printk_enabled) + return 0; + + return trace_vbprintk(ip, fmt, ap); +} +EXPORT_SYMBOL_GPL(__ftrace_vbprintk); + +int __trace_printk(unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!trace_printk_enabled) + return 0; + + va_start(ap, fmt); + ret = trace_vprintk(ip, fmt, ap); + va_end(ap); + return ret; +} +EXPORT_SYMBOL_GPL(__trace_printk); + +int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) +{ + if (!trace_printk_enabled) + return 0; + + return trace_vprintk(ip, fmt, ap); +} +EXPORT_SYMBOL_GPL(__ftrace_vprintk); + +bool trace_is_tracepoint_string(const char *str) +{ + const char **ptr = __start___tracepoint_str; + + for (ptr = __start___tracepoint_str; ptr < __stop___tracepoint_str; ptr++) { + if (str == *ptr) + return true; + } + return false; +} + +static const char **find_next(void *v, loff_t *pos) +{ + const char **fmt = v; + int start_index; + int last_index; + + start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; + + if (*pos < start_index) + return __start___trace_bprintk_fmt + *pos; + + /* + * The __tracepoint_str section is treated the same as the + * __trace_printk_fmt section. The difference is that the + * __trace_printk_fmt section should only be used by trace_printk() + * in a debugging environment, as if anything exists in that section + * the trace_prink() helper buffers are allocated, which would just + * waste space in a production environment. + * + * The __tracepoint_str sections on the other hand are used by + * tracepoints which need to map pointers to their strings to + * the ASCII text for userspace. + */ + last_index = start_index; + start_index = __stop___tracepoint_str - __start___tracepoint_str; + + if (*pos < last_index + start_index) + return __start___tracepoint_str + (*pos - last_index); + + start_index += last_index; + return find_next_mod_format(start_index, v, fmt, pos); +} + +static void * +t_start(struct seq_file *m, loff_t *pos) +{ + format_mod_start(); + return find_next(NULL, pos); +} + +static void *t_next(struct seq_file *m, void * v, loff_t *pos) +{ + (*pos)++; + return find_next(v, pos); +} + +static int t_show(struct seq_file *m, void *v) +{ + const char **fmt = v; + const char *str = *fmt; + int i; + + if (!*fmt) + return 0; + + seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt); + + /* + * Tabs and new lines need to be converted. + */ + for (i = 0; str[i]; i++) { + switch (str[i]) { + case '\n': + seq_puts(m, "\\n"); + break; + case '\t': + seq_puts(m, "\\t"); + break; + case '\\': + seq_putc(m, '\\'); + break; + case '"': + seq_puts(m, "\\\""); + break; + default: + seq_putc(m, str[i]); + } + } + seq_puts(m, "\"\n"); + + return 0; +} + +static void t_stop(struct seq_file *m, void *p) +{ + format_mod_stop(); +} + +static const struct seq_operations show_format_seq_ops = { + .start = t_start, + .next = t_next, + .show = t_show, + .stop = t_stop, +}; + +static int +ftrace_formats_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + return seq_open(file, &show_format_seq_ops); +} + +static const struct file_operations ftrace_formats_fops = { + .open = ftrace_formats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static __init int init_trace_printk_function_export(void) +{ + int ret; + + ret = tracing_init_dentry(); + if (ret) + return 0; + + trace_create_file("printk_formats", TRACE_MODE_READ, NULL, + NULL, &ftrace_formats_fops); + + return 0; +} + +fs_initcall(init_trace_printk_function_export); + +static __init int init_trace_printk(void) +{ + return register_module_notifier(&module_trace_bprintk_format_nb); +} + +early_initcall(init_trace_printk); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c new file mode 100644 index 0000000000..4dc74d73fc --- /dev/null +++ b/kernel/trace/trace_probe.c @@ -0,0 +1,1980 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common code for probe-based Dynamic events. + * + * This code was copied from kernel/trace/trace_kprobe.c written by + * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> + * + * Updates to make this generic: + * Copyright (C) IBM Corporation, 2010-2011 + * Author: Srikar Dronamraju + */ +#define pr_fmt(fmt) "trace_probe: " fmt + +#include <linux/bpf.h> +#include "trace_btf.h" + +#include "trace_probe.h" + +#undef C +#define C(a, b) b + +static const char *trace_probe_err_text[] = { ERRORS }; + +static const char *reserved_field_names[] = { + "common_type", + "common_flags", + "common_preempt_count", + "common_pid", + "common_tgid", + FIELD_STRING_IP, + FIELD_STRING_RETIP, + FIELD_STRING_FUNC, +}; + +/* Printing in basic type function template */ +#define DEFINE_BASIC_PRINT_TYPE_FUNC(tname, type, fmt) \ +int PRINT_TYPE_FUNC_NAME(tname)(struct trace_seq *s, void *data, void *ent)\ +{ \ + trace_seq_printf(s, fmt, *(type *)data); \ + return !trace_seq_has_overflowed(s); \ +} \ +const char PRINT_TYPE_FMT_NAME(tname)[] = fmt; + +DEFINE_BASIC_PRINT_TYPE_FUNC(u8, u8, "%u") +DEFINE_BASIC_PRINT_TYPE_FUNC(u16, u16, "%u") +DEFINE_BASIC_PRINT_TYPE_FUNC(u32, u32, "%u") +DEFINE_BASIC_PRINT_TYPE_FUNC(u64, u64, "%Lu") +DEFINE_BASIC_PRINT_TYPE_FUNC(s8, s8, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s16, s16, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s32, s32, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s64, s64, "%Ld") +DEFINE_BASIC_PRINT_TYPE_FUNC(x8, u8, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx") +DEFINE_BASIC_PRINT_TYPE_FUNC(char, u8, "'%c'") + +int PRINT_TYPE_FUNC_NAME(symbol)(struct trace_seq *s, void *data, void *ent) +{ + trace_seq_printf(s, "%pS", (void *)*(unsigned long *)data); + return !trace_seq_has_overflowed(s); +} +const char PRINT_TYPE_FMT_NAME(symbol)[] = "%pS"; + +/* Print type function for string type */ +int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, void *data, void *ent) +{ + int len = *(u32 *)data >> 16; + + if (!len) + trace_seq_puts(s, FAULT_STRING); + else + trace_seq_printf(s, "\"%s\"", + (const char *)get_loc_data(data, ent)); + return !trace_seq_has_overflowed(s); +} + +const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; + +/* Fetch type information table */ +static const struct fetch_type probe_fetch_types[] = { + /* Special types */ + __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, 1, + "__data_loc char[]"), + __ASSIGN_FETCH_TYPE("ustring", string, string, sizeof(u32), 1, 1, + "__data_loc char[]"), + __ASSIGN_FETCH_TYPE("symstr", string, string, sizeof(u32), 1, 1, + "__data_loc char[]"), + /* Basic types */ + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), + ASSIGN_FETCH_TYPE_ALIAS(x8, u8, u8, 0), + ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0), + ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0), + ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), + ASSIGN_FETCH_TYPE_ALIAS(char, u8, u8, 0), + ASSIGN_FETCH_TYPE_ALIAS(symbol, ADDR_FETCH_TYPE, ADDR_FETCH_TYPE, 0), + + ASSIGN_FETCH_TYPE_END +}; + +static const struct fetch_type *find_fetch_type(const char *type, unsigned long flags) +{ + int i; + + /* Reject the symbol/symstr for uprobes */ + if (type && (flags & TPARG_FL_USER) && + (!strcmp(type, "symbol") || !strcmp(type, "symstr"))) + return NULL; + + if (!type) + type = DEFAULT_FETCH_TYPE_STR; + + /* Special case: bitfield */ + if (*type == 'b') { + unsigned long bs; + + type = strchr(type, '/'); + if (!type) + goto fail; + + type++; + if (kstrtoul(type, 0, &bs)) + goto fail; + + switch (bs) { + case 8: + return find_fetch_type("u8", flags); + case 16: + return find_fetch_type("u16", flags); + case 32: + return find_fetch_type("u32", flags); + case 64: + return find_fetch_type("u64", flags); + default: + goto fail; + } + } + + for (i = 0; probe_fetch_types[i].name; i++) { + if (strcmp(type, probe_fetch_types[i].name) == 0) + return &probe_fetch_types[i]; + } + +fail: + return NULL; +} + +static struct trace_probe_log trace_probe_log; + +void trace_probe_log_init(const char *subsystem, int argc, const char **argv) +{ + trace_probe_log.subsystem = subsystem; + trace_probe_log.argc = argc; + trace_probe_log.argv = argv; + trace_probe_log.index = 0; +} + +void trace_probe_log_clear(void) +{ + memset(&trace_probe_log, 0, sizeof(trace_probe_log)); +} + +void trace_probe_log_set_index(int index) +{ + trace_probe_log.index = index; +} + +void __trace_probe_log_err(int offset, int err_type) +{ + char *command, *p; + int i, len = 0, pos = 0; + + if (!trace_probe_log.argv) + return; + + /* Recalculate the length and allocate buffer */ + for (i = 0; i < trace_probe_log.argc; i++) { + if (i == trace_probe_log.index) + pos = len; + len += strlen(trace_probe_log.argv[i]) + 1; + } + command = kzalloc(len, GFP_KERNEL); + if (!command) + return; + + if (trace_probe_log.index >= trace_probe_log.argc) { + /** + * Set the error position is next to the last arg + space. + * Note that len includes the terminal null and the cursor + * appears at pos + 1. + */ + pos = len; + offset = 0; + } + + /* And make a command string from argv array */ + p = command; + for (i = 0; i < trace_probe_log.argc; i++) { + len = strlen(trace_probe_log.argv[i]); + strcpy(p, trace_probe_log.argv[i]); + p[len] = ' '; + p += len + 1; + } + *(p - 1) = '\0'; + + tracing_log_err(NULL, trace_probe_log.subsystem, command, + trace_probe_err_text, err_type, pos + offset); + + kfree(command); +} + +/* Split symbol and offset. */ +int traceprobe_split_symbol_offset(char *symbol, long *offset) +{ + char *tmp; + int ret; + + if (!offset) + return -EINVAL; + + tmp = strpbrk(symbol, "+-"); + if (tmp) { + ret = kstrtol(tmp, 0, offset); + if (ret) + return ret; + *tmp = '\0'; + } else + *offset = 0; + + return 0; +} + +/* @buf must has MAX_EVENT_NAME_LEN size */ +int traceprobe_parse_event_name(const char **pevent, const char **pgroup, + char *buf, int offset) +{ + const char *slash, *event = *pevent; + int len; + + slash = strchr(event, '/'); + if (!slash) + slash = strchr(event, '.'); + + if (slash) { + if (slash == event) { + trace_probe_log_err(offset, NO_GROUP_NAME); + return -EINVAL; + } + if (slash - event + 1 > MAX_EVENT_NAME_LEN) { + trace_probe_log_err(offset, GROUP_TOO_LONG); + return -EINVAL; + } + strscpy(buf, event, slash - event + 1); + if (!is_good_system_name(buf)) { + trace_probe_log_err(offset, BAD_GROUP_NAME); + return -EINVAL; + } + *pgroup = buf; + *pevent = slash + 1; + offset += slash - event + 1; + event = *pevent; + } + len = strlen(event); + if (len == 0) { + if (slash) { + *pevent = NULL; + return 0; + } + trace_probe_log_err(offset, NO_EVENT_NAME); + return -EINVAL; + } else if (len > MAX_EVENT_NAME_LEN) { + trace_probe_log_err(offset, EVENT_TOO_LONG); + return -EINVAL; + } + if (!is_good_name(event)) { + trace_probe_log_err(offset, BAD_EVENT_NAME); + return -EINVAL; + } + return 0; +} + +static int parse_trace_event_arg(char *arg, struct fetch_insn *code, + struct traceprobe_parse_context *ctx) +{ + struct ftrace_event_field *field; + struct list_head *head; + + head = trace_get_fields(ctx->event); + list_for_each_entry(field, head, link) { + if (!strcmp(arg, field->name)) { + code->op = FETCH_OP_TP_ARG; + code->data = field; + return 0; + } + } + return -ENOENT; +} + +#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS + +static u32 btf_type_int(const struct btf_type *t) +{ + return *(u32 *)(t + 1); +} + +static bool btf_type_is_char_ptr(struct btf *btf, const struct btf_type *type) +{ + const struct btf_type *real_type; + u32 intdata; + s32 tid; + + real_type = btf_type_skip_modifiers(btf, type->type, &tid); + if (!real_type) + return false; + + if (BTF_INFO_KIND(real_type->info) != BTF_KIND_INT) + return false; + + intdata = btf_type_int(real_type); + return !(BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED) + && BTF_INT_BITS(intdata) == 8; +} + +static bool btf_type_is_char_array(struct btf *btf, const struct btf_type *type) +{ + const struct btf_type *real_type; + const struct btf_array *array; + u32 intdata; + s32 tid; + + if (BTF_INFO_KIND(type->info) != BTF_KIND_ARRAY) + return false; + + array = (const struct btf_array *)(type + 1); + + real_type = btf_type_skip_modifiers(btf, array->type, &tid); + + intdata = btf_type_int(real_type); + return !(BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED) + && BTF_INT_BITS(intdata) == 8; +} + +static int check_prepare_btf_string_fetch(char *typename, + struct fetch_insn **pcode, + struct traceprobe_parse_context *ctx) +{ + struct btf *btf = ctx->btf; + + if (!btf || !ctx->last_type) + return 0; + + /* char [] does not need any change. */ + if (btf_type_is_char_array(btf, ctx->last_type)) + return 0; + + /* char * requires dereference the pointer. */ + if (btf_type_is_char_ptr(btf, ctx->last_type)) { + struct fetch_insn *code = *pcode + 1; + + if (code->op == FETCH_OP_END) { + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); + return -E2BIG; + } + if (typename[0] == 'u') + code->op = FETCH_OP_UDEREF; + else + code->op = FETCH_OP_DEREF; + code->offset = 0; + *pcode = code; + return 0; + } + /* Other types are not available for string */ + trace_probe_log_err(ctx->offset, BAD_TYPE4STR); + return -EINVAL; +} + +static const char *fetch_type_from_btf_type(struct btf *btf, + const struct btf_type *type, + struct traceprobe_parse_context *ctx) +{ + u32 intdata; + + /* TODO: const char * could be converted as a string */ + switch (BTF_INFO_KIND(type->info)) { + case BTF_KIND_ENUM: + /* enum is "int", so convert to "s32" */ + return "s32"; + case BTF_KIND_ENUM64: + return "s64"; + case BTF_KIND_PTR: + /* pointer will be converted to "x??" */ + if (IS_ENABLED(CONFIG_64BIT)) + return "x64"; + else + return "x32"; + case BTF_KIND_INT: + intdata = btf_type_int(type); + if (BTF_INT_ENCODING(intdata) & BTF_INT_SIGNED) { + switch (BTF_INT_BITS(intdata)) { + case 8: + return "s8"; + case 16: + return "s16"; + case 32: + return "s32"; + case 64: + return "s64"; + } + } else { /* unsigned */ + switch (BTF_INT_BITS(intdata)) { + case 8: + return "u8"; + case 16: + return "u16"; + case 32: + return "u32"; + case 64: + return "u64"; + } + /* bitfield, size is encoded in the type */ + ctx->last_bitsize = BTF_INT_BITS(intdata); + ctx->last_bitoffs += BTF_INT_OFFSET(intdata); + return "u64"; + } + } + /* TODO: support other types */ + + return NULL; +} + +static int query_btf_context(struct traceprobe_parse_context *ctx) +{ + const struct btf_param *param; + const struct btf_type *type; + struct btf *btf; + s32 nr; + + if (ctx->btf) + return 0; + + if (!ctx->funcname) + return -EINVAL; + + type = btf_find_func_proto(ctx->funcname, &btf); + if (!type) + return -ENOENT; + + ctx->btf = btf; + ctx->proto = type; + + /* ctx->params is optional, since func(void) will not have params. */ + nr = 0; + param = btf_get_func_param(type, &nr); + if (!IS_ERR_OR_NULL(param)) { + /* Hide the first 'data' argument of tracepoint */ + if (ctx->flags & TPARG_FL_TPOINT) { + nr--; + param++; + } + } + + if (nr > 0) { + ctx->nr_params = nr; + ctx->params = param; + } else { + ctx->nr_params = 0; + ctx->params = NULL; + } + + return 0; +} + +static void clear_btf_context(struct traceprobe_parse_context *ctx) +{ + if (ctx->btf) { + btf_put(ctx->btf); + ctx->btf = NULL; + ctx->proto = NULL; + ctx->params = NULL; + ctx->nr_params = 0; + } +} + +/* Return 1 if the field separater is arrow operator ('->') */ +static int split_next_field(char *varname, char **next_field, + struct traceprobe_parse_context *ctx) +{ + char *field; + int ret = 0; + + field = strpbrk(varname, ".-"); + if (field) { + if (field[0] == '-' && field[1] == '>') { + field[0] = '\0'; + field += 2; + ret = 1; + } else if (field[0] == '.') { + field[0] = '\0'; + field += 1; + } else { + trace_probe_log_err(ctx->offset + field - varname, BAD_HYPHEN); + return -EINVAL; + } + *next_field = field; + } + + return ret; +} + +/* + * Parse the field of data structure. The @type must be a pointer type + * pointing the target data structure type. + */ +static int parse_btf_field(char *fieldname, const struct btf_type *type, + struct fetch_insn **pcode, struct fetch_insn *end, + struct traceprobe_parse_context *ctx) +{ + struct fetch_insn *code = *pcode; + const struct btf_member *field; + u32 bitoffs, anon_offs; + char *next; + int is_ptr; + s32 tid; + + do { + /* Outer loop for solving arrow operator ('->') */ + if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) { + trace_probe_log_err(ctx->offset, NO_PTR_STRCT); + return -EINVAL; + } + /* Convert a struct pointer type to a struct type */ + type = btf_type_skip_modifiers(ctx->btf, type->type, &tid); + if (!type) { + trace_probe_log_err(ctx->offset, BAD_BTF_TID); + return -EINVAL; + } + + bitoffs = 0; + do { + /* Inner loop for solving dot operator ('.') */ + next = NULL; + is_ptr = split_next_field(fieldname, &next, ctx); + if (is_ptr < 0) + return is_ptr; + + anon_offs = 0; + field = btf_find_struct_member(ctx->btf, type, fieldname, + &anon_offs); + if (!field) { + trace_probe_log_err(ctx->offset, NO_BTF_FIELD); + return -ENOENT; + } + /* Add anonymous structure/union offset */ + bitoffs += anon_offs; + + /* Accumulate the bit-offsets of the dot-connected fields */ + if (btf_type_kflag(type)) { + bitoffs += BTF_MEMBER_BIT_OFFSET(field->offset); + ctx->last_bitsize = BTF_MEMBER_BITFIELD_SIZE(field->offset); + } else { + bitoffs += field->offset; + ctx->last_bitsize = 0; + } + + type = btf_type_skip_modifiers(ctx->btf, field->type, &tid); + if (!type) { + trace_probe_log_err(ctx->offset, BAD_BTF_TID); + return -EINVAL; + } + + ctx->offset += next - fieldname; + fieldname = next; + } while (!is_ptr && fieldname); + + if (++code == end) { + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); + return -EINVAL; + } + code->op = FETCH_OP_DEREF; /* TODO: user deref support */ + code->offset = bitoffs / 8; + *pcode = code; + + ctx->last_bitoffs = bitoffs % 8; + ctx->last_type = type; + } while (fieldname); + + return 0; +} + +static int parse_btf_arg(char *varname, + struct fetch_insn **pcode, struct fetch_insn *end, + struct traceprobe_parse_context *ctx) +{ + struct fetch_insn *code = *pcode; + const struct btf_param *params; + const struct btf_type *type; + char *field = NULL; + int i, is_ptr, ret; + u32 tid; + + if (WARN_ON_ONCE(!ctx->funcname)) + return -EINVAL; + + is_ptr = split_next_field(varname, &field, ctx); + if (is_ptr < 0) + return is_ptr; + if (!is_ptr && field) { + /* dot-connected field on an argument is not supported. */ + trace_probe_log_err(ctx->offset + field - varname, + NOSUP_DAT_ARG); + return -EOPNOTSUPP; + } + + if (ctx->flags & TPARG_FL_RETURN) { + if (strcmp(varname, "$retval") != 0) { + trace_probe_log_err(ctx->offset, NO_BTFARG); + return -ENOENT; + } + code->op = FETCH_OP_RETVAL; + /* Check whether the function return type is not void */ + if (query_btf_context(ctx) == 0) { + if (ctx->proto->type == 0) { + trace_probe_log_err(ctx->offset, NO_RETVAL); + return -ENOENT; + } + tid = ctx->proto->type; + goto found; + } + if (field) { + trace_probe_log_err(ctx->offset + field - varname, + NO_BTF_ENTRY); + return -ENOENT; + } + return 0; + } + + if (!ctx->btf) { + ret = query_btf_context(ctx); + if (ret < 0 || ctx->nr_params == 0) { + trace_probe_log_err(ctx->offset, NO_BTF_ENTRY); + return PTR_ERR(params); + } + } + params = ctx->params; + + for (i = 0; i < ctx->nr_params; i++) { + const char *name = btf_name_by_offset(ctx->btf, params[i].name_off); + + if (name && !strcmp(name, varname)) { + code->op = FETCH_OP_ARG; + if (ctx->flags & TPARG_FL_TPOINT) + code->param = i + 1; + else + code->param = i; + tid = params[i].type; + goto found; + } + } + trace_probe_log_err(ctx->offset, NO_BTFARG); + return -ENOENT; + +found: + type = btf_type_skip_modifiers(ctx->btf, tid, &tid); + if (!type) { + trace_probe_log_err(ctx->offset, BAD_BTF_TID); + return -EINVAL; + } + /* Initialize the last type information */ + ctx->last_type = type; + ctx->last_bitoffs = 0; + ctx->last_bitsize = 0; + if (field) { + ctx->offset += field - varname; + return parse_btf_field(field, type, pcode, end, ctx); + } + return 0; +} + +static const struct fetch_type *find_fetch_type_from_btf_type( + struct traceprobe_parse_context *ctx) +{ + struct btf *btf = ctx->btf; + const char *typestr = NULL; + + if (btf && ctx->last_type) + typestr = fetch_type_from_btf_type(btf, ctx->last_type, ctx); + + return find_fetch_type(typestr, ctx->flags); +} + +static int parse_btf_bitfield(struct fetch_insn **pcode, + struct traceprobe_parse_context *ctx) +{ + struct fetch_insn *code = *pcode; + + if ((ctx->last_bitsize % 8 == 0) && ctx->last_bitoffs == 0) + return 0; + + code++; + if (code->op != FETCH_OP_NOP) { + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); + return -EINVAL; + } + *pcode = code; + + code->op = FETCH_OP_MOD_BF; + code->lshift = 64 - (ctx->last_bitsize + ctx->last_bitoffs); + code->rshift = 64 - ctx->last_bitsize; + code->basesize = 64 / 8; + return 0; +} + +#else +static void clear_btf_context(struct traceprobe_parse_context *ctx) +{ + ctx->btf = NULL; +} + +static int query_btf_context(struct traceprobe_parse_context *ctx) +{ + return -EOPNOTSUPP; +} + +static int parse_btf_arg(char *varname, + struct fetch_insn **pcode, struct fetch_insn *end, + struct traceprobe_parse_context *ctx) +{ + trace_probe_log_err(ctx->offset, NOSUP_BTFARG); + return -EOPNOTSUPP; +} + +static int parse_btf_bitfield(struct fetch_insn **pcode, + struct traceprobe_parse_context *ctx) +{ + trace_probe_log_err(ctx->offset, NOSUP_BTFARG); + return -EOPNOTSUPP; +} + +#define find_fetch_type_from_btf_type(ctx) \ + find_fetch_type(NULL, ctx->flags) + +static int check_prepare_btf_string_fetch(char *typename, + struct fetch_insn **pcode, + struct traceprobe_parse_context *ctx) +{ + return 0; +} + +#endif + +#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) + +/* Parse $vars. @orig_arg points '$', which syncs to @ctx->offset */ +static int parse_probe_vars(char *orig_arg, const struct fetch_type *t, + struct fetch_insn **pcode, + struct fetch_insn *end, + struct traceprobe_parse_context *ctx) +{ + struct fetch_insn *code = *pcode; + int err = TP_ERR_BAD_VAR; + char *arg = orig_arg + 1; + unsigned long param; + int ret = 0; + int len; + + if (ctx->flags & TPARG_FL_TEVENT) { + if (code->data) + return -EFAULT; + ret = parse_trace_event_arg(arg, code, ctx); + if (!ret) + return 0; + if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { + code->op = FETCH_OP_COMM; + return 0; + } + /* backward compatibility */ + ctx->offset = 0; + goto inval; + } + + if (str_has_prefix(arg, "retval")) { + if (!(ctx->flags & TPARG_FL_RETURN)) { + err = TP_ERR_RETVAL_ON_PROBE; + goto inval; + } + if (!(ctx->flags & TPARG_FL_KERNEL) || + !IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS)) { + code->op = FETCH_OP_RETVAL; + return 0; + } + return parse_btf_arg(orig_arg, pcode, end, ctx); + } + + len = str_has_prefix(arg, "stack"); + if (len) { + + if (arg[len] == '\0') { + code->op = FETCH_OP_STACKP; + return 0; + } + + if (isdigit(arg[len])) { + ret = kstrtoul(arg + len, 10, ¶m); + if (ret) + goto inval; + + if ((ctx->flags & TPARG_FL_KERNEL) && + param > PARAM_MAX_STACK) { + err = TP_ERR_BAD_STACK_NUM; + goto inval; + } + code->op = FETCH_OP_STACK; + code->param = (unsigned int)param; + return 0; + } + goto inval; + } + + if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { + code->op = FETCH_OP_COMM; + return 0; + } + +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + len = str_has_prefix(arg, "arg"); + if (len && tparg_is_function_entry(ctx->flags)) { + ret = kstrtoul(arg + len, 10, ¶m); + if (ret) + goto inval; + + if (!param || param > PARAM_MAX_STACK) { + err = TP_ERR_BAD_ARG_NUM; + goto inval; + } + + code->op = FETCH_OP_ARG; + code->param = (unsigned int)param - 1; + /* + * The tracepoint probe will probe a stub function, and the + * first parameter of the stub is a dummy and should be ignored. + */ + if (ctx->flags & TPARG_FL_TPOINT) + code->param++; + return 0; + } +#endif + +inval: + __trace_probe_log_err(ctx->offset, err); + return -EINVAL; +} + +static int str_to_immediate(char *str, unsigned long *imm) +{ + if (isdigit(str[0])) + return kstrtoul(str, 0, imm); + else if (str[0] == '-') + return kstrtol(str, 0, (long *)imm); + else if (str[0] == '+') + return kstrtol(str + 1, 0, (long *)imm); + return -EINVAL; +} + +static int __parse_imm_string(char *str, char **pbuf, int offs) +{ + size_t len = strlen(str); + + if (str[len - 1] != '"') { + trace_probe_log_err(offs + len, IMMSTR_NO_CLOSE); + return -EINVAL; + } + *pbuf = kstrndup(str, len - 1, GFP_KERNEL); + if (!*pbuf) + return -ENOMEM; + return 0; +} + +/* Recursive argument parser */ +static int +parse_probe_arg(char *arg, const struct fetch_type *type, + struct fetch_insn **pcode, struct fetch_insn *end, + struct traceprobe_parse_context *ctx) +{ + struct fetch_insn *code = *pcode; + unsigned long param; + int deref = FETCH_OP_DEREF; + long offset = 0; + char *tmp; + int ret = 0; + + switch (arg[0]) { + case '$': + ret = parse_probe_vars(arg, type, pcode, end, ctx); + break; + + case '%': /* named register */ + if (ctx->flags & (TPARG_FL_TEVENT | TPARG_FL_FPROBE)) { + /* eprobe and fprobe do not handle registers */ + trace_probe_log_err(ctx->offset, BAD_VAR); + break; + } + ret = regs_query_register_offset(arg + 1); + if (ret >= 0) { + code->op = FETCH_OP_REG; + code->param = (unsigned int)ret; + ret = 0; + } else + trace_probe_log_err(ctx->offset, BAD_REG_NAME); + break; + + case '@': /* memory, file-offset or symbol */ + if (isdigit(arg[1])) { + ret = kstrtoul(arg + 1, 0, ¶m); + if (ret) { + trace_probe_log_err(ctx->offset, BAD_MEM_ADDR); + break; + } + /* load address */ + code->op = FETCH_OP_IMM; + code->immediate = param; + } else if (arg[1] == '+') { + /* kprobes don't support file offsets */ + if (ctx->flags & TPARG_FL_KERNEL) { + trace_probe_log_err(ctx->offset, FILE_ON_KPROBE); + return -EINVAL; + } + ret = kstrtol(arg + 2, 0, &offset); + if (ret) { + trace_probe_log_err(ctx->offset, BAD_FILE_OFFS); + break; + } + + code->op = FETCH_OP_FOFFS; + code->immediate = (unsigned long)offset; // imm64? + } else { + /* uprobes don't support symbols */ + if (!(ctx->flags & TPARG_FL_KERNEL)) { + trace_probe_log_err(ctx->offset, SYM_ON_UPROBE); + return -EINVAL; + } + /* Preserve symbol for updating */ + code->op = FETCH_NOP_SYMBOL; + code->data = kstrdup(arg + 1, GFP_KERNEL); + if (!code->data) + return -ENOMEM; + if (++code == end) { + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); + return -EINVAL; + } + code->op = FETCH_OP_IMM; + code->immediate = 0; + } + /* These are fetching from memory */ + if (++code == end) { + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); + return -EINVAL; + } + *pcode = code; + code->op = FETCH_OP_DEREF; + code->offset = offset; + break; + + case '+': /* deref memory */ + case '-': + if (arg[1] == 'u') { + deref = FETCH_OP_UDEREF; + arg[1] = arg[0]; + arg++; + } + if (arg[0] == '+') + arg++; /* Skip '+', because kstrtol() rejects it. */ + tmp = strchr(arg, '('); + if (!tmp) { + trace_probe_log_err(ctx->offset, DEREF_NEED_BRACE); + return -EINVAL; + } + *tmp = '\0'; + ret = kstrtol(arg, 0, &offset); + if (ret) { + trace_probe_log_err(ctx->offset, BAD_DEREF_OFFS); + break; + } + ctx->offset += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0); + arg = tmp + 1; + tmp = strrchr(arg, ')'); + if (!tmp) { + trace_probe_log_err(ctx->offset + strlen(arg), + DEREF_OPEN_BRACE); + return -EINVAL; + } else { + const struct fetch_type *t2 = find_fetch_type(NULL, ctx->flags); + int cur_offs = ctx->offset; + + *tmp = '\0'; + ret = parse_probe_arg(arg, t2, &code, end, ctx); + if (ret) + break; + ctx->offset = cur_offs; + if (code->op == FETCH_OP_COMM || + code->op == FETCH_OP_DATA) { + trace_probe_log_err(ctx->offset, COMM_CANT_DEREF); + return -EINVAL; + } + if (++code == end) { + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); + return -EINVAL; + } + *pcode = code; + + code->op = deref; + code->offset = offset; + /* Reset the last type if used */ + ctx->last_type = NULL; + } + break; + case '\\': /* Immediate value */ + if (arg[1] == '"') { /* Immediate string */ + ret = __parse_imm_string(arg + 2, &tmp, ctx->offset + 2); + if (ret) + break; + code->op = FETCH_OP_DATA; + code->data = tmp; + } else { + ret = str_to_immediate(arg + 1, &code->immediate); + if (ret) + trace_probe_log_err(ctx->offset + 1, BAD_IMM); + else + code->op = FETCH_OP_IMM; + } + break; + default: + if (isalpha(arg[0]) || arg[0] == '_') { /* BTF variable */ + if (!tparg_is_function_entry(ctx->flags)) { + trace_probe_log_err(ctx->offset, NOSUP_BTFARG); + return -EINVAL; + } + ret = parse_btf_arg(arg, pcode, end, ctx); + break; + } + } + if (!ret && code->op == FETCH_OP_NOP) { + /* Parsed, but do not find fetch method */ + trace_probe_log_err(ctx->offset, BAD_FETCH_ARG); + ret = -EINVAL; + } + return ret; +} + +#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) + +/* Bitfield type needs to be parsed into a fetch function */ +static int __parse_bitfield_probe_arg(const char *bf, + const struct fetch_type *t, + struct fetch_insn **pcode) +{ + struct fetch_insn *code = *pcode; + unsigned long bw, bo; + char *tail; + + if (*bf != 'b') + return 0; + + bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ + + if (bw == 0 || *tail != '@') + return -EINVAL; + + bf = tail + 1; + bo = simple_strtoul(bf, &tail, 0); + + if (tail == bf || *tail != '/') + return -EINVAL; + code++; + if (code->op != FETCH_OP_NOP) + return -EINVAL; + *pcode = code; + + code->op = FETCH_OP_MOD_BF; + code->lshift = BYTES_TO_BITS(t->size) - (bw + bo); + code->rshift = BYTES_TO_BITS(t->size) - bw; + code->basesize = t->size; + + return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; +} + +/* String length checking wrapper */ +static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, + struct probe_arg *parg, + struct traceprobe_parse_context *ctx) +{ + struct fetch_insn *code, *scode, *tmp = NULL; + char *t, *t2, *t3; + int ret, len; + char *arg; + + arg = kstrdup(argv, GFP_KERNEL); + if (!arg) + return -ENOMEM; + + ret = -EINVAL; + len = strlen(arg); + if (len > MAX_ARGSTR_LEN) { + trace_probe_log_err(ctx->offset, ARG_TOO_LONG); + goto out; + } else if (len == 0) { + trace_probe_log_err(ctx->offset, NO_ARG_BODY); + goto out; + } + + ret = -ENOMEM; + parg->comm = kstrdup(arg, GFP_KERNEL); + if (!parg->comm) + goto out; + + ret = -EINVAL; + t = strchr(arg, ':'); + if (t) { + *t = '\0'; + t2 = strchr(++t, '['); + if (t2) { + *t2++ = '\0'; + t3 = strchr(t2, ']'); + if (!t3) { + int offs = t2 + strlen(t2) - arg; + + trace_probe_log_err(ctx->offset + offs, + ARRAY_NO_CLOSE); + goto out; + } else if (t3[1] != '\0') { + trace_probe_log_err(ctx->offset + t3 + 1 - arg, + BAD_ARRAY_SUFFIX); + goto out; + } + *t3 = '\0'; + if (kstrtouint(t2, 0, &parg->count) || !parg->count) { + trace_probe_log_err(ctx->offset + t2 - arg, + BAD_ARRAY_NUM); + goto out; + } + if (parg->count > MAX_ARRAY_LEN) { + trace_probe_log_err(ctx->offset + t2 - arg, + ARRAY_TOO_BIG); + goto out; + } + } + } + + /* + * Since $comm and immediate string can not be dereferenced, + * we can find those by strcmp. But ignore for eprobes. + */ + if (!(ctx->flags & TPARG_FL_TEVENT) && + (strcmp(arg, "$comm") == 0 || strcmp(arg, "$COMM") == 0 || + strncmp(arg, "\\\"", 2) == 0)) { + /* The type of $comm must be "string", and not an array. */ + if (parg->count || (t && strcmp(t, "string"))) + goto out; + parg->type = find_fetch_type("string", ctx->flags); + } else + parg->type = find_fetch_type(t, ctx->flags); + if (!parg->type) { + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), BAD_TYPE); + goto out; + } + parg->offset = *size; + *size += parg->type->size * (parg->count ?: 1); + + ret = -ENOMEM; + if (parg->count) { + len = strlen(parg->type->fmttype) + 6; + parg->fmt = kmalloc(len, GFP_KERNEL); + if (!parg->fmt) + goto out; + snprintf(parg->fmt, len, "%s[%d]", parg->type->fmttype, + parg->count); + } + + code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL); + if (!code) + goto out; + code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; + + ctx->last_type = NULL; + ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], + ctx); + if (ret) + goto fail; + + /* Update storing type if BTF is available */ + if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && + ctx->last_type) { + if (!t) { + parg->type = find_fetch_type_from_btf_type(ctx); + } else if (strstr(t, "string")) { + ret = check_prepare_btf_string_fetch(t, &code, ctx); + if (ret) + goto fail; + } + } + + ret = -EINVAL; + /* Store operation */ + if (parg->type->is_string) { + if (!strcmp(parg->type->name, "symstr")) { + if (code->op != FETCH_OP_REG && code->op != FETCH_OP_STACK && + code->op != FETCH_OP_RETVAL && code->op != FETCH_OP_ARG && + code->op != FETCH_OP_DEREF && code->op != FETCH_OP_TP_ARG) { + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), + BAD_SYMSTRING); + goto fail; + } + } else { + if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_UDEREF && + code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM && + code->op != FETCH_OP_DATA && code->op != FETCH_OP_TP_ARG) { + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), + BAD_STRING); + goto fail; + } + } + if (!strcmp(parg->type->name, "symstr") || + (code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM || + code->op == FETCH_OP_DATA) || code->op == FETCH_OP_TP_ARG || + parg->count) { + /* + * IMM, DATA and COMM is pointing actual address, those + * must be kept, and if parg->count != 0, this is an + * array of string pointers instead of string address + * itself. + * For the symstr, it doesn't need to dereference, thus + * it just get the value. + */ + code++; + if (code->op != FETCH_OP_NOP) { + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); + goto fail; + } + } + /* If op == DEREF, replace it with STRING */ + if (!strcmp(parg->type->name, "ustring") || + code->op == FETCH_OP_UDEREF) + code->op = FETCH_OP_ST_USTRING; + else if (!strcmp(parg->type->name, "symstr")) + code->op = FETCH_OP_ST_SYMSTR; + else + code->op = FETCH_OP_ST_STRING; + code->size = parg->type->size; + parg->dynamic = true; + } else if (code->op == FETCH_OP_DEREF) { + code->op = FETCH_OP_ST_MEM; + code->size = parg->type->size; + } else if (code->op == FETCH_OP_UDEREF) { + code->op = FETCH_OP_ST_UMEM; + code->size = parg->type->size; + } else { + code++; + if (code->op != FETCH_OP_NOP) { + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); + goto fail; + } + code->op = FETCH_OP_ST_RAW; + code->size = parg->type->size; + } + scode = code; + /* Modify operation */ + if (t != NULL) { + ret = __parse_bitfield_probe_arg(t, parg->type, &code); + if (ret) { + trace_probe_log_err(ctx->offset + t - arg, BAD_BITFIELD); + goto fail; + } + } else if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS) && + ctx->last_type) { + ret = parse_btf_bitfield(&code, ctx); + if (ret) + goto fail; + } + ret = -EINVAL; + /* Loop(Array) operation */ + if (parg->count) { + if (scode->op != FETCH_OP_ST_MEM && + scode->op != FETCH_OP_ST_STRING && + scode->op != FETCH_OP_ST_USTRING) { + trace_probe_log_err(ctx->offset + (t ? (t - arg) : 0), + BAD_STRING); + goto fail; + } + code++; + if (code->op != FETCH_OP_NOP) { + trace_probe_log_err(ctx->offset, TOO_MANY_OPS); + goto fail; + } + code->op = FETCH_OP_LP_ARRAY; + code->param = parg->count; + } + code++; + code->op = FETCH_OP_END; + + ret = 0; + /* Shrink down the code buffer */ + parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL); + if (!parg->code) + ret = -ENOMEM; + else + memcpy(parg->code, tmp, sizeof(*code) * (code - tmp + 1)); + +fail: + if (ret) { + for (code = tmp; code < tmp + FETCH_INSN_MAX; code++) + if (code->op == FETCH_NOP_SYMBOL || + code->op == FETCH_OP_DATA) + kfree(code->data); + } + kfree(tmp); +out: + kfree(arg); + + return ret; +} + +/* Return 1 if name is reserved or already used by another argument */ +static int traceprobe_conflict_field_name(const char *name, + struct probe_arg *args, int narg) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++) + if (strcmp(reserved_field_names[i], name) == 0) + return 1; + + for (i = 0; i < narg; i++) + if (strcmp(args[i].name, name) == 0) + return 1; + + return 0; +} + +static char *generate_probe_arg_name(const char *arg, int idx) +{ + char *name = NULL; + const char *end; + + /* + * If argument name is omitted, try arg as a name (BTF variable) + * or "argN". + */ + if (IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS)) { + end = strchr(arg, ':'); + if (!end) + end = arg + strlen(arg); + + name = kmemdup_nul(arg, end - arg, GFP_KERNEL); + if (!name || !is_good_name(name)) { + kfree(name); + name = NULL; + } + } + + if (!name) + name = kasprintf(GFP_KERNEL, "arg%d", idx + 1); + + return name; +} + +int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char *arg, + struct traceprobe_parse_context *ctx) +{ + struct probe_arg *parg = &tp->args[i]; + const char *body; + + /* Increment count for freeing args in error case */ + tp->nr_args++; + + body = strchr(arg, '='); + if (body) { + if (body - arg > MAX_ARG_NAME_LEN) { + trace_probe_log_err(0, ARG_NAME_TOO_LONG); + return -EINVAL; + } else if (body == arg) { + trace_probe_log_err(0, NO_ARG_NAME); + return -EINVAL; + } + parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL); + body++; + } else { + parg->name = generate_probe_arg_name(arg, i); + body = arg; + } + if (!parg->name) + return -ENOMEM; + + if (!is_good_name(parg->name)) { + trace_probe_log_err(0, BAD_ARG_NAME); + return -EINVAL; + } + if (traceprobe_conflict_field_name(parg->name, tp->args, i)) { + trace_probe_log_err(0, USED_ARG_NAME); + return -EINVAL; + } + ctx->offset = body - arg; + /* Parse fetch argument */ + return traceprobe_parse_probe_arg_body(body, &tp->size, parg, ctx); +} + +void traceprobe_free_probe_arg(struct probe_arg *arg) +{ + struct fetch_insn *code = arg->code; + + while (code && code->op != FETCH_OP_END) { + if (code->op == FETCH_NOP_SYMBOL || + code->op == FETCH_OP_DATA) + kfree(code->data); + code++; + } + kfree(arg->code); + kfree(arg->name); + kfree(arg->comm); + kfree(arg->fmt); +} + +static int argv_has_var_arg(int argc, const char *argv[], int *args_idx, + struct traceprobe_parse_context *ctx) +{ + int i, found = 0; + + for (i = 0; i < argc; i++) + if (str_has_prefix(argv[i], "$arg")) { + trace_probe_log_set_index(i + 2); + + if (!tparg_is_function_entry(ctx->flags)) { + trace_probe_log_err(0, NOFENTRY_ARGS); + return -EINVAL; + } + + if (isdigit(argv[i][4])) { + found = 1; + continue; + } + + if (argv[i][4] != '*') { + trace_probe_log_err(0, BAD_VAR); + return -EINVAL; + } + + if (*args_idx >= 0 && *args_idx < argc) { + trace_probe_log_err(0, DOUBLE_ARGS); + return -EINVAL; + } + found = 1; + *args_idx = i; + } + + return found; +} + +static int sprint_nth_btf_arg(int idx, const char *type, + char *buf, int bufsize, + struct traceprobe_parse_context *ctx) +{ + const char *name; + int ret; + + if (idx >= ctx->nr_params) { + trace_probe_log_err(0, NO_BTFARG); + return -ENOENT; + } + name = btf_name_by_offset(ctx->btf, ctx->params[idx].name_off); + if (!name) { + trace_probe_log_err(0, NO_BTF_ENTRY); + return -ENOENT; + } + ret = snprintf(buf, bufsize, "%s%s", name, type); + if (ret >= bufsize) { + trace_probe_log_err(0, ARGS_2LONG); + return -E2BIG; + } + return ret; +} + +/* Return new_argv which must be freed after use */ +const char **traceprobe_expand_meta_args(int argc, const char *argv[], + int *new_argc, char *buf, int bufsize, + struct traceprobe_parse_context *ctx) +{ + const struct btf_param *params = NULL; + int i, j, n, used, ret, args_idx = -1; + const char **new_argv = NULL; + + ret = argv_has_var_arg(argc, argv, &args_idx, ctx); + if (ret < 0) + return ERR_PTR(ret); + + if (!ret) { + *new_argc = argc; + return NULL; + } + + ret = query_btf_context(ctx); + if (ret < 0 || ctx->nr_params == 0) { + if (args_idx != -1) { + /* $arg* requires BTF info */ + trace_probe_log_err(0, NOSUP_BTFARG); + return (const char **)params; + } + *new_argc = argc; + return NULL; + } + + if (args_idx >= 0) + *new_argc = argc + ctx->nr_params - 1; + else + *new_argc = argc; + + new_argv = kcalloc(*new_argc, sizeof(char *), GFP_KERNEL); + if (!new_argv) + return ERR_PTR(-ENOMEM); + + used = 0; + for (i = 0, j = 0; i < argc; i++) { + trace_probe_log_set_index(i + 2); + if (i == args_idx) { + for (n = 0; n < ctx->nr_params; n++) { + ret = sprint_nth_btf_arg(n, "", buf + used, + bufsize - used, ctx); + if (ret < 0) + goto error; + + new_argv[j++] = buf + used; + used += ret + 1; + } + continue; + } + + if (str_has_prefix(argv[i], "$arg")) { + char *type = NULL; + + n = simple_strtoul(argv[i] + 4, &type, 10); + if (type && !(*type == ':' || *type == '\0')) { + trace_probe_log_err(0, BAD_VAR); + ret = -ENOENT; + goto error; + } + /* Note: $argN starts from $arg1 */ + ret = sprint_nth_btf_arg(n - 1, type, buf + used, + bufsize - used, ctx); + if (ret < 0) + goto error; + new_argv[j++] = buf + used; + used += ret + 1; + } else + new_argv[j++] = argv[i]; + } + + return new_argv; + +error: + kfree(new_argv); + return ERR_PTR(ret); +} + +void traceprobe_finish_parse(struct traceprobe_parse_context *ctx) +{ + clear_btf_context(ctx); +} + +int traceprobe_update_arg(struct probe_arg *arg) +{ + struct fetch_insn *code = arg->code; + long offset; + char *tmp; + char c; + int ret = 0; + + while (code && code->op != FETCH_OP_END) { + if (code->op == FETCH_NOP_SYMBOL) { + if (code[1].op != FETCH_OP_IMM) + return -EINVAL; + + tmp = strpbrk(code->data, "+-"); + if (tmp) + c = *tmp; + ret = traceprobe_split_symbol_offset(code->data, + &offset); + if (ret) + return ret; + + code[1].immediate = + (unsigned long)kallsyms_lookup_name(code->data); + if (tmp) + *tmp = c; + if (!code[1].immediate) + return -ENOENT; + code[1].immediate += offset; + } + code++; + } + return 0; +} + +/* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) +static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, + enum probe_print_type ptype) +{ + struct probe_arg *parg; + int i, j; + int pos = 0; + const char *fmt, *arg; + + switch (ptype) { + case PROBE_PRINT_NORMAL: + fmt = "(%lx)"; + arg = ", REC->" FIELD_STRING_IP; + break; + case PROBE_PRINT_RETURN: + fmt = "(%lx <- %lx)"; + arg = ", REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + break; + case PROBE_PRINT_EVENT: + fmt = ""; + arg = ""; + break; + default: + WARN_ON_ONCE(1); + return 0; + } + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); + + for (i = 0; i < tp->nr_args; i++) { + parg = tp->args + i; + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=", parg->name); + if (parg->count) { + pos += snprintf(buf + pos, LEN_OR_ZERO, "{%s", + parg->type->fmt); + for (j = 1; j < parg->count; j++) + pos += snprintf(buf + pos, LEN_OR_ZERO, ",%s", + parg->type->fmt); + pos += snprintf(buf + pos, LEN_OR_ZERO, "}"); + } else + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", + parg->type->fmt); + } + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", arg); + + for (i = 0; i < tp->nr_args; i++) { + parg = tp->args + i; + if (parg->count) { + if (parg->type->is_string) + fmt = ", __get_str(%s[%d])"; + else + fmt = ", REC->%s[%d]"; + for (j = 0; j < parg->count; j++) + pos += snprintf(buf + pos, LEN_OR_ZERO, + fmt, parg->name, j); + } else { + if (parg->type->is_string) + fmt = ", __get_str(%s)"; + else + fmt = ", REC->%s"; + pos += snprintf(buf + pos, LEN_OR_ZERO, + fmt, parg->name); + } + } + + /* return the length of print_fmt */ + return pos; +} +#undef LEN_OR_ZERO + +int traceprobe_set_print_fmt(struct trace_probe *tp, enum probe_print_type ptype) +{ + struct trace_event_call *call = trace_probe_event_call(tp); + int len; + char *print_fmt; + + /* First: called with 0 length to calculate the needed length */ + len = __set_print_fmt(tp, NULL, 0, ptype); + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_print_fmt(tp, print_fmt, len + 1, ptype); + call->print_fmt = print_fmt; + + return 0; +} + +int traceprobe_define_arg_fields(struct trace_event_call *event_call, + size_t offset, struct trace_probe *tp) +{ + int ret, i; + + /* Set argument names as fields */ + for (i = 0; i < tp->nr_args; i++) { + struct probe_arg *parg = &tp->args[i]; + const char *fmt = parg->type->fmttype; + int size = parg->type->size; + + if (parg->fmt) + fmt = parg->fmt; + if (parg->count) + size *= parg->count; + ret = trace_define_field(event_call, fmt, parg->name, + offset + parg->offset, size, + parg->type->is_signed, + FILTER_OTHER); + if (ret) + return ret; + } + return 0; +} + +static void trace_probe_event_free(struct trace_probe_event *tpe) +{ + kfree(tpe->class.system); + kfree(tpe->call.name); + kfree(tpe->call.print_fmt); + kfree(tpe); +} + +int trace_probe_append(struct trace_probe *tp, struct trace_probe *to) +{ + if (trace_probe_has_sibling(tp)) + return -EBUSY; + + list_del_init(&tp->list); + trace_probe_event_free(tp->event); + + tp->event = to->event; + list_add_tail(&tp->list, trace_probe_probe_list(to)); + + return 0; +} + +void trace_probe_unlink(struct trace_probe *tp) +{ + list_del_init(&tp->list); + if (list_empty(trace_probe_probe_list(tp))) + trace_probe_event_free(tp->event); + tp->event = NULL; +} + +void trace_probe_cleanup(struct trace_probe *tp) +{ + int i; + + for (i = 0; i < tp->nr_args; i++) + traceprobe_free_probe_arg(&tp->args[i]); + + if (tp->event) + trace_probe_unlink(tp); +} + +int trace_probe_init(struct trace_probe *tp, const char *event, + const char *group, bool alloc_filter) +{ + struct trace_event_call *call; + size_t size = sizeof(struct trace_probe_event); + int ret = 0; + + if (!event || !group) + return -EINVAL; + + if (alloc_filter) + size += sizeof(struct trace_uprobe_filter); + + tp->event = kzalloc(size, GFP_KERNEL); + if (!tp->event) + return -ENOMEM; + + INIT_LIST_HEAD(&tp->event->files); + INIT_LIST_HEAD(&tp->event->class.fields); + INIT_LIST_HEAD(&tp->event->probes); + INIT_LIST_HEAD(&tp->list); + list_add(&tp->list, &tp->event->probes); + + call = trace_probe_event_call(tp); + call->class = &tp->event->class; + call->name = kstrdup(event, GFP_KERNEL); + if (!call->name) { + ret = -ENOMEM; + goto error; + } + + tp->event->class.system = kstrdup(group, GFP_KERNEL); + if (!tp->event->class.system) { + ret = -ENOMEM; + goto error; + } + + return 0; + +error: + trace_probe_cleanup(tp); + return ret; +} + +static struct trace_event_call * +find_trace_event_call(const char *system, const char *event_name) +{ + struct trace_event_call *tp_event; + const char *name; + + list_for_each_entry(tp_event, &ftrace_events, list) { + if (!tp_event->class->system || + strcmp(system, tp_event->class->system)) + continue; + name = trace_event_name(tp_event); + if (!name || strcmp(event_name, name)) + continue; + return tp_event; + } + + return NULL; +} + +int trace_probe_register_event_call(struct trace_probe *tp) +{ + struct trace_event_call *call = trace_probe_event_call(tp); + int ret; + + lockdep_assert_held(&event_mutex); + + if (find_trace_event_call(trace_probe_group_name(tp), + trace_probe_name(tp))) + return -EEXIST; + + ret = register_trace_event(&call->event); + if (!ret) + return -ENODEV; + + ret = trace_add_event_call(call); + if (ret) + unregister_trace_event(&call->event); + + return ret; +} + +int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file) +{ + struct event_file_link *link; + + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) + return -ENOMEM; + + link->file = file; + INIT_LIST_HEAD(&link->list); + list_add_tail_rcu(&link->list, &tp->event->files); + trace_probe_set_flag(tp, TP_FLAG_TRACE); + return 0; +} + +struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp, + struct trace_event_file *file) +{ + struct event_file_link *link; + + trace_probe_for_each_link(link, tp) { + if (link->file == file) + return link; + } + + return NULL; +} + +int trace_probe_remove_file(struct trace_probe *tp, + struct trace_event_file *file) +{ + struct event_file_link *link; + + link = trace_probe_get_file_link(tp, file); + if (!link) + return -ENOENT; + + list_del_rcu(&link->list); + kvfree_rcu_mightsleep(link); + + if (list_empty(&tp->event->files)) + trace_probe_clear_flag(tp, TP_FLAG_TRACE); + + return 0; +} + +/* + * Return the smallest index of different type argument (start from 1). + * If all argument types and name are same, return 0. + */ +int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b) +{ + int i; + + /* In case of more arguments */ + if (a->nr_args < b->nr_args) + return a->nr_args + 1; + if (a->nr_args > b->nr_args) + return b->nr_args + 1; + + for (i = 0; i < a->nr_args; i++) { + if ((b->nr_args <= i) || + ((a->args[i].type != b->args[i].type) || + (a->args[i].count != b->args[i].count) || + strcmp(a->args[i].name, b->args[i].name))) + return i + 1; + } + + return 0; +} + +bool trace_probe_match_command_args(struct trace_probe *tp, + int argc, const char **argv) +{ + char buf[MAX_ARGSTR_LEN + 1]; + int i; + + if (tp->nr_args < argc) + return false; + + for (i = 0; i < argc; i++) { + snprintf(buf, sizeof(buf), "%s=%s", + tp->args[i].name, tp->args[i].comm); + if (strcmp(buf, argv[i])) + return false; + } + return true; +} + +int trace_probe_create(const char *raw_command, int (*createfn)(int, const char **)) +{ + int argc = 0, ret = 0; + char **argv; + + argv = argv_split(GFP_KERNEL, raw_command, &argc); + if (!argv) + return -ENOMEM; + + if (argc) + ret = createfn(argc, (const char **)argv); + + argv_free(argv); + + return ret; +} + +int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_args, + u8 *data, void *field) +{ + void *p; + int i, j; + + for (i = 0; i < nr_args; i++) { + struct probe_arg *a = args + i; + + trace_seq_printf(s, " %s=", a->name); + if (likely(!a->count)) { + if (!a->type->print(s, data + a->offset, field)) + return -ENOMEM; + continue; + } + trace_seq_putc(s, '{'); + p = data + a->offset; + for (j = 0; j < a->count; j++) { + if (!a->type->print(s, p, field)) + return -ENOMEM; + trace_seq_putc(s, j == a->count - 1 ? '}' : ','); + p += a->type->size; + } + } + return 0; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h new file mode 100644 index 0000000000..850d9ecb67 --- /dev/null +++ b/kernel/trace/trace_probe.h @@ -0,0 +1,546 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common header file for probe-based Dynamic events. + * + * This code was copied from kernel/trace/trace_kprobe.h written by + * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> + * + * Updates to make this generic: + * Copyright (C) IBM Corporation, 2010-2011 + * Author: Srikar Dronamraju + */ + +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/tracefs.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/ptrace.h> +#include <linux/perf_event.h> +#include <linux/kprobes.h> +#include <linux/stringify.h> +#include <linux/limits.h> +#include <linux/uaccess.h> +#include <linux/bitops.h> +#include <linux/btf.h> +#include <asm/bitsperlong.h> + +#include "trace.h" +#include "trace_output.h" + +#define MAX_TRACE_ARGS 128 +#define MAX_ARGSTR_LEN 63 +#define MAX_ARRAY_LEN 64 +#define MAX_ARG_NAME_LEN 32 +#define MAX_BTF_ARGS_LEN 128 +#define MAX_STRING_SIZE PATH_MAX +#define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN) + +/* Reserved field names */ +#define FIELD_STRING_IP "__probe_ip" +#define FIELD_STRING_RETIP "__probe_ret_ip" +#define FIELD_STRING_FUNC "__probe_func" + +#undef DEFINE_FIELD +#define DEFINE_FIELD(type, item, name, is_signed) \ + do { \ + ret = trace_define_field(event_call, #type, name, \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed, \ + FILTER_OTHER); \ + if (ret) \ + return ret; \ + } while (0) + + +/* Flags for trace_probe */ +#define TP_FLAG_TRACE 1 +#define TP_FLAG_PROFILE 2 + +/* data_loc: data location, compatible with u32 */ +#define make_data_loc(len, offs) \ + (((u32)(len) << 16) | ((u32)(offs) & 0xffff)) +#define get_loc_len(dl) ((u32)(dl) >> 16) +#define get_loc_offs(dl) ((u32)(dl) & 0xffff) + +static nokprobe_inline void *get_loc_data(u32 *dl, void *ent) +{ + return (u8 *)ent + get_loc_offs(*dl); +} + +static nokprobe_inline u32 update_data_loc(u32 loc, int consumed) +{ + u32 maxlen = get_loc_len(loc); + u32 offset = get_loc_offs(loc); + + return make_data_loc(maxlen - consumed, offset + consumed); +} + +/* Printing function type */ +typedef int (*print_type_func_t)(struct trace_seq *, void *, void *); + +enum fetch_op { + FETCH_OP_NOP = 0, + // Stage 1 (load) ops + FETCH_OP_REG, /* Register : .param = offset */ + FETCH_OP_STACK, /* Stack : .param = index */ + FETCH_OP_STACKP, /* Stack pointer */ + FETCH_OP_RETVAL, /* Return value */ + FETCH_OP_IMM, /* Immediate : .immediate */ + FETCH_OP_COMM, /* Current comm */ + FETCH_OP_ARG, /* Function argument : .param */ + FETCH_OP_FOFFS, /* File offset: .immediate */ + FETCH_OP_DATA, /* Allocated data: .data */ + // Stage 2 (dereference) op + FETCH_OP_DEREF, /* Dereference: .offset */ + FETCH_OP_UDEREF, /* User-space Dereference: .offset */ + // Stage 3 (store) ops + FETCH_OP_ST_RAW, /* Raw: .size */ + FETCH_OP_ST_MEM, /* Mem: .offset, .size */ + FETCH_OP_ST_UMEM, /* Mem: .offset, .size */ + FETCH_OP_ST_STRING, /* String: .offset, .size */ + FETCH_OP_ST_USTRING, /* User String: .offset, .size */ + FETCH_OP_ST_SYMSTR, /* Kernel Symbol String: .offset, .size */ + // Stage 4 (modify) op + FETCH_OP_MOD_BF, /* Bitfield: .basesize, .lshift, .rshift */ + // Stage 5 (loop) op + FETCH_OP_LP_ARRAY, /* Array: .param = loop count */ + FETCH_OP_TP_ARG, /* Trace Point argument */ + FETCH_OP_END, + FETCH_NOP_SYMBOL, /* Unresolved Symbol holder */ +}; + +struct fetch_insn { + enum fetch_op op; + union { + unsigned int param; + struct { + unsigned int size; + int offset; + }; + struct { + unsigned char basesize; + unsigned char lshift; + unsigned char rshift; + }; + unsigned long immediate; + void *data; + }; +}; + +/* fetch + deref*N + store + mod + end <= 16, this allows N=12, enough */ +#define FETCH_INSN_MAX 16 +#define FETCH_TOKEN_COMM (-ECOMM) + +/* Fetch type information table */ +struct fetch_type { + const char *name; /* Name of type */ + size_t size; /* Byte size of type */ + bool is_signed; /* Signed flag */ + bool is_string; /* String flag */ + print_type_func_t print; /* Print functions */ + const char *fmt; /* Format string */ + const char *fmttype; /* Name in format file */ +}; + +/* For defining macros, define string/string_size types */ +typedef u32 string; +typedef u32 string_size; + +#define PRINT_TYPE_FUNC_NAME(type) print_type_##type +#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type + +/* Printing in basic type function template */ +#define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \ +int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, void *data, void *ent);\ +extern const char PRINT_TYPE_FMT_NAME(type)[] + +DECLARE_BASIC_PRINT_TYPE_FUNC(u8); +DECLARE_BASIC_PRINT_TYPE_FUNC(u16); +DECLARE_BASIC_PRINT_TYPE_FUNC(u32); +DECLARE_BASIC_PRINT_TYPE_FUNC(u64); +DECLARE_BASIC_PRINT_TYPE_FUNC(s8); +DECLARE_BASIC_PRINT_TYPE_FUNC(s16); +DECLARE_BASIC_PRINT_TYPE_FUNC(s32); +DECLARE_BASIC_PRINT_TYPE_FUNC(s64); +DECLARE_BASIC_PRINT_TYPE_FUNC(x8); +DECLARE_BASIC_PRINT_TYPE_FUNC(x16); +DECLARE_BASIC_PRINT_TYPE_FUNC(x32); +DECLARE_BASIC_PRINT_TYPE_FUNC(x64); + +DECLARE_BASIC_PRINT_TYPE_FUNC(char); +DECLARE_BASIC_PRINT_TYPE_FUNC(string); +DECLARE_BASIC_PRINT_TYPE_FUNC(symbol); + +/* Default (unsigned long) fetch type */ +#define __DEFAULT_FETCH_TYPE(t) x##t +#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) +#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) +#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) + +#define __ADDR_FETCH_TYPE(t) u##t +#define _ADDR_FETCH_TYPE(t) __ADDR_FETCH_TYPE(t) +#define ADDR_FETCH_TYPE _ADDR_FETCH_TYPE(BITS_PER_LONG) + +#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, str, _fmttype) \ + {.name = _name, \ + .size = _size, \ + .is_signed = (bool)sign, \ + .is_string = (bool)str, \ + .print = PRINT_TYPE_FUNC_NAME(ptype), \ + .fmt = PRINT_TYPE_FMT_NAME(ptype), \ + .fmttype = _fmttype, \ + } + +/* Non string types can use these macros */ +#define _ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ + __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, 0, #_fmttype) +#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ + _ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, ptype) + +/* If ptype is an alias of atype, use this macro (show atype in format) */ +#define ASSIGN_FETCH_TYPE_ALIAS(ptype, atype, ftype, sign) \ + _ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, atype) + +#define ASSIGN_FETCH_TYPE_END {} +#define MAX_ARRAY_LEN 64 + +#ifdef CONFIG_KPROBE_EVENTS +bool trace_kprobe_on_func_entry(struct trace_event_call *call); +bool trace_kprobe_error_injectable(struct trace_event_call *call); +#else +static inline bool trace_kprobe_on_func_entry(struct trace_event_call *call) +{ + return false; +} + +static inline bool trace_kprobe_error_injectable(struct trace_event_call *call) +{ + return false; +} +#endif /* CONFIG_KPROBE_EVENTS */ + +struct probe_arg { + struct fetch_insn *code; + bool dynamic;/* Dynamic array (string) is used */ + unsigned int offset; /* Offset from argument entry */ + unsigned int count; /* Array count */ + const char *name; /* Name of this argument */ + const char *comm; /* Command of this argument */ + char *fmt; /* Format string if needed */ + const struct fetch_type *type; /* Type of this argument */ +}; + +struct trace_uprobe_filter { + rwlock_t rwlock; + int nr_systemwide; + struct list_head perf_events; +}; + +/* Event call and class holder */ +struct trace_probe_event { + unsigned int flags; /* For TP_FLAG_* */ + struct trace_event_class class; + struct trace_event_call call; + struct list_head files; + struct list_head probes; + struct trace_uprobe_filter filter[]; +}; + +struct trace_probe { + struct list_head list; + struct trace_probe_event *event; + ssize_t size; /* trace entry size */ + unsigned int nr_args; + struct probe_arg args[]; +}; + +struct event_file_link { + struct trace_event_file *file; + struct list_head list; +}; + +static inline bool trace_probe_test_flag(struct trace_probe *tp, + unsigned int flag) +{ + return !!(tp->event->flags & flag); +} + +static inline void trace_probe_set_flag(struct trace_probe *tp, + unsigned int flag) +{ + tp->event->flags |= flag; +} + +static inline void trace_probe_clear_flag(struct trace_probe *tp, + unsigned int flag) +{ + tp->event->flags &= ~flag; +} + +static inline bool trace_probe_is_enabled(struct trace_probe *tp) +{ + return trace_probe_test_flag(tp, TP_FLAG_TRACE | TP_FLAG_PROFILE); +} + +static inline const char *trace_probe_name(struct trace_probe *tp) +{ + return trace_event_name(&tp->event->call); +} + +static inline const char *trace_probe_group_name(struct trace_probe *tp) +{ + return tp->event->call.class->system; +} + +static inline struct trace_event_call * + trace_probe_event_call(struct trace_probe *tp) +{ + return &tp->event->call; +} + +static inline struct trace_probe_event * +trace_probe_event_from_call(struct trace_event_call *event_call) +{ + return container_of(event_call, struct trace_probe_event, call); +} + +static inline struct trace_probe * +trace_probe_primary_from_call(struct trace_event_call *call) +{ + struct trace_probe_event *tpe = trace_probe_event_from_call(call); + + return list_first_entry_or_null(&tpe->probes, struct trace_probe, list); +} + +static inline struct list_head *trace_probe_probe_list(struct trace_probe *tp) +{ + return &tp->event->probes; +} + +static inline bool trace_probe_has_sibling(struct trace_probe *tp) +{ + struct list_head *list = trace_probe_probe_list(tp); + + return !list_empty(list) && !list_is_singular(list); +} + +static inline int trace_probe_unregister_event_call(struct trace_probe *tp) +{ + /* tp->event is unregistered in trace_remove_event_call() */ + return trace_remove_event_call(&tp->event->call); +} + +static inline bool trace_probe_has_single_file(struct trace_probe *tp) +{ + return !!list_is_singular(&tp->event->files); +} + +int trace_probe_init(struct trace_probe *tp, const char *event, + const char *group, bool alloc_filter); +void trace_probe_cleanup(struct trace_probe *tp); +int trace_probe_append(struct trace_probe *tp, struct trace_probe *to); +void trace_probe_unlink(struct trace_probe *tp); +int trace_probe_register_event_call(struct trace_probe *tp); +int trace_probe_add_file(struct trace_probe *tp, struct trace_event_file *file); +int trace_probe_remove_file(struct trace_probe *tp, + struct trace_event_file *file); +struct event_file_link *trace_probe_get_file_link(struct trace_probe *tp, + struct trace_event_file *file); +int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b); +bool trace_probe_match_command_args(struct trace_probe *tp, + int argc, const char **argv); +int trace_probe_create(const char *raw_command, int (*createfn)(int, const char **)); +int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_args, + u8 *data, void *field); + +#define trace_probe_for_each_link(pos, tp) \ + list_for_each_entry(pos, &(tp)->event->files, list) +#define trace_probe_for_each_link_rcu(pos, tp) \ + list_for_each_entry_rcu(pos, &(tp)->event->files, list) + +/* + * The flags used for parsing trace_probe arguments. + * TPARG_FL_RETURN, TPARG_FL_FENTRY and TPARG_FL_TEVENT are mutually exclusive. + * TPARG_FL_KERNEL and TPARG_FL_USER are also mutually exclusive. + * TPARG_FL_FPROBE and TPARG_FL_TPOINT are optional but it should be with + * TPARG_FL_KERNEL. + */ +#define TPARG_FL_RETURN BIT(0) +#define TPARG_FL_KERNEL BIT(1) +#define TPARG_FL_FENTRY BIT(2) +#define TPARG_FL_TEVENT BIT(3) +#define TPARG_FL_USER BIT(4) +#define TPARG_FL_FPROBE BIT(5) +#define TPARG_FL_TPOINT BIT(6) +#define TPARG_FL_LOC_MASK GENMASK(4, 0) + +static inline bool tparg_is_function_entry(unsigned int flags) +{ + return (flags & TPARG_FL_LOC_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY); +} + +struct traceprobe_parse_context { + struct trace_event_call *event; + /* BTF related parameters */ + const char *funcname; /* Function name in BTF */ + const struct btf_type *proto; /* Prototype of the function */ + const struct btf_param *params; /* Parameter of the function */ + s32 nr_params; /* The number of the parameters */ + struct btf *btf; /* The BTF to be used */ + const struct btf_type *last_type; /* Saved type */ + u32 last_bitoffs; /* Saved bitoffs */ + u32 last_bitsize; /* Saved bitsize */ + unsigned int flags; + int offset; +}; + +extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, + const char *argv, + struct traceprobe_parse_context *ctx); +const char **traceprobe_expand_meta_args(int argc, const char *argv[], + int *new_argc, char *buf, int bufsize, + struct traceprobe_parse_context *ctx); + +extern int traceprobe_update_arg(struct probe_arg *arg); +extern void traceprobe_free_probe_arg(struct probe_arg *arg); + +/* + * If either traceprobe_parse_probe_arg() or traceprobe_expand_meta_args() is called, + * this MUST be called for clean up the context and return a resource. + */ +void traceprobe_finish_parse(struct traceprobe_parse_context *ctx); + +extern int traceprobe_split_symbol_offset(char *symbol, long *offset); +int traceprobe_parse_event_name(const char **pevent, const char **pgroup, + char *buf, int offset); + +enum probe_print_type { + PROBE_PRINT_NORMAL, + PROBE_PRINT_RETURN, + PROBE_PRINT_EVENT, +}; + +extern int traceprobe_set_print_fmt(struct trace_probe *tp, enum probe_print_type ptype); + +#ifdef CONFIG_PERF_EVENTS +extern struct trace_event_call * +create_local_trace_kprobe(char *func, void *addr, unsigned long offs, + bool is_return); +extern void destroy_local_trace_kprobe(struct trace_event_call *event_call); + +extern struct trace_event_call * +create_local_trace_uprobe(char *name, unsigned long offs, + unsigned long ref_ctr_offset, bool is_return); +extern void destroy_local_trace_uprobe(struct trace_event_call *event_call); +#endif +extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, + size_t offset, struct trace_probe *tp); + +#undef ERRORS +#define ERRORS \ + C(FILE_NOT_FOUND, "Failed to find the given file"), \ + C(NO_REGULAR_FILE, "Not a regular file"), \ + C(BAD_REFCNT, "Invalid reference counter offset"), \ + C(REFCNT_OPEN_BRACE, "Reference counter brace is not closed"), \ + C(BAD_REFCNT_SUFFIX, "Reference counter has wrong suffix"), \ + C(BAD_UPROBE_OFFS, "Invalid uprobe offset"), \ + C(BAD_MAXACT_TYPE, "Maxactive is only for function exit"), \ + C(BAD_MAXACT, "Invalid maxactive number"), \ + C(MAXACT_TOO_BIG, "Maxactive is too big"), \ + C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \ + C(NON_UNIQ_SYMBOL, "The symbol is not unique"), \ + C(BAD_RETPROBE, "Retprobe address must be an function entry"), \ + C(NO_TRACEPOINT, "Tracepoint is not found"), \ + C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \ + C(NO_GROUP_NAME, "Group name is not specified"), \ + C(GROUP_TOO_LONG, "Group name is too long"), \ + C(BAD_GROUP_NAME, "Group name must follow the same rules as C identifiers"), \ + C(NO_EVENT_NAME, "Event name is not specified"), \ + C(EVENT_TOO_LONG, "Event name is too long"), \ + C(BAD_EVENT_NAME, "Event name must follow the same rules as C identifiers"), \ + C(EVENT_EXIST, "Given group/event name is already used by another event"), \ + C(RETVAL_ON_PROBE, "$retval is not available on probe"), \ + C(NO_RETVAL, "This function returns 'void' type"), \ + C(BAD_STACK_NUM, "Invalid stack number"), \ + C(BAD_ARG_NUM, "Invalid argument number"), \ + C(BAD_VAR, "Invalid $-valiable specified"), \ + C(BAD_REG_NAME, "Invalid register name"), \ + C(BAD_MEM_ADDR, "Invalid memory address"), \ + C(BAD_IMM, "Invalid immediate value"), \ + C(IMMSTR_NO_CLOSE, "String is not closed with '\"'"), \ + C(FILE_ON_KPROBE, "File offset is not available with kprobe"), \ + C(BAD_FILE_OFFS, "Invalid file offset value"), \ + C(SYM_ON_UPROBE, "Symbol is not available with uprobe"), \ + C(TOO_MANY_OPS, "Dereference is too much nested"), \ + C(DEREF_NEED_BRACE, "Dereference needs a brace"), \ + C(BAD_DEREF_OFFS, "Invalid dereference offset"), \ + C(DEREF_OPEN_BRACE, "Dereference brace is not closed"), \ + C(COMM_CANT_DEREF, "$comm can not be dereferenced"), \ + C(BAD_FETCH_ARG, "Invalid fetch argument"), \ + C(ARRAY_NO_CLOSE, "Array is not closed"), \ + C(BAD_ARRAY_SUFFIX, "Array has wrong suffix"), \ + C(BAD_ARRAY_NUM, "Invalid array size"), \ + C(ARRAY_TOO_BIG, "Array number is too big"), \ + C(BAD_TYPE, "Unknown type is specified"), \ + C(BAD_STRING, "String accepts only memory argument"), \ + C(BAD_SYMSTRING, "Symbol String doesn't accept data/userdata"), \ + C(BAD_BITFIELD, "Invalid bitfield"), \ + C(ARG_NAME_TOO_LONG, "Argument name is too long"), \ + C(NO_ARG_NAME, "Argument name is not specified"), \ + C(BAD_ARG_NAME, "Argument name must follow the same rules as C identifiers"), \ + C(USED_ARG_NAME, "This argument name is already used"), \ + C(ARG_TOO_LONG, "Argument expression is too long"), \ + C(NO_ARG_BODY, "No argument expression"), \ + C(BAD_INSN_BNDRY, "Probe point is not an instruction boundary"),\ + C(FAIL_REG_PROBE, "Failed to register probe event"),\ + C(DIFF_PROBE_TYPE, "Probe type is different from existing probe"),\ + C(DIFF_ARG_TYPE, "Argument type or name is different from existing probe"),\ + C(SAME_PROBE, "There is already the exact same probe event"),\ + C(NO_EVENT_INFO, "This requires both group and event name to attach"),\ + C(BAD_ATTACH_EVENT, "Attached event does not exist"),\ + C(BAD_ATTACH_ARG, "Attached event does not have this field"),\ + C(NO_EP_FILTER, "No filter rule after 'if'"), \ + C(NOSUP_BTFARG, "BTF is not available or not supported"), \ + C(NO_BTFARG, "This variable is not found at this probe point"),\ + C(NO_BTF_ENTRY, "No BTF entry for this probe point"), \ + C(BAD_VAR_ARGS, "$arg* must be an independent parameter without name etc."),\ + C(NOFENTRY_ARGS, "$arg* can be used only on function entry"), \ + C(DOUBLE_ARGS, "$arg* can be used only once in the parameters"), \ + C(ARGS_2LONG, "$arg* failed because the argument list is too long"), \ + C(ARGIDX_2BIG, "$argN index is too big"), \ + C(NO_PTR_STRCT, "This is not a pointer to union/structure."), \ + C(NOSUP_DAT_ARG, "Non pointer structure/union argument is not supported."),\ + C(BAD_HYPHEN, "Failed to parse single hyphen. Forgot '>'?"), \ + C(NO_BTF_FIELD, "This field is not found."), \ + C(BAD_BTF_TID, "Failed to get BTF type info."),\ + C(BAD_TYPE4STR, "This type does not fit for string."), + +#undef C +#define C(a, b) TP_ERR_##a + +/* Define TP_ERR_ */ +enum { ERRORS }; + +/* Error text is defined in trace_probe.c */ + +struct trace_probe_log { + const char *subsystem; + const char **argv; + int argc; + int index; +}; + +void trace_probe_log_init(const char *subsystem, int argc, const char **argv); +void trace_probe_log_set_index(int index); +void trace_probe_log_clear(void); +void __trace_probe_log_err(int offset, int err); + +#define trace_probe_log_err(offs, err) \ + __trace_probe_log_err(offs, TP_ERR_##err) + +struct uprobe_dispatch_data { + struct trace_uprobe *tu; + unsigned long bp_addr; +}; diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h new file mode 100644 index 0000000000..bb723eefd7 --- /dev/null +++ b/kernel/trace/trace_probe_kernel.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __TRACE_PROBE_KERNEL_H_ +#define __TRACE_PROBE_KERNEL_H_ + +/* + * This depends on trace_probe.h, but can not include it due to + * the way trace_probe_tmpl.h is used by trace_kprobe.c and trace_eprobe.c. + * Which means that any other user must include trace_probe.h before including + * this file. + */ +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +fetch_store_strlen_user(unsigned long addr) +{ + const void __user *uaddr = (__force const void __user *)addr; + + return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); +} + +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +fetch_store_strlen(unsigned long addr) +{ + int ret, len = 0; + u8 c; + +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if (addr < TASK_SIZE) + return fetch_store_strlen_user(addr); +#endif + + do { + ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + + return (ret < 0) ? ret : len; +} + +static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base) +{ + if (ret < 0) + ret = 0; + *(u32 *)dest = make_data_loc(ret, __dest - base); +} + +/* + * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf + * with max length and relative data location. + */ +static nokprobe_inline int +fetch_store_string_user(unsigned long addr, void *dest, void *base) +{ + const void __user *uaddr = (__force const void __user *)addr; + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + long ret; + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); + set_data_loc(ret, dest, __dest, base); + + return ret; +} + +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max + * length and relative data location. + */ +static nokprobe_inline int +fetch_store_string(unsigned long addr, void *dest, void *base) +{ + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + long ret; + +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)addr < TASK_SIZE) + return fetch_store_string_user(addr, dest, base); +#endif + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + /* + * Try to get string again, since the string can be changed while + * probing. + */ + ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); + set_data_loc(ret, dest, __dest, base); + + return ret; +} + +static nokprobe_inline int +probe_mem_read_user(void *dest, void *src, size_t size) +{ + const void __user *uaddr = (__force const void __user *)src; + + return copy_from_user_nofault(dest, uaddr, size); +} + +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size) +{ +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)src < TASK_SIZE) + return probe_mem_read_user(dest, src, size); +#endif + return copy_from_kernel_nofault(dest, src, size); +} + +#endif /* __TRACE_PROBE_KERNEL_H_ */ diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h new file mode 100644 index 0000000000..3935b347f8 --- /dev/null +++ b/kernel/trace/trace_probe_tmpl.h @@ -0,0 +1,275 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Traceprobe fetch helper inlines + */ + +static nokprobe_inline void +fetch_store_raw(unsigned long val, struct fetch_insn *code, void *buf) +{ + switch (code->size) { + case 1: + *(u8 *)buf = (u8)val; + break; + case 2: + *(u16 *)buf = (u16)val; + break; + case 4: + *(u32 *)buf = (u32)val; + break; + case 8: + //TBD: 32bit signed + *(u64 *)buf = (u64)val; + break; + default: + *(unsigned long *)buf = val; + } +} + +static nokprobe_inline void +fetch_apply_bitfield(struct fetch_insn *code, void *buf) +{ + switch (code->basesize) { + case 1: + *(u8 *)buf <<= code->lshift; + *(u8 *)buf >>= code->rshift; + break; + case 2: + *(u16 *)buf <<= code->lshift; + *(u16 *)buf >>= code->rshift; + break; + case 4: + *(u32 *)buf <<= code->lshift; + *(u32 *)buf >>= code->rshift; + break; + case 8: + *(u64 *)buf <<= code->lshift; + *(u64 *)buf >>= code->rshift; + break; + } +} + +/* + * These functions must be defined for each callsite. + * Return consumed dynamic data size (>= 0), or error (< 0). + * If dest is NULL, don't store result and return required dynamic data size. + */ +static int +process_fetch_insn(struct fetch_insn *code, void *rec, + void *dest, void *base); +static nokprobe_inline int fetch_store_strlen(unsigned long addr); +static nokprobe_inline int +fetch_store_string(unsigned long addr, void *dest, void *base); +static nokprobe_inline int fetch_store_strlen_user(unsigned long addr); +static nokprobe_inline int +fetch_store_string_user(unsigned long addr, void *dest, void *base); +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size); +static nokprobe_inline int +probe_mem_read_user(void *dest, void *src, size_t size); + +static nokprobe_inline int +fetch_store_symstrlen(unsigned long addr) +{ + char namebuf[KSYM_SYMBOL_LEN]; + int ret; + + ret = sprint_symbol(namebuf, addr); + if (ret < 0) + return 0; + + return ret + 1; +} + +/* + * Fetch a null-terminated symbol string + offset. Caller MUST set *(u32 *)buf + * with max length and relative data location. + */ +static nokprobe_inline int +fetch_store_symstring(unsigned long addr, void *dest, void *base) +{ + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + return sprint_symbol(__dest, addr); +} + +/* common part of process_fetch_insn*/ +static nokprobe_inline int +process_common_fetch_insn(struct fetch_insn *code, unsigned long *val) +{ + switch (code->op) { + case FETCH_OP_IMM: + *val = code->immediate; + break; + case FETCH_OP_COMM: + *val = (unsigned long)current->comm; + break; + case FETCH_OP_DATA: + *val = (unsigned long)code->data; + break; + default: + return -EILSEQ; + } + return 0; +} + +/* From the 2nd stage, routine is same */ +static nokprobe_inline int +process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val, + void *dest, void *base) +{ + struct fetch_insn *s3 = NULL; + int total = 0, ret = 0, i = 0; + u32 loc = 0; + unsigned long lval = val; + +stage2: + /* 2nd stage: dereference memory if needed */ + do { + if (code->op == FETCH_OP_DEREF) { + lval = val; + ret = probe_mem_read(&val, (void *)val + code->offset, + sizeof(val)); + } else if (code->op == FETCH_OP_UDEREF) { + lval = val; + ret = probe_mem_read_user(&val, + (void *)val + code->offset, sizeof(val)); + } else + break; + if (ret) + return ret; + code++; + } while (1); + + s3 = code; +stage3: + /* 3rd stage: store value to buffer */ + if (unlikely(!dest)) { + switch (code->op) { + case FETCH_OP_ST_STRING: + ret = fetch_store_strlen(val + code->offset); + code++; + goto array; + case FETCH_OP_ST_USTRING: + ret = fetch_store_strlen_user(val + code->offset); + code++; + goto array; + case FETCH_OP_ST_SYMSTR: + ret = fetch_store_symstrlen(val + code->offset); + code++; + goto array; + default: + return -EILSEQ; + } + } + + switch (code->op) { + case FETCH_OP_ST_RAW: + fetch_store_raw(val, code, dest); + break; + case FETCH_OP_ST_MEM: + probe_mem_read(dest, (void *)val + code->offset, code->size); + break; + case FETCH_OP_ST_UMEM: + probe_mem_read_user(dest, (void *)val + code->offset, code->size); + break; + case FETCH_OP_ST_STRING: + loc = *(u32 *)dest; + ret = fetch_store_string(val + code->offset, dest, base); + break; + case FETCH_OP_ST_USTRING: + loc = *(u32 *)dest; + ret = fetch_store_string_user(val + code->offset, dest, base); + break; + case FETCH_OP_ST_SYMSTR: + loc = *(u32 *)dest; + ret = fetch_store_symstring(val + code->offset, dest, base); + break; + default: + return -EILSEQ; + } + code++; + + /* 4th stage: modify stored value if needed */ + if (code->op == FETCH_OP_MOD_BF) { + fetch_apply_bitfield(code, dest); + code++; + } + +array: + /* the last stage: Loop on array */ + if (code->op == FETCH_OP_LP_ARRAY) { + if (ret < 0) + ret = 0; + total += ret; + if (++i < code->param) { + code = s3; + if (s3->op != FETCH_OP_ST_STRING && + s3->op != FETCH_OP_ST_USTRING) { + dest += s3->size; + val += s3->size; + goto stage3; + } + code--; + val = lval + sizeof(char *); + if (dest) { + dest += sizeof(u32); + *(u32 *)dest = update_data_loc(loc, ret); + } + goto stage2; + } + code++; + ret = total; + } + + return code->op == FETCH_OP_END ? ret : -EILSEQ; +} + +/* Sum up total data length for dynamic arrays (strings) */ +static nokprobe_inline int +__get_data_size(struct trace_probe *tp, struct pt_regs *regs) +{ + struct probe_arg *arg; + int i, len, ret = 0; + + for (i = 0; i < tp->nr_args; i++) { + arg = tp->args + i; + if (unlikely(arg->dynamic)) { + len = process_fetch_insn(arg->code, regs, NULL, NULL); + if (len > 0) + ret += len; + } + } + + return ret; +} + +/* Store the value of each argument */ +static nokprobe_inline void +store_trace_args(void *data, struct trace_probe *tp, void *rec, + int header_size, int maxlen) +{ + struct probe_arg *arg; + void *base = data - header_size; + void *dyndata = data + tp->size; + u32 *dl; /* Data location */ + int ret, i; + + for (i = 0; i < tp->nr_args; i++) { + arg = tp->args + i; + dl = data + arg->offset; + /* Point the dynamic data area if needed */ + if (unlikely(arg->dynamic)) + *dl = make_data_loc(maxlen, dyndata - base); + ret = process_fetch_insn(arg->code, rec, dl, base); + if (arg->dynamic && likely(ret > 0)) { + dyndata += ret; + maxlen -= ret; + } + } +} diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c new file mode 100644 index 0000000000..a520b11afb --- /dev/null +++ b/kernel/trace/trace_recursion_record.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/seq_file.h> +#include <linux/kallsyms.h> +#include <linux/module.h> +#include <linux/ftrace.h> +#include <linux/fs.h> + +#include "trace_output.h" + +struct recursed_functions { + unsigned long ip; + unsigned long parent_ip; +}; + +static struct recursed_functions recursed_functions[CONFIG_FTRACE_RECORD_RECURSION_SIZE]; +static atomic_t nr_records; + +/* + * Cache the last found function. Yes, updates to this is racey, but + * so is memory cache ;-) + */ +static unsigned long cached_function; + +void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip) +{ + int index = 0; + int i; + unsigned long old; + + again: + /* First check the last one recorded */ + if (ip == cached_function) + return; + + i = atomic_read(&nr_records); + /* nr_records is -1 when clearing records */ + smp_mb__after_atomic(); + if (i < 0) + return; + + /* + * If there's two writers and this writer comes in second, + * the cmpxchg() below to update the ip will fail. Then this + * writer will try again. It is possible that index will now + * be greater than nr_records. This is because the writer + * that succeeded has not updated the nr_records yet. + * This writer could keep trying again until the other writer + * updates nr_records. But if the other writer takes an + * interrupt, and that interrupt locks up that CPU, we do + * not want this CPU to lock up due to the recursion protection, + * and have a bug report showing this CPU as the cause of + * locking up the computer. To not lose this record, this + * writer will simply use the next position to update the + * recursed_functions, and it will update the nr_records + * accordingly. + */ + if (index < i) + index = i; + if (index >= CONFIG_FTRACE_RECORD_RECURSION_SIZE) + return; + + for (i = index - 1; i >= 0; i--) { + if (recursed_functions[i].ip == ip) { + cached_function = ip; + return; + } + } + + cached_function = ip; + + /* + * We only want to add a function if it hasn't been added before. + * Add to the current location before incrementing the count. + * If it fails to add, then increment the index (save in i) + * and try again. + */ + old = cmpxchg(&recursed_functions[index].ip, 0, ip); + if (old != 0) { + /* Did something else already added this for us? */ + if (old == ip) + return; + /* Try the next location (use i for the next index) */ + index++; + goto again; + } + + recursed_functions[index].parent_ip = parent_ip; + + /* + * It's still possible that we could race with the clearing + * CPU0 CPU1 + * ---- ---- + * ip = func + * nr_records = -1; + * recursed_functions[0] = 0; + * i = -1 + * if (i < 0) + * nr_records = 0; + * (new recursion detected) + * recursed_functions[0] = func + * cmpxchg(recursed_functions[0], + * func, 0) + * + * But the worse that could happen is that we get a zero in + * the recursed_functions array, and it's likely that "func" will + * be recorded again. + */ + i = atomic_read(&nr_records); + smp_mb__after_atomic(); + if (i < 0) + cmpxchg(&recursed_functions[index].ip, ip, 0); + else if (i <= index) + atomic_cmpxchg(&nr_records, i, index + 1); +} +EXPORT_SYMBOL_GPL(ftrace_record_recursion); + +static DEFINE_MUTEX(recursed_function_lock); +static struct trace_seq *tseq; + +static void *recursed_function_seq_start(struct seq_file *m, loff_t *pos) +{ + void *ret = NULL; + int index; + + mutex_lock(&recursed_function_lock); + index = atomic_read(&nr_records); + if (*pos < index) { + ret = &recursed_functions[*pos]; + } + + tseq = kzalloc(sizeof(*tseq), GFP_KERNEL); + if (!tseq) + return ERR_PTR(-ENOMEM); + + trace_seq_init(tseq); + + return ret; +} + +static void *recursed_function_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + int index; + int p; + + index = atomic_read(&nr_records); + p = ++(*pos); + + return p < index ? &recursed_functions[p] : NULL; +} + +static void recursed_function_seq_stop(struct seq_file *m, void *v) +{ + kfree(tseq); + mutex_unlock(&recursed_function_lock); +} + +static int recursed_function_seq_show(struct seq_file *m, void *v) +{ + struct recursed_functions *record = v; + int ret = 0; + + if (record) { + trace_seq_print_sym(tseq, record->parent_ip, true); + trace_seq_puts(tseq, ":\t"); + trace_seq_print_sym(tseq, record->ip, true); + trace_seq_putc(tseq, '\n'); + ret = trace_print_seq(m, tseq); + } + + return ret; +} + +static const struct seq_operations recursed_function_seq_ops = { + .start = recursed_function_seq_start, + .next = recursed_function_seq_next, + .stop = recursed_function_seq_stop, + .show = recursed_function_seq_show +}; + +static int recursed_function_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + mutex_lock(&recursed_function_lock); + /* If this file was opened for write, then erase contents */ + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + /* disable updating records */ + atomic_set(&nr_records, -1); + smp_mb__after_atomic(); + memset(recursed_functions, 0, sizeof(recursed_functions)); + smp_wmb(); + /* enable them again */ + atomic_set(&nr_records, 0); + } + if (file->f_mode & FMODE_READ) + ret = seq_open(file, &recursed_function_seq_ops); + mutex_unlock(&recursed_function_lock); + + return ret; +} + +static ssize_t recursed_function_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + return count; +} + +static int recursed_function_release(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + return 0; +} + +static const struct file_operations recursed_functions_fops = { + .open = recursed_function_open, + .write = recursed_function_write, + .read = seq_read, + .llseek = seq_lseek, + .release = recursed_function_release, +}; + +__init static int create_recursed_functions(void) +{ + + trace_create_file("recursed_functions", TRACE_MODE_WRITE, + NULL, NULL, &recursed_functions_fops); + return 0; +} + +fs_initcall(create_recursed_functions); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c new file mode 100644 index 0000000000..c9ffdcfe62 --- /dev/null +++ b/kernel/trace/trace_sched_switch.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace context switch + * + * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com> + * + */ +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/uaccess.h> +#include <linux/ftrace.h> +#include <trace/events/sched.h> + +#include "trace.h" + +#define RECORD_CMDLINE 1 +#define RECORD_TGID 2 + +static int sched_cmdline_ref; +static int sched_tgid_ref; +static DEFINE_MUTEX(sched_register_mutex); + +static void +probe_sched_switch(void *ignore, bool preempt, + struct task_struct *prev, struct task_struct *next, + unsigned int prev_state) +{ + int flags; + + flags = (RECORD_TGID * !!sched_tgid_ref) + + (RECORD_CMDLINE * !!sched_cmdline_ref); + + if (!flags) + return; + tracing_record_taskinfo_sched_switch(prev, next, flags); +} + +static void +probe_sched_wakeup(void *ignore, struct task_struct *wakee) +{ + int flags; + + flags = (RECORD_TGID * !!sched_tgid_ref) + + (RECORD_CMDLINE * !!sched_cmdline_ref); + + if (!flags) + return; + tracing_record_taskinfo_sched_switch(current, wakee, flags); +} + +static int tracing_sched_register(void) +{ + int ret; + + ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL); + if (ret) { + pr_info("wakeup trace: Couldn't activate tracepoint" + " probe to kernel_sched_wakeup\n"); + return ret; + } + + ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL); + if (ret) { + pr_info("wakeup trace: Couldn't activate tracepoint" + " probe to kernel_sched_wakeup_new\n"); + goto fail_deprobe; + } + + ret = register_trace_sched_switch(probe_sched_switch, NULL); + if (ret) { + pr_info("sched trace: Couldn't activate tracepoint" + " probe to kernel_sched_switch\n"); + goto fail_deprobe_wake_new; + } + + return ret; +fail_deprobe_wake_new: + unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); +fail_deprobe: + unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); + return ret; +} + +static void tracing_sched_unregister(void) +{ + unregister_trace_sched_switch(probe_sched_switch, NULL); + unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); + unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); +} + +static void tracing_start_sched_switch(int ops) +{ + bool sched_register; + + mutex_lock(&sched_register_mutex); + sched_register = (!sched_cmdline_ref && !sched_tgid_ref); + + switch (ops) { + case RECORD_CMDLINE: + sched_cmdline_ref++; + break; + + case RECORD_TGID: + sched_tgid_ref++; + break; + } + + if (sched_register && (sched_cmdline_ref || sched_tgid_ref)) + tracing_sched_register(); + mutex_unlock(&sched_register_mutex); +} + +static void tracing_stop_sched_switch(int ops) +{ + mutex_lock(&sched_register_mutex); + + switch (ops) { + case RECORD_CMDLINE: + sched_cmdline_ref--; + break; + + case RECORD_TGID: + sched_tgid_ref--; + break; + } + + if (!sched_cmdline_ref && !sched_tgid_ref) + tracing_sched_unregister(); + mutex_unlock(&sched_register_mutex); +} + +void tracing_start_cmdline_record(void) +{ + tracing_start_sched_switch(RECORD_CMDLINE); +} + +void tracing_stop_cmdline_record(void) +{ + tracing_stop_sched_switch(RECORD_CMDLINE); +} + +void tracing_start_tgid_record(void) +{ + tracing_start_sched_switch(RECORD_TGID); +} + +void tracing_stop_tgid_record(void) +{ + tracing_stop_sched_switch(RECORD_TGID); +} diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c new file mode 100644 index 0000000000..0469a04a35 --- /dev/null +++ b/kernel/trace/trace_sched_wakeup.c @@ -0,0 +1,820 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace task wakeup timings + * + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> + * + * Based on code from the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 Nadia Yvette Chambers + */ +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/uaccess.h> +#include <linux/ftrace.h> +#include <linux/sched/rt.h> +#include <linux/sched/deadline.h> +#include <trace/events/sched.h> +#include "trace.h" + +static struct trace_array *wakeup_trace; +static int __read_mostly tracer_enabled; + +static struct task_struct *wakeup_task; +static int wakeup_cpu; +static int wakeup_current_cpu; +static unsigned wakeup_prio = -1; +static bool wakeup_rt; +static bool wakeup_dl; +static bool tracing_dl; + +static arch_spinlock_t wakeup_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + +static void wakeup_reset(struct trace_array *tr); +static void __wakeup_reset(struct trace_array *tr); +static int start_func_tracer(struct trace_array *tr, int graph); +static void stop_func_tracer(struct trace_array *tr, int graph); + +static int save_flags; + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH) +#else +# define is_graph(tr) false +#endif + +#ifdef CONFIG_FUNCTION_TRACER + +static bool function_enabled; + +/* + * Prologue for the wakeup function tracers. + * + * Returns 1 if it is OK to continue, and preemption + * is disabled and data->disabled is incremented. + * 0 if the trace is to be ignored, and preemption + * is not disabled and data->disabled is + * kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. + */ +static int +func_prolog_preempt_disable(struct trace_array *tr, + struct trace_array_cpu **data, + unsigned int *trace_ctx) +{ + long disabled; + int cpu; + + if (likely(!wakeup_task)) + return 0; + + *trace_ctx = tracing_gen_ctx(); + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); + if (cpu != wakeup_current_cpu) + goto out_enable; + + *data = per_cpu_ptr(tr->array_buffer.data, cpu); + disabled = atomic_inc_return(&(*data)->disabled); + if (unlikely(disabled != 1)) + goto out; + + return 1; + +out: + atomic_dec(&(*data)->disabled); + +out_enable: + preempt_enable_notrace(); + return 0; +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +static int wakeup_display_graph(struct trace_array *tr, int set) +{ + if (!(is_graph(tr) ^ set)) + return 0; + + stop_func_tracer(tr, !set); + + wakeup_reset(wakeup_trace); + tr->max_latency = 0; + + return start_func_tracer(tr, set); +} + +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned int trace_ctx; + int ret = 0; + + if (ftrace_graph_ignore_func(trace)) + return 0; + /* + * Do not trace a function if it's filtered by set_graph_notrace. + * Make the index of ret stack negative to indicate that it should + * ignore further functions. But it needs its own ret stack entry + * to recover the original index in order to continue tracing after + * returning from the function. + */ + if (ftrace_graph_notrace_addr(trace->func)) + return 1; + + if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) + return 0; + + ret = __trace_graph_entry(tr, trace, trace_ctx); + atomic_dec(&data->disabled); + preempt_enable_notrace(); + + return ret; +} + +static void wakeup_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned int trace_ctx; + + ftrace_graph_addr_finish(trace); + + if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) + return; + + __trace_graph_return(tr, trace, trace_ctx); + atomic_dec(&data->disabled); + + preempt_enable_notrace(); + return; +} + +static struct fgraph_ops fgraph_wakeup_ops = { + .entryfunc = &wakeup_graph_entry, + .retfunc = &wakeup_graph_return, +}; + +static void wakeup_trace_open(struct trace_iterator *iter) +{ + if (is_graph(iter->tr)) + graph_trace_open(iter); + else + iter->private = NULL; +} + +static void wakeup_trace_close(struct trace_iterator *iter) +{ + if (iter->private) + graph_trace_close(iter); +} + +#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \ + TRACE_GRAPH_PRINT_CPU | \ + TRACE_GRAPH_PRINT_REL_TIME | \ + TRACE_GRAPH_PRINT_DURATION | \ + TRACE_GRAPH_PRINT_OVERHEAD | \ + TRACE_GRAPH_PRINT_IRQS) + +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + /* + * In graph mode call the graph tracer output function, + * otherwise go with the TRACE_FN event handler + */ + if (is_graph(iter->tr)) + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); + + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_print_header(struct seq_file *s) +{ + if (is_graph(wakeup_trace)) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else + trace_default_header(s); +} +#endif /* else CONFIG_FUNCTION_GRAPH_TRACER */ + +/* + * wakeup uses its own tracer function to keep the overhead down: + */ +static void +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + unsigned int trace_ctx; + + if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) + return; + + local_irq_save(flags); + trace_function(tr, ip, parent_ip, trace_ctx); + local_irq_restore(flags); + + atomic_dec(&data->disabled); + preempt_enable_notrace(); +} + +static int register_wakeup_function(struct trace_array *tr, int graph, int set) +{ + int ret; + + /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ + if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION))) + return 0; + + if (graph) + ret = register_ftrace_graph(&fgraph_wakeup_ops); + else + ret = register_ftrace_function(tr->ops); + + if (!ret) + function_enabled = true; + + return ret; +} + +static void unregister_wakeup_function(struct trace_array *tr, int graph) +{ + if (!function_enabled) + return; + + if (graph) + unregister_ftrace_graph(&fgraph_wakeup_ops); + else + unregister_ftrace_function(tr->ops); + + function_enabled = false; +} + +static int wakeup_function_set(struct trace_array *tr, u32 mask, int set) +{ + if (!(mask & TRACE_ITER_FUNCTION)) + return 0; + + if (set) + register_wakeup_function(tr, is_graph(tr), 1); + else + unregister_wakeup_function(tr, is_graph(tr)); + return 1; +} +#else /* CONFIG_FUNCTION_TRACER */ +static int register_wakeup_function(struct trace_array *tr, int graph, int set) +{ + return 0; +} +static void unregister_wakeup_function(struct trace_array *tr, int graph) { } +static int wakeup_function_set(struct trace_array *tr, u32 mask, int set) +{ + return 0; +} +#endif /* else CONFIG_FUNCTION_TRACER */ + +#ifndef CONFIG_FUNCTION_GRAPH_TRACER +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_trace_open(struct trace_iterator *iter) { } +static void wakeup_trace_close(struct trace_iterator *iter) { } + +static void wakeup_print_header(struct seq_file *s) +{ + trace_default_header(s); +} +#endif /* !CONFIG_FUNCTION_GRAPH_TRACER */ + +static void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned int trace_ctx) +{ + if (is_graph(tr)) + trace_graph_function(tr, ip, parent_ip, trace_ctx); + else + trace_function(tr, ip, parent_ip, trace_ctx); +} + +static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) +{ + struct tracer *tracer = tr->current_trace; + + if (wakeup_function_set(tr, mask, set)) + return 0; + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (mask & TRACE_ITER_DISPLAY_GRAPH) + return wakeup_display_graph(tr, set); +#endif + + return trace_keep_overwrite(tracer, mask, set); +} + +static int start_func_tracer(struct trace_array *tr, int graph) +{ + int ret; + + ret = register_wakeup_function(tr, graph, 0); + + if (!ret && tracing_is_enabled()) + tracer_enabled = 1; + else + tracer_enabled = 0; + + return ret; +} + +static void stop_func_tracer(struct trace_array *tr, int graph) +{ + tracer_enabled = 0; + + unregister_wakeup_function(tr, graph); +} + +/* + * Should this new latency be reported/recorded? + */ +static bool report_latency(struct trace_array *tr, u64 delta) +{ + if (tracing_thresh) { + if (delta < tracing_thresh) + return false; + } else { + if (delta <= tr->max_latency) + return false; + } + return true; +} + +static void +probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu) +{ + if (task != wakeup_task) + return; + + wakeup_current_cpu = cpu; +} + +static void +tracing_sched_switch_trace(struct trace_array *tr, + struct task_struct *prev, + struct task_struct *next, + unsigned int trace_ctx) +{ + struct trace_event_call *call = &event_context_switch; + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_CTX, + sizeof(*entry), trace_ctx); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = prev->pid; + entry->prev_prio = prev->prio; + entry->prev_state = task_state_index(prev); + entry->next_pid = next->pid; + entry->next_prio = next->prio; + entry->next_state = task_state_index(next); + entry->next_cpu = task_cpu(next); + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); +} + +static void +tracing_sched_wakeup_trace(struct trace_array *tr, + struct task_struct *wakee, + struct task_struct *curr, + unsigned int trace_ctx) +{ + struct trace_event_call *call = &event_wakeup; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + struct trace_buffer *buffer = tr->array_buffer.buffer; + + event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, + sizeof(*entry), trace_ctx); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = curr->pid; + entry->prev_prio = curr->prio; + entry->prev_state = task_state_index(curr); + entry->next_pid = wakee->pid; + entry->next_prio = wakee->prio; + entry->next_state = task_state_index(wakee); + entry->next_cpu = task_cpu(wakee); + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); +} + +static void notrace +probe_wakeup_sched_switch(void *ignore, bool preempt, + struct task_struct *prev, struct task_struct *next, + unsigned int prev_state) +{ + struct trace_array_cpu *data; + u64 T0, T1, delta; + unsigned long flags; + long disabled; + int cpu; + unsigned int trace_ctx; + + tracing_record_cmdline(prev); + + if (unlikely(!tracer_enabled)) + return; + + /* + * When we start a new trace, we set wakeup_task to NULL + * and then set tracer_enabled = 1. We want to make sure + * that another CPU does not see the tracer_enabled = 1 + * and the wakeup_task with an older task, that might + * actually be the same as next. + */ + smp_rmb(); + + if (next != wakeup_task) + return; + + /* disable local data, not wakeup_cpu data */ + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + if (likely(disabled != 1)) + goto out; + + local_irq_save(flags); + trace_ctx = tracing_gen_ctx_flags(flags); + + arch_spin_lock(&wakeup_lock); + + /* We could race with grabbing wakeup_lock */ + if (unlikely(!tracer_enabled || next != wakeup_task)) + goto out_unlock; + + /* The task we are waiting for is waking up */ + data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu); + + __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, trace_ctx); + tracing_sched_switch_trace(wakeup_trace, prev, next, trace_ctx); + __trace_stack(wakeup_trace, trace_ctx, 0); + + T0 = data->preempt_timestamp; + T1 = ftrace_now(cpu); + delta = T1-T0; + + if (!report_latency(wakeup_trace, delta)) + goto out_unlock; + + if (likely(!is_tracing_stopped())) { + wakeup_trace->max_latency = delta; + update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu, NULL); + } + +out_unlock: + __wakeup_reset(wakeup_trace); + arch_spin_unlock(&wakeup_lock); + local_irq_restore(flags); +out: + atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); +} + +static void __wakeup_reset(struct trace_array *tr) +{ + wakeup_cpu = -1; + wakeup_prio = -1; + tracing_dl = false; + + if (wakeup_task) + put_task_struct(wakeup_task); + + wakeup_task = NULL; +} + +static void wakeup_reset(struct trace_array *tr) +{ + unsigned long flags; + + tracing_reset_online_cpus(&tr->array_buffer); + + local_irq_save(flags); + arch_spin_lock(&wakeup_lock); + __wakeup_reset(tr); + arch_spin_unlock(&wakeup_lock); + local_irq_restore(flags); +} + +static void +probe_wakeup(void *ignore, struct task_struct *p) +{ + struct trace_array_cpu *data; + int cpu = smp_processor_id(); + long disabled; + unsigned int trace_ctx; + + if (likely(!tracer_enabled)) + return; + + tracing_record_cmdline(p); + tracing_record_cmdline(current); + + /* + * Semantic is like this: + * - wakeup tracer handles all tasks in the system, independently + * from their scheduling class; + * - wakeup_rt tracer handles tasks belonging to sched_dl and + * sched_rt class; + * - wakeup_dl handles tasks belonging to sched_dl class only. + */ + if (tracing_dl || (wakeup_dl && !dl_task(p)) || + (wakeup_rt && !dl_task(p) && !rt_task(p)) || + (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) + return; + + disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + if (unlikely(disabled != 1)) + goto out; + + trace_ctx = tracing_gen_ctx(); + + /* interrupts should be off from try_to_wake_up */ + arch_spin_lock(&wakeup_lock); + + /* check for races. */ + if (!tracer_enabled || tracing_dl || + (!dl_task(p) && p->prio >= wakeup_prio)) + goto out_locked; + + /* reset the trace */ + __wakeup_reset(wakeup_trace); + + wakeup_cpu = task_cpu(p); + wakeup_current_cpu = wakeup_cpu; + wakeup_prio = p->prio; + + /* + * Once you start tracing a -deadline task, don't bother tracing + * another task until the first one wakes up. + */ + if (dl_task(p)) + tracing_dl = true; + else + tracing_dl = false; + + wakeup_task = get_task_struct(p); + + data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu); + data->preempt_timestamp = ftrace_now(cpu); + tracing_sched_wakeup_trace(wakeup_trace, p, current, trace_ctx); + __trace_stack(wakeup_trace, trace_ctx, 0); + + /* + * We must be careful in using CALLER_ADDR2. But since wake_up + * is not called by an assembly function (where as schedule is) + * it should be safe to use it here. + */ + __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, trace_ctx); + +out_locked: + arch_spin_unlock(&wakeup_lock); +out: + atomic_dec(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); +} + +static void start_wakeup_tracer(struct trace_array *tr) +{ + int ret; + + ret = register_trace_sched_wakeup(probe_wakeup, NULL); + if (ret) { + pr_info("wakeup trace: Couldn't activate tracepoint" + " probe to kernel_sched_wakeup\n"); + return; + } + + ret = register_trace_sched_wakeup_new(probe_wakeup, NULL); + if (ret) { + pr_info("wakeup trace: Couldn't activate tracepoint" + " probe to kernel_sched_wakeup_new\n"); + goto fail_deprobe; + } + + ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL); + if (ret) { + pr_info("sched trace: Couldn't activate tracepoint" + " probe to kernel_sched_switch\n"); + goto fail_deprobe_wake_new; + } + + ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); + if (ret) { + pr_info("wakeup trace: Couldn't activate tracepoint" + " probe to kernel_sched_migrate_task\n"); + goto fail_deprobe_sched_switch; + } + + wakeup_reset(tr); + + /* + * Don't let the tracer_enabled = 1 show up before + * the wakeup_task is reset. This may be overkill since + * wakeup_reset does a spin_unlock after setting the + * wakeup_task to NULL, but I want to be safe. + * This is a slow path anyway. + */ + smp_wmb(); + + if (start_func_tracer(tr, is_graph(tr))) + printk(KERN_ERR "failed to start wakeup tracer\n"); + + return; +fail_deprobe_sched_switch: + unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); +fail_deprobe_wake_new: + unregister_trace_sched_wakeup_new(probe_wakeup, NULL); +fail_deprobe: + unregister_trace_sched_wakeup(probe_wakeup, NULL); +} + +static void stop_wakeup_tracer(struct trace_array *tr) +{ + tracer_enabled = 0; + stop_func_tracer(tr, is_graph(tr)); + unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); + unregister_trace_sched_wakeup_new(probe_wakeup, NULL); + unregister_trace_sched_wakeup(probe_wakeup, NULL); + unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); +} + +static bool wakeup_busy; + +static int __wakeup_tracer_init(struct trace_array *tr) +{ + save_flags = tr->trace_flags; + + /* non overwrite screws up the latency tracers */ + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); + + tr->max_latency = 0; + wakeup_trace = tr; + ftrace_init_array_ops(tr, wakeup_tracer_call); + start_wakeup_tracer(tr); + + wakeup_busy = true; + return 0; +} + +static int wakeup_tracer_init(struct trace_array *tr) +{ + if (wakeup_busy) + return -EBUSY; + + wakeup_dl = false; + wakeup_rt = false; + return __wakeup_tracer_init(tr); +} + +static int wakeup_rt_tracer_init(struct trace_array *tr) +{ + if (wakeup_busy) + return -EBUSY; + + wakeup_dl = false; + wakeup_rt = true; + return __wakeup_tracer_init(tr); +} + +static int wakeup_dl_tracer_init(struct trace_array *tr) +{ + if (wakeup_busy) + return -EBUSY; + + wakeup_dl = true; + wakeup_rt = false; + return __wakeup_tracer_init(tr); +} + +static void wakeup_tracer_reset(struct trace_array *tr) +{ + int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; + int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + + stop_wakeup_tracer(tr); + /* make sure we put back any tasks we are tracing */ + wakeup_reset(tr); + + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); + ftrace_reset_array_ops(tr); + wakeup_busy = false; +} + +static void wakeup_tracer_start(struct trace_array *tr) +{ + wakeup_reset(tr); + tracer_enabled = 1; +} + +static void wakeup_tracer_stop(struct trace_array *tr) +{ + tracer_enabled = 0; +} + +static struct tracer wakeup_tracer __read_mostly = +{ + .name = "wakeup", + .init = wakeup_tracer_init, + .reset = wakeup_tracer_reset, + .start = wakeup_tracer_start, + .stop = wakeup_tracer_stop, + .print_max = true, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flag_changed = wakeup_flag_changed, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +#endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, + .allow_instances = true, + .use_max_tr = true, +}; + +static struct tracer wakeup_rt_tracer __read_mostly = +{ + .name = "wakeup_rt", + .init = wakeup_rt_tracer_init, + .reset = wakeup_tracer_reset, + .start = wakeup_tracer_start, + .stop = wakeup_tracer_stop, + .print_max = true, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flag_changed = wakeup_flag_changed, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +#endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, + .allow_instances = true, + .use_max_tr = true, +}; + +static struct tracer wakeup_dl_tracer __read_mostly = +{ + .name = "wakeup_dl", + .init = wakeup_dl_tracer_init, + .reset = wakeup_tracer_reset, + .start = wakeup_tracer_start, + .stop = wakeup_tracer_stop, + .print_max = true, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flag_changed = wakeup_flag_changed, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +#endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, + .allow_instances = true, + .use_max_tr = true, +}; + +__init static int init_wakeup_tracer(void) +{ + int ret; + + ret = register_tracer(&wakeup_tracer); + if (ret) + return ret; + + ret = register_tracer(&wakeup_rt_tracer); + if (ret) + return ret; + + ret = register_tracer(&wakeup_dl_tracer); + if (ret) + return ret; + + return 0; +} +core_initcall(init_wakeup_tracer); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c new file mode 100644 index 0000000000..529590499b --- /dev/null +++ b/kernel/trace/trace_selftest.c @@ -0,0 +1,1299 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Include in trace.c */ + +#include <uapi/linux/sched/types.h> +#include <linux/stringify.h> +#include <linux/kthread.h> +#include <linux/delay.h> +#include <linux/slab.h> + +static inline int trace_valid_entry(struct trace_entry *entry) +{ + switch (entry->type) { + case TRACE_FN: + case TRACE_CTX: + case TRACE_WAKE: + case TRACE_STACK: + case TRACE_PRINT: + case TRACE_BRANCH: + case TRACE_GRAPH_ENT: + case TRACE_GRAPH_RET: + return 1; + } + return 0; +} + +static int trace_test_buffer_cpu(struct array_buffer *buf, int cpu) +{ + struct ring_buffer_event *event; + struct trace_entry *entry; + unsigned int loops = 0; + + while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) { + entry = ring_buffer_event_data(event); + + /* + * The ring buffer is a size of trace_buf_size, if + * we loop more than the size, there's something wrong + * with the ring buffer. + */ + if (loops++ > trace_buf_size) { + printk(KERN_CONT ".. bad ring buffer "); + goto failed; + } + if (!trace_valid_entry(entry)) { + printk(KERN_CONT ".. invalid entry %d ", + entry->type); + goto failed; + } + } + return 0; + + failed: + /* disable tracing */ + tracing_disabled = 1; + printk(KERN_CONT ".. corrupted trace buffer .. "); + return -1; +} + +/* + * Test the trace buffer to see if all the elements + * are still sane. + */ +static int __maybe_unused trace_test_buffer(struct array_buffer *buf, unsigned long *count) +{ + unsigned long flags, cnt = 0; + int cpu, ret = 0; + + /* Don't allow flipping of max traces now */ + local_irq_save(flags); + arch_spin_lock(&buf->tr->max_lock); + + cnt = ring_buffer_entries(buf->buffer); + + /* + * The trace_test_buffer_cpu runs a while loop to consume all data. + * If the calling tracer is broken, and is constantly filling + * the buffer, this will run forever, and hard lock the box. + * We disable the ring buffer while we do this test to prevent + * a hard lock up. + */ + tracing_off(); + for_each_possible_cpu(cpu) { + ret = trace_test_buffer_cpu(buf, cpu); + if (ret) + break; + } + tracing_on(); + arch_spin_unlock(&buf->tr->max_lock); + local_irq_restore(flags); + + if (count) + *count = cnt; + + return ret; +} + +static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) +{ + printk(KERN_WARNING "Failed to init %s tracer, init returned %d\n", + trace->name, init_ret); +} +#ifdef CONFIG_FUNCTION_TRACER + +#ifdef CONFIG_DYNAMIC_FTRACE + +static int trace_selftest_test_probe1_cnt; +static void trace_selftest_test_probe1_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + trace_selftest_test_probe1_cnt++; +} + +static int trace_selftest_test_probe2_cnt; +static void trace_selftest_test_probe2_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + trace_selftest_test_probe2_cnt++; +} + +static int trace_selftest_test_probe3_cnt; +static void trace_selftest_test_probe3_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + trace_selftest_test_probe3_cnt++; +} + +static int trace_selftest_test_global_cnt; +static void trace_selftest_test_global_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + trace_selftest_test_global_cnt++; +} + +static int trace_selftest_test_dyn_cnt; +static void trace_selftest_test_dyn_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + trace_selftest_test_dyn_cnt++; +} + +static struct ftrace_ops test_probe1 = { + .func = trace_selftest_test_probe1_func, +}; + +static struct ftrace_ops test_probe2 = { + .func = trace_selftest_test_probe2_func, +}; + +static struct ftrace_ops test_probe3 = { + .func = trace_selftest_test_probe3_func, +}; + +static void print_counts(void) +{ + printk("(%d %d %d %d %d) ", + trace_selftest_test_probe1_cnt, + trace_selftest_test_probe2_cnt, + trace_selftest_test_probe3_cnt, + trace_selftest_test_global_cnt, + trace_selftest_test_dyn_cnt); +} + +static void reset_counts(void) +{ + trace_selftest_test_probe1_cnt = 0; + trace_selftest_test_probe2_cnt = 0; + trace_selftest_test_probe3_cnt = 0; + trace_selftest_test_global_cnt = 0; + trace_selftest_test_dyn_cnt = 0; +} + +static int trace_selftest_ops(struct trace_array *tr, int cnt) +{ + int save_ftrace_enabled = ftrace_enabled; + struct ftrace_ops *dyn_ops; + char *func1_name; + char *func2_name; + int len1; + int len2; + int ret = -1; + + printk(KERN_CONT "PASSED\n"); + pr_info("Testing dynamic ftrace ops #%d: ", cnt); + + ftrace_enabled = 1; + reset_counts(); + + /* Handle PPC64 '.' name */ + func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2); + len1 = strlen(func1_name); + len2 = strlen(func2_name); + + /* + * Probe 1 will trace function 1. + * Probe 2 will trace function 2. + * Probe 3 will trace functions 1 and 2. + */ + ftrace_set_filter(&test_probe1, func1_name, len1, 1); + ftrace_set_filter(&test_probe2, func2_name, len2, 1); + ftrace_set_filter(&test_probe3, func1_name, len1, 1); + ftrace_set_filter(&test_probe3, func2_name, len2, 0); + + register_ftrace_function(&test_probe1); + register_ftrace_function(&test_probe2); + register_ftrace_function(&test_probe3); + /* First time we are running with main function */ + if (cnt > 1) { + ftrace_init_array_ops(tr, trace_selftest_test_global_func); + register_ftrace_function(tr->ops); + } + + DYN_FTRACE_TEST_NAME(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 1) + goto out; + if (trace_selftest_test_probe2_cnt != 0) + goto out; + if (trace_selftest_test_probe3_cnt != 1) + goto out; + if (cnt > 1) { + if (trace_selftest_test_global_cnt == 0) + goto out; + } + + DYN_FTRACE_TEST_NAME2(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 1) + goto out; + if (trace_selftest_test_probe2_cnt != 1) + goto out; + if (trace_selftest_test_probe3_cnt != 2) + goto out; + + /* Add a dynamic probe */ + dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL); + if (!dyn_ops) { + printk("MEMORY ERROR "); + goto out; + } + + dyn_ops->func = trace_selftest_test_dyn_func; + + register_ftrace_function(dyn_ops); + + trace_selftest_test_global_cnt = 0; + + DYN_FTRACE_TEST_NAME(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 2) + goto out_free; + if (trace_selftest_test_probe2_cnt != 1) + goto out_free; + if (trace_selftest_test_probe3_cnt != 3) + goto out_free; + if (cnt > 1) { + if (trace_selftest_test_global_cnt == 0) + goto out_free; + } + if (trace_selftest_test_dyn_cnt == 0) + goto out_free; + + DYN_FTRACE_TEST_NAME2(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 2) + goto out_free; + if (trace_selftest_test_probe2_cnt != 2) + goto out_free; + if (trace_selftest_test_probe3_cnt != 4) + goto out_free; + + /* Remove trace function from probe 3 */ + func1_name = "!" __stringify(DYN_FTRACE_TEST_NAME); + len1 = strlen(func1_name); + + ftrace_set_filter(&test_probe3, func1_name, len1, 0); + + DYN_FTRACE_TEST_NAME(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 3) + goto out_free; + if (trace_selftest_test_probe2_cnt != 2) + goto out_free; + if (trace_selftest_test_probe3_cnt != 4) + goto out_free; + if (cnt > 1) { + if (trace_selftest_test_global_cnt == 0) + goto out_free; + } + if (trace_selftest_test_dyn_cnt == 0) + goto out_free; + + DYN_FTRACE_TEST_NAME2(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 3) + goto out_free; + if (trace_selftest_test_probe2_cnt != 3) + goto out_free; + if (trace_selftest_test_probe3_cnt != 5) + goto out_free; + + ret = 0; + out_free: + unregister_ftrace_function(dyn_ops); + kfree(dyn_ops); + + out: + /* Purposely unregister in the same order */ + unregister_ftrace_function(&test_probe1); + unregister_ftrace_function(&test_probe2); + unregister_ftrace_function(&test_probe3); + if (cnt > 1) + unregister_ftrace_function(tr->ops); + ftrace_reset_array_ops(tr); + + /* Make sure everything is off */ + reset_counts(); + DYN_FTRACE_TEST_NAME(); + DYN_FTRACE_TEST_NAME(); + + if (trace_selftest_test_probe1_cnt || + trace_selftest_test_probe2_cnt || + trace_selftest_test_probe3_cnt || + trace_selftest_test_global_cnt || + trace_selftest_test_dyn_cnt) + ret = -1; + + ftrace_enabled = save_ftrace_enabled; + + return ret; +} + +/* Test dynamic code modification and ftrace filters */ +static int trace_selftest_startup_dynamic_tracing(struct tracer *trace, + struct trace_array *tr, + int (*func)(void)) +{ + int save_ftrace_enabled = ftrace_enabled; + unsigned long count; + char *func_name; + int ret; + + /* The ftrace test PASSED */ + printk(KERN_CONT "PASSED\n"); + pr_info("Testing dynamic ftrace: "); + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + + /* passed in by parameter to fool gcc from optimizing */ + func(); + + /* + * Some archs *cough*PowerPC*cough* add characters to the + * start of the function names. We simply put a '*' to + * accommodate them. + */ + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + + /* filter only on our function */ + ftrace_set_global_filter(func_name, strlen(func_name), 1); + + /* enable tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } + + /* Sleep for a 1/10 of a second */ + msleep(100); + + /* we should have nothing in the buffer */ + ret = trace_test_buffer(&tr->array_buffer, &count); + if (ret) + goto out; + + if (count) { + ret = -1; + printk(KERN_CONT ".. filter did not filter .. "); + goto out; + } + + /* call our function again */ + func(); + + /* sleep again */ + msleep(100); + + /* stop the tracing. */ + tracing_stop(); + ftrace_enabled = 0; + + /* check the trace buffer */ + ret = trace_test_buffer(&tr->array_buffer, &count); + + ftrace_enabled = 1; + tracing_start(); + + /* we should only have one item */ + if (!ret && count != 1) { + trace->reset(tr); + printk(KERN_CONT ".. filter failed count=%ld ..", count); + ret = -1; + goto out; + } + + /* Test the ops with global tracing running */ + ret = trace_selftest_ops(tr, 1); + trace->reset(tr); + + out: + ftrace_enabled = save_ftrace_enabled; + + /* Enable tracing on all functions again */ + ftrace_set_global_filter(NULL, 0, 1); + + /* Test the ops with global tracing off */ + if (!ret) + ret = trace_selftest_ops(tr, 2); + + return ret; +} + +static int trace_selftest_recursion_cnt; +static void trace_selftest_test_recursion_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + /* + * This function is registered without the recursion safe flag. + * The ftrace infrastructure should provide the recursion + * protection. If not, this will crash the kernel! + */ + if (trace_selftest_recursion_cnt++ > 10) + return; + DYN_FTRACE_TEST_NAME(); +} + +static void trace_selftest_test_recursion_safe_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + /* + * We said we would provide our own recursion. By calling + * this function again, we should recurse back into this function + * and count again. But this only happens if the arch supports + * all of ftrace features and nothing else is using the function + * tracing utility. + */ + if (trace_selftest_recursion_cnt++) + return; + DYN_FTRACE_TEST_NAME(); +} + +static struct ftrace_ops test_rec_probe = { + .func = trace_selftest_test_recursion_func, + .flags = FTRACE_OPS_FL_RECURSION, +}; + +static struct ftrace_ops test_recsafe_probe = { + .func = trace_selftest_test_recursion_safe_func, +}; + +static int +trace_selftest_function_recursion(void) +{ + int save_ftrace_enabled = ftrace_enabled; + char *func_name; + int len; + int ret; + + /* The previous test PASSED */ + pr_cont("PASSED\n"); + pr_info("Testing ftrace recursion: "); + + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + + /* Handle PPC64 '.' name */ + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + len = strlen(func_name); + + ret = ftrace_set_filter(&test_rec_probe, func_name, len, 1); + if (ret) { + pr_cont("*Could not set filter* "); + goto out; + } + + ret = register_ftrace_function(&test_rec_probe); + if (ret) { + pr_cont("*could not register callback* "); + goto out; + } + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_function(&test_rec_probe); + + ret = -1; + /* + * Recursion allows for transitions between context, + * and may call the callback twice. + */ + if (trace_selftest_recursion_cnt != 1 && + trace_selftest_recursion_cnt != 2) { + pr_cont("*callback not called once (or twice) (%d)* ", + trace_selftest_recursion_cnt); + goto out; + } + + trace_selftest_recursion_cnt = 1; + + pr_cont("PASSED\n"); + pr_info("Testing ftrace recursion safe: "); + + ret = ftrace_set_filter(&test_recsafe_probe, func_name, len, 1); + if (ret) { + pr_cont("*Could not set filter* "); + goto out; + } + + ret = register_ftrace_function(&test_recsafe_probe); + if (ret) { + pr_cont("*could not register callback* "); + goto out; + } + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_function(&test_recsafe_probe); + + ret = -1; + if (trace_selftest_recursion_cnt != 2) { + pr_cont("*callback not called expected 2 times (%d)* ", + trace_selftest_recursion_cnt); + goto out; + } + + ret = 0; +out: + ftrace_enabled = save_ftrace_enabled; + + return ret; +} +#else +# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) +# define trace_selftest_function_recursion() ({ 0; }) +#endif /* CONFIG_DYNAMIC_FTRACE */ + +static enum { + TRACE_SELFTEST_REGS_START, + TRACE_SELFTEST_REGS_FOUND, + TRACE_SELFTEST_REGS_NOT_FOUND, +} trace_selftest_regs_stat; + +static void trace_selftest_test_regs_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + struct pt_regs *regs = ftrace_get_regs(fregs); + + if (regs) + trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND; + else + trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND; +} + +static struct ftrace_ops test_regs_probe = { + .func = trace_selftest_test_regs_func, + .flags = FTRACE_OPS_FL_SAVE_REGS, +}; + +static int +trace_selftest_function_regs(void) +{ + int save_ftrace_enabled = ftrace_enabled; + char *func_name; + int len; + int ret; + int supported = 0; + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS + supported = 1; +#endif + + /* The previous test PASSED */ + pr_cont("PASSED\n"); + pr_info("Testing ftrace regs%s: ", + !supported ? "(no arch support)" : ""); + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + + /* Handle PPC64 '.' name */ + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + len = strlen(func_name); + + ret = ftrace_set_filter(&test_regs_probe, func_name, len, 1); + /* + * If DYNAMIC_FTRACE is not set, then we just trace all functions. + * This test really doesn't care. + */ + if (ret && ret != -ENODEV) { + pr_cont("*Could not set filter* "); + goto out; + } + + ret = register_ftrace_function(&test_regs_probe); + /* + * Now if the arch does not support passing regs, then this should + * have failed. + */ + if (!supported) { + if (!ret) { + pr_cont("*registered save-regs without arch support* "); + goto out; + } + test_regs_probe.flags |= FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED; + ret = register_ftrace_function(&test_regs_probe); + } + if (ret) { + pr_cont("*could not register callback* "); + goto out; + } + + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_function(&test_regs_probe); + + ret = -1; + + switch (trace_selftest_regs_stat) { + case TRACE_SELFTEST_REGS_START: + pr_cont("*callback never called* "); + goto out; + + case TRACE_SELFTEST_REGS_FOUND: + if (supported) + break; + pr_cont("*callback received regs without arch support* "); + goto out; + + case TRACE_SELFTEST_REGS_NOT_FOUND: + if (!supported) + break; + pr_cont("*callback received NULL regs* "); + goto out; + } + + ret = 0; +out: + ftrace_enabled = save_ftrace_enabled; + + return ret; +} + +/* + * Simple verification test of ftrace function tracer. + * Enable ftrace, sleep 1/10 second, and then read the trace + * buffer to see if all is in order. + */ +__init int +trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) +{ + int save_ftrace_enabled = ftrace_enabled; + unsigned long count; + int ret; + +#ifdef CONFIG_DYNAMIC_FTRACE + if (ftrace_filter_param) { + printk(KERN_CONT " ... kernel command line filter set: force PASS ... "); + return 0; + } +#endif + + /* make sure msleep has been recorded */ + msleep(1); + + /* start the tracing */ + ftrace_enabled = 1; + + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } + + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tracing_stop(); + ftrace_enabled = 0; + + /* check the trace buffer */ + ret = trace_test_buffer(&tr->array_buffer, &count); + + ftrace_enabled = 1; + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + ret = trace_selftest_startup_dynamic_tracing(trace, tr, + DYN_FTRACE_TEST_NAME); + if (ret) + goto out; + + ret = trace_selftest_function_recursion(); + if (ret) + goto out; + + ret = trace_selftest_function_regs(); + out: + ftrace_enabled = save_ftrace_enabled; + + /* kill ftrace totally if we failed */ + if (ret) + ftrace_kill(); + + return ret; +} +#endif /* CONFIG_FUNCTION_TRACER */ + + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +/* Maximum number of functions to trace before diagnosing a hang */ +#define GRAPH_MAX_FUNC_TEST 100000000 + +static unsigned int graph_hang_thresh; + +/* Wrap the real function entry probe to avoid possible hanging */ +static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) +{ + /* This is harmlessly racy, we want to approximately detect a hang */ + if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { + ftrace_graph_stop(); + printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); + if (ftrace_dump_on_oops) { + ftrace_dump(DUMP_ALL); + /* ftrace_dump() disables tracing */ + tracing_on(); + } + return 0; + } + + return trace_graph_entry(trace); +} + +static struct fgraph_ops fgraph_ops __initdata = { + .entryfunc = &trace_graph_entry_watchdog, + .retfunc = &trace_graph_return, +}; + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +static struct ftrace_ops direct; +#endif + +/* + * Pretty much the same than for the function tracer from which the selftest + * has been borrowed. + */ +__init int +trace_selftest_startup_function_graph(struct tracer *trace, + struct trace_array *tr) +{ + int ret; + unsigned long count; + char *func_name __maybe_unused; + +#ifdef CONFIG_DYNAMIC_FTRACE + if (ftrace_filter_param) { + printk(KERN_CONT " ... kernel command line filter set: force PASS ... "); + return 0; + } +#endif + + /* + * Simulate the init() callback but we attach a watchdog callback + * to detect and recover from possible hangs + */ + tracing_reset_online_cpus(&tr->array_buffer); + set_graph_array(tr); + ret = register_ftrace_graph(&fgraph_ops); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } + tracing_start_cmdline_record(); + + /* Sleep for a 1/10 of a second */ + msleep(100); + + /* Have we just recovered from a hang? */ + if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) { + disable_tracing_selftest("recovering from a hang"); + ret = -1; + goto out; + } + + tracing_stop(); + + /* check the trace buffer */ + ret = trace_test_buffer(&tr->array_buffer, &count); + + /* Need to also simulate the tr->reset to remove this fgraph_ops */ + tracing_stop_cmdline_record(); + unregister_ftrace_graph(&fgraph_ops); + + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + /* + * These tests can take some time to run. Make sure on non PREEMPT + * kernels, we do not trigger the softlockup detector. + */ + cond_resched(); + + tracing_reset_online_cpus(&tr->array_buffer); + set_graph_array(tr); + + /* + * Some archs *cough*PowerPC*cough* add characters to the + * start of the function names. We simply put a '*' to + * accommodate them. + */ + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + ftrace_set_global_filter(func_name, strlen(func_name), 1); + + /* + * Register direct function together with graph tracer + * and make sure we get graph trace. + */ + ftrace_set_filter_ip(&direct, (unsigned long)DYN_FTRACE_TEST_NAME, 0, 0); + ret = register_ftrace_direct(&direct, + (unsigned long)ftrace_stub_direct_tramp); + if (ret) + goto out; + + cond_resched(); + + ret = register_ftrace_graph(&fgraph_ops); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } + + DYN_FTRACE_TEST_NAME(); + + count = 0; + + tracing_stop(); + /* check the trace buffer */ + ret = trace_test_buffer(&tr->array_buffer, &count); + + unregister_ftrace_graph(&fgraph_ops); + + ret = unregister_ftrace_direct(&direct, + (unsigned long)ftrace_stub_direct_tramp, + true); + if (ret) + goto out; + + cond_resched(); + + tracing_start(); + + if (!ret && !count) { + ret = -1; + goto out; + } + + /* Enable tracing on all functions again */ + ftrace_set_global_filter(NULL, 0, 1); +#endif + + /* Don't test dynamic tracing, the function tracer already did */ +out: + /* Stop it if we failed */ + if (ret) + ftrace_graph_stop(); + + return ret; +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + + +#ifdef CONFIG_IRQSOFF_TRACER +int +trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tr->max_latency; + unsigned long count; + int ret; + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + /* reset the max latency */ + tr->max_latency = 0; + /* disable interrupts for a bit */ + local_irq_disable(); + udelay(100); + local_irq_enable(); + + /* + * Stop the tracer to avoid a warning subsequent + * to buffer flipping failure because tracing_stop() + * disables the tr and max buffers, making flipping impossible + * in case of parallels max irqs off latencies. + */ + trace->stop(tr); + /* stop the tracing. */ + tracing_stop(); + /* check both trace buffers */ + ret = trace_test_buffer(&tr->array_buffer, NULL); + if (!ret) + ret = trace_test_buffer(&tr->max_buffer, &count); + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + tr->max_latency = save_max; + + return ret; +} +#endif /* CONFIG_IRQSOFF_TRACER */ + +#ifdef CONFIG_PREEMPT_TRACER +int +trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tr->max_latency; + unsigned long count; + int ret; + + /* + * Now that the big kernel lock is no longer preemptible, + * and this is called with the BKL held, it will always + * fail. If preemption is already disabled, simply + * pass the test. When the BKL is removed, or becomes + * preemptible again, we will once again test this, + * so keep it in. + */ + if (preempt_count()) { + printk(KERN_CONT "can not test ... force "); + return 0; + } + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + /* reset the max latency */ + tr->max_latency = 0; + /* disable preemption for a bit */ + preempt_disable(); + udelay(100); + preempt_enable(); + + /* + * Stop the tracer to avoid a warning subsequent + * to buffer flipping failure because tracing_stop() + * disables the tr and max buffers, making flipping impossible + * in case of parallels max preempt off latencies. + */ + trace->stop(tr); + /* stop the tracing. */ + tracing_stop(); + /* check both trace buffers */ + ret = trace_test_buffer(&tr->array_buffer, NULL); + if (!ret) + ret = trace_test_buffer(&tr->max_buffer, &count); + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + tr->max_latency = save_max; + + return ret; +} +#endif /* CONFIG_PREEMPT_TRACER */ + +#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) +int +trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tr->max_latency; + unsigned long count; + int ret; + + /* + * Now that the big kernel lock is no longer preemptible, + * and this is called with the BKL held, it will always + * fail. If preemption is already disabled, simply + * pass the test. When the BKL is removed, or becomes + * preemptible again, we will once again test this, + * so keep it in. + */ + if (preempt_count()) { + printk(KERN_CONT "can not test ... force "); + return 0; + } + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out_no_start; + } + + /* reset the max latency */ + tr->max_latency = 0; + + /* disable preemption and interrupts for a bit */ + preempt_disable(); + local_irq_disable(); + udelay(100); + preempt_enable(); + /* reverse the order of preempt vs irqs */ + local_irq_enable(); + + /* + * Stop the tracer to avoid a warning subsequent + * to buffer flipping failure because tracing_stop() + * disables the tr and max buffers, making flipping impossible + * in case of parallels max irqs/preempt off latencies. + */ + trace->stop(tr); + /* stop the tracing. */ + tracing_stop(); + /* check both trace buffers */ + ret = trace_test_buffer(&tr->array_buffer, NULL); + if (ret) + goto out; + + ret = trace_test_buffer(&tr->max_buffer, &count); + if (ret) + goto out; + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + /* do the test by disabling interrupts first this time */ + tr->max_latency = 0; + tracing_start(); + trace->start(tr); + + preempt_disable(); + local_irq_disable(); + udelay(100); + preempt_enable(); + /* reverse the order of preempt vs irqs */ + local_irq_enable(); + + trace->stop(tr); + /* stop the tracing. */ + tracing_stop(); + /* check both trace buffers */ + ret = trace_test_buffer(&tr->array_buffer, NULL); + if (ret) + goto out; + + ret = trace_test_buffer(&tr->max_buffer, &count); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + +out: + tracing_start(); +out_no_start: + trace->reset(tr); + tr->max_latency = save_max; + + return ret; +} +#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */ + +#ifdef CONFIG_NOP_TRACER +int +trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) +{ + /* What could possibly go wrong? */ + return 0; +} +#endif + +#ifdef CONFIG_SCHED_TRACER + +struct wakeup_test_data { + struct completion is_ready; + int go; +}; + +static int trace_wakeup_test_thread(void *data) +{ + /* Make this a -deadline thread */ + static const struct sched_attr attr = { + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL + }; + struct wakeup_test_data *x = data; + + sched_setattr(current, &attr); + + /* Make it know we have a new prio */ + complete(&x->is_ready); + + /* now go to sleep and let the test wake us up */ + set_current_state(TASK_INTERRUPTIBLE); + while (!x->go) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + + complete(&x->is_ready); + + set_current_state(TASK_INTERRUPTIBLE); + + /* we are awake, now wait to disappear */ + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + + __set_current_state(TASK_RUNNING); + + return 0; +} +int +trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tr->max_latency; + struct task_struct *p; + struct wakeup_test_data data; + unsigned long count; + int ret; + + memset(&data, 0, sizeof(data)); + + init_completion(&data.is_ready); + + /* create a -deadline thread */ + p = kthread_run(trace_wakeup_test_thread, &data, "ftrace-test"); + if (IS_ERR(p)) { + printk(KERN_CONT "Failed to create ftrace wakeup test thread "); + return -1; + } + + /* make sure the thread is running at -deadline policy */ + wait_for_completion(&data.is_ready); + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + /* reset the max latency */ + tr->max_latency = 0; + + while (p->on_rq) { + /* + * Sleep to make sure the -deadline thread is asleep too. + * On virtual machines we can't rely on timings, + * but we want to make sure this test still works. + */ + msleep(100); + } + + init_completion(&data.is_ready); + + data.go = 1; + /* memory barrier is in the wake_up_process() */ + + wake_up_process(p); + + /* Wait for the task to wake up */ + wait_for_completion(&data.is_ready); + + /* stop the tracing. */ + tracing_stop(); + /* check both trace buffers */ + ret = trace_test_buffer(&tr->array_buffer, NULL); + if (!ret) + ret = trace_test_buffer(&tr->max_buffer, &count); + + + trace->reset(tr); + tracing_start(); + + tr->max_latency = save_max; + + /* kill the thread */ + kthread_stop(p); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_SCHED_TRACER */ + +#ifdef CONFIG_BRANCH_TRACER +int +trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tracing_stop(); + /* check the trace buffer */ + ret = trace_test_buffer(&tr->array_buffer, &count); + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_BRANCH_TRACER */ + diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c new file mode 100644 index 0000000000..c364cf777e --- /dev/null +++ b/kernel/trace/trace_selftest_dynamic.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/compiler.h> +#include "trace.h" + +noinline __noclone int DYN_FTRACE_TEST_NAME(void) +{ + /* used to call mcount */ + return 0; +} + +noinline __noclone int DYN_FTRACE_TEST_NAME2(void) +{ + /* used to call mcount */ + return 0; +} diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c new file mode 100644 index 0000000000..bac06ee3b9 --- /dev/null +++ b/kernel/trace/trace_seq.c @@ -0,0 +1,429 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * trace_seq.c + * + * Copyright (C) 2008-2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> + * + * The trace_seq is a handy tool that allows you to pass a descriptor around + * to a buffer that other functions can write to. It is similar to the + * seq_file functionality but has some differences. + * + * To use it, the trace_seq must be initialized with trace_seq_init(). + * This will set up the counters within the descriptor. You can call + * trace_seq_init() more than once to reset the trace_seq to start + * from scratch. + * + * The buffer size is currently PAGE_SIZE, although it may become dynamic + * in the future. + * + * A write to the buffer will either succeed or fail. That is, unlike + * sprintf() there will not be a partial write (well it may write into + * the buffer but it wont update the pointers). This allows users to + * try to write something into the trace_seq buffer and if it fails + * they can flush it and try again. + * + */ +#include <linux/uaccess.h> +#include <linux/seq_file.h> +#include <linux/trace_seq.h> + +/* How much buffer is left on the trace_seq? */ +#define TRACE_SEQ_BUF_LEFT(s) seq_buf_buffer_left(&(s)->seq) + +/* + * trace_seq should work with being initialized with 0s. + */ +static inline void __trace_seq_init(struct trace_seq *s) +{ + if (unlikely(!s->seq.size)) + trace_seq_init(s); +} + +/** + * trace_print_seq - move the contents of trace_seq into a seq_file + * @m: the seq_file descriptor that is the destination + * @s: the trace_seq descriptor that is the source. + * + * Returns 0 on success and non zero on error. If it succeeds to + * write to the seq_file it will reset the trace_seq, otherwise + * it does not modify the trace_seq to let the caller try again. + */ +int trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ + int ret; + + __trace_seq_init(s); + + ret = seq_buf_print_seq(m, &s->seq); + + /* + * Only reset this buffer if we successfully wrote to the + * seq_file buffer. This lets the caller try again or + * do something else with the contents. + */ + if (!ret) + trace_seq_init(s); + + return ret; +} + +/** + * trace_seq_printf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formatting of a trace + * trace_seq_printf() is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + */ +void trace_seq_printf(struct trace_seq *s, const char *fmt, ...) +{ + unsigned int save_len = s->seq.len; + va_list ap; + + if (s->full) + return; + + __trace_seq_init(s); + + va_start(ap, fmt); + seq_buf_vprintf(&s->seq, fmt, ap); + va_end(ap); + + /* If we can't write it all, don't bother writing anything */ + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + } +} +EXPORT_SYMBOL_GPL(trace_seq_printf); + +/** + * trace_seq_bitmask - write a bitmask array in its ASCII representation + * @s: trace sequence descriptor + * @maskp: points to an array of unsigned longs that represent a bitmask + * @nmaskbits: The number of bits that are valid in @maskp + * + * Writes a ASCII representation of a bitmask string into @s. + */ +void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, + int nmaskbits) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return; + + __trace_seq_init(s); + + seq_buf_printf(&s->seq, "%*pb", nmaskbits, maskp); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + } +} +EXPORT_SYMBOL_GPL(trace_seq_bitmask); + +/** + * trace_seq_vprintf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * @args: Arguments for the format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formatting of a trace + * trace_seq_printf is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + */ +void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return; + + __trace_seq_init(s); + + seq_buf_vprintf(&s->seq, fmt, args); + + /* If we can't write it all, don't bother writing anything */ + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + } +} +EXPORT_SYMBOL_GPL(trace_seq_vprintf); + +/** + * trace_seq_bprintf - Write the printf string from binary arguments + * @s: trace sequence descriptor + * @fmt: The format string for the @binary arguments + * @binary: The binary arguments for @fmt. + * + * When recording in a fast path, a printf may be recorded with just + * saving the format and the arguments as they were passed to the + * function, instead of wasting cycles converting the arguments into + * ASCII characters. Instead, the arguments are saved in a 32 bit + * word array that is defined by the format string constraints. + * + * This function will take the format and the binary array and finish + * the conversion into the ASCII string within the buffer. + */ +void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return; + + __trace_seq_init(s); + + seq_buf_bprintf(&s->seq, fmt, binary); + + /* If we can't write it all, don't bother writing anything */ + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + return; + } +} +EXPORT_SYMBOL_GPL(trace_seq_bprintf); + +/** + * trace_seq_puts - trace sequence printing of simple string + * @s: trace sequence descriptor + * @str: simple string to record + * + * The tracer may use either the sequence operations or its own + * copy to user routines. This function records a simple string + * into a special buffer (@s) for later retrieval by a sequencer + * or other mechanism. + */ +void trace_seq_puts(struct trace_seq *s, const char *str) +{ + unsigned int len = strlen(str); + + if (s->full) + return; + + __trace_seq_init(s); + + if (len > TRACE_SEQ_BUF_LEFT(s)) { + s->full = 1; + return; + } + + seq_buf_putmem(&s->seq, str, len); +} +EXPORT_SYMBOL_GPL(trace_seq_puts); + +/** + * trace_seq_putc - trace sequence printing of simple character + * @s: trace sequence descriptor + * @c: simple character to record + * + * The tracer may use either the sequence operations or its own + * copy to user routines. This function records a simple character + * into a special buffer (@s) for later retrieval by a sequencer + * or other mechanism. + */ +void trace_seq_putc(struct trace_seq *s, unsigned char c) +{ + if (s->full) + return; + + __trace_seq_init(s); + + if (TRACE_SEQ_BUF_LEFT(s) < 1) { + s->full = 1; + return; + } + + seq_buf_putc(&s->seq, c); +} +EXPORT_SYMBOL_GPL(trace_seq_putc); + +/** + * trace_seq_putmem - write raw data into the trace_seq buffer + * @s: trace sequence descriptor + * @mem: The raw memory to copy into the buffer + * @len: The length of the raw memory to copy (in bytes) + * + * There may be cases where raw memory needs to be written into the + * buffer and a strcpy() would not work. Using this function allows + * for such cases. + */ +void trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len) +{ + if (s->full) + return; + + __trace_seq_init(s); + + if (len > TRACE_SEQ_BUF_LEFT(s)) { + s->full = 1; + return; + } + + seq_buf_putmem(&s->seq, mem, len); +} +EXPORT_SYMBOL_GPL(trace_seq_putmem); + +/** + * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex + * @s: trace sequence descriptor + * @mem: The raw memory to write its hex ASCII representation of + * @len: The length of the raw memory to copy (in bytes) + * + * This is similar to trace_seq_putmem() except instead of just copying the + * raw memory into the buffer it writes its ASCII representation of it + * in hex characters. + */ +void trace_seq_putmem_hex(struct trace_seq *s, const void *mem, + unsigned int len) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return; + + __trace_seq_init(s); + + /* Each byte is represented by two chars */ + if (len * 2 > TRACE_SEQ_BUF_LEFT(s)) { + s->full = 1; + return; + } + + /* The added spaces can still cause an overflow */ + seq_buf_putmem_hex(&s->seq, mem, len); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + return; + } +} +EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); + +/** + * trace_seq_path - copy a path into the sequence buffer + * @s: trace sequence descriptor + * @path: path to write into the sequence buffer. + * + * Write a path name into the sequence buffer. + * + * Returns 1 if we successfully written all the contents to + * the buffer. + * Returns 0 if we the length to write is bigger than the + * reserved buffer space. In this case, nothing gets written. + */ +int trace_seq_path(struct trace_seq *s, const struct path *path) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return 0; + + __trace_seq_init(s); + + if (TRACE_SEQ_BUF_LEFT(s) < 1) { + s->full = 1; + return 0; + } + + seq_buf_path(&s->seq, path, "\n"); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + return 0; + } + + return 1; +} +EXPORT_SYMBOL_GPL(trace_seq_path); + +/** + * trace_seq_to_user - copy the sequence buffer to user space + * @s: trace sequence descriptor + * @ubuf: The userspace memory location to copy to + * @cnt: The amount to copy + * + * Copies the sequence buffer into the userspace memory pointed to + * by @ubuf. It starts from the last read position (@s->readpos) + * and writes up to @cnt characters or till it reaches the end of + * the content in the buffer (@s->len), which ever comes first. + * + * On success, it returns a positive number of the number of bytes + * it copied. + * + * On failure it returns -EBUSY if all of the content in the + * sequence has been already read, which includes nothing in the + * sequence (@s->len == @s->readpos). + * + * Returns -EFAULT if the copy to userspace fails. + */ +int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) +{ + __trace_seq_init(s); + return seq_buf_to_user(&s->seq, ubuf, cnt); +} +EXPORT_SYMBOL_GPL(trace_seq_to_user); + +int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return 0; + + __trace_seq_init(s); + + if (TRACE_SEQ_BUF_LEFT(s) < 1) { + s->full = 1; + return 0; + } + + seq_buf_hex_dump(&(s->seq), prefix_str, + prefix_type, rowsize, groupsize, + buf, len, ascii); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + return 0; + } + + return 1; +} +EXPORT_SYMBOL(trace_seq_hex_dump); + +/* + * trace_seq_acquire - acquire seq buffer with size len + * @s: trace sequence descriptor + * @len: size of buffer to be acquired + * + * acquire buffer with size of @len from trace_seq for output usage, + * user can fill string into that buffer. + * + * Returns start address of acquired buffer. + * + * it allow multiple usage in one trace output function call. + */ +char *trace_seq_acquire(struct trace_seq *s, unsigned int len) +{ + char *ret = trace_seq_buffer_ptr(s); + + if (!WARN_ON_ONCE(seq_buf_buffer_left(&s->seq) < len)) + seq_buf_commit(&s->seq, len); + + return ret; +} +EXPORT_SYMBOL(trace_seq_acquire); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c new file mode 100644 index 0000000000..5a48dba912 --- /dev/null +++ b/kernel/trace/trace_stack.c @@ -0,0 +1,582 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> + * + */ +#include <linux/sched/task_stack.h> +#include <linux/stacktrace.h> +#include <linux/security.h> +#include <linux/kallsyms.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> +#include <linux/uaccess.h> +#include <linux/ftrace.h> +#include <linux/module.h> +#include <linux/sysctl.h> +#include <linux/init.h> + +#include <asm/setup.h> + +#include "trace.h" + +#define STACK_TRACE_ENTRIES 500 + +static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES]; +static unsigned stack_trace_index[STACK_TRACE_ENTRIES]; + +static unsigned int stack_trace_nr_entries; +static unsigned long stack_trace_max_size; +static arch_spinlock_t stack_trace_max_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + +DEFINE_PER_CPU(int, disable_stack_tracer); +static DEFINE_MUTEX(stack_sysctl_mutex); + +int stack_tracer_enabled; + +static void print_max_stack(void) +{ + long i; + int size; + + pr_emerg(" Depth Size Location (%d entries)\n" + " ----- ---- --------\n", + stack_trace_nr_entries); + + for (i = 0; i < stack_trace_nr_entries; i++) { + if (i + 1 == stack_trace_nr_entries) + size = stack_trace_index[i]; + else + size = stack_trace_index[i] - stack_trace_index[i+1]; + + pr_emerg("%3ld) %8d %5d %pS\n", i, stack_trace_index[i], + size, (void *)stack_dump_trace[i]); + } +} + +/* + * The stack tracer looks for a maximum stack at each call from a function. It + * registers a callback from ftrace, and in that callback it examines the stack + * size. It determines the stack size from the variable passed in, which is the + * address of a local variable in the stack_trace_call() callback function. + * The stack size is calculated by the address of the local variable to the top + * of the current stack. If that size is smaller than the currently saved max + * stack size, nothing more is done. + * + * If the size of the stack is greater than the maximum recorded size, then the + * following algorithm takes place. + * + * For architectures (like x86) that store the function's return address before + * saving the function's local variables, the stack will look something like + * this: + * + * [ top of stack ] + * 0: sys call entry frame + * 10: return addr to entry code + * 11: start of sys_foo frame + * 20: return addr to sys_foo + * 21: start of kernel_func_bar frame + * 30: return addr to kernel_func_bar + * 31: [ do trace stack here ] + * + * The save_stack_trace() is called returning all the functions it finds in the + * current stack. Which would be (from the bottom of the stack to the top): + * + * return addr to kernel_func_bar + * return addr to sys_foo + * return addr to entry code + * + * Now to figure out how much each of these functions' local variable size is, + * a search of the stack is made to find these values. When a match is made, it + * is added to the stack_dump_trace[] array. The offset into the stack is saved + * in the stack_trace_index[] array. The above example would show: + * + * stack_dump_trace[] | stack_trace_index[] + * ------------------ + ------------------- + * return addr to kernel_func_bar | 30 + * return addr to sys_foo | 20 + * return addr to entry | 10 + * + * The print_max_stack() function above, uses these values to print the size of + * each function's portion of the stack. + * + * for (i = 0; i < nr_entries; i++) { + * size = i == nr_entries - 1 ? stack_trace_index[i] : + * stack_trace_index[i] - stack_trace_index[i+1] + * print "%d %d %d %s\n", i, stack_trace_index[i], size, stack_dump_trace[i]); + * } + * + * The above shows + * + * depth size location + * ----- ---- -------- + * 0 30 10 kernel_func_bar + * 1 20 10 sys_foo + * 2 10 10 entry code + * + * Now for architectures that might save the return address after the functions + * local variables (saving the link register before calling nested functions), + * this will cause the stack to look a little different: + * + * [ top of stack ] + * 0: sys call entry frame + * 10: start of sys_foo_frame + * 19: return addr to entry code << lr saved before calling kernel_func_bar + * 20: start of kernel_func_bar frame + * 29: return addr to sys_foo_frame << lr saved before calling next function + * 30: [ do trace stack here ] + * + * Although the functions returned by save_stack_trace() may be the same, the + * placement in the stack will be different. Using the same algorithm as above + * would yield: + * + * stack_dump_trace[] | stack_trace_index[] + * ------------------ + ------------------- + * return addr to kernel_func_bar | 30 + * return addr to sys_foo | 29 + * return addr to entry | 19 + * + * Where the mapping is off by one: + * + * kernel_func_bar stack frame size is 29 - 19 not 30 - 29! + * + * To fix this, if the architecture sets ARCH_RET_ADDR_AFTER_LOCAL_VARS the + * values in stack_trace_index[] are shifted by one to and the number of + * stack trace entries is decremented by one. + * + * stack_dump_trace[] | stack_trace_index[] + * ------------------ + ------------------- + * return addr to kernel_func_bar | 29 + * return addr to sys_foo | 19 + * + * Although the entry function is not displayed, the first function (sys_foo) + * will still include the stack size of it. + */ +static void check_stack(unsigned long ip, unsigned long *stack) +{ + unsigned long this_size, flags; unsigned long *p, *top, *start; + static int tracer_frame; + int frame_size = READ_ONCE(tracer_frame); + int i, x; + + this_size = ((unsigned long)stack) & (THREAD_SIZE-1); + this_size = THREAD_SIZE - this_size; + /* Remove the frame of the tracer */ + this_size -= frame_size; + + if (this_size <= stack_trace_max_size) + return; + + /* we do not handle interrupt stacks yet */ + if (!object_is_on_stack(stack)) + return; + + /* Can't do this from NMI context (can cause deadlocks) */ + if (in_nmi()) + return; + + local_irq_save(flags); + arch_spin_lock(&stack_trace_max_lock); + + /* In case another CPU set the tracer_frame on us */ + if (unlikely(!frame_size)) + this_size -= tracer_frame; + + /* a race could have already updated it */ + if (this_size <= stack_trace_max_size) + goto out; + + stack_trace_max_size = this_size; + + stack_trace_nr_entries = stack_trace_save(stack_dump_trace, + ARRAY_SIZE(stack_dump_trace) - 1, + 0); + + /* Skip over the overhead of the stack tracer itself */ + for (i = 0; i < stack_trace_nr_entries; i++) { + if (stack_dump_trace[i] == ip) + break; + } + + /* + * Some archs may not have the passed in ip in the dump. + * If that happens, we need to show everything. + */ + if (i == stack_trace_nr_entries) + i = 0; + + /* + * Now find where in the stack these are. + */ + x = 0; + start = stack; + top = (unsigned long *) + (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); + + /* + * Loop through all the entries. One of the entries may + * for some reason be missed on the stack, so we may + * have to account for them. If they are all there, this + * loop will only happen once. This code only takes place + * on a new max, so it is far from a fast path. + */ + while (i < stack_trace_nr_entries) { + int found = 0; + + stack_trace_index[x] = this_size; + p = start; + + for (; p < top && i < stack_trace_nr_entries; p++) { + /* + * The READ_ONCE_NOCHECK is used to let KASAN know that + * this is not a stack-out-of-bounds error. + */ + if ((READ_ONCE_NOCHECK(*p)) == stack_dump_trace[i]) { + stack_dump_trace[x] = stack_dump_trace[i++]; + this_size = stack_trace_index[x++] = + (top - p) * sizeof(unsigned long); + found = 1; + /* Start the search from here */ + start = p + 1; + /* + * We do not want to show the overhead + * of the stack tracer stack in the + * max stack. If we haven't figured + * out what that is, then figure it out + * now. + */ + if (unlikely(!tracer_frame)) { + tracer_frame = (p - stack) * + sizeof(unsigned long); + stack_trace_max_size -= tracer_frame; + } + } + } + + if (!found) + i++; + } + +#ifdef ARCH_FTRACE_SHIFT_STACK_TRACER + /* + * Some archs will store the link register before calling + * nested functions. This means the saved return address + * comes after the local storage, and we need to shift + * for that. + */ + if (x > 1) { + memmove(&stack_trace_index[0], &stack_trace_index[1], + sizeof(stack_trace_index[0]) * (x - 1)); + x--; + } +#endif + + stack_trace_nr_entries = x; + + if (task_stack_end_corrupted(current)) { + print_max_stack(); + BUG(); + } + + out: + arch_spin_unlock(&stack_trace_max_lock); + local_irq_restore(flags); +} + +/* Some archs may not define MCOUNT_INSN_SIZE */ +#ifndef MCOUNT_INSN_SIZE +# define MCOUNT_INSN_SIZE 0 +#endif + +static void +stack_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + unsigned long stack; + + preempt_disable_notrace(); + + /* no atomic needed, we only modify this variable by this cpu */ + __this_cpu_inc(disable_stack_tracer); + if (__this_cpu_read(disable_stack_tracer) != 1) + goto out; + + /* If rcu is not watching, then save stack trace can fail */ + if (!rcu_is_watching()) + goto out; + + ip += MCOUNT_INSN_SIZE; + + check_stack(ip, &stack); + + out: + __this_cpu_dec(disable_stack_tracer); + /* prevent recursion in schedule */ + preempt_enable_notrace(); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = stack_trace_call, +}; + +static ssize_t +stack_max_size_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + unsigned long *ptr = filp->private_data; + char buf[64]; + int r; + + r = snprintf(buf, sizeof(buf), "%ld\n", *ptr); + if (r > sizeof(buf)) + r = sizeof(buf); + return simple_read_from_buffer(ubuf, count, ppos, buf, r); +} + +static ssize_t +stack_max_size_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + long *ptr = filp->private_data; + unsigned long val, flags; + int ret; + + ret = kstrtoul_from_user(ubuf, count, 10, &val); + if (ret) + return ret; + + local_irq_save(flags); + + /* + * In case we trace inside arch_spin_lock() or after (NMI), + * we will cause circular lock, so we also need to increase + * the percpu disable_stack_tracer here. + */ + __this_cpu_inc(disable_stack_tracer); + + arch_spin_lock(&stack_trace_max_lock); + *ptr = val; + arch_spin_unlock(&stack_trace_max_lock); + + __this_cpu_dec(disable_stack_tracer); + local_irq_restore(flags); + + return count; +} + +static const struct file_operations stack_max_size_fops = { + .open = tracing_open_generic, + .read = stack_max_size_read, + .write = stack_max_size_write, + .llseek = default_llseek, +}; + +static void * +__next(struct seq_file *m, loff_t *pos) +{ + long n = *pos - 1; + + if (n >= stack_trace_nr_entries) + return NULL; + + m->private = (void *)n; + return &m->private; +} + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return __next(m, pos); +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + local_irq_disable(); + + __this_cpu_inc(disable_stack_tracer); + + arch_spin_lock(&stack_trace_max_lock); + + if (*pos == 0) + return SEQ_START_TOKEN; + + return __next(m, pos); +} + +static void t_stop(struct seq_file *m, void *p) +{ + arch_spin_unlock(&stack_trace_max_lock); + + __this_cpu_dec(disable_stack_tracer); + + local_irq_enable(); +} + +static void trace_lookup_stack(struct seq_file *m, long i) +{ + unsigned long addr = stack_dump_trace[i]; + + seq_printf(m, "%pS\n", (void *)addr); +} + +static void print_disabled(struct seq_file *m) +{ + seq_puts(m, "#\n" + "# Stack tracer disabled\n" + "#\n" + "# To enable the stack tracer, either add 'stacktrace' to the\n" + "# kernel command line\n" + "# or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'\n" + "#\n"); +} + +static int t_show(struct seq_file *m, void *v) +{ + long i; + int size; + + if (v == SEQ_START_TOKEN) { + seq_printf(m, " Depth Size Location" + " (%d entries)\n" + " ----- ---- --------\n", + stack_trace_nr_entries); + + if (!stack_tracer_enabled && !stack_trace_max_size) + print_disabled(m); + + return 0; + } + + i = *(long *)v; + + if (i >= stack_trace_nr_entries) + return 0; + + if (i + 1 == stack_trace_nr_entries) + size = stack_trace_index[i]; + else + size = stack_trace_index[i] - stack_trace_index[i+1]; + + seq_printf(m, "%3ld) %8d %5d ", i, stack_trace_index[i], size); + + trace_lookup_stack(m, i); + + return 0; +} + +static const struct seq_operations stack_trace_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int stack_trace_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + return seq_open(file, &stack_trace_seq_ops); +} + +static const struct file_operations stack_trace_fops = { + .open = stack_trace_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#ifdef CONFIG_DYNAMIC_FTRACE + +static int +stack_trace_filter_open(struct inode *inode, struct file *file) +{ + struct ftrace_ops *ops = inode->i_private; + + /* Checks for tracefs lockdown */ + return ftrace_regex_open(ops, FTRACE_ITER_FILTER, + inode, file); +} + +static const struct file_operations stack_trace_filter_fops = { + .open = stack_trace_filter_open, + .read = seq_read, + .write = ftrace_filter_write, + .llseek = tracing_lseek, + .release = ftrace_regex_release, +}; + +#endif /* CONFIG_DYNAMIC_FTRACE */ + +int +stack_trace_sysctl(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + int was_enabled; + int ret; + + mutex_lock(&stack_sysctl_mutex); + was_enabled = !!stack_tracer_enabled; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (ret || !write || (was_enabled == !!stack_tracer_enabled)) + goto out; + + if (stack_tracer_enabled) + register_ftrace_function(&trace_ops); + else + unregister_ftrace_function(&trace_ops); + out: + mutex_unlock(&stack_sysctl_mutex); + return ret; +} + +static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata; + +static __init int enable_stacktrace(char *str) +{ + int len; + + if ((len = str_has_prefix(str, "_filter="))) + strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE); + + stack_tracer_enabled = 1; + return 1; +} +__setup("stacktrace", enable_stacktrace); + +static __init int stack_trace_init(void) +{ + int ret; + + ret = tracing_init_dentry(); + if (ret) + return 0; + + trace_create_file("stack_max_size", TRACE_MODE_WRITE, NULL, + &stack_trace_max_size, &stack_max_size_fops); + + trace_create_file("stack_trace", TRACE_MODE_READ, NULL, + NULL, &stack_trace_fops); + +#ifdef CONFIG_DYNAMIC_FTRACE + trace_create_file("stack_trace_filter", TRACE_MODE_WRITE, NULL, + &trace_ops, &stack_trace_filter_fops); +#endif + + if (stack_trace_filter_buf[0]) + ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); + + if (stack_tracer_enabled) + register_ftrace_function(&trace_ops); + + return 0; +} + +device_initcall(stack_trace_init); diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c new file mode 100644 index 0000000000..bb247beec4 --- /dev/null +++ b/kernel/trace/trace_stat.c @@ -0,0 +1,364 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Infrastructure for statistic tracing (histogram output). + * + * Copyright (C) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com> + * + * Based on the code from trace_branch.c which is + * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> + * + */ + +#include <linux/security.h> +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/rbtree.h> +#include <linux/tracefs.h> +#include "trace_stat.h" +#include "trace.h" + + +/* + * List of stat red-black nodes from a tracer + * We use a such tree to sort quickly the stat + * entries from the tracer. + */ +struct stat_node { + struct rb_node node; + void *stat; +}; + +/* A stat session is the stats output in one file */ +struct stat_session { + struct list_head session_list; + struct tracer_stat *ts; + struct rb_root stat_root; + struct mutex stat_mutex; + struct dentry *file; +}; + +/* All of the sessions currently in use. Each stat file embed one session */ +static LIST_HEAD(all_stat_sessions); +static DEFINE_MUTEX(all_stat_sessions_mutex); + +/* The root directory for all stat files */ +static struct dentry *stat_dir; + +static void __reset_stat_session(struct stat_session *session) +{ + struct stat_node *snode, *n; + + rbtree_postorder_for_each_entry_safe(snode, n, &session->stat_root, node) { + if (session->ts->stat_release) + session->ts->stat_release(snode->stat); + kfree(snode); + } + + session->stat_root = RB_ROOT; +} + +static void reset_stat_session(struct stat_session *session) +{ + mutex_lock(&session->stat_mutex); + __reset_stat_session(session); + mutex_unlock(&session->stat_mutex); +} + +static void destroy_session(struct stat_session *session) +{ + tracefs_remove(session->file); + __reset_stat_session(session); + mutex_destroy(&session->stat_mutex); + kfree(session); +} + +static int insert_stat(struct rb_root *root, void *stat, cmp_func_t cmp) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + struct stat_node *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + data->stat = stat; + + /* + * Figure out where to put new node + * This is a descendent sorting + */ + while (*new) { + struct stat_node *this; + int result; + + this = container_of(*new, struct stat_node, node); + result = cmp(data->stat, this->stat); + + parent = *new; + if (result >= 0) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + rb_link_node(&data->node, parent, new); + rb_insert_color(&data->node, root); + return 0; +} + +/* + * For tracers that don't provide a stat_cmp callback. + * This one will force an insertion as right-most node + * in the rbtree. + */ +static int dummy_cmp(const void *p1, const void *p2) +{ + return -1; +} + +/* + * Initialize the stat rbtree at each trace_stat file opening. + * All of these copies and sorting are required on all opening + * since the stats could have changed between two file sessions. + */ +static int stat_seq_init(struct stat_session *session) +{ + struct tracer_stat *ts = session->ts; + struct rb_root *root = &session->stat_root; + void *stat; + int ret = 0; + int i; + + mutex_lock(&session->stat_mutex); + __reset_stat_session(session); + + if (!ts->stat_cmp) + ts->stat_cmp = dummy_cmp; + + stat = ts->stat_start(ts); + if (!stat) + goto exit; + + ret = insert_stat(root, stat, ts->stat_cmp); + if (ret) + goto exit; + + /* + * Iterate over the tracer stat entries and store them in an rbtree. + */ + for (i = 1; ; i++) { + stat = ts->stat_next(stat, i); + + /* End of insertion */ + if (!stat) + break; + + ret = insert_stat(root, stat, ts->stat_cmp); + if (ret) + goto exit_free_rbtree; + } + +exit: + mutex_unlock(&session->stat_mutex); + return ret; + +exit_free_rbtree: + __reset_stat_session(session); + mutex_unlock(&session->stat_mutex); + return ret; +} + + +static void *stat_seq_start(struct seq_file *s, loff_t *pos) +{ + struct stat_session *session = s->private; + struct rb_node *node; + int n = *pos; + int i; + + /* Prevent from tracer switch or rbtree modification */ + mutex_lock(&session->stat_mutex); + + /* If we are in the beginning of the file, print the headers */ + if (session->ts->stat_headers) { + if (n == 0) + return SEQ_START_TOKEN; + n--; + } + + node = rb_first(&session->stat_root); + for (i = 0; node && i < n; i++) + node = rb_next(node); + + return node; +} + +static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) +{ + struct stat_session *session = s->private; + struct rb_node *node = p; + + (*pos)++; + + if (p == SEQ_START_TOKEN) + return rb_first(&session->stat_root); + + return rb_next(node); +} + +static void stat_seq_stop(struct seq_file *s, void *p) +{ + struct stat_session *session = s->private; + mutex_unlock(&session->stat_mutex); +} + +static int stat_seq_show(struct seq_file *s, void *v) +{ + struct stat_session *session = s->private; + struct stat_node *l = container_of(v, struct stat_node, node); + + if (v == SEQ_START_TOKEN) + return session->ts->stat_headers(s); + + return session->ts->stat_show(s, l->stat); +} + +static const struct seq_operations trace_stat_seq_ops = { + .start = stat_seq_start, + .next = stat_seq_next, + .stop = stat_seq_stop, + .show = stat_seq_show +}; + +/* The session stat is refilled and resorted at each stat file opening */ +static int tracing_stat_open(struct inode *inode, struct file *file) +{ + int ret; + struct seq_file *m; + struct stat_session *session = inode->i_private; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + ret = stat_seq_init(session); + if (ret) + return ret; + + ret = seq_open(file, &trace_stat_seq_ops); + if (ret) { + reset_stat_session(session); + return ret; + } + + m = file->private_data; + m->private = session; + return ret; +} + +/* + * Avoid consuming memory with our now useless rbtree. + */ +static int tracing_stat_release(struct inode *i, struct file *f) +{ + struct stat_session *session = i->i_private; + + reset_stat_session(session); + + return seq_release(i, f); +} + +static const struct file_operations tracing_stat_fops = { + .open = tracing_stat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_stat_release +}; + +static int tracing_stat_init(void) +{ + int ret; + + ret = tracing_init_dentry(); + if (ret) + return -ENODEV; + + stat_dir = tracefs_create_dir("trace_stat", NULL); + if (!stat_dir) { + pr_warn("Could not create tracefs 'trace_stat' entry\n"); + return -ENOMEM; + } + return 0; +} + +static int init_stat_file(struct stat_session *session) +{ + int ret; + + if (!stat_dir && (ret = tracing_stat_init())) + return ret; + + session->file = tracefs_create_file(session->ts->name, TRACE_MODE_WRITE, + stat_dir, session, + &tracing_stat_fops); + if (!session->file) + return -ENOMEM; + return 0; +} + +int register_stat_tracer(struct tracer_stat *trace) +{ + struct stat_session *session, *node; + int ret = -EINVAL; + + if (!trace) + return -EINVAL; + + if (!trace->stat_start || !trace->stat_next || !trace->stat_show) + return -EINVAL; + + /* Already registered? */ + mutex_lock(&all_stat_sessions_mutex); + list_for_each_entry(node, &all_stat_sessions, session_list) { + if (node->ts == trace) + goto out; + } + + ret = -ENOMEM; + /* Init the session */ + session = kzalloc(sizeof(*session), GFP_KERNEL); + if (!session) + goto out; + + session->ts = trace; + INIT_LIST_HEAD(&session->session_list); + mutex_init(&session->stat_mutex); + + ret = init_stat_file(session); + if (ret) { + destroy_session(session); + goto out; + } + + ret = 0; + /* Register */ + list_add_tail(&session->session_list, &all_stat_sessions); + out: + mutex_unlock(&all_stat_sessions_mutex); + + return ret; +} + +void unregister_stat_tracer(struct tracer_stat *trace) +{ + struct stat_session *node, *tmp; + + mutex_lock(&all_stat_sessions_mutex); + list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { + if (node->ts == trace) { + list_del(&node->session_list); + destroy_session(node); + break; + } + } + mutex_unlock(&all_stat_sessions_mutex); +} diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h new file mode 100644 index 0000000000..31d7dc5bf1 --- /dev/null +++ b/kernel/trace/trace_stat.h @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __TRACE_STAT_H +#define __TRACE_STAT_H + +#include <linux/seq_file.h> + +/* + * If you want to provide a stat file (one-shot statistics), fill + * an iterator with stat_start/stat_next and a stat_show callbacks. + * The others callbacks are optional. + */ +struct tracer_stat { + /* The name of your stat file */ + const char *name; + /* Iteration over statistic entries */ + void *(*stat_start)(struct tracer_stat *trace); + void *(*stat_next)(void *prev, int idx); + /* Compare two entries for stats sorting */ + cmp_func_t stat_cmp; + /* Print a stat entry */ + int (*stat_show)(struct seq_file *s, void *p); + /* Release an entry */ + void (*stat_release)(void *stat); + /* Print the headers of your stat entries */ + int (*stat_headers)(struct seq_file *s); +}; + +/* + * Destroy or create a stat file + */ +extern int register_stat_tracer(struct tracer_stat *trace); +extern void unregister_stat_tracer(struct tracer_stat *trace); + +#endif /* __TRACE_STAT_H */ diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h new file mode 100644 index 0000000000..43f6fb6078 --- /dev/null +++ b/kernel/trace/trace_synth.h @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __TRACE_SYNTH_H +#define __TRACE_SYNTH_H + +#include "trace_dynevent.h" + +#define SYNTH_SYSTEM "synthetic" +#define SYNTH_FIELDS_MAX 64 + +#define STR_VAR_LEN_MAX MAX_FILTER_STR_VAL /* must be multiple of sizeof(u64) */ + +struct synth_field { + char *type; + char *name; + size_t size; + unsigned int offset; + unsigned int field_pos; + bool is_signed; + bool is_string; + bool is_dynamic; + bool is_stack; +}; + +struct synth_event { + struct dyn_event devent; + int ref; + char *name; + struct synth_field **fields; + unsigned int n_fields; + struct synth_field **dynamic_fields; + unsigned int n_dynamic_fields; + unsigned int n_u64; + struct trace_event_class class; + struct trace_event_call call; + struct tracepoint *tp; + struct module *mod; +}; + +extern struct synth_event *find_synth_event(const char *name); + +#endif /* __TRACE_SYNTH_H */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c new file mode 100644 index 0000000000..de753403cd --- /dev/null +++ b/kernel/trace/trace_syscalls.c @@ -0,0 +1,812 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <trace/syscall.h> +#include <trace/events/syscalls.h> +#include <linux/syscalls.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ +#include <linux/ftrace.h> +#include <linux/perf_event.h> +#include <linux/xarray.h> +#include <asm/syscall.h> + +#include "trace_output.h" +#include "trace.h" + +static DEFINE_MUTEX(syscall_trace_lock); + +static int syscall_enter_register(struct trace_event_call *event, + enum trace_reg type, void *data); +static int syscall_exit_register(struct trace_event_call *event, + enum trace_reg type, void *data); + +static struct list_head * +syscall_get_enter_fields(struct trace_event_call *call) +{ + struct syscall_metadata *entry = call->data; + + return &entry->enter_fields; +} + +extern struct syscall_metadata *__start_syscalls_metadata[]; +extern struct syscall_metadata *__stop_syscalls_metadata[]; + +static DEFINE_XARRAY(syscalls_metadata_sparse); +static struct syscall_metadata **syscalls_metadata; + +#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME +static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) +{ + /* + * Only compare after the "sys" prefix. Archs that use + * syscall wrappers may have syscalls symbols aliases prefixed + * with ".SyS" or ".sys" instead of "sys", leading to an unwanted + * mismatch. + */ + return !strcmp(sym + 3, name + 3); +} +#endif + +#ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS +/* + * Some architectures that allow for 32bit applications + * to run on a 64bit kernel, do not map the syscalls for + * the 32bit tasks the same as they do for 64bit tasks. + * + * *cough*x86*cough* + * + * In such a case, instead of reporting the wrong syscalls, + * simply ignore them. + * + * For an arch to ignore the compat syscalls it needs to + * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as + * define the function arch_trace_is_compat_syscall() to let + * the tracing system know that it should ignore it. + */ +static int +trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) +{ + if (unlikely(arch_trace_is_compat_syscall(regs))) + return -1; + + return syscall_get_nr(task, regs); +} +#else +static inline int +trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) +{ + return syscall_get_nr(task, regs); +} +#endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */ + +static __init struct syscall_metadata * +find_syscall_meta(unsigned long syscall) +{ + struct syscall_metadata **start; + struct syscall_metadata **stop; + char str[KSYM_SYMBOL_LEN]; + + + start = __start_syscalls_metadata; + stop = __stop_syscalls_metadata; + kallsyms_lookup(syscall, NULL, NULL, NULL, str); + + if (arch_syscall_match_sym_name(str, "sys_ni_syscall")) + return NULL; + + for ( ; start < stop; start++) { + if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) + return *start; + } + return NULL; +} + +static struct syscall_metadata *syscall_nr_to_meta(int nr) +{ + if (IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) + return xa_load(&syscalls_metadata_sparse, (unsigned long)nr); + + if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) + return NULL; + + return syscalls_metadata[nr]; +} + +const char *get_syscall_name(int syscall) +{ + struct syscall_metadata *entry; + + entry = syscall_nr_to_meta(syscall); + if (!entry) + return NULL; + + return entry->name; +} + +static enum print_line_t +print_syscall_enter(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_array *tr = iter->tr; + struct trace_seq *s = &iter->seq; + struct trace_entry *ent = iter->ent; + struct syscall_trace_enter *trace; + struct syscall_metadata *entry; + int i, syscall; + + trace = (typeof(trace))ent; + syscall = trace->nr; + entry = syscall_nr_to_meta(syscall); + + if (!entry) + goto end; + + if (entry->enter_event->event.type != ent->type) { + WARN_ON_ONCE(1); + goto end; + } + + trace_seq_printf(s, "%s(", entry->name); + + for (i = 0; i < entry->nb_args; i++) { + + if (trace_seq_has_overflowed(s)) + goto end; + + /* parameter types */ + if (tr && tr->trace_flags & TRACE_ITER_VERBOSE) + trace_seq_printf(s, "%s ", entry->types[i]); + + /* parameter values */ + trace_seq_printf(s, "%s: %lx%s", entry->args[i], + trace->args[i], + i == entry->nb_args - 1 ? "" : ", "); + } + + trace_seq_putc(s, ')'); +end: + trace_seq_putc(s, '\n'); + + return trace_handle_return(s); +} + +static enum print_line_t +print_syscall_exit(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *ent = iter->ent; + struct syscall_trace_exit *trace; + int syscall; + struct syscall_metadata *entry; + + trace = (typeof(trace))ent; + syscall = trace->nr; + entry = syscall_nr_to_meta(syscall); + + if (!entry) { + trace_seq_putc(s, '\n'); + goto out; + } + + if (entry->exit_event->event.type != ent->type) { + WARN_ON_ONCE(1); + return TRACE_TYPE_UNHANDLED; + } + + trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, + trace->ret); + + out: + return trace_handle_return(s); +} + +#define SYSCALL_FIELD(_type, _name) { \ + .type = #_type, .name = #_name, \ + .size = sizeof(_type), .align = __alignof__(_type), \ + .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER } + +static int __init +__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) +{ + int i; + int pos = 0; + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + for (i = 0; i < entry->nb_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s", + entry->args[i], sizeof(unsigned long), + i == entry->nb_args - 1 ? "" : ", "); + } + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + for (i = 0; i < entry->nb_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", ((unsigned long)(REC->%s))", entry->args[i]); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +static int __init set_syscall_print_fmt(struct trace_event_call *call) +{ + char *print_fmt; + int len; + struct syscall_metadata *entry = call->data; + + if (entry->enter_event != call) { + call->print_fmt = "\"0x%lx\", REC->ret"; + return 0; + } + + /* First: called with 0 length to calculate the needed length */ + len = __set_enter_print_fmt(entry, NULL, 0); + + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_enter_print_fmt(entry, print_fmt, len + 1); + call->print_fmt = print_fmt; + + return 0; +} + +static void __init free_syscall_print_fmt(struct trace_event_call *call) +{ + struct syscall_metadata *entry = call->data; + + if (entry->enter_event == call) + kfree(call->print_fmt); +} + +static int __init syscall_enter_define_fields(struct trace_event_call *call) +{ + struct syscall_trace_enter trace; + struct syscall_metadata *meta = call->data; + int offset = offsetof(typeof(trace), args); + int ret = 0; + int i; + + for (i = 0; i < meta->nb_args; i++) { + ret = trace_define_field(call, meta->types[i], + meta->args[i], offset, + sizeof(unsigned long), 0, + FILTER_OTHER); + if (ret) + break; + offset += sizeof(unsigned long); + } + + return ret; +} + +static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) +{ + struct trace_array *tr = data; + struct trace_event_file *trace_file; + struct syscall_trace_enter *entry; + struct syscall_metadata *sys_data; + struct trace_event_buffer fbuffer; + unsigned long args[6]; + int syscall_nr; + int size; + + syscall_nr = trace_get_syscall_nr(current, regs); + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) + return; + + /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ + trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]); + if (!trace_file) + return; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); + if (!entry) + return; + + entry = ring_buffer_event_data(fbuffer.event); + entry->nr = syscall_nr; + syscall_get_arguments(current, regs, args); + memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); + + trace_event_buffer_commit(&fbuffer); +} + +static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) +{ + struct trace_array *tr = data; + struct trace_event_file *trace_file; + struct syscall_trace_exit *entry; + struct syscall_metadata *sys_data; + struct trace_event_buffer fbuffer; + int syscall_nr; + + syscall_nr = trace_get_syscall_nr(current, regs); + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) + return; + + /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ + trace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]); + if (!trace_file) + return; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry)); + if (!entry) + return; + + entry = ring_buffer_event_data(fbuffer.event); + entry->nr = syscall_nr; + entry->ret = syscall_get_return_value(current, regs); + + trace_event_buffer_commit(&fbuffer); +} + +static int reg_event_syscall_enter(struct trace_event_file *file, + struct trace_event_call *call) +{ + struct trace_array *tr = file->tr; + int ret = 0; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) + return -ENOSYS; + mutex_lock(&syscall_trace_lock); + if (!tr->sys_refcount_enter) + ret = register_trace_sys_enter(ftrace_syscall_enter, tr); + if (!ret) { + rcu_assign_pointer(tr->enter_syscall_files[num], file); + tr->sys_refcount_enter++; + } + mutex_unlock(&syscall_trace_lock); + return ret; +} + +static void unreg_event_syscall_enter(struct trace_event_file *file, + struct trace_event_call *call) +{ + struct trace_array *tr = file->tr; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) + return; + mutex_lock(&syscall_trace_lock); + tr->sys_refcount_enter--; + RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL); + if (!tr->sys_refcount_enter) + unregister_trace_sys_enter(ftrace_syscall_enter, tr); + mutex_unlock(&syscall_trace_lock); +} + +static int reg_event_syscall_exit(struct trace_event_file *file, + struct trace_event_call *call) +{ + struct trace_array *tr = file->tr; + int ret = 0; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) + return -ENOSYS; + mutex_lock(&syscall_trace_lock); + if (!tr->sys_refcount_exit) + ret = register_trace_sys_exit(ftrace_syscall_exit, tr); + if (!ret) { + rcu_assign_pointer(tr->exit_syscall_files[num], file); + tr->sys_refcount_exit++; + } + mutex_unlock(&syscall_trace_lock); + return ret; +} + +static void unreg_event_syscall_exit(struct trace_event_file *file, + struct trace_event_call *call) +{ + struct trace_array *tr = file->tr; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) + return; + mutex_lock(&syscall_trace_lock); + tr->sys_refcount_exit--; + RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL); + if (!tr->sys_refcount_exit) + unregister_trace_sys_exit(ftrace_syscall_exit, tr); + mutex_unlock(&syscall_trace_lock); +} + +static int __init init_syscall_trace(struct trace_event_call *call) +{ + int id; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (num < 0 || num >= NR_syscalls) { + pr_debug("syscall %s metadata not mapped, disabling ftrace event\n", + ((struct syscall_metadata *)call->data)->name); + return -ENOSYS; + } + + if (set_syscall_print_fmt(call) < 0) + return -ENOMEM; + + id = trace_event_raw_init(call); + + if (id < 0) { + free_syscall_print_fmt(call); + return id; + } + + return id; +} + +static struct trace_event_fields __refdata syscall_enter_fields_array[] = { + SYSCALL_FIELD(int, __syscall_nr), + { .type = TRACE_FUNCTION_TYPE, + .define_fields = syscall_enter_define_fields }, + {} +}; + +struct trace_event_functions enter_syscall_print_funcs = { + .trace = print_syscall_enter, +}; + +struct trace_event_functions exit_syscall_print_funcs = { + .trace = print_syscall_exit, +}; + +struct trace_event_class __refdata event_class_syscall_enter = { + .system = "syscalls", + .reg = syscall_enter_register, + .fields_array = syscall_enter_fields_array, + .get_fields = syscall_get_enter_fields, + .raw_init = init_syscall_trace, +}; + +struct trace_event_class __refdata event_class_syscall_exit = { + .system = "syscalls", + .reg = syscall_exit_register, + .fields_array = (struct trace_event_fields[]){ + SYSCALL_FIELD(int, __syscall_nr), + SYSCALL_FIELD(long, ret), + {} + }, + .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), + .raw_init = init_syscall_trace, +}; + +unsigned long __init __weak arch_syscall_addr(int nr) +{ + return (unsigned long)sys_call_table[nr]; +} + +void __init init_ftrace_syscalls(void) +{ + struct syscall_metadata *meta; + unsigned long addr; + int i; + void *ret; + + if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) { + syscalls_metadata = kcalloc(NR_syscalls, + sizeof(*syscalls_metadata), + GFP_KERNEL); + if (!syscalls_metadata) { + WARN_ON(1); + return; + } + } + + for (i = 0; i < NR_syscalls; i++) { + addr = arch_syscall_addr(i); + meta = find_syscall_meta(addr); + if (!meta) + continue; + + meta->syscall_nr = i; + + if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) { + syscalls_metadata[i] = meta; + } else { + ret = xa_store(&syscalls_metadata_sparse, i, meta, + GFP_KERNEL); + WARN(xa_is_err(ret), + "Syscall memory allocation failed\n"); + } + + } +} + +#ifdef CONFIG_PERF_EVENTS + +static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls); +static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); +static int sys_perf_refcount_enter; +static int sys_perf_refcount_exit; + +static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs, + struct syscall_metadata *sys_data, + struct syscall_trace_enter *rec) +{ + struct syscall_tp_t { + struct trace_entry ent; + unsigned long syscall_nr; + unsigned long args[SYSCALL_DEFINE_MAXARGS]; + } __aligned(8) param; + int i; + + BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *)); + + /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ + *(struct pt_regs **)¶m = regs; + param.syscall_nr = rec->nr; + for (i = 0; i < sys_data->nb_args; i++) + param.args[i] = rec->args[i]; + return trace_call_bpf(call, ¶m); +} + +static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) +{ + struct syscall_metadata *sys_data; + struct syscall_trace_enter *rec; + struct hlist_head *head; + unsigned long args[6]; + bool valid_prog_array; + int syscall_nr; + int rctx; + int size; + + syscall_nr = trace_get_syscall_nr(current, regs); + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) + return; + if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) + return; + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + head = this_cpu_ptr(sys_data->enter_event->perf_events); + valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); + if (!valid_prog_array && hlist_empty(head)) + return; + + /* get the size after alignment with the u32 buffer size field */ + size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); + size = ALIGN(size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + rec = perf_trace_buf_alloc(size, NULL, &rctx); + if (!rec) + return; + + rec->nr = syscall_nr; + syscall_get_arguments(current, regs, args); + memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args); + + if ((valid_prog_array && + !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) || + hlist_empty(head)) { + perf_swevent_put_recursion_context(rctx); + return; + } + + perf_trace_buf_submit(rec, size, rctx, + sys_data->enter_event->event.type, 1, regs, + head, NULL); +} + +static int perf_sysenter_enable(struct trace_event_call *call) +{ + int ret = 0; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + + mutex_lock(&syscall_trace_lock); + if (!sys_perf_refcount_enter) + ret = register_trace_sys_enter(perf_syscall_enter, NULL); + if (ret) { + pr_info("event trace: Could not activate syscall entry trace point"); + } else { + set_bit(num, enabled_perf_enter_syscalls); + sys_perf_refcount_enter++; + } + mutex_unlock(&syscall_trace_lock); + return ret; +} + +static void perf_sysenter_disable(struct trace_event_call *call) +{ + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + + mutex_lock(&syscall_trace_lock); + sys_perf_refcount_enter--; + clear_bit(num, enabled_perf_enter_syscalls); + if (!sys_perf_refcount_enter) + unregister_trace_sys_enter(perf_syscall_enter, NULL); + mutex_unlock(&syscall_trace_lock); +} + +static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs, + struct syscall_trace_exit *rec) +{ + struct syscall_tp_t { + struct trace_entry ent; + unsigned long syscall_nr; + unsigned long ret; + } __aligned(8) param; + + /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ + *(struct pt_regs **)¶m = regs; + param.syscall_nr = rec->nr; + param.ret = rec->ret; + return trace_call_bpf(call, ¶m); +} + +static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) +{ + struct syscall_metadata *sys_data; + struct syscall_trace_exit *rec; + struct hlist_head *head; + bool valid_prog_array; + int syscall_nr; + int rctx; + int size; + + syscall_nr = trace_get_syscall_nr(current, regs); + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) + return; + if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) + return; + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + head = this_cpu_ptr(sys_data->exit_event->perf_events); + valid_prog_array = bpf_prog_array_valid(sys_data->exit_event); + if (!valid_prog_array && hlist_empty(head)) + return; + + /* We can probably do that at build time */ + size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + rec = perf_trace_buf_alloc(size, NULL, &rctx); + if (!rec) + return; + + rec->nr = syscall_nr; + rec->ret = syscall_get_return_value(current, regs); + + if ((valid_prog_array && + !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) || + hlist_empty(head)) { + perf_swevent_put_recursion_context(rctx); + return; + } + + perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, + 1, regs, head, NULL); +} + +static int perf_sysexit_enable(struct trace_event_call *call) +{ + int ret = 0; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + + mutex_lock(&syscall_trace_lock); + if (!sys_perf_refcount_exit) + ret = register_trace_sys_exit(perf_syscall_exit, NULL); + if (ret) { + pr_info("event trace: Could not activate syscall exit trace point"); + } else { + set_bit(num, enabled_perf_exit_syscalls); + sys_perf_refcount_exit++; + } + mutex_unlock(&syscall_trace_lock); + return ret; +} + +static void perf_sysexit_disable(struct trace_event_call *call) +{ + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + + mutex_lock(&syscall_trace_lock); + sys_perf_refcount_exit--; + clear_bit(num, enabled_perf_exit_syscalls); + if (!sys_perf_refcount_exit) + unregister_trace_sys_exit(perf_syscall_exit, NULL); + mutex_unlock(&syscall_trace_lock); +} + +#endif /* CONFIG_PERF_EVENTS */ + +static int syscall_enter_register(struct trace_event_call *event, + enum trace_reg type, void *data) +{ + struct trace_event_file *file = data; + + switch (type) { + case TRACE_REG_REGISTER: + return reg_event_syscall_enter(file, event); + case TRACE_REG_UNREGISTER: + unreg_event_syscall_enter(file, event); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return perf_sysenter_enable(event); + case TRACE_REG_PERF_UNREGISTER: + perf_sysenter_disable(event); + return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; +#endif + } + return 0; +} + +static int syscall_exit_register(struct trace_event_call *event, + enum trace_reg type, void *data) +{ + struct trace_event_file *file = data; + + switch (type) { + case TRACE_REG_REGISTER: + return reg_event_syscall_exit(file, event); + case TRACE_REG_UNREGISTER: + unreg_event_syscall_exit(file, event); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return perf_sysexit_enable(event); + case TRACE_REG_PERF_UNREGISTER: + perf_sysexit_disable(event); + return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: + return 0; +#endif + } + return 0; +} diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c new file mode 100644 index 0000000000..99c051de41 --- /dev/null +++ b/kernel/trace/trace_uprobe.c @@ -0,0 +1,1665 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * uprobes-based tracing events + * + * Copyright (C) IBM Corporation, 2010-2012 + * Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com> + */ +#define pr_fmt(fmt) "trace_uprobe: " fmt + +#include <linux/bpf-cgroup.h> +#include <linux/security.h> +#include <linux/ctype.h> +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/uprobes.h> +#include <linux/namei.h> +#include <linux/string.h> +#include <linux/rculist.h> +#include <linux/filter.h> + +#include "trace_dynevent.h" +#include "trace_probe.h" +#include "trace_probe_tmpl.h" + +#define UPROBE_EVENT_SYSTEM "uprobes" + +struct uprobe_trace_entry_head { + struct trace_entry ent; + unsigned long vaddr[]; +}; + +#define SIZEOF_TRACE_ENTRY(is_return) \ + (sizeof(struct uprobe_trace_entry_head) + \ + sizeof(unsigned long) * (is_return ? 2 : 1)) + +#define DATAOF_TRACE_ENTRY(entry, is_return) \ + ((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return)) + +static int trace_uprobe_create(const char *raw_command); +static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev); +static int trace_uprobe_release(struct dyn_event *ev); +static bool trace_uprobe_is_busy(struct dyn_event *ev); +static bool trace_uprobe_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev); + +static struct dyn_event_operations trace_uprobe_ops = { + .create = trace_uprobe_create, + .show = trace_uprobe_show, + .is_busy = trace_uprobe_is_busy, + .free = trace_uprobe_release, + .match = trace_uprobe_match, +}; + +/* + * uprobe event core functions + */ +struct trace_uprobe { + struct dyn_event devent; + struct uprobe_consumer consumer; + struct path path; + struct inode *inode; + char *filename; + unsigned long offset; + unsigned long ref_ctr_offset; + unsigned long nhit; + struct trace_probe tp; +}; + +static bool is_trace_uprobe(struct dyn_event *ev) +{ + return ev->ops == &trace_uprobe_ops; +} + +static struct trace_uprobe *to_trace_uprobe(struct dyn_event *ev) +{ + return container_of(ev, struct trace_uprobe, devent); +} + +/** + * for_each_trace_uprobe - iterate over the trace_uprobe list + * @pos: the struct trace_uprobe * for each entry + * @dpos: the struct dyn_event * to use as a loop cursor + */ +#define for_each_trace_uprobe(pos, dpos) \ + for_each_dyn_event(dpos) \ + if (is_trace_uprobe(dpos) && (pos = to_trace_uprobe(dpos))) + +static int register_uprobe_event(struct trace_uprobe *tu); +static int unregister_uprobe_event(struct trace_uprobe *tu); + +static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); +static int uretprobe_dispatcher(struct uprobe_consumer *con, + unsigned long func, struct pt_regs *regs); + +#ifdef CONFIG_STACK_GROWSUP +static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) +{ + return addr - (n * sizeof(long)); +} +#else +static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) +{ + return addr + (n * sizeof(long)); +} +#endif + +static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n) +{ + unsigned long ret; + unsigned long addr = user_stack_pointer(regs); + + addr = adjust_stack_addr(addr, n); + + if (copy_from_user(&ret, (void __force __user *) addr, sizeof(ret))) + return 0; + + return ret; +} + +/* + * Uprobes-specific fetch functions + */ +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size) +{ + void __user *vaddr = (void __force __user *)src; + + return copy_from_user(dest, vaddr, size) ? -EFAULT : 0; +} + +static nokprobe_inline int +probe_mem_read_user(void *dest, void *src, size_t size) +{ + return probe_mem_read(dest, src, size); +} + +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max + * length and relative data location. + */ +static nokprobe_inline int +fetch_store_string(unsigned long addr, void *dest, void *base) +{ + long ret; + u32 loc = *(u32 *)dest; + int maxlen = get_loc_len(loc); + u8 *dst = get_loc_data(dest, base); + void __user *src = (void __force __user *) addr; + + if (unlikely(!maxlen)) + return -ENOMEM; + + if (addr == FETCH_TOKEN_COMM) + ret = strlcpy(dst, current->comm, maxlen); + else + ret = strncpy_from_user(dst, src, maxlen); + if (ret >= 0) { + if (ret == maxlen) + dst[ret - 1] = '\0'; + else + /* + * Include the terminating null byte. In this case it + * was copied by strncpy_from_user but not accounted + * for in ret. + */ + ret++; + *(u32 *)dest = make_data_loc(ret, (void *)dst - base); + } else + *(u32 *)dest = make_data_loc(0, (void *)dst - base); + + return ret; +} + +static nokprobe_inline int +fetch_store_string_user(unsigned long addr, void *dest, void *base) +{ + return fetch_store_string(addr, dest, base); +} + +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +fetch_store_strlen(unsigned long addr) +{ + int len; + void __user *vaddr = (void __force __user *) addr; + + if (addr == FETCH_TOKEN_COMM) + len = strlen(current->comm) + 1; + else + len = strnlen_user(vaddr, MAX_STRING_SIZE); + + return (len > MAX_STRING_SIZE) ? 0 : len; +} + +static nokprobe_inline int +fetch_store_strlen_user(unsigned long addr) +{ + return fetch_store_strlen(addr); +} + +static unsigned long translate_user_vaddr(unsigned long file_offset) +{ + unsigned long base_addr; + struct uprobe_dispatch_data *udd; + + udd = (void *) current->utask->vaddr; + + base_addr = udd->bp_addr - udd->tu->offset; + return base_addr + file_offset; +} + +/* Note that we don't verify it, since the code does not come from user space */ +static int +process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, + void *base) +{ + struct pt_regs *regs = rec; + unsigned long val; + int ret; + + /* 1st stage: get value from context */ + switch (code->op) { + case FETCH_OP_REG: + val = regs_get_register(regs, code->param); + break; + case FETCH_OP_STACK: + val = get_user_stack_nth(regs, code->param); + break; + case FETCH_OP_STACKP: + val = user_stack_pointer(regs); + break; + case FETCH_OP_RETVAL: + val = regs_return_value(regs); + break; + case FETCH_OP_COMM: + val = FETCH_TOKEN_COMM; + break; + case FETCH_OP_FOFFS: + val = translate_user_vaddr(code->immediate); + break; + default: + ret = process_common_fetch_insn(code, &val); + if (ret < 0) + return ret; + } + code++; + + return process_fetch_insn_bottom(code, val, dest, base); +} +NOKPROBE_SYMBOL(process_fetch_insn) + +static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) +{ + rwlock_init(&filter->rwlock); + filter->nr_systemwide = 0; + INIT_LIST_HEAD(&filter->perf_events); +} + +static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter) +{ + return !filter->nr_systemwide && list_empty(&filter->perf_events); +} + +static inline bool is_ret_probe(struct trace_uprobe *tu) +{ + return tu->consumer.ret_handler != NULL; +} + +static bool trace_uprobe_is_busy(struct dyn_event *ev) +{ + struct trace_uprobe *tu = to_trace_uprobe(ev); + + return trace_probe_is_enabled(&tu->tp); +} + +static bool trace_uprobe_match_command_head(struct trace_uprobe *tu, + int argc, const char **argv) +{ + char buf[MAX_ARGSTR_LEN + 1]; + int len; + + if (!argc) + return true; + + len = strlen(tu->filename); + if (strncmp(tu->filename, argv[0], len) || argv[0][len] != ':') + return false; + + if (tu->ref_ctr_offset == 0) + snprintf(buf, sizeof(buf), "0x%0*lx", + (int)(sizeof(void *) * 2), tu->offset); + else + snprintf(buf, sizeof(buf), "0x%0*lx(0x%lx)", + (int)(sizeof(void *) * 2), tu->offset, + tu->ref_ctr_offset); + if (strcmp(buf, &argv[0][len + 1])) + return false; + + argc--; argv++; + + return trace_probe_match_command_args(&tu->tp, argc, argv); +} + +static bool trace_uprobe_match(const char *system, const char *event, + int argc, const char **argv, struct dyn_event *ev) +{ + struct trace_uprobe *tu = to_trace_uprobe(ev); + + return (event[0] == '\0' || + strcmp(trace_probe_name(&tu->tp), event) == 0) && + (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0) && + trace_uprobe_match_command_head(tu, argc, argv); +} + +static nokprobe_inline struct trace_uprobe * +trace_uprobe_primary_from_call(struct trace_event_call *call) +{ + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return NULL; + + return container_of(tp, struct trace_uprobe, tp); +} + +/* + * Allocate new trace_uprobe and initialize it (including uprobes). + */ +static struct trace_uprobe * +alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) +{ + struct trace_uprobe *tu; + int ret; + + tu = kzalloc(struct_size(tu, tp.args, nargs), GFP_KERNEL); + if (!tu) + return ERR_PTR(-ENOMEM); + + ret = trace_probe_init(&tu->tp, event, group, true); + if (ret < 0) + goto error; + + dyn_event_init(&tu->devent, &trace_uprobe_ops); + tu->consumer.handler = uprobe_dispatcher; + if (is_ret) + tu->consumer.ret_handler = uretprobe_dispatcher; + init_trace_uprobe_filter(tu->tp.event->filter); + return tu; + +error: + kfree(tu); + + return ERR_PTR(ret); +} + +static void free_trace_uprobe(struct trace_uprobe *tu) +{ + if (!tu) + return; + + path_put(&tu->path); + trace_probe_cleanup(&tu->tp); + kfree(tu->filename); + kfree(tu); +} + +static struct trace_uprobe *find_probe_event(const char *event, const char *group) +{ + struct dyn_event *pos; + struct trace_uprobe *tu; + + for_each_trace_uprobe(tu, pos) + if (strcmp(trace_probe_name(&tu->tp), event) == 0 && + strcmp(trace_probe_group_name(&tu->tp), group) == 0) + return tu; + + return NULL; +} + +/* Unregister a trace_uprobe and probe_event */ +static int unregister_trace_uprobe(struct trace_uprobe *tu) +{ + int ret; + + if (trace_probe_has_sibling(&tu->tp)) + goto unreg; + + /* If there's a reference to the dynamic event */ + if (trace_event_dyn_busy(trace_probe_event_call(&tu->tp))) + return -EBUSY; + + ret = unregister_uprobe_event(tu); + if (ret) + return ret; + +unreg: + dyn_event_remove(&tu->devent); + trace_probe_unlink(&tu->tp); + free_trace_uprobe(tu); + return 0; +} + +static bool trace_uprobe_has_same_uprobe(struct trace_uprobe *orig, + struct trace_uprobe *comp) +{ + struct trace_probe_event *tpe = orig->tp.event; + struct inode *comp_inode = d_real_inode(comp->path.dentry); + int i; + + list_for_each_entry(orig, &tpe->probes, tp.list) { + if (comp_inode != d_real_inode(orig->path.dentry) || + comp->offset != orig->offset) + continue; + + /* + * trace_probe_compare_arg_type() ensured that nr_args and + * each argument name and type are same. Let's compare comm. + */ + for (i = 0; i < orig->tp.nr_args; i++) { + if (strcmp(orig->tp.args[i].comm, + comp->tp.args[i].comm)) + break; + } + + if (i == orig->tp.nr_args) + return true; + } + + return false; +} + +static int append_trace_uprobe(struct trace_uprobe *tu, struct trace_uprobe *to) +{ + int ret; + + ret = trace_probe_compare_arg_type(&tu->tp, &to->tp); + if (ret) { + /* Note that argument starts index = 2 */ + trace_probe_log_set_index(ret + 1); + trace_probe_log_err(0, DIFF_ARG_TYPE); + return -EEXIST; + } + if (trace_uprobe_has_same_uprobe(to, tu)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, SAME_PROBE); + return -EEXIST; + } + + /* Append to existing event */ + ret = trace_probe_append(&tu->tp, &to->tp); + if (!ret) + dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp)); + + return ret; +} + +/* + * Uprobe with multiple reference counter is not allowed. i.e. + * If inode and offset matches, reference counter offset *must* + * match as well. Though, there is one exception: If user is + * replacing old trace_uprobe with new one(same group/event), + * then we allow same uprobe with new reference counter as far + * as the new one does not conflict with any other existing + * ones. + */ +static int validate_ref_ctr_offset(struct trace_uprobe *new) +{ + struct dyn_event *pos; + struct trace_uprobe *tmp; + struct inode *new_inode = d_real_inode(new->path.dentry); + + for_each_trace_uprobe(tmp, pos) { + if (new_inode == d_real_inode(tmp->path.dentry) && + new->offset == tmp->offset && + new->ref_ctr_offset != tmp->ref_ctr_offset) { + pr_warn("Reference counter offset mismatch."); + return -EINVAL; + } + } + return 0; +} + +/* Register a trace_uprobe and probe_event */ +static int register_trace_uprobe(struct trace_uprobe *tu) +{ + struct trace_uprobe *old_tu; + int ret; + + mutex_lock(&event_mutex); + + ret = validate_ref_ctr_offset(tu); + if (ret) + goto end; + + /* register as an event */ + old_tu = find_probe_event(trace_probe_name(&tu->tp), + trace_probe_group_name(&tu->tp)); + if (old_tu) { + if (is_ret_probe(tu) != is_ret_probe(old_tu)) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, DIFF_PROBE_TYPE); + ret = -EEXIST; + } else { + ret = append_trace_uprobe(tu, old_tu); + } + goto end; + } + + ret = register_uprobe_event(tu); + if (ret) { + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } else + pr_warn("Failed to register probe event(%d)\n", ret); + goto end; + } + + dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp)); + +end: + mutex_unlock(&event_mutex); + + return ret; +} + +/* + * Argument syntax: + * - Add uprobe: p|r[:[GRP/][EVENT]] PATH:OFFSET[%return][(REF)] [FETCHARGS] + */ +static int __trace_uprobe_create(int argc, const char **argv) +{ + struct trace_uprobe *tu; + const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; + char *arg, *filename, *rctr, *rctr_end, *tmp; + char buf[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; + enum probe_print_type ptype; + struct path path; + unsigned long offset, ref_ctr_offset; + bool is_return = false; + int i, ret; + + ref_ctr_offset = 0; + + switch (argv[0][0]) { + case 'r': + is_return = true; + break; + case 'p': + break; + default: + return -ECANCELED; + } + + if (argc < 2) + return -ECANCELED; + + if (argv[0][1] == ':') + event = &argv[0][2]; + + if (!strchr(argv[1], '/')) + return -ECANCELED; + + filename = kstrdup(argv[1], GFP_KERNEL); + if (!filename) + return -ENOMEM; + + /* Find the last occurrence, in case the path contains ':' too. */ + arg = strrchr(filename, ':'); + if (!arg || !isdigit(arg[1])) { + kfree(filename); + return -ECANCELED; + } + + trace_probe_log_init("trace_uprobe", argc, argv); + trace_probe_log_set_index(1); /* filename is the 2nd argument */ + + *arg++ = '\0'; + ret = kern_path(filename, LOOKUP_FOLLOW, &path); + if (ret) { + trace_probe_log_err(0, FILE_NOT_FOUND); + kfree(filename); + trace_probe_log_clear(); + return ret; + } + if (!d_is_reg(path.dentry)) { + trace_probe_log_err(0, NO_REGULAR_FILE); + ret = -EINVAL; + goto fail_address_parse; + } + + /* Parse reference counter offset if specified. */ + rctr = strchr(arg, '('); + if (rctr) { + rctr_end = strchr(rctr, ')'); + if (!rctr_end) { + ret = -EINVAL; + rctr_end = rctr + strlen(rctr); + trace_probe_log_err(rctr_end - filename, + REFCNT_OPEN_BRACE); + goto fail_address_parse; + } else if (rctr_end[1] != '\0') { + ret = -EINVAL; + trace_probe_log_err(rctr_end + 1 - filename, + BAD_REFCNT_SUFFIX); + goto fail_address_parse; + } + + *rctr++ = '\0'; + *rctr_end = '\0'; + ret = kstrtoul(rctr, 0, &ref_ctr_offset); + if (ret) { + trace_probe_log_err(rctr - filename, BAD_REFCNT); + goto fail_address_parse; + } + } + + /* Check if there is %return suffix */ + tmp = strchr(arg, '%'); + if (tmp) { + if (!strcmp(tmp, "%return")) { + *tmp = '\0'; + is_return = true; + } else { + trace_probe_log_err(tmp - filename, BAD_ADDR_SUFFIX); + ret = -EINVAL; + goto fail_address_parse; + } + } + + /* Parse uprobe offset. */ + ret = kstrtoul(arg, 0, &offset); + if (ret) { + trace_probe_log_err(arg - filename, BAD_UPROBE_OFFS); + goto fail_address_parse; + } + + /* setup a probe */ + trace_probe_log_set_index(0); + if (event) { + ret = traceprobe_parse_event_name(&event, &group, gbuf, + event - argv[0]); + if (ret) + goto fail_address_parse; + } + + if (!event) { + char *tail; + char *ptr; + + tail = kstrdup(kbasename(filename), GFP_KERNEL); + if (!tail) { + ret = -ENOMEM; + goto fail_address_parse; + } + + ptr = strpbrk(tail, ".-_"); + if (ptr) + *ptr = '\0'; + + snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset); + event = buf; + kfree(tail); + } + + argc -= 2; + argv += 2; + + tu = alloc_trace_uprobe(group, event, argc, is_return); + if (IS_ERR(tu)) { + ret = PTR_ERR(tu); + /* This must return -ENOMEM otherwise there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM); + goto fail_address_parse; + } + tu->offset = offset; + tu->ref_ctr_offset = ref_ctr_offset; + tu->path = path; + tu->filename = filename; + + /* parse arguments */ + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + struct traceprobe_parse_context ctx = { + .flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER, + }; + + trace_probe_log_set_index(i + 2); + ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], &ctx); + traceprobe_finish_parse(&ctx); + if (ret) + goto error; + } + + ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; + ret = traceprobe_set_print_fmt(&tu->tp, ptype); + if (ret < 0) + goto error; + + ret = register_trace_uprobe(tu); + if (!ret) + goto out; + +error: + free_trace_uprobe(tu); +out: + trace_probe_log_clear(); + return ret; + +fail_address_parse: + trace_probe_log_clear(); + path_put(&path); + kfree(filename); + + return ret; +} + +int trace_uprobe_create(const char *raw_command) +{ + return trace_probe_create(raw_command, __trace_uprobe_create); +} + +static int create_or_delete_trace_uprobe(const char *raw_command) +{ + int ret; + + if (raw_command[0] == '-') + return dyn_event_release(raw_command, &trace_uprobe_ops); + + ret = trace_uprobe_create(raw_command); + return ret == -ECANCELED ? -EINVAL : ret; +} + +static int trace_uprobe_release(struct dyn_event *ev) +{ + struct trace_uprobe *tu = to_trace_uprobe(ev); + + return unregister_trace_uprobe(tu); +} + +/* Probes listing interfaces */ +static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev) +{ + struct trace_uprobe *tu = to_trace_uprobe(ev); + char c = is_ret_probe(tu) ? 'r' : 'p'; + int i; + + seq_printf(m, "%c:%s/%s %s:0x%0*lx", c, trace_probe_group_name(&tu->tp), + trace_probe_name(&tu->tp), tu->filename, + (int)(sizeof(void *) * 2), tu->offset); + + if (tu->ref_ctr_offset) + seq_printf(m, "(0x%lx)", tu->ref_ctr_offset); + + for (i = 0; i < tu->tp.nr_args; i++) + seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); + + seq_putc(m, '\n'); + return 0; +} + +static int probes_seq_show(struct seq_file *m, void *v) +{ + struct dyn_event *ev = v; + + if (!is_trace_uprobe(ev)) + return 0; + + return trace_uprobe_show(m, ev); +} + +static const struct seq_operations probes_seq_op = { + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, + .show = probes_seq_show +}; + +static int probes_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ret = dyn_events_release_all(&trace_uprobe_ops); + if (ret) + return ret; + } + + return seq_open(file, &probes_seq_op); +} + +static ssize_t probes_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + return trace_parse_run_command(file, buffer, count, ppos, + create_or_delete_trace_uprobe); +} + +static const struct file_operations uprobe_events_ops = { + .owner = THIS_MODULE, + .open = probes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = probes_write, +}; + +/* Probes profiling interfaces */ +static int probes_profile_seq_show(struct seq_file *m, void *v) +{ + struct dyn_event *ev = v; + struct trace_uprobe *tu; + + if (!is_trace_uprobe(ev)) + return 0; + + tu = to_trace_uprobe(ev); + seq_printf(m, " %s %-44s %15lu\n", tu->filename, + trace_probe_name(&tu->tp), tu->nhit); + return 0; +} + +static const struct seq_operations profile_seq_op = { + .start = dyn_event_seq_start, + .next = dyn_event_seq_next, + .stop = dyn_event_seq_stop, + .show = probes_profile_seq_show +}; + +static int profile_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + return seq_open(file, &profile_seq_op); +} + +static const struct file_operations uprobe_profile_ops = { + .owner = THIS_MODULE, + .open = profile_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +struct uprobe_cpu_buffer { + struct mutex mutex; + void *buf; +}; +static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer; +static int uprobe_buffer_refcnt; + +static int uprobe_buffer_init(void) +{ + int cpu, err_cpu; + + uprobe_cpu_buffer = alloc_percpu(struct uprobe_cpu_buffer); + if (uprobe_cpu_buffer == NULL) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct page *p = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL, 0); + if (p == NULL) { + err_cpu = cpu; + goto err; + } + per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf = page_address(p); + mutex_init(&per_cpu_ptr(uprobe_cpu_buffer, cpu)->mutex); + } + + return 0; + +err: + for_each_possible_cpu(cpu) { + if (cpu == err_cpu) + break; + free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf); + } + + free_percpu(uprobe_cpu_buffer); + return -ENOMEM; +} + +static int uprobe_buffer_enable(void) +{ + int ret = 0; + + BUG_ON(!mutex_is_locked(&event_mutex)); + + if (uprobe_buffer_refcnt++ == 0) { + ret = uprobe_buffer_init(); + if (ret < 0) + uprobe_buffer_refcnt--; + } + + return ret; +} + +static void uprobe_buffer_disable(void) +{ + int cpu; + + BUG_ON(!mutex_is_locked(&event_mutex)); + + if (--uprobe_buffer_refcnt == 0) { + for_each_possible_cpu(cpu) + free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, + cpu)->buf); + + free_percpu(uprobe_cpu_buffer); + uprobe_cpu_buffer = NULL; + } +} + +static struct uprobe_cpu_buffer *uprobe_buffer_get(void) +{ + struct uprobe_cpu_buffer *ucb; + int cpu; + + cpu = raw_smp_processor_id(); + ucb = per_cpu_ptr(uprobe_cpu_buffer, cpu); + + /* + * Use per-cpu buffers for fastest access, but we might migrate + * so the mutex makes sure we have sole access to it. + */ + mutex_lock(&ucb->mutex); + + return ucb; +} + +static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) +{ + mutex_unlock(&ucb->mutex); +} + +static void __uprobe_trace_func(struct trace_uprobe *tu, + unsigned long func, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize, + struct trace_event_file *trace_file) +{ + struct uprobe_trace_entry_head *entry; + struct trace_event_buffer fbuffer; + void *data; + int size, esize; + struct trace_event_call *call = trace_probe_event_call(&tu->tp); + + WARN_ON(call != trace_file->event_call); + + if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE)) + return; + + if (trace_trigger_soft_disabled(trace_file)) + return; + + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + size = esize + tu->tp.size + dsize; + entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); + if (!entry) + return; + + if (is_ret_probe(tu)) { + entry->vaddr[0] = func; + entry->vaddr[1] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, true); + } else { + entry->vaddr[0] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, false); + } + + memcpy(data, ucb->buf, tu->tp.size + dsize); + + trace_event_buffer_commit(&fbuffer); +} + +/* uprobe handler */ +static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) +{ + struct event_file_link *link; + + if (is_ret_probe(tu)) + return 0; + + rcu_read_lock(); + trace_probe_for_each_link_rcu(link, &tu->tp) + __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file); + rcu_read_unlock(); + + return 0; +} + +static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, + struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) +{ + struct event_file_link *link; + + rcu_read_lock(); + trace_probe_for_each_link_rcu(link, &tu->tp) + __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file); + rcu_read_unlock(); +} + +/* Event entry printers */ +static enum print_line_t +print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) +{ + struct uprobe_trace_entry_head *entry; + struct trace_seq *s = &iter->seq; + struct trace_uprobe *tu; + u8 *data; + + entry = (struct uprobe_trace_entry_head *)iter->ent; + tu = trace_uprobe_primary_from_call( + container_of(event, struct trace_event_call, event)); + if (unlikely(!tu)) + goto out; + + if (is_ret_probe(tu)) { + trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", + trace_probe_name(&tu->tp), + entry->vaddr[1], entry->vaddr[0]); + data = DATAOF_TRACE_ENTRY(entry, true); + } else { + trace_seq_printf(s, "%s: (0x%lx)", + trace_probe_name(&tu->tp), + entry->vaddr[0]); + data = DATAOF_TRACE_ENTRY(entry, false); + } + + if (trace_probe_print_args(s, tu->tp.args, tu->tp.nr_args, data, entry) < 0) + goto out; + + trace_seq_putc(s, '\n'); + + out: + return trace_handle_return(s); +} + +typedef bool (*filter_func_t)(struct uprobe_consumer *self, + enum uprobe_filter_ctx ctx, + struct mm_struct *mm); + +static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter) +{ + int ret; + + tu->consumer.filter = filter; + tu->inode = d_real_inode(tu->path.dentry); + + if (tu->ref_ctr_offset) + ret = uprobe_register_refctr(tu->inode, tu->offset, + tu->ref_ctr_offset, &tu->consumer); + else + ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + + if (ret) + tu->inode = NULL; + + return ret; +} + +static void __probe_event_disable(struct trace_probe *tp) +{ + struct trace_uprobe *tu; + + tu = container_of(tp, struct trace_uprobe, tp); + WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); + + list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { + if (!tu->inode) + continue; + + uprobe_unregister(tu->inode, tu->offset, &tu->consumer); + tu->inode = NULL; + } +} + +static int probe_event_enable(struct trace_event_call *call, + struct trace_event_file *file, filter_func_t filter) +{ + struct trace_probe *tp; + struct trace_uprobe *tu; + bool enabled; + int ret; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This may also change "enabled" state */ + if (file) { + if (trace_probe_test_flag(tp, TP_FLAG_PROFILE)) + return -EINTR; + + ret = trace_probe_add_file(tp, file); + if (ret < 0) + return ret; + } else { + if (trace_probe_test_flag(tp, TP_FLAG_TRACE)) + return -EINTR; + + trace_probe_set_flag(tp, TP_FLAG_PROFILE); + } + + tu = container_of(tp, struct trace_uprobe, tp); + WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); + + if (enabled) + return 0; + + ret = uprobe_buffer_enable(); + if (ret) + goto err_flags; + + list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { + ret = trace_uprobe_enable(tu, filter); + if (ret) { + __probe_event_disable(tp); + goto err_buffer; + } + } + + return 0; + + err_buffer: + uprobe_buffer_disable(); + + err_flags: + if (file) + trace_probe_remove_file(tp, file); + else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + + return ret; +} + +static void probe_event_disable(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return; + + if (!trace_probe_is_enabled(tp)) + return; + + if (file) { + if (trace_probe_remove_file(tp, file) < 0) + return; + + if (trace_probe_is_enabled(tp)) + return; + } else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + + __probe_event_disable(tp); + uprobe_buffer_disable(); +} + +static int uprobe_event_define_fields(struct trace_event_call *event_call) +{ + int ret, size; + struct uprobe_trace_entry_head field; + struct trace_uprobe *tu; + + tu = trace_uprobe_primary_from_call(event_call); + if (unlikely(!tu)) + return -ENODEV; + + if (is_ret_probe(tu)) { + DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0); + DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0); + size = SIZEOF_TRACE_ENTRY(true); + } else { + DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0); + size = SIZEOF_TRACE_ENTRY(false); + } + + return traceprobe_define_arg_fields(event_call, size, &tu->tp); +} + +#ifdef CONFIG_PERF_EVENTS +static bool +__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) +{ + struct perf_event *event; + + if (filter->nr_systemwide) + return true; + + list_for_each_entry(event, &filter->perf_events, hw.tp_list) { + if (event->hw.target->mm == mm) + return true; + } + + return false; +} + +static inline bool +trace_uprobe_filter_event(struct trace_uprobe_filter *filter, + struct perf_event *event) +{ + return __uprobe_perf_filter(filter, event->hw.target->mm); +} + +static bool trace_uprobe_filter_remove(struct trace_uprobe_filter *filter, + struct perf_event *event) +{ + bool done; + + write_lock(&filter->rwlock); + if (event->hw.target) { + list_del(&event->hw.tp_list); + done = filter->nr_systemwide || + (event->hw.target->flags & PF_EXITING) || + trace_uprobe_filter_event(filter, event); + } else { + filter->nr_systemwide--; + done = filter->nr_systemwide; + } + write_unlock(&filter->rwlock); + + return done; +} + +/* This returns true if the filter always covers target mm */ +static bool trace_uprobe_filter_add(struct trace_uprobe_filter *filter, + struct perf_event *event) +{ + bool done; + + write_lock(&filter->rwlock); + if (event->hw.target) { + /* + * event->parent != NULL means copy_process(), we can avoid + * uprobe_apply(). current->mm must be probed and we can rely + * on dup_mmap() which preserves the already installed bp's. + * + * attr.enable_on_exec means that exec/mmap will install the + * breakpoints we need. + */ + done = filter->nr_systemwide || + event->parent || event->attr.enable_on_exec || + trace_uprobe_filter_event(filter, event); + list_add(&event->hw.tp_list, &filter->perf_events); + } else { + done = filter->nr_systemwide; + filter->nr_systemwide++; + } + write_unlock(&filter->rwlock); + + return done; +} + +static int uprobe_perf_close(struct trace_event_call *call, + struct perf_event *event) +{ + struct trace_probe *tp; + struct trace_uprobe *tu; + int ret = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + tu = container_of(tp, struct trace_uprobe, tp); + if (trace_uprobe_filter_remove(tu->tp.event->filter, event)) + return 0; + + list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { + ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + if (ret) + break; + } + + return ret; +} + +static int uprobe_perf_open(struct trace_event_call *call, + struct perf_event *event) +{ + struct trace_probe *tp; + struct trace_uprobe *tu; + int err = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + tu = container_of(tp, struct trace_uprobe, tp); + if (trace_uprobe_filter_add(tu->tp.event->filter, event)) + return 0; + + list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { + err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + if (err) { + uprobe_perf_close(call, event); + break; + } + } + + return err; +} + +static bool uprobe_perf_filter(struct uprobe_consumer *uc, + enum uprobe_filter_ctx ctx, struct mm_struct *mm) +{ + struct trace_uprobe_filter *filter; + struct trace_uprobe *tu; + int ret; + + tu = container_of(uc, struct trace_uprobe, consumer); + filter = tu->tp.event->filter; + + read_lock(&filter->rwlock); + ret = __uprobe_perf_filter(filter, mm); + read_unlock(&filter->rwlock); + + return ret; +} + +static void __uprobe_perf_func(struct trace_uprobe *tu, + unsigned long func, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) +{ + struct trace_event_call *call = trace_probe_event_call(&tu->tp); + struct uprobe_trace_entry_head *entry; + struct hlist_head *head; + void *data; + int size, esize; + int rctx; + +#ifdef CONFIG_BPF_EVENTS + if (bpf_prog_array_valid(call)) { + u32 ret; + + ret = bpf_prog_run_array_uprobe(call->prog_array, regs, bpf_prog_run); + if (!ret) + return; + } +#endif /* CONFIG_BPF_EVENTS */ + + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + + size = esize + tu->tp.size + dsize; + size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) + return; + + preempt_disable(); + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + goto out; + + entry = perf_trace_buf_alloc(size, NULL, &rctx); + if (!entry) + goto out; + + if (is_ret_probe(tu)) { + entry->vaddr[0] = func; + entry->vaddr[1] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, true); + } else { + entry->vaddr[0] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, false); + } + + memcpy(data, ucb->buf, tu->tp.size + dsize); + + if (size - esize > tu->tp.size + dsize) { + int len = tu->tp.size + dsize; + + memset(data + len, 0, size - esize - len); + } + + perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, + head, NULL); + out: + preempt_enable(); +} + +/* uprobe profile handler */ +static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) +{ + if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) + return UPROBE_HANDLER_REMOVE; + + if (!is_ret_probe(tu)) + __uprobe_perf_func(tu, 0, regs, ucb, dsize); + return 0; +} + +static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, + struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) +{ + __uprobe_perf_func(tu, func, regs, ucb, dsize); +} + +int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, + const char **filename, u64 *probe_offset, + u64 *probe_addr, bool perf_type_tracepoint) +{ + const char *pevent = trace_event_name(event->tp_event); + const char *group = event->tp_event->class->system; + struct trace_uprobe *tu; + + if (perf_type_tracepoint) + tu = find_probe_event(pevent, group); + else + tu = trace_uprobe_primary_from_call(event->tp_event); + if (!tu) + return -EINVAL; + + *fd_type = is_ret_probe(tu) ? BPF_FD_TYPE_URETPROBE + : BPF_FD_TYPE_UPROBE; + *filename = tu->filename; + *probe_offset = tu->offset; + *probe_addr = 0; + return 0; +} +#endif /* CONFIG_PERF_EVENTS */ + +static int +trace_uprobe_register(struct trace_event_call *event, enum trace_reg type, + void *data) +{ + struct trace_event_file *file = data; + + switch (type) { + case TRACE_REG_REGISTER: + return probe_event_enable(event, file, NULL); + + case TRACE_REG_UNREGISTER: + probe_event_disable(event, file); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return probe_event_enable(event, NULL, uprobe_perf_filter); + + case TRACE_REG_PERF_UNREGISTER: + probe_event_disable(event, NULL); + return 0; + + case TRACE_REG_PERF_OPEN: + return uprobe_perf_open(event, data); + + case TRACE_REG_PERF_CLOSE: + return uprobe_perf_close(event, data); + +#endif + default: + return 0; + } +} + +static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) +{ + struct trace_uprobe *tu; + struct uprobe_dispatch_data udd; + struct uprobe_cpu_buffer *ucb; + int dsize, esize; + int ret = 0; + + + tu = container_of(con, struct trace_uprobe, consumer); + tu->nhit++; + + udd.tu = tu; + udd.bp_addr = instruction_pointer(regs); + + current->utask->vaddr = (unsigned long) &udd; + + if (WARN_ON_ONCE(!uprobe_cpu_buffer)) + return 0; + + dsize = __get_data_size(&tu->tp, regs); + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + + ucb = uprobe_buffer_get(); + store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); + + if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) + ret |= uprobe_trace_func(tu, regs, ucb, dsize); + +#ifdef CONFIG_PERF_EVENTS + if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) + ret |= uprobe_perf_func(tu, regs, ucb, dsize); +#endif + uprobe_buffer_put(ucb); + return ret; +} + +static int uretprobe_dispatcher(struct uprobe_consumer *con, + unsigned long func, struct pt_regs *regs) +{ + struct trace_uprobe *tu; + struct uprobe_dispatch_data udd; + struct uprobe_cpu_buffer *ucb; + int dsize, esize; + + tu = container_of(con, struct trace_uprobe, consumer); + + udd.tu = tu; + udd.bp_addr = func; + + current->utask->vaddr = (unsigned long) &udd; + + if (WARN_ON_ONCE(!uprobe_cpu_buffer)) + return 0; + + dsize = __get_data_size(&tu->tp, regs); + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + + ucb = uprobe_buffer_get(); + store_trace_args(ucb->buf, &tu->tp, regs, esize, dsize); + + if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) + uretprobe_trace_func(tu, func, regs, ucb, dsize); + +#ifdef CONFIG_PERF_EVENTS + if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) + uretprobe_perf_func(tu, func, regs, ucb, dsize); +#endif + uprobe_buffer_put(ucb); + return 0; +} + +static struct trace_event_functions uprobe_funcs = { + .trace = print_uprobe_event +}; + +static struct trace_event_fields uprobe_fields_array[] = { + { .type = TRACE_FUNCTION_TYPE, + .define_fields = uprobe_event_define_fields }, + {} +}; + +static inline void init_trace_event_call(struct trace_uprobe *tu) +{ + struct trace_event_call *call = trace_probe_event_call(&tu->tp); + call->event.funcs = &uprobe_funcs; + call->class->fields_array = uprobe_fields_array; + + call->flags = TRACE_EVENT_FL_UPROBE | TRACE_EVENT_FL_CAP_ANY; + call->class->reg = trace_uprobe_register; +} + +static int register_uprobe_event(struct trace_uprobe *tu) +{ + init_trace_event_call(tu); + + return trace_probe_register_event_call(&tu->tp); +} + +static int unregister_uprobe_event(struct trace_uprobe *tu) +{ + return trace_probe_unregister_event_call(&tu->tp); +} + +#ifdef CONFIG_PERF_EVENTS +struct trace_event_call * +create_local_trace_uprobe(char *name, unsigned long offs, + unsigned long ref_ctr_offset, bool is_return) +{ + enum probe_print_type ptype; + struct trace_uprobe *tu; + struct path path; + int ret; + + ret = kern_path(name, LOOKUP_FOLLOW, &path); + if (ret) + return ERR_PTR(ret); + + if (!d_is_reg(path.dentry)) { + path_put(&path); + return ERR_PTR(-EINVAL); + } + + /* + * local trace_kprobes are not added to dyn_event, so they are never + * searched in find_trace_kprobe(). Therefore, there is no concern of + * duplicated name "DUMMY_EVENT" here. + */ + tu = alloc_trace_uprobe(UPROBE_EVENT_SYSTEM, "DUMMY_EVENT", 0, + is_return); + + if (IS_ERR(tu)) { + pr_info("Failed to allocate trace_uprobe.(%d)\n", + (int)PTR_ERR(tu)); + path_put(&path); + return ERR_CAST(tu); + } + + tu->offset = offs; + tu->path = path; + tu->ref_ctr_offset = ref_ctr_offset; + tu->filename = kstrdup(name, GFP_KERNEL); + if (!tu->filename) { + ret = -ENOMEM; + goto error; + } + + init_trace_event_call(tu); + + ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; + if (traceprobe_set_print_fmt(&tu->tp, ptype) < 0) { + ret = -ENOMEM; + goto error; + } + + return trace_probe_event_call(&tu->tp); +error: + free_trace_uprobe(tu); + return ERR_PTR(ret); +} + +void destroy_local_trace_uprobe(struct trace_event_call *event_call) +{ + struct trace_uprobe *tu; + + tu = trace_uprobe_primary_from_call(event_call); + + free_trace_uprobe(tu); +} +#endif /* CONFIG_PERF_EVENTS */ + +/* Make a trace interface for controlling probe points */ +static __init int init_uprobe_trace(void) +{ + int ret; + + ret = dyn_event_register(&trace_uprobe_ops); + if (ret) + return ret; + + ret = tracing_init_dentry(); + if (ret) + return 0; + + trace_create_file("uprobe_events", TRACE_MODE_WRITE, NULL, + NULL, &uprobe_events_ops); + /* Profile interface */ + trace_create_file("uprobe_profile", TRACE_MODE_READ, NULL, + NULL, &uprobe_profile_ops); + return 0; +} + +fs_initcall(init_uprobe_trace); diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c new file mode 100644 index 0000000000..a4dcf0f243 --- /dev/null +++ b/kernel/trace/tracing_map.c @@ -0,0 +1,1139 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * tracing_map - lock-free map for tracing + * + * Copyright (C) 2015 Tom Zanussi <tom.zanussi@linux.intel.com> + * + * tracing_map implementation inspired by lock-free map algorithms + * originated by Dr. Cliff Click: + * + * http://www.azulsystems.com/blog/cliff/2007-03-26-non-blocking-hashtable + * http://www.azulsystems.com/events/javaone_2007/2007_LockFreeHash.pdf + */ + +#include <linux/vmalloc.h> +#include <linux/jhash.h> +#include <linux/slab.h> +#include <linux/sort.h> +#include <linux/kmemleak.h> + +#include "tracing_map.h" +#include "trace.h" + +/* + * NOTE: For a detailed description of the data structures used by + * these functions (such as tracing_map_elt) please see the overview + * of tracing_map data structures at the beginning of tracing_map.h. + */ + +/** + * tracing_map_update_sum - Add a value to a tracing_map_elt's sum field + * @elt: The tracing_map_elt + * @i: The index of the given sum associated with the tracing_map_elt + * @n: The value to add to the sum + * + * Add n to sum i associated with the specified tracing_map_elt + * instance. The index i is the index returned by the call to + * tracing_map_add_sum_field() when the tracing map was set up. + */ +void tracing_map_update_sum(struct tracing_map_elt *elt, unsigned int i, u64 n) +{ + atomic64_add(n, &elt->fields[i].sum); +} + +/** + * tracing_map_read_sum - Return the value of a tracing_map_elt's sum field + * @elt: The tracing_map_elt + * @i: The index of the given sum associated with the tracing_map_elt + * + * Retrieve the value of the sum i associated with the specified + * tracing_map_elt instance. The index i is the index returned by the + * call to tracing_map_add_sum_field() when the tracing map was set + * up. + * + * Return: The sum associated with field i for elt. + */ +u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i) +{ + return (u64)atomic64_read(&elt->fields[i].sum); +} + +/** + * tracing_map_set_var - Assign a tracing_map_elt's variable field + * @elt: The tracing_map_elt + * @i: The index of the given variable associated with the tracing_map_elt + * @n: The value to assign + * + * Assign n to variable i associated with the specified tracing_map_elt + * instance. The index i is the index returned by the call to + * tracing_map_add_var() when the tracing map was set up. + */ +void tracing_map_set_var(struct tracing_map_elt *elt, unsigned int i, u64 n) +{ + atomic64_set(&elt->vars[i], n); + elt->var_set[i] = true; +} + +/** + * tracing_map_var_set - Return whether or not a variable has been set + * @elt: The tracing_map_elt + * @i: The index of the given variable associated with the tracing_map_elt + * + * Return true if the variable has been set, false otherwise. The + * index i is the index returned by the call to tracing_map_add_var() + * when the tracing map was set up. + */ +bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i) +{ + return elt->var_set[i]; +} + +/** + * tracing_map_read_var - Return the value of a tracing_map_elt's variable field + * @elt: The tracing_map_elt + * @i: The index of the given variable associated with the tracing_map_elt + * + * Retrieve the value of the variable i associated with the specified + * tracing_map_elt instance. The index i is the index returned by the + * call to tracing_map_add_var() when the tracing map was set + * up. + * + * Return: The variable value associated with field i for elt. + */ +u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i) +{ + return (u64)atomic64_read(&elt->vars[i]); +} + +/** + * tracing_map_read_var_once - Return and reset a tracing_map_elt's variable field + * @elt: The tracing_map_elt + * @i: The index of the given variable associated with the tracing_map_elt + * + * Retrieve the value of the variable i associated with the specified + * tracing_map_elt instance, and reset the variable to the 'not set' + * state. The index i is the index returned by the call to + * tracing_map_add_var() when the tracing map was set up. The reset + * essentially makes the variable a read-once variable if it's only + * accessed using this function. + * + * Return: The variable value associated with field i for elt. + */ +u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i) +{ + elt->var_set[i] = false; + return (u64)atomic64_read(&elt->vars[i]); +} + +int tracing_map_cmp_string(void *val_a, void *val_b) +{ + char *a = val_a; + char *b = val_b; + + return strcmp(a, b); +} + +int tracing_map_cmp_none(void *val_a, void *val_b) +{ + return 0; +} + +static int tracing_map_cmp_atomic64(void *val_a, void *val_b) +{ + u64 a = atomic64_read((atomic64_t *)val_a); + u64 b = atomic64_read((atomic64_t *)val_b); + + return (a > b) ? 1 : ((a < b) ? -1 : 0); +} + +#define DEFINE_TRACING_MAP_CMP_FN(type) \ +static int tracing_map_cmp_##type(void *val_a, void *val_b) \ +{ \ + type a = (type)(*(u64 *)val_a); \ + type b = (type)(*(u64 *)val_b); \ + \ + return (a > b) ? 1 : ((a < b) ? -1 : 0); \ +} + +DEFINE_TRACING_MAP_CMP_FN(s64); +DEFINE_TRACING_MAP_CMP_FN(u64); +DEFINE_TRACING_MAP_CMP_FN(s32); +DEFINE_TRACING_MAP_CMP_FN(u32); +DEFINE_TRACING_MAP_CMP_FN(s16); +DEFINE_TRACING_MAP_CMP_FN(u16); +DEFINE_TRACING_MAP_CMP_FN(s8); +DEFINE_TRACING_MAP_CMP_FN(u8); + +tracing_map_cmp_fn_t tracing_map_cmp_num(int field_size, + int field_is_signed) +{ + tracing_map_cmp_fn_t fn = tracing_map_cmp_none; + + switch (field_size) { + case 8: + if (field_is_signed) + fn = tracing_map_cmp_s64; + else + fn = tracing_map_cmp_u64; + break; + case 4: + if (field_is_signed) + fn = tracing_map_cmp_s32; + else + fn = tracing_map_cmp_u32; + break; + case 2: + if (field_is_signed) + fn = tracing_map_cmp_s16; + else + fn = tracing_map_cmp_u16; + break; + case 1: + if (field_is_signed) + fn = tracing_map_cmp_s8; + else + fn = tracing_map_cmp_u8; + break; + } + + return fn; +} + +static int tracing_map_add_field(struct tracing_map *map, + tracing_map_cmp_fn_t cmp_fn) +{ + int ret = -EINVAL; + + if (map->n_fields < TRACING_MAP_FIELDS_MAX) { + ret = map->n_fields; + map->fields[map->n_fields++].cmp_fn = cmp_fn; + } + + return ret; +} + +/** + * tracing_map_add_sum_field - Add a field describing a tracing_map sum + * @map: The tracing_map + * + * Add a sum field to the key and return the index identifying it in + * the map and associated tracing_map_elts. This is the index used + * for instance to update a sum for a particular tracing_map_elt using + * tracing_map_update_sum() or reading it via tracing_map_read_sum(). + * + * Return: The index identifying the field in the map and associated + * tracing_map_elts, or -EINVAL on error. + */ +int tracing_map_add_sum_field(struct tracing_map *map) +{ + return tracing_map_add_field(map, tracing_map_cmp_atomic64); +} + +/** + * tracing_map_add_var - Add a field describing a tracing_map var + * @map: The tracing_map + * + * Add a var to the map and return the index identifying it in the map + * and associated tracing_map_elts. This is the index used for + * instance to update a var for a particular tracing_map_elt using + * tracing_map_update_var() or reading it via tracing_map_read_var(). + * + * Return: The index identifying the var in the map and associated + * tracing_map_elts, or -EINVAL on error. + */ +int tracing_map_add_var(struct tracing_map *map) +{ + int ret = -EINVAL; + + if (map->n_vars < TRACING_MAP_VARS_MAX) + ret = map->n_vars++; + + return ret; +} + +/** + * tracing_map_add_key_field - Add a field describing a tracing_map key + * @map: The tracing_map + * @offset: The offset within the key + * @cmp_fn: The comparison function that will be used to sort on the key + * + * Let the map know there is a key and that if it's used as a sort key + * to use cmp_fn. + * + * A key can be a subset of a compound key; for that purpose, the + * offset param is used to describe where within the compound key + * the key referenced by this key field resides. + * + * Return: The index identifying the field in the map and associated + * tracing_map_elts, or -EINVAL on error. + */ +int tracing_map_add_key_field(struct tracing_map *map, + unsigned int offset, + tracing_map_cmp_fn_t cmp_fn) + +{ + int idx = tracing_map_add_field(map, cmp_fn); + + if (idx < 0) + return idx; + + map->fields[idx].offset = offset; + + map->key_idx[map->n_keys++] = idx; + + return idx; +} + +static void tracing_map_array_clear(struct tracing_map_array *a) +{ + unsigned int i; + + if (!a->pages) + return; + + for (i = 0; i < a->n_pages; i++) + memset(a->pages[i], 0, PAGE_SIZE); +} + +static void tracing_map_array_free(struct tracing_map_array *a) +{ + unsigned int i; + + if (!a) + return; + + if (!a->pages) + goto free; + + for (i = 0; i < a->n_pages; i++) { + if (!a->pages[i]) + break; + kmemleak_free(a->pages[i]); + free_page((unsigned long)a->pages[i]); + } + + kfree(a->pages); + + free: + kfree(a); +} + +static struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts, + unsigned int entry_size) +{ + struct tracing_map_array *a; + unsigned int i; + + a = kzalloc(sizeof(*a), GFP_KERNEL); + if (!a) + return NULL; + + a->entry_size_shift = fls(roundup_pow_of_two(entry_size) - 1); + a->entries_per_page = PAGE_SIZE / (1 << a->entry_size_shift); + a->n_pages = n_elts / a->entries_per_page; + if (!a->n_pages) + a->n_pages = 1; + a->entry_shift = fls(a->entries_per_page) - 1; + a->entry_mask = (1 << a->entry_shift) - 1; + + a->pages = kcalloc(a->n_pages, sizeof(void *), GFP_KERNEL); + if (!a->pages) + goto free; + + for (i = 0; i < a->n_pages; i++) { + a->pages[i] = (void *)get_zeroed_page(GFP_KERNEL); + if (!a->pages[i]) + goto free; + kmemleak_alloc(a->pages[i], PAGE_SIZE, 1, GFP_KERNEL); + } + out: + return a; + free: + tracing_map_array_free(a); + a = NULL; + + goto out; +} + +static void tracing_map_elt_clear(struct tracing_map_elt *elt) +{ + unsigned i; + + for (i = 0; i < elt->map->n_fields; i++) + if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64) + atomic64_set(&elt->fields[i].sum, 0); + + for (i = 0; i < elt->map->n_vars; i++) { + atomic64_set(&elt->vars[i], 0); + elt->var_set[i] = false; + } + + if (elt->map->ops && elt->map->ops->elt_clear) + elt->map->ops->elt_clear(elt); +} + +static void tracing_map_elt_init_fields(struct tracing_map_elt *elt) +{ + unsigned int i; + + tracing_map_elt_clear(elt); + + for (i = 0; i < elt->map->n_fields; i++) { + elt->fields[i].cmp_fn = elt->map->fields[i].cmp_fn; + + if (elt->fields[i].cmp_fn != tracing_map_cmp_atomic64) + elt->fields[i].offset = elt->map->fields[i].offset; + } +} + +static void tracing_map_elt_free(struct tracing_map_elt *elt) +{ + if (!elt) + return; + + if (elt->map->ops && elt->map->ops->elt_free) + elt->map->ops->elt_free(elt); + kfree(elt->fields); + kfree(elt->vars); + kfree(elt->var_set); + kfree(elt->key); + kfree(elt); +} + +static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map) +{ + struct tracing_map_elt *elt; + int err = 0; + + elt = kzalloc(sizeof(*elt), GFP_KERNEL); + if (!elt) + return ERR_PTR(-ENOMEM); + + elt->map = map; + + elt->key = kzalloc(map->key_size, GFP_KERNEL); + if (!elt->key) { + err = -ENOMEM; + goto free; + } + + elt->fields = kcalloc(map->n_fields, sizeof(*elt->fields), GFP_KERNEL); + if (!elt->fields) { + err = -ENOMEM; + goto free; + } + + elt->vars = kcalloc(map->n_vars, sizeof(*elt->vars), GFP_KERNEL); + if (!elt->vars) { + err = -ENOMEM; + goto free; + } + + elt->var_set = kcalloc(map->n_vars, sizeof(*elt->var_set), GFP_KERNEL); + if (!elt->var_set) { + err = -ENOMEM; + goto free; + } + + tracing_map_elt_init_fields(elt); + + if (map->ops && map->ops->elt_alloc) { + err = map->ops->elt_alloc(elt); + if (err) + goto free; + } + return elt; + free: + tracing_map_elt_free(elt); + + return ERR_PTR(err); +} + +static struct tracing_map_elt *get_free_elt(struct tracing_map *map) +{ + struct tracing_map_elt *elt = NULL; + int idx; + + idx = atomic_inc_return(&map->next_elt); + if (idx < map->max_elts) { + elt = *(TRACING_MAP_ELT(map->elts, idx)); + if (map->ops && map->ops->elt_init) + map->ops->elt_init(elt); + } + + return elt; +} + +static void tracing_map_free_elts(struct tracing_map *map) +{ + unsigned int i; + + if (!map->elts) + return; + + for (i = 0; i < map->max_elts; i++) { + tracing_map_elt_free(*(TRACING_MAP_ELT(map->elts, i))); + *(TRACING_MAP_ELT(map->elts, i)) = NULL; + } + + tracing_map_array_free(map->elts); + map->elts = NULL; +} + +static int tracing_map_alloc_elts(struct tracing_map *map) +{ + unsigned int i; + + map->elts = tracing_map_array_alloc(map->max_elts, + sizeof(struct tracing_map_elt *)); + if (!map->elts) + return -ENOMEM; + + for (i = 0; i < map->max_elts; i++) { + *(TRACING_MAP_ELT(map->elts, i)) = tracing_map_elt_alloc(map); + if (IS_ERR(*(TRACING_MAP_ELT(map->elts, i)))) { + *(TRACING_MAP_ELT(map->elts, i)) = NULL; + tracing_map_free_elts(map); + + return -ENOMEM; + } + } + + return 0; +} + +static inline bool keys_match(void *key, void *test_key, unsigned key_size) +{ + bool match = true; + + if (memcmp(key, test_key, key_size)) + match = false; + + return match; +} + +static inline struct tracing_map_elt * +__tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) +{ + u32 idx, key_hash, test_key; + int dup_try = 0; + struct tracing_map_entry *entry; + struct tracing_map_elt *val; + + key_hash = jhash(key, map->key_size, 0); + if (key_hash == 0) + key_hash = 1; + idx = key_hash >> (32 - (map->map_bits + 1)); + + while (1) { + idx &= (map->map_size - 1); + entry = TRACING_MAP_ENTRY(map->map, idx); + test_key = entry->key; + + if (test_key && test_key == key_hash) { + val = READ_ONCE(entry->val); + if (val && + keys_match(key, val->key, map->key_size)) { + if (!lookup_only) + atomic64_inc(&map->hits); + return val; + } else if (unlikely(!val)) { + /* + * The key is present. But, val (pointer to elt + * struct) is still NULL. which means some other + * thread is in the process of inserting an + * element. + * + * On top of that, it's key_hash is same as the + * one being inserted right now. So, it's + * possible that the element has the same + * key as well. + */ + + dup_try++; + if (dup_try > map->map_size) { + atomic64_inc(&map->drops); + break; + } + continue; + } + } + + if (!test_key) { + if (lookup_only) + break; + + if (!cmpxchg(&entry->key, 0, key_hash)) { + struct tracing_map_elt *elt; + + elt = get_free_elt(map); + if (!elt) { + atomic64_inc(&map->drops); + entry->key = 0; + break; + } + + memcpy(elt->key, key, map->key_size); + /* + * Ensure the initialization is visible and + * publish the elt. + */ + smp_wmb(); + WRITE_ONCE(entry->val, elt); + atomic64_inc(&map->hits); + + return entry->val; + } else { + /* + * cmpxchg() failed. Loop around once + * more to check what key was inserted. + */ + dup_try++; + continue; + } + } + + idx++; + } + + return NULL; +} + +/** + * tracing_map_insert - Insert key and/or retrieve val from a tracing_map + * @map: The tracing_map to insert into + * @key: The key to insert + * + * Inserts a key into a tracing_map and creates and returns a new + * tracing_map_elt for it, or if the key has already been inserted by + * a previous call, returns the tracing_map_elt already associated + * with it. When the map was created, the number of elements to be + * allocated for the map was specified (internally maintained as + * 'max_elts' in struct tracing_map), and that number of + * tracing_map_elts was created by tracing_map_init(). This is the + * pre-allocated pool of tracing_map_elts that tracing_map_insert() + * will allocate from when adding new keys. Once that pool is + * exhausted, tracing_map_insert() is useless and will return NULL to + * signal that state. There are two user-visible tracing_map + * variables, 'hits' and 'drops', which are updated by this function. + * Every time an element is either successfully inserted or retrieved, + * the 'hits' value is incremented. Every time an element insertion + * fails, the 'drops' value is incremented. + * + * This is a lock-free tracing map insertion function implementing a + * modified form of Cliff Click's basic insertion algorithm. It + * requires the table size be a power of two. To prevent any + * possibility of an infinite loop we always make the internal table + * size double the size of the requested table size (max_elts * 2). + * Likewise, we never reuse a slot or resize or delete elements - when + * we've reached max_elts entries, we simply return NULL once we've + * run out of entries. Readers can at any point in time traverse the + * tracing map and safely access the key/val pairs. + * + * Return: the tracing_map_elt pointer val associated with the key. + * If this was a newly inserted key, the val will be a newly allocated + * and associated tracing_map_elt pointer val. If the key wasn't + * found and the pool of tracing_map_elts has been exhausted, NULL is + * returned and no further insertions will succeed. + */ +struct tracing_map_elt *tracing_map_insert(struct tracing_map *map, void *key) +{ + return __tracing_map_insert(map, key, false); +} + +/** + * tracing_map_lookup - Retrieve val from a tracing_map + * @map: The tracing_map to perform the lookup on + * @key: The key to look up + * + * Looks up key in tracing_map and if found returns the matching + * tracing_map_elt. This is a lock-free lookup; see + * tracing_map_insert() for details on tracing_map and how it works. + * Every time an element is retrieved, the 'hits' value is + * incremented. There is one user-visible tracing_map variable, + * 'hits', which is updated by this function. Every time an element + * is successfully retrieved, the 'hits' value is incremented. The + * 'drops' value is never updated by this function. + * + * Return: the tracing_map_elt pointer val associated with the key. + * If the key wasn't found, NULL is returned. + */ +struct tracing_map_elt *tracing_map_lookup(struct tracing_map *map, void *key) +{ + return __tracing_map_insert(map, key, true); +} + +/** + * tracing_map_destroy - Destroy a tracing_map + * @map: The tracing_map to destroy + * + * Frees a tracing_map along with its associated array of + * tracing_map_elts. + * + * Callers should make sure there are no readers or writers actively + * reading or inserting into the map before calling this. + */ +void tracing_map_destroy(struct tracing_map *map) +{ + if (!map) + return; + + tracing_map_free_elts(map); + + tracing_map_array_free(map->map); + kfree(map); +} + +/** + * tracing_map_clear - Clear a tracing_map + * @map: The tracing_map to clear + * + * Resets the tracing map to a cleared or initial state. The + * tracing_map_elts are all cleared, and the array of struct + * tracing_map_entry is reset to an initialized state. + * + * Callers should make sure there are no writers actively inserting + * into the map before calling this. + */ +void tracing_map_clear(struct tracing_map *map) +{ + unsigned int i; + + atomic_set(&map->next_elt, -1); + atomic64_set(&map->hits, 0); + atomic64_set(&map->drops, 0); + + tracing_map_array_clear(map->map); + + for (i = 0; i < map->max_elts; i++) + tracing_map_elt_clear(*(TRACING_MAP_ELT(map->elts, i))); +} + +static void set_sort_key(struct tracing_map *map, + struct tracing_map_sort_key *sort_key) +{ + map->sort_key = *sort_key; +} + +/** + * tracing_map_create - Create a lock-free map and element pool + * @map_bits: The size of the map (2 ** map_bits) + * @key_size: The size of the key for the map in bytes + * @ops: Optional client-defined tracing_map_ops instance + * @private_data: Client data associated with the map + * + * Creates and sets up a map to contain 2 ** map_bits number of + * elements (internally maintained as 'max_elts' in struct + * tracing_map). Before using, map fields should be added to the map + * with tracing_map_add_sum_field() and tracing_map_add_key_field(). + * tracing_map_init() should then be called to allocate the array of + * tracing_map_elts, in order to avoid allocating anything in the map + * insertion path. The user-specified map size reflects the maximum + * number of elements that can be contained in the table requested by + * the user - internally we double that in order to keep the table + * sparse and keep collisions manageable. + * + * A tracing_map is a special-purpose map designed to aggregate or + * 'sum' one or more values associated with a specific object of type + * tracing_map_elt, which is attached by the map to a given key. + * + * tracing_map_create() sets up the map itself, and provides + * operations for inserting tracing_map_elts, but doesn't allocate the + * tracing_map_elts themselves, or provide a means for describing the + * keys or sums associated with the tracing_map_elts. All + * tracing_map_elts for a given map have the same set of sums and + * keys, which are defined by the client using the functions + * tracing_map_add_key_field() and tracing_map_add_sum_field(). Once + * the fields are defined, the pool of elements allocated for the map + * can be created, which occurs when the client code calls + * tracing_map_init(). + * + * When tracing_map_init() returns, tracing_map_elt elements can be + * inserted into the map using tracing_map_insert(). When called, + * tracing_map_insert() grabs a free tracing_map_elt from the pool, or + * finds an existing match in the map and in either case returns it. + * The client can then use tracing_map_update_sum() and + * tracing_map_read_sum() to update or read a given sum field for the + * tracing_map_elt. + * + * The client can at any point retrieve and traverse the current set + * of inserted tracing_map_elts in a tracing_map, via + * tracing_map_sort_entries(). Sorting can be done on any field, + * including keys. + * + * See tracing_map.h for a description of tracing_map_ops. + * + * Return: the tracing_map pointer if successful, ERR_PTR if not. + */ +struct tracing_map *tracing_map_create(unsigned int map_bits, + unsigned int key_size, + const struct tracing_map_ops *ops, + void *private_data) +{ + struct tracing_map *map; + unsigned int i; + + if (map_bits < TRACING_MAP_BITS_MIN || + map_bits > TRACING_MAP_BITS_MAX) + return ERR_PTR(-EINVAL); + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return ERR_PTR(-ENOMEM); + + map->map_bits = map_bits; + map->max_elts = (1 << map_bits); + atomic_set(&map->next_elt, -1); + + map->map_size = (1 << (map_bits + 1)); + map->ops = ops; + + map->private_data = private_data; + + map->map = tracing_map_array_alloc(map->map_size, + sizeof(struct tracing_map_entry)); + if (!map->map) + goto free; + + map->key_size = key_size; + for (i = 0; i < TRACING_MAP_KEYS_MAX; i++) + map->key_idx[i] = -1; + out: + return map; + free: + tracing_map_destroy(map); + map = ERR_PTR(-ENOMEM); + + goto out; +} + +/** + * tracing_map_init - Allocate and clear a map's tracing_map_elts + * @map: The tracing_map to initialize + * + * Allocates a clears a pool of tracing_map_elts equal to the + * user-specified size of 2 ** map_bits (internally maintained as + * 'max_elts' in struct tracing_map). Before using, the map fields + * should be added to the map with tracing_map_add_sum_field() and + * tracing_map_add_key_field(). tracing_map_init() should then be + * called to allocate the array of tracing_map_elts, in order to avoid + * allocating anything in the map insertion path. The user-specified + * map size reflects the max number of elements requested by the user + * - internally we double that in order to keep the table sparse and + * keep collisions manageable. + * + * See tracing_map.h for a description of tracing_map_ops. + * + * Return: the tracing_map pointer if successful, ERR_PTR if not. + */ +int tracing_map_init(struct tracing_map *map) +{ + int err; + + if (map->n_fields < 2) + return -EINVAL; /* need at least 1 key and 1 val */ + + err = tracing_map_alloc_elts(map); + if (err) + return err; + + tracing_map_clear(map); + + return err; +} + +static int cmp_entries_dup(const void *A, const void *B) +{ + const struct tracing_map_sort_entry *a, *b; + int ret = 0; + + a = *(const struct tracing_map_sort_entry **)A; + b = *(const struct tracing_map_sort_entry **)B; + + if (memcmp(a->key, b->key, a->elt->map->key_size)) + ret = 1; + + return ret; +} + +static int cmp_entries_sum(const void *A, const void *B) +{ + const struct tracing_map_elt *elt_a, *elt_b; + const struct tracing_map_sort_entry *a, *b; + struct tracing_map_sort_key *sort_key; + struct tracing_map_field *field; + tracing_map_cmp_fn_t cmp_fn; + void *val_a, *val_b; + int ret = 0; + + a = *(const struct tracing_map_sort_entry **)A; + b = *(const struct tracing_map_sort_entry **)B; + + elt_a = a->elt; + elt_b = b->elt; + + sort_key = &elt_a->map->sort_key; + + field = &elt_a->fields[sort_key->field_idx]; + cmp_fn = field->cmp_fn; + + val_a = &elt_a->fields[sort_key->field_idx].sum; + val_b = &elt_b->fields[sort_key->field_idx].sum; + + ret = cmp_fn(val_a, val_b); + if (sort_key->descending) + ret = -ret; + + return ret; +} + +static int cmp_entries_key(const void *A, const void *B) +{ + const struct tracing_map_elt *elt_a, *elt_b; + const struct tracing_map_sort_entry *a, *b; + struct tracing_map_sort_key *sort_key; + struct tracing_map_field *field; + tracing_map_cmp_fn_t cmp_fn; + void *val_a, *val_b; + int ret = 0; + + a = *(const struct tracing_map_sort_entry **)A; + b = *(const struct tracing_map_sort_entry **)B; + + elt_a = a->elt; + elt_b = b->elt; + + sort_key = &elt_a->map->sort_key; + + field = &elt_a->fields[sort_key->field_idx]; + + cmp_fn = field->cmp_fn; + + val_a = elt_a->key + field->offset; + val_b = elt_b->key + field->offset; + + ret = cmp_fn(val_a, val_b); + if (sort_key->descending) + ret = -ret; + + return ret; +} + +static void destroy_sort_entry(struct tracing_map_sort_entry *entry) +{ + if (!entry) + return; + + if (entry->elt_copied) + tracing_map_elt_free(entry->elt); + + kfree(entry); +} + +/** + * tracing_map_destroy_sort_entries - Destroy an array of sort entries + * @entries: The entries to destroy + * @n_entries: The number of entries in the array + * + * Destroy the elements returned by a tracing_map_sort_entries() call. + */ +void tracing_map_destroy_sort_entries(struct tracing_map_sort_entry **entries, + unsigned int n_entries) +{ + unsigned int i; + + for (i = 0; i < n_entries; i++) + destroy_sort_entry(entries[i]); + + vfree(entries); +} + +static struct tracing_map_sort_entry * +create_sort_entry(void *key, struct tracing_map_elt *elt) +{ + struct tracing_map_sort_entry *sort_entry; + + sort_entry = kzalloc(sizeof(*sort_entry), GFP_KERNEL); + if (!sort_entry) + return NULL; + + sort_entry->key = key; + sort_entry->elt = elt; + + return sort_entry; +} + +static void detect_dups(struct tracing_map_sort_entry **sort_entries, + int n_entries, unsigned int key_size) +{ + unsigned int total_dups = 0; + int i; + void *key; + + if (n_entries < 2) + return; + + sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *), + (int (*)(const void *, const void *))cmp_entries_dup, NULL); + + key = sort_entries[0]->key; + for (i = 1; i < n_entries; i++) { + if (!memcmp(sort_entries[i]->key, key, key_size)) { + total_dups++; + continue; + } + key = sort_entries[i]->key; + } + + WARN_ONCE(total_dups > 0, + "Duplicates detected: %d\n", total_dups); +} + +static bool is_key(struct tracing_map *map, unsigned int field_idx) +{ + unsigned int i; + + for (i = 0; i < map->n_keys; i++) + if (map->key_idx[i] == field_idx) + return true; + return false; +} + +static void sort_secondary(struct tracing_map *map, + const struct tracing_map_sort_entry **entries, + unsigned int n_entries, + struct tracing_map_sort_key *primary_key, + struct tracing_map_sort_key *secondary_key) +{ + int (*primary_fn)(const void *, const void *); + int (*secondary_fn)(const void *, const void *); + unsigned i, start = 0, n_sub = 1; + + if (is_key(map, primary_key->field_idx)) + primary_fn = cmp_entries_key; + else + primary_fn = cmp_entries_sum; + + if (is_key(map, secondary_key->field_idx)) + secondary_fn = cmp_entries_key; + else + secondary_fn = cmp_entries_sum; + + for (i = 0; i < n_entries - 1; i++) { + const struct tracing_map_sort_entry **a = &entries[i]; + const struct tracing_map_sort_entry **b = &entries[i + 1]; + + if (primary_fn(a, b) == 0) { + n_sub++; + if (i < n_entries - 2) + continue; + } + + if (n_sub < 2) { + start = i + 1; + n_sub = 1; + continue; + } + + set_sort_key(map, secondary_key); + sort(&entries[start], n_sub, + sizeof(struct tracing_map_sort_entry *), + (int (*)(const void *, const void *))secondary_fn, NULL); + set_sort_key(map, primary_key); + + start = i + 1; + n_sub = 1; + } +} + +/** + * tracing_map_sort_entries - Sort the current set of tracing_map_elts in a map + * @map: The tracing_map + * @sort_keys: The sort key to use for sorting + * @n_sort_keys: hitcount, always have at least one + * @sort_entries: outval: pointer to allocated and sorted array of entries + * + * tracing_map_sort_entries() sorts the current set of entries in the + * map and returns the list of tracing_map_sort_entries containing + * them to the client in the sort_entries param. The client can + * access the struct tracing_map_elt element of interest directly as + * the 'elt' field of a returned struct tracing_map_sort_entry object. + * + * The sort_key has only two fields: idx and descending. 'idx' refers + * to the index of the field added via tracing_map_add_sum_field() or + * tracing_map_add_key_field() when the tracing_map was initialized. + * 'descending' is a flag that if set reverses the sort order, which + * by default is ascending. + * + * The client should not hold on to the returned array but should use + * it and call tracing_map_destroy_sort_entries() when done. + * + * Return: the number of sort_entries in the struct tracing_map_sort_entry + * array, negative on error + */ +int tracing_map_sort_entries(struct tracing_map *map, + struct tracing_map_sort_key *sort_keys, + unsigned int n_sort_keys, + struct tracing_map_sort_entry ***sort_entries) +{ + int (*cmp_entries_fn)(const void *, const void *); + struct tracing_map_sort_entry *sort_entry, **entries; + int i, n_entries, ret; + + entries = vmalloc(array_size(sizeof(sort_entry), map->max_elts)); + if (!entries) + return -ENOMEM; + + for (i = 0, n_entries = 0; i < map->map_size; i++) { + struct tracing_map_entry *entry; + + entry = TRACING_MAP_ENTRY(map->map, i); + + if (!entry->key || !entry->val) + continue; + + entries[n_entries] = create_sort_entry(entry->val->key, + entry->val); + if (!entries[n_entries++]) { + ret = -ENOMEM; + goto free; + } + } + + if (n_entries == 0) { + ret = 0; + goto free; + } + + if (n_entries == 1) { + *sort_entries = entries; + return 1; + } + + detect_dups(entries, n_entries, map->key_size); + + if (is_key(map, sort_keys[0].field_idx)) + cmp_entries_fn = cmp_entries_key; + else + cmp_entries_fn = cmp_entries_sum; + + set_sort_key(map, &sort_keys[0]); + + sort(entries, n_entries, sizeof(struct tracing_map_sort_entry *), + (int (*)(const void *, const void *))cmp_entries_fn, NULL); + + if (n_sort_keys > 1) + sort_secondary(map, + (const struct tracing_map_sort_entry **)entries, + n_entries, + &sort_keys[0], + &sort_keys[1]); + + *sort_entries = entries; + + return n_entries; + free: + tracing_map_destroy_sort_entries(entries, n_entries); + + return ret; +} diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h new file mode 100644 index 0000000000..99c37eeebc --- /dev/null +++ b/kernel/trace/tracing_map.h @@ -0,0 +1,284 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef __TRACING_MAP_H +#define __TRACING_MAP_H + +#define TRACING_MAP_BITS_DEFAULT 11 +#define TRACING_MAP_BITS_MAX 17 +#define TRACING_MAP_BITS_MIN 7 + +#define TRACING_MAP_KEYS_MAX 3 +#define TRACING_MAP_VALS_MAX 3 +#define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \ + TRACING_MAP_VALS_MAX) +#define TRACING_MAP_VARS_MAX 16 +#define TRACING_MAP_SORT_KEYS_MAX 2 + +typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b); + +/* + * This is an overview of the tracing_map data structures and how they + * relate to the tracing_map API. The details of the algorithms + * aren't discussed here - this is just a general overview of the data + * structures and how they interact with the API. + * + * The central data structure of the tracing_map is an initially + * zeroed array of struct tracing_map_entry (stored in the map field + * of struct tracing_map). tracing_map_entry is a very simple data + * structure containing only two fields: a 32-bit unsigned 'key' + * variable and a pointer named 'val'. This array of struct + * tracing_map_entry is essentially a hash table which will be + * modified by a single function, tracing_map_insert(), but which can + * be traversed and read by a user at any time (though the user does + * this indirectly via an array of tracing_map_sort_entry - see the + * explanation of that data structure in the discussion of the + * sorting-related data structures below). + * + * The central function of the tracing_map API is + * tracing_map_insert(). tracing_map_insert() hashes the + * arbitrarily-sized key passed into it into a 32-bit unsigned key. + * It then uses this key, truncated to the array size, as an index + * into the array of tracing_map_entries. If the value of the 'key' + * field of the tracing_map_entry found at that location is 0, then + * that entry is considered to be free and can be claimed, by + * replacing the 0 in the 'key' field of the tracing_map_entry with + * the new 32-bit hashed key. Once claimed, that tracing_map_entry's + * 'val' field is then used to store a unique element which will be + * forever associated with that 32-bit hashed key in the + * tracing_map_entry. + * + * That unique element now in the tracing_map_entry's 'val' field is + * an instance of tracing_map_elt, where 'elt' in the latter part of + * that variable name is short for 'element'. The purpose of a + * tracing_map_elt is to hold values specific to the particular + * 32-bit hashed key it's associated with. Things such as the unique + * set of aggregated sums associated with the 32-bit hashed key, along + * with a copy of the full key associated with the entry, and which + * was used to produce the 32-bit hashed key. + * + * When tracing_map_create() is called to create the tracing map, the + * user specifies (indirectly via the map_bits param, the details are + * unimportant for this discussion) the maximum number of elements + * that the map can hold (stored in the max_elts field of struct + * tracing_map). This is the maximum possible number of + * tracing_map_entries in the tracing_map_entry array which can be + * 'claimed' as described in the above discussion, and therefore is + * also the maximum number of tracing_map_elts that can be associated + * with the tracing_map_entry array in the tracing_map. Because of + * the way the insertion algorithm works, the size of the allocated + * tracing_map_entry array is always twice the maximum number of + * elements (2 * max_elts). This value is stored in the map_size + * field of struct tracing_map. + * + * Because tracing_map_insert() needs to work from any context, + * including from within the memory allocation functions themselves, + * both the tracing_map_entry array and a pool of max_elts + * tracing_map_elts are pre-allocated before any call is made to + * tracing_map_insert(). + * + * The tracing_map_entry array is allocated as a single block by + * tracing_map_create(). + * + * Because the tracing_map_elts are much larger objects and can't + * generally be allocated together as a single large array without + * failure, they're allocated individually, by tracing_map_init(). + * + * The pool of tracing_map_elts are allocated by tracing_map_init() + * rather than by tracing_map_create() because at the time + * tracing_map_create() is called, there isn't enough information to + * create the tracing_map_elts. Specifically,the user first needs to + * tell the tracing_map implementation how many fields the + * tracing_map_elts contain, and which types of fields they are (key + * or sum). The user does this via the tracing_map_add_sum_field() + * and tracing_map_add_key_field() functions, following which the user + * calls tracing_map_init() to finish up the tracing map setup. The + * array holding the pointers which make up the pre-allocated pool of + * tracing_map_elts is allocated as a single block and is stored in + * the elts field of struct tracing_map. + * + * There is also a set of structures used for sorting that might + * benefit from some minimal explanation. + * + * struct tracing_map_sort_key is used to drive the sort at any given + * time. By 'any given time' we mean that a different + * tracing_map_sort_key will be used at different times depending on + * whether the sort currently being performed is a primary or a + * secondary sort. + * + * The sort key is very simple, consisting of the field index of the + * tracing_map_elt field to sort on (which the user saved when adding + * the field), and whether the sort should be done in an ascending or + * descending order. + * + * For the convenience of the sorting code, a tracing_map_sort_entry + * is created for each tracing_map_elt, again individually allocated + * to avoid failures that might be expected if allocated as a single + * large array of struct tracing_map_sort_entry. + * tracing_map_sort_entry instances are the objects expected by the + * various internal sorting functions, and are also what the user + * ultimately receives after calling tracing_map_sort_entries(). + * Because it doesn't make sense for users to access an unordered and + * sparsely populated tracing_map directly, the + * tracing_map_sort_entries() function is provided so that users can + * retrieve a sorted list of all existing elements. In addition to + * the associated tracing_map_elt 'elt' field contained within the + * tracing_map_sort_entry, which is the object of interest to the + * user, tracing_map_sort_entry objects contain a number of additional + * fields which are used for caching and internal purposes and can + * safely be ignored. +*/ + +struct tracing_map_field { + tracing_map_cmp_fn_t cmp_fn; + union { + atomic64_t sum; + unsigned int offset; + }; +}; + +struct tracing_map_elt { + struct tracing_map *map; + struct tracing_map_field *fields; + atomic64_t *vars; + bool *var_set; + void *key; + void *private_data; +}; + +struct tracing_map_entry { + u32 key; + struct tracing_map_elt *val; +}; + +struct tracing_map_sort_key { + unsigned int field_idx; + bool descending; +}; + +struct tracing_map_sort_entry { + void *key; + struct tracing_map_elt *elt; + bool elt_copied; + bool dup; +}; + +struct tracing_map_array { + unsigned int entries_per_page; + unsigned int entry_size_shift; + unsigned int entry_shift; + unsigned int entry_mask; + unsigned int n_pages; + void **pages; +}; + +#define TRACING_MAP_ARRAY_ELT(array, idx) \ + (array->pages[idx >> array->entry_shift] + \ + ((idx & array->entry_mask) << array->entry_size_shift)) + +#define TRACING_MAP_ENTRY(array, idx) \ + ((struct tracing_map_entry *)TRACING_MAP_ARRAY_ELT(array, idx)) + +#define TRACING_MAP_ELT(array, idx) \ + ((struct tracing_map_elt **)TRACING_MAP_ARRAY_ELT(array, idx)) + +struct tracing_map { + unsigned int key_size; + unsigned int map_bits; + unsigned int map_size; + unsigned int max_elts; + atomic_t next_elt; + struct tracing_map_array *elts; + struct tracing_map_array *map; + const struct tracing_map_ops *ops; + void *private_data; + struct tracing_map_field fields[TRACING_MAP_FIELDS_MAX]; + unsigned int n_fields; + int key_idx[TRACING_MAP_KEYS_MAX]; + unsigned int n_keys; + struct tracing_map_sort_key sort_key; + unsigned int n_vars; + atomic64_t hits; + atomic64_t drops; +}; + +/** + * struct tracing_map_ops - callbacks for tracing_map + * + * The methods in this structure define callback functions for various + * operations on a tracing_map or objects related to a tracing_map. + * + * For a detailed description of tracing_map_elt objects please see + * the overview of tracing_map data structures at the beginning of + * this file. + * + * All the methods below are optional. + * + * @elt_alloc: When a tracing_map_elt is allocated, this function, if + * defined, will be called and gives clients the opportunity to + * allocate additional data and attach it to the element + * (tracing_map_elt->private_data is meant for that purpose). + * Element allocation occurs before tracing begins, when the + * tracing_map_init() call is made by client code. + * + * @elt_free: When a tracing_map_elt is freed, this function is called + * and allows client-allocated per-element data to be freed. + * + * @elt_clear: This callback allows per-element client-defined data to + * be cleared, if applicable. + * + * @elt_init: This callback allows per-element client-defined data to + * be initialized when used i.e. when the element is actually + * claimed by tracing_map_insert() in the context of the map + * insertion. + */ +struct tracing_map_ops { + int (*elt_alloc)(struct tracing_map_elt *elt); + void (*elt_free)(struct tracing_map_elt *elt); + void (*elt_clear)(struct tracing_map_elt *elt); + void (*elt_init)(struct tracing_map_elt *elt); +}; + +extern struct tracing_map * +tracing_map_create(unsigned int map_bits, + unsigned int key_size, + const struct tracing_map_ops *ops, + void *private_data); +extern int tracing_map_init(struct tracing_map *map); + +extern int tracing_map_add_sum_field(struct tracing_map *map); +extern int tracing_map_add_var(struct tracing_map *map); +extern int tracing_map_add_key_field(struct tracing_map *map, + unsigned int offset, + tracing_map_cmp_fn_t cmp_fn); + +extern void tracing_map_destroy(struct tracing_map *map); +extern void tracing_map_clear(struct tracing_map *map); + +extern struct tracing_map_elt * +tracing_map_insert(struct tracing_map *map, void *key); +extern struct tracing_map_elt * +tracing_map_lookup(struct tracing_map *map, void *key); + +extern tracing_map_cmp_fn_t tracing_map_cmp_num(int field_size, + int field_is_signed); +extern int tracing_map_cmp_string(void *val_a, void *val_b); +extern int tracing_map_cmp_none(void *val_a, void *val_b); + +extern void tracing_map_update_sum(struct tracing_map_elt *elt, + unsigned int i, u64 n); +extern void tracing_map_set_var(struct tracing_map_elt *elt, + unsigned int i, u64 n); +extern bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i); +extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i); +extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i); +extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i); + +extern int +tracing_map_sort_entries(struct tracing_map *map, + struct tracing_map_sort_key *sort_keys, + unsigned int n_sort_keys, + struct tracing_map_sort_entry ***sort_entries); + +extern void +tracing_map_destroy_sort_entries(struct tracing_map_sort_entry **entries, + unsigned int n_entries); +#endif /* __TRACING_MAP_H */ |