diff options
Diffstat (limited to '')
-rw-r--r-- | bin/xcapture.bt | 201 |
1 files changed, 201 insertions, 0 deletions
diff --git a/bin/xcapture.bt b/bin/xcapture.bt new file mode 100644 index 0000000..a39bca9 --- /dev/null +++ b/bin/xcapture.bt @@ -0,0 +1,201 @@ +/* + * 0x.Tools xcapture.bt v0.4 - Proof-of-concept prototype for sampling + * Linux thread activity using eBPF [0x.tools] + * + * Copyright 2019-2023 Tanel Poder + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * SPDX-License-Identifier: GPL-2.0-or-later + * + */ + +// This is a PoC prototype for demonstrating feasibility of the custom, programmable +// task state object populating + sampling approach. This script is not complete and +// it probably has bugs. I have plenty of improvements in It's not a finished tool or a product. +// +// To avoid the extremely slow stack address to symbol resolution in bpftrace, enable +// symbol caching, for example: +// +// sudo BPFTRACE_CACHE_USER_SYMBOLS=1 bpftrace xcapture.bt +// or +// sudo BPFTRACE_CACHE_USER_SYMBOLS=1 bpftrace -f json xcapture.bt > out.json + +BEGIN { + @TASK_STATES[0x00] = "R"; // "(running)" + @TASK_STATES[0x01] = "S"; // "(sleeping)" + @TASK_STATES[0x02] = "D"; // "(disk sleep)" + @TASK_STATES[0x04] = "T"; // "(stopped)" + @TASK_STATES[0x08] = "t"; // "(tracing stop)" + @TASK_STATES[0x10] = "X"; // "(dead)" + @TASK_STATES[0x20] = "Z"; // "(zombie)" + @TASK_STATES[0x40] = "P"; // "(parked)" + @TASK_STATES[0x80] = "I"; // "(idle)" +} + + +// record system calls by threads into the thread state array +// ideally/eventually need to move pid/uid/gid (and perhaps comm) assignment out of the syscall probe +tracepoint:raw_syscalls:sys_enter { + // [tid] uses thread local storage, cleaned out automatically on thread exit + @pid [tid] = pid; // *in bpftrace* tid means thread ID (task ID), pid means Process ID (thread group ID) + @uid [tid] = uid; + @gid [tid] = gid; + @comm [tid] = comm; + @cmdline [tid] = str(uptr(curtask->mm->arg_start)); + @task_state [tid] = @TASK_STATES[curtask->__state & 0xff]; + @syscall_id [tid] = args->id; + @syscall_args [tid] = (args->args[0], args->args[1], args->args[2], args->args[3], args->args[4], args->args[5]); + @syscall_ustack [tid] = ustack(); +} + +tracepoint:raw_syscalls:sys_exit { + delete(@syscall_id[tid]) // @syscall_id [tid] = -1; +} + + +// thread requests going off CPU +// by the time schedule() is called, the caller has set the new task state +kprobe:schedule { + @task_state [tid] = @TASK_STATES[curtask->__state & 0xff]; + @offcpu_ustack [tid] = ustack(); + @offcpu_kstack [tid] = kstack(); +} + +// thread has been put back on CPU +// newer kernels have the "isra" version of this function name, thus the * wildcard +kprobe:finish_task_switch* { + @task_state [tid] = @TASK_STATES[curtask->__state & 0xff]; + delete(@offcpu_ustack[tid]); + delete(@offcpu_kstack[tid]); +} + +// sampled profiling of on-CPU threads +// update the stack id of threads currently running on (any) cpu +profile:hz:1 { + @task_state [tid] = @TASK_STATES[curtask->__state & 0xff]; + @profile_ustack[tid] = ustack(); + @profile_kstack[tid] = kstack(); +} + +// Context enrichment example (kernel): tasks waiting in the CPU runqueue +tracepoint:sched:sched_wakeup, +tracepoint:sched:sched_wakeup_new { + @sched_wakeup[args->pid] = 1; +} + +tracepoint:sched:sched_switch { + delete(@sched_wakeup[args->next_pid]); // or: @sched_wakeup[args->next_pid] = -1; +} + +tracepoint:sched:sched_process_exit { + delete(@pid [args->pid]); + delete(@uid [args->pid]); + delete(@gid [args->pid]); + delete(@comm [args->pid]); + delete(@cmdline [args->pid]); + delete(@task_state [args->pid]); + delete(@syscall_id [args->pid]); + delete(@syscall_args [args->pid]); + delete(@syscall_ustack [args->pid]); + delete(@sched_wakeup [args->pid]); +} + + +// Context enrichment example (application): Oracle database wait events +uprobe:/u01/app/oracle/product/19.0.0/dbhome_1/bin/oracle:kskthbwt { + $EVENT_NAME_ARRAY_START=(uint64 *) *uptr(0x600069f0); // uaddr("ksledt_") gave error... + $EVENT_NAME_SLOT_SIZE=(uint64) 56; // sizeof(struct) + + @oracle_wait_event[tid] = str(*uptr($EVENT_NAME_ARRAY_START + ($EVENT_NAME_SLOT_SIZE * arg1)/8)); +} + +uprobe:/u01/app/oracle/product/19.0.0/dbhome_1/bin/oracle:kskthewt { + delete(@oracle_wait_event[tid]); // @oracle_wait_event[tid] = -1; +} + + +// write out SAMPLES of thread states & activity +// interval is executed on 1 CPU only, so we won't emit duplicates +interval:hz:1 { + @SAMPLE_TIME=strftime("\"%Y-%m-%dT%H:%M:%S.%f\"", nsecs); // extra "" for json output + print(@SAMPLE_TIME); + + print(@pid); + print(@comm); + print(@cmdline); + print(@task_state); + print(@syscall_id); + print(@syscall_args); + print(@profile_ustack); + print(@profile_kstack); + print(@syscall_ustack); + print(@offcpu_ustack); + print(@offcpu_kstack); + print(@sched_wakeup); + print(@oracle_wait_event); +} + +END { + clear(@SAMPLE_TIME); + clear(@TASK_STATES); + clear(@pid); + clear(@uid); + clear(@gid); + clear(@comm); + clear(@cmdline); + clear(@profile_ustack); + clear(@profile_kstack); + clear(@syscall_ustack); + clear(@offcpu_ustack); + clear(@offcpu_kstack); + clear(@sched_wakeup); + clear(@syscall_id); + clear(@syscall_args); + clear(@task_state); + clear(@oracle_wait_event); +} + +// TODO: +// --------------------------------------------------------------------------------------------- +// There's *plenty* to do! If you know bcc/libbpf and are interested in helping out, ping me :-) +// +// Email: tanel@tanelpoder.com +// +// PRINTOUT NOTES: +// ---------------------------------------------------------------------------------------------- +// "Kernel: 5.3 bpftrace supports C style while loops: +// bpftrace -e 'i:ms:100 { $i = 0; while ($i <= 100) { printf("%d ", $i); $i++} exit(); }' +// Loops can be short circuited by using the continue and break keywords." +// +// Unfortunately bpftrace doesn't (yet?) support iterating through only the existing (populated) +// elements in hash maps, we don't want to loop from 1 to pid_max every time we emit output! +// +// Thus, we need to use bcc/libbpf for the sampling loops or use bpftool to dump or mount +// the kernel ebpf maps as files and do our reading / sampling from there. +// +// Since we don't want to always emit/print every single task, but would rather have some +// conditional logic & intelligence of what threads are interesting (a'la only print R & D states +// and some specific syscalls under S state), it's better to push this decision logic down to +// kernel. This means bcc or more likely libbpf as mentioned above. +// +// DATA STRUCTURE NOTES: +// ---------------------------------------------------------------------------------------------- +// With bcc/libbpf it's likely possible to use a hashmap of structs (or hashmap of maps) for +// storing each thread's complete state in a single thread state "array", under a single TID key. +// This should reduce any timing & "read consistency" issues when sampling/emitting records too. +// +/* vi:syntax=c */ +/* vi:filetype=c */ |