summaryrefslogtreecommitdiffstats
path: root/bin/xcapture.bt
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--bin/xcapture.bt201
1 files changed, 201 insertions, 0 deletions
diff --git a/bin/xcapture.bt b/bin/xcapture.bt
new file mode 100644
index 0000000..a39bca9
--- /dev/null
+++ b/bin/xcapture.bt
@@ -0,0 +1,201 @@
+/*
+ * 0x.Tools xcapture.bt v0.4 - Proof-of-concept prototype for sampling
+ * Linux thread activity using eBPF [0x.tools]
+ *
+ * Copyright 2019-2023 Tanel Poder
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ */
+
+// This is a PoC prototype for demonstrating feasibility of the custom, programmable
+// task state object populating + sampling approach. This script is not complete and
+// it probably has bugs. I have plenty of improvements in It's not a finished tool or a product.
+//
+// To avoid the extremely slow stack address to symbol resolution in bpftrace, enable
+// symbol caching, for example:
+//
+// sudo BPFTRACE_CACHE_USER_SYMBOLS=1 bpftrace xcapture.bt
+// or
+// sudo BPFTRACE_CACHE_USER_SYMBOLS=1 bpftrace -f json xcapture.bt > out.json
+
+BEGIN {
+ @TASK_STATES[0x00] = "R"; // "(running)"
+ @TASK_STATES[0x01] = "S"; // "(sleeping)"
+ @TASK_STATES[0x02] = "D"; // "(disk sleep)"
+ @TASK_STATES[0x04] = "T"; // "(stopped)"
+ @TASK_STATES[0x08] = "t"; // "(tracing stop)"
+ @TASK_STATES[0x10] = "X"; // "(dead)"
+ @TASK_STATES[0x20] = "Z"; // "(zombie)"
+ @TASK_STATES[0x40] = "P"; // "(parked)"
+ @TASK_STATES[0x80] = "I"; // "(idle)"
+}
+
+
+// record system calls by threads into the thread state array
+// ideally/eventually need to move pid/uid/gid (and perhaps comm) assignment out of the syscall probe
+tracepoint:raw_syscalls:sys_enter {
+ // [tid] uses thread local storage, cleaned out automatically on thread exit
+ @pid [tid] = pid; // *in bpftrace* tid means thread ID (task ID), pid means Process ID (thread group ID)
+ @uid [tid] = uid;
+ @gid [tid] = gid;
+ @comm [tid] = comm;
+ @cmdline [tid] = str(uptr(curtask->mm->arg_start));
+ @task_state [tid] = @TASK_STATES[curtask->__state & 0xff];
+ @syscall_id [tid] = args->id;
+ @syscall_args [tid] = (args->args[0], args->args[1], args->args[2], args->args[3], args->args[4], args->args[5]);
+ @syscall_ustack [tid] = ustack();
+}
+
+tracepoint:raw_syscalls:sys_exit {
+ delete(@syscall_id[tid]) // @syscall_id [tid] = -1;
+}
+
+
+// thread requests going off CPU
+// by the time schedule() is called, the caller has set the new task state
+kprobe:schedule {
+ @task_state [tid] = @TASK_STATES[curtask->__state & 0xff];
+ @offcpu_ustack [tid] = ustack();
+ @offcpu_kstack [tid] = kstack();
+}
+
+// thread has been put back on CPU
+// newer kernels have the "isra" version of this function name, thus the * wildcard
+kprobe:finish_task_switch* {
+ @task_state [tid] = @TASK_STATES[curtask->__state & 0xff];
+ delete(@offcpu_ustack[tid]);
+ delete(@offcpu_kstack[tid]);
+}
+
+// sampled profiling of on-CPU threads
+// update the stack id of threads currently running on (any) cpu
+profile:hz:1 {
+ @task_state [tid] = @TASK_STATES[curtask->__state & 0xff];
+ @profile_ustack[tid] = ustack();
+ @profile_kstack[tid] = kstack();
+}
+
+// Context enrichment example (kernel): tasks waiting in the CPU runqueue
+tracepoint:sched:sched_wakeup,
+tracepoint:sched:sched_wakeup_new {
+ @sched_wakeup[args->pid] = 1;
+}
+
+tracepoint:sched:sched_switch {
+ delete(@sched_wakeup[args->next_pid]); // or: @sched_wakeup[args->next_pid] = -1;
+}
+
+tracepoint:sched:sched_process_exit {
+ delete(@pid [args->pid]);
+ delete(@uid [args->pid]);
+ delete(@gid [args->pid]);
+ delete(@comm [args->pid]);
+ delete(@cmdline [args->pid]);
+ delete(@task_state [args->pid]);
+ delete(@syscall_id [args->pid]);
+ delete(@syscall_args [args->pid]);
+ delete(@syscall_ustack [args->pid]);
+ delete(@sched_wakeup [args->pid]);
+}
+
+
+// Context enrichment example (application): Oracle database wait events
+uprobe:/u01/app/oracle/product/19.0.0/dbhome_1/bin/oracle:kskthbwt {
+ $EVENT_NAME_ARRAY_START=(uint64 *) *uptr(0x600069f0); // uaddr("ksledt_") gave error...
+ $EVENT_NAME_SLOT_SIZE=(uint64) 56; // sizeof(struct)
+
+ @oracle_wait_event[tid] = str(*uptr($EVENT_NAME_ARRAY_START + ($EVENT_NAME_SLOT_SIZE * arg1)/8));
+}
+
+uprobe:/u01/app/oracle/product/19.0.0/dbhome_1/bin/oracle:kskthewt {
+ delete(@oracle_wait_event[tid]); // @oracle_wait_event[tid] = -1;
+}
+
+
+// write out SAMPLES of thread states & activity
+// interval is executed on 1 CPU only, so we won't emit duplicates
+interval:hz:1 {
+ @SAMPLE_TIME=strftime("\"%Y-%m-%dT%H:%M:%S.%f\"", nsecs); // extra "" for json output
+ print(@SAMPLE_TIME);
+
+ print(@pid);
+ print(@comm);
+ print(@cmdline);
+ print(@task_state);
+ print(@syscall_id);
+ print(@syscall_args);
+ print(@profile_ustack);
+ print(@profile_kstack);
+ print(@syscall_ustack);
+ print(@offcpu_ustack);
+ print(@offcpu_kstack);
+ print(@sched_wakeup);
+ print(@oracle_wait_event);
+}
+
+END {
+ clear(@SAMPLE_TIME);
+ clear(@TASK_STATES);
+ clear(@pid);
+ clear(@uid);
+ clear(@gid);
+ clear(@comm);
+ clear(@cmdline);
+ clear(@profile_ustack);
+ clear(@profile_kstack);
+ clear(@syscall_ustack);
+ clear(@offcpu_ustack);
+ clear(@offcpu_kstack);
+ clear(@sched_wakeup);
+ clear(@syscall_id);
+ clear(@syscall_args);
+ clear(@task_state);
+ clear(@oracle_wait_event);
+}
+
+// TODO:
+// ---------------------------------------------------------------------------------------------
+// There's *plenty* to do! If you know bcc/libbpf and are interested in helping out, ping me :-)
+//
+// Email: tanel@tanelpoder.com
+//
+// PRINTOUT NOTES:
+// ----------------------------------------------------------------------------------------------
+// "Kernel: 5.3 bpftrace supports C style while loops:
+// bpftrace -e 'i:ms:100 { $i = 0; while ($i <= 100) { printf("%d ", $i); $i++} exit(); }'
+// Loops can be short circuited by using the continue and break keywords."
+//
+// Unfortunately bpftrace doesn't (yet?) support iterating through only the existing (populated)
+// elements in hash maps, we don't want to loop from 1 to pid_max every time we emit output!
+//
+// Thus, we need to use bcc/libbpf for the sampling loops or use bpftool to dump or mount
+// the kernel ebpf maps as files and do our reading / sampling from there.
+//
+// Since we don't want to always emit/print every single task, but would rather have some
+// conditional logic & intelligence of what threads are interesting (a'la only print R & D states
+// and some specific syscalls under S state), it's better to push this decision logic down to
+// kernel. This means bcc or more likely libbpf as mentioned above.
+//
+// DATA STRUCTURE NOTES:
+// ----------------------------------------------------------------------------------------------
+// With bcc/libbpf it's likely possible to use a hashmap of structs (or hashmap of maps) for
+// storing each thread's complete state in a single thread state "array", under a single TID key.
+// This should reduce any timing & "read consistency" issues when sampling/emitting records too.
+//
+/* vi:syntax=c */
+/* vi:filetype=c */