diff options
Diffstat (limited to 'tools/perf/util/bpf_skel')
-rw-r--r-- | tools/perf/util/bpf_skel/.gitignore | 3 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 227 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/bperf_follower.bpf.c | 78 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/bperf_leader.bpf.c | 55 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/bperf_u.h | 14 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c | 92 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/func_latency.bpf.c | 116 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/kwork_trace.bpf.c | 383 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/lock_contention.bpf.c | 176 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/off_cpu.bpf.c | 283 |
10 files changed, 1427 insertions, 0 deletions
diff --git a/tools/perf/util/bpf_skel/.gitignore b/tools/perf/util/bpf_skel/.gitignore new file mode 100644 index 000000000..5263e9e6c --- /dev/null +++ b/tools/perf/util/bpf_skel/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +.tmp +*.skel.h
\ No newline at end of file diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c new file mode 100644 index 000000000..6a438e010 --- /dev/null +++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2021 Facebook +// Copyright (c) 2021 Google +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> + +#define MAX_LEVELS 10 // max cgroup hierarchy level: arbitrary +#define MAX_EVENTS 32 // max events per cgroup: arbitrary + +// NOTE: many of map and global data will be modified before loading +// from the userspace (perf tool) using the skeleton helpers. + +// single set of global perf events to measure +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(int)); + __uint(max_entries, 1); +} events SEC(".maps"); + +// from cgroup id to event index +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); + __uint(value_size, sizeof(__u32)); + __uint(max_entries, 1); +} cgrp_idx SEC(".maps"); + +// per-cpu event snapshots to calculate delta +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); +} prev_readings SEC(".maps"); + +// aggregated event values for each cgroup (per-cpu) +// will be read from the user-space +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); +} cgrp_readings SEC(".maps"); + +/* new kernel cgroup definition */ +struct cgroup___new { + int level; + struct cgroup *ancestors[]; +} __attribute__((preserve_access_index)); + +/* old kernel cgroup definition */ +struct cgroup___old { + int level; + u64 ancestor_ids[]; +} __attribute__((preserve_access_index)); + +const volatile __u32 num_events = 1; +const volatile __u32 num_cpus = 1; + +int enabled = 0; +int use_cgroup_v2 = 0; +int perf_subsys_id = -1; + +static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level) +{ + /* recast pointer to capture new type for compiler */ + struct cgroup___new *cgrp_new = (void *)cgrp; + + if (bpf_core_field_exists(cgrp_new->ancestors)) { + return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id); + } else { + /* recast pointer to capture old type for compiler */ + struct cgroup___old *cgrp_old = (void *)cgrp; + + return BPF_CORE_READ(cgrp_old, ancestor_ids[level]); + } +} + +static inline int get_cgroup_v1_idx(__u32 *cgrps, int size) +{ + struct task_struct *p = (void *)bpf_get_current_task(); + struct cgroup *cgrp; + register int i = 0; + __u32 *elem; + int level; + int cnt; + + if (perf_subsys_id == -1) { +#if __has_builtin(__builtin_preserve_enum_value) + perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, + perf_event_cgrp_id); +#else + perf_subsys_id = perf_event_cgrp_id; +#endif + } + cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup); + level = BPF_CORE_READ(cgrp, level); + + for (cnt = 0; i < MAX_LEVELS; i++) { + __u64 cgrp_id; + + if (i > level) + break; + + // convert cgroup-id to a map index + cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i); + elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); + if (!elem) + continue; + + cgrps[cnt++] = *elem; + if (cnt == size) + break; + } + + return cnt; +} + +static inline int get_cgroup_v2_idx(__u32 *cgrps, int size) +{ + register int i = 0; + __u32 *elem; + int cnt; + + for (cnt = 0; i < MAX_LEVELS; i++) { + __u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i); + + if (cgrp_id == 0) + break; + + // convert cgroup-id to a map index + elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); + if (!elem) + continue; + + cgrps[cnt++] = *elem; + if (cnt == size) + break; + } + + return cnt; +} + +static int bperf_cgroup_count(void) +{ + register __u32 idx = 0; // to have it in a register to pass BPF verifier + register int c = 0; + struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val; + __u32 cpu = bpf_get_smp_processor_id(); + __u32 cgrp_idx[MAX_LEVELS]; + int cgrp_cnt; + __u32 key, cgrp; + long err; + + if (use_cgroup_v2) + cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS); + else + cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS); + + for ( ; idx < MAX_EVENTS; idx++) { + if (idx == num_events) + break; + + // XXX: do not pass idx directly (for verifier) + key = idx; + // this is per-cpu array for diff + prev_val = bpf_map_lookup_elem(&prev_readings, &key); + if (!prev_val) { + val.counter = val.enabled = val.running = 0; + bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY); + + prev_val = bpf_map_lookup_elem(&prev_readings, &key); + if (!prev_val) + continue; + } + + // read from global perf_event array + key = idx * num_cpus + cpu; + err = bpf_perf_event_read_value(&events, key, &val, sizeof(val)); + if (err) + continue; + + if (enabled) { + delta.counter = val.counter - prev_val->counter; + delta.enabled = val.enabled - prev_val->enabled; + delta.running = val.running - prev_val->running; + + for (c = 0; c < MAX_LEVELS; c++) { + if (c == cgrp_cnt) + break; + + cgrp = cgrp_idx[c]; + + // aggregate the result by cgroup + key = cgrp * num_events + idx; + cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key); + if (cgrp_val) { + cgrp_val->counter += delta.counter; + cgrp_val->enabled += delta.enabled; + cgrp_val->running += delta.running; + } else { + bpf_map_update_elem(&cgrp_readings, &key, + &delta, BPF_ANY); + } + } + } + + *prev_val = val; + } + return 0; +} + +// This will be attached to cgroup-switches event for each cpu +SEC("perf_event") +int BPF_PROG(on_cgrp_switch) +{ + return bperf_cgroup_count(); +} + +SEC("raw_tp/sched_switch") +int BPF_PROG(trigger_read) +{ + return bperf_cgroup_count(); +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c new file mode 100644 index 000000000..f19399853 --- /dev/null +++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2021 Facebook +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bperf_u.h" + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); + __uint(max_entries, 1); +} diff_readings SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); + __uint(max_entries, 1); +} accum_readings SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u32)); +} filter SEC(".maps"); + +enum bperf_filter_type type = 0; +int enabled = 0; + +SEC("fexit/XXX") +int BPF_PROG(fexit_XXX) +{ + struct bpf_perf_event_value *diff_val, *accum_val; + __u32 filter_key, zero = 0; + __u32 *accum_key; + + if (!enabled) + return 0; + + switch (type) { + case BPERF_FILTER_GLOBAL: + accum_key = &zero; + goto do_add; + case BPERF_FILTER_CPU: + filter_key = bpf_get_smp_processor_id(); + break; + case BPERF_FILTER_PID: + filter_key = bpf_get_current_pid_tgid() & 0xffffffff; + break; + case BPERF_FILTER_TGID: + filter_key = bpf_get_current_pid_tgid() >> 32; + break; + default: + return 0; + } + + accum_key = bpf_map_lookup_elem(&filter, &filter_key); + if (!accum_key) + return 0; + +do_add: + diff_val = bpf_map_lookup_elem(&diff_readings, &zero); + if (!diff_val) + return 0; + + accum_val = bpf_map_lookup_elem(&accum_readings, accum_key); + if (!accum_val) + return 0; + + accum_val->counter += diff_val->counter; + accum_val->enabled += diff_val->enabled; + accum_val->running += diff_val->running; + + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/bperf_leader.bpf.c b/tools/perf/util/bpf_skel/bperf_leader.bpf.c new file mode 100644 index 000000000..e2a2d4cd7 --- /dev/null +++ b/tools/perf/util/bpf_skel/bperf_leader.bpf.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2021 Facebook +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(int)); + __uint(map_flags, BPF_F_PRESERVE_ELEMS); +} events SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); + __uint(max_entries, 1); +} prev_readings SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); + __uint(max_entries, 1); +} diff_readings SEC(".maps"); + +SEC("raw_tp/sched_switch") +int BPF_PROG(on_switch) +{ + struct bpf_perf_event_value val, *prev_val, *diff_val; + __u32 key = bpf_get_smp_processor_id(); + __u32 zero = 0; + long err; + + prev_val = bpf_map_lookup_elem(&prev_readings, &zero); + if (!prev_val) + return 0; + + diff_val = bpf_map_lookup_elem(&diff_readings, &zero); + if (!diff_val) + return 0; + + err = bpf_perf_event_read_value(&events, key, &val, sizeof(val)); + if (err) + return 0; + + diff_val->counter = val.counter - prev_val->counter; + diff_val->enabled = val.enabled - prev_val->enabled; + diff_val->running = val.running - prev_val->running; + *prev_val = val; + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/bperf_u.h b/tools/perf/util/bpf_skel/bperf_u.h new file mode 100644 index 000000000..1ce0c2c90 --- /dev/null +++ b/tools/perf/util/bpf_skel/bperf_u.h @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2021 Facebook + +#ifndef __BPERF_STAT_U_H +#define __BPERF_STAT_U_H + +enum bperf_filter_type { + BPERF_FILTER_GLOBAL = 1, + BPERF_FILTER_CPU, + BPERF_FILTER_PID, + BPERF_FILTER_TGID, +}; + +#endif /* __BPERF_STAT_U_H */ diff --git a/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c b/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c new file mode 100644 index 000000000..97037d3b3 --- /dev/null +++ b/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2020 Facebook +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +/* map of perf event fds, num_cpu * num_metric entries */ +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(int)); +} events SEC(".maps"); + +/* readings at fentry */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); + __uint(max_entries, 1); +} fentry_readings SEC(".maps"); + +/* accumulated readings */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); + __uint(max_entries, 1); +} accum_readings SEC(".maps"); + +const volatile __u32 num_cpu = 1; + +SEC("fentry/XXX") +int BPF_PROG(fentry_XXX) +{ + __u32 key = bpf_get_smp_processor_id(); + struct bpf_perf_event_value *ptr; + __u32 zero = 0; + long err; + + /* look up before reading, to reduce error */ + ptr = bpf_map_lookup_elem(&fentry_readings, &zero); + if (!ptr) + return 0; + + err = bpf_perf_event_read_value(&events, key, ptr, sizeof(*ptr)); + if (err) + return 0; + + return 0; +} + +static inline void +fexit_update_maps(struct bpf_perf_event_value *after) +{ + struct bpf_perf_event_value *before, diff; + __u32 zero = 0; + + before = bpf_map_lookup_elem(&fentry_readings, &zero); + /* only account samples with a valid fentry_reading */ + if (before && before->counter) { + struct bpf_perf_event_value *accum; + + diff.counter = after->counter - before->counter; + diff.enabled = after->enabled - before->enabled; + diff.running = after->running - before->running; + + accum = bpf_map_lookup_elem(&accum_readings, &zero); + if (accum) { + accum->counter += diff.counter; + accum->enabled += diff.enabled; + accum->running += diff.running; + } + } +} + +SEC("fexit/XXX") +int BPF_PROG(fexit_XXX) +{ + struct bpf_perf_event_value reading; + __u32 cpu = bpf_get_smp_processor_id(); + int err; + + /* read all events before updating the maps, to reduce error */ + err = bpf_perf_event_read_value(&events, cpu, &reading, sizeof(reading)); + if (err) + return 0; + + fexit_update_maps(&reading); + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/func_latency.bpf.c b/tools/perf/util/bpf_skel/func_latency.bpf.c new file mode 100644 index 000000000..9d01e3af7 --- /dev/null +++ b/tools/perf/util/bpf_skel/func_latency.bpf.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2021 Google +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +// This should be in sync with "util/ftrace.h" +#define NUM_BUCKET 22 + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); + __uint(value_size, sizeof(__u64)); + __uint(max_entries, 10000); +} functime SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} cpu_filter SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} task_filter SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u64)); + __uint(max_entries, NUM_BUCKET); +} latency SEC(".maps"); + + +int enabled = 0; +int has_cpu = 0; +int has_task = 0; +int use_nsec = 0; + +SEC("kprobe/func") +int BPF_PROG(func_begin) +{ + __u64 key, now; + + if (!enabled) + return 0; + + key = bpf_get_current_pid_tgid(); + + if (has_cpu) { + __u32 cpu = bpf_get_smp_processor_id(); + __u8 *ok; + + ok = bpf_map_lookup_elem(&cpu_filter, &cpu); + if (!ok) + return 0; + } + + if (has_task) { + __u32 pid = key & 0xffffffff; + __u8 *ok; + + ok = bpf_map_lookup_elem(&task_filter, &pid); + if (!ok) + return 0; + } + + now = bpf_ktime_get_ns(); + + // overwrite timestamp for nested functions + bpf_map_update_elem(&functime, &key, &now, BPF_ANY); + return 0; +} + +SEC("kretprobe/func") +int BPF_PROG(func_end) +{ + __u64 tid; + __u64 *start; + __u64 cmp_base = use_nsec ? 1 : 1000; + + if (!enabled) + return 0; + + tid = bpf_get_current_pid_tgid(); + + start = bpf_map_lookup_elem(&functime, &tid); + if (start) { + __s64 delta = bpf_ktime_get_ns() - *start; + __u32 key; + __u64 *hist; + + bpf_map_delete_elem(&functime, &tid); + + if (delta < 0) + return 0; + + // calculate index using delta + for (key = 0; key < (NUM_BUCKET - 1); key++) { + if (delta < (cmp_base << key)) + break; + } + + hist = bpf_map_lookup_elem(&latency, &key); + if (!hist) + return 0; + + *hist += 1; + } + + return 0; +} diff --git a/tools/perf/util/bpf_skel/kwork_trace.bpf.c b/tools/perf/util/bpf_skel/kwork_trace.bpf.c new file mode 100644 index 000000000..063c124e0 --- /dev/null +++ b/tools/perf/util/bpf_skel/kwork_trace.bpf.c @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2022, Huawei + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +#define KWORK_COUNT 100 +#define MAX_KWORKNAME 128 + +/* + * This should be in sync with "util/kwork.h" + */ +enum kwork_class_type { + KWORK_CLASS_IRQ, + KWORK_CLASS_SOFTIRQ, + KWORK_CLASS_WORKQUEUE, + KWORK_CLASS_MAX, +}; + +struct work_key { + __u32 type; + __u32 cpu; + __u64 id; +}; + +struct report_data { + __u64 nr; + __u64 total_time; + __u64 max_time; + __u64 max_time_start; + __u64 max_time_end; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(struct work_key)); + __uint(value_size, MAX_KWORKNAME); + __uint(max_entries, KWORK_COUNT); +} perf_kwork_names SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(struct work_key)); + __uint(value_size, sizeof(__u64)); + __uint(max_entries, KWORK_COUNT); +} perf_kwork_time SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(struct work_key)); + __uint(value_size, sizeof(struct report_data)); + __uint(max_entries, KWORK_COUNT); +} perf_kwork_report SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} perf_kwork_cpu_filter SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, MAX_KWORKNAME); + __uint(max_entries, 1); +} perf_kwork_name_filter SEC(".maps"); + +int enabled = 0; +int has_cpu_filter = 0; +int has_name_filter = 0; + +static __always_inline int local_strncmp(const char *s1, + unsigned int sz, const char *s2) +{ + int ret = 0; + unsigned int i; + + for (i = 0; i < sz; i++) { + ret = (unsigned char)s1[i] - (unsigned char)s2[i]; + if (ret || !s1[i] || !s2[i]) + break; + } + + return ret; +} + +static __always_inline int trace_event_match(struct work_key *key, char *name) +{ + __u8 *cpu_val; + char *name_val; + __u32 zero = 0; + __u32 cpu = bpf_get_smp_processor_id(); + + if (!enabled) + return 0; + + if (has_cpu_filter) { + cpu_val = bpf_map_lookup_elem(&perf_kwork_cpu_filter, &cpu); + if (!cpu_val) + return 0; + } + + if (has_name_filter && (name != NULL)) { + name_val = bpf_map_lookup_elem(&perf_kwork_name_filter, &zero); + if (name_val && + (local_strncmp(name_val, MAX_KWORKNAME, name) != 0)) { + return 0; + } + } + + return 1; +} + +static __always_inline void do_update_time(void *map, struct work_key *key, + __u64 time_start, __u64 time_end) +{ + struct report_data zero, *data; + __s64 delta = time_end - time_start; + + if (delta < 0) + return; + + data = bpf_map_lookup_elem(map, key); + if (!data) { + __builtin_memset(&zero, 0, sizeof(zero)); + bpf_map_update_elem(map, key, &zero, BPF_NOEXIST); + data = bpf_map_lookup_elem(map, key); + if (!data) + return; + } + + if ((delta > data->max_time) || + (data->max_time == 0)) { + data->max_time = delta; + data->max_time_start = time_start; + data->max_time_end = time_end; + } + + data->total_time += delta; + data->nr++; +} + +static __always_inline void do_update_timestart(void *map, struct work_key *key) +{ + __u64 ts = bpf_ktime_get_ns(); + + bpf_map_update_elem(map, key, &ts, BPF_ANY); +} + +static __always_inline void do_update_timeend(void *report_map, void *time_map, + struct work_key *key) +{ + __u64 *time = bpf_map_lookup_elem(time_map, key); + + if (time) { + bpf_map_delete_elem(time_map, key); + do_update_time(report_map, key, *time, bpf_ktime_get_ns()); + } +} + +static __always_inline void do_update_name(void *map, + struct work_key *key, char *name) +{ + if (!bpf_map_lookup_elem(map, key)) + bpf_map_update_elem(map, key, name, BPF_ANY); +} + +static __always_inline int update_timestart(void *map, struct work_key *key) +{ + if (!trace_event_match(key, NULL)) + return 0; + + do_update_timestart(map, key); + return 0; +} + +static __always_inline int update_timestart_and_name(void *time_map, + void *names_map, + struct work_key *key, + char *name) +{ + if (!trace_event_match(key, name)) + return 0; + + do_update_timestart(time_map, key); + do_update_name(names_map, key, name); + + return 0; +} + +static __always_inline int update_timeend(void *report_map, + void *time_map, struct work_key *key) +{ + if (!trace_event_match(key, NULL)) + return 0; + + do_update_timeend(report_map, time_map, key); + + return 0; +} + +static __always_inline int update_timeend_and_name(void *report_map, + void *time_map, + void *names_map, + struct work_key *key, + char *name) +{ + if (!trace_event_match(key, name)) + return 0; + + do_update_timeend(report_map, time_map, key); + do_update_name(names_map, key, name); + + return 0; +} + +SEC("tracepoint/irq/irq_handler_entry") +int report_irq_handler_entry(struct trace_event_raw_irq_handler_entry *ctx) +{ + char name[MAX_KWORKNAME]; + struct work_key key = { + .type = KWORK_CLASS_IRQ, + .cpu = bpf_get_smp_processor_id(), + .id = (__u64)ctx->irq, + }; + void *name_addr = (void *)ctx + (ctx->__data_loc_name & 0xffff); + + bpf_probe_read_kernel_str(name, sizeof(name), name_addr); + + return update_timestart_and_name(&perf_kwork_time, + &perf_kwork_names, &key, name); +} + +SEC("tracepoint/irq/irq_handler_exit") +int report_irq_handler_exit(struct trace_event_raw_irq_handler_exit *ctx) +{ + struct work_key key = { + .type = KWORK_CLASS_IRQ, + .cpu = bpf_get_smp_processor_id(), + .id = (__u64)ctx->irq, + }; + + return update_timeend(&perf_kwork_report, &perf_kwork_time, &key); +} + +static char softirq_name_list[NR_SOFTIRQS][MAX_KWORKNAME] = { + { "HI" }, + { "TIMER" }, + { "NET_TX" }, + { "NET_RX" }, + { "BLOCK" }, + { "IRQ_POLL" }, + { "TASKLET" }, + { "SCHED" }, + { "HRTIMER" }, + { "RCU" }, +}; + +SEC("tracepoint/irq/softirq_entry") +int report_softirq_entry(struct trace_event_raw_softirq *ctx) +{ + unsigned int vec = ctx->vec; + struct work_key key = { + .type = KWORK_CLASS_SOFTIRQ, + .cpu = bpf_get_smp_processor_id(), + .id = (__u64)vec, + }; + + if (vec < NR_SOFTIRQS) { + return update_timestart_and_name(&perf_kwork_time, + &perf_kwork_names, &key, + softirq_name_list[vec]); + } + + return 0; +} + +SEC("tracepoint/irq/softirq_exit") +int report_softirq_exit(struct trace_event_raw_softirq *ctx) +{ + struct work_key key = { + .type = KWORK_CLASS_SOFTIRQ, + .cpu = bpf_get_smp_processor_id(), + .id = (__u64)ctx->vec, + }; + + return update_timeend(&perf_kwork_report, &perf_kwork_time, &key); +} + +SEC("tracepoint/irq/softirq_raise") +int latency_softirq_raise(struct trace_event_raw_softirq *ctx) +{ + unsigned int vec = ctx->vec; + struct work_key key = { + .type = KWORK_CLASS_SOFTIRQ, + .cpu = bpf_get_smp_processor_id(), + .id = (__u64)vec, + }; + + if (vec < NR_SOFTIRQS) { + return update_timestart_and_name(&perf_kwork_time, + &perf_kwork_names, &key, + softirq_name_list[vec]); + } + + return 0; +} + +SEC("tracepoint/irq/softirq_entry") +int latency_softirq_entry(struct trace_event_raw_softirq *ctx) +{ + struct work_key key = { + .type = KWORK_CLASS_SOFTIRQ, + .cpu = bpf_get_smp_processor_id(), + .id = (__u64)ctx->vec, + }; + + return update_timeend(&perf_kwork_report, &perf_kwork_time, &key); +} + +SEC("tracepoint/workqueue/workqueue_execute_start") +int report_workqueue_execute_start(struct trace_event_raw_workqueue_execute_start *ctx) +{ + struct work_key key = { + .type = KWORK_CLASS_WORKQUEUE, + .cpu = bpf_get_smp_processor_id(), + .id = (__u64)ctx->work, + }; + + return update_timestart(&perf_kwork_time, &key); +} + +SEC("tracepoint/workqueue/workqueue_execute_end") +int report_workqueue_execute_end(struct trace_event_raw_workqueue_execute_end *ctx) +{ + char name[MAX_KWORKNAME]; + struct work_key key = { + .type = KWORK_CLASS_WORKQUEUE, + .cpu = bpf_get_smp_processor_id(), + .id = (__u64)ctx->work, + }; + unsigned long long func_addr = (unsigned long long)ctx->function; + + __builtin_memset(name, 0, sizeof(name)); + bpf_snprintf(name, sizeof(name), "%ps", &func_addr, sizeof(func_addr)); + + return update_timeend_and_name(&perf_kwork_report, &perf_kwork_time, + &perf_kwork_names, &key, name); +} + +SEC("tracepoint/workqueue/workqueue_activate_work") +int latency_workqueue_activate_work(struct trace_event_raw_workqueue_activate_work *ctx) +{ + struct work_key key = { + .type = KWORK_CLASS_WORKQUEUE, + .cpu = bpf_get_smp_processor_id(), + .id = (__u64)ctx->work, + }; + + return update_timestart(&perf_kwork_time, &key); +} + +SEC("tracepoint/workqueue/workqueue_execute_start") +int latency_workqueue_execute_start(struct trace_event_raw_workqueue_execute_start *ctx) +{ + char name[MAX_KWORKNAME]; + struct work_key key = { + .type = KWORK_CLASS_WORKQUEUE, + .cpu = bpf_get_smp_processor_id(), + .id = (__u64)ctx->work, + }; + unsigned long long func_addr = (unsigned long long)ctx->function; + + __builtin_memset(name, 0, sizeof(name)); + bpf_snprintf(name, sizeof(name), "%ps", &func_addr, sizeof(func_addr)); + + return update_timeend_and_name(&perf_kwork_report, &perf_kwork_time, + &perf_kwork_names, &key, name); +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c new file mode 100644 index 000000000..1bb8628e7 --- /dev/null +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2022 Google +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> + +/* maximum stack trace depth */ +#define MAX_STACKS 8 + +/* default buffer size */ +#define MAX_ENTRIES 10240 + +struct contention_key { + __s32 stack_id; +}; + +struct contention_data { + __u64 total_time; + __u64 min_time; + __u64 max_time; + __u32 count; + __u32 flags; +}; + +struct tstamp_data { + __u64 timestamp; + __u64 lock; + __u32 flags; + __s32 stack_id; +}; + +/* callstack storage */ +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(__u32)); + __uint(value_size, MAX_STACKS * sizeof(__u64)); + __uint(max_entries, MAX_ENTRIES); +} stacks SEC(".maps"); + +/* maintain timestamp at the beginning of contention */ +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct tstamp_data); +} tstamp SEC(".maps"); + +/* actual lock contention statistics */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(struct contention_key)); + __uint(value_size, sizeof(struct contention_data)); + __uint(max_entries, MAX_ENTRIES); +} lock_stat SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} cpu_filter SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} task_filter SEC(".maps"); + +/* control flags */ +int enabled; +int has_cpu; +int has_task; +int stack_skip; + +/* error stat */ +int lost; + +static inline int can_record(void) +{ + if (has_cpu) { + __u32 cpu = bpf_get_smp_processor_id(); + __u8 *ok; + + ok = bpf_map_lookup_elem(&cpu_filter, &cpu); + if (!ok) + return 0; + } + + if (has_task) { + __u8 *ok; + __u32 pid = bpf_get_current_pid_tgid(); + + ok = bpf_map_lookup_elem(&task_filter, &pid); + if (!ok) + return 0; + } + + return 1; +} + +SEC("tp_btf/contention_begin") +int contention_begin(u64 *ctx) +{ + struct task_struct *curr; + struct tstamp_data *pelem; + + if (!enabled || !can_record()) + return 0; + + curr = bpf_get_current_task_btf(); + pelem = bpf_task_storage_get(&tstamp, curr, NULL, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!pelem || pelem->lock) + return 0; + + pelem->timestamp = bpf_ktime_get_ns(); + pelem->lock = (__u64)ctx[0]; + pelem->flags = (__u32)ctx[1]; + pelem->stack_id = bpf_get_stackid(ctx, &stacks, BPF_F_FAST_STACK_CMP | stack_skip); + + if (pelem->stack_id < 0) + lost++; + return 0; +} + +SEC("tp_btf/contention_end") +int contention_end(u64 *ctx) +{ + struct task_struct *curr; + struct tstamp_data *pelem; + struct contention_key key; + struct contention_data *data; + __u64 duration; + + if (!enabled) + return 0; + + curr = bpf_get_current_task_btf(); + pelem = bpf_task_storage_get(&tstamp, curr, NULL, 0); + if (!pelem || pelem->lock != ctx[0]) + return 0; + + duration = bpf_ktime_get_ns() - pelem->timestamp; + + key.stack_id = pelem->stack_id; + data = bpf_map_lookup_elem(&lock_stat, &key); + if (!data) { + struct contention_data first = { + .total_time = duration, + .max_time = duration, + .min_time = duration, + .count = 1, + .flags = pelem->flags, + }; + + bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST); + pelem->lock = 0; + return 0; + } + + __sync_fetch_and_add(&data->total_time, duration); + __sync_fetch_and_add(&data->count, 1); + + /* FIXME: need atomic operations */ + if (data->max_time < duration) + data->max_time = duration; + if (data->min_time > duration) + data->min_time = duration; + + pelem->lock = 0; + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c new file mode 100644 index 000000000..38e3b287d --- /dev/null +++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2022 Google +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> + +/* task->flags for off-cpu analysis */ +#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ + +/* task->state for off-cpu analysis */ +#define TASK_INTERRUPTIBLE 0x0001 +#define TASK_UNINTERRUPTIBLE 0x0002 + +/* create a new thread */ +#define CLONE_THREAD 0x10000 + +#define MAX_STACKS 32 +#define MAX_ENTRIES 102400 + +struct tstamp_data { + __u32 stack_id; + __u32 state; + __u64 timestamp; +}; + +struct offcpu_key { + __u32 pid; + __u32 tgid; + __u32 stack_id; + __u32 state; + __u64 cgroup_id; +}; + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(__u32)); + __uint(value_size, MAX_STACKS * sizeof(__u64)); + __uint(max_entries, MAX_ENTRIES); +} stacks SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct tstamp_data); +} tstamp SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(struct offcpu_key)); + __uint(value_size, sizeof(__u64)); + __uint(max_entries, MAX_ENTRIES); +} off_cpu SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} cpu_filter SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} task_filter SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} cgroup_filter SEC(".maps"); + +/* new kernel task_struct definition */ +struct task_struct___new { + long __state; +} __attribute__((preserve_access_index)); + +/* old kernel task_struct definition */ +struct task_struct___old { + long state; +} __attribute__((preserve_access_index)); + +int enabled = 0; +int has_cpu = 0; +int has_task = 0; +int has_cgroup = 0; +int uses_tgid = 0; + +const volatile bool has_prev_state = false; +const volatile bool needs_cgroup = false; +const volatile bool uses_cgroup_v1 = false; + +int perf_subsys_id = -1; + +/* + * Old kernel used to call it task_struct->state and now it's '__state'. + * Use BPF CO-RE "ignored suffix rule" to deal with it like below: + * + * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes + */ +static inline int get_task_state(struct task_struct *t) +{ + /* recast pointer to capture new type for compiler */ + struct task_struct___new *t_new = (void *)t; + + if (bpf_core_field_exists(t_new->__state)) { + return BPF_CORE_READ(t_new, __state); + } else { + /* recast pointer to capture old type for compiler */ + struct task_struct___old *t_old = (void *)t; + + return BPF_CORE_READ(t_old, state); + } +} + +static inline __u64 get_cgroup_id(struct task_struct *t) +{ + struct cgroup *cgrp; + + if (!uses_cgroup_v1) + return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id); + + if (perf_subsys_id == -1) { +#if __has_builtin(__builtin_preserve_enum_value) + perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, + perf_event_cgrp_id); +#else + perf_subsys_id = perf_event_cgrp_id; +#endif + } + + cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup); + return BPF_CORE_READ(cgrp, kn, id); +} + +static inline int can_record(struct task_struct *t, int state) +{ + /* kernel threads don't have user stack */ + if (t->flags & PF_KTHREAD) + return 0; + + if (state != TASK_INTERRUPTIBLE && + state != TASK_UNINTERRUPTIBLE) + return 0; + + if (has_cpu) { + __u32 cpu = bpf_get_smp_processor_id(); + __u8 *ok; + + ok = bpf_map_lookup_elem(&cpu_filter, &cpu); + if (!ok) + return 0; + } + + if (has_task) { + __u8 *ok; + __u32 pid; + + if (uses_tgid) + pid = t->tgid; + else + pid = t->pid; + + ok = bpf_map_lookup_elem(&task_filter, &pid); + if (!ok) + return 0; + } + + if (has_cgroup) { + __u8 *ok; + __u64 cgrp_id = get_cgroup_id(t); + + ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id); + if (!ok) + return 0; + } + + return 1; +} + +static int off_cpu_stat(u64 *ctx, struct task_struct *prev, + struct task_struct *next, int state) +{ + __u64 ts; + __u32 stack_id; + struct tstamp_data *pelem; + + ts = bpf_ktime_get_ns(); + + if (!can_record(prev, state)) + goto next; + + stack_id = bpf_get_stackid(ctx, &stacks, + BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK); + + pelem = bpf_task_storage_get(&tstamp, prev, NULL, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!pelem) + goto next; + + pelem->timestamp = ts; + pelem->state = state; + pelem->stack_id = stack_id; + +next: + pelem = bpf_task_storage_get(&tstamp, next, NULL, 0); + + if (pelem && pelem->timestamp) { + struct offcpu_key key = { + .pid = next->pid, + .tgid = next->tgid, + .stack_id = pelem->stack_id, + .state = pelem->state, + .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0, + }; + __u64 delta = ts - pelem->timestamp; + __u64 *total; + + total = bpf_map_lookup_elem(&off_cpu, &key); + if (total) + *total += delta; + else + bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY); + + /* prevent to reuse the timestamp later */ + pelem->timestamp = 0; + } + + return 0; +} + +SEC("tp_btf/task_newtask") +int on_newtask(u64 *ctx) +{ + struct task_struct *task; + u64 clone_flags; + u32 pid; + u8 val = 1; + + if (!uses_tgid) + return 0; + + task = (struct task_struct *)bpf_get_current_task(); + + pid = BPF_CORE_READ(task, tgid); + if (!bpf_map_lookup_elem(&task_filter, &pid)) + return 0; + + task = (struct task_struct *)ctx[0]; + clone_flags = ctx[1]; + + pid = task->tgid; + if (!(clone_flags & CLONE_THREAD)) + bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST); + + return 0; +} + +SEC("tp_btf/sched_switch") +int on_switch(u64 *ctx) +{ + struct task_struct *prev, *next; + int prev_state; + + if (!enabled) + return 0; + + prev = (struct task_struct *)ctx[1]; + next = (struct task_struct *)ctx[2]; + + if (has_prev_state) + prev_state = (int)ctx[3]; + else + prev_state = get_task_state(prev); + + return off_cpu_stat(ctx, prev, next, prev_state); +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; |