summaryrefslogtreecommitdiffstats
path: root/kernel/trace
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/Kconfig17
-rw-r--r--kernel/trace/blktrace.c3
-rw-r--r--kernel/trace/bpf_trace.c162
-rw-r--r--kernel/trace/bpf_trace.h2
-rw-r--r--kernel/trace/fgraph.c11
-rw-r--r--kernel/trace/ftrace.c46
-rw-r--r--kernel/trace/pid_list.c4
-rw-r--r--kernel/trace/rethook.c4
-rw-r--r--kernel/trace/ring_buffer.c487
-rw-r--r--kernel/trace/trace.c106
-rw-r--r--kernel/trace/trace.h24
-rw-r--r--kernel/trace/trace_benchmark.c2
-rw-r--r--kernel/trace/trace_events.c33
-rw-r--r--kernel/trace/trace_events_hist.c4
-rw-r--r--kernel/trace/trace_events_inject.c2
-rw-r--r--kernel/trace/trace_events_trigger.c6
-rw-r--r--kernel/trace/trace_events_user.c1
-rw-r--r--kernel/trace/trace_fprobe.c6
-rw-r--r--kernel/trace/trace_kprobe.c26
-rw-r--r--kernel/trace/trace_probe.c65
-rw-r--r--kernel/trace/trace_probe.h2
-rw-r--r--kernel/trace/trace_uprobe.c107
-rw-r--r--kernel/trace/tracing_map.c6
23 files changed, 938 insertions, 188 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 34804c7152..721c3b2210 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -163,7 +163,7 @@ config TRACING
select BINARY_PRINTF
select EVENT_TRACING
select TRACE_CLOCK
- select TASKS_RCU if PREEMPTION
+ select NEED_TASKS_RCU
config GENERIC_TRACER
bool
@@ -204,7 +204,7 @@ config FUNCTION_TRACER
select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
select GLOB
- select TASKS_RCU if PREEMPTION
+ select NEED_TASKS_RCU
select TASKS_RUDE_RCU
help
Enable the kernel to trace every kernel function. This is done
@@ -974,6 +974,19 @@ config FTRACE_RECORD_RECURSION_SIZE
This file can be reset, but the limit can not change in
size at runtime.
+config FTRACE_VALIDATE_RCU_IS_WATCHING
+ bool "Validate RCU is on during ftrace execution"
+ depends on FUNCTION_TRACER
+ depends on ARCH_WANTS_NO_INSTR
+ help
+ All callbacks that attach to the function tracing have some sort of
+ protection against recursion. This option is only to verify that
+ ftrace (and other users of ftrace_test_recursion_trylock()) are not
+ called outside of RCU, as if they are, it can cause a race. But it
+ also has a noticeable overhead when enabled.
+
+ If unsure, say N
+
config RING_BUFFER_RECORD_RECURSION
bool "Record functions that recurse in the ring buffer"
depends on FTRACE_RECORD_RECURSION
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d5d94510af..8fd292d34d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -524,8 +524,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (!buts->buf_size || !buts->buf_nr)
return -EINVAL;
- strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
- buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
+ strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE);
/*
* some device names have larger paths - convert the slashes
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 5d8f918c98..d1daeab1bb 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1053,9 +1053,15 @@ static unsigned long get_entry_ip(unsigned long fentry_ip)
{
u32 instr;
- /* Being extra safe in here in case entry ip is on the page-edge. */
- if (get_kernel_nofault(instr, (u32 *) fentry_ip - 1))
- return fentry_ip;
+ /* We want to be extra safe in case entry ip is on the page edge,
+ * but otherwise we need to avoid get_kernel_nofault()'s overhead.
+ */
+ if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) {
+ if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE)))
+ return fentry_ip;
+ } else {
+ instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE);
+ }
if (is_endbr(instr))
fentry_ip -= ENDBR_INSN_SIZE;
return fentry_ip;
@@ -1182,9 +1188,6 @@ static const struct bpf_func_proto bpf_get_attach_cookie_proto_tracing = {
BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
{
-#ifndef CONFIG_X86
- return -ENOENT;
-#else
static const u32 br_entry_size = sizeof(struct perf_branch_entry);
u32 entry_cnt = size / br_entry_size;
@@ -1197,7 +1200,6 @@ BPF_CALL_3(bpf_get_branch_snapshot, void *, buf, u32, size, u64, flags)
return -ENOENT;
return entry_cnt * br_entry_size;
-#endif
}
static const struct bpf_func_proto bpf_get_branch_snapshot_proto = {
@@ -1525,8 +1527,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_ktime_get_boot_ns_proto;
case BPF_FUNC_tail_call:
return &bpf_tail_call_proto;
- case BPF_FUNC_get_current_pid_tgid:
- return &bpf_get_current_pid_tgid_proto;
case BPF_FUNC_get_current_task:
return &bpf_get_current_task_proto;
case BPF_FUNC_get_current_task_btf:
@@ -1582,8 +1582,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_send_signal_thread_proto;
case BPF_FUNC_perf_event_read_value:
return &bpf_perf_event_read_value_proto;
- case BPF_FUNC_get_ns_current_pid_tgid:
- return &bpf_get_ns_current_pid_tgid_proto;
case BPF_FUNC_ringbuf_output:
return &bpf_ringbuf_output_proto;
case BPF_FUNC_ringbuf_reserve:
@@ -1633,6 +1631,17 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
}
+static bool is_kprobe_multi(const struct bpf_prog *prog)
+{
+ return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ||
+ prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
+}
+
+static inline bool is_kprobe_session(const struct bpf_prog *prog)
+{
+ return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
+}
+
static const struct bpf_func_proto *
kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -1648,13 +1657,13 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_override_return_proto;
#endif
case BPF_FUNC_get_func_ip:
- if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI)
+ if (is_kprobe_multi(prog))
return &bpf_get_func_ip_proto_kprobe_multi;
if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
return &bpf_get_func_ip_proto_uprobe_multi;
return &bpf_get_func_ip_proto_kprobe;
case BPF_FUNC_get_attach_cookie:
- if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI)
+ if (is_kprobe_multi(prog))
return &bpf_get_attach_cookie_proto_kmulti;
if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
return &bpf_get_attach_cookie_proto_umulti;
@@ -2008,6 +2017,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_stackid_proto_raw_tp;
case BPF_FUNC_get_stack:
return &bpf_get_stack_proto_raw_tp;
+ case BPF_FUNC_get_attach_cookie:
+ return &bpf_get_attach_cookie_proto_tracing;
default:
return bpf_tracing_func_proto(func_id, prog);
}
@@ -2070,6 +2081,9 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_func_arg_cnt:
return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL;
case BPF_FUNC_get_attach_cookie:
+ if (prog->type == BPF_PROG_TYPE_TRACING &&
+ prog->expected_attach_type == BPF_TRACE_RAW_TP)
+ return &bpf_get_attach_cookie_proto_tracing;
return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL;
default:
fn = raw_tp_prog_func_proto(func_id, prog);
@@ -2370,16 +2384,26 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
}
static __always_inline
-void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
+void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
{
+ struct bpf_prog *prog = link->link.prog;
+ struct bpf_run_ctx *old_run_ctx;
+ struct bpf_trace_run_ctx run_ctx;
+
cant_sleep();
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
bpf_prog_inc_misses_counter(prog);
goto out;
}
+
+ run_ctx.bpf_cookie = link->cookie;
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+
rcu_read_lock();
(void) bpf_prog_run(prog, args);
rcu_read_unlock();
+
+ bpf_reset_run_ctx(old_run_ctx);
out:
this_cpu_dec(*(prog->active));
}
@@ -2408,12 +2432,12 @@ out:
#define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
#define BPF_TRACE_DEFN_x(x) \
- void bpf_trace_run##x(struct bpf_prog *prog, \
+ void bpf_trace_run##x(struct bpf_raw_tp_link *link, \
REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \
{ \
u64 args[x]; \
REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \
- __bpf_trace_run(prog, args); \
+ __bpf_trace_run(link, args); \
} \
EXPORT_SYMBOL_GPL(bpf_trace_run##x)
BPF_TRACE_DEFN_x(1);
@@ -2429,9 +2453,10 @@ BPF_TRACE_DEFN_x(10);
BPF_TRACE_DEFN_x(11);
BPF_TRACE_DEFN_x(12);
-static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
+int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link)
{
struct tracepoint *tp = btp->tp;
+ struct bpf_prog *prog = link->link.prog;
/*
* check that program doesn't access arguments beyond what's
@@ -2443,18 +2468,12 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *
if (prog->aux->max_tp_access > btp->writable_size)
return -EINVAL;
- return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func,
- prog);
+ return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, link);
}
-int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
+int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link)
{
- return __bpf_probe_register(btp, prog);
-}
-
-int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
-{
- return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog);
+ return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, link);
}
int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
@@ -2577,6 +2596,12 @@ static int __init bpf_event_init(void)
fs_initcall(bpf_event_init);
#endif /* CONFIG_MODULES */
+struct bpf_session_run_ctx {
+ struct bpf_run_ctx run_ctx;
+ bool is_return;
+ void *data;
+};
+
#ifdef CONFIG_FPROBE
struct bpf_kprobe_multi_link {
struct bpf_link link;
@@ -2590,7 +2615,7 @@ struct bpf_kprobe_multi_link {
};
struct bpf_kprobe_multi_run_ctx {
- struct bpf_run_ctx run_ctx;
+ struct bpf_session_run_ctx session_ctx;
struct bpf_kprobe_multi_link *link;
unsigned long entry_ip;
};
@@ -2769,7 +2794,8 @@ static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
if (WARN_ON_ONCE(!ctx))
return 0;
- run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx);
+ run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx,
+ session_ctx.run_ctx);
link = run_ctx->link;
if (!link->cookies)
return 0;
@@ -2786,15 +2812,21 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
{
struct bpf_kprobe_multi_run_ctx *run_ctx;
- run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx);
+ run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx,
+ session_ctx.run_ctx);
return run_ctx->entry_ip;
}
static int
kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
- unsigned long entry_ip, struct pt_regs *regs)
+ unsigned long entry_ip, struct pt_regs *regs,
+ bool is_return, void *data)
{
struct bpf_kprobe_multi_run_ctx run_ctx = {
+ .session_ctx = {
+ .is_return = is_return,
+ .data = data,
+ },
.link = link,
.entry_ip = entry_ip,
};
@@ -2809,7 +2841,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
migrate_disable();
rcu_read_lock();
- old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx);
err = bpf_prog_run(link->link.prog, regs);
bpf_reset_run_ctx(old_run_ctx);
rcu_read_unlock();
@@ -2826,10 +2858,11 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
void *data)
{
struct bpf_kprobe_multi_link *link;
+ int err;
link = container_of(fp, struct bpf_kprobe_multi_link, fp);
- kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
- return 0;
+ err = kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, false, data);
+ return is_kprobe_session(link->link.prog) ? err : 0;
}
static void
@@ -2840,7 +2873,7 @@ kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip,
struct bpf_kprobe_multi_link *link;
link = container_of(fp, struct bpf_kprobe_multi_link, fp);
- kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
+ kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, true, data);
}
static int symbols_cmp_r(const void *a, const void *b, const void *priv)
@@ -2973,7 +3006,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (sizeof(u64) != sizeof(void *))
return -EOPNOTSUPP;
- if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI)
+ if (!is_kprobe_multi(prog))
return -EINVAL;
flags = attr->link_create.kprobe_multi.flags;
@@ -3054,10 +3087,12 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (err)
goto error;
- if (flags & BPF_F_KPROBE_MULTI_RETURN)
- link->fp.exit_handler = kprobe_multi_link_exit_handler;
- else
+ if (!(flags & BPF_F_KPROBE_MULTI_RETURN))
link->fp.entry_handler = kprobe_multi_link_handler;
+ if ((flags & BPF_F_KPROBE_MULTI_RETURN) || is_kprobe_session(prog))
+ link->fp.exit_handler = kprobe_multi_link_exit_handler;
+ if (is_kprobe_session(prog))
+ link->fp.entry_data_size = sizeof(u64);
link->addrs = addrs;
link->cookies = cookies;
@@ -3388,9 +3423,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
}
if (pid) {
- rcu_read_lock();
task = get_pid_task(find_vpid(pid), PIDTYPE_TGID);
- rcu_read_unlock();
if (!task) {
err = -ESRCH;
goto error_path_put;
@@ -3483,3 +3516,52 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
return 0;
}
#endif /* CONFIG_UPROBES */
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc bool bpf_session_is_return(void)
+{
+ struct bpf_session_run_ctx *session_ctx;
+
+ session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx);
+ return session_ctx->is_return;
+}
+
+__bpf_kfunc __u64 *bpf_session_cookie(void)
+{
+ struct bpf_session_run_ctx *session_ctx;
+
+ session_ctx = container_of(current->bpf_ctx, struct bpf_session_run_ctx, run_ctx);
+ return session_ctx->data;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(kprobe_multi_kfunc_set_ids)
+BTF_ID_FLAGS(func, bpf_session_is_return)
+BTF_ID_FLAGS(func, bpf_session_cookie)
+BTF_KFUNCS_END(kprobe_multi_kfunc_set_ids)
+
+static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id)
+{
+ if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id))
+ return 0;
+
+ if (!is_kprobe_session(prog))
+ return -EACCES;
+
+ return 0;
+}
+
+static const struct btf_kfunc_id_set bpf_kprobe_multi_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &kprobe_multi_kfunc_set_ids,
+ .filter = bpf_kprobe_multi_filter,
+};
+
+static int __init bpf_kprobe_multi_kfuncs_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_KPROBE, &bpf_kprobe_multi_kfunc_set);
+}
+
+late_initcall(bpf_kprobe_multi_kfuncs_init);
diff --git a/kernel/trace/bpf_trace.h b/kernel/trace/bpf_trace.h
index 9acbc11ac7..c4075b56be 100644
--- a/kernel/trace/bpf_trace.h
+++ b/kernel/trace/bpf_trace.h
@@ -19,7 +19,7 @@ TRACE_EVENT(bpf_trace_printk,
),
TP_fast_assign(
- __assign_str(bpf_string, bpf_string);
+ __assign_str(bpf_string);
),
TP_printk("%s", __get_str(bpf_string))
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index c83c005e65..a130b2d898 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -125,17 +125,6 @@ int function_graph_enter(unsigned long ret, unsigned long func,
{
struct ftrace_graph_ent trace;
-#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
- /*
- * Skip graph tracing if the return location is served by direct trampoline,
- * since call sequence and return addresses are unpredictable anyway.
- * Ex: BPF trampoline may call original function and may skip frame
- * depending on type of BPF programs attached.
- */
- if (ftrace_direct_func_count &&
- ftrace_find_rec_direct(ret - MCOUNT_INSN_SIZE))
- return -EBUSY;
-#endif
trace.func = func;
trace.depth = ++current->curr_ret_depth;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2e11236722..eacab40205 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2538,7 +2538,6 @@ ftrace_find_unique_ops(struct dyn_ftrace *rec)
/* Protected by rcu_tasks for reading, and direct_mutex for writing */
static struct ftrace_hash __rcu *direct_functions = EMPTY_HASH;
static DEFINE_MUTEX(direct_mutex);
-int ftrace_direct_func_count;
/*
* Search the direct_functions hash to see if the given instruction pointer
@@ -3157,8 +3156,7 @@ out:
* synchronize_rcu_tasks() will wait for those tasks to
* execute and either schedule voluntarily or enter user space.
*/
- if (IS_ENABLED(CONFIG_PREEMPTION))
- synchronize_rcu_tasks();
+ synchronize_rcu_tasks();
ftrace_trampoline_free(ops);
}
@@ -4202,12 +4200,12 @@ static int
add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g,
int clear_filter)
{
- long index = simple_strtoul(func_g->search, NULL, 0);
+ long index;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
/* The index starts at 1 */
- if (--index < 0)
+ if (kstrtoul(func_g->search, 0, &index) || --index < 0)
return 0;
do_for_each_ftrace_rec(pg, rec) {
@@ -5318,14 +5316,6 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt,
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
-struct ftrace_direct_func {
- struct list_head next;
- unsigned long addr;
- int count;
-};
-
-static LIST_HEAD(ftrace_direct_funcs);
-
static int register_ftrace_function_nolock(struct ftrace_ops *ops);
/*
@@ -5366,6 +5356,13 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
}
}
+static void register_ftrace_direct_cb(struct rcu_head *rhp)
+{
+ struct ftrace_hash *fhp = container_of(rhp, struct ftrace_hash, rcu);
+
+ free_ftrace_hash(fhp);
+}
+
/**
* register_ftrace_direct - Call a custom trampoline directly
* for multiple functions registered in @ops
@@ -5464,10 +5461,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
out_unlock:
mutex_unlock(&direct_mutex);
- if (free_hash && free_hash != EMPTY_HASH) {
- synchronize_rcu_tasks();
- free_ftrace_hash(free_hash);
- }
+ if (free_hash && free_hash != EMPTY_HASH)
+ call_rcu_tasks(&free_hash->rcu, register_ftrace_direct_cb);
if (new_hash)
free_ftrace_hash(new_hash);
@@ -5817,9 +5812,8 @@ __setup("ftrace_graph_notrace=", set_graph_notrace_function);
static int __init set_graph_max_depth_function(char *str)
{
- if (!str)
+ if (!str || kstrtouint(str, 0, &fgraph_max_depth))
return 0;
- fgraph_max_depth = simple_strtoul(str, NULL, 0);
return 1;
}
__setup("ftrace_graph_max_depth=", set_graph_max_depth_function);
@@ -6975,7 +6969,7 @@ allocate_ftrace_mod_map(struct module *mod,
return mod_map;
}
-static const char *
+static int
ftrace_func_address_lookup(struct ftrace_mod_map *mod_map,
unsigned long addr, unsigned long *size,
unsigned long *off, char *sym)
@@ -6996,21 +6990,18 @@ ftrace_func_address_lookup(struct ftrace_mod_map *mod_map,
*size = found_func->size;
if (off)
*off = addr - found_func->ip;
- if (sym)
- strscpy(sym, found_func->name, KSYM_NAME_LEN);
-
- return found_func->name;
+ return strscpy(sym, found_func->name, KSYM_NAME_LEN);
}
- return NULL;
+ return 0;
}
-const char *
+int
ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
unsigned long *off, char **modname, char *sym)
{
struct ftrace_mod_map *mod_map;
- const char *ret = NULL;
+ int ret = 0;
/* mod_map is freed via call_rcu() */
preempt_disable();
@@ -8278,7 +8269,6 @@ static struct ctl_table ftrace_sysctls[] = {
.mode = 0644,
.proc_handler = ftrace_enable_sysctl,
},
- {}
};
static int __init ftrace_sysctl_init(void)
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index 95106d02b3..85de221c0b 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -354,7 +354,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
while (upper_count-- > 0) {
union upper_chunk *chunk;
- chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT);
if (!chunk)
break;
*upper_next = chunk;
@@ -365,7 +365,7 @@ static void pid_list_refill_irq(struct irq_work *iwork)
while (lower_count-- > 0) {
union lower_chunk *chunk;
- chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+ chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT);
if (!chunk)
break;
*lower_next = chunk;
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
index fa03094e9e..30d2249468 100644
--- a/kernel/trace/rethook.c
+++ b/kernel/trace/rethook.c
@@ -166,6 +166,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh)
if (unlikely(!handler))
return NULL;
+#if defined(CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING) || defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE)
/*
* This expects the caller will set up a rethook on a function entry.
* When the function returns, the rethook will eventually be reclaimed
@@ -174,6 +175,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh)
*/
if (unlikely(!rcu_is_watching()))
return NULL;
+#endif
return (struct rethook_node *)objpool_pop(&rh->pool);
}
@@ -248,7 +250,7 @@ unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame
if (WARN_ON_ONCE(!cur))
return 0;
- if (WARN_ON_ONCE(tsk != current && task_is_running(tsk)))
+ if (tsk != current && task_is_running(tsk))
return 0;
do {
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 54887f4c35..28853966aa 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -9,6 +9,7 @@
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
#include <linux/sched/clock.h>
+#include <linux/cacheflush.h>
#include <linux/trace_seq.h>
#include <linux/spinlock.h>
#include <linux/irq_work.h>
@@ -26,6 +27,7 @@
#include <linux/list.h>
#include <linux/cpu.h>
#include <linux/oom.h>
+#include <linux/mm.h>
#include <asm/local64.h>
#include <asm/local.h>
@@ -312,6 +314,8 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event)
/* Missed count stored at end */
#define RB_MISSED_STORED (1 << 30)
+#define RB_MISSED_MASK (3 << 30)
+
struct buffer_data_page {
u64 time_stamp; /* page time stamp */
local_t commit; /* write committed index */
@@ -338,6 +342,7 @@ struct buffer_page {
local_t entries; /* entries on this page */
unsigned long real_end; /* real end of data */
unsigned order; /* order of the page */
+ u32 id; /* ID for external mapping */
struct buffer_data_page *page; /* Actual data page */
};
@@ -484,6 +489,12 @@ struct ring_buffer_per_cpu {
u64 read_stamp;
/* pages removed since last reset */
unsigned long pages_removed;
+
+ unsigned int mapped;
+ struct mutex mapping_lock;
+ unsigned long *subbuf_ids; /* ID to subbuf VA */
+ struct trace_buffer_meta *meta_page;
+
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
struct list_head new_pages; /* new pages to add */
@@ -1529,7 +1540,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
list_add(&bpage->list, pages);
page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
- mflags | __GFP_ZERO,
+ mflags | __GFP_COMP | __GFP_ZERO,
cpu_buffer->buffer->subbuf_order);
if (!page)
goto free_pages;
@@ -1604,6 +1615,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
init_waitqueue_head(&cpu_buffer->irq_work.waiters);
init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
+ mutex_init(&cpu_buffer->mapping_lock);
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu));
@@ -1614,7 +1626,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
cpu_buffer->reader_page = bpage;
- page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_ZERO,
+ page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
cpu_buffer->buffer->subbuf_order);
if (!page)
goto fail_free_reader;
@@ -1794,8 +1806,6 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer)
return buffer->time_stamp_abs;
}
-static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
-
static inline unsigned long rb_page_entries(struct buffer_page *bpage)
{
return local_read(&bpage->entries) & RB_WRITE_MASK;
@@ -2327,7 +2337,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
/* Size is determined by what has been committed */
static __always_inline unsigned rb_page_size(struct buffer_page *bpage)
{
- return rb_page_commit(bpage);
+ return rb_page_commit(bpage) & ~RB_MISSED_MASK;
}
static __always_inline unsigned
@@ -3954,7 +3964,7 @@ static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
return true;
/* Reader should exhaust content in reader page */
- if (reader->read != rb_page_commit(reader))
+ if (reader->read != rb_page_size(reader))
return false;
/*
@@ -4425,7 +4435,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
return ((iter->head_page == commit_page && iter->head >= commit) ||
(iter->head_page == reader && commit_page == head_page &&
head_page->read == commit &&
- iter->head == rb_page_commit(cpu_buffer->reader_page)));
+ iter->head == rb_page_size(cpu_buffer->reader_page)));
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_empty);
@@ -5045,13 +5055,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
* @flags: gfp flags to use for memory allocation
*
* This performs the initial preparations necessary to iterate
- * through the buffer. Memory is allocated, buffer recording
+ * through the buffer. Memory is allocated, buffer resizing
* is disabled, and the iterator pointer is returned to the caller.
*
- * Disabling buffer recording prevents the reading from being
- * corrupted. This is not a consuming read, so a producer is not
- * expected.
- *
* After a sequence of ring_buffer_read_prepare calls, the user is
* expected to make at least one call to ring_buffer_read_prepare_sync.
* Afterwards, ring_buffer_read_start is invoked to get things going
@@ -5138,8 +5144,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_start);
* ring_buffer_read_finish - finish reading the iterator of the buffer
* @iter: The iterator retrieved by ring_buffer_start
*
- * This re-enables the recording to the buffer, and frees the
- * iterator.
+ * This re-enables resizing of the buffer, and frees the iterator.
*/
void
ring_buffer_read_finish(struct ring_buffer_iter *iter)
@@ -5147,12 +5152,7 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter)
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
unsigned long flags;
- /*
- * Ring buffer is disabled from recording, here's a good place
- * to check the integrity of the ring buffer.
- * Must prevent readers from trying to read, as the check
- * clears the HEAD page and readers require it.
- */
+ /* Use this opportunity to check the integrity of the ring buffer. */
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
rb_check_pages(cpu_buffer);
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
@@ -5220,6 +5220,22 @@ static void rb_clear_buffer_page(struct buffer_page *page)
page->read = 0;
}
+static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct trace_buffer_meta *meta = cpu_buffer->meta_page;
+
+ meta->reader.read = cpu_buffer->reader_page->read;
+ meta->reader.id = cpu_buffer->reader_page->id;
+ meta->reader.lost_events = cpu_buffer->lost_events;
+
+ meta->entries = local_read(&cpu_buffer->entries);
+ meta->overrun = local_read(&cpu_buffer->overrun);
+ meta->read = cpu_buffer->read;
+
+ /* Some archs do not have data cache coherency between kernel and user-space */
+ flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page));
+}
+
static void
rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
{
@@ -5264,6 +5280,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->lost_events = 0;
cpu_buffer->last_overrun = 0;
+ if (cpu_buffer->mapped)
+ rb_update_meta_page(cpu_buffer);
+
rb_head_page_activate(cpu_buffer);
cpu_buffer->pages_removed = 0;
}
@@ -5478,6 +5497,12 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
cpu_buffer_a = buffer_a->buffers[cpu];
cpu_buffer_b = buffer_b->buffers[cpu];
+ /* It's up to the callers to not try to swap mapped buffers */
+ if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
/* At least make sure the two buffers are somewhat the same */
if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
goto out;
@@ -5588,7 +5613,7 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
goto out;
page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_NORETRY | __GFP_ZERO,
+ GFP_KERNEL | __GFP_NORETRY | __GFP_COMP | __GFP_ZERO,
cpu_buffer->buffer->subbuf_order);
if (!page) {
kfree(bpage);
@@ -5729,7 +5754,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
event = rb_reader_event(cpu_buffer);
read = reader->read;
- commit = rb_page_commit(reader);
+ commit = rb_page_size(reader);
/* Check if any events were dropped */
missed_events = cpu_buffer->lost_events;
@@ -5742,7 +5767,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
* Otherwise, we can simply swap the page with the one passed in.
*/
if (read || (len < (commit - read)) ||
- cpu_buffer->reader_page == cpu_buffer->commit_page) {
+ cpu_buffer->reader_page == cpu_buffer->commit_page ||
+ cpu_buffer->mapped) {
struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
unsigned int rpos = read;
unsigned int pos = 0;
@@ -5805,7 +5831,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
} else {
/* update the entry counter */
cpu_buffer->read += rb_page_entries(reader);
- cpu_buffer->read_bytes += rb_page_commit(reader);
+ cpu_buffer->read_bytes += rb_page_size(reader);
/* swap the pages */
rb_init_page(bpage);
@@ -5965,6 +5991,11 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
cpu_buffer = buffer->buffers[cpu];
+ if (cpu_buffer->mapped) {
+ err = -EBUSY;
+ goto error;
+ }
+
/* Update the number of pages to match the new size */
nr_pages = old_size * buffer->buffers[cpu]->nr_pages;
nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size);
@@ -6066,6 +6097,414 @@ error:
}
EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set);
+static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct page *page;
+
+ if (cpu_buffer->meta_page)
+ return 0;
+
+ page = alloc_page(GFP_USER | __GFP_ZERO);
+ if (!page)
+ return -ENOMEM;
+
+ cpu_buffer->meta_page = page_to_virt(page);
+
+ return 0;
+}
+
+static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ unsigned long addr = (unsigned long)cpu_buffer->meta_page;
+
+ free_page(addr);
+ cpu_buffer->meta_page = NULL;
+}
+
+static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long *subbuf_ids)
+{
+ struct trace_buffer_meta *meta = cpu_buffer->meta_page;
+ unsigned int nr_subbufs = cpu_buffer->nr_pages + 1;
+ struct buffer_page *first_subbuf, *subbuf;
+ int id = 0;
+
+ subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
+ cpu_buffer->reader_page->id = id++;
+
+ first_subbuf = subbuf = rb_set_head_page(cpu_buffer);
+ do {
+ if (WARN_ON(id >= nr_subbufs))
+ break;
+
+ subbuf_ids[id] = (unsigned long)subbuf->page;
+ subbuf->id = id;
+
+ rb_inc_page(&subbuf);
+ id++;
+ } while (subbuf != first_subbuf);
+
+ /* install subbuf ID to kern VA translation */
+ cpu_buffer->subbuf_ids = subbuf_ids;
+
+ meta->meta_page_size = PAGE_SIZE;
+ meta->meta_struct_len = sizeof(*meta);
+ meta->nr_subbufs = nr_subbufs;
+ meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+
+ rb_update_meta_page(cpu_buffer);
+}
+
+static struct ring_buffer_per_cpu *
+rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return ERR_PTR(-EINVAL);
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ mutex_lock(&cpu_buffer->mapping_lock);
+
+ if (!cpu_buffer->mapped) {
+ mutex_unlock(&cpu_buffer->mapping_lock);
+ return ERR_PTR(-ENODEV);
+ }
+
+ return cpu_buffer;
+}
+
+static void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ mutex_unlock(&cpu_buffer->mapping_lock);
+}
+
+/*
+ * Fast-path for rb_buffer_(un)map(). Called whenever the meta-page doesn't need
+ * to be set-up or torn-down.
+ */
+static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer,
+ bool inc)
+{
+ unsigned long flags;
+
+ lockdep_assert_held(&cpu_buffer->mapping_lock);
+
+ if (inc && cpu_buffer->mapped == UINT_MAX)
+ return -EBUSY;
+
+ if (WARN_ON(!inc && cpu_buffer->mapped == 0))
+ return -EINVAL;
+
+ mutex_lock(&cpu_buffer->buffer->mutex);
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+ if (inc)
+ cpu_buffer->mapped++;
+ else
+ cpu_buffer->mapped--;
+
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ mutex_unlock(&cpu_buffer->buffer->mutex);
+
+ return 0;
+}
+
+/*
+ * +--------------+ pgoff == 0
+ * | meta page |
+ * +--------------+ pgoff == 1
+ * | subbuffer 0 |
+ * | |
+ * +--------------+ pgoff == (1 + (1 << subbuf_order))
+ * | subbuffer 1 |
+ * | |
+ * ...
+ */
+#ifdef CONFIG_MMU
+static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
+ struct vm_area_struct *vma)
+{
+ unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff;
+ unsigned int subbuf_pages, subbuf_order;
+ struct page **pages;
+ int p = 0, s = 0;
+ int err;
+
+ /* Refuse MP_PRIVATE or writable mappings */
+ if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC ||
+ !(vma->vm_flags & VM_MAYSHARE))
+ return -EPERM;
+
+ /*
+ * Make sure the mapping cannot become writable later. Also tell the VM
+ * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND).
+ */
+ vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP,
+ VM_MAYWRITE);
+
+ lockdep_assert_held(&cpu_buffer->mapping_lock);
+
+ subbuf_order = cpu_buffer->buffer->subbuf_order;
+ subbuf_pages = 1 << subbuf_order;
+
+ nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */
+ nr_pages = ((nr_subbufs) << subbuf_order) - pgoff + 1; /* + meta-page */
+
+ vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+ if (!vma_pages || vma_pages > nr_pages)
+ return -EINVAL;
+
+ nr_pages = vma_pages;
+
+ pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ if (!pgoff) {
+ pages[p++] = virt_to_page(cpu_buffer->meta_page);
+
+ /*
+ * TODO: Align sub-buffers on their size, once
+ * vm_insert_pages() supports the zero-page.
+ */
+ } else {
+ /* Skip the meta-page */
+ pgoff--;
+
+ if (pgoff % subbuf_pages) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ s += pgoff / subbuf_pages;
+ }
+
+ while (p < nr_pages) {
+ struct page *page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]);
+ int off = 0;
+
+ if (WARN_ON_ONCE(s >= nr_subbufs)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ for (; off < (1 << (subbuf_order)); off++, page++) {
+ if (p >= nr_pages)
+ break;
+
+ pages[p++] = page;
+ }
+ s++;
+ }
+
+ err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
+
+out:
+ kfree(pages);
+
+ return err;
+}
+#else
+static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
+ struct vm_area_struct *vma)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+int ring_buffer_map(struct trace_buffer *buffer, int cpu,
+ struct vm_area_struct *vma)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags, *subbuf_ids;
+ int err = 0;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ mutex_lock(&cpu_buffer->mapping_lock);
+
+ if (cpu_buffer->mapped) {
+ err = __rb_map_vma(cpu_buffer, vma);
+ if (!err)
+ err = __rb_inc_dec_mapped(cpu_buffer, true);
+ mutex_unlock(&cpu_buffer->mapping_lock);
+ return err;
+ }
+
+ /* prevent another thread from changing buffer/sub-buffer sizes */
+ mutex_lock(&buffer->mutex);
+
+ err = rb_alloc_meta_page(cpu_buffer);
+ if (err)
+ goto unlock;
+
+ /* subbuf_ids include the reader while nr_pages does not */
+ subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL);
+ if (!subbuf_ids) {
+ rb_free_meta_page(cpu_buffer);
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ atomic_inc(&cpu_buffer->resize_disabled);
+
+ /*
+ * Lock all readers to block any subbuf swap until the subbuf IDs are
+ * assigned.
+ */
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ rb_setup_ids_meta_page(cpu_buffer, subbuf_ids);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ err = __rb_map_vma(cpu_buffer, vma);
+ if (!err) {
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ cpu_buffer->mapped = 1;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ } else {
+ kfree(cpu_buffer->subbuf_ids);
+ cpu_buffer->subbuf_ids = NULL;
+ rb_free_meta_page(cpu_buffer);
+ }
+
+unlock:
+ mutex_unlock(&buffer->mutex);
+ mutex_unlock(&cpu_buffer->mapping_lock);
+
+ return err;
+}
+
+int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+ int err = 0;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ mutex_lock(&cpu_buffer->mapping_lock);
+
+ if (!cpu_buffer->mapped) {
+ err = -ENODEV;
+ goto out;
+ } else if (cpu_buffer->mapped > 1) {
+ __rb_inc_dec_mapped(cpu_buffer, false);
+ goto out;
+ }
+
+ mutex_lock(&buffer->mutex);
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+ cpu_buffer->mapped = 0;
+
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ kfree(cpu_buffer->subbuf_ids);
+ cpu_buffer->subbuf_ids = NULL;
+ rb_free_meta_page(cpu_buffer);
+ atomic_dec(&cpu_buffer->resize_disabled);
+
+ mutex_unlock(&buffer->mutex);
+
+out:
+ mutex_unlock(&cpu_buffer->mapping_lock);
+
+ return err;
+}
+
+int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *reader;
+ unsigned long missed_events;
+ unsigned long reader_size;
+ unsigned long flags;
+
+ cpu_buffer = rb_get_mapped_buffer(buffer, cpu);
+ if (IS_ERR(cpu_buffer))
+ return (int)PTR_ERR(cpu_buffer);
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+consume:
+ if (rb_per_cpu_empty(cpu_buffer))
+ goto out;
+
+ reader_size = rb_page_size(cpu_buffer->reader_page);
+
+ /*
+ * There are data to be read on the current reader page, we can
+ * return to the caller. But before that, we assume the latter will read
+ * everything. Let's update the kernel reader accordingly.
+ */
+ if (cpu_buffer->reader_page->read < reader_size) {
+ while (cpu_buffer->reader_page->read < reader_size)
+ rb_advance_reader(cpu_buffer);
+ goto out;
+ }
+
+ reader = rb_get_reader_page(cpu_buffer);
+ if (WARN_ON(!reader))
+ goto out;
+
+ /* Check if any events were dropped */
+ missed_events = cpu_buffer->lost_events;
+
+ if (cpu_buffer->reader_page != cpu_buffer->commit_page) {
+ if (missed_events) {
+ struct buffer_data_page *bpage = reader->page;
+ unsigned int commit;
+ /*
+ * Use the real_end for the data size,
+ * This gives us a chance to store the lost events
+ * on the page.
+ */
+ if (reader->real_end)
+ local_set(&bpage->commit, reader->real_end);
+ /*
+ * If there is room at the end of the page to save the
+ * missed events, then record it there.
+ */
+ commit = rb_page_size(reader);
+ if (buffer->subbuf_size - commit >= sizeof(missed_events)) {
+ memcpy(&bpage->data[commit], &missed_events,
+ sizeof(missed_events));
+ local_add(RB_MISSED_STORED, &bpage->commit);
+ }
+ local_add(RB_MISSED_EVENTS, &bpage->commit);
+ }
+ } else {
+ /*
+ * There really shouldn't be any missed events if the commit
+ * is on the reader page.
+ */
+ WARN_ON_ONCE(missed_events);
+ }
+
+ cpu_buffer->lost_events = 0;
+
+ goto consume;
+
+out:
+ /* Some archs do not have data cache coherency between kernel and user-space */
+ flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page));
+
+ rb_update_meta_page(cpu_buffer);
+
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ rb_put_mapped_buffer(cpu_buffer);
+
+ return 0;
+}
+
/*
* We only allocate new buffers, never free them if the CPU goes down.
* If we were to free the buffer, then the user would lose any trace that was in
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 233d1af39f..578a49ff5c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1191,6 +1191,12 @@ static void tracing_snapshot_instance_cond(struct trace_array *tr,
return;
}
+ if (tr->mapped) {
+ trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n");
+ trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
+ return;
+ }
+
local_irq_save(flags);
update_max_tr(tr, current, smp_processor_id(), cond_data);
local_irq_restore(flags);
@@ -1323,7 +1329,7 @@ static int tracing_arm_snapshot_locked(struct trace_array *tr)
lockdep_assert_held(&trace_types_lock);
spin_lock(&tr->snapshot_trigger_lock);
- if (tr->snapshot == UINT_MAX) {
+ if (tr->snapshot == UINT_MAX || tr->mapped) {
spin_unlock(&tr->snapshot_trigger_lock);
return -EBUSY;
}
@@ -5540,7 +5546,7 @@ static const char readme_msg[] =
"\t kernel return probes support: $retval, $arg<N>, $comm\n"
"\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n"
"\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n"
- "\t symstr, <type>\\[<array-size>\\]\n"
+ "\t symstr, %pd/%pD, <type>\\[<array-size>\\]\n"
#ifdef CONFIG_HIST_TRIGGERS
"\t field: <stype> <name>;\n"
"\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n"
@@ -6068,7 +6074,7 @@ static void tracing_set_nop(struct trace_array *tr)
{
if (tr->current_trace == &nop_trace)
return;
-
+
tr->current_trace->enabled--;
if (tr->current_trace->reset)
@@ -8194,15 +8200,32 @@ out:
return ret;
}
-/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */
static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct ftrace_buffer_info *info = file->private_data;
struct trace_iterator *iter = &info->iter;
+ int err;
+
+ if (cmd == TRACE_MMAP_IOCTL_GET_READER) {
+ if (!(file->f_flags & O_NONBLOCK)) {
+ err = ring_buffer_wait(iter->array_buffer->buffer,
+ iter->cpu_file,
+ iter->tr->buffer_percent,
+ NULL, NULL);
+ if (err)
+ return err;
+ }
- if (cmd)
- return -ENOIOCTLCMD;
+ return ring_buffer_map_get_reader(iter->array_buffer->buffer,
+ iter->cpu_file);
+ } else if (cmd) {
+ return -ENOTTY;
+ }
+ /*
+ * An ioctl call with cmd 0 to the ring buffer file will wake up all
+ * waiters
+ */
mutex_lock(&trace_types_lock);
/* Make sure the waiters see the new wait_index */
@@ -8214,6 +8237,76 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned
return 0;
}
+#ifdef CONFIG_TRACER_MAX_TRACE
+static int get_snapshot_map(struct trace_array *tr)
+{
+ int err = 0;
+
+ /*
+ * Called with mmap_lock held. lockdep would be unhappy if we would now
+ * take trace_types_lock. Instead use the specific
+ * snapshot_trigger_lock.
+ */
+ spin_lock(&tr->snapshot_trigger_lock);
+
+ if (tr->snapshot || tr->mapped == UINT_MAX)
+ err = -EBUSY;
+ else
+ tr->mapped++;
+
+ spin_unlock(&tr->snapshot_trigger_lock);
+
+ /* Wait for update_max_tr() to observe iter->tr->mapped */
+ if (tr->mapped == 1)
+ synchronize_rcu();
+
+ return err;
+
+}
+static void put_snapshot_map(struct trace_array *tr)
+{
+ spin_lock(&tr->snapshot_trigger_lock);
+ if (!WARN_ON(!tr->mapped))
+ tr->mapped--;
+ spin_unlock(&tr->snapshot_trigger_lock);
+}
+#else
+static inline int get_snapshot_map(struct trace_array *tr) { return 0; }
+static inline void put_snapshot_map(struct trace_array *tr) { }
+#endif
+
+static void tracing_buffers_mmap_close(struct vm_area_struct *vma)
+{
+ struct ftrace_buffer_info *info = vma->vm_file->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ WARN_ON(ring_buffer_unmap(iter->array_buffer->buffer, iter->cpu_file));
+ put_snapshot_map(iter->tr);
+}
+
+static const struct vm_operations_struct tracing_buffers_vmops = {
+ .close = tracing_buffers_mmap_close,
+};
+
+static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct ftrace_buffer_info *info = filp->private_data;
+ struct trace_iterator *iter = &info->iter;
+ int ret = 0;
+
+ ret = get_snapshot_map(iter->tr);
+ if (ret)
+ return ret;
+
+ ret = ring_buffer_map(iter->array_buffer->buffer, iter->cpu_file, vma);
+ if (ret)
+ put_snapshot_map(iter->tr);
+
+ vma->vm_ops = &tracing_buffers_vmops;
+
+ return ret;
+}
+
static const struct file_operations tracing_buffers_fops = {
.open = tracing_buffers_open,
.read = tracing_buffers_read,
@@ -8223,6 +8316,7 @@ static const struct file_operations tracing_buffers_fops = {
.splice_read = tracing_buffers_splice_read,
.unlocked_ioctl = tracing_buffers_ioctl,
.llseek = no_llseek,
+ .mmap = tracing_buffers_mmap,
};
static ssize_t
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 64450615ca..8edab43580 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -336,6 +336,7 @@ struct trace_array {
bool allocated_snapshot;
spinlock_t snapshot_trigger_lock;
unsigned int snapshot;
+ unsigned int mapped;
unsigned long max_latency;
#ifdef CONFIG_FSNOTIFY
struct dentry *d_max_latency;
@@ -1572,6 +1573,29 @@ static inline void *event_file_data(struct file *filp)
extern struct mutex event_mutex;
extern struct list_head ftrace_events;
+/*
+ * When the trace_event_file is the filp->i_private pointer,
+ * it must be taken under the event_mutex lock, and then checked
+ * if the EVENT_FILE_FL_FREED flag is set. If it is, then the
+ * data pointed to by the trace_event_file can not be trusted.
+ *
+ * Use the event_file_file() to access the trace_event_file from
+ * the filp the first time under the event_mutex and check for
+ * NULL. If it is needed to be retrieved again and the event_mutex
+ * is still held, then the event_file_data() can be used and it
+ * is guaranteed to be valid.
+ */
+static inline struct trace_event_file *event_file_file(struct file *filp)
+{
+ struct trace_event_file *file;
+
+ lockdep_assert_held(&event_mutex);
+ file = READ_ONCE(file_inode(filp)->i_private);
+ if (!file || file->flags & EVENT_FILE_FL_FREED)
+ return NULL;
+ return file;
+}
+
extern const struct file_operations event_trigger_fops;
extern const struct file_operations event_hist_fops;
extern const struct file_operations event_hist_debug_fops;
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 811b084394..e19c32f2a9 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -104,7 +104,7 @@ static void trace_do_benchmark(void)
stddev = 0;
delta = bm_total;
- delta = div64_u64(delta, bm_cnt);
+ do_div(delta, (u32)bm_cnt);
avg = delta;
if (stddev > 0) {
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 6ef29eba90..f08fbaf8ca 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1386,12 +1386,12 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
char buf[4] = "0";
mutex_lock(&event_mutex);
- file = event_file_data(filp);
+ file = event_file_file(filp);
if (likely(file))
flags = file->flags;
mutex_unlock(&event_mutex);
- if (!file || flags & EVENT_FILE_FL_FREED)
+ if (!file)
return -ENODEV;
if (flags & EVENT_FILE_FL_ENABLED &&
@@ -1424,8 +1424,8 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
case 1:
ret = -ENODEV;
mutex_lock(&event_mutex);
- file = event_file_data(filp);
- if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) {
+ file = event_file_file(filp);
+ if (likely(file)) {
ret = tracing_update_buffers(file->tr);
if (ret < 0) {
mutex_unlock(&event_mutex);
@@ -1540,7 +1540,8 @@ enum {
static void *f_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct trace_event_call *call = event_file_data(m->private);
+ struct trace_event_file *file = event_file_data(m->private);
+ struct trace_event_call *call = file->event_call;
struct list_head *common_head = &ftrace_common_fields;
struct list_head *head = trace_get_fields(call);
struct list_head *node = v;
@@ -1572,7 +1573,8 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos)
static int f_show(struct seq_file *m, void *v)
{
- struct trace_event_call *call = event_file_data(m->private);
+ struct trace_event_file *file = event_file_data(m->private);
+ struct trace_event_call *call = file->event_call;
struct ftrace_event_field *field;
const char *array_descriptor;
@@ -1627,12 +1629,14 @@ static int f_show(struct seq_file *m, void *v)
static void *f_start(struct seq_file *m, loff_t *pos)
{
+ struct trace_event_file *file;
void *p = (void *)FORMAT_HEADER;
loff_t l = 0;
/* ->stop() is called even if ->start() fails */
mutex_lock(&event_mutex);
- if (!event_file_data(m->private))
+ file = event_file_file(m->private);
+ if (!file)
return ERR_PTR(-ENODEV);
while (l < *pos && p)
@@ -1706,8 +1710,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
trace_seq_init(s);
mutex_lock(&event_mutex);
- file = event_file_data(filp);
- if (file && !(file->flags & EVENT_FILE_FL_FREED))
+ file = event_file_file(filp);
+ if (file)
print_event_filter(file, s);
mutex_unlock(&event_mutex);
@@ -1736,9 +1740,13 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
return PTR_ERR(buf);
mutex_lock(&event_mutex);
- file = event_file_data(filp);
- if (file)
- err = apply_event_filter(file, buf);
+ file = event_file_file(filp);
+ if (file) {
+ if (file->flags & EVENT_FILE_FL_FREED)
+ err = -ENODEV;
+ else
+ err = apply_event_filter(file, buf);
+ }
mutex_unlock(&event_mutex);
kfree(buf);
@@ -2485,7 +2493,6 @@ static int event_callback(const char *name, umode_t *mode, void **data,
if (strcmp(name, "format") == 0) {
*mode = TRACE_MODE_READ;
*fops = &ftrace_event_format_fops;
- *data = call;
return 1;
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 6ece1308d3..5f9119eb7c 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -5601,7 +5601,7 @@ static int hist_show(struct seq_file *m, void *v)
mutex_lock(&event_mutex);
- event_file = event_file_data(m->private);
+ event_file = event_file_file(m->private);
if (unlikely(!event_file)) {
ret = -ENODEV;
goto out_unlock;
@@ -5880,7 +5880,7 @@ static int hist_debug_show(struct seq_file *m, void *v)
mutex_lock(&event_mutex);
- event_file = event_file_data(m->private);
+ event_file = event_file_file(m->private);
if (unlikely(!event_file)) {
ret = -ENODEV;
goto out_unlock;
diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c
index 8650562bda..a8f076809d 100644
--- a/kernel/trace/trace_events_inject.c
+++ b/kernel/trace/trace_events_inject.c
@@ -299,7 +299,7 @@ event_inject_write(struct file *filp, const char __user *ubuf, size_t cnt,
strim(buf);
mutex_lock(&event_mutex);
- file = event_file_data(filp);
+ file = event_file_file(filp);
if (file) {
call = file->event_call;
size = parse_entry(buf, call, &entry);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 4bec043c86..a5e3d6acf1 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -159,7 +159,7 @@ static void *trigger_start(struct seq_file *m, loff_t *pos)
/* ->stop() is called even if ->start() fails */
mutex_lock(&event_mutex);
- event_file = event_file_data(m->private);
+ event_file = event_file_file(m->private);
if (unlikely(!event_file))
return ERR_PTR(-ENODEV);
@@ -213,7 +213,7 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
mutex_lock(&event_mutex);
- if (unlikely(!event_file_data(file))) {
+ if (unlikely(!event_file_file(file))) {
mutex_unlock(&event_mutex);
return -ENODEV;
}
@@ -293,7 +293,7 @@ static ssize_t event_trigger_regex_write(struct file *file,
strim(buf);
mutex_lock(&event_mutex);
- event_file = event_file_data(file);
+ event_file = event_file_file(file);
if (unlikely(!event_file)) {
mutex_unlock(&event_mutex);
kfree(buf);
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 82b191f33a..3a2b46847c 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -2907,7 +2907,6 @@ static struct ctl_table user_event_sysctls[] = {
.mode = 0644,
.proc_handler = set_max_user_events_sysctl,
},
- {}
};
static int __init trace_events_user_init(void)
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index 4f42808155..62e6a8f4aa 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -994,6 +994,7 @@ static int __trace_fprobe_create(int argc, const char *argv[])
char gbuf[MAX_EVENT_NAME_LEN];
char sbuf[KSYM_NAME_LEN];
char abuf[MAX_BTF_ARGS_LEN];
+ char *dbuf = NULL;
bool is_tracepoint = false;
struct tracepoint *tpoint = NULL;
struct traceprobe_parse_context ctx = {
@@ -1104,6 +1105,10 @@ static int __trace_fprobe_create(int argc, const char *argv[])
argv = new_argv;
}
+ ret = traceprobe_expand_dentry_args(argc, argv, &dbuf);
+ if (ret)
+ goto out;
+
/* setup a probe */
tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive,
argc, is_return);
@@ -1154,6 +1159,7 @@ out:
trace_probe_log_clear();
kfree(new_argv);
kfree(symbol);
+ kfree(dbuf);
return ret;
parse_error:
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 14099cc17f..16383247bd 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -111,6 +111,7 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
return strncmp(module_name(mod), name, len) == 0 && name[len] == ':';
}
+#ifdef CONFIG_MODULES
static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
{
char *p;
@@ -129,6 +130,12 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
return ret;
}
+#else
+static inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
+{
+ return false;
+}
+#endif
static bool trace_kprobe_is_busy(struct dyn_event *ev)
{
@@ -670,6 +677,7 @@ end:
return ret;
}
+#ifdef CONFIG_MODULES
/* Module notifier call back, checking event on the module */
static int trace_kprobe_module_callback(struct notifier_block *nb,
unsigned long val, void *data)
@@ -704,6 +712,16 @@ static struct notifier_block trace_kprobe_module_nb = {
.notifier_call = trace_kprobe_module_callback,
.priority = 1 /* Invoked after kprobe module callback */
};
+static int trace_kprobe_register_module_notifier(void)
+{
+ return register_module_notifier(&trace_kprobe_module_nb);
+}
+#else
+static int trace_kprobe_register_module_notifier(void)
+{
+ return 0;
+}
+#endif /* CONFIG_MODULES */
static int count_symbols(void *data, unsigned long unused)
{
@@ -782,6 +800,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
char buf[MAX_EVENT_NAME_LEN];
char gbuf[MAX_EVENT_NAME_LEN];
char abuf[MAX_BTF_ARGS_LEN];
+ char *dbuf = NULL;
struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL };
switch (argv[0][0]) {
@@ -933,6 +952,10 @@ static int __trace_kprobe_create(int argc, const char *argv[])
argv = new_argv;
}
+ ret = traceprobe_expand_dentry_args(argc, argv, &dbuf);
+ if (ret)
+ goto out;
+
/* setup a probe */
tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
argc, is_return);
@@ -979,6 +1002,7 @@ out:
trace_probe_log_clear();
kfree(new_argv);
kfree(symbol);
+ kfree(dbuf);
return ret;
parse_error:
@@ -1933,7 +1957,7 @@ static __init int init_kprobe_trace_early(void)
if (ret)
return ret;
- if (register_module_notifier(&trace_kprobe_module_nb))
+ if (trace_kprobe_register_module_notifier())
return -EINVAL;
return 0;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 1a7e7cf944..39877c80d6 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -12,6 +12,7 @@
#define pr_fmt(fmt) "trace_probe: " fmt
#include <linux/bpf.h>
+#include <linux/fs.h>
#include "trace_btf.h"
#include "trace_probe.h"
@@ -1184,8 +1185,6 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
return ret;
}
-#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
-
/* Bitfield type needs to be parsed into a fetch function */
static int __parse_bitfield_probe_arg(const char *bf,
const struct fetch_type *t,
@@ -1743,6 +1742,68 @@ error:
return ERR_PTR(ret);
}
+/* @buf: *buf must be equal to NULL. Caller must to free *buf */
+int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf)
+{
+ int i, used, ret;
+ const int bufsize = MAX_DENTRY_ARGS_LEN;
+ char *tmpbuf = NULL;
+
+ if (*buf)
+ return -EINVAL;
+
+ used = 0;
+ for (i = 0; i < argc; i++) {
+ char *tmp;
+ char *equal;
+ size_t arg_len;
+
+ if (!glob_match("*:%p[dD]", argv[i]))
+ continue;
+
+ if (!tmpbuf) {
+ tmpbuf = kmalloc(bufsize, GFP_KERNEL);
+ if (!tmpbuf)
+ return -ENOMEM;
+ }
+
+ tmp = kstrdup(argv[i], GFP_KERNEL);
+ if (!tmp)
+ goto nomem;
+
+ equal = strchr(tmp, '=');
+ if (equal)
+ *equal = '\0';
+ arg_len = strlen(argv[i]);
+ tmp[arg_len - 4] = '\0';
+ if (argv[i][arg_len - 1] == 'd')
+ ret = snprintf(tmpbuf + used, bufsize - used,
+ "%s%s+0x0(+0x%zx(%s)):string",
+ equal ? tmp : "", equal ? "=" : "",
+ offsetof(struct dentry, d_name.name),
+ equal ? equal + 1 : tmp);
+ else
+ ret = snprintf(tmpbuf + used, bufsize - used,
+ "%s%s+0x0(+0x%zx(+0x%zx(%s))):string",
+ equal ? tmp : "", equal ? "=" : "",
+ offsetof(struct dentry, d_name.name),
+ offsetof(struct file, f_path.dentry),
+ equal ? equal + 1 : tmp);
+
+ kfree(tmp);
+ if (ret >= bufsize - used)
+ goto nomem;
+ argv[i] = tmpbuf + used;
+ used += ret + 1;
+ }
+
+ *buf = tmpbuf;
+ return 0;
+nomem:
+ kfree(tmpbuf);
+ return -ENOMEM;
+}
+
void traceprobe_finish_parse(struct traceprobe_parse_context *ctx)
{
clear_btf_context(ctx);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index cef3a50628..5803e6a415 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -34,6 +34,7 @@
#define MAX_ARRAY_LEN 64
#define MAX_ARG_NAME_LEN 32
#define MAX_BTF_ARGS_LEN 128
+#define MAX_DENTRY_ARGS_LEN 256
#define MAX_STRING_SIZE PATH_MAX
#define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN)
@@ -428,6 +429,7 @@ extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i,
const char **traceprobe_expand_meta_args(int argc, const char *argv[],
int *new_argc, char *buf, int bufsize,
struct traceprobe_parse_context *ctx);
+extern int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf);
extern int traceprobe_update_arg(struct probe_arg *arg);
extern void traceprobe_free_probe_arg(struct probe_arg *arg);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 9e46136245..c98e3b3386 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -854,6 +854,7 @@ static const struct file_operations uprobe_profile_ops = {
struct uprobe_cpu_buffer {
struct mutex mutex;
void *buf;
+ int dsize;
};
static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer;
static int uprobe_buffer_refcnt;
@@ -940,12 +941,36 @@ static struct uprobe_cpu_buffer *uprobe_buffer_get(void)
static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
{
+ if (!ucb)
+ return;
mutex_unlock(&ucb->mutex);
}
+static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu,
+ struct pt_regs *regs,
+ struct uprobe_cpu_buffer **ucbp)
+{
+ struct uprobe_cpu_buffer *ucb;
+ int dsize, esize;
+
+ if (*ucbp)
+ return *ucbp;
+
+ esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+ dsize = __get_data_size(&tu->tp, regs, NULL);
+
+ ucb = uprobe_buffer_get();
+ ucb->dsize = tu->tp.size + dsize;
+
+ store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize);
+
+ *ucbp = ucb;
+ return ucb;
+}
+
static void __uprobe_trace_func(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize,
+ struct uprobe_cpu_buffer *ucb,
struct trace_event_file *trace_file)
{
struct uprobe_trace_entry_head *entry;
@@ -956,14 +981,14 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
WARN_ON(call != trace_file->event_call);
- if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
+ if (WARN_ON_ONCE(ucb->dsize > PAGE_SIZE))
return;
if (trace_trigger_soft_disabled(trace_file))
return;
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
- size = esize + tu->tp.size + dsize;
+ size = esize + ucb->dsize;
entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
if (!entry)
return;
@@ -977,23 +1002,26 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
data = DATAOF_TRACE_ENTRY(entry, false);
}
- memcpy(data, ucb->buf, tu->tp.size + dsize);
+ memcpy(data, ucb->buf, ucb->dsize);
trace_event_buffer_commit(&fbuffer);
}
/* uprobe handler */
static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize)
+ struct uprobe_cpu_buffer **ucbp)
{
struct event_file_link *link;
+ struct uprobe_cpu_buffer *ucb;
if (is_ret_probe(tu))
return 0;
+ ucb = prepare_uprobe_buffer(tu, regs, ucbp);
+
rcu_read_lock();
trace_probe_for_each_link_rcu(link, &tu->tp)
- __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file);
+ __uprobe_trace_func(tu, 0, regs, ucb, link->file);
rcu_read_unlock();
return 0;
@@ -1001,13 +1029,16 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize)
+ struct uprobe_cpu_buffer **ucbp)
{
struct event_file_link *link;
+ struct uprobe_cpu_buffer *ucb;
+
+ ucb = prepare_uprobe_buffer(tu, regs, ucbp);
rcu_read_lock();
trace_probe_for_each_link_rcu(link, &tu->tp)
- __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file);
+ __uprobe_trace_func(tu, func, regs, ucb, link->file);
rcu_read_unlock();
}
@@ -1199,9 +1230,6 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
{
struct perf_event *event;
- if (filter->nr_systemwide)
- return true;
-
list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
if (event->hw.target->mm == mm)
return true;
@@ -1326,6 +1354,13 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
tu = container_of(uc, struct trace_uprobe, consumer);
filter = tu->tp.event->filter;
+ /*
+ * speculative short-circuiting check to avoid unnecessarily taking
+ * filter->rwlock below, if the uprobe has system-wide consumer
+ */
+ if (READ_ONCE(filter->nr_systemwide))
+ return true;
+
read_lock(&filter->rwlock);
ret = __uprobe_perf_filter(filter, mm);
read_unlock(&filter->rwlock);
@@ -1335,10 +1370,11 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
static void __uprobe_perf_func(struct trace_uprobe *tu,
unsigned long func, struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize)
+ struct uprobe_cpu_buffer **ucbp)
{
struct trace_event_call *call = trace_probe_event_call(&tu->tp);
struct uprobe_trace_entry_head *entry;
+ struct uprobe_cpu_buffer *ucb;
struct hlist_head *head;
void *data;
int size, esize;
@@ -1356,7 +1392,8 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
- size = esize + tu->tp.size + dsize;
+ ucb = prepare_uprobe_buffer(tu, regs, ucbp);
+ size = esize + ucb->dsize;
size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32);
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
return;
@@ -1379,13 +1416,10 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
data = DATAOF_TRACE_ENTRY(entry, false);
}
- memcpy(data, ucb->buf, tu->tp.size + dsize);
-
- if (size - esize > tu->tp.size + dsize) {
- int len = tu->tp.size + dsize;
+ memcpy(data, ucb->buf, ucb->dsize);
- memset(data + len, 0, size - esize - len);
- }
+ if (size - esize > ucb->dsize)
+ memset(data + ucb->dsize, 0, size - esize - ucb->dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
@@ -1395,21 +1429,21 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
/* uprobe profile handler */
static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize)
+ struct uprobe_cpu_buffer **ucbp)
{
if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
return UPROBE_HANDLER_REMOVE;
if (!is_ret_probe(tu))
- __uprobe_perf_func(tu, 0, regs, ucb, dsize);
+ __uprobe_perf_func(tu, 0, regs, ucbp);
return 0;
}
static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
struct pt_regs *regs,
- struct uprobe_cpu_buffer *ucb, int dsize)
+ struct uprobe_cpu_buffer **ucbp)
{
- __uprobe_perf_func(tu, func, regs, ucb, dsize);
+ __uprobe_perf_func(tu, func, regs, ucbp);
}
int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
@@ -1474,11 +1508,9 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
{
struct trace_uprobe *tu;
struct uprobe_dispatch_data udd;
- struct uprobe_cpu_buffer *ucb;
- int dsize, esize;
+ struct uprobe_cpu_buffer *ucb = NULL;
int ret = 0;
-
tu = container_of(con, struct trace_uprobe, consumer);
tu->nhit++;
@@ -1490,18 +1522,12 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
return 0;
- dsize = __get_data_size(&tu->tp, regs, NULL);
- esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
-
- ucb = uprobe_buffer_get();
- store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize);
-
if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
- ret |= uprobe_trace_func(tu, regs, ucb, dsize);
+ ret |= uprobe_trace_func(tu, regs, &ucb);
#ifdef CONFIG_PERF_EVENTS
if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE))
- ret |= uprobe_perf_func(tu, regs, ucb, dsize);
+ ret |= uprobe_perf_func(tu, regs, &ucb);
#endif
uprobe_buffer_put(ucb);
return ret;
@@ -1512,8 +1538,7 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
{
struct trace_uprobe *tu;
struct uprobe_dispatch_data udd;
- struct uprobe_cpu_buffer *ucb;
- int dsize, esize;
+ struct uprobe_cpu_buffer *ucb = NULL;
tu = container_of(con, struct trace_uprobe, consumer);
@@ -1525,18 +1550,12 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
return 0;
- dsize = __get_data_size(&tu->tp, regs, NULL);
- esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
-
- ucb = uprobe_buffer_get();
- store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize);
-
if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE))
- uretprobe_trace_func(tu, func, regs, ucb, dsize);
+ uretprobe_trace_func(tu, func, regs, &ucb);
#ifdef CONFIG_PERF_EVENTS
if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE))
- uretprobe_perf_func(tu, func, regs, ucb, dsize);
+ uretprobe_perf_func(tu, func, regs, &ucb);
#endif
uprobe_buffer_put(ucb);
return 0;
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index a4dcf0f243..3a56e7c8aa 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -454,7 +454,7 @@ static struct tracing_map_elt *get_free_elt(struct tracing_map *map)
struct tracing_map_elt *elt = NULL;
int idx;
- idx = atomic_inc_return(&map->next_elt);
+ idx = atomic_fetch_add_unless(&map->next_elt, 1, map->max_elts);
if (idx < map->max_elts) {
elt = *(TRACING_MAP_ELT(map->elts, idx));
if (map->ops && map->ops->elt_init)
@@ -699,7 +699,7 @@ void tracing_map_clear(struct tracing_map *map)
{
unsigned int i;
- atomic_set(&map->next_elt, -1);
+ atomic_set(&map->next_elt, 0);
atomic64_set(&map->hits, 0);
atomic64_set(&map->drops, 0);
@@ -783,7 +783,7 @@ struct tracing_map *tracing_map_create(unsigned int map_bits,
map->map_bits = map_bits;
map->max_elts = (1 << map_bits);
- atomic_set(&map->next_elt, -1);
+ atomic_set(&map->next_elt, 0);
map->map_size = (1 << (map_bits + 1));
map->ops = ops;