summaryrefslogtreecommitdiffstats
path: root/kernel/trace
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 18:50:03 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-18 18:50:03 +0000
commit01a69402cf9d38ff180345d55c2ee51c7e89fbc7 (patch)
treeb406c5242a088c4f59c6e4b719b783f43aca6ae9 /kernel/trace
parentAdding upstream version 6.7.12. (diff)
downloadlinux-01a69402cf9d38ff180345d55c2ee51c7e89fbc7.tar.xz
linux-01a69402cf9d38ff180345d55c2ee51c7e89fbc7.zip
Adding upstream version 6.8.9.upstream/6.8.9
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/bpf_trace.c183
-rw-r--r--kernel/trace/ring_buffer.c784
-rw-r--r--kernel/trace/ring_buffer_benchmark.c10
-rw-r--r--kernel/trace/trace.c277
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_boot.c2
-rw-r--r--kernel/trace/trace_events.c111
-rw-r--r--kernel/trace/trace_events_hist.c49
-rw-r--r--kernel/trace/trace_events_user.c4
-rw-r--r--kernel/trace/trace_kprobe.c4
-rw-r--r--kernel/trace/trace_output.c6
-rw-r--r--kernel/trace/trace_seq.c3
-rw-r--r--kernel/trace/trace_uprobe.c2
13 files changed, 1012 insertions, 425 deletions
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 652c40a14d..c8d1ebc438 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -24,6 +24,7 @@
#include <linux/key.h>
#include <linux/verification.h>
#include <linux/namei.h>
+#include <linux/fileattr.h>
#include <net/bpf_sk_storage.h>
@@ -1379,6 +1380,8 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
struct bpf_dynptr_kern *sig_ptr,
struct bpf_key *trusted_keyring)
{
+ const void *data, *sig;
+ u32 data_len, sig_len;
int ret;
if (trusted_keyring->has_ref) {
@@ -1395,10 +1398,12 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
return ret;
}
- return verify_pkcs7_signature(data_ptr->data,
- __bpf_dynptr_size(data_ptr),
- sig_ptr->data,
- __bpf_dynptr_size(sig_ptr),
+ data_len = __bpf_dynptr_size(data_ptr);
+ data = __bpf_dynptr_data(data_ptr, data_len);
+ sig_len = __bpf_dynptr_size(sig_ptr);
+ sig = __bpf_dynptr_data(sig_ptr, sig_len);
+
+ return verify_pkcs7_signature(data, data_len, sig, sig_len,
trusted_keyring->key,
VERIFYING_UNSPECIFIED_SIGNATURE, NULL,
NULL);
@@ -1430,6 +1435,72 @@ static int __init bpf_key_sig_kfuncs_init(void)
late_initcall(bpf_key_sig_kfuncs_init);
#endif /* CONFIG_KEYS */
+/* filesystem kfuncs */
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_get_file_xattr - get xattr of a file
+ * @file: file to get xattr from
+ * @name__str: name of the xattr
+ * @value_ptr: output buffer of the xattr value
+ *
+ * Get xattr *name__str* of *file* and store the output in *value_ptr*.
+ *
+ * For security reasons, only *name__str* with prefix "user." is allowed.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str,
+ struct bpf_dynptr_kern *value_ptr)
+{
+ struct dentry *dentry;
+ u32 value_len;
+ void *value;
+ int ret;
+
+ if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+ return -EPERM;
+
+ value_len = __bpf_dynptr_size(value_ptr);
+ value = __bpf_dynptr_data_rw(value_ptr, value_len);
+ if (!value)
+ return -EINVAL;
+
+ dentry = file_dentry(file);
+ ret = inode_permission(&nop_mnt_idmap, dentry->d_inode, MAY_READ);
+ if (ret)
+ return ret;
+ return __vfs_getxattr(dentry, dentry->d_inode, name__str, value, value_len);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_SET8_START(fs_kfunc_set_ids)
+BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+BTF_SET8_END(fs_kfunc_set_ids)
+
+static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id)
+{
+ if (!btf_id_set8_contains(&fs_kfunc_set_ids, kfunc_id))
+ return 0;
+
+ /* Only allow to attach from LSM hooks, to avoid recursion */
+ return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0;
+}
+
+static const struct btf_kfunc_id_set bpf_fs_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &fs_kfunc_set_ids,
+ .filter = bpf_get_file_xattr_filter,
+};
+
+static int __init bpf_fs_kfuncs_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fs_kfunc_set);
+}
+
+late_initcall(bpf_fs_kfuncs_init);
+
static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -2642,7 +2713,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
.release = bpf_kprobe_multi_link_release,
- .dealloc = bpf_kprobe_multi_link_dealloc,
+ .dealloc_deferred = bpf_kprobe_multi_link_dealloc,
.fill_link_info = bpf_kprobe_multi_link_fill_link_info,
};
@@ -3034,6 +3105,7 @@ struct bpf_uprobe_multi_link;
struct bpf_uprobe {
struct bpf_uprobe_multi_link *link;
loff_t offset;
+ unsigned long ref_ctr_offset;
u64 cookie;
struct uprobe_consumer consumer;
};
@@ -3042,6 +3114,7 @@ struct bpf_uprobe_multi_link {
struct path path;
struct bpf_link link;
u32 cnt;
+ u32 flags;
struct bpf_uprobe *uprobes;
struct task_struct *task;
};
@@ -3069,6 +3142,9 @@ static void bpf_uprobe_multi_link_release(struct bpf_link *link)
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt);
+ if (umulti_link->task)
+ put_task_struct(umulti_link->task);
+ path_put(&umulti_link->path);
}
static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link)
@@ -3076,16 +3152,83 @@ static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link)
struct bpf_uprobe_multi_link *umulti_link;
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
- if (umulti_link->task)
- put_task_struct(umulti_link->task);
- path_put(&umulti_link->path);
kvfree(umulti_link->uprobes);
kfree(umulti_link);
}
+static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ u64 __user *uref_ctr_offsets = u64_to_user_ptr(info->uprobe_multi.ref_ctr_offsets);
+ u64 __user *ucookies = u64_to_user_ptr(info->uprobe_multi.cookies);
+ u64 __user *uoffsets = u64_to_user_ptr(info->uprobe_multi.offsets);
+ u64 __user *upath = u64_to_user_ptr(info->uprobe_multi.path);
+ u32 upath_size = info->uprobe_multi.path_size;
+ struct bpf_uprobe_multi_link *umulti_link;
+ u32 ucount = info->uprobe_multi.count;
+ int err = 0, i;
+ long left;
+
+ if (!upath ^ !upath_size)
+ return -EINVAL;
+
+ if ((uoffsets || uref_ctr_offsets || ucookies) && !ucount)
+ return -EINVAL;
+
+ umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
+ info->uprobe_multi.count = umulti_link->cnt;
+ info->uprobe_multi.flags = umulti_link->flags;
+ info->uprobe_multi.pid = umulti_link->task ?
+ task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0;
+
+ if (upath) {
+ char *p, *buf;
+
+ upath_size = min_t(u32, upath_size, PATH_MAX);
+
+ buf = kmalloc(upath_size, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ p = d_path(&umulti_link->path, buf, upath_size);
+ if (IS_ERR(p)) {
+ kfree(buf);
+ return PTR_ERR(p);
+ }
+ upath_size = buf + upath_size - p;
+ left = copy_to_user(upath, p, upath_size);
+ kfree(buf);
+ if (left)
+ return -EFAULT;
+ info->uprobe_multi.path_size = upath_size;
+ }
+
+ if (!uoffsets && !ucookies && !uref_ctr_offsets)
+ return 0;
+
+ if (ucount < umulti_link->cnt)
+ err = -ENOSPC;
+ else
+ ucount = umulti_link->cnt;
+
+ for (i = 0; i < ucount; i++) {
+ if (uoffsets &&
+ put_user(umulti_link->uprobes[i].offset, uoffsets + i))
+ return -EFAULT;
+ if (uref_ctr_offsets &&
+ put_user(umulti_link->uprobes[i].ref_ctr_offset, uref_ctr_offsets + i))
+ return -EFAULT;
+ if (ucookies &&
+ put_user(umulti_link->uprobes[i].cookie, ucookies + i))
+ return -EFAULT;
+ }
+
+ return err;
+}
+
static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
.release = bpf_uprobe_multi_link_release,
- .dealloc = bpf_uprobe_multi_link_dealloc,
+ .dealloc_deferred = bpf_uprobe_multi_link_dealloc,
+ .fill_link_info = bpf_uprobe_multi_link_fill_link_info,
};
static int uprobe_prog_run(struct bpf_uprobe *uprobe,
@@ -3173,7 +3316,6 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
{
struct bpf_uprobe_multi_link *link = NULL;
unsigned long __user *uref_ctr_offsets;
- unsigned long *ref_ctr_offsets = NULL;
struct bpf_link_primer link_primer;
struct bpf_uprobe *uprobes = NULL;
struct task_struct *task = NULL;
@@ -3248,22 +3390,20 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (!uprobes || !link)
goto error_free;
- if (uref_ctr_offsets) {
- ref_ctr_offsets = kvcalloc(cnt, sizeof(*ref_ctr_offsets), GFP_KERNEL);
- if (!ref_ctr_offsets)
- goto error_free;
- }
-
for (i = 0; i < cnt; i++) {
- if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) {
+ if (__get_user(uprobes[i].offset, uoffsets + i)) {
err = -EFAULT;
goto error_free;
}
- if (uref_ctr_offsets && __get_user(ref_ctr_offsets[i], uref_ctr_offsets + i)) {
+ if (uprobes[i].offset < 0) {
+ err = -EINVAL;
+ goto error_free;
+ }
+ if (uref_ctr_offsets && __get_user(uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) {
err = -EFAULT;
goto error_free;
}
- if (__get_user(uprobes[i].offset, uoffsets + i)) {
+ if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) {
err = -EFAULT;
goto error_free;
}
@@ -3283,6 +3423,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
link->uprobes = uprobes;
link->path = path;
link->task = task;
+ link->flags = flags;
bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI,
&bpf_uprobe_multi_link_lops, prog);
@@ -3290,7 +3431,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
for (i = 0; i < cnt; i++) {
err = uprobe_register_refctr(d_real_inode(link->path.dentry),
uprobes[i].offset,
- ref_ctr_offsets ? ref_ctr_offsets[i] : 0,
+ uprobes[i].ref_ctr_offset,
&uprobes[i].consumer);
if (err) {
bpf_uprobe_unregister(&path, uprobes, i);
@@ -3302,11 +3443,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (err)
goto error_free;
- kvfree(ref_ctr_offsets);
return bpf_link_settle(&link_primer);
error_free:
- kvfree(ref_ctr_offsets);
kvfree(uprobes);
kfree(link);
if (task)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 140f8eed83..faf56d9a9e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -27,6 +27,7 @@
#include <linux/cpu.h>
#include <linux/oom.h>
+#include <asm/local64.h>
#include <asm/local.h>
/*
@@ -317,6 +318,11 @@ struct buffer_data_page {
unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
};
+struct buffer_data_read_page {
+ unsigned order; /* order of the page */
+ struct buffer_data_page *data; /* actual data, stored in this page */
+};
+
/*
* Note, the buffer_page list must be first. The buffer pages
* are allocated in cache lines, which means that each buffer
@@ -331,6 +337,7 @@ struct buffer_page {
unsigned read; /* index for next read */
local_t entries; /* entries on this page */
unsigned long real_end; /* real end of data */
+ unsigned order; /* order of the page */
struct buffer_data_page *page; /* Actual data page */
};
@@ -361,7 +368,7 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
static void free_buffer_page(struct buffer_page *bpage)
{
- free_page((unsigned long)bpage->page);
+ free_pages((unsigned long)bpage->page, bpage->order);
kfree(bpage);
}
@@ -373,45 +380,11 @@ static inline bool test_time_stamp(u64 delta)
return !!(delta & TS_DELTA_TEST);
}
-#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
-
-/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
-#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
-
-int ring_buffer_print_page_header(struct trace_seq *s)
-{
- struct buffer_data_page field;
-
- trace_seq_printf(s, "\tfield: u64 timestamp;\t"
- "offset:0;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)sizeof(field.time_stamp),
- (unsigned int)is_signed_type(u64));
-
- trace_seq_printf(s, "\tfield: local_t commit;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- (unsigned int)sizeof(field.commit),
- (unsigned int)is_signed_type(long));
-
- trace_seq_printf(s, "\tfield: int overwrite;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- 1,
- (unsigned int)is_signed_type(long));
-
- trace_seq_printf(s, "\tfield: char data;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), data),
- (unsigned int)BUF_PAGE_SIZE,
- (unsigned int)is_signed_type(char));
-
- return !trace_seq_has_overflowed(s);
-}
-
struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
wait_queue_head_t full_waiters;
+ atomic_t seq;
bool waiters_pending;
bool full_waiters_pending;
bool wakeup_full;
@@ -462,27 +435,9 @@ enum {
RB_CTX_MAX
};
-#if BITS_PER_LONG == 32
-#define RB_TIME_32
-#endif
-
-/* To test on 64 bit machines */
-//#define RB_TIME_32
-
-#ifdef RB_TIME_32
-
-struct rb_time_struct {
- local_t cnt;
- local_t top;
- local_t bottom;
- local_t msb;
-};
-#else
-#include <asm/local64.h>
struct rb_time_struct {
local64_t time;
};
-#endif
typedef struct rb_time_struct rb_time_t;
#define MAX_NEST 5
@@ -556,6 +511,10 @@ struct trace_buffer {
struct rb_irq_work irq_work;
bool time_stamp_abs;
+
+ unsigned int subbuf_size;
+ unsigned int subbuf_order;
+ unsigned int max_data_size;
};
struct ring_buffer_iter {
@@ -569,150 +528,48 @@ struct ring_buffer_iter {
u64 read_stamp;
u64 page_stamp;
struct ring_buffer_event *event;
+ size_t event_size;
int missed_events;
};
-#ifdef RB_TIME_32
-
-/*
- * On 32 bit machines, local64_t is very expensive. As the ring
- * buffer doesn't need all the features of a true 64 bit atomic,
- * on 32 bit, it uses these functions (64 still uses local64_t).
- *
- * For the ring buffer, 64 bit required operations for the time is
- * the following:
- *
- * - Reads may fail if it interrupted a modification of the time stamp.
- * It will succeed if it did not interrupt another write even if
- * the read itself is interrupted by a write.
- * It returns whether it was successful or not.
- *
- * - Writes always succeed and will overwrite other writes and writes
- * that were done by events interrupting the current write.
- *
- * - A write followed by a read of the same time stamp will always succeed,
- * but may not contain the same value.
- *
- * - A cmpxchg will fail if it interrupted another write or cmpxchg.
- * Other than that, it acts like a normal cmpxchg.
- *
- * The 60 bit time stamp is broken up by 30 bits in a top and bottom half
- * (bottom being the least significant 30 bits of the 60 bit time stamp).
- *
- * The two most significant bits of each half holds a 2 bit counter (0-3).
- * Each update will increment this counter by one.
- * When reading the top and bottom, if the two counter bits match then the
- * top and bottom together make a valid 60 bit number.
- */
-#define RB_TIME_SHIFT 30
-#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
-#define RB_TIME_MSB_SHIFT 60
-
-static inline int rb_time_cnt(unsigned long val)
-{
- return (val >> RB_TIME_SHIFT) & 3;
-}
-
-static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
-{
- u64 val;
-
- val = top & RB_TIME_VAL_MASK;
- val <<= RB_TIME_SHIFT;
- val |= bottom & RB_TIME_VAL_MASK;
-
- return val;
-}
-
-static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
-{
- unsigned long top, bottom, msb;
- unsigned long c;
-
- /*
- * If the read is interrupted by a write, then the cnt will
- * be different. Loop until both top and bottom have been read
- * without interruption.
- */
- do {
- c = local_read(&t->cnt);
- top = local_read(&t->top);
- bottom = local_read(&t->bottom);
- msb = local_read(&t->msb);
- } while (c != local_read(&t->cnt));
-
- *cnt = rb_time_cnt(top);
-
- /* If top, msb or bottom counts don't match, this interrupted a write */
- if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom))
- return false;
-
- /* The shift to msb will lose its cnt bits */
- *ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT);
- return true;
-}
-
-static bool rb_time_read(rb_time_t *t, u64 *ret)
-{
- unsigned long cnt;
-
- return __rb_time_read(t, ret, &cnt);
-}
-
-static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt)
+int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s)
{
- return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
-}
+ struct buffer_data_page field;
-static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom,
- unsigned long *msb)
-{
- *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
- *bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
- *msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT);
-}
+ trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+ "offset:0;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)sizeof(field.time_stamp),
+ (unsigned int)is_signed_type(u64));
-static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
-{
- val = rb_time_val_cnt(val, cnt);
- local_set(t, val);
-}
+ trace_seq_printf(s, "\tfield: local_t commit;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ (unsigned int)sizeof(field.commit),
+ (unsigned int)is_signed_type(long));
-static void rb_time_set(rb_time_t *t, u64 val)
-{
- unsigned long cnt, top, bottom, msb;
+ trace_seq_printf(s, "\tfield: int overwrite;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ 1,
+ (unsigned int)is_signed_type(long));
- rb_time_split(val, &top, &bottom, &msb);
+ trace_seq_printf(s, "\tfield: char data;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), data),
+ (unsigned int)buffer->subbuf_size,
+ (unsigned int)is_signed_type(char));
- /* Writes always succeed with a valid number even if it gets interrupted. */
- do {
- cnt = local_inc_return(&t->cnt);
- rb_time_val_set(&t->top, top, cnt);
- rb_time_val_set(&t->bottom, bottom, cnt);
- rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt);
- } while (cnt != local_read(&t->cnt));
+ return !trace_seq_has_overflowed(s);
}
-static inline bool
-rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
-{
- return local_try_cmpxchg(l, &expect, set);
-}
-
-#else /* 64 bits */
-
-/* local64_t always succeeds */
-
-static inline bool rb_time_read(rb_time_t *t, u64 *ret)
+static inline void rb_time_read(rb_time_t *t, u64 *ret)
{
*ret = local64_read(&t->time);
- return true;
}
static void rb_time_set(rb_time_t *t, u64 val)
{
local64_set(&t->time, val);
}
-#endif
/*
* Enable this to make sure that the event passed to
@@ -819,10 +676,7 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
WARN_ONCE(1, "nest (%d) greater than max", nest);
fail:
- /* Can only fail on 32 bit */
- if (!rb_time_read(&cpu_buffer->write_stamp, &ts))
- /* Screw it, just read the current time */
- ts = rb_time_stamp(cpu_buffer->buffer);
+ rb_time_read(&cpu_buffer->write_stamp, &ts);
return ts;
}
@@ -900,6 +754,9 @@ static void rb_wake_up_waiters(struct irq_work *work)
{
struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
+ /* For waiters waiting for the first wake up */
+ (void)atomic_fetch_inc_release(&rbwork->seq);
+
wake_up_all(&rbwork->waiters);
if (rbwork->full_waiters_pending || rbwork->wakeup_full) {
/* Only cpu_buffer sets the above flags */
@@ -1028,20 +885,21 @@ rb_wait_cond(struct rb_irq_work *rbwork, struct trace_buffer *buffer,
return false;
}
+struct rb_wait_data {
+ struct rb_irq_work *irq_work;
+ int seq;
+};
+
/*
* The default wait condition for ring_buffer_wait() is to just to exit the
* wait loop the first time it is woken up.
*/
static bool rb_wait_once(void *data)
{
- long *once = data;
-
- /* wait_event() actually calls this twice before scheduling*/
- if (*once > 1)
- return true;
+ struct rb_wait_data *rdata = data;
+ struct rb_irq_work *rbwork = rdata->irq_work;
- (*once)++;
- return false;
+ return atomic_read_acquire(&rbwork->seq) != rdata->seq;
}
/**
@@ -1049,24 +907,22 @@ static bool rb_wait_once(void *data)
* @buffer: buffer to wait on
* @cpu: the cpu buffer to wait on
* @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
+ * @cond: condition function to break out of wait (NULL to run once)
+ * @data: the data to pass to @cond.
*
* If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
* as data is added to any of the @buffer's cpu buffers. Otherwise
* it will wait for data to be added to a specific cpu buffer.
*/
-int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
+int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full,
+ ring_buffer_cond_fn cond, void *data)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct wait_queue_head *waitq;
- ring_buffer_cond_fn cond;
struct rb_irq_work *rbwork;
- void *data;
- long once = 0;
+ struct rb_wait_data rdata;
int ret = 0;
- cond = rb_wait_once;
- data = &once;
-
/*
* Depending on what the caller is waiting for, either any
* data in any cpu buffer, or a specific buffer, put the
@@ -1088,6 +944,14 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
else
waitq = &rbwork->waiters;
+ /* Set up to exit loop as soon as it is woken */
+ if (!cond) {
+ cond = rb_wait_once;
+ rdata.irq_work = rbwork;
+ rdata.seq = atomic_read_acquire(&rbwork->seq);
+ data = &rdata;
+ }
+
ret = wait_event_interruptible((*waitq),
rb_wait_cond(rbwork, buffer, cpu, full, cond, data));
@@ -1536,7 +1400,6 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
- local_inc(&cpu_buffer->pages_touched);
/*
* Just make sure we have seen our old_write and synchronize
* with any interrupts that come in.
@@ -1573,8 +1436,9 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
*/
local_set(&next_page->page->commit, 0);
- /* Again, either we update tail_page or an interrupt does */
- (void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page);
+ /* Either we update tail_page or an interrupt does */
+ if (try_cmpxchg(&cpu_buffer->tail_page, &tail_page, next_page))
+ local_inc(&cpu_buffer->pages_touched);
}
}
@@ -1666,10 +1530,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
list_add(&bpage->list, pages);
- page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0);
+ page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags,
+ cpu_buffer->buffer->subbuf_order);
if (!page)
goto free_pages;
bpage->page = page_address(page);
+ bpage->order = cpu_buffer->buffer->subbuf_order;
rb_init_page(bpage->page);
if (user_thread && fatal_signal_pending(current))
@@ -1748,7 +1614,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
rb_check_bpage(cpu_buffer, bpage);
cpu_buffer->reader_page = bpage;
- page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
+
+ page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order);
if (!page)
goto fail_free_reader;
bpage->page = page_address(page);
@@ -1831,7 +1698,14 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
goto fail_free_buffer;
- nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ /* Default buffer page size - one system page */
+ buffer->subbuf_order = 0;
+ buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
+
+ /* Max payload is buffer page size - header (8bytes) */
+ buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2);
+
+ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
buffer->flags = flags;
buffer->clock = trace_clock_local;
buffer->reader_lock_key = key;
@@ -2150,7 +2024,7 @@ static void update_pages_handler(struct work_struct *work)
* @size: the new size.
* @cpu_id: the cpu buffer to resize
*
- * Minimum size is 2 * BUF_PAGE_SIZE.
+ * Minimum size is 2 * buffer->subbuf_size.
*
* Returns 0 on success and < 0 on failure.
*/
@@ -2172,7 +2046,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
!cpumask_test_cpu(cpu_id, buffer->cpumask))
return 0;
- nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
/* we need a minimum of two pages */
if (nr_pages < 2)
@@ -2419,7 +2293,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
*/
barrier();
- if ((iter->head + length) > commit || length > BUF_PAGE_SIZE)
+ if ((iter->head + length) > commit || length > iter->event_size)
/* Writer corrupted the read? */
goto reset;
@@ -2459,11 +2333,13 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
}
static __always_inline unsigned
-rb_event_index(struct ring_buffer_event *event)
+rb_event_index(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event)
{
unsigned long addr = (unsigned long)event;
- return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
+ addr &= (PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1;
+
+ return addr - BUF_PAGE_HDR_SIZE;
}
static void rb_inc_iter(struct ring_buffer_iter *iter)
@@ -2652,6 +2528,7 @@ static inline void
rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long tail, struct rb_event_info *info)
{
+ unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
struct buffer_page *tail_page = info->tail_page;
struct ring_buffer_event *event;
unsigned long length = info->length;
@@ -2660,13 +2537,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
* Only the event that crossed the page boundary
* must fill the old tail_page with padding.
*/
- if (tail >= BUF_PAGE_SIZE) {
+ if (tail >= bsize) {
/*
* If the page was filled, then we still need
* to update the real_end. Reset it to zero
* and the reader will ignore it.
*/
- if (tail == BUF_PAGE_SIZE)
+ if (tail == bsize)
tail_page->real_end = 0;
local_sub(length, &tail_page->write);
@@ -2694,7 +2571,7 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
* If we are less than the minimum size, we don't need to
* worry about it.
*/
- if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
+ if (tail > (bsize - RB_EVNT_MIN_SIZE)) {
/* No room for any events */
/* Mark the rest of the page with padding */
@@ -2709,19 +2586,19 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
}
/* Put in a discarded event */
- event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
+ event->array[0] = (bsize - tail) - RB_EVNT_HDR_SIZE;
event->type_len = RINGBUF_TYPE_PADDING;
/* time delta must be non zero */
event->time_delta = 1;
/* account for padding bytes */
- local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
+ local_add(bsize - tail, &cpu_buffer->entries_bytes);
/* Make sure the padding is visible before the tail_page->write update */
smp_wmb();
/* Set write to end of buffer */
- length = (tail + length) - BUF_PAGE_SIZE;
+ length = (tail + length) - bsize;
local_sub(length, &tail_page->write);
}
@@ -2835,7 +2712,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
/* Slow path */
static struct ring_buffer_event *
-rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
+rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event, u64 delta, bool abs)
{
if (abs)
event->type_len = RINGBUF_TYPE_TIME_STAMP;
@@ -2843,7 +2721,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
event->type_len = RINGBUF_TYPE_TIME_EXTEND;
/* Not the first event on the page, or not delta? */
- if (abs || rb_event_index(event)) {
+ if (abs || rb_event_index(cpu_buffer, event)) {
event->time_delta = delta & TS_MASK;
event->array[0] = delta >> TS_SHIFT;
} else {
@@ -2873,7 +2751,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
(unsigned long long)info->ts,
(unsigned long long)info->before,
(unsigned long long)info->after,
- (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0),
+ (unsigned long long)({rb_time_read(&cpu_buffer->write_stamp, &write_stamp); write_stamp;}),
sched_clock_stable() ? "" :
"If you just came from a suspend/resume,\n"
"please switch to the trace global clock:\n"
@@ -2917,7 +2795,7 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
if (!abs)
info->delta = 0;
}
- *event = rb_add_time_stamp(*event, info->delta, abs);
+ *event = rb_add_time_stamp(cpu_buffer, *event, info->delta, abs);
*length -= RB_LEN_TIME_EXTEND;
*delta = 0;
}
@@ -3001,10 +2879,10 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *bpage;
unsigned long addr;
- new_index = rb_event_index(event);
+ new_index = rb_event_index(cpu_buffer, event);
old_index = new_index + rb_event_ts_length(event);
addr = (unsigned long)event;
- addr &= PAGE_MASK;
+ addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1);
bpage = READ_ONCE(cpu_buffer->tail_page);
@@ -3391,6 +3269,76 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
#define CHECK_FULL_PAGE 1L
#ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS
+
+static const char *show_irq_str(int bits)
+{
+ const char *type[] = {
+ ".", // 0
+ "s", // 1
+ "h", // 2
+ "Hs", // 3
+ "n", // 4
+ "Ns", // 5
+ "Nh", // 6
+ "NHs", // 7
+ };
+
+ return type[bits];
+}
+
+/* Assume this is an trace event */
+static const char *show_flags(struct ring_buffer_event *event)
+{
+ struct trace_entry *entry;
+ int bits = 0;
+
+ if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry))
+ return "X";
+
+ entry = ring_buffer_event_data(event);
+
+ if (entry->flags & TRACE_FLAG_SOFTIRQ)
+ bits |= 1;
+
+ if (entry->flags & TRACE_FLAG_HARDIRQ)
+ bits |= 2;
+
+ if (entry->flags & TRACE_FLAG_NMI)
+ bits |= 4;
+
+ return show_irq_str(bits);
+}
+
+static const char *show_irq(struct ring_buffer_event *event)
+{
+ struct trace_entry *entry;
+
+ if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry))
+ return "";
+
+ entry = ring_buffer_event_data(event);
+ if (entry->flags & TRACE_FLAG_IRQS_OFF)
+ return "d";
+ return "";
+}
+
+static const char *show_interrupt_level(void)
+{
+ unsigned long pc = preempt_count();
+ unsigned char level = 0;
+
+ if (pc & SOFTIRQ_OFFSET)
+ level |= 1;
+
+ if (pc & HARDIRQ_MASK)
+ level |= 2;
+
+ if (pc & NMI_MASK)
+ level |= 4;
+
+ return show_irq_str(level);
+}
+
static void dump_buffer_page(struct buffer_data_page *bpage,
struct rb_event_info *info,
unsigned long tail)
@@ -3411,34 +3359,57 @@ static void dump_buffer_page(struct buffer_data_page *bpage,
case RINGBUF_TYPE_TIME_EXTEND:
delta = rb_event_time_stamp(event);
ts += delta;
- pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta);
+ pr_warn(" 0x%x: [%lld] delta:%lld TIME EXTEND\n",
+ e, ts, delta);
break;
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
ts = rb_fix_abs_ts(delta, ts);
- pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta);
+ pr_warn(" 0x%x: [%lld] absolute:%lld TIME STAMP\n",
+ e, ts, delta);
break;
case RINGBUF_TYPE_PADDING:
ts += event->time_delta;
- pr_warn(" [%lld] delta:%d PADDING\n", ts, event->time_delta);
+ pr_warn(" 0x%x: [%lld] delta:%d PADDING\n",
+ e, ts, event->time_delta);
break;
case RINGBUF_TYPE_DATA:
ts += event->time_delta;
- pr_warn(" [%lld] delta:%d\n", ts, event->time_delta);
+ pr_warn(" 0x%x: [%lld] delta:%d %s%s\n",
+ e, ts, event->time_delta,
+ show_flags(event), show_irq(event));
break;
default:
break;
}
}
+ pr_warn("expected end:0x%lx last event actually ended at:0x%x\n", tail, e);
}
static DEFINE_PER_CPU(atomic_t, checking);
static atomic_t ts_dump;
+#define buffer_warn_return(fmt, ...) \
+ do { \
+ /* If another report is happening, ignore this one */ \
+ if (atomic_inc_return(&ts_dump) != 1) { \
+ atomic_dec(&ts_dump); \
+ goto out; \
+ } \
+ atomic_inc(&cpu_buffer->record_disabled); \
+ pr_warn(fmt, ##__VA_ARGS__); \
+ dump_buffer_page(bpage, info, tail); \
+ atomic_dec(&ts_dump); \
+ /* There's some cases in boot up that this can happen */ \
+ if (WARN_ON_ONCE(system_state != SYSTEM_BOOTING)) \
+ /* Do not re-enable checking */ \
+ return; \
+ } while (0)
+
/*
* Check if the current event time stamp matches the deltas on
* the buffer page.
@@ -3492,7 +3463,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
- ts = rb_fix_abs_ts(delta, ts);
+ delta = rb_fix_abs_ts(delta, ts);
+ if (delta < ts) {
+ buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
+ cpu_buffer->cpu, ts, delta);
+ }
+ ts = delta;
break;
case RINGBUF_TYPE_PADDING:
@@ -3509,23 +3485,11 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
}
if ((full && ts > info->ts) ||
(!full && ts + info->delta != info->ts)) {
- /* If another report is happening, ignore this one */
- if (atomic_inc_return(&ts_dump) != 1) {
- atomic_dec(&ts_dump);
- goto out;
- }
- atomic_inc(&cpu_buffer->record_disabled);
- /* There's some cases in boot up that this can happen */
- WARN_ON_ONCE(system_state != SYSTEM_BOOTING);
- pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s\n",
- cpu_buffer->cpu,
- ts + info->delta, info->ts, info->delta,
- info->before, info->after,
- full ? " (full)" : "");
- dump_buffer_page(bpage, info, tail);
- atomic_dec(&ts_dump);
- /* Do not re-enable checking */
- return;
+ buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n",
+ cpu_buffer->cpu,
+ ts + info->delta, info->ts, info->delta,
+ info->before, info->after,
+ full ? " (full)" : "", show_interrupt_level());
}
out:
atomic_dec(this_cpu_ptr(&checking));
@@ -3545,16 +3509,14 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event;
struct buffer_page *tail_page;
unsigned long tail, write, w;
- bool a_ok;
- bool b_ok;
/* Don't let the compiler play games with cpu_buffer->tail_page */
tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
/*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK;
barrier();
- b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
- a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ rb_time_read(&cpu_buffer->before_stamp, &info->before);
+ rb_time_read(&cpu_buffer->write_stamp, &info->after);
barrier();
info->ts = rb_time_stamp(cpu_buffer->buffer);
@@ -3569,7 +3531,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
if (!w) {
/* Use the sub-buffer timestamp */
info->delta = 0;
- } else if (unlikely(!a_ok || !b_ok || info->before != info->after)) {
+ } else if (unlikely(info->before != info->after)) {
info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
info->length += RB_LEN_TIME_EXTEND;
} else {
@@ -3591,7 +3553,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
tail = write - info->length;
/* See if we shot pass the end of this buffer page */
- if (unlikely(write > BUF_PAGE_SIZE)) {
+ if (unlikely(write > cpu_buffer->buffer->subbuf_size)) {
check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
return rb_move_tail(cpu_buffer, tail, info);
}
@@ -3618,8 +3580,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
/* SLOW PATH - Interrupted between A and C */
/* Save the old before_stamp */
- a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
- RB_WARN_ON(cpu_buffer, !a_ok);
+ rb_time_read(&cpu_buffer->before_stamp, &info->before);
/*
* Read a new timestamp and update the before_stamp to make
@@ -3631,9 +3592,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
rb_time_set(&cpu_buffer->before_stamp, ts);
barrier();
- /*E*/ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
- /* Was interrupted before here, write_stamp must be valid */
- RB_WARN_ON(cpu_buffer, !a_ok);
+ /*E*/ rb_time_read(&cpu_buffer->write_stamp, &info->after);
barrier();
/*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
info->after == info->before && info->after < ts) {
@@ -3725,7 +3684,7 @@ rb_reserve_next_event(struct trace_buffer *buffer,
if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
add_ts_default = RB_ADD_STAMP_ABSOLUTE;
info.length += RB_LEN_TIME_EXTEND;
- if (info.length > BUF_MAX_DATA_SIZE)
+ if (info.length > cpu_buffer->buffer->max_data_size)
goto out_fail;
} else {
add_ts_default = RB_ADD_STAMP_NONE;
@@ -3800,7 +3759,7 @@ ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length)
if (unlikely(atomic_read(&cpu_buffer->record_disabled)))
goto out;
- if (unlikely(length > BUF_MAX_DATA_SIZE))
+ if (unlikely(length > buffer->max_data_size))
goto out;
if (unlikely(trace_recursive_lock(cpu_buffer)))
@@ -3834,7 +3793,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *bpage = cpu_buffer->commit_page;
struct buffer_page *start;
- addr &= PAGE_MASK;
+ addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1);
/* Do the likely case first */
if (likely(bpage->page == (void *)addr)) {
@@ -3950,7 +3909,7 @@ int ring_buffer_write(struct trace_buffer *buffer,
if (atomic_read(&cpu_buffer->record_disabled))
goto out;
- if (length > BUF_MAX_DATA_SIZE)
+ if (length > buffer->max_data_size)
goto out;
if (unlikely(trace_recursive_lock(cpu_buffer)))
@@ -4436,7 +4395,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
cpu_buffer = iter->cpu_buffer;
reader = cpu_buffer->reader_page;
head_page = cpu_buffer->head_page;
- commit_page = cpu_buffer->commit_page;
+ commit_page = READ_ONCE(cpu_buffer->commit_page);
commit_ts = commit_page->page->time_stamp;
/*
@@ -4530,6 +4489,7 @@ static struct buffer_page *
rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = NULL;
+ unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
unsigned long overwrite;
unsigned long flags;
int nr_loops = 0;
@@ -4665,7 +4625,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
#define USECS_WAIT 1000000
for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) {
/* If the write is past the end of page, a writer is still updating it */
- if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE))
+ if (likely(!reader || rb_page_write(reader) <= bsize))
break;
udelay(1);
@@ -5109,7 +5069,8 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
return NULL;
/* Holds the entire event: data and meta data */
- iter->event = kmalloc(BUF_PAGE_SIZE, flags);
+ iter->event_size = buffer->subbuf_size;
+ iter->event = kmalloc(iter->event_size, flags);
if (!iter->event) {
kfree(iter);
return NULL;
@@ -5225,19 +5186,28 @@ EXPORT_SYMBOL_GPL(ring_buffer_iter_advance);
*/
unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
{
- /*
- * Earlier, this method returned
- * BUF_PAGE_SIZE * buffer->nr_pages
- * Since the nr_pages field is now removed, we have converted this to
- * return the per cpu buffer value.
- */
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
- return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
+ return buffer->subbuf_size * buffer->buffers[cpu]->nr_pages;
}
EXPORT_SYMBOL_GPL(ring_buffer_size);
+/**
+ * ring_buffer_max_event_size - return the max data size of an event
+ * @buffer: The ring buffer.
+ *
+ * Returns the maximum size an event can be.
+ */
+unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer)
+{
+ /* If abs timestamp is requested, events have a timestamp too */
+ if (ring_buffer_time_stamp_abs(buffer))
+ return buffer->max_data_size - RB_LEN_TIME_EXTEND;
+ return buffer->max_data_size;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_max_event_size);
+
static void rb_clear_buffer_page(struct buffer_page *page)
{
local_set(&page->write, 0);
@@ -5508,6 +5478,9 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
goto out;
+ if (buffer_a->subbuf_order != buffer_b->subbuf_order)
+ goto out;
+
ret = -EAGAIN;
if (atomic_read(&buffer_a->record_disabled))
@@ -5579,40 +5552,48 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
* Returns:
* The page allocated, or ERR_PTR
*/
-void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
+struct buffer_data_read_page *
+ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct buffer_data_page *bpage = NULL;
+ struct buffer_data_read_page *bpage = NULL;
unsigned long flags;
struct page *page;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return ERR_PTR(-ENODEV);
+ bpage = kzalloc(sizeof(*bpage), GFP_KERNEL);
+ if (!bpage)
+ return ERR_PTR(-ENOMEM);
+
+ bpage->order = buffer->subbuf_order;
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
if (cpu_buffer->free_page) {
- bpage = cpu_buffer->free_page;
+ bpage->data = cpu_buffer->free_page;
cpu_buffer->free_page = NULL;
}
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
- if (bpage)
+ if (bpage->data)
goto out;
- page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_NORETRY, 0);
- if (!page)
+ page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY,
+ cpu_buffer->buffer->subbuf_order);
+ if (!page) {
+ kfree(bpage);
return ERR_PTR(-ENOMEM);
+ }
- bpage = page_address(page);
+ bpage->data = page_address(page);
out:
- rb_init_page(bpage);
+ rb_init_page(bpage->data);
return bpage;
}
@@ -5622,14 +5603,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
* ring_buffer_free_read_page - free an allocated read page
* @buffer: the buffer the page was allocate for
* @cpu: the cpu buffer the page came from
- * @data: the page to free
+ * @data_page: the page to free
*
* Free a page allocated from ring_buffer_alloc_read_page.
*/
-void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data)
+void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu,
+ struct buffer_data_read_page *data_page)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct buffer_data_page *bpage = data;
+ struct buffer_data_page *bpage = data_page->data;
struct page *page = virt_to_page(bpage);
unsigned long flags;
@@ -5638,8 +5620,12 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data
cpu_buffer = buffer->buffers[cpu];
- /* If the page is still in use someplace else, we can't reuse it */
- if (page_ref_count(page) > 1)
+ /*
+ * If the page is still in use someplace else, or order of the page
+ * is different from the subbuffer order of the buffer -
+ * we can't reuse it
+ */
+ if (page_ref_count(page) > 1 || data_page->order != buffer->subbuf_order)
goto out;
local_irq_save(flags);
@@ -5654,7 +5640,8 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data
local_irq_restore(flags);
out:
- free_page((unsigned long)bpage);
+ free_pages((unsigned long)bpage, data_page->order);
+ kfree(data_page);
}
EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
@@ -5675,9 +5662,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
* rpage = ring_buffer_alloc_read_page(buffer, cpu);
* if (IS_ERR(rpage))
* return PTR_ERR(rpage);
- * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
+ * ret = ring_buffer_read_page(buffer, rpage, len, cpu, 0);
* if (ret >= 0)
- * process_page(rpage, ret);
+ * process_page(ring_buffer_read_page_data(rpage), ret);
+ * ring_buffer_free_read_page(buffer, cpu, rpage);
*
* When @full is set, the function will not return true unless
* the writer is off the reader page.
@@ -5692,7 +5680,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
* <0 if no data has been transferred.
*/
int ring_buffer_read_page(struct trace_buffer *buffer,
- void **data_page, size_t len, int cpu, int full)
+ struct buffer_data_read_page *data_page,
+ size_t len, int cpu, int full)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct ring_buffer_event *event;
@@ -5717,10 +5706,12 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
len -= BUF_PAGE_HDR_SIZE;
- if (!data_page)
+ if (!data_page || !data_page->data)
+ goto out;
+ if (data_page->order != buffer->subbuf_order)
goto out;
- bpage = *data_page;
+ bpage = data_page->data;
if (!bpage)
goto out;
@@ -5814,11 +5805,11 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/* swap the pages */
rb_init_page(bpage);
bpage = reader->page;
- reader->page = *data_page;
+ reader->page = data_page->data;
local_set(&reader->write, 0);
local_set(&reader->entries, 0);
reader->read = 0;
- *data_page = bpage;
+ data_page->data = bpage;
/*
* Use the real_end for the data size,
@@ -5840,7 +5831,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/* If there is room at the end of the page to save the
* missed events, then record it there.
*/
- if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
+ if (buffer->subbuf_size - commit >= sizeof(missed_events)) {
memcpy(&bpage->data[commit], &missed_events,
sizeof(missed_events));
local_add(RB_MISSED_STORED, &bpage->commit);
@@ -5852,8 +5843,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/*
* This page may be off to user land. Zero it out here.
*/
- if (commit < BUF_PAGE_SIZE)
- memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
+ if (commit < buffer->subbuf_size)
+ memset(&bpage->data[commit], 0, buffer->subbuf_size - commit);
out_unlock:
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
@@ -5863,6 +5854,213 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
+/**
+ * ring_buffer_read_page_data - get pointer to the data in the page.
+ * @page: the page to get the data from
+ *
+ * Returns pointer to the actual data in this page.
+ */
+void *ring_buffer_read_page_data(struct buffer_data_read_page *page)
+{
+ return page->data;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_page_data);
+
+/**
+ * ring_buffer_subbuf_size_get - get size of the sub buffer.
+ * @buffer: the buffer to get the sub buffer size from
+ *
+ * Returns size of the sub buffer, in bytes.
+ */
+int ring_buffer_subbuf_size_get(struct trace_buffer *buffer)
+{
+ return buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_size_get);
+
+/**
+ * ring_buffer_subbuf_order_get - get order of system sub pages in one buffer page.
+ * @buffer: The ring_buffer to get the system sub page order from
+ *
+ * By default, one ring buffer sub page equals to one system page. This parameter
+ * is configurable, per ring buffer. The size of the ring buffer sub page can be
+ * extended, but must be an order of system page size.
+ *
+ * Returns the order of buffer sub page size, in system pages:
+ * 0 means the sub buffer size is 1 system page and so forth.
+ * In case of an error < 0 is returned.
+ */
+int ring_buffer_subbuf_order_get(struct trace_buffer *buffer)
+{
+ if (!buffer)
+ return -EINVAL;
+
+ return buffer->subbuf_order;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get);
+
+/**
+ * ring_buffer_subbuf_order_set - set the size of ring buffer sub page.
+ * @buffer: The ring_buffer to set the new page size.
+ * @order: Order of the system pages in one sub buffer page
+ *
+ * By default, one ring buffer pages equals to one system page. This API can be
+ * used to set new size of the ring buffer page. The size must be order of
+ * system page size, that's why the input parameter @order is the order of
+ * system pages that are allocated for one ring buffer page:
+ * 0 - 1 system page
+ * 1 - 2 system pages
+ * 3 - 4 system pages
+ * ...
+ *
+ * Returns 0 on success or < 0 in case of an error.
+ */
+int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *bpage, *tmp;
+ int old_order, old_size;
+ int nr_pages;
+ int psize;
+ int err;
+ int cpu;
+
+ if (!buffer || order < 0)
+ return -EINVAL;
+
+ if (buffer->subbuf_order == order)
+ return 0;
+
+ psize = (1 << order) * PAGE_SIZE;
+ if (psize <= BUF_PAGE_HDR_SIZE)
+ return -EINVAL;
+
+ /* Size of a subbuf cannot be greater than the write counter */
+ if (psize > RB_WRITE_MASK + 1)
+ return -EINVAL;
+
+ old_order = buffer->subbuf_order;
+ old_size = buffer->subbuf_size;
+
+ /* prevent another thread from changing buffer sizes */
+ mutex_lock(&buffer->mutex);
+ atomic_inc(&buffer->record_disabled);
+
+ /* Make sure all commits have finished */
+ synchronize_rcu();
+
+ buffer->subbuf_order = order;
+ buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE;
+
+ /* Make sure all new buffers are allocated, before deleting the old ones */
+ for_each_buffer_cpu(buffer, cpu) {
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ continue;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ /* Update the number of pages to match the new size */
+ nr_pages = old_size * buffer->buffers[cpu]->nr_pages;
+ nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size);
+
+ /* we need a minimum of two pages */
+ if (nr_pages < 2)
+ nr_pages = 2;
+
+ cpu_buffer->nr_pages_to_update = nr_pages;
+
+ /* Include the reader page */
+ nr_pages++;
+
+ /* Allocate the new size buffer */
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+ if (__rb_allocate_pages(cpu_buffer, nr_pages,
+ &cpu_buffer->new_pages)) {
+ /* not enough memory for new pages */
+ err = -ENOMEM;
+ goto error;
+ }
+ }
+
+ for_each_buffer_cpu(buffer, cpu) {
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ continue;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ /* Clear the head bit to make the link list normal to read */
+ rb_head_page_deactivate(cpu_buffer);
+
+ /* Now walk the list and free all the old sub buffers */
+ list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ /* The above loop stopped an the last page needing to be freed */
+ bpage = list_entry(cpu_buffer->pages, struct buffer_page, list);
+ free_buffer_page(bpage);
+
+ /* Free the current reader page */
+ free_buffer_page(cpu_buffer->reader_page);
+
+ /* One page was allocated for the reader page */
+ cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next,
+ struct buffer_page, list);
+ list_del_init(&cpu_buffer->reader_page->list);
+
+ /* The cpu_buffer pages are a link list with no head */
+ cpu_buffer->pages = cpu_buffer->new_pages.next;
+ cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev;
+ cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next;
+
+ /* Clear the new_pages list */
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages, struct buffer_page, list);
+ cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+
+ cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update;
+ cpu_buffer->nr_pages_to_update = 0;
+
+ free_pages((unsigned long)cpu_buffer->free_page, old_order);
+ cpu_buffer->free_page = NULL;
+
+ rb_head_page_activate(cpu_buffer);
+
+ rb_check_pages(cpu_buffer);
+ }
+
+ atomic_dec(&buffer->record_disabled);
+ mutex_unlock(&buffer->mutex);
+
+ return 0;
+
+error:
+ buffer->subbuf_order = old_order;
+ buffer->subbuf_size = old_size;
+
+ atomic_dec(&buffer->record_disabled);
+ mutex_unlock(&buffer->mutex);
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (!cpu_buffer->nr_pages_to_update)
+ continue;
+
+ list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set);
+
/*
* We only allocate new buffers, never free them if the CPU goes down.
* If we were to free the buffer, then the user would lose any trace that was in
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index aef34673d7..008187ebd7 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -104,10 +104,11 @@ static enum event_status read_event(int cpu)
static enum event_status read_page(int cpu)
{
+ struct buffer_data_read_page *bpage;
struct ring_buffer_event *event;
struct rb_page *rpage;
unsigned long commit;
- void *bpage;
+ int page_size;
int *entry;
int ret;
int inc;
@@ -117,14 +118,15 @@ static enum event_status read_page(int cpu)
if (IS_ERR(bpage))
return EVENT_DROPPED;
- ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
+ page_size = ring_buffer_subbuf_size_get(buffer);
+ ret = ring_buffer_read_page(buffer, bpage, page_size, cpu, 1);
if (ret >= 0) {
- rpage = bpage;
+ rpage = ring_buffer_read_page_data(bpage);
/* The commit may have missed event flags set, clear them */
commit = local_read(&rpage->commit) & 0xfffff;
for (i = 0; i < commit && !test_error ; i += inc) {
- if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
+ if (i >= (page_size - offsetof(struct rb_page, data))) {
TEST_ERROR();
break;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e03960f9f4..d390fea3a6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1264,10 +1264,17 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val);
int tracing_alloc_snapshot_instance(struct trace_array *tr)
{
+ int order;
int ret;
if (!tr->allocated_snapshot) {
+ /* Make the snapshot buffer have the same order as main buffer */
+ order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+ ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order);
+ if (ret < 0)
+ return ret;
+
/* allocate spare buffer */
ret = resize_buffer_duplicate_size(&tr->max_buffer,
&tr->array_buffer, RING_BUFFER_ALL_CPUS);
@@ -1287,6 +1294,7 @@ static void free_snapshot(struct trace_array *tr)
* The max_tr ring buffer has some state (e.g. ring->clock) and
* we want preserve it.
*/
+ ring_buffer_subbuf_order_set(tr->max_buffer.buffer, 0);
ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
set_buffer_entries(&tr->max_buffer, 1);
tracing_reset_online_cpus(&tr->max_buffer);
@@ -1525,7 +1533,7 @@ void disable_trace_on_warning(void)
bool tracer_tracing_is_on(struct trace_array *tr)
{
if (tr->array_buffer.buffer)
- return ring_buffer_record_is_on(tr->array_buffer.buffer);
+ return ring_buffer_record_is_set_on(tr->array_buffer.buffer);
return !tr->buffer_disabled;
}
@@ -1947,15 +1955,36 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
#endif /* CONFIG_TRACER_MAX_TRACE */
+struct pipe_wait {
+ struct trace_iterator *iter;
+ int wait_index;
+};
+
+static bool wait_pipe_cond(void *data)
+{
+ struct pipe_wait *pwait = data;
+ struct trace_iterator *iter = pwait->iter;
+
+ if (atomic_read_acquire(&iter->wait_index) != pwait->wait_index)
+ return true;
+
+ return iter->closed;
+}
+
static int wait_on_pipe(struct trace_iterator *iter, int full)
{
+ struct pipe_wait pwait;
int ret;
/* Iterators are static, they should be filled or empty */
if (trace_buffer_iter(iter, iter->cpu_file))
return 0;
- ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full);
+ pwait.wait_index = atomic_read_acquire(&iter->wait_index);
+ pwait.iter = iter;
+
+ ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full,
+ wait_pipe_cond, &pwait);
#ifdef CONFIG_TRACER_MAX_TRACE
/*
@@ -3781,7 +3810,7 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str,
/* OK if part of the temp seq buffer */
if ((addr >= (unsigned long)iter->tmp_seq.buffer) &&
- (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE))
+ (addr < (unsigned long)iter->tmp_seq.buffer + TRACE_SEQ_BUFFER_SIZE))
return true;
/* Core rodata can not be freed */
@@ -5046,7 +5075,7 @@ static int tracing_release(struct inode *inode, struct file *file)
return 0;
}
-static int tracing_release_generic_tr(struct inode *inode, struct file *file)
+int tracing_release_generic_tr(struct inode *inode, struct file *file)
{
struct trace_array *tr = inode->i_private;
@@ -6948,8 +6977,8 @@ waitagain:
goto out;
}
- if (cnt >= PAGE_SIZE)
- cnt = PAGE_SIZE - 1;
+ if (cnt >= TRACE_SEQ_BUFFER_SIZE)
+ cnt = TRACE_SEQ_BUFFER_SIZE - 1;
/* reset all but tr, trace, and overruns */
trace_iterator_reset(iter);
@@ -7285,6 +7314,8 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
return 0;
}
+#define TRACE_MARKER_MAX_SIZE 4096
+
static ssize_t
tracing_mark_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
@@ -7294,8 +7325,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
enum event_trigger_type tt = ETT_NONE;
struct trace_buffer *buffer;
struct print_entry *entry;
+ int meta_size;
ssize_t written;
- int size;
+ size_t size;
int len;
/* Used in tracing_mark_raw_write() as well */
@@ -7308,12 +7340,15 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if (!(tr->trace_flags & TRACE_ITER_MARKERS))
return -EINVAL;
- if (cnt > TRACE_BUF_SIZE)
- cnt = TRACE_BUF_SIZE;
+ if ((ssize_t)cnt < 0)
+ return -EINVAL;
- BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
+ if (cnt > TRACE_MARKER_MAX_SIZE)
+ cnt = TRACE_MARKER_MAX_SIZE;
- size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */
+ meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */
+ again:
+ size = cnt + meta_size;
/* If less than "<faulted>", then make sure we can still add that */
if (cnt < FAULTED_SIZE)
@@ -7322,9 +7357,25 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
buffer = tr->array_buffer.buffer;
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
tracing_gen_ctx());
- if (unlikely(!event))
+ if (unlikely(!event)) {
+ /*
+ * If the size was greater than what was allowed, then
+ * make it smaller and try again.
+ */
+ if (size > ring_buffer_max_event_size(buffer)) {
+ /* cnt < FAULTED size should never be bigger than max */
+ if (WARN_ON_ONCE(cnt < FAULTED_SIZE))
+ return -EBADF;
+ cnt = ring_buffer_max_event_size(buffer) - meta_size;
+ /* The above should only happen once */
+ if (WARN_ON_ONCE(cnt + meta_size == size))
+ return -EBADF;
+ goto again;
+ }
+
/* Ring buffer disabled, return as if not open for write */
return -EBADF;
+ }
entry = ring_buffer_event_data(event);
entry->ip = _THIS_IP_;
@@ -7359,9 +7410,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
return written;
}
-/* Limit it for now to 3K (including tag) */
-#define RAW_DATA_MAX_SIZE (1024*3)
-
static ssize_t
tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
@@ -7383,19 +7431,18 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
return -EINVAL;
/* The marker must at least have a tag id */
- if (cnt < sizeof(unsigned int) || cnt > RAW_DATA_MAX_SIZE)
+ if (cnt < sizeof(unsigned int))
return -EINVAL;
- if (cnt > TRACE_BUF_SIZE)
- cnt = TRACE_BUF_SIZE;
-
- BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
-
size = sizeof(*entry) + cnt;
if (cnt < FAULT_SIZE_ID)
size += FAULT_SIZE_ID - cnt;
buffer = tr->array_buffer.buffer;
+
+ if (size > ring_buffer_max_event_size(buffer))
+ return -EINVAL;
+
event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
tracing_gen_ctx());
if (!event)
@@ -7580,6 +7627,7 @@ struct ftrace_buffer_info {
struct trace_iterator iter;
void *spare;
unsigned int spare_cpu;
+ unsigned int spare_size;
unsigned int read;
};
@@ -8284,6 +8332,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
{
struct ftrace_buffer_info *info = filp->private_data;
struct trace_iterator *iter = &info->iter;
+ void *trace_data;
+ int page_size;
ssize_t ret = 0;
ssize_t size;
@@ -8295,6 +8345,17 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
return -EBUSY;
#endif
+ page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
+
+ /* Make sure the spare matches the current sub buffer size */
+ if (info->spare) {
+ if (page_size != info->spare_size) {
+ ring_buffer_free_read_page(iter->array_buffer->buffer,
+ info->spare_cpu, info->spare);
+ info->spare = NULL;
+ }
+ }
+
if (!info->spare) {
info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer,
iter->cpu_file);
@@ -8303,19 +8364,20 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
info->spare = NULL;
} else {
info->spare_cpu = iter->cpu_file;
+ info->spare_size = page_size;
}
}
if (!info->spare)
return ret;
/* Do we have previous read data to read? */
- if (info->read < PAGE_SIZE)
+ if (info->read < page_size)
goto read;
again:
trace_access_lock(iter->cpu_file);
ret = ring_buffer_read_page(iter->array_buffer->buffer,
- &info->spare,
+ info->spare,
count,
iter->cpu_file, 0);
trace_access_unlock(iter->cpu_file);
@@ -8336,11 +8398,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
info->read = 0;
read:
- size = PAGE_SIZE - info->read;
+ size = page_size - info->read;
if (size > count)
size = count;
-
- ret = copy_to_user(ubuf, info->spare + info->read, size);
+ trace_data = ring_buffer_read_page_data(info->spare);
+ ret = copy_to_user(ubuf, trace_data + info->read, size);
if (ret == size)
return -EFAULT;
@@ -8357,9 +8419,9 @@ static int tracing_buffers_flush(struct file *file, fl_owner_t id)
struct ftrace_buffer_info *info = file->private_data;
struct trace_iterator *iter = &info->iter;
- iter->wait_index++;
+ iter->closed = true;
/* Make sure the waiters see the new wait_index */
- smp_wmb();
+ (void)atomic_fetch_inc_release(&iter->wait_index);
ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
@@ -8459,6 +8521,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
.spd_release = buffer_spd_release,
};
struct buffer_ref *ref;
+ bool woken = false;
+ int page_size;
int entries, i;
ssize_t ret = 0;
@@ -8467,13 +8531,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
return -EBUSY;
#endif
- if (*ppos & (PAGE_SIZE - 1))
+ page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
+ if (*ppos & (page_size - 1))
return -EINVAL;
- if (len & (PAGE_SIZE - 1)) {
- if (len < PAGE_SIZE)
+ if (len & (page_size - 1)) {
+ if (len < page_size)
return -EINVAL;
- len &= PAGE_MASK;
+ len &= (~(page_size - 1));
}
if (splice_grow_spd(pipe, &spd))
@@ -8483,7 +8548,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
trace_access_lock(iter->cpu_file);
entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
- for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) {
+ for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= page_size) {
struct page *page;
int r;
@@ -8504,7 +8569,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
}
ref->cpu = iter->cpu_file;
- r = ring_buffer_read_page(ref->buffer, &ref->page,
+ r = ring_buffer_read_page(ref->buffer, ref->page,
len, iter->cpu_file, 1);
if (r < 0) {
ring_buffer_free_read_page(ref->buffer, ref->cpu,
@@ -8513,14 +8578,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
break;
}
- page = virt_to_page(ref->page);
+ page = virt_to_page(ring_buffer_read_page_data(ref->page));
spd.pages[i] = page;
- spd.partial[i].len = PAGE_SIZE;
+ spd.partial[i].len = page_size;
spd.partial[i].offset = 0;
spd.partial[i].private = (unsigned long)ref;
spd.nr_pages++;
- *ppos += PAGE_SIZE;
+ *ppos += page_size;
entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
}
@@ -8530,17 +8595,17 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
/* did we read anything? */
if (!spd.nr_pages) {
- long wait_index;
if (ret)
goto out;
+ if (woken)
+ goto out;
+
ret = -EAGAIN;
if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
goto out;
- wait_index = READ_ONCE(iter->wait_index);
-
ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent);
if (ret)
goto out;
@@ -8549,10 +8614,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
if (!tracer_tracing_is_on(iter->tr))
goto out;
- /* Make sure we see the new wait_index */
- smp_rmb();
- if (wait_index != iter->wait_index)
- goto out;
+ /* Iterate one more time to collect any new data then exit */
+ woken = true;
goto again;
}
@@ -8575,9 +8638,8 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned
mutex_lock(&trace_types_lock);
- iter->wait_index++;
/* Make sure the waiters see the new wait_index */
- smp_wmb();
+ (void)atomic_fetch_inc_release(&iter->wait_index);
ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
@@ -9365,6 +9427,103 @@ static const struct file_operations buffer_percent_fops = {
.llseek = default_llseek,
};
+static ssize_t
+buffer_subbuf_size_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ size_t size;
+ char buf[64];
+ int order;
+ int r;
+
+ order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+ size = (PAGE_SIZE << order) / 1024;
+
+ r = sprintf(buf, "%zd\n", size);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+buffer_subbuf_size_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ unsigned long val;
+ int old_order;
+ int order;
+ int pages;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ val *= 1024; /* value passed in is in KB */
+
+ pages = DIV_ROUND_UP(val, PAGE_SIZE);
+ order = fls(pages - 1);
+
+ /* limit between 1 and 128 system pages */
+ if (order < 0 || order > 7)
+ return -EINVAL;
+
+ /* Do not allow tracing while changing the order of the ring buffer */
+ tracing_stop_tr(tr);
+
+ old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+ if (old_order == order)
+ goto out;
+
+ ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, order);
+ if (ret)
+ goto out;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+
+ if (!tr->allocated_snapshot)
+ goto out_max;
+
+ ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order);
+ if (ret) {
+ /* Put back the old order */
+ cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order);
+ if (WARN_ON_ONCE(cnt)) {
+ /*
+ * AARGH! We are left with different orders!
+ * The max buffer is our "snapshot" buffer.
+ * When a tracer needs a snapshot (one of the
+ * latency tracers), it swaps the max buffer
+ * with the saved snap shot. We succeeded to
+ * update the order of the main buffer, but failed to
+ * update the order of the max buffer. But when we tried
+ * to reset the main buffer to the original size, we
+ * failed there too. This is very unlikely to
+ * happen, but if it does, warn and kill all
+ * tracing.
+ */
+ tracing_disabled = 1;
+ }
+ goto out;
+ }
+ out_max:
+#endif
+ (*ppos)++;
+ out:
+ if (ret)
+ cnt = ret;
+ tracing_start_tr(tr);
+ return cnt;
+}
+
+static const struct file_operations buffer_subbuf_size_fops = {
+ .open = tracing_open_generic_tr,
+ .read = buffer_subbuf_size_read,
+ .write = buffer_subbuf_size_write,
+ .release = tracing_release_generic_tr,
+ .llseek = default_llseek,
+};
+
static struct dentry *trace_instance_dir;
static void
@@ -9515,7 +9674,8 @@ static int trace_array_create_dir(struct trace_array *tr)
return ret;
}
-static struct trace_array *trace_array_create(const char *name)
+static struct trace_array *
+trace_array_create_systems(const char *name, const char *systems)
{
struct trace_array *tr;
int ret;
@@ -9535,6 +9695,12 @@ static struct trace_array *trace_array_create(const char *name)
if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
goto out_free_tr;
+ if (systems) {
+ tr->system_names = kstrdup_const(systems, GFP_KERNEL);
+ if (!tr->system_names)
+ goto out_free_tr;
+ }
+
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9581,12 +9747,18 @@ static struct trace_array *trace_array_create(const char *name)
free_trace_buffers(tr);
free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
+ kfree_const(tr->system_names);
kfree(tr->name);
kfree(tr);
return ERR_PTR(ret);
}
+static struct trace_array *trace_array_create(const char *name)
+{
+ return trace_array_create_systems(name, NULL);
+}
+
static int instance_mkdir(const char *name)
{
struct trace_array *tr;
@@ -9612,6 +9784,7 @@ out_unlock:
/**
* trace_array_get_by_name - Create/Lookup a trace array, given its name.
* @name: The name of the trace array to be looked up/created.
+ * @systems: A list of systems to create event directories for (NULL for all)
*
* Returns pointer to trace array with given name.
* NULL, if it cannot be created.
@@ -9625,7 +9798,7 @@ out_unlock:
* trace_array_put() is called, user space can not delete it.
*
*/
-struct trace_array *trace_array_get_by_name(const char *name)
+struct trace_array *trace_array_get_by_name(const char *name, const char *systems)
{
struct trace_array *tr;
@@ -9637,7 +9810,7 @@ struct trace_array *trace_array_get_by_name(const char *name)
goto out_unlock;
}
- tr = trace_array_create(name);
+ tr = trace_array_create_systems(name, systems);
if (IS_ERR(tr))
tr = NULL;
@@ -9684,6 +9857,7 @@ static int __remove_instance(struct trace_array *tr)
free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
+ kfree_const(tr->system_names);
kfree(tr->name);
kfree(tr);
@@ -9816,6 +9990,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer,
tr, &buffer_percent_fops);
+ trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer,
+ tr, &buffer_subbuf_size_fops);
+
create_trace_options_dir(tr);
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -10402,7 +10579,7 @@ __init static void enable_instances(void)
if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
do_allocate_snapshot(tok);
- tr = trace_array_get_by_name(tok);
+ tr = trace_array_get_by_name(tok, NULL);
if (!tr) {
pr_warn("Failed to create instance buffer %s\n", curr_str);
continue;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0489e72c81..00f873910c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -377,6 +377,7 @@ struct trace_array {
unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE];
unsigned int flags;
raw_spinlock_t start_lock;
+ const char *system_names;
struct list_head err_log;
struct dentry *dir;
struct dentry *options;
@@ -615,6 +616,7 @@ void tracing_reset_all_online_cpus(void);
void tracing_reset_all_online_cpus_unlocked(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
int tracing_open_generic_tr(struct inode *inode, struct file *filp);
+int tracing_release_generic_tr(struct inode *inode, struct file *file);
int tracing_open_file_tr(struct inode *inode, struct file *filp);
int tracing_release_file_tr(struct inode *inode, struct file *filp);
int tracing_single_release_file_tr(struct inode *inode, struct file *filp);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 7ccc7a8e15..dbe29b4c6a 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -633,7 +633,7 @@ trace_boot_init_instances(struct xbc_node *node)
if (!p || *p == '\0')
continue;
- tr = trace_array_get_by_name(p);
+ tr = trace_array_get_by_name(p, NULL);
if (!tr) {
pr_err("Failed to get trace instance %s\n", p);
continue;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f29e815ca5..52f75c36bb 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1670,6 +1670,7 @@ static int trace_format_open(struct inode *inode, struct file *file)
return 0;
}
+#ifdef CONFIG_PERF_EVENTS
static ssize_t
event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
@@ -1684,6 +1685,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
}
+#endif
static ssize_t
event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
@@ -1893,9 +1895,9 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
}
static ssize_t
-show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+show_header_page_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
- int (*func)(struct trace_seq *s) = filp->private_data;
+ struct trace_array *tr = filp->private_data;
struct trace_seq *s;
int r;
@@ -1908,7 +1910,31 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
trace_seq_init(s);
- func(s);
+ ring_buffer_print_page_header(tr->array_buffer.buffer, s);
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, trace_seq_used(s));
+
+ kfree(s);
+
+ return r;
+}
+
+static ssize_t
+show_header_event_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_seq *s;
+ int r;
+
+ if (*ppos)
+ return 0;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ ring_buffer_print_entry_header(s);
r = simple_read_from_buffer(ubuf, cnt, ppos,
s->buffer, trace_seq_used(s));
@@ -2128,10 +2154,12 @@ static const struct file_operations ftrace_event_format_fops = {
.release = seq_release,
};
+#ifdef CONFIG_PERF_EVENTS
static const struct file_operations ftrace_event_id_fops = {
.read = event_id_read,
.llseek = default_llseek,
};
+#endif
static const struct file_operations ftrace_event_filter_fops = {
.open = tracing_open_file_tr,
@@ -2165,10 +2193,18 @@ static const struct file_operations ftrace_tr_enable_fops = {
.release = subsystem_release,
};
-static const struct file_operations ftrace_show_header_fops = {
- .open = tracing_open_generic,
- .read = show_header,
+static const struct file_operations ftrace_show_header_page_fops = {
+ .open = tracing_open_generic_tr,
+ .read = show_header_page_file,
+ .llseek = default_llseek,
+ .release = tracing_release_generic_tr,
+};
+
+static const struct file_operations ftrace_show_header_event_fops = {
+ .open = tracing_open_generic_tr,
+ .read = show_header_event_file,
.llseek = default_llseek,
+ .release = tracing_release_generic_tr,
};
static int
@@ -2896,6 +2932,27 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
up_write(&trace_event_sem);
}
+static bool event_in_systems(struct trace_event_call *call,
+ const char *systems)
+{
+ const char *system;
+ const char *p;
+
+ if (!systems)
+ return true;
+
+ system = call->class->system;
+ p = strstr(systems, system);
+ if (!p)
+ return false;
+
+ if (p != systems && !isspace(*(p - 1)) && *(p - 1) != ',')
+ return false;
+
+ p += strlen(system);
+ return !*p || isspace(*p) || *p == ',';
+}
+
static struct trace_event_file *
trace_create_new_event(struct trace_event_call *call,
struct trace_array *tr)
@@ -2905,9 +2962,12 @@ trace_create_new_event(struct trace_event_call *call,
struct trace_event_file *file;
unsigned int first;
+ if (!event_in_systems(call, tr->system_names))
+ return NULL;
+
file = kmem_cache_alloc(file_cachep, GFP_TRACE);
if (!file)
- return NULL;
+ return ERR_PTR(-ENOMEM);
pid_list = rcu_dereference_protected(tr->filtered_pids,
lockdep_is_held(&event_mutex));
@@ -2972,8 +3032,17 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
struct trace_event_file *file;
file = trace_create_new_event(call, tr);
+ /*
+ * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
+ * allocation, or NULL if the event is not part of the tr->system_names.
+ * When the event is not part of the tr->system_names, return zero, not
+ * an error.
+ */
if (!file)
- return -ENOMEM;
+ return 0;
+
+ if (IS_ERR(file))
+ return PTR_ERR(file);
if (eventdir_initialized)
return event_create_dir(tr->event_dir, file);
@@ -3012,8 +3081,17 @@ __trace_early_add_new_event(struct trace_event_call *call,
int ret;
file = trace_create_new_event(call, tr);
+ /*
+ * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
+ * allocation, or NULL if the event is not part of the tr->system_names.
+ * When the event is not part of the tr->system_names, return zero, not
+ * an error.
+ */
if (!file)
- return -ENOMEM;
+ return 0;
+
+ if (IS_ERR(file))
+ return PTR_ERR(file);
ret = event_define_fields(call);
if (ret)
@@ -3752,17 +3830,16 @@ static int events_callback(const char *name, umode_t *mode, void **data,
return 1;
}
- if (strcmp(name, "header_page") == 0)
- *data = ring_buffer_print_page_header;
-
- else if (strcmp(name, "header_event") == 0)
- *data = ring_buffer_print_entry_header;
+ if (strcmp(name, "header_page") == 0) {
+ *mode = TRACE_MODE_READ;
+ *fops = &ftrace_show_header_page_fops;
- else
+ } else if (strcmp(name, "header_event") == 0) {
+ *mode = TRACE_MODE_READ;
+ *fops = &ftrace_show_header_event_fops;
+ } else
return 0;
- *mode = TRACE_MODE_READ;
- *fops = &ftrace_show_header_fops;
return 1;
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 5ecf3c8bde..6ece1308d3 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -4805,36 +4805,35 @@ static int parse_actions(struct hist_trigger_data *hist_data)
int len;
for (i = 0; i < hist_data->attrs->n_actions; i++) {
+ enum handler_id hid = 0;
+ char *action_str;
+
str = hist_data->attrs->action_str[i];
- if ((len = str_has_prefix(str, "onmatch("))) {
- char *action_str = str + len;
+ if ((len = str_has_prefix(str, "onmatch(")))
+ hid = HANDLER_ONMATCH;
+ else if ((len = str_has_prefix(str, "onmax(")))
+ hid = HANDLER_ONMAX;
+ else if ((len = str_has_prefix(str, "onchange(")))
+ hid = HANDLER_ONCHANGE;
- data = onmatch_parse(tr, action_str);
- if (IS_ERR(data)) {
- ret = PTR_ERR(data);
- break;
- }
- } else if ((len = str_has_prefix(str, "onmax("))) {
- char *action_str = str + len;
+ action_str = str + len;
- data = track_data_parse(hist_data, action_str,
- HANDLER_ONMAX);
- if (IS_ERR(data)) {
- ret = PTR_ERR(data);
- break;
- }
- } else if ((len = str_has_prefix(str, "onchange("))) {
- char *action_str = str + len;
+ switch (hid) {
+ case HANDLER_ONMATCH:
+ data = onmatch_parse(tr, action_str);
+ break;
+ case HANDLER_ONMAX:
+ case HANDLER_ONCHANGE:
+ data = track_data_parse(hist_data, action_str, hid);
+ break;
+ default:
+ data = ERR_PTR(-EINVAL);
+ break;
+ }
- data = track_data_parse(hist_data, action_str,
- HANDLER_ONCHANGE);
- if (IS_ERR(data)) {
- ret = PTR_ERR(data);
- break;
- }
- } else {
- ret = -EINVAL;
+ if (IS_ERR(data)) {
+ ret = PTR_ERR(data);
break;
}
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 9365ce4074..e76f5e1efd 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -2177,14 +2177,12 @@ static int user_events_open(struct inode *node, struct file *file)
static ssize_t user_events_write(struct file *file, const char __user *ubuf,
size_t count, loff_t *ppos)
{
- struct iovec iov;
struct iov_iter i;
if (unlikely(*ppos != 0))
return -EFAULT;
- if (unlikely(import_single_range(ITER_SOURCE, (char __user *)ubuf,
- count, &iov, &i)))
+ if (unlikely(import_ubuf(ITER_SOURCE, (char __user *)ubuf, count, &i)))
return -EFAULT;
return user_events_write_core(file, &i);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 52f8b537dd..c4c6e0e006 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -487,8 +487,8 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)
return -EINVAL;
if (within_notrace_func(tk)) {
- pr_warn("Could not probe notrace function %s\n",
- trace_kprobe_symbol(tk));
+ pr_warn("Could not probe notrace function %ps\n",
+ (void *)trace_kprobe_address(tk));
return -EINVAL;
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 3e7fa44dc2..d8b302d010 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1587,12 +1587,11 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
{
struct print_entry *field;
struct trace_seq *s = &iter->seq;
- int max = iter->ent_size - offsetof(struct print_entry, buf);
trace_assign_type(field, iter->ent);
seq_print_ip_sym(s, field->ip, flags);
- trace_seq_printf(s, ": %.*s", max, field->buf);
+ trace_seq_printf(s, ": %s", field->buf);
return trace_handle_return(s);
}
@@ -1601,11 +1600,10 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
struct print_entry *field;
- int max = iter->ent_size - offsetof(struct print_entry, buf);
trace_assign_type(field, iter->ent);
- trace_seq_printf(&iter->seq, "# %lx %.*s", field->ip, max, field->buf);
+ trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf);
return trace_handle_return(&iter->seq);
}
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 7be97229dd..c158d65a8a 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -13,9 +13,6 @@
* trace_seq_init() more than once to reset the trace_seq to start
* from scratch.
*
- * The buffer size is currently PAGE_SIZE, although it may become dynamic
- * in the future.
- *
* A write to the buffer will either succeed or fail. That is, unlike
* sprintf() there will not be a partial write (well it may write into
* the buffer but it wont update the pointers). This allows users to
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 99c051de41..a84b85d8aa 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -151,7 +151,7 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
return -ENOMEM;
if (addr == FETCH_TOKEN_COMM)
- ret = strlcpy(dst, current->comm, maxlen);
+ ret = strscpy(dst, current->comm, maxlen);
else
ret = strncpy_from_user(dst, src, maxlen);
if (ret >= 0) {