diff options
Diffstat (limited to 'kernel/trace')
-rw-r--r-- | kernel/trace/bpf_trace.c | 183 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 784 | ||||
-rw-r--r-- | kernel/trace/ring_buffer_benchmark.c | 10 | ||||
-rw-r--r-- | kernel/trace/trace.c | 277 | ||||
-rw-r--r-- | kernel/trace/trace.h | 2 | ||||
-rw-r--r-- | kernel/trace/trace_boot.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 111 | ||||
-rw-r--r-- | kernel/trace/trace_events_hist.c | 49 | ||||
-rw-r--r-- | kernel/trace/trace_events_user.c | 4 | ||||
-rw-r--r-- | kernel/trace/trace_kprobe.c | 4 | ||||
-rw-r--r-- | kernel/trace/trace_output.c | 6 | ||||
-rw-r--r-- | kernel/trace/trace_seq.c | 3 | ||||
-rw-r--r-- | kernel/trace/trace_uprobe.c | 2 |
13 files changed, 1012 insertions, 425 deletions
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 652c40a14d..c8d1ebc438 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -24,6 +24,7 @@ #include <linux/key.h> #include <linux/verification.h> #include <linux/namei.h> +#include <linux/fileattr.h> #include <net/bpf_sk_storage.h> @@ -1379,6 +1380,8 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, struct bpf_dynptr_kern *sig_ptr, struct bpf_key *trusted_keyring) { + const void *data, *sig; + u32 data_len, sig_len; int ret; if (trusted_keyring->has_ref) { @@ -1395,10 +1398,12 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, return ret; } - return verify_pkcs7_signature(data_ptr->data, - __bpf_dynptr_size(data_ptr), - sig_ptr->data, - __bpf_dynptr_size(sig_ptr), + data_len = __bpf_dynptr_size(data_ptr); + data = __bpf_dynptr_data(data_ptr, data_len); + sig_len = __bpf_dynptr_size(sig_ptr); + sig = __bpf_dynptr_data(sig_ptr, sig_len); + + return verify_pkcs7_signature(data, data_len, sig, sig_len, trusted_keyring->key, VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL); @@ -1430,6 +1435,72 @@ static int __init bpf_key_sig_kfuncs_init(void) late_initcall(bpf_key_sig_kfuncs_init); #endif /* CONFIG_KEYS */ +/* filesystem kfuncs */ +__bpf_kfunc_start_defs(); + +/** + * bpf_get_file_xattr - get xattr of a file + * @file: file to get xattr from + * @name__str: name of the xattr + * @value_ptr: output buffer of the xattr value + * + * Get xattr *name__str* of *file* and store the output in *value_ptr*. + * + * For security reasons, only *name__str* with prefix "user." is allowed. + * + * Return: 0 on success, a negative value on error. + */ +__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str, + struct bpf_dynptr_kern *value_ptr) +{ + struct dentry *dentry; + u32 value_len; + void *value; + int ret; + + if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + return -EPERM; + + value_len = __bpf_dynptr_size(value_ptr); + value = __bpf_dynptr_data_rw(value_ptr, value_len); + if (!value) + return -EINVAL; + + dentry = file_dentry(file); + ret = inode_permission(&nop_mnt_idmap, dentry->d_inode, MAY_READ); + if (ret) + return ret; + return __vfs_getxattr(dentry, dentry->d_inode, name__str, value, value_len); +} + +__bpf_kfunc_end_defs(); + +BTF_SET8_START(fs_kfunc_set_ids) +BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_SET8_END(fs_kfunc_set_ids) + +static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + if (!btf_id_set8_contains(&fs_kfunc_set_ids, kfunc_id)) + return 0; + + /* Only allow to attach from LSM hooks, to avoid recursion */ + return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0; +} + +static const struct btf_kfunc_id_set bpf_fs_kfunc_set = { + .owner = THIS_MODULE, + .set = &fs_kfunc_set_ids, + .filter = bpf_get_file_xattr_filter, +}; + +static int __init bpf_fs_kfuncs_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fs_kfunc_set); +} + +late_initcall(bpf_fs_kfuncs_init); + static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -2642,7 +2713,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { .release = bpf_kprobe_multi_link_release, - .dealloc = bpf_kprobe_multi_link_dealloc, + .dealloc_deferred = bpf_kprobe_multi_link_dealloc, .fill_link_info = bpf_kprobe_multi_link_fill_link_info, }; @@ -3034,6 +3105,7 @@ struct bpf_uprobe_multi_link; struct bpf_uprobe { struct bpf_uprobe_multi_link *link; loff_t offset; + unsigned long ref_ctr_offset; u64 cookie; struct uprobe_consumer consumer; }; @@ -3042,6 +3114,7 @@ struct bpf_uprobe_multi_link { struct path path; struct bpf_link link; u32 cnt; + u32 flags; struct bpf_uprobe *uprobes; struct task_struct *task; }; @@ -3069,6 +3142,9 @@ static void bpf_uprobe_multi_link_release(struct bpf_link *link) umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt); + if (umulti_link->task) + put_task_struct(umulti_link->task); + path_put(&umulti_link->path); } static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) @@ -3076,16 +3152,83 @@ static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link) struct bpf_uprobe_multi_link *umulti_link; umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); - if (umulti_link->task) - put_task_struct(umulti_link->task); - path_put(&umulti_link->path); kvfree(umulti_link->uprobes); kfree(umulti_link); } +static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + u64 __user *uref_ctr_offsets = u64_to_user_ptr(info->uprobe_multi.ref_ctr_offsets); + u64 __user *ucookies = u64_to_user_ptr(info->uprobe_multi.cookies); + u64 __user *uoffsets = u64_to_user_ptr(info->uprobe_multi.offsets); + u64 __user *upath = u64_to_user_ptr(info->uprobe_multi.path); + u32 upath_size = info->uprobe_multi.path_size; + struct bpf_uprobe_multi_link *umulti_link; + u32 ucount = info->uprobe_multi.count; + int err = 0, i; + long left; + + if (!upath ^ !upath_size) + return -EINVAL; + + if ((uoffsets || uref_ctr_offsets || ucookies) && !ucount) + return -EINVAL; + + umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); + info->uprobe_multi.count = umulti_link->cnt; + info->uprobe_multi.flags = umulti_link->flags; + info->uprobe_multi.pid = umulti_link->task ? + task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; + + if (upath) { + char *p, *buf; + + upath_size = min_t(u32, upath_size, PATH_MAX); + + buf = kmalloc(upath_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + p = d_path(&umulti_link->path, buf, upath_size); + if (IS_ERR(p)) { + kfree(buf); + return PTR_ERR(p); + } + upath_size = buf + upath_size - p; + left = copy_to_user(upath, p, upath_size); + kfree(buf); + if (left) + return -EFAULT; + info->uprobe_multi.path_size = upath_size; + } + + if (!uoffsets && !ucookies && !uref_ctr_offsets) + return 0; + + if (ucount < umulti_link->cnt) + err = -ENOSPC; + else + ucount = umulti_link->cnt; + + for (i = 0; i < ucount; i++) { + if (uoffsets && + put_user(umulti_link->uprobes[i].offset, uoffsets + i)) + return -EFAULT; + if (uref_ctr_offsets && + put_user(umulti_link->uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) + return -EFAULT; + if (ucookies && + put_user(umulti_link->uprobes[i].cookie, ucookies + i)) + return -EFAULT; + } + + return err; +} + static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release, - .dealloc = bpf_uprobe_multi_link_dealloc, + .dealloc_deferred = bpf_uprobe_multi_link_dealloc, + .fill_link_info = bpf_uprobe_multi_link_fill_link_info, }; static int uprobe_prog_run(struct bpf_uprobe *uprobe, @@ -3173,7 +3316,6 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr { struct bpf_uprobe_multi_link *link = NULL; unsigned long __user *uref_ctr_offsets; - unsigned long *ref_ctr_offsets = NULL; struct bpf_link_primer link_primer; struct bpf_uprobe *uprobes = NULL; struct task_struct *task = NULL; @@ -3248,22 +3390,20 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (!uprobes || !link) goto error_free; - if (uref_ctr_offsets) { - ref_ctr_offsets = kvcalloc(cnt, sizeof(*ref_ctr_offsets), GFP_KERNEL); - if (!ref_ctr_offsets) - goto error_free; - } - for (i = 0; i < cnt; i++) { - if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) { + if (__get_user(uprobes[i].offset, uoffsets + i)) { err = -EFAULT; goto error_free; } - if (uref_ctr_offsets && __get_user(ref_ctr_offsets[i], uref_ctr_offsets + i)) { + if (uprobes[i].offset < 0) { + err = -EINVAL; + goto error_free; + } + if (uref_ctr_offsets && __get_user(uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) { err = -EFAULT; goto error_free; } - if (__get_user(uprobes[i].offset, uoffsets + i)) { + if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) { err = -EFAULT; goto error_free; } @@ -3283,6 +3423,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->uprobes = uprobes; link->path = path; link->task = task; + link->flags = flags; bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI, &bpf_uprobe_multi_link_lops, prog); @@ -3290,7 +3431,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr for (i = 0; i < cnt; i++) { err = uprobe_register_refctr(d_real_inode(link->path.dentry), uprobes[i].offset, - ref_ctr_offsets ? ref_ctr_offsets[i] : 0, + uprobes[i].ref_ctr_offset, &uprobes[i].consumer); if (err) { bpf_uprobe_unregister(&path, uprobes, i); @@ -3302,11 +3443,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (err) goto error_free; - kvfree(ref_ctr_offsets); return bpf_link_settle(&link_primer); error_free: - kvfree(ref_ctr_offsets); kvfree(uprobes); kfree(link); if (task) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 140f8eed83..faf56d9a9e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -27,6 +27,7 @@ #include <linux/cpu.h> #include <linux/oom.h> +#include <asm/local64.h> #include <asm/local.h> /* @@ -317,6 +318,11 @@ struct buffer_data_page { unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ }; +struct buffer_data_read_page { + unsigned order; /* order of the page */ + struct buffer_data_page *data; /* actual data, stored in this page */ +}; + /* * Note, the buffer_page list must be first. The buffer pages * are allocated in cache lines, which means that each buffer @@ -331,6 +337,7 @@ struct buffer_page { unsigned read; /* index for next read */ local_t entries; /* entries on this page */ unsigned long real_end; /* real end of data */ + unsigned order; /* order of the page */ struct buffer_data_page *page; /* Actual data page */ }; @@ -361,7 +368,7 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage) static void free_buffer_page(struct buffer_page *bpage) { - free_page((unsigned long)bpage->page); + free_pages((unsigned long)bpage->page, bpage->order); kfree(bpage); } @@ -373,45 +380,11 @@ static inline bool test_time_stamp(u64 delta) return !!(delta & TS_DELTA_TEST); } -#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) - -/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ -#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) - -int ring_buffer_print_page_header(struct trace_seq *s) -{ - struct buffer_data_page field; - - trace_seq_printf(s, "\tfield: u64 timestamp;\t" - "offset:0;\tsize:%u;\tsigned:%u;\n", - (unsigned int)sizeof(field.time_stamp), - (unsigned int)is_signed_type(u64)); - - trace_seq_printf(s, "\tfield: local_t commit;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), commit), - (unsigned int)sizeof(field.commit), - (unsigned int)is_signed_type(long)); - - trace_seq_printf(s, "\tfield: int overwrite;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), commit), - 1, - (unsigned int)is_signed_type(long)); - - trace_seq_printf(s, "\tfield: char data;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), data), - (unsigned int)BUF_PAGE_SIZE, - (unsigned int)is_signed_type(char)); - - return !trace_seq_has_overflowed(s); -} - struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; wait_queue_head_t full_waiters; + atomic_t seq; bool waiters_pending; bool full_waiters_pending; bool wakeup_full; @@ -462,27 +435,9 @@ enum { RB_CTX_MAX }; -#if BITS_PER_LONG == 32 -#define RB_TIME_32 -#endif - -/* To test on 64 bit machines */ -//#define RB_TIME_32 - -#ifdef RB_TIME_32 - -struct rb_time_struct { - local_t cnt; - local_t top; - local_t bottom; - local_t msb; -}; -#else -#include <asm/local64.h> struct rb_time_struct { local64_t time; }; -#endif typedef struct rb_time_struct rb_time_t; #define MAX_NEST 5 @@ -556,6 +511,10 @@ struct trace_buffer { struct rb_irq_work irq_work; bool time_stamp_abs; + + unsigned int subbuf_size; + unsigned int subbuf_order; + unsigned int max_data_size; }; struct ring_buffer_iter { @@ -569,150 +528,48 @@ struct ring_buffer_iter { u64 read_stamp; u64 page_stamp; struct ring_buffer_event *event; + size_t event_size; int missed_events; }; -#ifdef RB_TIME_32 - -/* - * On 32 bit machines, local64_t is very expensive. As the ring - * buffer doesn't need all the features of a true 64 bit atomic, - * on 32 bit, it uses these functions (64 still uses local64_t). - * - * For the ring buffer, 64 bit required operations for the time is - * the following: - * - * - Reads may fail if it interrupted a modification of the time stamp. - * It will succeed if it did not interrupt another write even if - * the read itself is interrupted by a write. - * It returns whether it was successful or not. - * - * - Writes always succeed and will overwrite other writes and writes - * that were done by events interrupting the current write. - * - * - A write followed by a read of the same time stamp will always succeed, - * but may not contain the same value. - * - * - A cmpxchg will fail if it interrupted another write or cmpxchg. - * Other than that, it acts like a normal cmpxchg. - * - * The 60 bit time stamp is broken up by 30 bits in a top and bottom half - * (bottom being the least significant 30 bits of the 60 bit time stamp). - * - * The two most significant bits of each half holds a 2 bit counter (0-3). - * Each update will increment this counter by one. - * When reading the top and bottom, if the two counter bits match then the - * top and bottom together make a valid 60 bit number. - */ -#define RB_TIME_SHIFT 30 -#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1) -#define RB_TIME_MSB_SHIFT 60 - -static inline int rb_time_cnt(unsigned long val) -{ - return (val >> RB_TIME_SHIFT) & 3; -} - -static inline u64 rb_time_val(unsigned long top, unsigned long bottom) -{ - u64 val; - - val = top & RB_TIME_VAL_MASK; - val <<= RB_TIME_SHIFT; - val |= bottom & RB_TIME_VAL_MASK; - - return val; -} - -static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) -{ - unsigned long top, bottom, msb; - unsigned long c; - - /* - * If the read is interrupted by a write, then the cnt will - * be different. Loop until both top and bottom have been read - * without interruption. - */ - do { - c = local_read(&t->cnt); - top = local_read(&t->top); - bottom = local_read(&t->bottom); - msb = local_read(&t->msb); - } while (c != local_read(&t->cnt)); - - *cnt = rb_time_cnt(top); - - /* If top, msb or bottom counts don't match, this interrupted a write */ - if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom)) - return false; - - /* The shift to msb will lose its cnt bits */ - *ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT); - return true; -} - -static bool rb_time_read(rb_time_t *t, u64 *ret) -{ - unsigned long cnt; - - return __rb_time_read(t, ret, &cnt); -} - -static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt) +int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s) { - return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT); -} + struct buffer_data_page field; -static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom, - unsigned long *msb) -{ - *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK); - *bottom = (unsigned long)(val & RB_TIME_VAL_MASK); - *msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT); -} + trace_seq_printf(s, "\tfield: u64 timestamp;\t" + "offset:0;\tsize:%u;\tsigned:%u;\n", + (unsigned int)sizeof(field.time_stamp), + (unsigned int)is_signed_type(u64)); -static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt) -{ - val = rb_time_val_cnt(val, cnt); - local_set(t, val); -} + trace_seq_printf(s, "\tfield: local_t commit;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + (unsigned int)sizeof(field.commit), + (unsigned int)is_signed_type(long)); -static void rb_time_set(rb_time_t *t, u64 val) -{ - unsigned long cnt, top, bottom, msb; + trace_seq_printf(s, "\tfield: int overwrite;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + 1, + (unsigned int)is_signed_type(long)); - rb_time_split(val, &top, &bottom, &msb); + trace_seq_printf(s, "\tfield: char data;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), data), + (unsigned int)buffer->subbuf_size, + (unsigned int)is_signed_type(char)); - /* Writes always succeed with a valid number even if it gets interrupted. */ - do { - cnt = local_inc_return(&t->cnt); - rb_time_val_set(&t->top, top, cnt); - rb_time_val_set(&t->bottom, bottom, cnt); - rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt); - } while (cnt != local_read(&t->cnt)); + return !trace_seq_has_overflowed(s); } -static inline bool -rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) -{ - return local_try_cmpxchg(l, &expect, set); -} - -#else /* 64 bits */ - -/* local64_t always succeeds */ - -static inline bool rb_time_read(rb_time_t *t, u64 *ret) +static inline void rb_time_read(rb_time_t *t, u64 *ret) { *ret = local64_read(&t->time); - return true; } static void rb_time_set(rb_time_t *t, u64 val) { local64_set(&t->time, val); } -#endif /* * Enable this to make sure that the event passed to @@ -819,10 +676,7 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, WARN_ONCE(1, "nest (%d) greater than max", nest); fail: - /* Can only fail on 32 bit */ - if (!rb_time_read(&cpu_buffer->write_stamp, &ts)) - /* Screw it, just read the current time */ - ts = rb_time_stamp(cpu_buffer->buffer); + rb_time_read(&cpu_buffer->write_stamp, &ts); return ts; } @@ -900,6 +754,9 @@ static void rb_wake_up_waiters(struct irq_work *work) { struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); + /* For waiters waiting for the first wake up */ + (void)atomic_fetch_inc_release(&rbwork->seq); + wake_up_all(&rbwork->waiters); if (rbwork->full_waiters_pending || rbwork->wakeup_full) { /* Only cpu_buffer sets the above flags */ @@ -1028,20 +885,21 @@ rb_wait_cond(struct rb_irq_work *rbwork, struct trace_buffer *buffer, return false; } +struct rb_wait_data { + struct rb_irq_work *irq_work; + int seq; +}; + /* * The default wait condition for ring_buffer_wait() is to just to exit the * wait loop the first time it is woken up. */ static bool rb_wait_once(void *data) { - long *once = data; - - /* wait_event() actually calls this twice before scheduling*/ - if (*once > 1) - return true; + struct rb_wait_data *rdata = data; + struct rb_irq_work *rbwork = rdata->irq_work; - (*once)++; - return false; + return atomic_read_acquire(&rbwork->seq) != rdata->seq; } /** @@ -1049,24 +907,22 @@ static bool rb_wait_once(void *data) * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS + * @cond: condition function to break out of wait (NULL to run once) + * @data: the data to pass to @cond. * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. */ -int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) +int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full, + ring_buffer_cond_fn cond, void *data) { struct ring_buffer_per_cpu *cpu_buffer; struct wait_queue_head *waitq; - ring_buffer_cond_fn cond; struct rb_irq_work *rbwork; - void *data; - long once = 0; + struct rb_wait_data rdata; int ret = 0; - cond = rb_wait_once; - data = &once; - /* * Depending on what the caller is waiting for, either any * data in any cpu buffer, or a specific buffer, put the @@ -1088,6 +944,14 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) else waitq = &rbwork->waiters; + /* Set up to exit loop as soon as it is woken */ + if (!cond) { + cond = rb_wait_once; + rdata.irq_work = rbwork; + rdata.seq = atomic_read_acquire(&rbwork->seq); + data = &rdata; + } + ret = wait_event_interruptible((*waitq), rb_wait_cond(rbwork, buffer, cpu, full, cond, data)); @@ -1536,7 +1400,6 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); - local_inc(&cpu_buffer->pages_touched); /* * Just make sure we have seen our old_write and synchronize * with any interrupts that come in. @@ -1573,8 +1436,9 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, */ local_set(&next_page->page->commit, 0); - /* Again, either we update tail_page or an interrupt does */ - (void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page); + /* Either we update tail_page or an interrupt does */ + if (try_cmpxchg(&cpu_buffer->tail_page, &tail_page, next_page)) + local_inc(&cpu_buffer->pages_touched); } } @@ -1666,10 +1530,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add(&bpage->list, pages); - page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0); + page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, + cpu_buffer->buffer->subbuf_order); if (!page) goto free_pages; bpage->page = page_address(page); + bpage->order = cpu_buffer->buffer->subbuf_order; rb_init_page(bpage->page); if (user_thread && fatal_signal_pending(current)) @@ -1748,7 +1614,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) rb_check_bpage(cpu_buffer, bpage); cpu_buffer->reader_page = bpage; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); + + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order); if (!page) goto fail_free_reader; bpage->page = page_address(page); @@ -1831,7 +1698,14 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) goto fail_free_buffer; - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + /* Default buffer page size - one system page */ + buffer->subbuf_order = 0; + buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE; + + /* Max payload is buffer page size - header (8bytes) */ + buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2); + + nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); buffer->flags = flags; buffer->clock = trace_clock_local; buffer->reader_lock_key = key; @@ -2150,7 +2024,7 @@ static void update_pages_handler(struct work_struct *work) * @size: the new size. * @cpu_id: the cpu buffer to resize * - * Minimum size is 2 * BUF_PAGE_SIZE. + * Minimum size is 2 * buffer->subbuf_size. * * Returns 0 on success and < 0 on failure. */ @@ -2172,7 +2046,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, !cpumask_test_cpu(cpu_id, buffer->cpumask)) return 0; - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size); /* we need a minimum of two pages */ if (nr_pages < 2) @@ -2419,7 +2293,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter) */ barrier(); - if ((iter->head + length) > commit || length > BUF_PAGE_SIZE) + if ((iter->head + length) > commit || length > iter->event_size) /* Writer corrupted the read? */ goto reset; @@ -2459,11 +2333,13 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) } static __always_inline unsigned -rb_event_index(struct ring_buffer_event *event) +rb_event_index(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; - return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; + addr &= (PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1; + + return addr - BUF_PAGE_HDR_SIZE; } static void rb_inc_iter(struct ring_buffer_iter *iter) @@ -2652,6 +2528,7 @@ static inline void rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long tail, struct rb_event_info *info) { + unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); struct buffer_page *tail_page = info->tail_page; struct ring_buffer_event *event; unsigned long length = info->length; @@ -2660,13 +2537,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, * Only the event that crossed the page boundary * must fill the old tail_page with padding. */ - if (tail >= BUF_PAGE_SIZE) { + if (tail >= bsize) { /* * If the page was filled, then we still need * to update the real_end. Reset it to zero * and the reader will ignore it. */ - if (tail == BUF_PAGE_SIZE) + if (tail == bsize) tail_page->real_end = 0; local_sub(length, &tail_page->write); @@ -2694,7 +2571,7 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, * If we are less than the minimum size, we don't need to * worry about it. */ - if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) { + if (tail > (bsize - RB_EVNT_MIN_SIZE)) { /* No room for any events */ /* Mark the rest of the page with padding */ @@ -2709,19 +2586,19 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, } /* Put in a discarded event */ - event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE; + event->array[0] = (bsize - tail) - RB_EVNT_HDR_SIZE; event->type_len = RINGBUF_TYPE_PADDING; /* time delta must be non zero */ event->time_delta = 1; /* account for padding bytes */ - local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); + local_add(bsize - tail, &cpu_buffer->entries_bytes); /* Make sure the padding is visible before the tail_page->write update */ smp_wmb(); /* Set write to end of buffer */ - length = (tail + length) - BUF_PAGE_SIZE; + length = (tail + length) - bsize; local_sub(length, &tail_page->write); } @@ -2835,7 +2712,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, /* Slow path */ static struct ring_buffer_event * -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) +rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, u64 delta, bool abs) { if (abs) event->type_len = RINGBUF_TYPE_TIME_STAMP; @@ -2843,7 +2721,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) event->type_len = RINGBUF_TYPE_TIME_EXTEND; /* Not the first event on the page, or not delta? */ - if (abs || rb_event_index(event)) { + if (abs || rb_event_index(cpu_buffer, event)) { event->time_delta = delta & TS_MASK; event->array[0] = delta >> TS_SHIFT; } else { @@ -2873,7 +2751,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, (unsigned long long)info->ts, (unsigned long long)info->before, (unsigned long long)info->after, - (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0), + (unsigned long long)({rb_time_read(&cpu_buffer->write_stamp, &write_stamp); write_stamp;}), sched_clock_stable() ? "" : "If you just came from a suspend/resume,\n" "please switch to the trace global clock:\n" @@ -2917,7 +2795,7 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer, if (!abs) info->delta = 0; } - *event = rb_add_time_stamp(*event, info->delta, abs); + *event = rb_add_time_stamp(cpu_buffer, *event, info->delta, abs); *length -= RB_LEN_TIME_EXTEND; *delta = 0; } @@ -3001,10 +2879,10 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage; unsigned long addr; - new_index = rb_event_index(event); + new_index = rb_event_index(cpu_buffer, event); old_index = new_index + rb_event_ts_length(event); addr = (unsigned long)event; - addr &= PAGE_MASK; + addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1); bpage = READ_ONCE(cpu_buffer->tail_page); @@ -3391,6 +3269,76 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); #define CHECK_FULL_PAGE 1L #ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS + +static const char *show_irq_str(int bits) +{ + const char *type[] = { + ".", // 0 + "s", // 1 + "h", // 2 + "Hs", // 3 + "n", // 4 + "Ns", // 5 + "Nh", // 6 + "NHs", // 7 + }; + + return type[bits]; +} + +/* Assume this is an trace event */ +static const char *show_flags(struct ring_buffer_event *event) +{ + struct trace_entry *entry; + int bits = 0; + + if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry)) + return "X"; + + entry = ring_buffer_event_data(event); + + if (entry->flags & TRACE_FLAG_SOFTIRQ) + bits |= 1; + + if (entry->flags & TRACE_FLAG_HARDIRQ) + bits |= 2; + + if (entry->flags & TRACE_FLAG_NMI) + bits |= 4; + + return show_irq_str(bits); +} + +static const char *show_irq(struct ring_buffer_event *event) +{ + struct trace_entry *entry; + + if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry)) + return ""; + + entry = ring_buffer_event_data(event); + if (entry->flags & TRACE_FLAG_IRQS_OFF) + return "d"; + return ""; +} + +static const char *show_interrupt_level(void) +{ + unsigned long pc = preempt_count(); + unsigned char level = 0; + + if (pc & SOFTIRQ_OFFSET) + level |= 1; + + if (pc & HARDIRQ_MASK) + level |= 2; + + if (pc & NMI_MASK) + level |= 4; + + return show_irq_str(level); +} + static void dump_buffer_page(struct buffer_data_page *bpage, struct rb_event_info *info, unsigned long tail) @@ -3411,34 +3359,57 @@ static void dump_buffer_page(struct buffer_data_page *bpage, case RINGBUF_TYPE_TIME_EXTEND: delta = rb_event_time_stamp(event); ts += delta; - pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta); + pr_warn(" 0x%x: [%lld] delta:%lld TIME EXTEND\n", + e, ts, delta); break; case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); ts = rb_fix_abs_ts(delta, ts); - pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta); + pr_warn(" 0x%x: [%lld] absolute:%lld TIME STAMP\n", + e, ts, delta); break; case RINGBUF_TYPE_PADDING: ts += event->time_delta; - pr_warn(" [%lld] delta:%d PADDING\n", ts, event->time_delta); + pr_warn(" 0x%x: [%lld] delta:%d PADDING\n", + e, ts, event->time_delta); break; case RINGBUF_TYPE_DATA: ts += event->time_delta; - pr_warn(" [%lld] delta:%d\n", ts, event->time_delta); + pr_warn(" 0x%x: [%lld] delta:%d %s%s\n", + e, ts, event->time_delta, + show_flags(event), show_irq(event)); break; default: break; } } + pr_warn("expected end:0x%lx last event actually ended at:0x%x\n", tail, e); } static DEFINE_PER_CPU(atomic_t, checking); static atomic_t ts_dump; +#define buffer_warn_return(fmt, ...) \ + do { \ + /* If another report is happening, ignore this one */ \ + if (atomic_inc_return(&ts_dump) != 1) { \ + atomic_dec(&ts_dump); \ + goto out; \ + } \ + atomic_inc(&cpu_buffer->record_disabled); \ + pr_warn(fmt, ##__VA_ARGS__); \ + dump_buffer_page(bpage, info, tail); \ + atomic_dec(&ts_dump); \ + /* There's some cases in boot up that this can happen */ \ + if (WARN_ON_ONCE(system_state != SYSTEM_BOOTING)) \ + /* Do not re-enable checking */ \ + return; \ + } while (0) + /* * Check if the current event time stamp matches the deltas on * the buffer page. @@ -3492,7 +3463,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, case RINGBUF_TYPE_TIME_STAMP: delta = rb_event_time_stamp(event); - ts = rb_fix_abs_ts(delta, ts); + delta = rb_fix_abs_ts(delta, ts); + if (delta < ts) { + buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n", + cpu_buffer->cpu, ts, delta); + } + ts = delta; break; case RINGBUF_TYPE_PADDING: @@ -3509,23 +3485,11 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, } if ((full && ts > info->ts) || (!full && ts + info->delta != info->ts)) { - /* If another report is happening, ignore this one */ - if (atomic_inc_return(&ts_dump) != 1) { - atomic_dec(&ts_dump); - goto out; - } - atomic_inc(&cpu_buffer->record_disabled); - /* There's some cases in boot up that this can happen */ - WARN_ON_ONCE(system_state != SYSTEM_BOOTING); - pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s\n", - cpu_buffer->cpu, - ts + info->delta, info->ts, info->delta, - info->before, info->after, - full ? " (full)" : ""); - dump_buffer_page(bpage, info, tail); - atomic_dec(&ts_dump); - /* Do not re-enable checking */ - return; + buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n", + cpu_buffer->cpu, + ts + info->delta, info->ts, info->delta, + info->before, info->after, + full ? " (full)" : "", show_interrupt_level()); } out: atomic_dec(this_cpu_ptr(&checking)); @@ -3545,16 +3509,14 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event; struct buffer_page *tail_page; unsigned long tail, write, w; - bool a_ok; - bool b_ok; /* Don't let the compiler play games with cpu_buffer->tail_page */ tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK; barrier(); - b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); - a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); + rb_time_read(&cpu_buffer->before_stamp, &info->before); + rb_time_read(&cpu_buffer->write_stamp, &info->after); barrier(); info->ts = rb_time_stamp(cpu_buffer->buffer); @@ -3569,7 +3531,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (!w) { /* Use the sub-buffer timestamp */ info->delta = 0; - } else if (unlikely(!a_ok || !b_ok || info->before != info->after)) { + } else if (unlikely(info->before != info->after)) { info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; info->length += RB_LEN_TIME_EXTEND; } else { @@ -3591,7 +3553,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, tail = write - info->length; /* See if we shot pass the end of this buffer page */ - if (unlikely(write > BUF_PAGE_SIZE)) { + if (unlikely(write > cpu_buffer->buffer->subbuf_size)) { check_buffer(cpu_buffer, info, CHECK_FULL_PAGE); return rb_move_tail(cpu_buffer, tail, info); } @@ -3618,8 +3580,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* SLOW PATH - Interrupted between A and C */ /* Save the old before_stamp */ - a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); - RB_WARN_ON(cpu_buffer, !a_ok); + rb_time_read(&cpu_buffer->before_stamp, &info->before); /* * Read a new timestamp and update the before_stamp to make @@ -3631,9 +3592,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, rb_time_set(&cpu_buffer->before_stamp, ts); barrier(); - /*E*/ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); - /* Was interrupted before here, write_stamp must be valid */ - RB_WARN_ON(cpu_buffer, !a_ok); + /*E*/ rb_time_read(&cpu_buffer->write_stamp, &info->after); barrier(); /*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && info->after == info->before && info->after < ts) { @@ -3725,7 +3684,7 @@ rb_reserve_next_event(struct trace_buffer *buffer, if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { add_ts_default = RB_ADD_STAMP_ABSOLUTE; info.length += RB_LEN_TIME_EXTEND; - if (info.length > BUF_MAX_DATA_SIZE) + if (info.length > cpu_buffer->buffer->max_data_size) goto out_fail; } else { add_ts_default = RB_ADD_STAMP_NONE; @@ -3800,7 +3759,7 @@ ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length) if (unlikely(atomic_read(&cpu_buffer->record_disabled))) goto out; - if (unlikely(length > BUF_MAX_DATA_SIZE)) + if (unlikely(length > buffer->max_data_size)) goto out; if (unlikely(trace_recursive_lock(cpu_buffer))) @@ -3834,7 +3793,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage = cpu_buffer->commit_page; struct buffer_page *start; - addr &= PAGE_MASK; + addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1); /* Do the likely case first */ if (likely(bpage->page == (void *)addr)) { @@ -3950,7 +3909,7 @@ int ring_buffer_write(struct trace_buffer *buffer, if (atomic_read(&cpu_buffer->record_disabled)) goto out; - if (length > BUF_MAX_DATA_SIZE) + if (length > buffer->max_data_size) goto out; if (unlikely(trace_recursive_lock(cpu_buffer))) @@ -4436,7 +4395,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) cpu_buffer = iter->cpu_buffer; reader = cpu_buffer->reader_page; head_page = cpu_buffer->head_page; - commit_page = cpu_buffer->commit_page; + commit_page = READ_ONCE(cpu_buffer->commit_page); commit_ts = commit_page->page->time_stamp; /* @@ -4530,6 +4489,7 @@ static struct buffer_page * rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = NULL; + unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); unsigned long overwrite; unsigned long flags; int nr_loops = 0; @@ -4665,7 +4625,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) #define USECS_WAIT 1000000 for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) { /* If the write is past the end of page, a writer is still updating it */ - if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE)) + if (likely(!reader || rb_page_write(reader) <= bsize)) break; udelay(1); @@ -5109,7 +5069,8 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) return NULL; /* Holds the entire event: data and meta data */ - iter->event = kmalloc(BUF_PAGE_SIZE, flags); + iter->event_size = buffer->subbuf_size; + iter->event = kmalloc(iter->event_size, flags); if (!iter->event) { kfree(iter); return NULL; @@ -5225,19 +5186,28 @@ EXPORT_SYMBOL_GPL(ring_buffer_iter_advance); */ unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu) { - /* - * Earlier, this method returned - * BUF_PAGE_SIZE * buffer->nr_pages - * Since the nr_pages field is now removed, we have converted this to - * return the per cpu buffer value. - */ if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; - return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages; + return buffer->subbuf_size * buffer->buffers[cpu]->nr_pages; } EXPORT_SYMBOL_GPL(ring_buffer_size); +/** + * ring_buffer_max_event_size - return the max data size of an event + * @buffer: The ring buffer. + * + * Returns the maximum size an event can be. + */ +unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer) +{ + /* If abs timestamp is requested, events have a timestamp too */ + if (ring_buffer_time_stamp_abs(buffer)) + return buffer->max_data_size - RB_LEN_TIME_EXTEND; + return buffer->max_data_size; +} +EXPORT_SYMBOL_GPL(ring_buffer_max_event_size); + static void rb_clear_buffer_page(struct buffer_page *page) { local_set(&page->write, 0); @@ -5508,6 +5478,9 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) goto out; + if (buffer_a->subbuf_order != buffer_b->subbuf_order) + goto out; + ret = -EAGAIN; if (atomic_read(&buffer_a->record_disabled)) @@ -5579,40 +5552,48 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); * Returns: * The page allocated, or ERR_PTR */ -void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) +struct buffer_data_read_page * +ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - struct buffer_data_page *bpage = NULL; + struct buffer_data_read_page *bpage = NULL; unsigned long flags; struct page *page; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return ERR_PTR(-ENODEV); + bpage = kzalloc(sizeof(*bpage), GFP_KERNEL); + if (!bpage) + return ERR_PTR(-ENOMEM); + + bpage->order = buffer->subbuf_order; cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); if (cpu_buffer->free_page) { - bpage = cpu_buffer->free_page; + bpage->data = cpu_buffer->free_page; cpu_buffer->free_page = NULL; } arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); - if (bpage) + if (bpage->data) goto out; - page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY, 0); - if (!page) + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, + cpu_buffer->buffer->subbuf_order); + if (!page) { + kfree(bpage); return ERR_PTR(-ENOMEM); + } - bpage = page_address(page); + bpage->data = page_address(page); out: - rb_init_page(bpage); + rb_init_page(bpage->data); return bpage; } @@ -5622,14 +5603,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); * ring_buffer_free_read_page - free an allocated read page * @buffer: the buffer the page was allocate for * @cpu: the cpu buffer the page came from - * @data: the page to free + * @data_page: the page to free * * Free a page allocated from ring_buffer_alloc_read_page. */ -void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data) +void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, + struct buffer_data_read_page *data_page) { struct ring_buffer_per_cpu *cpu_buffer; - struct buffer_data_page *bpage = data; + struct buffer_data_page *bpage = data_page->data; struct page *page = virt_to_page(bpage); unsigned long flags; @@ -5638,8 +5620,12 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data cpu_buffer = buffer->buffers[cpu]; - /* If the page is still in use someplace else, we can't reuse it */ - if (page_ref_count(page) > 1) + /* + * If the page is still in use someplace else, or order of the page + * is different from the subbuffer order of the buffer - + * we can't reuse it + */ + if (page_ref_count(page) > 1 || data_page->order != buffer->subbuf_order) goto out; local_irq_save(flags); @@ -5654,7 +5640,8 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data local_irq_restore(flags); out: - free_page((unsigned long)bpage); + free_pages((unsigned long)bpage, data_page->order); + kfree(data_page); } EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); @@ -5675,9 +5662,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); * rpage = ring_buffer_alloc_read_page(buffer, cpu); * if (IS_ERR(rpage)) * return PTR_ERR(rpage); - * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); + * ret = ring_buffer_read_page(buffer, rpage, len, cpu, 0); * if (ret >= 0) - * process_page(rpage, ret); + * process_page(ring_buffer_read_page_data(rpage), ret); + * ring_buffer_free_read_page(buffer, cpu, rpage); * * When @full is set, the function will not return true unless * the writer is off the reader page. @@ -5692,7 +5680,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); * <0 if no data has been transferred. */ int ring_buffer_read_page(struct trace_buffer *buffer, - void **data_page, size_t len, int cpu, int full) + struct buffer_data_read_page *data_page, + size_t len, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; @@ -5717,10 +5706,12 @@ int ring_buffer_read_page(struct trace_buffer *buffer, len -= BUF_PAGE_HDR_SIZE; - if (!data_page) + if (!data_page || !data_page->data) + goto out; + if (data_page->order != buffer->subbuf_order) goto out; - bpage = *data_page; + bpage = data_page->data; if (!bpage) goto out; @@ -5814,11 +5805,11 @@ int ring_buffer_read_page(struct trace_buffer *buffer, /* swap the pages */ rb_init_page(bpage); bpage = reader->page; - reader->page = *data_page; + reader->page = data_page->data; local_set(&reader->write, 0); local_set(&reader->entries, 0); reader->read = 0; - *data_page = bpage; + data_page->data = bpage; /* * Use the real_end for the data size, @@ -5840,7 +5831,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, /* If there is room at the end of the page to save the * missed events, then record it there. */ - if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) { + if (buffer->subbuf_size - commit >= sizeof(missed_events)) { memcpy(&bpage->data[commit], &missed_events, sizeof(missed_events)); local_add(RB_MISSED_STORED, &bpage->commit); @@ -5852,8 +5843,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, /* * This page may be off to user land. Zero it out here. */ - if (commit < BUF_PAGE_SIZE) - memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); + if (commit < buffer->subbuf_size) + memset(&bpage->data[commit], 0, buffer->subbuf_size - commit); out_unlock: raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); @@ -5863,6 +5854,213 @@ int ring_buffer_read_page(struct trace_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_read_page); +/** + * ring_buffer_read_page_data - get pointer to the data in the page. + * @page: the page to get the data from + * + * Returns pointer to the actual data in this page. + */ +void *ring_buffer_read_page_data(struct buffer_data_read_page *page) +{ + return page->data; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_page_data); + +/** + * ring_buffer_subbuf_size_get - get size of the sub buffer. + * @buffer: the buffer to get the sub buffer size from + * + * Returns size of the sub buffer, in bytes. + */ +int ring_buffer_subbuf_size_get(struct trace_buffer *buffer) +{ + return buffer->subbuf_size + BUF_PAGE_HDR_SIZE; +} +EXPORT_SYMBOL_GPL(ring_buffer_subbuf_size_get); + +/** + * ring_buffer_subbuf_order_get - get order of system sub pages in one buffer page. + * @buffer: The ring_buffer to get the system sub page order from + * + * By default, one ring buffer sub page equals to one system page. This parameter + * is configurable, per ring buffer. The size of the ring buffer sub page can be + * extended, but must be an order of system page size. + * + * Returns the order of buffer sub page size, in system pages: + * 0 means the sub buffer size is 1 system page and so forth. + * In case of an error < 0 is returned. + */ +int ring_buffer_subbuf_order_get(struct trace_buffer *buffer) +{ + if (!buffer) + return -EINVAL; + + return buffer->subbuf_order; +} +EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get); + +/** + * ring_buffer_subbuf_order_set - set the size of ring buffer sub page. + * @buffer: The ring_buffer to set the new page size. + * @order: Order of the system pages in one sub buffer page + * + * By default, one ring buffer pages equals to one system page. This API can be + * used to set new size of the ring buffer page. The size must be order of + * system page size, that's why the input parameter @order is the order of + * system pages that are allocated for one ring buffer page: + * 0 - 1 system page + * 1 - 2 system pages + * 3 - 4 system pages + * ... + * + * Returns 0 on success or < 0 in case of an error. + */ +int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *bpage, *tmp; + int old_order, old_size; + int nr_pages; + int psize; + int err; + int cpu; + + if (!buffer || order < 0) + return -EINVAL; + + if (buffer->subbuf_order == order) + return 0; + + psize = (1 << order) * PAGE_SIZE; + if (psize <= BUF_PAGE_HDR_SIZE) + return -EINVAL; + + /* Size of a subbuf cannot be greater than the write counter */ + if (psize > RB_WRITE_MASK + 1) + return -EINVAL; + + old_order = buffer->subbuf_order; + old_size = buffer->subbuf_size; + + /* prevent another thread from changing buffer sizes */ + mutex_lock(&buffer->mutex); + atomic_inc(&buffer->record_disabled); + + /* Make sure all commits have finished */ + synchronize_rcu(); + + buffer->subbuf_order = order; + buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE; + + /* Make sure all new buffers are allocated, before deleting the old ones */ + for_each_buffer_cpu(buffer, cpu) { + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + continue; + + cpu_buffer = buffer->buffers[cpu]; + + /* Update the number of pages to match the new size */ + nr_pages = old_size * buffer->buffers[cpu]->nr_pages; + nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size); + + /* we need a minimum of two pages */ + if (nr_pages < 2) + nr_pages = 2; + + cpu_buffer->nr_pages_to_update = nr_pages; + + /* Include the reader page */ + nr_pages++; + + /* Allocate the new size buffer */ + INIT_LIST_HEAD(&cpu_buffer->new_pages); + if (__rb_allocate_pages(cpu_buffer, nr_pages, + &cpu_buffer->new_pages)) { + /* not enough memory for new pages */ + err = -ENOMEM; + goto error; + } + } + + for_each_buffer_cpu(buffer, cpu) { + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + continue; + + cpu_buffer = buffer->buffers[cpu]; + + /* Clear the head bit to make the link list normal to read */ + rb_head_page_deactivate(cpu_buffer); + + /* Now walk the list and free all the old sub buffers */ + list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + /* The above loop stopped an the last page needing to be freed */ + bpage = list_entry(cpu_buffer->pages, struct buffer_page, list); + free_buffer_page(bpage); + + /* Free the current reader page */ + free_buffer_page(cpu_buffer->reader_page); + + /* One page was allocated for the reader page */ + cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next, + struct buffer_page, list); + list_del_init(&cpu_buffer->reader_page->list); + + /* The cpu_buffer pages are a link list with no head */ + cpu_buffer->pages = cpu_buffer->new_pages.next; + cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev; + cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next; + + /* Clear the new_pages list */ + INIT_LIST_HEAD(&cpu_buffer->new_pages); + + cpu_buffer->head_page + = list_entry(cpu_buffer->pages, struct buffer_page, list); + cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + + cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update; + cpu_buffer->nr_pages_to_update = 0; + + free_pages((unsigned long)cpu_buffer->free_page, old_order); + cpu_buffer->free_page = NULL; + + rb_head_page_activate(cpu_buffer); + + rb_check_pages(cpu_buffer); + } + + atomic_dec(&buffer->record_disabled); + mutex_unlock(&buffer->mutex); + + return 0; + +error: + buffer->subbuf_order = old_order; + buffer->subbuf_size = old_size; + + atomic_dec(&buffer->record_disabled); + mutex_unlock(&buffer->mutex); + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + if (!cpu_buffer->nr_pages_to_update) + continue; + + list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + } + + return err; +} +EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set); + /* * We only allocate new buffers, never free them if the CPU goes down. * If we were to free the buffer, then the user would lose any trace that was in diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index aef34673d7..008187ebd7 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -104,10 +104,11 @@ static enum event_status read_event(int cpu) static enum event_status read_page(int cpu) { + struct buffer_data_read_page *bpage; struct ring_buffer_event *event; struct rb_page *rpage; unsigned long commit; - void *bpage; + int page_size; int *entry; int ret; int inc; @@ -117,14 +118,15 @@ static enum event_status read_page(int cpu) if (IS_ERR(bpage)) return EVENT_DROPPED; - ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); + page_size = ring_buffer_subbuf_size_get(buffer); + ret = ring_buffer_read_page(buffer, bpage, page_size, cpu, 1); if (ret >= 0) { - rpage = bpage; + rpage = ring_buffer_read_page_data(bpage); /* The commit may have missed event flags set, clear them */ commit = local_read(&rpage->commit) & 0xfffff; for (i = 0; i < commit && !test_error ; i += inc) { - if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { + if (i >= (page_size - offsetof(struct rb_page, data))) { TEST_ERROR(); break; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e03960f9f4..d390fea3a6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1264,10 +1264,17 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val); int tracing_alloc_snapshot_instance(struct trace_array *tr) { + int order; int ret; if (!tr->allocated_snapshot) { + /* Make the snapshot buffer have the same order as main buffer */ + order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); + ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order); + if (ret < 0) + return ret; + /* allocate spare buffer */ ret = resize_buffer_duplicate_size(&tr->max_buffer, &tr->array_buffer, RING_BUFFER_ALL_CPUS); @@ -1287,6 +1294,7 @@ static void free_snapshot(struct trace_array *tr) * The max_tr ring buffer has some state (e.g. ring->clock) and * we want preserve it. */ + ring_buffer_subbuf_order_set(tr->max_buffer.buffer, 0); ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); set_buffer_entries(&tr->max_buffer, 1); tracing_reset_online_cpus(&tr->max_buffer); @@ -1525,7 +1533,7 @@ void disable_trace_on_warning(void) bool tracer_tracing_is_on(struct trace_array *tr) { if (tr->array_buffer.buffer) - return ring_buffer_record_is_on(tr->array_buffer.buffer); + return ring_buffer_record_is_set_on(tr->array_buffer.buffer); return !tr->buffer_disabled; } @@ -1947,15 +1955,36 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) #endif /* CONFIG_TRACER_MAX_TRACE */ +struct pipe_wait { + struct trace_iterator *iter; + int wait_index; +}; + +static bool wait_pipe_cond(void *data) +{ + struct pipe_wait *pwait = data; + struct trace_iterator *iter = pwait->iter; + + if (atomic_read_acquire(&iter->wait_index) != pwait->wait_index) + return true; + + return iter->closed; +} + static int wait_on_pipe(struct trace_iterator *iter, int full) { + struct pipe_wait pwait; int ret; /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return 0; - ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full); + pwait.wait_index = atomic_read_acquire(&iter->wait_index); + pwait.iter = iter; + + ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full, + wait_pipe_cond, &pwait); #ifdef CONFIG_TRACER_MAX_TRACE /* @@ -3781,7 +3810,7 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str, /* OK if part of the temp seq buffer */ if ((addr >= (unsigned long)iter->tmp_seq.buffer) && - (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE)) + (addr < (unsigned long)iter->tmp_seq.buffer + TRACE_SEQ_BUFFER_SIZE)) return true; /* Core rodata can not be freed */ @@ -5046,7 +5075,7 @@ static int tracing_release(struct inode *inode, struct file *file) return 0; } -static int tracing_release_generic_tr(struct inode *inode, struct file *file) +int tracing_release_generic_tr(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; @@ -6948,8 +6977,8 @@ waitagain: goto out; } - if (cnt >= PAGE_SIZE) - cnt = PAGE_SIZE - 1; + if (cnt >= TRACE_SEQ_BUFFER_SIZE) + cnt = TRACE_SEQ_BUFFER_SIZE - 1; /* reset all but tr, trace, and overruns */ trace_iterator_reset(iter); @@ -7285,6 +7314,8 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) return 0; } +#define TRACE_MARKER_MAX_SIZE 4096 + static ssize_t tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) @@ -7294,8 +7325,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, enum event_trigger_type tt = ETT_NONE; struct trace_buffer *buffer; struct print_entry *entry; + int meta_size; ssize_t written; - int size; + size_t size; int len; /* Used in tracing_mark_raw_write() as well */ @@ -7308,12 +7340,15 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (!(tr->trace_flags & TRACE_ITER_MARKERS)) return -EINVAL; - if (cnt > TRACE_BUF_SIZE) - cnt = TRACE_BUF_SIZE; + if ((ssize_t)cnt < 0) + return -EINVAL; - BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); + if (cnt > TRACE_MARKER_MAX_SIZE) + cnt = TRACE_MARKER_MAX_SIZE; - size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */ + meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */ + again: + size = cnt + meta_size; /* If less than "<faulted>", then make sure we can still add that */ if (cnt < FAULTED_SIZE) @@ -7322,9 +7357,25 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, buffer = tr->array_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, tracing_gen_ctx()); - if (unlikely(!event)) + if (unlikely(!event)) { + /* + * If the size was greater than what was allowed, then + * make it smaller and try again. + */ + if (size > ring_buffer_max_event_size(buffer)) { + /* cnt < FAULTED size should never be bigger than max */ + if (WARN_ON_ONCE(cnt < FAULTED_SIZE)) + return -EBADF; + cnt = ring_buffer_max_event_size(buffer) - meta_size; + /* The above should only happen once */ + if (WARN_ON_ONCE(cnt + meta_size == size)) + return -EBADF; + goto again; + } + /* Ring buffer disabled, return as if not open for write */ return -EBADF; + } entry = ring_buffer_event_data(event); entry->ip = _THIS_IP_; @@ -7359,9 +7410,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, return written; } -/* Limit it for now to 3K (including tag) */ -#define RAW_DATA_MAX_SIZE (1024*3) - static ssize_t tracing_mark_raw_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) @@ -7383,19 +7431,18 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, return -EINVAL; /* The marker must at least have a tag id */ - if (cnt < sizeof(unsigned int) || cnt > RAW_DATA_MAX_SIZE) + if (cnt < sizeof(unsigned int)) return -EINVAL; - if (cnt > TRACE_BUF_SIZE) - cnt = TRACE_BUF_SIZE; - - BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); - size = sizeof(*entry) + cnt; if (cnt < FAULT_SIZE_ID) size += FAULT_SIZE_ID - cnt; buffer = tr->array_buffer.buffer; + + if (size > ring_buffer_max_event_size(buffer)) + return -EINVAL; + event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, tracing_gen_ctx()); if (!event) @@ -7580,6 +7627,7 @@ struct ftrace_buffer_info { struct trace_iterator iter; void *spare; unsigned int spare_cpu; + unsigned int spare_size; unsigned int read; }; @@ -8284,6 +8332,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, { struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; + void *trace_data; + int page_size; ssize_t ret = 0; ssize_t size; @@ -8295,6 +8345,17 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, return -EBUSY; #endif + page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer); + + /* Make sure the spare matches the current sub buffer size */ + if (info->spare) { + if (page_size != info->spare_size) { + ring_buffer_free_read_page(iter->array_buffer->buffer, + info->spare_cpu, info->spare); + info->spare = NULL; + } + } + if (!info->spare) { info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer, iter->cpu_file); @@ -8303,19 +8364,20 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, info->spare = NULL; } else { info->spare_cpu = iter->cpu_file; + info->spare_size = page_size; } } if (!info->spare) return ret; /* Do we have previous read data to read? */ - if (info->read < PAGE_SIZE) + if (info->read < page_size) goto read; again: trace_access_lock(iter->cpu_file); ret = ring_buffer_read_page(iter->array_buffer->buffer, - &info->spare, + info->spare, count, iter->cpu_file, 0); trace_access_unlock(iter->cpu_file); @@ -8336,11 +8398,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, info->read = 0; read: - size = PAGE_SIZE - info->read; + size = page_size - info->read; if (size > count) size = count; - - ret = copy_to_user(ubuf, info->spare + info->read, size); + trace_data = ring_buffer_read_page_data(info->spare); + ret = copy_to_user(ubuf, trace_data + info->read, size); if (ret == size) return -EFAULT; @@ -8357,9 +8419,9 @@ static int tracing_buffers_flush(struct file *file, fl_owner_t id) struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; - iter->wait_index++; + iter->closed = true; /* Make sure the waiters see the new wait_index */ - smp_wmb(); + (void)atomic_fetch_inc_release(&iter->wait_index); ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); @@ -8459,6 +8521,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; + bool woken = false; + int page_size; int entries, i; ssize_t ret = 0; @@ -8467,13 +8531,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, return -EBUSY; #endif - if (*ppos & (PAGE_SIZE - 1)) + page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer); + if (*ppos & (page_size - 1)) return -EINVAL; - if (len & (PAGE_SIZE - 1)) { - if (len < PAGE_SIZE) + if (len & (page_size - 1)) { + if (len < page_size) return -EINVAL; - len &= PAGE_MASK; + len &= (~(page_size - 1)); } if (splice_grow_spd(pipe, &spd)) @@ -8483,7 +8548,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, trace_access_lock(iter->cpu_file); entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); - for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) { + for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= page_size) { struct page *page; int r; @@ -8504,7 +8569,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, } ref->cpu = iter->cpu_file; - r = ring_buffer_read_page(ref->buffer, &ref->page, + r = ring_buffer_read_page(ref->buffer, ref->page, len, iter->cpu_file, 1); if (r < 0) { ring_buffer_free_read_page(ref->buffer, ref->cpu, @@ -8513,14 +8578,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; } - page = virt_to_page(ref->page); + page = virt_to_page(ring_buffer_read_page_data(ref->page)); spd.pages[i] = page; - spd.partial[i].len = PAGE_SIZE; + spd.partial[i].len = page_size; spd.partial[i].offset = 0; spd.partial[i].private = (unsigned long)ref; spd.nr_pages++; - *ppos += PAGE_SIZE; + *ppos += page_size; entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); } @@ -8530,17 +8595,17 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, /* did we read anything? */ if (!spd.nr_pages) { - long wait_index; if (ret) goto out; + if (woken) + goto out; + ret = -EAGAIN; if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) goto out; - wait_index = READ_ONCE(iter->wait_index); - ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent); if (ret) goto out; @@ -8549,10 +8614,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, if (!tracer_tracing_is_on(iter->tr)) goto out; - /* Make sure we see the new wait_index */ - smp_rmb(); - if (wait_index != iter->wait_index) - goto out; + /* Iterate one more time to collect any new data then exit */ + woken = true; goto again; } @@ -8575,9 +8638,8 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned mutex_lock(&trace_types_lock); - iter->wait_index++; /* Make sure the waiters see the new wait_index */ - smp_wmb(); + (void)atomic_fetch_inc_release(&iter->wait_index); ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); @@ -9365,6 +9427,103 @@ static const struct file_operations buffer_percent_fops = { .llseek = default_llseek, }; +static ssize_t +buffer_subbuf_size_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + size_t size; + char buf[64]; + int order; + int r; + + order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); + size = (PAGE_SIZE << order) / 1024; + + r = sprintf(buf, "%zd\n", size); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +buffer_subbuf_size_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + unsigned long val; + int old_order; + int order; + int pages; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + val *= 1024; /* value passed in is in KB */ + + pages = DIV_ROUND_UP(val, PAGE_SIZE); + order = fls(pages - 1); + + /* limit between 1 and 128 system pages */ + if (order < 0 || order > 7) + return -EINVAL; + + /* Do not allow tracing while changing the order of the ring buffer */ + tracing_stop_tr(tr); + + old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); + if (old_order == order) + goto out; + + ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, order); + if (ret) + goto out; + +#ifdef CONFIG_TRACER_MAX_TRACE + + if (!tr->allocated_snapshot) + goto out_max; + + ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order); + if (ret) { + /* Put back the old order */ + cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order); + if (WARN_ON_ONCE(cnt)) { + /* + * AARGH! We are left with different orders! + * The max buffer is our "snapshot" buffer. + * When a tracer needs a snapshot (one of the + * latency tracers), it swaps the max buffer + * with the saved snap shot. We succeeded to + * update the order of the main buffer, but failed to + * update the order of the max buffer. But when we tried + * to reset the main buffer to the original size, we + * failed there too. This is very unlikely to + * happen, but if it does, warn and kill all + * tracing. + */ + tracing_disabled = 1; + } + goto out; + } + out_max: +#endif + (*ppos)++; + out: + if (ret) + cnt = ret; + tracing_start_tr(tr); + return cnt; +} + +static const struct file_operations buffer_subbuf_size_fops = { + .open = tracing_open_generic_tr, + .read = buffer_subbuf_size_read, + .write = buffer_subbuf_size_write, + .release = tracing_release_generic_tr, + .llseek = default_llseek, +}; + static struct dentry *trace_instance_dir; static void @@ -9515,7 +9674,8 @@ static int trace_array_create_dir(struct trace_array *tr) return ret; } -static struct trace_array *trace_array_create(const char *name) +static struct trace_array * +trace_array_create_systems(const char *name, const char *systems) { struct trace_array *tr; int ret; @@ -9535,6 +9695,12 @@ static struct trace_array *trace_array_create(const char *name) if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL)) goto out_free_tr; + if (systems) { + tr->system_names = kstrdup_const(systems, GFP_KERNEL); + if (!tr->system_names) + goto out_free_tr; + } + tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; cpumask_copy(tr->tracing_cpumask, cpu_all_mask); @@ -9581,12 +9747,18 @@ static struct trace_array *trace_array_create(const char *name) free_trace_buffers(tr); free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); + kfree_const(tr->system_names); kfree(tr->name); kfree(tr); return ERR_PTR(ret); } +static struct trace_array *trace_array_create(const char *name) +{ + return trace_array_create_systems(name, NULL); +} + static int instance_mkdir(const char *name) { struct trace_array *tr; @@ -9612,6 +9784,7 @@ out_unlock: /** * trace_array_get_by_name - Create/Lookup a trace array, given its name. * @name: The name of the trace array to be looked up/created. + * @systems: A list of systems to create event directories for (NULL for all) * * Returns pointer to trace array with given name. * NULL, if it cannot be created. @@ -9625,7 +9798,7 @@ out_unlock: * trace_array_put() is called, user space can not delete it. * */ -struct trace_array *trace_array_get_by_name(const char *name) +struct trace_array *trace_array_get_by_name(const char *name, const char *systems) { struct trace_array *tr; @@ -9637,7 +9810,7 @@ struct trace_array *trace_array_get_by_name(const char *name) goto out_unlock; } - tr = trace_array_create(name); + tr = trace_array_create_systems(name, systems); if (IS_ERR(tr)) tr = NULL; @@ -9684,6 +9857,7 @@ static int __remove_instance(struct trace_array *tr) free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); + kfree_const(tr->system_names); kfree(tr->name); kfree(tr); @@ -9816,6 +9990,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer, tr, &buffer_percent_fops); + trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer, + tr, &buffer_subbuf_size_fops); + create_trace_options_dir(tr); #ifdef CONFIG_TRACER_MAX_TRACE @@ -10402,7 +10579,7 @@ __init static void enable_instances(void) if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE)) do_allocate_snapshot(tok); - tr = trace_array_get_by_name(tok); + tr = trace_array_get_by_name(tok, NULL); if (!tr) { pr_warn("Failed to create instance buffer %s\n", curr_str); continue; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0489e72c81..00f873910c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -377,6 +377,7 @@ struct trace_array { unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; unsigned int flags; raw_spinlock_t start_lock; + const char *system_names; struct list_head err_log; struct dentry *dir; struct dentry *options; @@ -615,6 +616,7 @@ void tracing_reset_all_online_cpus(void); void tracing_reset_all_online_cpus_unlocked(void); int tracing_open_generic(struct inode *inode, struct file *filp); int tracing_open_generic_tr(struct inode *inode, struct file *filp); +int tracing_release_generic_tr(struct inode *inode, struct file *file); int tracing_open_file_tr(struct inode *inode, struct file *filp); int tracing_release_file_tr(struct inode *inode, struct file *filp); int tracing_single_release_file_tr(struct inode *inode, struct file *filp); diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 7ccc7a8e15..dbe29b4c6a 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -633,7 +633,7 @@ trace_boot_init_instances(struct xbc_node *node) if (!p || *p == '\0') continue; - tr = trace_array_get_by_name(p); + tr = trace_array_get_by_name(p, NULL); if (!tr) { pr_err("Failed to get trace instance %s\n", p); continue; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f29e815ca5..52f75c36bb 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1670,6 +1670,7 @@ static int trace_format_open(struct inode *inode, struct file *file) return 0; } +#ifdef CONFIG_PERF_EVENTS static ssize_t event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -1684,6 +1685,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); } +#endif static ssize_t event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, @@ -1893,9 +1895,9 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, } static ssize_t -show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +show_header_page_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - int (*func)(struct trace_seq *s) = filp->private_data; + struct trace_array *tr = filp->private_data; struct trace_seq *s; int r; @@ -1908,7 +1910,31 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) trace_seq_init(s); - func(s); + ring_buffer_print_page_header(tr->array_buffer.buffer, s); + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, trace_seq_used(s)); + + kfree(s); + + return r; +} + +static ssize_t +show_header_event_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + ring_buffer_print_entry_header(s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); @@ -2128,10 +2154,12 @@ static const struct file_operations ftrace_event_format_fops = { .release = seq_release, }; +#ifdef CONFIG_PERF_EVENTS static const struct file_operations ftrace_event_id_fops = { .read = event_id_read, .llseek = default_llseek, }; +#endif static const struct file_operations ftrace_event_filter_fops = { .open = tracing_open_file_tr, @@ -2165,10 +2193,18 @@ static const struct file_operations ftrace_tr_enable_fops = { .release = subsystem_release, }; -static const struct file_operations ftrace_show_header_fops = { - .open = tracing_open_generic, - .read = show_header, +static const struct file_operations ftrace_show_header_page_fops = { + .open = tracing_open_generic_tr, + .read = show_header_page_file, + .llseek = default_llseek, + .release = tracing_release_generic_tr, +}; + +static const struct file_operations ftrace_show_header_event_fops = { + .open = tracing_open_generic_tr, + .read = show_header_event_file, .llseek = default_llseek, + .release = tracing_release_generic_tr, }; static int @@ -2896,6 +2932,27 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) up_write(&trace_event_sem); } +static bool event_in_systems(struct trace_event_call *call, + const char *systems) +{ + const char *system; + const char *p; + + if (!systems) + return true; + + system = call->class->system; + p = strstr(systems, system); + if (!p) + return false; + + if (p != systems && !isspace(*(p - 1)) && *(p - 1) != ',') + return false; + + p += strlen(system); + return !*p || isspace(*p) || *p == ','; +} + static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) @@ -2905,9 +2962,12 @@ trace_create_new_event(struct trace_event_call *call, struct trace_event_file *file; unsigned int first; + if (!event_in_systems(call, tr->system_names)) + return NULL; + file = kmem_cache_alloc(file_cachep, GFP_TRACE); if (!file) - return NULL; + return ERR_PTR(-ENOMEM); pid_list = rcu_dereference_protected(tr->filtered_pids, lockdep_is_held(&event_mutex)); @@ -2972,8 +3032,17 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) struct trace_event_file *file; file = trace_create_new_event(call, tr); + /* + * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed + * allocation, or NULL if the event is not part of the tr->system_names. + * When the event is not part of the tr->system_names, return zero, not + * an error. + */ if (!file) - return -ENOMEM; + return 0; + + if (IS_ERR(file)) + return PTR_ERR(file); if (eventdir_initialized) return event_create_dir(tr->event_dir, file); @@ -3012,8 +3081,17 @@ __trace_early_add_new_event(struct trace_event_call *call, int ret; file = trace_create_new_event(call, tr); + /* + * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed + * allocation, or NULL if the event is not part of the tr->system_names. + * When the event is not part of the tr->system_names, return zero, not + * an error. + */ if (!file) - return -ENOMEM; + return 0; + + if (IS_ERR(file)) + return PTR_ERR(file); ret = event_define_fields(call); if (ret) @@ -3752,17 +3830,16 @@ static int events_callback(const char *name, umode_t *mode, void **data, return 1; } - if (strcmp(name, "header_page") == 0) - *data = ring_buffer_print_page_header; - - else if (strcmp(name, "header_event") == 0) - *data = ring_buffer_print_entry_header; + if (strcmp(name, "header_page") == 0) { + *mode = TRACE_MODE_READ; + *fops = &ftrace_show_header_page_fops; - else + } else if (strcmp(name, "header_event") == 0) { + *mode = TRACE_MODE_READ; + *fops = &ftrace_show_header_event_fops; + } else return 0; - *mode = TRACE_MODE_READ; - *fops = &ftrace_show_header_fops; return 1; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5ecf3c8bde..6ece1308d3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4805,36 +4805,35 @@ static int parse_actions(struct hist_trigger_data *hist_data) int len; for (i = 0; i < hist_data->attrs->n_actions; i++) { + enum handler_id hid = 0; + char *action_str; + str = hist_data->attrs->action_str[i]; - if ((len = str_has_prefix(str, "onmatch("))) { - char *action_str = str + len; + if ((len = str_has_prefix(str, "onmatch("))) + hid = HANDLER_ONMATCH; + else if ((len = str_has_prefix(str, "onmax("))) + hid = HANDLER_ONMAX; + else if ((len = str_has_prefix(str, "onchange("))) + hid = HANDLER_ONCHANGE; - data = onmatch_parse(tr, action_str); - if (IS_ERR(data)) { - ret = PTR_ERR(data); - break; - } - } else if ((len = str_has_prefix(str, "onmax("))) { - char *action_str = str + len; + action_str = str + len; - data = track_data_parse(hist_data, action_str, - HANDLER_ONMAX); - if (IS_ERR(data)) { - ret = PTR_ERR(data); - break; - } - } else if ((len = str_has_prefix(str, "onchange("))) { - char *action_str = str + len; + switch (hid) { + case HANDLER_ONMATCH: + data = onmatch_parse(tr, action_str); + break; + case HANDLER_ONMAX: + case HANDLER_ONCHANGE: + data = track_data_parse(hist_data, action_str, hid); + break; + default: + data = ERR_PTR(-EINVAL); + break; + } - data = track_data_parse(hist_data, action_str, - HANDLER_ONCHANGE); - if (IS_ERR(data)) { - ret = PTR_ERR(data); - break; - } - } else { - ret = -EINVAL; + if (IS_ERR(data)) { + ret = PTR_ERR(data); break; } diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 9365ce4074..e76f5e1efd 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -2177,14 +2177,12 @@ static int user_events_open(struct inode *node, struct file *file) static ssize_t user_events_write(struct file *file, const char __user *ubuf, size_t count, loff_t *ppos) { - struct iovec iov; struct iov_iter i; if (unlikely(*ppos != 0)) return -EFAULT; - if (unlikely(import_single_range(ITER_SOURCE, (char __user *)ubuf, - count, &iov, &i))) + if (unlikely(import_ubuf(ITER_SOURCE, (char __user *)ubuf, count, &i))) return -EFAULT; return user_events_write_core(file, &i); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 52f8b537dd..c4c6e0e006 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -487,8 +487,8 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) return -EINVAL; if (within_notrace_func(tk)) { - pr_warn("Could not probe notrace function %s\n", - trace_kprobe_symbol(tk)); + pr_warn("Could not probe notrace function %ps\n", + (void *)trace_kprobe_address(tk)); return -EINVAL; } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 3e7fa44dc2..d8b302d010 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1587,12 +1587,11 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter, { struct print_entry *field; struct trace_seq *s = &iter->seq; - int max = iter->ent_size - offsetof(struct print_entry, buf); trace_assign_type(field, iter->ent); seq_print_ip_sym(s, field->ip, flags); - trace_seq_printf(s, ": %.*s", max, field->buf); + trace_seq_printf(s, ": %s", field->buf); return trace_handle_return(s); } @@ -1601,11 +1600,10 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, struct trace_event *event) { struct print_entry *field; - int max = iter->ent_size - offsetof(struct print_entry, buf); trace_assign_type(field, iter->ent); - trace_seq_printf(&iter->seq, "# %lx %.*s", field->ip, max, field->buf); + trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf); return trace_handle_return(&iter->seq); } diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 7be97229dd..c158d65a8a 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -13,9 +13,6 @@ * trace_seq_init() more than once to reset the trace_seq to start * from scratch. * - * The buffer size is currently PAGE_SIZE, although it may become dynamic - * in the future. - * * A write to the buffer will either succeed or fail. That is, unlike * sprintf() there will not be a partial write (well it may write into * the buffer but it wont update the pointers). This allows users to diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 99c051de41..a84b85d8aa 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -151,7 +151,7 @@ fetch_store_string(unsigned long addr, void *dest, void *base) return -ENOMEM; if (addr == FETCH_TOKEN_COMM) - ret = strlcpy(dst, current->comm, maxlen); + ret = strscpy(dst, current->comm, maxlen); else ret = strncpy_from_user(dst, src, maxlen); if (ret >= 0) { |