diff options
Diffstat (limited to 'kernel/trace')
-rw-r--r-- | kernel/trace/ring_buffer.c | 195 | ||||
-rw-r--r-- | kernel/trace/trace.c | 78 | ||||
-rw-r--r-- | kernel/trace/trace_event_perf.c | 3 | ||||
-rw-r--r-- | kernel/trace/tracing_map.c | 7 |
4 files changed, 159 insertions, 124 deletions
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 041b91c2b..2df8e13a2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -415,7 +415,6 @@ struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; wait_queue_head_t full_waiters; - long wait_index; bool waiters_pending; bool full_waiters_pending; bool wakeup_full; @@ -832,8 +831,19 @@ static void rb_wake_up_waiters(struct irq_work *work) wake_up_all(&rbwork->waiters); if (rbwork->full_waiters_pending || rbwork->wakeup_full) { + /* Only cpu_buffer sets the above flags */ + struct ring_buffer_per_cpu *cpu_buffer = + container_of(rbwork, struct ring_buffer_per_cpu, irq_work); + + /* Called from interrupt context */ + raw_spin_lock(&cpu_buffer->reader_lock); rbwork->wakeup_full = false; rbwork->full_waiters_pending = false; + + /* Waking up all waiters, they will reset the shortest full */ + cpu_buffer->shortest_full = 0; + raw_spin_unlock(&cpu_buffer->reader_lock); + wake_up_all(&rbwork->full_waiters); } } @@ -862,14 +872,41 @@ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu) rbwork = &cpu_buffer->irq_work; } - rbwork->wait_index++; - /* make sure the waiters see the new index */ - smp_wmb(); - /* This can be called in any context */ irq_work_queue(&rbwork->work); } +static bool rb_watermark_hit(struct trace_buffer *buffer, int cpu, int full) +{ + struct ring_buffer_per_cpu *cpu_buffer; + bool ret = false; + + /* Reads of all CPUs always waits for any data */ + if (cpu == RING_BUFFER_ALL_CPUS) + return !ring_buffer_empty(buffer); + + cpu_buffer = buffer->buffers[cpu]; + + if (!ring_buffer_empty_cpu(buffer, cpu)) { + unsigned long flags; + bool pagebusy; + + if (!full) + return true; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + ret = !pagebusy && full_hit(buffer, cpu, full); + + if (!ret && (!cpu_buffer->shortest_full || + cpu_buffer->shortest_full > full)) { + cpu_buffer->shortest_full = full; + } + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + } + return ret; +} + /** * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on @@ -885,7 +922,6 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) struct ring_buffer_per_cpu *cpu_buffer; DEFINE_WAIT(wait); struct rb_irq_work *work; - long wait_index; int ret = 0; /* @@ -904,81 +940,54 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) work = &cpu_buffer->irq_work; } - wait_index = READ_ONCE(work->wait_index); - - while (true) { - if (full) - prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE); - else - prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); - - /* - * The events can happen in critical sections where - * checking a work queue can cause deadlocks. - * After adding a task to the queue, this flag is set - * only to notify events to try to wake up the queue - * using irq_work. - * - * We don't clear it even if the buffer is no longer - * empty. The flag only causes the next event to run - * irq_work to do the work queue wake up. The worse - * that can happen if we race with !trace_empty() is that - * an event will cause an irq_work to try to wake up - * an empty queue. - * - * There's no reason to protect this flag either, as - * the work queue and irq_work logic will do the necessary - * synchronization for the wake ups. The only thing - * that is necessary is that the wake up happens after - * a task has been queued. It's OK for spurious wake ups. - */ - if (full) - work->full_waiters_pending = true; - else - work->waiters_pending = true; - - if (signal_pending(current)) { - ret = -EINTR; - break; - } - - if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) - break; - - if (cpu != RING_BUFFER_ALL_CPUS && - !ring_buffer_empty_cpu(buffer, cpu)) { - unsigned long flags; - bool pagebusy; - bool done; - - if (!full) - break; - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; - done = !pagebusy && full_hit(buffer, cpu, full); + if (full) + prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE); + else + prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); - if (!cpu_buffer->shortest_full || - cpu_buffer->shortest_full > full) - cpu_buffer->shortest_full = full; - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (done) - break; - } + /* + * The events can happen in critical sections where + * checking a work queue can cause deadlocks. + * After adding a task to the queue, this flag is set + * only to notify events to try to wake up the queue + * using irq_work. + * + * We don't clear it even if the buffer is no longer + * empty. The flag only causes the next event to run + * irq_work to do the work queue wake up. The worse + * that can happen if we race with !trace_empty() is that + * an event will cause an irq_work to try to wake up + * an empty queue. + * + * There's no reason to protect this flag either, as + * the work queue and irq_work logic will do the necessary + * synchronization for the wake ups. The only thing + * that is necessary is that the wake up happens after + * a task has been queued. It's OK for spurious wake ups. + */ + if (full) + work->full_waiters_pending = true; + else + work->waiters_pending = true; - schedule(); + if (rb_watermark_hit(buffer, cpu, full)) + goto out; - /* Make sure to see the new wait index */ - smp_rmb(); - if (wait_index != work->wait_index) - break; + if (signal_pending(current)) { + ret = -EINTR; + goto out; } + schedule(); + out: if (full) finish_wait(&work->full_waiters, &wait); else finish_wait(&work->waiters, &wait); + if (!ret && !rb_watermark_hit(buffer, cpu, full) && signal_pending(current)) + ret = -EINTR; + return ret; } @@ -1001,30 +1010,51 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table, int full) { struct ring_buffer_per_cpu *cpu_buffer; - struct rb_irq_work *work; + struct rb_irq_work *rbwork; if (cpu == RING_BUFFER_ALL_CPUS) { - work = &buffer->irq_work; + rbwork = &buffer->irq_work; full = 0; } else { if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return -EINVAL; + return EPOLLERR; cpu_buffer = buffer->buffers[cpu]; - work = &cpu_buffer->irq_work; + rbwork = &cpu_buffer->irq_work; } if (full) { - poll_wait(filp, &work->full_waiters, poll_table); - work->full_waiters_pending = true; + unsigned long flags; + + poll_wait(filp, &rbwork->full_waiters, poll_table); + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); if (!cpu_buffer->shortest_full || cpu_buffer->shortest_full > full) cpu_buffer->shortest_full = full; - } else { - poll_wait(filp, &work->waiters, poll_table); - work->waiters_pending = true; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (full_hit(buffer, cpu, full)) + return EPOLLIN | EPOLLRDNORM; + /* + * Only allow full_waiters_pending update to be seen after + * the shortest_full is set. If the writer sees the + * full_waiters_pending flag set, it will compare the + * amount in the ring buffer to shortest_full. If the amount + * in the ring buffer is greater than the shortest_full + * percent, it will call the irq_work handler to wake up + * this list. The irq_handler will reset shortest_full + * back to zero. That's done under the reader_lock, but + * the below smp_mb() makes sure that the update to + * full_waiters_pending doesn't leak up into the above. + */ + smp_mb(); + rbwork->full_waiters_pending = true; + return 0; } + poll_wait(filp, &rbwork->waiters, poll_table); + rbwork->waiters_pending = true; + /* * There's a tight race between setting the waiters_pending and * checking if the ring buffer is empty. Once the waiters_pending bit @@ -1040,9 +1070,6 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, */ smp_mb(); - if (full) - return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0; - if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) return EPOLLIN | EPOLLRDNORM; @@ -4184,7 +4211,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) cpu_buffer = iter->cpu_buffer; reader = cpu_buffer->reader_page; head_page = cpu_buffer->head_page; - commit_page = cpu_buffer->commit_page; + commit_page = READ_ONCE(cpu_buffer->commit_page); commit_ts = commit_page->page->time_stamp; /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 548f694fc..22e1e5711 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -39,6 +39,7 @@ #include <linux/slab.h> #include <linux/ctype.h> #include <linux/init.h> +#include <linux/kmemleak.h> #include <linux/poll.h> #include <linux/nmi.h> #include <linux/fs.h> @@ -2239,7 +2240,7 @@ struct saved_cmdlines_buffer { unsigned *map_cmdline_to_pid; unsigned cmdline_num; int cmdline_idx; - char *saved_cmdlines; + char saved_cmdlines[]; }; static struct saved_cmdlines_buffer *savedcmd; @@ -2253,47 +2254,60 @@ static inline void set_cmdline(int idx, const char *cmdline) strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); } -static int allocate_cmdlines_buffer(unsigned int val, - struct saved_cmdlines_buffer *s) +static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) { + int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN); + + kfree(s->map_cmdline_to_pid); + kmemleak_free(s); + free_pages((unsigned long)s, order); +} + +static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val) +{ + struct saved_cmdlines_buffer *s; + struct page *page; + int orig_size, size; + int order; + + /* Figure out how much is needed to hold the given number of cmdlines */ + orig_size = sizeof(*s) + val * TASK_COMM_LEN; + order = get_order(orig_size); + size = 1 << (order + PAGE_SHIFT); + page = alloc_pages(GFP_KERNEL, order); + if (!page) + return NULL; + + s = page_address(page); + kmemleak_alloc(s, size, 1, GFP_KERNEL); + memset(s, 0, sizeof(*s)); + + /* Round up to actual allocation */ + val = (size - sizeof(*s)) / TASK_COMM_LEN; + s->cmdline_num = val; + s->map_cmdline_to_pid = kmalloc_array(val, sizeof(*s->map_cmdline_to_pid), GFP_KERNEL); - if (!s->map_cmdline_to_pid) - return -ENOMEM; - - s->saved_cmdlines = kmalloc_array(TASK_COMM_LEN, val, GFP_KERNEL); - if (!s->saved_cmdlines) { - kfree(s->map_cmdline_to_pid); - return -ENOMEM; + if (!s->map_cmdline_to_pid) { + free_saved_cmdlines_buffer(s); + return NULL; } s->cmdline_idx = 0; - s->cmdline_num = val; memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(s->map_pid_to_cmdline)); memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, val * sizeof(*s->map_cmdline_to_pid)); - return 0; + return s; } static int trace_create_savedcmd(void) { - int ret; - - savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL); - if (!savedcmd) - return -ENOMEM; + savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT); - ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd); - if (ret < 0) { - kfree(savedcmd); - savedcmd = NULL; - return -ENOMEM; - } - - return 0; + return savedcmd ? 0 : -ENOMEM; } int is_tracing_stopped(void) @@ -5603,26 +5617,14 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } -static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) -{ - kfree(s->saved_cmdlines); - kfree(s->map_cmdline_to_pid); - kfree(s); -} - static int tracing_resize_saved_cmdlines(unsigned int val) { struct saved_cmdlines_buffer *s, *savedcmd_temp; - s = kmalloc(sizeof(*s), GFP_KERNEL); + s = allocate_cmdlines_buffer(val); if (!s) return -ENOMEM; - if (allocate_cmdlines_buffer(val, s) < 0) { - kfree(s); - return -ENOMEM; - } - preempt_disable(); arch_spin_lock(&trace_cmdline_lock); savedcmd_temp = savedcmd; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 643e0b199..eb81ad523 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -400,7 +400,8 @@ void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp) BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "perf buffer not large enough")) + "perf buffer not large enough, wanted %d, have %d", + size, PERF_MAX_TRACE_SIZE)) return NULL; *rctxp = rctx = perf_swevent_get_recursion_context(); diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 51a9d1185..d47641f97 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -574,7 +574,12 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) } memcpy(elt->key, key, map->key_size); - entry->val = elt; + /* + * Ensure the initialization is visible and + * publish the elt. + */ + smp_wmb(); + WRITE_ONCE(entry->val, elt); atomic64_inc(&map->hits); return entry->val; |