summaryrefslogtreecommitdiffstats
path: root/kernel/trace
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/ring_buffer.c195
-rw-r--r--kernel/trace/trace.c78
-rw-r--r--kernel/trace/trace_event_perf.c3
-rw-r--r--kernel/trace/tracing_map.c7
4 files changed, 159 insertions, 124 deletions
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 041b91c2b..2df8e13a2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -415,7 +415,6 @@ struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
wait_queue_head_t full_waiters;
- long wait_index;
bool waiters_pending;
bool full_waiters_pending;
bool wakeup_full;
@@ -832,8 +831,19 @@ static void rb_wake_up_waiters(struct irq_work *work)
wake_up_all(&rbwork->waiters);
if (rbwork->full_waiters_pending || rbwork->wakeup_full) {
+ /* Only cpu_buffer sets the above flags */
+ struct ring_buffer_per_cpu *cpu_buffer =
+ container_of(rbwork, struct ring_buffer_per_cpu, irq_work);
+
+ /* Called from interrupt context */
+ raw_spin_lock(&cpu_buffer->reader_lock);
rbwork->wakeup_full = false;
rbwork->full_waiters_pending = false;
+
+ /* Waking up all waiters, they will reset the shortest full */
+ cpu_buffer->shortest_full = 0;
+ raw_spin_unlock(&cpu_buffer->reader_lock);
+
wake_up_all(&rbwork->full_waiters);
}
}
@@ -862,14 +872,41 @@ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
rbwork = &cpu_buffer->irq_work;
}
- rbwork->wait_index++;
- /* make sure the waiters see the new index */
- smp_wmb();
-
/* This can be called in any context */
irq_work_queue(&rbwork->work);
}
+static bool rb_watermark_hit(struct trace_buffer *buffer, int cpu, int full)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ bool ret = false;
+
+ /* Reads of all CPUs always waits for any data */
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ return !ring_buffer_empty(buffer);
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (!ring_buffer_empty_cpu(buffer, cpu)) {
+ unsigned long flags;
+ bool pagebusy;
+
+ if (!full)
+ return true;
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+ ret = !pagebusy && full_hit(buffer, cpu, full);
+
+ if (!ret && (!cpu_buffer->shortest_full ||
+ cpu_buffer->shortest_full > full)) {
+ cpu_buffer->shortest_full = full;
+ }
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ }
+ return ret;
+}
+
/**
* ring_buffer_wait - wait for input to the ring buffer
* @buffer: buffer to wait on
@@ -885,7 +922,6 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
struct ring_buffer_per_cpu *cpu_buffer;
DEFINE_WAIT(wait);
struct rb_irq_work *work;
- long wait_index;
int ret = 0;
/*
@@ -904,81 +940,54 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
work = &cpu_buffer->irq_work;
}
- wait_index = READ_ONCE(work->wait_index);
-
- while (true) {
- if (full)
- prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
- else
- prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
-
- /*
- * The events can happen in critical sections where
- * checking a work queue can cause deadlocks.
- * After adding a task to the queue, this flag is set
- * only to notify events to try to wake up the queue
- * using irq_work.
- *
- * We don't clear it even if the buffer is no longer
- * empty. The flag only causes the next event to run
- * irq_work to do the work queue wake up. The worse
- * that can happen if we race with !trace_empty() is that
- * an event will cause an irq_work to try to wake up
- * an empty queue.
- *
- * There's no reason to protect this flag either, as
- * the work queue and irq_work logic will do the necessary
- * synchronization for the wake ups. The only thing
- * that is necessary is that the wake up happens after
- * a task has been queued. It's OK for spurious wake ups.
- */
- if (full)
- work->full_waiters_pending = true;
- else
- work->waiters_pending = true;
-
- if (signal_pending(current)) {
- ret = -EINTR;
- break;
- }
-
- if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer))
- break;
-
- if (cpu != RING_BUFFER_ALL_CPUS &&
- !ring_buffer_empty_cpu(buffer, cpu)) {
- unsigned long flags;
- bool pagebusy;
- bool done;
-
- if (!full)
- break;
-
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
- pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
- done = !pagebusy && full_hit(buffer, cpu, full);
+ if (full)
+ prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
+ else
+ prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
- if (!cpu_buffer->shortest_full ||
- cpu_buffer->shortest_full > full)
- cpu_buffer->shortest_full = full;
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- if (done)
- break;
- }
+ /*
+ * The events can happen in critical sections where
+ * checking a work queue can cause deadlocks.
+ * After adding a task to the queue, this flag is set
+ * only to notify events to try to wake up the queue
+ * using irq_work.
+ *
+ * We don't clear it even if the buffer is no longer
+ * empty. The flag only causes the next event to run
+ * irq_work to do the work queue wake up. The worse
+ * that can happen if we race with !trace_empty() is that
+ * an event will cause an irq_work to try to wake up
+ * an empty queue.
+ *
+ * There's no reason to protect this flag either, as
+ * the work queue and irq_work logic will do the necessary
+ * synchronization for the wake ups. The only thing
+ * that is necessary is that the wake up happens after
+ * a task has been queued. It's OK for spurious wake ups.
+ */
+ if (full)
+ work->full_waiters_pending = true;
+ else
+ work->waiters_pending = true;
- schedule();
+ if (rb_watermark_hit(buffer, cpu, full))
+ goto out;
- /* Make sure to see the new wait index */
- smp_rmb();
- if (wait_index != work->wait_index)
- break;
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
}
+ schedule();
+ out:
if (full)
finish_wait(&work->full_waiters, &wait);
else
finish_wait(&work->waiters, &wait);
+ if (!ret && !rb_watermark_hit(buffer, cpu, full) && signal_pending(current))
+ ret = -EINTR;
+
return ret;
}
@@ -1001,30 +1010,51 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
struct file *filp, poll_table *poll_table, int full)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct rb_irq_work *work;
+ struct rb_irq_work *rbwork;
if (cpu == RING_BUFFER_ALL_CPUS) {
- work = &buffer->irq_work;
+ rbwork = &buffer->irq_work;
full = 0;
} else {
if (!cpumask_test_cpu(cpu, buffer->cpumask))
- return -EINVAL;
+ return EPOLLERR;
cpu_buffer = buffer->buffers[cpu];
- work = &cpu_buffer->irq_work;
+ rbwork = &cpu_buffer->irq_work;
}
if (full) {
- poll_wait(filp, &work->full_waiters, poll_table);
- work->full_waiters_pending = true;
+ unsigned long flags;
+
+ poll_wait(filp, &rbwork->full_waiters, poll_table);
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
if (!cpu_buffer->shortest_full ||
cpu_buffer->shortest_full > full)
cpu_buffer->shortest_full = full;
- } else {
- poll_wait(filp, &work->waiters, poll_table);
- work->waiters_pending = true;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ if (full_hit(buffer, cpu, full))
+ return EPOLLIN | EPOLLRDNORM;
+ /*
+ * Only allow full_waiters_pending update to be seen after
+ * the shortest_full is set. If the writer sees the
+ * full_waiters_pending flag set, it will compare the
+ * amount in the ring buffer to shortest_full. If the amount
+ * in the ring buffer is greater than the shortest_full
+ * percent, it will call the irq_work handler to wake up
+ * this list. The irq_handler will reset shortest_full
+ * back to zero. That's done under the reader_lock, but
+ * the below smp_mb() makes sure that the update to
+ * full_waiters_pending doesn't leak up into the above.
+ */
+ smp_mb();
+ rbwork->full_waiters_pending = true;
+ return 0;
}
+ poll_wait(filp, &rbwork->waiters, poll_table);
+ rbwork->waiters_pending = true;
+
/*
* There's a tight race between setting the waiters_pending and
* checking if the ring buffer is empty. Once the waiters_pending bit
@@ -1040,9 +1070,6 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
*/
smp_mb();
- if (full)
- return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0;
-
if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
(cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
return EPOLLIN | EPOLLRDNORM;
@@ -4184,7 +4211,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
cpu_buffer = iter->cpu_buffer;
reader = cpu_buffer->reader_page;
head_page = cpu_buffer->head_page;
- commit_page = cpu_buffer->commit_page;
+ commit_page = READ_ONCE(cpu_buffer->commit_page);
commit_ts = commit_page->page->time_stamp;
/*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 548f694fc..22e1e5711 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -39,6 +39,7 @@
#include <linux/slab.h>
#include <linux/ctype.h>
#include <linux/init.h>
+#include <linux/kmemleak.h>
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/fs.h>
@@ -2239,7 +2240,7 @@ struct saved_cmdlines_buffer {
unsigned *map_cmdline_to_pid;
unsigned cmdline_num;
int cmdline_idx;
- char *saved_cmdlines;
+ char saved_cmdlines[];
};
static struct saved_cmdlines_buffer *savedcmd;
@@ -2253,47 +2254,60 @@ static inline void set_cmdline(int idx, const char *cmdline)
strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
}
-static int allocate_cmdlines_buffer(unsigned int val,
- struct saved_cmdlines_buffer *s)
+static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
{
+ int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN);
+
+ kfree(s->map_cmdline_to_pid);
+ kmemleak_free(s);
+ free_pages((unsigned long)s, order);
+}
+
+static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val)
+{
+ struct saved_cmdlines_buffer *s;
+ struct page *page;
+ int orig_size, size;
+ int order;
+
+ /* Figure out how much is needed to hold the given number of cmdlines */
+ orig_size = sizeof(*s) + val * TASK_COMM_LEN;
+ order = get_order(orig_size);
+ size = 1 << (order + PAGE_SHIFT);
+ page = alloc_pages(GFP_KERNEL, order);
+ if (!page)
+ return NULL;
+
+ s = page_address(page);
+ kmemleak_alloc(s, size, 1, GFP_KERNEL);
+ memset(s, 0, sizeof(*s));
+
+ /* Round up to actual allocation */
+ val = (size - sizeof(*s)) / TASK_COMM_LEN;
+ s->cmdline_num = val;
+
s->map_cmdline_to_pid = kmalloc_array(val,
sizeof(*s->map_cmdline_to_pid),
GFP_KERNEL);
- if (!s->map_cmdline_to_pid)
- return -ENOMEM;
-
- s->saved_cmdlines = kmalloc_array(TASK_COMM_LEN, val, GFP_KERNEL);
- if (!s->saved_cmdlines) {
- kfree(s->map_cmdline_to_pid);
- return -ENOMEM;
+ if (!s->map_cmdline_to_pid) {
+ free_saved_cmdlines_buffer(s);
+ return NULL;
}
s->cmdline_idx = 0;
- s->cmdline_num = val;
memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
sizeof(s->map_pid_to_cmdline));
memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
val * sizeof(*s->map_cmdline_to_pid));
- return 0;
+ return s;
}
static int trace_create_savedcmd(void)
{
- int ret;
-
- savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL);
- if (!savedcmd)
- return -ENOMEM;
+ savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT);
- ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd);
- if (ret < 0) {
- kfree(savedcmd);
- savedcmd = NULL;
- return -ENOMEM;
- }
-
- return 0;
+ return savedcmd ? 0 : -ENOMEM;
}
int is_tracing_stopped(void)
@@ -5603,26 +5617,14 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
-static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
-{
- kfree(s->saved_cmdlines);
- kfree(s->map_cmdline_to_pid);
- kfree(s);
-}
-
static int tracing_resize_saved_cmdlines(unsigned int val)
{
struct saved_cmdlines_buffer *s, *savedcmd_temp;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
+ s = allocate_cmdlines_buffer(val);
if (!s)
return -ENOMEM;
- if (allocate_cmdlines_buffer(val, s) < 0) {
- kfree(s);
- return -ENOMEM;
- }
-
preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
savedcmd_temp = savedcmd;
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 643e0b199..eb81ad523 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -400,7 +400,8 @@ void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
- "perf buffer not large enough"))
+ "perf buffer not large enough, wanted %d, have %d",
+ size, PERF_MAX_TRACE_SIZE))
return NULL;
*rctxp = rctx = perf_swevent_get_recursion_context();
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 51a9d1185..d47641f97 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -574,7 +574,12 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
}
memcpy(elt->key, key, map->key_size);
- entry->val = elt;
+ /*
+ * Ensure the initialization is visible and
+ * publish the elt.
+ */
+ smp_wmb();
+ WRITE_ONCE(entry->val, elt);
atomic64_inc(&map->hits);
return entry->val;