1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
|
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 12 Mar 2024 19:01:50 +0100
Subject: [PATCH 2/4] perf: Enqueue SIGTRAP always via task_work.
Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/6.8/older/patches-6.8.2-rt11.tar.xz
A signal is delivered by raising irq_work() which works from any context
including NMI. irq_work() can be delayed if the architecture does not
provide an interrupt vector. In order not to lose a signal, the signal
is injected via task_work during event_sched_out().
Instead going via irq_work, the signal could be added directly via
task_work. The signal is sent to current and can be enqueued on its
return path to userland instead of triggering irq_work. A dummy IRQ is
required in the NMI case to ensure the task_work is handled before
returning to user land. For this irq_work is used. An alternative would
be just raising an interrupt like arch_send_call_function_single_ipi().
During testing with `remove_on_exec' it become visible that the event
can be enqueued via NMI during execve(). The task_work must not be kept
because free_event() will complain later. Also the new task will not
have a sighandler installed.
Queue signal via task_work. Remove perf_event::pending_sigtrap and
and use perf_event::pending_work instead. Raise irq_work in the NMI case
for a dummy interrupt. Remove the task_work if the event is freed.
Tested-by: Marco Elver <elver@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Link: https://lore.kernel.org/r/20240312180814.3373778-3-bigeasy@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/perf_event.h | 3 --
kernel/events/core.c | 57 +++++++++++++++++++++++++--------------------
2 files changed, 33 insertions(+), 27 deletions(-)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -781,7 +781,6 @@ struct perf_event {
unsigned int pending_wakeup;
unsigned int pending_kill;
unsigned int pending_disable;
- unsigned int pending_sigtrap;
unsigned long pending_addr; /* SIGTRAP */
struct irq_work pending_irq;
struct callback_head pending_task;
@@ -959,7 +958,7 @@ struct perf_event_context {
struct rcu_head rcu_head;
/*
- * Sum (event->pending_sigtrap + event->pending_work)
+ * Sum (event->pending_work + event->pending_work)
*
* The SIGTRAP is targeted at ctx->task, as such it won't do changing
* that until the signal is delivered.
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2283,21 +2283,6 @@ event_sched_out(struct perf_event *event
state = PERF_EVENT_STATE_OFF;
}
- if (event->pending_sigtrap) {
- bool dec = true;
-
- event->pending_sigtrap = 0;
- if (state != PERF_EVENT_STATE_OFF &&
- !event->pending_work) {
- event->pending_work = 1;
- dec = false;
- WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
- task_work_add(current, &event->pending_task, TWA_RESUME);
- }
- if (dec)
- local_dec(&event->ctx->nr_pending);
- }
-
perf_event_set_state(event, state);
if (!is_software_event(event))
@@ -6741,11 +6726,6 @@ static void __perf_pending_irq(struct pe
* Yay, we hit home and are in the context of the event.
*/
if (cpu == smp_processor_id()) {
- if (event->pending_sigtrap) {
- event->pending_sigtrap = 0;
- perf_sigtrap(event);
- local_dec(&event->ctx->nr_pending);
- }
if (event->pending_disable) {
event->pending_disable = 0;
perf_event_disable_local(event);
@@ -9592,14 +9572,23 @@ static int __perf_event_overflow(struct
if (regs)
pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
- if (!event->pending_sigtrap) {
- event->pending_sigtrap = pending_id;
+ if (!event->pending_work) {
+ event->pending_work = pending_id;
local_inc(&event->ctx->nr_pending);
- irq_work_queue(&event->pending_irq);
+ WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
+ task_work_add(current, &event->pending_task, TWA_RESUME);
+ /*
+ * The NMI path returns directly to userland. The
+ * irq_work is raised as a dummy interrupt to ensure
+ * regular return path to user is taken and task_work
+ * is processed.
+ */
+ if (in_nmi())
+ irq_work_queue(&event->pending_irq);
} else if (event->attr.exclude_kernel && valid_sample) {
/*
* Should not be able to return to user space without
- * consuming pending_sigtrap; with exceptions:
+ * consuming pending_work; with exceptions:
*
* 1. Where !exclude_kernel, events can overflow again
* in the kernel without returning to user space.
@@ -9609,7 +9598,7 @@ static int __perf_event_overflow(struct
* To approximate progress (with false negatives),
* check 32-bit hash of the current IP.
*/
- WARN_ON_ONCE(event->pending_sigtrap != pending_id);
+ WARN_ON_ONCE(event->pending_work != pending_id);
}
event->pending_addr = 0;
@@ -13049,6 +13038,13 @@ static void sync_child_event(struct perf
&parent_event->child_total_time_running);
}
+static bool task_work_cb_match(struct callback_head *cb, void *data)
+{
+ struct perf_event *event = container_of(cb, struct perf_event, pending_task);
+
+ return event == data;
+}
+
static void
perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
{
@@ -13088,6 +13084,17 @@ perf_event_exit_event(struct perf_event
* Kick perf_poll() for is_event_hup();
*/
perf_event_wakeup(parent_event);
+ /*
+ * Cancel pending task_work and update counters if it has not
+ * yet been delivered to userland. free_event() expects the
+ * reference counter at 1 and keeping the event around until the
+ * task return to userland will be a unexpected.
+ */
+ if (event->pending_work &&
+ task_work_cancel_match(current, task_work_cb_match, event)) {
+ put_event(event);
+ local_dec(&event->ctx->nr_pending);
+ }
free_event(event);
put_event(parent_event);
return;
|