1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
|
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Mon, 31 Oct 2022 16:50:04 +0100
Subject: [PATCH 348/353] timers: Prepare support for PREEMPT_RT
Origin: https://git.kernel.org/cgit/linux/kernel/git/rt/linux-stable-rt.git/commit?id=f90995ffd999e658dbb9146f2436b4fde40ed9d9
Upstream commit 030dcdd197d77374879bb5603d091eee7d8aba80
When PREEMPT_RT is enabled, the soft interrupt thread can be preempted. If
the soft interrupt thread is preempted in the middle of a timer callback,
then calling del_timer_sync() can lead to two issues:
- If the caller is on a remote CPU then it has to spin wait for the timer
handler to complete. This can result in unbound priority inversion.
- If the caller originates from the task which preempted the timer
handler on the same CPU, then spin waiting for the timer handler to
complete is never going to end.
To avoid these issues, add a new lock to the timer base which is held
around the execution of the timer callbacks. If del_timer_sync() detects
that the timer callback is currently running, it blocks on the expiry
lock. When the callback is finished, the expiry lock is dropped by the
softirq thread which wakes up the waiter and the system makes progress.
This addresses both the priority inversion and the life lock issues.
This mechanism is not used for timers which are marked IRQSAFE as for those
preemption is disabled accross the callback and therefore this situation
cannot happen. The callbacks for such timers need to be individually
audited for RT compliance.
The same issue can happen in virtual machines when the vCPU which runs a
timer callback is scheduled out. If a second vCPU of the same guest calls
del_timer_sync() it will spin wait for the other vCPU to be scheduled back
in. The expiry lock mechanism would avoid that. It'd be trivial to enable
this when paravirt spinlocks are enabled in a guest, but it's not clear
whether this is an actual problem in the wild, so for now it's an RT only
mechanism.
As the softirq thread can be preempted with PREEMPT_RT=y, the SMP variant
of del_timer_sync() needs to be used on UP as well.
[ tglx: Refactored it for mainline ]
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20190726185753.832418500@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Daniel Wagner <wagi@monom.org>
---
kernel/time/timer.c | 130 ++++++++++++++++++++++++++++++--------------
1 file changed, 88 insertions(+), 42 deletions(-)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 0a6d60b3e67c..b859ecf6424b 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -198,7 +198,10 @@ EXPORT_SYMBOL(jiffies_64);
struct timer_base {
raw_spinlock_t lock;
struct timer_list *running_timer;
+#ifdef CONFIG_PREEMPT_RT
spinlock_t expiry_lock;
+ atomic_t timer_waiters;
+#endif
unsigned long clk;
unsigned long next_expiry;
unsigned int cpu;
@@ -1227,8 +1230,14 @@ int del_timer(struct timer_list *timer)
}
EXPORT_SYMBOL(del_timer);
-static int __try_to_del_timer_sync(struct timer_list *timer,
- struct timer_base **basep)
+/**
+ * try_to_del_timer_sync - Try to deactivate a timer
+ * @timer: timer to delete
+ *
+ * This function tries to deactivate a timer. Upon successful (ret >= 0)
+ * exit the timer is not queued and the handler is not running on any CPU.
+ */
+int try_to_del_timer_sync(struct timer_list *timer)
{
struct timer_base *base;
unsigned long flags;
@@ -1236,7 +1245,7 @@ static int __try_to_del_timer_sync(struct timer_list *timer,
debug_assert_init(timer);
- *basep = base = lock_timer_base(timer, &flags);
+ base = lock_timer_base(timer, &flags);
if (base->running_timer != timer)
ret = detach_if_pending(timer, base, true);
@@ -1245,45 +1254,80 @@ static int __try_to_del_timer_sync(struct timer_list *timer,
return ret;
}
+EXPORT_SYMBOL(try_to_del_timer_sync);
-/**
- * try_to_del_timer_sync - Try to deactivate a timer
- * @timer: timer to delete
- *
- * This function tries to deactivate a timer. Upon successful (ret >= 0)
- * exit the timer is not queued and the handler is not running on any CPU.
- */
-int try_to_del_timer_sync(struct timer_list *timer)
+#ifdef CONFIG_PREEMPT_RT
+static __init void timer_base_init_expiry_lock(struct timer_base *base)
{
- struct timer_base *base;
+ spin_lock_init(&base->expiry_lock);
+}
- return __try_to_del_timer_sync(timer, &base);
+static inline void timer_base_lock_expiry(struct timer_base *base)
+{
+ spin_lock(&base->expiry_lock);
}
-EXPORT_SYMBOL(try_to_del_timer_sync);
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
-static int __del_timer_sync(struct timer_list *timer)
+static inline void timer_base_unlock_expiry(struct timer_base *base)
{
- struct timer_base *base;
- int ret;
+ spin_unlock(&base->expiry_lock);
+}
- for (;;) {
- ret = __try_to_del_timer_sync(timer, &base);
- if (ret >= 0)
- return ret;
+/*
+ * The counterpart to del_timer_wait_running().
+ *
+ * If there is a waiter for base->expiry_lock, then it was waiting for the
+ * timer callback to finish. Drop expiry_lock and reaquire it. That allows
+ * the waiter to acquire the lock and make progress.
+ */
+static void timer_sync_wait_running(struct timer_base *base)
+{
+ if (atomic_read(&base->timer_waiters)) {
+ spin_unlock(&base->expiry_lock);
+ spin_lock(&base->expiry_lock);
+ }
+}
- if (READ_ONCE(timer->flags) & TIMER_IRQSAFE)
- continue;
+/*
+ * This function is called on PREEMPT_RT kernels when the fast path
+ * deletion of a timer failed because the timer callback function was
+ * running.
+ *
+ * This prevents priority inversion, if the softirq thread on a remote CPU
+ * got preempted, and it prevents a life lock when the task which tries to
+ * delete a timer preempted the softirq thread running the timer callback
+ * function.
+ */
+static void del_timer_wait_running(struct timer_list *timer)
+{
+ u32 tf;
+
+ tf = READ_ONCE(timer->flags);
+ if (!(tf & TIMER_MIGRATING)) {
+ struct timer_base *base = get_timer_base(tf);
/*
- * When accessing the lock, timers of base are no longer expired
- * and so timer is no longer running.
+ * Mark the base as contended and grab the expiry lock,
+ * which is held by the softirq across the timer
+ * callback. Drop the lock immediately so the softirq can
+ * expire the next timer. In theory the timer could already
+ * be running again, but that's more than unlikely and just
+ * causes another wait loop.
*/
- spin_lock(&base->expiry_lock);
- spin_unlock(&base->expiry_lock);
+ atomic_inc(&base->timer_waiters);
+ spin_lock_bh(&base->expiry_lock);
+ atomic_dec(&base->timer_waiters);
+ spin_unlock_bh(&base->expiry_lock);
}
}
+#else
+static inline void timer_base_init_expiry_lock(struct timer_base *base) { }
+static inline void timer_base_lock_expiry(struct timer_base *base) { }
+static inline void timer_base_unlock_expiry(struct timer_base *base) { }
+static inline void timer_sync_wait_running(struct timer_base *base) { }
+static inline void del_timer_wait_running(struct timer_list *timer) { }
+#endif
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL)
/**
* del_timer_sync - deactivate a timer and wait for the handler to finish.
* @timer: the timer to be deactivated
@@ -1322,6 +1366,8 @@ static int __del_timer_sync(struct timer_list *timer)
*/
int del_timer_sync(struct timer_list *timer)
{
+ int ret;
+
#ifdef CONFIG_LOCKDEP
unsigned long flags;
@@ -1339,14 +1385,17 @@ int del_timer_sync(struct timer_list *timer)
* could lead to deadlock.
*/
WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
- /*
- * Must be able to sleep on PREEMPT_RT because of the slowpath in
- * __del_timer_sync().
- */
- if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE))
- might_sleep();
- return __del_timer_sync(timer);
+ do {
+ ret = try_to_del_timer_sync(timer);
+
+ if (unlikely(ret < 0)) {
+ del_timer_wait_running(timer);
+ cpu_relax();
+ }
+ } while (ret < 0);
+
+ return ret;
}
EXPORT_SYMBOL(del_timer_sync);
#endif
@@ -1410,15 +1459,12 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
raw_spin_unlock(&base->lock);
call_timer_fn(timer, fn);
base->running_timer = NULL;
- spin_unlock(&base->expiry_lock);
- spin_lock(&base->expiry_lock);
raw_spin_lock(&base->lock);
} else {
raw_spin_unlock_irq(&base->lock);
call_timer_fn(timer, fn);
base->running_timer = NULL;
- spin_unlock(&base->expiry_lock);
- spin_lock(&base->expiry_lock);
+ timer_sync_wait_running(base);
raw_spin_lock_irq(&base->lock);
}
}
@@ -1715,7 +1761,7 @@ static inline void __run_timers(struct timer_base *base)
if (!time_after_eq(jiffies, base->clk))
return;
- spin_lock(&base->expiry_lock);
+ timer_base_lock_expiry(base);
raw_spin_lock_irq(&base->lock);
/*
@@ -1743,7 +1789,7 @@ static inline void __run_timers(struct timer_base *base)
expire_timers(base, heads + levels);
}
raw_spin_unlock_irq(&base->lock);
- spin_unlock(&base->expiry_lock);
+ timer_base_unlock_expiry(base);
}
/*
@@ -1990,7 +2036,7 @@ static void __init init_timer_cpu(int cpu)
base->cpu = cpu;
raw_spin_lock_init(&base->lock);
base->clk = jiffies;
- spin_lock_init(&base->expiry_lock);
+ timer_base_init_expiry_lock(base);
}
}
|