diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 10:06:00 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 10:06:00 +0000 |
commit | b15a952c52a6825376d3e7f6c1bf5c886c6d8b74 (patch) | |
tree | 1500f2f8f276908a36d8126cb632c0d6b1276764 /debian/patches-rt | |
parent | Adding upstream version 5.10.209. (diff) | |
download | linux-b15a952c52a6825376d3e7f6c1bf5c886c6d8b74.tar.xz linux-b15a952c52a6825376d3e7f6c1bf5c886c6d8b74.zip |
Adding debian version 5.10.209-2.debian/5.10.209-2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
324 files changed, 43253 insertions, 0 deletions
diff --git a/debian/patches-rt/0001-z3fold-remove-preempt-disabled-sections-for-RT.patch b/debian/patches-rt/0001-z3fold-remove-preempt-disabled-sections-for-RT.patch new file mode 100644 index 000000000..c161007b7 --- /dev/null +++ b/debian/patches-rt/0001-z3fold-remove-preempt-disabled-sections-for-RT.patch @@ -0,0 +1,86 @@ +From 373cc1c1427a46b4bf77f0d782d8bd8b2d00bc54 Mon Sep 17 00:00:00 2001 +From: Vitaly Wool <vitaly.wool@konsulko.com> +Date: Mon, 14 Dec 2020 19:12:36 -0800 +Subject: [PATCH 001/323] z3fold: remove preempt disabled sections for RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Replace get_cpu_ptr() with migrate_disable()+this_cpu_ptr() so RT can take +spinlocks that become sleeping locks. + +Signed-off-by Mike Galbraith <efault@gmx.de> + +Link: https://lkml.kernel.org/r/20201209145151.18994-3-vitaly.wool@konsulko.com +Signed-off-by: Vitaly Wool <vitaly.wool@konsulko.com> +Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + mm/z3fold.c | 17 ++++++++++------- + 1 file changed, 10 insertions(+), 7 deletions(-) + +diff --git a/mm/z3fold.c b/mm/z3fold.c +index 912ac9a64a15..f3d875fcaeb7 100644 +--- a/mm/z3fold.c ++++ b/mm/z3fold.c +@@ -623,14 +623,16 @@ static inline void add_to_unbuddied(struct z3fold_pool *pool, + { + if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || + zhdr->middle_chunks == 0) { +- struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied); +- ++ struct list_head *unbuddied; + int freechunks = num_free_chunks(zhdr); ++ ++ migrate_disable(); ++ unbuddied = this_cpu_ptr(pool->unbuddied); + spin_lock(&pool->lock); + list_add(&zhdr->buddy, &unbuddied[freechunks]); + spin_unlock(&pool->lock); + zhdr->cpu = smp_processor_id(); +- put_cpu_ptr(pool->unbuddied); ++ migrate_enable(); + } + } + +@@ -880,8 +882,9 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool, + int chunks = size_to_chunks(size), i; + + lookup: ++ migrate_disable(); + /* First, try to find an unbuddied z3fold page. */ +- unbuddied = get_cpu_ptr(pool->unbuddied); ++ unbuddied = this_cpu_ptr(pool->unbuddied); + for_each_unbuddied_list(i, chunks) { + struct list_head *l = &unbuddied[i]; + +@@ -899,7 +902,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool, + !z3fold_page_trylock(zhdr)) { + spin_unlock(&pool->lock); + zhdr = NULL; +- put_cpu_ptr(pool->unbuddied); ++ migrate_enable(); + if (can_sleep) + cond_resched(); + goto lookup; +@@ -913,7 +916,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool, + test_bit(PAGE_CLAIMED, &page->private)) { + z3fold_page_unlock(zhdr); + zhdr = NULL; +- put_cpu_ptr(pool->unbuddied); ++ migrate_enable(); + if (can_sleep) + cond_resched(); + goto lookup; +@@ -928,7 +931,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool, + kref_get(&zhdr->refcount); + break; + } +- put_cpu_ptr(pool->unbuddied); ++ migrate_enable(); + + if (!zhdr) { + int cpu; +-- +2.43.0 + diff --git a/debian/patches-rt/0002-stop_machine-Add-function-and-caller-debug-info.patch b/debian/patches-rt/0002-stop_machine-Add-function-and-caller-debug-info.patch new file mode 100644 index 000000000..f68789cd5 --- /dev/null +++ b/debian/patches-rt/0002-stop_machine-Add-function-and-caller-debug-info.patch @@ -0,0 +1,135 @@ +From 863bc087d69814dc4113ca9ec91ce1895b53480e Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Fri, 23 Oct 2020 12:11:59 +0200 +Subject: [PATCH 002/323] stop_machine: Add function and caller debug info +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Crashes in stop-machine are hard to connect to the calling code, add a +little something to help with that. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/stop_machine.h | 5 +++++ + kernel/stop_machine.c | 23 ++++++++++++++++++++--- + lib/dump_stack.c | 2 ++ + 3 files changed, 27 insertions(+), 3 deletions(-) + +--- a/include/linux/stop_machine.h ++++ b/include/linux/stop_machine.h +@@ -24,6 +24,7 @@ + struct cpu_stop_work { + struct list_head list; /* cpu_stopper->works */ + cpu_stop_fn_t fn; ++ unsigned long caller; + void *arg; + struct cpu_stop_done *done; + }; +@@ -36,6 +37,8 @@ + void stop_machine_unpark(int cpu); + void stop_machine_yield(const struct cpumask *cpumask); + ++extern void print_stop_info(const char *log_lvl, struct task_struct *task); ++ + #else /* CONFIG_SMP */ + + #include <linux/workqueue.h> +@@ -80,6 +83,8 @@ + return false; + } + ++static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { } ++ + #endif /* CONFIG_SMP */ + + /* +--- a/kernel/stop_machine.c ++++ b/kernel/stop_machine.c +@@ -42,11 +42,23 @@ + struct list_head works; /* list of pending works */ + + struct cpu_stop_work stop_work; /* for stop_cpus */ ++ unsigned long caller; ++ cpu_stop_fn_t fn; + }; + + static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); + static bool stop_machine_initialized = false; + ++void print_stop_info(const char *log_lvl, struct task_struct *task) ++{ ++ struct cpu_stopper *stopper = this_cpu_ptr(&cpu_stopper); ++ ++ if (task != stopper->thread) ++ return; ++ ++ printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller); ++} ++ + /* static data for stop_cpus */ + static DEFINE_MUTEX(stop_cpus_mutex); + static bool stop_cpus_in_progress; +@@ -123,7 +135,7 @@ + int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) + { + struct cpu_stop_done done; +- struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; ++ struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ }; + + cpu_stop_init_done(&done, 1); + if (!cpu_stop_queue_work(cpu, &work)) +@@ -331,7 +343,8 @@ + work1 = work2 = (struct cpu_stop_work){ + .fn = multi_cpu_stop, + .arg = &msdata, +- .done = &done ++ .done = &done, ++ .caller = _RET_IP_, + }; + + cpu_stop_init_done(&done, 2); +@@ -367,7 +380,7 @@ + bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, + struct cpu_stop_work *work_buf) + { +- *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; ++ *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, }; + return cpu_stop_queue_work(cpu, work_buf); + } + +@@ -487,6 +500,8 @@ + int ret; + + /* cpu stop callbacks must not sleep, make in_atomic() == T */ ++ stopper->caller = work->caller; ++ stopper->fn = fn; + preempt_count_inc(); + ret = fn(arg); + if (done) { +@@ -495,6 +510,8 @@ + cpu_stop_signal_done(done); + } + preempt_count_dec(); ++ stopper->fn = NULL; ++ stopper->caller = 0; + WARN_ONCE(preempt_count(), + "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg); + goto repeat; +--- a/lib/dump_stack.c ++++ b/lib/dump_stack.c +@@ -12,6 +12,7 @@ + #include <linux/atomic.h> + #include <linux/kexec.h> + #include <linux/utsname.h> ++#include <linux/stop_machine.h> + #include <generated/package.h> + + static char dump_stack_arch_desc_str[128]; +@@ -59,6 +60,7 @@ + log_lvl, dump_stack_arch_desc_str); + + print_worker_info(log_lvl, current); ++ print_stop_info(log_lvl, current); + } + + /** diff --git a/debian/patches-rt/0003-sched-Fix-balance_callback.patch b/debian/patches-rt/0003-sched-Fix-balance_callback.patch new file mode 100644 index 000000000..265608440 --- /dev/null +++ b/debian/patches-rt/0003-sched-Fix-balance_callback.patch @@ -0,0 +1,235 @@ +From db31f287a66b122733214b4a2bac9d59a564b77d Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Fri, 23 Oct 2020 12:12:00 +0200 +Subject: [PATCH 003/323] sched: Fix balance_callback() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The intent of balance_callback() has always been to delay executing +balancing operations until the end of the current rq->lock section. +This is because balance operations must often drop rq->lock, and that +isn't safe in general. + +However, as noted by Scott, there were a few holes in that scheme; +balance_callback() was called after rq->lock was dropped, which means +another CPU can interleave and touch the callback list. + +Rework code to call the balance callbacks before dropping rq->lock +where possible, and otherwise splice the balance list onto a local +stack. + +This guarantees that the balance list must be empty when we take +rq->lock. IOW, we'll only ever run our own balance callbacks. + +Reported-by: Scott Wood <swood@redhat.com> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/sched/core.c | 119 +++++++++++++++++++++++++++---------------- + kernel/sched/sched.h | 3 ++ + 2 files changed, 78 insertions(+), 44 deletions(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 40f40f359c5d..ab8b7fd46334 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -3511,6 +3511,69 @@ static inline void finish_task(struct task_struct *prev) + #endif + } + ++#ifdef CONFIG_SMP ++ ++static void do_balance_callbacks(struct rq *rq, struct callback_head *head) ++{ ++ void (*func)(struct rq *rq); ++ struct callback_head *next; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ while (head) { ++ func = (void (*)(struct rq *))head->func; ++ next = head->next; ++ head->next = NULL; ++ head = next; ++ ++ func(rq); ++ } ++} ++ ++static inline struct callback_head *splice_balance_callbacks(struct rq *rq) ++{ ++ struct callback_head *head = rq->balance_callback; ++ ++ lockdep_assert_held(&rq->lock); ++ if (head) ++ rq->balance_callback = NULL; ++ ++ return head; ++} ++ ++static void __balance_callbacks(struct rq *rq) ++{ ++ do_balance_callbacks(rq, splice_balance_callbacks(rq)); ++} ++ ++static inline void balance_callbacks(struct rq *rq, struct callback_head *head) ++{ ++ unsigned long flags; ++ ++ if (unlikely(head)) { ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ do_balance_callbacks(rq, head); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ } ++} ++ ++#else ++ ++static inline void __balance_callbacks(struct rq *rq) ++{ ++} ++ ++static inline struct callback_head *splice_balance_callbacks(struct rq *rq) ++{ ++ return NULL; ++} ++ ++static inline void balance_callbacks(struct rq *rq, struct callback_head *head) ++{ ++} ++ ++#endif ++ + static inline void + prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) + { +@@ -3536,6 +3599,7 @@ static inline void finish_lock_switch(struct rq *rq) + * prev into current: + */ + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); ++ __balance_callbacks(rq); + raw_spin_unlock_irq(&rq->lock); + } + +@@ -3677,43 +3741,6 @@ static struct rq *finish_task_switch(struct task_struct *prev) + return rq; + } + +-#ifdef CONFIG_SMP +- +-/* rq->lock is NOT held, but preemption is disabled */ +-static void __balance_callback(struct rq *rq) +-{ +- struct callback_head *head, *next; +- void (*func)(struct rq *rq); +- unsigned long flags; +- +- raw_spin_lock_irqsave(&rq->lock, flags); +- head = rq->balance_callback; +- rq->balance_callback = NULL; +- while (head) { +- func = (void (*)(struct rq *))head->func; +- next = head->next; +- head->next = NULL; +- head = next; +- +- func(rq); +- } +- raw_spin_unlock_irqrestore(&rq->lock, flags); +-} +- +-static inline void balance_callback(struct rq *rq) +-{ +- if (unlikely(rq->balance_callback)) +- __balance_callback(rq); +-} +- +-#else +- +-static inline void balance_callback(struct rq *rq) +-{ +-} +- +-#endif +- + /** + * schedule_tail - first thing a freshly forked thread must call. + * @prev: the thread we just switched away from. +@@ -3733,7 +3760,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) + */ + + rq = finish_task_switch(prev); +- balance_callback(rq); + preempt_enable(); + + if (current->set_child_tid) +@@ -4548,10 +4574,11 @@ static void __sched notrace __schedule(bool preempt) + rq = context_switch(rq, prev, next, &rf); + } else { + rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); +- rq_unlock_irq(rq, &rf); +- } + +- balance_callback(rq); ++ rq_unpin_lock(rq, &rf); ++ __balance_callbacks(rq); ++ raw_spin_unlock_irq(&rq->lock); ++ } + } + + void __noreturn do_task_dead(void) +@@ -4972,9 +4999,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) + out_unlock: + /* Avoid rq from going away on us: */ + preempt_disable(); +- __task_rq_unlock(rq, &rf); + +- balance_callback(rq); ++ rq_unpin_lock(rq, &rf); ++ __balance_callbacks(rq); ++ raw_spin_unlock(&rq->lock); ++ + preempt_enable(); + } + #else +@@ -5217,6 +5246,7 @@ static int __sched_setscheduler(struct task_struct *p, + int oldpolicy = -1, policy = attr->sched_policy; + int retval, oldprio, newprio, queued, running; + const struct sched_class *prev_class; ++ struct callback_head *head; + struct rq_flags rf; + int reset_on_fork; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; +@@ -5466,6 +5496,7 @@ static int __sched_setscheduler(struct task_struct *p, + + /* Avoid rq from going away on us: */ + preempt_disable(); ++ head = splice_balance_callbacks(rq); + task_rq_unlock(rq, p, &rf); + + if (pi) { +@@ -5475,7 +5506,7 @@ static int __sched_setscheduler(struct task_struct *p, + } + + /* Run balance callbacks after we've adjusted the PI chain: */ +- balance_callback(rq); ++ balance_callbacks(rq, head); + preempt_enable(); + + return 0; +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 8de07aba8bdd..fd71da3a4f0f 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1225,6 +1225,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) + rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); + rf->clock_update_flags = 0; + #endif ++#ifdef CONFIG_SMP ++ SCHED_WARN_ON(rq->balance_callback); ++#endif + } + + static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) +-- +2.43.0 + diff --git a/debian/patches-rt/0004-sched-hotplug-Ensure-only-per-cpu-kthreads-run-durin.patch b/debian/patches-rt/0004-sched-hotplug-Ensure-only-per-cpu-kthreads-run-durin.patch new file mode 100644 index 000000000..6cb47e482 --- /dev/null +++ b/debian/patches-rt/0004-sched-hotplug-Ensure-only-per-cpu-kthreads-run-durin.patch @@ -0,0 +1,244 @@ +From bc9c6ea411da55a929b5bc3663c0a89449613d47 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Fri, 23 Oct 2020 12:12:01 +0200 +Subject: [PATCH 004/323] sched/hotplug: Ensure only per-cpu kthreads run + during hotplug +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +In preparation for migrate_disable(), make sure only per-cpu kthreads +are allowed to run on !active CPUs. + +This is ran (as one of the very first steps) from the cpu-hotplug +task which is a per-cpu kthread and completion of the hotplug +operation only requires such tasks. + +This constraint enables the migrate_disable() implementation to wait +for completion of all migrate_disable regions on this CPU at hotplug +time without fear of any new ones starting. + +This replaces the unlikely(rq->balance_callbacks) test at the tail of +context_switch with an unlikely(rq->balance_work), the fast path is +not affected. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/sched/core.c | 114 ++++++++++++++++++++++++++++++++++++++++++- + kernel/sched/sched.h | 7 ++- + 2 files changed, 118 insertions(+), 3 deletions(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index ab8b7fd46334..32c3acef5781 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -3535,8 +3535,10 @@ static inline struct callback_head *splice_balance_callbacks(struct rq *rq) + struct callback_head *head = rq->balance_callback; + + lockdep_assert_held(&rq->lock); +- if (head) ++ if (head) { + rq->balance_callback = NULL; ++ rq->balance_flags &= ~BALANCE_WORK; ++ } + + return head; + } +@@ -3557,6 +3559,21 @@ static inline void balance_callbacks(struct rq *rq, struct callback_head *head) + } + } + ++static void balance_push(struct rq *rq); ++ ++static inline void balance_switch(struct rq *rq) ++{ ++ if (likely(!rq->balance_flags)) ++ return; ++ ++ if (rq->balance_flags & BALANCE_PUSH) { ++ balance_push(rq); ++ return; ++ } ++ ++ __balance_callbacks(rq); ++} ++ + #else + + static inline void __balance_callbacks(struct rq *rq) +@@ -3572,6 +3589,10 @@ static inline void balance_callbacks(struct rq *rq, struct callback_head *head) + { + } + ++static inline void balance_switch(struct rq *rq) ++{ ++} ++ + #endif + + static inline void +@@ -3599,7 +3620,7 @@ static inline void finish_lock_switch(struct rq *rq) + * prev into current: + */ + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); +- __balance_callbacks(rq); ++ balance_switch(rq); + raw_spin_unlock_irq(&rq->lock); + } + +@@ -6833,6 +6854,90 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) + + rq->stop = stop; + } ++ ++static int __balance_push_cpu_stop(void *arg) ++{ ++ struct task_struct *p = arg; ++ struct rq *rq = this_rq(); ++ struct rq_flags rf; ++ int cpu; ++ ++ raw_spin_lock_irq(&p->pi_lock); ++ rq_lock(rq, &rf); ++ ++ update_rq_clock(rq); ++ ++ if (task_rq(p) == rq && task_on_rq_queued(p)) { ++ cpu = select_fallback_rq(rq->cpu, p); ++ rq = __migrate_task(rq, &rf, p, cpu); ++ } ++ ++ rq_unlock(rq, &rf); ++ raw_spin_unlock_irq(&p->pi_lock); ++ ++ put_task_struct(p); ++ ++ return 0; ++} ++ ++static DEFINE_PER_CPU(struct cpu_stop_work, push_work); ++ ++/* ++ * Ensure we only run per-cpu kthreads once the CPU goes !active. ++ */ ++static void balance_push(struct rq *rq) ++{ ++ struct task_struct *push_task = rq->curr; ++ ++ lockdep_assert_held(&rq->lock); ++ SCHED_WARN_ON(rq->cpu != smp_processor_id()); ++ ++ /* ++ * Both the cpu-hotplug and stop task are in this case and are ++ * required to complete the hotplug process. ++ */ ++ if (is_per_cpu_kthread(push_task)) ++ return; ++ ++ get_task_struct(push_task); ++ /* ++ * Temporarily drop rq->lock such that we can wake-up the stop task. ++ * Both preemption and IRQs are still disabled. ++ */ ++ raw_spin_unlock(&rq->lock); ++ stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task, ++ this_cpu_ptr(&push_work)); ++ /* ++ * At this point need_resched() is true and we'll take the loop in ++ * schedule(). The next pick is obviously going to be the stop task ++ * which is_per_cpu_kthread() and will push this task away. ++ */ ++ raw_spin_lock(&rq->lock); ++} ++ ++static void balance_push_set(int cpu, bool on) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++ rq_lock_irqsave(rq, &rf); ++ if (on) ++ rq->balance_flags |= BALANCE_PUSH; ++ else ++ rq->balance_flags &= ~BALANCE_PUSH; ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++#else ++ ++static inline void balance_push(struct rq *rq) ++{ ++} ++ ++static inline void balance_push_set(int cpu, bool on) ++{ ++} ++ + #endif /* CONFIG_HOTPLUG_CPU */ + + void set_rq_online(struct rq *rq) +@@ -6920,6 +7025,8 @@ int sched_cpu_activate(unsigned int cpu) + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + ++ balance_push_set(cpu, false); ++ + #ifdef CONFIG_SCHED_SMT + /* + * When going up, increment the number of cores with SMT present. +@@ -6967,6 +7074,8 @@ int sched_cpu_deactivate(unsigned int cpu) + */ + synchronize_rcu(); + ++ balance_push_set(cpu, true); ++ + #ifdef CONFIG_SCHED_SMT + /* + * When going down, decrement the number of cores with SMT present. +@@ -6980,6 +7089,7 @@ int sched_cpu_deactivate(unsigned int cpu) + + ret = cpuset_cpu_inactive(cpu); + if (ret) { ++ balance_push_set(cpu, false); + set_cpu_active(cpu, true); + return ret; + } +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index fd71da3a4f0f..81dc4212423a 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -976,6 +976,7 @@ struct rq { + unsigned long cpu_capacity_inverted; + + struct callback_head *balance_callback; ++ unsigned char balance_flags; + + unsigned char nohz_idle_balance; + unsigned char idle_balance; +@@ -1389,6 +1390,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) + + #ifdef CONFIG_SMP + ++#define BALANCE_WORK 0x01 ++#define BALANCE_PUSH 0x02 ++ + static inline void + queue_balance_callback(struct rq *rq, + struct callback_head *head, +@@ -1396,12 +1400,13 @@ queue_balance_callback(struct rq *rq, + { + lockdep_assert_held(&rq->lock); + +- if (unlikely(head->next)) ++ if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH))) + return; + + head->func = (void (*)(struct callback_head *))func; + head->next = rq->balance_callback; + rq->balance_callback = head; ++ rq->balance_flags |= BALANCE_WORK; + } + + #define rcu_dereference_check_sched_domain(p) \ +-- +2.43.0 + diff --git a/debian/patches-rt/0005-sched-core-Wait-for-tasks-being-pushed-away-on-hotpl.patch b/debian/patches-rt/0005-sched-core-Wait-for-tasks-being-pushed-away-on-hotpl.patch new file mode 100644 index 000000000..abd6b3875 --- /dev/null +++ b/debian/patches-rt/0005-sched-core-Wait-for-tasks-being-pushed-away-on-hotpl.patch @@ -0,0 +1,124 @@ +From 94b59ab31222fc252603987e1ee316264426a015 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 23 Oct 2020 12:12:02 +0200 +Subject: [PATCH 005/323] sched/core: Wait for tasks being pushed away on + hotplug +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +RT kernels need to ensure that all tasks which are not per CPU kthreads +have left the outgoing CPU to guarantee that no tasks are force migrated +within a migrate disabled section. + +There is also some desire to (ab)use fine grained CPU hotplug control to +clear a CPU from active state to force migrate tasks which are not per CPU +kthreads away for power control purposes. + +Add a mechanism which waits until all tasks which should leave the CPU +after the CPU active flag is cleared have moved to a different online CPU. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/sched/core.c | 40 +++++++++++++++++++++++++++++++++++++++- + kernel/sched/sched.h | 4 ++++ + 2 files changed, 43 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 32c3acef5781..b902755615d7 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -6896,8 +6896,21 @@ static void balance_push(struct rq *rq) + * Both the cpu-hotplug and stop task are in this case and are + * required to complete the hotplug process. + */ +- if (is_per_cpu_kthread(push_task)) ++ if (is_per_cpu_kthread(push_task)) { ++ /* ++ * If this is the idle task on the outgoing CPU try to wake ++ * up the hotplug control thread which might wait for the ++ * last task to vanish. The rcuwait_active() check is ++ * accurate here because the waiter is pinned on this CPU ++ * and can't obviously be running in parallel. ++ */ ++ if (!rq->nr_running && rcuwait_active(&rq->hotplug_wait)) { ++ raw_spin_unlock(&rq->lock); ++ rcuwait_wake_up(&rq->hotplug_wait); ++ raw_spin_lock(&rq->lock); ++ } + return; ++ } + + get_task_struct(push_task); + /* +@@ -6928,6 +6941,20 @@ static void balance_push_set(int cpu, bool on) + rq_unlock_irqrestore(rq, &rf); + } + ++/* ++ * Invoked from a CPUs hotplug control thread after the CPU has been marked ++ * inactive. All tasks which are not per CPU kernel threads are either ++ * pushed off this CPU now via balance_push() or placed on a different CPU ++ * during wakeup. Wait until the CPU is quiescent. ++ */ ++static void balance_hotplug_wait(void) ++{ ++ struct rq *rq = this_rq(); ++ ++ rcuwait_wait_event(&rq->hotplug_wait, rq->nr_running == 1, ++ TASK_UNINTERRUPTIBLE); ++} ++ + #else + + static inline void balance_push(struct rq *rq) +@@ -6938,6 +6965,10 @@ static inline void balance_push_set(int cpu, bool on) + { + } + ++static inline void balance_hotplug_wait(void) ++{ ++} ++ + #endif /* CONFIG_HOTPLUG_CPU */ + + void set_rq_online(struct rq *rq) +@@ -7094,6 +7125,10 @@ int sched_cpu_deactivate(unsigned int cpu) + return ret; + } + sched_domains_numa_masks_clear(cpu); ++ ++ /* Wait for all non per CPU kernel threads to vanish. */ ++ balance_hotplug_wait(); ++ + return 0; + } + +@@ -7334,6 +7369,9 @@ void __init sched_init(void) + + rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); + #endif ++#ifdef CONFIG_HOTPLUG_CPU ++ rcuwait_init(&rq->hotplug_wait); ++#endif + #endif /* CONFIG_SMP */ + hrtick_rq_init(rq); + atomic_set(&rq->nr_iowait, 0); +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 81dc4212423a..a72464d370cd 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1007,6 +1007,10 @@ struct rq { + + /* This is used to determine avg_idle's max value */ + u64 max_idle_balance_cost; ++ ++#ifdef CONFIG_HOTPLUG_CPU ++ struct rcuwait hotplug_wait; ++#endif + #endif /* CONFIG_SMP */ + + #ifdef CONFIG_IRQ_TIME_ACCOUNTING +-- +2.43.0 + diff --git a/debian/patches-rt/0006-workqueue-Manually-break-affinity-on-hotplug.patch b/debian/patches-rt/0006-workqueue-Manually-break-affinity-on-hotplug.patch new file mode 100644 index 000000000..7eb185ce2 --- /dev/null +++ b/debian/patches-rt/0006-workqueue-Manually-break-affinity-on-hotplug.patch @@ -0,0 +1,34 @@ +From e3372f0d43a35f1070d1689e6aa1f84d6cd9784b Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Fri, 23 Oct 2020 12:12:03 +0200 +Subject: [PATCH 006/323] workqueue: Manually break affinity on hotplug +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Don't rely on the scheduler to force break affinity for us -- it will +stop doing that for per-cpu-kthreads. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Acked-by: Tejun Heo <tj@kernel.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/workqueue.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/kernel/workqueue.c b/kernel/workqueue.c +index 63140e4dd5df..7c14773dc4c4 100644 +--- a/kernel/workqueue.c ++++ b/kernel/workqueue.c +@@ -4941,6 +4941,10 @@ static void unbind_workers(int cpu) + pool->flags |= POOL_DISASSOCIATED; + + raw_spin_unlock_irq(&pool->lock); ++ ++ for_each_pool_worker(worker, pool) ++ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0); ++ + mutex_unlock(&wq_pool_attach_mutex); + + /* +-- +2.43.0 + diff --git a/debian/patches-rt/0007-sched-hotplug-Consolidate-task-migration-on-CPU-unpl.patch b/debian/patches-rt/0007-sched-hotplug-Consolidate-task-migration-on-CPU-unpl.patch new file mode 100644 index 000000000..606e32fa1 --- /dev/null +++ b/debian/patches-rt/0007-sched-hotplug-Consolidate-task-migration-on-CPU-unpl.patch @@ -0,0 +1,283 @@ +From 61ebefbbf2500f6ade2182806061526bbde76a28 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 23 Oct 2020 12:12:04 +0200 +Subject: [PATCH 007/323] sched/hotplug: Consolidate task migration on CPU + unplug +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +With the new mechanism which kicks tasks off the outgoing CPU at the end of +schedule() the situation on an outgoing CPU right before the stopper thread +brings it down completely is: + + - All user tasks and all unbound kernel threads have either been migrated + away or are not running and the next wakeup will move them to a online CPU. + + - All per CPU kernel threads, except cpu hotplug thread and the stopper + thread have either been unbound or parked by the responsible CPU hotplug + callback. + +That means that at the last step before the stopper thread is invoked the +cpu hotplug thread is the last legitimate running task on the outgoing +CPU. + +Add a final wait step right before the stopper thread is kicked which +ensures that any still running tasks on the way to park or on the way to +kick themself of the CPU are either sleeping or gone. + +This allows to remove the migrate_tasks() crutch in sched_cpu_dying(). If +sched_cpu_dying() detects that there is still another running task aside of +the stopper thread then it will explode with the appropriate fireworks. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/cpuhotplug.h | 1 + + include/linux/sched/hotplug.h | 2 + + kernel/cpu.c | 9 +- + kernel/sched/core.c | 154 ++++++++-------------------------- + 4 files changed, 46 insertions(+), 120 deletions(-) + +diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h +index f5a5df3a8cfd..daf39c04a552 100644 +--- a/include/linux/cpuhotplug.h ++++ b/include/linux/cpuhotplug.h +@@ -155,6 +155,7 @@ enum cpuhp_state { + CPUHP_AP_ONLINE, + CPUHP_TEARDOWN_CPU, + CPUHP_AP_ONLINE_IDLE, ++ CPUHP_AP_SCHED_WAIT_EMPTY, + CPUHP_AP_SMPBOOT_THREADS, + CPUHP_AP_X86_VDSO_VMA_ONLINE, + CPUHP_AP_IRQ_AFFINITY_ONLINE, +diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h +index 9a62ffdd296f..412cdaba33eb 100644 +--- a/include/linux/sched/hotplug.h ++++ b/include/linux/sched/hotplug.h +@@ -11,8 +11,10 @@ extern int sched_cpu_activate(unsigned int cpu); + extern int sched_cpu_deactivate(unsigned int cpu); + + #ifdef CONFIG_HOTPLUG_CPU ++extern int sched_cpu_wait_empty(unsigned int cpu); + extern int sched_cpu_dying(unsigned int cpu); + #else ++# define sched_cpu_wait_empty NULL + # define sched_cpu_dying NULL + #endif + +diff --git a/kernel/cpu.c b/kernel/cpu.c +index abf717c4f57c..f8280edb679f 100644 +--- a/kernel/cpu.c ++++ b/kernel/cpu.c +@@ -1674,7 +1674,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { + .name = "ap:online", + }, + /* +- * Handled on controll processor until the plugged processor manages ++ * Handled on control processor until the plugged processor manages + * this itself. + */ + [CPUHP_TEARDOWN_CPU] = { +@@ -1683,6 +1683,13 @@ static struct cpuhp_step cpuhp_hp_states[] = { + .teardown.single = takedown_cpu, + .cant_stop = true, + }, ++ ++ [CPUHP_AP_SCHED_WAIT_EMPTY] = { ++ .name = "sched:waitempty", ++ .startup.single = NULL, ++ .teardown.single = sched_cpu_wait_empty, ++ }, ++ + /* Handle smpboot threads park/unpark */ + [CPUHP_AP_SMPBOOT_THREADS] = { + .name = "smpboot/threads:online", +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index b902755615d7..a26a82c3e939 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -6741,120 +6741,6 @@ void idle_task_exit(void) + /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ + } + +-/* +- * Since this CPU is going 'away' for a while, fold any nr_active delta +- * we might have. Assumes we're called after migrate_tasks() so that the +- * nr_active count is stable. We need to take the teardown thread which +- * is calling this into account, so we hand in adjust = 1 to the load +- * calculation. +- * +- * Also see the comment "Global load-average calculations". +- */ +-static void calc_load_migrate(struct rq *rq) +-{ +- long delta = calc_load_fold_active(rq, 1); +- if (delta) +- atomic_long_add(delta, &calc_load_tasks); +-} +- +-static struct task_struct *__pick_migrate_task(struct rq *rq) +-{ +- const struct sched_class *class; +- struct task_struct *next; +- +- for_each_class(class) { +- next = class->pick_next_task(rq); +- if (next) { +- next->sched_class->put_prev_task(rq, next); +- return next; +- } +- } +- +- /* The idle class should always have a runnable task */ +- BUG(); +-} +- +-/* +- * Migrate all tasks from the rq, sleeping tasks will be migrated by +- * try_to_wake_up()->select_task_rq(). +- * +- * Called with rq->lock held even though we'er in stop_machine() and +- * there's no concurrency possible, we hold the required locks anyway +- * because of lock validation efforts. +- */ +-static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) +-{ +- struct rq *rq = dead_rq; +- struct task_struct *next, *stop = rq->stop; +- struct rq_flags orf = *rf; +- int dest_cpu; +- +- /* +- * Fudge the rq selection such that the below task selection loop +- * doesn't get stuck on the currently eligible stop task. +- * +- * We're currently inside stop_machine() and the rq is either stuck +- * in the stop_machine_cpu_stop() loop, or we're executing this code, +- * either way we should never end up calling schedule() until we're +- * done here. +- */ +- rq->stop = NULL; +- +- /* +- * put_prev_task() and pick_next_task() sched +- * class method both need to have an up-to-date +- * value of rq->clock[_task] +- */ +- update_rq_clock(rq); +- +- for (;;) { +- /* +- * There's this thread running, bail when that's the only +- * remaining thread: +- */ +- if (rq->nr_running == 1) +- break; +- +- next = __pick_migrate_task(rq); +- +- /* +- * Rules for changing task_struct::cpus_mask are holding +- * both pi_lock and rq->lock, such that holding either +- * stabilizes the mask. +- * +- * Drop rq->lock is not quite as disastrous as it usually is +- * because !cpu_active at this point, which means load-balance +- * will not interfere. Also, stop-machine. +- */ +- rq_unlock(rq, rf); +- raw_spin_lock(&next->pi_lock); +- rq_relock(rq, rf); +- +- /* +- * Since we're inside stop-machine, _nothing_ should have +- * changed the task, WARN if weird stuff happened, because in +- * that case the above rq->lock drop is a fail too. +- */ +- if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { +- raw_spin_unlock(&next->pi_lock); +- continue; +- } +- +- /* Find suitable destination for @next, with force if needed. */ +- dest_cpu = select_fallback_rq(dead_rq->cpu, next); +- rq = __migrate_task(rq, rf, next, dest_cpu); +- if (rq != dead_rq) { +- rq_unlock(rq, rf); +- rq = dead_rq; +- *rf = orf; +- rq_relock(rq, rf); +- } +- raw_spin_unlock(&next->pi_lock); +- } +- +- rq->stop = stop; +-} +- + static int __balance_push_cpu_stop(void *arg) + { + struct task_struct *p = arg; +@@ -7125,10 +7011,6 @@ int sched_cpu_deactivate(unsigned int cpu) + return ret; + } + sched_domains_numa_masks_clear(cpu); +- +- /* Wait for all non per CPU kernel threads to vanish. */ +- balance_hotplug_wait(); +- + return 0; + } + +@@ -7148,6 +7030,41 @@ int sched_cpu_starting(unsigned int cpu) + } + + #ifdef CONFIG_HOTPLUG_CPU ++ ++/* ++ * Invoked immediately before the stopper thread is invoked to bring the ++ * CPU down completely. At this point all per CPU kthreads except the ++ * hotplug thread (current) and the stopper thread (inactive) have been ++ * either parked or have been unbound from the outgoing CPU. Ensure that ++ * any of those which might be on the way out are gone. ++ * ++ * If after this point a bound task is being woken on this CPU then the ++ * responsible hotplug callback has failed to do it's job. ++ * sched_cpu_dying() will catch it with the appropriate fireworks. ++ */ ++int sched_cpu_wait_empty(unsigned int cpu) ++{ ++ balance_hotplug_wait(); ++ return 0; ++} ++ ++/* ++ * Since this CPU is going 'away' for a while, fold any nr_active delta we ++ * might have. Called from the CPU stopper task after ensuring that the ++ * stopper is the last running task on the CPU, so nr_active count is ++ * stable. We need to take the teardown thread which is calling this into ++ * account, so we hand in adjust = 1 to the load calculation. ++ * ++ * Also see the comment "Global load-average calculations". ++ */ ++static void calc_load_migrate(struct rq *rq) ++{ ++ long delta = calc_load_fold_active(rq, 1); ++ ++ if (delta) ++ atomic_long_add(delta, &calc_load_tasks); ++} ++ + int sched_cpu_dying(unsigned int cpu) + { + struct rq *rq = cpu_rq(cpu); +@@ -7161,7 +7078,6 @@ int sched_cpu_dying(unsigned int cpu) + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } +- migrate_tasks(rq, &rf); + BUG_ON(rq->nr_running != 1); + rq_unlock_irqrestore(rq, &rf); + +-- +2.43.0 + diff --git a/debian/patches-rt/0008-sched-Fix-hotplug-vs-CPU-bandwidth-control.patch b/debian/patches-rt/0008-sched-Fix-hotplug-vs-CPU-bandwidth-control.patch new file mode 100644 index 000000000..6c09cbe86 --- /dev/null +++ b/debian/patches-rt/0008-sched-Fix-hotplug-vs-CPU-bandwidth-control.patch @@ -0,0 +1,94 @@ +From 36d2d778f786cb7fd55c549911a6055d6b6f40ef Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Fri, 23 Oct 2020 12:12:05 +0200 +Subject: [PATCH 008/323] sched: Fix hotplug vs CPU bandwidth control +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Since we now migrate tasks away before DYING, we should also move +bandwidth unthrottle, otherwise we can gain tasks from unthrottle +after we expect all tasks to be gone already. + +Also; it looks like the RT balancers don't respect cpu_active() and +instead rely on rq->online in part, complete this. This too requires +we do set_rq_offline() earlier to match the cpu_active() semantics. +(The bigger patch is to convert RT to cpu_active() entirely) + +Since set_rq_online() is called from sched_cpu_activate(), place +set_rq_offline() in sched_cpu_deactivate(). + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/sched/core.c | 14 ++++++++++---- + kernel/sched/deadline.c | 2 +- + kernel/sched/rt.c | 2 +- + 3 files changed, 12 insertions(+), 6 deletions(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index a26a82c3e939..c5d5576c67fb 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -6979,6 +6979,8 @@ int sched_cpu_activate(unsigned int cpu) + + int sched_cpu_deactivate(unsigned int cpu) + { ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; + int ret; + + set_cpu_active(cpu, false); +@@ -6993,6 +6995,14 @@ int sched_cpu_deactivate(unsigned int cpu) + + balance_push_set(cpu, true); + ++ rq_lock_irqsave(rq, &rf); ++ if (rq->rd) { ++ update_rq_clock(rq); ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_offline(rq); ++ } ++ rq_unlock_irqrestore(rq, &rf); ++ + #ifdef CONFIG_SCHED_SMT + /* + * When going down, decrement the number of cores with SMT present. +@@ -7074,10 +7084,6 @@ int sched_cpu_dying(unsigned int cpu) + sched_tick_stop(cpu); + + rq_lock_irqsave(rq, &rf); +- if (rq->rd) { +- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); +- set_rq_offline(rq); +- } + BUG_ON(rq->nr_running != 1); + rq_unlock_irqrestore(rq, &rf); + +diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c +index d91295d3059f..af8569dbdc9c 100644 +--- a/kernel/sched/deadline.c ++++ b/kernel/sched/deadline.c +@@ -566,7 +566,7 @@ static int push_dl_task(struct rq *rq); + + static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) + { +- return dl_task(prev); ++ return rq->online && dl_task(prev); + } + + static DEFINE_PER_CPU(struct callback_head, dl_push_head); +diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c +index f690f901b6cc..fdcce04913db 100644 +--- a/kernel/sched/rt.c ++++ b/kernel/sched/rt.c +@@ -270,7 +270,7 @@ static void pull_rt_task(struct rq *this_rq); + static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) + { + /* Try to pull RT tasks here if we lower this rq's prio */ +- return rq->rt.highest_prio.curr > prev->prio; ++ return rq->online && rq->rt.highest_prio.curr > prev->prio; + } + + static inline int rt_overloaded(struct rq *rq) +-- +2.43.0 + diff --git a/debian/patches-rt/0009-sched-Massage-set_cpus_allowed.patch b/debian/patches-rt/0009-sched-Massage-set_cpus_allowed.patch new file mode 100644 index 000000000..24fc3200e --- /dev/null +++ b/debian/patches-rt/0009-sched-Massage-set_cpus_allowed.patch @@ -0,0 +1,175 @@ +From 1125b34b63aa303af592b0ea5be730dc92ce6d53 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Fri, 23 Oct 2020 12:12:06 +0200 +Subject: [PATCH 009/323] sched: Massage set_cpus_allowed() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Thread a u32 flags word through the *set_cpus_allowed*() callchain. +This will allow adding behavioural tweaks for future users. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/sched/core.c | 28 ++++++++++++++++++---------- + kernel/sched/deadline.c | 5 +++-- + kernel/sched/sched.h | 7 +++++-- + 3 files changed, 26 insertions(+), 14 deletions(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index c5d5576c67fb..569cc5e48e68 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1838,13 +1838,14 @@ static int migration_cpu_stop(void *data) + * sched_class::set_cpus_allowed must do the below, but is not required to + * actually call this function. + */ +-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags) + { + cpumask_copy(&p->cpus_mask, new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); + } + +-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++static void ++__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags) + { + struct rq *rq = task_rq(p); + bool queued, running; +@@ -1865,7 +1866,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) + if (running) + put_prev_task(rq, p); + +- p->sched_class->set_cpus_allowed(p, new_mask); ++ p->sched_class->set_cpus_allowed(p, new_mask, flags); + + if (queued) + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); +@@ -1873,6 +1874,11 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) + set_next_task(rq, p); + } + ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ __do_set_cpus_allowed(p, new_mask, 0); ++} ++ + /* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on +@@ -1883,7 +1889,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) + * call is not atomic; no spinlocks may be held. + */ + static int __set_cpus_allowed_ptr(struct task_struct *p, +- const struct cpumask *new_mask, bool check) ++ const struct cpumask *new_mask, ++ u32 flags) + { + const struct cpumask *cpu_valid_mask = cpu_active_mask; + unsigned int dest_cpu; +@@ -1905,7 +1912,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, + * Must re-check here, to close a race against __kthread_bind(), + * sched_setaffinity() is not guaranteed to observe the flag. + */ +- if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { + ret = -EINVAL; + goto out; + } +@@ -1924,7 +1931,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, + goto out; + } + +- do_set_cpus_allowed(p, new_mask); ++ __do_set_cpus_allowed(p, new_mask, flags); + + if (p->flags & PF_KTHREAD) { + /* +@@ -1961,7 +1968,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, + + int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) + { +- return __set_cpus_allowed_ptr(p, new_mask, false); ++ return __set_cpus_allowed_ptr(p, new_mask, 0); + } + EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); + +@@ -2420,7 +2427,8 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) + #else + + static inline int __set_cpus_allowed_ptr(struct task_struct *p, +- const struct cpumask *new_mask, bool check) ++ const struct cpumask *new_mask, ++ u32 flags) + { + return set_cpus_allowed_ptr(p, new_mask); + } +@@ -6022,7 +6030,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) + } + #endif + again: +- retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK); + + if (!retval) { + cpuset_cpus_allowed(p, cpus_allowed); +@@ -6598,7 +6606,7 @@ void __init init_idle(struct task_struct *idle, int cpu) + * + * And since this is boot we can forgo the serialization. + */ +- set_cpus_allowed_common(idle, cpumask_of(cpu)); ++ set_cpus_allowed_common(idle, cpumask_of(cpu), 0); + #endif + /* + * We're having a chicken and egg problem, even though we are +diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c +index af8569dbdc9c..5566f157640b 100644 +--- a/kernel/sched/deadline.c ++++ b/kernel/sched/deadline.c +@@ -2311,7 +2311,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) + } + + static void set_cpus_allowed_dl(struct task_struct *p, +- const struct cpumask *new_mask) ++ const struct cpumask *new_mask, ++ u32 flags) + { + struct root_domain *src_rd; + struct rq *rq; +@@ -2340,7 +2341,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, + raw_spin_unlock(&src_dl_b->lock); + } + +- set_cpus_allowed_common(p, new_mask); ++ set_cpus_allowed_common(p, new_mask, flags); + } + + /* Assumes rq->lock is held */ +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index a72464d370cd..f0f396cc1bee 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1818,7 +1818,8 @@ struct sched_class { + void (*task_woken)(struct rq *this_rq, struct task_struct *task); + + void (*set_cpus_allowed)(struct task_struct *p, +- const struct cpumask *newmask); ++ const struct cpumask *newmask, ++ u32 flags); + + void (*rq_online)(struct rq *rq); + void (*rq_offline)(struct rq *rq); +@@ -1911,7 +1912,9 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu); + + extern void trigger_load_balance(struct rq *rq); + +-extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); ++#define SCA_CHECK 0x01 ++ ++extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags); + + #endif + +-- +2.43.0 + diff --git a/debian/patches-rt/0010-sched-Add-migrate_disable.patch b/debian/patches-rt/0010-sched-Add-migrate_disable.patch new file mode 100644 index 000000000..d0ea31279 --- /dev/null +++ b/debian/patches-rt/0010-sched-Add-migrate_disable.patch @@ -0,0 +1,356 @@ +From e9f7c2225ee3e1ce9317762393618c1c81a8febe Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Fri, 23 Oct 2020 12:12:07 +0200 +Subject: [PATCH 010/323] sched: Add migrate_disable() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Add the base migrate_disable() support (under protest). + +While migrate_disable() is (currently) required for PREEMPT_RT, it is +also one of the biggest flaws in the system. + +Notably this is just the base implementation, it is broken vs +sched_setaffinity() and hotplug, both solved in additional patches for +ease of review. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/preempt.h | 65 +++++++++++++++++++++++ + include/linux/sched.h | 3 ++ + kernel/sched/core.c | 112 +++++++++++++++++++++++++++++++++++++--- + kernel/sched/sched.h | 6 ++- + lib/smp_processor_id.c | 5 ++ + 5 files changed, 183 insertions(+), 8 deletions(-) + +diff --git a/include/linux/preempt.h b/include/linux/preempt.h +index 7d9c1c0e149c..97ba7c920653 100644 +--- a/include/linux/preempt.h ++++ b/include/linux/preempt.h +@@ -322,6 +322,69 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, + + #endif + ++#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) ++ ++/* ++ * Migrate-Disable and why it is (strongly) undesired. ++ * ++ * The premise of the Real-Time schedulers we have on Linux ++ * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks ++ * concurrently, provided there are sufficient runnable tasks, also known as ++ * work-conserving. For instance SCHED_DEADLINE tries to schedule the M ++ * earliest deadline threads, and SCHED_FIFO the M highest priority threads. ++ * ++ * The correctness of various scheduling models depends on this, but is it ++ * broken by migrate_disable() that doesn't imply preempt_disable(). Where ++ * preempt_disable() implies an immediate priority ceiling, preemptible ++ * migrate_disable() allows nesting. ++ * ++ * The worst case is that all tasks preempt one another in a migrate_disable() ++ * region and stack on a single CPU. This then reduces the available bandwidth ++ * to a single CPU. And since Real-Time schedulability theory considers the ++ * Worst-Case only, all Real-Time analysis shall revert to single-CPU ++ * (instantly solving the SMP analysis problem). ++ * ++ * ++ * The reason we have it anyway. ++ * ++ * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a ++ * number of primitives into becoming preemptible, they would also allow ++ * migration. This turns out to break a bunch of per-cpu usage. To this end, ++ * all these primitives employ migirate_disable() to restore this implicit ++ * assumption. ++ * ++ * This is a 'temporary' work-around at best. The correct solution is getting ++ * rid of the above assumptions and reworking the code to employ explicit ++ * per-cpu locking or short preempt-disable regions. ++ * ++ * The end goal must be to get rid of migrate_disable(), alternatively we need ++ * a schedulability theory that does not depend on abritrary migration. ++ * ++ * ++ * Notes on the implementation. ++ * ++ * The implementation is particularly tricky since existing code patterns ++ * dictate neither migrate_disable() nor migrate_enable() is allowed to block. ++ * This means that it cannot use cpus_read_lock() to serialize against hotplug, ++ * nor can it easily migrate itself into a pending affinity mask change on ++ * migrate_enable(). ++ * ++ * ++ * Note: even non-work-conserving schedulers like semi-partitioned depends on ++ * migration, so migrate_disable() is not only a problem for ++ * work-conserving schedulers. ++ * ++ */ ++extern void migrate_disable(void); ++extern void migrate_enable(void); ++ ++#elif defined(CONFIG_PREEMPT_RT) ++ ++static inline void migrate_disable(void) { } ++static inline void migrate_enable(void) { } ++ ++#else /* !CONFIG_PREEMPT_RT */ ++ + /** + * migrate_disable - Prevent migration of the current task + * +@@ -352,4 +415,6 @@ static __always_inline void migrate_enable(void) + preempt_enable(); + } + ++#endif /* CONFIG_SMP && CONFIG_PREEMPT_RT */ ++ + #endif /* __LINUX_PREEMPT_H */ +diff --git a/include/linux/sched.h b/include/linux/sched.h +index aa015416c569..76907e9876d5 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -726,6 +726,9 @@ struct task_struct { + int nr_cpus_allowed; + const cpumask_t *cpus_ptr; + cpumask_t cpus_mask; ++#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) ++ int migration_disabled; ++#endif + + #ifdef CONFIG_PREEMPT_RCU + int rcu_read_lock_nesting; +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 569cc5e48e68..5c9db4b2b6ec 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1710,6 +1710,61 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) + + #ifdef CONFIG_SMP + ++#ifdef CONFIG_PREEMPT_RT ++ ++static void ++__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags); ++ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, ++ u32 flags); ++ ++static void migrate_disable_switch(struct rq *rq, struct task_struct *p) ++{ ++ if (likely(!p->migration_disabled)) ++ return; ++ ++ if (p->cpus_ptr != &p->cpus_mask) ++ return; ++ ++ /* ++ * Violates locking rules! see comment in __do_set_cpus_allowed(). ++ */ ++ __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE); ++} ++ ++void migrate_disable(void) ++{ ++ if (current->migration_disabled++) ++ return; ++ ++ barrier(); ++} ++EXPORT_SYMBOL_GPL(migrate_disable); ++ ++void migrate_enable(void) ++{ ++ struct task_struct *p = current; ++ ++ if (--p->migration_disabled) ++ return; ++ ++ barrier(); ++ ++ if (p->cpus_ptr == &p->cpus_mask) ++ return; ++ ++ __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); ++} ++EXPORT_SYMBOL_GPL(migrate_enable); ++ ++static inline bool is_migration_disabled(struct task_struct *p) ++{ ++ return p->migration_disabled; ++} ++ ++#endif ++ + /* + * Per-CPU kthreads are allowed to run on !active && online CPUs, see + * __set_cpus_allowed_ptr() and select_fallback_rq(). +@@ -1719,7 +1774,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + return false; + +- if (is_per_cpu_kthread(p)) ++ if (is_per_cpu_kthread(p) || is_migration_disabled(p)) + return cpu_online(cpu); + + return cpu_active(cpu); +@@ -1840,6 +1895,11 @@ static int migration_cpu_stop(void *data) + */ + void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags) + { ++ if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) { ++ p->cpus_ptr = new_mask; ++ return; ++ } ++ + cpumask_copy(&p->cpus_mask, new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); + } +@@ -1850,7 +1910,22 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 + struct rq *rq = task_rq(p); + bool queued, running; + +- lockdep_assert_held(&p->pi_lock); ++ /* ++ * This here violates the locking rules for affinity, since we're only ++ * supposed to change these variables while holding both rq->lock and ++ * p->pi_lock. ++ * ++ * HOWEVER, it magically works, because ttwu() is the only code that ++ * accesses these variables under p->pi_lock and only does so after ++ * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() ++ * before finish_task(). ++ * ++ * XXX do further audits, this smells like something putrid. ++ */ ++ if (flags & SCA_MIGRATE_DISABLE) ++ SCHED_WARN_ON(!p->on_cpu); ++ else ++ lockdep_assert_held(&p->pi_lock); + + queued = task_on_rq_queued(p); + running = task_current(rq, p); +@@ -1901,9 +1976,14 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, + rq = task_rq_lock(p, &rf); + update_rq_clock(rq); + +- if (p->flags & PF_KTHREAD) { ++ if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { + /* +- * Kernel threads are allowed on online && !active CPUs ++ * Kernel threads are allowed on online && !active CPUs. ++ * ++ * Specifically, migration_disabled() tasks must not fail the ++ * cpumask_any_and_distribute() pick below, esp. so on ++ * SCA_MIGRATE_ENABLE, otherwise we'll not call ++ * set_cpus_allowed_common() and actually reset p->cpus_ptr. + */ + cpu_valid_mask = cpu_online_mask; + } +@@ -1917,7 +1997,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, + goto out; + } + +- if (cpumask_equal(&p->cpus_mask, new_mask)) ++ if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask)) + goto out; + + /* +@@ -2009,6 +2089,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) + * Clearly, migrating tasks to offline CPUs is a fairly daft thing. + */ + WARN_ON_ONCE(!cpu_online(new_cpu)); ++ ++ WARN_ON_ONCE(is_migration_disabled(p)); + #endif + + trace_sched_migrate_task(p, new_cpu); +@@ -2339,6 +2421,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) + } + fallthrough; + case possible: ++ /* ++ * XXX When called from select_task_rq() we only ++ * hold p->pi_lock and again violate locking order. ++ * ++ * More yuck to audit. ++ */ + do_set_cpus_allowed(p, cpu_possible_mask); + state = fail; + break; +@@ -2373,7 +2461,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) + { + lockdep_assert_held(&p->pi_lock); + +- if (p->nr_cpus_allowed > 1) ++ if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + else + cpu = cpumask_any(p->cpus_ptr); +@@ -2435,6 +2523,17 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p, + + #endif /* CONFIG_SMP */ + ++#if !defined(CONFIG_SMP) || !defined(CONFIG_PREEMPT_RT) ++ ++static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } ++ ++static inline bool is_migration_disabled(struct task_struct *p) ++{ ++ return false; ++} ++ ++#endif ++ + static void + ttwu_stat(struct task_struct *p, int cpu, int wake_flags) + { +@@ -4595,6 +4694,7 @@ static void __sched notrace __schedule(bool preempt) + */ + ++*switch_count; + ++ migrate_disable_switch(rq, prev); + psi_sched_switch(prev, next, !task_on_rq_queued(prev)); + + trace_sched_switch(preempt, prev, next); +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index f0f396cc1bee..f3109adda484 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1906,14 +1906,16 @@ static inline bool sched_fair_runnable(struct rq *rq) + extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); + extern struct task_struct *pick_next_task_idle(struct rq *rq); + ++#define SCA_CHECK 0x01 ++#define SCA_MIGRATE_DISABLE 0x02 ++#define SCA_MIGRATE_ENABLE 0x04 ++ + #ifdef CONFIG_SMP + + extern void update_group_capacity(struct sched_domain *sd, int cpu); + + extern void trigger_load_balance(struct rq *rq); + +-#define SCA_CHECK 0x01 +- + extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags); + + #endif +diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c +index 2916606a9333..dbb96ebf661f 100644 +--- a/lib/smp_processor_id.c ++++ b/lib/smp_processor_id.c +@@ -26,6 +26,11 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2) + if (current->nr_cpus_allowed == 1) + goto out; + ++#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) ++ if (current->migration_disabled) ++ goto out; ++#endif ++ + /* + * It is valid to assume CPU-locality during early bootup: + */ +-- +2.43.0 + diff --git a/debian/patches-rt/0011-sched-Fix-migrate_disable-vs-set_cpus_allowed_ptr.patch b/debian/patches-rt/0011-sched-Fix-migrate_disable-vs-set_cpus_allowed_ptr.patch new file mode 100644 index 000000000..d89f86138 --- /dev/null +++ b/debian/patches-rt/0011-sched-Fix-migrate_disable-vs-set_cpus_allowed_ptr.patch @@ -0,0 +1,370 @@ +From f6fcadc60ec427b9eeb4b734b77b7b110c050b83 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Fri, 23 Oct 2020 12:12:08 +0200 +Subject: [PATCH 011/323] sched: Fix migrate_disable() vs + set_cpus_allowed_ptr() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Concurrent migrate_disable() and set_cpus_allowed_ptr() has +interesting features. We rely on set_cpus_allowed_ptr() to not return +until the task runs inside the provided mask. This expectation is +exported to userspace. + +This means that any set_cpus_allowed_ptr() caller must wait until +migrate_enable() allows migrations. + +At the same time, we don't want migrate_enable() to schedule, due to +patterns like: + + preempt_disable(); + migrate_disable(); + ... + migrate_enable(); + preempt_enable(); + +And: + + raw_spin_lock(&B); + spin_unlock(&A); + +this means that when migrate_enable() must restore the affinity +mask, it cannot wait for completion thereof. Luck will have it that +that is exactly the case where there is a pending +set_cpus_allowed_ptr(), so let that provide storage for the async stop +machine. + +Much thanks to Valentin who used TLA+ most effective and found lots of +'interesting' cases. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/sched.h | 1 + + kernel/sched/core.c | 234 ++++++++++++++++++++++++++++++++++++------ + 2 files changed, 205 insertions(+), 30 deletions(-) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 76907e9876d5..5b5c194f5a62 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -726,6 +726,7 @@ struct task_struct { + int nr_cpus_allowed; + const cpumask_t *cpus_ptr; + cpumask_t cpus_mask; ++ void *migration_pending; + #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) + int migration_disabled; + #endif +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 5c9db4b2b6ec..3af7c42896c9 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1746,15 +1746,26 @@ void migrate_enable(void) + { + struct task_struct *p = current; + +- if (--p->migration_disabled) ++ if (p->migration_disabled > 1) { ++ p->migration_disabled--; + return; ++ } + ++ /* ++ * Ensure stop_task runs either before or after this, and that ++ * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). ++ */ ++ preempt_disable(); ++ if (p->cpus_ptr != &p->cpus_mask) ++ __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); ++ /* ++ * Mustn't clear migration_disabled() until cpus_ptr points back at the ++ * regular cpus_mask, otherwise things that race (eg. ++ * select_fallback_rq) get confused. ++ */ + barrier(); +- +- if (p->cpus_ptr == &p->cpus_mask) +- return; +- +- __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); ++ p->migration_disabled = 0; ++ preempt_enable(); + } + EXPORT_SYMBOL_GPL(migrate_enable); + +@@ -1819,8 +1830,16 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, + } + + struct migration_arg { +- struct task_struct *task; +- int dest_cpu; ++ struct task_struct *task; ++ int dest_cpu; ++ struct set_affinity_pending *pending; ++}; ++ ++struct set_affinity_pending { ++ refcount_t refs; ++ struct completion done; ++ struct cpu_stop_work stop_work; ++ struct migration_arg arg; + }; + + /* +@@ -1852,16 +1871,19 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, + */ + static int migration_cpu_stop(void *data) + { ++ struct set_affinity_pending *pending; + struct migration_arg *arg = data; + struct task_struct *p = arg->task; ++ int dest_cpu = arg->dest_cpu; + struct rq *rq = this_rq(); ++ bool complete = false; + struct rq_flags rf; + + /* + * The original target CPU might have gone down and we might + * be on another CPU but it doesn't matter. + */ +- local_irq_disable(); ++ local_irq_save(rf.flags); + /* + * We need to explicitly wake pending tasks before running + * __migrate_task() such that we will not miss enforcing cpus_ptr +@@ -1871,21 +1893,83 @@ static int migration_cpu_stop(void *data) + + raw_spin_lock(&p->pi_lock); + rq_lock(rq, &rf); ++ ++ pending = p->migration_pending; + /* + * If task_rq(p) != rq, it cannot be migrated here, because we're + * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because + * we're holding p->pi_lock. + */ + if (task_rq(p) == rq) { ++ if (is_migration_disabled(p)) ++ goto out; ++ ++ if (pending) { ++ p->migration_pending = NULL; ++ complete = true; ++ } ++ ++ /* migrate_enable() -- we must not race against SCA */ ++ if (dest_cpu < 0) { ++ /* ++ * When this was migrate_enable() but we no longer ++ * have a @pending, a concurrent SCA 'fixed' things ++ * and we should be valid again. Nothing to do. ++ */ ++ if (!pending) { ++ WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq))); ++ goto out; ++ } ++ ++ dest_cpu = cpumask_any_distribute(&p->cpus_mask); ++ } ++ + if (task_on_rq_queued(p)) +- rq = __migrate_task(rq, &rf, p, arg->dest_cpu); ++ rq = __migrate_task(rq, &rf, p, dest_cpu); + else +- p->wake_cpu = arg->dest_cpu; ++ p->wake_cpu = dest_cpu; ++ ++ } else if (dest_cpu < 0) { ++ /* ++ * This happens when we get migrated between migrate_enable()'s ++ * preempt_enable() and scheduling the stopper task. At that ++ * point we're a regular task again and not current anymore. ++ * ++ * A !PREEMPT kernel has a giant hole here, which makes it far ++ * more likely. ++ */ ++ ++ /* ++ * When this was migrate_enable() but we no longer have an ++ * @pending, a concurrent SCA 'fixed' things and we should be ++ * valid again. Nothing to do. ++ */ ++ if (!pending) { ++ WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq))); ++ goto out; ++ } ++ ++ /* ++ * When migrate_enable() hits a rq mis-match we can't reliably ++ * determine is_migration_disabled() and so have to chase after ++ * it. ++ */ ++ task_rq_unlock(rq, p, &rf); ++ stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, ++ &pending->arg, &pending->stop_work); ++ return 0; + } +- rq_unlock(rq, &rf); +- raw_spin_unlock(&p->pi_lock); ++out: ++ task_rq_unlock(rq, p, &rf); ++ ++ if (complete) ++ complete_all(&pending->done); ++ ++ /* For pending->{arg,stop_work} */ ++ pending = arg->pending; ++ if (pending && refcount_dec_and_test(&pending->refs)) ++ wake_up_var(&pending->refs); + +- local_irq_enable(); + return 0; + } + +@@ -1954,6 +2038,110 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) + __do_set_cpus_allowed(p, new_mask, 0); + } + ++/* ++ * This function is wildly self concurrent, consider at least 3 times. ++ */ ++static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf, ++ int dest_cpu, unsigned int flags) ++{ ++ struct set_affinity_pending my_pending = { }, *pending = NULL; ++ struct migration_arg arg = { ++ .task = p, ++ .dest_cpu = dest_cpu, ++ }; ++ bool complete = false; ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { ++ pending = p->migration_pending; ++ if (pending) { ++ refcount_inc(&pending->refs); ++ p->migration_pending = NULL; ++ complete = true; ++ } ++ task_rq_unlock(rq, p, rf); ++ ++ if (complete) ++ goto do_complete; ++ ++ return 0; ++ } ++ ++ if (!(flags & SCA_MIGRATE_ENABLE)) { ++ /* serialized by p->pi_lock */ ++ if (!p->migration_pending) { ++ refcount_set(&my_pending.refs, 1); ++ init_completion(&my_pending.done); ++ p->migration_pending = &my_pending; ++ } else { ++ pending = p->migration_pending; ++ refcount_inc(&pending->refs); ++ } ++ } ++ pending = p->migration_pending; ++ /* ++ * - !MIGRATE_ENABLE: ++ * we'll have installed a pending if there wasn't one already. ++ * ++ * - MIGRATE_ENABLE: ++ * we're here because the current CPU isn't matching anymore, ++ * the only way that can happen is because of a concurrent ++ * set_cpus_allowed_ptr() call, which should then still be ++ * pending completion. ++ * ++ * Either way, we really should have a @pending here. ++ */ ++ if (WARN_ON_ONCE(!pending)) ++ return -EINVAL; ++ ++ if (flags & SCA_MIGRATE_ENABLE) { ++ ++ refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ ++ task_rq_unlock(rq, p, rf); ++ ++ pending->arg = (struct migration_arg) { ++ .task = p, ++ .dest_cpu = -1, ++ .pending = pending, ++ }; ++ ++ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, ++ &pending->arg, &pending->stop_work); ++ ++ return 0; ++ } ++ ++ if (task_running(rq, p) || p->state == TASK_WAKING) { ++ ++ task_rq_unlock(rq, p, rf); ++ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); ++ ++ } else { ++ ++ if (!is_migration_disabled(p)) { ++ if (task_on_rq_queued(p)) ++ rq = move_queued_task(rq, rf, p, dest_cpu); ++ ++ p->migration_pending = NULL; ++ complete = true; ++ } ++ task_rq_unlock(rq, p, rf); ++ ++do_complete: ++ if (complete) ++ complete_all(&pending->done); ++ } ++ ++ wait_for_completion(&pending->done); ++ ++ if (refcount_dec_and_test(&pending->refs)) ++ wake_up_var(&pending->refs); ++ ++ wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); ++ ++ return 0; ++} ++ + /* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on +@@ -2023,23 +2211,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, + p->nr_cpus_allowed != 1); + } + +- /* Can the task run on the task's current CPU? If so, we're done */ +- if (cpumask_test_cpu(task_cpu(p), new_mask)) +- goto out; ++ return affine_move_task(rq, p, &rf, dest_cpu, flags); + +- if (task_running(rq, p) || p->state == TASK_WAKING) { +- struct migration_arg arg = { p, dest_cpu }; +- /* Need help from migration thread: drop lock and wait. */ +- task_rq_unlock(rq, p, &rf); +- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); +- return 0; +- } else if (task_on_rq_queued(p)) { +- /* +- * OK, since we're going to drop the lock immediately +- * afterwards anyway. +- */ +- rq = move_queued_task(rq, &rf, p, dest_cpu); +- } + out: + task_rq_unlock(rq, p, &rf); + +@@ -3230,6 +3403,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + init_numa_balancing(clone_flags, p); + #ifdef CONFIG_SMP + p->wake_entry.u_flags = CSD_TYPE_TTWU; ++ p->migration_pending = NULL; + #endif + } + +-- +2.43.0 + diff --git a/debian/patches-rt/0012-sched-core-Make-migrate-disable-and-CPU-hotplug-coop.patch b/debian/patches-rt/0012-sched-core-Make-migrate-disable-and-CPU-hotplug-coop.patch new file mode 100644 index 000000000..e89668431 --- /dev/null +++ b/debian/patches-rt/0012-sched-core-Make-migrate-disable-and-CPU-hotplug-coop.patch @@ -0,0 +1,137 @@ +From 02829fffd43c5fe3e617d07e0a94d5164324449b Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 23 Oct 2020 12:12:09 +0200 +Subject: [PATCH 012/323] sched/core: Make migrate disable and CPU hotplug + cooperative +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +On CPU unplug tasks which are in a migrate disabled region cannot be pushed +to a different CPU until they returned to migrateable state. + +Account the number of tasks on a runqueue which are in a migrate disabled +section and make the hotplug wait mechanism respect that. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/sched/core.c | 36 ++++++++++++++++++++++++++++++------ + kernel/sched/sched.h | 4 ++++ + 2 files changed, 34 insertions(+), 6 deletions(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 3af7c42896c9..2517a003295b 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1735,10 +1735,17 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p) + + void migrate_disable(void) + { +- if (current->migration_disabled++) ++ struct task_struct *p = current; ++ ++ if (p->migration_disabled) { ++ p->migration_disabled++; + return; ++ } + +- barrier(); ++ preempt_disable(); ++ this_rq()->nr_pinned++; ++ p->migration_disabled = 1; ++ preempt_enable(); + } + EXPORT_SYMBOL_GPL(migrate_disable); + +@@ -1765,6 +1772,7 @@ void migrate_enable(void) + */ + barrier(); + p->migration_disabled = 0; ++ this_rq()->nr_pinned--; + preempt_enable(); + } + EXPORT_SYMBOL_GPL(migrate_enable); +@@ -1774,6 +1782,11 @@ static inline bool is_migration_disabled(struct task_struct *p) + return p->migration_disabled; + } + ++static inline bool rq_has_pinned_tasks(struct rq *rq) ++{ ++ return rq->nr_pinned; ++} ++ + #endif + + /* +@@ -2705,6 +2718,11 @@ static inline bool is_migration_disabled(struct task_struct *p) + return false; + } + ++static inline bool rq_has_pinned_tasks(struct rq *rq) ++{ ++ return false; ++} ++ + #endif + + static void +@@ -7064,15 +7082,20 @@ static void balance_push(struct rq *rq) + * Both the cpu-hotplug and stop task are in this case and are + * required to complete the hotplug process. + */ +- if (is_per_cpu_kthread(push_task)) { ++ if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) { + /* + * If this is the idle task on the outgoing CPU try to wake + * up the hotplug control thread which might wait for the + * last task to vanish. The rcuwait_active() check is + * accurate here because the waiter is pinned on this CPU + * and can't obviously be running in parallel. ++ * ++ * On RT kernels this also has to check whether there are ++ * pinned and scheduled out tasks on the runqueue. They ++ * need to leave the migrate disabled section first. + */ +- if (!rq->nr_running && rcuwait_active(&rq->hotplug_wait)) { ++ if (!rq->nr_running && !rq_has_pinned_tasks(rq) && ++ rcuwait_active(&rq->hotplug_wait)) { + raw_spin_unlock(&rq->lock); + rcuwait_wake_up(&rq->hotplug_wait); + raw_spin_lock(&rq->lock); +@@ -7119,7 +7142,8 @@ static void balance_hotplug_wait(void) + { + struct rq *rq = this_rq(); + +- rcuwait_wait_event(&rq->hotplug_wait, rq->nr_running == 1, ++ rcuwait_wait_event(&rq->hotplug_wait, ++ rq->nr_running == 1 && !rq_has_pinned_tasks(rq), + TASK_UNINTERRUPTIBLE); + } + +@@ -7366,7 +7390,7 @@ int sched_cpu_dying(unsigned int cpu) + sched_tick_stop(cpu); + + rq_lock_irqsave(rq, &rf); +- BUG_ON(rq->nr_running != 1); ++ BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq)); + rq_unlock_irqrestore(rq, &rf); + + calc_load_migrate(rq); +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index f3109adda484..8237c9ab2bb8 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1057,6 +1057,10 @@ struct rq { + /* Must be inspected within a rcu lock section */ + struct cpuidle_state *idle_state; + #endif ++ ++#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) ++ unsigned int nr_pinned; ++#endif + }; + + #ifdef CONFIG_FAIR_GROUP_SCHED +-- +2.43.0 + diff --git a/debian/patches-rt/0013-sched-rt-Use-cpumask_any-_distribute.patch b/debian/patches-rt/0013-sched-rt-Use-cpumask_any-_distribute.patch new file mode 100644 index 000000000..b190e58d4 --- /dev/null +++ b/debian/patches-rt/0013-sched-rt-Use-cpumask_any-_distribute.patch @@ -0,0 +1,121 @@ +From 0d3b4a8d9391d1eb1efb998bfcaff013a01466bf Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Fri, 23 Oct 2020 12:12:10 +0200 +Subject: [PATCH 013/323] sched,rt: Use cpumask_any*_distribute() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Replace a bunch of cpumask_any*() instances with +cpumask_any*_distribute(), by injecting this little bit of random in +cpu selection, we reduce the chance two competing balance operations +working off the same lowest_mask pick the same CPU. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/cpumask.h | 6 ++++++ + kernel/sched/deadline.c | 6 +++--- + kernel/sched/rt.c | 6 +++--- + lib/cpumask.c | 18 ++++++++++++++++++ + 4 files changed, 30 insertions(+), 6 deletions(-) + +diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h +index f0d895d6ac39..383684e30f12 100644 +--- a/include/linux/cpumask.h ++++ b/include/linux/cpumask.h +@@ -199,6 +199,11 @@ static inline int cpumask_any_and_distribute(const struct cpumask *src1p, + return cpumask_next_and(-1, src1p, src2p); + } + ++static inline int cpumask_any_distribute(const struct cpumask *srcp) ++{ ++ return cpumask_first(srcp); ++} ++ + #define for_each_cpu(cpu, mask) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) + #define for_each_cpu_not(cpu, mask) \ +@@ -252,6 +257,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu); + unsigned int cpumask_local_spread(unsigned int i, int node); + int cpumask_any_and_distribute(const struct cpumask *src1p, + const struct cpumask *src2p); ++int cpumask_any_distribute(const struct cpumask *srcp); + + /** + * for_each_cpu - iterate over every cpu in a mask +diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c +index 5566f157640b..e64d378e4e87 100644 +--- a/kernel/sched/deadline.c ++++ b/kernel/sched/deadline.c +@@ -2012,8 +2012,8 @@ static int find_later_rq(struct task_struct *task) + return this_cpu; + } + +- best_cpu = cpumask_first_and(later_mask, +- sched_domain_span(sd)); ++ best_cpu = cpumask_any_and_distribute(later_mask, ++ sched_domain_span(sd)); + /* + * Last chance: if a CPU being in both later_mask + * and current sd span is valid, that becomes our +@@ -2035,7 +2035,7 @@ static int find_later_rq(struct task_struct *task) + if (this_cpu != -1) + return this_cpu; + +- cpu = cpumask_any(later_mask); ++ cpu = cpumask_any_distribute(later_mask); + if (cpu < nr_cpu_ids) + return cpu; + +diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c +index fdcce04913db..695526a54a89 100644 +--- a/kernel/sched/rt.c ++++ b/kernel/sched/rt.c +@@ -1766,8 +1766,8 @@ static int find_lowest_rq(struct task_struct *task) + return this_cpu; + } + +- best_cpu = cpumask_first_and(lowest_mask, +- sched_domain_span(sd)); ++ best_cpu = cpumask_any_and_distribute(lowest_mask, ++ sched_domain_span(sd)); + if (best_cpu < nr_cpu_ids) { + rcu_read_unlock(); + return best_cpu; +@@ -1784,7 +1784,7 @@ static int find_lowest_rq(struct task_struct *task) + if (this_cpu != -1) + return this_cpu; + +- cpu = cpumask_any(lowest_mask); ++ cpu = cpumask_any_distribute(lowest_mask); + if (cpu < nr_cpu_ids) + return cpu; + +diff --git a/lib/cpumask.c b/lib/cpumask.c +index fb22fb266f93..c3c76b833384 100644 +--- a/lib/cpumask.c ++++ b/lib/cpumask.c +@@ -261,3 +261,21 @@ int cpumask_any_and_distribute(const struct cpumask *src1p, + return next; + } + EXPORT_SYMBOL(cpumask_any_and_distribute); ++ ++int cpumask_any_distribute(const struct cpumask *srcp) ++{ ++ int next, prev; ++ ++ /* NOTE: our first selection will skip 0. */ ++ prev = __this_cpu_read(distribute_cpu_mask_prev); ++ ++ next = cpumask_next(prev, srcp); ++ if (next >= nr_cpu_ids) ++ next = cpumask_first(srcp); ++ ++ if (next < nr_cpu_ids) ++ __this_cpu_write(distribute_cpu_mask_prev, next); ++ ++ return next; ++} ++EXPORT_SYMBOL(cpumask_any_distribute); +-- +2.43.0 + diff --git a/debian/patches-rt/0014-sched-rt-Use-the-full-cpumask-for-balancing.patch b/debian/patches-rt/0014-sched-rt-Use-the-full-cpumask-for-balancing.patch new file mode 100644 index 000000000..c2817a0b7 --- /dev/null +++ b/debian/patches-rt/0014-sched-rt-Use-the-full-cpumask-for-balancing.patch @@ -0,0 +1,105 @@ +From 75c028b5537f7205cfe5aecf8cf6bb61ef1428f6 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Fri, 23 Oct 2020 12:12:11 +0200 +Subject: [PATCH 014/323] sched,rt: Use the full cpumask for balancing +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +We want migrate_disable() tasks to get PULLs in order for them to PUSH +away the higher priority task. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/sched/cpudeadline.c | 4 ++-- + kernel/sched/cpupri.c | 4 ++-- + kernel/sched/deadline.c | 4 ++-- + kernel/sched/rt.c | 4 ++-- + 4 files changed, 8 insertions(+), 8 deletions(-) + +diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c +index 8cb06c8c7eb1..ceb03d76c0cc 100644 +--- a/kernel/sched/cpudeadline.c ++++ b/kernel/sched/cpudeadline.c +@@ -120,7 +120,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, + const struct sched_dl_entity *dl_se = &p->dl; + + if (later_mask && +- cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { ++ cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) { + unsigned long cap, max_cap = 0; + int cpu, max_cpu = -1; + +@@ -151,7 +151,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, + + WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); + +- if (cpumask_test_cpu(best_cpu, p->cpus_ptr) && ++ if (cpumask_test_cpu(best_cpu, &p->cpus_mask) && + dl_time_before(dl_se->deadline, cp->elements[0].dl)) { + if (later_mask) + cpumask_set_cpu(best_cpu, later_mask); +diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c +index 0033731a0797..11c4df2010de 100644 +--- a/kernel/sched/cpupri.c ++++ b/kernel/sched/cpupri.c +@@ -73,11 +73,11 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p, + if (skip) + return 0; + +- if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) ++ if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids) + return 0; + + if (lowest_mask) { +- cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); ++ cpumask_and(lowest_mask, &p->cpus_mask, vec->mask); + + /* + * We have to ensure that we have at least one bit +diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c +index e64d378e4e87..94977a6ced8b 100644 +--- a/kernel/sched/deadline.c ++++ b/kernel/sched/deadline.c +@@ -1922,7 +1922,7 @@ static void task_fork_dl(struct task_struct *p) + static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) + { + if (!task_running(rq, p) && +- cpumask_test_cpu(cpu, p->cpus_ptr)) ++ cpumask_test_cpu(cpu, &p->cpus_mask)) + return 1; + return 0; + } +@@ -2072,7 +2072,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) + /* Retry if something changed. */ + if (double_lock_balance(rq, later_rq)) { + if (unlikely(task_rq(task) != rq || +- !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) || ++ !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || + task_running(rq, task) || + !dl_task(task) || + !task_on_rq_queued(task))) { +diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c +index 695526a54a89..f91339c0dee8 100644 +--- a/kernel/sched/rt.c ++++ b/kernel/sched/rt.c +@@ -1672,7 +1672,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) + static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) + { + if (!task_running(rq, p) && +- cpumask_test_cpu(cpu, p->cpus_ptr)) ++ cpumask_test_cpu(cpu, &p->cpus_mask)) + return 1; + + return 0; +@@ -1825,7 +1825,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) + * Also make sure that it wasn't scheduled on its rq. + */ + if (unlikely(task_rq(task) != rq || +- !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) || ++ !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || + task_running(rq, task) || + !rt_task(task) || + !task_on_rq_queued(task))) { +-- +2.43.0 + diff --git a/debian/patches-rt/0015-sched-lockdep-Annotate-pi_lock-recursion.patch b/debian/patches-rt/0015-sched-lockdep-Annotate-pi_lock-recursion.patch new file mode 100644 index 000000000..05030516b --- /dev/null +++ b/debian/patches-rt/0015-sched-lockdep-Annotate-pi_lock-recursion.patch @@ -0,0 +1,52 @@ +From c0e34a5ef4e854220a216e934ab90f04d4432bdd Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Fri, 23 Oct 2020 12:12:12 +0200 +Subject: [PATCH 015/323] sched, lockdep: Annotate ->pi_lock recursion +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +There's a valid ->pi_lock recursion issue where the actual PI code +tries to wake up the stop task. Make lockdep aware so it doesn't +complain about this. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/sched/core.c | 15 +++++++++++++++ + 1 file changed, 15 insertions(+) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 2517a003295b..abbf01f77a76 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -2670,6 +2670,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) + + void sched_set_stop_task(int cpu, struct task_struct *stop) + { ++ static struct lock_class_key stop_pi_lock; + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + struct task_struct *old_stop = cpu_rq(cpu)->stop; + +@@ -2685,6 +2686,20 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) + sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); + + stop->sched_class = &stop_sched_class; ++ ++ /* ++ * The PI code calls rt_mutex_setprio() with ->pi_lock held to ++ * adjust the effective priority of a task. As a result, ++ * rt_mutex_setprio() can trigger (RT) balancing operations, ++ * which can then trigger wakeups of the stop thread to push ++ * around the current task. ++ * ++ * The stop task itself will never be part of the PI-chain, it ++ * never blocks, therefore that ->pi_lock recursion is safe. ++ * Tell lockdep about this by placing the stop->pi_lock in its ++ * own class. ++ */ ++ lockdep_set_class(&stop->pi_lock, &stop_pi_lock); + } + + cpu_rq(cpu)->stop = stop; +-- +2.43.0 + diff --git a/debian/patches-rt/0016-sched-Fix-migrate_disable-vs-rt-dl-balancing.patch b/debian/patches-rt/0016-sched-Fix-migrate_disable-vs-rt-dl-balancing.patch new file mode 100644 index 000000000..b665e6d22 --- /dev/null +++ b/debian/patches-rt/0016-sched-Fix-migrate_disable-vs-rt-dl-balancing.patch @@ -0,0 +1,495 @@ +From 8121011ded919f172bbbc7f5c095ca29d83a5aed Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Fri, 23 Oct 2020 12:12:13 +0200 +Subject: [PATCH 016/323] sched: Fix migrate_disable() vs rt/dl balancing +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +In order to minimize the interference of migrate_disable() on lower +priority tasks, which can be deprived of runtime due to being stuck +below a higher priority task. Teach the RT/DL balancers to push away +these higher priority tasks when a lower priority task gets selected +to run on a freshly demoted CPU (pull). + +This adds migration interference to the higher priority task, but +restores bandwidth to system that would otherwise be irrevocably lost. +Without this it would be possible to have all tasks on the system +stuck on a single CPU, each task preempted in a migrate_disable() +section with a single high priority task running. + +This way we can still approximate running the M highest priority tasks +on the system. + +Migrating the top task away is (ofcourse) still subject to +migrate_disable() too, which means the lower task is subject to an +interference equivalent to the worst case migrate_disable() section. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/preempt.h | 40 +++++++++++++----------- + include/linux/sched.h | 3 +- + kernel/sched/core.c | 67 +++++++++++++++++++++++++++++++++++------ + kernel/sched/deadline.c | 29 +++++++++++++----- + kernel/sched/rt.c | 63 ++++++++++++++++++++++++++++++-------- + kernel/sched/sched.h | 32 ++++++++++++++++++++ + 6 files changed, 186 insertions(+), 48 deletions(-) + +diff --git a/include/linux/preempt.h b/include/linux/preempt.h +index 97ba7c920653..8b43922e65df 100644 +--- a/include/linux/preempt.h ++++ b/include/linux/preempt.h +@@ -325,24 +325,28 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, + #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) + + /* +- * Migrate-Disable and why it is (strongly) undesired. +- * +- * The premise of the Real-Time schedulers we have on Linux +- * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks +- * concurrently, provided there are sufficient runnable tasks, also known as +- * work-conserving. For instance SCHED_DEADLINE tries to schedule the M +- * earliest deadline threads, and SCHED_FIFO the M highest priority threads. +- * +- * The correctness of various scheduling models depends on this, but is it +- * broken by migrate_disable() that doesn't imply preempt_disable(). Where +- * preempt_disable() implies an immediate priority ceiling, preemptible +- * migrate_disable() allows nesting. +- * +- * The worst case is that all tasks preempt one another in a migrate_disable() +- * region and stack on a single CPU. This then reduces the available bandwidth +- * to a single CPU. And since Real-Time schedulability theory considers the +- * Worst-Case only, all Real-Time analysis shall revert to single-CPU +- * (instantly solving the SMP analysis problem). ++ * Migrate-Disable and why it is undesired. ++ * ++ * When a preempted task becomes elegible to run under the ideal model (IOW it ++ * becomes one of the M highest priority tasks), it might still have to wait ++ * for the preemptee's migrate_disable() section to complete. Thereby suffering ++ * a reduction in bandwidth in the exact duration of the migrate_disable() ++ * section. ++ * ++ * Per this argument, the change from preempt_disable() to migrate_disable() ++ * gets us: ++ * ++ * - a higher priority tasks gains reduced wake-up latency; with preempt_disable() ++ * it would have had to wait for the lower priority task. ++ * ++ * - a lower priority tasks; which under preempt_disable() could've instantly ++ * migrated away when another CPU becomes available, is now constrained ++ * by the ability to push the higher priority task away, which might itself be ++ * in a migrate_disable() section, reducing it's available bandwidth. ++ * ++ * IOW it trades latency / moves the interference term, but it stays in the ++ * system, and as long as it remains unbounded, the system is not fully ++ * deterministic. + * + * + * The reason we have it anyway. +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 5b5c194f5a62..7ca1f3e740dd 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -728,8 +728,9 @@ struct task_struct { + cpumask_t cpus_mask; + void *migration_pending; + #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) +- int migration_disabled; ++ unsigned short migration_disabled; + #endif ++ unsigned short migration_flags; + + #ifdef CONFIG_PREEMPT_RCU + int rcu_read_lock_nesting; +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index abbf01f77a76..452fc1dfb143 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1777,11 +1777,6 @@ void migrate_enable(void) + } + EXPORT_SYMBOL_GPL(migrate_enable); + +-static inline bool is_migration_disabled(struct task_struct *p) +-{ +- return p->migration_disabled; +-} +- + static inline bool rq_has_pinned_tasks(struct rq *rq) + { + return rq->nr_pinned; +@@ -1986,6 +1981,49 @@ static int migration_cpu_stop(void *data) + return 0; + } + ++int push_cpu_stop(void *arg) ++{ ++ struct rq *lowest_rq = NULL, *rq = this_rq(); ++ struct task_struct *p = arg; ++ ++ raw_spin_lock_irq(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ if (task_rq(p) != rq) ++ goto out_unlock; ++ ++ if (is_migration_disabled(p)) { ++ p->migration_flags |= MDF_PUSH; ++ goto out_unlock; ++ } ++ ++ p->migration_flags &= ~MDF_PUSH; ++ ++ if (p->sched_class->find_lock_rq) ++ lowest_rq = p->sched_class->find_lock_rq(p, rq); ++ ++ if (!lowest_rq) ++ goto out_unlock; ++ ++ // XXX validate p is still the highest prio task ++ if (task_rq(p) == rq) { ++ deactivate_task(rq, p, 0); ++ set_task_cpu(p, lowest_rq->cpu); ++ activate_task(lowest_rq, p, 0); ++ resched_curr(lowest_rq); ++ } ++ ++ double_unlock_balance(rq, lowest_rq); ++ ++out_unlock: ++ rq->push_busy = false; ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irq(&p->pi_lock); ++ ++ put_task_struct(p); ++ return 0; ++} ++ + /* + * sched_class::set_cpus_allowed must do the below, but is not required to + * actually call this function. +@@ -2066,6 +2104,14 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { ++ struct task_struct *push_task = NULL; ++ ++ if ((flags & SCA_MIGRATE_ENABLE) && ++ (p->migration_flags & MDF_PUSH) && !rq->push_busy) { ++ rq->push_busy = true; ++ push_task = get_task_struct(p); ++ } ++ + pending = p->migration_pending; + if (pending) { + refcount_inc(&pending->refs); +@@ -2074,6 +2120,11 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + } + task_rq_unlock(rq, p, rf); + ++ if (push_task) { ++ stop_one_cpu_nowait(rq->cpu, push_cpu_stop, ++ p, &rq->push_work); ++ } ++ + if (complete) + goto do_complete; + +@@ -2110,6 +2161,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + if (flags & SCA_MIGRATE_ENABLE) { + + refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ ++ p->migration_flags &= ~MDF_PUSH; + task_rq_unlock(rq, p, rf); + + pending->arg = (struct migration_arg) { +@@ -2728,11 +2780,6 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p, + + static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } + +-static inline bool is_migration_disabled(struct task_struct *p) +-{ +- return false; +-} +- + static inline bool rq_has_pinned_tasks(struct rq *rq) + { + return false; +diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c +index 94977a6ced8b..7cf3248894a9 100644 +--- a/kernel/sched/deadline.c ++++ b/kernel/sched/deadline.c +@@ -2139,6 +2139,9 @@ static int push_dl_task(struct rq *rq) + return 0; + + retry: ++ if (is_migration_disabled(next_task)) ++ return 0; ++ + if (WARN_ON(next_task == rq->curr)) + return 0; + +@@ -2216,7 +2219,7 @@ static void push_dl_tasks(struct rq *rq) + static void pull_dl_task(struct rq *this_rq) + { + int this_cpu = this_rq->cpu, cpu; +- struct task_struct *p; ++ struct task_struct *p, *push_task; + bool resched = false; + struct rq *src_rq; + u64 dmin = LONG_MAX; +@@ -2246,6 +2249,7 @@ static void pull_dl_task(struct rq *this_rq) + continue; + + /* Might drop this_rq->lock */ ++ push_task = NULL; + double_lock_balance(this_rq, src_rq); + + /* +@@ -2277,17 +2281,27 @@ static void pull_dl_task(struct rq *this_rq) + src_rq->curr->dl.deadline)) + goto skip; + +- resched = true; +- +- deactivate_task(src_rq, p, 0); +- set_task_cpu(p, this_cpu); +- activate_task(this_rq, p, 0); +- dmin = p->dl.deadline; ++ if (is_migration_disabled(p)) { ++ push_task = get_push_task(src_rq); ++ } else { ++ deactivate_task(src_rq, p, 0); ++ set_task_cpu(p, this_cpu); ++ activate_task(this_rq, p, 0); ++ dmin = p->dl.deadline; ++ resched = true; ++ } + + /* Is there any other task even earlier? */ + } + skip: + double_unlock_balance(this_rq, src_rq); ++ ++ if (push_task) { ++ raw_spin_unlock(&this_rq->lock); ++ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, ++ push_task, &src_rq->push_work); ++ raw_spin_lock(&this_rq->lock); ++ } + } + + if (resched) +@@ -2548,6 +2562,7 @@ const struct sched_class dl_sched_class + .rq_online = rq_online_dl, + .rq_offline = rq_offline_dl, + .task_woken = task_woken_dl, ++ .find_lock_rq = find_lock_later_rq, + #endif + + .task_tick = task_tick_dl, +diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c +index f91339c0dee8..c25e35f41555 100644 +--- a/kernel/sched/rt.c ++++ b/kernel/sched/rt.c +@@ -1873,7 +1873,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) + * running task can migrate over to a CPU that is running a task + * of lesser priority. + */ +-static int push_rt_task(struct rq *rq) ++static int push_rt_task(struct rq *rq, bool pull) + { + struct task_struct *next_task; + struct rq *lowest_rq; +@@ -1887,6 +1887,34 @@ static int push_rt_task(struct rq *rq) + return 0; + + retry: ++ if (is_migration_disabled(next_task)) { ++ struct task_struct *push_task = NULL; ++ int cpu; ++ ++ if (!pull || rq->push_busy) ++ return 0; ++ ++ cpu = find_lowest_rq(rq->curr); ++ if (cpu == -1 || cpu == rq->cpu) ++ return 0; ++ ++ /* ++ * Given we found a CPU with lower priority than @next_task, ++ * therefore it should be running. However we cannot migrate it ++ * to this other CPU, instead attempt to push the current ++ * running task on this CPU away. ++ */ ++ push_task = get_push_task(rq); ++ if (push_task) { ++ raw_spin_unlock(&rq->lock); ++ stop_one_cpu_nowait(rq->cpu, push_cpu_stop, ++ push_task, &rq->push_work); ++ raw_spin_lock(&rq->lock); ++ } ++ ++ return 0; ++ } ++ + if (WARN_ON(next_task == rq->curr)) + return 0; + +@@ -1941,12 +1969,10 @@ static int push_rt_task(struct rq *rq) + deactivate_task(rq, next_task, 0); + set_task_cpu(next_task, lowest_rq->cpu); + activate_task(lowest_rq, next_task, 0); +- ret = 1; +- + resched_curr(lowest_rq); ++ ret = 1; + + double_unlock_balance(rq, lowest_rq); +- + out: + put_task_struct(next_task); + +@@ -1956,7 +1982,7 @@ static int push_rt_task(struct rq *rq) + static void push_rt_tasks(struct rq *rq) + { + /* push_rt_task will return true if it moved an RT */ +- while (push_rt_task(rq)) ++ while (push_rt_task(rq, false)) + ; + } + +@@ -2109,7 +2135,8 @@ void rto_push_irq_work_func(struct irq_work *work) + */ + if (has_pushable_tasks(rq)) { + raw_spin_lock(&rq->lock); +- push_rt_tasks(rq); ++ while (push_rt_task(rq, true)) ++ ; + raw_spin_unlock(&rq->lock); + } + +@@ -2134,7 +2161,7 @@ static void pull_rt_task(struct rq *this_rq) + { + int this_cpu = this_rq->cpu, cpu; + bool resched = false; +- struct task_struct *p; ++ struct task_struct *p, *push_task; + struct rq *src_rq; + int rt_overload_count = rt_overloaded(this_rq); + +@@ -2181,6 +2208,7 @@ static void pull_rt_task(struct rq *this_rq) + * double_lock_balance, and another CPU could + * alter this_rq + */ ++ push_task = NULL; + double_lock_balance(this_rq, src_rq); + + /* +@@ -2208,11 +2236,14 @@ static void pull_rt_task(struct rq *this_rq) + if (p->prio < src_rq->curr->prio) + goto skip; + +- resched = true; +- +- deactivate_task(src_rq, p, 0); +- set_task_cpu(p, this_cpu); +- activate_task(this_rq, p, 0); ++ if (is_migration_disabled(p)) { ++ push_task = get_push_task(src_rq); ++ } else { ++ deactivate_task(src_rq, p, 0); ++ set_task_cpu(p, this_cpu); ++ activate_task(this_rq, p, 0); ++ resched = true; ++ } + /* + * We continue with the search, just in + * case there's an even higher prio task +@@ -2222,6 +2253,13 @@ static void pull_rt_task(struct rq *this_rq) + } + skip: + double_unlock_balance(this_rq, src_rq); ++ ++ if (push_task) { ++ raw_spin_unlock(&this_rq->lock); ++ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, ++ push_task, &src_rq->push_work); ++ raw_spin_lock(&this_rq->lock); ++ } + } + + if (resched) +@@ -2470,6 +2508,7 @@ const struct sched_class rt_sched_class + .rq_offline = rq_offline_rt, + .task_woken = task_woken_rt, + .switched_from = switched_from_rt, ++ .find_lock_rq = find_lock_lowest_rq, + #endif + + .task_tick = task_tick_rt, +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 8237c9ab2bb8..69ef7cac3d29 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1061,6 +1061,8 @@ struct rq { + #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) + unsigned int nr_pinned; + #endif ++ unsigned int push_busy; ++ struct cpu_stop_work push_work; + }; + + #ifdef CONFIG_FAIR_GROUP_SCHED +@@ -1088,6 +1090,16 @@ static inline int cpu_of(struct rq *rq) + #endif + } + ++#define MDF_PUSH 0x01 ++ ++static inline bool is_migration_disabled(struct task_struct *p) ++{ ++#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) ++ return p->migration_disabled; ++#else ++ return false; ++#endif ++} + + #ifdef CONFIG_SCHED_SMT + extern void __update_idle_core(struct rq *rq); +@@ -1827,6 +1839,8 @@ struct sched_class { + + void (*rq_online)(struct rq *rq); + void (*rq_offline)(struct rq *rq); ++ ++ struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); + #endif + + void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); +@@ -1922,6 +1936,24 @@ extern void trigger_load_balance(struct rq *rq); + + extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags); + ++static inline struct task_struct *get_push_task(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ if (rq->push_busy) ++ return NULL; ++ ++ if (p->nr_cpus_allowed == 1) ++ return NULL; ++ ++ rq->push_busy = true; ++ return get_task_struct(p); ++} ++ ++extern int push_cpu_stop(void *arg); ++ + #endif + + #ifdef CONFIG_CPU_IDLE +-- +2.43.0 + diff --git a/debian/patches-rt/0017-sched-proc-Print-accurate-cpumask-vs-migrate_disable.patch b/debian/patches-rt/0017-sched-proc-Print-accurate-cpumask-vs-migrate_disable.patch new file mode 100644 index 000000000..44d87463d --- /dev/null +++ b/debian/patches-rt/0017-sched-proc-Print-accurate-cpumask-vs-migrate_disable.patch @@ -0,0 +1,35 @@ +From e8ed7f739c2f8113927b0b3489bc357c36e27040 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Fri, 23 Oct 2020 12:12:14 +0200 +Subject: [PATCH 017/323] sched/proc: Print accurate cpumask vs + migrate_disable() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Ensure /proc/*/status doesn't print 'random' cpumasks due to +migrate_disable(). + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + fs/proc/array.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/proc/array.c b/fs/proc/array.c +index 18a4588c35be..decaa7768044 100644 +--- a/fs/proc/array.c ++++ b/fs/proc/array.c +@@ -384,9 +384,9 @@ static inline void task_context_switch_counts(struct seq_file *m, + static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) + { + seq_printf(m, "Cpus_allowed:\t%*pb\n", +- cpumask_pr_args(task->cpus_ptr)); ++ cpumask_pr_args(&task->cpus_mask)); + seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", +- cpumask_pr_args(task->cpus_ptr)); ++ cpumask_pr_args(&task->cpus_mask)); + } + + static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) +-- +2.43.0 + diff --git a/debian/patches-rt/0018-sched-Add-migrate_disable-tracepoints.patch b/debian/patches-rt/0018-sched-Add-migrate_disable-tracepoints.patch new file mode 100644 index 000000000..0f315ce9d --- /dev/null +++ b/debian/patches-rt/0018-sched-Add-migrate_disable-tracepoints.patch @@ -0,0 +1,110 @@ +From 8b4906578ce683b9bce7df984b4179519152345f Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Fri, 23 Oct 2020 12:12:15 +0200 +Subject: [PATCH 018/323] sched: Add migrate_disable() tracepoints +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +XXX write a tracer: + + - 'migirate_disable() -> migrate_enable()' time in task_sched_runtime() + - 'migrate_pull -> sched-in' time in task_sched_runtime() + +The first will give worst case for the second, which is the actual +interference experienced by the task to due migration constraints of +migrate_disable(). + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/trace/events/sched.h | 12 ++++++++++++ + kernel/sched/core.c | 4 ++++ + kernel/sched/deadline.c | 1 + + kernel/sched/rt.c | 8 +++++++- + 4 files changed, 24 insertions(+), 1 deletion(-) + +diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h +index c96a4337afe6..e48f584abf5f 100644 +--- a/include/trace/events/sched.h ++++ b/include/trace/events/sched.h +@@ -650,6 +650,18 @@ DECLARE_TRACE(sched_update_nr_running_tp, + TP_PROTO(struct rq *rq, int change), + TP_ARGS(rq, change)); + ++DECLARE_TRACE(sched_migrate_disable_tp, ++ TP_PROTO(struct task_struct *p), ++ TP_ARGS(p)); ++ ++DECLARE_TRACE(sched_migrate_enable_tp, ++ TP_PROTO(struct task_struct *p), ++ TP_ARGS(p)); ++ ++DECLARE_TRACE(sched_migrate_pull_tp, ++ TP_PROTO(struct task_struct *p), ++ TP_ARGS(p)); ++ + #endif /* _TRACE_SCHED_H */ + + /* This part must be outside protection */ +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 452fc1dfb143..b5f35b512577 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1742,6 +1742,8 @@ void migrate_disable(void) + return; + } + ++ trace_sched_migrate_disable_tp(p); ++ + preempt_disable(); + this_rq()->nr_pinned++; + p->migration_disabled = 1; +@@ -1774,6 +1776,8 @@ void migrate_enable(void) + p->migration_disabled = 0; + this_rq()->nr_pinned--; + preempt_enable(); ++ ++ trace_sched_migrate_enable_tp(p); + } + EXPORT_SYMBOL_GPL(migrate_enable); + +diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c +index 7cf3248894a9..fcf546cd2eac 100644 +--- a/kernel/sched/deadline.c ++++ b/kernel/sched/deadline.c +@@ -2282,6 +2282,7 @@ static void pull_dl_task(struct rq *this_rq) + goto skip; + + if (is_migration_disabled(p)) { ++ trace_sched_migrate_pull_tp(p); + push_task = get_push_task(src_rq); + } else { + deactivate_task(src_rq, p, 0); +diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c +index c25e35f41555..c5038d680c2c 100644 +--- a/kernel/sched/rt.c ++++ b/kernel/sched/rt.c +@@ -1891,7 +1891,12 @@ static int push_rt_task(struct rq *rq, bool pull) + struct task_struct *push_task = NULL; + int cpu; + +- if (!pull || rq->push_busy) ++ if (!pull) ++ return 0; ++ ++ trace_sched_migrate_pull_tp(next_task); ++ ++ if (rq->push_busy) + return 0; + + cpu = find_lowest_rq(rq->curr); +@@ -2237,6 +2242,7 @@ static void pull_rt_task(struct rq *this_rq) + goto skip; + + if (is_migration_disabled(p)) { ++ trace_sched_migrate_pull_tp(p); + push_task = get_push_task(src_rq); + } else { + deactivate_task(src_rq, p, 0); +-- +2.43.0 + diff --git a/debian/patches-rt/0019-sched-Deny-self-issued-__set_cpus_allowed_ptr-when-m.patch b/debian/patches-rt/0019-sched-Deny-self-issued-__set_cpus_allowed_ptr-when-m.patch new file mode 100644 index 000000000..ca7d5923b --- /dev/null +++ b/debian/patches-rt/0019-sched-Deny-self-issued-__set_cpus_allowed_ptr-when-m.patch @@ -0,0 +1,46 @@ +From 5857482efa6b272402feb4b063c4273e70dd67fb Mon Sep 17 00:00:00 2001 +From: Valentin Schneider <valentin.schneider@arm.com> +Date: Fri, 23 Oct 2020 12:12:16 +0200 +Subject: [PATCH 019/323] sched: Deny self-issued __set_cpus_allowed_ptr() when + migrate_disable() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + + migrate_disable(); + set_cpus_allowed_ptr(current, {something excluding task_cpu(current)}); + affine_move_task(); <-- never returns + +Signed-off-by: Valentin Schneider <valentin.schneider@arm.com> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Link: https://lkml.kernel.org/r/20201013140116.26651-1-valentin.schneider@arm.com +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/sched/core.c | 13 +++++++++++-- + 1 file changed, 11 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index b5f35b512577..3b7bb01eecc0 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -2254,8 +2254,17 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, + goto out; + } + +- if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask)) +- goto out; ++ if (!(flags & SCA_MIGRATE_ENABLE)) { ++ if (cpumask_equal(&p->cpus_mask, new_mask)) ++ goto out; ++ ++ if (WARN_ON_ONCE(p == current && ++ is_migration_disabled(p) && ++ !cpumask_test_cpu(task_cpu(p), new_mask))) { ++ ret = -EBUSY; ++ goto out; ++ } ++ } + + /* + * Picking a ~random cpu helps in cases where we are changing affinity +-- +2.43.0 + diff --git a/debian/patches-rt/0020-sched-Comment-affine_move_task.patch b/debian/patches-rt/0020-sched-Comment-affine_move_task.patch new file mode 100644 index 000000000..ce16fe3bc --- /dev/null +++ b/debian/patches-rt/0020-sched-Comment-affine_move_task.patch @@ -0,0 +1,130 @@ +From ef79ed54f47d3ce342809e4914f4d9cf400f1ae0 Mon Sep 17 00:00:00 2001 +From: Valentin Schneider <valentin.schneider@arm.com> +Date: Fri, 23 Oct 2020 12:12:17 +0200 +Subject: [PATCH 020/323] sched: Comment affine_move_task() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Signed-off-by: Valentin Schneider <valentin.schneider@arm.com> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Link: https://lkml.kernel.org/r/20201013140116.26651-2-valentin.schneider@arm.com +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/sched/core.c | 81 +++++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 79 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 3b7bb01eecc0..565d8011c832 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -2094,7 +2094,75 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) + } + + /* +- * This function is wildly self concurrent, consider at least 3 times. ++ * This function is wildly self concurrent; here be dragons. ++ * ++ * ++ * When given a valid mask, __set_cpus_allowed_ptr() must block until the ++ * designated task is enqueued on an allowed CPU. If that task is currently ++ * running, we have to kick it out using the CPU stopper. ++ * ++ * Migrate-Disable comes along and tramples all over our nice sandcastle. ++ * Consider: ++ * ++ * Initial conditions: P0->cpus_mask = [0, 1] ++ * ++ * P0@CPU0 P1 ++ * ++ * migrate_disable(); ++ * <preempted> ++ * set_cpus_allowed_ptr(P0, [1]); ++ * ++ * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes ++ * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region). ++ * This means we need the following scheme: ++ * ++ * P0@CPU0 P1 ++ * ++ * migrate_disable(); ++ * <preempted> ++ * set_cpus_allowed_ptr(P0, [1]); ++ * <blocks> ++ * <resumes> ++ * migrate_enable(); ++ * __set_cpus_allowed_ptr(); ++ * <wakes local stopper> ++ * `--> <woken on migration completion> ++ * ++ * Now the fun stuff: there may be several P1-like tasks, i.e. multiple ++ * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any ++ * task p are serialized by p->pi_lock, which we can leverage: the one that ++ * should come into effect at the end of the Migrate-Disable region is the last ++ * one. This means we only need to track a single cpumask (i.e. p->cpus_mask), ++ * but we still need to properly signal those waiting tasks at the appropriate ++ * moment. ++ * ++ * This is implemented using struct set_affinity_pending. The first ++ * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will ++ * setup an instance of that struct and install it on the targeted task_struct. ++ * Any and all further callers will reuse that instance. Those then wait for ++ * a completion signaled at the tail of the CPU stopper callback (1), triggered ++ * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()). ++ * ++ * ++ * (1) In the cases covered above. There is one more where the completion is ++ * signaled within affine_move_task() itself: when a subsequent affinity request ++ * cancels the need for an active migration. Consider: ++ * ++ * Initial conditions: P0->cpus_mask = [0, 1] ++ * ++ * P0@CPU0 P1 P2 ++ * ++ * migrate_disable(); ++ * <preempted> ++ * set_cpus_allowed_ptr(P0, [1]); ++ * <blocks> ++ * set_cpus_allowed_ptr(P0, [0, 1]); ++ * <signal completion> ++ * <awakes> ++ * ++ * Note that the above is safe vs a concurrent migrate_enable(), as any ++ * pending affinity completion is preceded an uninstallion of ++ * p->migration_pending done with p->pi_lock held. + */ + static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf, + int dest_cpu, unsigned int flags) +@@ -2138,6 +2206,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + if (!(flags & SCA_MIGRATE_ENABLE)) { + /* serialized by p->pi_lock */ + if (!p->migration_pending) { ++ /* Install the request */ + refcount_set(&my_pending.refs, 1); + init_completion(&my_pending.done); + p->migration_pending = &my_pending; +@@ -2181,7 +2250,11 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + } + + if (task_running(rq, p) || p->state == TASK_WAKING) { +- ++ /* ++ * Lessen races (and headaches) by delegating ++ * is_migration_disabled(p) checks to the stopper, which will ++ * run on the same CPU as said p. ++ */ + task_rq_unlock(rq, p, rf); + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + +@@ -2206,6 +2279,10 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + if (refcount_dec_and_test(&pending->refs)) + wake_up_var(&pending->refs); + ++ /* ++ * Block the original owner of &pending until all subsequent callers ++ * have seen the completion and decremented the refcount ++ */ + wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); + + return 0; +-- +2.43.0 + diff --git a/debian/patches-rt/0021-sched-Unlock-the-rq-in-affine_move_task-error-path.patch b/debian/patches-rt/0021-sched-Unlock-the-rq-in-affine_move_task-error-path.patch new file mode 100644 index 000000000..bc78d7729 --- /dev/null +++ b/debian/patches-rt/0021-sched-Unlock-the-rq-in-affine_move_task-error-path.patch @@ -0,0 +1,34 @@ +From 9a34ae4ec3b4af5cc98c8405ea590096a1380af9 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Mon, 9 Nov 2020 15:54:03 +0100 +Subject: [PATCH 021/323] sched: Unlock the rq in affine_move_task() error path +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Unlock the rq if returned early in the error path. + +Reported-by: Joe Korty <joe.korty@concurrent-rt.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Link: https://lkml.kernel.org/r/20201106203921.GA48461@zipoli.concurrent-rt.com +--- + kernel/sched/core.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 565d8011c832..d12d91510789 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -2228,8 +2228,10 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + * + * Either way, we really should have a @pending here. + */ +- if (WARN_ON_ONCE(!pending)) ++ if (WARN_ON_ONCE(!pending)) { ++ task_rq_unlock(rq, p, rf); + return -EINVAL; ++ } + + if (flags & SCA_MIGRATE_ENABLE) { + +-- +2.43.0 + diff --git a/debian/patches-rt/0022-sched-Fix-migration_cpu_stop-WARN.patch b/debian/patches-rt/0022-sched-Fix-migration_cpu_stop-WARN.patch new file mode 100644 index 000000000..6f1a0cb21 --- /dev/null +++ b/debian/patches-rt/0022-sched-Fix-migration_cpu_stop-WARN.patch @@ -0,0 +1,47 @@ +From 09d09da2b8c97fd09f50deb278f9d4a25b99ad55 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Tue, 17 Nov 2020 12:14:51 +0100 +Subject: [PATCH 022/323] sched: Fix migration_cpu_stop() WARN +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Oleksandr reported hitting the WARN in the 'task_rq(p) != rq' branch +of migration_cpu_stop(). Valentin noted that using cpu_of(rq) in that +case is just plain wrong to begin with, since per the earlier branch +that isn't the actual CPU of the task. + +Replace both instances of is_cpu_allowed() by a direct p->cpus_mask +test using task_cpu(). + +Reported-by: Oleksandr Natalenko <oleksandr@natalenko.name> +Debugged-by: Valentin Schneider <valentin.schneider@arm.com> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/sched/core.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index d12d91510789..7d67a0e03f9a 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1929,7 +1929,7 @@ static int migration_cpu_stop(void *data) + * and we should be valid again. Nothing to do. + */ + if (!pending) { +- WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq))); ++ WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); + goto out; + } + +@@ -1957,7 +1957,7 @@ static int migration_cpu_stop(void *data) + * valid again. Nothing to do. + */ + if (!pending) { +- WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq))); ++ WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); + goto out; + } + +-- +2.43.0 + diff --git a/debian/patches-rt/0023-sched-core-Add-missing-completion-for-affine_move_ta.patch b/debian/patches-rt/0023-sched-core-Add-missing-completion-for-affine_move_ta.patch new file mode 100644 index 000000000..19fcddc66 --- /dev/null +++ b/debian/patches-rt/0023-sched-core-Add-missing-completion-for-affine_move_ta.patch @@ -0,0 +1,79 @@ +From 5188e4c01f37c8c179cf88261297d546f5d2502e Mon Sep 17 00:00:00 2001 +From: Valentin Schneider <valentin.schneider@arm.com> +Date: Fri, 13 Nov 2020 11:24:14 +0000 +Subject: [PATCH 023/323] sched/core: Add missing completion for + affine_move_task() waiters +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Qian reported that some fuzzer issuing sched_setaffinity() ends up stuck on +a wait_for_completion(). The problematic pattern seems to be: + + affine_move_task() + // task_running() case + stop_one_cpu(); + wait_for_completion(&pending->done); + +Combined with, on the stopper side: + + migration_cpu_stop() + // Task moved between unlocks and scheduling the stopper + task_rq(p) != rq && + // task_running() case + dest_cpu >= 0 + + => no complete_all() + +This can happen with both PREEMPT and !PREEMPT, although !PREEMPT should +be more likely to see this given the targeted task has a much bigger window +to block and be woken up elsewhere before the stopper runs. + +Make migration_cpu_stop() always look at pending affinity requests; signal +their completion if the stopper hits a rq mismatch but the task is +still within its allowed mask. When Migrate-Disable isn't involved, this +matches the previous set_cpus_allowed_ptr() vs migration_cpu_stop() +behaviour. + +Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") +Reported-by: Qian Cai <cai@redhat.com> +Signed-off-by: Valentin Schneider <valentin.schneider@arm.com> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Link: https://lore.kernel.org/lkml/8b62fd1ad1b18def27f18e2ee2df3ff5b36d0762.camel@redhat.com +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/sched/core.c | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 7d67a0e03f9a..c1e52319669d 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1941,7 +1941,7 @@ static int migration_cpu_stop(void *data) + else + p->wake_cpu = dest_cpu; + +- } else if (dest_cpu < 0) { ++ } else if (dest_cpu < 0 || pending) { + /* + * This happens when we get migrated between migrate_enable()'s + * preempt_enable() and scheduling the stopper task. At that +@@ -1951,6 +1951,17 @@ static int migration_cpu_stop(void *data) + * more likely. + */ + ++ /* ++ * The task moved before the stopper got to run. We're holding ++ * ->pi_lock, so the allowed mask is stable - if it got ++ * somewhere allowed, we're done. ++ */ ++ if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { ++ p->migration_pending = NULL; ++ complete = true; ++ goto out; ++ } ++ + /* + * When this was migrate_enable() but we no longer have an + * @pending, a concurrent SCA 'fixed' things and we should be +-- +2.43.0 + diff --git a/debian/patches-rt/0024-mm-highmem-Un-EXPORT-__kmap_atomic_idx.patch b/debian/patches-rt/0024-mm-highmem-Un-EXPORT-__kmap_atomic_idx.patch new file mode 100644 index 000000000..ba252d03e --- /dev/null +++ b/debian/patches-rt/0024-mm-highmem-Un-EXPORT-__kmap_atomic_idx.patch @@ -0,0 +1,33 @@ +From 44027cc10df8025d820ea864076dae75fb26626d Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:13 +0100 +Subject: [PATCH 024/323] mm/highmem: Un-EXPORT __kmap_atomic_idx() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Nothing in modules can use that. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Christoph Hellwig <hch@lst.de> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: linux-mm@kvack.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + mm/highmem.c | 2 -- + 1 file changed, 2 deletions(-) + +diff --git a/mm/highmem.c b/mm/highmem.c +index 1352a27951e3..6abfd762eee7 100644 +--- a/mm/highmem.c ++++ b/mm/highmem.c +@@ -108,8 +108,6 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) + atomic_long_t _totalhigh_pages __read_mostly; + EXPORT_SYMBOL(_totalhigh_pages); + +-EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); +- + unsigned int nr_free_highpages (void) + { + struct zone *zone; +-- +2.43.0 + diff --git a/debian/patches-rt/0025-highmem-Remove-unused-functions.patch b/debian/patches-rt/0025-highmem-Remove-unused-functions.patch new file mode 100644 index 000000000..55deed269 --- /dev/null +++ b/debian/patches-rt/0025-highmem-Remove-unused-functions.patch @@ -0,0 +1,43 @@ +From 2060c9ea726ebd649d8d2e596fa2acebef644cd8 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:14 +0100 +Subject: [PATCH 025/323] highmem: Remove unused functions +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Nothing uses totalhigh_pages_dec() and totalhigh_pages_set(). + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/highmem.h | 10 ---------- + 1 file changed, 10 deletions(-) + +diff --git a/include/linux/highmem.h b/include/linux/highmem.h +index b25df1f8d48d..3297bfca78ed 100644 +--- a/include/linux/highmem.h ++++ b/include/linux/highmem.h +@@ -104,21 +104,11 @@ static inline void totalhigh_pages_inc(void) + atomic_long_inc(&_totalhigh_pages); + } + +-static inline void totalhigh_pages_dec(void) +-{ +- atomic_long_dec(&_totalhigh_pages); +-} +- + static inline void totalhigh_pages_add(long count) + { + atomic_long_add(count, &_totalhigh_pages); + } + +-static inline void totalhigh_pages_set(long val) +-{ +- atomic_long_set(&_totalhigh_pages, val); +-} +- + void kmap_flush_unused(void); + + struct page *kmap_to_page(void *addr); +-- +2.43.0 + diff --git a/debian/patches-rt/0026-fs-Remove-asm-kmap_types.h-includes.patch b/debian/patches-rt/0026-fs-Remove-asm-kmap_types.h-includes.patch new file mode 100644 index 000000000..3cbd4847e --- /dev/null +++ b/debian/patches-rt/0026-fs-Remove-asm-kmap_types.h-includes.patch @@ -0,0 +1,50 @@ +From 280d9322736bea3870ae507592e1d2880bff8bd9 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:15 +0100 +Subject: [PATCH 026/323] fs: Remove asm/kmap_types.h includes +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Historical leftovers from the time where kmap() had fixed slots. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Alexander Viro <viro@zeniv.linux.org.uk> +Cc: Benjamin LaHaise <bcrl@kvack.org> +Cc: linux-fsdevel@vger.kernel.org +Cc: linux-aio@kvack.org +Cc: Chris Mason <clm@fb.com> +Cc: Josef Bacik <josef@toxicpanda.com> +Cc: David Sterba <dsterba@suse.com> +Cc: linux-btrfs@vger.kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + fs/aio.c | 1 - + fs/btrfs/ctree.h | 1 - + 2 files changed, 2 deletions(-) + +diff --git a/fs/aio.c b/fs/aio.c +index 5934ea84b499..c90e045a37bc 100644 +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -43,7 +43,6 @@ + #include <linux/mount.h> + #include <linux/pseudo_fs.h> + +-#include <asm/kmap_types.h> + #include <linux/uaccess.h> + #include <linux/nospec.h> + +diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h +index 67831868ef0d..17c920aa6bc2 100644 +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -17,7 +17,6 @@ + #include <linux/wait.h> + #include <linux/slab.h> + #include <trace/events/btrfs.h> +-#include <asm/kmap_types.h> + #include <asm/unaligned.h> + #include <linux/pagemap.h> + #include <linux/btrfs.h> +-- +2.43.0 + diff --git a/debian/patches-rt/0027-sh-highmem-Remove-all-traces-of-unused-cruft.patch b/debian/patches-rt/0027-sh-highmem-Remove-all-traces-of-unused-cruft.patch new file mode 100644 index 000000000..bcf3a9b1f --- /dev/null +++ b/debian/patches-rt/0027-sh-highmem-Remove-all-traces-of-unused-cruft.patch @@ -0,0 +1,94 @@ +From 61e1ac8c3bd29dec41ae2f631bb142e26b790de2 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:16 +0100 +Subject: [PATCH 027/323] sh/highmem: Remove all traces of unused cruft +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +For whatever reasons SH has highmem bits all over the place but does +not enable it via Kconfig. Remove the bitrot. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/sh/include/asm/fixmap.h | 8 -------- + arch/sh/include/asm/kmap_types.h | 15 --------------- + arch/sh/mm/init.c | 8 -------- + 3 files changed, 31 deletions(-) + delete mode 100644 arch/sh/include/asm/kmap_types.h + +diff --git a/arch/sh/include/asm/fixmap.h b/arch/sh/include/asm/fixmap.h +index f38adc189b83..b07fbc7f7bc6 100644 +--- a/arch/sh/include/asm/fixmap.h ++++ b/arch/sh/include/asm/fixmap.h +@@ -13,9 +13,6 @@ + #include <linux/kernel.h> + #include <linux/threads.h> + #include <asm/page.h> +-#ifdef CONFIG_HIGHMEM +-#include <asm/kmap_types.h> +-#endif + + /* + * Here we define all the compile-time 'special' virtual +@@ -53,11 +50,6 @@ enum fixed_addresses { + FIX_CMAP_BEGIN, + FIX_CMAP_END = FIX_CMAP_BEGIN + (FIX_N_COLOURS * NR_CPUS) - 1, + +-#ifdef CONFIG_HIGHMEM +- FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ +- FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS) - 1, +-#endif +- + #ifdef CONFIG_IOREMAP_FIXED + /* + * FIX_IOREMAP entries are useful for mapping physical address +diff --git a/arch/sh/include/asm/kmap_types.h b/arch/sh/include/asm/kmap_types.h +deleted file mode 100644 +index b78107f923dd..000000000000 +--- a/arch/sh/include/asm/kmap_types.h ++++ /dev/null +@@ -1,15 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef __SH_KMAP_TYPES_H +-#define __SH_KMAP_TYPES_H +- +-/* Dummy header just to define km_type. */ +- +-#ifdef CONFIG_DEBUG_HIGHMEM +-#define __WITH_KM_FENCE +-#endif +- +-#include <asm-generic/kmap_types.h> +- +-#undef __WITH_KM_FENCE +- +-#endif +diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c +index 3348e0c4d769..0db6919af8d3 100644 +--- a/arch/sh/mm/init.c ++++ b/arch/sh/mm/init.c +@@ -362,9 +362,6 @@ void __init mem_init(void) + mem_init_print_info(NULL); + pr_info("virtual kernel memory layout:\n" + " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" +-#ifdef CONFIG_HIGHMEM +- " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" +-#endif + " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" + " lowmem : 0x%08lx - 0x%08lx (%4ld MB) (cached)\n" + #ifdef CONFIG_UNCACHED_MAPPING +@@ -376,11 +373,6 @@ void __init mem_init(void) + FIXADDR_START, FIXADDR_TOP, + (FIXADDR_TOP - FIXADDR_START) >> 10, + +-#ifdef CONFIG_HIGHMEM +- PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, +- (LAST_PKMAP*PAGE_SIZE) >> 10, +-#endif +- + (unsigned long)VMALLOC_START, VMALLOC_END, + (VMALLOC_END - VMALLOC_START) >> 20, + +-- +2.43.0 + diff --git a/debian/patches-rt/0028-asm-generic-Provide-kmap_size.h.patch b/debian/patches-rt/0028-asm-generic-Provide-kmap_size.h.patch new file mode 100644 index 000000000..bb5d5f717 --- /dev/null +++ b/debian/patches-rt/0028-asm-generic-Provide-kmap_size.h.patch @@ -0,0 +1,70 @@ +From 5ed7d8c8905e639e711f1a21e5f978fc21585f8c Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:17 +0100 +Subject: [PATCH 028/323] asm-generic: Provide kmap_size.h +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +kmap_types.h is a misnomer because the old atomic MAP based array does not +exist anymore and the whole indirection of architectures including +kmap_types.h is inconinstent and does not allow to provide guard page +debugging for this misfeature. + +Add a common header file which defines the mapping stack size for all +architectures. Will be used when converting architectures over to a +generic kmap_local/atomic implementation. + +The array size is chosen with the following constraints in mind: + + - The deepest nest level in one context is 3 according to code + inspection. + + - The worst case nesting for the upcoming reemptible version would be: + + 2 maps in task context and a fault inside + 2 maps in the fault handler + 3 maps in softirq + 2 maps in interrupt + +So a total of 16 is sufficient and probably overestimated. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/asm-generic/Kbuild | 1 + + include/asm-generic/kmap_size.h | 12 ++++++++++++ + 2 files changed, 13 insertions(+) + create mode 100644 include/asm-generic/kmap_size.h + +diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild +index d1300c6e0a47..3114a6da7e56 100644 +--- a/include/asm-generic/Kbuild ++++ b/include/asm-generic/Kbuild +@@ -31,6 +31,7 @@ mandatory-y += irq_regs.h + mandatory-y += irq_work.h + mandatory-y += kdebug.h + mandatory-y += kmap_types.h ++mandatory-y += kmap_size.h + mandatory-y += kprobes.h + mandatory-y += linkage.h + mandatory-y += local.h +diff --git a/include/asm-generic/kmap_size.h b/include/asm-generic/kmap_size.h +new file mode 100644 +index 000000000000..9d6c7786a645 +--- /dev/null ++++ b/include/asm-generic/kmap_size.h +@@ -0,0 +1,12 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _ASM_GENERIC_KMAP_SIZE_H ++#define _ASM_GENERIC_KMAP_SIZE_H ++ ++/* For debug this provides guard pages between the maps */ ++#ifdef CONFIG_DEBUG_HIGHMEM ++# define KM_MAX_IDX 33 ++#else ++# define KM_MAX_IDX 16 ++#endif ++ ++#endif +-- +2.43.0 + diff --git a/debian/patches-rt/0029-highmem-Provide-generic-variant-of-kmap_atomic.patch b/debian/patches-rt/0029-highmem-Provide-generic-variant-of-kmap_atomic.patch new file mode 100644 index 000000000..828708904 --- /dev/null +++ b/debian/patches-rt/0029-highmem-Provide-generic-variant-of-kmap_atomic.patch @@ -0,0 +1,346 @@ +From 4e1b14787f7a2c71a9347db23c402f5dbe2da206 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:18 +0100 +Subject: [PATCH 029/323] highmem: Provide generic variant of kmap_atomic* +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The kmap_atomic* interfaces in all architectures are pretty much the same +except for post map operations (flush) and pre- and post unmap operations. + +Provide a generic variant for that. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: linux-mm@kvack.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/highmem.h | 82 ++++++++++++++++++----- + mm/Kconfig | 3 + + mm/highmem.c | 144 +++++++++++++++++++++++++++++++++++++++- + 3 files changed, 211 insertions(+), 18 deletions(-) + +diff --git a/include/linux/highmem.h b/include/linux/highmem.h +index 3297bfca78ed..14d5b4020c8c 100644 +--- a/include/linux/highmem.h ++++ b/include/linux/highmem.h +@@ -31,9 +31,16 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) + + #include <asm/kmap_types.h> + ++/* ++ * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft. ++ */ ++#ifdef CONFIG_KMAP_LOCAL ++void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); ++void *__kmap_local_page_prot(struct page *page, pgprot_t prot); ++void kunmap_local_indexed(void *vaddr); ++#endif ++ + #ifdef CONFIG_HIGHMEM +-extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); +-extern void kunmap_atomic_high(void *kvaddr); + #include <asm/highmem.h> + + #ifndef ARCH_HAS_KMAP_FLUSH_TLB +@@ -81,6 +88,11 @@ static inline void kunmap(struct page *page) + * be used in IRQ contexts, so in some (very limited) cases we need + * it. + */ ++ ++#ifndef CONFIG_KMAP_LOCAL ++void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); ++void kunmap_atomic_high(void *kvaddr); ++ + static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) + { + preempt_disable(); +@@ -89,7 +101,38 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) + return page_address(page); + return kmap_atomic_high_prot(page, prot); + } +-#define kmap_atomic(page) kmap_atomic_prot(page, kmap_prot) ++ ++static inline void __kunmap_atomic(void *vaddr) ++{ ++ kunmap_atomic_high(vaddr); ++} ++#else /* !CONFIG_KMAP_LOCAL */ ++ ++static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) ++{ ++ preempt_disable(); ++ pagefault_disable(); ++ return __kmap_local_page_prot(page, prot); ++} ++ ++static inline void *kmap_atomic_pfn(unsigned long pfn) ++{ ++ preempt_disable(); ++ pagefault_disable(); ++ return __kmap_local_pfn_prot(pfn, kmap_prot); ++} ++ ++static inline void __kunmap_atomic(void *addr) ++{ ++ kunmap_local_indexed(addr); ++} ++ ++#endif /* CONFIG_KMAP_LOCAL */ ++ ++static inline void *kmap_atomic(struct page *page) ++{ ++ return kmap_atomic_prot(page, kmap_prot); ++} + + /* declarations for linux/mm/highmem.c */ + unsigned int nr_free_highpages(void); +@@ -147,25 +190,33 @@ static inline void *kmap_atomic(struct page *page) + pagefault_disable(); + return page_address(page); + } +-#define kmap_atomic_prot(page, prot) kmap_atomic(page) + +-static inline void kunmap_atomic_high(void *addr) ++static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) ++{ ++ return kmap_atomic(page); ++} ++ ++static inline void *kmap_atomic_pfn(unsigned long pfn) ++{ ++ return kmap_atomic(pfn_to_page(pfn)); ++} ++ ++static inline void __kunmap_atomic(void *addr) + { + /* + * Mostly nothing to do in the CONFIG_HIGHMEM=n case as kunmap_atomic() +- * handles re-enabling faults + preemption ++ * handles re-enabling faults and preemption + */ + #ifdef ARCH_HAS_FLUSH_ON_KUNMAP + kunmap_flush_on_unmap(addr); + #endif + } + +-#define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) +- + #define kmap_flush_unused() do {} while(0) + + #endif /* CONFIG_HIGHMEM */ + ++#if !defined(CONFIG_KMAP_LOCAL) + #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) + + DECLARE_PER_CPU(int, __kmap_atomic_idx); +@@ -196,22 +247,21 @@ static inline void kmap_atomic_idx_pop(void) + __this_cpu_dec(__kmap_atomic_idx); + #endif + } +- ++#endif + #endif + + /* + * Prevent people trying to call kunmap_atomic() as if it were kunmap() + * kunmap_atomic() should get the return value of kmap_atomic, not the page. + */ +-#define kunmap_atomic(addr) \ +-do { \ +- BUILD_BUG_ON(__same_type((addr), struct page *)); \ +- kunmap_atomic_high(addr); \ +- pagefault_enable(); \ +- preempt_enable(); \ ++#define kunmap_atomic(__addr) \ ++do { \ ++ BUILD_BUG_ON(__same_type((__addr), struct page *)); \ ++ __kunmap_atomic(__addr); \ ++ pagefault_enable(); \ ++ preempt_enable(); \ + } while (0) + +- + /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ + #ifndef clear_user_highpage + static inline void clear_user_highpage(struct page *page, unsigned long vaddr) +diff --git a/mm/Kconfig b/mm/Kconfig +index 390165ffbb0f..8c49d09da214 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -859,4 +859,7 @@ config ARCH_HAS_HUGEPD + config MAPPING_DIRTY_HELPERS + bool + ++config KMAP_LOCAL ++ bool ++ + endmenu +diff --git a/mm/highmem.c b/mm/highmem.c +index 6abfd762eee7..bb4ce13ee7e7 100644 +--- a/mm/highmem.c ++++ b/mm/highmem.c +@@ -31,9 +31,11 @@ + #include <asm/tlbflush.h> + #include <linux/vmalloc.h> + ++#ifndef CONFIG_KMAP_LOCAL + #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) + DEFINE_PER_CPU(int, __kmap_atomic_idx); + #endif ++#endif + + /* + * Virtual_count is not a pure "count". +@@ -365,9 +367,147 @@ void kunmap_high(struct page *page) + if (need_wakeup) + wake_up(pkmap_map_wait); + } +- + EXPORT_SYMBOL(kunmap_high); +-#endif /* CONFIG_HIGHMEM */ ++#endif /* CONFIG_HIGHMEM */ ++ ++#ifdef CONFIG_KMAP_LOCAL ++ ++#include <asm/kmap_size.h> ++ ++static DEFINE_PER_CPU(int, __kmap_local_idx); ++ ++static inline int kmap_local_idx_push(void) ++{ ++ int idx = __this_cpu_inc_return(__kmap_local_idx) - 1; ++ ++ WARN_ON_ONCE(in_irq() && !irqs_disabled()); ++ BUG_ON(idx >= KM_MAX_IDX); ++ return idx; ++} ++ ++static inline int kmap_local_idx(void) ++{ ++ return __this_cpu_read(__kmap_local_idx) - 1; ++} ++ ++static inline void kmap_local_idx_pop(void) ++{ ++ int idx = __this_cpu_dec_return(__kmap_local_idx); ++ ++ BUG_ON(idx < 0); ++} ++ ++#ifndef arch_kmap_local_post_map ++# define arch_kmap_local_post_map(vaddr, pteval) do { } while (0) ++#endif ++#ifndef arch_kmap_local_pre_unmap ++# define arch_kmap_local_pre_unmap(vaddr) do { } while (0) ++#endif ++ ++#ifndef arch_kmap_local_post_unmap ++# define arch_kmap_local_post_unmap(vaddr) do { } while (0) ++#endif ++ ++#ifndef arch_kmap_local_map_idx ++#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx) ++#endif ++ ++#ifndef arch_kmap_local_unmap_idx ++#define arch_kmap_local_unmap_idx(idx, vaddr) kmap_local_calc_idx(idx) ++#endif ++ ++#ifndef arch_kmap_local_high_get ++static inline void *arch_kmap_local_high_get(struct page *page) ++{ ++ return NULL; ++} ++#endif ++ ++/* Unmap a local mapping which was obtained by kmap_high_get() */ ++static inline void kmap_high_unmap_local(unsigned long vaddr) ++{ ++#ifdef ARCH_NEEDS_KMAP_HIGH_GET ++ if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) ++ kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); ++#endif ++} ++ ++static inline int kmap_local_calc_idx(int idx) ++{ ++ return idx + KM_MAX_IDX * smp_processor_id(); ++} ++ ++static pte_t *__kmap_pte; ++ ++static pte_t *kmap_get_pte(void) ++{ ++ if (!__kmap_pte) ++ __kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); ++ return __kmap_pte; ++} ++ ++void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) ++{ ++ pte_t pteval, *kmap_pte = kmap_get_pte(); ++ unsigned long vaddr; ++ int idx; ++ ++ preempt_disable(); ++ idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn); ++ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); ++ BUG_ON(!pte_none(*(kmap_pte - idx))); ++ pteval = pfn_pte(pfn, prot); ++ set_pte_at(&init_mm, vaddr, kmap_pte - idx, pteval); ++ arch_kmap_local_post_map(vaddr, pteval); ++ preempt_enable(); ++ ++ return (void *)vaddr; ++} ++EXPORT_SYMBOL_GPL(__kmap_local_pfn_prot); ++ ++void *__kmap_local_page_prot(struct page *page, pgprot_t prot) ++{ ++ void *kmap; ++ ++ if (!PageHighMem(page)) ++ return page_address(page); ++ ++ /* Try kmap_high_get() if architecture has it enabled */ ++ kmap = arch_kmap_local_high_get(page); ++ if (kmap) ++ return kmap; ++ ++ return __kmap_local_pfn_prot(page_to_pfn(page), prot); ++} ++EXPORT_SYMBOL(__kmap_local_page_prot); ++ ++void kunmap_local_indexed(void *vaddr) ++{ ++ unsigned long addr = (unsigned long) vaddr & PAGE_MASK; ++ pte_t *kmap_pte = kmap_get_pte(); ++ int idx; ++ ++ if (addr < __fix_to_virt(FIX_KMAP_END) || ++ addr > __fix_to_virt(FIX_KMAP_BEGIN)) { ++ WARN_ON_ONCE(addr < PAGE_OFFSET); ++ ++ /* Handle mappings which were obtained by kmap_high_get() */ ++ kmap_high_unmap_local(addr); ++ return; ++ } ++ ++ preempt_disable(); ++ idx = arch_kmap_local_unmap_idx(kmap_local_idx(), addr); ++ WARN_ON_ONCE(addr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); ++ ++ arch_kmap_local_pre_unmap(addr); ++ pte_clear(&init_mm, addr, kmap_pte - idx); ++ arch_kmap_local_post_unmap(addr); ++ kmap_local_idx_pop(); ++ preempt_enable(); ++} ++EXPORT_SYMBOL(kunmap_local_indexed); ++#endif + + #if defined(HASHED_PAGE_VIRTUAL) + +-- +2.43.0 + diff --git a/debian/patches-rt/0030-highmem-Make-DEBUG_HIGHMEM-functional.patch b/debian/patches-rt/0030-highmem-Make-DEBUG_HIGHMEM-functional.patch new file mode 100644 index 000000000..139157832 --- /dev/null +++ b/debian/patches-rt/0030-highmem-Make-DEBUG_HIGHMEM-functional.patch @@ -0,0 +1,61 @@ +From 4c9348bc5b225c2de1552b059787335345d30d2c Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:19 +0100 +Subject: [PATCH 030/323] highmem: Make DEBUG_HIGHMEM functional +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +For some obscure reason when CONFIG_DEBUG_HIGHMEM is enabled the stack +depth is increased from 20 to 41. But the only thing DEBUG_HIGHMEM does is +to enable a few BUG_ON()'s in the mapping code. + +That's a leftover from the historical mapping code which had fixed entries +for various purposes. DEBUG_HIGHMEM inserted guard mappings between the map +types. But that got all ditched when kmap_atomic() switched to a stack +based map management. Though the WITH_KM_FENCE magic survived without being +functional. All the thing does today is to increase the stack depth. + +Add a working implementation to the generic kmap_local* implementation. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + mm/highmem.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +diff --git a/mm/highmem.c b/mm/highmem.c +index bb4ce13ee7e7..67d2d5983cb0 100644 +--- a/mm/highmem.c ++++ b/mm/highmem.c +@@ -376,9 +376,19 @@ EXPORT_SYMBOL(kunmap_high); + + static DEFINE_PER_CPU(int, __kmap_local_idx); + ++/* ++ * With DEBUG_HIGHMEM the stack depth is doubled and every second ++ * slot is unused which acts as a guard page ++ */ ++#ifdef CONFIG_DEBUG_HIGHMEM ++# define KM_INCR 2 ++#else ++# define KM_INCR 1 ++#endif ++ + static inline int kmap_local_idx_push(void) + { +- int idx = __this_cpu_inc_return(__kmap_local_idx) - 1; ++ int idx = __this_cpu_add_return(__kmap_local_idx, KM_INCR) - 1; + + WARN_ON_ONCE(in_irq() && !irqs_disabled()); + BUG_ON(idx >= KM_MAX_IDX); +@@ -392,7 +402,7 @@ static inline int kmap_local_idx(void) + + static inline void kmap_local_idx_pop(void) + { +- int idx = __this_cpu_dec_return(__kmap_local_idx); ++ int idx = __this_cpu_sub_return(__kmap_local_idx, KM_INCR); + + BUG_ON(idx < 0); + } +-- +2.43.0 + diff --git a/debian/patches-rt/0031-x86-mm-highmem-Use-generic-kmap-atomic-implementatio.patch b/debian/patches-rt/0031-x86-mm-highmem-Use-generic-kmap-atomic-implementatio.patch new file mode 100644 index 000000000..d381e36e5 --- /dev/null +++ b/debian/patches-rt/0031-x86-mm-highmem-Use-generic-kmap-atomic-implementatio.patch @@ -0,0 +1,389 @@ +From a6456b1e46c0a3b8ad0a9dd3afaeb69c037ad289 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:20 +0100 +Subject: [PATCH 031/323] x86/mm/highmem: Use generic kmap atomic + implementation +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Convert X86 to the generic kmap atomic implementation and make the +iomap_atomic() naming convention consistent while at it. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: x86@kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/x86/Kconfig | 1 + + arch/x86/include/asm/fixmap.h | 5 +-- + arch/x86/include/asm/highmem.h | 13 ++++-- + arch/x86/include/asm/iomap.h | 18 ++++---- + arch/x86/include/asm/kmap_types.h | 13 ------ + arch/x86/include/asm/paravirt_types.h | 1 - + arch/x86/mm/highmem_32.c | 59 --------------------------- + arch/x86/mm/init_32.c | 15 ------- + arch/x86/mm/iomap_32.c | 59 +++------------------------ + include/linux/highmem.h | 2 +- + include/linux/io-mapping.h | 2 +- + mm/highmem.c | 2 +- + 12 files changed, 30 insertions(+), 160 deletions(-) + delete mode 100644 arch/x86/include/asm/kmap_types.h + +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 6dc670e36393..54e5284a6ae1 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -15,6 +15,7 @@ config X86_32 + select CLKSRC_I8253 + select CLONE_BACKWARDS + select HAVE_DEBUG_STACKOVERFLOW ++ select KMAP_LOCAL + select MODULES_USE_ELF_REL + select OLD_SIGACTION + select GENERIC_VDSO_32 +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index 77217bd292bd..8eba66a33e39 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -31,7 +31,7 @@ + #include <asm/pgtable_types.h> + #ifdef CONFIG_X86_32 + #include <linux/threads.h> +-#include <asm/kmap_types.h> ++#include <asm/kmap_size.h> + #else + #include <uapi/asm/vsyscall.h> + #endif +@@ -94,7 +94,7 @@ enum fixed_addresses { + #endif + #ifdef CONFIG_X86_32 + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ +- FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, ++ FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, + #ifdef CONFIG_PCI_MMCONFIG + FIX_PCIE_MCFG, + #endif +@@ -151,7 +151,6 @@ extern void reserve_top_address(unsigned long reserve); + + extern int fixmaps_set; + +-extern pte_t *kmap_pte; + extern pte_t *pkmap_page_table; + + void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); +diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h +index 0f420b24e0fc..032e020853aa 100644 +--- a/arch/x86/include/asm/highmem.h ++++ b/arch/x86/include/asm/highmem.h +@@ -23,7 +23,6 @@ + + #include <linux/interrupt.h> + #include <linux/threads.h> +-#include <asm/kmap_types.h> + #include <asm/tlbflush.h> + #include <asm/paravirt.h> + #include <asm/fixmap.h> +@@ -58,11 +57,17 @@ extern unsigned long highstart_pfn, highend_pfn; + #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) + #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) + +-void *kmap_atomic_pfn(unsigned long pfn); +-void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); +- + #define flush_cache_kmaps() do { } while (0) + ++#define arch_kmap_local_post_map(vaddr, pteval) \ ++ arch_flush_lazy_mmu_mode() ++ ++#define arch_kmap_local_post_unmap(vaddr) \ ++ do { \ ++ flush_tlb_one_kernel((vaddr)); \ ++ arch_flush_lazy_mmu_mode(); \ ++ } while (0) ++ + extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn); + +diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h +index bacf68c4d70e..0be7a30fd6bc 100644 +--- a/arch/x86/include/asm/iomap.h ++++ b/arch/x86/include/asm/iomap.h +@@ -9,19 +9,21 @@ + #include <linux/fs.h> + #include <linux/mm.h> + #include <linux/uaccess.h> ++#include <linux/highmem.h> + #include <asm/cacheflush.h> + #include <asm/tlbflush.h> + +-void __iomem * +-iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); ++void __iomem *iomap_atomic_pfn_prot(unsigned long pfn, pgprot_t prot); + +-void +-iounmap_atomic(void __iomem *kvaddr); ++static inline void iounmap_atomic(void __iomem *vaddr) ++{ ++ kunmap_local_indexed((void __force *)vaddr); ++ pagefault_enable(); ++ preempt_enable(); ++} + +-int +-iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); ++int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); + +-void +-iomap_free(resource_size_t base, unsigned long size); ++void iomap_free(resource_size_t base, unsigned long size); + + #endif /* _ASM_X86_IOMAP_H */ +diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h +deleted file mode 100644 +index 04ab8266e347..000000000000 +--- a/arch/x86/include/asm/kmap_types.h ++++ /dev/null +@@ -1,13 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef _ASM_X86_KMAP_TYPES_H +-#define _ASM_X86_KMAP_TYPES_H +- +-#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM) +-#define __WITH_KM_FENCE +-#endif +- +-#include <asm-generic/kmap_types.h> +- +-#undef __WITH_KM_FENCE +- +-#endif /* _ASM_X86_KMAP_TYPES_H */ +diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h +index 903d71884fa2..130f428b0cc8 100644 +--- a/arch/x86/include/asm/paravirt_types.h ++++ b/arch/x86/include/asm/paravirt_types.h +@@ -41,7 +41,6 @@ + #ifndef __ASSEMBLY__ + + #include <asm/desc_defs.h> +-#include <asm/kmap_types.h> + #include <asm/pgtable_types.h> + #include <asm/nospec-branch.h> + +diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c +index 075fe51317b0..2c54b76d8f84 100644 +--- a/arch/x86/mm/highmem_32.c ++++ b/arch/x86/mm/highmem_32.c +@@ -4,65 +4,6 @@ + #include <linux/swap.h> /* for totalram_pages */ + #include <linux/memblock.h> + +-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) +-{ +- unsigned long vaddr; +- int idx, type; +- +- type = kmap_atomic_idx_push(); +- idx = type + KM_TYPE_NR*smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +- BUG_ON(!pte_none(*(kmap_pte-idx))); +- set_pte(kmap_pte-idx, mk_pte(page, prot)); +- arch_flush_lazy_mmu_mode(); +- +- return (void *)vaddr; +-} +-EXPORT_SYMBOL(kmap_atomic_high_prot); +- +-/* +- * This is the same as kmap_atomic() but can map memory that doesn't +- * have a struct page associated with it. +- */ +-void *kmap_atomic_pfn(unsigned long pfn) +-{ +- return kmap_atomic_prot_pfn(pfn, kmap_prot); +-} +-EXPORT_SYMBOL_GPL(kmap_atomic_pfn); +- +-void kunmap_atomic_high(void *kvaddr) +-{ +- unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; +- +- if (vaddr >= __fix_to_virt(FIX_KMAP_END) && +- vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { +- int idx, type; +- +- type = kmap_atomic_idx(); +- idx = type + KM_TYPE_NR * smp_processor_id(); +- +-#ifdef CONFIG_DEBUG_HIGHMEM +- WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +-#endif +- /* +- * Force other mappings to Oops if they'll try to access this +- * pte without first remap it. Keeping stale mappings around +- * is a bad idea also, in case the page changes cacheability +- * attributes or becomes a protected page in a hypervisor. +- */ +- kpte_clear_flush(kmap_pte-idx, vaddr); +- kmap_atomic_idx_pop(); +- arch_flush_lazy_mmu_mode(); +- } +-#ifdef CONFIG_DEBUG_HIGHMEM +- else { +- BUG_ON(vaddr < PAGE_OFFSET); +- BUG_ON(vaddr >= (unsigned long)high_memory); +- } +-#endif +-} +-EXPORT_SYMBOL(kunmap_atomic_high); +- + void __init set_highmem_pages_init(void) + { + struct zone *zone; +diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c +index 7c055259de3a..da31c2635ee4 100644 +--- a/arch/x86/mm/init_32.c ++++ b/arch/x86/mm/init_32.c +@@ -394,19 +394,6 @@ kernel_physical_mapping_init(unsigned long start, + return last_map_addr; + } + +-pte_t *kmap_pte; +- +-static void __init kmap_init(void) +-{ +- unsigned long kmap_vstart; +- +- /* +- * Cache the first kmap pte: +- */ +- kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); +- kmap_pte = virt_to_kpte(kmap_vstart); +-} +- + #ifdef CONFIG_HIGHMEM + static void __init permanent_kmaps_init(pgd_t *pgd_base) + { +@@ -712,8 +699,6 @@ void __init paging_init(void) + + __flush_tlb_all(); + +- kmap_init(); +- + /* + * NOTE: at this point the bootmem allocator is fully available. + */ +diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c +index f60398aeb644..e0a40d7cc66c 100644 +--- a/arch/x86/mm/iomap_32.c ++++ b/arch/x86/mm/iomap_32.c +@@ -44,28 +44,7 @@ void iomap_free(resource_size_t base, unsigned long size) + } + EXPORT_SYMBOL_GPL(iomap_free); + +-void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) +-{ +- unsigned long vaddr; +- int idx, type; +- +- preempt_disable(); +- pagefault_disable(); +- +- type = kmap_atomic_idx_push(); +- idx = type + KM_TYPE_NR * smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +- set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); +- arch_flush_lazy_mmu_mode(); +- +- return (void *)vaddr; +-} +- +-/* +- * Map 'pfn' using protections 'prot' +- */ +-void __iomem * +-iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) ++void __iomem *iomap_atomic_pfn_prot(unsigned long pfn, pgprot_t prot) + { + /* + * For non-PAT systems, translate non-WB request to UC- just in +@@ -81,36 +60,8 @@ iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) + /* Filter out unsupported __PAGE_KERNEL* bits: */ + pgprot_val(prot) &= __default_kernel_pte_mask; + +- return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot); +-} +-EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); +- +-void +-iounmap_atomic(void __iomem *kvaddr) +-{ +- unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; +- +- if (vaddr >= __fix_to_virt(FIX_KMAP_END) && +- vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { +- int idx, type; +- +- type = kmap_atomic_idx(); +- idx = type + KM_TYPE_NR * smp_processor_id(); +- +-#ifdef CONFIG_DEBUG_HIGHMEM +- WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +-#endif +- /* +- * Force other mappings to Oops if they'll try to access this +- * pte without first remap it. Keeping stale mappings around +- * is a bad idea also, in case the page changes cacheability +- * attributes or becomes a protected page in a hypervisor. +- */ +- kpte_clear_flush(kmap_pte-idx, vaddr); +- kmap_atomic_idx_pop(); +- } +- +- pagefault_enable(); +- preempt_enable(); ++ preempt_disable(); ++ pagefault_disable(); ++ return (void __force __iomem *)__kmap_local_pfn_prot(pfn, prot); + } +-EXPORT_SYMBOL_GPL(iounmap_atomic); ++EXPORT_SYMBOL_GPL(iomap_atomic_pfn_prot); +diff --git a/include/linux/highmem.h b/include/linux/highmem.h +index 14d5b4020c8c..fbede783dc34 100644 +--- a/include/linux/highmem.h ++++ b/include/linux/highmem.h +@@ -217,7 +217,7 @@ static inline void __kunmap_atomic(void *addr) + #endif /* CONFIG_HIGHMEM */ + + #if !defined(CONFIG_KMAP_LOCAL) +-#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) ++#if defined(CONFIG_HIGHMEM) + + DECLARE_PER_CPU(int, __kmap_atomic_idx); + +diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h +index c75e4d3d8833..3b0940be72e9 100644 +--- a/include/linux/io-mapping.h ++++ b/include/linux/io-mapping.h +@@ -69,7 +69,7 @@ io_mapping_map_atomic_wc(struct io_mapping *mapping, + + BUG_ON(offset >= mapping->size); + phys_addr = mapping->base + offset; +- return iomap_atomic_prot_pfn(PHYS_PFN(phys_addr), mapping->prot); ++ return iomap_atomic_pfn_prot(PHYS_PFN(phys_addr), mapping->prot); + } + + static inline void +diff --git a/mm/highmem.c b/mm/highmem.c +index 67d2d5983cb0..77677c6844f7 100644 +--- a/mm/highmem.c ++++ b/mm/highmem.c +@@ -32,7 +32,7 @@ + #include <linux/vmalloc.h> + + #ifndef CONFIG_KMAP_LOCAL +-#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) ++#ifdef CONFIG_HIGHMEM + DEFINE_PER_CPU(int, __kmap_atomic_idx); + #endif + #endif +-- +2.43.0 + diff --git a/debian/patches-rt/0032-arc-mm-highmem-Use-generic-kmap-atomic-implementatio.patch b/debian/patches-rt/0032-arc-mm-highmem-Use-generic-kmap-atomic-implementatio.patch new file mode 100644 index 000000000..581f1da69 --- /dev/null +++ b/debian/patches-rt/0032-arc-mm-highmem-Use-generic-kmap-atomic-implementatio.patch @@ -0,0 +1,212 @@ +From 7eeaddc0ba0792dd0d8a1f5f167ff44230b77855 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:21 +0100 +Subject: [PATCH 032/323] arc/mm/highmem: Use generic kmap atomic + implementation +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Adopt the map ordering to match the other architectures and the generic +code. Also make the maximum entries limited and not dependend on the number +of CPUs. With the original implementation did the following calculation: + + nr_slots = mapsize >> PAGE_SHIFT; + +The results in either 512 or 1024 total slots depending on +configuration. The total slots have to be divided by the number of CPUs to +get the number of slots per CPU (former KM_TYPE_NR). ARC supports up to 4k +CPUs, so this just falls apart in random ways depending on the number of +CPUs and the actual kmap (atomic) nesting. The comment in highmem.c: + + * - fixmap anyhow needs a limited number of mappings. So 2M kvaddr == 256 PTE + * slots across NR_CPUS would be more than sufficient (generic code defines + * KM_TYPE_NR as 20). + +is just wrong. KM_TYPE_NR (now KM_MAX_IDX) is the number of slots per CPU +because kmap_local/atomic() needs to support nested mappings (thread, +softirq, interrupt). While KM_MAX_IDX might be overestimated, the above +reasoning is just wrong and clearly the highmem code was never tested with +any system with more than a few CPUs. + +Use the default number of slots and fail the build when it does not +fit. Randomly failing at runtime is not a really good option. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Vineet Gupta <vgupta@synopsys.com> +Cc: linux-snps-arc@lists.infradead.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/arc/Kconfig | 1 + + arch/arc/include/asm/highmem.h | 26 +++++++++++---- + arch/arc/include/asm/kmap_types.h | 14 -------- + arch/arc/mm/highmem.c | 54 +++---------------------------- + 4 files changed, 26 insertions(+), 69 deletions(-) + delete mode 100644 arch/arc/include/asm/kmap_types.h + +diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig +index 0a89cc9def65..d8804001d550 100644 +--- a/arch/arc/Kconfig ++++ b/arch/arc/Kconfig +@@ -507,6 +507,7 @@ config LINUX_RAM_BASE + config HIGHMEM + bool "High Memory Support" + select ARCH_DISCONTIGMEM_ENABLE ++ select KMAP_LOCAL + help + With ARC 2G:2G address split, only upper 2G is directly addressable by + kernel. Enable this to potentially allow access to rest of 2G and PAE +diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h +index 6e5eafb3afdd..a6b8e2c352c4 100644 +--- a/arch/arc/include/asm/highmem.h ++++ b/arch/arc/include/asm/highmem.h +@@ -9,17 +9,29 @@ + #ifdef CONFIG_HIGHMEM + + #include <uapi/asm/page.h> +-#include <asm/kmap_types.h> ++#include <asm/kmap_size.h> ++ ++#define FIXMAP_SIZE PGDIR_SIZE ++#define PKMAP_SIZE PGDIR_SIZE + + /* start after vmalloc area */ + #define FIXMAP_BASE (PAGE_OFFSET - FIXMAP_SIZE - PKMAP_SIZE) +-#define FIXMAP_SIZE PGDIR_SIZE /* only 1 PGD worth */ +-#define KM_TYPE_NR ((FIXMAP_SIZE >> PAGE_SHIFT)/NR_CPUS) +-#define FIXMAP_ADDR(nr) (FIXMAP_BASE + ((nr) << PAGE_SHIFT)) ++ ++#define FIX_KMAP_SLOTS (KM_MAX_IDX * NR_CPUS) ++#define FIX_KMAP_BEGIN (0UL) ++#define FIX_KMAP_END ((FIX_KMAP_BEGIN + FIX_KMAP_SLOTS) - 1) ++ ++#define FIXADDR_TOP (FIXMAP_BASE + (FIX_KMAP_END << PAGE_SHIFT)) ++ ++/* ++ * This should be converted to the asm-generic version, but of course this ++ * is needlessly different from all other architectures. Sigh - tglx ++ */ ++#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) ++#define __virt_to_fix(x) (((FIXADDR_TOP - ((x) & PAGE_MASK))) >> PAGE_SHIFT) + + /* start after fixmap area */ + #define PKMAP_BASE (FIXMAP_BASE + FIXMAP_SIZE) +-#define PKMAP_SIZE PGDIR_SIZE + #define LAST_PKMAP (PKMAP_SIZE >> PAGE_SHIFT) + #define LAST_PKMAP_MASK (LAST_PKMAP - 1) + #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) +@@ -29,11 +41,13 @@ + + extern void kmap_init(void); + ++#define arch_kmap_local_post_unmap(vaddr) \ ++ local_flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE) ++ + static inline void flush_cache_kmaps(void) + { + flush_cache_all(); + } +- + #endif + + #endif +diff --git a/arch/arc/include/asm/kmap_types.h b/arch/arc/include/asm/kmap_types.h +deleted file mode 100644 +index fecf7851ec32..000000000000 +--- a/arch/arc/include/asm/kmap_types.h ++++ /dev/null +@@ -1,14 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0-only */ +-/* +- * Copyright (C) 2015 Synopsys, Inc. (www.synopsys.com) +- */ +- +-#ifndef _ASM_KMAP_TYPES_H +-#define _ASM_KMAP_TYPES_H +- +-/* +- * We primarily need to define KM_TYPE_NR here but that in turn +- * is a function of PGDIR_SIZE etc. +- * To avoid circular deps issue, put everything in asm/highmem.h +- */ +-#endif +diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c +index 1b9f473c6369..c79912a6b196 100644 +--- a/arch/arc/mm/highmem.c ++++ b/arch/arc/mm/highmem.c +@@ -36,9 +36,8 @@ + * This means each only has 1 PGDIR_SIZE worth of kvaddr mappings, which means + * 2M of kvaddr space for typical config (8K page and 11:8:13 traversal split) + * +- * - fixmap anyhow needs a limited number of mappings. So 2M kvaddr == 256 PTE +- * slots across NR_CPUS would be more than sufficient (generic code defines +- * KM_TYPE_NR as 20). ++ * - The fixed KMAP slots for kmap_local/atomic() require KM_MAX_IDX slots per ++ * CPU. So the number of CPUs sharing a single PTE page is limited. + * + * - pkmap being preemptible, in theory could do with more than 256 concurrent + * mappings. However, generic pkmap code: map_new_virtual(), doesn't traverse +@@ -47,48 +46,6 @@ + */ + + extern pte_t * pkmap_page_table; +-static pte_t * fixmap_page_table; +- +-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) +-{ +- int idx, cpu_idx; +- unsigned long vaddr; +- +- cpu_idx = kmap_atomic_idx_push(); +- idx = cpu_idx + KM_TYPE_NR * smp_processor_id(); +- vaddr = FIXMAP_ADDR(idx); +- +- set_pte_at(&init_mm, vaddr, fixmap_page_table + idx, +- mk_pte(page, prot)); +- +- return (void *)vaddr; +-} +-EXPORT_SYMBOL(kmap_atomic_high_prot); +- +-void kunmap_atomic_high(void *kv) +-{ +- unsigned long kvaddr = (unsigned long)kv; +- +- if (kvaddr >= FIXMAP_BASE && kvaddr < (FIXMAP_BASE + FIXMAP_SIZE)) { +- +- /* +- * Because preemption is disabled, this vaddr can be associated +- * with the current allocated index. +- * But in case of multiple live kmap_atomic(), it still relies on +- * callers to unmap in right order. +- */ +- int cpu_idx = kmap_atomic_idx(); +- int idx = cpu_idx + KM_TYPE_NR * smp_processor_id(); +- +- WARN_ON(kvaddr != FIXMAP_ADDR(idx)); +- +- pte_clear(&init_mm, kvaddr, fixmap_page_table + idx); +- local_flush_tlb_kernel_range(kvaddr, kvaddr + PAGE_SIZE); +- +- kmap_atomic_idx_pop(); +- } +-} +-EXPORT_SYMBOL(kunmap_atomic_high); + + static noinline pte_t * __init alloc_kmap_pgtable(unsigned long kvaddr) + { +@@ -108,10 +65,9 @@ void __init kmap_init(void) + { + /* Due to recursive include hell, we can't do this in processor.h */ + BUILD_BUG_ON(PAGE_OFFSET < (VMALLOC_END + FIXMAP_SIZE + PKMAP_SIZE)); ++ BUILD_BUG_ON(LAST_PKMAP > PTRS_PER_PTE); ++ BUILD_BUG_ON(FIX_KMAP_SLOTS > PTRS_PER_PTE); + +- BUILD_BUG_ON(KM_TYPE_NR > PTRS_PER_PTE); + pkmap_page_table = alloc_kmap_pgtable(PKMAP_BASE); +- +- BUILD_BUG_ON(LAST_PKMAP > PTRS_PER_PTE); +- fixmap_page_table = alloc_kmap_pgtable(FIXMAP_BASE); ++ alloc_kmap_pgtable(FIXMAP_BASE); + } +-- +2.43.0 + diff --git a/debian/patches-rt/0033-ARM-highmem-Switch-to-generic-kmap-atomic.patch b/debian/patches-rt/0033-ARM-highmem-Switch-to-generic-kmap-atomic.patch new file mode 100644 index 000000000..7f4d188e7 --- /dev/null +++ b/debian/patches-rt/0033-ARM-highmem-Switch-to-generic-kmap-atomic.patch @@ -0,0 +1,271 @@ +From 007968641a299ffd9fd631d7f67fc119aaf9ac5d Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:22 +0100 +Subject: [PATCH 033/323] ARM: highmem: Switch to generic kmap atomic +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +No reason having the same code in every architecture. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Russell King <linux@armlinux.org.uk> +Cc: Arnd Bergmann <arnd@arndb.de> +Cc: linux-arm-kernel@lists.infradead.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/arm/Kconfig | 1 + + arch/arm/include/asm/fixmap.h | 4 +- + arch/arm/include/asm/highmem.h | 34 ++++++--- + arch/arm/include/asm/kmap_types.h | 10 --- + arch/arm/mm/Makefile | 1 - + arch/arm/mm/highmem.c | 121 ------------------------------ + 6 files changed, 27 insertions(+), 144 deletions(-) + delete mode 100644 arch/arm/include/asm/kmap_types.h + delete mode 100644 arch/arm/mm/highmem.c + +diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig +index 335308aff6ce..c01251683018 100644 +--- a/arch/arm/Kconfig ++++ b/arch/arm/Kconfig +@@ -1497,6 +1497,7 @@ config HAVE_ARCH_PFN_VALID + config HIGHMEM + bool "High Memory Support" + depends on MMU ++ select KMAP_LOCAL + help + The address space of ARM processors is only 4 Gigabytes large + and it has to accommodate user address space, kernel address +diff --git a/arch/arm/include/asm/fixmap.h b/arch/arm/include/asm/fixmap.h +index 9575b404019c..707068f852c2 100644 +--- a/arch/arm/include/asm/fixmap.h ++++ b/arch/arm/include/asm/fixmap.h +@@ -7,14 +7,14 @@ + #define FIXADDR_TOP (FIXADDR_END - PAGE_SIZE) + + #include <linux/pgtable.h> +-#include <asm/kmap_types.h> ++#include <asm/kmap_size.h> + + enum fixed_addresses { + FIX_EARLYCON_MEM_BASE, + __end_of_permanent_fixed_addresses, + + FIX_KMAP_BEGIN = __end_of_permanent_fixed_addresses, +- FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS) - 1, ++ FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, + + /* Support writing RO kernel text via kprobes, jump labels, etc. */ + FIX_TEXT_POKE0, +diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h +index 31811be38d78..b22dffa8c7eb 100644 +--- a/arch/arm/include/asm/highmem.h ++++ b/arch/arm/include/asm/highmem.h +@@ -2,7 +2,8 @@ + #ifndef _ASM_HIGHMEM_H + #define _ASM_HIGHMEM_H + +-#include <asm/kmap_types.h> ++#include <asm/kmap_size.h> ++#include <asm/fixmap.h> + + #define PKMAP_BASE (PAGE_OFFSET - PMD_SIZE) + #define LAST_PKMAP PTRS_PER_PTE +@@ -46,19 +47,32 @@ extern pte_t *pkmap_page_table; + + #ifdef ARCH_NEEDS_KMAP_HIGH_GET + extern void *kmap_high_get(struct page *page); +-#else ++ ++static inline void *arch_kmap_local_high_get(struct page *page) ++{ ++ if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !cache_is_vivt()) ++ return NULL; ++ return kmap_high_get(page); ++} ++#define arch_kmap_local_high_get arch_kmap_local_high_get ++ ++#else /* ARCH_NEEDS_KMAP_HIGH_GET */ + static inline void *kmap_high_get(struct page *page) + { + return NULL; + } +-#endif ++#endif /* !ARCH_NEEDS_KMAP_HIGH_GET */ + +-/* +- * The following functions are already defined by <linux/highmem.h> +- * when CONFIG_HIGHMEM is not set. +- */ +-#ifdef CONFIG_HIGHMEM +-extern void *kmap_atomic_pfn(unsigned long pfn); +-#endif ++#define arch_kmap_local_post_map(vaddr, pteval) \ ++ local_flush_tlb_kernel_page(vaddr) ++ ++#define arch_kmap_local_pre_unmap(vaddr) \ ++do { \ ++ if (cache_is_vivt()) \ ++ __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE); \ ++} while (0) ++ ++#define arch_kmap_local_post_unmap(vaddr) \ ++ local_flush_tlb_kernel_page(vaddr) + + #endif +diff --git a/arch/arm/include/asm/kmap_types.h b/arch/arm/include/asm/kmap_types.h +deleted file mode 100644 +index 5590940ee43d..000000000000 +--- a/arch/arm/include/asm/kmap_types.h ++++ /dev/null +@@ -1,10 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef __ARM_KMAP_TYPES_H +-#define __ARM_KMAP_TYPES_H +- +-/* +- * This is the "bare minimum". AIO seems to require this. +- */ +-#define KM_TYPE_NR 16 +- +-#endif +diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile +index 7cb1699fbfc4..c4ce477c5261 100644 +--- a/arch/arm/mm/Makefile ++++ b/arch/arm/mm/Makefile +@@ -19,7 +19,6 @@ obj-$(CONFIG_MODULES) += proc-syms.o + obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o + + obj-$(CONFIG_ALIGNMENT_TRAP) += alignment.o +-obj-$(CONFIG_HIGHMEM) += highmem.o + obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o + obj-$(CONFIG_ARM_PV_FIXUP) += pv-fixup-asm.o + +diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c +deleted file mode 100644 +index 187fab227b50..000000000000 +--- a/arch/arm/mm/highmem.c ++++ /dev/null +@@ -1,121 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0-only +-/* +- * arch/arm/mm/highmem.c -- ARM highmem support +- * +- * Author: Nicolas Pitre +- * Created: september 8, 2008 +- * Copyright: Marvell Semiconductors Inc. +- */ +- +-#include <linux/module.h> +-#include <linux/highmem.h> +-#include <linux/interrupt.h> +-#include <asm/fixmap.h> +-#include <asm/cacheflush.h> +-#include <asm/tlbflush.h> +-#include "mm.h" +- +-static inline void set_fixmap_pte(int idx, pte_t pte) +-{ +- unsigned long vaddr = __fix_to_virt(idx); +- pte_t *ptep = virt_to_kpte(vaddr); +- +- set_pte_ext(ptep, pte, 0); +- local_flush_tlb_kernel_page(vaddr); +-} +- +-static inline pte_t get_fixmap_pte(unsigned long vaddr) +-{ +- pte_t *ptep = virt_to_kpte(vaddr); +- +- return *ptep; +-} +- +-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) +-{ +- unsigned int idx; +- unsigned long vaddr; +- void *kmap; +- int type; +- +-#ifdef CONFIG_DEBUG_HIGHMEM +- /* +- * There is no cache coherency issue when non VIVT, so force the +- * dedicated kmap usage for better debugging purposes in that case. +- */ +- if (!cache_is_vivt()) +- kmap = NULL; +- else +-#endif +- kmap = kmap_high_get(page); +- if (kmap) +- return kmap; +- +- type = kmap_atomic_idx_push(); +- +- idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); +- vaddr = __fix_to_virt(idx); +-#ifdef CONFIG_DEBUG_HIGHMEM +- /* +- * With debugging enabled, kunmap_atomic forces that entry to 0. +- * Make sure it was indeed properly unmapped. +- */ +- BUG_ON(!pte_none(get_fixmap_pte(vaddr))); +-#endif +- /* +- * When debugging is off, kunmap_atomic leaves the previous mapping +- * in place, so the contained TLB flush ensures the TLB is updated +- * with the new mapping. +- */ +- set_fixmap_pte(idx, mk_pte(page, prot)); +- +- return (void *)vaddr; +-} +-EXPORT_SYMBOL(kmap_atomic_high_prot); +- +-void kunmap_atomic_high(void *kvaddr) +-{ +- unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; +- int idx, type; +- +- if (kvaddr >= (void *)FIXADDR_START) { +- type = kmap_atomic_idx(); +- idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); +- +- if (cache_is_vivt()) +- __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE); +-#ifdef CONFIG_DEBUG_HIGHMEM +- BUG_ON(vaddr != __fix_to_virt(idx)); +- set_fixmap_pte(idx, __pte(0)); +-#else +- (void) idx; /* to kill a warning */ +-#endif +- kmap_atomic_idx_pop(); +- } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { +- /* this address was obtained through kmap_high_get() */ +- kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); +- } +-} +-EXPORT_SYMBOL(kunmap_atomic_high); +- +-void *kmap_atomic_pfn(unsigned long pfn) +-{ +- unsigned long vaddr; +- int idx, type; +- struct page *page = pfn_to_page(pfn); +- +- preempt_disable(); +- pagefault_disable(); +- if (!PageHighMem(page)) +- return page_address(page); +- +- type = kmap_atomic_idx_push(); +- idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); +- vaddr = __fix_to_virt(idx); +-#ifdef CONFIG_DEBUG_HIGHMEM +- BUG_ON(!pte_none(get_fixmap_pte(vaddr))); +-#endif +- set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot)); +- +- return (void *)vaddr; +-} +-- +2.43.0 + diff --git a/debian/patches-rt/0034-csky-mm-highmem-Switch-to-generic-kmap-atomic.patch b/debian/patches-rt/0034-csky-mm-highmem-Switch-to-generic-kmap-atomic.patch new file mode 100644 index 000000000..667836a1e --- /dev/null +++ b/debian/patches-rt/0034-csky-mm-highmem-Switch-to-generic-kmap-atomic.patch @@ -0,0 +1,179 @@ +From 3af9ca89d4398239a71471cffb488e3104990e23 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:23 +0100 +Subject: [PATCH 034/323] csky/mm/highmem: Switch to generic kmap atomic +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +No reason having the same code in every architecture. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: linux-csky@vger.kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/csky/Kconfig | 1 + + arch/csky/include/asm/fixmap.h | 4 +- + arch/csky/include/asm/highmem.h | 6 ++- + arch/csky/mm/highmem.c | 75 +-------------------------------- + 4 files changed, 8 insertions(+), 78 deletions(-) + +diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig +index 7bf0a617e94c..c9f2533cc53d 100644 +--- a/arch/csky/Kconfig ++++ b/arch/csky/Kconfig +@@ -286,6 +286,7 @@ config NR_CPUS + config HIGHMEM + bool "High Memory Support" + depends on !CPU_CK610 ++ select KMAP_LOCAL + default y + + config FORCE_MAX_ZONEORDER +diff --git a/arch/csky/include/asm/fixmap.h b/arch/csky/include/asm/fixmap.h +index 81f9477d5330..4b589cc20900 100644 +--- a/arch/csky/include/asm/fixmap.h ++++ b/arch/csky/include/asm/fixmap.h +@@ -8,7 +8,7 @@ + #include <asm/memory.h> + #ifdef CONFIG_HIGHMEM + #include <linux/threads.h> +-#include <asm/kmap_types.h> ++#include <asm/kmap_size.h> + #endif + + enum fixed_addresses { +@@ -17,7 +17,7 @@ enum fixed_addresses { + #endif + #ifdef CONFIG_HIGHMEM + FIX_KMAP_BEGIN, +- FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS) - 1, ++ FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, + #endif + __end_of_fixed_addresses + }; +diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h +index 14645e3d5cd5..1f4ed3f4c0d9 100644 +--- a/arch/csky/include/asm/highmem.h ++++ b/arch/csky/include/asm/highmem.h +@@ -9,7 +9,7 @@ + #include <linux/init.h> + #include <linux/interrupt.h> + #include <linux/uaccess.h> +-#include <asm/kmap_types.h> ++#include <asm/kmap_size.h> + #include <asm/cache.h> + + /* undef for production */ +@@ -32,10 +32,12 @@ extern pte_t *pkmap_page_table; + + #define ARCH_HAS_KMAP_FLUSH_TLB + extern void kmap_flush_tlb(unsigned long addr); +-extern void *kmap_atomic_pfn(unsigned long pfn); + + #define flush_cache_kmaps() do {} while (0) + ++#define arch_kmap_local_post_map(vaddr, pteval) kmap_flush_tlb(vaddr) ++#define arch_kmap_local_post_unmap(vaddr) kmap_flush_tlb(vaddr) ++ + extern void kmap_init(void); + + #endif /* __KERNEL__ */ +diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c +index 89c10800a002..4161df3c6c15 100644 +--- a/arch/csky/mm/highmem.c ++++ b/arch/csky/mm/highmem.c +@@ -9,8 +9,6 @@ + #include <asm/tlbflush.h> + #include <asm/cacheflush.h> + +-static pte_t *kmap_pte; +- + unsigned long highstart_pfn, highend_pfn; + + void kmap_flush_tlb(unsigned long addr) +@@ -19,67 +17,7 @@ void kmap_flush_tlb(unsigned long addr) + } + EXPORT_SYMBOL(kmap_flush_tlb); + +-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) +-{ +- unsigned long vaddr; +- int idx, type; +- +- type = kmap_atomic_idx_push(); +- idx = type + KM_TYPE_NR*smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +-#ifdef CONFIG_DEBUG_HIGHMEM +- BUG_ON(!pte_none(*(kmap_pte - idx))); +-#endif +- set_pte(kmap_pte-idx, mk_pte(page, prot)); +- flush_tlb_one((unsigned long)vaddr); +- +- return (void *)vaddr; +-} +-EXPORT_SYMBOL(kmap_atomic_high_prot); +- +-void kunmap_atomic_high(void *kvaddr) +-{ +- unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; +- int idx; +- +- if (vaddr < FIXADDR_START) +- return; +- +-#ifdef CONFIG_DEBUG_HIGHMEM +- idx = KM_TYPE_NR*smp_processor_id() + kmap_atomic_idx(); +- +- BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +- +- pte_clear(&init_mm, vaddr, kmap_pte - idx); +- flush_tlb_one(vaddr); +-#else +- (void) idx; /* to kill a warning */ +-#endif +- kmap_atomic_idx_pop(); +-} +-EXPORT_SYMBOL(kunmap_atomic_high); +- +-/* +- * This is the same as kmap_atomic() but can map memory that doesn't +- * have a struct page associated with it. +- */ +-void *kmap_atomic_pfn(unsigned long pfn) +-{ +- unsigned long vaddr; +- int idx, type; +- +- pagefault_disable(); +- +- type = kmap_atomic_idx_push(); +- idx = type + KM_TYPE_NR*smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +- set_pte(kmap_pte-idx, pfn_pte(pfn, PAGE_KERNEL)); +- flush_tlb_one(vaddr); +- +- return (void *) vaddr; +-} +- +-static void __init kmap_pages_init(void) ++void __init kmap_init(void) + { + unsigned long vaddr; + pgd_t *pgd; +@@ -96,14 +34,3 @@ static void __init kmap_pages_init(void) + pte = pte_offset_kernel(pmd, vaddr); + pkmap_page_table = pte; + } +- +-void __init kmap_init(void) +-{ +- unsigned long vaddr; +- +- kmap_pages_init(); +- +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN); +- +- kmap_pte = pte_offset_kernel((pmd_t *)pgd_offset_k(vaddr), vaddr); +-} +-- +2.43.0 + diff --git a/debian/patches-rt/0035-microblaze-mm-highmem-Switch-to-generic-kmap-atomic.patch b/debian/patches-rt/0035-microblaze-mm-highmem-Switch-to-generic-kmap-atomic.patch new file mode 100644 index 000000000..80bd103c5 --- /dev/null +++ b/debian/patches-rt/0035-microblaze-mm-highmem-Switch-to-generic-kmap-atomic.patch @@ -0,0 +1,197 @@ +From bfdad55346f7206cf980bbd6cbc264082c54ae0c Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:24 +0100 +Subject: [PATCH 035/323] microblaze/mm/highmem: Switch to generic kmap atomic +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +No reason having the same code in every architecture. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Michal Simek <monstr@monstr.eu> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/microblaze/Kconfig | 1 + + arch/microblaze/include/asm/fixmap.h | 4 +- + arch/microblaze/include/asm/highmem.h | 6 ++- + arch/microblaze/mm/Makefile | 1 - + arch/microblaze/mm/highmem.c | 78 --------------------------- + arch/microblaze/mm/init.c | 6 --- + 6 files changed, 8 insertions(+), 88 deletions(-) + delete mode 100644 arch/microblaze/mm/highmem.c + +diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig +index 33925ffed68f..7f6ca0ab4f81 100644 +--- a/arch/microblaze/Kconfig ++++ b/arch/microblaze/Kconfig +@@ -155,6 +155,7 @@ config XILINX_UNCACHED_SHADOW + config HIGHMEM + bool "High memory support" + depends on MMU ++ select KMAP_LOCAL + help + The address space of Microblaze processors is only 4 Gigabytes large + and it has to accommodate user address space, kernel address +diff --git a/arch/microblaze/include/asm/fixmap.h b/arch/microblaze/include/asm/fixmap.h +index 0379ce5229e3..e6e9288bff76 100644 +--- a/arch/microblaze/include/asm/fixmap.h ++++ b/arch/microblaze/include/asm/fixmap.h +@@ -20,7 +20,7 @@ + #include <asm/page.h> + #ifdef CONFIG_HIGHMEM + #include <linux/threads.h> +-#include <asm/kmap_types.h> ++#include <asm/kmap_size.h> + #endif + + #define FIXADDR_TOP ((unsigned long)(-PAGE_SIZE)) +@@ -47,7 +47,7 @@ enum fixed_addresses { + FIX_HOLE, + #ifdef CONFIG_HIGHMEM + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ +- FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * num_possible_cpus()) - 1, ++ FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * num_possible_cpus()) - 1, + #endif + __end_of_fixed_addresses + }; +diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h +index 284ca8fb54c1..4418633fb163 100644 +--- a/arch/microblaze/include/asm/highmem.h ++++ b/arch/microblaze/include/asm/highmem.h +@@ -25,7 +25,6 @@ + #include <linux/uaccess.h> + #include <asm/fixmap.h> + +-extern pte_t *kmap_pte; + extern pte_t *pkmap_page_table; + + /* +@@ -52,6 +51,11 @@ extern pte_t *pkmap_page_table; + + #define flush_cache_kmaps() { flush_icache(); flush_dcache(); } + ++#define arch_kmap_local_post_map(vaddr, pteval) \ ++ local_flush_tlb_page(NULL, vaddr); ++#define arch_kmap_local_post_unmap(vaddr) \ ++ local_flush_tlb_page(NULL, vaddr); ++ + #endif /* __KERNEL__ */ + + #endif /* _ASM_HIGHMEM_H */ +diff --git a/arch/microblaze/mm/Makefile b/arch/microblaze/mm/Makefile +index 1b16875cea70..8ced71100047 100644 +--- a/arch/microblaze/mm/Makefile ++++ b/arch/microblaze/mm/Makefile +@@ -6,4 +6,3 @@ + obj-y := consistent.o init.o + + obj-$(CONFIG_MMU) += pgtable.o mmu_context.o fault.o +-obj-$(CONFIG_HIGHMEM) += highmem.o +diff --git a/arch/microblaze/mm/highmem.c b/arch/microblaze/mm/highmem.c +deleted file mode 100644 +index 92e0890416c9..000000000000 +--- a/arch/microblaze/mm/highmem.c ++++ /dev/null +@@ -1,78 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0 +-/* +- * highmem.c: virtual kernel memory mappings for high memory +- * +- * PowerPC version, stolen from the i386 version. +- * +- * Used in CONFIG_HIGHMEM systems for memory pages which +- * are not addressable by direct kernel virtual addresses. +- * +- * Copyright (C) 1999 Gerhard Wichert, Siemens AG +- * Gerhard.Wichert@pdb.siemens.de +- * +- * +- * Redesigned the x86 32-bit VM architecture to deal with +- * up to 16 Terrabyte physical memory. With current x86 CPUs +- * we now support up to 64 Gigabytes physical RAM. +- * +- * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> +- * +- * Reworked for PowerPC by various contributors. Moved from +- * highmem.h by Benjamin Herrenschmidt (c) 2009 IBM Corp. +- */ +- +-#include <linux/export.h> +-#include <linux/highmem.h> +- +-/* +- * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap +- * gives a more generic (and caching) interface. But kmap_atomic can +- * be used in IRQ contexts, so in some (very limited) cases we need +- * it. +- */ +-#include <asm/tlbflush.h> +- +-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) +-{ +- +- unsigned long vaddr; +- int idx, type; +- +- type = kmap_atomic_idx_push(); +- idx = type + KM_TYPE_NR*smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +-#ifdef CONFIG_DEBUG_HIGHMEM +- BUG_ON(!pte_none(*(kmap_pte-idx))); +-#endif +- set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot)); +- local_flush_tlb_page(NULL, vaddr); +- +- return (void *) vaddr; +-} +-EXPORT_SYMBOL(kmap_atomic_high_prot); +- +-void kunmap_atomic_high(void *kvaddr) +-{ +- unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; +- int type; +- unsigned int idx; +- +- if (vaddr < __fix_to_virt(FIX_KMAP_END)) +- return; +- +- type = kmap_atomic_idx(); +- +- idx = type + KM_TYPE_NR * smp_processor_id(); +-#ifdef CONFIG_DEBUG_HIGHMEM +- BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +-#endif +- /* +- * force other mappings to Oops if they'll try to access +- * this pte without first remap it +- */ +- pte_clear(&init_mm, vaddr, kmap_pte-idx); +- local_flush_tlb_page(NULL, vaddr); +- +- kmap_atomic_idx_pop(); +-} +-EXPORT_SYMBOL(kunmap_atomic_high); +diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c +index 45da639bd22c..1f4b5b34e600 100644 +--- a/arch/microblaze/mm/init.c ++++ b/arch/microblaze/mm/init.c +@@ -49,17 +49,11 @@ unsigned long lowmem_size; + EXPORT_SYMBOL(min_low_pfn); + EXPORT_SYMBOL(max_low_pfn); + +-#ifdef CONFIG_HIGHMEM +-pte_t *kmap_pte; +-EXPORT_SYMBOL(kmap_pte); +- + static void __init highmem_init(void) + { + pr_debug("%x\n", (u32)PKMAP_BASE); + map_page(PKMAP_BASE, 0, 0); /* XXX gross */ + pkmap_page_table = virt_to_kpte(PKMAP_BASE); +- +- kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); + } + + static void highmem_setup(void) +-- +2.43.0 + diff --git a/debian/patches-rt/0036-mips-mm-highmem-Switch-to-generic-kmap-atomic.patch b/debian/patches-rt/0036-mips-mm-highmem-Switch-to-generic-kmap-atomic.patch new file mode 100644 index 000000000..998662451 --- /dev/null +++ b/debian/patches-rt/0036-mips-mm-highmem-Switch-to-generic-kmap-atomic.patch @@ -0,0 +1,219 @@ +From 5ff7160041ca1461c70a70237f3c6a4b09d54472 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:25 +0100 +Subject: [PATCH 036/323] mips/mm/highmem: Switch to generic kmap atomic +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +No reason having the same code in every architecture + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> +Cc: linux-mips@vger.kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/mips/Kconfig | 1 + + arch/mips/include/asm/fixmap.h | 4 +- + arch/mips/include/asm/highmem.h | 6 +-- + arch/mips/include/asm/kmap_types.h | 13 ----- + arch/mips/mm/highmem.c | 77 ------------------------------ + arch/mips/mm/init.c | 4 -- + 6 files changed, 6 insertions(+), 99 deletions(-) + delete mode 100644 arch/mips/include/asm/kmap_types.h + +diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig +index 18ebacf29889..0e02b2e6ee06 100644 +--- a/arch/mips/Kconfig ++++ b/arch/mips/Kconfig +@@ -2733,6 +2733,7 @@ config WAR_MIPS34K_MISSED_ITLB + config HIGHMEM + bool "High Memory Support" + depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA ++ select KMAP_LOCAL + + config CPU_SUPPORTS_HIGHMEM + bool +diff --git a/arch/mips/include/asm/fixmap.h b/arch/mips/include/asm/fixmap.h +index 743535be7528..beea14761cef 100644 +--- a/arch/mips/include/asm/fixmap.h ++++ b/arch/mips/include/asm/fixmap.h +@@ -17,7 +17,7 @@ + #include <spaces.h> + #ifdef CONFIG_HIGHMEM + #include <linux/threads.h> +-#include <asm/kmap_types.h> ++#include <asm/kmap_size.h> + #endif + + /* +@@ -52,7 +52,7 @@ enum fixed_addresses { + #ifdef CONFIG_HIGHMEM + /* reserved pte's for temporary kernel mappings */ + FIX_KMAP_BEGIN = FIX_CMAP_END + 1, +- FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, ++ FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, + #endif + __end_of_fixed_addresses + }; +diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h +index 9f021cf51aa7..1716181ea66d 100644 +--- a/arch/mips/include/asm/highmem.h ++++ b/arch/mips/include/asm/highmem.h +@@ -24,7 +24,7 @@ + #include <linux/interrupt.h> + #include <linux/uaccess.h> + #include <asm/cpu-features.h> +-#include <asm/kmap_types.h> ++#include <asm/kmap_size.h> + + /* declarations for highmem.c */ + extern unsigned long highstart_pfn, highend_pfn; +@@ -48,11 +48,11 @@ extern pte_t *pkmap_page_table; + + #define ARCH_HAS_KMAP_FLUSH_TLB + extern void kmap_flush_tlb(unsigned long addr); +-extern void *kmap_atomic_pfn(unsigned long pfn); + + #define flush_cache_kmaps() BUG_ON(cpu_has_dc_aliases) + +-extern void kmap_init(void); ++#define arch_kmap_local_post_map(vaddr, pteval) local_flush_tlb_one(vaddr) ++#define arch_kmap_local_post_unmap(vaddr) local_flush_tlb_one(vaddr) + + #endif /* __KERNEL__ */ + +diff --git a/arch/mips/include/asm/kmap_types.h b/arch/mips/include/asm/kmap_types.h +deleted file mode 100644 +index 16665dc2431b..000000000000 +--- a/arch/mips/include/asm/kmap_types.h ++++ /dev/null +@@ -1,13 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef _ASM_KMAP_TYPES_H +-#define _ASM_KMAP_TYPES_H +- +-#ifdef CONFIG_DEBUG_HIGHMEM +-#define __WITH_KM_FENCE +-#endif +- +-#include <asm-generic/kmap_types.h> +- +-#undef __WITH_KM_FENCE +- +-#endif +diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c +index 5fec7f45d79a..57e2f08f00d0 100644 +--- a/arch/mips/mm/highmem.c ++++ b/arch/mips/mm/highmem.c +@@ -8,8 +8,6 @@ + #include <asm/fixmap.h> + #include <asm/tlbflush.h> + +-static pte_t *kmap_pte; +- + unsigned long highstart_pfn, highend_pfn; + + void kmap_flush_tlb(unsigned long addr) +@@ -17,78 +15,3 @@ void kmap_flush_tlb(unsigned long addr) + flush_tlb_one(addr); + } + EXPORT_SYMBOL(kmap_flush_tlb); +- +-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) +-{ +- unsigned long vaddr; +- int idx, type; +- +- type = kmap_atomic_idx_push(); +- idx = type + KM_TYPE_NR*smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +-#ifdef CONFIG_DEBUG_HIGHMEM +- BUG_ON(!pte_none(*(kmap_pte - idx))); +-#endif +- set_pte(kmap_pte-idx, mk_pte(page, prot)); +- local_flush_tlb_one((unsigned long)vaddr); +- +- return (void*) vaddr; +-} +-EXPORT_SYMBOL(kmap_atomic_high_prot); +- +-void kunmap_atomic_high(void *kvaddr) +-{ +- unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; +- int type __maybe_unused; +- +- if (vaddr < FIXADDR_START) +- return; +- +- type = kmap_atomic_idx(); +-#ifdef CONFIG_DEBUG_HIGHMEM +- { +- int idx = type + KM_TYPE_NR * smp_processor_id(); +- +- BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +- +- /* +- * force other mappings to Oops if they'll try to access +- * this pte without first remap it +- */ +- pte_clear(&init_mm, vaddr, kmap_pte-idx); +- local_flush_tlb_one(vaddr); +- } +-#endif +- kmap_atomic_idx_pop(); +-} +-EXPORT_SYMBOL(kunmap_atomic_high); +- +-/* +- * This is the same as kmap_atomic() but can map memory that doesn't +- * have a struct page associated with it. +- */ +-void *kmap_atomic_pfn(unsigned long pfn) +-{ +- unsigned long vaddr; +- int idx, type; +- +- preempt_disable(); +- pagefault_disable(); +- +- type = kmap_atomic_idx_push(); +- idx = type + KM_TYPE_NR*smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +- set_pte(kmap_pte-idx, pfn_pte(pfn, PAGE_KERNEL)); +- flush_tlb_one(vaddr); +- +- return (void*) vaddr; +-} +- +-void __init kmap_init(void) +-{ +- unsigned long kmap_vstart; +- +- /* cache the first kmap pte */ +- kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); +- kmap_pte = virt_to_kpte(kmap_vstart); +-} +diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c +index 07e84a774938..bc80893e5c0f 100644 +--- a/arch/mips/mm/init.c ++++ b/arch/mips/mm/init.c +@@ -36,7 +36,6 @@ + #include <asm/cachectl.h> + #include <asm/cpu.h> + #include <asm/dma.h> +-#include <asm/kmap_types.h> + #include <asm/maar.h> + #include <asm/mmu_context.h> + #include <asm/sections.h> +@@ -402,9 +401,6 @@ void __init paging_init(void) + + pagetable_init(); + +-#ifdef CONFIG_HIGHMEM +- kmap_init(); +-#endif + #ifdef CONFIG_ZONE_DMA + max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; + #endif +-- +2.43.0 + diff --git a/debian/patches-rt/0037-nds32-mm-highmem-Switch-to-generic-kmap-atomic.patch b/debian/patches-rt/0037-nds32-mm-highmem-Switch-to-generic-kmap-atomic.patch new file mode 100644 index 000000000..08d1dc198 --- /dev/null +++ b/debian/patches-rt/0037-nds32-mm-highmem-Switch-to-generic-kmap-atomic.patch @@ -0,0 +1,167 @@ +From bc36ab505c69bbf7d37b57cb983ca97b198c44df Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:26 +0100 +Subject: [PATCH 037/323] nds32/mm/highmem: Switch to generic kmap atomic +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The mapping code is odd and looks broken. See FIXME in the comment. + +Also fix the harmless off by one in the FIX_KMAP_END define. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Nick Hu <nickhu@andestech.com> +Cc: Greentime Hu <green.hu@gmail.com> +Cc: Vincent Chen <deanbo422@gmail.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/nds32/Kconfig.cpu | 1 + + arch/nds32/include/asm/fixmap.h | 4 +-- + arch/nds32/include/asm/highmem.h | 22 +++++++++++---- + arch/nds32/mm/Makefile | 1 - + arch/nds32/mm/highmem.c | 48 -------------------------------- + 5 files changed, 19 insertions(+), 57 deletions(-) + delete mode 100644 arch/nds32/mm/highmem.c + +diff --git a/arch/nds32/Kconfig.cpu b/arch/nds32/Kconfig.cpu +index f88a12fdf0f3..c10759952485 100644 +--- a/arch/nds32/Kconfig.cpu ++++ b/arch/nds32/Kconfig.cpu +@@ -157,6 +157,7 @@ config HW_SUPPORT_UNALIGNMENT_ACCESS + config HIGHMEM + bool "High Memory Support" + depends on MMU && !CPU_CACHE_ALIASING ++ select KMAP_LOCAL + help + The address space of Andes processors is only 4 Gigabytes large + and it has to accommodate user address space, kernel address +diff --git a/arch/nds32/include/asm/fixmap.h b/arch/nds32/include/asm/fixmap.h +index 5a4bf11e5800..2fa09a2de428 100644 +--- a/arch/nds32/include/asm/fixmap.h ++++ b/arch/nds32/include/asm/fixmap.h +@@ -6,7 +6,7 @@ + + #ifdef CONFIG_HIGHMEM + #include <linux/threads.h> +-#include <asm/kmap_types.h> ++#include <asm/kmap_size.h> + #endif + + enum fixed_addresses { +@@ -14,7 +14,7 @@ enum fixed_addresses { + FIX_KMAP_RESERVED, + FIX_KMAP_BEGIN, + #ifdef CONFIG_HIGHMEM +- FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS), ++ FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, + #endif + FIX_EARLYCON_MEM_BASE, + __end_of_fixed_addresses +diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h +index fe986d0e6e3f..16159a8716f2 100644 +--- a/arch/nds32/include/asm/highmem.h ++++ b/arch/nds32/include/asm/highmem.h +@@ -5,7 +5,6 @@ + #define _ASM_HIGHMEM_H + + #include <asm/proc-fns.h> +-#include <asm/kmap_types.h> + #include <asm/fixmap.h> + + /* +@@ -45,11 +44,22 @@ extern pte_t *pkmap_page_table; + extern void kmap_init(void); + + /* +- * The following functions are already defined by <linux/highmem.h> +- * when CONFIG_HIGHMEM is not set. ++ * FIXME: The below looks broken vs. a kmap_atomic() in task context which ++ * is interupted and another kmap_atomic() happens in interrupt context. ++ * But what do I know about nds32. -- tglx + */ +-#ifdef CONFIG_HIGHMEM +-extern void *kmap_atomic_pfn(unsigned long pfn); +-#endif ++#define arch_kmap_local_post_map(vaddr, pteval) \ ++ do { \ ++ __nds32__tlbop_inv(vaddr); \ ++ __nds32__mtsr_dsb(vaddr, NDS32_SR_TLB_VPN); \ ++ __nds32__tlbop_rwr(pteval); \ ++ __nds32__isb(); \ ++ } while (0) ++ ++#define arch_kmap_local_pre_unmap(vaddr) \ ++ do { \ ++ __nds32__tlbop_inv(vaddr); \ ++ __nds32__isb(); \ ++ } while (0) + + #endif +diff --git a/arch/nds32/mm/Makefile b/arch/nds32/mm/Makefile +index 897ecaf5cf54..14fb2e8eb036 100644 +--- a/arch/nds32/mm/Makefile ++++ b/arch/nds32/mm/Makefile +@@ -3,7 +3,6 @@ obj-y := extable.o tlb.o fault.o init.o mmap.o \ + mm-nds32.o cacheflush.o proc.o + + obj-$(CONFIG_ALIGNMENT_TRAP) += alignment.o +-obj-$(CONFIG_HIGHMEM) += highmem.o + + ifdef CONFIG_FUNCTION_TRACER + CFLAGS_REMOVE_proc.o = $(CC_FLAGS_FTRACE) +diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c +deleted file mode 100644 +index 4284cd59e21a..000000000000 +--- a/arch/nds32/mm/highmem.c ++++ /dev/null +@@ -1,48 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0 +-// Copyright (C) 2005-2017 Andes Technology Corporation +- +-#include <linux/export.h> +-#include <linux/highmem.h> +-#include <linux/sched.h> +-#include <linux/smp.h> +-#include <linux/interrupt.h> +-#include <linux/memblock.h> +-#include <asm/fixmap.h> +-#include <asm/tlbflush.h> +- +-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) +-{ +- unsigned int idx; +- unsigned long vaddr, pte; +- int type; +- pte_t *ptep; +- +- type = kmap_atomic_idx_push(); +- +- idx = type + KM_TYPE_NR * smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +- pte = (page_to_pfn(page) << PAGE_SHIFT) | prot; +- ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr); +- set_pte(ptep, pte); +- +- __nds32__tlbop_inv(vaddr); +- __nds32__mtsr_dsb(vaddr, NDS32_SR_TLB_VPN); +- __nds32__tlbop_rwr(pte); +- __nds32__isb(); +- return (void *)vaddr; +-} +-EXPORT_SYMBOL(kmap_atomic_high_prot); +- +-void kunmap_atomic_high(void *kvaddr) +-{ +- if (kvaddr >= (void *)FIXADDR_START) { +- unsigned long vaddr = (unsigned long)kvaddr; +- pte_t *ptep; +- kmap_atomic_idx_pop(); +- __nds32__tlbop_inv(vaddr); +- __nds32__isb(); +- ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr); +- set_pte(ptep, 0); +- } +-} +-EXPORT_SYMBOL(kunmap_atomic_high); +-- +2.43.0 + diff --git a/debian/patches-rt/0038-powerpc-mm-highmem-Switch-to-generic-kmap-atomic.patch b/debian/patches-rt/0038-powerpc-mm-highmem-Switch-to-generic-kmap-atomic.patch new file mode 100644 index 000000000..92e962c1f --- /dev/null +++ b/debian/patches-rt/0038-powerpc-mm-highmem-Switch-to-generic-kmap-atomic.patch @@ -0,0 +1,221 @@ +From 6d7d749a6b27ada63e8be5d1903d336538b40702 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:27 +0100 +Subject: [PATCH 038/323] powerpc/mm/highmem: Switch to generic kmap atomic +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +No reason having the same code in every architecture + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Michael Ellerman <mpe@ellerman.id.au> +Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> +Cc: Paul Mackerras <paulus@samba.org> +Cc: linuxppc-dev@lists.ozlabs.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/powerpc/Kconfig | 1 + + arch/powerpc/include/asm/fixmap.h | 4 +- + arch/powerpc/include/asm/highmem.h | 7 ++- + arch/powerpc/include/asm/kmap_types.h | 13 ------ + arch/powerpc/mm/Makefile | 1 - + arch/powerpc/mm/highmem.c | 67 --------------------------- + arch/powerpc/mm/mem.c | 7 --- + 7 files changed, 8 insertions(+), 92 deletions(-) + delete mode 100644 arch/powerpc/include/asm/kmap_types.h + delete mode 100644 arch/powerpc/mm/highmem.c + +diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig +index 78dd6be8b31d..b3ab6c2d9f66 100644 +--- a/arch/powerpc/Kconfig ++++ b/arch/powerpc/Kconfig +@@ -410,6 +410,7 @@ menu "Kernel options" + config HIGHMEM + bool "High memory support" + depends on PPC32 ++ select KMAP_LOCAL + + source "kernel/Kconfig.hz" + +diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h +index 897cc68758d4..a832aeafe560 100644 +--- a/arch/powerpc/include/asm/fixmap.h ++++ b/arch/powerpc/include/asm/fixmap.h +@@ -20,7 +20,7 @@ + #include <asm/page.h> + #ifdef CONFIG_HIGHMEM + #include <linux/threads.h> +-#include <asm/kmap_types.h> ++#include <asm/kmap_size.h> + #endif + + #ifdef CONFIG_PPC64 +@@ -61,7 +61,7 @@ enum fixed_addresses { + FIX_EARLY_DEBUG_BASE = FIX_EARLY_DEBUG_TOP+(ALIGN(SZ_128K, PAGE_SIZE)/PAGE_SIZE)-1, + #ifdef CONFIG_HIGHMEM + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ +- FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, ++ FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, + #endif + #ifdef CONFIG_PPC_8xx + /* For IMMR we need an aligned 512K area */ +diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h +index 104026f7d6bc..80a5ae771c65 100644 +--- a/arch/powerpc/include/asm/highmem.h ++++ b/arch/powerpc/include/asm/highmem.h +@@ -24,12 +24,10 @@ + #ifdef __KERNEL__ + + #include <linux/interrupt.h> +-#include <asm/kmap_types.h> + #include <asm/cacheflush.h> + #include <asm/page.h> + #include <asm/fixmap.h> + +-extern pte_t *kmap_pte; + extern pte_t *pkmap_page_table; + + /* +@@ -60,6 +58,11 @@ extern pte_t *pkmap_page_table; + + #define flush_cache_kmaps() flush_cache_all() + ++#define arch_kmap_local_post_map(vaddr, pteval) \ ++ local_flush_tlb_page(NULL, vaddr) ++#define arch_kmap_local_post_unmap(vaddr) \ ++ local_flush_tlb_page(NULL, vaddr) ++ + #endif /* __KERNEL__ */ + + #endif /* _ASM_HIGHMEM_H */ +diff --git a/arch/powerpc/include/asm/kmap_types.h b/arch/powerpc/include/asm/kmap_types.h +deleted file mode 100644 +index c8fa182d48c8..000000000000 +--- a/arch/powerpc/include/asm/kmap_types.h ++++ /dev/null +@@ -1,13 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0-or-later */ +-#ifndef _ASM_POWERPC_KMAP_TYPES_H +-#define _ASM_POWERPC_KMAP_TYPES_H +- +-#ifdef __KERNEL__ +- +-/* +- */ +- +-#define KM_TYPE_NR 16 +- +-#endif /* __KERNEL__ */ +-#endif /* _ASM_POWERPC_KMAP_TYPES_H */ +diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile +index 55b4a8bd408a..3b4e9e4e25ea 100644 +--- a/arch/powerpc/mm/Makefile ++++ b/arch/powerpc/mm/Makefile +@@ -16,7 +16,6 @@ obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o + obj-$(CONFIG_PPC_MM_SLICES) += slice.o + obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o + obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o +-obj-$(CONFIG_HIGHMEM) += highmem.o + obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o + obj-$(CONFIG_PPC_PTDUMP) += ptdump/ + obj-$(CONFIG_KASAN) += kasan/ +diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c +deleted file mode 100644 +index 624b4438aff9..000000000000 +--- a/arch/powerpc/mm/highmem.c ++++ /dev/null +@@ -1,67 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0 +-/* +- * highmem.c: virtual kernel memory mappings for high memory +- * +- * PowerPC version, stolen from the i386 version. +- * +- * Used in CONFIG_HIGHMEM systems for memory pages which +- * are not addressable by direct kernel virtual addresses. +- * +- * Copyright (C) 1999 Gerhard Wichert, Siemens AG +- * Gerhard.Wichert@pdb.siemens.de +- * +- * +- * Redesigned the x86 32-bit VM architecture to deal with +- * up to 16 Terrabyte physical memory. With current x86 CPUs +- * we now support up to 64 Gigabytes physical RAM. +- * +- * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> +- * +- * Reworked for PowerPC by various contributors. Moved from +- * highmem.h by Benjamin Herrenschmidt (c) 2009 IBM Corp. +- */ +- +-#include <linux/highmem.h> +-#include <linux/module.h> +- +-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) +-{ +- unsigned long vaddr; +- int idx, type; +- +- type = kmap_atomic_idx_push(); +- idx = type + KM_TYPE_NR*smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +- WARN_ON(IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !pte_none(*(kmap_pte - idx))); +- __set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot), 1); +- local_flush_tlb_page(NULL, vaddr); +- +- return (void*) vaddr; +-} +-EXPORT_SYMBOL(kmap_atomic_high_prot); +- +-void kunmap_atomic_high(void *kvaddr) +-{ +- unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; +- +- if (vaddr < __fix_to_virt(FIX_KMAP_END)) +- return; +- +- if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM)) { +- int type = kmap_atomic_idx(); +- unsigned int idx; +- +- idx = type + KM_TYPE_NR * smp_processor_id(); +- WARN_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +- +- /* +- * force other mappings to Oops if they'll try to access +- * this pte without first remap it +- */ +- pte_clear(&init_mm, vaddr, kmap_pte-idx); +- local_flush_tlb_page(NULL, vaddr); +- } +- +- kmap_atomic_idx_pop(); +-} +-EXPORT_SYMBOL(kunmap_atomic_high); +diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c +index 08e3422eb792..1faa0789871d 100644 +--- a/arch/powerpc/mm/mem.c ++++ b/arch/powerpc/mm/mem.c +@@ -63,11 +63,6 @@ + unsigned long long memory_limit; + bool init_mem_is_free; + +-#ifdef CONFIG_HIGHMEM +-pte_t *kmap_pte; +-EXPORT_SYMBOL(kmap_pte); +-#endif +- + pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot) + { +@@ -237,8 +232,6 @@ void __init paging_init(void) + + map_kernel_page(PKMAP_BASE, 0, __pgprot(0)); /* XXX gross */ + pkmap_page_table = virt_to_kpte(PKMAP_BASE); +- +- kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); + #endif /* CONFIG_HIGHMEM */ + + printk(KERN_DEBUG "Top of RAM: 0x%llx, Total RAM: 0x%llx\n", +-- +2.43.0 + diff --git a/debian/patches-rt/0039-sparc-mm-highmem-Switch-to-generic-kmap-atomic.patch b/debian/patches-rt/0039-sparc-mm-highmem-Switch-to-generic-kmap-atomic.patch new file mode 100644 index 000000000..5ac60d218 --- /dev/null +++ b/debian/patches-rt/0039-sparc-mm-highmem-Switch-to-generic-kmap-atomic.patch @@ -0,0 +1,254 @@ +From 859a29e0206dcd137ea0db376a846246f1071bfb Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:28 +0100 +Subject: [PATCH 039/323] sparc/mm/highmem: Switch to generic kmap atomic +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +No reason having the same code in every architecture + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: "David S. Miller" <davem@davemloft.net> +Cc: sparclinux@vger.kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/sparc/Kconfig | 1 + + arch/sparc/include/asm/highmem.h | 8 +- + arch/sparc/include/asm/kmap_types.h | 11 --- + arch/sparc/include/asm/vaddrs.h | 4 +- + arch/sparc/mm/Makefile | 3 - + arch/sparc/mm/highmem.c | 115 ---------------------------- + arch/sparc/mm/srmmu.c | 2 - + 7 files changed, 8 insertions(+), 136 deletions(-) + delete mode 100644 arch/sparc/include/asm/kmap_types.h + delete mode 100644 arch/sparc/mm/highmem.c + +diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig +index 7e2ce4c8d657..1a6e4b187861 100644 +--- a/arch/sparc/Kconfig ++++ b/arch/sparc/Kconfig +@@ -140,6 +140,7 @@ config MMU + config HIGHMEM + bool + default y if SPARC32 ++ select KMAP_LOCAL + + config ZONE_DMA + bool +diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h +index 6c35f0d27ee1..875116209ec1 100644 +--- a/arch/sparc/include/asm/highmem.h ++++ b/arch/sparc/include/asm/highmem.h +@@ -24,7 +24,6 @@ + #include <linux/interrupt.h> + #include <linux/pgtable.h> + #include <asm/vaddrs.h> +-#include <asm/kmap_types.h> + #include <asm/pgtsrmmu.h> + + /* declarations for highmem.c */ +@@ -33,8 +32,6 @@ extern unsigned long highstart_pfn, highend_pfn; + #define kmap_prot __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE) + extern pte_t *pkmap_page_table; + +-void kmap_init(void) __init; +- + /* + * Right now we initialize only a single pte table. It can be extended + * easily, subsequent pte tables have to be allocated in one physical +@@ -53,6 +50,11 @@ void kmap_init(void) __init; + + #define flush_cache_kmaps() flush_cache_all() + ++/* FIXME: Use __flush_tlb_one(vaddr) instead of flush_cache_all() -- Anton */ ++#define arch_kmap_local_post_map(vaddr, pteval) flush_cache_all() ++#define arch_kmap_local_post_unmap(vaddr) flush_cache_all() ++ ++ + #endif /* __KERNEL__ */ + + #endif /* _ASM_HIGHMEM_H */ +diff --git a/arch/sparc/include/asm/kmap_types.h b/arch/sparc/include/asm/kmap_types.h +deleted file mode 100644 +index 55a99b6bd91e..000000000000 +--- a/arch/sparc/include/asm/kmap_types.h ++++ /dev/null +@@ -1,11 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef _ASM_KMAP_TYPES_H +-#define _ASM_KMAP_TYPES_H +- +-/* Dummy header just to define km_type. None of this +- * is actually used on sparc. -DaveM +- */ +- +-#include <asm-generic/kmap_types.h> +- +-#endif +diff --git a/arch/sparc/include/asm/vaddrs.h b/arch/sparc/include/asm/vaddrs.h +index 84d054b07a6f..4fec0341e2a8 100644 +--- a/arch/sparc/include/asm/vaddrs.h ++++ b/arch/sparc/include/asm/vaddrs.h +@@ -32,13 +32,13 @@ + #define SRMMU_NOCACHE_ALCRATIO 64 /* 256 pages per 64MB of system RAM */ + + #ifndef __ASSEMBLY__ +-#include <asm/kmap_types.h> ++#include <asm/kmap_size.h> + + enum fixed_addresses { + FIX_HOLE, + #ifdef CONFIG_HIGHMEM + FIX_KMAP_BEGIN, +- FIX_KMAP_END = (KM_TYPE_NR * NR_CPUS), ++ FIX_KMAP_END = (KM_MAX_IDX * NR_CPUS), + #endif + __end_of_fixed_addresses + }; +diff --git a/arch/sparc/mm/Makefile b/arch/sparc/mm/Makefile +index b078205b70e0..68db1f859b02 100644 +--- a/arch/sparc/mm/Makefile ++++ b/arch/sparc/mm/Makefile +@@ -15,6 +15,3 @@ obj-$(CONFIG_SPARC32) += leon_mm.o + + # Only used by sparc64 + obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +- +-# Only used by sparc32 +-obj-$(CONFIG_HIGHMEM) += highmem.o +diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c +deleted file mode 100644 +index 8f2a2afb048a..000000000000 +--- a/arch/sparc/mm/highmem.c ++++ /dev/null +@@ -1,115 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0 +-/* +- * highmem.c: virtual kernel memory mappings for high memory +- * +- * Provides kernel-static versions of atomic kmap functions originally +- * found as inlines in include/asm-sparc/highmem.h. These became +- * needed as kmap_atomic() and kunmap_atomic() started getting +- * called from within modules. +- * -- Tomas Szepe <szepe@pinerecords.com>, September 2002 +- * +- * But kmap_atomic() and kunmap_atomic() cannot be inlined in +- * modules because they are loaded with btfixup-ped functions. +- */ +- +-/* +- * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap +- * gives a more generic (and caching) interface. But kmap_atomic can +- * be used in IRQ contexts, so in some (very limited) cases we need it. +- * +- * XXX This is an old text. Actually, it's good to use atomic kmaps, +- * provided you remember that they are atomic and not try to sleep +- * with a kmap taken, much like a spinlock. Non-atomic kmaps are +- * shared by CPUs, and so precious, and establishing them requires IPI. +- * Atomic kmaps are lightweight and we may have NCPUS more of them. +- */ +-#include <linux/highmem.h> +-#include <linux/export.h> +-#include <linux/mm.h> +- +-#include <asm/cacheflush.h> +-#include <asm/tlbflush.h> +-#include <asm/vaddrs.h> +- +-static pte_t *kmap_pte; +- +-void __init kmap_init(void) +-{ +- unsigned long address = __fix_to_virt(FIX_KMAP_BEGIN); +- +- /* cache the first kmap pte */ +- kmap_pte = virt_to_kpte(address); +-} +- +-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) +-{ +- unsigned long vaddr; +- long idx, type; +- +- type = kmap_atomic_idx_push(); +- idx = type + KM_TYPE_NR*smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +- +-/* XXX Fix - Anton */ +-#if 0 +- __flush_cache_one(vaddr); +-#else +- flush_cache_all(); +-#endif +- +-#ifdef CONFIG_DEBUG_HIGHMEM +- BUG_ON(!pte_none(*(kmap_pte-idx))); +-#endif +- set_pte(kmap_pte-idx, mk_pte(page, prot)); +-/* XXX Fix - Anton */ +-#if 0 +- __flush_tlb_one(vaddr); +-#else +- flush_tlb_all(); +-#endif +- +- return (void*) vaddr; +-} +-EXPORT_SYMBOL(kmap_atomic_high_prot); +- +-void kunmap_atomic_high(void *kvaddr) +-{ +- unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; +- int type; +- +- if (vaddr < FIXADDR_START) +- return; +- +- type = kmap_atomic_idx(); +- +-#ifdef CONFIG_DEBUG_HIGHMEM +- { +- unsigned long idx; +- +- idx = type + KM_TYPE_NR * smp_processor_id(); +- BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)); +- +- /* XXX Fix - Anton */ +-#if 0 +- __flush_cache_one(vaddr); +-#else +- flush_cache_all(); +-#endif +- +- /* +- * force other mappings to Oops if they'll try to access +- * this pte without first remap it +- */ +- pte_clear(&init_mm, vaddr, kmap_pte-idx); +- /* XXX Fix - Anton */ +-#if 0 +- __flush_tlb_one(vaddr); +-#else +- flush_tlb_all(); +-#endif +- } +-#endif +- +- kmap_atomic_idx_pop(); +-} +-EXPORT_SYMBOL(kunmap_atomic_high); +diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c +index 0070f8b9a753..a03caa5f6628 100644 +--- a/arch/sparc/mm/srmmu.c ++++ b/arch/sparc/mm/srmmu.c +@@ -971,8 +971,6 @@ void __init srmmu_paging_init(void) + + sparc_context_init(num_contexts); + +- kmap_init(); +- + { + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; + +-- +2.43.0 + diff --git a/debian/patches-rt/0040-xtensa-mm-highmem-Switch-to-generic-kmap-atomic.patch b/debian/patches-rt/0040-xtensa-mm-highmem-Switch-to-generic-kmap-atomic.patch new file mode 100644 index 000000000..c17ef447f --- /dev/null +++ b/debian/patches-rt/0040-xtensa-mm-highmem-Switch-to-generic-kmap-atomic.patch @@ -0,0 +1,166 @@ +From 716b526a73eff1c6b44a898c5e66ee2569f7280d Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:29 +0100 +Subject: [PATCH 040/323] xtensa/mm/highmem: Switch to generic kmap atomic +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +No reason having the same code in every architecture + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Chris Zankel <chris@zankel.net> +Cc: Max Filippov <jcmvbkbc@gmail.com> +Cc: linux-xtensa@linux-xtensa.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/xtensa/Kconfig | 1 + + arch/xtensa/include/asm/fixmap.h | 4 +-- + arch/xtensa/include/asm/highmem.h | 12 ++++++-- + arch/xtensa/mm/highmem.c | 46 ++++--------------------------- + 4 files changed, 18 insertions(+), 45 deletions(-) + +diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig +index 87e08ad38ea7..03cbf6b53622 100644 +--- a/arch/xtensa/Kconfig ++++ b/arch/xtensa/Kconfig +@@ -666,6 +666,7 @@ endchoice + config HIGHMEM + bool "High Memory Support" + depends on MMU ++ select KMAP_LOCAL + help + Linux can use the full amount of RAM in the system by + default. However, the default MMUv2 setup only maps the +diff --git a/arch/xtensa/include/asm/fixmap.h b/arch/xtensa/include/asm/fixmap.h +index a06ffb0c61c7..92049b61c351 100644 +--- a/arch/xtensa/include/asm/fixmap.h ++++ b/arch/xtensa/include/asm/fixmap.h +@@ -16,7 +16,7 @@ + #ifdef CONFIG_HIGHMEM + #include <linux/threads.h> + #include <linux/pgtable.h> +-#include <asm/kmap_types.h> ++#include <asm/kmap_size.h> + #endif + + /* +@@ -39,7 +39,7 @@ enum fixed_addresses { + /* reserved pte's for temporary kernel mappings */ + FIX_KMAP_BEGIN, + FIX_KMAP_END = FIX_KMAP_BEGIN + +- (KM_TYPE_NR * NR_CPUS * DCACHE_N_COLORS) - 1, ++ (KM_MAX_IDX * NR_CPUS * DCACHE_N_COLORS) - 1, + #endif + __end_of_fixed_addresses + }; +diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h +index eac503215f17..0fc3b1cebc56 100644 +--- a/arch/xtensa/include/asm/highmem.h ++++ b/arch/xtensa/include/asm/highmem.h +@@ -16,9 +16,8 @@ + #include <linux/pgtable.h> + #include <asm/cacheflush.h> + #include <asm/fixmap.h> +-#include <asm/kmap_types.h> + +-#define PKMAP_BASE ((FIXADDR_START - \ ++#define PKMAP_BASE ((FIXADDR_START - \ + (LAST_PKMAP + 1) * PAGE_SIZE) & PMD_MASK) + #define LAST_PKMAP (PTRS_PER_PTE * DCACHE_N_COLORS) + #define LAST_PKMAP_MASK (LAST_PKMAP - 1) +@@ -68,6 +67,15 @@ static inline void flush_cache_kmaps(void) + flush_cache_all(); + } + ++enum fixed_addresses kmap_local_map_idx(int type, unsigned long pfn); ++#define arch_kmap_local_map_idx kmap_local_map_idx ++ ++enum fixed_addresses kmap_local_unmap_idx(int type, unsigned long addr); ++#define arch_kmap_local_unmap_idx kmap_local_unmap_idx ++ ++#define arch_kmap_local_post_unmap(vaddr) \ ++ local_flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE) ++ + void kmap_init(void); + + #endif +diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c +index 673196fe862e..0735ca5e8f86 100644 +--- a/arch/xtensa/mm/highmem.c ++++ b/arch/xtensa/mm/highmem.c +@@ -12,8 +12,6 @@ + #include <linux/highmem.h> + #include <asm/tlbflush.h> + +-static pte_t *kmap_pte; +- + #if DCACHE_WAY_SIZE > PAGE_SIZE + unsigned int last_pkmap_nr_arr[DCACHE_N_COLORS]; + wait_queue_head_t pkmap_map_wait_arr[DCACHE_N_COLORS]; +@@ -33,59 +31,25 @@ static inline void kmap_waitqueues_init(void) + + static inline enum fixed_addresses kmap_idx(int type, unsigned long color) + { +- return (type + KM_TYPE_NR * smp_processor_id()) * DCACHE_N_COLORS + ++ return (type + KM_MAX_IDX * smp_processor_id()) * DCACHE_N_COLORS + + color; + } + +-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) ++enum fixed_addresses kmap_local_map_idx(int type, unsigned long pfn) + { +- enum fixed_addresses idx; +- unsigned long vaddr; +- +- idx = kmap_idx(kmap_atomic_idx_push(), +- DCACHE_ALIAS(page_to_phys(page))); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +-#ifdef CONFIG_DEBUG_HIGHMEM +- BUG_ON(!pte_none(*(kmap_pte + idx))); +-#endif +- set_pte(kmap_pte + idx, mk_pte(page, prot)); +- +- return (void *)vaddr; ++ return kmap_idx(type, DCACHE_ALIAS(pfn << PAGE_SHIFT)); + } +-EXPORT_SYMBOL(kmap_atomic_high_prot); + +-void kunmap_atomic_high(void *kvaddr) ++enum fixed_addresses kmap_local_unmap_idx(int type, unsigned long addr) + { +- if (kvaddr >= (void *)FIXADDR_START && +- kvaddr < (void *)FIXADDR_TOP) { +- int idx = kmap_idx(kmap_atomic_idx(), +- DCACHE_ALIAS((unsigned long)kvaddr)); +- +- /* +- * Force other mappings to Oops if they'll try to access this +- * pte without first remap it. Keeping stale mappings around +- * is a bad idea also, in case the page changes cacheability +- * attributes or becomes a protected page in a hypervisor. +- */ +- pte_clear(&init_mm, kvaddr, kmap_pte + idx); +- local_flush_tlb_kernel_range((unsigned long)kvaddr, +- (unsigned long)kvaddr + PAGE_SIZE); +- +- kmap_atomic_idx_pop(); +- } ++ return kmap_idx(type, DCACHE_ALIAS(addr)); + } +-EXPORT_SYMBOL(kunmap_atomic_high); + + void __init kmap_init(void) + { +- unsigned long kmap_vstart; +- + /* Check if this memory layout is broken because PKMAP overlaps + * page table. + */ + BUILD_BUG_ON(PKMAP_BASE < TLBTEMP_BASE_1 + TLBTEMP_SIZE); +- /* cache the first kmap pte */ +- kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); +- kmap_pte = virt_to_kpte(kmap_vstart); + kmap_waitqueues_init(); + } +-- +2.43.0 + diff --git a/debian/patches-rt/0041-highmem-Get-rid-of-kmap_types.h.patch b/debian/patches-rt/0041-highmem-Get-rid-of-kmap_types.h.patch new file mode 100644 index 000000000..d492fb27f --- /dev/null +++ b/debian/patches-rt/0041-highmem-Get-rid-of-kmap_types.h.patch @@ -0,0 +1,189 @@ +From 78d1ee2f2f990b4876ce2e64c637a660911a903d Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:30 +0100 +Subject: [PATCH 041/323] highmem: Get rid of kmap_types.h +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The header is not longer used and on alpha, ia64, openrisc, parisc and um +it was completely unused anyway as these architectures have no highmem +support. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/alpha/include/asm/kmap_types.h | 15 --------------- + arch/ia64/include/asm/kmap_types.h | 13 ------------- + arch/openrisc/mm/init.c | 1 - + arch/openrisc/mm/ioremap.c | 1 - + arch/parisc/include/asm/kmap_types.h | 13 ------------- + arch/um/include/asm/fixmap.h | 1 - + arch/um/include/asm/kmap_types.h | 13 ------------- + include/asm-generic/Kbuild | 1 - + include/asm-generic/kmap_types.h | 11 ----------- + include/linux/highmem.h | 2 -- + 10 files changed, 71 deletions(-) + delete mode 100644 arch/alpha/include/asm/kmap_types.h + delete mode 100644 arch/ia64/include/asm/kmap_types.h + delete mode 100644 arch/parisc/include/asm/kmap_types.h + delete mode 100644 arch/um/include/asm/kmap_types.h + delete mode 100644 include/asm-generic/kmap_types.h + +diff --git a/arch/alpha/include/asm/kmap_types.h b/arch/alpha/include/asm/kmap_types.h +deleted file mode 100644 +index 651714b45729..000000000000 +--- a/arch/alpha/include/asm/kmap_types.h ++++ /dev/null +@@ -1,15 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef _ASM_KMAP_TYPES_H +-#define _ASM_KMAP_TYPES_H +- +-/* Dummy header just to define km_type. */ +- +-#ifdef CONFIG_DEBUG_HIGHMEM +-#define __WITH_KM_FENCE +-#endif +- +-#include <asm-generic/kmap_types.h> +- +-#undef __WITH_KM_FENCE +- +-#endif +diff --git a/arch/ia64/include/asm/kmap_types.h b/arch/ia64/include/asm/kmap_types.h +deleted file mode 100644 +index 5c268cf7c2bd..000000000000 +--- a/arch/ia64/include/asm/kmap_types.h ++++ /dev/null +@@ -1,13 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef _ASM_IA64_KMAP_TYPES_H +-#define _ASM_IA64_KMAP_TYPES_H +- +-#ifdef CONFIG_DEBUG_HIGHMEM +-#define __WITH_KM_FENCE +-#endif +- +-#include <asm-generic/kmap_types.h> +- +-#undef __WITH_KM_FENCE +- +-#endif /* _ASM_IA64_KMAP_TYPES_H */ +diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c +index 5e88c351e6a4..f3fa02b8838a 100644 +--- a/arch/openrisc/mm/init.c ++++ b/arch/openrisc/mm/init.c +@@ -33,7 +33,6 @@ + #include <asm/io.h> + #include <asm/tlb.h> + #include <asm/mmu_context.h> +-#include <asm/kmap_types.h> + #include <asm/fixmap.h> + #include <asm/tlbflush.h> + #include <asm/sections.h> +diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c +index a978590d802d..5aed97a18bac 100644 +--- a/arch/openrisc/mm/ioremap.c ++++ b/arch/openrisc/mm/ioremap.c +@@ -15,7 +15,6 @@ + #include <linux/io.h> + #include <linux/pgtable.h> + #include <asm/pgalloc.h> +-#include <asm/kmap_types.h> + #include <asm/fixmap.h> + #include <asm/bug.h> + #include <linux/sched.h> +diff --git a/arch/parisc/include/asm/kmap_types.h b/arch/parisc/include/asm/kmap_types.h +deleted file mode 100644 +index 3e70b5cd1123..000000000000 +--- a/arch/parisc/include/asm/kmap_types.h ++++ /dev/null +@@ -1,13 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef _ASM_KMAP_TYPES_H +-#define _ASM_KMAP_TYPES_H +- +-#ifdef CONFIG_DEBUG_HIGHMEM +-#define __WITH_KM_FENCE +-#endif +- +-#include <asm-generic/kmap_types.h> +- +-#undef __WITH_KM_FENCE +- +-#endif +diff --git a/arch/um/include/asm/fixmap.h b/arch/um/include/asm/fixmap.h +index 2c697a145ac1..2efac5827188 100644 +--- a/arch/um/include/asm/fixmap.h ++++ b/arch/um/include/asm/fixmap.h +@@ -3,7 +3,6 @@ + #define __UM_FIXMAP_H + + #include <asm/processor.h> +-#include <asm/kmap_types.h> + #include <asm/archparam.h> + #include <asm/page.h> + #include <linux/threads.h> +diff --git a/arch/um/include/asm/kmap_types.h b/arch/um/include/asm/kmap_types.h +deleted file mode 100644 +index b0bd12de1d23..000000000000 +--- a/arch/um/include/asm/kmap_types.h ++++ /dev/null +@@ -1,13 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-/* +- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) +- */ +- +-#ifndef __UM_KMAP_TYPES_H +-#define __UM_KMAP_TYPES_H +- +-/* No more #include "asm/arch/kmap_types.h" ! */ +- +-#define KM_TYPE_NR 14 +- +-#endif +diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild +index 3114a6da7e56..267f6dfb8960 100644 +--- a/include/asm-generic/Kbuild ++++ b/include/asm-generic/Kbuild +@@ -30,7 +30,6 @@ mandatory-y += irq.h + mandatory-y += irq_regs.h + mandatory-y += irq_work.h + mandatory-y += kdebug.h +-mandatory-y += kmap_types.h + mandatory-y += kmap_size.h + mandatory-y += kprobes.h + mandatory-y += linkage.h +diff --git a/include/asm-generic/kmap_types.h b/include/asm-generic/kmap_types.h +deleted file mode 100644 +index 9f95b7b63d19..000000000000 +--- a/include/asm-generic/kmap_types.h ++++ /dev/null +@@ -1,11 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef _ASM_GENERIC_KMAP_TYPES_H +-#define _ASM_GENERIC_KMAP_TYPES_H +- +-#ifdef __WITH_KM_FENCE +-# define KM_TYPE_NR 41 +-#else +-# define KM_TYPE_NR 20 +-#endif +- +-#endif +diff --git a/include/linux/highmem.h b/include/linux/highmem.h +index fbede783dc34..a5ce45dceae0 100644 +--- a/include/linux/highmem.h ++++ b/include/linux/highmem.h +@@ -29,8 +29,6 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) + } + #endif + +-#include <asm/kmap_types.h> +- + /* + * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft. + */ +-- +2.43.0 + diff --git a/debian/patches-rt/0042-mm-highmem-Remove-the-old-kmap_atomic-cruft.patch b/debian/patches-rt/0042-mm-highmem-Remove-the-old-kmap_atomic-cruft.patch new file mode 100644 index 000000000..112ed0a87 --- /dev/null +++ b/debian/patches-rt/0042-mm-highmem-Remove-the-old-kmap_atomic-cruft.patch @@ -0,0 +1,139 @@ +From 2840eec439f3e1788047259d10994b555020de40 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:31 +0100 +Subject: [PATCH 042/323] mm/highmem: Remove the old kmap_atomic cruft +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +All users gone. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/highmem.h | 63 +++-------------------------------------- + mm/highmem.c | 7 +---- + 2 files changed, 5 insertions(+), 65 deletions(-) + +diff --git a/include/linux/highmem.h b/include/linux/highmem.h +index a5ce45dceae0..e632774cce87 100644 +--- a/include/linux/highmem.h ++++ b/include/linux/highmem.h +@@ -86,31 +86,16 @@ static inline void kunmap(struct page *page) + * be used in IRQ contexts, so in some (very limited) cases we need + * it. + */ +- +-#ifndef CONFIG_KMAP_LOCAL +-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); +-void kunmap_atomic_high(void *kvaddr); +- + static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) + { + preempt_disable(); + pagefault_disable(); +- if (!PageHighMem(page)) +- return page_address(page); +- return kmap_atomic_high_prot(page, prot); +-} +- +-static inline void __kunmap_atomic(void *vaddr) +-{ +- kunmap_atomic_high(vaddr); ++ return __kmap_local_page_prot(page, prot); + } +-#else /* !CONFIG_KMAP_LOCAL */ + +-static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) ++static inline void *kmap_atomic(struct page *page) + { +- preempt_disable(); +- pagefault_disable(); +- return __kmap_local_page_prot(page, prot); ++ return kmap_atomic_prot(page, kmap_prot); + } + + static inline void *kmap_atomic_pfn(unsigned long pfn) +@@ -125,13 +110,6 @@ static inline void __kunmap_atomic(void *addr) + kunmap_local_indexed(addr); + } + +-#endif /* CONFIG_KMAP_LOCAL */ +- +-static inline void *kmap_atomic(struct page *page) +-{ +- return kmap_atomic_prot(page, kmap_prot); +-} +- + /* declarations for linux/mm/highmem.c */ + unsigned int nr_free_highpages(void); + extern atomic_long_t _totalhigh_pages; +@@ -212,41 +190,8 @@ static inline void __kunmap_atomic(void *addr) + + #define kmap_flush_unused() do {} while(0) + +-#endif /* CONFIG_HIGHMEM */ +- +-#if !defined(CONFIG_KMAP_LOCAL) +-#if defined(CONFIG_HIGHMEM) +- +-DECLARE_PER_CPU(int, __kmap_atomic_idx); +- +-static inline int kmap_atomic_idx_push(void) +-{ +- int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1; +- +-#ifdef CONFIG_DEBUG_HIGHMEM +- WARN_ON_ONCE(in_irq() && !irqs_disabled()); +- BUG_ON(idx >= KM_TYPE_NR); +-#endif +- return idx; +-} +- +-static inline int kmap_atomic_idx(void) +-{ +- return __this_cpu_read(__kmap_atomic_idx) - 1; +-} + +-static inline void kmap_atomic_idx_pop(void) +-{ +-#ifdef CONFIG_DEBUG_HIGHMEM +- int idx = __this_cpu_dec_return(__kmap_atomic_idx); +- +- BUG_ON(idx < 0); +-#else +- __this_cpu_dec(__kmap_atomic_idx); +-#endif +-} +-#endif +-#endif ++#endif /* CONFIG_HIGHMEM */ + + /* + * Prevent people trying to call kunmap_atomic() as if it were kunmap() +diff --git a/mm/highmem.c b/mm/highmem.c +index 77677c6844f7..499dfafd36b7 100644 +--- a/mm/highmem.c ++++ b/mm/highmem.c +@@ -31,12 +31,6 @@ + #include <asm/tlbflush.h> + #include <linux/vmalloc.h> + +-#ifndef CONFIG_KMAP_LOCAL +-#ifdef CONFIG_HIGHMEM +-DEFINE_PER_CPU(int, __kmap_atomic_idx); +-#endif +-#endif +- + /* + * Virtual_count is not a pure "count". + * 0 means that it is not mapped, and has not been mapped +@@ -410,6 +404,7 @@ static inline void kmap_local_idx_pop(void) + #ifndef arch_kmap_local_post_map + # define arch_kmap_local_post_map(vaddr, pteval) do { } while (0) + #endif ++ + #ifndef arch_kmap_local_pre_unmap + # define arch_kmap_local_pre_unmap(vaddr) do { } while (0) + #endif +-- +2.43.0 + diff --git a/debian/patches-rt/0043-io-mapping-Cleanup-atomic-iomap.patch b/debian/patches-rt/0043-io-mapping-Cleanup-atomic-iomap.patch new file mode 100644 index 000000000..08ec0b108 --- /dev/null +++ b/debian/patches-rt/0043-io-mapping-Cleanup-atomic-iomap.patch @@ -0,0 +1,90 @@ +From d1b454a9410a5cb2d9dcd0ef4bc330d04847f302 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:32 +0100 +Subject: [PATCH 043/323] io-mapping: Cleanup atomic iomap +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Switch the atomic iomap implementation over to kmap_local and stick the +preempt/pagefault mechanics into the generic code similar to the +kmap_atomic variants. + +Rename the x86 map function in preparation for a non-atomic variant. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/x86/include/asm/iomap.h | 9 +-------- + arch/x86/mm/iomap_32.c | 6 ++---- + include/linux/io-mapping.h | 8 ++++++-- + 3 files changed, 9 insertions(+), 14 deletions(-) + +diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h +index 0be7a30fd6bc..e2de092fc38c 100644 +--- a/arch/x86/include/asm/iomap.h ++++ b/arch/x86/include/asm/iomap.h +@@ -13,14 +13,7 @@ + #include <asm/cacheflush.h> + #include <asm/tlbflush.h> + +-void __iomem *iomap_atomic_pfn_prot(unsigned long pfn, pgprot_t prot); +- +-static inline void iounmap_atomic(void __iomem *vaddr) +-{ +- kunmap_local_indexed((void __force *)vaddr); +- pagefault_enable(); +- preempt_enable(); +-} ++void __iomem *__iomap_local_pfn_prot(unsigned long pfn, pgprot_t prot); + + int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); + +diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c +index e0a40d7cc66c..9aaa756ddf21 100644 +--- a/arch/x86/mm/iomap_32.c ++++ b/arch/x86/mm/iomap_32.c +@@ -44,7 +44,7 @@ void iomap_free(resource_size_t base, unsigned long size) + } + EXPORT_SYMBOL_GPL(iomap_free); + +-void __iomem *iomap_atomic_pfn_prot(unsigned long pfn, pgprot_t prot) ++void __iomem *__iomap_local_pfn_prot(unsigned long pfn, pgprot_t prot) + { + /* + * For non-PAT systems, translate non-WB request to UC- just in +@@ -60,8 +60,6 @@ void __iomem *iomap_atomic_pfn_prot(unsigned long pfn, pgprot_t prot) + /* Filter out unsupported __PAGE_KERNEL* bits: */ + pgprot_val(prot) &= __default_kernel_pte_mask; + +- preempt_disable(); +- pagefault_disable(); + return (void __force __iomem *)__kmap_local_pfn_prot(pfn, prot); + } +-EXPORT_SYMBOL_GPL(iomap_atomic_pfn_prot); ++EXPORT_SYMBOL_GPL(__iomap_local_pfn_prot); +diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h +index 3b0940be72e9..60e7c83e4904 100644 +--- a/include/linux/io-mapping.h ++++ b/include/linux/io-mapping.h +@@ -69,13 +69,17 @@ io_mapping_map_atomic_wc(struct io_mapping *mapping, + + BUG_ON(offset >= mapping->size); + phys_addr = mapping->base + offset; +- return iomap_atomic_pfn_prot(PHYS_PFN(phys_addr), mapping->prot); ++ preempt_disable(); ++ pagefault_disable(); ++ return __iomap_local_pfn_prot(PHYS_PFN(phys_addr), mapping->prot); + } + + static inline void + io_mapping_unmap_atomic(void __iomem *vaddr) + { +- iounmap_atomic(vaddr); ++ kunmap_local_indexed((void __force *)vaddr); ++ pagefault_enable(); ++ preempt_enable(); + } + + static inline void __iomem * +-- +2.43.0 + diff --git a/debian/patches-rt/0044-Documentation-io-mapping-Remove-outdated-blurb.patch b/debian/patches-rt/0044-Documentation-io-mapping-Remove-outdated-blurb.patch new file mode 100644 index 000000000..4ea660fff --- /dev/null +++ b/debian/patches-rt/0044-Documentation-io-mapping-Remove-outdated-blurb.patch @@ -0,0 +1,48 @@ +From edd5c644d78fbd912f6386c97ed2b4eaf2b58b47 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:33 +0100 +Subject: [PATCH 044/323] Documentation/io-mapping: Remove outdated blurb +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The implementation details in the documentation are outdated and not really +helpful. Remove them. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + Documentation/driver-api/io-mapping.rst | 22 ---------------------- + 1 file changed, 22 deletions(-) + +diff --git a/Documentation/driver-api/io-mapping.rst b/Documentation/driver-api/io-mapping.rst +index a966239f04e4..e33b88268554 100644 +--- a/Documentation/driver-api/io-mapping.rst ++++ b/Documentation/driver-api/io-mapping.rst +@@ -73,25 +73,3 @@ for pages mapped with io_mapping_map_wc. + At driver close time, the io_mapping object must be freed:: + + void io_mapping_free(struct io_mapping *mapping) +- +-Current Implementation +-====================== +- +-The initial implementation of these functions uses existing mapping +-mechanisms and so provides only an abstraction layer and no new +-functionality. +- +-On 64-bit processors, io_mapping_create_wc calls ioremap_wc for the whole +-range, creating a permanent kernel-visible mapping to the resource. The +-map_atomic and map functions add the requested offset to the base of the +-virtual address returned by ioremap_wc. +- +-On 32-bit processors with HIGHMEM defined, io_mapping_map_atomic_wc uses +-kmap_atomic_pfn to map the specified page in an atomic fashion; +-kmap_atomic_pfn isn't really supposed to be used with device pages, but it +-provides an efficient mapping for this usage. +- +-On 32-bit processors without HIGHMEM defined, io_mapping_map_atomic_wc and +-io_mapping_map_wc both use ioremap_wc, a terribly inefficient function which +-performs an IPI to inform all processors about the new mapping. This results +-in a significant performance penalty. +-- +2.43.0 + diff --git a/debian/patches-rt/0045-highmem-High-implementation-details-and-document-API.patch b/debian/patches-rt/0045-highmem-High-implementation-details-and-document-API.patch new file mode 100644 index 000000000..723dd45e7 --- /dev/null +++ b/debian/patches-rt/0045-highmem-High-implementation-details-and-document-API.patch @@ -0,0 +1,544 @@ +From e9f16e3d48ba9b1d37ae050de3e84e147133b84c Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:34 +0100 +Subject: [PATCH 045/323] highmem: High implementation details and document API +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Move the gory details of kmap & al into a private header and only document +the interfaces which are usable by drivers. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/highmem-internal.h | 174 ++++++++++++++++++++ + include/linux/highmem.h | 266 +++++++++++-------------------- + mm/highmem.c | 11 +- + 3 files changed, 274 insertions(+), 177 deletions(-) + create mode 100644 include/linux/highmem-internal.h + +diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h +new file mode 100644 +index 000000000000..6ceed907b14e +--- /dev/null ++++ b/include/linux/highmem-internal.h +@@ -0,0 +1,174 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _LINUX_HIGHMEM_INTERNAL_H ++#define _LINUX_HIGHMEM_INTERNAL_H ++ ++/* ++ * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft. ++ */ ++#ifdef CONFIG_KMAP_LOCAL ++void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); ++void *__kmap_local_page_prot(struct page *page, pgprot_t prot); ++void kunmap_local_indexed(void *vaddr); ++#endif ++ ++#ifdef CONFIG_HIGHMEM ++#include <asm/highmem.h> ++ ++#ifndef ARCH_HAS_KMAP_FLUSH_TLB ++static inline void kmap_flush_tlb(unsigned long addr) { } ++#endif ++ ++#ifndef kmap_prot ++#define kmap_prot PAGE_KERNEL ++#endif ++ ++void *kmap_high(struct page *page); ++void kunmap_high(struct page *page); ++void __kmap_flush_unused(void); ++struct page *__kmap_to_page(void *addr); ++ ++static inline void *kmap(struct page *page) ++{ ++ void *addr; ++ ++ might_sleep(); ++ if (!PageHighMem(page)) ++ addr = page_address(page); ++ else ++ addr = kmap_high(page); ++ kmap_flush_tlb((unsigned long)addr); ++ return addr; ++} ++ ++static inline void kunmap(struct page *page) ++{ ++ might_sleep(); ++ if (!PageHighMem(page)) ++ return; ++ kunmap_high(page); ++} ++ ++static inline struct page *kmap_to_page(void *addr) ++{ ++ return __kmap_to_page(addr); ++} ++ ++static inline void kmap_flush_unused(void) ++{ ++ __kmap_flush_unused(); ++} ++ ++static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) ++{ ++ preempt_disable(); ++ pagefault_disable(); ++ return __kmap_local_page_prot(page, prot); ++} ++ ++static inline void *kmap_atomic(struct page *page) ++{ ++ return kmap_atomic_prot(page, kmap_prot); ++} ++ ++static inline void *kmap_atomic_pfn(unsigned long pfn) ++{ ++ preempt_disable(); ++ pagefault_disable(); ++ return __kmap_local_pfn_prot(pfn, kmap_prot); ++} ++ ++static inline void __kunmap_atomic(void *addr) ++{ ++ kunmap_local_indexed(addr); ++ pagefault_enable(); ++ preempt_enable(); ++} ++ ++unsigned int __nr_free_highpages(void); ++extern atomic_long_t _totalhigh_pages; ++ ++static inline unsigned int nr_free_highpages(void) ++{ ++ return __nr_free_highpages(); ++} ++ ++static inline unsigned long totalhigh_pages(void) ++{ ++ return (unsigned long)atomic_long_read(&_totalhigh_pages); ++} ++ ++static inline void totalhigh_pages_inc(void) ++{ ++ atomic_long_inc(&_totalhigh_pages); ++} ++ ++static inline void totalhigh_pages_add(long count) ++{ ++ atomic_long_add(count, &_totalhigh_pages); ++} ++ ++#else /* CONFIG_HIGHMEM */ ++ ++static inline struct page *kmap_to_page(void *addr) ++{ ++ return virt_to_page(addr); ++} ++ ++static inline void *kmap(struct page *page) ++{ ++ might_sleep(); ++ return page_address(page); ++} ++ ++static inline void kunmap_high(struct page *page) { } ++static inline void kmap_flush_unused(void) { } ++ ++static inline void kunmap(struct page *page) ++{ ++#ifdef ARCH_HAS_FLUSH_ON_KUNMAP ++ kunmap_flush_on_unmap(page_address(page)); ++#endif ++} ++ ++static inline void *kmap_atomic(struct page *page) ++{ ++ preempt_disable(); ++ pagefault_disable(); ++ return page_address(page); ++} ++ ++static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) ++{ ++ return kmap_atomic(page); ++} ++ ++static inline void *kmap_atomic_pfn(unsigned long pfn) ++{ ++ return kmap_atomic(pfn_to_page(pfn)); ++} ++ ++static inline void __kunmap_atomic(void *addr) ++{ ++#ifdef ARCH_HAS_FLUSH_ON_KUNMAP ++ kunmap_flush_on_unmap(addr); ++#endif ++ pagefault_enable(); ++ preempt_enable(); ++} ++ ++static inline unsigned int nr_free_highpages(void) { return 0; } ++static inline unsigned long totalhigh_pages(void) { return 0UL; } ++ ++#endif /* CONFIG_HIGHMEM */ ++ ++/* ++ * Prevent people trying to call kunmap_atomic() as if it were kunmap() ++ * kunmap_atomic() should get the return value of kmap_atomic, not the page. ++ */ ++#define kunmap_atomic(__addr) \ ++do { \ ++ BUILD_BUG_ON(__same_type((__addr), struct page *)); \ ++ __kunmap_atomic(__addr); \ ++} while (0) ++ ++#endif +diff --git a/include/linux/highmem.h b/include/linux/highmem.h +index e632774cce87..5c888525b4c5 100644 +--- a/include/linux/highmem.h ++++ b/include/linux/highmem.h +@@ -11,199 +11,125 @@ + + #include <asm/cacheflush.h> + +-#ifndef ARCH_HAS_FLUSH_ANON_PAGE +-static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) +-{ +-} +-#endif +- +-#ifndef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE +-static inline void flush_kernel_dcache_page(struct page *page) +-{ +-} +-static inline void flush_kernel_vmap_range(void *vaddr, int size) +-{ +-} +-static inline void invalidate_kernel_vmap_range(void *vaddr, int size) +-{ +-} +-#endif ++#include "highmem-internal.h" + +-/* +- * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft. ++/** ++ * kmap - Map a page for long term usage ++ * @page: Pointer to the page to be mapped ++ * ++ * Returns: The virtual address of the mapping ++ * ++ * Can only be invoked from preemptible task context because on 32bit ++ * systems with CONFIG_HIGHMEM enabled this function might sleep. ++ * ++ * For systems with CONFIG_HIGHMEM=n and for pages in the low memory area ++ * this returns the virtual address of the direct kernel mapping. ++ * ++ * The returned virtual address is globally visible and valid up to the ++ * point where it is unmapped via kunmap(). The pointer can be handed to ++ * other contexts. ++ * ++ * For highmem pages on 32bit systems this can be slow as the mapping space ++ * is limited and protected by a global lock. In case that there is no ++ * mapping slot available the function blocks until a slot is released via ++ * kunmap(). + */ +-#ifdef CONFIG_KMAP_LOCAL +-void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); +-void *__kmap_local_page_prot(struct page *page, pgprot_t prot); +-void kunmap_local_indexed(void *vaddr); +-#endif +- +-#ifdef CONFIG_HIGHMEM +-#include <asm/highmem.h> ++static inline void *kmap(struct page *page); + +-#ifndef ARCH_HAS_KMAP_FLUSH_TLB +-static inline void kmap_flush_tlb(unsigned long addr) { } +-#endif +- +-#ifndef kmap_prot +-#define kmap_prot PAGE_KERNEL +-#endif +- +-void *kmap_high(struct page *page); +-static inline void *kmap(struct page *page) +-{ +- void *addr; +- +- might_sleep(); +- if (!PageHighMem(page)) +- addr = page_address(page); +- else +- addr = kmap_high(page); +- kmap_flush_tlb((unsigned long)addr); +- return addr; +-} ++/** ++ * kunmap - Unmap the virtual address mapped by kmap() ++ * @addr: Virtual address to be unmapped ++ * ++ * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of ++ * pages in the low memory area. ++ */ ++static inline void kunmap(struct page *page); + +-void kunmap_high(struct page *page); ++/** ++ * kmap_to_page - Get the page for a kmap'ed address ++ * @addr: The address to look up ++ * ++ * Returns: The page which is mapped to @addr. ++ */ ++static inline struct page *kmap_to_page(void *addr); + +-static inline void kunmap(struct page *page) +-{ +- might_sleep(); +- if (!PageHighMem(page)) +- return; +- kunmap_high(page); +-} ++/** ++ * kmap_flush_unused - Flush all unused kmap mappings in order to ++ * remove stray mappings ++ */ ++static inline void kmap_flush_unused(void); + +-/* +- * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because +- * no global lock is needed and because the kmap code must perform a global TLB +- * invalidation when the kmap pool wraps. ++/** ++ * kmap_atomic - Atomically map a page for temporary usage ++ * @page: Pointer to the page to be mapped ++ * ++ * Returns: The virtual address of the mapping ++ * ++ * Side effect: On return pagefaults and preemption are disabled. ++ * ++ * Can be invoked from any context. + * +- * However when holding an atomic kmap it is not legal to sleep, so atomic +- * kmaps are appropriate for short, tight code paths only. ++ * Requires careful handling when nesting multiple mappings because the map ++ * management is stack based. The unmap has to be in the reverse order of ++ * the map operation: + * +- * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap +- * gives a more generic (and caching) interface. But kmap_atomic can +- * be used in IRQ contexts, so in some (very limited) cases we need +- * it. ++ * addr1 = kmap_atomic(page1); ++ * addr2 = kmap_atomic(page2); ++ * ... ++ * kunmap_atomic(addr2); ++ * kunmap_atomic(addr1); ++ * ++ * Unmapping addr1 before addr2 is invalid and causes malfunction. ++ * ++ * Contrary to kmap() mappings the mapping is only valid in the context of ++ * the caller and cannot be handed to other contexts. ++ * ++ * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the ++ * virtual address of the direct mapping. Only real highmem pages are ++ * temporarily mapped. ++ * ++ * While it is significantly faster than kmap() it comes with restrictions ++ * about the pointer validity and the side effects of disabling page faults ++ * and preemption. Use it only when absolutely necessary, e.g. from non ++ * preemptible contexts. + */ +-static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +-{ +- preempt_disable(); +- pagefault_disable(); +- return __kmap_local_page_prot(page, prot); +-} ++static inline void *kmap_atomic(struct page *page); + +-static inline void *kmap_atomic(struct page *page) +-{ +- return kmap_atomic_prot(page, kmap_prot); +-} +- +-static inline void *kmap_atomic_pfn(unsigned long pfn) +-{ +- preempt_disable(); +- pagefault_disable(); +- return __kmap_local_pfn_prot(pfn, kmap_prot); +-} +- +-static inline void __kunmap_atomic(void *addr) +-{ +- kunmap_local_indexed(addr); +-} +- +-/* declarations for linux/mm/highmem.c */ +-unsigned int nr_free_highpages(void); +-extern atomic_long_t _totalhigh_pages; +-static inline unsigned long totalhigh_pages(void) +-{ +- return (unsigned long)atomic_long_read(&_totalhigh_pages); +-} +- +-static inline void totalhigh_pages_inc(void) +-{ +- atomic_long_inc(&_totalhigh_pages); +-} +- +-static inline void totalhigh_pages_add(long count) +-{ +- atomic_long_add(count, &_totalhigh_pages); +-} +- +-void kmap_flush_unused(void); +- +-struct page *kmap_to_page(void *addr); +- +-#else /* CONFIG_HIGHMEM */ +- +-static inline unsigned int nr_free_highpages(void) { return 0; } +- +-static inline struct page *kmap_to_page(void *addr) +-{ +- return virt_to_page(addr); +-} +- +-static inline unsigned long totalhigh_pages(void) { return 0UL; } ++/** ++ * kunmap_atomic - Unmap the virtual address mapped by kmap_atomic() ++ * @addr: Virtual address to be unmapped ++ * ++ * Counterpart to kmap_atomic(). ++ * ++ * Undoes the side effects of kmap_atomic(), i.e. reenabling pagefaults and ++ * preemption. ++ * ++ * Other than that a NOOP for CONFIG_HIGHMEM=n and for mappings of pages ++ * in the low memory area. For real highmen pages the mapping which was ++ * established with kmap_atomic() is destroyed. ++ */ + +-static inline void *kmap(struct page *page) +-{ +- might_sleep(); +- return page_address(page); +-} ++/* Highmem related interfaces for management code */ ++static inline unsigned int nr_free_highpages(void); ++static inline unsigned long totalhigh_pages(void); + +-static inline void kunmap_high(struct page *page) ++#ifndef ARCH_HAS_FLUSH_ANON_PAGE ++static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) + { + } +- +-static inline void kunmap(struct page *page) +-{ +-#ifdef ARCH_HAS_FLUSH_ON_KUNMAP +- kunmap_flush_on_unmap(page_address(page)); + #endif +-} + +-static inline void *kmap_atomic(struct page *page) ++#ifndef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE ++static inline void flush_kernel_dcache_page(struct page *page) + { +- preempt_disable(); +- pagefault_disable(); +- return page_address(page); + } +- +-static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) ++static inline void flush_kernel_vmap_range(void *vaddr, int size) + { +- return kmap_atomic(page); + } +- +-static inline void *kmap_atomic_pfn(unsigned long pfn) ++static inline void invalidate_kernel_vmap_range(void *vaddr, int size) + { +- return kmap_atomic(pfn_to_page(pfn)); + } +- +-static inline void __kunmap_atomic(void *addr) +-{ +- /* +- * Mostly nothing to do in the CONFIG_HIGHMEM=n case as kunmap_atomic() +- * handles re-enabling faults and preemption +- */ +-#ifdef ARCH_HAS_FLUSH_ON_KUNMAP +- kunmap_flush_on_unmap(addr); + #endif +-} +- +-#define kmap_flush_unused() do {} while(0) +- +- +-#endif /* CONFIG_HIGHMEM */ +- +-/* +- * Prevent people trying to call kunmap_atomic() as if it were kunmap() +- * kunmap_atomic() should get the return value of kmap_atomic, not the page. +- */ +-#define kunmap_atomic(__addr) \ +-do { \ +- BUILD_BUG_ON(__same_type((__addr), struct page *)); \ +- __kunmap_atomic(__addr); \ +- pagefault_enable(); \ +- preempt_enable(); \ +-} while (0) + + /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ + #ifndef clear_user_highpage +diff --git a/mm/highmem.c b/mm/highmem.c +index 499dfafd36b7..54bd233846c9 100644 +--- a/mm/highmem.c ++++ b/mm/highmem.c +@@ -104,7 +104,7 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) + atomic_long_t _totalhigh_pages __read_mostly; + EXPORT_SYMBOL(_totalhigh_pages); + +-unsigned int nr_free_highpages (void) ++unsigned int __nr_free_highpages (void) + { + struct zone *zone; + unsigned int pages = 0; +@@ -141,7 +141,7 @@ pte_t * pkmap_page_table; + do { spin_unlock(&kmap_lock); (void)(flags); } while (0) + #endif + +-struct page *kmap_to_page(void *vaddr) ++struct page *__kmap_to_page(void *vaddr) + { + unsigned long addr = (unsigned long)vaddr; + +@@ -152,7 +152,7 @@ struct page *kmap_to_page(void *vaddr) + + return virt_to_page(addr); + } +-EXPORT_SYMBOL(kmap_to_page); ++EXPORT_SYMBOL(__kmap_to_page); + + static void flush_all_zero_pkmaps(void) + { +@@ -194,10 +194,7 @@ static void flush_all_zero_pkmaps(void) + flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); + } + +-/** +- * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings +- */ +-void kmap_flush_unused(void) ++void __kmap_flush_unused(void) + { + lock_kmap(); + flush_all_zero_pkmaps(); +-- +2.43.0 + diff --git a/debian/patches-rt/0046-sched-Make-migrate_disable-enable-independent-of-RT.patch b/debian/patches-rt/0046-sched-Make-migrate_disable-enable-independent-of-RT.patch new file mode 100644 index 000000000..ed3660a63 --- /dev/null +++ b/debian/patches-rt/0046-sched-Make-migrate_disable-enable-independent-of-RT.patch @@ -0,0 +1,293 @@ +From 2a81a49976b336ce93ef823a5db96ce960096478 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:35 +0100 +Subject: [PATCH 046/323] sched: Make migrate_disable/enable() independent of + RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Now that the scheduler can deal with migrate disable properly, there is no +real compelling reason to make it only available for RT. + +There are quite some code pathes which needlessly disable preemption in +order to prevent migration and some constructs like kmap_atomic() enforce +it implicitly. + +Making it available independent of RT allows to provide a preemptible +variant of kmap_atomic() and makes the code more consistent in general. + +FIXME: Rework the comment in preempt.h + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Peter Zijlstra <peterz@infradead.org> +Cc: Ingo Molnar <mingo@kernel.org> +Cc: Juri Lelli <juri.lelli@redhat.com> +Cc: Vincent Guittot <vincent.guittot@linaro.org> +Cc: Dietmar Eggemann <dietmar.eggemann@arm.com> +Cc: Steven Rostedt <rostedt@goodmis.org> +Cc: Ben Segall <bsegall@google.com> +Cc: Mel Gorman <mgorman@suse.de> +Cc: Daniel Bristot de Oliveira <bristot@redhat.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/kernel.h | 21 ++++++++++++------- + include/linux/preempt.h | 38 +++------------------------------- + include/linux/sched.h | 2 +- + kernel/sched/core.c | 45 ++++++++++++++++++++++++++++++++--------- + kernel/sched/sched.h | 4 ++-- + lib/smp_processor_id.c | 2 +- + 6 files changed, 56 insertions(+), 56 deletions(-) + +diff --git a/include/linux/kernel.h b/include/linux/kernel.h +index cdd6ed5bbcf2..55d48d5627c7 100644 +--- a/include/linux/kernel.h ++++ b/include/linux/kernel.h +@@ -207,6 +207,7 @@ extern int _cond_resched(void); + extern void ___might_sleep(const char *file, int line, int preempt_offset); + extern void __might_sleep(const char *file, int line, int preempt_offset); + extern void __cant_sleep(const char *file, int line, int preempt_offset); ++extern void __cant_migrate(const char *file, int line); + + /** + * might_sleep - annotation for functions that can sleep +@@ -230,6 +231,18 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); + # define cant_sleep() \ + do { __cant_sleep(__FILE__, __LINE__, 0); } while (0) + # define sched_annotate_sleep() (current->task_state_change = 0) ++ ++/** ++ * cant_migrate - annotation for functions that cannot migrate ++ * ++ * Will print a stack trace if executed in code which is migratable ++ */ ++# define cant_migrate() \ ++ do { \ ++ if (IS_ENABLED(CONFIG_SMP)) \ ++ __cant_migrate(__FILE__, __LINE__); \ ++ } while (0) ++ + /** + * non_block_start - annotate the start of section where sleeping is prohibited + * +@@ -254,6 +267,7 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); + int preempt_offset) { } + # define might_sleep() do { might_resched(); } while (0) + # define cant_sleep() do { } while (0) ++# define cant_migrate() do { } while (0) + # define sched_annotate_sleep() do { } while (0) + # define non_block_start() do { } while (0) + # define non_block_end() do { } while (0) +@@ -261,13 +275,6 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); + + #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) + +-#ifndef CONFIG_PREEMPT_RT +-# define cant_migrate() cant_sleep() +-#else +- /* Placeholder for now */ +-# define cant_migrate() do { } while (0) +-#endif +- + /** + * abs - return absolute value of an argument + * @x: the value. If it is unsigned type, it is converted to signed type first. +diff --git a/include/linux/preempt.h b/include/linux/preempt.h +index 8b43922e65df..6df63cbe8bb0 100644 +--- a/include/linux/preempt.h ++++ b/include/linux/preempt.h +@@ -322,7 +322,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, + + #endif + +-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) ++#ifdef CONFIG_SMP + + /* + * Migrate-Disable and why it is undesired. +@@ -382,43 +382,11 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, + extern void migrate_disable(void); + extern void migrate_enable(void); + +-#elif defined(CONFIG_PREEMPT_RT) ++#else + + static inline void migrate_disable(void) { } + static inline void migrate_enable(void) { } + +-#else /* !CONFIG_PREEMPT_RT */ +- +-/** +- * migrate_disable - Prevent migration of the current task +- * +- * Maps to preempt_disable() which also disables preemption. Use +- * migrate_disable() to annotate that the intent is to prevent migration, +- * but not necessarily preemption. +- * +- * Can be invoked nested like preempt_disable() and needs the corresponding +- * number of migrate_enable() invocations. +- */ +-static __always_inline void migrate_disable(void) +-{ +- preempt_disable(); +-} +- +-/** +- * migrate_enable - Allow migration of the current task +- * +- * Counterpart to migrate_disable(). +- * +- * As migrate_disable() can be invoked nested, only the outermost invocation +- * reenables migration. +- * +- * Currently mapped to preempt_enable(). +- */ +-static __always_inline void migrate_enable(void) +-{ +- preempt_enable(); +-} +- +-#endif /* CONFIG_SMP && CONFIG_PREEMPT_RT */ ++#endif /* CONFIG_SMP */ + + #endif /* __LINUX_PREEMPT_H */ +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 7ca1f3e740dd..bff48e9f32db 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -727,7 +727,7 @@ struct task_struct { + const cpumask_t *cpus_ptr; + cpumask_t cpus_mask; + void *migration_pending; +-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) ++#ifdef CONFIG_SMP + unsigned short migration_disabled; + #endif + unsigned short migration_flags; +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index c1e52319669d..8a6135a0b2ee 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1710,8 +1710,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) + + #ifdef CONFIG_SMP + +-#ifdef CONFIG_PREEMPT_RT +- + static void + __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags); + +@@ -1786,8 +1784,6 @@ static inline bool rq_has_pinned_tasks(struct rq *rq) + return rq->nr_pinned; + } + +-#endif +- + /* + * Per-CPU kthreads are allowed to run on !active && online CPUs, see + * __set_cpus_allowed_ptr() and select_fallback_rq(). +@@ -2868,7 +2864,7 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) + } + } + +-#else ++#else /* CONFIG_SMP */ + + static inline int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, +@@ -2877,10 +2873,6 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p, + return set_cpus_allowed_ptr(p, new_mask); + } + +-#endif /* CONFIG_SMP */ +- +-#if !defined(CONFIG_SMP) || !defined(CONFIG_PREEMPT_RT) +- + static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } + + static inline bool rq_has_pinned_tasks(struct rq *rq) +@@ -2888,7 +2880,7 @@ static inline bool rq_has_pinned_tasks(struct rq *rq) + return false; + } + +-#endif ++#endif /* !CONFIG_SMP */ + + static void + ttwu_stat(struct task_struct *p, int cpu, int wake_flags) +@@ -7902,6 +7894,39 @@ void __cant_sleep(const char *file, int line, int preempt_offset) + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); + } + EXPORT_SYMBOL_GPL(__cant_sleep); ++ ++#ifdef CONFIG_SMP ++void __cant_migrate(const char *file, int line) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (is_migration_disabled(current)) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > 0) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ pr_err("BUG: assuming non migratable context at %s:%d\n", file, line); ++ pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), is_migration_disabled(current), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_migrate); ++#endif + #endif + + #ifdef CONFIG_MAGIC_SYSRQ +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 69ef7cac3d29..c26b1c7009f4 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1058,7 +1058,7 @@ struct rq { + struct cpuidle_state *idle_state; + #endif + +-#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) ++#ifdef CONFIG_SMP + unsigned int nr_pinned; + #endif + unsigned int push_busy; +@@ -1094,7 +1094,7 @@ static inline int cpu_of(struct rq *rq) + + static inline bool is_migration_disabled(struct task_struct *p) + { +-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) ++#ifdef CONFIG_SMP + return p->migration_disabled; + #else + return false; +diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c +index dbb96ebf661f..0c0c42b14370 100644 +--- a/lib/smp_processor_id.c ++++ b/lib/smp_processor_id.c +@@ -26,7 +26,7 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2) + if (current->nr_cpus_allowed == 1) + goto out; + +-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) ++#ifdef CONFIG_SMP + if (current->migration_disabled) + goto out; + #endif +-- +2.43.0 + diff --git a/debian/patches-rt/0047-sched-highmem-Store-local-kmaps-in-task-struct.patch b/debian/patches-rt/0047-sched-highmem-Store-local-kmaps-in-task-struct.patch new file mode 100644 index 000000000..ecd08de62 --- /dev/null +++ b/debian/patches-rt/0047-sched-highmem-Store-local-kmaps-in-task-struct.patch @@ -0,0 +1,309 @@ +From 7aa261f53a405c0862f4857b46ef3344bbe6385f Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:36 +0100 +Subject: [PATCH 047/323] sched: highmem: Store local kmaps in task struct +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Instead of storing the map per CPU provide and use per task storage. That +prepares for local kmaps which are preemptible. + +The context switch code is preparatory and not yet in use because +kmap_atomic() runs with preemption disabled. Will be made usable in the +next step. + +The context switch logic is safe even when an interrupt happens after +clearing or before restoring the kmaps. The kmap index in task struct is +not modified so any nesting kmap in an interrupt will use unused indices +and on return the counter is the same as before. + +Also add an assert into the return to user space code. Going back to user +space with an active kmap local is a nono. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/highmem-internal.h | 10 ++++ + include/linux/sched.h | 9 +++ + kernel/entry/common.c | 2 + + kernel/fork.c | 1 + + kernel/sched/core.c | 18 ++++++ + mm/highmem.c | 99 ++++++++++++++++++++++++++++---- + 6 files changed, 129 insertions(+), 10 deletions(-) + +diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h +index 6ceed907b14e..c5a22177db85 100644 +--- a/include/linux/highmem-internal.h ++++ b/include/linux/highmem-internal.h +@@ -9,6 +9,16 @@ + void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); + void *__kmap_local_page_prot(struct page *page, pgprot_t prot); + void kunmap_local_indexed(void *vaddr); ++void kmap_local_fork(struct task_struct *tsk); ++void __kmap_local_sched_out(void); ++void __kmap_local_sched_in(void); ++static inline void kmap_assert_nomap(void) ++{ ++ DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx); ++} ++#else ++static inline void kmap_local_fork(struct task_struct *tsk) { } ++static inline void kmap_assert_nomap(void) { } + #endif + + #ifdef CONFIG_HIGHMEM +diff --git a/include/linux/sched.h b/include/linux/sched.h +index bff48e9f32db..82de1ab42497 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -34,6 +34,7 @@ + #include <linux/rseq.h> + #include <linux/seqlock.h> + #include <linux/kcsan.h> ++#include <asm/kmap_size.h> + + /* task_struct member predeclarations (sorted alphabetically): */ + struct audit_context; +@@ -641,6 +642,13 @@ struct wake_q_node { + struct wake_q_node *next; + }; + ++struct kmap_ctrl { ++#ifdef CONFIG_KMAP_LOCAL ++ int idx; ++ pte_t pteval[KM_MAX_IDX]; ++#endif ++}; ++ + struct task_struct { + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* +@@ -1323,6 +1331,7 @@ struct task_struct { + unsigned int sequential_io; + unsigned int sequential_io_avg; + #endif ++ struct kmap_ctrl kmap_ctrl; + #ifdef CONFIG_DEBUG_ATOMIC_SLEEP + unsigned long task_state_change; + #endif +diff --git a/kernel/entry/common.c b/kernel/entry/common.c +index 09f58853f692..e6a66de1202a 100644 +--- a/kernel/entry/common.c ++++ b/kernel/entry/common.c +@@ -2,6 +2,7 @@ + + #include <linux/context_tracking.h> + #include <linux/entry-common.h> ++#include <linux/highmem.h> + #include <linux/livepatch.h> + #include <linux/audit.h> + +@@ -202,6 +203,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) + + /* Ensure that the address limit is intact and no locks are held */ + addr_limit_user_check(); ++ kmap_assert_nomap(); + lockdep_assert_irqs_disabled(); + lockdep_sys_exit(); + } +diff --git a/kernel/fork.c b/kernel/fork.c +index 633b0af1d1a7..32b9d7205ac1 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -942,6 +942,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) + account_kernel_stack(tsk, 1); + + kcov_task_init(tsk); ++ kmap_local_fork(tsk); + + #ifdef CONFIG_FAULT_INJECTION + tsk->fail_nth = 0; +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 8a6135a0b2ee..390b51366f5e 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4092,6 +4092,22 @@ static inline void finish_lock_switch(struct rq *rq) + # define finish_arch_post_lock_switch() do { } while (0) + #endif + ++static inline void kmap_local_sched_out(void) ++{ ++#ifdef CONFIG_KMAP_LOCAL ++ if (unlikely(current->kmap_ctrl.idx)) ++ __kmap_local_sched_out(); ++#endif ++} ++ ++static inline void kmap_local_sched_in(void) ++{ ++#ifdef CONFIG_KMAP_LOCAL ++ if (unlikely(current->kmap_ctrl.idx)) ++ __kmap_local_sched_in(); ++#endif ++} ++ + /** + * prepare_task_switch - prepare to switch tasks + * @rq: the runqueue preparing to switch +@@ -4114,6 +4130,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, + perf_event_task_sched_out(prev, next); + rseq_preempt(prev); + fire_sched_out_preempt_notifiers(prev, next); ++ kmap_local_sched_out(); + prepare_task(next); + prepare_arch_switch(next); + } +@@ -4180,6 +4197,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) + finish_lock_switch(rq); + finish_arch_post_lock_switch(); + kcov_finish_switch(current); ++ kmap_local_sched_in(); + + fire_sched_in_preempt_notifiers(current); + /* +diff --git a/mm/highmem.c b/mm/highmem.c +index 54bd233846c9..d7a1c80001d0 100644 +--- a/mm/highmem.c ++++ b/mm/highmem.c +@@ -365,8 +365,6 @@ EXPORT_SYMBOL(kunmap_high); + + #include <asm/kmap_size.h> + +-static DEFINE_PER_CPU(int, __kmap_local_idx); +- + /* + * With DEBUG_HIGHMEM the stack depth is doubled and every second + * slot is unused which acts as a guard page +@@ -379,23 +377,21 @@ static DEFINE_PER_CPU(int, __kmap_local_idx); + + static inline int kmap_local_idx_push(void) + { +- int idx = __this_cpu_add_return(__kmap_local_idx, KM_INCR) - 1; +- + WARN_ON_ONCE(in_irq() && !irqs_disabled()); +- BUG_ON(idx >= KM_MAX_IDX); +- return idx; ++ current->kmap_ctrl.idx += KM_INCR; ++ BUG_ON(current->kmap_ctrl.idx >= KM_MAX_IDX); ++ return current->kmap_ctrl.idx - 1; + } + + static inline int kmap_local_idx(void) + { +- return __this_cpu_read(__kmap_local_idx) - 1; ++ return current->kmap_ctrl.idx - 1; + } + + static inline void kmap_local_idx_pop(void) + { +- int idx = __this_cpu_sub_return(__kmap_local_idx, KM_INCR); +- +- BUG_ON(idx < 0); ++ current->kmap_ctrl.idx -= KM_INCR; ++ BUG_ON(current->kmap_ctrl.idx < 0); + } + + #ifndef arch_kmap_local_post_map +@@ -461,6 +457,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) + pteval = pfn_pte(pfn, prot); + set_pte_at(&init_mm, vaddr, kmap_pte - idx, pteval); + arch_kmap_local_post_map(vaddr, pteval); ++ current->kmap_ctrl.pteval[kmap_local_idx()] = pteval; + preempt_enable(); + + return (void *)vaddr; +@@ -505,10 +502,92 @@ void kunmap_local_indexed(void *vaddr) + arch_kmap_local_pre_unmap(addr); + pte_clear(&init_mm, addr, kmap_pte - idx); + arch_kmap_local_post_unmap(addr); ++ current->kmap_ctrl.pteval[kmap_local_idx()] = __pte(0); + kmap_local_idx_pop(); + preempt_enable(); + } + EXPORT_SYMBOL(kunmap_local_indexed); ++ ++/* ++ * Invoked before switch_to(). This is safe even when during or after ++ * clearing the maps an interrupt which needs a kmap_local happens because ++ * the task::kmap_ctrl.idx is not modified by the unmapping code so a ++ * nested kmap_local will use the next unused index and restore the index ++ * on unmap. The already cleared kmaps of the outgoing task are irrelevant ++ * because the interrupt context does not know about them. The same applies ++ * when scheduling back in for an interrupt which happens before the ++ * restore is complete. ++ */ ++void __kmap_local_sched_out(void) ++{ ++ struct task_struct *tsk = current; ++ pte_t *kmap_pte = kmap_get_pte(); ++ int i; ++ ++ /* Clear kmaps */ ++ for (i = 0; i < tsk->kmap_ctrl.idx; i++) { ++ pte_t pteval = tsk->kmap_ctrl.pteval[i]; ++ unsigned long addr; ++ int idx; ++ ++ /* With debug all even slots are unmapped and act as guard */ ++ if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) { ++ WARN_ON_ONCE(!pte_none(pteval)); ++ continue; ++ } ++ if (WARN_ON_ONCE(pte_none(pteval))) ++ continue; ++ ++ /* ++ * This is a horrible hack for XTENSA to calculate the ++ * coloured PTE index. Uses the PFN encoded into the pteval ++ * and the map index calculation because the actual mapped ++ * virtual address is not stored in task::kmap_ctrl. ++ * For any sane architecture this is optimized out. ++ */ ++ idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); ++ ++ addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); ++ arch_kmap_local_pre_unmap(addr); ++ pte_clear(&init_mm, addr, kmap_pte - idx); ++ arch_kmap_local_post_unmap(addr); ++ } ++} ++ ++void __kmap_local_sched_in(void) ++{ ++ struct task_struct *tsk = current; ++ pte_t *kmap_pte = kmap_get_pte(); ++ int i; ++ ++ /* Restore kmaps */ ++ for (i = 0; i < tsk->kmap_ctrl.idx; i++) { ++ pte_t pteval = tsk->kmap_ctrl.pteval[i]; ++ unsigned long addr; ++ int idx; ++ ++ /* With debug all even slots are unmapped and act as guard */ ++ if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) { ++ WARN_ON_ONCE(!pte_none(pteval)); ++ continue; ++ } ++ if (WARN_ON_ONCE(pte_none(pteval))) ++ continue; ++ ++ /* See comment in __kmap_local_sched_out() */ ++ idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); ++ addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); ++ set_pte_at(&init_mm, addr, kmap_pte - idx, pteval); ++ arch_kmap_local_post_map(addr, pteval); ++ } ++} ++ ++void kmap_local_fork(struct task_struct *tsk) ++{ ++ if (WARN_ON_ONCE(tsk->kmap_ctrl.idx)) ++ memset(&tsk->kmap_ctrl, 0, sizeof(tsk->kmap_ctrl)); ++} ++ + #endif + + #if defined(HASHED_PAGE_VIRTUAL) +-- +2.43.0 + diff --git a/debian/patches-rt/0048-mm-highmem-Provide-kmap_local.patch b/debian/patches-rt/0048-mm-highmem-Provide-kmap_local.patch new file mode 100644 index 000000000..aa15ab4ec --- /dev/null +++ b/debian/patches-rt/0048-mm-highmem-Provide-kmap_local.patch @@ -0,0 +1,207 @@ +From 452c4c8536ea017ed0f82287834e7cfa2f751488 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:37 +0100 +Subject: [PATCH 048/323] mm/highmem: Provide kmap_local* +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Now that the kmap atomic index is stored in task struct provide a +preemptible variant. On context switch the maps of an outgoing task are +removed and the map of the incoming task are restored. That's obviously +slow, but highmem is slow anyway. + +The kmap_local.*() functions can be invoked from both preemptible and +atomic context. kmap local sections disable migration to keep the resulting +virtual mapping address correct, but disable neither pagefaults nor +preemption. + +A wholesale conversion of kmap_atomic to be fully preemptible is not +possible because some of the usage sites might rely on the preemption +disable for serialization or on the implicit pagefault disable. Needs to be +done on a case by case basis. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/highmem-internal.h | 48 ++++++++++++++++++++++++++++++++ + include/linux/highmem.h | 43 +++++++++++++++++----------- + mm/highmem.c | 6 ++++ + 3 files changed, 81 insertions(+), 16 deletions(-) + +diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h +index c5a22177db85..1bbe96dc8be6 100644 +--- a/include/linux/highmem-internal.h ++++ b/include/linux/highmem-internal.h +@@ -68,6 +68,26 @@ static inline void kmap_flush_unused(void) + __kmap_flush_unused(); + } + ++static inline void *kmap_local_page(struct page *page) ++{ ++ return __kmap_local_page_prot(page, kmap_prot); ++} ++ ++static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) ++{ ++ return __kmap_local_page_prot(page, prot); ++} ++ ++static inline void *kmap_local_pfn(unsigned long pfn) ++{ ++ return __kmap_local_pfn_prot(pfn, kmap_prot); ++} ++ ++static inline void __kunmap_local(void *vaddr) ++{ ++ kunmap_local_indexed(vaddr); ++} ++ + static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) + { + preempt_disable(); +@@ -140,6 +160,28 @@ static inline void kunmap(struct page *page) + #endif + } + ++static inline void *kmap_local_page(struct page *page) ++{ ++ return page_address(page); ++} ++ ++static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) ++{ ++ return kmap_local_page(page); ++} ++ ++static inline void *kmap_local_pfn(unsigned long pfn) ++{ ++ return kmap_local_page(pfn_to_page(pfn)); ++} ++ ++static inline void __kunmap_local(void *addr) ++{ ++#ifdef ARCH_HAS_FLUSH_ON_KUNMAP ++ kunmap_flush_on_unmap(addr); ++#endif ++} ++ + static inline void *kmap_atomic(struct page *page) + { + preempt_disable(); +@@ -181,4 +223,10 @@ do { \ + __kunmap_atomic(__addr); \ + } while (0) + ++#define kunmap_local(__addr) \ ++do { \ ++ BUILD_BUG_ON(__same_type((__addr), struct page *)); \ ++ __kunmap_local(__addr); \ ++} while (0) ++ + #endif +diff --git a/include/linux/highmem.h b/include/linux/highmem.h +index 5c888525b4c5..7a3c6d4b79d8 100644 +--- a/include/linux/highmem.h ++++ b/include/linux/highmem.h +@@ -60,24 +60,22 @@ static inline struct page *kmap_to_page(void *addr); + static inline void kmap_flush_unused(void); + + /** +- * kmap_atomic - Atomically map a page for temporary usage ++ * kmap_local_page - Map a page for temporary usage + * @page: Pointer to the page to be mapped + * + * Returns: The virtual address of the mapping + * +- * Side effect: On return pagefaults and preemption are disabled. +- * + * Can be invoked from any context. + * + * Requires careful handling when nesting multiple mappings because the map + * management is stack based. The unmap has to be in the reverse order of + * the map operation: + * +- * addr1 = kmap_atomic(page1); +- * addr2 = kmap_atomic(page2); ++ * addr1 = kmap_local_page(page1); ++ * addr2 = kmap_local_page(page2); + * ... +- * kunmap_atomic(addr2); +- * kunmap_atomic(addr1); ++ * kunmap_local(addr2); ++ * kunmap_local(addr1); + * + * Unmapping addr1 before addr2 is invalid and causes malfunction. + * +@@ -88,10 +86,26 @@ static inline void kmap_flush_unused(void); + * virtual address of the direct mapping. Only real highmem pages are + * temporarily mapped. + * +- * While it is significantly faster than kmap() it comes with restrictions +- * about the pointer validity and the side effects of disabling page faults +- * and preemption. Use it only when absolutely necessary, e.g. from non +- * preemptible contexts. ++ * While it is significantly faster than kmap() for the higmem case it ++ * comes with restrictions about the pointer validity. Only use when really ++ * necessary. ++ * ++ * On HIGHMEM enabled systems mapping a highmem page has the side effect of ++ * disabling migration in order to keep the virtual address stable across ++ * preemption. No caller of kmap_local_page() can rely on this side effect. ++ */ ++static inline void *kmap_local_page(struct page *page); ++ ++/** ++ * kmap_atomic - Atomically map a page for temporary usage - Deprecated! ++ * @page: Pointer to the page to be mapped ++ * ++ * Returns: The virtual address of the mapping ++ * ++ * Effectively a wrapper around kmap_local_page() which disables pagefaults ++ * and preemption. ++ * ++ * Do not use in new code. Use kmap_local_page() instead. + */ + static inline void *kmap_atomic(struct page *page); + +@@ -101,12 +115,9 @@ static inline void *kmap_atomic(struct page *page); + * + * Counterpart to kmap_atomic(). + * +- * Undoes the side effects of kmap_atomic(), i.e. reenabling pagefaults and ++ * Effectively a wrapper around kunmap_local() which additionally undoes ++ * the side effects of kmap_atomic(), i.e. reenabling pagefaults and + * preemption. +- * +- * Other than that a NOOP for CONFIG_HIGHMEM=n and for mappings of pages +- * in the low memory area. For real highmen pages the mapping which was +- * established with kmap_atomic() is destroyed. + */ + + /* Highmem related interfaces for management code */ +diff --git a/mm/highmem.c b/mm/highmem.c +index d7a1c80001d0..8db577e5290c 100644 +--- a/mm/highmem.c ++++ b/mm/highmem.c +@@ -450,6 +450,11 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) + unsigned long vaddr; + int idx; + ++ /* ++ * Disable migration so resulting virtual address is stable ++ * accross preemption. ++ */ ++ migrate_disable(); + preempt_disable(); + idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +@@ -505,6 +510,7 @@ void kunmap_local_indexed(void *vaddr) + current->kmap_ctrl.pteval[kmap_local_idx()] = __pte(0); + kmap_local_idx_pop(); + preempt_enable(); ++ migrate_enable(); + } + EXPORT_SYMBOL(kunmap_local_indexed); + +-- +2.43.0 + diff --git a/debian/patches-rt/0049-io-mapping-Provide-iomap_local-variant.patch b/debian/patches-rt/0049-io-mapping-Provide-iomap_local-variant.patch new file mode 100644 index 000000000..1419b16fb --- /dev/null +++ b/debian/patches-rt/0049-io-mapping-Provide-iomap_local-variant.patch @@ -0,0 +1,179 @@ +From 223ef1707ad3b008f96df3f4d263d2cdeed3ef65 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:38 +0100 +Subject: [PATCH 049/323] io-mapping: Provide iomap_local variant +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Similar to kmap local provide a iomap local variant which only disables +migration, but neither disables pagefaults nor preemption. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + Documentation/driver-api/io-mapping.rst | 74 +++++++++++++++---------- + include/linux/io-mapping.h | 30 +++++++++- + 2 files changed, 73 insertions(+), 31 deletions(-) + +diff --git a/Documentation/driver-api/io-mapping.rst b/Documentation/driver-api/io-mapping.rst +index e33b88268554..a0cfb15988df 100644 +--- a/Documentation/driver-api/io-mapping.rst ++++ b/Documentation/driver-api/io-mapping.rst +@@ -20,55 +20,71 @@ A mapping object is created during driver initialization using:: + mappable, while 'size' indicates how large a mapping region to + enable. Both are in bytes. + +-This _wc variant provides a mapping which may only be used +-with the io_mapping_map_atomic_wc or io_mapping_map_wc. ++This _wc variant provides a mapping which may only be used with ++io_mapping_map_atomic_wc(), io_mapping_map_local_wc() or ++io_mapping_map_wc(). + +-With this mapping object, individual pages can be mapped either atomically +-or not, depending on the necessary scheduling environment. Of course, atomic +-maps are more efficient:: ++With this mapping object, individual pages can be mapped either temporarily ++or long term, depending on the requirements. Of course, temporary maps are ++more efficient. They come in two flavours:: ++ ++ void *io_mapping_map_local_wc(struct io_mapping *mapping, ++ unsigned long offset) + + void *io_mapping_map_atomic_wc(struct io_mapping *mapping, + unsigned long offset) + +-'offset' is the offset within the defined mapping region. +-Accessing addresses beyond the region specified in the +-creation function yields undefined results. Using an offset +-which is not page aligned yields an undefined result. The +-return value points to a single page in CPU address space. ++'offset' is the offset within the defined mapping region. Accessing ++addresses beyond the region specified in the creation function yields ++undefined results. Using an offset which is not page aligned yields an ++undefined result. The return value points to a single page in CPU address ++space. + +-This _wc variant returns a write-combining map to the +-page and may only be used with mappings created by +-io_mapping_create_wc ++This _wc variant returns a write-combining map to the page and may only be ++used with mappings created by io_mapping_create_wc() + +-Note that the task may not sleep while holding this page +-mapped. ++Temporary mappings are only valid in the context of the caller. The mapping ++is not guaranteed to be globaly visible. + +-:: ++io_mapping_map_local_wc() has a side effect on X86 32bit as it disables ++migration to make the mapping code work. No caller can rely on this side ++effect. + +- void io_mapping_unmap_atomic(void *vaddr) ++io_mapping_map_atomic_wc() has the side effect of disabling preemption and ++pagefaults. Don't use in new code. Use io_mapping_map_local_wc() instead. + +-'vaddr' must be the value returned by the last +-io_mapping_map_atomic_wc call. This unmaps the specified +-page and allows the task to sleep once again. ++Nested mappings need to be undone in reverse order because the mapping ++code uses a stack for keeping track of them:: + +-If you need to sleep while holding the lock, you can use the non-atomic +-variant, although they may be significantly slower. ++ addr1 = io_mapping_map_local_wc(map1, offset1); ++ addr2 = io_mapping_map_local_wc(map2, offset2); ++ ... ++ io_mapping_unmap_local(addr2); ++ io_mapping_unmap_local(addr1); + +-:: ++The mappings are released with:: ++ ++ void io_mapping_unmap_local(void *vaddr) ++ void io_mapping_unmap_atomic(void *vaddr) ++ ++'vaddr' must be the value returned by the last io_mapping_map_local_wc() or ++io_mapping_map_atomic_wc() call. This unmaps the specified mapping and ++undoes the side effects of the mapping functions. ++ ++If you need to sleep while holding a mapping, you can use the regular ++variant, although this may be significantly slower:: + + void *io_mapping_map_wc(struct io_mapping *mapping, + unsigned long offset) + +-This works like io_mapping_map_atomic_wc except it allows +-the task to sleep while holding the page mapped. +- ++This works like io_mapping_map_atomic/local_wc() except it has no side ++effects and the pointer is globaly visible. + +-:: ++The mappings are released with:: + + void io_mapping_unmap(void *vaddr) + +-This works like io_mapping_unmap_atomic, except it is used +-for pages mapped with io_mapping_map_wc. ++Use for pages mapped with io_mapping_map_wc(). + + At driver close time, the io_mapping object must be freed:: + +diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h +index 60e7c83e4904..c093e81310a9 100644 +--- a/include/linux/io-mapping.h ++++ b/include/linux/io-mapping.h +@@ -82,6 +82,21 @@ io_mapping_unmap_atomic(void __iomem *vaddr) + preempt_enable(); + } + ++static inline void __iomem * ++io_mapping_map_local_wc(struct io_mapping *mapping, unsigned long offset) ++{ ++ resource_size_t phys_addr; ++ ++ BUG_ON(offset >= mapping->size); ++ phys_addr = mapping->base + offset; ++ return __iomap_local_pfn_prot(PHYS_PFN(phys_addr), mapping->prot); ++} ++ ++static inline void io_mapping_unmap_local(void __iomem *vaddr) ++{ ++ kunmap_local_indexed((void __force *)vaddr); ++} ++ + static inline void __iomem * + io_mapping_map_wc(struct io_mapping *mapping, + unsigned long offset, +@@ -101,7 +116,7 @@ io_mapping_unmap(void __iomem *vaddr) + iounmap(vaddr); + } + +-#else ++#else /* HAVE_ATOMIC_IOMAP */ + + #include <linux/uaccess.h> + +@@ -166,7 +181,18 @@ io_mapping_unmap_atomic(void __iomem *vaddr) + preempt_enable(); + } + +-#endif /* HAVE_ATOMIC_IOMAP */ ++static inline void __iomem * ++io_mapping_map_local_wc(struct io_mapping *mapping, unsigned long offset) ++{ ++ return io_mapping_map_wc(mapping, offset, PAGE_SIZE); ++} ++ ++static inline void io_mapping_unmap_local(void __iomem *vaddr) ++{ ++ io_mapping_unmap(vaddr); ++} ++ ++#endif /* !HAVE_ATOMIC_IOMAP */ + + static inline struct io_mapping * + io_mapping_create_wc(resource_size_t base, +-- +2.43.0 + diff --git a/debian/patches-rt/0050-x86-crashdump-32-Simplify-copy_oldmem_page.patch b/debian/patches-rt/0050-x86-crashdump-32-Simplify-copy_oldmem_page.patch new file mode 100644 index 000000000..6f57ef059 --- /dev/null +++ b/debian/patches-rt/0050-x86-crashdump-32-Simplify-copy_oldmem_page.patch @@ -0,0 +1,99 @@ +From 9d9dd47b77d859a1261ac0dc98d94018bb5bb6a0 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:39 +0100 +Subject: [PATCH 050/323] x86/crashdump/32: Simplify copy_oldmem_page() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Replace kmap_atomic_pfn() with kmap_local_pfn() which is preemptible and +can take page faults. + +Remove the indirection of the dump page and the related cruft which is not +longer required. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/x86/kernel/crash_dump_32.c | 48 +++++++-------------------------- + 1 file changed, 10 insertions(+), 38 deletions(-) + +diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c +index 33ee47670b99..5fcac46aaf6b 100644 +--- a/arch/x86/kernel/crash_dump_32.c ++++ b/arch/x86/kernel/crash_dump_32.c +@@ -13,8 +13,6 @@ + + #include <linux/uaccess.h> + +-static void *kdump_buf_page; +- + static inline bool is_crashed_pfn_valid(unsigned long pfn) + { + #ifndef CONFIG_X86_PAE +@@ -41,15 +39,11 @@ static inline bool is_crashed_pfn_valid(unsigned long pfn) + * @userbuf: if set, @buf is in user address space, use copy_to_user(), + * otherwise @buf is in kernel address space, use memcpy(). + * +- * Copy a page from "oldmem". For this page, there is no pte mapped +- * in the current kernel. We stitch up a pte, similar to kmap_atomic. +- * +- * Calling copy_to_user() in atomic context is not desirable. Hence first +- * copying the data to a pre-allocated kernel page and then copying to user +- * space in non-atomic context. ++ * Copy a page from "oldmem". For this page, there might be no pte mapped ++ * in the current kernel. + */ +-ssize_t copy_oldmem_page(unsigned long pfn, char *buf, +- size_t csize, unsigned long offset, int userbuf) ++ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, ++ unsigned long offset, int userbuf) + { + void *vaddr; + +@@ -59,38 +53,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, + if (!is_crashed_pfn_valid(pfn)) + return -EFAULT; + +- vaddr = kmap_atomic_pfn(pfn); ++ vaddr = kmap_local_pfn(pfn); + + if (!userbuf) { +- memcpy(buf, (vaddr + offset), csize); +- kunmap_atomic(vaddr); ++ memcpy(buf, vaddr + offset, csize); + } else { +- if (!kdump_buf_page) { +- printk(KERN_WARNING "Kdump: Kdump buffer page not" +- " allocated\n"); +- kunmap_atomic(vaddr); +- return -EFAULT; +- } +- copy_page(kdump_buf_page, vaddr); +- kunmap_atomic(vaddr); +- if (copy_to_user(buf, (kdump_buf_page + offset), csize)) +- return -EFAULT; ++ if (copy_to_user(buf, vaddr + offset, csize)) ++ csize = -EFAULT; + } + +- return csize; +-} ++ kunmap_local(vaddr); + +-static int __init kdump_buf_page_init(void) +-{ +- int ret = 0; +- +- kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL); +- if (!kdump_buf_page) { +- printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer" +- " page\n"); +- ret = -ENOMEM; +- } +- +- return ret; ++ return csize; + } +-arch_initcall(kdump_buf_page_init); +-- +2.43.0 + diff --git a/debian/patches-rt/0051-mips-crashdump-Simplify-copy_oldmem_page.patch b/debian/patches-rt/0051-mips-crashdump-Simplify-copy_oldmem_page.patch new file mode 100644 index 000000000..837a4a04e --- /dev/null +++ b/debian/patches-rt/0051-mips-crashdump-Simplify-copy_oldmem_page.patch @@ -0,0 +1,95 @@ +From 1aeac707162ce43fc0ae9c45df5a8a4535de2332 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:40 +0100 +Subject: [PATCH 051/323] mips/crashdump: Simplify copy_oldmem_page() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Replace kmap_atomic_pfn() with kmap_local_pfn() which is preemptible and +can take page faults. + +Remove the indirection of the dump page and the related cruft which is not +longer required. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> +Cc: linux-mips@vger.kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/mips/kernel/crash_dump.c | 42 ++++++----------------------------- + 1 file changed, 7 insertions(+), 35 deletions(-) + +diff --git a/arch/mips/kernel/crash_dump.c b/arch/mips/kernel/crash_dump.c +index 01b2bd95ba1f..9aba83e1eeb4 100644 +--- a/arch/mips/kernel/crash_dump.c ++++ b/arch/mips/kernel/crash_dump.c +@@ -5,8 +5,6 @@ + #include <linux/uaccess.h> + #include <linux/slab.h> + +-static void *kdump_buf_page; +- + /** + * copy_oldmem_page - copy one page from "oldmem" + * @pfn: page frame number to be copied +@@ -17,51 +15,25 @@ static void *kdump_buf_page; + * @userbuf: if set, @buf is in user address space, use copy_to_user(), + * otherwise @buf is in kernel address space, use memcpy(). + * +- * Copy a page from "oldmem". For this page, there is no pte mapped ++ * Copy a page from "oldmem". For this page, there might be no pte mapped + * in the current kernel. +- * +- * Calling copy_to_user() in atomic context is not desirable. Hence first +- * copying the data to a pre-allocated kernel page and then copying to user +- * space in non-atomic context. + */ +-ssize_t copy_oldmem_page(unsigned long pfn, char *buf, +- size_t csize, unsigned long offset, int userbuf) ++ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, ++ unsigned long offset, int userbuf) + { + void *vaddr; + + if (!csize) + return 0; + +- vaddr = kmap_atomic_pfn(pfn); ++ vaddr = kmap_local_pfn(pfn); + + if (!userbuf) { +- memcpy(buf, (vaddr + offset), csize); +- kunmap_atomic(vaddr); ++ memcpy(buf, vaddr + offset, csize); + } else { +- if (!kdump_buf_page) { +- pr_warn("Kdump: Kdump buffer page not allocated\n"); +- +- return -EFAULT; +- } +- copy_page(kdump_buf_page, vaddr); +- kunmap_atomic(vaddr); +- if (copy_to_user(buf, (kdump_buf_page + offset), csize)) +- return -EFAULT; ++ if (copy_to_user(buf, vaddr + offset, csize)) ++ csize = -EFAULT; + } + + return csize; + } +- +-static int __init kdump_buf_page_init(void) +-{ +- int ret = 0; +- +- kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL); +- if (!kdump_buf_page) { +- pr_warn("Kdump: Failed to allocate kdump buffer page\n"); +- ret = -ENOMEM; +- } +- +- return ret; +-} +-arch_initcall(kdump_buf_page_init); +-- +2.43.0 + diff --git a/debian/patches-rt/0052-ARM-mm-Replace-kmap_atomic_pfn.patch b/debian/patches-rt/0052-ARM-mm-Replace-kmap_atomic_pfn.patch new file mode 100644 index 000000000..ba46ccc69 --- /dev/null +++ b/debian/patches-rt/0052-ARM-mm-Replace-kmap_atomic_pfn.patch @@ -0,0 +1,71 @@ +From d74735ac58543c778598c75bccd45aa527acf03c Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:41 +0100 +Subject: [PATCH 052/323] ARM: mm: Replace kmap_atomic_pfn() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +There is no requirement to disable pagefaults and preemption for these +cache management mappings. + +Replace kmap_atomic_pfn() with kmap_local_pfn(). This allows to remove +kmap_atomic_pfn() in the next step. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Russell King <linux@armlinux.org.uk> +Cc: linux-arm-kernel@lists.infradead.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/arm/mm/cache-feroceon-l2.c | 6 +++--- + arch/arm/mm/cache-xsc3l2.c | 4 ++-- + 2 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/arch/arm/mm/cache-feroceon-l2.c b/arch/arm/mm/cache-feroceon-l2.c +index 5c1b7a7b9af6..87328766e910 100644 +--- a/arch/arm/mm/cache-feroceon-l2.c ++++ b/arch/arm/mm/cache-feroceon-l2.c +@@ -49,9 +49,9 @@ static inline unsigned long l2_get_va(unsigned long paddr) + * we simply install a virtual mapping for it only for the + * TLB lookup to occur, hence no need to flush the untouched + * memory mapping afterwards (note: a cache flush may happen +- * in some circumstances depending on the path taken in kunmap_atomic). ++ * in some circumstances depending on the path taken in kunmap_local). + */ +- void *vaddr = kmap_atomic_pfn(paddr >> PAGE_SHIFT); ++ void *vaddr = kmap_local_pfn(paddr >> PAGE_SHIFT); + return (unsigned long)vaddr + (paddr & ~PAGE_MASK); + #else + return __phys_to_virt(paddr); +@@ -61,7 +61,7 @@ static inline unsigned long l2_get_va(unsigned long paddr) + static inline void l2_put_va(unsigned long vaddr) + { + #ifdef CONFIG_HIGHMEM +- kunmap_atomic((void *)vaddr); ++ kunmap_local((void *)vaddr); + #endif + } + +diff --git a/arch/arm/mm/cache-xsc3l2.c b/arch/arm/mm/cache-xsc3l2.c +index d20d7af02d10..0e0a3abd8174 100644 +--- a/arch/arm/mm/cache-xsc3l2.c ++++ b/arch/arm/mm/cache-xsc3l2.c +@@ -59,7 +59,7 @@ static inline void l2_unmap_va(unsigned long va) + { + #ifdef CONFIG_HIGHMEM + if (va != -1) +- kunmap_atomic((void *)va); ++ kunmap_local((void *)va); + #endif + } + +@@ -75,7 +75,7 @@ static inline unsigned long l2_map_va(unsigned long pa, unsigned long prev_va) + * in place for it. + */ + l2_unmap_va(prev_va); +- va = (unsigned long)kmap_atomic_pfn(pa >> PAGE_SHIFT); ++ va = (unsigned long)kmap_local_pfn(pa >> PAGE_SHIFT); + } + return va + (pa_offset >> (32 - PAGE_SHIFT)); + #else +-- +2.43.0 + diff --git a/debian/patches-rt/0053-highmem-Remove-kmap_atomic_pfn.patch b/debian/patches-rt/0053-highmem-Remove-kmap_atomic_pfn.patch new file mode 100644 index 000000000..a43566a1c --- /dev/null +++ b/debian/patches-rt/0053-highmem-Remove-kmap_atomic_pfn.patch @@ -0,0 +1,47 @@ +From 766aabba108ec5697bed86d6ec633151b7f3618e Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:42 +0100 +Subject: [PATCH 053/323] highmem: Remove kmap_atomic_pfn() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +No more users. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/highmem-internal.h | 12 ------------ + 1 file changed, 12 deletions(-) + +diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h +index 1bbe96dc8be6..3590af5aad96 100644 +--- a/include/linux/highmem-internal.h ++++ b/include/linux/highmem-internal.h +@@ -100,13 +100,6 @@ static inline void *kmap_atomic(struct page *page) + return kmap_atomic_prot(page, kmap_prot); + } + +-static inline void *kmap_atomic_pfn(unsigned long pfn) +-{ +- preempt_disable(); +- pagefault_disable(); +- return __kmap_local_pfn_prot(pfn, kmap_prot); +-} +- + static inline void __kunmap_atomic(void *addr) + { + kunmap_local_indexed(addr); +@@ -194,11 +187,6 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) + return kmap_atomic(page); + } + +-static inline void *kmap_atomic_pfn(unsigned long pfn) +-{ +- return kmap_atomic(pfn_to_page(pfn)); +-} +- + static inline void __kunmap_atomic(void *addr) + { + #ifdef ARCH_HAS_FLUSH_ON_KUNMAP +-- +2.43.0 + diff --git a/debian/patches-rt/0054-drm-ttm-Replace-kmap_atomic-usage.patch b/debian/patches-rt/0054-drm-ttm-Replace-kmap_atomic-usage.patch new file mode 100644 index 000000000..3043aa642 --- /dev/null +++ b/debian/patches-rt/0054-drm-ttm-Replace-kmap_atomic-usage.patch @@ -0,0 +1,74 @@ +From 818c332a89ac4310c59f7ad122bdb4a6e0aac518 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:43 +0100 +Subject: [PATCH 054/323] drm/ttm: Replace kmap_atomic() usage +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +There is no reason to disable pagefaults and preemption as a side effect of +kmap_atomic_prot(). + +Use kmap_local_page_prot() instead and document the reasoning for the +mapping usage with the given pgprot. + +Remove the NULL pointer check for the map. These functions return a valid +address for valid pages and the return was bogus anyway as it would have +left preemption and pagefaults disabled. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Christian Koenig <christian.koenig@amd.com> +Cc: Huang Rui <ray.huang@amd.com> +Cc: David Airlie <airlied@linux.ie> +Cc: Daniel Vetter <daniel@ffwll.ch> +Cc: dri-devel@lists.freedesktop.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/gpu/drm/ttm/ttm_bo_util.c | 20 ++++++++++++-------- + 1 file changed, 12 insertions(+), 8 deletions(-) + +diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c +index 1968df9743fc..79b9c883431a 100644 +--- a/drivers/gpu/drm/ttm/ttm_bo_util.c ++++ b/drivers/gpu/drm/ttm/ttm_bo_util.c +@@ -181,13 +181,15 @@ static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, + return -ENOMEM; + + src = (void *)((unsigned long)src + (page << PAGE_SHIFT)); +- dst = kmap_atomic_prot(d, prot); +- if (!dst) +- return -ENOMEM; ++ /* ++ * Ensure that a highmem page is mapped with the correct ++ * pgprot. For non highmem the mapping is already there. ++ */ ++ dst = kmap_local_page_prot(d, prot); + + memcpy_fromio(dst, src, PAGE_SIZE); + +- kunmap_atomic(dst); ++ kunmap_local(dst); + + return 0; + } +@@ -203,13 +205,15 @@ static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst, + return -ENOMEM; + + dst = (void *)((unsigned long)dst + (page << PAGE_SHIFT)); +- src = kmap_atomic_prot(s, prot); +- if (!src) +- return -ENOMEM; ++ /* ++ * Ensure that a highmem page is mapped with the correct ++ * pgprot. For non highmem the mapping is already there. ++ */ ++ src = kmap_local_page_prot(s, prot); + + memcpy_toio(dst, src, PAGE_SIZE); + +- kunmap_atomic(src); ++ kunmap_local(src); + + return 0; + } +-- +2.43.0 + diff --git a/debian/patches-rt/0055-drm-vmgfx-Replace-kmap_atomic.patch b/debian/patches-rt/0055-drm-vmgfx-Replace-kmap_atomic.patch new file mode 100644 index 000000000..613e94a91 --- /dev/null +++ b/debian/patches-rt/0055-drm-vmgfx-Replace-kmap_atomic.patch @@ -0,0 +1,104 @@ +From 3c21a2a16d681c2be78f257f660f1c8c1b50f7c3 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:44 +0100 +Subject: [PATCH 055/323] drm/vmgfx: Replace kmap_atomic() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +There is no reason to disable pagefaults and preemption as a side effect of +kmap_atomic_prot(). + +Use kmap_local_page_prot() instead and document the reasoning for the +mapping usage with the given pgprot. + +Remove the NULL pointer check for the map. These functions return a valid +address for valid pages and the return was bogus anyway as it would have +left preemption and pagefaults disabled. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: VMware Graphics <linux-graphics-maintainer@vmware.com> +Cc: Roland Scheidegger <sroland@vmware.com> +Cc: David Airlie <airlied@linux.ie> +Cc: Daniel Vetter <daniel@ffwll.ch> +Cc: dri-devel@lists.freedesktop.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/gpu/drm/vmwgfx/vmwgfx_blit.c | 30 +++++++++++----------------- + 1 file changed, 12 insertions(+), 18 deletions(-) + +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c +index e8d66182cd7b..71dba228f68e 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c +@@ -375,12 +375,12 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, + copy_size = min_t(u32, copy_size, PAGE_SIZE - src_page_offset); + + if (unmap_src) { +- kunmap_atomic(d->src_addr); ++ kunmap_local(d->src_addr); + d->src_addr = NULL; + } + + if (unmap_dst) { +- kunmap_atomic(d->dst_addr); ++ kunmap_local(d->dst_addr); + d->dst_addr = NULL; + } + +@@ -388,12 +388,8 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, + if (WARN_ON_ONCE(dst_page >= d->dst_num_pages)) + return -EINVAL; + +- d->dst_addr = +- kmap_atomic_prot(d->dst_pages[dst_page], +- d->dst_prot); +- if (!d->dst_addr) +- return -ENOMEM; +- ++ d->dst_addr = kmap_local_page_prot(d->dst_pages[dst_page], ++ d->dst_prot); + d->mapped_dst = dst_page; + } + +@@ -401,12 +397,8 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, + if (WARN_ON_ONCE(src_page >= d->src_num_pages)) + return -EINVAL; + +- d->src_addr = +- kmap_atomic_prot(d->src_pages[src_page], +- d->src_prot); +- if (!d->src_addr) +- return -ENOMEM; +- ++ d->src_addr = kmap_local_page_prot(d->src_pages[src_page], ++ d->src_prot); + d->mapped_src = src_page; + } + diff->do_cpy(diff, d->dst_addr + dst_page_offset, +@@ -436,8 +428,10 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, + * + * Performs a CPU blit from one buffer object to another avoiding a full + * bo vmap which may exhaust- or fragment vmalloc space. +- * On supported architectures (x86), we're using kmap_atomic which avoids +- * cross-processor TLB- and cache flushes and may, on non-HIGHMEM systems ++ * ++ * On supported architectures (x86), we're using kmap_local_prot() which ++ * avoids cross-processor TLB- and cache flushes. kmap_local_prot() will ++ * either map a highmem page with the proper pgprot on HIGHMEM=y systems or + * reference already set-up mappings. + * + * Neither of the buffer objects may be placed in PCI memory +@@ -500,9 +494,9 @@ int vmw_bo_cpu_blit(struct ttm_buffer_object *dst, + } + out: + if (d.src_addr) +- kunmap_atomic(d.src_addr); ++ kunmap_local(d.src_addr); + if (d.dst_addr) +- kunmap_atomic(d.dst_addr); ++ kunmap_local(d.dst_addr); + + return ret; + } +-- +2.43.0 + diff --git a/debian/patches-rt/0056-highmem-Remove-kmap_atomic_prot.patch b/debian/patches-rt/0056-highmem-Remove-kmap_atomic_prot.patch new file mode 100644 index 000000000..f9038b30b --- /dev/null +++ b/debian/patches-rt/0056-highmem-Remove-kmap_atomic_prot.patch @@ -0,0 +1,52 @@ +From acac3af33fa7522e3be0f030ec1d80cca2cf215d Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:45 +0100 +Subject: [PATCH 056/323] highmem: Remove kmap_atomic_prot() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +No more users. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/highmem-internal.h | 14 ++------------ + 1 file changed, 2 insertions(+), 12 deletions(-) + +diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h +index 3590af5aad96..bd15bf9164c2 100644 +--- a/include/linux/highmem-internal.h ++++ b/include/linux/highmem-internal.h +@@ -88,16 +88,11 @@ static inline void __kunmap_local(void *vaddr) + kunmap_local_indexed(vaddr); + } + +-static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) ++static inline void *kmap_atomic(struct page *page) + { + preempt_disable(); + pagefault_disable(); +- return __kmap_local_page_prot(page, prot); +-} +- +-static inline void *kmap_atomic(struct page *page) +-{ +- return kmap_atomic_prot(page, kmap_prot); ++ return __kmap_local_page_prot(page, kmap_prot); + } + + static inline void __kunmap_atomic(void *addr) +@@ -182,11 +177,6 @@ static inline void *kmap_atomic(struct page *page) + return page_address(page); + } + +-static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +-{ +- return kmap_atomic(page); +-} +- + static inline void __kunmap_atomic(void *addr) + { + #ifdef ARCH_HAS_FLUSH_ON_KUNMAP +-- +2.43.0 + diff --git a/debian/patches-rt/0057-drm-qxl-Replace-io_mapping_map_atomic_wc.patch b/debian/patches-rt/0057-drm-qxl-Replace-io_mapping_map_atomic_wc.patch new file mode 100644 index 000000000..3d78f15fd --- /dev/null +++ b/debian/patches-rt/0057-drm-qxl-Replace-io_mapping_map_atomic_wc.patch @@ -0,0 +1,257 @@ +From 2c90fccccea4cffb6ab5cd779da693cfe82194b8 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:46 +0100 +Subject: [PATCH 057/323] drm/qxl: Replace io_mapping_map_atomic_wc() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +None of these mapping requires the side effect of disabling pagefaults and +preemption. + +Use io_mapping_map_local_wc() instead, rename the related functions +accordingly and clean up qxl_process_single_command() to use a plain +copy_from_user() as the local maps are not disabling pagefaults. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Dave Airlie <airlied@redhat.com> +Cc: Gerd Hoffmann <kraxel@redhat.com> +Cc: David Airlie <airlied@linux.ie> +Cc: Daniel Vetter <daniel@ffwll.ch> +Cc: virtualization@lists.linux-foundation.org +Cc: spice-devel@lists.freedesktop.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/gpu/drm/qxl/qxl_image.c | 18 +++++++++--------- + drivers/gpu/drm/qxl/qxl_ioctl.c | 27 +++++++++++++-------------- + drivers/gpu/drm/qxl/qxl_object.c | 12 ++++++------ + drivers/gpu/drm/qxl/qxl_object.h | 4 ++-- + drivers/gpu/drm/qxl/qxl_release.c | 4 ++-- + 5 files changed, 32 insertions(+), 33 deletions(-) + +diff --git a/drivers/gpu/drm/qxl/qxl_image.c b/drivers/gpu/drm/qxl/qxl_image.c +index 60ab7151b84d..93f92ccd42e5 100644 +--- a/drivers/gpu/drm/qxl/qxl_image.c ++++ b/drivers/gpu/drm/qxl/qxl_image.c +@@ -124,12 +124,12 @@ qxl_image_init_helper(struct qxl_device *qdev, + wrong (check the bitmaps are sent correctly + first) */ + +- ptr = qxl_bo_kmap_atomic_page(qdev, chunk_bo, 0); ++ ptr = qxl_bo_kmap_local_page(qdev, chunk_bo, 0); + chunk = ptr; + chunk->data_size = height * chunk_stride; + chunk->prev_chunk = 0; + chunk->next_chunk = 0; +- qxl_bo_kunmap_atomic_page(qdev, chunk_bo, ptr); ++ qxl_bo_kunmap_local_page(qdev, chunk_bo, ptr); + + { + void *k_data, *i_data; +@@ -143,7 +143,7 @@ qxl_image_init_helper(struct qxl_device *qdev, + i_data = (void *)data; + + while (remain > 0) { +- ptr = qxl_bo_kmap_atomic_page(qdev, chunk_bo, page << PAGE_SHIFT); ++ ptr = qxl_bo_kmap_local_page(qdev, chunk_bo, page << PAGE_SHIFT); + + if (page == 0) { + chunk = ptr; +@@ -157,7 +157,7 @@ qxl_image_init_helper(struct qxl_device *qdev, + + memcpy(k_data, i_data, size); + +- qxl_bo_kunmap_atomic_page(qdev, chunk_bo, ptr); ++ qxl_bo_kunmap_local_page(qdev, chunk_bo, ptr); + i_data += size; + remain -= size; + page++; +@@ -175,10 +175,10 @@ qxl_image_init_helper(struct qxl_device *qdev, + page_offset = offset_in_page(out_offset); + size = min((int)(PAGE_SIZE - page_offset), remain); + +- ptr = qxl_bo_kmap_atomic_page(qdev, chunk_bo, page_base); ++ ptr = qxl_bo_kmap_local_page(qdev, chunk_bo, page_base); + k_data = ptr + page_offset; + memcpy(k_data, i_data, size); +- qxl_bo_kunmap_atomic_page(qdev, chunk_bo, ptr); ++ qxl_bo_kunmap_local_page(qdev, chunk_bo, ptr); + remain -= size; + i_data += size; + out_offset += size; +@@ -189,7 +189,7 @@ qxl_image_init_helper(struct qxl_device *qdev, + qxl_bo_kunmap(chunk_bo); + + image_bo = dimage->bo; +- ptr = qxl_bo_kmap_atomic_page(qdev, image_bo, 0); ++ ptr = qxl_bo_kmap_local_page(qdev, image_bo, 0); + image = ptr; + + image->descriptor.id = 0; +@@ -212,7 +212,7 @@ qxl_image_init_helper(struct qxl_device *qdev, + break; + default: + DRM_ERROR("unsupported image bit depth\n"); +- qxl_bo_kunmap_atomic_page(qdev, image_bo, ptr); ++ qxl_bo_kunmap_local_page(qdev, image_bo, ptr); + return -EINVAL; + } + image->u.bitmap.flags = QXL_BITMAP_TOP_DOWN; +@@ -222,7 +222,7 @@ qxl_image_init_helper(struct qxl_device *qdev, + image->u.bitmap.palette = 0; + image->u.bitmap.data = qxl_bo_physical_address(qdev, chunk_bo, 0); + +- qxl_bo_kunmap_atomic_page(qdev, image_bo, ptr); ++ qxl_bo_kunmap_local_page(qdev, image_bo, ptr); + + return 0; + } +diff --git a/drivers/gpu/drm/qxl/qxl_ioctl.c b/drivers/gpu/drm/qxl/qxl_ioctl.c +index 5cea6eea72ab..785023081b79 100644 +--- a/drivers/gpu/drm/qxl/qxl_ioctl.c ++++ b/drivers/gpu/drm/qxl/qxl_ioctl.c +@@ -89,11 +89,11 @@ apply_reloc(struct qxl_device *qdev, struct qxl_reloc_info *info) + { + void *reloc_page; + +- reloc_page = qxl_bo_kmap_atomic_page(qdev, info->dst_bo, info->dst_offset & PAGE_MASK); ++ reloc_page = qxl_bo_kmap_local_page(qdev, info->dst_bo, info->dst_offset & PAGE_MASK); + *(uint64_t *)(reloc_page + (info->dst_offset & ~PAGE_MASK)) = qxl_bo_physical_address(qdev, + info->src_bo, + info->src_offset); +- qxl_bo_kunmap_atomic_page(qdev, info->dst_bo, reloc_page); ++ qxl_bo_kunmap_local_page(qdev, info->dst_bo, reloc_page); + } + + static void +@@ -105,9 +105,9 @@ apply_surf_reloc(struct qxl_device *qdev, struct qxl_reloc_info *info) + if (info->src_bo && !info->src_bo->is_primary) + id = info->src_bo->surface_id; + +- reloc_page = qxl_bo_kmap_atomic_page(qdev, info->dst_bo, info->dst_offset & PAGE_MASK); ++ reloc_page = qxl_bo_kmap_local_page(qdev, info->dst_bo, info->dst_offset & PAGE_MASK); + *(uint32_t *)(reloc_page + (info->dst_offset & ~PAGE_MASK)) = id; +- qxl_bo_kunmap_atomic_page(qdev, info->dst_bo, reloc_page); ++ qxl_bo_kunmap_local_page(qdev, info->dst_bo, reloc_page); + } + + /* return holding the reference to this object */ +@@ -149,7 +149,6 @@ static int qxl_process_single_command(struct qxl_device *qdev, + struct qxl_bo *cmd_bo; + void *fb_cmd; + int i, ret, num_relocs; +- int unwritten; + + switch (cmd->type) { + case QXL_CMD_DRAW: +@@ -185,21 +184,21 @@ static int qxl_process_single_command(struct qxl_device *qdev, + goto out_free_reloc; + + /* TODO copy slow path code from i915 */ +- fb_cmd = qxl_bo_kmap_atomic_page(qdev, cmd_bo, (release->release_offset & PAGE_MASK)); +- unwritten = __copy_from_user_inatomic_nocache +- (fb_cmd + sizeof(union qxl_release_info) + (release->release_offset & ~PAGE_MASK), +- u64_to_user_ptr(cmd->command), cmd->command_size); ++ fb_cmd = qxl_bo_kmap_local_page(qdev, cmd_bo, (release->release_offset & PAGE_MASK)); + +- { ++ if (copy_from_user(fb_cmd + sizeof(union qxl_release_info) + ++ (release->release_offset & ~PAGE_MASK), ++ u64_to_user_ptr(cmd->command), cmd->command_size)) { ++ ret = -EFAULT; ++ } else { + struct qxl_drawable *draw = fb_cmd; + + draw->mm_time = qdev->rom->mm_clock; + } + +- qxl_bo_kunmap_atomic_page(qdev, cmd_bo, fb_cmd); +- if (unwritten) { +- DRM_ERROR("got unwritten %d\n", unwritten); +- ret = -EFAULT; ++ qxl_bo_kunmap_local_page(qdev, cmd_bo, fb_cmd); ++ if (ret) { ++ DRM_ERROR("copy from user failed %d\n", ret); + goto out_free_release; + } + +diff --git a/drivers/gpu/drm/qxl/qxl_object.c b/drivers/gpu/drm/qxl/qxl_object.c +index 544a9e4df2a8..5ee5171d46ef 100644 +--- a/drivers/gpu/drm/qxl/qxl_object.c ++++ b/drivers/gpu/drm/qxl/qxl_object.c +@@ -173,8 +173,8 @@ int qxl_bo_kmap(struct qxl_bo *bo, void **ptr) + return 0; + } + +-void *qxl_bo_kmap_atomic_page(struct qxl_device *qdev, +- struct qxl_bo *bo, int page_offset) ++void *qxl_bo_kmap_local_page(struct qxl_device *qdev, ++ struct qxl_bo *bo, int page_offset) + { + unsigned long offset; + void *rptr; +@@ -189,7 +189,7 @@ void *qxl_bo_kmap_atomic_page(struct qxl_device *qdev, + goto fallback; + + offset = bo->tbo.mem.start << PAGE_SHIFT; +- return io_mapping_map_atomic_wc(map, offset + page_offset); ++ return io_mapping_map_local_wc(map, offset + page_offset); + fallback: + if (bo->kptr) { + rptr = bo->kptr + (page_offset * PAGE_SIZE); +@@ -215,14 +215,14 @@ void qxl_bo_kunmap(struct qxl_bo *bo) + ttm_bo_kunmap(&bo->kmap); + } + +-void qxl_bo_kunmap_atomic_page(struct qxl_device *qdev, +- struct qxl_bo *bo, void *pmap) ++void qxl_bo_kunmap_local_page(struct qxl_device *qdev, ++ struct qxl_bo *bo, void *pmap) + { + if ((bo->tbo.mem.mem_type != TTM_PL_VRAM) && + (bo->tbo.mem.mem_type != TTM_PL_PRIV)) + goto fallback; + +- io_mapping_unmap_atomic(pmap); ++ io_mapping_unmap_local(pmap); + return; + fallback: + qxl_bo_kunmap(bo); +diff --git a/drivers/gpu/drm/qxl/qxl_object.h b/drivers/gpu/drm/qxl/qxl_object.h +index 5762ea40d047..6ae89b1b36f4 100644 +--- a/drivers/gpu/drm/qxl/qxl_object.h ++++ b/drivers/gpu/drm/qxl/qxl_object.h +@@ -89,8 +89,8 @@ extern int qxl_bo_create(struct qxl_device *qdev, + struct qxl_bo **bo_ptr); + extern int qxl_bo_kmap(struct qxl_bo *bo, void **ptr); + extern void qxl_bo_kunmap(struct qxl_bo *bo); +-void *qxl_bo_kmap_atomic_page(struct qxl_device *qdev, struct qxl_bo *bo, int page_offset); +-void qxl_bo_kunmap_atomic_page(struct qxl_device *qdev, struct qxl_bo *bo, void *map); ++void *qxl_bo_kmap_local_page(struct qxl_device *qdev, struct qxl_bo *bo, int page_offset); ++void qxl_bo_kunmap_local_page(struct qxl_device *qdev, struct qxl_bo *bo, void *map); + extern struct qxl_bo *qxl_bo_ref(struct qxl_bo *bo); + extern void qxl_bo_unref(struct qxl_bo **bo); + extern int qxl_bo_pin(struct qxl_bo *bo); +diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c +index b2a475a0ca4a..b665a33b449b 100644 +--- a/drivers/gpu/drm/qxl/qxl_release.c ++++ b/drivers/gpu/drm/qxl/qxl_release.c +@@ -414,7 +414,7 @@ union qxl_release_info *qxl_release_map(struct qxl_device *qdev, + union qxl_release_info *info; + struct qxl_bo *bo = release->release_bo; + +- ptr = qxl_bo_kmap_atomic_page(qdev, bo, release->release_offset & PAGE_MASK); ++ ptr = qxl_bo_kmap_local_page(qdev, bo, release->release_offset & PAGE_MASK); + if (!ptr) + return NULL; + info = ptr + (release->release_offset & ~PAGE_MASK); +@@ -429,7 +429,7 @@ void qxl_release_unmap(struct qxl_device *qdev, + void *ptr; + + ptr = ((void *)info) - (release->release_offset & ~PAGE_MASK); +- qxl_bo_kunmap_atomic_page(qdev, bo, ptr); ++ qxl_bo_kunmap_local_page(qdev, bo, ptr); + } + + void qxl_release_fence_buffer_objects(struct qxl_release *release) +-- +2.43.0 + diff --git a/debian/patches-rt/0058-drm-nouveau-device-Replace-io_mapping_map_atomic_wc.patch b/debian/patches-rt/0058-drm-nouveau-device-Replace-io_mapping_map_atomic_wc.patch new file mode 100644 index 000000000..1fbd4e19c --- /dev/null +++ b/debian/patches-rt/0058-drm-nouveau-device-Replace-io_mapping_map_atomic_wc.patch @@ -0,0 +1,54 @@ +From fa246d15bcd167ad99886a776912530c0ca8d6a7 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:47 +0100 +Subject: [PATCH 058/323] drm/nouveau/device: Replace + io_mapping_map_atomic_wc() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Neither fbmem_peek() nor fbmem_poke() require to disable pagefaults and +preemption as a side effect of io_mapping_map_atomic_wc(). + +Use io_mapping_map_local_wc() instead. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Ben Skeggs <bskeggs@redhat.com> +Cc: David Airlie <airlied@linux.ie> +Cc: Daniel Vetter <daniel@ffwll.ch> +Cc: dri-devel@lists.freedesktop.org +Cc: nouveau@lists.freedesktop.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/gpu/drm/nouveau/nvkm/subdev/devinit/fbmem.h | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/fbmem.h b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/fbmem.h +index 6c5bbff12eb4..411f91ee20fa 100644 +--- a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/fbmem.h ++++ b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/fbmem.h +@@ -60,19 +60,19 @@ fbmem_fini(struct io_mapping *fb) + static inline u32 + fbmem_peek(struct io_mapping *fb, u32 off) + { +- u8 __iomem *p = io_mapping_map_atomic_wc(fb, off & PAGE_MASK); ++ u8 __iomem *p = io_mapping_map_local_wc(fb, off & PAGE_MASK); + u32 val = ioread32(p + (off & ~PAGE_MASK)); +- io_mapping_unmap_atomic(p); ++ io_mapping_unmap_local(p); + return val; + } + + static inline void + fbmem_poke(struct io_mapping *fb, u32 off, u32 val) + { +- u8 __iomem *p = io_mapping_map_atomic_wc(fb, off & PAGE_MASK); ++ u8 __iomem *p = io_mapping_map_local_wc(fb, off & PAGE_MASK); + iowrite32(val, p + (off & ~PAGE_MASK)); + wmb(); +- io_mapping_unmap_atomic(p); ++ io_mapping_unmap_local(p); + } + + static inline bool +-- +2.43.0 + diff --git a/debian/patches-rt/0059-drm-i915-Replace-io_mapping_map_atomic_wc.patch b/debian/patches-rt/0059-drm-i915-Replace-io_mapping_map_atomic_wc.patch new file mode 100644 index 000000000..91fbfdde2 --- /dev/null +++ b/debian/patches-rt/0059-drm-i915-Replace-io_mapping_map_atomic_wc.patch @@ -0,0 +1,173 @@ +From 012c96110910f68af41e3914aabb92283a885215 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:48 +0100 +Subject: [PATCH 059/323] drm/i915: Replace io_mapping_map_atomic_wc() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +None of these mapping requires the side effect of disabling pagefaults and +preemption. + +Use io_mapping_map_local_wc() instead, and clean up gtt_user_read() and +gtt_user_write() to use a plain copy_from_user() as the local maps are not +disabling pagefaults. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Jani Nikula <jani.nikula@linux.intel.com> +Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> +Cc: Rodrigo Vivi <rodrigo.vivi@intel.com> +Cc: David Airlie <airlied@linux.ie> +Cc: Daniel Vetter <daniel@ffwll.ch> +Cc: intel-gfx@lists.freedesktop.org +Cc: dri-devel@lists.freedesktop.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + .../gpu/drm/i915/gem/i915_gem_execbuffer.c | 7 ++-- + drivers/gpu/drm/i915/i915_gem.c | 40 ++++++------------- + drivers/gpu/drm/i915/selftests/i915_gem.c | 4 +- + drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 8 ++-- + 4 files changed, 22 insertions(+), 37 deletions(-) + +diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +index 0c083af5a59d..2abf043d3d9d 100644 +--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c ++++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +@@ -1080,7 +1080,7 @@ static void reloc_cache_reset(struct reloc_cache *cache, struct i915_execbuffer + struct i915_ggtt *ggtt = cache_to_ggtt(cache); + + intel_gt_flush_ggtt_writes(ggtt->vm.gt); +- io_mapping_unmap_atomic((void __iomem *)vaddr); ++ io_mapping_unmap_local((void __iomem *)vaddr); + + if (drm_mm_node_allocated(&cache->node)) { + ggtt->vm.clear_range(&ggtt->vm, +@@ -1146,7 +1146,7 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj, + + if (cache->vaddr) { + intel_gt_flush_ggtt_writes(ggtt->vm.gt); +- io_mapping_unmap_atomic((void __force __iomem *) unmask_page(cache->vaddr)); ++ io_mapping_unmap_local((void __force __iomem *) unmask_page(cache->vaddr)); + } else { + struct i915_vma *vma; + int err; +@@ -1194,8 +1194,7 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj, + offset += page << PAGE_SHIFT; + } + +- vaddr = (void __force *)io_mapping_map_atomic_wc(&ggtt->iomap, +- offset); ++ vaddr = (void __force *)io_mapping_map_local_wc(&ggtt->iomap, offset); + cache->page = page; + cache->vaddr = (unsigned long)vaddr; + +diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c +index 58276694c848..88944c3b1bc8 100644 +--- a/drivers/gpu/drm/i915/i915_gem.c ++++ b/drivers/gpu/drm/i915/i915_gem.c +@@ -355,22 +355,15 @@ gtt_user_read(struct io_mapping *mapping, + char __user *user_data, int length) + { + void __iomem *vaddr; +- unsigned long unwritten; ++ bool fail = false; + + /* We can use the cpu mem copy function because this is X86. */ +- vaddr = io_mapping_map_atomic_wc(mapping, base); +- unwritten = __copy_to_user_inatomic(user_data, +- (void __force *)vaddr + offset, +- length); +- io_mapping_unmap_atomic(vaddr); +- if (unwritten) { +- vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE); +- unwritten = copy_to_user(user_data, +- (void __force *)vaddr + offset, +- length); +- io_mapping_unmap(vaddr); +- } +- return unwritten; ++ vaddr = io_mapping_map_local_wc(mapping, base); ++ if (copy_to_user(user_data, (void __force *)vaddr + offset, length)) ++ fail = true; ++ io_mapping_unmap_local(vaddr); ++ ++ return fail; + } + + static int +@@ -539,21 +532,14 @@ ggtt_write(struct io_mapping *mapping, + char __user *user_data, int length) + { + void __iomem *vaddr; +- unsigned long unwritten; ++ bool fail = false; + + /* We can use the cpu mem copy function because this is X86. */ +- vaddr = io_mapping_map_atomic_wc(mapping, base); +- unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset, +- user_data, length); +- io_mapping_unmap_atomic(vaddr); +- if (unwritten) { +- vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE); +- unwritten = copy_from_user((void __force *)vaddr + offset, +- user_data, length); +- io_mapping_unmap(vaddr); +- } +- +- return unwritten; ++ vaddr = io_mapping_map_local_wc(mapping, base); ++ if (copy_from_user((void __force *)vaddr + offset, user_data, length)) ++ fail = true; ++ io_mapping_unmap_local(vaddr); ++ return fail; + } + + /** +diff --git a/drivers/gpu/drm/i915/selftests/i915_gem.c b/drivers/gpu/drm/i915/selftests/i915_gem.c +index 412e21604a05..432493183d20 100644 +--- a/drivers/gpu/drm/i915/selftests/i915_gem.c ++++ b/drivers/gpu/drm/i915/selftests/i915_gem.c +@@ -57,12 +57,12 @@ static void trash_stolen(struct drm_i915_private *i915) + + ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0); + +- s = io_mapping_map_atomic_wc(&ggtt->iomap, slot); ++ s = io_mapping_map_local_wc(&ggtt->iomap, slot); + for (x = 0; x < PAGE_SIZE / sizeof(u32); x++) { + prng = next_pseudo_random32(prng); + iowrite32(prng, &s[x]); + } +- io_mapping_unmap_atomic(s); ++ io_mapping_unmap_local(s); + } + + ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE); +diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c +index 65e28c4cd4ce..ca483285f267 100644 +--- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c ++++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c +@@ -1201,9 +1201,9 @@ static int igt_ggtt_page(void *arg) + u64 offset = tmp.start + order[n] * PAGE_SIZE; + u32 __iomem *vaddr; + +- vaddr = io_mapping_map_atomic_wc(&ggtt->iomap, offset); ++ vaddr = io_mapping_map_local_wc(&ggtt->iomap, offset); + iowrite32(n, vaddr + n); +- io_mapping_unmap_atomic(vaddr); ++ io_mapping_unmap_local(vaddr); + } + intel_gt_flush_ggtt_writes(ggtt->vm.gt); + +@@ -1213,9 +1213,9 @@ static int igt_ggtt_page(void *arg) + u32 __iomem *vaddr; + u32 val; + +- vaddr = io_mapping_map_atomic_wc(&ggtt->iomap, offset); ++ vaddr = io_mapping_map_local_wc(&ggtt->iomap, offset); + val = ioread32(vaddr + n); +- io_mapping_unmap_atomic(vaddr); ++ io_mapping_unmap_local(vaddr); + + if (val != n) { + pr_err("insert page failed: found %d, expected %d\n", +-- +2.43.0 + diff --git a/debian/patches-rt/0060-io-mapping-Remove-io_mapping_map_atomic_wc.patch b/debian/patches-rt/0060-io-mapping-Remove-io_mapping_map_atomic_wc.patch new file mode 100644 index 000000000..f8374350f --- /dev/null +++ b/debian/patches-rt/0060-io-mapping-Remove-io_mapping_map_atomic_wc.patch @@ -0,0 +1,140 @@ +From 925f92c53f78436e9242942d1a31a1ecc61ba740 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 3 Nov 2020 10:27:49 +0100 +Subject: [PATCH 060/323] io-mapping: Remove io_mapping_map_atomic_wc() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +No more users. Get rid of it and remove the traces in documentation. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + Documentation/driver-api/io-mapping.rst | 22 +++++-------- + include/linux/io-mapping.h | 42 ++----------------------- + 2 files changed, 9 insertions(+), 55 deletions(-) + +diff --git a/Documentation/driver-api/io-mapping.rst b/Documentation/driver-api/io-mapping.rst +index a0cfb15988df..a7830c59481f 100644 +--- a/Documentation/driver-api/io-mapping.rst ++++ b/Documentation/driver-api/io-mapping.rst +@@ -21,19 +21,15 @@ mappable, while 'size' indicates how large a mapping region to + enable. Both are in bytes. + + This _wc variant provides a mapping which may only be used with +-io_mapping_map_atomic_wc(), io_mapping_map_local_wc() or +-io_mapping_map_wc(). ++io_mapping_map_local_wc() or io_mapping_map_wc(). + + With this mapping object, individual pages can be mapped either temporarily + or long term, depending on the requirements. Of course, temporary maps are +-more efficient. They come in two flavours:: ++more efficient. + + void *io_mapping_map_local_wc(struct io_mapping *mapping, + unsigned long offset) + +- void *io_mapping_map_atomic_wc(struct io_mapping *mapping, +- unsigned long offset) +- + 'offset' is the offset within the defined mapping region. Accessing + addresses beyond the region specified in the creation function yields + undefined results. Using an offset which is not page aligned yields an +@@ -50,9 +46,6 @@ io_mapping_map_local_wc() has a side effect on X86 32bit as it disables + migration to make the mapping code work. No caller can rely on this side + effect. + +-io_mapping_map_atomic_wc() has the side effect of disabling preemption and +-pagefaults. Don't use in new code. Use io_mapping_map_local_wc() instead. +- + Nested mappings need to be undone in reverse order because the mapping + code uses a stack for keeping track of them:: + +@@ -65,11 +58,10 @@ code uses a stack for keeping track of them:: + The mappings are released with:: + + void io_mapping_unmap_local(void *vaddr) +- void io_mapping_unmap_atomic(void *vaddr) + +-'vaddr' must be the value returned by the last io_mapping_map_local_wc() or +-io_mapping_map_atomic_wc() call. This unmaps the specified mapping and +-undoes the side effects of the mapping functions. ++'vaddr' must be the value returned by the last io_mapping_map_local_wc() ++call. This unmaps the specified mapping and undoes eventual side effects of ++the mapping function. + + If you need to sleep while holding a mapping, you can use the regular + variant, although this may be significantly slower:: +@@ -77,8 +69,8 @@ variant, although this may be significantly slower:: + void *io_mapping_map_wc(struct io_mapping *mapping, + unsigned long offset) + +-This works like io_mapping_map_atomic/local_wc() except it has no side +-effects and the pointer is globaly visible. ++This works like io_mapping_map_local_wc() except it has no side effects and ++the pointer is globaly visible. + + The mappings are released with:: + +diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h +index c093e81310a9..4bb8223f2f82 100644 +--- a/include/linux/io-mapping.h ++++ b/include/linux/io-mapping.h +@@ -60,28 +60,7 @@ io_mapping_fini(struct io_mapping *mapping) + iomap_free(mapping->base, mapping->size); + } + +-/* Atomic map/unmap */ +-static inline void __iomem * +-io_mapping_map_atomic_wc(struct io_mapping *mapping, +- unsigned long offset) +-{ +- resource_size_t phys_addr; +- +- BUG_ON(offset >= mapping->size); +- phys_addr = mapping->base + offset; +- preempt_disable(); +- pagefault_disable(); +- return __iomap_local_pfn_prot(PHYS_PFN(phys_addr), mapping->prot); +-} +- +-static inline void +-io_mapping_unmap_atomic(void __iomem *vaddr) +-{ +- kunmap_local_indexed((void __force *)vaddr); +- pagefault_enable(); +- preempt_enable(); +-} +- ++/* Temporary mappings which are only valid in the current context */ + static inline void __iomem * + io_mapping_map_local_wc(struct io_mapping *mapping, unsigned long offset) + { +@@ -163,24 +142,7 @@ io_mapping_unmap(void __iomem *vaddr) + { + } + +-/* Atomic map/unmap */ +-static inline void __iomem * +-io_mapping_map_atomic_wc(struct io_mapping *mapping, +- unsigned long offset) +-{ +- preempt_disable(); +- pagefault_disable(); +- return io_mapping_map_wc(mapping, offset, PAGE_SIZE); +-} +- +-static inline void +-io_mapping_unmap_atomic(void __iomem *vaddr) +-{ +- io_mapping_unmap(vaddr); +- pagefault_enable(); +- preempt_enable(); +-} +- ++/* Temporary mappings which are only valid in the current context */ + static inline void __iomem * + io_mapping_map_local_wc(struct io_mapping *mapping, unsigned long offset) + { +-- +2.43.0 + diff --git a/debian/patches-rt/0061-mm-highmem-Take-kmap_high_get-properly-into-account.patch b/debian/patches-rt/0061-mm-highmem-Take-kmap_high_get-properly-into-account.patch new file mode 100644 index 000000000..5970ff263 --- /dev/null +++ b/debian/patches-rt/0061-mm-highmem-Take-kmap_high_get-properly-into-account.patch @@ -0,0 +1,72 @@ +From 3df53841fc978eae47eb80dbb4ccaefa880589a7 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Thu, 12 Nov 2020 11:59:32 +0100 +Subject: [PATCH 061/323] mm/highmem: Take kmap_high_get() properly into + account +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +kunmap_local() warns when the virtual address to unmap is below +PAGE_OFFSET. This is correct except for the case that the mapping was +obtained via kmap_high_get() because the PKMAP addresses are right below +PAGE_OFFSET. + +Cure it by skipping the WARN_ON() when the unmap was handled by +kunmap_high(). + +Fixes: 298fa1ad5571 ("highmem: Provide generic variant of kmap_atomic*") +Reported-by: vtolkm@googlemail.com +Reported-by: Marek Szyprowski <m.szyprowski@samsung.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Tested-by: Marek Szyprowski <m.szyprowski@samsung.com> +Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Cc: Andrew Morton <akpm@linux-foundation.org> +Link: https://lore.kernel.org/r/87y2j6n8mj.fsf@nanos.tec.linutronix.de +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + mm/highmem.c | 19 +++++++++++++------ + 1 file changed, 13 insertions(+), 6 deletions(-) + +diff --git a/mm/highmem.c b/mm/highmem.c +index 8db577e5290c..72b9a2d95c72 100644 +--- a/mm/highmem.c ++++ b/mm/highmem.c +@@ -422,12 +422,15 @@ static inline void *arch_kmap_local_high_get(struct page *page) + #endif + + /* Unmap a local mapping which was obtained by kmap_high_get() */ +-static inline void kmap_high_unmap_local(unsigned long vaddr) ++static inline bool kmap_high_unmap_local(unsigned long vaddr) + { + #ifdef ARCH_NEEDS_KMAP_HIGH_GET +- if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) ++ if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { + kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); ++ return true; ++ } + #endif ++ return false; + } + + static inline int kmap_local_calc_idx(int idx) +@@ -493,10 +496,14 @@ void kunmap_local_indexed(void *vaddr) + + if (addr < __fix_to_virt(FIX_KMAP_END) || + addr > __fix_to_virt(FIX_KMAP_BEGIN)) { +- WARN_ON_ONCE(addr < PAGE_OFFSET); +- +- /* Handle mappings which were obtained by kmap_high_get() */ +- kmap_high_unmap_local(addr); ++ /* ++ * Handle mappings which were obtained by kmap_high_get() ++ * first as the virtual address of such mappings is below ++ * PAGE_OFFSET. Warn for all other addresses which are in ++ * the user space part of the virtual address space. ++ */ ++ if (!kmap_high_unmap_local(addr)) ++ WARN_ON_ONCE(addr < PAGE_OFFSET); + return; + } + +-- +2.43.0 + diff --git a/debian/patches-rt/0062-highmem-Don-t-disable-preemption-on-RT-in-kmap_atomi.patch b/debian/patches-rt/0062-highmem-Don-t-disable-preemption-on-RT-in-kmap_atomi.patch new file mode 100644 index 000000000..cecb34498 --- /dev/null +++ b/debian/patches-rt/0062-highmem-Don-t-disable-preemption-on-RT-in-kmap_atomi.patch @@ -0,0 +1,71 @@ +From 09d7bcb6eee1e7c9351535a20f60929daca8632d Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Fri, 30 Oct 2020 13:59:06 +0100 +Subject: [PATCH 062/323] highmem: Don't disable preemption on RT in + kmap_atomic() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Disabling preemption makes it impossible to acquire sleeping locks within +kmap_atomic() section. +For PREEMPT_RT it is sufficient to disable migration. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/highmem-internal.h | 20 ++++++++++++++++---- + 1 file changed, 16 insertions(+), 4 deletions(-) + +diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h +index bd15bf9164c2..f9bc6acd3679 100644 +--- a/include/linux/highmem-internal.h ++++ b/include/linux/highmem-internal.h +@@ -90,7 +90,10 @@ static inline void __kunmap_local(void *vaddr) + + static inline void *kmap_atomic(struct page *page) + { +- preempt_disable(); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ migrate_disable(); ++ else ++ preempt_disable(); + pagefault_disable(); + return __kmap_local_page_prot(page, kmap_prot); + } +@@ -99,7 +102,10 @@ static inline void __kunmap_atomic(void *addr) + { + kunmap_local_indexed(addr); + pagefault_enable(); +- preempt_enable(); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ migrate_enable(); ++ else ++ preempt_enable(); + } + + unsigned int __nr_free_highpages(void); +@@ -172,7 +178,10 @@ static inline void __kunmap_local(void *addr) + + static inline void *kmap_atomic(struct page *page) + { +- preempt_disable(); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ migrate_disable(); ++ else ++ preempt_disable(); + pagefault_disable(); + return page_address(page); + } +@@ -183,7 +192,10 @@ static inline void __kunmap_atomic(void *addr) + kunmap_flush_on_unmap(addr); + #endif + pagefault_enable(); +- preempt_enable(); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ migrate_enable(); ++ else ++ preempt_enable(); + } + + static inline unsigned int nr_free_highpages(void) { return 0; } +-- +2.43.0 + diff --git a/debian/patches-rt/0063-blk-mq-Don-t-complete-on-a-remote-CPU-in-force-threa.patch b/debian/patches-rt/0063-blk-mq-Don-t-complete-on-a-remote-CPU-in-force-threa.patch new file mode 100644 index 000000000..8623209ab --- /dev/null +++ b/debian/patches-rt/0063-blk-mq-Don-t-complete-on-a-remote-CPU-in-force-threa.patch @@ -0,0 +1,48 @@ +From f7d87d26b4c9bfd083484542889f392867c1c85b Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Fri, 4 Dec 2020 20:13:54 +0100 +Subject: [PATCH 063/323] blk-mq: Don't complete on a remote CPU in force + threaded mode +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +With force threaded interrupts enabled, raising softirq from an SMP +function call will always result in waking the ksoftirqd thread. This is +not optimal given that the thread runs at SCHED_OTHER priority. + +Completing the request in hard IRQ-context on PREEMPT_RT (which enforces +the force threaded mode) is bad because the completion handler may +acquire sleeping locks which violate the locking context. + +Disable request completing on a remote CPU in force threaded mode. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Reviewed-by: Christoph Hellwig <hch@lst.de> +Reviewed-by: Daniel Wagner <dwagner@suse.de> +Signed-off-by: Jens Axboe <axboe@kernel.dk> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + block/blk-mq.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/block/blk-mq.c b/block/blk-mq.c +index e153a36c9ba3..cc41373cf346 100644 +--- a/block/blk-mq.c ++++ b/block/blk-mq.c +@@ -652,6 +652,14 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) + if (!IS_ENABLED(CONFIG_SMP) || + !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) + return false; ++ /* ++ * With force threaded interrupts enabled, raising softirq from an SMP ++ * function call will always result in waking the ksoftirqd thread. ++ * This is probably worse than completing the request on a different ++ * cache domain. ++ */ ++ if (force_irqthreads) ++ return false; + + /* same CPU or cache domain? Complete locally */ + if (cpu == rq->mq_ctx->cpu || +-- +2.43.0 + diff --git a/debian/patches-rt/0064-blk-mq-Always-complete-remote-completions-requests-i.patch b/debian/patches-rt/0064-blk-mq-Always-complete-remote-completions-requests-i.patch new file mode 100644 index 000000000..92574e87b --- /dev/null +++ b/debian/patches-rt/0064-blk-mq-Always-complete-remote-completions-requests-i.patch @@ -0,0 +1,49 @@ +From 4397e4a75ac6456a0dd8cf0a99c8250845b022d6 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Sat, 23 Jan 2021 21:10:26 +0100 +Subject: [PATCH 064/323] blk-mq: Always complete remote completions requests + in softirq +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Controllers with multiple queues have their IRQ-handelers pinned to a +CPU. The core shouldn't need to complete the request on a remote CPU. + +Remove this case and always raise the softirq to complete the request. + +Reviewed-by: Christoph Hellwig <hch@lst.de> +Reviewed-by: Daniel Wagner <dwagner@suse.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Jens Axboe <axboe@kernel.dk> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + block/blk-mq.c | 14 +------------- + 1 file changed, 1 insertion(+), 13 deletions(-) + +diff --git a/block/blk-mq.c b/block/blk-mq.c +index cc41373cf346..87b8377a74fc 100644 +--- a/block/blk-mq.c ++++ b/block/blk-mq.c +@@ -630,19 +630,7 @@ static void __blk_mq_complete_request_remote(void *data) + { + struct request *rq = data; + +- /* +- * For most of single queue controllers, there is only one irq vector +- * for handling I/O completion, and the only irq's affinity is set +- * to all possible CPUs. On most of ARCHs, this affinity means the irq +- * is handled on one specific CPU. +- * +- * So complete I/O requests in softirq context in case of single queue +- * devices to avoid degrading I/O performance due to irqsoff latency. +- */ +- if (rq->q->nr_hw_queues == 1) +- blk_mq_trigger_softirq(rq); +- else +- rq->q->mq_ops->complete(rq); ++ blk_mq_trigger_softirq(rq); + } + + static inline bool blk_mq_complete_need_ipi(struct request *rq) +-- +2.43.0 + diff --git a/debian/patches-rt/0065-blk-mq-Use-llist_head-for-blk_cpu_done.patch b/debian/patches-rt/0065-blk-mq-Use-llist_head-for-blk_cpu_done.patch new file mode 100644 index 000000000..b7faea536 --- /dev/null +++ b/debian/patches-rt/0065-blk-mq-Use-llist_head-for-blk_cpu_done.patch @@ -0,0 +1,201 @@ +From a958b1b96b5419f3d9372de0c7851daae941bf47 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Sat, 23 Jan 2021 21:10:27 +0100 +Subject: [PATCH 065/323] blk-mq: Use llist_head for blk_cpu_done +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +With llist_head it is possible to avoid the locking (the irq-off region) +when items are added. This makes it possible to add items on a remote +CPU without additional locking. +llist_add() returns true if the list was previously empty. This can be +used to invoke the SMP function call / raise sofirq only if the first +item was added (otherwise it is already pending). +This simplifies the code a little and reduces the IRQ-off regions. + +blk_mq_raise_softirq() needs a preempt-disable section to ensure the +request is enqueued on the same CPU as the softirq is raised. +Some callers (USB-storage) invoke this path in preemptible context. + +Reviewed-by: Christoph Hellwig <hch@lst.de> +Reviewed-by: Daniel Wagner <dwagner@suse.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Jens Axboe <axboe@kernel.dk> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + block/blk-mq.c | 101 ++++++++++++++++++----------------------- + include/linux/blkdev.h | 2 +- + 2 files changed, 44 insertions(+), 59 deletions(-) + +diff --git a/block/blk-mq.c b/block/blk-mq.c +index 87b8377a74fc..a6618bf45992 100644 +--- a/block/blk-mq.c ++++ b/block/blk-mq.c +@@ -41,7 +41,7 @@ + #include "blk-mq-sched.h" + #include "blk-rq-qos.h" + +-static DEFINE_PER_CPU(struct list_head, blk_cpu_done); ++static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); + + static void blk_mq_poll_stats_start(struct request_queue *q); + static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); +@@ -569,68 +569,29 @@ void blk_mq_end_request(struct request *rq, blk_status_t error) + } + EXPORT_SYMBOL(blk_mq_end_request); + +-/* +- * Softirq action handler - move entries to local list and loop over them +- * while passing them to the queue registered handler. +- */ +-static __latent_entropy void blk_done_softirq(struct softirq_action *h) ++static void blk_complete_reqs(struct llist_head *list) + { +- struct list_head *cpu_list, local_list; +- +- local_irq_disable(); +- cpu_list = this_cpu_ptr(&blk_cpu_done); +- list_replace_init(cpu_list, &local_list); +- local_irq_enable(); +- +- while (!list_empty(&local_list)) { +- struct request *rq; ++ struct llist_node *entry = llist_reverse_order(llist_del_all(list)); ++ struct request *rq, *next; + +- rq = list_entry(local_list.next, struct request, ipi_list); +- list_del_init(&rq->ipi_list); ++ llist_for_each_entry_safe(rq, next, entry, ipi_list) + rq->q->mq_ops->complete(rq); +- } + } + +-static void blk_mq_trigger_softirq(struct request *rq) ++static __latent_entropy void blk_done_softirq(struct softirq_action *h) + { +- struct list_head *list; +- unsigned long flags; +- +- local_irq_save(flags); +- list = this_cpu_ptr(&blk_cpu_done); +- list_add_tail(&rq->ipi_list, list); +- +- /* +- * If the list only contains our just added request, signal a raise of +- * the softirq. If there are already entries there, someone already +- * raised the irq but it hasn't run yet. +- */ +- if (list->next == &rq->ipi_list) +- raise_softirq_irqoff(BLOCK_SOFTIRQ); +- local_irq_restore(flags); ++ blk_complete_reqs(this_cpu_ptr(&blk_cpu_done)); + } + + static int blk_softirq_cpu_dead(unsigned int cpu) + { +- /* +- * If a CPU goes away, splice its entries to the current CPU +- * and trigger a run of the softirq +- */ +- local_irq_disable(); +- list_splice_init(&per_cpu(blk_cpu_done, cpu), +- this_cpu_ptr(&blk_cpu_done)); +- raise_softirq_irqoff(BLOCK_SOFTIRQ); +- local_irq_enable(); +- ++ blk_complete_reqs(&per_cpu(blk_cpu_done, cpu)); + return 0; + } + +- + static void __blk_mq_complete_request_remote(void *data) + { +- struct request *rq = data; +- +- blk_mq_trigger_softirq(rq); ++ __raise_softirq_irqoff(BLOCK_SOFTIRQ); + } + + static inline bool blk_mq_complete_need_ipi(struct request *rq) +@@ -659,6 +620,32 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) + return cpu_online(rq->mq_ctx->cpu); + } + ++static void blk_mq_complete_send_ipi(struct request *rq) ++{ ++ struct llist_head *list; ++ unsigned int cpu; ++ ++ cpu = rq->mq_ctx->cpu; ++ list = &per_cpu(blk_cpu_done, cpu); ++ if (llist_add(&rq->ipi_list, list)) { ++ rq->csd.func = __blk_mq_complete_request_remote; ++ rq->csd.info = rq; ++ rq->csd.flags = 0; ++ smp_call_function_single_async(cpu, &rq->csd); ++ } ++} ++ ++static void blk_mq_raise_softirq(struct request *rq) ++{ ++ struct llist_head *list; ++ ++ preempt_disable(); ++ list = this_cpu_ptr(&blk_cpu_done); ++ if (llist_add(&rq->ipi_list, list)) ++ raise_softirq(BLOCK_SOFTIRQ); ++ preempt_enable(); ++} ++ + bool blk_mq_complete_request_remote(struct request *rq) + { + WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); +@@ -671,17 +658,15 @@ bool blk_mq_complete_request_remote(struct request *rq) + return false; + + if (blk_mq_complete_need_ipi(rq)) { +- rq->csd.func = __blk_mq_complete_request_remote; +- rq->csd.info = rq; +- rq->csd.flags = 0; +- smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd); +- } else { +- if (rq->q->nr_hw_queues > 1) +- return false; +- blk_mq_trigger_softirq(rq); ++ blk_mq_complete_send_ipi(rq); ++ return true; + } + +- return true; ++ if (rq->q->nr_hw_queues == 1) { ++ blk_mq_raise_softirq(rq); ++ return true; ++ } ++ return false; + } + EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote); + +@@ -3980,7 +3965,7 @@ static int __init blk_mq_init(void) + int i; + + for_each_possible_cpu(i) +- INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); ++ init_llist_head(&per_cpu(blk_cpu_done, i)); + open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); + + cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 98fdf5a31fd6..b35193aec5e3 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -161,7 +161,7 @@ struct request { + */ + union { + struct hlist_node hash; /* merge hash */ +- struct list_head ipi_list; ++ struct llist_node ipi_list; + }; + + /* +-- +2.43.0 + diff --git a/debian/patches-rt/0066-lib-test_lockup-Minimum-fix-to-get-it-compiled-on-PR.patch b/debian/patches-rt/0066-lib-test_lockup-Minimum-fix-to-get-it-compiled-on-PR.patch new file mode 100644 index 000000000..e52831303 --- /dev/null +++ b/debian/patches-rt/0066-lib-test_lockup-Minimum-fix-to-get-it-compiled-on-PR.patch @@ -0,0 +1,65 @@ +From 11e632bdd63b66d70c365424a1ceb502cb0c02ef Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 28 Oct 2020 18:55:27 +0100 +Subject: [PATCH 066/323] lib/test_lockup: Minimum fix to get it compiled on + PREEMPT_RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +On PREEMPT_RT the locks are quite different so they can't be tested as +it is done below. The alternative is test for the waitlock within +rtmutex. + +This is the bare minim to get it compiled. Problems which exists on +PREEMP_RT: +- none of the locks (spinlock_t, rwlock_t, mutex_t, rw_semaphore) may be + acquired with disabled preemption or interrupts. + If I read the code correct the it is possible to acquire a mutex with + disabled interrupts. + I don't know how to obtain a lock pointer. Technically they are not + exported to userland. + +- memory can not be allocated with disabled premption or interrupts even + with GFP_ATOMIC. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + lib/test_lockup.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/lib/test_lockup.c b/lib/test_lockup.c +index 78a630bbd03d..d27a80502204 100644 +--- a/lib/test_lockup.c ++++ b/lib/test_lockup.c +@@ -485,6 +485,21 @@ static int __init test_lockup_init(void) + return -EINVAL; + + #ifdef CONFIG_DEBUG_SPINLOCK ++#ifdef CONFIG_PREEMPT_RT ++ if (test_magic(lock_spinlock_ptr, ++ offsetof(spinlock_t, lock.wait_lock.magic), ++ SPINLOCK_MAGIC) || ++ test_magic(lock_rwlock_ptr, ++ offsetof(rwlock_t, rtmutex.wait_lock.magic), ++ SPINLOCK_MAGIC) || ++ test_magic(lock_mutex_ptr, ++ offsetof(struct mutex, lock.wait_lock.magic), ++ SPINLOCK_MAGIC) || ++ test_magic(lock_rwsem_ptr, ++ offsetof(struct rw_semaphore, rtmutex.wait_lock.magic), ++ SPINLOCK_MAGIC)) ++ return -EINVAL; ++#else + if (test_magic(lock_spinlock_ptr, + offsetof(spinlock_t, rlock.magic), + SPINLOCK_MAGIC) || +@@ -498,6 +513,7 @@ static int __init test_lockup_init(void) + offsetof(struct rw_semaphore, wait_lock.magic), + SPINLOCK_MAGIC)) + return -EINVAL; ++#endif + #endif + + if ((wait_state != TASK_RUNNING || +-- +2.43.0 + diff --git a/debian/patches-rt/0067-timers-Don-t-block-on-expiry_lock-for-TIMER_IRQSAFE.patch b/debian/patches-rt/0067-timers-Don-t-block-on-expiry_lock-for-TIMER_IRQSAFE.patch new file mode 100644 index 000000000..01c62e677 --- /dev/null +++ b/debian/patches-rt/0067-timers-Don-t-block-on-expiry_lock-for-TIMER_IRQSAFE.patch @@ -0,0 +1,60 @@ +From a7d720657001cb31fc09699861f12726418a4bf8 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Mon, 2 Nov 2020 14:14:24 +0100 +Subject: [PATCH 067/323] timers: Don't block on ->expiry_lock for + TIMER_IRQSAFE +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +PREEMPT_RT does not spin and wait until a running timer completes its +callback but instead it blocks on a sleeping lock to prevent a deadlock. + +This blocking can not be done for workqueue's IRQ_SAFE timer which will +be canceled in an IRQ-off region. It has to happen to in IRQ-off region +because changing the PENDING bit and clearing the timer must not be +interrupted to avoid a busy-loop. + +The callback invocation of IRQSAFE timer is not preempted on PREEMPT_RT +so there is no need to synchronize on timer_base::expiry_lock. + +Don't acquire the timer_base::expiry_lock for TIMER_IRQSAFE flagged +timer. +Add a lockdep annotation to ensure that this function is always invoked +in preemptible context on PREEMPT_RT. + +Reported-by: Mike Galbraith <efault@gmx.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Cc: stable-rt@vger.kernel.org +--- + kernel/time/timer.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index e87e638c31bd..a4fdc7cfb723 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1287,7 +1287,7 @@ static void del_timer_wait_running(struct timer_list *timer) + u32 tf; + + tf = READ_ONCE(timer->flags); +- if (!(tf & TIMER_MIGRATING)) { ++ if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) { + struct timer_base *base = get_timer_base(tf); + + /* +@@ -1371,6 +1371,13 @@ int del_timer_sync(struct timer_list *timer) + */ + WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE)); + ++ /* ++ * Must be able to sleep on PREEMPT_RT because of the slowpath in ++ * del_timer_wait_running(). ++ */ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE)) ++ lockdep_assert_preemption_enabled(); ++ + do { + ret = try_to_del_timer_sync(timer); + +-- +2.43.0 + diff --git a/debian/patches-rt/0068-kthread-Move-prio-affinite-change-into-the-newly-cre.patch b/debian/patches-rt/0068-kthread-Move-prio-affinite-change-into-the-newly-cre.patch new file mode 100644 index 000000000..ee2883f8e --- /dev/null +++ b/debian/patches-rt/0068-kthread-Move-prio-affinite-change-into-the-newly-cre.patch @@ -0,0 +1,86 @@ +From b0fd4a90cb328b61c6fe14df3a7fc4a79745fd2f Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Mon, 9 Nov 2020 21:30:41 +0100 +Subject: [PATCH 068/323] kthread: Move prio/affinite change into the newly + created thread +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +With enabled threaded interrupts the nouveau driver reported the +following: +| Chain exists of: +| &mm->mmap_lock#2 --> &device->mutex --> &cpuset_rwsem +| +| Possible unsafe locking scenario: +| +| CPU0 CPU1 +| ---- ---- +| lock(&cpuset_rwsem); +| lock(&device->mutex); +| lock(&cpuset_rwsem); +| lock(&mm->mmap_lock#2); + +The device->mutex is nvkm_device::mutex. + +Unblocking the lockchain at `cpuset_rwsem' is probably the easiest thing +to do. +Move the priority reset to the start of the newly created thread. + +Fixes: 710da3c8ea7df ("sched/core: Prevent race condition between cpuset and __sched_setscheduler()") +Reported-by: Mike Galbraith <efault@gmx.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Link: https://lkml.kernel.org/r/a23a826af7c108ea5651e73b8fbae5e653f16e86.camel@gmx.de +--- + kernel/kthread.c | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +diff --git a/kernel/kthread.c b/kernel/kthread.c +index 508fe5278285..3ce6a31db7b4 100644 +--- a/kernel/kthread.c ++++ b/kernel/kthread.c +@@ -264,6 +264,7 @@ EXPORT_SYMBOL_GPL(kthread_parkme); + + static int kthread(void *_create) + { ++ static const struct sched_param param = { .sched_priority = 0 }; + /* Copy data: it's on kthread's stack */ + struct kthread_create_info *create = _create; + int (*threadfn)(void *data) = create->threadfn; +@@ -294,6 +295,13 @@ static int kthread(void *_create) + init_completion(&self->parked); + current->vfork_done = &self->exited; + ++ /* ++ * The new thread inherited kthreadd's priority and CPU mask. Reset ++ * back to default in case they have been changed. ++ */ ++ sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); ++ set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD)); ++ + /* OK, tell user we're spawned, wait for stop or wakeup */ + __set_current_state(TASK_UNINTERRUPTIBLE); + create->result = current; +@@ -391,7 +399,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), + } + task = create->result; + if (!IS_ERR(task)) { +- static const struct sched_param param = { .sched_priority = 0 }; + char name[TASK_COMM_LEN]; + + /* +@@ -400,13 +407,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), + */ + vsnprintf(name, sizeof(name), namefmt, args); + set_task_comm(task, name); +- /* +- * root may have changed our (kthreadd's) priority or CPU mask. +- * The kernel thread should not inherit these properties. +- */ +- sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); +- set_cpus_allowed_ptr(task, +- housekeeping_cpumask(HK_FLAG_KTHREAD)); + } + kfree(create); + return task; +-- +2.43.0 + diff --git a/debian/patches-rt/0069-genirq-Move-prio-assignment-into-the-newly-created-t.patch b/debian/patches-rt/0069-genirq-Move-prio-assignment-into-the-newly-created-t.patch new file mode 100644 index 000000000..f09b701ab --- /dev/null +++ b/debian/patches-rt/0069-genirq-Move-prio-assignment-into-the-newly-created-t.patch @@ -0,0 +1,63 @@ +From 617294b2f33cec584f48c12d914fcc982f317be5 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 9 Nov 2020 23:32:39 +0100 +Subject: [PATCH 069/323] genirq: Move prio assignment into the newly created + thread +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +With enabled threaded interrupts the nouveau driver reported the +following: +| Chain exists of: +| &mm->mmap_lock#2 --> &device->mutex --> &cpuset_rwsem +| +| Possible unsafe locking scenario: +| +| CPU0 CPU1 +| ---- ---- +| lock(&cpuset_rwsem); +| lock(&device->mutex); +| lock(&cpuset_rwsem); +| lock(&mm->mmap_lock#2); + +The device->mutex is nvkm_device::mutex. + +Unblocking the lockchain at `cpuset_rwsem' is probably the easiest thing +to do. +Move the priority assignment to the start of the newly created thread. + +Fixes: 710da3c8ea7df ("sched/core: Prevent race condition between cpuset and __sched_setscheduler()") +Reported-by: Mike Galbraith <efault@gmx.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +[bigeasy: Patch description] +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Link: https://lkml.kernel.org/r/a23a826af7c108ea5651e73b8fbae5e653f16e86.camel@gmx.de +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + kernel/irq/manage.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c +index 0159925054fa..5ec167fb77b9 100644 +--- a/kernel/irq/manage.c ++++ b/kernel/irq/manage.c +@@ -1201,6 +1201,8 @@ static int irq_thread(void *data) + + irq_thread_set_ready(desc, action); + ++ sched_set_fifo(current); ++ + if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD, + &action->thread_flags)) + handler_fn = irq_forced_thread_fn; +@@ -1366,8 +1368,6 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) + if (IS_ERR(t)) + return PTR_ERR(t); + +- sched_set_fifo(t); +- + /* + * We keep the reference to the task struct even if + * the thread dies to avoid that the interrupt code +-- +2.43.0 + diff --git a/debian/patches-rt/0070-notifier-Make-atomic_notifiers-use-raw_spinlock.patch b/debian/patches-rt/0070-notifier-Make-atomic_notifiers-use-raw_spinlock.patch new file mode 100644 index 000000000..74ee5109b --- /dev/null +++ b/debian/patches-rt/0070-notifier-Make-atomic_notifiers-use-raw_spinlock.patch @@ -0,0 +1,132 @@ +From f603fbc6eaf37d836e3498b47cef43965730bcda Mon Sep 17 00:00:00 2001 +From: Valentin Schneider <valentin.schneider@arm.com> +Date: Sun, 22 Nov 2020 20:19:04 +0000 +Subject: [PATCH 070/323] notifier: Make atomic_notifiers use raw_spinlock +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Booting a recent PREEMPT_RT kernel (v5.10-rc3-rt7-rebase) on my arm64 Juno +leads to the idle task blocking on an RT sleeping spinlock down some +notifier path: + + [ 1.809101] BUG: scheduling while atomic: swapper/5/0/0x00000002 + [ 1.809116] Modules linked in: + [ 1.809123] Preemption disabled at: + [ 1.809125] secondary_start_kernel (arch/arm64/kernel/smp.c:227) + [ 1.809146] CPU: 5 PID: 0 Comm: swapper/5 Tainted: G W 5.10.0-rc3-rt7 #168 + [ 1.809153] Hardware name: ARM Juno development board (r0) (DT) + [ 1.809158] Call trace: + [ 1.809160] dump_backtrace (arch/arm64/kernel/stacktrace.c:100 (discriminator 1)) + [ 1.809170] show_stack (arch/arm64/kernel/stacktrace.c:198) + [ 1.809178] dump_stack (lib/dump_stack.c:122) + [ 1.809188] __schedule_bug (kernel/sched/core.c:4886) + [ 1.809197] __schedule (./arch/arm64/include/asm/preempt.h:18 kernel/sched/core.c:4913 kernel/sched/core.c:5040) + [ 1.809204] preempt_schedule_lock (kernel/sched/core.c:5365 (discriminator 1)) + [ 1.809210] rt_spin_lock_slowlock_locked (kernel/locking/rtmutex.c:1072) + [ 1.809217] rt_spin_lock_slowlock (kernel/locking/rtmutex.c:1110) + [ 1.809224] rt_spin_lock (./include/linux/rcupdate.h:647 kernel/locking/rtmutex.c:1139) + [ 1.809231] atomic_notifier_call_chain_robust (kernel/notifier.c:71 kernel/notifier.c:118 kernel/notifier.c:186) + [ 1.809240] cpu_pm_enter (kernel/cpu_pm.c:39 kernel/cpu_pm.c:93) + [ 1.809249] psci_enter_idle_state (drivers/cpuidle/cpuidle-psci.c:52 drivers/cpuidle/cpuidle-psci.c:129) + [ 1.809258] cpuidle_enter_state (drivers/cpuidle/cpuidle.c:238) + [ 1.809267] cpuidle_enter (drivers/cpuidle/cpuidle.c:353) + [ 1.809275] do_idle (kernel/sched/idle.c:132 kernel/sched/idle.c:213 kernel/sched/idle.c:273) + [ 1.809282] cpu_startup_entry (kernel/sched/idle.c:368 (discriminator 1)) + [ 1.809288] secondary_start_kernel (arch/arm64/kernel/smp.c:273) + +Two points worth noting: + +1) That this is conceptually the same issue as pointed out in: + 313c8c16ee62 ("PM / CPU: replace raw_notifier with atomic_notifier") +2) Only the _robust() variant of atomic_notifier callchains suffer from + this + +AFAICT only the cpu_pm_notifier_chain really needs to be changed, but +singling it out would mean introducing a new (truly) non-blocking API. At +the same time, callers that are fine with any blocking within the call +chain should use blocking notifiers, so patching up all atomic_notifier's +doesn't seem *too* crazy to me. + +Fixes: 70d932985757 ("notifier: Fix broken error handling pattern") +Signed-off-by: Valentin Schneider <valentin.schneider@arm.com> +Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com> +Link: https://lkml.kernel.org/r/20201122201904.30940-1-valentin.schneider@arm.com +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/notifier.h | 6 +++--- + kernel/notifier.c | 12 ++++++------ + 2 files changed, 9 insertions(+), 9 deletions(-) + +diff --git a/include/linux/notifier.h b/include/linux/notifier.h +index 2fb373a5c1ed..723bc2df6388 100644 +--- a/include/linux/notifier.h ++++ b/include/linux/notifier.h +@@ -58,7 +58,7 @@ struct notifier_block { + }; + + struct atomic_notifier_head { +- spinlock_t lock; ++ raw_spinlock_t lock; + struct notifier_block __rcu *head; + }; + +@@ -78,7 +78,7 @@ struct srcu_notifier_head { + }; + + #define ATOMIC_INIT_NOTIFIER_HEAD(name) do { \ +- spin_lock_init(&(name)->lock); \ ++ raw_spin_lock_init(&(name)->lock); \ + (name)->head = NULL; \ + } while (0) + #define BLOCKING_INIT_NOTIFIER_HEAD(name) do { \ +@@ -95,7 +95,7 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); + cleanup_srcu_struct(&(name)->srcu); + + #define ATOMIC_NOTIFIER_INIT(name) { \ +- .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ ++ .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ + .head = NULL } + #define BLOCKING_NOTIFIER_INIT(name) { \ + .rwsem = __RWSEM_INITIALIZER((name).rwsem), \ +diff --git a/kernel/notifier.c b/kernel/notifier.c +index 1b019cbca594..c20782f07643 100644 +--- a/kernel/notifier.c ++++ b/kernel/notifier.c +@@ -142,9 +142,9 @@ int atomic_notifier_chain_register(struct atomic_notifier_head *nh, + unsigned long flags; + int ret; + +- spin_lock_irqsave(&nh->lock, flags); ++ raw_spin_lock_irqsave(&nh->lock, flags); + ret = notifier_chain_register(&nh->head, n); +- spin_unlock_irqrestore(&nh->lock, flags); ++ raw_spin_unlock_irqrestore(&nh->lock, flags); + return ret; + } + EXPORT_SYMBOL_GPL(atomic_notifier_chain_register); +@@ -164,9 +164,9 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, + unsigned long flags; + int ret; + +- spin_lock_irqsave(&nh->lock, flags); ++ raw_spin_lock_irqsave(&nh->lock, flags); + ret = notifier_chain_unregister(&nh->head, n); +- spin_unlock_irqrestore(&nh->lock, flags); ++ raw_spin_unlock_irqrestore(&nh->lock, flags); + synchronize_rcu(); + return ret; + } +@@ -182,9 +182,9 @@ int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh, + * Musn't use RCU; because then the notifier list can + * change between the up and down traversal. + */ +- spin_lock_irqsave(&nh->lock, flags); ++ raw_spin_lock_irqsave(&nh->lock, flags); + ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v); +- spin_unlock_irqrestore(&nh->lock, flags); ++ raw_spin_unlock_irqrestore(&nh->lock, flags); + + return ret; + } +-- +2.43.0 + diff --git a/debian/patches-rt/0071-rcu-Make-RCU_BOOST-default-on-CONFIG_PREEMPT_RT.patch b/debian/patches-rt/0071-rcu-Make-RCU_BOOST-default-on-CONFIG_PREEMPT_RT.patch new file mode 100644 index 000000000..c6be2d81c --- /dev/null +++ b/debian/patches-rt/0071-rcu-Make-RCU_BOOST-default-on-CONFIG_PREEMPT_RT.patch @@ -0,0 +1,41 @@ +From b9c110646f281f521cc06039710f979b363478b9 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 15 Dec 2020 15:16:45 +0100 +Subject: [PATCH 071/323] rcu: Make RCU_BOOST default on CONFIG_PREEMPT_RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +On PREEMPT_RT kernels, RCU callbacks are deferred to the `rcuc' kthread. +This can stall RCU grace periods due to lengthy preemption not only of RCU +readers but also of 'rcuc' kthreads, either of which prevent grace periods +from completing, which can in turn result in OOM. Because PREEMPT_RT +kernels have more kthreads that can block grace periods, it is more +important for such kernels to enable RCU_BOOST. + +This commit therefore makes RCU_BOOST the default on PREEMPT_RT. +RCU_BOOST can still be manually disabled if need be. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Paul E. McKenney <paulmck@kernel.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/rcu/Kconfig | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig +index cd6e11403f1b..0c71197857f6 100644 +--- a/kernel/rcu/Kconfig ++++ b/kernel/rcu/Kconfig +@@ -189,8 +189,8 @@ config RCU_FAST_NO_HZ + + config RCU_BOOST + bool "Enable RCU priority boosting" +- depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT +- default n ++ depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT ++ default y if PREEMPT_RT + help + This option boosts the priority of preempted RCU readers that + block the current preemptible RCU grace period for too long. +-- +2.43.0 + diff --git a/debian/patches-rt/0072-rcu-Unconditionally-use-rcuc-threads-on-PREEMPT_RT.patch b/debian/patches-rt/0072-rcu-Unconditionally-use-rcuc-threads-on-PREEMPT_RT.patch new file mode 100644 index 000000000..2c85d9dd0 --- /dev/null +++ b/debian/patches-rt/0072-rcu-Unconditionally-use-rcuc-threads-on-PREEMPT_RT.patch @@ -0,0 +1,66 @@ +From 8cc685c333f0deef6d724211a7f0ff37648d24c7 Mon Sep 17 00:00:00 2001 +From: Scott Wood <swood@redhat.com> +Date: Tue, 15 Dec 2020 15:16:46 +0100 +Subject: [PATCH 072/323] rcu: Unconditionally use rcuc threads on PREEMPT_RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +PREEMPT_RT systems have long used the rcutree.use_softirq kernel +boot parameter to avoid use of RCU_SOFTIRQ handlers, which can disrupt +real-time applications by invoking callbacks during return from interrupts +that arrived while executing time-critical code. This kernel boot +parameter instead runs RCU core processing in an 'rcuc' kthread, thus +allowing the scheduler to do its job of avoiding disrupting time-critical +code. + +This commit therefore disables the rcutree.use_softirq kernel boot +parameter on PREEMPT_RT systems, thus forcing such systems to do RCU +core processing in 'rcuc' kthreads. This approach has long been in +use by users of the -rt patchset, and there have been no complaints. +There is therefore no way for the system administrator to override this +choice, at least without modifying and rebuilding the kernel. + +Signed-off-by: Scott Wood <swood@redhat.com> +[bigeasy: Reword commit message] +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +[ paulmck: Update kernel-parameters.txt accordingly. ] +Signed-off-by: Paul E. McKenney <paulmck@kernel.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + Documentation/admin-guide/kernel-parameters.txt | 4 ++++ + kernel/rcu/tree.c | 4 +++- + 2 files changed, 7 insertions(+), 1 deletion(-) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index f1f7c068cf65..c2afb3304825 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4191,6 +4191,10 @@ + value, meaning that RCU_SOFTIRQ is used by default. + Specify rcutree.use_softirq=0 to use rcuc kthreads. + ++ But note that CONFIG_PREEMPT_RT=y kernels disable ++ this kernel boot parameter, forcibly setting it ++ to zero. ++ + rcutree.rcu_fanout_exact= [KNL] + Disable autobalancing of the rcu_node combining + tree. This is used by rcutorture, and might +diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c +index 06bfe61d3cd3..39b3b56ad92d 100644 +--- a/kernel/rcu/tree.c ++++ b/kernel/rcu/tree.c +@@ -101,8 +101,10 @@ static struct rcu_state rcu_state = { + static bool dump_tree; + module_param(dump_tree, bool, 0444); + /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ +-static bool use_softirq = true; ++static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT); ++#ifndef CONFIG_PREEMPT_RT + module_param(use_softirq, bool, 0444); ++#endif + /* Control rcu_node-tree auto-balancing at boot time. */ + static bool rcu_fanout_exact; + module_param(rcu_fanout_exact, bool, 0444); +-- +2.43.0 + diff --git a/debian/patches-rt/0073-rcu-Enable-rcu_normal_after_boot-unconditionally-for.patch b/debian/patches-rt/0073-rcu-Enable-rcu_normal_after_boot-unconditionally-for.patch new file mode 100644 index 000000000..ec9afba35 --- /dev/null +++ b/debian/patches-rt/0073-rcu-Enable-rcu_normal_after_boot-unconditionally-for.patch @@ -0,0 +1,72 @@ +From be1e6f72a91c1c6b65594bba32401b029c7699b8 Mon Sep 17 00:00:00 2001 +From: Julia Cartwright <julia@ni.com> +Date: Tue, 15 Dec 2020 15:16:47 +0100 +Subject: [PATCH 073/323] rcu: Enable rcu_normal_after_boot unconditionally for + RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Expedited RCU grace periods send IPIs to all non-idle CPUs, and thus can +disrupt time-critical code in real-time applications. However, there +is a portion of boot-time processing (presumably before any real-time +applications have started) where expedited RCU grace periods are the only +option. And so it is that experience with the -rt patchset indicates that +PREEMPT_RT systems should always set the rcupdate.rcu_normal_after_boot +kernel boot parameter. + +This commit therefore makes the post-boot application environment safe +for real-time applications by making PREEMPT_RT systems disable the +rcupdate.rcu_normal_after_boot kernel boot parameter and acting as +if this parameter had been set. This means that post-boot calls to +synchronize_rcu_expedited() will be treated as if they were instead +calls to synchronize_rcu(), thus preventing the IPIs, and thus avoiding +disrupting real-time applications. + +Suggested-by: Luiz Capitulino <lcapitulino@redhat.com> +Acked-by: Paul E. McKenney <paulmck@linux.ibm.com> +Signed-off-by: Julia Cartwright <julia@ni.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +[ paulmck: Update kernel-parameters.txt accordingly. ] +Signed-off-by: Paul E. McKenney <paulmck@kernel.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + Documentation/admin-guide/kernel-parameters.txt | 7 +++++++ + kernel/rcu/update.c | 4 +++- + 2 files changed, 10 insertions(+), 1 deletion(-) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index c2afb3304825..23fd09f5351d 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4573,6 +4573,13 @@ + only normal grace-period primitives. No effect + on CONFIG_TINY_RCU kernels. + ++ But note that CONFIG_PREEMPT_RT=y kernels enables ++ this kernel boot parameter, forcibly setting ++ it to the value one, that is, converting any ++ post-boot attempt at an expedited RCU grace ++ period to instead use normal non-expedited ++ grace-period processing. ++ + rcupdate.rcu_task_ipi_delay= [KNL] + Set time in jiffies during which RCU tasks will + avoid sending IPIs, starting with the beginning +diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c +index 849f0aa99333..dd94a602a6d2 100644 +--- a/kernel/rcu/update.c ++++ b/kernel/rcu/update.c +@@ -56,8 +56,10 @@ + #ifndef CONFIG_TINY_RCU + module_param(rcu_expedited, int, 0); + module_param(rcu_normal, int, 0); +-static int rcu_normal_after_boot; ++static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT); ++#ifndef CONFIG_PREEMPT_RT + module_param(rcu_normal_after_boot, int, 0); ++#endif + #endif /* #ifndef CONFIG_TINY_RCU */ + + #ifdef CONFIG_DEBUG_LOCK_ALLOC +-- +2.43.0 + diff --git a/debian/patches-rt/0074-doc-Update-RCU-s-requirements-page-about-the-PREEMPT.patch b/debian/patches-rt/0074-doc-Update-RCU-s-requirements-page-about-the-PREEMPT.patch new file mode 100644 index 000000000..81d863443 --- /dev/null +++ b/debian/patches-rt/0074-doc-Update-RCU-s-requirements-page-about-the-PREEMPT.patch @@ -0,0 +1,35 @@ +From 98e9f8e6202660f2965bc68d54447861280e81b9 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 15 Dec 2020 15:16:48 +0100 +Subject: [PATCH 074/323] doc: Update RCU's requirements page about the + PREEMPT_RT wiki. +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The PREEMPT_RT wiki moved from kernel.org to the Linux Foundation wiki. +The kernel.org wiki is read only. + +This commit therefore updates the URL of the active PREEMPT_RT wiki. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Paul E. McKenney <paulmck@kernel.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + Documentation/RCU/Design/Requirements/Requirements.rst | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst +index 1ae79a10a8de..0f7e0237ea14 100644 +--- a/Documentation/RCU/Design/Requirements/Requirements.rst ++++ b/Documentation/RCU/Design/Requirements/Requirements.rst +@@ -2289,7 +2289,7 @@ decides to throw at it. + + The Linux kernel is used for real-time workloads, especially in + conjunction with the `-rt +-patchset <https://rt.wiki.kernel.org/index.php/Main_Page>`__. The ++patchset <https://wiki.linuxfoundation.org/realtime/>`__. The + real-time-latency response requirements are such that the traditional + approach of disabling preemption across RCU read-side critical sections + is inappropriate. Kernels built with ``CONFIG_PREEMPT=y`` therefore use +-- +2.43.0 + diff --git a/debian/patches-rt/0075-doc-Use-CONFIG_PREEMPTION.patch b/debian/patches-rt/0075-doc-Use-CONFIG_PREEMPTION.patch new file mode 100644 index 000000000..3ddcf14fa --- /dev/null +++ b/debian/patches-rt/0075-doc-Use-CONFIG_PREEMPTION.patch @@ -0,0 +1,250 @@ +From d9780d88d268b12562427ef709de6ab2b8c85188 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 15 Dec 2020 15:16:49 +0100 +Subject: [PATCH 075/323] doc: Use CONFIG_PREEMPTION +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. +Both PREEMPT and PREEMPT_RT require the same functionality which today +depends on CONFIG_PREEMPT. + +Update the documents and mention CONFIG_PREEMPTION. Spell out +CONFIG_PREEMPT_RT (instead PREEMPT_RT) since it is an option now. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Paul E. McKenney <paulmck@kernel.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + .../Expedited-Grace-Periods.rst | 4 ++-- + .../RCU/Design/Requirements/Requirements.rst | 24 +++++++++---------- + Documentation/RCU/checklist.rst | 2 +- + Documentation/RCU/rcubarrier.rst | 6 ++--- + Documentation/RCU/stallwarn.rst | 4 ++-- + Documentation/RCU/whatisRCU.rst | 10 ++++---- + 6 files changed, 25 insertions(+), 25 deletions(-) + +diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst +index 72f0f6fbd53c..6f89cf1e567d 100644 +--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst ++++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst +@@ -38,7 +38,7 @@ sections. + RCU-preempt Expedited Grace Periods + =================================== + +-``CONFIG_PREEMPT=y`` kernels implement RCU-preempt. ++``CONFIG_PREEMPTION=y`` kernels implement RCU-preempt. + The overall flow of the handling of a given CPU by an RCU-preempt + expedited grace period is shown in the following diagram: + +@@ -112,7 +112,7 @@ things. + RCU-sched Expedited Grace Periods + --------------------------------- + +-``CONFIG_PREEMPT=n`` kernels implement RCU-sched. The overall flow of ++``CONFIG_PREEMPTION=n`` kernels implement RCU-sched. The overall flow of + the handling of a given CPU by an RCU-sched expedited grace period is + shown in the following diagram: + +diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst +index 0f7e0237ea14..17d38480ef5c 100644 +--- a/Documentation/RCU/Design/Requirements/Requirements.rst ++++ b/Documentation/RCU/Design/Requirements/Requirements.rst +@@ -78,7 +78,7 @@ RCU treats a nested set as one big RCU read-side critical section. + Production-quality implementations of ``rcu_read_lock()`` and + ``rcu_read_unlock()`` are extremely lightweight, and in fact have + exactly zero overhead in Linux kernels built for production use with +-``CONFIG_PREEMPT=n``. ++``CONFIG_PREEMPTION=n``. + + This guarantee allows ordering to be enforced with extremely low + overhead to readers, for example: +@@ -1182,7 +1182,7 @@ and has become decreasingly so as memory sizes have expanded and memory + costs have plummeted. However, as I learned from Matt Mackall's + `bloatwatch <http://elinux.org/Linux_Tiny-FAQ>`__ efforts, memory + footprint is critically important on single-CPU systems with +-non-preemptible (``CONFIG_PREEMPT=n``) kernels, and thus `tiny ++non-preemptible (``CONFIG_PREEMPTION=n``) kernels, and thus `tiny + RCU <https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com>`__ + was born. Josh Triplett has since taken over the small-memory banner + with his `Linux kernel tinification <https://tiny.wiki.kernel.org/>`__ +@@ -1498,7 +1498,7 @@ limitations. + + Implementations of RCU for which ``rcu_read_lock()`` and + ``rcu_read_unlock()`` generate no code, such as Linux-kernel RCU when +-``CONFIG_PREEMPT=n``, can be nested arbitrarily deeply. After all, there ++``CONFIG_PREEMPTION=n``, can be nested arbitrarily deeply. After all, there + is no overhead. Except that if all these instances of + ``rcu_read_lock()`` and ``rcu_read_unlock()`` are visible to the + compiler, compilation will eventually fail due to exhausting memory, +@@ -1771,7 +1771,7 @@ implementation can be a no-op. + + However, once the scheduler has spawned its first kthread, this early + boot trick fails for ``synchronize_rcu()`` (as well as for +-``synchronize_rcu_expedited()``) in ``CONFIG_PREEMPT=y`` kernels. The ++``synchronize_rcu_expedited()``) in ``CONFIG_PREEMPTION=y`` kernels. The + reason is that an RCU read-side critical section might be preempted, + which means that a subsequent ``synchronize_rcu()`` really does have to + wait for something, as opposed to simply returning immediately. +@@ -2010,7 +2010,7 @@ the following: + 5 rcu_read_unlock(); + 6 do_something_with(v, user_v); + +-If the compiler did make this transformation in a ``CONFIG_PREEMPT=n`` kernel ++If the compiler did make this transformation in a ``CONFIG_PREEMPTION=n`` kernel + build, and if ``get_user()`` did page fault, the result would be a quiescent + state in the middle of an RCU read-side critical section. This misplaced + quiescent state could result in line 4 being a use-after-free access, +@@ -2292,7 +2292,7 @@ conjunction with the `-rt + patchset <https://wiki.linuxfoundation.org/realtime/>`__. The + real-time-latency response requirements are such that the traditional + approach of disabling preemption across RCU read-side critical sections +-is inappropriate. Kernels built with ``CONFIG_PREEMPT=y`` therefore use ++is inappropriate. Kernels built with ``CONFIG_PREEMPTION=y`` therefore use + an RCU implementation that allows RCU read-side critical sections to be + preempted. This requirement made its presence known after users made it + clear that an earlier `real-time +@@ -2414,7 +2414,7 @@ includes ``rcu_read_lock_bh()``, ``rcu_read_unlock_bh()``, + ``call_rcu_bh()``, ``rcu_barrier_bh()``, and + ``rcu_read_lock_bh_held()``. However, the update-side APIs are now + simple wrappers for other RCU flavors, namely RCU-sched in +-CONFIG_PREEMPT=n kernels and RCU-preempt otherwise. ++CONFIG_PREEMPTION=n kernels and RCU-preempt otherwise. + + Sched Flavor (Historical) + ~~~~~~~~~~~~~~~~~~~~~~~~~ +@@ -2432,11 +2432,11 @@ not have this property, given that any point in the code outside of an + RCU read-side critical section can be a quiescent state. Therefore, + *RCU-sched* was created, which follows “classic” RCU in that an + RCU-sched grace period waits for pre-existing interrupt and NMI +-handlers. In kernels built with ``CONFIG_PREEMPT=n``, the RCU and ++handlers. In kernels built with ``CONFIG_PREEMPTION=n``, the RCU and + RCU-sched APIs have identical implementations, while kernels built with +-``CONFIG_PREEMPT=y`` provide a separate implementation for each. ++``CONFIG_PREEMPTION=y`` provide a separate implementation for each. + +-Note well that in ``CONFIG_PREEMPT=y`` kernels, ++Note well that in ``CONFIG_PREEMPTION=y`` kernels, + ``rcu_read_lock_sched()`` and ``rcu_read_unlock_sched()`` disable and + re-enable preemption, respectively. This means that if there was a + preemption attempt during the RCU-sched read-side critical section, +@@ -2599,10 +2599,10 @@ userspace execution also delimit tasks-RCU read-side critical sections. + + The tasks-RCU API is quite compact, consisting only of + ``call_rcu_tasks()``, ``synchronize_rcu_tasks()``, and +-``rcu_barrier_tasks()``. In ``CONFIG_PREEMPT=n`` kernels, trampolines ++``rcu_barrier_tasks()``. In ``CONFIG_PREEMPTION=n`` kernels, trampolines + cannot be preempted, so these APIs map to ``call_rcu()``, + ``synchronize_rcu()``, and ``rcu_barrier()``, respectively. In +-``CONFIG_PREEMPT=y`` kernels, trampolines can be preempted, and these ++``CONFIG_PREEMPTION=y`` kernels, trampolines can be preempted, and these + three APIs are therefore implemented by separate functions that check + for voluntary context switches. + +diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst +index 2efed9926c3f..7ed4956043bd 100644 +--- a/Documentation/RCU/checklist.rst ++++ b/Documentation/RCU/checklist.rst +@@ -214,7 +214,7 @@ over a rather long period of time, but improvements are always welcome! + the rest of the system. + + 7. As of v4.20, a given kernel implements only one RCU flavor, +- which is RCU-sched for PREEMPT=n and RCU-preempt for PREEMPT=y. ++ which is RCU-sched for PREEMPTION=n and RCU-preempt for PREEMPTION=y. + If the updater uses call_rcu() or synchronize_rcu(), + then the corresponding readers my use rcu_read_lock() and + rcu_read_unlock(), rcu_read_lock_bh() and rcu_read_unlock_bh(), +diff --git a/Documentation/RCU/rcubarrier.rst b/Documentation/RCU/rcubarrier.rst +index f64f4413a47c..3b4a24877496 100644 +--- a/Documentation/RCU/rcubarrier.rst ++++ b/Documentation/RCU/rcubarrier.rst +@@ -9,7 +9,7 @@ RCU (read-copy update) is a synchronization mechanism that can be thought + of as a replacement for read-writer locking (among other things), but with + very low-overhead readers that are immune to deadlock, priority inversion, + and unbounded latency. RCU read-side critical sections are delimited +-by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPT ++by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPTION + kernels, generate no code whatsoever. + + This means that RCU writers are unaware of the presence of concurrent +@@ -329,10 +329,10 @@ Answer: This cannot happen. The reason is that on_each_cpu() has its last + to smp_call_function() and further to smp_call_function_on_cpu(), + causing this latter to spin until the cross-CPU invocation of + rcu_barrier_func() has completed. This by itself would prevent +- a grace period from completing on non-CONFIG_PREEMPT kernels, ++ a grace period from completing on non-CONFIG_PREEMPTION kernels, + since each CPU must undergo a context switch (or other quiescent + state) before the grace period can complete. However, this is +- of no use in CONFIG_PREEMPT kernels. ++ of no use in CONFIG_PREEMPTION kernels. + + Therefore, on_each_cpu() disables preemption across its call + to smp_call_function() and also across the local call to +diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst +index c9ab6af4d3be..e97d1b4876ef 100644 +--- a/Documentation/RCU/stallwarn.rst ++++ b/Documentation/RCU/stallwarn.rst +@@ -25,7 +25,7 @@ warnings: + + - A CPU looping with bottom halves disabled. + +-- For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel ++- For !CONFIG_PREEMPTION kernels, a CPU looping anywhere in the kernel + without invoking schedule(). If the looping in the kernel is + really expected and desirable behavior, you might need to add + some calls to cond_resched(). +@@ -44,7 +44,7 @@ warnings: + result in the ``rcu_.*kthread starved for`` console-log message, + which will include additional debugging information. + +-- A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might ++- A CPU-bound real-time task in a CONFIG_PREEMPTION kernel, which might + happen to preempt a low-priority task in the middle of an RCU + read-side critical section. This is especially damaging if + that low-priority task is not permitted to run on any other CPU, +diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst +index fb3ff76c3e73..3b2b1479fd0f 100644 +--- a/Documentation/RCU/whatisRCU.rst ++++ b/Documentation/RCU/whatisRCU.rst +@@ -684,7 +684,7 @@ Quick Quiz #1: + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + This section presents a "toy" RCU implementation that is based on + "classic RCU". It is also short on performance (but only for updates) and +-on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT ++on features such as hotplug CPU and the ability to run in CONFIG_PREEMPTION + kernels. The definitions of rcu_dereference() and rcu_assign_pointer() + are the same as those shown in the preceding section, so they are omitted. + :: +@@ -740,7 +740,7 @@ Quick Quiz #2: + Quick Quiz #3: + If it is illegal to block in an RCU read-side + critical section, what the heck do you do in +- PREEMPT_RT, where normal spinlocks can block??? ++ CONFIG_PREEMPT_RT, where normal spinlocks can block??? + + :ref:`Answers to Quick Quiz <8_whatisRCU>` + +@@ -1094,7 +1094,7 @@ Quick Quiz #2: + overhead is **negative**. + + Answer: +- Imagine a single-CPU system with a non-CONFIG_PREEMPT ++ Imagine a single-CPU system with a non-CONFIG_PREEMPTION + kernel where a routing table is used by process-context + code, but can be updated by irq-context code (for example, + by an "ICMP REDIRECT" packet). The usual way of handling +@@ -1121,10 +1121,10 @@ Answer: + Quick Quiz #3: + If it is illegal to block in an RCU read-side + critical section, what the heck do you do in +- PREEMPT_RT, where normal spinlocks can block??? ++ CONFIG_PREEMPT_RT, where normal spinlocks can block??? + + Answer: +- Just as PREEMPT_RT permits preemption of spinlock ++ Just as CONFIG_PREEMPT_RT permits preemption of spinlock + critical sections, it permits preemption of RCU + read-side critical sections. It also permits + spinlocks blocking while in RCU read-side critical +-- +2.43.0 + diff --git a/debian/patches-rt/0076-tracing-Merge-irqflags-preempt-counter.patch b/debian/patches-rt/0076-tracing-Merge-irqflags-preempt-counter.patch new file mode 100644 index 000000000..8cd083750 --- /dev/null +++ b/debian/patches-rt/0076-tracing-Merge-irqflags-preempt-counter.patch @@ -0,0 +1,1900 @@ +From 1ab76ef52a7db4158a5d86604c2a318cfe847e9d Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 3 Feb 2021 11:05:23 -0500 +Subject: [PATCH 076/323] tracing: Merge irqflags + preempt counter. +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The state of the interrupts (irqflags) and the preemption counter are +both passed down to tracing_generic_entry_update(). Only one bit of +irqflags is actually required: The on/off state. The complete 32bit +of the preemption counter isn't needed. Just whether of the upper bits +(softirq, hardirq and NMI) are set and the preemption depth is needed. + +The irqflags and the preemption counter could be evaluated early and the +information stored in an integer `trace_ctx'. +tracing_generic_entry_update() would use the upper bits as the +TRACE_FLAG_* and the lower 8bit as the disabled-preemption depth +(considering that one must be substracted from the counter in one +special cases). + +The actual preemption value is not used except for the tracing record. +The `irqflags' variable is mostly used only for the tracing record. An +exception here is for instance wakeup_tracer_call() or +probe_wakeup_sched_switch() which explicilty disable interrupts and use +that `irqflags' to save (and restore) the IRQ state and to record the +state. + +Struct trace_event_buffer has also the `pc' and flags' members which can +be replaced with `trace_ctx' since their actual value is not used +outside of trace recording. + +This will reduce tracing_generic_entry_update() to simply assign values +to struct trace_entry. The evaluation of the TRACE_FLAG_* bits is moved +to _tracing_gen_ctx_flags() which replaces preempt_count() and +local_save_flags() invocations. + +As an example, ftrace_syscall_enter() may invoke: +- trace_buffer_lock_reserve() -> … -> tracing_generic_entry_update() +- event_trigger_unlock_commit() + -> ftrace_trace_stack() -> … -> tracing_generic_entry_update() + -> ftrace_trace_userstack() -> … -> tracing_generic_entry_update() + +In this case the TRACE_FLAG_* bits were evaluated three times. By using +the `trace_ctx' they are evaluated once and assigned three times. + +A build with all tracers enabled on x86-64 with and without the patch: + + text data bss dec hex filename +21970669 17084168 7639260 46694097 2c87ed1 vmlinux.old +21970293 17084168 7639260 46693721 2c87d59 vmlinux.new + +text shrank by 379 bytes, data remained constant. + +Link: https://lkml.kernel.org/r/20210125194511.3924915-2-bigeasy@linutronix.de + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/trace_events.h | 25 +++- + kernel/trace/blktrace.c | 17 +-- + kernel/trace/trace.c | 206 +++++++++++++++------------ + kernel/trace/trace.h | 38 +++-- + kernel/trace/trace_branch.c | 6 +- + kernel/trace/trace_event_perf.c | 5 +- + kernel/trace/trace_events.c | 18 +-- + kernel/trace/trace_events_inject.c | 6 +- + kernel/trace/trace_functions.c | 28 ++-- + kernel/trace/trace_functions_graph.c | 32 ++--- + kernel/trace/trace_hwlat.c | 7 +- + kernel/trace/trace_irqsoff.c | 86 +++++------ + kernel/trace/trace_kprobe.c | 10 +- + kernel/trace/trace_mmiotrace.c | 14 +- + kernel/trace/trace_sched_wakeup.c | 71 +++++---- + kernel/trace/trace_syscalls.c | 20 ++- + kernel/trace/trace_uprobe.c | 4 +- + 17 files changed, 286 insertions(+), 307 deletions(-) + +diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h +index f7ed0471d5a8..2a98c40526a0 100644 +--- a/include/linux/trace_events.h ++++ b/include/linux/trace_events.h +@@ -152,17 +152,29 @@ enum print_line_t { + + enum print_line_t trace_handle_return(struct trace_seq *s); + +-void tracing_generic_entry_update(struct trace_entry *entry, +- unsigned short type, +- unsigned long flags, +- int pc); ++static inline void tracing_generic_entry_update(struct trace_entry *entry, ++ unsigned short type, ++ unsigned int trace_ctx) ++{ ++ struct task_struct *tsk = current; ++ ++ entry->preempt_count = trace_ctx & 0xff; ++ entry->pid = (tsk) ? tsk->pid : 0; ++ entry->type = type; ++ entry->flags = trace_ctx >> 16; ++} ++ ++unsigned int tracing_gen_ctx_flags(unsigned long irqflags); ++unsigned int tracing_gen_ctx(void); ++unsigned int tracing_gen_ctx_dec(void); ++ + struct trace_event_file; + + struct ring_buffer_event * + trace_event_buffer_lock_reserve(struct trace_buffer **current_buffer, + struct trace_event_file *trace_file, + int type, unsigned long len, +- unsigned long flags, int pc); ++ unsigned int trace_ctx); + + #define TRACE_RECORD_CMDLINE BIT(0) + #define TRACE_RECORD_TGID BIT(1) +@@ -236,8 +248,7 @@ struct trace_event_buffer { + struct ring_buffer_event *event; + struct trace_event_file *trace_file; + void *entry; +- unsigned long flags; +- int pc; ++ unsigned int trace_ctx; + struct pt_regs *regs; + }; + +diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c +index ab912cc60760..a95a2027eefd 100644 +--- a/kernel/trace/blktrace.c ++++ b/kernel/trace/blktrace.c +@@ -72,17 +72,17 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, + struct blk_io_trace *t; + struct ring_buffer_event *event = NULL; + struct trace_buffer *buffer = NULL; +- int pc = 0; ++ unsigned int trace_ctx = 0; + int cpu = smp_processor_id(); + bool blk_tracer = blk_tracer_enabled; + ssize_t cgid_len = cgid ? sizeof(cgid) : 0; + + if (blk_tracer) { + buffer = blk_tr->array_buffer.buffer; +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx_flags(0); + event = trace_buffer_lock_reserve(buffer, TRACE_BLK, + sizeof(*t) + len + cgid_len, +- 0, pc); ++ trace_ctx); + if (!event) + return; + t = ring_buffer_event_data(event); +@@ -107,7 +107,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, + memcpy((void *) t + sizeof(*t) + cgid_len, data, len); + + if (blk_tracer) +- trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); ++ trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); + } + } + +@@ -222,8 +222,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, + struct blk_io_trace *t; + unsigned long flags = 0; + unsigned long *sequence; ++ unsigned int trace_ctx = 0; + pid_t pid; +- int cpu, pc = 0; ++ int cpu; + bool blk_tracer = blk_tracer_enabled; + ssize_t cgid_len = cgid ? sizeof(cgid) : 0; + +@@ -252,10 +253,10 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, + tracing_record_cmdline(current); + + buffer = blk_tr->array_buffer.buffer; +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx_flags(0); + event = trace_buffer_lock_reserve(buffer, TRACE_BLK, + sizeof(*t) + pdu_len + cgid_len, +- 0, pc); ++ trace_ctx); + if (!event) + return; + t = ring_buffer_event_data(event); +@@ -301,7 +302,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, + memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); + + if (blk_tracer) { +- trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); ++ trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); + return; + } + } +diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c +index 4e0411b19ef9..376eb8a1c913 100644 +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -176,7 +176,7 @@ static union trace_eval_map_item *trace_eval_maps; + int tracing_set_tracer(struct trace_array *tr, const char *buf); + static void ftrace_trace_userstack(struct trace_array *tr, + struct trace_buffer *buffer, +- unsigned long flags, int pc); ++ unsigned int trace_ctx); + + #define MAX_TRACER_SIZE 100 + static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; +@@ -909,23 +909,23 @@ static inline void trace_access_lock_init(void) + + #ifdef CONFIG_STACKTRACE + static void __ftrace_trace_stack(struct trace_buffer *buffer, +- unsigned long flags, +- int skip, int pc, struct pt_regs *regs); ++ unsigned int trace_ctx, ++ int skip, struct pt_regs *regs); + static inline void ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, +- unsigned long flags, +- int skip, int pc, struct pt_regs *regs); ++ unsigned int trace_ctx, ++ int skip, struct pt_regs *regs); + + #else + static inline void __ftrace_trace_stack(struct trace_buffer *buffer, +- unsigned long flags, +- int skip, int pc, struct pt_regs *regs) ++ unsigned int trace_ctx, ++ int skip, struct pt_regs *regs) + { + } + static inline void ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, +- unsigned long flags, +- int skip, int pc, struct pt_regs *regs) ++ unsigned long trace_ctx, ++ int skip, struct pt_regs *regs) + { + } + +@@ -933,24 +933,24 @@ static inline void ftrace_trace_stack(struct trace_array *tr, + + static __always_inline void + trace_event_setup(struct ring_buffer_event *event, +- int type, unsigned long flags, int pc) ++ int type, unsigned int trace_ctx) + { + struct trace_entry *ent = ring_buffer_event_data(event); + +- tracing_generic_entry_update(ent, type, flags, pc); ++ tracing_generic_entry_update(ent, type, trace_ctx); + } + + static __always_inline struct ring_buffer_event * + __trace_buffer_lock_reserve(struct trace_buffer *buffer, + int type, + unsigned long len, +- unsigned long flags, int pc) ++ unsigned int trace_ctx) + { + struct ring_buffer_event *event; + + event = ring_buffer_lock_reserve(buffer, len); + if (event != NULL) +- trace_event_setup(event, type, flags, pc); ++ trace_event_setup(event, type, trace_ctx); + + return event; + } +@@ -1011,25 +1011,22 @@ int __trace_puts(unsigned long ip, const char *str, int size) + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct print_entry *entry; +- unsigned long irq_flags; ++ unsigned int trace_ctx; + int alloc; +- int pc; + + if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) + return 0; + +- pc = preempt_count(); +- + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + + alloc = sizeof(*entry) + size + 2; /* possible \n added */ + +- local_save_flags(irq_flags); ++ trace_ctx = tracing_gen_ctx(); + buffer = global_trace.array_buffer.buffer; + ring_buffer_nest_start(buffer); +- event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, +- irq_flags, pc); ++ event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, ++ trace_ctx); + if (!event) { + size = 0; + goto out; +@@ -1048,7 +1045,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) + entry->buf[size] = '\0'; + + __buffer_unlock_commit(buffer, event); +- ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); ++ ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL); + out: + ring_buffer_nest_end(buffer); + return size; +@@ -1065,25 +1062,22 @@ int __trace_bputs(unsigned long ip, const char *str) + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct bputs_entry *entry; +- unsigned long irq_flags; ++ unsigned int trace_ctx; + int size = sizeof(struct bputs_entry); + int ret = 0; +- int pc; + + if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) + return 0; + +- pc = preempt_count(); +- + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + +- local_save_flags(irq_flags); ++ trace_ctx = tracing_gen_ctx(); + buffer = global_trace.array_buffer.buffer; + + ring_buffer_nest_start(buffer); + event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, +- irq_flags, pc); ++ trace_ctx); + if (!event) + goto out; + +@@ -1092,7 +1086,7 @@ int __trace_bputs(unsigned long ip, const char *str) + entry->str = str; + + __buffer_unlock_commit(buffer, event); +- ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL); ++ ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL); + + ret = 1; + out: +@@ -2581,36 +2575,69 @@ enum print_line_t trace_handle_return(struct trace_seq *s) + } + EXPORT_SYMBOL_GPL(trace_handle_return); + +-void +-tracing_generic_entry_update(struct trace_entry *entry, unsigned short type, +- unsigned long flags, int pc) ++unsigned int tracing_gen_ctx_flags(unsigned long irqflags) + { +- struct task_struct *tsk = current; ++ unsigned int trace_flags = 0; ++ unsigned int pc; ++ ++ pc = preempt_count(); + +- entry->preempt_count = pc & 0xff; +- entry->pid = (tsk) ? tsk->pid : 0; +- entry->type = type; +- entry->flags = + #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT +- (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | ++ if (irqs_disabled_flags(irqflags)) ++ trace_flags |= TRACE_FLAG_IRQS_OFF; + #else +- TRACE_FLAG_IRQS_NOSUPPORT | ++ trace_flags |= TRACE_FLAG_IRQS_NOSUPPORT; + #endif +- ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) | +- ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | +- ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) | +- (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | +- (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0); ++ ++ if (pc & NMI_MASK) ++ trace_flags |= TRACE_FLAG_NMI; ++ if (pc & HARDIRQ_MASK) ++ trace_flags |= TRACE_FLAG_HARDIRQ; ++ ++ if (pc & SOFTIRQ_OFFSET) ++ trace_flags |= TRACE_FLAG_SOFTIRQ; ++ ++ if (tif_need_resched()) ++ trace_flags |= TRACE_FLAG_NEED_RESCHED; ++ if (test_preempt_need_resched()) ++ trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; ++ return (trace_flags << 16) | (pc & 0xff); ++} ++ ++unsigned int tracing_gen_ctx(void) ++{ ++ unsigned long irqflags; ++ ++#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT ++ local_save_flags(irqflags); ++#else ++ irqflags = 0; ++#endif ++ return tracing_gen_ctx_flags(irqflags); ++} ++ ++unsigned int tracing_gen_ctx_dec(void) ++{ ++ unsigned int trace_ctx; ++ ++ trace_ctx = tracing_gen_ctx(); ++ ++ /* ++ * Subtract one from the preeption counter if preemption is enabled, ++ * see trace_event_buffer_reserve()for details. ++ */ ++ if (IS_ENABLED(CONFIG_PREEMPTION)) ++ trace_ctx--; ++ return trace_ctx; + } +-EXPORT_SYMBOL_GPL(tracing_generic_entry_update); + + struct ring_buffer_event * + trace_buffer_lock_reserve(struct trace_buffer *buffer, + int type, + unsigned long len, +- unsigned long flags, int pc) ++ unsigned int trace_ctx) + { +- return __trace_buffer_lock_reserve(buffer, type, len, flags, pc); ++ return __trace_buffer_lock_reserve(buffer, type, len, trace_ctx); + } + + DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); +@@ -2729,7 +2756,7 @@ struct ring_buffer_event * + trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, + struct trace_event_file *trace_file, + int type, unsigned long len, +- unsigned long flags, int pc) ++ unsigned int trace_ctx) + { + struct ring_buffer_event *entry; + int val; +@@ -2742,7 +2769,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, + /* Try to use the per cpu buffer first */ + val = this_cpu_inc_return(trace_buffered_event_cnt); + if ((len < (PAGE_SIZE - sizeof(*entry) - sizeof(entry->array[0]))) && val == 1) { +- trace_event_setup(entry, type, flags, pc); ++ trace_event_setup(entry, type, trace_ctx); + entry->array[0] = len; + return entry; + } +@@ -2750,7 +2777,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, + } + + entry = __trace_buffer_lock_reserve(*current_rb, +- type, len, flags, pc); ++ type, len, trace_ctx); + /* + * If tracing is off, but we have triggers enabled + * we still need to look at the event data. Use the temp_buffer +@@ -2759,8 +2786,8 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, + */ + if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) { + *current_rb = temp_buffer; +- entry = __trace_buffer_lock_reserve(*current_rb, +- type, len, flags, pc); ++ entry = __trace_buffer_lock_reserve(*current_rb, type, len, ++ trace_ctx); + } + return entry; + } +@@ -2846,7 +2873,7 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) + ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT); + event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer, + fbuffer->event, fbuffer->entry, +- fbuffer->flags, fbuffer->pc, fbuffer->regs); ++ fbuffer->trace_ctx, fbuffer->regs); + } + EXPORT_SYMBOL_GPL(trace_event_buffer_commit); + +@@ -2862,7 +2889,7 @@ EXPORT_SYMBOL_GPL(trace_event_buffer_commit); + void trace_buffer_unlock_commit_regs(struct trace_array *tr, + struct trace_buffer *buffer, + struct ring_buffer_event *event, +- unsigned long flags, int pc, ++ unsigned int trace_ctx, + struct pt_regs *regs) + { + __buffer_unlock_commit(buffer, event); +@@ -2873,8 +2900,8 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, + * and mmiotrace, but that's ok if they lose a function or + * two. They are not that meaningful. + */ +- ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs); +- ftrace_trace_userstack(tr, buffer, flags, pc); ++ ftrace_trace_stack(tr, buffer, trace_ctx, regs ? 0 : STACK_SKIP, regs); ++ ftrace_trace_userstack(tr, buffer, trace_ctx); + } + + /* +@@ -2888,9 +2915,8 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, + } + + void +-trace_function(struct trace_array *tr, +- unsigned long ip, unsigned long parent_ip, unsigned long flags, +- int pc) ++trace_function(struct trace_array *tr, unsigned long ip, unsigned long ++ parent_ip, unsigned int trace_ctx) + { + struct trace_event_call *call = &event_function; + struct trace_buffer *buffer = tr->array_buffer.buffer; +@@ -2898,7 +2924,7 @@ trace_function(struct trace_array *tr, + struct ftrace_entry *entry; + + event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), +- flags, pc); ++ trace_ctx); + if (!event) + return; + entry = ring_buffer_event_data(event); +@@ -2932,8 +2958,8 @@ static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks); + static DEFINE_PER_CPU(int, ftrace_stack_reserve); + + static void __ftrace_trace_stack(struct trace_buffer *buffer, +- unsigned long flags, +- int skip, int pc, struct pt_regs *regs) ++ unsigned int trace_ctx, ++ int skip, struct pt_regs *regs) + { + struct trace_event_call *call = &event_kernel_stack; + struct ring_buffer_event *event; +@@ -2981,7 +3007,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, + size = nr_entries * sizeof(unsigned long); + event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, + (sizeof(*entry) - sizeof(entry->caller)) + size, +- flags, pc); ++ trace_ctx); + if (!event) + goto out; + entry = ring_buffer_event_data(event); +@@ -3002,22 +3028,22 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, + + static inline void ftrace_trace_stack(struct trace_array *tr, + struct trace_buffer *buffer, +- unsigned long flags, +- int skip, int pc, struct pt_regs *regs) ++ unsigned int trace_ctx, ++ int skip, struct pt_regs *regs) + { + if (!(tr->trace_flags & TRACE_ITER_STACKTRACE)) + return; + +- __ftrace_trace_stack(buffer, flags, skip, pc, regs); ++ __ftrace_trace_stack(buffer, trace_ctx, skip, regs); + } + +-void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, +- int pc) ++void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, ++ int skip) + { + struct trace_buffer *buffer = tr->array_buffer.buffer; + + if (rcu_is_watching()) { +- __ftrace_trace_stack(buffer, flags, skip, pc, NULL); ++ __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); + return; + } + +@@ -3031,7 +3057,7 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, + return; + + rcu_irq_enter_irqson(); +- __ftrace_trace_stack(buffer, flags, skip, pc, NULL); ++ __ftrace_trace_stack(buffer, trace_ctx, skip, NULL); + rcu_irq_exit_irqson(); + } + +@@ -3041,19 +3067,15 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, + */ + void trace_dump_stack(int skip) + { +- unsigned long flags; +- + if (tracing_disabled || tracing_selftest_running) + return; + +- local_save_flags(flags); +- + #ifndef CONFIG_UNWINDER_ORC + /* Skip 1 to skip this function. */ + skip++; + #endif + __ftrace_trace_stack(global_trace.array_buffer.buffer, +- flags, skip, preempt_count(), NULL); ++ tracing_gen_ctx(), skip, NULL); + } + EXPORT_SYMBOL_GPL(trace_dump_stack); + +@@ -3062,7 +3084,7 @@ static DEFINE_PER_CPU(int, user_stack_count); + + static void + ftrace_trace_userstack(struct trace_array *tr, +- struct trace_buffer *buffer, unsigned long flags, int pc) ++ struct trace_buffer *buffer, unsigned int trace_ctx) + { + struct trace_event_call *call = &event_user_stack; + struct ring_buffer_event *event; +@@ -3089,7 +3111,7 @@ ftrace_trace_userstack(struct trace_array *tr, + __this_cpu_inc(user_stack_count); + + event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, +- sizeof(*entry), flags, pc); ++ sizeof(*entry), trace_ctx); + if (!event) + goto out_drop_count; + entry = ring_buffer_event_data(event); +@@ -3109,7 +3131,7 @@ ftrace_trace_userstack(struct trace_array *tr, + #else /* CONFIG_USER_STACKTRACE_SUPPORT */ + static void ftrace_trace_userstack(struct trace_array *tr, + struct trace_buffer *buffer, +- unsigned long flags, int pc) ++ unsigned int trace_ctx) + { + } + #endif /* !CONFIG_USER_STACKTRACE_SUPPORT */ +@@ -3239,9 +3261,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) + struct trace_buffer *buffer; + struct trace_array *tr = &global_trace; + struct bprint_entry *entry; +- unsigned long flags; ++ unsigned int trace_ctx; + char *tbuffer; +- int len = 0, size, pc; ++ int len = 0, size; + + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; +@@ -3249,7 +3271,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) + /* Don't pollute graph traces with trace_vprintk internals */ + pause_graph_tracing(); + +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx(); + preempt_disable_notrace(); + + tbuffer = get_trace_buf(); +@@ -3263,12 +3285,11 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) + if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) + goto out_put; + +- local_save_flags(flags); + size = sizeof(*entry) + sizeof(u32) * len; + buffer = tr->array_buffer.buffer; + ring_buffer_nest_start(buffer); + event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, +- flags, pc); ++ trace_ctx); + if (!event) + goto out; + entry = ring_buffer_event_data(event); +@@ -3278,7 +3299,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) + memcpy(entry->buf, tbuffer, sizeof(u32) * len); + if (!call_filter_check_discard(call, entry, buffer, event)) { + __buffer_unlock_commit(buffer, event); +- ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL); ++ ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); + } + + out: +@@ -3301,9 +3322,9 @@ __trace_array_vprintk(struct trace_buffer *buffer, + { + struct trace_event_call *call = &event_print; + struct ring_buffer_event *event; +- int len = 0, size, pc; ++ int len = 0, size; + struct print_entry *entry; +- unsigned long flags; ++ unsigned int trace_ctx; + char *tbuffer; + + if (tracing_disabled || tracing_selftest_running) +@@ -3312,7 +3333,7 @@ __trace_array_vprintk(struct trace_buffer *buffer, + /* Don't pollute graph traces with trace_vprintk internals */ + pause_graph_tracing(); + +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx(); + preempt_disable_notrace(); + + +@@ -3324,11 +3345,10 @@ __trace_array_vprintk(struct trace_buffer *buffer, + + len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); + +- local_save_flags(flags); + size = sizeof(*entry) + len + 1; + ring_buffer_nest_start(buffer); + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, +- flags, pc); ++ trace_ctx); + if (!event) + goto out; + entry = ring_buffer_event_data(event); +@@ -3337,7 +3357,7 @@ __trace_array_vprintk(struct trace_buffer *buffer, + memcpy(&entry->buf, tbuffer, len + 1); + if (!call_filter_check_discard(call, entry, buffer, event)) { + __buffer_unlock_commit(buffer, event); +- ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL); ++ ftrace_trace_stack(&global_trace, buffer, trace_ctx, 6, NULL); + } + + out: +@@ -6831,7 +6851,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, + enum event_trigger_type tt = ETT_NONE; + struct trace_buffer *buffer; + struct print_entry *entry; +- unsigned long irq_flags; + ssize_t written; + int size; + int len; +@@ -6851,7 +6870,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, + + BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); + +- local_save_flags(irq_flags); + size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */ + + /* If less than "<faulted>", then make sure we can still add that */ +@@ -6860,7 +6878,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, + + buffer = tr->array_buffer.buffer; + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, +- irq_flags, preempt_count()); ++ tracing_gen_ctx()); + if (unlikely(!event)) + /* Ring buffer disabled, return as if not open for write */ + return -EBADF; +@@ -6912,7 +6930,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, + struct ring_buffer_event *event; + struct trace_buffer *buffer; + struct raw_data_entry *entry; +- unsigned long irq_flags; + ssize_t written; + int size; + int len; +@@ -6934,14 +6951,13 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, + + BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); + +- local_save_flags(irq_flags); + size = sizeof(*entry) + cnt; + if (cnt < FAULT_SIZE_ID) + size += FAULT_SIZE_ID - cnt; + + buffer = tr->array_buffer.buffer; + event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, +- irq_flags, preempt_count()); ++ tracing_gen_ctx()); + if (!event) + /* Ring buffer disabled, return as if not open for write */ + return -EBADF; +diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h +index 7c90872f2435..27ed42bccd7f 100644 +--- a/kernel/trace/trace.h ++++ b/kernel/trace/trace.h +@@ -750,8 +750,7 @@ struct ring_buffer_event * + trace_buffer_lock_reserve(struct trace_buffer *buffer, + int type, + unsigned long len, +- unsigned long flags, +- int pc); ++ unsigned int trace_ctx); + + struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, + struct trace_array_cpu *data); +@@ -778,11 +777,11 @@ unsigned long trace_total_entries(struct trace_array *tr); + void trace_function(struct trace_array *tr, + unsigned long ip, + unsigned long parent_ip, +- unsigned long flags, int pc); ++ unsigned int trace_ctx); + void trace_graph_function(struct trace_array *tr, + unsigned long ip, + unsigned long parent_ip, +- unsigned long flags, int pc); ++ unsigned int trace_ctx); + void trace_latency_header(struct seq_file *m); + void trace_default_header(struct seq_file *m); + void print_trace_header(struct seq_file *m, struct trace_iterator *iter); +@@ -850,11 +849,10 @@ static inline void latency_fsnotify(struct trace_array *tr) { } + #endif + + #ifdef CONFIG_STACKTRACE +-void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, +- int pc); ++void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip); + #else +-static inline void __trace_stack(struct trace_array *tr, unsigned long flags, +- int skip, int pc) ++static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, ++ int skip) + { + } + #endif /* CONFIG_STACKTRACE */ +@@ -994,10 +992,10 @@ extern void graph_trace_open(struct trace_iterator *iter); + extern void graph_trace_close(struct trace_iterator *iter); + extern int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, +- unsigned long flags, int pc); ++ unsigned int trace_ctx); + extern void __trace_graph_return(struct trace_array *tr, + struct ftrace_graph_ret *trace, +- unsigned long flags, int pc); ++ unsigned int trace_ctx); + + #ifdef CONFIG_DYNAMIC_FTRACE + extern struct ftrace_hash __rcu *ftrace_graph_hash; +@@ -1460,15 +1458,15 @@ extern int call_filter_check_discard(struct trace_event_call *call, void *rec, + void trace_buffer_unlock_commit_regs(struct trace_array *tr, + struct trace_buffer *buffer, + struct ring_buffer_event *event, +- unsigned long flags, int pc, ++ unsigned int trcace_ctx, + struct pt_regs *regs); + + static inline void trace_buffer_unlock_commit(struct trace_array *tr, + struct trace_buffer *buffer, + struct ring_buffer_event *event, +- unsigned long flags, int pc) ++ unsigned int trace_ctx) + { +- trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL); ++ trace_buffer_unlock_commit_regs(tr, buffer, event, trace_ctx, NULL); + } + + DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); +@@ -1541,8 +1539,7 @@ __event_trigger_test_discard(struct trace_event_file *file, + * @buffer: The ring buffer that the event is being written to + * @event: The event meta data in the ring buffer + * @entry: The event itself +- * @irq_flags: The state of the interrupts at the start of the event +- * @pc: The state of the preempt count at the start of the event. ++ * @trace_ctx: The tracing context flags. + * + * This is a helper function to handle triggers that require data + * from the event itself. It also tests the event against filters and +@@ -1552,12 +1549,12 @@ static inline void + event_trigger_unlock_commit(struct trace_event_file *file, + struct trace_buffer *buffer, + struct ring_buffer_event *event, +- void *entry, unsigned long irq_flags, int pc) ++ void *entry, unsigned int trace_ctx) + { + enum event_trigger_type tt = ETT_NONE; + + if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) +- trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc); ++ trace_buffer_unlock_commit(file->tr, buffer, event, trace_ctx); + + if (tt) + event_triggers_post_call(file, tt); +@@ -1569,8 +1566,7 @@ event_trigger_unlock_commit(struct trace_event_file *file, + * @buffer: The ring buffer that the event is being written to + * @event: The event meta data in the ring buffer + * @entry: The event itself +- * @irq_flags: The state of the interrupts at the start of the event +- * @pc: The state of the preempt count at the start of the event. ++ * @trace_ctx: The tracing context flags. + * + * This is a helper function to handle triggers that require data + * from the event itself. It also tests the event against filters and +@@ -1583,14 +1579,14 @@ static inline void + event_trigger_unlock_commit_regs(struct trace_event_file *file, + struct trace_buffer *buffer, + struct ring_buffer_event *event, +- void *entry, unsigned long irq_flags, int pc, ++ void *entry, unsigned int trace_ctx, + struct pt_regs *regs) + { + enum event_trigger_type tt = ETT_NONE; + + if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) + trace_buffer_unlock_commit_regs(file->tr, buffer, event, +- irq_flags, pc, regs); ++ trace_ctx, regs); + + if (tt) + event_triggers_post_call(file, tt); +diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c +index eff099123aa2..e47fdb4c92fb 100644 +--- a/kernel/trace/trace_branch.c ++++ b/kernel/trace/trace_branch.c +@@ -37,7 +37,7 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) + struct ring_buffer_event *event; + struct trace_branch *entry; + unsigned long flags; +- int pc; ++ unsigned int trace_ctx; + const char *p; + + if (current->trace_recursion & TRACE_BRANCH_BIT) +@@ -59,10 +59,10 @@ probe_likely_condition(struct ftrace_likely_data *f, int val, int expect) + if (atomic_read(&data->disabled)) + goto out; + +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx_flags(flags); + buffer = tr->array_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, +- sizeof(*entry), flags, pc); ++ sizeof(*entry), trace_ctx); + if (!event) + goto out; + +diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c +index 643e0b19920d..0443dd61667b 100644 +--- a/kernel/trace/trace_event_perf.c ++++ b/kernel/trace/trace_event_perf.c +@@ -421,11 +421,8 @@ NOKPROBE_SYMBOL(perf_trace_buf_alloc); + void perf_trace_buf_update(void *record, u16 type) + { + struct trace_entry *entry = record; +- int pc = preempt_count(); +- unsigned long flags; + +- local_save_flags(flags); +- tracing_generic_entry_update(entry, type, flags, pc); ++ tracing_generic_entry_update(entry, type, tracing_gen_ctx()); + } + NOKPROBE_SYMBOL(perf_trace_buf_update); + +diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c +index 4b5a8d7275be..df64b92c5edc 100644 +--- a/kernel/trace/trace_events.c ++++ b/kernel/trace/trace_events.c +@@ -259,22 +259,19 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, + trace_event_ignore_this_pid(trace_file)) + return NULL; + +- local_save_flags(fbuffer->flags); +- fbuffer->pc = preempt_count(); + /* + * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables + * preemption (adding one to the preempt_count). Since we are + * interested in the preempt_count at the time the tracepoint was + * hit, we need to subtract one to offset the increment. + */ +- if (IS_ENABLED(CONFIG_PREEMPTION)) +- fbuffer->pc--; ++ fbuffer->trace_ctx = tracing_gen_ctx_dec(); + fbuffer->trace_file = trace_file; + + fbuffer->event = + trace_event_buffer_lock_reserve(&fbuffer->buffer, trace_file, + event_call->event.type, len, +- fbuffer->flags, fbuffer->pc); ++ fbuffer->trace_ctx); + if (!fbuffer->event) + return NULL; + +@@ -3709,12 +3706,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, + struct trace_buffer *buffer; + struct ring_buffer_event *event; + struct ftrace_entry *entry; +- unsigned long flags; ++ unsigned int trace_ctx; + long disabled; + int cpu; +- int pc; + +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx(); + preempt_disable_notrace(); + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); +@@ -3722,11 +3718,9 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, + if (disabled != 1) + goto out; + +- local_save_flags(flags); +- + event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file, + TRACE_FN, sizeof(*entry), +- flags, pc); ++ trace_ctx); + if (!event) + goto out; + entry = ring_buffer_event_data(event); +@@ -3734,7 +3728,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, + entry->parent_ip = parent_ip; + + event_trigger_unlock_commit(&event_trace_file, buffer, event, +- entry, flags, pc); ++ entry, trace_ctx); + out: + atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); + preempt_enable_notrace(); +diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c +index 149c7dc6a447..b1fce64e126c 100644 +--- a/kernel/trace/trace_events_inject.c ++++ b/kernel/trace/trace_events_inject.c +@@ -192,7 +192,6 @@ static void *trace_alloc_entry(struct trace_event_call *call, int *size) + static int parse_entry(char *str, struct trace_event_call *call, void **pentry) + { + struct ftrace_event_field *field; +- unsigned long irq_flags; + void *entry = NULL; + int entry_size; + u64 val = 0; +@@ -203,9 +202,8 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) + if (!entry) + return -ENOMEM; + +- local_save_flags(irq_flags); +- tracing_generic_entry_update(entry, call->event.type, irq_flags, +- preempt_count()); ++ tracing_generic_entry_update(entry, call->event.type, ++ tracing_gen_ctx()); + + while ((len = parse_field(str, call, &field, &val)) > 0) { + if (is_function_field(field)) +diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c +index 93e20ed642e5..8606cb73341e 100644 +--- a/kernel/trace/trace_functions.c ++++ b/kernel/trace/trace_functions.c +@@ -133,15 +133,14 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, + { + struct trace_array *tr = op->private; + struct trace_array_cpu *data; +- unsigned long flags; ++ unsigned int trace_ctx; + int bit; + int cpu; +- int pc; + + if (unlikely(!tr->function_enabled)) + return; + +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx(); + preempt_disable_notrace(); + + bit = trace_test_and_set_recursion(TRACE_FTRACE_START); +@@ -150,10 +149,9 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, + + cpu = smp_processor_id(); + data = per_cpu_ptr(tr->array_buffer.data, cpu); +- if (!atomic_read(&data->disabled)) { +- local_save_flags(flags); +- trace_function(tr, ip, parent_ip, flags, pc); +- } ++ if (!atomic_read(&data->disabled)) ++ trace_function(tr, ip, parent_ip, trace_ctx); ++ + trace_clear_recursion(bit); + + out: +@@ -187,7 +185,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, + unsigned long flags; + long disabled; + int cpu; +- int pc; ++ unsigned int trace_ctx; + + if (unlikely(!tr->function_enabled)) + return; +@@ -202,9 +200,9 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { +- pc = preempt_count(); +- trace_function(tr, ip, parent_ip, flags, pc); +- __trace_stack(tr, flags, STACK_SKIP, pc); ++ trace_ctx = tracing_gen_ctx_flags(flags); ++ trace_function(tr, ip, parent_ip, trace_ctx); ++ __trace_stack(tr, trace_ctx, STACK_SKIP); + } + + atomic_dec(&data->disabled); +@@ -407,13 +405,11 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, + + static __always_inline void trace_stack(struct trace_array *tr) + { +- unsigned long flags; +- int pc; ++ unsigned int trace_ctx; + +- local_save_flags(flags); +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx(); + +- __trace_stack(tr, flags, FTRACE_STACK_SKIP, pc); ++ __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP); + } + + static void +diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c +index 60d66278aa0d..b086ba8bb3d6 100644 +--- a/kernel/trace/trace_functions_graph.c ++++ b/kernel/trace/trace_functions_graph.c +@@ -96,8 +96,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, + + int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, +- unsigned long flags, +- int pc) ++ unsigned int trace_ctx) + { + struct trace_event_call *call = &event_funcgraph_entry; + struct ring_buffer_event *event; +@@ -105,7 +104,7 @@ int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, +- sizeof(*entry), flags, pc); ++ sizeof(*entry), trace_ctx); + if (!event) + return 0; + entry = ring_buffer_event_data(event); +@@ -129,10 +128,10 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) + struct trace_array *tr = graph_array; + struct trace_array_cpu *data; + unsigned long flags; ++ unsigned int trace_ctx; + long disabled; + int ret; + int cpu; +- int pc; + + if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) + return 0; +@@ -174,8 +173,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) + data = per_cpu_ptr(tr->array_buffer.data, cpu); + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { +- pc = preempt_count(); +- ret = __trace_graph_entry(tr, trace, flags, pc); ++ trace_ctx = tracing_gen_ctx_flags(flags); ++ ret = __trace_graph_entry(tr, trace, trace_ctx); + } else { + ret = 0; + } +@@ -188,7 +187,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) + + static void + __trace_graph_function(struct trace_array *tr, +- unsigned long ip, unsigned long flags, int pc) ++ unsigned long ip, unsigned int trace_ctx) + { + u64 time = trace_clock_local(); + struct ftrace_graph_ent ent = { +@@ -202,22 +201,21 @@ __trace_graph_function(struct trace_array *tr, + .rettime = time, + }; + +- __trace_graph_entry(tr, &ent, flags, pc); +- __trace_graph_return(tr, &ret, flags, pc); ++ __trace_graph_entry(tr, &ent, trace_ctx); ++ __trace_graph_return(tr, &ret, trace_ctx); + } + + void + trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, +- unsigned long flags, int pc) ++ unsigned int trace_ctx) + { +- __trace_graph_function(tr, ip, flags, pc); ++ __trace_graph_function(tr, ip, trace_ctx); + } + + void __trace_graph_return(struct trace_array *tr, + struct ftrace_graph_ret *trace, +- unsigned long flags, +- int pc) ++ unsigned int trace_ctx) + { + struct trace_event_call *call = &event_funcgraph_exit; + struct ring_buffer_event *event; +@@ -225,7 +223,7 @@ void __trace_graph_return(struct trace_array *tr, + struct ftrace_graph_ret_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, +- sizeof(*entry), flags, pc); ++ sizeof(*entry), trace_ctx); + if (!event) + return; + entry = ring_buffer_event_data(event); +@@ -239,9 +237,9 @@ void trace_graph_return(struct ftrace_graph_ret *trace) + struct trace_array *tr = graph_array; + struct trace_array_cpu *data; + unsigned long flags; ++ unsigned int trace_ctx; + long disabled; + int cpu; +- int pc; + + ftrace_graph_addr_finish(trace); + +@@ -255,8 +253,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace) + data = per_cpu_ptr(tr->array_buffer.data, cpu); + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { +- pc = preempt_count(); +- __trace_graph_return(tr, trace, flags, pc); ++ trace_ctx = tracing_gen_ctx_flags(flags); ++ __trace_graph_return(tr, trace, trace_ctx); + } + atomic_dec(&data->disabled); + local_irq_restore(flags); +diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c +index d071fc271eef..4c01c5d8b9a7 100644 +--- a/kernel/trace/trace_hwlat.c ++++ b/kernel/trace/trace_hwlat.c +@@ -108,14 +108,9 @@ static void trace_hwlat_sample(struct hwlat_sample *sample) + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct ring_buffer_event *event; + struct hwlat_entry *entry; +- unsigned long flags; +- int pc; +- +- pc = preempt_count(); +- local_save_flags(flags); + + event = trace_buffer_lock_reserve(buffer, TRACE_HWLAT, sizeof(*entry), +- flags, pc); ++ tracing_gen_ctx()); + if (!event) + return; + entry = ring_buffer_event_data(event); +diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c +index 619a60944bb6..4a11967c8daa 100644 +--- a/kernel/trace/trace_irqsoff.c ++++ b/kernel/trace/trace_irqsoff.c +@@ -143,11 +143,14 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; ++ unsigned int trace_ctx; + + if (!func_prolog_dec(tr, &data, &flags)) + return; + +- trace_function(tr, ip, parent_ip, flags, preempt_count()); ++ trace_ctx = tracing_gen_ctx_flags(flags); ++ ++ trace_function(tr, ip, parent_ip, trace_ctx); + + atomic_dec(&data->disabled); + } +@@ -177,8 +180,8 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; ++ unsigned int trace_ctx; + int ret; +- int pc; + + if (ftrace_graph_ignore_func(trace)) + return 0; +@@ -195,8 +198,8 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) + if (!func_prolog_dec(tr, &data, &flags)) + return 0; + +- pc = preempt_count(); +- ret = __trace_graph_entry(tr, trace, flags, pc); ++ trace_ctx = tracing_gen_ctx_flags(flags); ++ ret = __trace_graph_entry(tr, trace, trace_ctx); + atomic_dec(&data->disabled); + + return ret; +@@ -207,15 +210,15 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; +- int pc; ++ unsigned int trace_ctx; + + ftrace_graph_addr_finish(trace); + + if (!func_prolog_dec(tr, &data, &flags)) + return; + +- pc = preempt_count(); +- __trace_graph_return(tr, trace, flags, pc); ++ trace_ctx = tracing_gen_ctx_flags(flags); ++ __trace_graph_return(tr, trace, trace_ctx); + atomic_dec(&data->disabled); + } + +@@ -268,12 +271,12 @@ static void irqsoff_print_header(struct seq_file *s) + static void + __trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, +- unsigned long flags, int pc) ++ unsigned int trace_ctx) + { + if (is_graph(tr)) +- trace_graph_function(tr, ip, parent_ip, flags, pc); ++ trace_graph_function(tr, ip, parent_ip, trace_ctx); + else +- trace_function(tr, ip, parent_ip, flags, pc); ++ trace_function(tr, ip, parent_ip, trace_ctx); + } + + #else +@@ -323,15 +326,13 @@ check_critical_timing(struct trace_array *tr, + { + u64 T0, T1, delta; + unsigned long flags; +- int pc; ++ unsigned int trace_ctx; + + T0 = data->preempt_timestamp; + T1 = ftrace_now(cpu); + delta = T1-T0; + +- local_save_flags(flags); +- +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx(); + + if (!report_latency(tr, delta)) + goto out; +@@ -342,9 +343,9 @@ check_critical_timing(struct trace_array *tr, + if (!report_latency(tr, delta)) + goto out_unlock; + +- __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); ++ __trace_function(tr, CALLER_ADDR0, parent_ip, trace_ctx); + /* Skip 5 functions to get to the irq/preempt enable function */ +- __trace_stack(tr, flags, 5, pc); ++ __trace_stack(tr, trace_ctx, 5); + + if (data->critical_sequence != max_sequence) + goto out_unlock; +@@ -364,16 +365,15 @@ check_critical_timing(struct trace_array *tr, + out: + data->critical_sequence = max_sequence; + data->preempt_timestamp = ftrace_now(cpu); +- __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); ++ __trace_function(tr, CALLER_ADDR0, parent_ip, trace_ctx); + } + + static nokprobe_inline void +-start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) ++start_critical_timing(unsigned long ip, unsigned long parent_ip) + { + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; +- unsigned long flags; + + if (!tracer_enabled || !tracing_is_enabled()) + return; +@@ -394,9 +394,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) + data->preempt_timestamp = ftrace_now(cpu); + data->critical_start = parent_ip ? : ip; + +- local_save_flags(flags); +- +- __trace_function(tr, ip, parent_ip, flags, pc); ++ __trace_function(tr, ip, parent_ip, tracing_gen_ctx()); + + per_cpu(tracing_cpu, cpu) = 1; + +@@ -404,12 +402,12 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) + } + + static nokprobe_inline void +-stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) ++stop_critical_timing(unsigned long ip, unsigned long parent_ip) + { + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; +- unsigned long flags; ++ unsigned int trace_ctx; + + cpu = raw_smp_processor_id(); + /* Always clear the tracing cpu on stopping the trace */ +@@ -429,8 +427,8 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) + + atomic_inc(&data->disabled); + +- local_save_flags(flags); +- __trace_function(tr, ip, parent_ip, flags, pc); ++ trace_ctx = tracing_gen_ctx(); ++ __trace_function(tr, ip, parent_ip, trace_ctx); + check_critical_timing(tr, data, parent_ip ? : ip, cpu); + data->critical_start = 0; + atomic_dec(&data->disabled); +@@ -439,20 +437,16 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) + /* start and stop critical timings used to for stoppage (in idle) */ + void start_critical_timings(void) + { +- int pc = preempt_count(); +- +- if (preempt_trace(pc) || irq_trace()) +- start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); ++ if (preempt_trace(preempt_count()) || irq_trace()) ++ start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + } + EXPORT_SYMBOL_GPL(start_critical_timings); + NOKPROBE_SYMBOL(start_critical_timings); + + void stop_critical_timings(void) + { +- int pc = preempt_count(); +- +- if (preempt_trace(pc) || irq_trace()) +- stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); ++ if (preempt_trace(preempt_count()) || irq_trace()) ++ stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + } + EXPORT_SYMBOL_GPL(stop_critical_timings); + NOKPROBE_SYMBOL(stop_critical_timings); +@@ -614,19 +608,15 @@ static void irqsoff_tracer_stop(struct trace_array *tr) + */ + void tracer_hardirqs_on(unsigned long a0, unsigned long a1) + { +- unsigned int pc = preempt_count(); +- +- if (!preempt_trace(pc) && irq_trace()) +- stop_critical_timing(a0, a1, pc); ++ if (!preempt_trace(preempt_count()) && irq_trace()) ++ stop_critical_timing(a0, a1); + } + NOKPROBE_SYMBOL(tracer_hardirqs_on); + + void tracer_hardirqs_off(unsigned long a0, unsigned long a1) + { +- unsigned int pc = preempt_count(); +- +- if (!preempt_trace(pc) && irq_trace()) +- start_critical_timing(a0, a1, pc); ++ if (!preempt_trace(preempt_count()) && irq_trace()) ++ start_critical_timing(a0, a1); + } + NOKPROBE_SYMBOL(tracer_hardirqs_off); + +@@ -666,18 +656,14 @@ static struct tracer irqsoff_tracer __read_mostly = + #ifdef CONFIG_PREEMPT_TRACER + void tracer_preempt_on(unsigned long a0, unsigned long a1) + { +- int pc = preempt_count(); +- +- if (preempt_trace(pc) && !irq_trace()) +- stop_critical_timing(a0, a1, pc); ++ if (preempt_trace(preempt_count()) && !irq_trace()) ++ stop_critical_timing(a0, a1); + } + + void tracer_preempt_off(unsigned long a0, unsigned long a1) + { +- int pc = preempt_count(); +- +- if (preempt_trace(pc) && !irq_trace()) +- start_critical_timing(a0, a1, pc); ++ if (preempt_trace(preempt_count()) && !irq_trace()) ++ start_critical_timing(a0, a1); + } + + static int preemptoff_tracer_init(struct trace_array *tr) +diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c +index 718357289899..a2478605e761 100644 +--- a/kernel/trace/trace_kprobe.c ++++ b/kernel/trace/trace_kprobe.c +@@ -1394,8 +1394,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, + if (trace_trigger_soft_disabled(trace_file)) + return; + +- local_save_flags(fbuffer.flags); +- fbuffer.pc = preempt_count(); ++ fbuffer.trace_ctx = tracing_gen_ctx(); + fbuffer.trace_file = trace_file; + + dsize = __get_data_size(&tk->tp, regs); +@@ -1404,7 +1403,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, + trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file, + call->event.type, + sizeof(*entry) + tk->tp.size + dsize, +- fbuffer.flags, fbuffer.pc); ++ fbuffer.trace_ctx); + if (!fbuffer.event) + return; + +@@ -1442,8 +1441,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, + if (trace_trigger_soft_disabled(trace_file)) + return; + +- local_save_flags(fbuffer.flags); +- fbuffer.pc = preempt_count(); ++ fbuffer.trace_ctx = tracing_gen_ctx(); + fbuffer.trace_file = trace_file; + + dsize = __get_data_size(&tk->tp, regs); +@@ -1451,7 +1449,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, + trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file, + call->event.type, + sizeof(*entry) + tk->tp.size + dsize, +- fbuffer.flags, fbuffer.pc); ++ fbuffer.trace_ctx); + if (!fbuffer.event) + return; + +diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c +index 84582bf1ed5f..7221ae0b4c47 100644 +--- a/kernel/trace/trace_mmiotrace.c ++++ b/kernel/trace/trace_mmiotrace.c +@@ -300,10 +300,11 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct ring_buffer_event *event; + struct trace_mmiotrace_rw *entry; +- int pc = preempt_count(); ++ unsigned int trace_ctx; + ++ trace_ctx = tracing_gen_ctx_flags(0); + event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW, +- sizeof(*entry), 0, pc); ++ sizeof(*entry), trace_ctx); + if (!event) { + atomic_inc(&dropped_count); + return; +@@ -312,7 +313,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, + entry->rw = *rw; + + if (!call_filter_check_discard(call, entry, buffer, event)) +- trace_buffer_unlock_commit(tr, buffer, event, 0, pc); ++ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + } + + void mmio_trace_rw(struct mmiotrace_rw *rw) +@@ -330,10 +331,11 @@ static void __trace_mmiotrace_map(struct trace_array *tr, + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct ring_buffer_event *event; + struct trace_mmiotrace_map *entry; +- int pc = preempt_count(); ++ unsigned int trace_ctx; + ++ trace_ctx = tracing_gen_ctx_flags(0); + event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP, +- sizeof(*entry), 0, pc); ++ sizeof(*entry), trace_ctx); + if (!event) { + atomic_inc(&dropped_count); + return; +@@ -342,7 +344,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, + entry->map = *map; + + if (!call_filter_check_discard(call, entry, buffer, event)) +- trace_buffer_unlock_commit(tr, buffer, event, 0, pc); ++ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + } + + void mmio_trace_mapping(struct mmiotrace_map *map) +diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c +index 037e1e863b17..c1f582e0e97f 100644 +--- a/kernel/trace/trace_sched_wakeup.c ++++ b/kernel/trace/trace_sched_wakeup.c +@@ -67,7 +67,7 @@ static bool function_enabled; + static int + func_prolog_preempt_disable(struct trace_array *tr, + struct trace_array_cpu **data, +- int *pc) ++ unsigned int *trace_ctx) + { + long disabled; + int cpu; +@@ -75,7 +75,7 @@ func_prolog_preempt_disable(struct trace_array *tr, + if (likely(!wakeup_task)) + return 0; + +- *pc = preempt_count(); ++ *trace_ctx = tracing_gen_ctx(); + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); +@@ -116,8 +116,8 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace) + { + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; +- unsigned long flags; +- int pc, ret = 0; ++ unsigned int trace_ctx; ++ int ret = 0; + + if (ftrace_graph_ignore_func(trace)) + return 0; +@@ -131,11 +131,10 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace) + if (ftrace_graph_notrace_addr(trace->func)) + return 1; + +- if (!func_prolog_preempt_disable(tr, &data, &pc)) ++ if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) + return 0; + +- local_save_flags(flags); +- ret = __trace_graph_entry(tr, trace, flags, pc); ++ ret = __trace_graph_entry(tr, trace, trace_ctx); + atomic_dec(&data->disabled); + preempt_enable_notrace(); + +@@ -146,16 +145,14 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace) + { + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; +- unsigned long flags; +- int pc; ++ unsigned int trace_ctx; + + ftrace_graph_addr_finish(trace); + +- if (!func_prolog_preempt_disable(tr, &data, &pc)) ++ if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) + return; + +- local_save_flags(flags); +- __trace_graph_return(tr, trace, flags, pc); ++ __trace_graph_return(tr, trace, trace_ctx); + atomic_dec(&data->disabled); + + preempt_enable_notrace(); +@@ -219,13 +216,13 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; +- int pc; ++ unsigned int trace_ctx; + +- if (!func_prolog_preempt_disable(tr, &data, &pc)) ++ if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) + return; + + local_irq_save(flags); +- trace_function(tr, ip, parent_ip, flags, pc); ++ trace_function(tr, ip, parent_ip, trace_ctx); + local_irq_restore(flags); + + atomic_dec(&data->disabled); +@@ -305,12 +302,12 @@ static void wakeup_print_header(struct seq_file *s) + static void + __trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, +- unsigned long flags, int pc) ++ unsigned int trace_ctx) + { + if (is_graph(tr)) +- trace_graph_function(tr, ip, parent_ip, flags, pc); ++ trace_graph_function(tr, ip, parent_ip, trace_ctx); + else +- trace_function(tr, ip, parent_ip, flags, pc); ++ trace_function(tr, ip, parent_ip, trace_ctx); + } + + static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) +@@ -377,7 +374,7 @@ static void + tracing_sched_switch_trace(struct trace_array *tr, + struct task_struct *prev, + struct task_struct *next, +- unsigned long flags, int pc) ++ unsigned int trace_ctx) + { + struct trace_event_call *call = &event_context_switch; + struct trace_buffer *buffer = tr->array_buffer.buffer; +@@ -385,7 +382,7 @@ tracing_sched_switch_trace(struct trace_array *tr, + struct ctx_switch_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_CTX, +- sizeof(*entry), flags, pc); ++ sizeof(*entry), trace_ctx); + if (!event) + return; + entry = ring_buffer_event_data(event); +@@ -398,14 +395,14 @@ tracing_sched_switch_trace(struct trace_array *tr, + entry->next_cpu = task_cpu(next); + + if (!call_filter_check_discard(call, entry, buffer, event)) +- trace_buffer_unlock_commit(tr, buffer, event, flags, pc); ++ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + } + + static void + tracing_sched_wakeup_trace(struct trace_array *tr, + struct task_struct *wakee, + struct task_struct *curr, +- unsigned long flags, int pc) ++ unsigned int trace_ctx) + { + struct trace_event_call *call = &event_wakeup; + struct ring_buffer_event *event; +@@ -413,7 +410,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, + struct trace_buffer *buffer = tr->array_buffer.buffer; + + event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, +- sizeof(*entry), flags, pc); ++ sizeof(*entry), trace_ctx); + if (!event) + return; + entry = ring_buffer_event_data(event); +@@ -426,7 +423,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, + entry->next_cpu = task_cpu(wakee); + + if (!call_filter_check_discard(call, entry, buffer, event)) +- trace_buffer_unlock_commit(tr, buffer, event, flags, pc); ++ trace_buffer_unlock_commit(tr, buffer, event, trace_ctx); + } + + static void notrace +@@ -438,7 +435,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, + unsigned long flags; + long disabled; + int cpu; +- int pc; ++ unsigned int trace_ctx; + + tracing_record_cmdline(prev); + +@@ -457,8 +454,6 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, + if (next != wakeup_task) + return; + +- pc = preempt_count(); +- + /* disable local data, not wakeup_cpu data */ + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); +@@ -466,6 +461,8 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, + goto out; + + local_irq_save(flags); ++ trace_ctx = tracing_gen_ctx_flags(flags); ++ + arch_spin_lock(&wakeup_lock); + + /* We could race with grabbing wakeup_lock */ +@@ -475,9 +472,9 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, + /* The task we are waiting for is waking up */ + data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu); + +- __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); +- tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); +- __trace_stack(wakeup_trace, flags, 0, pc); ++ __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, trace_ctx); ++ tracing_sched_switch_trace(wakeup_trace, prev, next, trace_ctx); ++ __trace_stack(wakeup_trace, trace_ctx, 0); + + T0 = data->preempt_timestamp; + T1 = ftrace_now(cpu); +@@ -529,9 +526,8 @@ probe_wakeup(void *ignore, struct task_struct *p) + { + struct trace_array_cpu *data; + int cpu = smp_processor_id(); +- unsigned long flags; + long disabled; +- int pc; ++ unsigned int trace_ctx; + + if (likely(!tracer_enabled)) + return; +@@ -552,11 +548,12 @@ probe_wakeup(void *ignore, struct task_struct *p) + (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) + return; + +- pc = preempt_count(); + disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->array_buffer.data, cpu)->disabled); + if (unlikely(disabled != 1)) + goto out; + ++ trace_ctx = tracing_gen_ctx(); ++ + /* interrupts should be off from try_to_wake_up */ + arch_spin_lock(&wakeup_lock); + +@@ -583,19 +580,17 @@ probe_wakeup(void *ignore, struct task_struct *p) + + wakeup_task = get_task_struct(p); + +- local_save_flags(flags); +- + data = per_cpu_ptr(wakeup_trace->array_buffer.data, wakeup_cpu); + data->preempt_timestamp = ftrace_now(cpu); +- tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); +- __trace_stack(wakeup_trace, flags, 0, pc); ++ tracing_sched_wakeup_trace(wakeup_trace, p, current, trace_ctx); ++ __trace_stack(wakeup_trace, trace_ctx, 0); + + /* + * We must be careful in using CALLER_ADDR2. But since wake_up + * is not called by an assembly function (where as schedule is) + * it should be safe to use it here. + */ +- __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); ++ __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, trace_ctx); + + out_locked: + arch_spin_unlock(&wakeup_lock); +diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c +index d85a2f0f316b..8bfcd3b09422 100644 +--- a/kernel/trace/trace_syscalls.c ++++ b/kernel/trace/trace_syscalls.c +@@ -298,9 +298,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) + struct syscall_metadata *sys_data; + struct ring_buffer_event *event; + struct trace_buffer *buffer; +- unsigned long irq_flags; ++ unsigned int trace_ctx; + unsigned long args[6]; +- int pc; + int syscall_nr; + int size; + +@@ -322,12 +321,11 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) + + size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; + +- local_save_flags(irq_flags); +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx(); + + buffer = tr->array_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, +- sys_data->enter_event->event.type, size, irq_flags, pc); ++ sys_data->enter_event->event.type, size, trace_ctx); + if (!event) + return; + +@@ -337,7 +335,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) + memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); + + event_trigger_unlock_commit(trace_file, buffer, event, entry, +- irq_flags, pc); ++ trace_ctx); + } + + static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) +@@ -348,8 +346,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) + struct syscall_metadata *sys_data; + struct ring_buffer_event *event; + struct trace_buffer *buffer; +- unsigned long irq_flags; +- int pc; ++ unsigned int trace_ctx; + int syscall_nr; + + syscall_nr = trace_get_syscall_nr(current, regs); +@@ -368,13 +365,12 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) + if (!sys_data) + return; + +- local_save_flags(irq_flags); +- pc = preempt_count(); ++ trace_ctx = tracing_gen_ctx(); + + buffer = tr->array_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, + sys_data->exit_event->event.type, sizeof(*entry), +- irq_flags, pc); ++ trace_ctx); + if (!event) + return; + +@@ -383,7 +379,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) + entry->ret = syscall_get_return_value(current, regs); + + event_trigger_unlock_commit(trace_file, buffer, event, entry, +- irq_flags, pc); ++ trace_ctx); + } + + static int reg_event_syscall_enter(struct trace_event_file *file, +diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c +index 60ff36f5d7f9..0b07bb07127d 100644 +--- a/kernel/trace/trace_uprobe.c ++++ b/kernel/trace/trace_uprobe.c +@@ -966,7 +966,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + size = esize + tu->tp.size + dsize; + event = trace_event_buffer_lock_reserve(&buffer, trace_file, +- call->event.type, size, 0, 0); ++ call->event.type, size, 0); + if (!event) + return; + +@@ -982,7 +982,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, + + memcpy(data, ucb->buf, tu->tp.size + dsize); + +- event_trigger_unlock_commit(trace_file, buffer, event, entry, 0, 0); ++ event_trigger_unlock_commit(trace_file, buffer, event, entry, 0); + } + + /* uprobe handler */ +-- +2.43.0 + diff --git a/debian/patches-rt/0077-tracing-Inline-tracing_gen_ctx_flags.patch b/debian/patches-rt/0077-tracing-Inline-tracing_gen_ctx_flags.patch new file mode 100644 index 000000000..38993b6b8 --- /dev/null +++ b/debian/patches-rt/0077-tracing-Inline-tracing_gen_ctx_flags.patch @@ -0,0 +1,184 @@ +From 49d76ea7e195eb08b5c73a3a11dba066afd40070 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 3 Feb 2021 11:05:24 -0500 +Subject: [PATCH 077/323] tracing: Inline tracing_gen_ctx_flags() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Inline tracing_gen_ctx_flags(). This allows to have one ifdef +CONFIG_TRACE_IRQFLAGS_SUPPORT. + +This requires to move `trace_flag_type' so tracing_gen_ctx_flags() can +use it. + +Link: https://lkml.kernel.org/r/20210125194511.3924915-3-bigeasy@linutronix.de + +Suggested-by: Steven Rostedt <rostedt@goodmis.org> +Link: https://lkml.kernel.org/r/20210125140323.6b1ff20c@gandalf.local.home +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/trace_events.h | 54 ++++++++++++++++++++++++++++++++++-- + kernel/trace/trace.c | 38 ++----------------------- + kernel/trace/trace.h | 19 ------------- + 3 files changed, 53 insertions(+), 58 deletions(-) + +diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h +index 2a98c40526a0..c4b0524582d8 100644 +--- a/include/linux/trace_events.h ++++ b/include/linux/trace_events.h +@@ -164,9 +164,57 @@ static inline void tracing_generic_entry_update(struct trace_entry *entry, + entry->flags = trace_ctx >> 16; + } + +-unsigned int tracing_gen_ctx_flags(unsigned long irqflags); +-unsigned int tracing_gen_ctx(void); +-unsigned int tracing_gen_ctx_dec(void); ++unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); ++ ++enum trace_flag_type { ++ TRACE_FLAG_IRQS_OFF = 0x01, ++ TRACE_FLAG_IRQS_NOSUPPORT = 0x02, ++ TRACE_FLAG_NEED_RESCHED = 0x04, ++ TRACE_FLAG_HARDIRQ = 0x08, ++ TRACE_FLAG_SOFTIRQ = 0x10, ++ TRACE_FLAG_PREEMPT_RESCHED = 0x20, ++ TRACE_FLAG_NMI = 0x40, ++}; ++ ++#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT ++static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) ++{ ++ unsigned int irq_status = irqs_disabled_flags(irqflags) ? ++ TRACE_FLAG_IRQS_OFF : 0; ++ return tracing_gen_ctx_irq_test(irq_status); ++} ++static inline unsigned int tracing_gen_ctx(void) ++{ ++ unsigned long irqflags; ++ ++ local_save_flags(irqflags); ++ return tracing_gen_ctx_flags(irqflags); ++} ++#else ++ ++static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) ++{ ++ return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); ++} ++static inline unsigned int tracing_gen_ctx(void) ++{ ++ return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); ++} ++#endif ++ ++static inline unsigned int tracing_gen_ctx_dec(void) ++{ ++ unsigned int trace_ctx; ++ ++ trace_ctx = tracing_gen_ctx(); ++ /* ++ * Subtract one from the preeption counter if preemption is enabled, ++ * see trace_event_buffer_reserve()for details. ++ */ ++ if (IS_ENABLED(CONFIG_PREEMPTION)) ++ trace_ctx--; ++ return trace_ctx; ++} + + struct trace_event_file; + +diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c +index 376eb8a1c913..733ef9230c73 100644 +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -2575,20 +2575,13 @@ enum print_line_t trace_handle_return(struct trace_seq *s) + } + EXPORT_SYMBOL_GPL(trace_handle_return); + +-unsigned int tracing_gen_ctx_flags(unsigned long irqflags) ++unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) + { +- unsigned int trace_flags = 0; ++ unsigned int trace_flags = irqs_status; + unsigned int pc; + + pc = preempt_count(); + +-#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT +- if (irqs_disabled_flags(irqflags)) +- trace_flags |= TRACE_FLAG_IRQS_OFF; +-#else +- trace_flags |= TRACE_FLAG_IRQS_NOSUPPORT; +-#endif +- + if (pc & NMI_MASK) + trace_flags |= TRACE_FLAG_NMI; + if (pc & HARDIRQ_MASK) +@@ -2604,33 +2597,6 @@ unsigned int tracing_gen_ctx_flags(unsigned long irqflags) + return (trace_flags << 16) | (pc & 0xff); + } + +-unsigned int tracing_gen_ctx(void) +-{ +- unsigned long irqflags; +- +-#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT +- local_save_flags(irqflags); +-#else +- irqflags = 0; +-#endif +- return tracing_gen_ctx_flags(irqflags); +-} +- +-unsigned int tracing_gen_ctx_dec(void) +-{ +- unsigned int trace_ctx; +- +- trace_ctx = tracing_gen_ctx(); +- +- /* +- * Subtract one from the preeption counter if preemption is enabled, +- * see trace_event_buffer_reserve()for details. +- */ +- if (IS_ENABLED(CONFIG_PREEMPTION)) +- trace_ctx--; +- return trace_ctx; +-} +- + struct ring_buffer_event * + trace_buffer_lock_reserve(struct trace_buffer *buffer, + int type, +diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h +index 27ed42bccd7f..40e5ee987578 100644 +--- a/kernel/trace/trace.h ++++ b/kernel/trace/trace.h +@@ -136,25 +136,6 @@ struct kretprobe_trace_entry_head { + unsigned long ret_ip; + }; + +-/* +- * trace_flag_type is an enumeration that holds different +- * states when a trace occurs. These are: +- * IRQS_OFF - interrupts were disabled +- * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags +- * NEED_RESCHED - reschedule is requested +- * HARDIRQ - inside an interrupt handler +- * SOFTIRQ - inside a softirq handler +- */ +-enum trace_flag_type { +- TRACE_FLAG_IRQS_OFF = 0x01, +- TRACE_FLAG_IRQS_NOSUPPORT = 0x02, +- TRACE_FLAG_NEED_RESCHED = 0x04, +- TRACE_FLAG_HARDIRQ = 0x08, +- TRACE_FLAG_SOFTIRQ = 0x10, +- TRACE_FLAG_PREEMPT_RESCHED = 0x20, +- TRACE_FLAG_NMI = 0x40, +-}; +- + #define TRACE_BUF_SIZE 1024 + + struct trace_array; +-- +2.43.0 + diff --git a/debian/patches-rt/0078-tracing-Use-in_serving_softirq-to-deduct-softirq-sta.patch b/debian/patches-rt/0078-tracing-Use-in_serving_softirq-to-deduct-softirq-sta.patch new file mode 100644 index 000000000..32411e1ff --- /dev/null +++ b/debian/patches-rt/0078-tracing-Use-in_serving_softirq-to-deduct-softirq-sta.patch @@ -0,0 +1,48 @@ +From 1a49f21b61e919dbef7c948009870a1f1ba31270 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 3 Feb 2021 11:05:25 -0500 +Subject: [PATCH 078/323] tracing: Use in_serving_softirq() to deduct softirq + status. +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +PREEMPT_RT does not report "serving softirq" because the tracing core +looks at the preemption counter while PREEMPT_RT does not update it +while processing softirqs in order to remain preemptible. The +information is stored somewhere else. +The in_serving_softirq() macro and the SOFTIRQ_OFFSET define are still +working but not on the preempt-counter. + +Use in_serving_softirq() macro which works on PREEMPT_RT. On !PREEMPT_RT +the compiler (gcc-10 / clang-11) is smart enough to optimize the +in_serving_softirq() related read of the preemption counter away. +The only difference I noticed by using in_serving_softirq() on +!PREEMPT_RT is that gcc-10 implemented tracing_gen_ctx_flags() as +reading FLAG, jmp _tracing_gen_ctx_flags(). Without in_serving_softirq() +it inlined _tracing_gen_ctx_flags() into tracing_gen_ctx_flags(). + +Link: https://lkml.kernel.org/r/20210125194511.3924915-4-bigeasy@linutronix.de + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/trace/trace.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c +index 733ef9230c73..70ab6b46ef8f 100644 +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -2586,8 +2586,7 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) + trace_flags |= TRACE_FLAG_NMI; + if (pc & HARDIRQ_MASK) + trace_flags |= TRACE_FLAG_HARDIRQ; +- +- if (pc & SOFTIRQ_OFFSET) ++ if (in_serving_softirq()) + trace_flags |= TRACE_FLAG_SOFTIRQ; + + if (tif_need_resched()) +-- +2.43.0 + diff --git a/debian/patches-rt/0079-tracing-Remove-NULL-check-from-current-in-tracing_ge.patch b/debian/patches-rt/0079-tracing-Remove-NULL-check-from-current-in-tracing_ge.patch new file mode 100644 index 000000000..5853f5af4 --- /dev/null +++ b/debian/patches-rt/0079-tracing-Remove-NULL-check-from-current-in-tracing_ge.patch @@ -0,0 +1,43 @@ +From 76993b5cd21ad0898e6d940ea5fe0449dec94465 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 3 Feb 2021 11:05:26 -0500 +Subject: [PATCH 079/323] tracing: Remove NULL check from current in + tracing_generic_entry_update(). +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +I can't imagine when or why `current' would return a NULL pointer. This +check was added in commit + 72829bc3d63cd ("ftrace: move enums to ftrace.h and make helper function global") + +but it doesn't give me hint why it was needed. + +Assume `current' never returns a NULL pointer and remove the check. + +Link: https://lkml.kernel.org/r/20210125194511.3924915-5-bigeasy@linutronix.de + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/trace_events.h | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h +index c4b0524582d8..2fea9fcd4d4e 100644 +--- a/include/linux/trace_events.h ++++ b/include/linux/trace_events.h +@@ -156,10 +156,8 @@ static inline void tracing_generic_entry_update(struct trace_entry *entry, + unsigned short type, + unsigned int trace_ctx) + { +- struct task_struct *tsk = current; +- + entry->preempt_count = trace_ctx & 0xff; +- entry->pid = (tsk) ? tsk->pid : 0; ++ entry->pid = current->pid; + entry->type = type; + entry->flags = trace_ctx >> 16; + } +-- +2.43.0 + diff --git a/debian/patches-rt/0080-printk-inline-log_output-log_store-in-vprintk_store.patch b/debian/patches-rt/0080-printk-inline-log_output-log_store-in-vprintk_store.patch new file mode 100644 index 000000000..3a67cf2f5 --- /dev/null +++ b/debian/patches-rt/0080-printk-inline-log_output-log_store-in-vprintk_store.patch @@ -0,0 +1,201 @@ +From fe124333809d2e0a0926d5609f78bafffc62f539 Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Wed, 9 Dec 2020 01:50:52 +0106 +Subject: [PATCH 080/323] printk: inline log_output(),log_store() in + vprintk_store() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +In preparation for removing logbuf_lock, inline log_output() +and log_store() into vprintk_store(). This will simplify dealing +with the various code branches and fallbacks that are possible. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Reviewed-by: Petr Mladek <pmladek@suse.com> +Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> +Signed-off-by: Petr Mladek <pmladek@suse.com> +Link: https://lore.kernel.org/r/20201209004453.17720-2-john.ogness@linutronix.de +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/printk/printk.c | 145 +++++++++++++++++++---------------------- + 1 file changed, 67 insertions(+), 78 deletions(-) + +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 17a310dcb6d9..9a7c2b561e6d 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -493,52 +493,6 @@ static void truncate_msg(u16 *text_len, u16 *trunc_msg_len) + *trunc_msg_len = 0; + } + +-/* insert record into the buffer, discard old ones, update heads */ +-static int log_store(u32 caller_id, int facility, int level, +- enum log_flags flags, u64 ts_nsec, +- const struct dev_printk_info *dev_info, +- const char *text, u16 text_len) +-{ +- struct prb_reserved_entry e; +- struct printk_record r; +- u16 trunc_msg_len = 0; +- +- prb_rec_init_wr(&r, text_len); +- +- if (!prb_reserve(&e, prb, &r)) { +- /* truncate the message if it is too long for empty buffer */ +- truncate_msg(&text_len, &trunc_msg_len); +- prb_rec_init_wr(&r, text_len + trunc_msg_len); +- /* survive when the log buffer is too small for trunc_msg */ +- if (!prb_reserve(&e, prb, &r)) +- return 0; +- } +- +- /* fill message */ +- memcpy(&r.text_buf[0], text, text_len); +- if (trunc_msg_len) +- memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); +- r.info->text_len = text_len + trunc_msg_len; +- r.info->facility = facility; +- r.info->level = level & 7; +- r.info->flags = flags & 0x1f; +- if (ts_nsec > 0) +- r.info->ts_nsec = ts_nsec; +- else +- r.info->ts_nsec = local_clock(); +- r.info->caller_id = caller_id; +- if (dev_info) +- memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); +- +- /* A message without a trailing newline can be continued. */ +- if (!(flags & LOG_NEWLINE)) +- prb_commit(&e); +- else +- prb_final_commit(&e); +- +- return (text_len + trunc_msg_len); +-} +- + int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); + + static int syslog_action_restricted(int type) +@@ -1933,44 +1887,28 @@ static inline u32 printk_caller_id(void) + 0x80000000 + raw_smp_processor_id(); + } + +-static size_t log_output(int facility, int level, enum log_flags lflags, +- const struct dev_printk_info *dev_info, +- char *text, size_t text_len) +-{ +- const u32 caller_id = printk_caller_id(); +- +- if (lflags & LOG_CONT) { +- struct prb_reserved_entry e; +- struct printk_record r; +- +- prb_rec_init_wr(&r, text_len); +- if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { +- memcpy(&r.text_buf[r.info->text_len], text, text_len); +- r.info->text_len += text_len; +- if (lflags & LOG_NEWLINE) { +- r.info->flags |= LOG_NEWLINE; +- prb_final_commit(&e); +- } else { +- prb_commit(&e); +- } +- return text_len; +- } +- } +- +- /* Store it in the record log */ +- return log_store(caller_id, facility, level, lflags, 0, +- dev_info, text, text_len); +-} +- + /* Must be called under logbuf_lock. */ + int vprintk_store(int facility, int level, + const struct dev_printk_info *dev_info, + const char *fmt, va_list args) + { ++ const u32 caller_id = printk_caller_id(); + static char textbuf[LOG_LINE_MAX]; +- char *text = textbuf; +- size_t text_len; ++ struct prb_reserved_entry e; + enum log_flags lflags = 0; ++ struct printk_record r; ++ u16 trunc_msg_len = 0; ++ char *text = textbuf; ++ u16 text_len; ++ u64 ts_nsec; ++ ++ /* ++ * Since the duration of printk() can vary depending on the message ++ * and state of the ringbuffer, grab the timestamp now so that it is ++ * close to the call of printk(). This provides a more deterministic ++ * timestamp with respect to the caller. ++ */ ++ ts_nsec = local_clock(); + + /* + * The printf needs to come first; we need the syslog +@@ -2009,7 +1947,58 @@ int vprintk_store(int facility, int level, + if (dev_info) + lflags |= LOG_NEWLINE; + +- return log_output(facility, level, lflags, dev_info, text, text_len); ++ if (lflags & LOG_CONT) { ++ prb_rec_init_wr(&r, text_len); ++ if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { ++ memcpy(&r.text_buf[r.info->text_len], text, text_len); ++ r.info->text_len += text_len; ++ ++ if (lflags & LOG_NEWLINE) { ++ r.info->flags |= LOG_NEWLINE; ++ prb_final_commit(&e); ++ } else { ++ prb_commit(&e); ++ } ++ ++ return text_len; ++ } ++ } ++ ++ /* ++ * Explicitly initialize the record before every prb_reserve() call. ++ * prb_reserve_in_last() and prb_reserve() purposely invalidate the ++ * structure when they fail. ++ */ ++ prb_rec_init_wr(&r, text_len); ++ if (!prb_reserve(&e, prb, &r)) { ++ /* truncate the message if it is too long for empty buffer */ ++ truncate_msg(&text_len, &trunc_msg_len); ++ ++ prb_rec_init_wr(&r, text_len + trunc_msg_len); ++ if (!prb_reserve(&e, prb, &r)) ++ return 0; ++ } ++ ++ /* fill message */ ++ memcpy(&r.text_buf[0], text, text_len); ++ if (trunc_msg_len) ++ memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); ++ r.info->text_len = text_len + trunc_msg_len; ++ r.info->facility = facility; ++ r.info->level = level & 7; ++ r.info->flags = lflags & 0x1f; ++ r.info->ts_nsec = ts_nsec; ++ r.info->caller_id = caller_id; ++ if (dev_info) ++ memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); ++ ++ /* A message without a trailing newline can be continued. */ ++ if (!(lflags & LOG_NEWLINE)) ++ prb_commit(&e); ++ else ++ prb_final_commit(&e); ++ ++ return (text_len + trunc_msg_len); + } + + asmlinkage int vprintk_emit(int facility, int level, +-- +2.43.0 + diff --git a/debian/patches-rt/0081-printk-remove-logbuf_lock-writer-protection-of-ringb.patch b/debian/patches-rt/0081-printk-remove-logbuf_lock-writer-protection-of-ringb.patch new file mode 100644 index 000000000..15ea7d1b6 --- /dev/null +++ b/debian/patches-rt/0081-printk-remove-logbuf_lock-writer-protection-of-ringb.patch @@ -0,0 +1,251 @@ +From c2d6c379547a29aa25202641659917ecd9ae7712 Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Wed, 9 Dec 2020 01:50:53 +0106 +Subject: [PATCH 081/323] printk: remove logbuf_lock writer-protection of + ringbuffer +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Since the ringbuffer is lockless, there is no need for it to be +protected by @logbuf_lock. Remove @logbuf_lock writer-protection of +the ringbuffer. The reader-protection is not removed because some +variables, used by readers, are using @logbuf_lock for synchronization: +@syslog_seq, @syslog_time, @syslog_partial, @console_seq, +struct kmsg_dumper. + +For PRINTK_NMI_DIRECT_CONTEXT_MASK, @logbuf_lock usage is not removed +because it may be used for dumper synchronization. + +Without @logbuf_lock synchronization of vprintk_store() it is no +longer possible to use the single static buffer for temporarily +sprint'ing the message. Instead, use vsnprintf() to determine the +length and perform the real vscnprintf() using the area reserved from +the ringbuffer. This leads to suboptimal packing of the message data, +but will result in less wasted storage than multiple per-cpu buffers +to support lockless temporary sprint'ing. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> +Reviewed-by: Petr Mladek <pmladek@suse.com> +Signed-off-by: Petr Mladek <pmladek@suse.com> +Link: https://lore.kernel.org/r/20201209004453.17720-3-john.ogness@linutronix.de +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/printk/printk.c | 138 +++++++++++++++++++++++++++++------------ + 1 file changed, 98 insertions(+), 40 deletions(-) + +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 9a7c2b561e6d..b0a3f7827819 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -1129,7 +1129,7 @@ void __init setup_log_buf(int early) + new_descs, ilog2(new_descs_count), + new_infos); + +- logbuf_lock_irqsave(flags); ++ printk_safe_enter_irqsave(flags); + + log_buf_len = new_log_buf_len; + log_buf = new_log_buf; +@@ -1146,7 +1146,7 @@ void __init setup_log_buf(int early) + */ + prb = &printk_rb_dynamic; + +- logbuf_unlock_irqrestore(flags); ++ printk_safe_exit_irqrestore(flags); + + if (seq != prb_next_seq(&printk_rb_static)) { + pr_err("dropped %llu messages\n", +@@ -1887,18 +1887,90 @@ static inline u32 printk_caller_id(void) + 0x80000000 + raw_smp_processor_id(); + } + +-/* Must be called under logbuf_lock. */ ++/** ++ * parse_prefix - Parse level and control flags. ++ * ++ * @text: The terminated text message. ++ * @level: A pointer to the current level value, will be updated. ++ * @lflags: A pointer to the current log flags, will be updated. ++ * ++ * @level may be NULL if the caller is not interested in the parsed value. ++ * Otherwise the variable pointed to by @level must be set to ++ * LOGLEVEL_DEFAULT in order to be updated with the parsed value. ++ * ++ * @lflags may be NULL if the caller is not interested in the parsed value. ++ * Otherwise the variable pointed to by @lflags will be OR'd with the parsed ++ * value. ++ * ++ * Return: The length of the parsed level and control flags. ++ */ ++static u16 parse_prefix(char *text, int *level, enum log_flags *lflags) ++{ ++ u16 prefix_len = 0; ++ int kern_level; ++ ++ while (*text) { ++ kern_level = printk_get_level(text); ++ if (!kern_level) ++ break; ++ ++ switch (kern_level) { ++ case '0' ... '7': ++ if (level && *level == LOGLEVEL_DEFAULT) ++ *level = kern_level - '0'; ++ break; ++ case 'c': /* KERN_CONT */ ++ if (lflags) ++ *lflags |= LOG_CONT; ++ } ++ ++ prefix_len += 2; ++ text += 2; ++ } ++ ++ return prefix_len; ++} ++ ++static u16 printk_sprint(char *text, u16 size, int facility, enum log_flags *lflags, ++ const char *fmt, va_list args) ++{ ++ u16 text_len; ++ ++ text_len = vscnprintf(text, size, fmt, args); ++ ++ /* Mark and strip a trailing newline. */ ++ if (text_len && text[text_len - 1] == '\n') { ++ text_len--; ++ *lflags |= LOG_NEWLINE; ++ } ++ ++ /* Strip log level and control flags. */ ++ if (facility == 0) { ++ u16 prefix_len; ++ ++ prefix_len = parse_prefix(text, NULL, NULL); ++ if (prefix_len) { ++ text_len -= prefix_len; ++ memmove(text, text + prefix_len, text_len); ++ } ++ } ++ ++ return text_len; ++} ++ ++__printf(4, 0) + int vprintk_store(int facility, int level, + const struct dev_printk_info *dev_info, + const char *fmt, va_list args) + { + const u32 caller_id = printk_caller_id(); +- static char textbuf[LOG_LINE_MAX]; + struct prb_reserved_entry e; + enum log_flags lflags = 0; + struct printk_record r; + u16 trunc_msg_len = 0; +- char *text = textbuf; ++ char prefix_buf[8]; ++ u16 reserve_size; ++ va_list args2; + u16 text_len; + u64 ts_nsec; + +@@ -1911,35 +1983,21 @@ int vprintk_store(int facility, int level, + ts_nsec = local_clock(); + + /* +- * The printf needs to come first; we need the syslog +- * prefix which might be passed-in as a parameter. ++ * The sprintf needs to come first since the syslog prefix might be ++ * passed in as a parameter. An extra byte must be reserved so that ++ * later the vscnprintf() into the reserved buffer has room for the ++ * terminating '\0', which is not counted by vsnprintf(). + */ +- text_len = vscnprintf(text, sizeof(textbuf), fmt, args); +- +- /* mark and strip a trailing newline */ +- if (text_len && text[text_len-1] == '\n') { +- text_len--; +- lflags |= LOG_NEWLINE; +- } +- +- /* strip kernel syslog prefix and extract log level or control flags */ +- if (facility == 0) { +- int kern_level; ++ va_copy(args2, args); ++ reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1; ++ va_end(args2); + +- while ((kern_level = printk_get_level(text)) != 0) { +- switch (kern_level) { +- case '0' ... '7': +- if (level == LOGLEVEL_DEFAULT) +- level = kern_level - '0'; +- break; +- case 'c': /* KERN_CONT */ +- lflags |= LOG_CONT; +- } ++ if (reserve_size > LOG_LINE_MAX) ++ reserve_size = LOG_LINE_MAX; + +- text_len -= 2; +- text += 2; +- } +- } ++ /* Extract log level or control flags. */ ++ if (facility == 0) ++ parse_prefix(&prefix_buf[0], &level, &lflags); + + if (level == LOGLEVEL_DEFAULT) + level = default_message_loglevel; +@@ -1948,9 +2006,10 @@ int vprintk_store(int facility, int level, + lflags |= LOG_NEWLINE; + + if (lflags & LOG_CONT) { +- prb_rec_init_wr(&r, text_len); ++ prb_rec_init_wr(&r, reserve_size); + if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { +- memcpy(&r.text_buf[r.info->text_len], text, text_len); ++ text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size, ++ facility, &lflags, fmt, args); + r.info->text_len += text_len; + + if (lflags & LOG_NEWLINE) { +@@ -1969,18 +2028,18 @@ int vprintk_store(int facility, int level, + * prb_reserve_in_last() and prb_reserve() purposely invalidate the + * structure when they fail. + */ +- prb_rec_init_wr(&r, text_len); ++ prb_rec_init_wr(&r, reserve_size); + if (!prb_reserve(&e, prb, &r)) { + /* truncate the message if it is too long for empty buffer */ +- truncate_msg(&text_len, &trunc_msg_len); ++ truncate_msg(&reserve_size, &trunc_msg_len); + +- prb_rec_init_wr(&r, text_len + trunc_msg_len); ++ prb_rec_init_wr(&r, reserve_size + trunc_msg_len); + if (!prb_reserve(&e, prb, &r)) + return 0; + } + + /* fill message */ +- memcpy(&r.text_buf[0], text, text_len); ++ text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &lflags, fmt, args); + if (trunc_msg_len) + memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); + r.info->text_len = text_len + trunc_msg_len; +@@ -2021,10 +2080,9 @@ asmlinkage int vprintk_emit(int facility, int level, + boot_delay_msec(level); + printk_delay(); + +- /* This stops the holder of console_sem just where we want him */ +- logbuf_lock_irqsave(flags); ++ printk_safe_enter_irqsave(flags); + printed_len = vprintk_store(facility, level, dev_info, fmt, args); +- logbuf_unlock_irqrestore(flags); ++ printk_safe_exit_irqrestore(flags); + + /* If called from the scheduler, we can not call up(). */ + if (!in_sched) { +-- +2.43.0 + diff --git a/debian/patches-rt/0082-printk-limit-second-loop-of-syslog_print_all.patch b/debian/patches-rt/0082-printk-limit-second-loop-of-syslog_print_all.patch new file mode 100644 index 000000000..41d143e99 --- /dev/null +++ b/debian/patches-rt/0082-printk-limit-second-loop-of-syslog_print_all.patch @@ -0,0 +1,56 @@ +From 6769883ba30e543644f62ababe6910c6774f1885 Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Wed, 17 Feb 2021 16:15:31 +0100 +Subject: [PATCH 082/323] printk: limit second loop of syslog_print_all +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The second loop of syslog_print_all() subtracts lengths that were +added in the first loop. With commit b031a684bfd0 ("printk: remove +logbuf_lock writer-protection of ringbuffer") it is possible that +records are (over)written during syslog_print_all(). This allows the +possibility of the second loop subtracting lengths that were never +added in the first loop. + +This situation can result in syslog_print_all() filling the buffer +starting from a later record, even though there may have been room +to fit the earlier record(s) as well. + +Fixes: b031a684bfd0 ("printk: remove logbuf_lock writer-protection of ringbuffer") +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Reviewed-by: Petr Mladek <pmladek@suse.com> +--- + kernel/printk/printk.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index b0a3f7827819..044bb3791896 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -1497,6 +1497,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + struct printk_info info; + unsigned int line_count; + struct printk_record r; ++ u64 max_seq; + char *text; + int len = 0; + u64 seq; +@@ -1515,9 +1516,15 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + prb_for_each_info(clear_seq, prb, seq, &info, &line_count) + len += get_record_print_text_size(&info, line_count, true, time); + ++ /* ++ * Set an upper bound for the next loop to avoid subtracting lengths ++ * that were never added. ++ */ ++ max_seq = seq; ++ + /* move first record forward until length fits into the buffer */ + prb_for_each_info(clear_seq, prb, seq, &info, &line_count) { +- if (len <= size) ++ if (len <= size || info.seq >= max_seq) + break; + len -= get_record_print_text_size(&info, line_count, true, time); + } +-- +2.43.0 + diff --git a/debian/patches-rt/0083-printk-kmsg_dump-remove-unused-fields.patch b/debian/patches-rt/0083-printk-kmsg_dump-remove-unused-fields.patch new file mode 100644 index 000000000..8b219b627 --- /dev/null +++ b/debian/patches-rt/0083-printk-kmsg_dump-remove-unused-fields.patch @@ -0,0 +1,43 @@ +From dabfa71dd963396a46aee0066bb3eff206de08f7 Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Mon, 21 Dec 2020 11:19:39 +0106 +Subject: [PATCH 083/323] printk: kmsg_dump: remove unused fields +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +struct kmsg_dumper still contains some fields that were used to +iterate the old ringbuffer. They are no longer used. Remove them +and update the struct documentation. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Reviewed-by: Petr Mladek <pmladek@suse.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/kmsg_dump.h | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h +index 3378bcbe585e..235c50982c2d 100644 +--- a/include/linux/kmsg_dump.h ++++ b/include/linux/kmsg_dump.h +@@ -36,6 +36,9 @@ enum kmsg_dump_reason { + * through the record iterator + * @max_reason: filter for highest reason number that should be dumped + * @registered: Flag that specifies if this is already registered ++ * @active: Flag that specifies if this is currently dumping ++ * @cur_seq: Points to the oldest message to dump (private) ++ * @next_seq: Points after the newest message to dump (private) + */ + struct kmsg_dumper { + struct list_head list; +@@ -45,8 +48,6 @@ struct kmsg_dumper { + bool registered; + + /* private state of the kmsg iterator */ +- u32 cur_idx; +- u32 next_idx; + u64 cur_seq; + u64 next_seq; + }; +-- +2.43.0 + diff --git a/debian/patches-rt/0084-printk-refactor-kmsg_dump_get_buffer.patch b/debian/patches-rt/0084-printk-refactor-kmsg_dump_get_buffer.patch new file mode 100644 index 000000000..dd1390103 --- /dev/null +++ b/debian/patches-rt/0084-printk-refactor-kmsg_dump_get_buffer.patch @@ -0,0 +1,145 @@ +From b6e17f3557ef5e32212483ca92910ea28eb21eeb Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Mon, 30 Nov 2020 01:41:56 +0106 +Subject: [PATCH 084/323] printk: refactor kmsg_dump_get_buffer() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +kmsg_dump_get_buffer() requires nearly the same logic as +syslog_print_all(), but uses different variable names and +does not make use of the ringbuffer loop macros. Modify +kmsg_dump_get_buffer() so that the implementation is as similar +to syslog_print_all() as possible. + +A follow-up commit will move this common logic into a +separate helper function. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Reviewed-by: Petr Mladek <pmladek@suse.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/kmsg_dump.h | 2 +- + kernel/printk/printk.c | 60 +++++++++++++++++++++------------------ + 2 files changed, 33 insertions(+), 29 deletions(-) + +diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h +index 235c50982c2d..4095a34db0fa 100644 +--- a/include/linux/kmsg_dump.h ++++ b/include/linux/kmsg_dump.h +@@ -62,7 +62,7 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, + char *line, size_t size, size_t *len); + + bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, +- char *buf, size_t size, size_t *len); ++ char *buf, size_t size, size_t *len_out); + + void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper); + +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 044bb3791896..ad7edcb693d4 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -3429,7 +3429,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); + * read. + */ + bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, +- char *buf, size_t size, size_t *len) ++ char *buf, size_t size, size_t *len_out) + { + struct printk_info info; + unsigned int line_count; +@@ -3437,12 +3437,10 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, + unsigned long flags; + u64 seq; + u64 next_seq; +- size_t l = 0; ++ size_t len = 0; + bool ret = false; + bool time = printk_time; + +- prb_rec_init_rd(&r, &info, buf, size); +- + if (!dumper->active || !buf || !size) + goto out; + +@@ -3460,48 +3458,54 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, + goto out; + } + +- /* calculate length of entire buffer */ +- seq = dumper->cur_seq; +- while (prb_read_valid_info(prb, seq, &info, &line_count)) { +- if (r.info->seq >= dumper->next_seq) ++ /* ++ * Find first record that fits, including all following records, ++ * into the user-provided buffer for this dump. ++ */ ++ ++ prb_for_each_info(dumper->cur_seq, prb, seq, &info, &line_count) { ++ if (info.seq >= dumper->next_seq) + break; +- l += get_record_print_text_size(&info, line_count, syslog, time); +- seq = r.info->seq + 1; ++ len += get_record_print_text_size(&info, line_count, syslog, time); + } + +- /* move first record forward until length fits into the buffer */ +- seq = dumper->cur_seq; +- while (l >= size && prb_read_valid_info(prb, seq, +- &info, &line_count)) { +- if (r.info->seq >= dumper->next_seq) ++ /* ++ * Move first record forward until length fits into the buffer. Ignore ++ * newest messages that were not counted in the above cycle. Messages ++ * might appear and get lost in the meantime. This is the best effort ++ * that prevents an infinite loop. ++ */ ++ prb_for_each_info(dumper->cur_seq, prb, seq, &info, &line_count) { ++ if (len < size || info.seq >= dumper->next_seq) + break; +- l -= get_record_print_text_size(&info, line_count, syslog, time); +- seq = r.info->seq + 1; ++ len -= get_record_print_text_size(&info, line_count, syslog, time); + } + +- /* last message in next interation */ ++ /* ++ * Next kmsg_dump_get_buffer() invocation will dump block of ++ * older records stored right before this one. ++ */ + next_seq = seq; + +- /* actually read text into the buffer now */ +- l = 0; +- while (prb_read_valid(prb, seq, &r)) { ++ prb_rec_init_rd(&r, &info, buf, size); ++ ++ len = 0; ++ prb_for_each_record(seq, prb, seq, &r) { + if (r.info->seq >= dumper->next_seq) + break; + +- l += record_print_text(&r, syslog, time); +- +- /* adjust record to store to remaining buffer space */ +- prb_rec_init_rd(&r, &info, buf + l, size - l); ++ len += record_print_text(&r, syslog, time); + +- seq = r.info->seq + 1; ++ /* Adjust record to store to remaining buffer space. */ ++ prb_rec_init_rd(&r, &info, buf + len, size - len); + } + + dumper->next_seq = next_seq; + ret = true; + logbuf_unlock_irqrestore(flags); + out: +- if (len) +- *len = l; ++ if (len_out) ++ *len_out = len; + return ret; + } + EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); +-- +2.43.0 + diff --git a/debian/patches-rt/0085-printk-consolidate-kmsg_dump_get_buffer-syslog_print.patch b/debian/patches-rt/0085-printk-consolidate-kmsg_dump_get_buffer-syslog_print.patch new file mode 100644 index 000000000..acc8de768 --- /dev/null +++ b/debian/patches-rt/0085-printk-consolidate-kmsg_dump_get_buffer-syslog_print.patch @@ -0,0 +1,147 @@ +From 922cda1ee8d183c2e98429cdabf907a92df72465 Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Wed, 13 Jan 2021 11:29:53 +0106 +Subject: [PATCH 085/323] printk: consolidate + kmsg_dump_get_buffer/syslog_print_all code +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The logic for finding records to fit into a buffer is the same for +kmsg_dump_get_buffer() and syslog_print_all(). Introduce a helper +function find_first_fitting_seq() to handle this logic. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +--- + kernel/printk/printk.c | 87 ++++++++++++++++++++++++------------------ + 1 file changed, 50 insertions(+), 37 deletions(-) + +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index ad7edcb693d4..1f710a2a40b6 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -1424,6 +1424,50 @@ static size_t get_record_print_text_size(struct printk_info *info, + return ((prefix_len * line_count) + info->text_len + 1); + } + ++/* ++ * Beginning with @start_seq, find the first record where it and all following ++ * records up to (but not including) @max_seq fit into @size. ++ * ++ * @max_seq is simply an upper bound and does not need to exist. If the caller ++ * does not require an upper bound, -1 can be used for @max_seq. ++ */ ++static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size, ++ bool syslog, bool time) ++{ ++ struct printk_info info; ++ unsigned int line_count; ++ size_t len = 0; ++ u64 seq; ++ ++ /* Determine the size of the records up to @max_seq. */ ++ prb_for_each_info(start_seq, prb, seq, &info, &line_count) { ++ if (info.seq >= max_seq) ++ break; ++ len += get_record_print_text_size(&info, line_count, syslog, time); ++ } ++ ++ /* ++ * Adjust the upper bound for the next loop to avoid subtracting ++ * lengths that were never added. ++ */ ++ if (seq < max_seq) ++ max_seq = seq; ++ ++ /* ++ * Move first record forward until length fits into the buffer. Ignore ++ * newest messages that were not counted in the above cycle. Messages ++ * might appear and get lost in the meantime. This is a best effort ++ * that prevents an infinite loop that could occur with a retry. ++ */ ++ prb_for_each_info(start_seq, prb, seq, &info, &line_count) { ++ if (len <= size || info.seq >= max_seq) ++ break; ++ len -= get_record_print_text_size(&info, line_count, syslog, time); ++ } ++ ++ return seq; ++} ++ + static int syslog_print(char __user *buf, int size) + { + struct printk_info info; +@@ -1495,9 +1539,7 @@ static int syslog_print(char __user *buf, int size) + static int syslog_print_all(char __user *buf, int size, bool clear) + { + struct printk_info info; +- unsigned int line_count; + struct printk_record r; +- u64 max_seq; + char *text; + int len = 0; + u64 seq; +@@ -1513,21 +1555,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + * Find first record that fits, including all following records, + * into the user-provided buffer for this dump. + */ +- prb_for_each_info(clear_seq, prb, seq, &info, &line_count) +- len += get_record_print_text_size(&info, line_count, true, time); +- +- /* +- * Set an upper bound for the next loop to avoid subtracting lengths +- * that were never added. +- */ +- max_seq = seq; +- +- /* move first record forward until length fits into the buffer */ +- prb_for_each_info(clear_seq, prb, seq, &info, &line_count) { +- if (len <= size || info.seq >= max_seq) +- break; +- len -= get_record_print_text_size(&info, line_count, true, time); +- } ++ seq = find_first_fitting_seq(clear_seq, -1, size, true, time); + + prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); + +@@ -3432,7 +3460,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, + char *buf, size_t size, size_t *len_out) + { + struct printk_info info; +- unsigned int line_count; + struct printk_record r; + unsigned long flags; + u64 seq; +@@ -3460,26 +3487,12 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, + + /* + * Find first record that fits, including all following records, +- * into the user-provided buffer for this dump. ++ * into the user-provided buffer for this dump. Pass in size-1 ++ * because this function (by way of record_print_text()) will ++ * not write more than size-1 bytes of text into @buf. + */ +- +- prb_for_each_info(dumper->cur_seq, prb, seq, &info, &line_count) { +- if (info.seq >= dumper->next_seq) +- break; +- len += get_record_print_text_size(&info, line_count, syslog, time); +- } +- +- /* +- * Move first record forward until length fits into the buffer. Ignore +- * newest messages that were not counted in the above cycle. Messages +- * might appear and get lost in the meantime. This is the best effort +- * that prevents an infinite loop. +- */ +- prb_for_each_info(dumper->cur_seq, prb, seq, &info, &line_count) { +- if (len < size || info.seq >= dumper->next_seq) +- break; +- len -= get_record_print_text_size(&info, line_count, syslog, time); +- } ++ seq = find_first_fitting_seq(dumper->cur_seq, dumper->next_seq, ++ size - 1, syslog, time); + + /* + * Next kmsg_dump_get_buffer() invocation will dump block of +-- +2.43.0 + diff --git a/debian/patches-rt/0086-printk-introduce-CONSOLE_LOG_MAX-for-improved-multi-.patch b/debian/patches-rt/0086-printk-introduce-CONSOLE_LOG_MAX-for-improved-multi-.patch new file mode 100644 index 000000000..876f0920e --- /dev/null +++ b/debian/patches-rt/0086-printk-introduce-CONSOLE_LOG_MAX-for-improved-multi-.patch @@ -0,0 +1,95 @@ +From 2b148f755fdd9d276d191e982d525f330cffad0b Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Thu, 10 Dec 2020 12:48:01 +0106 +Subject: [PATCH 086/323] printk: introduce CONSOLE_LOG_MAX for improved + multi-line support +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Instead of using "LOG_LINE_MAX + PREFIX_MAX" for temporary buffer +sizes, introduce CONSOLE_LOG_MAX. This represents the maximum size +that is allowed to be printed to the console for a single record. + +Rather than setting CONSOLE_LOG_MAX to "LOG_LINE_MAX + PREFIX_MAX" +(1024), increase it to 4096. With a larger buffer size, multi-line +records that are nearly LOG_LINE_MAX in length will have a better +chance of being fully printed. (When formatting a record for the +console, each line of a multi-line record is prepended with a copy +of the prefix.) + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +--- + kernel/printk/printk.c | 18 +++++++++++------- + 1 file changed, 11 insertions(+), 7 deletions(-) + +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 1f710a2a40b6..1bc4ff19c0d2 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -412,8 +412,13 @@ static u64 clear_seq; + #else + #define PREFIX_MAX 32 + #endif ++ ++/* the maximum size allowed to be reserved for a record */ + #define LOG_LINE_MAX (1024 - PREFIX_MAX) + ++/* the maximum size of a formatted record (i.e. with prefix added per line) */ ++#define CONSOLE_LOG_MAX 4096 ++ + #define LOG_LEVEL(v) ((v) & 0x07) + #define LOG_FACILITY(v) ((v) >> 3 & 0xff) + +@@ -1475,11 +1480,11 @@ static int syslog_print(char __user *buf, int size) + char *text; + int len = 0; + +- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); ++ text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); + if (!text) + return -ENOMEM; + +- prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); ++ prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); + + while (size > 0) { + size_t n; +@@ -1545,7 +1550,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + u64 seq; + bool time; + +- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); ++ text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); + if (!text) + return -ENOMEM; + +@@ -1557,7 +1562,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + */ + seq = find_first_fitting_seq(clear_seq, -1, size, true, time); + +- prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); ++ prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); + + len = 0; + prb_for_each_record(seq, prb, seq, &r) { +@@ -2190,8 +2195,7 @@ EXPORT_SYMBOL(printk); + + #else /* CONFIG_PRINTK */ + +-#define LOG_LINE_MAX 0 +-#define PREFIX_MAX 0 ++#define CONSOLE_LOG_MAX 0 + #define printk_time false + + #define prb_read_valid(rb, seq, r) false +@@ -2509,7 +2513,7 @@ static inline int can_use_console(void) + void console_unlock(void) + { + static char ext_text[CONSOLE_EXT_LOG_MAX]; +- static char text[LOG_LINE_MAX + PREFIX_MAX]; ++ static char text[CONSOLE_LOG_MAX]; + unsigned long flags; + bool do_cond_resched, retry; + struct printk_info info; +-- +2.43.0 + diff --git a/debian/patches-rt/0087-printk-use-seqcount_latch-for-clear_seq.patch b/debian/patches-rt/0087-printk-use-seqcount_latch-for-clear_seq.patch new file mode 100644 index 000000000..f34000021 --- /dev/null +++ b/debian/patches-rt/0087-printk-use-seqcount_latch-for-clear_seq.patch @@ -0,0 +1,147 @@ +From 0b877c3f36fc909dc2aec9f70e2e80ad0c69a60b Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Mon, 30 Nov 2020 01:41:58 +0106 +Subject: [PATCH 087/323] printk: use seqcount_latch for clear_seq +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +kmsg_dump_rewind_nolock() locklessly reads @clear_seq. However, +this is not done atomically. Since @clear_seq is 64-bit, this +cannot be an atomic operation for all platforms. Therefore, use +a seqcount_latch to allow readers to always read a consistent +value. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Reviewed-by: Petr Mladek <pmladek@suse.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/printk/printk.c | 58 ++++++++++++++++++++++++++++++++++++------ + 1 file changed, 50 insertions(+), 8 deletions(-) + +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 1bc4ff19c0d2..4444b3e292d5 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -404,8 +404,21 @@ static u64 console_seq; + static u64 exclusive_console_stop_seq; + static unsigned long console_dropped; + +-/* the next printk record to read after the last 'clear' command */ +-static u64 clear_seq; ++struct latched_seq { ++ seqcount_latch_t latch; ++ u64 val[2]; ++}; ++ ++/* ++ * The next printk record to read after the last 'clear' command. There are ++ * two copies (updated with seqcount_latch) so that reads can locklessly ++ * access a valid value. Writers are synchronized by @logbuf_lock. ++ */ ++static struct latched_seq clear_seq = { ++ .latch = SEQCNT_LATCH_ZERO(clear_seq.latch), ++ .val[0] = 0, ++ .val[1] = 0, ++}; + + #ifdef CONFIG_PRINTK_CALLER + #define PREFIX_MAX 48 +@@ -459,6 +472,31 @@ bool printk_percpu_data_ready(void) + return __printk_percpu_data_ready; + } + ++/* Must be called under logbuf_lock. */ ++static void latched_seq_write(struct latched_seq *ls, u64 val) ++{ ++ raw_write_seqcount_latch(&ls->latch); ++ ls->val[0] = val; ++ raw_write_seqcount_latch(&ls->latch); ++ ls->val[1] = val; ++} ++ ++/* Can be called from any context. */ ++static u64 latched_seq_read_nolock(struct latched_seq *ls) ++{ ++ unsigned int seq; ++ unsigned int idx; ++ u64 val; ++ ++ do { ++ seq = raw_read_seqcount_latch(&ls->latch); ++ idx = seq & 0x1; ++ val = ls->val[idx]; ++ } while (read_seqcount_latch_retry(&ls->latch, seq)); ++ ++ return val; ++} ++ + /* Return log buffer address */ + char *log_buf_addr_get(void) + { +@@ -804,7 +842,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) + * like issued by 'dmesg -c'. Reading /dev/kmsg itself + * changes no global state, and does not clear anything. + */ +- user->seq = clear_seq; ++ user->seq = latched_seq_read_nolock(&clear_seq); + break; + case SEEK_END: + /* after the last record */ +@@ -963,6 +1001,9 @@ void log_buf_vmcoreinfo_setup(void) + + VMCOREINFO_SIZE(atomic_long_t); + VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter); ++ ++ VMCOREINFO_STRUCT_SIZE(latched_seq); ++ VMCOREINFO_OFFSET(latched_seq, val); + } + #endif + +@@ -1560,7 +1601,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + * Find first record that fits, including all following records, + * into the user-provided buffer for this dump. + */ +- seq = find_first_fitting_seq(clear_seq, -1, size, true, time); ++ seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1, ++ size, true, time); + + prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); + +@@ -1587,7 +1629,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + } + + if (clear) +- clear_seq = seq; ++ latched_seq_write(&clear_seq, seq); + logbuf_unlock_irq(); + + kfree(text); +@@ -1597,7 +1639,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + static void syslog_clear(void) + { + logbuf_lock_irq(); +- clear_seq = prb_next_seq(prb); ++ latched_seq_write(&clear_seq, prb_next_seq(prb)); + logbuf_unlock_irq(); + } + +@@ -3341,7 +3383,7 @@ void kmsg_dump(enum kmsg_dump_reason reason) + dumper->active = true; + + logbuf_lock_irqsave(flags); +- dumper->cur_seq = clear_seq; ++ dumper->cur_seq = latched_seq_read_nolock(&clear_seq); + dumper->next_seq = prb_next_seq(prb); + logbuf_unlock_irqrestore(flags); + +@@ -3539,7 +3581,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); + */ + void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) + { +- dumper->cur_seq = clear_seq; ++ dumper->cur_seq = latched_seq_read_nolock(&clear_seq); + dumper->next_seq = prb_next_seq(prb); + } + +-- +2.43.0 + diff --git a/debian/patches-rt/0088-printk-use-atomic64_t-for-devkmsg_user.seq.patch b/debian/patches-rt/0088-printk-use-atomic64_t-for-devkmsg_user.seq.patch new file mode 100644 index 000000000..d5c3de71c --- /dev/null +++ b/debian/patches-rt/0088-printk-use-atomic64_t-for-devkmsg_user.seq.patch @@ -0,0 +1,112 @@ +From ccc444abbce9df5f0747db8dd10bd39388d58836 Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Thu, 10 Dec 2020 15:33:40 +0106 +Subject: [PATCH 088/323] printk: use atomic64_t for devkmsg_user.seq +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +@user->seq is indirectly protected by @logbuf_lock. Once @logbuf_lock +is removed, @user->seq will be no longer safe from an atomicity point +of view. + +In preparation for the removal of @logbuf_lock, change it to +atomic64_t to provide this safety. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +--- + kernel/printk/printk.c | 22 +++++++++++----------- + 1 file changed, 11 insertions(+), 11 deletions(-) + +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 4444b3e292d5..a351ed400c04 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -664,7 +664,7 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, + + /* /dev/kmsg - userspace message inject/listen interface */ + struct devkmsg_user { +- u64 seq; ++ atomic64_t seq; + struct ratelimit_state rs; + struct mutex lock; + char buf[CONSOLE_EXT_LOG_MAX]; +@@ -766,7 +766,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, + return ret; + + logbuf_lock_irq(); +- if (!prb_read_valid(prb, user->seq, r)) { ++ if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) { + if (file->f_flags & O_NONBLOCK) { + ret = -EAGAIN; + logbuf_unlock_irq(); +@@ -775,15 +775,15 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, + + logbuf_unlock_irq(); + ret = wait_event_interruptible(log_wait, +- prb_read_valid(prb, user->seq, r)); ++ prb_read_valid(prb, atomic64_read(&user->seq), r)); + if (ret) + goto out; + logbuf_lock_irq(); + } + +- if (r->info->seq != user->seq) { ++ if (r->info->seq != atomic64_read(&user->seq)) { + /* our last seen message is gone, return error and reset */ +- user->seq = r->info->seq; ++ atomic64_set(&user->seq, r->info->seq); + ret = -EPIPE; + logbuf_unlock_irq(); + goto out; +@@ -794,7 +794,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, + &r->text_buf[0], r->info->text_len, + &r->info->dev_info); + +- user->seq = r->info->seq + 1; ++ atomic64_set(&user->seq, r->info->seq + 1); + logbuf_unlock_irq(); + + if (len > count) { +@@ -834,7 +834,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) + switch (whence) { + case SEEK_SET: + /* the first record */ +- user->seq = prb_first_valid_seq(prb); ++ atomic64_set(&user->seq, prb_first_valid_seq(prb)); + break; + case SEEK_DATA: + /* +@@ -842,11 +842,11 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) + * like issued by 'dmesg -c'. Reading /dev/kmsg itself + * changes no global state, and does not clear anything. + */ +- user->seq = latched_seq_read_nolock(&clear_seq); ++ atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq)); + break; + case SEEK_END: + /* after the last record */ +- user->seq = prb_next_seq(prb); ++ atomic64_set(&user->seq, prb_next_seq(prb)); + break; + default: + ret = -EINVAL; +@@ -869,7 +869,7 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) + logbuf_lock_irq(); + if (prb_read_valid_info(prb, user->seq, &info, NULL)) { + /* return error when data has vanished underneath us */ +- if (info.seq != user->seq) ++ if (info.seq != atomic64_read(&user->seq)) + ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; + else + ret = EPOLLIN|EPOLLRDNORM; +@@ -908,7 +908,7 @@ static int devkmsg_open(struct inode *inode, struct file *file) + &user->text_buf[0], sizeof(user->text_buf)); + + logbuf_lock_irq(); +- user->seq = prb_first_valid_seq(prb); ++ atomic64_set(&user->seq, prb_first_valid_seq(prb)); + logbuf_unlock_irq(); + + file->private_data = user; +-- +2.43.0 + diff --git a/debian/patches-rt/0089-printk-add-syslog_lock.patch b/debian/patches-rt/0089-printk-add-syslog_lock.patch new file mode 100644 index 000000000..42e77e5ab --- /dev/null +++ b/debian/patches-rt/0089-printk-add-syslog_lock.patch @@ -0,0 +1,159 @@ +From 5521ed0e06ffc572d9f0ebf5421ab23288899804 Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Thu, 10 Dec 2020 16:58:02 +0106 +Subject: [PATCH 089/323] printk: add syslog_lock +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The global variables @syslog_seq, @syslog_partial, @syslog_time +and write access to @clear_seq are protected by @logbuf_lock. +Once @logbuf_lock is removed, these variables will need their +own synchronization method. Introduce @syslog_lock for this +purpose. + +@syslog_lock is a raw_spin_lock for now. This simplifies the +transition to removing @logbuf_lock. Once @logbuf_lock and the +safe buffers are removed, @syslog_lock can change to spin_lock. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/printk/printk.c | 41 +++++++++++++++++++++++++++++++++++++---- + 1 file changed, 37 insertions(+), 4 deletions(-) + +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index a351ed400c04..986fc9fad210 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -392,8 +392,12 @@ DEFINE_RAW_SPINLOCK(logbuf_lock); + printk_safe_exit_irqrestore(flags); \ + } while (0) + ++/* syslog_lock protects syslog_* variables and write access to clear_seq. */ ++static DEFINE_RAW_SPINLOCK(syslog_lock); ++ + #ifdef CONFIG_PRINTK + DECLARE_WAIT_QUEUE_HEAD(log_wait); ++/* All 3 protected by @syslog_lock. */ + /* the next printk record to read by syslog(READ) or /proc/kmsg */ + static u64 syslog_seq; + static size_t syslog_partial; +@@ -412,7 +416,7 @@ struct latched_seq { + /* + * The next printk record to read after the last 'clear' command. There are + * two copies (updated with seqcount_latch) so that reads can locklessly +- * access a valid value. Writers are synchronized by @logbuf_lock. ++ * access a valid value. Writers are synchronized by @syslog_lock. + */ + static struct latched_seq clear_seq = { + .latch = SEQCNT_LATCH_ZERO(clear_seq.latch), +@@ -472,7 +476,7 @@ bool printk_percpu_data_ready(void) + return __printk_percpu_data_ready; + } + +-/* Must be called under logbuf_lock. */ ++/* Must be called under syslog_lock. */ + static void latched_seq_write(struct latched_seq *ls, u64 val) + { + raw_write_seqcount_latch(&ls->latch); +@@ -1532,7 +1536,9 @@ static int syslog_print(char __user *buf, int size) + size_t skip; + + logbuf_lock_irq(); ++ raw_spin_lock(&syslog_lock); + if (!prb_read_valid(prb, syslog_seq, &r)) { ++ raw_spin_unlock(&syslog_lock); + logbuf_unlock_irq(); + break; + } +@@ -1562,6 +1568,7 @@ static int syslog_print(char __user *buf, int size) + syslog_partial += n; + } else + n = 0; ++ raw_spin_unlock(&syslog_lock); + logbuf_unlock_irq(); + + if (!n) +@@ -1628,8 +1635,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + break; + } + +- if (clear) ++ if (clear) { ++ raw_spin_lock(&syslog_lock); + latched_seq_write(&clear_seq, seq); ++ raw_spin_unlock(&syslog_lock); ++ } + logbuf_unlock_irq(); + + kfree(text); +@@ -1639,10 +1649,24 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + static void syslog_clear(void) + { + logbuf_lock_irq(); ++ raw_spin_lock(&syslog_lock); + latched_seq_write(&clear_seq, prb_next_seq(prb)); ++ raw_spin_unlock(&syslog_lock); + logbuf_unlock_irq(); + } + ++/* Return a consistent copy of @syslog_seq. */ ++static u64 read_syslog_seq_irq(void) ++{ ++ u64 seq; ++ ++ raw_spin_lock_irq(&syslog_lock); ++ seq = syslog_seq; ++ raw_spin_unlock_irq(&syslog_lock); ++ ++ return seq; ++} ++ + int do_syslog(int type, char __user *buf, int len, int source) + { + struct printk_info info; +@@ -1666,8 +1690,9 @@ int do_syslog(int type, char __user *buf, int len, int source) + return 0; + if (!access_ok(buf, len)) + return -EFAULT; ++ + error = wait_event_interruptible(log_wait, +- prb_read_valid(prb, syslog_seq, NULL)); ++ prb_read_valid(prb, read_syslog_seq_irq(), NULL)); + if (error) + return error; + error = syslog_print(buf, len); +@@ -1716,8 +1741,10 @@ int do_syslog(int type, char __user *buf, int len, int source) + /* Number of chars in the log buffer */ + case SYSLOG_ACTION_SIZE_UNREAD: + logbuf_lock_irq(); ++ raw_spin_lock(&syslog_lock); + if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) { + /* No unread messages. */ ++ raw_spin_unlock(&syslog_lock); + logbuf_unlock_irq(); + return 0; + } +@@ -1746,6 +1773,7 @@ int do_syslog(int type, char __user *buf, int len, int source) + } + error -= syslog_partial; + } ++ raw_spin_unlock(&syslog_lock); + logbuf_unlock_irq(); + break; + /* Size of the log buffer */ +@@ -2995,7 +3023,12 @@ void register_console(struct console *newcon) + */ + exclusive_console = newcon; + exclusive_console_stop_seq = console_seq; ++ ++ /* Get a consistent copy of @syslog_seq. */ ++ raw_spin_lock(&syslog_lock); + console_seq = syslog_seq; ++ raw_spin_unlock(&syslog_lock); ++ + logbuf_unlock_irqrestore(flags); + } + console_unlock(); +-- +2.43.0 + diff --git a/debian/patches-rt/0090-printk-introduce-a-kmsg_dump-iterator.patch b/debian/patches-rt/0090-printk-introduce-a-kmsg_dump-iterator.patch new file mode 100644 index 000000000..018c1949d --- /dev/null +++ b/debian/patches-rt/0090-printk-introduce-a-kmsg_dump-iterator.patch @@ -0,0 +1,561 @@ +From 5bb123ebb8e508baa987e9bc17029e2eaa64575b Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Fri, 18 Dec 2020 11:40:08 +0000 +Subject: [PATCH 090/323] printk: introduce a kmsg_dump iterator +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Rather than store the iterator information into the registered +kmsg_dump structure, create a separate iterator structure. The +kmsg_dump_iter structure can reside on the stack of the caller, +thus allowing lockless use of the kmsg_dump functions. + +This is in preparation for removal of @logbuf_lock. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/powerpc/kernel/nvram_64.c | 12 ++-- + arch/powerpc/platforms/powernv/opal-kmsg.c | 3 +- + arch/powerpc/xmon/xmon.c | 6 +- + arch/um/kernel/kmsg_dump.c | 5 +- + drivers/hv/vmbus_drv.c | 5 +- + drivers/mtd/mtdoops.c | 5 +- + fs/pstore/platform.c | 5 +- + include/linux/kmsg_dump.h | 43 +++++++------- + kernel/debug/kdb/kdb_main.c | 10 ++-- + kernel/printk/printk.c | 65 +++++++++++----------- + 10 files changed, 84 insertions(+), 75 deletions(-) + +diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c +index 532f22637783..1ef55f4b389a 100644 +--- a/arch/powerpc/kernel/nvram_64.c ++++ b/arch/powerpc/kernel/nvram_64.c +@@ -73,7 +73,8 @@ static const char *nvram_os_partitions[] = { + }; + + static void oops_to_nvram(struct kmsg_dumper *dumper, +- enum kmsg_dump_reason reason); ++ enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter); + + static struct kmsg_dumper nvram_kmsg_dumper = { + .dump = oops_to_nvram +@@ -643,7 +644,8 @@ void __init nvram_init_oops_partition(int rtas_partition_exists) + * partition. If that's too much, go back and capture uncompressed text. + */ + static void oops_to_nvram(struct kmsg_dumper *dumper, +- enum kmsg_dump_reason reason) ++ enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter) + { + struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf; + static unsigned int oops_count = 0; +@@ -681,13 +683,13 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, + return; + + if (big_oops_buf) { +- kmsg_dump_get_buffer(dumper, false, ++ kmsg_dump_get_buffer(iter, false, + big_oops_buf, big_oops_buf_sz, &text_len); + rc = zip_oops(text_len); + } + if (rc != 0) { +- kmsg_dump_rewind(dumper); +- kmsg_dump_get_buffer(dumper, false, ++ kmsg_dump_rewind(iter); ++ kmsg_dump_get_buffer(iter, false, + oops_data, oops_data_sz, &text_len); + err_type = ERR_TYPE_KERNEL_PANIC; + oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION); +diff --git a/arch/powerpc/platforms/powernv/opal-kmsg.c b/arch/powerpc/platforms/powernv/opal-kmsg.c +index 6c3bc4b4da98..ec862846bc82 100644 +--- a/arch/powerpc/platforms/powernv/opal-kmsg.c ++++ b/arch/powerpc/platforms/powernv/opal-kmsg.c +@@ -20,7 +20,8 @@ + * message, it just ensures that OPAL completely flushes the console buffer. + */ + static void kmsg_dump_opal_console_flush(struct kmsg_dumper *dumper, +- enum kmsg_dump_reason reason) ++ enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter) + { + /* + * Outside of a panic context the pollers will continue to run, +diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c +index 3de2adc0a807..9f62cca6c594 100644 +--- a/arch/powerpc/xmon/xmon.c ++++ b/arch/powerpc/xmon/xmon.c +@@ -3003,7 +3003,7 @@ print_address(unsigned long addr) + static void + dump_log_buf(void) + { +- struct kmsg_dumper dumper = { .active = 1 }; ++ struct kmsg_dumper_iter iter = { .active = 1 }; + unsigned char buf[128]; + size_t len; + +@@ -3015,9 +3015,9 @@ dump_log_buf(void) + catch_memory_errors = 1; + sync(); + +- kmsg_dump_rewind_nolock(&dumper); ++ kmsg_dump_rewind_nolock(&iter); + xmon_start_pagination(); +- while (kmsg_dump_get_line_nolock(&dumper, false, buf, sizeof(buf), &len)) { ++ while (kmsg_dump_get_line_nolock(&iter, false, buf, sizeof(buf), &len)) { + buf[len] = '\0'; + printf("%s", buf); + } +diff --git a/arch/um/kernel/kmsg_dump.c b/arch/um/kernel/kmsg_dump.c +index e4abac6c9727..f38349ad00ea 100644 +--- a/arch/um/kernel/kmsg_dump.c ++++ b/arch/um/kernel/kmsg_dump.c +@@ -6,7 +6,8 @@ + #include <os.h> + + static void kmsg_dumper_stdout(struct kmsg_dumper *dumper, +- enum kmsg_dump_reason reason) ++ enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter) + { + static char line[1024]; + struct console *con; +@@ -25,7 +26,7 @@ static void kmsg_dumper_stdout(struct kmsg_dumper *dumper, + return; + + printf("kmsg_dump:\n"); +- while (kmsg_dump_get_line(dumper, true, line, sizeof(line), &len)) { ++ while (kmsg_dump_get_line(iter, true, line, sizeof(line), &len)) { + line[len] = '\0'; + printf("%s", line); + } +diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c +index e99400f3ae1d..db39c96bbf43 100644 +--- a/drivers/hv/vmbus_drv.c ++++ b/drivers/hv/vmbus_drv.c +@@ -1359,7 +1359,8 @@ static void vmbus_isr(void) + * buffer and call into Hyper-V to transfer the data. + */ + static void hv_kmsg_dump(struct kmsg_dumper *dumper, +- enum kmsg_dump_reason reason) ++ enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter) + { + size_t bytes_written; + phys_addr_t panic_pa; +@@ -1374,7 +1375,7 @@ static void hv_kmsg_dump(struct kmsg_dumper *dumper, + * Write dump contents to the page. No need to synchronize; panic should + * be single-threaded. + */ +- kmsg_dump_get_buffer(dumper, false, hv_panic_page, HV_HYP_PAGE_SIZE, ++ kmsg_dump_get_buffer(iter, false, hv_panic_page, HV_HYP_PAGE_SIZE, + &bytes_written); + if (bytes_written) + hyperv_report_panic_msg(panic_pa, bytes_written); +diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c +index 774970bfcf85..6bc2c728adb7 100644 +--- a/drivers/mtd/mtdoops.c ++++ b/drivers/mtd/mtdoops.c +@@ -267,7 +267,8 @@ static void find_next_position(struct mtdoops_context *cxt) + } + + static void mtdoops_do_dump(struct kmsg_dumper *dumper, +- enum kmsg_dump_reason reason) ++ enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter) + { + struct mtdoops_context *cxt = container_of(dumper, + struct mtdoops_context, dump); +@@ -276,7 +277,7 @@ static void mtdoops_do_dump(struct kmsg_dumper *dumper, + if (reason == KMSG_DUMP_OOPS && !dump_oops) + return; + +- kmsg_dump_get_buffer(dumper, true, cxt->oops_buf + MTDOOPS_HEADER_SIZE, ++ kmsg_dump_get_buffer(iter, true, cxt->oops_buf + MTDOOPS_HEADER_SIZE, + record_size - MTDOOPS_HEADER_SIZE, NULL); + + if (reason != KMSG_DUMP_OOPS) { +diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c +index d59f13b1fb96..64f3f85d2a5d 100644 +--- a/fs/pstore/platform.c ++++ b/fs/pstore/platform.c +@@ -384,7 +384,8 @@ void pstore_record_init(struct pstore_record *record, + * end of the buffer. + */ + static void pstore_dump(struct kmsg_dumper *dumper, +- enum kmsg_dump_reason reason) ++ enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter) + { + unsigned long total = 0; + const char *why; +@@ -434,7 +435,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, + dst_size -= header_size; + + /* Write dump contents. */ +- if (!kmsg_dump_get_buffer(dumper, true, dst + header_size, ++ if (!kmsg_dump_get_buffer(iter, true, dst + header_size, + dst_size, &dump_size)) + break; + +diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h +index 4095a34db0fa..2fdb10ab1799 100644 +--- a/include/linux/kmsg_dump.h ++++ b/include/linux/kmsg_dump.h +@@ -29,6 +29,18 @@ enum kmsg_dump_reason { + KMSG_DUMP_MAX + }; + ++/** ++ * struct kmsg_dumper_iter - iterator for kernel crash message dumper ++ * @active: Flag that specifies if this is currently dumping ++ * @cur_seq: Points to the oldest message to dump (private) ++ * @next_seq: Points after the newest message to dump (private) ++ */ ++struct kmsg_dumper_iter { ++ bool active; ++ u64 cur_seq; ++ u64 next_seq; ++}; ++ + /** + * struct kmsg_dumper - kernel crash message dumper structure + * @list: Entry in the dumper list (private) +@@ -36,37 +48,30 @@ enum kmsg_dump_reason { + * through the record iterator + * @max_reason: filter for highest reason number that should be dumped + * @registered: Flag that specifies if this is already registered +- * @active: Flag that specifies if this is currently dumping +- * @cur_seq: Points to the oldest message to dump (private) +- * @next_seq: Points after the newest message to dump (private) + */ + struct kmsg_dumper { + struct list_head list; +- void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason); ++ void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter); + enum kmsg_dump_reason max_reason; +- bool active; + bool registered; +- +- /* private state of the kmsg iterator */ +- u64 cur_seq; +- u64 next_seq; + }; + + #ifdef CONFIG_PRINTK + void kmsg_dump(enum kmsg_dump_reason reason); + +-bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, ++bool kmsg_dump_get_line_nolock(struct kmsg_dumper_iter *iter, bool syslog, + char *line, size_t size, size_t *len); + +-bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, ++bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog, + char *line, size_t size, size_t *len); + +-bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, ++bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog, + char *buf, size_t size, size_t *len_out); + +-void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper); ++void kmsg_dump_rewind_nolock(struct kmsg_dumper_iter *iter); + +-void kmsg_dump_rewind(struct kmsg_dumper *dumper); ++void kmsg_dump_rewind(struct kmsg_dumper_iter *dumper_iter); + + int kmsg_dump_register(struct kmsg_dumper *dumper); + +@@ -78,30 +83,30 @@ static inline void kmsg_dump(enum kmsg_dump_reason reason) + { + } + +-static inline bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, ++static inline bool kmsg_dump_get_line_nolock(struct kmsg_dumper_iter *iter, + bool syslog, const char *line, + size_t size, size_t *len) + { + return false; + } + +-static inline bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, ++static inline bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog, + const char *line, size_t size, size_t *len) + { + return false; + } + +-static inline bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, ++static inline bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog, + char *buf, size_t size, size_t *len) + { + return false; + } + +-static inline void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) ++static inline void kmsg_dump_rewind_nolock(struct kmsg_dumper_iter *iter) + { + } + +-static inline void kmsg_dump_rewind(struct kmsg_dumper *dumper) ++static inline void kmsg_dump_rewind(struct kmsg_dumper_iter *iter) + { + } + +diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c +index 4e09fab52faf..048baadd7a41 100644 +--- a/kernel/debug/kdb/kdb_main.c ++++ b/kernel/debug/kdb/kdb_main.c +@@ -2157,7 +2157,7 @@ static int kdb_dmesg(int argc, const char **argv) + int adjust = 0; + int n = 0; + int skip = 0; +- struct kmsg_dumper dumper = { .active = 1 }; ++ struct kmsg_dumper_iter iter = { .active = 1 }; + size_t len; + char buf[201]; + +@@ -2182,8 +2182,8 @@ static int kdb_dmesg(int argc, const char **argv) + kdb_set(2, setargs); + } + +- kmsg_dump_rewind_nolock(&dumper); +- while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL)) ++ kmsg_dump_rewind_nolock(&iter); ++ while (kmsg_dump_get_line_nolock(&iter, 1, NULL, 0, NULL)) + n++; + + if (lines < 0) { +@@ -2215,8 +2215,8 @@ static int kdb_dmesg(int argc, const char **argv) + if (skip >= n || skip < 0) + return 0; + +- kmsg_dump_rewind_nolock(&dumper); +- while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) { ++ kmsg_dump_rewind_nolock(&iter); ++ while (kmsg_dump_get_line_nolock(&iter, 1, buf, sizeof(buf), &len)) { + if (skip) { + skip--; + continue; +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 986fc9fad210..b992be31824f 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -3394,6 +3394,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); + */ + void kmsg_dump(enum kmsg_dump_reason reason) + { ++ struct kmsg_dumper_iter iter; + struct kmsg_dumper *dumper; + unsigned long flags; + +@@ -3413,25 +3414,21 @@ void kmsg_dump(enum kmsg_dump_reason reason) + continue; + + /* initialize iterator with data about the stored records */ +- dumper->active = true; +- ++ iter.active = true; + logbuf_lock_irqsave(flags); +- dumper->cur_seq = latched_seq_read_nolock(&clear_seq); +- dumper->next_seq = prb_next_seq(prb); ++ iter.cur_seq = latched_seq_read_nolock(&clear_seq); ++ iter.next_seq = prb_next_seq(prb); + logbuf_unlock_irqrestore(flags); + + /* invoke dumper which will iterate over records */ +- dumper->dump(dumper, reason); +- +- /* reset iterator */ +- dumper->active = false; ++ dumper->dump(dumper, reason, &iter); + } + rcu_read_unlock(); + } + + /** + * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version) +- * @dumper: registered kmsg dumper ++ * @iter: kmsg dumper iterator + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to + * @size: maximum size of the buffer +@@ -3448,7 +3445,7 @@ void kmsg_dump(enum kmsg_dump_reason reason) + * + * The function is similar to kmsg_dump_get_line(), but grabs no locks. + */ +-bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, ++bool kmsg_dump_get_line_nolock(struct kmsg_dumper_iter *iter, bool syslog, + char *line, size_t size, size_t *len) + { + struct printk_info info; +@@ -3459,16 +3456,16 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, + + prb_rec_init_rd(&r, &info, line, size); + +- if (!dumper->active) ++ if (!iter->active) + goto out; + + /* Read text or count text lines? */ + if (line) { +- if (!prb_read_valid(prb, dumper->cur_seq, &r)) ++ if (!prb_read_valid(prb, iter->cur_seq, &r)) + goto out; + l = record_print_text(&r, syslog, printk_time); + } else { +- if (!prb_read_valid_info(prb, dumper->cur_seq, ++ if (!prb_read_valid_info(prb, iter->cur_seq, + &info, &line_count)) { + goto out; + } +@@ -3477,7 +3474,7 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, + + } + +- dumper->cur_seq = r.info->seq + 1; ++ iter->cur_seq = r.info->seq + 1; + ret = true; + out: + if (len) +@@ -3487,7 +3484,7 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, + + /** + * kmsg_dump_get_line - retrieve one kmsg log line +- * @dumper: registered kmsg dumper ++ * @iter: kmsg dumper iterator + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to + * @size: maximum size of the buffer +@@ -3502,14 +3499,14 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, + * A return value of FALSE indicates that there are no more records to + * read. + */ +-bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, ++bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog, + char *line, size_t size, size_t *len) + { + unsigned long flags; + bool ret; + + logbuf_lock_irqsave(flags); +- ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); ++ ret = kmsg_dump_get_line_nolock(iter, syslog, line, size, len); + logbuf_unlock_irqrestore(flags); + + return ret; +@@ -3518,7 +3515,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); + + /** + * kmsg_dump_get_buffer - copy kmsg log lines +- * @dumper: registered kmsg dumper ++ * @iter: kmsg dumper iterator + * @syslog: include the "<4>" prefixes + * @buf: buffer to copy the line to + * @size: maximum size of the buffer +@@ -3535,7 +3532,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); + * A return value of FALSE indicates that there are no more records to + * read. + */ +-bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, ++bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog, + char *buf, size_t size, size_t *len_out) + { + struct printk_info info; +@@ -3547,19 +3544,19 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, + bool ret = false; + bool time = printk_time; + +- if (!dumper->active || !buf || !size) ++ if (!iter->active || !buf || !size) + goto out; + + logbuf_lock_irqsave(flags); +- if (prb_read_valid_info(prb, dumper->cur_seq, &info, NULL)) { +- if (info.seq != dumper->cur_seq) { ++ if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) { ++ if (info.seq != iter->cur_seq) { + /* messages are gone, move to first available one */ +- dumper->cur_seq = info.seq; ++ iter->cur_seq = info.seq; + } + } + + /* last entry */ +- if (dumper->cur_seq >= dumper->next_seq) { ++ if (iter->cur_seq >= iter->next_seq) { + logbuf_unlock_irqrestore(flags); + goto out; + } +@@ -3570,7 +3567,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, + * because this function (by way of record_print_text()) will + * not write more than size-1 bytes of text into @buf. + */ +- seq = find_first_fitting_seq(dumper->cur_seq, dumper->next_seq, ++ seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq, + size - 1, syslog, time); + + /* +@@ -3583,7 +3580,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, + + len = 0; + prb_for_each_record(seq, prb, seq, &r) { +- if (r.info->seq >= dumper->next_seq) ++ if (r.info->seq >= iter->next_seq) + break; + + len += record_print_text(&r, syslog, time); +@@ -3592,7 +3589,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, + prb_rec_init_rd(&r, &info, buf + len, size - len); + } + +- dumper->next_seq = next_seq; ++ iter->next_seq = next_seq; + ret = true; + logbuf_unlock_irqrestore(flags); + out: +@@ -3604,7 +3601,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); + + /** + * kmsg_dump_rewind_nolock - reset the iterator (unlocked version) +- * @dumper: registered kmsg dumper ++ * @iter: kmsg dumper iterator + * + * Reset the dumper's iterator so that kmsg_dump_get_line() and + * kmsg_dump_get_buffer() can be called again and used multiple +@@ -3612,26 +3609,26 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); + * + * The function is similar to kmsg_dump_rewind(), but grabs no locks. + */ +-void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) ++void kmsg_dump_rewind_nolock(struct kmsg_dumper_iter *iter) + { +- dumper->cur_seq = latched_seq_read_nolock(&clear_seq); +- dumper->next_seq = prb_next_seq(prb); ++ iter->cur_seq = latched_seq_read_nolock(&clear_seq); ++ iter->next_seq = prb_next_seq(prb); + } + + /** + * kmsg_dump_rewind - reset the iterator +- * @dumper: registered kmsg dumper ++ * @iter: kmsg dumper iterator + * + * Reset the dumper's iterator so that kmsg_dump_get_line() and + * kmsg_dump_get_buffer() can be called again and used multiple + * times within the same dumper.dump() callback. + */ +-void kmsg_dump_rewind(struct kmsg_dumper *dumper) ++void kmsg_dump_rewind(struct kmsg_dumper_iter *iter) + { + unsigned long flags; + + logbuf_lock_irqsave(flags); +- kmsg_dump_rewind_nolock(dumper); ++ kmsg_dump_rewind_nolock(iter); + logbuf_unlock_irqrestore(flags); + } + EXPORT_SYMBOL_GPL(kmsg_dump_rewind); +-- +2.43.0 + diff --git a/debian/patches-rt/0091-um-synchronize-kmsg_dumper.patch b/debian/patches-rt/0091-um-synchronize-kmsg_dumper.patch new file mode 100644 index 000000000..88b6f08ca --- /dev/null +++ b/debian/patches-rt/0091-um-synchronize-kmsg_dumper.patch @@ -0,0 +1,61 @@ +From 7790fa6baa700a285007a6e8ddf99393ba72acc2 Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Mon, 21 Dec 2020 11:10:03 +0106 +Subject: [PATCH 091/323] um: synchronize kmsg_dumper +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The kmsg_dumper can be called from any context and CPU, possibly +from multiple CPUs simultaneously. Since a static buffer is used +to retrieve the kernel logs, this buffer must be protected against +simultaneous dumping. + +Cc: Richard Weinberger <richard@nod.at> +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Reviewed-by: Petr Mladek <pmladek@suse.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/um/kernel/kmsg_dump.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/arch/um/kernel/kmsg_dump.c b/arch/um/kernel/kmsg_dump.c +index f38349ad00ea..173999422ed8 100644 +--- a/arch/um/kernel/kmsg_dump.c ++++ b/arch/um/kernel/kmsg_dump.c +@@ -1,5 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0 + #include <linux/kmsg_dump.h> ++#include <linux/spinlock.h> + #include <linux/console.h> + #include <shared/init.h> + #include <shared/kern.h> +@@ -9,8 +10,10 @@ static void kmsg_dumper_stdout(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason, + struct kmsg_dumper_iter *iter) + { ++ static DEFINE_SPINLOCK(lock); + static char line[1024]; + struct console *con; ++ unsigned long flags; + size_t len = 0; + + /* only dump kmsg when no console is available */ +@@ -25,11 +28,16 @@ static void kmsg_dumper_stdout(struct kmsg_dumper *dumper, + if (con) + return; + ++ if (!spin_trylock_irqsave(&lock, flags)) ++ return; ++ + printf("kmsg_dump:\n"); + while (kmsg_dump_get_line(iter, true, line, sizeof(line), &len)) { + line[len] = '\0'; + printf("%s", line); + } ++ ++ spin_unlock_irqrestore(&lock, flags); + } + + static struct kmsg_dumper kmsg_dumper = { +-- +2.43.0 + diff --git a/debian/patches-rt/0092-printk-remove-logbuf_lock.patch b/debian/patches-rt/0092-printk-remove-logbuf_lock.patch new file mode 100644 index 000000000..6f45ee8b3 --- /dev/null +++ b/debian/patches-rt/0092-printk-remove-logbuf_lock.patch @@ -0,0 +1,486 @@ +From 74a8c8a0d6b64df822c5946a582ca3adeec24e63 Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Tue, 26 Jan 2021 17:43:19 +0106 +Subject: [PATCH 092/323] printk: remove logbuf_lock +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Since the ringbuffer is lockless, there is no need for it to be +protected by @logbuf_lock. Remove @logbuf_lock. + +This means that printk_nmi_direct and printk_safe_flush_on_panic() +no longer need to acquire any lock to run. + +@console_seq, @exclusive_console_stop_seq, @console_dropped are +protected by @console_lock. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/printk/internal.h | 4 +- + kernel/printk/printk.c | 118 ++++++++++++------------------------ + kernel/printk/printk_safe.c | 29 +++------ + 3 files changed, 48 insertions(+), 103 deletions(-) + +diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h +index 3a8fd491758c..e7acc2888c8e 100644 +--- a/kernel/printk/internal.h ++++ b/kernel/printk/internal.h +@@ -12,8 +12,6 @@ + + #define PRINTK_NMI_CONTEXT_OFFSET 0x010000000 + +-extern raw_spinlock_t logbuf_lock; +- + __printf(4, 0) + int vprintk_store(int facility, int level, + const struct dev_printk_info *dev_info, +@@ -59,7 +57,7 @@ void defer_console_output(void); + __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; } + + /* +- * In !PRINTK builds we still export logbuf_lock spin_lock, console_sem ++ * In !PRINTK builds we still export console_sem + * semaphore and some of console functions (console_unlock()/etc.), so + * printk-safe must preserve the existing local IRQ guarantees. + */ +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index b992be31824f..dde1696d7564 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -357,41 +357,6 @@ enum log_flags { + LOG_CONT = 8, /* text is a fragment of a continuation line */ + }; + +-/* +- * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken +- * within the scheduler's rq lock. It must be released before calling +- * console_unlock() or anything else that might wake up a process. +- */ +-DEFINE_RAW_SPINLOCK(logbuf_lock); +- +-/* +- * Helper macros to lock/unlock logbuf_lock and switch between +- * printk-safe/unsafe modes. +- */ +-#define logbuf_lock_irq() \ +- do { \ +- printk_safe_enter_irq(); \ +- raw_spin_lock(&logbuf_lock); \ +- } while (0) +- +-#define logbuf_unlock_irq() \ +- do { \ +- raw_spin_unlock(&logbuf_lock); \ +- printk_safe_exit_irq(); \ +- } while (0) +- +-#define logbuf_lock_irqsave(flags) \ +- do { \ +- printk_safe_enter_irqsave(flags); \ +- raw_spin_lock(&logbuf_lock); \ +- } while (0) +- +-#define logbuf_unlock_irqrestore(flags) \ +- do { \ +- raw_spin_unlock(&logbuf_lock); \ +- printk_safe_exit_irqrestore(flags); \ +- } while (0) +- + /* syslog_lock protects syslog_* variables and write access to clear_seq. */ + static DEFINE_RAW_SPINLOCK(syslog_lock); + +@@ -403,6 +368,7 @@ static u64 syslog_seq; + static size_t syslog_partial; + static bool syslog_time; + ++/* All 3 protected by @console_sem. */ + /* the next printk record to write to the console */ + static u64 console_seq; + static u64 exclusive_console_stop_seq; +@@ -769,27 +735,27 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, + if (ret) + return ret; + +- logbuf_lock_irq(); ++ printk_safe_enter_irq(); + if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) { + if (file->f_flags & O_NONBLOCK) { + ret = -EAGAIN; +- logbuf_unlock_irq(); ++ printk_safe_exit_irq(); + goto out; + } + +- logbuf_unlock_irq(); ++ printk_safe_exit_irq(); + ret = wait_event_interruptible(log_wait, + prb_read_valid(prb, atomic64_read(&user->seq), r)); + if (ret) + goto out; +- logbuf_lock_irq(); ++ printk_safe_enter_irq(); + } + + if (r->info->seq != atomic64_read(&user->seq)) { + /* our last seen message is gone, return error and reset */ + atomic64_set(&user->seq, r->info->seq); + ret = -EPIPE; +- logbuf_unlock_irq(); ++ printk_safe_exit_irq(); + goto out; + } + +@@ -799,7 +765,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, + &r->info->dev_info); + + atomic64_set(&user->seq, r->info->seq + 1); +- logbuf_unlock_irq(); ++ printk_safe_exit_irq(); + + if (len > count) { + ret = -EINVAL; +@@ -834,7 +800,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) + if (offset) + return -ESPIPE; + +- logbuf_lock_irq(); ++ printk_safe_enter_irq(); + switch (whence) { + case SEEK_SET: + /* the first record */ +@@ -855,7 +821,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) + default: + ret = -EINVAL; + } +- logbuf_unlock_irq(); ++ printk_safe_exit_irq(); + return ret; + } + +@@ -870,15 +836,15 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) + + poll_wait(file, &log_wait, wait); + +- logbuf_lock_irq(); +- if (prb_read_valid_info(prb, user->seq, &info, NULL)) { ++ printk_safe_enter_irq(); ++ if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) { + /* return error when data has vanished underneath us */ + if (info.seq != atomic64_read(&user->seq)) + ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; + else + ret = EPOLLIN|EPOLLRDNORM; + } +- logbuf_unlock_irq(); ++ printk_safe_exit_irq(); + + return ret; + } +@@ -911,9 +877,9 @@ static int devkmsg_open(struct inode *inode, struct file *file) + prb_rec_init_rd(&user->record, &user->info, + &user->text_buf[0], sizeof(user->text_buf)); + +- logbuf_lock_irq(); ++ printk_safe_enter_irq(); + atomic64_set(&user->seq, prb_first_valid_seq(prb)); +- logbuf_unlock_irq(); ++ printk_safe_exit_irq(); + + file->private_data = user; + return 0; +@@ -1535,11 +1501,11 @@ static int syslog_print(char __user *buf, int size) + size_t n; + size_t skip; + +- logbuf_lock_irq(); ++ printk_safe_enter_irq(); + raw_spin_lock(&syslog_lock); + if (!prb_read_valid(prb, syslog_seq, &r)) { + raw_spin_unlock(&syslog_lock); +- logbuf_unlock_irq(); ++ printk_safe_exit_irq(); + break; + } + if (r.info->seq != syslog_seq) { +@@ -1569,7 +1535,7 @@ static int syslog_print(char __user *buf, int size) + } else + n = 0; + raw_spin_unlock(&syslog_lock); +- logbuf_unlock_irq(); ++ printk_safe_exit_irq(); + + if (!n) + break; +@@ -1603,7 +1569,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + return -ENOMEM; + + time = printk_time; +- logbuf_lock_irq(); ++ printk_safe_enter_irq(); + /* + * Find first record that fits, including all following records, + * into the user-provided buffer for this dump. +@@ -1624,12 +1590,12 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + break; + } + +- logbuf_unlock_irq(); ++ printk_safe_exit_irq(); + if (copy_to_user(buf + len, text, textlen)) + len = -EFAULT; + else + len += textlen; +- logbuf_lock_irq(); ++ printk_safe_enter_irq(); + + if (len < 0) + break; +@@ -1640,7 +1606,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + latched_seq_write(&clear_seq, seq); + raw_spin_unlock(&syslog_lock); + } +- logbuf_unlock_irq(); ++ printk_safe_exit_irq(); + + kfree(text); + return len; +@@ -1648,11 +1614,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + + static void syslog_clear(void) + { +- logbuf_lock_irq(); ++ printk_safe_enter_irq(); + raw_spin_lock(&syslog_lock); + latched_seq_write(&clear_seq, prb_next_seq(prb)); + raw_spin_unlock(&syslog_lock); +- logbuf_unlock_irq(); ++ printk_safe_exit_irq(); + } + + /* Return a consistent copy of @syslog_seq. */ +@@ -1740,12 +1706,12 @@ int do_syslog(int type, char __user *buf, int len, int source) + break; + /* Number of chars in the log buffer */ + case SYSLOG_ACTION_SIZE_UNREAD: +- logbuf_lock_irq(); ++ printk_safe_enter_irq(); + raw_spin_lock(&syslog_lock); + if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) { + /* No unread messages. */ + raw_spin_unlock(&syslog_lock); +- logbuf_unlock_irq(); ++ printk_safe_exit_irq(); + return 0; + } + if (info.seq != syslog_seq) { +@@ -1774,7 +1740,7 @@ int do_syslog(int type, char __user *buf, int len, int source) + error -= syslog_partial; + } + raw_spin_unlock(&syslog_lock); +- logbuf_unlock_irq(); ++ printk_safe_exit_irq(); + break; + /* Size of the log buffer */ + case SYSLOG_ACTION_SIZE_BUFFER: +@@ -2630,7 +2596,6 @@ void console_unlock(void) + size_t len; + + printk_safe_enter_irqsave(flags); +- raw_spin_lock(&logbuf_lock); + skip: + if (!prb_read_valid(prb, console_seq, &r)) + break; +@@ -2674,7 +2639,6 @@ void console_unlock(void) + console_msg_format & MSG_FORMAT_SYSLOG, + printk_time); + console_seq++; +- raw_spin_unlock(&logbuf_lock); + + /* + * While actively printing out messages, if another printk() +@@ -2701,8 +2665,6 @@ void console_unlock(void) + + console_locked = 0; + +- raw_spin_unlock(&logbuf_lock); +- + up_console_sem(); + + /* +@@ -2711,9 +2673,7 @@ void console_unlock(void) + * there's a new owner and the console_unlock() from them will do the + * flush, no worries. + */ +- raw_spin_lock(&logbuf_lock); + retry = prb_read_valid(prb, console_seq, NULL); +- raw_spin_unlock(&logbuf_lock); + printk_safe_exit_irqrestore(flags); + + if (retry && console_trylock()) +@@ -2780,9 +2740,9 @@ void console_flush_on_panic(enum con_flush_mode mode) + if (mode == CONSOLE_REPLAY_ALL) { + unsigned long flags; + +- logbuf_lock_irqsave(flags); ++ printk_safe_enter_irqsave(flags); + console_seq = prb_first_valid_seq(prb); +- logbuf_unlock_irqrestore(flags); ++ printk_safe_exit_irqrestore(flags); + } + console_unlock(); + } +@@ -3011,7 +2971,7 @@ void register_console(struct console *newcon) + * console_unlock(); will print out the buffered messages + * for us. + */ +- logbuf_lock_irqsave(flags); ++ printk_safe_enter_irqsave(flags); + /* + * We're about to replay the log buffer. Only do this to the + * just-registered console to avoid excessive message spam to +@@ -3029,7 +2989,7 @@ void register_console(struct console *newcon) + console_seq = syslog_seq; + raw_spin_unlock(&syslog_lock); + +- logbuf_unlock_irqrestore(flags); ++ printk_safe_exit_irqrestore(flags); + } + console_unlock(); + console_sysfs_notify(); +@@ -3415,10 +3375,10 @@ void kmsg_dump(enum kmsg_dump_reason reason) + + /* initialize iterator with data about the stored records */ + iter.active = true; +- logbuf_lock_irqsave(flags); ++ printk_safe_enter_irqsave(flags); + iter.cur_seq = latched_seq_read_nolock(&clear_seq); + iter.next_seq = prb_next_seq(prb); +- logbuf_unlock_irqrestore(flags); ++ printk_safe_exit_irqrestore(flags); + + /* invoke dumper which will iterate over records */ + dumper->dump(dumper, reason, &iter); +@@ -3505,9 +3465,9 @@ bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog, + unsigned long flags; + bool ret; + +- logbuf_lock_irqsave(flags); ++ printk_safe_enter_irqsave(flags); + ret = kmsg_dump_get_line_nolock(iter, syslog, line, size, len); +- logbuf_unlock_irqrestore(flags); ++ printk_safe_exit_irqrestore(flags); + + return ret; + } +@@ -3547,7 +3507,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog, + if (!iter->active || !buf || !size) + goto out; + +- logbuf_lock_irqsave(flags); ++ printk_safe_enter_irqsave(flags); + if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) { + if (info.seq != iter->cur_seq) { + /* messages are gone, move to first available one */ +@@ -3557,7 +3517,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog, + + /* last entry */ + if (iter->cur_seq >= iter->next_seq) { +- logbuf_unlock_irqrestore(flags); ++ printk_safe_exit_irqrestore(flags); + goto out; + } + +@@ -3591,7 +3551,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog, + + iter->next_seq = next_seq; + ret = true; +- logbuf_unlock_irqrestore(flags); ++ printk_safe_exit_irqrestore(flags); + out: + if (len_out) + *len_out = len; +@@ -3627,9 +3587,9 @@ void kmsg_dump_rewind(struct kmsg_dumper_iter *iter) + { + unsigned long flags; + +- logbuf_lock_irqsave(flags); ++ printk_safe_enter_irqsave(flags); + kmsg_dump_rewind_nolock(iter); +- logbuf_unlock_irqrestore(flags); ++ printk_safe_exit_irqrestore(flags); + } + EXPORT_SYMBOL_GPL(kmsg_dump_rewind); + +diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c +index 2e9e3ed7d63e..7df8a88d4115 100644 +--- a/kernel/printk/printk_safe.c ++++ b/kernel/printk/printk_safe.c +@@ -16,7 +16,7 @@ + #include "internal.h" + + /* +- * printk() could not take logbuf_lock in NMI context. Instead, ++ * In NMI and safe mode, printk() avoids taking locks. Instead, + * it uses an alternative implementation that temporary stores + * the strings into a per-CPU buffer. The content of the buffer + * is later flushed into the main ring buffer via IRQ work. +@@ -266,18 +266,6 @@ void printk_safe_flush(void) + */ + void printk_safe_flush_on_panic(void) + { +- /* +- * Make sure that we could access the main ring buffer. +- * Do not risk a double release when more CPUs are up. +- */ +- if (raw_spin_is_locked(&logbuf_lock)) { +- if (num_online_cpus() > 1) +- return; +- +- debug_locks_off(); +- raw_spin_lock_init(&logbuf_lock); +- } +- + if (raw_spin_is_locked(&safe_read_lock)) { + if (num_online_cpus() > 1) + return; +@@ -319,9 +307,7 @@ void noinstr printk_nmi_exit(void) + * reordering. + * + * It has effect only when called in NMI context. Then printk() +- * will try to store the messages into the main logbuf directly +- * and use the per-CPU buffers only as a fallback when the lock +- * is not available. ++ * will store the messages into the main logbuf directly. + */ + void printk_nmi_direct_enter(void) + { +@@ -376,20 +362,21 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args) + #endif + + /* +- * Try to use the main logbuf even in NMI. But avoid calling console ++ * Use the main logbuf even in NMI. But avoid calling console + * drivers that might have their own locks. + */ +- if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK) && +- raw_spin_trylock(&logbuf_lock)) { ++ if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK)) { ++ unsigned long flags; + int len; + ++ printk_safe_enter_irqsave(flags); + len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); +- raw_spin_unlock(&logbuf_lock); ++ printk_safe_exit_irqrestore(flags); + defer_console_output(); + return len; + } + +- /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */ ++ /* Use extra buffer in NMI. */ + if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) + return vprintk_nmi(fmt, args); + +-- +2.43.0 + diff --git a/debian/patches-rt/0093-printk-kmsg_dump-remove-_nolock-variants.patch b/debian/patches-rt/0093-printk-kmsg_dump-remove-_nolock-variants.patch new file mode 100644 index 000000000..6ad034c1a --- /dev/null +++ b/debian/patches-rt/0093-printk-kmsg_dump-remove-_nolock-variants.patch @@ -0,0 +1,226 @@ +From 171ee108014d67150846f1b8f978921cefc7b1e3 Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Mon, 21 Dec 2020 10:27:58 +0106 +Subject: [PATCH 093/323] printk: kmsg_dump: remove _nolock() variants +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +kmsg_dump_rewind() and kmsg_dump_get_line() are lockless, so there is +no need for _nolock() variants. Remove these functions and switch all +callers of the _nolock() variants. + +The functions without _nolock() were chosen because they are already +exported to kernel modules. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +--- + arch/powerpc/xmon/xmon.c | 4 +-- + include/linux/kmsg_dump.h | 18 +---------- + kernel/debug/kdb/kdb_main.c | 8 ++--- + kernel/printk/printk.c | 60 +++++-------------------------------- + 4 files changed, 15 insertions(+), 75 deletions(-) + +diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c +index 9f62cca6c594..2eef587bf51d 100644 +--- a/arch/powerpc/xmon/xmon.c ++++ b/arch/powerpc/xmon/xmon.c +@@ -3015,9 +3015,9 @@ dump_log_buf(void) + catch_memory_errors = 1; + sync(); + +- kmsg_dump_rewind_nolock(&iter); ++ kmsg_dump_rewind(&iter); + xmon_start_pagination(); +- while (kmsg_dump_get_line_nolock(&iter, false, buf, sizeof(buf), &len)) { ++ while (kmsg_dump_get_line(&iter, false, buf, sizeof(buf), &len)) { + buf[len] = '\0'; + printf("%s", buf); + } +diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h +index 2fdb10ab1799..86673930c8ea 100644 +--- a/include/linux/kmsg_dump.h ++++ b/include/linux/kmsg_dump.h +@@ -60,18 +60,13 @@ struct kmsg_dumper { + #ifdef CONFIG_PRINTK + void kmsg_dump(enum kmsg_dump_reason reason); + +-bool kmsg_dump_get_line_nolock(struct kmsg_dumper_iter *iter, bool syslog, +- char *line, size_t size, size_t *len); +- + bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog, + char *line, size_t size, size_t *len); + + bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog, + char *buf, size_t size, size_t *len_out); + +-void kmsg_dump_rewind_nolock(struct kmsg_dumper_iter *iter); +- +-void kmsg_dump_rewind(struct kmsg_dumper_iter *dumper_iter); ++void kmsg_dump_rewind(struct kmsg_dumper_iter *iter); + + int kmsg_dump_register(struct kmsg_dumper *dumper); + +@@ -83,13 +78,6 @@ static inline void kmsg_dump(enum kmsg_dump_reason reason) + { + } + +-static inline bool kmsg_dump_get_line_nolock(struct kmsg_dumper_iter *iter, +- bool syslog, const char *line, +- size_t size, size_t *len) +-{ +- return false; +-} +- + static inline bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog, + const char *line, size_t size, size_t *len) + { +@@ -102,10 +90,6 @@ static inline bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool sysl + return false; + } + +-static inline void kmsg_dump_rewind_nolock(struct kmsg_dumper_iter *iter) +-{ +-} +- + static inline void kmsg_dump_rewind(struct kmsg_dumper_iter *iter) + { + } +diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c +index 048baadd7a41..1f5c577b926e 100644 +--- a/kernel/debug/kdb/kdb_main.c ++++ b/kernel/debug/kdb/kdb_main.c +@@ -2182,8 +2182,8 @@ static int kdb_dmesg(int argc, const char **argv) + kdb_set(2, setargs); + } + +- kmsg_dump_rewind_nolock(&iter); +- while (kmsg_dump_get_line_nolock(&iter, 1, NULL, 0, NULL)) ++ kmsg_dump_rewind(&iter); ++ while (kmsg_dump_get_line(&iter, 1, NULL, 0, NULL)) + n++; + + if (lines < 0) { +@@ -2215,8 +2215,8 @@ static int kdb_dmesg(int argc, const char **argv) + if (skip >= n || skip < 0) + return 0; + +- kmsg_dump_rewind_nolock(&iter); +- while (kmsg_dump_get_line_nolock(&iter, 1, buf, sizeof(buf), &len)) { ++ kmsg_dump_rewind(&iter); ++ while (kmsg_dump_get_line(&iter, 1, buf, sizeof(buf), &len)) { + if (skip) { + skip--; + continue; +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index dde1696d7564..031ce3f00141 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -3387,7 +3387,7 @@ void kmsg_dump(enum kmsg_dump_reason reason) + } + + /** +- * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version) ++ * kmsg_dump_get_line - retrieve one kmsg log line + * @iter: kmsg dumper iterator + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to +@@ -3402,18 +3402,18 @@ void kmsg_dump(enum kmsg_dump_reason reason) + * + * A return value of FALSE indicates that there are no more records to + * read. +- * +- * The function is similar to kmsg_dump_get_line(), but grabs no locks. + */ +-bool kmsg_dump_get_line_nolock(struct kmsg_dumper_iter *iter, bool syslog, +- char *line, size_t size, size_t *len) ++bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog, ++ char *line, size_t size, size_t *len) + { + struct printk_info info; + unsigned int line_count; + struct printk_record r; ++ unsigned long flags; + size_t l = 0; + bool ret = false; + ++ printk_safe_enter_irqsave(flags); + prb_rec_init_rd(&r, &info, line, size); + + if (!iter->active) +@@ -3437,40 +3437,11 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper_iter *iter, bool syslog, + iter->cur_seq = r.info->seq + 1; + ret = true; + out: ++ printk_safe_exit_irqrestore(flags); + if (len) + *len = l; + return ret; + } +- +-/** +- * kmsg_dump_get_line - retrieve one kmsg log line +- * @iter: kmsg dumper iterator +- * @syslog: include the "<4>" prefixes +- * @line: buffer to copy the line to +- * @size: maximum size of the buffer +- * @len: length of line placed into buffer +- * +- * Start at the beginning of the kmsg buffer, with the oldest kmsg +- * record, and copy one record into the provided buffer. +- * +- * Consecutive calls will return the next available record moving +- * towards the end of the buffer with the youngest messages. +- * +- * A return value of FALSE indicates that there are no more records to +- * read. +- */ +-bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog, +- char *line, size_t size, size_t *len) +-{ +- unsigned long flags; +- bool ret; +- +- printk_safe_enter_irqsave(flags); +- ret = kmsg_dump_get_line_nolock(iter, syslog, line, size, len); +- printk_safe_exit_irqrestore(flags); +- +- return ret; +-} + EXPORT_SYMBOL_GPL(kmsg_dump_get_line); + + /** +@@ -3559,22 +3530,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog, + } + EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); + +-/** +- * kmsg_dump_rewind_nolock - reset the iterator (unlocked version) +- * @iter: kmsg dumper iterator +- * +- * Reset the dumper's iterator so that kmsg_dump_get_line() and +- * kmsg_dump_get_buffer() can be called again and used multiple +- * times within the same dumper.dump() callback. +- * +- * The function is similar to kmsg_dump_rewind(), but grabs no locks. +- */ +-void kmsg_dump_rewind_nolock(struct kmsg_dumper_iter *iter) +-{ +- iter->cur_seq = latched_seq_read_nolock(&clear_seq); +- iter->next_seq = prb_next_seq(prb); +-} +- + /** + * kmsg_dump_rewind - reset the iterator + * @iter: kmsg dumper iterator +@@ -3588,7 +3543,8 @@ void kmsg_dump_rewind(struct kmsg_dumper_iter *iter) + unsigned long flags; + + printk_safe_enter_irqsave(flags); +- kmsg_dump_rewind_nolock(iter); ++ iter->cur_seq = latched_seq_read_nolock(&clear_seq); ++ iter->next_seq = prb_next_seq(prb); + printk_safe_exit_irqrestore(flags); + } + EXPORT_SYMBOL_GPL(kmsg_dump_rewind); +-- +2.43.0 + diff --git a/debian/patches-rt/0094-printk-kmsg_dump-use-kmsg_dump_rewind.patch b/debian/patches-rt/0094-printk-kmsg_dump-use-kmsg_dump_rewind.patch new file mode 100644 index 000000000..7ff974d08 --- /dev/null +++ b/debian/patches-rt/0094-printk-kmsg_dump-use-kmsg_dump_rewind.patch @@ -0,0 +1,42 @@ +From 66a45e662dc7ecb7e9b519d3475c7893c0c1dc60 Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Wed, 17 Feb 2021 18:23:16 +0100 +Subject: [PATCH 094/323] printk: kmsg_dump: use kmsg_dump_rewind +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +kmsg_dump() is open coding the kmsg_dump_rewind(). Call +kmsg_dump_rewind() instead. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/printk/printk.c | 6 +----- + 1 file changed, 1 insertion(+), 5 deletions(-) + +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 031ce3f00141..c553b235d068 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -3356,7 +3356,6 @@ void kmsg_dump(enum kmsg_dump_reason reason) + { + struct kmsg_dumper_iter iter; + struct kmsg_dumper *dumper; +- unsigned long flags; + + rcu_read_lock(); + list_for_each_entry_rcu(dumper, &dump_list, list) { +@@ -3375,10 +3374,7 @@ void kmsg_dump(enum kmsg_dump_reason reason) + + /* initialize iterator with data about the stored records */ + iter.active = true; +- printk_safe_enter_irqsave(flags); +- iter.cur_seq = latched_seq_read_nolock(&clear_seq); +- iter.next_seq = prb_next_seq(prb); +- printk_safe_exit_irqrestore(flags); ++ kmsg_dump_rewind(&iter); + + /* invoke dumper which will iterate over records */ + dumper->dump(dumper, reason, &iter); +-- +2.43.0 + diff --git a/debian/patches-rt/0095-printk-console-remove-unnecessary-safe-buffer-usage.patch b/debian/patches-rt/0095-printk-console-remove-unnecessary-safe-buffer-usage.patch new file mode 100644 index 000000000..2b93c47dd --- /dev/null +++ b/debian/patches-rt/0095-printk-console-remove-unnecessary-safe-buffer-usage.patch @@ -0,0 +1,48 @@ +From 259c5c359d06d8bbbddebc6cc12fc2abc6bb2783 Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Wed, 17 Feb 2021 18:28:05 +0100 +Subject: [PATCH 095/323] printk: console: remove unnecessary safe buffer usage +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Upon registering a console, safe buffers are activated when setting +up the sequence number to replay the log. However, these are already +protected by @console_sem and @syslog_lock. Remove the unnecessary +safe buffer usage. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +--- + kernel/printk/printk.c | 10 +++------- + 1 file changed, 3 insertions(+), 7 deletions(-) + +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index c553b235d068..aebc9e31b36a 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -2970,9 +2970,7 @@ void register_console(struct console *newcon) + /* + * console_unlock(); will print out the buffered messages + * for us. +- */ +- printk_safe_enter_irqsave(flags); +- /* ++ * + * We're about to replay the log buffer. Only do this to the + * just-registered console to avoid excessive message spam to + * the already-registered consoles. +@@ -2985,11 +2983,9 @@ void register_console(struct console *newcon) + exclusive_console_stop_seq = console_seq; + + /* Get a consistent copy of @syslog_seq. */ +- raw_spin_lock(&syslog_lock); ++ raw_spin_lock_irqsave(&syslog_lock, flags); + console_seq = syslog_seq; +- raw_spin_unlock(&syslog_lock); +- +- printk_safe_exit_irqrestore(flags); ++ raw_spin_unlock_irqrestore(&syslog_lock, flags); + } + console_unlock(); + console_sysfs_notify(); +-- +2.43.0 + diff --git a/debian/patches-rt/0096-printk-track-limit-recursion.patch b/debian/patches-rt/0096-printk-track-limit-recursion.patch new file mode 100644 index 000000000..ee26831be --- /dev/null +++ b/debian/patches-rt/0096-printk-track-limit-recursion.patch @@ -0,0 +1,143 @@ +From 05db2b7a33a6aed0cb1ce99370c436dc2b72b521 Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Fri, 11 Dec 2020 00:55:25 +0106 +Subject: [PATCH 096/323] printk: track/limit recursion +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Limit printk() recursion to 1 level. This is enough to print a +stacktrace for the printk call, should a WARN or BUG occur. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/printk/printk.c | 74 ++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 71 insertions(+), 3 deletions(-) + +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index aebc9e31b36a..31a2b7a116a7 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -1943,6 +1943,65 @@ static void call_console_drivers(const char *ext_text, size_t ext_len, + } + } + ++#ifdef CONFIG_PRINTK_NMI ++#define NUM_RECURSION_CTX 2 ++#else ++#define NUM_RECURSION_CTX 1 ++#endif ++ ++struct printk_recursion { ++ char count[NUM_RECURSION_CTX]; ++}; ++ ++static DEFINE_PER_CPU(struct printk_recursion, percpu_printk_recursion); ++static char printk_recursion_count[NUM_RECURSION_CTX]; ++ ++static char *printk_recursion_counter(void) ++{ ++ struct printk_recursion *rec; ++ char *count; ++ ++ if (!printk_percpu_data_ready()) { ++ count = &printk_recursion_count[0]; ++ } else { ++ rec = this_cpu_ptr(&percpu_printk_recursion); ++ ++ count = &rec->count[0]; ++ } ++ ++#ifdef CONFIG_PRINTK_NMI ++ if (in_nmi()) ++ count++; ++#endif ++ ++ return count; ++} ++ ++static bool printk_enter_irqsave(unsigned long *flags) ++{ ++ char *count; ++ ++ local_irq_save(*flags); ++ count = printk_recursion_counter(); ++ /* Only 1 level of recursion allowed. */ ++ if (*count > 1) { ++ local_irq_restore(*flags); ++ return false; ++ } ++ (*count)++; ++ ++ return true; ++} ++ ++static void printk_exit_irqrestore(unsigned long flags) ++{ ++ char *count; ++ ++ count = printk_recursion_counter(); ++ (*count)--; ++ local_irq_restore(flags); ++} ++ + int printk_delay_msec __read_mostly; + + static inline void printk_delay(void) +@@ -2043,11 +2102,13 @@ int vprintk_store(int facility, int level, + struct prb_reserved_entry e; + enum log_flags lflags = 0; + struct printk_record r; ++ unsigned long irqflags; + u16 trunc_msg_len = 0; + char prefix_buf[8]; + u16 reserve_size; + va_list args2; + u16 text_len; ++ int ret = 0; + u64 ts_nsec; + + /* +@@ -2058,6 +2119,9 @@ int vprintk_store(int facility, int level, + */ + ts_nsec = local_clock(); + ++ if (!printk_enter_irqsave(&irqflags)) ++ return 0; ++ + /* + * The sprintf needs to come first since the syslog prefix might be + * passed in as a parameter. An extra byte must be reserved so that +@@ -2095,7 +2159,8 @@ int vprintk_store(int facility, int level, + prb_commit(&e); + } + +- return text_len; ++ ret = text_len; ++ goto out; + } + } + +@@ -2111,7 +2176,7 @@ int vprintk_store(int facility, int level, + + prb_rec_init_wr(&r, reserve_size + trunc_msg_len); + if (!prb_reserve(&e, prb, &r)) +- return 0; ++ goto out; + } + + /* fill message */ +@@ -2133,7 +2198,10 @@ int vprintk_store(int facility, int level, + else + prb_final_commit(&e); + +- return (text_len + trunc_msg_len); ++ ret = text_len + trunc_msg_len; ++out: ++ printk_exit_irqrestore(irqflags); ++ return ret; + } + + asmlinkage int vprintk_emit(int facility, int level, +-- +2.43.0 + diff --git a/debian/patches-rt/0097-printk-remove-safe-buffers.patch b/debian/patches-rt/0097-printk-remove-safe-buffers.patch new file mode 100644 index 000000000..df3318972 --- /dev/null +++ b/debian/patches-rt/0097-printk-remove-safe-buffers.patch @@ -0,0 +1,877 @@ +From c0ee407b7db3b7ac5c24337a9d365a35450c233b Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Mon, 30 Nov 2020 01:42:00 +0106 +Subject: [PATCH 097/323] printk: remove safe buffers +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +With @logbuf_lock removed, the high level printk functions for +storing messages are lockless. Messages can be stored from any +context, so there is no need for the NMI and safe buffers anymore. + +Remove the NMI and safe buffers. In NMI or safe contexts, store +the message immediately but still use irq_work to defer the console +printing. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/powerpc/kernel/traps.c | 1 - + arch/powerpc/kernel/watchdog.c | 5 - + include/linux/printk.h | 10 - + kernel/kexec_core.c | 1 - + kernel/panic.c | 3 - + kernel/printk/internal.h | 2 - + kernel/printk/printk.c | 85 ++------- + kernel/printk/printk_safe.c | 329 +-------------------------------- + lib/nmi_backtrace.c | 6 - + 9 files changed, 17 insertions(+), 425 deletions(-) + +diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c +index 5e5a2448ae79..d39a4a6b4ddf 100644 +--- a/arch/powerpc/kernel/traps.c ++++ b/arch/powerpc/kernel/traps.c +@@ -170,7 +170,6 @@ extern void panic_flush_kmsg_start(void) + + extern void panic_flush_kmsg_end(void) + { +- printk_safe_flush_on_panic(); + kmsg_dump(KMSG_DUMP_PANIC); + bust_spinlocks(0); + debug_locks_off(); +diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c +index 75b2a6c4db5a..db40e20d0c54 100644 +--- a/arch/powerpc/kernel/watchdog.c ++++ b/arch/powerpc/kernel/watchdog.c +@@ -185,11 +185,6 @@ static void watchdog_smp_panic(int cpu, u64 tb) + + wd_smp_unlock(&flags); + +- printk_safe_flush(); +- /* +- * printk_safe_flush() seems to require another print +- * before anything actually goes out to console. +- */ + if (sysctl_hardlockup_all_cpu_backtrace) + trigger_allbutself_cpu_backtrace(); + +diff --git a/include/linux/printk.h b/include/linux/printk.h +index 344f6da3d4c3..c6bb48f0134c 100644 +--- a/include/linux/printk.h ++++ b/include/linux/printk.h +@@ -207,8 +207,6 @@ __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...); + void dump_stack_print_info(const char *log_lvl); + void show_regs_print_info(const char *log_lvl); + extern asmlinkage void dump_stack(void) __cold; +-extern void printk_safe_flush(void); +-extern void printk_safe_flush_on_panic(void); + #else + static inline __printf(1, 0) + int vprintk(const char *s, va_list args) +@@ -272,14 +270,6 @@ static inline void show_regs_print_info(const char *log_lvl) + static inline void dump_stack(void) + { + } +- +-static inline void printk_safe_flush(void) +-{ +-} +- +-static inline void printk_safe_flush_on_panic(void) +-{ +-} + #endif + + extern int kptr_restrict; +diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c +index 3a37fc62dc95..78a1fd9c3c8a 100644 +--- a/kernel/kexec_core.c ++++ b/kernel/kexec_core.c +@@ -978,7 +978,6 @@ void crash_kexec(struct pt_regs *regs) + old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); + if (old_cpu == PANIC_CPU_INVALID) { + /* This is the 1st CPU which comes here, so go ahead. */ +- printk_safe_flush_on_panic(); + __crash_kexec(regs); + + /* +diff --git a/kernel/panic.c b/kernel/panic.c +index bc39e2b27d31..7965f1e31224 100644 +--- a/kernel/panic.c ++++ b/kernel/panic.c +@@ -324,7 +324,6 @@ void panic(const char *fmt, ...) + * Bypass the panic_cpu check and call __crash_kexec directly. + */ + if (!_crash_kexec_post_notifiers) { +- printk_safe_flush_on_panic(); + __crash_kexec(NULL); + + /* +@@ -348,8 +347,6 @@ void panic(const char *fmt, ...) + */ + atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + +- /* Call flush even twice. It tries harder with a single online CPU */ +- printk_safe_flush_on_panic(); + kmsg_dump(KMSG_DUMP_PANIC); + + /* +diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h +index e7acc2888c8e..e108b2ece8c7 100644 +--- a/kernel/printk/internal.h ++++ b/kernel/printk/internal.h +@@ -23,7 +23,6 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args); + void __printk_safe_enter(void); + void __printk_safe_exit(void); + +-void printk_safe_init(void); + bool printk_percpu_data_ready(void); + + #define printk_safe_enter_irqsave(flags) \ +@@ -67,6 +66,5 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; } + #define printk_safe_enter_irq() local_irq_disable() + #define printk_safe_exit_irq() local_irq_enable() + +-static inline void printk_safe_init(void) { } + static inline bool printk_percpu_data_ready(void) { return false; } + #endif /* CONFIG_PRINTK */ +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 31a2b7a116a7..90a4cf4c23a4 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -735,27 +735,22 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, + if (ret) + return ret; + +- printk_safe_enter_irq(); + if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) { + if (file->f_flags & O_NONBLOCK) { + ret = -EAGAIN; +- printk_safe_exit_irq(); + goto out; + } + +- printk_safe_exit_irq(); + ret = wait_event_interruptible(log_wait, + prb_read_valid(prb, atomic64_read(&user->seq), r)); + if (ret) + goto out; +- printk_safe_enter_irq(); + } + + if (r->info->seq != atomic64_read(&user->seq)) { + /* our last seen message is gone, return error and reset */ + atomic64_set(&user->seq, r->info->seq); + ret = -EPIPE; +- printk_safe_exit_irq(); + goto out; + } + +@@ -765,7 +760,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, + &r->info->dev_info); + + atomic64_set(&user->seq, r->info->seq + 1); +- printk_safe_exit_irq(); + + if (len > count) { + ret = -EINVAL; +@@ -800,7 +794,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) + if (offset) + return -ESPIPE; + +- printk_safe_enter_irq(); + switch (whence) { + case SEEK_SET: + /* the first record */ +@@ -821,7 +814,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) + default: + ret = -EINVAL; + } +- printk_safe_exit_irq(); + return ret; + } + +@@ -836,7 +828,6 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) + + poll_wait(file, &log_wait, wait); + +- printk_safe_enter_irq(); + if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) { + /* return error when data has vanished underneath us */ + if (info.seq != atomic64_read(&user->seq)) +@@ -844,7 +835,6 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) + else + ret = EPOLLIN|EPOLLRDNORM; + } +- printk_safe_exit_irq(); + + return ret; + } +@@ -877,9 +867,7 @@ static int devkmsg_open(struct inode *inode, struct file *file) + prb_rec_init_rd(&user->record, &user->info, + &user->text_buf[0], sizeof(user->text_buf)); + +- printk_safe_enter_irq(); + atomic64_set(&user->seq, prb_first_valid_seq(prb)); +- printk_safe_exit_irq(); + + file->private_data = user; + return 0; +@@ -1045,9 +1033,6 @@ static inline void log_buf_add_cpu(void) {} + + static void __init set_percpu_data_ready(void) + { +- printk_safe_init(); +- /* Make sure we set this flag only after printk_safe() init is done */ +- barrier(); + __printk_percpu_data_ready = true; + } + +@@ -1145,8 +1130,6 @@ void __init setup_log_buf(int early) + new_descs, ilog2(new_descs_count), + new_infos); + +- printk_safe_enter_irqsave(flags); +- + log_buf_len = new_log_buf_len; + log_buf = new_log_buf; + new_log_buf_len = 0; +@@ -1162,8 +1145,6 @@ void __init setup_log_buf(int early) + */ + prb = &printk_rb_dynamic; + +- printk_safe_exit_irqrestore(flags); +- + if (seq != prb_next_seq(&printk_rb_static)) { + pr_err("dropped %llu messages\n", + prb_next_seq(&printk_rb_static) - seq); +@@ -1501,11 +1482,9 @@ static int syslog_print(char __user *buf, int size) + size_t n; + size_t skip; + +- printk_safe_enter_irq(); +- raw_spin_lock(&syslog_lock); ++ raw_spin_lock_irq(&syslog_lock); + if (!prb_read_valid(prb, syslog_seq, &r)) { +- raw_spin_unlock(&syslog_lock); +- printk_safe_exit_irq(); ++ raw_spin_unlock_irq(&syslog_lock); + break; + } + if (r.info->seq != syslog_seq) { +@@ -1534,8 +1513,7 @@ static int syslog_print(char __user *buf, int size) + syslog_partial += n; + } else + n = 0; +- raw_spin_unlock(&syslog_lock); +- printk_safe_exit_irq(); ++ raw_spin_unlock_irq(&syslog_lock); + + if (!n) + break; +@@ -1569,7 +1547,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + return -ENOMEM; + + time = printk_time; +- printk_safe_enter_irq(); + /* + * Find first record that fits, including all following records, + * into the user-provided buffer for this dump. +@@ -1590,23 +1567,20 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + break; + } + +- printk_safe_exit_irq(); + if (copy_to_user(buf + len, text, textlen)) + len = -EFAULT; + else + len += textlen; +- printk_safe_enter_irq(); + + if (len < 0) + break; + } + + if (clear) { +- raw_spin_lock(&syslog_lock); ++ raw_spin_lock_irq(&syslog_lock); + latched_seq_write(&clear_seq, seq); +- raw_spin_unlock(&syslog_lock); ++ raw_spin_unlock_irq(&syslog_lock); + } +- printk_safe_exit_irq(); + + kfree(text); + return len; +@@ -1614,11 +1588,9 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + + static void syslog_clear(void) + { +- printk_safe_enter_irq(); +- raw_spin_lock(&syslog_lock); ++ raw_spin_lock_irq(&syslog_lock); + latched_seq_write(&clear_seq, prb_next_seq(prb)); +- raw_spin_unlock(&syslog_lock); +- printk_safe_exit_irq(); ++ raw_spin_unlock_irq(&syslog_lock); + } + + /* Return a consistent copy of @syslog_seq. */ +@@ -1706,12 +1678,10 @@ int do_syslog(int type, char __user *buf, int len, int source) + break; + /* Number of chars in the log buffer */ + case SYSLOG_ACTION_SIZE_UNREAD: +- printk_safe_enter_irq(); +- raw_spin_lock(&syslog_lock); ++ raw_spin_lock_irq(&syslog_lock); + if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) { + /* No unread messages. */ +- raw_spin_unlock(&syslog_lock); +- printk_safe_exit_irq(); ++ raw_spin_unlock_irq(&syslog_lock); + return 0; + } + if (info.seq != syslog_seq) { +@@ -1739,8 +1709,7 @@ int do_syslog(int type, char __user *buf, int len, int source) + } + error -= syslog_partial; + } +- raw_spin_unlock(&syslog_lock); +- printk_safe_exit_irq(); ++ raw_spin_unlock_irq(&syslog_lock); + break; + /* Size of the log buffer */ + case SYSLOG_ACTION_SIZE_BUFFER: +@@ -2210,7 +2179,6 @@ asmlinkage int vprintk_emit(int facility, int level, + { + int printed_len; + bool in_sched = false; +- unsigned long flags; + + /* Suppress unimportant messages after panic happens */ + if (unlikely(suppress_printk)) +@@ -2224,9 +2192,7 @@ asmlinkage int vprintk_emit(int facility, int level, + boot_delay_msec(level); + printk_delay(); + +- printk_safe_enter_irqsave(flags); + printed_len = vprintk_store(facility, level, dev_info, fmt, args); +- printk_safe_exit_irqrestore(flags); + + /* If called from the scheduler, we can not call up(). */ + if (!in_sched) { +@@ -2618,7 +2584,6 @@ void console_unlock(void) + { + static char ext_text[CONSOLE_EXT_LOG_MAX]; + static char text[CONSOLE_LOG_MAX]; +- unsigned long flags; + bool do_cond_resched, retry; + struct printk_info info; + struct printk_record r; +@@ -2663,7 +2628,6 @@ void console_unlock(void) + size_t ext_len = 0; + size_t len; + +- printk_safe_enter_irqsave(flags); + skip: + if (!prb_read_valid(prb, console_seq, &r)) + break; +@@ -2720,12 +2684,8 @@ void console_unlock(void) + call_console_drivers(ext_text, ext_len, text, len); + start_critical_timings(); + +- if (console_lock_spinning_disable_and_check()) { +- printk_safe_exit_irqrestore(flags); ++ if (console_lock_spinning_disable_and_check()) + return; +- } +- +- printk_safe_exit_irqrestore(flags); + + if (do_cond_resched) + cond_resched(); +@@ -2742,8 +2702,6 @@ void console_unlock(void) + * flush, no worries. + */ + retry = prb_read_valid(prb, console_seq, NULL); +- printk_safe_exit_irqrestore(flags); +- + if (retry && console_trylock()) + goto again; + } +@@ -2805,13 +2763,8 @@ void console_flush_on_panic(enum con_flush_mode mode) + console_trylock(); + console_may_schedule = 0; + +- if (mode == CONSOLE_REPLAY_ALL) { +- unsigned long flags; +- +- printk_safe_enter_irqsave(flags); ++ if (mode == CONSOLE_REPLAY_ALL) + console_seq = prb_first_valid_seq(prb); +- printk_safe_exit_irqrestore(flags); +- } + console_unlock(); + } + +@@ -3469,11 +3422,9 @@ bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog, + struct printk_info info; + unsigned int line_count; + struct printk_record r; +- unsigned long flags; + size_t l = 0; + bool ret = false; + +- printk_safe_enter_irqsave(flags); + prb_rec_init_rd(&r, &info, line, size); + + if (!iter->active) +@@ -3497,7 +3448,6 @@ bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog, + iter->cur_seq = r.info->seq + 1; + ret = true; + out: +- printk_safe_exit_irqrestore(flags); + if (len) + *len = l; + return ret; +@@ -3528,7 +3478,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog, + { + struct printk_info info; + struct printk_record r; +- unsigned long flags; + u64 seq; + u64 next_seq; + size_t len = 0; +@@ -3538,7 +3487,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog, + if (!iter->active || !buf || !size) + goto out; + +- printk_safe_enter_irqsave(flags); + if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) { + if (info.seq != iter->cur_seq) { + /* messages are gone, move to first available one */ +@@ -3547,10 +3495,8 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog, + } + + /* last entry */ +- if (iter->cur_seq >= iter->next_seq) { +- printk_safe_exit_irqrestore(flags); ++ if (iter->cur_seq >= iter->next_seq) + goto out; +- } + + /* + * Find first record that fits, including all following records, +@@ -3582,7 +3528,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog, + + iter->next_seq = next_seq; + ret = true; +- printk_safe_exit_irqrestore(flags); + out: + if (len_out) + *len_out = len; +@@ -3600,12 +3545,8 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); + */ + void kmsg_dump_rewind(struct kmsg_dumper_iter *iter) + { +- unsigned long flags; +- +- printk_safe_enter_irqsave(flags); + iter->cur_seq = latched_seq_read_nolock(&clear_seq); + iter->next_seq = prb_next_seq(prb); +- printk_safe_exit_irqrestore(flags); + } + EXPORT_SYMBOL_GPL(kmsg_dump_rewind); + +diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c +index 7df8a88d4115..c23b127a6545 100644 +--- a/kernel/printk/printk_safe.c ++++ b/kernel/printk/printk_safe.c +@@ -15,282 +15,9 @@ + + #include "internal.h" + +-/* +- * In NMI and safe mode, printk() avoids taking locks. Instead, +- * it uses an alternative implementation that temporary stores +- * the strings into a per-CPU buffer. The content of the buffer +- * is later flushed into the main ring buffer via IRQ work. +- * +- * The alternative implementation is chosen transparently +- * by examining current printk() context mask stored in @printk_context +- * per-CPU variable. +- * +- * The implementation allows to flush the strings also from another CPU. +- * There are situations when we want to make sure that all buffers +- * were handled or when IRQs are blocked. +- */ +- +-#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \ +- sizeof(atomic_t) - \ +- sizeof(atomic_t) - \ +- sizeof(struct irq_work)) +- +-struct printk_safe_seq_buf { +- atomic_t len; /* length of written data */ +- atomic_t message_lost; +- struct irq_work work; /* IRQ work that flushes the buffer */ +- unsigned char buffer[SAFE_LOG_BUF_LEN]; +-}; +- +-static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq); + static DEFINE_PER_CPU(int, printk_context); + +-static DEFINE_RAW_SPINLOCK(safe_read_lock); +- +-#ifdef CONFIG_PRINTK_NMI +-static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq); +-#endif +- +-/* Get flushed in a more safe context. */ +-static void queue_flush_work(struct printk_safe_seq_buf *s) +-{ +- if (printk_percpu_data_ready()) +- irq_work_queue(&s->work); +-} +- +-/* +- * Add a message to per-CPU context-dependent buffer. NMI and printk-safe +- * have dedicated buffers, because otherwise printk-safe preempted by +- * NMI-printk would have overwritten the NMI messages. +- * +- * The messages are flushed from irq work (or from panic()), possibly, +- * from other CPU, concurrently with printk_safe_log_store(). Should this +- * happen, printk_safe_log_store() will notice the buffer->len mismatch +- * and repeat the write. +- */ +-static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s, +- const char *fmt, va_list args) +-{ +- int add; +- size_t len; +- va_list ap; +- +-again: +- len = atomic_read(&s->len); +- +- /* The trailing '\0' is not counted into len. */ +- if (len >= sizeof(s->buffer) - 1) { +- atomic_inc(&s->message_lost); +- queue_flush_work(s); +- return 0; +- } +- +- /* +- * Make sure that all old data have been read before the buffer +- * was reset. This is not needed when we just append data. +- */ +- if (!len) +- smp_rmb(); +- +- va_copy(ap, args); +- add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, ap); +- va_end(ap); +- if (!add) +- return 0; +- +- /* +- * Do it once again if the buffer has been flushed in the meantime. +- * Note that atomic_cmpxchg() is an implicit memory barrier that +- * makes sure that the data were written before updating s->len. +- */ +- if (atomic_cmpxchg(&s->len, len, len + add) != len) +- goto again; +- +- queue_flush_work(s); +- return add; +-} +- +-static inline void printk_safe_flush_line(const char *text, int len) +-{ +- /* +- * Avoid any console drivers calls from here, because we may be +- * in NMI or printk_safe context (when in panic). The messages +- * must go only into the ring buffer at this stage. Consoles will +- * get explicitly called later when a crashdump is not generated. +- */ +- printk_deferred("%.*s", len, text); +-} +- +-/* printk part of the temporary buffer line by line */ +-static int printk_safe_flush_buffer(const char *start, size_t len) +-{ +- const char *c, *end; +- bool header; +- +- c = start; +- end = start + len; +- header = true; +- +- /* Print line by line. */ +- while (c < end) { +- if (*c == '\n') { +- printk_safe_flush_line(start, c - start + 1); +- start = ++c; +- header = true; +- continue; +- } +- +- /* Handle continuous lines or missing new line. */ +- if ((c + 1 < end) && printk_get_level(c)) { +- if (header) { +- c = printk_skip_level(c); +- continue; +- } +- +- printk_safe_flush_line(start, c - start); +- start = c++; +- header = true; +- continue; +- } +- +- header = false; +- c++; +- } +- +- /* Check if there was a partial line. Ignore pure header. */ +- if (start < end && !header) { +- static const char newline[] = KERN_CONT "\n"; +- +- printk_safe_flush_line(start, end - start); +- printk_safe_flush_line(newline, strlen(newline)); +- } +- +- return len; +-} +- +-static void report_message_lost(struct printk_safe_seq_buf *s) +-{ +- int lost = atomic_xchg(&s->message_lost, 0); +- +- if (lost) +- printk_deferred("Lost %d message(s)!\n", lost); +-} +- +-/* +- * Flush data from the associated per-CPU buffer. The function +- * can be called either via IRQ work or independently. +- */ +-static void __printk_safe_flush(struct irq_work *work) +-{ +- struct printk_safe_seq_buf *s = +- container_of(work, struct printk_safe_seq_buf, work); +- unsigned long flags; +- size_t len; +- int i; +- +- /* +- * The lock has two functions. First, one reader has to flush all +- * available message to make the lockless synchronization with +- * writers easier. Second, we do not want to mix messages from +- * different CPUs. This is especially important when printing +- * a backtrace. +- */ +- raw_spin_lock_irqsave(&safe_read_lock, flags); +- +- i = 0; +-more: +- len = atomic_read(&s->len); +- +- /* +- * This is just a paranoid check that nobody has manipulated +- * the buffer an unexpected way. If we printed something then +- * @len must only increase. Also it should never overflow the +- * buffer size. +- */ +- if ((i && i >= len) || len > sizeof(s->buffer)) { +- const char *msg = "printk_safe_flush: internal error\n"; +- +- printk_safe_flush_line(msg, strlen(msg)); +- len = 0; +- } +- +- if (!len) +- goto out; /* Someone else has already flushed the buffer. */ +- +- /* Make sure that data has been written up to the @len */ +- smp_rmb(); +- i += printk_safe_flush_buffer(s->buffer + i, len - i); +- +- /* +- * Check that nothing has got added in the meantime and truncate +- * the buffer. Note that atomic_cmpxchg() is an implicit memory +- * barrier that makes sure that the data were copied before +- * updating s->len. +- */ +- if (atomic_cmpxchg(&s->len, len, 0) != len) +- goto more; +- +-out: +- report_message_lost(s); +- raw_spin_unlock_irqrestore(&safe_read_lock, flags); +-} +- +-/** +- * printk_safe_flush - flush all per-cpu nmi buffers. +- * +- * The buffers are flushed automatically via IRQ work. This function +- * is useful only when someone wants to be sure that all buffers have +- * been flushed at some point. +- */ +-void printk_safe_flush(void) +-{ +- int cpu; +- +- for_each_possible_cpu(cpu) { +-#ifdef CONFIG_PRINTK_NMI +- __printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work); +-#endif +- __printk_safe_flush(&per_cpu(safe_print_seq, cpu).work); +- } +-} +- +-/** +- * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system +- * goes down. +- * +- * Similar to printk_safe_flush() but it can be called even in NMI context when +- * the system goes down. It does the best effort to get NMI messages into +- * the main ring buffer. +- * +- * Note that it could try harder when there is only one CPU online. +- */ +-void printk_safe_flush_on_panic(void) +-{ +- if (raw_spin_is_locked(&safe_read_lock)) { +- if (num_online_cpus() > 1) +- return; +- +- debug_locks_off(); +- raw_spin_lock_init(&safe_read_lock); +- } +- +- printk_safe_flush(); +-} +- + #ifdef CONFIG_PRINTK_NMI +-/* +- * Safe printk() for NMI context. It uses a per-CPU buffer to +- * store the message. NMIs are not nested, so there is always only +- * one writer running. But the buffer might get flushed from another +- * CPU, so we need to be careful. +- */ +-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) +-{ +- struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq); +- +- return printk_safe_log_store(s, fmt, args); +-} +- + void noinstr printk_nmi_enter(void) + { + this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET); +@@ -305,9 +32,6 @@ void noinstr printk_nmi_exit(void) + * Marks a code that might produce many messages in NMI context + * and the risk of losing them is more critical than eventual + * reordering. +- * +- * It has effect only when called in NMI context. Then printk() +- * will store the messages into the main logbuf directly. + */ + void printk_nmi_direct_enter(void) + { +@@ -320,27 +44,8 @@ void printk_nmi_direct_exit(void) + this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK); + } + +-#else +- +-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) +-{ +- return 0; +-} +- + #endif /* CONFIG_PRINTK_NMI */ + +-/* +- * Lock-less printk(), to avoid deadlocks should the printk() recurse +- * into itself. It uses a per-CPU buffer to store the message, just like +- * NMI. +- */ +-static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args) +-{ +- struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq); +- +- return printk_safe_log_store(s, fmt, args); +-} +- + /* Can be preempted by NMI. */ + void __printk_safe_enter(void) + { +@@ -365,8 +70,10 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args) + * Use the main logbuf even in NMI. But avoid calling console + * drivers that might have their own locks. + */ +- if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK)) { +- unsigned long flags; ++ if (this_cpu_read(printk_context) & ++ (PRINTK_NMI_DIRECT_CONTEXT_MASK | ++ PRINTK_NMI_CONTEXT_MASK | ++ PRINTK_SAFE_CONTEXT_MASK)) { + int len; + + printk_safe_enter_irqsave(flags); +@@ -376,34 +83,6 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args) + return len; + } + +- /* Use extra buffer in NMI. */ +- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) +- return vprintk_nmi(fmt, args); +- +- /* Use extra buffer to prevent a recursion deadlock in safe mode. */ +- if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) +- return vprintk_safe(fmt, args); +- + /* No obstacles. */ + return vprintk_default(fmt, args); + } +- +-void __init printk_safe_init(void) +-{ +- int cpu; +- +- for_each_possible_cpu(cpu) { +- struct printk_safe_seq_buf *s; +- +- s = &per_cpu(safe_print_seq, cpu); +- init_irq_work(&s->work, __printk_safe_flush); +- +-#ifdef CONFIG_PRINTK_NMI +- s = &per_cpu(nmi_print_seq, cpu); +- init_irq_work(&s->work, __printk_safe_flush); +-#endif +- } +- +- /* Flush pending messages that did not have scheduled IRQ works. */ +- printk_safe_flush(); +-} +diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c +index 8abe1870dba4..b09a490f5f70 100644 +--- a/lib/nmi_backtrace.c ++++ b/lib/nmi_backtrace.c +@@ -75,12 +75,6 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask, + touch_softlockup_watchdog(); + } + +- /* +- * Force flush any remote buffers that might be stuck in IRQ context +- * and therefore could not run their irq_work. +- */ +- printk_safe_flush(); +- + clear_bit_unlock(0, &backtrace_flag); + put_cpu(); + } +-- +2.43.0 + diff --git a/debian/patches-rt/0098-printk-convert-syslog_lock-to-spin_lock.patch b/debian/patches-rt/0098-printk-convert-syslog_lock-to-spin_lock.patch new file mode 100644 index 000000000..639c0cf84 --- /dev/null +++ b/debian/patches-rt/0098-printk-convert-syslog_lock-to-spin_lock.patch @@ -0,0 +1,119 @@ +From 8e6eb9e51cf4c4efeeb61910c35b01dcce2ec502 Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Thu, 18 Feb 2021 17:37:41 +0100 +Subject: [PATCH 098/323] printk: convert @syslog_lock to spin_lock +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/printk/printk.c | 30 +++++++++++++++--------------- + 1 file changed, 15 insertions(+), 15 deletions(-) + +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 90a4cf4c23a4..57f3b8d7f35c 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -358,7 +358,7 @@ enum log_flags { + }; + + /* syslog_lock protects syslog_* variables and write access to clear_seq. */ +-static DEFINE_RAW_SPINLOCK(syslog_lock); ++static DEFINE_SPINLOCK(syslog_lock); + + #ifdef CONFIG_PRINTK + DECLARE_WAIT_QUEUE_HEAD(log_wait); +@@ -1482,9 +1482,9 @@ static int syslog_print(char __user *buf, int size) + size_t n; + size_t skip; + +- raw_spin_lock_irq(&syslog_lock); ++ spin_lock_irq(&syslog_lock); + if (!prb_read_valid(prb, syslog_seq, &r)) { +- raw_spin_unlock_irq(&syslog_lock); ++ spin_unlock_irq(&syslog_lock); + break; + } + if (r.info->seq != syslog_seq) { +@@ -1513,7 +1513,7 @@ static int syslog_print(char __user *buf, int size) + syslog_partial += n; + } else + n = 0; +- raw_spin_unlock_irq(&syslog_lock); ++ spin_unlock_irq(&syslog_lock); + + if (!n) + break; +@@ -1577,9 +1577,9 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + } + + if (clear) { +- raw_spin_lock_irq(&syslog_lock); ++ spin_lock_irq(&syslog_lock); + latched_seq_write(&clear_seq, seq); +- raw_spin_unlock_irq(&syslog_lock); ++ spin_unlock_irq(&syslog_lock); + } + + kfree(text); +@@ -1588,9 +1588,9 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + + static void syslog_clear(void) + { +- raw_spin_lock_irq(&syslog_lock); ++ spin_lock_irq(&syslog_lock); + latched_seq_write(&clear_seq, prb_next_seq(prb)); +- raw_spin_unlock_irq(&syslog_lock); ++ spin_unlock_irq(&syslog_lock); + } + + /* Return a consistent copy of @syslog_seq. */ +@@ -1598,9 +1598,9 @@ static u64 read_syslog_seq_irq(void) + { + u64 seq; + +- raw_spin_lock_irq(&syslog_lock); ++ spin_lock_irq(&syslog_lock); + seq = syslog_seq; +- raw_spin_unlock_irq(&syslog_lock); ++ spin_unlock_irq(&syslog_lock); + + return seq; + } +@@ -1678,10 +1678,10 @@ int do_syslog(int type, char __user *buf, int len, int source) + break; + /* Number of chars in the log buffer */ + case SYSLOG_ACTION_SIZE_UNREAD: +- raw_spin_lock_irq(&syslog_lock); ++ spin_lock_irq(&syslog_lock); + if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) { + /* No unread messages. */ +- raw_spin_unlock_irq(&syslog_lock); ++ spin_unlock_irq(&syslog_lock); + return 0; + } + if (info.seq != syslog_seq) { +@@ -1709,7 +1709,7 @@ int do_syslog(int type, char __user *buf, int len, int source) + } + error -= syslog_partial; + } +- raw_spin_unlock_irq(&syslog_lock); ++ spin_unlock_irq(&syslog_lock); + break; + /* Size of the log buffer */ + case SYSLOG_ACTION_SIZE_BUFFER: +@@ -3004,9 +3004,9 @@ void register_console(struct console *newcon) + exclusive_console_stop_seq = console_seq; + + /* Get a consistent copy of @syslog_seq. */ +- raw_spin_lock_irqsave(&syslog_lock, flags); ++ spin_lock_irqsave(&syslog_lock, flags); + console_seq = syslog_seq; +- raw_spin_unlock_irqrestore(&syslog_lock, flags); ++ spin_unlock_irqrestore(&syslog_lock, flags); + } + console_unlock(); + console_sysfs_notify(); +-- +2.43.0 + diff --git a/debian/patches-rt/0099-console-add-write_atomic-interface.patch b/debian/patches-rt/0099-console-add-write_atomic-interface.patch new file mode 100644 index 000000000..86f553013 --- /dev/null +++ b/debian/patches-rt/0099-console-add-write_atomic-interface.patch @@ -0,0 +1,163 @@ +From a021f069828ba9181d19f5ee9a977cf305ddaf19 Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Mon, 30 Nov 2020 01:42:01 +0106 +Subject: [PATCH 099/323] console: add write_atomic interface +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Add a write_atomic() callback to the console. This is an optional +function for console drivers. The function must be atomic (including +NMI safe) for writing to the console. + +Console drivers must still implement the write() callback. The +write_atomic() callback will only be used in special situations, +such as when the kernel panics. + +Creating an NMI safe write_atomic() that must synchronize with +write() requires a careful implementation of the console driver. To +aid with the implementation, a set of console_atomic_*() functions +are provided: + + void console_atomic_lock(unsigned int *flags); + void console_atomic_unlock(unsigned int flags); + +These functions synchronize using a processor-reentrant spinlock +(called a cpulock). + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/console.h | 4 ++ + kernel/printk/printk.c | 100 ++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 104 insertions(+) + +diff --git a/include/linux/console.h b/include/linux/console.h +index bc2a749e6f0d..613df76903f5 100644 +--- a/include/linux/console.h ++++ b/include/linux/console.h +@@ -141,6 +141,7 @@ static inline int con_debug_leave(void) + struct console { + char name[16]; + void (*write)(struct console *, const char *, unsigned); ++ void (*write_atomic)(struct console *co, const char *s, unsigned int count); + int (*read)(struct console *, char *, unsigned); + struct tty_driver *(*device)(struct console *, int *); + void (*unblank)(void); +@@ -232,4 +233,7 @@ extern void console_init(void); + void dummycon_register_output_notifier(struct notifier_block *nb); + void dummycon_unregister_output_notifier(struct notifier_block *nb); + ++extern void console_atomic_lock(unsigned int *flags); ++extern void console_atomic_unlock(unsigned int flags); ++ + #endif /* _LINUX_CONSOLE_H */ +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 57f3b8d7f35c..8768473712b2 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -3551,3 +3551,103 @@ void kmsg_dump_rewind(struct kmsg_dumper_iter *iter) + EXPORT_SYMBOL_GPL(kmsg_dump_rewind); + + #endif ++ ++struct prb_cpulock { ++ atomic_t owner; ++ unsigned long __percpu *irqflags; ++}; ++ ++#define DECLARE_STATIC_PRINTKRB_CPULOCK(name) \ ++static DEFINE_PER_CPU(unsigned long, _##name##_percpu_irqflags); \ ++static struct prb_cpulock name = { \ ++ .owner = ATOMIC_INIT(-1), \ ++ .irqflags = &_##name##_percpu_irqflags, \ ++} ++ ++static bool __prb_trylock(struct prb_cpulock *cpu_lock, ++ unsigned int *cpu_store) ++{ ++ unsigned long *flags; ++ unsigned int cpu; ++ ++ cpu = get_cpu(); ++ ++ *cpu_store = atomic_read(&cpu_lock->owner); ++ /* memory barrier to ensure the current lock owner is visible */ ++ smp_rmb(); ++ if (*cpu_store == -1) { ++ flags = per_cpu_ptr(cpu_lock->irqflags, cpu); ++ local_irq_save(*flags); ++ if (atomic_try_cmpxchg_acquire(&cpu_lock->owner, ++ cpu_store, cpu)) { ++ return true; ++ } ++ local_irq_restore(*flags); ++ } else if (*cpu_store == cpu) { ++ return true; ++ } ++ ++ put_cpu(); ++ return false; ++} ++ ++/* ++ * prb_lock: Perform a processor-reentrant spin lock. ++ * @cpu_lock: A pointer to the lock object. ++ * @cpu_store: A "flags" pointer to store lock status information. ++ * ++ * If no processor has the lock, the calling processor takes the lock and ++ * becomes the owner. If the calling processor is already the owner of the ++ * lock, this function succeeds immediately. If lock is locked by another ++ * processor, this function spins until the calling processor becomes the ++ * owner. ++ * ++ * It is safe to call this function from any context and state. ++ */ ++static void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store) ++{ ++ for (;;) { ++ if (__prb_trylock(cpu_lock, cpu_store)) ++ break; ++ cpu_relax(); ++ } ++} ++ ++/* ++ * prb_unlock: Perform a processor-reentrant spin unlock. ++ * @cpu_lock: A pointer to the lock object. ++ * @cpu_store: A "flags" object storing lock status information. ++ * ++ * Release the lock. The calling processor must be the owner of the lock. ++ * ++ * It is safe to call this function from any context and state. ++ */ ++static void prb_unlock(struct prb_cpulock *cpu_lock, unsigned int cpu_store) ++{ ++ unsigned long *flags; ++ unsigned int cpu; ++ ++ cpu = atomic_read(&cpu_lock->owner); ++ atomic_set_release(&cpu_lock->owner, cpu_store); ++ ++ if (cpu_store == -1) { ++ flags = per_cpu_ptr(cpu_lock->irqflags, cpu); ++ local_irq_restore(*flags); ++ } ++ ++ put_cpu(); ++} ++ ++DECLARE_STATIC_PRINTKRB_CPULOCK(printk_cpulock); ++ ++void console_atomic_lock(unsigned int *flags) ++{ ++ prb_lock(&printk_cpulock, flags); ++} ++EXPORT_SYMBOL(console_atomic_lock); ++ ++void console_atomic_unlock(unsigned int flags) ++{ ++ prb_unlock(&printk_cpulock, flags); ++} ++EXPORT_SYMBOL(console_atomic_unlock); +-- +2.43.0 + diff --git a/debian/patches-rt/0100-serial-8250-implement-write_atomic.patch b/debian/patches-rt/0100-serial-8250-implement-write_atomic.patch new file mode 100644 index 000000000..ad692ff78 --- /dev/null +++ b/debian/patches-rt/0100-serial-8250-implement-write_atomic.patch @@ -0,0 +1,500 @@ +From 713182575337acb0891fcfaef75c3a36408d1766 Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Mon, 30 Nov 2020 01:42:02 +0106 +Subject: [PATCH 100/323] serial: 8250: implement write_atomic +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Implement a non-sleeping NMI-safe write_atomic() console function in +order to support emergency console printing. + +Since interrupts need to be disabled during transmit, all usage of +the IER register is wrapped with access functions that use the +console_atomic_lock() function to synchronize register access while +tracking the state of the interrupts. This is necessary because +write_atomic() can be called from an NMI context that has preempted +write_atomic(). + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/tty/serial/8250/8250.h | 47 ++++++++++++- + drivers/tty/serial/8250/8250_core.c | 17 +++-- + drivers/tty/serial/8250/8250_fsl.c | 9 +++ + drivers/tty/serial/8250/8250_ingenic.c | 7 ++ + drivers/tty/serial/8250/8250_mtk.c | 29 +++++++- + drivers/tty/serial/8250/8250_port.c | 92 ++++++++++++++++---------- + include/linux/serial_8250.h | 5 ++ + 7 files changed, 162 insertions(+), 44 deletions(-) + +diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h +index 61b11490ae5b..32c534b87397 100644 +--- a/drivers/tty/serial/8250/8250.h ++++ b/drivers/tty/serial/8250/8250.h +@@ -152,12 +152,55 @@ static inline void serial_dl_write(struct uart_8250_port *up, int value) + up->dl_write(up, value); + } + ++static inline void serial8250_set_IER(struct uart_8250_port *up, ++ unsigned char ier) ++{ ++ struct uart_port *port = &up->port; ++ unsigned int flags; ++ bool is_console; ++ ++ is_console = uart_console(port); ++ ++ if (is_console) ++ console_atomic_lock(&flags); ++ ++ serial_out(up, UART_IER, ier); ++ ++ if (is_console) ++ console_atomic_unlock(flags); ++} ++ ++static inline unsigned char serial8250_clear_IER(struct uart_8250_port *up) ++{ ++ struct uart_port *port = &up->port; ++ unsigned int clearval = 0; ++ unsigned int prior; ++ unsigned int flags; ++ bool is_console; ++ ++ is_console = uart_console(port); ++ ++ if (up->capabilities & UART_CAP_UUE) ++ clearval = UART_IER_UUE; ++ ++ if (is_console) ++ console_atomic_lock(&flags); ++ ++ prior = serial_port_in(port, UART_IER); ++ serial_port_out(port, UART_IER, clearval); ++ ++ if (is_console) ++ console_atomic_unlock(flags); ++ ++ return prior; ++} ++ + static inline bool serial8250_set_THRI(struct uart_8250_port *up) + { + if (up->ier & UART_IER_THRI) + return false; + up->ier |= UART_IER_THRI; +- serial_out(up, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + return true; + } + +@@ -166,7 +209,7 @@ static inline bool serial8250_clear_THRI(struct uart_8250_port *up) + if (!(up->ier & UART_IER_THRI)) + return false; + up->ier &= ~UART_IER_THRI; +- serial_out(up, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + return true; + } + +diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c +index 43f2eed6df78..687119fe2f8c 100644 +--- a/drivers/tty/serial/8250/8250_core.c ++++ b/drivers/tty/serial/8250/8250_core.c +@@ -275,10 +275,8 @@ static void serial8250_backup_timeout(struct timer_list *t) + * Must disable interrupts or else we risk racing with the interrupt + * based handler. + */ +- if (up->port.irq) { +- ier = serial_in(up, UART_IER); +- serial_out(up, UART_IER, 0); +- } ++ if (up->port.irq) ++ ier = serial8250_clear_IER(up); + + iir = serial_in(up, UART_IIR); + +@@ -301,7 +299,7 @@ static void serial8250_backup_timeout(struct timer_list *t) + serial8250_tx_chars(up); + + if (up->port.irq) +- serial_out(up, UART_IER, ier); ++ serial8250_set_IER(up, ier); + + spin_unlock_irqrestore(&up->port.lock, flags); + +@@ -588,6 +586,14 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev) + + #ifdef CONFIG_SERIAL_8250_CONSOLE + ++static void univ8250_console_write_atomic(struct console *co, const char *s, ++ unsigned int count) ++{ ++ struct uart_8250_port *up = &serial8250_ports[co->index]; ++ ++ serial8250_console_write_atomic(up, s, count); ++} ++ + static void univ8250_console_write(struct console *co, const char *s, + unsigned int count) + { +@@ -681,6 +687,7 @@ static int univ8250_console_match(struct console *co, char *name, int idx, + + static struct console univ8250_console = { + .name = "ttyS", ++ .write_atomic = univ8250_console_write_atomic, + .write = univ8250_console_write, + .device = uart_console_device, + .setup = univ8250_console_setup, +diff --git a/drivers/tty/serial/8250/8250_fsl.c b/drivers/tty/serial/8250/8250_fsl.c +index fbcc90c31ca1..b33cb454ce03 100644 +--- a/drivers/tty/serial/8250/8250_fsl.c ++++ b/drivers/tty/serial/8250/8250_fsl.c +@@ -60,9 +60,18 @@ int fsl8250_handle_irq(struct uart_port *port) + + /* Stop processing interrupts on input overrun */ + if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) { ++ unsigned int ca_flags; + unsigned long delay; ++ bool is_console; + ++ is_console = uart_console(port); ++ ++ if (is_console) ++ console_atomic_lock(&ca_flags); + up->ier = port->serial_in(port, UART_IER); ++ if (is_console) ++ console_atomic_unlock(ca_flags); ++ + if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) { + port->ops->stop_rx(port); + } else { +diff --git a/drivers/tty/serial/8250/8250_ingenic.c b/drivers/tty/serial/8250/8250_ingenic.c +index 988bf6bcce42..bcd26d672539 100644 +--- a/drivers/tty/serial/8250/8250_ingenic.c ++++ b/drivers/tty/serial/8250/8250_ingenic.c +@@ -146,6 +146,8 @@ OF_EARLYCON_DECLARE(x1000_uart, "ingenic,x1000-uart", + + static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value) + { ++ unsigned int flags; ++ bool is_console; + int ier; + + switch (offset) { +@@ -167,7 +169,12 @@ static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value) + * If we have enabled modem status IRQs we should enable + * modem mode. + */ ++ is_console = uart_console(p); ++ if (is_console) ++ console_atomic_lock(&flags); + ier = p->serial_in(p, UART_IER); ++ if (is_console) ++ console_atomic_unlock(flags); + + if (ier & UART_IER_MSI) + value |= UART_MCR_MDCE | UART_MCR_FCM; +diff --git a/drivers/tty/serial/8250/8250_mtk.c b/drivers/tty/serial/8250/8250_mtk.c +index de48a58460f4..d246f2755fed 100644 +--- a/drivers/tty/serial/8250/8250_mtk.c ++++ b/drivers/tty/serial/8250/8250_mtk.c +@@ -222,12 +222,37 @@ static void mtk8250_shutdown(struct uart_port *port) + + static void mtk8250_disable_intrs(struct uart_8250_port *up, int mask) + { +- serial_out(up, UART_IER, serial_in(up, UART_IER) & (~mask)); ++ struct uart_port *port = &up->port; ++ unsigned int flags; ++ unsigned int ier; ++ bool is_console; ++ ++ is_console = uart_console(port); ++ ++ if (is_console) ++ console_atomic_lock(&flags); ++ ++ ier = serial_in(up, UART_IER); ++ serial_out(up, UART_IER, ier & (~mask)); ++ ++ if (is_console) ++ console_atomic_unlock(flags); + } + + static void mtk8250_enable_intrs(struct uart_8250_port *up, int mask) + { +- serial_out(up, UART_IER, serial_in(up, UART_IER) | mask); ++ struct uart_port *port = &up->port; ++ unsigned int flags; ++ unsigned int ier; ++ ++ if (uart_console(port)) ++ console_atomic_lock(&flags); ++ ++ ier = serial_in(up, UART_IER); ++ serial_out(up, UART_IER, ier | mask); ++ ++ if (uart_console(port)) ++ console_atomic_unlock(flags); + } + + static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode) +diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c +index 8b49ac4856d2..947737d0e46b 100644 +--- a/drivers/tty/serial/8250/8250_port.c ++++ b/drivers/tty/serial/8250/8250_port.c +@@ -730,7 +730,7 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep) + serial_out(p, UART_EFR, UART_EFR_ECB); + serial_out(p, UART_LCR, 0); + } +- serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0); ++ serial8250_set_IER(p, sleep ? UART_IERX_SLEEP : 0); + if (p->capabilities & UART_CAP_EFR) { + serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B); + serial_out(p, UART_EFR, efr); +@@ -1405,7 +1405,7 @@ static void serial8250_stop_rx(struct uart_port *port) + + up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); + up->port.read_status_mask &= ~UART_LSR_DR; +- serial_port_out(port, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + + serial8250_rpm_put(up); + } +@@ -1435,7 +1435,7 @@ void serial8250_em485_stop_tx(struct uart_8250_port *p) + serial8250_clear_and_reinit_fifos(p); + + p->ier |= UART_IER_RLSI | UART_IER_RDI; +- serial_port_out(&p->port, UART_IER, p->ier); ++ serial8250_set_IER(p, p->ier); + } + } + EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx); +@@ -1677,7 +1677,7 @@ static void serial8250_disable_ms(struct uart_port *port) + mctrl_gpio_disable_ms(up->gpios); + + up->ier &= ~UART_IER_MSI; +- serial_port_out(port, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + } + + static void serial8250_enable_ms(struct uart_port *port) +@@ -1693,7 +1693,7 @@ static void serial8250_enable_ms(struct uart_port *port) + up->ier |= UART_IER_MSI; + + serial8250_rpm_get(up); +- serial_port_out(port, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + serial8250_rpm_put(up); + } + +@@ -2129,14 +2129,7 @@ static void serial8250_put_poll_char(struct uart_port *port, + struct uart_8250_port *up = up_to_u8250p(port); + + serial8250_rpm_get(up); +- /* +- * First save the IER then disable the interrupts +- */ +- ier = serial_port_in(port, UART_IER); +- if (up->capabilities & UART_CAP_UUE) +- serial_port_out(port, UART_IER, UART_IER_UUE); +- else +- serial_port_out(port, UART_IER, 0); ++ ier = serial8250_clear_IER(up); + + wait_for_xmitr(up, BOTH_EMPTY); + /* +@@ -2149,7 +2142,7 @@ static void serial8250_put_poll_char(struct uart_port *port, + * and restore the IER + */ + wait_for_xmitr(up, BOTH_EMPTY); +- serial_port_out(port, UART_IER, ier); ++ serial8250_set_IER(up, ier); + serial8250_rpm_put(up); + } + +@@ -2454,7 +2447,7 @@ void serial8250_do_shutdown(struct uart_port *port) + */ + spin_lock_irqsave(&port->lock, flags); + up->ier = 0; +- serial_port_out(port, UART_IER, 0); ++ serial8250_set_IER(up, 0); + spin_unlock_irqrestore(&port->lock, flags); + + synchronize_irq(port->irq); +@@ -2806,7 +2799,7 @@ serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios, + if (up->capabilities & UART_CAP_RTOIE) + up->ier |= UART_IER_RTOIE; + +- serial_port_out(port, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + + if (up->capabilities & UART_CAP_EFR) { + unsigned char efr = 0; +@@ -3271,7 +3264,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_defaults); + + #ifdef CONFIG_SERIAL_8250_CONSOLE + +-static void serial8250_console_putchar(struct uart_port *port, int ch) ++static void serial8250_console_putchar_locked(struct uart_port *port, int ch) + { + struct uart_8250_port *up = up_to_u8250p(port); + +@@ -3279,6 +3272,18 @@ static void serial8250_console_putchar(struct uart_port *port, int ch) + serial_port_out(port, UART_TX, ch); + } + ++static void serial8250_console_putchar(struct uart_port *port, int ch) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned int flags; ++ ++ wait_for_xmitr(up, UART_LSR_THRE); ++ ++ console_atomic_lock(&flags); ++ serial8250_console_putchar_locked(port, ch); ++ console_atomic_unlock(flags); ++} ++ + /* + * Restore serial console when h/w power-off detected + */ +@@ -3305,6 +3310,32 @@ static void serial8250_console_restore(struct uart_8250_port *up) + serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS); + } + ++void serial8250_console_write_atomic(struct uart_8250_port *up, ++ const char *s, unsigned int count) ++{ ++ struct uart_port *port = &up->port; ++ unsigned int flags; ++ unsigned int ier; ++ ++ console_atomic_lock(&flags); ++ ++ touch_nmi_watchdog(); ++ ++ ier = serial8250_clear_IER(up); ++ ++ if (atomic_fetch_inc(&up->console_printing)) { ++ uart_console_write(port, "\n", 1, ++ serial8250_console_putchar_locked); ++ } ++ uart_console_write(port, s, count, serial8250_console_putchar_locked); ++ atomic_dec(&up->console_printing); ++ ++ wait_for_xmitr(up, BOTH_EMPTY); ++ serial8250_set_IER(up, ier); ++ ++ console_atomic_unlock(flags); ++} ++ + /* + * Print a string to the serial port trying not to disturb + * any possible real use of the port... +@@ -3321,24 +3352,12 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, + struct uart_port *port = &up->port; + unsigned long flags; + unsigned int ier; +- int locked = 1; + + touch_nmi_watchdog(); + +- if (oops_in_progress) +- locked = spin_trylock_irqsave(&port->lock, flags); +- else +- spin_lock_irqsave(&port->lock, flags); +- +- /* +- * First save the IER then disable the interrupts +- */ +- ier = serial_port_in(port, UART_IER); ++ spin_lock_irqsave(&port->lock, flags); + +- if (up->capabilities & UART_CAP_UUE) +- serial_port_out(port, UART_IER, UART_IER_UUE); +- else +- serial_port_out(port, UART_IER, 0); ++ ier = serial8250_clear_IER(up); + + /* check scratch reg to see if port powered off during system sleep */ + if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { +@@ -3352,7 +3371,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, + mdelay(port->rs485.delay_rts_before_send); + } + ++ atomic_inc(&up->console_printing); + uart_console_write(port, s, count, serial8250_console_putchar); ++ atomic_dec(&up->console_printing); + + /* + * Finally, wait for transmitter to become empty +@@ -3365,8 +3386,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, + if (em485->tx_stopped) + up->rs485_stop_tx(up); + } +- +- serial_port_out(port, UART_IER, ier); ++ serial8250_set_IER(up, ier); + + /* + * The receive handling will happen properly because the +@@ -3378,8 +3398,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, + if (up->msr_saved_flags) + serial8250_modem_status(up); + +- if (locked) +- spin_unlock_irqrestore(&port->lock, flags); ++ spin_unlock_irqrestore(&port->lock, flags); + } + + static unsigned int probe_baud(struct uart_port *port) +@@ -3399,6 +3418,7 @@ static unsigned int probe_baud(struct uart_port *port) + + int serial8250_console_setup(struct uart_port *port, char *options, bool probe) + { ++ struct uart_8250_port *up = up_to_u8250p(port); + int baud = 9600; + int bits = 8; + int parity = 'n'; +@@ -3408,6 +3428,8 @@ int serial8250_console_setup(struct uart_port *port, char *options, bool probe) + if (!port->iobase && !port->membase) + return -ENODEV; + ++ atomic_set(&up->console_printing, 0); ++ + if (options) + uart_parse_options(options, &baud, &parity, &bits, &flow); + else if (probe) +diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h +index abb928361270..77de68ed2d60 100644 +--- a/include/linux/serial_8250.h ++++ b/include/linux/serial_8250.h +@@ -7,6 +7,7 @@ + #ifndef _LINUX_SERIAL_8250_H + #define _LINUX_SERIAL_8250_H + ++#include <linux/atomic.h> + #include <linux/serial_core.h> + #include <linux/serial_reg.h> + #include <linux/platform_device.h> +@@ -125,6 +126,8 @@ struct uart_8250_port { + #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA + unsigned char msr_saved_flags; + ++ atomic_t console_printing; ++ + struct uart_8250_dma *dma; + const struct uart_8250_ops *ops; + +@@ -180,6 +183,8 @@ void serial8250_init_port(struct uart_8250_port *up); + void serial8250_set_defaults(struct uart_8250_port *up); + void serial8250_console_write(struct uart_8250_port *up, const char *s, + unsigned int count); ++void serial8250_console_write_atomic(struct uart_8250_port *up, const char *s, ++ unsigned int count); + int serial8250_console_setup(struct uart_port *port, char *options, bool probe); + int serial8250_console_exit(struct uart_port *port); + +-- +2.43.0 + diff --git a/debian/patches-rt/0101-printk-relocate-printk_delay-and-vprintk_default.patch b/debian/patches-rt/0101-printk-relocate-printk_delay-and-vprintk_default.patch new file mode 100644 index 000000000..27a3bd9d4 --- /dev/null +++ b/debian/patches-rt/0101-printk-relocate-printk_delay-and-vprintk_default.patch @@ -0,0 +1,89 @@ +From 598543b96b715ed360718ca3597ad4a2eda53241 Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Mon, 30 Nov 2020 01:42:03 +0106 +Subject: [PATCH 101/323] printk: relocate printk_delay() and vprintk_default() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Move printk_delay() and vprintk_default() "as is" further up so that +they can be used by new functions in an upcoming commit. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/printk/printk.c | 40 ++++++++++++++++++++-------------------- + 1 file changed, 20 insertions(+), 20 deletions(-) + +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 8768473712b2..c3fc6706a118 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -1728,6 +1728,20 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) + return do_syslog(type, buf, len, SYSLOG_FROM_READER); + } + ++int printk_delay_msec __read_mostly; ++ ++static inline void printk_delay(void) ++{ ++ if (unlikely(printk_delay_msec)) { ++ int m = printk_delay_msec; ++ ++ while (m--) { ++ mdelay(1); ++ touch_nmi_watchdog(); ++ } ++ } ++} ++ + /* + * Special console_lock variants that help to reduce the risk of soft-lockups. + * They allow to pass console_lock to another printk() call using a busy wait. +@@ -1971,20 +1985,6 @@ static void printk_exit_irqrestore(unsigned long flags) + local_irq_restore(flags); + } + +-int printk_delay_msec __read_mostly; +- +-static inline void printk_delay(void) +-{ +- if (unlikely(printk_delay_msec)) { +- int m = printk_delay_msec; +- +- while (m--) { +- mdelay(1); +- touch_nmi_watchdog(); +- } +- } +-} +- + static inline u32 printk_caller_id(void) + { + return in_task() ? task_pid_nr(current) : +@@ -2217,18 +2217,18 @@ asmlinkage int vprintk_emit(int facility, int level, + } + EXPORT_SYMBOL(vprintk_emit); + +-asmlinkage int vprintk(const char *fmt, va_list args) +-{ +- return vprintk_func(fmt, args); +-} +-EXPORT_SYMBOL(vprintk); +- + int vprintk_default(const char *fmt, va_list args) + { + return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); + } + EXPORT_SYMBOL_GPL(vprintk_default); + ++asmlinkage int vprintk(const char *fmt, va_list args) ++{ ++ return vprintk_func(fmt, args); ++} ++EXPORT_SYMBOL(vprintk); ++ + /** + * printk - print a kernel message + * @fmt: format string +-- +2.43.0 + diff --git a/debian/patches-rt/0102-printk-combine-boot_delay_msec-into-printk_delay.patch b/debian/patches-rt/0102-printk-combine-boot_delay_msec-into-printk_delay.patch new file mode 100644 index 000000000..64dda79fa --- /dev/null +++ b/debian/patches-rt/0102-printk-combine-boot_delay_msec-into-printk_delay.patch @@ -0,0 +1,44 @@ +From 17a9ff9877863dc267a2b0ead2ab3f792b5e405c Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Mon, 30 Nov 2020 01:42:04 +0106 +Subject: [PATCH 102/323] printk: combine boot_delay_msec() into printk_delay() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +boot_delay_msec() is always called immediately before printk_delay() +so just combine the two. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/printk/printk.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index c3fc6706a118..0ae184675e86 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -1730,8 +1730,10 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) + + int printk_delay_msec __read_mostly; + +-static inline void printk_delay(void) ++static inline void printk_delay(int level) + { ++ boot_delay_msec(level); ++ + if (unlikely(printk_delay_msec)) { + int m = printk_delay_msec; + +@@ -2189,8 +2191,7 @@ asmlinkage int vprintk_emit(int facility, int level, + in_sched = true; + } + +- boot_delay_msec(level); +- printk_delay(); ++ printk_delay(level); + + printed_len = vprintk_store(facility, level, dev_info, fmt, args); + +-- +2.43.0 + diff --git a/debian/patches-rt/0103-printk-change-console_seq-to-atomic64_t.patch b/debian/patches-rt/0103-printk-change-console_seq-to-atomic64_t.patch new file mode 100644 index 000000000..6647ecd6c --- /dev/null +++ b/debian/patches-rt/0103-printk-change-console_seq-to-atomic64_t.patch @@ -0,0 +1,132 @@ +From ebdec06c58c4330925d5603649542827bf7bff25 Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Mon, 30 Nov 2020 01:42:05 +0106 +Subject: [PATCH 103/323] printk: change @console_seq to atomic64_t +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +In preparation for atomic printing, change @console_seq to atomic +so that it can be accessed without requiring @console_sem. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/printk/printk.c | 34 +++++++++++++++++++--------------- + 1 file changed, 19 insertions(+), 15 deletions(-) + +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 0ae184675e86..2bc9904fd8ab 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -368,12 +368,13 @@ static u64 syslog_seq; + static size_t syslog_partial; + static bool syslog_time; + +-/* All 3 protected by @console_sem. */ +-/* the next printk record to write to the console */ +-static u64 console_seq; ++/* Both protected by @console_sem. */ + static u64 exclusive_console_stop_seq; + static unsigned long console_dropped; + ++/* the next printk record to write to the console */ ++static atomic64_t console_seq = ATOMIC64_INIT(0); ++ + struct latched_seq { + seqcount_latch_t latch; + u64 val[2]; +@@ -2273,7 +2274,7 @@ EXPORT_SYMBOL(printk); + #define prb_first_valid_seq(rb) 0 + + static u64 syslog_seq; +-static u64 console_seq; ++static atomic64_t console_seq = ATOMIC64_INIT(0); + static u64 exclusive_console_stop_seq; + static unsigned long console_dropped; + +@@ -2588,6 +2589,7 @@ void console_unlock(void) + bool do_cond_resched, retry; + struct printk_info info; + struct printk_record r; ++ u64 seq; + + if (console_suspended) { + up_console_sem(); +@@ -2630,12 +2632,14 @@ void console_unlock(void) + size_t len; + + skip: +- if (!prb_read_valid(prb, console_seq, &r)) ++ seq = atomic64_read(&console_seq); ++ if (!prb_read_valid(prb, seq, &r)) + break; + +- if (console_seq != r.info->seq) { +- console_dropped += r.info->seq - console_seq; +- console_seq = r.info->seq; ++ if (seq != r.info->seq) { ++ console_dropped += r.info->seq - seq; ++ atomic64_set(&console_seq, r.info->seq); ++ seq = r.info->seq; + } + + if (suppress_message_printing(r.info->level)) { +@@ -2644,13 +2648,13 @@ void console_unlock(void) + * directly to the console when we received it, and + * record that has level above the console loglevel. + */ +- console_seq++; ++ atomic64_set(&console_seq, seq + 1); + goto skip; + } + + /* Output to all consoles once old messages replayed. */ + if (unlikely(exclusive_console && +- console_seq >= exclusive_console_stop_seq)) { ++ seq >= exclusive_console_stop_seq)) { + exclusive_console = NULL; + } + +@@ -2671,7 +2675,7 @@ void console_unlock(void) + len = record_print_text(&r, + console_msg_format & MSG_FORMAT_SYSLOG, + printk_time); +- console_seq++; ++ atomic64_set(&console_seq, seq + 1); + + /* + * While actively printing out messages, if another printk() +@@ -2702,7 +2706,7 @@ void console_unlock(void) + * there's a new owner and the console_unlock() from them will do the + * flush, no worries. + */ +- retry = prb_read_valid(prb, console_seq, NULL); ++ retry = prb_read_valid(prb, atomic64_read(&console_seq), NULL); + if (retry && console_trylock()) + goto again; + } +@@ -2765,7 +2769,7 @@ void console_flush_on_panic(enum con_flush_mode mode) + console_may_schedule = 0; + + if (mode == CONSOLE_REPLAY_ALL) +- console_seq = prb_first_valid_seq(prb); ++ atomic64_set(&console_seq, prb_first_valid_seq(prb)); + console_unlock(); + } + +@@ -3002,11 +3006,11 @@ void register_console(struct console *newcon) + * ignores console_lock. + */ + exclusive_console = newcon; +- exclusive_console_stop_seq = console_seq; ++ exclusive_console_stop_seq = atomic64_read(&console_seq); + + /* Get a consistent copy of @syslog_seq. */ + spin_lock_irqsave(&syslog_lock, flags); +- console_seq = syslog_seq; ++ atomic64_set(&console_seq, syslog_seq); + spin_unlock_irqrestore(&syslog_lock, flags); + } + console_unlock(); +-- +2.43.0 + diff --git a/debian/patches-rt/0104-printk-introduce-kernel-sync-mode.patch b/debian/patches-rt/0104-printk-introduce-kernel-sync-mode.patch new file mode 100644 index 000000000..3dfd1b707 --- /dev/null +++ b/debian/patches-rt/0104-printk-introduce-kernel-sync-mode.patch @@ -0,0 +1,310 @@ +From fcf68eb3d589c57f73e4a79b7df2f6ec8080b1ce Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Mon, 30 Nov 2020 01:42:06 +0106 +Subject: [PATCH 104/323] printk: introduce kernel sync mode +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +When the kernel performs an OOPS, enter into "sync mode": + +- only atomic consoles (write_atomic() callback) will print +- printing occurs within vprintk_store() instead of console_unlock() + +CONSOLE_LOG_MAX is moved to printk.h to support the per-console +buffer used in sync mode. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + include/linux/console.h | 4 ++ + include/linux/printk.h | 6 ++ + kernel/printk/printk.c | 133 ++++++++++++++++++++++++++++++++++++++-- + 3 files changed, 137 insertions(+), 6 deletions(-) + +diff --git a/include/linux/console.h b/include/linux/console.h +index 613df76903f5..fcc98354f733 100644 +--- a/include/linux/console.h ++++ b/include/linux/console.h +@@ -16,6 +16,7 @@ + + #include <linux/atomic.h> + #include <linux/types.h> ++#include <linux/printk.h> + + struct vc_data; + struct console_font_op; +@@ -151,6 +152,9 @@ struct console { + short flags; + short index; + int cflag; ++#ifdef CONFIG_PRINTK ++ char sync_buf[CONSOLE_LOG_MAX]; ++#endif + uint ispeed; + uint ospeed; + void *data; +diff --git a/include/linux/printk.h b/include/linux/printk.h +index c6bb48f0134c..3738374e2b3b 100644 +--- a/include/linux/printk.h ++++ b/include/linux/printk.h +@@ -46,6 +46,12 @@ static inline const char *printk_skip_headers(const char *buffer) + + #define CONSOLE_EXT_LOG_MAX 8192 + ++/* ++ * The maximum size of a record formatted for console printing ++ * (i.e. with the prefix prepended to every line). ++ */ ++#define CONSOLE_LOG_MAX 4096 ++ + /* printk's without a loglevel use this.. */ + #define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT + +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 2bc9904fd8ab..8a0af76b6f69 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -44,6 +44,7 @@ + #include <linux/irq_work.h> + #include <linux/ctype.h> + #include <linux/uio.h> ++#include <linux/clocksource.h> + #include <linux/sched/clock.h> + #include <linux/sched/debug.h> + #include <linux/sched/task_stack.h> +@@ -361,6 +362,9 @@ enum log_flags { + static DEFINE_SPINLOCK(syslog_lock); + + #ifdef CONFIG_PRINTK ++/* Set to enable sync mode. Once set, it is never cleared. */ ++static bool sync_mode; ++ + DECLARE_WAIT_QUEUE_HEAD(log_wait); + /* All 3 protected by @syslog_lock. */ + /* the next printk record to read by syslog(READ) or /proc/kmsg */ +@@ -400,9 +404,6 @@ static struct latched_seq clear_seq = { + /* the maximum size allowed to be reserved for a record */ + #define LOG_LINE_MAX (1024 - PREFIX_MAX) + +-/* the maximum size of a formatted record (i.e. with prefix added per line) */ +-#define CONSOLE_LOG_MAX 4096 +- + #define LOG_LEVEL(v) ((v) & 0x07) + #define LOG_FACILITY(v) ((v) >> 3 & 0xff) + +@@ -1745,6 +1746,91 @@ static inline void printk_delay(int level) + } + } + ++static bool kernel_sync_mode(void) ++{ ++ return (oops_in_progress || sync_mode); ++} ++ ++static bool console_can_sync(struct console *con) ++{ ++ if (!(con->flags & CON_ENABLED)) ++ return false; ++ if (con->write_atomic && kernel_sync_mode()) ++ return true; ++ return false; ++} ++ ++static bool call_sync_console_driver(struct console *con, const char *text, size_t text_len) ++{ ++ if (!(con->flags & CON_ENABLED)) ++ return false; ++ if (con->write_atomic && kernel_sync_mode()) ++ con->write_atomic(con, text, text_len); ++ else ++ return false; ++ ++ return true; ++} ++ ++static bool have_atomic_console(void) ++{ ++ struct console *con; ++ ++ for_each_console(con) { ++ if (!(con->flags & CON_ENABLED)) ++ continue; ++ if (con->write_atomic) ++ return true; ++ } ++ return false; ++} ++ ++static bool print_sync(struct console *con, u64 *seq) ++{ ++ struct printk_info info; ++ struct printk_record r; ++ size_t text_len; ++ ++ prb_rec_init_rd(&r, &info, &con->sync_buf[0], sizeof(con->sync_buf)); ++ ++ if (!prb_read_valid(prb, *seq, &r)) ++ return false; ++ ++ text_len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); ++ ++ if (!call_sync_console_driver(con, &con->sync_buf[0], text_len)) ++ return false; ++ ++ *seq = r.info->seq; ++ ++ touch_softlockup_watchdog_sync(); ++ clocksource_touch_watchdog(); ++ rcu_cpu_stall_reset(); ++ touch_nmi_watchdog(); ++ ++ if (text_len) ++ printk_delay(r.info->level); ++ ++ return true; ++} ++ ++static void print_sync_until(struct console *con, u64 seq) ++{ ++ unsigned int flags; ++ u64 printk_seq; ++ ++ console_atomic_lock(&flags); ++ for (;;) { ++ printk_seq = atomic64_read(&console_seq); ++ if (printk_seq >= seq) ++ break; ++ if (!print_sync(con, &printk_seq)) ++ break; ++ atomic64_set(&console_seq, printk_seq + 1); ++ } ++ console_atomic_unlock(flags); ++} ++ + /* + * Special console_lock variants that help to reduce the risk of soft-lockups. + * They allow to pass console_lock to another printk() call using a busy wait. +@@ -1919,6 +2005,8 @@ static void call_console_drivers(const char *ext_text, size_t ext_len, + if (!cpu_online(smp_processor_id()) && + !(con->flags & CON_ANYTIME)) + continue; ++ if (kernel_sync_mode()) ++ continue; + if (con->flags & CON_EXTENDED) + con->write(con, ext_text, ext_len); + else { +@@ -2073,6 +2161,7 @@ int vprintk_store(int facility, int level, + const u32 caller_id = printk_caller_id(); + struct prb_reserved_entry e; + enum log_flags lflags = 0; ++ bool final_commit = false; + struct printk_record r; + unsigned long irqflags; + u16 trunc_msg_len = 0; +@@ -2082,6 +2171,7 @@ int vprintk_store(int facility, int level, + u16 text_len; + int ret = 0; + u64 ts_nsec; ++ u64 seq; + + /* + * Since the duration of printk() can vary depending on the message +@@ -2120,6 +2210,7 @@ int vprintk_store(int facility, int level, + if (lflags & LOG_CONT) { + prb_rec_init_wr(&r, reserve_size); + if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { ++ seq = r.info->seq; + text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size, + facility, &lflags, fmt, args); + r.info->text_len += text_len; +@@ -2127,6 +2218,7 @@ int vprintk_store(int facility, int level, + if (lflags & LOG_NEWLINE) { + r.info->flags |= LOG_NEWLINE; + prb_final_commit(&e); ++ final_commit = true; + } else { + prb_commit(&e); + } +@@ -2151,6 +2243,8 @@ int vprintk_store(int facility, int level, + goto out; + } + ++ seq = r.info->seq; ++ + /* fill message */ + text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &lflags, fmt, args); + if (trunc_msg_len) +@@ -2165,13 +2259,25 @@ int vprintk_store(int facility, int level, + memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); + + /* A message without a trailing newline can be continued. */ +- if (!(lflags & LOG_NEWLINE)) ++ if (!(lflags & LOG_NEWLINE)) { + prb_commit(&e); +- else ++ } else { + prb_final_commit(&e); ++ final_commit = true; ++ } + + ret = text_len + trunc_msg_len; + out: ++ /* only the kernel may perform synchronous printing */ ++ if (facility == 0 && final_commit) { ++ struct console *con; ++ ++ for_each_console(con) { ++ if (console_can_sync(con)) ++ print_sync_until(con, seq + 1); ++ } ++ } ++ + printk_exit_irqrestore(irqflags); + return ret; + } +@@ -2267,12 +2373,13 @@ EXPORT_SYMBOL(printk); + + #else /* CONFIG_PRINTK */ + +-#define CONSOLE_LOG_MAX 0 + #define printk_time false + + #define prb_read_valid(rb, seq, r) false + #define prb_first_valid_seq(rb) 0 + ++#define kernel_sync_mode() false ++ + static u64 syslog_seq; + static atomic64_t console_seq = ATOMIC64_INIT(0); + static u64 exclusive_console_stop_seq; +@@ -2565,6 +2672,8 @@ static int have_callable_console(void) + */ + static inline int can_use_console(void) + { ++ if (kernel_sync_mode()) ++ return false; + return cpu_online(raw_smp_processor_id()) || have_callable_console(); + } + +@@ -3379,6 +3488,18 @@ void kmsg_dump(enum kmsg_dump_reason reason) + struct kmsg_dumper_iter iter; + struct kmsg_dumper *dumper; + ++ if (!oops_in_progress) { ++ /* ++ * If atomic consoles are available, activate kernel sync mode ++ * to make sure any final messages are visible. The trailing ++ * printk message is important to flush any pending messages. ++ */ ++ if (have_atomic_console()) { ++ sync_mode = true; ++ pr_info("enabled sync mode\n"); ++ } ++ } ++ + rcu_read_lock(); + list_for_each_entry_rcu(dumper, &dump_list, list) { + enum kmsg_dump_reason max_reason = dumper->max_reason; +-- +2.43.0 + diff --git a/debian/patches-rt/0105-printk-move-console-printing-to-kthreads.patch b/debian/patches-rt/0105-printk-move-console-printing-to-kthreads.patch new file mode 100644 index 000000000..5f6075baf --- /dev/null +++ b/debian/patches-rt/0105-printk-move-console-printing-to-kthreads.patch @@ -0,0 +1,848 @@ +From 24c419661c9c605b6f1ddccf1683c2c0f8c5dcf4 Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Mon, 30 Nov 2020 01:42:07 +0106 +Subject: [PATCH 105/323] printk: move console printing to kthreads +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Create a kthread for each console to perform console printing. Now +all console printing is fully asynchronous except for the boot +console and when the kernel enters sync mode (and there are atomic +consoles available). + +The console_lock() and console_unlock() functions now only do what +their name says... locking and unlocking of the console. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + include/linux/console.h | 2 + + kernel/printk/printk.c | 625 ++++++++++++---------------------------- + 2 files changed, 186 insertions(+), 441 deletions(-) + +diff --git a/include/linux/console.h b/include/linux/console.h +index fcc98354f733..3e99359e0660 100644 +--- a/include/linux/console.h ++++ b/include/linux/console.h +@@ -155,6 +155,8 @@ struct console { + #ifdef CONFIG_PRINTK + char sync_buf[CONSOLE_LOG_MAX]; + #endif ++ atomic64_t printk_seq; ++ struct task_struct *thread; + uint ispeed; + uint ospeed; + void *data; +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 8a0af76b6f69..232ce58e94c2 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -44,6 +44,7 @@ + #include <linux/irq_work.h> + #include <linux/ctype.h> + #include <linux/uio.h> ++#include <linux/kthread.h> + #include <linux/clocksource.h> + #include <linux/sched/clock.h> + #include <linux/sched/debug.h> +@@ -269,11 +270,6 @@ static void __up_console_sem(unsigned long ip) + */ + static int console_locked, console_suspended; + +-/* +- * If exclusive_console is non-NULL then only this console is to be printed to. +- */ +-static struct console *exclusive_console; +- + /* + * Array of consoles built from command line options (console=) + */ +@@ -358,10 +354,10 @@ enum log_flags { + LOG_CONT = 8, /* text is a fragment of a continuation line */ + }; + ++#ifdef CONFIG_PRINTK + /* syslog_lock protects syslog_* variables and write access to clear_seq. */ + static DEFINE_SPINLOCK(syslog_lock); + +-#ifdef CONFIG_PRINTK + /* Set to enable sync mode. Once set, it is never cleared. */ + static bool sync_mode; + +@@ -372,13 +368,6 @@ static u64 syslog_seq; + static size_t syslog_partial; + static bool syslog_time; + +-/* Both protected by @console_sem. */ +-static u64 exclusive_console_stop_seq; +-static unsigned long console_dropped; +- +-/* the next printk record to write to the console */ +-static atomic64_t console_seq = ATOMIC64_INIT(0); +- + struct latched_seq { + seqcount_latch_t latch; + u64 val[2]; +@@ -1757,6 +1746,8 @@ static bool console_can_sync(struct console *con) + return false; + if (con->write_atomic && kernel_sync_mode()) + return true; ++ if (con->write && (con->flags & CON_BOOT) && !con->thread) ++ return true; + return false; + } + +@@ -1766,6 +1757,8 @@ static bool call_sync_console_driver(struct console *con, const char *text, size + return false; + if (con->write_atomic && kernel_sync_mode()) + con->write_atomic(con, text, text_len); ++ else if (con->write && (con->flags & CON_BOOT) && !con->thread) ++ con->write(con, text, text_len); + else + return false; + +@@ -1821,202 +1814,16 @@ static void print_sync_until(struct console *con, u64 seq) + + console_atomic_lock(&flags); + for (;;) { +- printk_seq = atomic64_read(&console_seq); ++ printk_seq = atomic64_read(&con->printk_seq); + if (printk_seq >= seq) + break; + if (!print_sync(con, &printk_seq)) + break; +- atomic64_set(&console_seq, printk_seq + 1); ++ atomic64_set(&con->printk_seq, printk_seq + 1); + } + console_atomic_unlock(flags); + } + +-/* +- * Special console_lock variants that help to reduce the risk of soft-lockups. +- * They allow to pass console_lock to another printk() call using a busy wait. +- */ +- +-#ifdef CONFIG_LOCKDEP +-static struct lockdep_map console_owner_dep_map = { +- .name = "console_owner" +-}; +-#endif +- +-static DEFINE_RAW_SPINLOCK(console_owner_lock); +-static struct task_struct *console_owner; +-static bool console_waiter; +- +-/** +- * console_lock_spinning_enable - mark beginning of code where another +- * thread might safely busy wait +- * +- * This basically converts console_lock into a spinlock. This marks +- * the section where the console_lock owner can not sleep, because +- * there may be a waiter spinning (like a spinlock). Also it must be +- * ready to hand over the lock at the end of the section. +- */ +-static void console_lock_spinning_enable(void) +-{ +- raw_spin_lock(&console_owner_lock); +- console_owner = current; +- raw_spin_unlock(&console_owner_lock); +- +- /* The waiter may spin on us after setting console_owner */ +- spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); +-} +- +-/** +- * console_lock_spinning_disable_and_check - mark end of code where another +- * thread was able to busy wait and check if there is a waiter +- * +- * This is called at the end of the section where spinning is allowed. +- * It has two functions. First, it is a signal that it is no longer +- * safe to start busy waiting for the lock. Second, it checks if +- * there is a busy waiter and passes the lock rights to her. +- * +- * Important: Callers lose the lock if there was a busy waiter. +- * They must not touch items synchronized by console_lock +- * in this case. +- * +- * Return: 1 if the lock rights were passed, 0 otherwise. +- */ +-static int console_lock_spinning_disable_and_check(void) +-{ +- int waiter; +- +- raw_spin_lock(&console_owner_lock); +- waiter = READ_ONCE(console_waiter); +- console_owner = NULL; +- raw_spin_unlock(&console_owner_lock); +- +- if (!waiter) { +- spin_release(&console_owner_dep_map, _THIS_IP_); +- return 0; +- } +- +- /* The waiter is now free to continue */ +- WRITE_ONCE(console_waiter, false); +- +- spin_release(&console_owner_dep_map, _THIS_IP_); +- +- /* +- * Hand off console_lock to waiter. The waiter will perform +- * the up(). After this, the waiter is the console_lock owner. +- */ +- mutex_release(&console_lock_dep_map, _THIS_IP_); +- return 1; +-} +- +-/** +- * console_trylock_spinning - try to get console_lock by busy waiting +- * +- * This allows to busy wait for the console_lock when the current +- * owner is running in specially marked sections. It means that +- * the current owner is running and cannot reschedule until it +- * is ready to lose the lock. +- * +- * Return: 1 if we got the lock, 0 othrewise +- */ +-static int console_trylock_spinning(void) +-{ +- struct task_struct *owner = NULL; +- bool waiter; +- bool spin = false; +- unsigned long flags; +- +- if (console_trylock()) +- return 1; +- +- printk_safe_enter_irqsave(flags); +- +- raw_spin_lock(&console_owner_lock); +- owner = READ_ONCE(console_owner); +- waiter = READ_ONCE(console_waiter); +- if (!waiter && owner && owner != current) { +- WRITE_ONCE(console_waiter, true); +- spin = true; +- } +- raw_spin_unlock(&console_owner_lock); +- +- /* +- * If there is an active printk() writing to the +- * consoles, instead of having it write our data too, +- * see if we can offload that load from the active +- * printer, and do some printing ourselves. +- * Go into a spin only if there isn't already a waiter +- * spinning, and there is an active printer, and +- * that active printer isn't us (recursive printk?). +- */ +- if (!spin) { +- printk_safe_exit_irqrestore(flags); +- return 0; +- } +- +- /* We spin waiting for the owner to release us */ +- spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); +- /* Owner will clear console_waiter on hand off */ +- while (READ_ONCE(console_waiter)) +- cpu_relax(); +- spin_release(&console_owner_dep_map, _THIS_IP_); +- +- printk_safe_exit_irqrestore(flags); +- /* +- * The owner passed the console lock to us. +- * Since we did not spin on console lock, annotate +- * this as a trylock. Otherwise lockdep will +- * complain. +- */ +- mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); +- +- return 1; +-} +- +-/* +- * Call the console drivers, asking them to write out +- * log_buf[start] to log_buf[end - 1]. +- * The console_lock must be held. +- */ +-static void call_console_drivers(const char *ext_text, size_t ext_len, +- const char *text, size_t len) +-{ +- static char dropped_text[64]; +- size_t dropped_len = 0; +- struct console *con; +- +- trace_console_rcuidle(text, len); +- +- if (!console_drivers) +- return; +- +- if (console_dropped) { +- dropped_len = snprintf(dropped_text, sizeof(dropped_text), +- "** %lu printk messages dropped **\n", +- console_dropped); +- console_dropped = 0; +- } +- +- for_each_console(con) { +- if (exclusive_console && con != exclusive_console) +- continue; +- if (!(con->flags & CON_ENABLED)) +- continue; +- if (!con->write) +- continue; +- if (!cpu_online(smp_processor_id()) && +- !(con->flags & CON_ANYTIME)) +- continue; +- if (kernel_sync_mode()) +- continue; +- if (con->flags & CON_EXTENDED) +- con->write(con, ext_text, ext_len); +- else { +- if (dropped_len) +- con->write(con, dropped_text, dropped_len); +- con->write(con, text, len); +- } +- } +-} +- + #ifdef CONFIG_PRINTK_NMI + #define NUM_RECURSION_CTX 2 + #else +@@ -2287,39 +2094,16 @@ asmlinkage int vprintk_emit(int facility, int level, + const char *fmt, va_list args) + { + int printed_len; +- bool in_sched = false; + + /* Suppress unimportant messages after panic happens */ + if (unlikely(suppress_printk)) + return 0; + +- if (level == LOGLEVEL_SCHED) { ++ if (level == LOGLEVEL_SCHED) + level = LOGLEVEL_DEFAULT; +- in_sched = true; +- } +- +- printk_delay(level); + + printed_len = vprintk_store(facility, level, dev_info, fmt, args); + +- /* If called from the scheduler, we can not call up(). */ +- if (!in_sched) { +- /* +- * Disable preemption to avoid being preempted while holding +- * console_sem which would prevent anyone from printing to +- * console +- */ +- preempt_disable(); +- /* +- * Try to acquire and then immediately release the console +- * semaphore. The release will print out buffers and wake up +- * /dev/kmsg and syslog() users. +- */ +- if (console_trylock_spinning()) +- console_unlock(); +- preempt_enable(); +- } +- + wake_up_klogd(); + return printed_len; + } +@@ -2371,38 +2155,158 @@ asmlinkage __visible int printk(const char *fmt, ...) + } + EXPORT_SYMBOL(printk); + +-#else /* CONFIG_PRINTK */ ++static int printk_kthread_func(void *data) ++{ ++ struct console *con = data; ++ unsigned long dropped = 0; ++ char *dropped_text = NULL; ++ struct printk_info info; ++ struct printk_record r; ++ char *ext_text = NULL; ++ size_t dropped_len; ++ int ret = -ENOMEM; ++ char *text = NULL; ++ char *write_text; ++ u64 printk_seq; ++ size_t len; ++ int error; ++ u64 seq; + +-#define printk_time false ++ if (con->flags & CON_EXTENDED) { ++ ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL); ++ if (!ext_text) ++ goto out; ++ } ++ text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); ++ dropped_text = kmalloc(64, GFP_KERNEL); ++ if (!text || !dropped_text) ++ goto out; + +-#define prb_read_valid(rb, seq, r) false +-#define prb_first_valid_seq(rb) 0 ++ if (con->flags & CON_EXTENDED) ++ write_text = ext_text; ++ else ++ write_text = text; + +-#define kernel_sync_mode() false ++ seq = atomic64_read(&con->printk_seq); + +-static u64 syslog_seq; +-static atomic64_t console_seq = ATOMIC64_INIT(0); +-static u64 exclusive_console_stop_seq; +-static unsigned long console_dropped; ++ prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); ++ ++ for (;;) { ++ error = wait_event_interruptible(log_wait, ++ prb_read_valid(prb, seq, &r) || kthread_should_stop()); ++ ++ if (kthread_should_stop()) ++ break; ++ ++ if (error) ++ continue; ++ ++ if (seq != r.info->seq) { ++ dropped += r.info->seq - seq; ++ seq = r.info->seq; ++ } ++ ++ seq++; ++ ++ if (!(con->flags & CON_ENABLED)) ++ continue; ++ ++ if (suppress_message_printing(r.info->level)) ++ continue; ++ ++ if (con->flags & CON_EXTENDED) { ++ len = info_print_ext_header(ext_text, ++ CONSOLE_EXT_LOG_MAX, ++ r.info); ++ len += msg_print_ext_body(ext_text + len, ++ CONSOLE_EXT_LOG_MAX - len, ++ &r.text_buf[0], r.info->text_len, ++ &r.info->dev_info); ++ } else { ++ len = record_print_text(&r, ++ console_msg_format & MSG_FORMAT_SYSLOG, ++ printk_time); ++ } ++ ++ printk_seq = atomic64_read(&con->printk_seq); + +-static size_t record_print_text(const struct printk_record *r, +- bool syslog, bool time) ++ console_lock(); ++ console_may_schedule = 0; ++ ++ if (kernel_sync_mode() && con->write_atomic) { ++ console_unlock(); ++ break; ++ } ++ ++ if (!(con->flags & CON_EXTENDED) && dropped) { ++ dropped_len = snprintf(dropped_text, 64, ++ "** %lu printk messages dropped **\n", ++ dropped); ++ dropped = 0; ++ ++ con->write(con, dropped_text, dropped_len); ++ printk_delay(r.info->level); ++ } ++ ++ con->write(con, write_text, len); ++ if (len) ++ printk_delay(r.info->level); ++ ++ atomic64_cmpxchg_relaxed(&con->printk_seq, printk_seq, seq); ++ ++ console_unlock(); ++ } ++out: ++ kfree(dropped_text); ++ kfree(text); ++ kfree(ext_text); ++ pr_info("%sconsole [%s%d]: printing thread stopped\n", ++ (con->flags & CON_BOOT) ? "boot" : "", ++ con->name, con->index); ++ return ret; ++} ++ ++/* Must be called within console_lock(). */ ++static void start_printk_kthread(struct console *con) + { +- return 0; ++ con->thread = kthread_run(printk_kthread_func, con, ++ "pr/%s%d", con->name, con->index); ++ if (IS_ERR(con->thread)) { ++ pr_err("%sconsole [%s%d]: unable to start printing thread\n", ++ (con->flags & CON_BOOT) ? "boot" : "", ++ con->name, con->index); ++ return; ++ } ++ pr_info("%sconsole [%s%d]: printing thread started\n", ++ (con->flags & CON_BOOT) ? "boot" : "", ++ con->name, con->index); + } +-static ssize_t info_print_ext_header(char *buf, size_t size, +- struct printk_info *info) ++ ++/* protected by console_lock */ ++static bool kthreads_started; ++ ++/* Must be called within console_lock(). */ ++static void console_try_thread(struct console *con) + { +- return 0; ++ if (kthreads_started) { ++ start_printk_kthread(con); ++ return; ++ } ++ ++ /* ++ * The printing threads have not been started yet. If this console ++ * can print synchronously, print all unprinted messages. ++ */ ++ if (console_can_sync(con)) ++ print_sync_until(con, prb_next_seq(prb)); + } +-static ssize_t msg_print_ext_body(char *buf, size_t size, +- char *text, size_t text_len, +- struct dev_printk_info *dev_info) { return 0; } +-static void console_lock_spinning_enable(void) { } +-static int console_lock_spinning_disable_and_check(void) { return 0; } +-static void call_console_drivers(const char *ext_text, size_t ext_len, +- const char *text, size_t len) {} +-static bool suppress_message_printing(int level) { return false; } ++ ++#else /* CONFIG_PRINTK */ ++ ++#define prb_first_valid_seq(rb) 0 ++#define prb_next_seq(rb) 0 ++ ++#define console_try_thread(con) + + #endif /* CONFIG_PRINTK */ + +@@ -2647,36 +2551,6 @@ int is_console_locked(void) + } + EXPORT_SYMBOL(is_console_locked); + +-/* +- * Check if we have any console that is capable of printing while cpu is +- * booting or shutting down. Requires console_sem. +- */ +-static int have_callable_console(void) +-{ +- struct console *con; +- +- for_each_console(con) +- if ((con->flags & CON_ENABLED) && +- (con->flags & CON_ANYTIME)) +- return 1; +- +- return 0; +-} +- +-/* +- * Can we actually use the console at this time on this cpu? +- * +- * Console drivers may assume that per-cpu resources have been allocated. So +- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't +- * call them until this CPU is officially up. +- */ +-static inline int can_use_console(void) +-{ +- if (kernel_sync_mode()) +- return false; +- return cpu_online(raw_smp_processor_id()) || have_callable_console(); +-} +- + /** + * console_unlock - unlock the console system + * +@@ -2693,131 +2567,14 @@ static inline int can_use_console(void) + */ + void console_unlock(void) + { +- static char ext_text[CONSOLE_EXT_LOG_MAX]; +- static char text[CONSOLE_LOG_MAX]; +- bool do_cond_resched, retry; +- struct printk_info info; +- struct printk_record r; +- u64 seq; +- + if (console_suspended) { + up_console_sem(); + return; + } + +- prb_rec_init_rd(&r, &info, text, sizeof(text)); +- +- /* +- * Console drivers are called with interrupts disabled, so +- * @console_may_schedule should be cleared before; however, we may +- * end up dumping a lot of lines, for example, if called from +- * console registration path, and should invoke cond_resched() +- * between lines if allowable. Not doing so can cause a very long +- * scheduling stall on a slow console leading to RCU stall and +- * softlockup warnings which exacerbate the issue with more +- * messages practically incapacitating the system. +- * +- * console_trylock() is not able to detect the preemptive +- * context reliably. Therefore the value must be stored before +- * and cleared after the "again" goto label. +- */ +- do_cond_resched = console_may_schedule; +-again: +- console_may_schedule = 0; +- +- /* +- * We released the console_sem lock, so we need to recheck if +- * cpu is online and (if not) is there at least one CON_ANYTIME +- * console. +- */ +- if (!can_use_console()) { +- console_locked = 0; +- up_console_sem(); +- return; +- } +- +- for (;;) { +- size_t ext_len = 0; +- size_t len; +- +-skip: +- seq = atomic64_read(&console_seq); +- if (!prb_read_valid(prb, seq, &r)) +- break; +- +- if (seq != r.info->seq) { +- console_dropped += r.info->seq - seq; +- atomic64_set(&console_seq, r.info->seq); +- seq = r.info->seq; +- } +- +- if (suppress_message_printing(r.info->level)) { +- /* +- * Skip record we have buffered and already printed +- * directly to the console when we received it, and +- * record that has level above the console loglevel. +- */ +- atomic64_set(&console_seq, seq + 1); +- goto skip; +- } +- +- /* Output to all consoles once old messages replayed. */ +- if (unlikely(exclusive_console && +- seq >= exclusive_console_stop_seq)) { +- exclusive_console = NULL; +- } +- +- /* +- * Handle extended console text first because later +- * record_print_text() will modify the record buffer in-place. +- */ +- if (nr_ext_console_drivers) { +- ext_len = info_print_ext_header(ext_text, +- sizeof(ext_text), +- r.info); +- ext_len += msg_print_ext_body(ext_text + ext_len, +- sizeof(ext_text) - ext_len, +- &r.text_buf[0], +- r.info->text_len, +- &r.info->dev_info); +- } +- len = record_print_text(&r, +- console_msg_format & MSG_FORMAT_SYSLOG, +- printk_time); +- atomic64_set(&console_seq, seq + 1); +- +- /* +- * While actively printing out messages, if another printk() +- * were to occur on another CPU, it may wait for this one to +- * finish. This task can not be preempted if there is a +- * waiter waiting to take over. +- */ +- console_lock_spinning_enable(); +- +- stop_critical_timings(); /* don't trace print latency */ +- call_console_drivers(ext_text, ext_len, text, len); +- start_critical_timings(); +- +- if (console_lock_spinning_disable_and_check()) +- return; +- +- if (do_cond_resched) +- cond_resched(); +- } +- + console_locked = 0; + + up_console_sem(); +- +- /* +- * Someone could have filled up the buffer again, so re-check if there's +- * something to flush. In case we cannot trylock the console_sem again, +- * there's a new owner and the console_unlock() from them will do the +- * flush, no worries. +- */ +- retry = prb_read_valid(prb, atomic64_read(&console_seq), NULL); +- if (retry && console_trylock()) +- goto again; + } + EXPORT_SYMBOL(console_unlock); + +@@ -2867,18 +2624,20 @@ void console_unblank(void) + */ + void console_flush_on_panic(enum con_flush_mode mode) + { +- /* +- * If someone else is holding the console lock, trylock will fail +- * and may_schedule may be set. Ignore and proceed to unlock so +- * that messages are flushed out. As this can be called from any +- * context and we don't want to get preempted while flushing, +- * ensure may_schedule is cleared. +- */ +- console_trylock(); ++ struct console *c; ++ u64 seq; ++ ++ if (!console_trylock()) ++ return; ++ + console_may_schedule = 0; + +- if (mode == CONSOLE_REPLAY_ALL) +- atomic64_set(&console_seq, prb_first_valid_seq(prb)); ++ if (mode == CONSOLE_REPLAY_ALL) { ++ seq = prb_first_valid_seq(prb); ++ for_each_console(c) ++ atomic64_set(&c->printk_seq, seq); ++ } ++ + console_unlock(); + } + +@@ -3013,7 +2772,6 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) + */ + void register_console(struct console *newcon) + { +- unsigned long flags; + struct console *bcon = NULL; + int err; + +@@ -3037,6 +2795,8 @@ void register_console(struct console *newcon) + } + } + ++ newcon->thread = NULL; ++ + if (console_drivers && console_drivers->flags & CON_BOOT) + bcon = console_drivers; + +@@ -3101,27 +2861,12 @@ void register_console(struct console *newcon) + if (newcon->flags & CON_EXTENDED) + nr_ext_console_drivers++; + +- if (newcon->flags & CON_PRINTBUFFER) { +- /* +- * console_unlock(); will print out the buffered messages +- * for us. +- * +- * We're about to replay the log buffer. Only do this to the +- * just-registered console to avoid excessive message spam to +- * the already-registered consoles. +- * +- * Set exclusive_console with disabled interrupts to reduce +- * race window with eventual console_flush_on_panic() that +- * ignores console_lock. +- */ +- exclusive_console = newcon; +- exclusive_console_stop_seq = atomic64_read(&console_seq); ++ if (newcon->flags & CON_PRINTBUFFER) ++ atomic64_set(&newcon->printk_seq, 0); ++ else ++ atomic64_set(&newcon->printk_seq, prb_next_seq(prb)); + +- /* Get a consistent copy of @syslog_seq. */ +- spin_lock_irqsave(&syslog_lock, flags); +- atomic64_set(&console_seq, syslog_seq); +- spin_unlock_irqrestore(&syslog_lock, flags); +- } ++ console_try_thread(newcon); + console_unlock(); + console_sysfs_notify(); + +@@ -3195,6 +2940,9 @@ int unregister_console(struct console *console) + console_unlock(); + console_sysfs_notify(); + ++ if (console->thread && !IS_ERR(console->thread)) ++ kthread_stop(console->thread); ++ + if (console->exit) + res = console->exit(console); + +@@ -3277,6 +3025,15 @@ static int __init printk_late_init(void) + unregister_console(con); + } + } ++ ++#ifdef CONFIG_PRINTK ++ console_lock(); ++ for_each_console(con) ++ start_printk_kthread(con); ++ kthreads_started = true; ++ console_unlock(); ++#endif ++ + ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, + console_cpu_notify); + WARN_ON(ret < 0); +@@ -3292,7 +3049,6 @@ late_initcall(printk_late_init); + * Delayed printk version, for scheduler-internal messages: + */ + #define PRINTK_PENDING_WAKEUP 0x01 +-#define PRINTK_PENDING_OUTPUT 0x02 + + static DEFINE_PER_CPU(int, printk_pending); + +@@ -3300,14 +3056,8 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) + { + int pending = __this_cpu_xchg(printk_pending, 0); + +- if (pending & PRINTK_PENDING_OUTPUT) { +- /* If trylock fails, someone else is doing the printing */ +- if (console_trylock()) +- console_unlock(); +- } +- + if (pending & PRINTK_PENDING_WAKEUP) +- wake_up_interruptible(&log_wait); ++ wake_up_interruptible_all(&log_wait); + } + + static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { +@@ -3330,13 +3080,6 @@ void wake_up_klogd(void) + + void defer_console_output(void) + { +- if (!printk_percpu_data_ready()) +- return; +- +- preempt_disable(); +- __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); +- irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); +- preempt_enable(); + } + + int vprintk_deferred(const char *fmt, va_list args) +-- +2.43.0 + diff --git a/debian/patches-rt/0106-printk-remove-deferred-printing.patch b/debian/patches-rt/0106-printk-remove-deferred-printing.patch new file mode 100644 index 000000000..b65397f57 --- /dev/null +++ b/debian/patches-rt/0106-printk-remove-deferred-printing.patch @@ -0,0 +1,432 @@ +From f1e8f72e9d5c1df1ddaac09999645f7255219d77 Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Mon, 30 Nov 2020 01:42:08 +0106 +Subject: [PATCH 106/323] printk: remove deferred printing +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Since printing occurs either atomically or from the printing +kthread, there is no need for any deferring or tracking possible +recursion paths. Remove all printk context tracking. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/arm/kernel/smp.c | 2 - + arch/powerpc/kexec/crash.c | 3 -- + include/linux/hardirq.h | 2 - + include/linux/printk.h | 12 ----- + kernel/printk/Makefile | 1 - + kernel/printk/internal.h | 70 ----------------------------- + kernel/printk/printk.c | 58 ++++++++++-------------- + kernel/printk/printk_safe.c | 88 ------------------------------------- + kernel/trace/trace.c | 2 - + 9 files changed, 22 insertions(+), 216 deletions(-) + delete mode 100644 kernel/printk/internal.h + delete mode 100644 kernel/printk/printk_safe.c + +diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c +index 8aa7fa949c23..3693706ba0a1 100644 +--- a/arch/arm/kernel/smp.c ++++ b/arch/arm/kernel/smp.c +@@ -671,9 +671,7 @@ static void do_handle_IPI(int ipinr) + break; + + case IPI_CPU_BACKTRACE: +- printk_nmi_enter(); + nmi_cpu_backtrace(get_irq_regs()); +- printk_nmi_exit(); + break; + + default: +diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c +index c9a889880214..d488311efab1 100644 +--- a/arch/powerpc/kexec/crash.c ++++ b/arch/powerpc/kexec/crash.c +@@ -311,9 +311,6 @@ void default_machine_crash_shutdown(struct pt_regs *regs) + unsigned int i; + int (*old_handler)(struct pt_regs *regs); + +- /* Avoid hardlocking with irresponsive CPU holding logbuf_lock */ +- printk_nmi_enter(); +- + /* + * This function is only called after the system + * has panicked or is otherwise in a critical state. +diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h +index 754f67ac4326..c35b71f8644a 100644 +--- a/include/linux/hardirq.h ++++ b/include/linux/hardirq.h +@@ -115,7 +115,6 @@ extern void rcu_nmi_exit(void); + do { \ + lockdep_off(); \ + arch_nmi_enter(); \ +- printk_nmi_enter(); \ + BUG_ON(in_nmi() == NMI_MASK); \ + __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ + } while (0) +@@ -134,7 +133,6 @@ extern void rcu_nmi_exit(void); + do { \ + BUG_ON(!in_nmi()); \ + __preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ +- printk_nmi_exit(); \ + arch_nmi_exit(); \ + lockdep_on(); \ + } while (0) +diff --git a/include/linux/printk.h b/include/linux/printk.h +index 3738374e2b3b..410435ac15ea 100644 +--- a/include/linux/printk.h ++++ b/include/linux/printk.h +@@ -155,18 +155,6 @@ static inline __printf(1, 2) __cold + void early_printk(const char *s, ...) { } + #endif + +-#ifdef CONFIG_PRINTK_NMI +-extern void printk_nmi_enter(void); +-extern void printk_nmi_exit(void); +-extern void printk_nmi_direct_enter(void); +-extern void printk_nmi_direct_exit(void); +-#else +-static inline void printk_nmi_enter(void) { } +-static inline void printk_nmi_exit(void) { } +-static inline void printk_nmi_direct_enter(void) { } +-static inline void printk_nmi_direct_exit(void) { } +-#endif /* PRINTK_NMI */ +- + struct dev_printk_info; + + #ifdef CONFIG_PRINTK +diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile +index eee3dc9b60a9..59cb24e25f00 100644 +--- a/kernel/printk/Makefile ++++ b/kernel/printk/Makefile +@@ -1,5 +1,4 @@ + # SPDX-License-Identifier: GPL-2.0-only + obj-y = printk.o +-obj-$(CONFIG_PRINTK) += printk_safe.o + obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o + obj-$(CONFIG_PRINTK) += printk_ringbuffer.o +diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h +deleted file mode 100644 +index e108b2ece8c7..000000000000 +--- a/kernel/printk/internal.h ++++ /dev/null +@@ -1,70 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0-or-later */ +-/* +- * internal.h - printk internal definitions +- */ +-#include <linux/percpu.h> +- +-#ifdef CONFIG_PRINTK +- +-#define PRINTK_SAFE_CONTEXT_MASK 0x007ffffff +-#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x008000000 +-#define PRINTK_NMI_CONTEXT_MASK 0xff0000000 +- +-#define PRINTK_NMI_CONTEXT_OFFSET 0x010000000 +- +-__printf(4, 0) +-int vprintk_store(int facility, int level, +- const struct dev_printk_info *dev_info, +- const char *fmt, va_list args); +- +-__printf(1, 0) int vprintk_default(const char *fmt, va_list args); +-__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); +-__printf(1, 0) int vprintk_func(const char *fmt, va_list args); +-void __printk_safe_enter(void); +-void __printk_safe_exit(void); +- +-bool printk_percpu_data_ready(void); +- +-#define printk_safe_enter_irqsave(flags) \ +- do { \ +- local_irq_save(flags); \ +- __printk_safe_enter(); \ +- } while (0) +- +-#define printk_safe_exit_irqrestore(flags) \ +- do { \ +- __printk_safe_exit(); \ +- local_irq_restore(flags); \ +- } while (0) +- +-#define printk_safe_enter_irq() \ +- do { \ +- local_irq_disable(); \ +- __printk_safe_enter(); \ +- } while (0) +- +-#define printk_safe_exit_irq() \ +- do { \ +- __printk_safe_exit(); \ +- local_irq_enable(); \ +- } while (0) +- +-void defer_console_output(void); +- +-#else +- +-__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; } +- +-/* +- * In !PRINTK builds we still export console_sem +- * semaphore and some of console functions (console_unlock()/etc.), so +- * printk-safe must preserve the existing local IRQ guarantees. +- */ +-#define printk_safe_enter_irqsave(flags) local_irq_save(flags) +-#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags) +- +-#define printk_safe_enter_irq() local_irq_disable() +-#define printk_safe_exit_irq() local_irq_enable() +- +-static inline bool printk_percpu_data_ready(void) { return false; } +-#endif /* CONFIG_PRINTK */ +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 232ce58e94c2..2b110b9ae34a 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -45,6 +45,7 @@ + #include <linux/ctype.h> + #include <linux/uio.h> + #include <linux/kthread.h> ++#include <linux/kdb.h> + #include <linux/clocksource.h> + #include <linux/sched/clock.h> + #include <linux/sched/debug.h> +@@ -60,7 +61,6 @@ + #include "printk_ringbuffer.h" + #include "console_cmdline.h" + #include "braille.h" +-#include "internal.h" + + int console_printk[4] = { + CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ +@@ -229,19 +229,7 @@ static int nr_ext_console_drivers; + + static int __down_trylock_console_sem(unsigned long ip) + { +- int lock_failed; +- unsigned long flags; +- +- /* +- * Here and in __up_console_sem() we need to be in safe mode, +- * because spindump/WARN/etc from under console ->lock will +- * deadlock in printk()->down_trylock_console_sem() otherwise. +- */ +- printk_safe_enter_irqsave(flags); +- lock_failed = down_trylock(&console_sem); +- printk_safe_exit_irqrestore(flags); +- +- if (lock_failed) ++ if (down_trylock(&console_sem)) + return 1; + mutex_acquire(&console_lock_dep_map, 0, 1, ip); + return 0; +@@ -250,13 +238,9 @@ static int __down_trylock_console_sem(unsigned long ip) + + static void __up_console_sem(unsigned long ip) + { +- unsigned long flags; +- + mutex_release(&console_lock_dep_map, ip); + +- printk_safe_enter_irqsave(flags); + up(&console_sem); +- printk_safe_exit_irqrestore(flags); + } + #define up_console_sem() __up_console_sem(_RET_IP_) + +@@ -428,7 +412,7 @@ static struct printk_ringbuffer *prb = &printk_rb_static; + */ + static bool __printk_percpu_data_ready __read_mostly; + +-bool printk_percpu_data_ready(void) ++static bool printk_percpu_data_ready(void) + { + return __printk_percpu_data_ready; + } +@@ -1063,7 +1047,6 @@ void __init setup_log_buf(int early) + struct printk_record r; + size_t new_descs_size; + size_t new_infos_size; +- unsigned long flags; + char *new_log_buf; + unsigned int free; + u64 seq; +@@ -1961,9 +1944,9 @@ static u16 printk_sprint(char *text, u16 size, int facility, enum log_flags *lfl + } + + __printf(4, 0) +-int vprintk_store(int facility, int level, +- const struct dev_printk_info *dev_info, +- const char *fmt, va_list args) ++static int vprintk_store(int facility, int level, ++ const struct dev_printk_info *dev_info, ++ const char *fmt, va_list args) + { + const u32 caller_id = printk_caller_id(); + struct prb_reserved_entry e; +@@ -2109,11 +2092,22 @@ asmlinkage int vprintk_emit(int facility, int level, + } + EXPORT_SYMBOL(vprintk_emit); + +-int vprintk_default(const char *fmt, va_list args) ++__printf(1, 0) ++static int vprintk_default(const char *fmt, va_list args) + { + return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); + } +-EXPORT_SYMBOL_GPL(vprintk_default); ++ ++__printf(1, 0) ++static int vprintk_func(const char *fmt, va_list args) ++{ ++#ifdef CONFIG_KGDB_KDB ++ /* Allow to pass printk() to kdb but avoid a recursion. */ ++ if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) ++ return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); ++#endif ++ return vprintk_default(fmt, args); ++} + + asmlinkage int vprintk(const char *fmt, va_list args) + { +@@ -3078,18 +3072,10 @@ void wake_up_klogd(void) + preempt_enable(); + } + +-void defer_console_output(void) +-{ +-} +- +-int vprintk_deferred(const char *fmt, va_list args) ++__printf(1, 0) ++static int vprintk_deferred(const char *fmt, va_list args) + { +- int r; +- +- r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); +- defer_console_output(); +- +- return r; ++ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); + } + + int printk_deferred(const char *fmt, ...) +diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c +deleted file mode 100644 +index c23b127a6545..000000000000 +--- a/kernel/printk/printk_safe.c ++++ /dev/null +@@ -1,88 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0-or-later +-/* +- * printk_safe.c - Safe printk for printk-deadlock-prone contexts +- */ +- +-#include <linux/preempt.h> +-#include <linux/spinlock.h> +-#include <linux/debug_locks.h> +-#include <linux/kdb.h> +-#include <linux/smp.h> +-#include <linux/cpumask.h> +-#include <linux/irq_work.h> +-#include <linux/printk.h> +-#include <linux/kprobes.h> +- +-#include "internal.h" +- +-static DEFINE_PER_CPU(int, printk_context); +- +-#ifdef CONFIG_PRINTK_NMI +-void noinstr printk_nmi_enter(void) +-{ +- this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET); +-} +- +-void noinstr printk_nmi_exit(void) +-{ +- this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET); +-} +- +-/* +- * Marks a code that might produce many messages in NMI context +- * and the risk of losing them is more critical than eventual +- * reordering. +- */ +-void printk_nmi_direct_enter(void) +-{ +- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) +- this_cpu_or(printk_context, PRINTK_NMI_DIRECT_CONTEXT_MASK); +-} +- +-void printk_nmi_direct_exit(void) +-{ +- this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK); +-} +- +-#endif /* CONFIG_PRINTK_NMI */ +- +-/* Can be preempted by NMI. */ +-void __printk_safe_enter(void) +-{ +- this_cpu_inc(printk_context); +-} +- +-/* Can be preempted by NMI. */ +-void __printk_safe_exit(void) +-{ +- this_cpu_dec(printk_context); +-} +- +-__printf(1, 0) int vprintk_func(const char *fmt, va_list args) +-{ +-#ifdef CONFIG_KGDB_KDB +- /* Allow to pass printk() to kdb but avoid a recursion. */ +- if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) +- return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); +-#endif +- +- /* +- * Use the main logbuf even in NMI. But avoid calling console +- * drivers that might have their own locks. +- */ +- if (this_cpu_read(printk_context) & +- (PRINTK_NMI_DIRECT_CONTEXT_MASK | +- PRINTK_NMI_CONTEXT_MASK | +- PRINTK_SAFE_CONTEXT_MASK)) { +- int len; +- +- printk_safe_enter_irqsave(flags); +- len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); +- printk_safe_exit_irqrestore(flags); +- defer_console_output(); +- return len; +- } +- +- /* No obstacles. */ +- return vprintk_default(fmt, args); +-} +diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c +index 70ab6b46ef8f..5d6730589823 100644 +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -9517,7 +9517,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) + tracing_off(); + + local_irq_save(flags); +- printk_nmi_direct_enter(); + + /* Simulate the iterator */ + trace_init_global_iter(&iter); +@@ -9594,7 +9593,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) + atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); + } + atomic_dec(&dump_running); +- printk_nmi_direct_exit(); + local_irq_restore(flags); + } + EXPORT_SYMBOL_GPL(ftrace_dump); +-- +2.43.0 + diff --git a/debian/patches-rt/0107-printk-add-console-handover.patch b/debian/patches-rt/0107-printk-add-console-handover.patch new file mode 100644 index 000000000..9abc4efd5 --- /dev/null +++ b/debian/patches-rt/0107-printk-add-console-handover.patch @@ -0,0 +1,76 @@ +From ead093d95a7917fa12f9d4653032eac383d21c1d Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Mon, 30 Nov 2020 01:42:09 +0106 +Subject: [PATCH 107/323] printk: add console handover +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +If earlyprintk is used, a boot console will print directly to the +console immediately. The boot console will unregister itself as soon +as a non-boot console registers. However, the non-boot console does +not begin printing until its kthread has started. Since this happens +much later, there is a long pause in the console output. If the +ringbuffer is small, messages could even be dropped during the +pause. + +Add a new CON_HANDOVER console flag to be used internally by printk +in order to track which non-boot console took over from a boot +console. If handover consoles have implemented write_atomic(), they +are allowed to print directly to the console until their kthread can +take over. + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/console.h | 1 + + kernel/printk/printk.c | 8 +++++++- + 2 files changed, 8 insertions(+), 1 deletion(-) + +diff --git a/include/linux/console.h b/include/linux/console.h +index 3e99359e0660..027278792eea 100644 +--- a/include/linux/console.h ++++ b/include/linux/console.h +@@ -138,6 +138,7 @@ static inline int con_debug_leave(void) + #define CON_ANYTIME (16) /* Safe to call when cpu is offline */ + #define CON_BRL (32) /* Used for a braille device */ + #define CON_EXTENDED (64) /* Use the extended output format a la /dev/kmsg */ ++#define CON_HANDOVER (128) /* Device was previously a boot console. */ + + struct console { + char name[16]; +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 2b110b9ae34a..58da007d7c99 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -1729,6 +1729,8 @@ static bool console_can_sync(struct console *con) + return false; + if (con->write_atomic && kernel_sync_mode()) + return true; ++ if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread) ++ return true; + if (con->write && (con->flags & CON_BOOT) && !con->thread) + return true; + return false; +@@ -1740,6 +1742,8 @@ static bool call_sync_console_driver(struct console *con, const char *text, size + return false; + if (con->write_atomic && kernel_sync_mode()) + con->write_atomic(con, text, text_len); ++ else if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread) ++ con->write_atomic(con, text, text_len); + else if (con->write && (con->flags & CON_BOOT) && !con->thread) + con->write(con, text, text_len); + else +@@ -2832,8 +2836,10 @@ void register_console(struct console *newcon) + * the real console are the same physical device, it's annoying to + * see the beginning boot messages twice + */ +- if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) ++ if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { + newcon->flags &= ~CON_PRINTBUFFER; ++ newcon->flags |= CON_HANDOVER; ++ } + + /* + * Put this console in the list - keep the +-- +2.43.0 + diff --git a/debian/patches-rt/0108-printk-add-pr_flush.patch b/debian/patches-rt/0108-printk-add-pr_flush.patch new file mode 100644 index 000000000..013525f54 --- /dev/null +++ b/debian/patches-rt/0108-printk-add-pr_flush.patch @@ -0,0 +1,213 @@ +From cf83f550753a28a3f2a5a7ddd45456fb3ccd03ed Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Mon, 30 Nov 2020 01:42:10 +0106 +Subject: [PATCH 108/323] printk: add pr_flush() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Provide a function to allow waiting for console printers to catch +up to the latest logged message. + +Use pr_flush() to give console printers a chance to finish in +critical situations if no atomic console is available. For now +pr_flush() is only used in the most common error paths: +panic(), print_oops_end_marker(), report_bug(), kmsg_dump(). + +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + include/linux/printk.h | 2 ++ + kernel/panic.c | 28 +++++++++------ + kernel/printk/printk.c | 79 ++++++++++++++++++++++++++++++++++++++++++ + lib/bug.c | 1 + + 4 files changed, 99 insertions(+), 11 deletions(-) + +diff --git a/include/linux/printk.h b/include/linux/printk.h +index 410435ac15ea..83c7734e9802 100644 +--- a/include/linux/printk.h ++++ b/include/linux/printk.h +@@ -481,6 +481,8 @@ extern int kptr_restrict; + no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) + #endif + ++bool pr_flush(int timeout_ms, bool reset_on_progress); ++ + /* + * ratelimited messages with local ratelimit_state, + * no local ratelimit_state used in the !PRINTK case +diff --git a/kernel/panic.c b/kernel/panic.c +index 7965f1e31224..5a1a4bf2feb3 100644 +--- a/kernel/panic.c ++++ b/kernel/panic.c +@@ -244,6 +244,7 @@ void check_panic_on_warn(const char *origin) + void panic(const char *fmt, ...) + { + static char buf[1024]; ++ va_list args2; + va_list args; + long i, i_next = 0, len; + int state = 0; +@@ -260,6 +261,21 @@ void panic(const char *fmt, ...) + panic_on_warn = 0; + } + ++ console_verbose(); ++ pr_emerg("Kernel panic - not syncing:\n"); ++ va_start(args2, fmt); ++ va_copy(args, args2); ++ vprintk(fmt, args2); ++ va_end(args2); ++#ifdef CONFIG_DEBUG_BUGVERBOSE ++ /* ++ * Avoid nested stack-dumping if a panic occurs during oops processing ++ */ ++ if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) ++ dump_stack(); ++#endif ++ pr_flush(1000, true); ++ + /* + * Disable local interrupts. This will prevent panic_smp_self_stop + * from deadlocking the first cpu that invokes the panic, since +@@ -290,24 +306,13 @@ void panic(const char *fmt, ...) + if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu) + panic_smp_self_stop(); + +- console_verbose(); + bust_spinlocks(1); +- va_start(args, fmt); + len = vscnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + if (len && buf[len - 1] == '\n') + buf[len - 1] = '\0'; + +- pr_emerg("Kernel panic - not syncing: %s\n", buf); +-#ifdef CONFIG_DEBUG_BUGVERBOSE +- /* +- * Avoid nested stack-dumping if a panic occurs during oops processing +- */ +- if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) +- dump_stack(); +-#endif +- + /* + * If kgdb is enabled, give it a chance to run before we stop all + * the other CPUs or else we won't be able to debug processes left +@@ -629,6 +634,7 @@ static void print_oops_end_marker(void) + { + init_oops_id(); + pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); ++ pr_flush(1000, true); + } + + /* +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 58da007d7c99..14cb111fe9f0 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -3233,6 +3233,12 @@ void kmsg_dump(enum kmsg_dump_reason reason) + sync_mode = true; + pr_info("enabled sync mode\n"); + } ++ ++ /* ++ * Give the printing threads time to flush, allowing up to ++ * 1s of no printing forward progress before giving up. ++ */ ++ pr_flush(1000, true); + } + + rcu_read_lock(); +@@ -3512,3 +3518,76 @@ void console_atomic_unlock(unsigned int flags) + prb_unlock(&printk_cpulock, flags); + } + EXPORT_SYMBOL(console_atomic_unlock); ++ ++static void pr_msleep(bool may_sleep, int ms) ++{ ++ if (may_sleep) { ++ msleep(ms); ++ } else { ++ while (ms--) ++ udelay(1000); ++ } ++} ++ ++/** ++ * pr_flush() - Wait for printing threads to catch up. ++ * ++ * @timeout_ms: The maximum time (in ms) to wait. ++ * @reset_on_progress: Reset the timeout if forward progress is seen. ++ * ++ * A value of 0 for @timeout_ms means no waiting will occur. A value of -1 ++ * represents infinite waiting. ++ * ++ * If @reset_on_progress is true, the timeout will be reset whenever any ++ * printer has been seen to make some forward progress. ++ * ++ * Context: Any context. ++ * Return: true if all enabled printers are caught up. ++ */ ++bool pr_flush(int timeout_ms, bool reset_on_progress) ++{ ++ int remaining = timeout_ms; ++ struct console *con; ++ u64 last_diff = 0; ++ bool may_sleep; ++ u64 printk_seq; ++ u64 diff; ++ u64 seq; ++ ++ may_sleep = (preemptible() && !in_softirq()); ++ ++ seq = prb_next_seq(prb); ++ ++ for (;;) { ++ diff = 0; ++ ++ for_each_console(con) { ++ if (!(con->flags & CON_ENABLED)) ++ continue; ++ printk_seq = atomic64_read(&con->printk_seq); ++ if (printk_seq < seq) ++ diff += seq - printk_seq; ++ } ++ ++ if (diff != last_diff && reset_on_progress) ++ remaining = timeout_ms; ++ ++ if (!diff || remaining == 0) ++ break; ++ ++ if (remaining < 0) { ++ pr_msleep(may_sleep, 100); ++ } else if (remaining < 100) { ++ pr_msleep(may_sleep, remaining); ++ remaining = 0; ++ } else { ++ pr_msleep(may_sleep, 100); ++ remaining -= 100; ++ } ++ ++ last_diff = diff; ++ } ++ ++ return (diff == 0); ++} ++EXPORT_SYMBOL(pr_flush); +diff --git a/lib/bug.c b/lib/bug.c +index 4ab398a2de93..9c681f29e61e 100644 +--- a/lib/bug.c ++++ b/lib/bug.c +@@ -202,6 +202,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) + else + pr_crit("Kernel BUG at %pB [verbose debug info unavailable]\n", + (void *)bugaddr); ++ pr_flush(1000, true); + + return BUG_TRAP_TYPE_BUG; + } +-- +2.43.0 + diff --git a/debian/patches-rt/0109-cgroup-use-irqsave-in-cgroup_rstat_flush_locked.patch b/debian/patches-rt/0109-cgroup-use-irqsave-in-cgroup_rstat_flush_locked.patch new file mode 100644 index 000000000..3fad12a67 --- /dev/null +++ b/debian/patches-rt/0109-cgroup-use-irqsave-in-cgroup_rstat_flush_locked.patch @@ -0,0 +1,50 @@ +From 5c99ffb2025e5ae87404c3b9641b9dafaec9336e Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 3 Jul 2018 18:19:48 +0200 +Subject: [PATCH 109/323] cgroup: use irqsave in cgroup_rstat_flush_locked() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +All callers of cgroup_rstat_flush_locked() acquire cgroup_rstat_lock +either with spin_lock_irq() or spin_lock_irqsave(). +cgroup_rstat_flush_locked() itself acquires cgroup_rstat_cpu_lock which +is a raw_spin_lock. This lock is also acquired in cgroup_rstat_updated() +in IRQ context and therefore requires _irqsave() locking suffix in +cgroup_rstat_flush_locked(). +Since there is no difference between spin_lock_t and raw_spin_lock_t +on !RT lockdep does not complain here. On RT lockdep complains because +the interrupts were not disabled here and a deadlock is possible. + +Acquire the raw_spin_lock_t with disabled interrupts. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/cgroup/rstat.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c +index 89ca9b61aa0d..753dc346520a 100644 +--- a/kernel/cgroup/rstat.c ++++ b/kernel/cgroup/rstat.c +@@ -149,8 +149,9 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, + cpu); + struct cgroup *pos = NULL; ++ unsigned long flags; + +- raw_spin_lock(cpu_lock); ++ raw_spin_lock_irqsave(cpu_lock, flags); + while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { + struct cgroup_subsys_state *css; + +@@ -162,7 +163,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) + css->ss->css_rstat_flush(css, cpu); + rcu_read_unlock(); + } +- raw_spin_unlock(cpu_lock); ++ raw_spin_unlock_irqrestore(cpu_lock, flags); + + /* if @may_sleep, play nice and yield if necessary */ + if (may_sleep && (need_resched() || +-- +2.43.0 + diff --git a/debian/patches-rt/0110-mm-workingset-replace-IRQ-off-check-with-a-lockdep-a.patch b/debian/patches-rt/0110-mm-workingset-replace-IRQ-off-check-with-a-lockdep-a.patch new file mode 100644 index 000000000..e9c3583b7 --- /dev/null +++ b/debian/patches-rt/0110-mm-workingset-replace-IRQ-off-check-with-a-lockdep-a.patch @@ -0,0 +1,49 @@ +From e215644e77fc149727b2ed1c3412c2e7668ff5aa Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Mon, 11 Feb 2019 10:40:46 +0100 +Subject: [PATCH 110/323] mm: workingset: replace IRQ-off check with a lockdep + assert. +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Commit + + 68d48e6a2df57 ("mm: workingset: add vmstat counter for shadow nodes") + +introduced an IRQ-off check to ensure that a lock is held which also +disabled interrupts. This does not work the same way on -RT because none +of the locks, that are held, disable interrupts. +Replace this check with a lockdep assert which ensures that the lock is +held. + +Cc: Peter Zijlstra <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + mm/workingset.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/mm/workingset.c b/mm/workingset.c +index 975a4d2dd02e..c3d098c01052 100644 +--- a/mm/workingset.c ++++ b/mm/workingset.c +@@ -432,6 +432,8 @@ static struct list_lru shadow_nodes; + + void workingset_update_node(struct xa_node *node) + { ++ struct address_space *mapping; ++ + /* + * Track non-empty nodes that contain only shadow entries; + * unlink those that contain pages or are being freed. +@@ -440,7 +442,8 @@ void workingset_update_node(struct xa_node *node) + * already where they should be. The list_empty() test is safe + * as node->private_list is protected by the i_pages lock. + */ +- VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */ ++ mapping = container_of(node->array, struct address_space, i_pages); ++ lockdep_assert_held(&mapping->i_pages.xa_lock); + + if (node->count && node->count == node->nr_values) { + if (list_empty(&node->private_list)) { +-- +2.43.0 + diff --git a/debian/patches-rt/0111-tpm-remove-tpm_dev_wq_lock.patch b/debian/patches-rt/0111-tpm-remove-tpm_dev_wq_lock.patch new file mode 100644 index 000000000..f69a126e3 --- /dev/null +++ b/debian/patches-rt/0111-tpm-remove-tpm_dev_wq_lock.patch @@ -0,0 +1,35 @@ +From 74373c599ec7fa9991304d680c34ec5b591d4cad Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Mon, 11 Feb 2019 11:33:11 +0100 +Subject: [PATCH 111/323] tpm: remove tpm_dev_wq_lock +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Added in commit + + 9e1b74a63f776 ("tpm: add support for nonblocking operation") + +but never actually used it. + +Cc: Philip Tricca <philip.b.tricca@intel.com> +Cc: Tadeusz Struk <tadeusz.struk@intel.com> +Cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/char/tpm/tpm-dev-common.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/drivers/char/tpm/tpm-dev-common.c b/drivers/char/tpm/tpm-dev-common.c +index b99e1941c52c..dc4c0a0a5129 100644 +--- a/drivers/char/tpm/tpm-dev-common.c ++++ b/drivers/char/tpm/tpm-dev-common.c +@@ -20,7 +20,6 @@ + #include "tpm-dev.h" + + static struct workqueue_struct *tpm_dev_wq; +-static DEFINE_MUTEX(tpm_dev_wq_lock); + + static ssize_t tpm_dev_transmit(struct tpm_chip *chip, struct tpm_space *space, + u8 *buf, size_t bufsiz) +-- +2.43.0 + diff --git a/debian/patches-rt/0112-shmem-Use-raw_spinlock_t-for-stat_lock.patch b/debian/patches-rt/0112-shmem-Use-raw_spinlock_t-for-stat_lock.patch new file mode 100644 index 000000000..41738b8a2 --- /dev/null +++ b/debian/patches-rt/0112-shmem-Use-raw_spinlock_t-for-stat_lock.patch @@ -0,0 +1,147 @@ +From 35616363024adeffc42af4d2ca91e4550d6db59c Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Fri, 14 Aug 2020 18:53:34 +0200 +Subject: [PATCH 112/323] shmem: Use raw_spinlock_t for ->stat_lock +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Each CPU has SHMEM_INO_BATCH inodes available in `->ino_batch' which is +per-CPU. Access here is serialized by disabling preemption. If the pool is +empty, it gets reloaded from `->next_ino'. Access here is serialized by +->stat_lock which is a spinlock_t and can not be acquired with disabled +preemption. +One way around it would make per-CPU ino_batch struct containing the inode +number a local_lock_t. +Another sollution is to promote ->stat_lock to a raw_spinlock_t. The critical +sections are short. The mpol_put() should be moved outside of the critical +section to avoid invoking the destrutor with disabled preemption. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/shmem_fs.h | 2 +- + mm/shmem.c | 31 +++++++++++++++++-------------- + 2 files changed, 18 insertions(+), 15 deletions(-) + +diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h +index a5a5d1d4d7b1..0470d1582b09 100644 +--- a/include/linux/shmem_fs.h ++++ b/include/linux/shmem_fs.h +@@ -31,7 +31,7 @@ struct shmem_sb_info { + struct percpu_counter used_blocks; /* How many are allocated */ + unsigned long max_inodes; /* How many inodes are allowed */ + unsigned long free_inodes; /* How many are left for allocation */ +- spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ ++ raw_spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ + umode_t mode; /* Mount mode for root directory */ + unsigned char huge; /* Whether to try for hugepages */ + kuid_t uid; /* Mount uid for root directory */ +diff --git a/mm/shmem.c b/mm/shmem.c +index e173d83b4448..94c0964f0d1f 100644 +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -278,10 +278,10 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) + ino_t ino; + + if (!(sb->s_flags & SB_KERNMOUNT)) { +- spin_lock(&sbinfo->stat_lock); ++ raw_spin_lock(&sbinfo->stat_lock); + if (sbinfo->max_inodes) { + if (!sbinfo->free_inodes) { +- spin_unlock(&sbinfo->stat_lock); ++ raw_spin_unlock(&sbinfo->stat_lock); + return -ENOSPC; + } + sbinfo->free_inodes--; +@@ -304,7 +304,7 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) + } + *inop = ino; + } +- spin_unlock(&sbinfo->stat_lock); ++ raw_spin_unlock(&sbinfo->stat_lock); + } else if (inop) { + /* + * __shmem_file_setup, one of our callers, is lock-free: it +@@ -319,13 +319,14 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) + * to worry about things like glibc compatibility. + */ + ino_t *next_ino; ++ + next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); + ino = *next_ino; + if (unlikely(ino % SHMEM_INO_BATCH == 0)) { +- spin_lock(&sbinfo->stat_lock); ++ raw_spin_lock(&sbinfo->stat_lock); + ino = sbinfo->next_ino; + sbinfo->next_ino += SHMEM_INO_BATCH; +- spin_unlock(&sbinfo->stat_lock); ++ raw_spin_unlock(&sbinfo->stat_lock); + if (unlikely(is_zero_ino(ino))) + ino++; + } +@@ -341,9 +342,9 @@ static void shmem_free_inode(struct super_block *sb) + { + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + if (sbinfo->max_inodes) { +- spin_lock(&sbinfo->stat_lock); ++ raw_spin_lock(&sbinfo->stat_lock); + sbinfo->free_inodes++; +- spin_unlock(&sbinfo->stat_lock); ++ raw_spin_unlock(&sbinfo->stat_lock); + } + } + +@@ -1484,10 +1485,10 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) + { + struct mempolicy *mpol = NULL; + if (sbinfo->mpol) { +- spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ ++ raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ + mpol = sbinfo->mpol; + mpol_get(mpol); +- spin_unlock(&sbinfo->stat_lock); ++ raw_spin_unlock(&sbinfo->stat_lock); + } + return mpol; + } +@@ -3613,9 +3614,10 @@ static int shmem_reconfigure(struct fs_context *fc) + struct shmem_options *ctx = fc->fs_private; + struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); + unsigned long inodes; ++ struct mempolicy *mpol = NULL; + const char *err; + +- spin_lock(&sbinfo->stat_lock); ++ raw_spin_lock(&sbinfo->stat_lock); + inodes = sbinfo->max_inodes - sbinfo->free_inodes; + if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { + if (!sbinfo->max_blocks) { +@@ -3660,14 +3662,15 @@ static int shmem_reconfigure(struct fs_context *fc) + * Preserve previous mempolicy unless mpol remount option was specified. + */ + if (ctx->mpol) { +- mpol_put(sbinfo->mpol); ++ mpol = sbinfo->mpol; + sbinfo->mpol = ctx->mpol; /* transfers initial ref */ + ctx->mpol = NULL; + } +- spin_unlock(&sbinfo->stat_lock); ++ raw_spin_unlock(&sbinfo->stat_lock); ++ mpol_put(mpol); + return 0; + out: +- spin_unlock(&sbinfo->stat_lock); ++ raw_spin_unlock(&sbinfo->stat_lock); + return invalfc(fc, "%s", err); + } + +@@ -3784,7 +3787,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) + sbinfo->mpol = ctx->mpol; + ctx->mpol = NULL; + +- spin_lock_init(&sbinfo->stat_lock); ++ raw_spin_lock_init(&sbinfo->stat_lock); + if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) + goto failed; + spin_lock_init(&sbinfo->shrinklist_lock); +-- +2.43.0 + diff --git a/debian/patches-rt/0113-net-Move-lockdep-where-it-belongs.patch b/debian/patches-rt/0113-net-Move-lockdep-where-it-belongs.patch new file mode 100644 index 000000000..b7b6765b1 --- /dev/null +++ b/debian/patches-rt/0113-net-Move-lockdep-where-it-belongs.patch @@ -0,0 +1,46 @@ +From b3ad938c1be05a261ce70af8888271d0b85cd955 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 8 Sep 2020 07:32:20 +0200 +Subject: [PATCH 113/323] net: Move lockdep where it belongs +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + net/core/sock.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +diff --git a/net/core/sock.c b/net/core/sock.c +index a069b5476df4..8839cfea108a 100644 +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -3082,12 +3082,11 @@ void lock_sock_nested(struct sock *sk, int subclass) + if (sk->sk_lock.owned) + __lock_sock(sk); + sk->sk_lock.owned = 1; +- spin_unlock(&sk->sk_lock.slock); ++ spin_unlock_bh(&sk->sk_lock.slock); + /* + * The sk_lock has mutex_lock() semantics here: + */ + mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); +- local_bh_enable(); + } + EXPORT_SYMBOL(lock_sock_nested); + +@@ -3136,12 +3135,11 @@ bool lock_sock_fast(struct sock *sk) + + __lock_sock(sk); + sk->sk_lock.owned = 1; +- spin_unlock(&sk->sk_lock.slock); ++ spin_unlock_bh(&sk->sk_lock.slock); + /* + * The sk_lock has mutex_lock() semantics here: + */ + mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); +- local_bh_enable(); + return true; + } + EXPORT_SYMBOL(lock_sock_fast); +-- +2.43.0 + diff --git a/debian/patches-rt/0114-parisc-Remove-bogus-__IRQ_STAT-macro.patch b/debian/patches-rt/0114-parisc-Remove-bogus-__IRQ_STAT-macro.patch new file mode 100644 index 000000000..29bae354a --- /dev/null +++ b/debian/patches-rt/0114-parisc-Remove-bogus-__IRQ_STAT-macro.patch @@ -0,0 +1,31 @@ +From a73391bb3950b16f44dda9989980e77e07febe01 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 13 Nov 2020 15:02:08 +0100 +Subject: [PATCH 114/323] parisc: Remove bogus __IRQ_STAT macro +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +This is a leftover from a historical array based implementation and unused. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <frederic@kernel.org> +Link: https://lore.kernel.org/r/20201113141732.680780121@linutronix.de +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/parisc/include/asm/hardirq.h | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/arch/parisc/include/asm/hardirq.h b/arch/parisc/include/asm/hardirq.h +index 7f7039516e53..fad29aa6f45f 100644 +--- a/arch/parisc/include/asm/hardirq.h ++++ b/arch/parisc/include/asm/hardirq.h +@@ -32,7 +32,6 @@ typedef struct { + DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); + + #define __ARCH_IRQ_STAT +-#define __IRQ_STAT(cpu, member) (irq_stat[cpu].member) + #define inc_irq_stat(member) this_cpu_inc(irq_stat.member) + #define __inc_irq_stat(member) __this_cpu_inc(irq_stat.member) + #define ack_bad_irq(irq) WARN(1, "unexpected IRQ trap at vector %02x\n", irq) +-- +2.43.0 + diff --git a/debian/patches-rt/0115-sh-Get-rid-of-nmi_count.patch b/debian/patches-rt/0115-sh-Get-rid-of-nmi_count.patch new file mode 100644 index 000000000..59cbda0e0 --- /dev/null +++ b/debian/patches-rt/0115-sh-Get-rid-of-nmi_count.patch @@ -0,0 +1,47 @@ +From 4d4b0ae9c82f34e6cc445f9b04248ae49346a73e Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 13 Nov 2020 15:02:09 +0100 +Subject: [PATCH 115/323] sh: Get rid of nmi_count() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +nmi_count() is a historical leftover and SH is the only user. Replace it +with regular per cpu accessors. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <frederic@kernel.org> +Link: https://lore.kernel.org/r/20201113141732.844232404@linutronix.de +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/sh/kernel/irq.c | 2 +- + arch/sh/kernel/traps.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c +index 5717c7cbdd97..5addcb2c2da0 100644 +--- a/arch/sh/kernel/irq.c ++++ b/arch/sh/kernel/irq.c +@@ -44,7 +44,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) + + seq_printf(p, "%*s: ", prec, "NMI"); + for_each_online_cpu(j) +- seq_printf(p, "%10u ", nmi_count(j)); ++ seq_printf(p, "%10u ", per_cpu(irq_stat.__nmi_count, j); + seq_printf(p, " Non-maskable interrupts\n"); + + seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); +diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c +index 4efffc18c851..be15e41489b2 100644 +--- a/arch/sh/kernel/traps.c ++++ b/arch/sh/kernel/traps.c +@@ -186,7 +186,7 @@ BUILD_TRAP_HANDLER(nmi) + arch_ftrace_nmi_enter(); + + nmi_enter(); +- nmi_count(cpu)++; ++ this_cpu_inc(irq_stat.__nmi_count); + + switch (notify_die(DIE_NMI, "NMI", regs, 0, vec & 0xff, SIGINT)) { + case NOTIFY_OK: +-- +2.43.0 + diff --git a/debian/patches-rt/0116-irqstat-Get-rid-of-nmi_count-and-__IRQ_STAT.patch b/debian/patches-rt/0116-irqstat-Get-rid-of-nmi_count-and-__IRQ_STAT.patch new file mode 100644 index 000000000..870e1af5f --- /dev/null +++ b/debian/patches-rt/0116-irqstat-Get-rid-of-nmi_count-and-__IRQ_STAT.patch @@ -0,0 +1,34 @@ +From c190c29cf4e02d1cfa67a9744ef94c2037ed071e Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 13 Nov 2020 15:02:10 +0100 +Subject: [PATCH 116/323] irqstat: Get rid of nmi_count() and __IRQ_STAT() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Nothing uses this anymore. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <frederic@kernel.org> +Link: https://lore.kernel.org/r/20201113141733.005212732@linutronix.de +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/irq_cpustat.h | 4 ---- + 1 file changed, 4 deletions(-) + +diff --git a/include/linux/irq_cpustat.h b/include/linux/irq_cpustat.h +index 6e8895cd4d92..78fb2de3ea4d 100644 +--- a/include/linux/irq_cpustat.h ++++ b/include/linux/irq_cpustat.h +@@ -19,10 +19,6 @@ + + #ifndef __ARCH_IRQ_STAT + DECLARE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); /* defined in asm/hardirq.h */ +-#define __IRQ_STAT(cpu, member) (per_cpu(irq_stat.member, cpu)) + #endif + +-/* arch dependent irq_stat fields */ +-#define nmi_count(cpu) __IRQ_STAT((cpu), __nmi_count) /* i386 */ +- + #endif /* __irq_cpustat_h */ +-- +2.43.0 + diff --git a/debian/patches-rt/0117-um-irqstat-Get-rid-of-the-duplicated-declarations.patch b/debian/patches-rt/0117-um-irqstat-Get-rid-of-the-duplicated-declarations.patch new file mode 100644 index 000000000..ee24a1117 --- /dev/null +++ b/debian/patches-rt/0117-um-irqstat-Get-rid-of-the-duplicated-declarations.patch @@ -0,0 +1,48 @@ +From 44cfe18ac1cb81f3617a962640846d232bed6d65 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 13 Nov 2020 15:02:11 +0100 +Subject: [PATCH 117/323] um/irqstat: Get rid of the duplicated declarations +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +irq_cpustat_t and ack_bad_irq() are exactly the same as the asm-generic +ones. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <frederic@kernel.org> +Link: https://lore.kernel.org/r/20201113141733.156361337@linutronix.de +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/um/include/asm/hardirq.h | 17 +---------------- + 1 file changed, 1 insertion(+), 16 deletions(-) + +diff --git a/arch/um/include/asm/hardirq.h b/arch/um/include/asm/hardirq.h +index b426796d26fd..52e2c36267a9 100644 +--- a/arch/um/include/asm/hardirq.h ++++ b/arch/um/include/asm/hardirq.h +@@ -2,22 +2,7 @@ + #ifndef __ASM_UM_HARDIRQ_H + #define __ASM_UM_HARDIRQ_H + +-#include <linux/cache.h> +-#include <linux/threads.h> +- +-typedef struct { +- unsigned int __softirq_pending; +-} ____cacheline_aligned irq_cpustat_t; +- +-#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */ +-#include <linux/irq.h> +- +-#ifndef ack_bad_irq +-static inline void ack_bad_irq(unsigned int irq) +-{ +- printk(KERN_CRIT "unexpected IRQ trap at vector %02x\n", irq); +-} +-#endif ++#include <asm-generic/hardirq.h> + + #define __ARCH_IRQ_EXIT_IRQS_DISABLED 1 + +-- +2.43.0 + diff --git a/debian/patches-rt/0118-ARM-irqstat-Get-rid-of-duplicated-declaration.patch b/debian/patches-rt/0118-ARM-irqstat-Get-rid-of-duplicated-declaration.patch new file mode 100644 index 000000000..a7b949523 --- /dev/null +++ b/debian/patches-rt/0118-ARM-irqstat-Get-rid-of-duplicated-declaration.patch @@ -0,0 +1,59 @@ +From 903c27e244a901b4c66dc1d423a6d9b3214fd868 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 13 Nov 2020 15:02:12 +0100 +Subject: [PATCH 118/323] ARM: irqstat: Get rid of duplicated declaration +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +irq_cpustat_t is exactly the same as the asm-generic one. Define +ack_bad_irq so the generic header does not emit the generic version of it. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <frederic@kernel.org> +Reviewed-by: Valentin Schneider <valentin.schneider@arm.com> +Link: https://lore.kernel.org/r/20201113141733.276505871@linutronix.de +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/arm/include/asm/hardirq.h | 11 +++-------- + arch/arm/include/asm/irq.h | 2 ++ + 2 files changed, 5 insertions(+), 8 deletions(-) + +diff --git a/arch/arm/include/asm/hardirq.h b/arch/arm/include/asm/hardirq.h +index b95848ed2bc7..706efafbf972 100644 +--- a/arch/arm/include/asm/hardirq.h ++++ b/arch/arm/include/asm/hardirq.h +@@ -2,16 +2,11 @@ + #ifndef __ASM_HARDIRQ_H + #define __ASM_HARDIRQ_H + +-#include <linux/cache.h> +-#include <linux/threads.h> + #include <asm/irq.h> + +-typedef struct { +- unsigned int __softirq_pending; +-} ____cacheline_aligned irq_cpustat_t; +- +-#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */ +- + #define __ARCH_IRQ_EXIT_IRQS_DISABLED 1 ++#define ack_bad_irq ack_bad_irq ++ ++#include <asm-generic/hardirq.h> + + #endif /* __ASM_HARDIRQ_H */ +diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h +index 46d41140df27..1cbcc462b07e 100644 +--- a/arch/arm/include/asm/irq.h ++++ b/arch/arm/include/asm/irq.h +@@ -31,6 +31,8 @@ void handle_IRQ(unsigned int, struct pt_regs *); + void init_IRQ(void); + + #ifdef CONFIG_SMP ++#include <linux/cpumask.h> ++ + extern void arch_trigger_cpumask_backtrace(const cpumask_t *mask, + bool exclude_self); + #define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace +-- +2.43.0 + diff --git a/debian/patches-rt/0119-arm64-irqstat-Get-rid-of-duplicated-declaration.patch b/debian/patches-rt/0119-arm64-irqstat-Get-rid-of-duplicated-declaration.patch new file mode 100644 index 000000000..015978430 --- /dev/null +++ b/debian/patches-rt/0119-arm64-irqstat-Get-rid-of-duplicated-declaration.patch @@ -0,0 +1,40 @@ +From 9d8ad1996dd7a9a4250e55ee988eac5f7e52bd8e Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 13 Nov 2020 15:02:13 +0100 +Subject: [PATCH 119/323] arm64: irqstat: Get rid of duplicated declaration +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +irq_cpustat_t is exactly the same as the asm-generic one. Define +ack_bad_irq so the generic header does not emit the generic version of it. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <frederic@kernel.org> +Acked-by: Will Deacon <will@kernel.org> +Acked-by: Marc Zyngier <maz@kernel.org> +Link: https://lore.kernel.org/r/20201113141733.392015387@linutronix.de +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/arm64/include/asm/hardirq.h | 7 ++----- + 1 file changed, 2 insertions(+), 5 deletions(-) + +diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h +index 5ffa4bacdad3..cbfa7b6f2e09 100644 +--- a/arch/arm64/include/asm/hardirq.h ++++ b/arch/arm64/include/asm/hardirq.h +@@ -13,11 +13,8 @@ + #include <asm/kvm_arm.h> + #include <asm/sysreg.h> + +-typedef struct { +- unsigned int __softirq_pending; +-} ____cacheline_aligned irq_cpustat_t; +- +-#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */ ++#define ack_bad_irq ack_bad_irq ++#include <asm-generic/hardirq.h> + + #define __ARCH_IRQ_EXIT_IRQS_DISABLED 1 + +-- +2.43.0 + diff --git a/debian/patches-rt/0120-asm-generic-irqstat-Add-optional-__nmi_count-member.patch b/debian/patches-rt/0120-asm-generic-irqstat-Add-optional-__nmi_count-member.patch new file mode 100644 index 000000000..6eddb073f --- /dev/null +++ b/debian/patches-rt/0120-asm-generic-irqstat-Add-optional-__nmi_count-member.patch @@ -0,0 +1,34 @@ +From 9daa9b58eb0e0b5b143d719b4c465e8ee233def0 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 13 Nov 2020 15:02:14 +0100 +Subject: [PATCH 120/323] asm-generic/irqstat: Add optional __nmi_count member +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Add an optional __nmi_count member to irq_cpustat_t so more architectures +can use the generic version. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <frederic@kernel.org> +Link: https://lore.kernel.org/r/20201113141733.501611990@linutronix.de +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/asm-generic/hardirq.h | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/include/asm-generic/hardirq.h b/include/asm-generic/hardirq.h +index d14214dfc10b..f5dd99781e3c 100644 +--- a/include/asm-generic/hardirq.h ++++ b/include/asm-generic/hardirq.h +@@ -7,6 +7,9 @@ + + typedef struct { + unsigned int __softirq_pending; ++#ifdef ARCH_WANTS_NMI_IRQSTAT ++ unsigned int __nmi_count; ++#endif + } ____cacheline_aligned irq_cpustat_t; + + #include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */ +-- +2.43.0 + diff --git a/debian/patches-rt/0121-sh-irqstat-Use-the-generic-irq_cpustat_t.patch b/debian/patches-rt/0121-sh-irqstat-Use-the-generic-irq_cpustat_t.patch new file mode 100644 index 000000000..a141aaddf --- /dev/null +++ b/debian/patches-rt/0121-sh-irqstat-Use-the-generic-irq_cpustat_t.patch @@ -0,0 +1,45 @@ +From 36f4507e26065641dd4b5e9fab744596a072b3a6 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 13 Nov 2020 15:02:15 +0100 +Subject: [PATCH 121/323] sh: irqstat: Use the generic irq_cpustat_t +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +SH can now use the generic irq_cpustat_t. Define ack_bad_irq so the generic +header does not emit the generic version of it. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <frederic@kernel.org> +Link: https://lore.kernel.org/r/20201113141733.625146223@linutronix.de +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/sh/include/asm/hardirq.h | 14 ++++---------- + 1 file changed, 4 insertions(+), 10 deletions(-) + +diff --git a/arch/sh/include/asm/hardirq.h b/arch/sh/include/asm/hardirq.h +index edaea3559a23..9fe4495a8e90 100644 +--- a/arch/sh/include/asm/hardirq.h ++++ b/arch/sh/include/asm/hardirq.h +@@ -2,16 +2,10 @@ + #ifndef __ASM_SH_HARDIRQ_H + #define __ASM_SH_HARDIRQ_H + +-#include <linux/threads.h> +-#include <linux/irq.h> +- +-typedef struct { +- unsigned int __softirq_pending; +- unsigned int __nmi_count; /* arch dependent */ +-} ____cacheline_aligned irq_cpustat_t; +- +-#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */ +- + extern void ack_bad_irq(unsigned int irq); ++#define ack_bad_irq ack_bad_irq ++#define ARCH_WANTS_NMI_IRQSTAT ++ ++#include <asm-generic/hardirq.h> + + #endif /* __ASM_SH_HARDIRQ_H */ +-- +2.43.0 + diff --git a/debian/patches-rt/0122-irqstat-Move-declaration-into-asm-generic-hardirq.h.patch b/debian/patches-rt/0122-irqstat-Move-declaration-into-asm-generic-hardirq.h.patch new file mode 100644 index 000000000..7918409ff --- /dev/null +++ b/debian/patches-rt/0122-irqstat-Move-declaration-into-asm-generic-hardirq.h.patch @@ -0,0 +1,66 @@ +From 4d8e6962b3b3f9acc509359631d4396d9b729064 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 13 Nov 2020 15:02:16 +0100 +Subject: [PATCH 122/323] irqstat: Move declaration into asm-generic/hardirq.h +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Move the declaration of the irq_cpustat per cpu variable to +asm-generic/hardirq.h and remove the now empty linux/irq_cpustat.h header. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <frederic@kernel.org> +Link: https://lore.kernel.org/r/20201113141733.737377332@linutronix.de +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/asm-generic/hardirq.h | 3 ++- + include/linux/irq_cpustat.h | 24 ------------------------ + 2 files changed, 2 insertions(+), 25 deletions(-) + delete mode 100644 include/linux/irq_cpustat.h + +diff --git a/include/asm-generic/hardirq.h b/include/asm-generic/hardirq.h +index f5dd99781e3c..7317e8258b48 100644 +--- a/include/asm-generic/hardirq.h ++++ b/include/asm-generic/hardirq.h +@@ -12,7 +12,8 @@ typedef struct { + #endif + } ____cacheline_aligned irq_cpustat_t; + +-#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */ ++DECLARE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); ++ + #include <linux/irq.h> + + #ifndef ack_bad_irq +diff --git a/include/linux/irq_cpustat.h b/include/linux/irq_cpustat.h +deleted file mode 100644 +index 78fb2de3ea4d..000000000000 +--- a/include/linux/irq_cpustat.h ++++ /dev/null +@@ -1,24 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef __irq_cpustat_h +-#define __irq_cpustat_h +- +-/* +- * Contains default mappings for irq_cpustat_t, used by almost every +- * architecture. Some arch (like s390) have per cpu hardware pages and +- * they define their own mappings for irq_stat. +- * +- * Keith Owens <kaos@ocs.com.au> July 2000. +- */ +- +- +-/* +- * Simple wrappers reducing source bloat. Define all irq_stat fields +- * here, even ones that are arch dependent. That way we get common +- * definitions instead of differing sets for each arch. +- */ +- +-#ifndef __ARCH_IRQ_STAT +-DECLARE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); /* defined in asm/hardirq.h */ +-#endif +- +-#endif /* __irq_cpustat_h */ +-- +2.43.0 + diff --git a/debian/patches-rt/0123-preempt-Cleanup-the-macro-maze-a-bit.patch b/debian/patches-rt/0123-preempt-Cleanup-the-macro-maze-a-bit.patch new file mode 100644 index 000000000..9e6469ec2 --- /dev/null +++ b/debian/patches-rt/0123-preempt-Cleanup-the-macro-maze-a-bit.patch @@ -0,0 +1,78 @@ +From b743317882baa4d44bdc55dea99865bf53623193 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 13 Nov 2020 15:02:17 +0100 +Subject: [PATCH 123/323] preempt: Cleanup the macro maze a bit +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Make the macro maze consistent and prepare it for adding the RT variant for +BH accounting. + + - Use nmi_count() for the NMI portion of preempt count + - Introduce in_hardirq() to make the naming consistent and non-ambiguos + - Use the macros to create combined checks (e.g. in_task()) so the + softirq representation for RT just falls into place. + - Update comments and move the deprecated macros aside + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <frederic@kernel.org> +Link: https://lore.kernel.org/r/20201113141733.864469886@linutronix.de +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/preempt.h | 30 ++++++++++++++++-------------- + 1 file changed, 16 insertions(+), 14 deletions(-) + +diff --git a/include/linux/preempt.h b/include/linux/preempt.h +index 6df63cbe8bb0..69cc8b64aa3a 100644 +--- a/include/linux/preempt.h ++++ b/include/linux/preempt.h +@@ -77,31 +77,33 @@ + /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */ + #include <asm/preempt.h> + ++#define nmi_count() (preempt_count() & NMI_MASK) + #define hardirq_count() (preempt_count() & HARDIRQ_MASK) + #define softirq_count() (preempt_count() & SOFTIRQ_MASK) +-#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \ +- | NMI_MASK)) ++#define irq_count() (nmi_count() | hardirq_count() | softirq_count()) + + /* +- * Are we doing bottom half or hardware interrupt processing? ++ * Macros to retrieve the current execution context: + * +- * in_irq() - We're in (hard) IRQ context ++ * in_nmi() - We're in NMI context ++ * in_hardirq() - We're in hard IRQ context ++ * in_serving_softirq() - We're in softirq context ++ * in_task() - We're in task context ++ */ ++#define in_nmi() (nmi_count()) ++#define in_hardirq() (hardirq_count()) ++#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) ++#define in_task() (!(in_nmi() | in_hardirq() | in_serving_softirq())) ++ ++/* ++ * The following macros are deprecated and should not be used in new code: ++ * in_irq() - Obsolete version of in_hardirq() + * in_softirq() - We have BH disabled, or are processing softirqs + * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled +- * in_serving_softirq() - We're in softirq context +- * in_nmi() - We're in NMI context +- * in_task() - We're in task context +- * +- * Note: due to the BH disabled confusion: in_softirq(),in_interrupt() really +- * should not be used in new code. + */ + #define in_irq() (hardirq_count()) + #define in_softirq() (softirq_count()) + #define in_interrupt() (irq_count()) +-#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) +-#define in_nmi() (preempt_count() & NMI_MASK) +-#define in_task() (!(preempt_count() & \ +- (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) + + /* + * The preempt_count offset after preempt_disable(); +-- +2.43.0 + diff --git a/debian/patches-rt/0124-softirq-Move-related-code-into-one-section.patch b/debian/patches-rt/0124-softirq-Move-related-code-into-one-section.patch new file mode 100644 index 000000000..fb937d24b --- /dev/null +++ b/debian/patches-rt/0124-softirq-Move-related-code-into-one-section.patch @@ -0,0 +1,169 @@ +From e40ca378bfbd05514961947d57b163831bdc3bbe Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 13 Nov 2020 15:02:18 +0100 +Subject: [PATCH 124/323] softirq: Move related code into one section +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +To prepare for adding a RT aware variant of softirq serialization and +processing move related code into one section so the necessary #ifdeffery +is reduced to one. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: Frederic Weisbecker <frederic@kernel.org> +Link: https://lore.kernel.org/r/20201113141733.974214480@linutronix.de +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/softirq.c | 107 ++++++++++++++++++++++++----------------------- + 1 file changed, 54 insertions(+), 53 deletions(-) + +diff --git a/kernel/softirq.c b/kernel/softirq.c +index 09229ad82209..617009ccd82c 100644 +--- a/kernel/softirq.c ++++ b/kernel/softirq.c +@@ -92,6 +92,13 @@ static bool ksoftirqd_running(unsigned long pending) + !__kthread_should_park(tsk); + } + ++#ifdef CONFIG_TRACE_IRQFLAGS ++DEFINE_PER_CPU(int, hardirqs_enabled); ++DEFINE_PER_CPU(int, hardirq_context); ++EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled); ++EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); ++#endif ++ + /* + * preempt_count and SOFTIRQ_OFFSET usage: + * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving +@@ -102,17 +109,11 @@ static bool ksoftirqd_running(unsigned long pending) + * softirq and whether we just have bh disabled. + */ + ++#ifdef CONFIG_TRACE_IRQFLAGS + /* +- * This one is for softirq.c-internal use, +- * where hardirqs are disabled legitimately: ++ * This is for softirq.c-internal use, where hardirqs are disabled ++ * legitimately: + */ +-#ifdef CONFIG_TRACE_IRQFLAGS +- +-DEFINE_PER_CPU(int, hardirqs_enabled); +-DEFINE_PER_CPU(int, hardirq_context); +-EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled); +-EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); +- + void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) + { + unsigned long flags; +@@ -203,6 +204,50 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) + } + EXPORT_SYMBOL(__local_bh_enable_ip); + ++static inline void invoke_softirq(void) ++{ ++ if (ksoftirqd_running(local_softirq_pending())) ++ return; ++ ++ if (!force_irqthreads) { ++#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK ++ /* ++ * We can safely execute softirq on the current stack if ++ * it is the irq stack, because it should be near empty ++ * at this stage. ++ */ ++ __do_softirq(); ++#else ++ /* ++ * Otherwise, irq_exit() is called on the task stack that can ++ * be potentially deep already. So call softirq in its own stack ++ * to prevent from any overrun. ++ */ ++ do_softirq_own_stack(); ++#endif ++ } else { ++ wakeup_softirqd(); ++ } ++} ++ ++asmlinkage __visible void do_softirq(void) ++{ ++ __u32 pending; ++ unsigned long flags; ++ ++ if (in_interrupt()) ++ return; ++ ++ local_irq_save(flags); ++ ++ pending = local_softirq_pending(); ++ ++ if (pending && !ksoftirqd_running(pending)) ++ do_softirq_own_stack(); ++ ++ local_irq_restore(flags); ++} ++ + /* + * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, + * but break the loop if need_resched() is set or after 2 ms. +@@ -327,24 +372,6 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) + current_restore_flags(old_flags, PF_MEMALLOC); + } + +-asmlinkage __visible void do_softirq(void) +-{ +- __u32 pending; +- unsigned long flags; +- +- if (in_interrupt()) +- return; +- +- local_irq_save(flags); +- +- pending = local_softirq_pending(); +- +- if (pending && !ksoftirqd_running(pending)) +- do_softirq_own_stack(); +- +- local_irq_restore(flags); +-} +- + /** + * irq_enter_rcu - Enter an interrupt context with RCU watching + */ +@@ -371,32 +398,6 @@ void irq_enter(void) + irq_enter_rcu(); + } + +-static inline void invoke_softirq(void) +-{ +- if (ksoftirqd_running(local_softirq_pending())) +- return; +- +- if (!force_irqthreads) { +-#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK +- /* +- * We can safely execute softirq on the current stack if +- * it is the irq stack, because it should be near empty +- * at this stage. +- */ +- __do_softirq(); +-#else +- /* +- * Otherwise, irq_exit() is called on the task stack that can +- * be potentially deep already. So call softirq in its own stack +- * to prevent from any overrun. +- */ +- do_softirq_own_stack(); +-#endif +- } else { +- wakeup_softirqd(); +- } +-} +- + static inline void tick_irq_exit(void) + { + #ifdef CONFIG_NO_HZ_COMMON +-- +2.43.0 + diff --git a/debian/patches-rt/0125-sh-irq-Add-missing-closing-parentheses-in-arch_show_.patch b/debian/patches-rt/0125-sh-irq-Add-missing-closing-parentheses-in-arch_show_.patch new file mode 100644 index 000000000..4259c009f --- /dev/null +++ b/debian/patches-rt/0125-sh-irq-Add-missing-closing-parentheses-in-arch_show_.patch @@ -0,0 +1,40 @@ +From e8f074aae182d9f11f092885cc1a1dfaedbe8807 Mon Sep 17 00:00:00 2001 +From: Geert Uytterhoeven <geert+renesas@glider.be> +Date: Tue, 24 Nov 2020 14:06:56 +0100 +Subject: [PATCH 125/323] sh/irq: Add missing closing parentheses in + arch_show_interrupts() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + + arch/sh/kernel/irq.c: In function ‘arch_show_interrupts’: + arch/sh/kernel/irq.c:47:58: error: expected ‘)’ before ‘;’ token + 47 | seq_printf(p, "%10u ", per_cpu(irq_stat.__nmi_count, j); + | ^ + +Fixes: fe3f1d5d7cd3062c ("sh: Get rid of nmi_count()") +Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Link: https://lore.kernel.org/r/20201124130656.2741743-1-geert+renesas@glider.be +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/sh/kernel/irq.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c +index 5addcb2c2da0..ab5f790b0cd2 100644 +--- a/arch/sh/kernel/irq.c ++++ b/arch/sh/kernel/irq.c +@@ -44,7 +44,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) + + seq_printf(p, "%*s: ", prec, "NMI"); + for_each_online_cpu(j) +- seq_printf(p, "%10u ", per_cpu(irq_stat.__nmi_count, j); ++ seq_printf(p, "%10u ", per_cpu(irq_stat.__nmi_count, j)); + seq_printf(p, " Non-maskable interrupts\n"); + + seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); +-- +2.43.0 + diff --git a/debian/patches-rt/0126-sched-cputime-Remove-symbol-exports-from-IRQ-time-ac.patch b/debian/patches-rt/0126-sched-cputime-Remove-symbol-exports-from-IRQ-time-ac.patch new file mode 100644 index 000000000..3926556d0 --- /dev/null +++ b/debian/patches-rt/0126-sched-cputime-Remove-symbol-exports-from-IRQ-time-ac.patch @@ -0,0 +1,73 @@ +From d15b621168c7c422b8e43819c45aa749f3611fa6 Mon Sep 17 00:00:00 2001 +From: Frederic Weisbecker <frederic@kernel.org> +Date: Wed, 2 Dec 2020 12:57:28 +0100 +Subject: [PATCH 126/323] sched/cputime: Remove symbol exports from IRQ time + accounting +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +account_irq_enter_time() and account_irq_exit_time() are not called +from modules. EXPORT_SYMBOL_GPL() can be safely removed from the IRQ +cputime accounting functions called from there. + +Signed-off-by: Frederic Weisbecker <frederic@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Link: https://lore.kernel.org/r/20201202115732.27827-2-frederic@kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/s390/kernel/vtime.c | 10 +++++----- + kernel/sched/cputime.c | 2 -- + 2 files changed, 5 insertions(+), 7 deletions(-) + +diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c +index 579ec3a8c816..710135905deb 100644 +--- a/arch/s390/kernel/vtime.c ++++ b/arch/s390/kernel/vtime.c +@@ -227,7 +227,7 @@ void vtime_flush(struct task_struct *tsk) + * Update process times based on virtual cpu times stored by entry.S + * to the lowcore fields user_timer, system_timer & steal_clock. + */ +-void vtime_account_irq_enter(struct task_struct *tsk) ++void vtime_account_kernel(struct task_struct *tsk) + { + u64 timer; + +@@ -246,12 +246,12 @@ void vtime_account_irq_enter(struct task_struct *tsk) + + virt_timer_forward(timer); + } +-EXPORT_SYMBOL_GPL(vtime_account_irq_enter); +- +-void vtime_account_kernel(struct task_struct *tsk) +-__attribute__((alias("vtime_account_irq_enter"))); + EXPORT_SYMBOL_GPL(vtime_account_kernel); + ++void vtime_account_irq_enter(struct task_struct *tsk) ++__attribute__((alias("vtime_account_kernel"))); ++ ++ + /* + * Sorted add to a list. List is linear searched until first bigger + * element is found. +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index ca0eef7d3852..e03568c5f584 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -71,7 +71,6 @@ void irqtime_account_irq(struct task_struct *curr) + else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) + irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); + } +-EXPORT_SYMBOL_GPL(irqtime_account_irq); + + static u64 irqtime_tick_accounted(u64 maxtime) + { +@@ -434,7 +433,6 @@ void vtime_account_irq_enter(struct task_struct *tsk) + else + vtime_account_kernel(tsk); + } +-EXPORT_SYMBOL_GPL(vtime_account_irq_enter); + #endif /* __ARCH_HAS_VTIME_ACCOUNT */ + + void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, +-- +2.43.0 + diff --git a/debian/patches-rt/0127-s390-vtime-Use-the-generic-IRQ-entry-accounting.patch b/debian/patches-rt/0127-s390-vtime-Use-the-generic-IRQ-entry-accounting.patch new file mode 100644 index 000000000..e4a67c29c --- /dev/null +++ b/debian/patches-rt/0127-s390-vtime-Use-the-generic-IRQ-entry-accounting.patch @@ -0,0 +1,126 @@ +From 433f928a0fd33d67682020e134ac00885b38a2a9 Mon Sep 17 00:00:00 2001 +From: Frederic Weisbecker <frederic@kernel.org> +Date: Wed, 2 Dec 2020 12:57:29 +0100 +Subject: [PATCH 127/323] s390/vtime: Use the generic IRQ entry accounting +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +s390 has its own version of IRQ entry accounting because it doesn't +account the idle time the same way the other architectures do. Only +the actual idle sleep time is accounted as idle time, the rest of the +idle task execution is accounted as system time. + +Make the generic IRQ entry accounting aware of architectures that have +their own way of accounting idle time and convert s390 to use it. + +This prepares s390 to get involved in further consolidations of IRQ +time accounting. + +Signed-off-by: Frederic Weisbecker <frederic@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Link: https://lore.kernel.org/r/20201202115732.27827-3-frederic@kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/Kconfig | 7 ++++++- + arch/s390/Kconfig | 1 + + arch/s390/include/asm/vtime.h | 1 - + arch/s390/kernel/vtime.c | 4 ---- + kernel/sched/cputime.c | 13 ++----------- + 5 files changed, 9 insertions(+), 17 deletions(-) + +diff --git a/arch/Kconfig b/arch/Kconfig +index 240277d5626c..628e1e7fe302 100644 +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -646,6 +646,12 @@ config HAVE_TIF_NOHZ + config HAVE_VIRT_CPU_ACCOUNTING + bool + ++config HAVE_VIRT_CPU_ACCOUNTING_IDLE ++ bool ++ help ++ Architecture has its own way to account idle CPU time and therefore ++ doesn't implement vtime_account_idle(). ++ + config ARCH_HAS_SCALED_CPUTIME + bool + +@@ -660,7 +666,6 @@ config HAVE_VIRT_CPU_ACCOUNTING_GEN + some 32-bit arches may require multiple accesses, so proper + locking is needed to protect against concurrent accesses. + +- + config HAVE_IRQ_TIME_ACCOUNTING + bool + help +diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig +index 878993982e39..2e78071551e7 100644 +--- a/arch/s390/Kconfig ++++ b/arch/s390/Kconfig +@@ -183,6 +183,7 @@ config S390 + select HAVE_RSEQ + select HAVE_SYSCALL_TRACEPOINTS + select HAVE_VIRT_CPU_ACCOUNTING ++ select HAVE_VIRT_CPU_ACCOUNTING_IDLE + select IOMMU_HELPER if PCI + select IOMMU_SUPPORT if PCI + select MODULES_USE_ELF_RELA +diff --git a/arch/s390/include/asm/vtime.h b/arch/s390/include/asm/vtime.h +index 3622d4ebc73a..fac6a67988eb 100644 +--- a/arch/s390/include/asm/vtime.h ++++ b/arch/s390/include/asm/vtime.h +@@ -2,7 +2,6 @@ + #ifndef _S390_VTIME_H + #define _S390_VTIME_H + +-#define __ARCH_HAS_VTIME_ACCOUNT + #define __ARCH_HAS_VTIME_TASK_SWITCH + + #endif /* _S390_VTIME_H */ +diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c +index 710135905deb..18a97631af43 100644 +--- a/arch/s390/kernel/vtime.c ++++ b/arch/s390/kernel/vtime.c +@@ -248,10 +248,6 @@ void vtime_account_kernel(struct task_struct *tsk) + } + EXPORT_SYMBOL_GPL(vtime_account_kernel); + +-void vtime_account_irq_enter(struct task_struct *tsk) +-__attribute__((alias("vtime_account_kernel"))); +- +- + /* + * Sorted add to a list. List is linear searched until first bigger + * element is found. +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index e03568c5f584..1d78b835ec8b 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -417,23 +417,14 @@ void vtime_task_switch(struct task_struct *prev) + } + # endif + +-/* +- * Archs that account the whole time spent in the idle task +- * (outside irq) as idle time can rely on this and just implement +- * vtime_account_kernel() and vtime_account_idle(). Archs that +- * have other meaning of the idle time (s390 only includes the +- * time spent by the CPU when it's in low power mode) must override +- * vtime_account(). +- */ +-#ifndef __ARCH_HAS_VTIME_ACCOUNT + void vtime_account_irq_enter(struct task_struct *tsk) + { +- if (!in_interrupt() && is_idle_task(tsk)) ++ if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && ++ !in_interrupt() && is_idle_task(tsk)) + vtime_account_idle(tsk); + else + vtime_account_kernel(tsk); + } +-#endif /* __ARCH_HAS_VTIME_ACCOUNT */ + + void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + u64 *ut, u64 *st) +-- +2.43.0 + diff --git a/debian/patches-rt/0128-sched-vtime-Consolidate-IRQ-time-accounting.patch b/debian/patches-rt/0128-sched-vtime-Consolidate-IRQ-time-accounting.patch new file mode 100644 index 000000000..bb1e03d33 --- /dev/null +++ b/debian/patches-rt/0128-sched-vtime-Consolidate-IRQ-time-accounting.patch @@ -0,0 +1,303 @@ +From 749e0a8d4b10240d5834e135f34c47d107aa1442 Mon Sep 17 00:00:00 2001 +From: Frederic Weisbecker <frederic@kernel.org> +Date: Wed, 2 Dec 2020 12:57:30 +0100 +Subject: [PATCH 128/323] sched/vtime: Consolidate IRQ time accounting +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The 3 architectures implementing CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +all have their own version of irq time accounting that dispatch the +cputime to the appropriate index: hardirq, softirq, system, idle, +guest... from an all-in-one function. + +Instead of having these ad-hoc versions, move the cputime destination +dispatch decision to the core code and leave only the actual per-index +cputime accounting to the architecture. + +Signed-off-by: Frederic Weisbecker <frederic@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Link: https://lore.kernel.org/r/20201202115732.27827-4-frederic@kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/ia64/kernel/time.c | 20 ++++++++++---- + arch/powerpc/kernel/time.c | 56 +++++++++++++++++++++++++++----------- + arch/s390/kernel/vtime.c | 45 +++++++++++++++++++++--------- + include/linux/vtime.h | 16 ++++------- + kernel/sched/cputime.c | 13 ++++++--- + 5 files changed, 102 insertions(+), 48 deletions(-) + +diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c +index 7abc5f37bfaf..733e0e3324b8 100644 +--- a/arch/ia64/kernel/time.c ++++ b/arch/ia64/kernel/time.c +@@ -138,12 +138,8 @@ void vtime_account_kernel(struct task_struct *tsk) + struct thread_info *ti = task_thread_info(tsk); + __u64 stime = vtime_delta(tsk); + +- if ((tsk->flags & PF_VCPU) && !irq_count()) ++ if (tsk->flags & PF_VCPU) + ti->gtime += stime; +- else if (hardirq_count()) +- ti->hardirq_time += stime; +- else if (in_serving_softirq()) +- ti->softirq_time += stime; + else + ti->stime += stime; + } +@@ -156,6 +152,20 @@ void vtime_account_idle(struct task_struct *tsk) + ti->idle_time += vtime_delta(tsk); + } + ++void vtime_account_softirq(struct task_struct *tsk) ++{ ++ struct thread_info *ti = task_thread_info(tsk); ++ ++ ti->softirq_time += vtime_delta(tsk); ++} ++ ++void vtime_account_hardirq(struct task_struct *tsk) ++{ ++ struct thread_info *ti = task_thread_info(tsk); ++ ++ ti->hardirq_time += vtime_delta(tsk); ++} ++ + #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ + + static irqreturn_t +diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c +index ba9b54d35f57..588e081ca55b 100644 +--- a/arch/powerpc/kernel/time.c ++++ b/arch/powerpc/kernel/time.c +@@ -312,12 +312,11 @@ static unsigned long vtime_delta_scaled(struct cpu_accounting_data *acct, + return stime_scaled; + } + +-static unsigned long vtime_delta(struct task_struct *tsk, ++static unsigned long vtime_delta(struct cpu_accounting_data *acct, + unsigned long *stime_scaled, + unsigned long *steal_time) + { + unsigned long now, stime; +- struct cpu_accounting_data *acct = get_accounting(tsk); + + WARN_ON_ONCE(!irqs_disabled()); + +@@ -332,29 +331,30 @@ static unsigned long vtime_delta(struct task_struct *tsk, + return stime; + } + ++static void vtime_delta_kernel(struct cpu_accounting_data *acct, ++ unsigned long *stime, unsigned long *stime_scaled) ++{ ++ unsigned long steal_time; ++ ++ *stime = vtime_delta(acct, stime_scaled, &steal_time); ++ *stime -= min(*stime, steal_time); ++ acct->steal_time += steal_time; ++} ++ + void vtime_account_kernel(struct task_struct *tsk) + { +- unsigned long stime, stime_scaled, steal_time; + struct cpu_accounting_data *acct = get_accounting(tsk); ++ unsigned long stime, stime_scaled; + +- stime = vtime_delta(tsk, &stime_scaled, &steal_time); +- +- stime -= min(stime, steal_time); +- acct->steal_time += steal_time; ++ vtime_delta_kernel(acct, &stime, &stime_scaled); + +- if ((tsk->flags & PF_VCPU) && !irq_count()) { ++ if (tsk->flags & PF_VCPU) { + acct->gtime += stime; + #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + acct->utime_scaled += stime_scaled; + #endif + } else { +- if (hardirq_count()) +- acct->hardirq_time += stime; +- else if (in_serving_softirq()) +- acct->softirq_time += stime; +- else +- acct->stime += stime; +- ++ acct->stime += stime; + #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + acct->stime_scaled += stime_scaled; + #endif +@@ -367,10 +367,34 @@ void vtime_account_idle(struct task_struct *tsk) + unsigned long stime, stime_scaled, steal_time; + struct cpu_accounting_data *acct = get_accounting(tsk); + +- stime = vtime_delta(tsk, &stime_scaled, &steal_time); ++ stime = vtime_delta(acct, &stime_scaled, &steal_time); + acct->idle_time += stime + steal_time; + } + ++static void vtime_account_irq_field(struct cpu_accounting_data *acct, ++ unsigned long *field) ++{ ++ unsigned long stime, stime_scaled; ++ ++ vtime_delta_kernel(acct, &stime, &stime_scaled); ++ *field += stime; ++#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME ++ acct->stime_scaled += stime_scaled; ++#endif ++} ++ ++void vtime_account_softirq(struct task_struct *tsk) ++{ ++ struct cpu_accounting_data *acct = get_accounting(tsk); ++ vtime_account_irq_field(acct, &acct->softirq_time); ++} ++ ++void vtime_account_hardirq(struct task_struct *tsk) ++{ ++ struct cpu_accounting_data *acct = get_accounting(tsk); ++ vtime_account_irq_field(acct, &acct->hardirq_time); ++} ++ + static void vtime_flush_scaled(struct task_struct *tsk, + struct cpu_accounting_data *acct) + { +diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c +index 18a97631af43..9b3c5978b668 100644 +--- a/arch/s390/kernel/vtime.c ++++ b/arch/s390/kernel/vtime.c +@@ -223,31 +223,50 @@ void vtime_flush(struct task_struct *tsk) + S390_lowcore.avg_steal_timer = avg_steal; + } + ++static u64 vtime_delta(void) ++{ ++ u64 timer = S390_lowcore.last_update_timer; ++ ++ S390_lowcore.last_update_timer = get_vtimer(); ++ ++ return timer - S390_lowcore.last_update_timer; ++} ++ + /* + * Update process times based on virtual cpu times stored by entry.S + * to the lowcore fields user_timer, system_timer & steal_clock. + */ + void vtime_account_kernel(struct task_struct *tsk) + { +- u64 timer; +- +- timer = S390_lowcore.last_update_timer; +- S390_lowcore.last_update_timer = get_vtimer(); +- timer -= S390_lowcore.last_update_timer; ++ u64 delta = vtime_delta(); + +- if ((tsk->flags & PF_VCPU) && (irq_count() == 0)) +- S390_lowcore.guest_timer += timer; +- else if (hardirq_count()) +- S390_lowcore.hardirq_timer += timer; +- else if (in_serving_softirq()) +- S390_lowcore.softirq_timer += timer; ++ if (tsk->flags & PF_VCPU) ++ S390_lowcore.guest_timer += delta; + else +- S390_lowcore.system_timer += timer; ++ S390_lowcore.system_timer += delta; + +- virt_timer_forward(timer); ++ virt_timer_forward(delta); + } + EXPORT_SYMBOL_GPL(vtime_account_kernel); + ++void vtime_account_softirq(struct task_struct *tsk) ++{ ++ u64 delta = vtime_delta(); ++ ++ S390_lowcore.softirq_timer += delta; ++ ++ virt_timer_forward(delta); ++} ++ ++void vtime_account_hardirq(struct task_struct *tsk) ++{ ++ u64 delta = vtime_delta(); ++ ++ S390_lowcore.hardirq_timer += delta; ++ ++ virt_timer_forward(delta); ++} ++ + /* + * Sorted add to a list. List is linear searched until first bigger + * element is found. +diff --git a/include/linux/vtime.h b/include/linux/vtime.h +index 2cdeca062db3..6c9867419615 100644 +--- a/include/linux/vtime.h ++++ b/include/linux/vtime.h +@@ -83,16 +83,12 @@ static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { } + #endif + + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +-extern void vtime_account_irq_enter(struct task_struct *tsk); +-static inline void vtime_account_irq_exit(struct task_struct *tsk) +-{ +- /* On hard|softirq exit we always account to hard|softirq cputime */ +- vtime_account_kernel(tsk); +-} ++extern void vtime_account_irq(struct task_struct *tsk); ++extern void vtime_account_softirq(struct task_struct *tsk); ++extern void vtime_account_hardirq(struct task_struct *tsk); + extern void vtime_flush(struct task_struct *tsk); + #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ +-static inline void vtime_account_irq_enter(struct task_struct *tsk) { } +-static inline void vtime_account_irq_exit(struct task_struct *tsk) { } ++static inline void vtime_account_irq(struct task_struct *tsk) { } + static inline void vtime_flush(struct task_struct *tsk) { } + #endif + +@@ -105,13 +101,13 @@ static inline void irqtime_account_irq(struct task_struct *tsk) { } + + static inline void account_irq_enter_time(struct task_struct *tsk) + { +- vtime_account_irq_enter(tsk); ++ vtime_account_irq(tsk); + irqtime_account_irq(tsk); + } + + static inline void account_irq_exit_time(struct task_struct *tsk) + { +- vtime_account_irq_exit(tsk); ++ vtime_account_irq(tsk); + irqtime_account_irq(tsk); + } + +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index 1d78b835ec8b..60129af16ed6 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -417,13 +417,18 @@ void vtime_task_switch(struct task_struct *prev) + } + # endif + +-void vtime_account_irq_enter(struct task_struct *tsk) ++void vtime_account_irq(struct task_struct *tsk) + { +- if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && +- !in_interrupt() && is_idle_task(tsk)) ++ if (hardirq_count()) { ++ vtime_account_hardirq(tsk); ++ } else if (in_serving_softirq()) { ++ vtime_account_softirq(tsk); ++ } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && ++ is_idle_task(tsk)) { + vtime_account_idle(tsk); +- else ++ } else { + vtime_account_kernel(tsk); ++ } + } + + void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, +-- +2.43.0 + diff --git a/debian/patches-rt/0129-irqtime-Move-irqtime-entry-accounting-after-irq-offs.patch b/debian/patches-rt/0129-irqtime-Move-irqtime-entry-accounting-after-irq-offs.patch new file mode 100644 index 000000000..838a13489 --- /dev/null +++ b/debian/patches-rt/0129-irqtime-Move-irqtime-entry-accounting-after-irq-offs.patch @@ -0,0 +1,213 @@ +From e387d228099075ac3f762ccfbbe29f9ee59678c8 Mon Sep 17 00:00:00 2001 +From: Frederic Weisbecker <frederic@kernel.org> +Date: Wed, 2 Dec 2020 12:57:31 +0100 +Subject: [PATCH 129/323] irqtime: Move irqtime entry accounting after irq + offset incrementation +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +IRQ time entry is currently accounted before HARDIRQ_OFFSET or +SOFTIRQ_OFFSET are incremented. This is convenient to decide to which +index the cputime to account is dispatched. + +Unfortunately it prevents tick_irq_enter() from being called under +HARDIRQ_OFFSET because tick_irq_enter() has to be called before the IRQ +entry accounting due to the necessary clock catch up. As a result we +don't benefit from appropriate lockdep coverage on tick_irq_enter(). + +To prepare for fixing this, move the IRQ entry cputime accounting after +the preempt offset is incremented. This requires the cputime dispatch +code to handle the extra offset. + +Signed-off-by: Frederic Weisbecker <frederic@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Link: https://lore.kernel.org/r/20201202115732.27827-5-frederic@kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/hardirq.h | 4 ++-- + include/linux/vtime.h | 34 ++++++++++++++++++++++++---------- + kernel/sched/cputime.c | 18 +++++++++++------- + kernel/softirq.c | 6 +++--- + 4 files changed, 40 insertions(+), 22 deletions(-) + +diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h +index c35b71f8644a..0926e9ca4d85 100644 +--- a/include/linux/hardirq.h ++++ b/include/linux/hardirq.h +@@ -32,9 +32,9 @@ static __always_inline void rcu_irq_enter_check_tick(void) + */ + #define __irq_enter() \ + do { \ +- account_irq_enter_time(current); \ + preempt_count_add(HARDIRQ_OFFSET); \ + lockdep_hardirq_enter(); \ ++ account_hardirq_enter(current); \ + } while (0) + + /* +@@ -62,8 +62,8 @@ void irq_enter_rcu(void); + */ + #define __irq_exit() \ + do { \ ++ account_hardirq_exit(current); \ + lockdep_hardirq_exit(); \ +- account_irq_exit_time(current); \ + preempt_count_sub(HARDIRQ_OFFSET); \ + } while (0) + +diff --git a/include/linux/vtime.h b/include/linux/vtime.h +index 6c9867419615..041d6524d144 100644 +--- a/include/linux/vtime.h ++++ b/include/linux/vtime.h +@@ -83,32 +83,46 @@ static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { } + #endif + + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +-extern void vtime_account_irq(struct task_struct *tsk); ++extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset); + extern void vtime_account_softirq(struct task_struct *tsk); + extern void vtime_account_hardirq(struct task_struct *tsk); + extern void vtime_flush(struct task_struct *tsk); + #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ +-static inline void vtime_account_irq(struct task_struct *tsk) { } ++static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { } ++static inline void vtime_account_softirq(struct task_struct *tsk) { } ++static inline void vtime_account_hardirq(struct task_struct *tsk) { } + static inline void vtime_flush(struct task_struct *tsk) { } + #endif + + + #ifdef CONFIG_IRQ_TIME_ACCOUNTING +-extern void irqtime_account_irq(struct task_struct *tsk); ++extern void irqtime_account_irq(struct task_struct *tsk, unsigned int offset); + #else +-static inline void irqtime_account_irq(struct task_struct *tsk) { } ++static inline void irqtime_account_irq(struct task_struct *tsk, unsigned int offset) { } + #endif + +-static inline void account_irq_enter_time(struct task_struct *tsk) ++static inline void account_softirq_enter(struct task_struct *tsk) + { +- vtime_account_irq(tsk); +- irqtime_account_irq(tsk); ++ vtime_account_irq(tsk, SOFTIRQ_OFFSET); ++ irqtime_account_irq(tsk, SOFTIRQ_OFFSET); + } + +-static inline void account_irq_exit_time(struct task_struct *tsk) ++static inline void account_softirq_exit(struct task_struct *tsk) + { +- vtime_account_irq(tsk); +- irqtime_account_irq(tsk); ++ vtime_account_softirq(tsk); ++ irqtime_account_irq(tsk, 0); ++} ++ ++static inline void account_hardirq_enter(struct task_struct *tsk) ++{ ++ vtime_account_irq(tsk, HARDIRQ_OFFSET); ++ irqtime_account_irq(tsk, HARDIRQ_OFFSET); ++} ++ ++static inline void account_hardirq_exit(struct task_struct *tsk) ++{ ++ vtime_account_hardirq(tsk); ++ irqtime_account_irq(tsk, 0); + } + + #endif /* _LINUX_KERNEL_VTIME_H */ +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index 60129af16ed6..02a7932c7383 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -44,12 +44,13 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, + } + + /* +- * Called before incrementing preempt_count on {soft,}irq_enter ++ * Called after incrementing preempt_count on {soft,}irq_enter + * and before decrementing preempt_count on {soft,}irq_exit. + */ +-void irqtime_account_irq(struct task_struct *curr) ++void irqtime_account_irq(struct task_struct *curr, unsigned int offset) + { + struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); ++ unsigned int pc; + s64 delta; + int cpu; + +@@ -59,6 +60,7 @@ void irqtime_account_irq(struct task_struct *curr) + cpu = smp_processor_id(); + delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; + irqtime->irq_start_time += delta; ++ pc = preempt_count() - offset; + + /* + * We do not account for softirq time from ksoftirqd here. +@@ -66,9 +68,9 @@ void irqtime_account_irq(struct task_struct *curr) + * in that case, so as not to confuse scheduler with a special task + * that do not consume any time, but still wants to run. + */ +- if (hardirq_count()) ++ if (pc & HARDIRQ_MASK) + irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); +- else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) ++ else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) + irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); + } + +@@ -417,11 +419,13 @@ void vtime_task_switch(struct task_struct *prev) + } + # endif + +-void vtime_account_irq(struct task_struct *tsk) ++void vtime_account_irq(struct task_struct *tsk, unsigned int offset) + { +- if (hardirq_count()) { ++ unsigned int pc = preempt_count() - offset; ++ ++ if (pc & HARDIRQ_OFFSET) { + vtime_account_hardirq(tsk); +- } else if (in_serving_softirq()) { ++ } else if (pc & SOFTIRQ_OFFSET) { + vtime_account_softirq(tsk); + } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && + is_idle_task(tsk)) { +diff --git a/kernel/softirq.c b/kernel/softirq.c +index 617009ccd82c..b8f42b3ba8ca 100644 +--- a/kernel/softirq.c ++++ b/kernel/softirq.c +@@ -315,10 +315,10 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) + current->flags &= ~PF_MEMALLOC; + + pending = local_softirq_pending(); +- account_irq_enter_time(current); + + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); + in_hardirq = lockdep_softirq_start(); ++ account_softirq_enter(current); + + restart: + /* Reset the pending bitmask before enabling irqs */ +@@ -365,8 +365,8 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) + wakeup_softirqd(); + } + ++ account_softirq_exit(current); + lockdep_softirq_end(in_hardirq); +- account_irq_exit_time(current); + __local_bh_enable(SOFTIRQ_OFFSET); + WARN_ON_ONCE(in_interrupt()); + current_restore_flags(old_flags, PF_MEMALLOC); +@@ -418,7 +418,7 @@ static inline void __irq_exit_rcu(void) + #else + lockdep_assert_irqs_disabled(); + #endif +- account_irq_exit_time(current); ++ account_hardirq_exit(current); + preempt_count_sub(HARDIRQ_OFFSET); + if (!in_interrupt() && local_softirq_pending()) + invoke_softirq(); +-- +2.43.0 + diff --git a/debian/patches-rt/0130-irq-Call-tick_irq_enter-inside-HARDIRQ_OFFSET.patch b/debian/patches-rt/0130-irq-Call-tick_irq_enter-inside-HARDIRQ_OFFSET.patch new file mode 100644 index 000000000..304fef0fa --- /dev/null +++ b/debian/patches-rt/0130-irq-Call-tick_irq_enter-inside-HARDIRQ_OFFSET.patch @@ -0,0 +1,51 @@ +From 5e2b7533340eb96a841068c309aac4985bd8180c Mon Sep 17 00:00:00 2001 +From: Frederic Weisbecker <frederic@kernel.org> +Date: Wed, 2 Dec 2020 12:57:32 +0100 +Subject: [PATCH 130/323] irq: Call tick_irq_enter() inside HARDIRQ_OFFSET +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Now that account_hardirq_enter() is called after HARDIRQ_OFFSET has +been incremented, there is nothing left that prevents us from also +moving tick_irq_enter() after HARDIRQ_OFFSET is incremented. + +The desired outcome is to remove the nasty hack that prevents softirqs +from being raised through ksoftirqd instead of the hardirq bottom half. +Also tick_irq_enter() then becomes appropriately covered by lockdep. + +Signed-off-by: Frederic Weisbecker <frederic@kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Link: https://lore.kernel.org/r/20201202115732.27827-6-frederic@kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/softirq.c | 14 +++++--------- + 1 file changed, 5 insertions(+), 9 deletions(-) + +diff --git a/kernel/softirq.c b/kernel/softirq.c +index b8f42b3ba8ca..d5bfd5e661fc 100644 +--- a/kernel/softirq.c ++++ b/kernel/softirq.c +@@ -377,16 +377,12 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) + */ + void irq_enter_rcu(void) + { +- if (is_idle_task(current) && !in_interrupt()) { +- /* +- * Prevent raise_softirq from needlessly waking up ksoftirqd +- * here, as softirq will be serviced on return from interrupt. +- */ +- local_bh_disable(); ++ __irq_enter_raw(); ++ ++ if (is_idle_task(current) && (irq_count() == HARDIRQ_OFFSET)) + tick_irq_enter(); +- _local_bh_enable(); +- } +- __irq_enter(); ++ ++ account_hardirq_enter(current); + } + + /** +-- +2.43.0 + diff --git a/debian/patches-rt/0131-smp-Wake-ksoftirqd-on-PREEMPT_RT-instead-do_softirq.patch b/debian/patches-rt/0131-smp-Wake-ksoftirqd-on-PREEMPT_RT-instead-do_softirq.patch new file mode 100644 index 000000000..19d4f6615 --- /dev/null +++ b/debian/patches-rt/0131-smp-Wake-ksoftirqd-on-PREEMPT_RT-instead-do_softirq.patch @@ -0,0 +1,48 @@ +From d6a969bae816bcd5ac549038a4582d300ede6b99 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Mon, 15 Feb 2021 18:44:12 +0100 +Subject: [PATCH 131/323] smp: Wake ksoftirqd on PREEMPT_RT instead + do_softirq(). +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The softirq implementation on PREEMPT_RT does not provide do_softirq(). +The other user of do_softirq() is replaced with a local_bh_disable() ++ enable() around the possible raise-softirq invocation. This can not be +done here because migration_cpu_stop() is invoked with disabled +preemption. + +Wake the softirq thread on PREEMPT_RT if there are any pending softirqs. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/smp.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +diff --git a/kernel/smp.c b/kernel/smp.c +index b0684b4c111e..4aaf11aa729c 100644 +--- a/kernel/smp.c ++++ b/kernel/smp.c +@@ -450,8 +450,18 @@ void flush_smp_call_function_from_idle(void) + + local_irq_save(flags); + flush_smp_call_function_queue(true); +- if (local_softirq_pending()) +- do_softirq(); ++ ++ if (local_softirq_pending()) { ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { ++ do_softirq(); ++ } else { ++ struct task_struct *ksoftirqd = this_cpu_ksoftirqd(); ++ ++ if (ksoftirqd && ksoftirqd->state != TASK_RUNNING) ++ wake_up_process(ksoftirqd); ++ } ++ } + + local_irq_restore(flags); + } +-- +2.43.0 + diff --git a/debian/patches-rt/0132-tasklets-Replace-barrier-with-cpu_relax-in-tasklet_u.patch b/debian/patches-rt/0132-tasklets-Replace-barrier-with-cpu_relax-in-tasklet_u.patch new file mode 100644 index 000000000..8908254e0 --- /dev/null +++ b/debian/patches-rt/0132-tasklets-Replace-barrier-with-cpu_relax-in-tasklet_u.patch @@ -0,0 +1,35 @@ +From 8127bb05947dbb12f9537095932d6a7a7a07d09e Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 9 Mar 2021 09:42:04 +0100 +Subject: [PATCH 132/323] tasklets: Replace barrier() with cpu_relax() in + tasklet_unlock_wait() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +A barrier() in a tight loop which waits for something to happen on a remote +CPU is a pointless exercise. Replace it with cpu_relax() which allows HT +siblings to make progress. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/interrupt.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h +index 0652b4858ba6..fad0eb9d42ab 100644 +--- a/include/linux/interrupt.h ++++ b/include/linux/interrupt.h +@@ -672,7 +672,8 @@ static inline void tasklet_unlock(struct tasklet_struct *t) + + static inline void tasklet_unlock_wait(struct tasklet_struct *t) + { +- while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); } ++ while (test_bit(TASKLET_STATE_RUN, &t->state)) ++ cpu_relax(); + } + #else + #define tasklet_trylock(t) 1 +-- +2.43.0 + diff --git a/debian/patches-rt/0133-tasklets-Use-static-inlines-for-stub-implementations.patch b/debian/patches-rt/0133-tasklets-Use-static-inlines-for-stub-implementations.patch new file mode 100644 index 000000000..57ec50a2e --- /dev/null +++ b/debian/patches-rt/0133-tasklets-Use-static-inlines-for-stub-implementations.patch @@ -0,0 +1,35 @@ +From fa31367f235c42f78b2375d5ade6130a3b296e22 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 9 Mar 2021 09:42:05 +0100 +Subject: [PATCH 133/323] tasklets: Use static inlines for stub implementations +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Inlines exist for a reason. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/interrupt.h | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h +index fad0eb9d42ab..75c4380afe9b 100644 +--- a/include/linux/interrupt.h ++++ b/include/linux/interrupt.h +@@ -676,9 +676,9 @@ static inline void tasklet_unlock_wait(struct tasklet_struct *t) + cpu_relax(); + } + #else +-#define tasklet_trylock(t) 1 +-#define tasklet_unlock_wait(t) do { } while (0) +-#define tasklet_unlock(t) do { } while (0) ++static inline int tasklet_trylock(struct tasklet_struct *t) { return 1; } ++static inline void tasklet_unlock(struct tasklet_struct *t) { } ++static inline void tasklet_unlock_wait(struct tasklet_struct *t) { } + #endif + + extern void __tasklet_schedule(struct tasklet_struct *t); +-- +2.43.0 + diff --git a/debian/patches-rt/0134-tasklets-Provide-tasklet_disable_in_atomic.patch b/debian/patches-rt/0134-tasklets-Provide-tasklet_disable_in_atomic.patch new file mode 100644 index 000000000..2f6fb1ab9 --- /dev/null +++ b/debian/patches-rt/0134-tasklets-Provide-tasklet_disable_in_atomic.patch @@ -0,0 +1,68 @@ +From 35c080a5db30acfdfa20ca84bf9d0482e6ec1409 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 9 Mar 2021 09:42:06 +0100 +Subject: [PATCH 134/323] tasklets: Provide tasklet_disable_in_atomic() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Replacing the spin wait loops in tasklet_unlock_wait() with +wait_var_event() is not possible as a handful of tasklet_disable() +invocations are happening in atomic context. All other invocations are in +teardown paths which can sleep. + +Provide tasklet_disable_in_atomic() and tasklet_unlock_spin_wait() to +convert the few atomic use cases over, which allows to change +tasklet_disable() and tasklet_unlock_wait() in a later step. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/interrupt.h | 22 ++++++++++++++++++++++ + 1 file changed, 22 insertions(+) + +diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h +index 75c4380afe9b..abba3eff4f86 100644 +--- a/include/linux/interrupt.h ++++ b/include/linux/interrupt.h +@@ -675,10 +675,21 @@ static inline void tasklet_unlock_wait(struct tasklet_struct *t) + while (test_bit(TASKLET_STATE_RUN, &t->state)) + cpu_relax(); + } ++ ++/* ++ * Do not use in new code. Waiting for tasklets from atomic contexts is ++ * error prone and should be avoided. ++ */ ++static inline void tasklet_unlock_spin_wait(struct tasklet_struct *t) ++{ ++ while (test_bit(TASKLET_STATE_RUN, &t->state)) ++ cpu_relax(); ++} + #else + static inline int tasklet_trylock(struct tasklet_struct *t) { return 1; } + static inline void tasklet_unlock(struct tasklet_struct *t) { } + static inline void tasklet_unlock_wait(struct tasklet_struct *t) { } ++static inline void tasklet_unlock_spin_wait(struct tasklet_struct *t) { } + #endif + + extern void __tasklet_schedule(struct tasklet_struct *t); +@@ -703,6 +714,17 @@ static inline void tasklet_disable_nosync(struct tasklet_struct *t) + smp_mb__after_atomic(); + } + ++/* ++ * Do not use in new code. Disabling tasklets from atomic contexts is ++ * error prone and should be avoided. ++ */ ++static inline void tasklet_disable_in_atomic(struct tasklet_struct *t) ++{ ++ tasklet_disable_nosync(t); ++ tasklet_unlock_spin_wait(t); ++ smp_mb(); ++} ++ + static inline void tasklet_disable(struct tasklet_struct *t) + { + tasklet_disable_nosync(t); +-- +2.43.0 + diff --git a/debian/patches-rt/0135-tasklets-Use-spin-wait-in-tasklet_disable-temporaril.patch b/debian/patches-rt/0135-tasklets-Use-spin-wait-in-tasklet_disable-temporaril.patch new file mode 100644 index 000000000..7e32a2a7c --- /dev/null +++ b/debian/patches-rt/0135-tasklets-Use-spin-wait-in-tasklet_disable-temporaril.patch @@ -0,0 +1,33 @@ +From 90d88360f539e57921ec9675603c700c351a0078 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 9 Mar 2021 09:42:07 +0100 +Subject: [PATCH 135/323] tasklets: Use spin wait in tasklet_disable() + temporarily +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +To ease the transition use spin waiting in tasklet_disable() until all +usage sites from atomic context have been cleaned up. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/interrupt.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h +index abba3eff4f86..fe085c46f210 100644 +--- a/include/linux/interrupt.h ++++ b/include/linux/interrupt.h +@@ -728,7 +728,8 @@ static inline void tasklet_disable_in_atomic(struct tasklet_struct *t) + static inline void tasklet_disable(struct tasklet_struct *t) + { + tasklet_disable_nosync(t); +- tasklet_unlock_wait(t); ++ /* Spin wait until all atomic users are converted */ ++ tasklet_unlock_spin_wait(t); + smp_mb(); + } + +-- +2.43.0 + diff --git a/debian/patches-rt/0136-tasklets-Replace-spin-wait-in-tasklet_unlock_wait.patch b/debian/patches-rt/0136-tasklets-Replace-spin-wait-in-tasklet_unlock_wait.patch new file mode 100644 index 000000000..c244c4010 --- /dev/null +++ b/debian/patches-rt/0136-tasklets-Replace-spin-wait-in-tasklet_unlock_wait.patch @@ -0,0 +1,90 @@ +From 1e2be8ee6d91630326a3a3bc32925205b41d73a5 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Tue, 9 Mar 2021 09:42:08 +0100 +Subject: [PATCH 136/323] tasklets: Replace spin wait in tasklet_unlock_wait() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +tasklet_unlock_wait() spin waits for TASKLET_STATE_RUN to be cleared. This +is wasting CPU cycles in a tight loop which is especially painful in a +guest when the CPU running the tasklet is scheduled out. + +tasklet_unlock_wait() is invoked from tasklet_kill() which is used in +teardown paths and not performance critical at all. Replace the spin wait +with wait_var_event(). + +There are no users of tasklet_unlock_wait() which are invoked from atomic +contexts. The usage in tasklet_disable() has been replaced temporarily with +the spin waiting variant until the atomic users are fixed up and will be +converted to the sleep wait variant later. + +Signed-off-by: Peter Zijlstra <peterz@infradead.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/interrupt.h | 13 ++----------- + kernel/softirq.c | 18 ++++++++++++++++++ + 2 files changed, 20 insertions(+), 11 deletions(-) + +diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h +index fe085c46f210..c4fafbfa28a6 100644 +--- a/include/linux/interrupt.h ++++ b/include/linux/interrupt.h +@@ -664,17 +664,8 @@ static inline int tasklet_trylock(struct tasklet_struct *t) + return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state); + } + +-static inline void tasklet_unlock(struct tasklet_struct *t) +-{ +- smp_mb__before_atomic(); +- clear_bit(TASKLET_STATE_RUN, &(t)->state); +-} +- +-static inline void tasklet_unlock_wait(struct tasklet_struct *t) +-{ +- while (test_bit(TASKLET_STATE_RUN, &t->state)) +- cpu_relax(); +-} ++void tasklet_unlock(struct tasklet_struct *t); ++void tasklet_unlock_wait(struct tasklet_struct *t); + + /* + * Do not use in new code. Waiting for tasklets from atomic contexts is +diff --git a/kernel/softirq.c b/kernel/softirq.c +index d5bfd5e661fc..06bca024ce45 100644 +--- a/kernel/softirq.c ++++ b/kernel/softirq.c +@@ -25,6 +25,7 @@ + #include <linux/smpboot.h> + #include <linux/tick.h> + #include <linux/irq.h> ++#include <linux/wait_bit.h> + + #define CREATE_TRACE_POINTS + #include <trace/events/irq.h> +@@ -619,6 +620,23 @@ void tasklet_kill(struct tasklet_struct *t) + } + EXPORT_SYMBOL(tasklet_kill); + ++#ifdef CONFIG_SMP ++void tasklet_unlock(struct tasklet_struct *t) ++{ ++ smp_mb__before_atomic(); ++ clear_bit(TASKLET_STATE_RUN, &t->state); ++ smp_mb__after_atomic(); ++ wake_up_var(&t->state); ++} ++EXPORT_SYMBOL_GPL(tasklet_unlock); ++ ++void tasklet_unlock_wait(struct tasklet_struct *t) ++{ ++ wait_var_event(&t->state, !test_bit(TASKLET_STATE_RUN, &t->state)); ++} ++EXPORT_SYMBOL_GPL(tasklet_unlock_wait); ++#endif ++ + void __init softirq_init(void) + { + int cpu; +-- +2.43.0 + diff --git a/debian/patches-rt/0137-tasklets-Replace-spin-wait-in-tasklet_kill.patch b/debian/patches-rt/0137-tasklets-Replace-spin-wait-in-tasklet_kill.patch new file mode 100644 index 000000000..619e62b2a --- /dev/null +++ b/debian/patches-rt/0137-tasklets-Replace-spin-wait-in-tasklet_kill.patch @@ -0,0 +1,74 @@ +From cc2fc16258410420e97a4873f760cffabcff49b0 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Tue, 9 Mar 2021 09:42:09 +0100 +Subject: [PATCH 137/323] tasklets: Replace spin wait in tasklet_kill() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +tasklet_kill() spin waits for TASKLET_STATE_SCHED to be cleared invoking +yield() from inside the loop. yield() is an ill defined mechanism and the +result might still be wasting CPU cycles in a tight loop which is +especially painful in a guest when the CPU running the tasklet is scheduled +out. + +tasklet_kill() is used in teardown paths and not performance critical at +all. Replace the spin wait with wait_var_event(). + +Signed-off-by: Peter Zijlstra <peterz@infradead.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/softirq.c | 23 +++++++++++++++-------- + 1 file changed, 15 insertions(+), 8 deletions(-) + +diff --git a/kernel/softirq.c b/kernel/softirq.c +index 06bca024ce45..ecc3ac4091c8 100644 +--- a/kernel/softirq.c ++++ b/kernel/softirq.c +@@ -530,6 +530,16 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) + } + EXPORT_SYMBOL(__tasklet_hi_schedule); + ++static inline bool tasklet_clear_sched(struct tasklet_struct *t) ++{ ++ if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) { ++ wake_up_var(&t->state); ++ return true; ++ } ++ ++ return false; ++} ++ + static void tasklet_action_common(struct softirq_action *a, + struct tasklet_head *tl_head, + unsigned int softirq_nr) +@@ -549,8 +559,7 @@ static void tasklet_action_common(struct softirq_action *a, + + if (tasklet_trylock(t)) { + if (!atomic_read(&t->count)) { +- if (!test_and_clear_bit(TASKLET_STATE_SCHED, +- &t->state)) ++ if (!tasklet_clear_sched(t)) + BUG(); + if (t->use_callback) + t->callback(t); +@@ -610,13 +619,11 @@ void tasklet_kill(struct tasklet_struct *t) + if (in_interrupt()) + pr_notice("Attempt to kill tasklet from interrupt\n"); + +- while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { +- do { +- yield(); +- } while (test_bit(TASKLET_STATE_SCHED, &t->state)); +- } ++ while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) ++ wait_var_event(&t->state, !test_bit(TASKLET_STATE_SCHED, &t->state)); ++ + tasklet_unlock_wait(t); +- clear_bit(TASKLET_STATE_SCHED, &t->state); ++ tasklet_clear_sched(t); + } + EXPORT_SYMBOL(tasklet_kill); + +-- +2.43.0 + diff --git a/debian/patches-rt/0138-tasklets-Prevent-tasklet_unlock_spin_wait-deadlock-o.patch b/debian/patches-rt/0138-tasklets-Prevent-tasklet_unlock_spin_wait-deadlock-o.patch new file mode 100644 index 000000000..9f446b54f --- /dev/null +++ b/debian/patches-rt/0138-tasklets-Prevent-tasklet_unlock_spin_wait-deadlock-o.patch @@ -0,0 +1,109 @@ +From 42114155db1a5548923ac48a09b3f228b29d3b09 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 9 Mar 2021 09:42:10 +0100 +Subject: [PATCH 138/323] tasklets: Prevent tasklet_unlock_spin_wait() deadlock + on RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +tasklet_unlock_spin_wait() spin waits for the TASKLET_STATE_SCHED bit in +the tasklet state to be cleared. This works on !RT nicely because the +corresponding execution can only happen on a different CPU. + +On RT softirq processing is preemptible, therefore a task preempting the +softirq processing thread can spin forever. + +Prevent this by invoking local_bh_disable()/enable() inside the loop. In +case that the softirq processing thread was preempted by the current task, +current will block on the local lock which yields the CPU to the preempted +softirq processing thread. If the tasklet is processed on a different CPU +then the local_bh_disable()/enable() pair is just a waste of processor +cycles. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/interrupt.h | 12 ++---------- + kernel/softirq.c | 28 +++++++++++++++++++++++++++- + 2 files changed, 29 insertions(+), 11 deletions(-) + +diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h +index c4fafbfa28a6..6eae70ec3a17 100644 +--- a/include/linux/interrupt.h ++++ b/include/linux/interrupt.h +@@ -658,7 +658,7 @@ enum + TASKLET_STATE_RUN /* Tasklet is running (SMP only) */ + }; + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) + static inline int tasklet_trylock(struct tasklet_struct *t) + { + return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state); +@@ -666,16 +666,8 @@ static inline int tasklet_trylock(struct tasklet_struct *t) + + void tasklet_unlock(struct tasklet_struct *t); + void tasklet_unlock_wait(struct tasklet_struct *t); ++void tasklet_unlock_spin_wait(struct tasklet_struct *t); + +-/* +- * Do not use in new code. Waiting for tasklets from atomic contexts is +- * error prone and should be avoided. +- */ +-static inline void tasklet_unlock_spin_wait(struct tasklet_struct *t) +-{ +- while (test_bit(TASKLET_STATE_RUN, &t->state)) +- cpu_relax(); +-} + #else + static inline int tasklet_trylock(struct tasklet_struct *t) { return 1; } + static inline void tasklet_unlock(struct tasklet_struct *t) { } +diff --git a/kernel/softirq.c b/kernel/softirq.c +index ecc3ac4091c8..fcb201ceed71 100644 +--- a/kernel/softirq.c ++++ b/kernel/softirq.c +@@ -614,6 +614,32 @@ void tasklet_init(struct tasklet_struct *t, + } + EXPORT_SYMBOL(tasklet_init); + ++#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) ++/* ++ * Do not use in new code. Waiting for tasklets from atomic contexts is ++ * error prone and should be avoided. ++ */ ++void tasklet_unlock_spin_wait(struct tasklet_struct *t) ++{ ++ while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { ++ /* ++ * Prevent a live lock when current preempted soft ++ * interrupt processing or prevents ksoftirqd from ++ * running. If the tasklet runs on a different CPU ++ * then this has no effect other than doing the BH ++ * disable/enable dance for nothing. ++ */ ++ local_bh_disable(); ++ local_bh_enable(); ++ } else { ++ cpu_relax(); ++ } ++ } ++} ++EXPORT_SYMBOL(tasklet_unlock_spin_wait); ++#endif ++ + void tasklet_kill(struct tasklet_struct *t) + { + if (in_interrupt()) +@@ -627,7 +653,7 @@ void tasklet_kill(struct tasklet_struct *t) + } + EXPORT_SYMBOL(tasklet_kill); + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) + void tasklet_unlock(struct tasklet_struct *t) + { + smp_mb__before_atomic(); +-- +2.43.0 + diff --git a/debian/patches-rt/0139-net-jme-Replace-link-change-tasklet-with-work.patch b/debian/patches-rt/0139-net-jme-Replace-link-change-tasklet-with-work.patch new file mode 100644 index 000000000..927a8ce66 --- /dev/null +++ b/debian/patches-rt/0139-net-jme-Replace-link-change-tasklet-with-work.patch @@ -0,0 +1,88 @@ +From 8aad52579188d8ae11265d495209d51029482f38 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 9 Mar 2021 09:42:11 +0100 +Subject: [PATCH 139/323] net: jme: Replace link-change tasklet with work +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The link change tasklet disables the tasklets for tx/rx processing while +upating hw parameters and then enables the tasklets again. + +This update can also be pushed into a workqueue where it can be performed +in preemptible context. This allows tasklet_disable() to become sleeping. + +Replace the linkch_task tasklet with a work. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/net/ethernet/jme.c | 10 +++++----- + drivers/net/ethernet/jme.h | 2 +- + 2 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c +index e9efe074edc1..f1b9284e0bea 100644 +--- a/drivers/net/ethernet/jme.c ++++ b/drivers/net/ethernet/jme.c +@@ -1265,9 +1265,9 @@ jme_stop_shutdown_timer(struct jme_adapter *jme) + jwrite32f(jme, JME_APMC, apmc); + } + +-static void jme_link_change_tasklet(struct tasklet_struct *t) ++static void jme_link_change_work(struct work_struct *work) + { +- struct jme_adapter *jme = from_tasklet(jme, t, linkch_task); ++ struct jme_adapter *jme = container_of(work, struct jme_adapter, linkch_task); + struct net_device *netdev = jme->dev; + int rc; + +@@ -1510,7 +1510,7 @@ jme_intr_msi(struct jme_adapter *jme, u32 intrstat) + * all other events are ignored + */ + jwrite32(jme, JME_IEVE, intrstat); +- tasklet_schedule(&jme->linkch_task); ++ schedule_work(&jme->linkch_task); + goto out_reenable; + } + +@@ -1832,7 +1832,6 @@ jme_open(struct net_device *netdev) + jme_clear_pm_disable_wol(jme); + JME_NAPI_ENABLE(jme); + +- tasklet_setup(&jme->linkch_task, jme_link_change_tasklet); + tasklet_setup(&jme->txclean_task, jme_tx_clean_tasklet); + tasklet_setup(&jme->rxclean_task, jme_rx_clean_tasklet); + tasklet_setup(&jme->rxempty_task, jme_rx_empty_tasklet); +@@ -1920,7 +1919,7 @@ jme_close(struct net_device *netdev) + + JME_NAPI_DISABLE(jme); + +- tasklet_kill(&jme->linkch_task); ++ cancel_work_sync(&jme->linkch_task); + tasklet_kill(&jme->txclean_task); + tasklet_kill(&jme->rxclean_task); + tasklet_kill(&jme->rxempty_task); +@@ -3035,6 +3034,7 @@ jme_init_one(struct pci_dev *pdev, + atomic_set(&jme->rx_empty, 1); + + tasklet_setup(&jme->pcc_task, jme_pcc_tasklet); ++ INIT_WORK(&jme->linkch_task, jme_link_change_work); + jme->dpi.cur = PCC_P1; + + jme->reg_ghc = 0; +diff --git a/drivers/net/ethernet/jme.h b/drivers/net/ethernet/jme.h +index a2c3b00d939d..2af76329b4a2 100644 +--- a/drivers/net/ethernet/jme.h ++++ b/drivers/net/ethernet/jme.h +@@ -411,7 +411,7 @@ struct jme_adapter { + struct tasklet_struct rxempty_task; + struct tasklet_struct rxclean_task; + struct tasklet_struct txclean_task; +- struct tasklet_struct linkch_task; ++ struct work_struct linkch_task; + struct tasklet_struct pcc_task; + unsigned long flags; + u32 reg_txcs; +-- +2.43.0 + diff --git a/debian/patches-rt/0140-net-sundance-Use-tasklet_disable_in_atomic.patch b/debian/patches-rt/0140-net-sundance-Use-tasklet_disable_in_atomic.patch new file mode 100644 index 000000000..cc8eade7d --- /dev/null +++ b/debian/patches-rt/0140-net-sundance-Use-tasklet_disable_in_atomic.patch @@ -0,0 +1,39 @@ +From b8d19bf463893a07bab9120bd628798515f3e6e0 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 9 Mar 2021 09:42:12 +0100 +Subject: [PATCH 140/323] net: sundance: Use tasklet_disable_in_atomic(). +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +tasklet_disable() is used in the timer callback. This might be distangled, +but without access to the hardware that's a bit risky. + +Replace it with tasklet_disable_in_atomic() so tasklet_disable() can be +changed to a sleep wait once all remaining atomic users are converted. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Denis Kirjanov <kda@linux-powerpc.org> +Cc: "David S. Miller" <davem@davemloft.net> +Cc: Jakub Kicinski <kuba@kernel.org> +Cc: netdev@vger.kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/net/ethernet/dlink/sundance.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/dlink/sundance.c b/drivers/net/ethernet/dlink/sundance.c +index e3a8858915b3..df0eab479d51 100644 +--- a/drivers/net/ethernet/dlink/sundance.c ++++ b/drivers/net/ethernet/dlink/sundance.c +@@ -963,7 +963,7 @@ static void tx_timeout(struct net_device *dev, unsigned int txqueue) + unsigned long flag; + + netif_stop_queue(dev); +- tasklet_disable(&np->tx_tasklet); ++ tasklet_disable_in_atomic(&np->tx_tasklet); + iowrite16(0, ioaddr + IntrEnable); + printk(KERN_WARNING "%s: Transmit timed out, TxStatus %2.2x " + "TxFrameId %2.2x," +-- +2.43.0 + diff --git a/debian/patches-rt/0141-ath9k-Use-tasklet_disable_in_atomic.patch b/debian/patches-rt/0141-ath9k-Use-tasklet_disable_in_atomic.patch new file mode 100644 index 000000000..8e80fa205 --- /dev/null +++ b/debian/patches-rt/0141-ath9k-Use-tasklet_disable_in_atomic.patch @@ -0,0 +1,48 @@ +From 3e2ecd3c09d77800b53b99090c6b6372af1922e7 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 9 Mar 2021 09:42:13 +0100 +Subject: [PATCH 141/323] ath9k: Use tasklet_disable_in_atomic() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +All callers of ath9k_beacon_ensure_primary_slot() are preemptible / +acquire a mutex except for this callchain: + + spin_lock_bh(&sc->sc_pcu_lock); + ath_complete_reset() + -> ath9k_calculate_summary_state() + -> ath9k_beacon_ensure_primary_slot() + +It's unclear how that can be distangled, so use tasklet_disable_in_atomic() +for now. This allows tasklet_disable() to become sleepable once the +remaining atomic users are cleaned up. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: ath9k-devel@qca.qualcomm.com +Cc: Kalle Valo <kvalo@codeaurora.org> +Cc: "David S. Miller" <davem@davemloft.net> +Cc: Jakub Kicinski <kuba@kernel.org> +Cc: linux-wireless@vger.kernel.org +Cc: netdev@vger.kernel.org +Acked-by: Kalle Valo <kvalo@codeaurora.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/net/wireless/ath/ath9k/beacon.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/net/wireless/ath/ath9k/beacon.c b/drivers/net/wireless/ath/ath9k/beacon.c +index 71e2ada86793..72e2e71aac0e 100644 +--- a/drivers/net/wireless/ath/ath9k/beacon.c ++++ b/drivers/net/wireless/ath/ath9k/beacon.c +@@ -251,7 +251,7 @@ void ath9k_beacon_ensure_primary_slot(struct ath_softc *sc) + int first_slot = ATH_BCBUF; + int slot; + +- tasklet_disable(&sc->bcon_tasklet); ++ tasklet_disable_in_atomic(&sc->bcon_tasklet); + + /* Find first taken slot. */ + for (slot = 0; slot < ATH_BCBUF; slot++) { +-- +2.43.0 + diff --git a/debian/patches-rt/0142-atm-eni-Use-tasklet_disable_in_atomic-in-the-send-ca.patch b/debian/patches-rt/0142-atm-eni-Use-tasklet_disable_in_atomic-in-the-send-ca.patch new file mode 100644 index 000000000..45236c9da --- /dev/null +++ b/debian/patches-rt/0142-atm-eni-Use-tasklet_disable_in_atomic-in-the-send-ca.patch @@ -0,0 +1,42 @@ +From f8cac7fb539abd4d961c942dde93a04956f9615d Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 9 Mar 2021 09:42:14 +0100 +Subject: [PATCH 142/323] atm: eni: Use tasklet_disable_in_atomic() in the + send() callback +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The atmdev_ops::send callback which calls tasklet_disable() is invoked with +bottom halfs disabled from net_device_ops::ndo_start_xmit(). All other +invocations of tasklet_disable() in this driver happen in preemptible +context. + +Change the send() call to use tasklet_disable_in_atomic() which allows +tasklet_disable() to be made sleepable once the remaining atomic context +usage sites are cleaned up. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Chas Williams <3chas3@gmail.com> +Cc: linux-atm-general@lists.sourceforge.net +Cc: netdev@vger.kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/atm/eni.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/atm/eni.c b/drivers/atm/eni.c +index 9fcc49be499f..a31ffe16e626 100644 +--- a/drivers/atm/eni.c ++++ b/drivers/atm/eni.c +@@ -2056,7 +2056,7 @@ static int eni_send(struct atm_vcc *vcc,struct sk_buff *skb) + } + submitted++; + ATM_SKB(skb)->vcc = vcc; +- tasklet_disable(&ENI_DEV(vcc->dev)->task); ++ tasklet_disable_in_atomic(&ENI_DEV(vcc->dev)->task); + res = do_tx(skb); + tasklet_enable(&ENI_DEV(vcc->dev)->task); + if (res == enq_ok) return 0; +-- +2.43.0 + diff --git a/debian/patches-rt/0143-PCI-hv-Use-tasklet_disable_in_atomic.patch b/debian/patches-rt/0143-PCI-hv-Use-tasklet_disable_in_atomic.patch new file mode 100644 index 000000000..e4c764804 --- /dev/null +++ b/debian/patches-rt/0143-PCI-hv-Use-tasklet_disable_in_atomic.patch @@ -0,0 +1,46 @@ +From 267b1086437ac0cca8ca59197e5cc8cbe8fc6a83 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 9 Mar 2021 09:42:15 +0100 +Subject: [PATCH 143/323] PCI: hv: Use tasklet_disable_in_atomic() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The hv_compose_msi_msg() callback in irq_chip::irq_compose_msi_msg is +invoked via irq_chip_compose_msi_msg(), which itself is always invoked from +atomic contexts from the guts of the interrupt core code. + +There is no way to change this w/o rewriting the whole driver, so use +tasklet_disable_in_atomic() which allows to make tasklet_disable() +sleepable once the remaining atomic users are addressed. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: "K. Y. Srinivasan" <kys@microsoft.com> +Cc: Haiyang Zhang <haiyangz@microsoft.com> +Cc: Stephen Hemminger <sthemmin@microsoft.com> +Cc: Wei Liu <wei.liu@kernel.org> +Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com> +Cc: Rob Herring <robh@kernel.org> +Cc: Bjorn Helgaas <bhelgaas@google.com> +Cc: linux-hyperv@vger.kernel.org +Cc: linux-pci@vger.kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/pci/controller/pci-hyperv.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c +index 2d6c77dcc815..7daba964f54e 100644 +--- a/drivers/pci/controller/pci-hyperv.c ++++ b/drivers/pci/controller/pci-hyperv.c +@@ -1518,7 +1518,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) + * Prevents hv_pci_onchannelcallback() from running concurrently + * in the tasklet. + */ +- tasklet_disable(&channel->callback_event); ++ tasklet_disable_in_atomic(&channel->callback_event); + + /* + * Since this function is called with IRQ locks held, can't +-- +2.43.0 + diff --git a/debian/patches-rt/0144-firewire-ohci-Use-tasklet_disable_in_atomic-where-re.patch b/debian/patches-rt/0144-firewire-ohci-Use-tasklet_disable_in_atomic-where-re.patch new file mode 100644 index 000000000..184d80276 --- /dev/null +++ b/debian/patches-rt/0144-firewire-ohci-Use-tasklet_disable_in_atomic-where-re.patch @@ -0,0 +1,61 @@ +From 090a450216e555382abe0be8e2941ecfcc527c44 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 9 Mar 2021 09:42:16 +0100 +Subject: [PATCH 144/323] firewire: ohci: Use tasklet_disable_in_atomic() where + required +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +tasklet_disable() is invoked in several places. Some of them are in atomic +context which prevents a conversion of tasklet_disable() to a sleepable +function. + +The atomic callchains are: + + ar_context_tasklet() + ohci_cancel_packet() + tasklet_disable() + + ... + ohci_flush_iso_completions() + tasklet_disable() + +The invocation of tasklet_disable() from at_context_flush() is always in +preemptible context. + +Use tasklet_disable_in_atomic() for the two invocations in +ohci_cancel_packet() and ohci_flush_iso_completions(). + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Stefan Richter <stefanr@s5r6.in-berlin.de> +Cc: linux1394-devel@lists.sourceforge.net +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/firewire/ohci.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c +index 9811c40956e5..17c9d825188b 100644 +--- a/drivers/firewire/ohci.c ++++ b/drivers/firewire/ohci.c +@@ -2545,7 +2545,7 @@ static int ohci_cancel_packet(struct fw_card *card, struct fw_packet *packet) + struct driver_data *driver_data = packet->driver_data; + int ret = -ENOENT; + +- tasklet_disable(&ctx->tasklet); ++ tasklet_disable_in_atomic(&ctx->tasklet); + + if (packet->ack != 0) + goto out; +@@ -3465,7 +3465,7 @@ static int ohci_flush_iso_completions(struct fw_iso_context *base) + struct iso_context *ctx = container_of(base, struct iso_context, base); + int ret = 0; + +- tasklet_disable(&ctx->context.tasklet); ++ tasklet_disable_in_atomic(&ctx->context.tasklet); + + if (!test_and_set_bit_lock(0, &ctx->flushing_completions)) { + context_tasklet((unsigned long)&ctx->context); +-- +2.43.0 + diff --git a/debian/patches-rt/0145-tasklets-Switch-tasklet_disable-to-the-sleep-wait-va.patch b/debian/patches-rt/0145-tasklets-Switch-tasklet_disable-to-the-sleep-wait-va.patch new file mode 100644 index 000000000..5b9765685 --- /dev/null +++ b/debian/patches-rt/0145-tasklets-Switch-tasklet_disable-to-the-sleep-wait-va.patch @@ -0,0 +1,35 @@ +From 24fc43821f54c5f95095fb5154422b54ba2777e2 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 9 Mar 2021 09:42:17 +0100 +Subject: [PATCH 145/323] tasklets: Switch tasklet_disable() to the sleep wait + variant +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + + -- NOT FOR IMMEDIATE MERGING -- + +Now that all users of tasklet_disable() are invoked from sleepable context, +convert it to use tasklet_unlock_wait() which might sleep. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/interrupt.h | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h +index 6eae70ec3a17..a1c64d969532 100644 +--- a/include/linux/interrupt.h ++++ b/include/linux/interrupt.h +@@ -711,8 +711,7 @@ static inline void tasklet_disable_in_atomic(struct tasklet_struct *t) + static inline void tasklet_disable(struct tasklet_struct *t) + { + tasklet_disable_nosync(t); +- /* Spin wait until all atomic users are converted */ +- tasklet_unlock_spin_wait(t); ++ tasklet_unlock_wait(t); + smp_mb(); + } + +-- +2.43.0 + diff --git a/debian/patches-rt/0146-softirq-Add-RT-specific-softirq-accounting.patch b/debian/patches-rt/0146-softirq-Add-RT-specific-softirq-accounting.patch new file mode 100644 index 000000000..c4bcb9bd1 --- /dev/null +++ b/debian/patches-rt/0146-softirq-Add-RT-specific-softirq-accounting.patch @@ -0,0 +1,75 @@ +From a69df864da6d43ae3de6a4d995eaebca51a582b4 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 9 Mar 2021 09:55:53 +0100 +Subject: [PATCH 146/323] softirq: Add RT specific softirq accounting +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +RT requires the softirq processing and local bottomhalf disabled regions to +be preemptible. Using the normal preempt count based serialization is +therefore not possible because this implicitely disables preemption. + +RT kernels use a per CPU local lock to serialize bottomhalfs. As +local_bh_disable() can nest the lock can only be acquired on the outermost +invocation of local_bh_disable() and released when the nest count becomes +zero. Tasks which hold the local lock can be preempted so its required to +keep track of the nest count per task. + +Add a RT only counter to task struct and adjust the relevant macros in +preempt.h. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Reviewed-by: Frederic Weisbecker <frederic@kernel.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/hardirq.h | 1 + + include/linux/preempt.h | 6 +++++- + include/linux/sched.h | 3 +++ + 3 files changed, 9 insertions(+), 1 deletion(-) + +diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h +index 0926e9ca4d85..76878b357ffa 100644 +--- a/include/linux/hardirq.h ++++ b/include/linux/hardirq.h +@@ -6,6 +6,7 @@ + #include <linux/preempt.h> + #include <linux/lockdep.h> + #include <linux/ftrace_irq.h> ++#include <linux/sched.h> + #include <linux/vtime.h> + #include <asm/hardirq.h> + +diff --git a/include/linux/preempt.h b/include/linux/preempt.h +index 69cc8b64aa3a..9881eac0698f 100644 +--- a/include/linux/preempt.h ++++ b/include/linux/preempt.h +@@ -79,7 +79,11 @@ + + #define nmi_count() (preempt_count() & NMI_MASK) + #define hardirq_count() (preempt_count() & HARDIRQ_MASK) +-#define softirq_count() (preempt_count() & SOFTIRQ_MASK) ++#ifdef CONFIG_PREEMPT_RT ++# define softirq_count() (current->softirq_disable_cnt & SOFTIRQ_MASK) ++#else ++# define softirq_count() (preempt_count() & SOFTIRQ_MASK) ++#endif + #define irq_count() (nmi_count() | hardirq_count() | softirq_count()) + + /* +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 82de1ab42497..d31da4867bb2 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1046,6 +1046,9 @@ struct task_struct { + int softirq_context; + int irq_config; + #endif ++#ifdef CONFIG_PREEMPT_RT ++ int softirq_disable_cnt; ++#endif + + #ifdef CONFIG_LOCKDEP + # define MAX_LOCK_DEPTH 48UL +-- +2.43.0 + diff --git a/debian/patches-rt/0147-irqtime-Make-accounting-correct-on-RT.patch b/debian/patches-rt/0147-irqtime-Make-accounting-correct-on-RT.patch new file mode 100644 index 000000000..576b5391b --- /dev/null +++ b/debian/patches-rt/0147-irqtime-Make-accounting-correct-on-RT.patch @@ -0,0 +1,54 @@ +From a4b2fa58f0b8dabae3f30be80fe317da32c5b121 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 9 Mar 2021 09:55:54 +0100 +Subject: [PATCH 147/323] irqtime: Make accounting correct on RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +vtime_account_irq and irqtime_account_irq() base checks on preempt_count() +which fails on RT because preempt_count() does not contain the softirq +accounting which is seperate on RT. + +These checks do not need the full preempt count as they only operate on the +hard and softirq sections. + +Use irq_count() instead which provides the correct value on both RT and non +RT kernels. The compiler is clever enough to fold the masking for !RT: + + 99b: 65 8b 05 00 00 00 00 mov %gs:0x0(%rip),%eax + - 9a2: 25 ff ff ff 7f and $0x7fffffff,%eax + + 9a2: 25 00 ff ff 00 and $0xffff00,%eax + +Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Reviewed-by: Frederic Weisbecker <frederic@kernel.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/sched/cputime.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index 02a7932c7383..02a5aa60fe7e 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -60,7 +60,7 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset) + cpu = smp_processor_id(); + delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; + irqtime->irq_start_time += delta; +- pc = preempt_count() - offset; ++ pc = irq_count() - offset; + + /* + * We do not account for softirq time from ksoftirqd here. +@@ -421,7 +421,7 @@ void vtime_task_switch(struct task_struct *prev) + + void vtime_account_irq(struct task_struct *tsk, unsigned int offset) + { +- unsigned int pc = preempt_count() - offset; ++ unsigned int pc = irq_count() - offset; + + if (pc & HARDIRQ_OFFSET) { + vtime_account_hardirq(tsk); +-- +2.43.0 + diff --git a/debian/patches-rt/0148-softirq-Move-various-protections-into-inline-helpers.patch b/debian/patches-rt/0148-softirq-Move-various-protections-into-inline-helpers.patch new file mode 100644 index 000000000..e08794144 --- /dev/null +++ b/debian/patches-rt/0148-softirq-Move-various-protections-into-inline-helpers.patch @@ -0,0 +1,108 @@ +From 7d59f358f639cc9e535a4049d663d9347aef6380 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 9 Mar 2021 09:55:55 +0100 +Subject: [PATCH 148/323] softirq: Move various protections into inline helpers +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +To allow reuse of the bulk of softirq processing code for RT and to avoid +#ifdeffery all over the place, split protections for various code sections +out into inline helpers so the RT variant can just replace them in one go. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Reviewed-by: Frederic Weisbecker <frederic@kernel.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/softirq.c | 39 ++++++++++++++++++++++++++++++++------- + 1 file changed, 32 insertions(+), 7 deletions(-) + +diff --git a/kernel/softirq.c b/kernel/softirq.c +index fcb201ceed71..87fac6ac0c32 100644 +--- a/kernel/softirq.c ++++ b/kernel/softirq.c +@@ -205,6 +205,32 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) + } + EXPORT_SYMBOL(__local_bh_enable_ip); + ++static inline void softirq_handle_begin(void) ++{ ++ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); ++} ++ ++static inline void softirq_handle_end(void) ++{ ++ __local_bh_enable(SOFTIRQ_OFFSET); ++ WARN_ON_ONCE(in_interrupt()); ++} ++ ++static inline void ksoftirqd_run_begin(void) ++{ ++ local_irq_disable(); ++} ++ ++static inline void ksoftirqd_run_end(void) ++{ ++ local_irq_enable(); ++} ++ ++static inline bool should_wake_ksoftirqd(void) ++{ ++ return true; ++} ++ + static inline void invoke_softirq(void) + { + if (ksoftirqd_running(local_softirq_pending())) +@@ -317,7 +343,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) + + pending = local_softirq_pending(); + +- __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); ++ softirq_handle_begin(); + in_hardirq = lockdep_softirq_start(); + account_softirq_enter(current); + +@@ -368,8 +394,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) + + account_softirq_exit(current); + lockdep_softirq_end(in_hardirq); +- __local_bh_enable(SOFTIRQ_OFFSET); +- WARN_ON_ONCE(in_interrupt()); ++ softirq_handle_end(); + current_restore_flags(old_flags, PF_MEMALLOC); + } + +@@ -464,7 +489,7 @@ inline void raise_softirq_irqoff(unsigned int nr) + * Otherwise we wake up ksoftirqd to make sure we + * schedule the softirq soon. + */ +- if (!in_interrupt()) ++ if (!in_interrupt() && should_wake_ksoftirqd()) + wakeup_softirqd(); + } + +@@ -692,18 +717,18 @@ static int ksoftirqd_should_run(unsigned int cpu) + + static void run_ksoftirqd(unsigned int cpu) + { +- local_irq_disable(); ++ ksoftirqd_run_begin(); + if (local_softirq_pending()) { + /* + * We can safely run softirq on inline stack, as we are not deep + * in the task stack here. + */ + __do_softirq(); +- local_irq_enable(); ++ ksoftirqd_run_end(); + cond_resched(); + return; + } +- local_irq_enable(); ++ ksoftirqd_run_end(); + } + + #ifdef CONFIG_HOTPLUG_CPU +-- +2.43.0 + diff --git a/debian/patches-rt/0149-softirq-Make-softirq-control-and-processing-RT-aware.patch b/debian/patches-rt/0149-softirq-Make-softirq-control-and-processing-RT-aware.patch new file mode 100644 index 000000000..22a88a79c --- /dev/null +++ b/debian/patches-rt/0149-softirq-Make-softirq-control-and-processing-RT-aware.patch @@ -0,0 +1,267 @@ +From d2c5f9de8094418b5ebade0ff54219371ec0a5ef Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 9 Mar 2021 09:55:56 +0100 +Subject: [PATCH 149/323] softirq: Make softirq control and processing RT aware +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Provide a local lock based serialization for soft interrupts on RT which +allows the local_bh_disabled() sections and servicing soft interrupts to be +preemptible. + +Provide the necessary inline helpers which allow to reuse the bulk of the +softirq processing code. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Reviewed-by: Frederic Weisbecker <frederic@kernel.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/bottom_half.h | 2 +- + kernel/softirq.c | 188 ++++++++++++++++++++++++++++++++++-- + 2 files changed, 182 insertions(+), 8 deletions(-) + +diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h +index a19519f4241d..e4dd613a070e 100644 +--- a/include/linux/bottom_half.h ++++ b/include/linux/bottom_half.h +@@ -4,7 +4,7 @@ + + #include <linux/preempt.h> + +-#ifdef CONFIG_TRACE_IRQFLAGS ++#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_TRACE_IRQFLAGS) + extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt); + #else + static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) +diff --git a/kernel/softirq.c b/kernel/softirq.c +index 87fac6ac0c32..ed13f6097de8 100644 +--- a/kernel/softirq.c ++++ b/kernel/softirq.c +@@ -13,6 +13,7 @@ + #include <linux/kernel_stat.h> + #include <linux/interrupt.h> + #include <linux/init.h> ++#include <linux/local_lock.h> + #include <linux/mm.h> + #include <linux/notifier.h> + #include <linux/percpu.h> +@@ -101,20 +102,189 @@ EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); + #endif + + /* +- * preempt_count and SOFTIRQ_OFFSET usage: +- * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving +- * softirq processing. +- * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) ++ * SOFTIRQ_OFFSET usage: ++ * ++ * On !RT kernels 'count' is the preempt counter, on RT kernels this applies ++ * to a per CPU counter and to task::softirqs_disabled_cnt. ++ * ++ * - count is changed by SOFTIRQ_OFFSET on entering or leaving softirq ++ * processing. ++ * ++ * - count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) + * on local_bh_disable or local_bh_enable. ++ * + * This lets us distinguish between whether we are currently processing + * softirq and whether we just have bh disabled. + */ ++#ifdef CONFIG_PREEMPT_RT + +-#ifdef CONFIG_TRACE_IRQFLAGS + /* +- * This is for softirq.c-internal use, where hardirqs are disabled ++ * RT accounts for BH disabled sections in task::softirqs_disabled_cnt and ++ * also in per CPU softirq_ctrl::cnt. This is necessary to allow tasks in a ++ * softirq disabled section to be preempted. ++ * ++ * The per task counter is used for softirq_count(), in_softirq() and ++ * in_serving_softirqs() because these counts are only valid when the task ++ * holding softirq_ctrl::lock is running. ++ * ++ * The per CPU counter prevents pointless wakeups of ksoftirqd in case that ++ * the task which is in a softirq disabled section is preempted or blocks. ++ */ ++struct softirq_ctrl { ++ local_lock_t lock; ++ int cnt; ++}; ++ ++static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = { ++ .lock = INIT_LOCAL_LOCK(softirq_ctrl.lock), ++}; ++ ++void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) ++{ ++ unsigned long flags; ++ int newcnt; ++ ++ WARN_ON_ONCE(in_hardirq()); ++ ++ /* First entry of a task into a BH disabled section? */ ++ if (!current->softirq_disable_cnt) { ++ if (preemptible()) { ++ local_lock(&softirq_ctrl.lock); ++ /* Required to meet the RCU bottomhalf requirements. */ ++ rcu_read_lock(); ++ } else { ++ DEBUG_LOCKS_WARN_ON(this_cpu_read(softirq_ctrl.cnt)); ++ } ++ } ++ ++ /* ++ * Track the per CPU softirq disabled state. On RT this is per CPU ++ * state to allow preemption of bottom half disabled sections. ++ */ ++ newcnt = __this_cpu_add_return(softirq_ctrl.cnt, cnt); ++ /* ++ * Reflect the result in the task state to prevent recursion on the ++ * local lock and to make softirq_count() & al work. ++ */ ++ current->softirq_disable_cnt = newcnt; ++ ++ if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) { ++ raw_local_irq_save(flags); ++ lockdep_softirqs_off(ip); ++ raw_local_irq_restore(flags); ++ } ++} ++EXPORT_SYMBOL(__local_bh_disable_ip); ++ ++static void __local_bh_enable(unsigned int cnt, bool unlock) ++{ ++ unsigned long flags; ++ int newcnt; ++ ++ DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt != ++ this_cpu_read(softirq_ctrl.cnt)); ++ ++ if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && softirq_count() == cnt) { ++ raw_local_irq_save(flags); ++ lockdep_softirqs_on(_RET_IP_); ++ raw_local_irq_restore(flags); ++ } ++ ++ newcnt = __this_cpu_sub_return(softirq_ctrl.cnt, cnt); ++ current->softirq_disable_cnt = newcnt; ++ ++ if (!newcnt && unlock) { ++ rcu_read_unlock(); ++ local_unlock(&softirq_ctrl.lock); ++ } ++} ++ ++void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) ++{ ++ bool preempt_on = preemptible(); ++ unsigned long flags; ++ u32 pending; ++ int curcnt; ++ ++ WARN_ON_ONCE(in_irq()); ++ lockdep_assert_irqs_enabled(); ++ ++ local_irq_save(flags); ++ curcnt = __this_cpu_read(softirq_ctrl.cnt); ++ ++ /* ++ * If this is not reenabling soft interrupts, no point in trying to ++ * run pending ones. ++ */ ++ if (curcnt != cnt) ++ goto out; ++ ++ pending = local_softirq_pending(); ++ if (!pending || ksoftirqd_running(pending)) ++ goto out; ++ ++ /* ++ * If this was called from non preemptible context, wake up the ++ * softirq daemon. ++ */ ++ if (!preempt_on) { ++ wakeup_softirqd(); ++ goto out; ++ } ++ ++ /* ++ * Adjust softirq count to SOFTIRQ_OFFSET which makes ++ * in_serving_softirq() become true. ++ */ ++ cnt = SOFTIRQ_OFFSET; ++ __local_bh_enable(cnt, false); ++ __do_softirq(); ++ ++out: ++ __local_bh_enable(cnt, preempt_on); ++ local_irq_restore(flags); ++} ++EXPORT_SYMBOL(__local_bh_enable_ip); ++ ++/* ++ * Invoked from ksoftirqd_run() outside of the interrupt disabled section ++ * to acquire the per CPU local lock for reentrancy protection. ++ */ ++static inline void ksoftirqd_run_begin(void) ++{ ++ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); ++ local_irq_disable(); ++} ++ ++/* Counterpart to ksoftirqd_run_begin() */ ++static inline void ksoftirqd_run_end(void) ++{ ++ __local_bh_enable(SOFTIRQ_OFFSET, true); ++ WARN_ON_ONCE(in_interrupt()); ++ local_irq_enable(); ++} ++ ++static inline void softirq_handle_begin(void) { } ++static inline void softirq_handle_end(void) { } ++ ++static inline bool should_wake_ksoftirqd(void) ++{ ++ return !this_cpu_read(softirq_ctrl.cnt); ++} ++ ++static inline void invoke_softirq(void) ++{ ++ if (should_wake_ksoftirqd()) ++ wakeup_softirqd(); ++} ++ ++#else /* CONFIG_PREEMPT_RT */ ++ ++/* ++ * This one is for softirq.c-internal use, where hardirqs are disabled + * legitimately: + */ ++#ifdef CONFIG_TRACE_IRQFLAGS + void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) + { + unsigned long flags; +@@ -275,6 +445,8 @@ asmlinkage __visible void do_softirq(void) + local_irq_restore(flags); + } + ++#endif /* !CONFIG_PREEMPT_RT */ ++ + /* + * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, + * but break the loop if need_resched() is set or after 2 ms. +@@ -379,8 +551,10 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) + pending >>= softirq_bit; + } + +- if (__this_cpu_read(ksoftirqd) == current) ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && ++ __this_cpu_read(ksoftirqd) == current) + rcu_softirq_qs(); ++ + local_irq_disable(); + + pending = local_softirq_pending(); +-- +2.43.0 + diff --git a/debian/patches-rt/0150-tick-sched-Prevent-false-positive-softirq-pending-wa.patch b/debian/patches-rt/0150-tick-sched-Prevent-false-positive-softirq-pending-wa.patch new file mode 100644 index 000000000..179b20d7b --- /dev/null +++ b/debian/patches-rt/0150-tick-sched-Prevent-false-positive-softirq-pending-wa.patch @@ -0,0 +1,84 @@ +From 8f4c53c804fbc30a305bf13376f5748a55ec4944 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 9 Mar 2021 09:55:57 +0100 +Subject: [PATCH 150/323] tick/sched: Prevent false positive softirq pending + warnings on RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +On RT a task which has soft interrupts disabled can block on a lock and +schedule out to idle while soft interrupts are pending. This triggers the +warning in the NOHZ idle code which complains about going idle with pending +soft interrupts. But as the task is blocked soft interrupt processing is +temporarily blocked as well which means that such a warning is a false +positive. + +To prevent that check the per CPU state which indicates that a scheduled +out task has soft interrupts disabled. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Reviewed-by: Frederic Weisbecker <frederic@kernel.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/bottom_half.h | 6 ++++++ + kernel/softirq.c | 15 +++++++++++++++ + kernel/time/tick-sched.c | 2 +- + 3 files changed, 22 insertions(+), 1 deletion(-) + +diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h +index e4dd613a070e..eed86eb0a1de 100644 +--- a/include/linux/bottom_half.h ++++ b/include/linux/bottom_half.h +@@ -32,4 +32,10 @@ static inline void local_bh_enable(void) + __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); + } + ++#ifdef CONFIG_PREEMPT_RT ++extern bool local_bh_blocked(void); ++#else ++static inline bool local_bh_blocked(void) { return false; } ++#endif ++ + #endif /* _LINUX_BH_H */ +diff --git a/kernel/softirq.c b/kernel/softirq.c +index ed13f6097de8..c9adc5c46248 100644 +--- a/kernel/softirq.c ++++ b/kernel/softirq.c +@@ -139,6 +139,21 @@ static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = { + .lock = INIT_LOCAL_LOCK(softirq_ctrl.lock), + }; + ++/** ++ * local_bh_blocked() - Check for idle whether BH processing is blocked ++ * ++ * Returns false if the per CPU softirq::cnt is 0 otherwise true. ++ * ++ * This is invoked from the idle task to guard against false positive ++ * softirq pending warnings, which would happen when the task which holds ++ * softirq_ctrl::lock was the only running task on the CPU and blocks on ++ * some other lock. ++ */ ++bool local_bh_blocked(void) ++{ ++ return __this_cpu_read(softirq_ctrl.cnt) != 0; ++} ++ + void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) + { + unsigned long flags; +diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c +index fc79b04b5947..fb10be8e5e92 100644 +--- a/kernel/time/tick-sched.c ++++ b/kernel/time/tick-sched.c +@@ -982,7 +982,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) + if (unlikely(local_softirq_pending())) { + static int ratelimit; + +- if (ratelimit < 10 && ++ if (ratelimit < 10 && !local_bh_blocked() && + (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { + pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n", + (unsigned int) local_softirq_pending()); +-- +2.43.0 + diff --git a/debian/patches-rt/0151-rcu-Prevent-false-positive-softirq-warning-on-RT.patch b/debian/patches-rt/0151-rcu-Prevent-false-positive-softirq-warning-on-RT.patch new file mode 100644 index 000000000..14d81c8a1 --- /dev/null +++ b/debian/patches-rt/0151-rcu-Prevent-false-positive-softirq-warning-on-RT.patch @@ -0,0 +1,35 @@ +From cce7be95962e51a5270439e8c31500ed38123c49 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 9 Mar 2021 09:55:58 +0100 +Subject: [PATCH 151/323] rcu: Prevent false positive softirq warning on RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Soft interrupt disabled sections can legitimately be preempted or schedule +out when blocking on a lock on RT enabled kernels so the RCU preempt check +warning has to be disabled for RT kernels. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Reviewed-by: Paul E. McKenney <paulmck@kernel.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/rcupdate.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h +index ef8d56b18da6..bb9681309e0d 100644 +--- a/include/linux/rcupdate.h ++++ b/include/linux/rcupdate.h +@@ -338,7 +338,8 @@ static inline void rcu_preempt_sleep_check(void) { } + #define rcu_sleep_check() \ + do { \ + rcu_preempt_sleep_check(); \ +- RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), \ ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ ++ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), \ + "Illegal context switch in RCU-bh read-side critical section"); \ + RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), \ + "Illegal context switch in RCU-sched read-side critical section"); \ +-- +2.43.0 + diff --git a/debian/patches-rt/0152-chelsio-cxgb-Replace-the-workqueue-with-threaded-int.patch b/debian/patches-rt/0152-chelsio-cxgb-Replace-the-workqueue-with-threaded-int.patch new file mode 100644 index 000000000..45fec4e55 --- /dev/null +++ b/debian/patches-rt/0152-chelsio-cxgb-Replace-the-workqueue-with-threaded-int.patch @@ -0,0 +1,271 @@ +From a3a5c955eb9767e09f62d7d4c1af2f8f5b0fe565 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 2 Feb 2021 18:01:03 +0100 +Subject: [PATCH 152/323] chelsio: cxgb: Replace the workqueue with threaded + interrupt +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The external interrupt (F_PL_INTR_EXT) needs to be handled in a process +context and this is accomplished by utilizing a workqueue. + +The process context can also be provided by a threaded interrupt instead +of a workqueue. The threaded interrupt can be used later for other +interrupt related processing which require non-atomic context without +using yet another workqueue. free_irq() also ensures that the thread is +done which is currently missing (the worker could continue after the +module has been removed). + +Save pending flags in pending_thread_intr. Use the same mechanism +to disable F_PL_INTR_EXT as interrupt source like it is used before the +worker is scheduled. Enable the interrupt again once +t1_elmer0_ext_intr_handler() is done. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/net/ethernet/chelsio/cxgb/common.h | 5 +-- + drivers/net/ethernet/chelsio/cxgb/cxgb2.c | 44 ++-------------------- + drivers/net/ethernet/chelsio/cxgb/sge.c | 33 ++++++++++++++-- + drivers/net/ethernet/chelsio/cxgb/sge.h | 1 + + drivers/net/ethernet/chelsio/cxgb/subr.c | 26 +++++++++---- + 5 files changed, 55 insertions(+), 54 deletions(-) + +diff --git a/drivers/net/ethernet/chelsio/cxgb/common.h b/drivers/net/ethernet/chelsio/cxgb/common.h +index 6475060649e9..e999a9b9fe6c 100644 +--- a/drivers/net/ethernet/chelsio/cxgb/common.h ++++ b/drivers/net/ethernet/chelsio/cxgb/common.h +@@ -238,7 +238,6 @@ struct adapter { + int msg_enable; + u32 mmio_len; + +- struct work_struct ext_intr_handler_task; + struct adapter_params params; + + /* Terminator modules. */ +@@ -257,6 +256,7 @@ struct adapter { + + /* guards async operations */ + spinlock_t async_lock ____cacheline_aligned; ++ u32 pending_thread_intr; + u32 slow_intr_mask; + int t1powersave; + }; +@@ -334,8 +334,7 @@ void t1_interrupts_enable(adapter_t *adapter); + void t1_interrupts_disable(adapter_t *adapter); + void t1_interrupts_clear(adapter_t *adapter); + int t1_elmer0_ext_intr_handler(adapter_t *adapter); +-void t1_elmer0_ext_intr(adapter_t *adapter); +-int t1_slow_intr_handler(adapter_t *adapter); ++irqreturn_t t1_slow_intr_handler(adapter_t *adapter); + + int t1_link_start(struct cphy *phy, struct cmac *mac, struct link_config *lc); + const struct board_info *t1_get_board_info(unsigned int board_id); +diff --git a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c +index c6db85fe1629..737c24136e2f 100644 +--- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c ++++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c +@@ -211,9 +211,10 @@ static int cxgb_up(struct adapter *adapter) + t1_interrupts_clear(adapter); + + adapter->params.has_msi = !disable_msi && !pci_enable_msi(adapter->pdev); +- err = request_irq(adapter->pdev->irq, t1_interrupt, +- adapter->params.has_msi ? 0 : IRQF_SHARED, +- adapter->name, adapter); ++ err = request_threaded_irq(adapter->pdev->irq, t1_interrupt, ++ t1_interrupt_thread, ++ adapter->params.has_msi ? 0 : IRQF_SHARED, ++ adapter->name, adapter); + if (err) { + if (adapter->params.has_msi) + pci_disable_msi(adapter->pdev); +@@ -916,41 +917,6 @@ static void mac_stats_task(struct work_struct *work) + spin_unlock(&adapter->work_lock); + } + +-/* +- * Processes elmer0 external interrupts in process context. +- */ +-static void ext_intr_task(struct work_struct *work) +-{ +- struct adapter *adapter = +- container_of(work, struct adapter, ext_intr_handler_task); +- +- t1_elmer0_ext_intr_handler(adapter); +- +- /* Now reenable external interrupts */ +- spin_lock_irq(&adapter->async_lock); +- adapter->slow_intr_mask |= F_PL_INTR_EXT; +- writel(F_PL_INTR_EXT, adapter->regs + A_PL_CAUSE); +- writel(adapter->slow_intr_mask | F_PL_INTR_SGE_DATA, +- adapter->regs + A_PL_ENABLE); +- spin_unlock_irq(&adapter->async_lock); +-} +- +-/* +- * Interrupt-context handler for elmer0 external interrupts. +- */ +-void t1_elmer0_ext_intr(struct adapter *adapter) +-{ +- /* +- * Schedule a task to handle external interrupts as we require +- * a process context. We disable EXT interrupts in the interim +- * and let the task reenable them when it's done. +- */ +- adapter->slow_intr_mask &= ~F_PL_INTR_EXT; +- writel(adapter->slow_intr_mask | F_PL_INTR_SGE_DATA, +- adapter->regs + A_PL_ENABLE); +- schedule_work(&adapter->ext_intr_handler_task); +-} +- + void t1_fatal_err(struct adapter *adapter) + { + if (adapter->flags & FULL_INIT_DONE) { +@@ -1062,8 +1028,6 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) + spin_lock_init(&adapter->async_lock); + spin_lock_init(&adapter->mac_lock); + +- INIT_WORK(&adapter->ext_intr_handler_task, +- ext_intr_task); + INIT_DELAYED_WORK(&adapter->stats_update_task, + mac_stats_task); + +diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.c b/drivers/net/ethernet/chelsio/cxgb/sge.c +index 2d9c2b5a690a..5aef9ae1ecfe 100644 +--- a/drivers/net/ethernet/chelsio/cxgb/sge.c ++++ b/drivers/net/ethernet/chelsio/cxgb/sge.c +@@ -1619,11 +1619,38 @@ int t1_poll(struct napi_struct *napi, int budget) + return work_done; + } + ++irqreturn_t t1_interrupt_thread(int irq, void *data) ++{ ++ struct adapter *adapter = data; ++ u32 pending_thread_intr; ++ ++ spin_lock_irq(&adapter->async_lock); ++ pending_thread_intr = adapter->pending_thread_intr; ++ adapter->pending_thread_intr = 0; ++ spin_unlock_irq(&adapter->async_lock); ++ ++ if (!pending_thread_intr) ++ return IRQ_NONE; ++ ++ if (pending_thread_intr & F_PL_INTR_EXT) ++ t1_elmer0_ext_intr_handler(adapter); ++ ++ spin_lock_irq(&adapter->async_lock); ++ adapter->slow_intr_mask |= F_PL_INTR_EXT; ++ ++ writel(F_PL_INTR_EXT, adapter->regs + A_PL_CAUSE); ++ writel(adapter->slow_intr_mask | F_PL_INTR_SGE_DATA, ++ adapter->regs + A_PL_ENABLE); ++ spin_unlock_irq(&adapter->async_lock); ++ ++ return IRQ_HANDLED; ++} ++ + irqreturn_t t1_interrupt(int irq, void *data) + { + struct adapter *adapter = data; + struct sge *sge = adapter->sge; +- int handled; ++ irqreturn_t handled; + + if (likely(responses_pending(adapter))) { + writel(F_PL_INTR_SGE_DATA, adapter->regs + A_PL_CAUSE); +@@ -1645,10 +1672,10 @@ irqreturn_t t1_interrupt(int irq, void *data) + handled = t1_slow_intr_handler(adapter); + spin_unlock(&adapter->async_lock); + +- if (!handled) ++ if (handled == IRQ_NONE) + sge->stats.unhandled_irqs++; + +- return IRQ_RETVAL(handled != 0); ++ return handled; + } + + /* +diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.h b/drivers/net/ethernet/chelsio/cxgb/sge.h +index a1ba591b3431..76516d2a8aa9 100644 +--- a/drivers/net/ethernet/chelsio/cxgb/sge.h ++++ b/drivers/net/ethernet/chelsio/cxgb/sge.h +@@ -74,6 +74,7 @@ struct sge *t1_sge_create(struct adapter *, struct sge_params *); + int t1_sge_configure(struct sge *, struct sge_params *); + int t1_sge_set_coalesce_params(struct sge *, struct sge_params *); + void t1_sge_destroy(struct sge *); ++irqreturn_t t1_interrupt_thread(int irq, void *data); + irqreturn_t t1_interrupt(int irq, void *cookie); + int t1_poll(struct napi_struct *, int); + +diff --git a/drivers/net/ethernet/chelsio/cxgb/subr.c b/drivers/net/ethernet/chelsio/cxgb/subr.c +index ea0f8741d7cf..d90ad07ff1a4 100644 +--- a/drivers/net/ethernet/chelsio/cxgb/subr.c ++++ b/drivers/net/ethernet/chelsio/cxgb/subr.c +@@ -210,7 +210,7 @@ static int fpga_phy_intr_handler(adapter_t *adapter) + /* + * Slow path interrupt handler for FPGAs. + */ +-static int fpga_slow_intr(adapter_t *adapter) ++static irqreturn_t fpga_slow_intr(adapter_t *adapter) + { + u32 cause = readl(adapter->regs + A_PL_CAUSE); + +@@ -238,7 +238,7 @@ static int fpga_slow_intr(adapter_t *adapter) + if (cause) + writel(cause, adapter->regs + A_PL_CAUSE); + +- return cause != 0; ++ return cause == 0 ? IRQ_NONE : IRQ_HANDLED; + } + #endif + +@@ -842,13 +842,14 @@ void t1_interrupts_clear(adapter_t* adapter) + /* + * Slow path interrupt handler for ASICs. + */ +-static int asic_slow_intr(adapter_t *adapter) ++static irqreturn_t asic_slow_intr(adapter_t *adapter) + { + u32 cause = readl(adapter->regs + A_PL_CAUSE); ++ irqreturn_t ret = IRQ_HANDLED; + + cause &= adapter->slow_intr_mask; + if (!cause) +- return 0; ++ return IRQ_NONE; + if (cause & F_PL_INTR_SGE_ERR) + t1_sge_intr_error_handler(adapter->sge); + if (cause & F_PL_INTR_TP) +@@ -857,16 +858,25 @@ static int asic_slow_intr(adapter_t *adapter) + t1_espi_intr_handler(adapter->espi); + if (cause & F_PL_INTR_PCIX) + t1_pci_intr_handler(adapter); +- if (cause & F_PL_INTR_EXT) +- t1_elmer0_ext_intr(adapter); ++ if (cause & F_PL_INTR_EXT) { ++ /* Wake the threaded interrupt to handle external interrupts as ++ * we require a process context. We disable EXT interrupts in ++ * the interim and let the thread reenable them when it's done. ++ */ ++ adapter->pending_thread_intr |= F_PL_INTR_EXT; ++ adapter->slow_intr_mask &= ~F_PL_INTR_EXT; ++ writel(adapter->slow_intr_mask | F_PL_INTR_SGE_DATA, ++ adapter->regs + A_PL_ENABLE); ++ ret = IRQ_WAKE_THREAD; ++ } + + /* Clear the interrupts just processed. */ + writel(cause, adapter->regs + A_PL_CAUSE); + readl(adapter->regs + A_PL_CAUSE); /* flush writes */ +- return 1; ++ return ret; + } + +-int t1_slow_intr_handler(adapter_t *adapter) ++irqreturn_t t1_slow_intr_handler(adapter_t *adapter) + { + #ifdef CONFIG_CHELSIO_T1_1G + if (!t1_is_asic(adapter)) +-- +2.43.0 + diff --git a/debian/patches-rt/0153-chelsio-cxgb-Disable-the-card-on-error-in-threaded-i.patch b/debian/patches-rt/0153-chelsio-cxgb-Disable-the-card-on-error-in-threaded-i.patch new file mode 100644 index 000000000..910927009 --- /dev/null +++ b/debian/patches-rt/0153-chelsio-cxgb-Disable-the-card-on-error-in-threaded-i.patch @@ -0,0 +1,215 @@ +From 676e60d5a1de49b6188285212b7655ccd463f984 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 2 Feb 2021 18:01:04 +0100 +Subject: [PATCH 153/323] chelsio: cxgb: Disable the card on error in threaded + interrupt +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +t1_fatal_err() is invoked from the interrupt handler. The bad part is +that it invokes (via t1_sge_stop()) del_timer_sync() and tasklet_kill(). +Both functions must not be called from an interrupt because it is +possible that it will wait for the completion of the timer/tasklet it +just interrupted. + +In case of a fatal error, use t1_interrupts_disable() to disable all +interrupt sources and then wake the interrupt thread with +F_PL_INTR_SGE_ERR as pending flag. The threaded-interrupt will stop the +card via t1_sge_stop() and not re-enable the interrupts again. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/net/ethernet/chelsio/cxgb/common.h | 1 - + drivers/net/ethernet/chelsio/cxgb/cxgb2.c | 10 ------ + drivers/net/ethernet/chelsio/cxgb/sge.c | 20 +++++++++--- + drivers/net/ethernet/chelsio/cxgb/sge.h | 2 +- + drivers/net/ethernet/chelsio/cxgb/subr.c | 38 +++++++++++++++------- + 5 files changed, 44 insertions(+), 27 deletions(-) + +diff --git a/drivers/net/ethernet/chelsio/cxgb/common.h b/drivers/net/ethernet/chelsio/cxgb/common.h +index e999a9b9fe6c..0321be77366c 100644 +--- a/drivers/net/ethernet/chelsio/cxgb/common.h ++++ b/drivers/net/ethernet/chelsio/cxgb/common.h +@@ -346,7 +346,6 @@ int t1_get_board_rev(adapter_t *adapter, const struct board_info *bi, + int t1_init_hw_modules(adapter_t *adapter); + int t1_init_sw_modules(adapter_t *adapter, const struct board_info *bi); + void t1_free_sw_modules(adapter_t *adapter); +-void t1_fatal_err(adapter_t *adapter); + void t1_link_changed(adapter_t *adapter, int port_id); + void t1_link_negotiated(adapter_t *adapter, int port_id, int link_stat, + int speed, int duplex, int pause); +diff --git a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c +index 737c24136e2f..2a28a38da036 100644 +--- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c ++++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c +@@ -917,16 +917,6 @@ static void mac_stats_task(struct work_struct *work) + spin_unlock(&adapter->work_lock); + } + +-void t1_fatal_err(struct adapter *adapter) +-{ +- if (adapter->flags & FULL_INIT_DONE) { +- t1_sge_stop(adapter->sge); +- t1_interrupts_disable(adapter); +- } +- pr_alert("%s: encountered fatal error, operation suspended\n", +- adapter->name); +-} +- + static const struct net_device_ops cxgb_netdev_ops = { + .ndo_open = cxgb_open, + .ndo_stop = cxgb_close, +diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.c b/drivers/net/ethernet/chelsio/cxgb/sge.c +index 5aef9ae1ecfe..cda01f22c71c 100644 +--- a/drivers/net/ethernet/chelsio/cxgb/sge.c ++++ b/drivers/net/ethernet/chelsio/cxgb/sge.c +@@ -940,10 +940,11 @@ void t1_sge_intr_clear(struct sge *sge) + /* + * SGE 'Error' interrupt handler + */ +-int t1_sge_intr_error_handler(struct sge *sge) ++bool t1_sge_intr_error_handler(struct sge *sge) + { + struct adapter *adapter = sge->adapter; + u32 cause = readl(adapter->regs + A_SG_INT_CAUSE); ++ bool wake = false; + + if (adapter->port[0].dev->hw_features & NETIF_F_TSO) + cause &= ~F_PACKET_TOO_BIG; +@@ -967,11 +968,14 @@ int t1_sge_intr_error_handler(struct sge *sge) + sge->stats.pkt_mismatch++; + pr_alert("%s: SGE packet mismatch\n", adapter->name); + } +- if (cause & SGE_INT_FATAL) +- t1_fatal_err(adapter); ++ if (cause & SGE_INT_FATAL) { ++ t1_interrupts_disable(adapter); ++ adapter->pending_thread_intr |= F_PL_INTR_SGE_ERR; ++ wake = true; ++ } + + writel(cause, adapter->regs + A_SG_INT_CAUSE); +- return 0; ++ return wake; + } + + const struct sge_intr_counts *t1_sge_get_intr_counts(const struct sge *sge) +@@ -1635,6 +1639,14 @@ irqreturn_t t1_interrupt_thread(int irq, void *data) + if (pending_thread_intr & F_PL_INTR_EXT) + t1_elmer0_ext_intr_handler(adapter); + ++ /* This error is fatal, interrupts remain off */ ++ if (pending_thread_intr & F_PL_INTR_SGE_ERR) { ++ pr_alert("%s: encountered fatal error, operation suspended\n", ++ adapter->name); ++ t1_sge_stop(adapter->sge); ++ return IRQ_HANDLED; ++ } ++ + spin_lock_irq(&adapter->async_lock); + adapter->slow_intr_mask |= F_PL_INTR_EXT; + +diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.h b/drivers/net/ethernet/chelsio/cxgb/sge.h +index 76516d2a8aa9..716705b96f26 100644 +--- a/drivers/net/ethernet/chelsio/cxgb/sge.h ++++ b/drivers/net/ethernet/chelsio/cxgb/sge.h +@@ -82,7 +82,7 @@ netdev_tx_t t1_start_xmit(struct sk_buff *skb, struct net_device *dev); + void t1_vlan_mode(struct adapter *adapter, netdev_features_t features); + void t1_sge_start(struct sge *); + void t1_sge_stop(struct sge *); +-int t1_sge_intr_error_handler(struct sge *); ++bool t1_sge_intr_error_handler(struct sge *sge); + void t1_sge_intr_enable(struct sge *); + void t1_sge_intr_disable(struct sge *); + void t1_sge_intr_clear(struct sge *); +diff --git a/drivers/net/ethernet/chelsio/cxgb/subr.c b/drivers/net/ethernet/chelsio/cxgb/subr.c +index d90ad07ff1a4..310add28fcf5 100644 +--- a/drivers/net/ethernet/chelsio/cxgb/subr.c ++++ b/drivers/net/ethernet/chelsio/cxgb/subr.c +@@ -170,7 +170,7 @@ void t1_link_changed(adapter_t *adapter, int port_id) + t1_link_negotiated(adapter, port_id, link_ok, speed, duplex, fc); + } + +-static int t1_pci_intr_handler(adapter_t *adapter) ++static bool t1_pci_intr_handler(adapter_t *adapter) + { + u32 pcix_cause; + +@@ -179,9 +179,13 @@ static int t1_pci_intr_handler(adapter_t *adapter) + if (pcix_cause) { + pci_write_config_dword(adapter->pdev, A_PCICFG_INTR_CAUSE, + pcix_cause); +- t1_fatal_err(adapter); /* PCI errors are fatal */ ++ /* PCI errors are fatal */ ++ t1_interrupts_disable(adapter); ++ adapter->pending_thread_intr |= F_PL_INTR_SGE_ERR; ++ pr_alert("%s: PCI error encountered.\n", adapter->name); ++ return true; + } +- return 0; ++ return false; + } + + #ifdef CONFIG_CHELSIO_T1_1G +@@ -213,10 +217,13 @@ static int fpga_phy_intr_handler(adapter_t *adapter) + static irqreturn_t fpga_slow_intr(adapter_t *adapter) + { + u32 cause = readl(adapter->regs + A_PL_CAUSE); ++ irqreturn_t ret = IRQ_NONE; + + cause &= ~F_PL_INTR_SGE_DATA; +- if (cause & F_PL_INTR_SGE_ERR) +- t1_sge_intr_error_handler(adapter->sge); ++ if (cause & F_PL_INTR_SGE_ERR) { ++ if (t1_sge_intr_error_handler(adapter->sge)) ++ ret = IRQ_WAKE_THREAD; ++ } + + if (cause & FPGA_PCIX_INTERRUPT_GMAC) + fpga_phy_intr_handler(adapter); +@@ -231,13 +238,18 @@ static irqreturn_t fpga_slow_intr(adapter_t *adapter) + /* Clear TP interrupt */ + writel(tp_cause, adapter->regs + FPGA_TP_ADDR_INTERRUPT_CAUSE); + } +- if (cause & FPGA_PCIX_INTERRUPT_PCIX) +- t1_pci_intr_handler(adapter); ++ if (cause & FPGA_PCIX_INTERRUPT_PCIX) { ++ if (t1_pci_intr_handler(adapter)) ++ ret = IRQ_WAKE_THREAD; ++ } + + /* Clear the interrupts just processed. */ + if (cause) + writel(cause, adapter->regs + A_PL_CAUSE); + ++ if (ret != IRQ_NONE) ++ return ret; ++ + return cause == 0 ? IRQ_NONE : IRQ_HANDLED; + } + #endif +@@ -850,14 +862,18 @@ static irqreturn_t asic_slow_intr(adapter_t *adapter) + cause &= adapter->slow_intr_mask; + if (!cause) + return IRQ_NONE; +- if (cause & F_PL_INTR_SGE_ERR) +- t1_sge_intr_error_handler(adapter->sge); ++ if (cause & F_PL_INTR_SGE_ERR) { ++ if (t1_sge_intr_error_handler(adapter->sge)) ++ ret = IRQ_WAKE_THREAD; ++ } + if (cause & F_PL_INTR_TP) + t1_tp_intr_handler(adapter->tp); + if (cause & F_PL_INTR_ESPI) + t1_espi_intr_handler(adapter->espi); +- if (cause & F_PL_INTR_PCIX) +- t1_pci_intr_handler(adapter); ++ if (cause & F_PL_INTR_PCIX) { ++ if (t1_pci_intr_handler(adapter)) ++ ret = IRQ_WAKE_THREAD; ++ } + if (cause & F_PL_INTR_EXT) { + /* Wake the threaded interrupt to handle external interrupts as + * we require a process context. We disable EXT interrupts in +-- +2.43.0 + diff --git a/debian/patches-rt/0154-x86-fpu-Simplify-fpregs_-un-lock.patch b/debian/patches-rt/0154-x86-fpu-Simplify-fpregs_-un-lock.patch new file mode 100644 index 000000000..1cde7fb8e --- /dev/null +++ b/debian/patches-rt/0154-x86-fpu-Simplify-fpregs_-un-lock.patch @@ -0,0 +1,47 @@ +From 8ca0011428a338becf8c390a74dba375610d693f Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 27 Oct 2020 11:09:50 +0100 +Subject: [PATCH 154/323] x86/fpu: Simplify fpregs_[un]lock() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +There is no point in disabling preemption and then disabling bottom +halfs. + +Just disabling bottom halfs is sufficient as it implicitly disables +preemption on !RT kernels. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Link: https://lore.kernel.org/r/20201027101349.455380473@linutronix.de +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/x86/include/asm/fpu/api.h | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h +index 8b9bfaad6e66..5174c0a640ef 100644 +--- a/arch/x86/include/asm/fpu/api.h ++++ b/arch/x86/include/asm/fpu/api.h +@@ -40,17 +40,18 @@ static inline void kernel_fpu_begin(void) + * A context switch will (and softirq might) save CPU's FPU registers to + * fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in + * a random state. ++ * ++ * local_bh_disable() protects against both preemption and soft interrupts ++ * on !RT kernels. + */ + static inline void fpregs_lock(void) + { +- preempt_disable(); + local_bh_disable(); + } + + static inline void fpregs_unlock(void) + { + local_bh_enable(); +- preempt_enable(); + } + + #ifdef CONFIG_X86_DEBUG_FPU +-- +2.43.0 + diff --git a/debian/patches-rt/0155-x86-fpu-Make-kernel-FPU-protection-RT-friendly.patch b/debian/patches-rt/0155-x86-fpu-Make-kernel-FPU-protection-RT-friendly.patch new file mode 100644 index 000000000..f28c1aace --- /dev/null +++ b/debian/patches-rt/0155-x86-fpu-Make-kernel-FPU-protection-RT-friendly.patch @@ -0,0 +1,64 @@ +From f6a993a6144299a8693fa151d042c3997238cd54 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 27 Oct 2020 11:09:51 +0100 +Subject: [PATCH 155/323] x86/fpu: Make kernel FPU protection RT friendly +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Non RT kernels need to protect FPU against preemption and bottom half +processing. This is achieved by disabling bottom halfs via +local_bh_disable() which implictly disables preemption. + +On RT kernels this protection mechanism is not sufficient because +local_bh_disable() does not disable preemption. It serializes bottom half +related processing via a CPU local lock. + +As bottom halfs are running always in thread context on RT kernels +disabling preemption is the proper choice as it implicitly prevents bottom +half processing. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Link: https://lore.kernel.org/r/20201027101349.588965083@linutronix.de +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/x86/include/asm/fpu/api.h | 18 ++++++++++++++++-- + 1 file changed, 16 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h +index 5174c0a640ef..ad2fee785310 100644 +--- a/arch/x86/include/asm/fpu/api.h ++++ b/arch/x86/include/asm/fpu/api.h +@@ -43,15 +43,29 @@ static inline void kernel_fpu_begin(void) + * + * local_bh_disable() protects against both preemption and soft interrupts + * on !RT kernels. ++ * ++ * On RT kernels local_bh_disable() is not sufficient because it only ++ * serializes soft interrupt related sections via a local lock, but stays ++ * preemptible. Disabling preemption is the right choice here as bottom ++ * half processing is always in thread context on RT kernels so it ++ * implicitly prevents bottom half processing as well. ++ * ++ * Disabling preemption also serializes against kernel_fpu_begin(). + */ + static inline void fpregs_lock(void) + { +- local_bh_disable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_bh_disable(); ++ else ++ preempt_disable(); + } + + static inline void fpregs_unlock(void) + { +- local_bh_enable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_bh_enable(); ++ else ++ preempt_enable(); + } + + #ifdef CONFIG_X86_DEBUG_FPU +-- +2.43.0 + diff --git a/debian/patches-rt/0156-locking-rtmutex-Remove-cruft.patch b/debian/patches-rt/0156-locking-rtmutex-Remove-cruft.patch new file mode 100644 index 000000000..a39953384 --- /dev/null +++ b/debian/patches-rt/0156-locking-rtmutex-Remove-cruft.patch @@ -0,0 +1,99 @@ +From 46072fa492b76df5aaf47fea23d280f72d7100a8 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 29 Sep 2020 15:21:17 +0200 +Subject: [PATCH 156/323] locking/rtmutex: Remove cruft +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Most of this is around since the very beginning. I'm not sure if this +was used while the rtmutex-deadlock-tester was around but today it seems +to only waste memory: +- save_state: No users +- name: Assigned and printed if a dead lock was detected. I'm keeping it + but want to point out that lockdep has the same information. +- file + line: Printed if ::name was NULL. This is only used for + in-kernel locks so it ::name shouldn't be NULL and then ::file and + ::line isn't used. +- magic: Assigned to NULL by rt_mutex_destroy(). + +Remove members of rt_mutex which are not used. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/rtmutex.h | 7 ++----- + kernel/locking/rtmutex-debug.c | 7 +------ + kernel/locking/rtmutex.c | 3 --- + kernel/locking/rtmutex_common.h | 1 - + 4 files changed, 3 insertions(+), 15 deletions(-) + +diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h +index 6fd615a0eea9..16f974a22f51 100644 +--- a/include/linux/rtmutex.h ++++ b/include/linux/rtmutex.h +@@ -32,10 +32,7 @@ struct rt_mutex { + struct rb_root_cached waiters; + struct task_struct *owner; + #ifdef CONFIG_DEBUG_RT_MUTEXES +- int save_state; +- const char *name, *file; +- int line; +- void *magic; ++ const char *name; + #endif + #ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +@@ -60,7 +57,7 @@ struct hrtimer_sleeper; + + #ifdef CONFIG_DEBUG_RT_MUTEXES + # define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \ +- , .name = #mutexname, .file = __FILE__, .line = __LINE__ ++ , .name = #mutexname + + # define rt_mutex_init(mutex) \ + do { \ +diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c +index 36e69100e8e0..7e411b946d4c 100644 +--- a/kernel/locking/rtmutex-debug.c ++++ b/kernel/locking/rtmutex-debug.c +@@ -42,12 +42,7 @@ static void printk_task(struct task_struct *p) + + static void printk_lock(struct rt_mutex *lock, int print_owner) + { +- if (lock->name) +- printk(" [%p] {%s}\n", +- lock, lock->name); +- else +- printk(" [%p] {%s:%d}\n", +- lock, lock->file, lock->line); ++ printk(" [%p] {%s}\n", lock, lock->name); + + if (print_owner && rt_mutex_owner(lock)) { + printk(".. ->owner: %p\n", lock->owner); +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index f00dd928fc71..4792fc7abe7d 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1655,9 +1655,6 @@ void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) + void rt_mutex_destroy(struct rt_mutex *lock) + { + WARN_ON(rt_mutex_is_locked(lock)); +-#ifdef CONFIG_DEBUG_RT_MUTEXES +- lock->magic = NULL; +-#endif + } + EXPORT_SYMBOL_GPL(rt_mutex_destroy); + +diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h +index ca6fb489007b..e6913103d7ff 100644 +--- a/kernel/locking/rtmutex_common.h ++++ b/kernel/locking/rtmutex_common.h +@@ -30,7 +30,6 @@ struct rt_mutex_waiter { + struct task_struct *task; + struct rt_mutex *lock; + #ifdef CONFIG_DEBUG_RT_MUTEXES +- unsigned long ip; + struct pid *deadlock_task_pid; + struct rt_mutex *deadlock_lock; + #endif +-- +2.43.0 + diff --git a/debian/patches-rt/0157-locking-rtmutex-Remove-output-from-deadlock-detector.patch b/debian/patches-rt/0157-locking-rtmutex-Remove-output-from-deadlock-detector.patch new file mode 100644 index 000000000..6f0878b80 --- /dev/null +++ b/debian/patches-rt/0157-locking-rtmutex-Remove-output-from-deadlock-detector.patch @@ -0,0 +1,312 @@ +From e9b488ea6cb82e796f8d331b037cb7de76d15657 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 29 Sep 2020 16:05:11 +0200 +Subject: [PATCH 157/323] locking/rtmutex: Remove output from deadlock + detector. +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +In commit + f5694788ad8da ("rt_mutex: Add lockdep annotations") + +rtmutex gained lockdep annotation for rt_mutex_lock() and and related +functions. +lockdep will see the locking order and may complain about a deadlock +before rtmutex' own mechanism gets a chance to detect it. +The rtmutex deadlock detector will only complain locks with the +RT_MUTEX_MIN_CHAINWALK and a waiter must be pending. That means it +works only for in-kernel locks because the futex interface always uses +RT_MUTEX_FULL_CHAINWALK. +The requirement for an active waiter limits the detector to actual +deadlocks and makes it possible to report potential deadlocks like +lockdep does. +It looks like lockdep is better suited for reporting deadlocks. + +Remove rtmutex' debug print on deadlock detection. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/rtmutex.h | 7 --- + kernel/locking/rtmutex-debug.c | 97 --------------------------------- + kernel/locking/rtmutex-debug.h | 11 ---- + kernel/locking/rtmutex.c | 9 --- + kernel/locking/rtmutex.h | 7 --- + kernel/locking/rtmutex_common.h | 4 -- + 6 files changed, 135 deletions(-) + +diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h +index 16f974a22f51..88a0ba806066 100644 +--- a/include/linux/rtmutex.h ++++ b/include/linux/rtmutex.h +@@ -31,9 +31,6 @@ struct rt_mutex { + raw_spinlock_t wait_lock; + struct rb_root_cached waiters; + struct task_struct *owner; +-#ifdef CONFIG_DEBUG_RT_MUTEXES +- const char *name; +-#endif + #ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; + #endif +@@ -56,8 +53,6 @@ struct hrtimer_sleeper; + #endif + + #ifdef CONFIG_DEBUG_RT_MUTEXES +-# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \ +- , .name = #mutexname + + # define rt_mutex_init(mutex) \ + do { \ +@@ -67,7 +62,6 @@ do { \ + + extern void rt_mutex_debug_task_free(struct task_struct *tsk); + #else +-# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) + # define rt_mutex_init(mutex) __rt_mutex_init(mutex, NULL, NULL) + # define rt_mutex_debug_task_free(t) do { } while (0) + #endif +@@ -83,7 +77,6 @@ do { \ + { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ + , .waiters = RB_ROOT_CACHED \ + , .owner = NULL \ +- __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \ + __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)} + + #define DEFINE_RT_MUTEX(mutexname) \ +diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c +index 7e411b946d4c..fb150100335f 100644 +--- a/kernel/locking/rtmutex-debug.c ++++ b/kernel/locking/rtmutex-debug.c +@@ -32,105 +32,12 @@ + + #include "rtmutex_common.h" + +-static void printk_task(struct task_struct *p) +-{ +- if (p) +- printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio); +- else +- printk("<none>"); +-} +- +-static void printk_lock(struct rt_mutex *lock, int print_owner) +-{ +- printk(" [%p] {%s}\n", lock, lock->name); +- +- if (print_owner && rt_mutex_owner(lock)) { +- printk(".. ->owner: %p\n", lock->owner); +- printk(".. held by: "); +- printk_task(rt_mutex_owner(lock)); +- printk("\n"); +- } +-} +- + void rt_mutex_debug_task_free(struct task_struct *task) + { + DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root)); + DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); + } + +-/* +- * We fill out the fields in the waiter to store the information about +- * the deadlock. We print when we return. act_waiter can be NULL in +- * case of a remove waiter operation. +- */ +-void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, +- struct rt_mutex_waiter *act_waiter, +- struct rt_mutex *lock) +-{ +- struct task_struct *task; +- +- if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter) +- return; +- +- task = rt_mutex_owner(act_waiter->lock); +- if (task && task != current) { +- act_waiter->deadlock_task_pid = get_pid(task_pid(task)); +- act_waiter->deadlock_lock = lock; +- } +-} +- +-void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) +-{ +- struct task_struct *task; +- +- if (!waiter->deadlock_lock || !debug_locks) +- return; +- +- rcu_read_lock(); +- task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID); +- if (!task) { +- rcu_read_unlock(); +- return; +- } +- +- if (!debug_locks_off()) { +- rcu_read_unlock(); +- return; +- } +- +- pr_warn("\n"); +- pr_warn("============================================\n"); +- pr_warn("WARNING: circular locking deadlock detected!\n"); +- pr_warn("%s\n", print_tainted()); +- pr_warn("--------------------------------------------\n"); +- printk("%s/%d is deadlocking current task %s/%d\n\n", +- task->comm, task_pid_nr(task), +- current->comm, task_pid_nr(current)); +- +- printk("\n1) %s/%d is trying to acquire this lock:\n", +- current->comm, task_pid_nr(current)); +- printk_lock(waiter->lock, 1); +- +- printk("\n2) %s/%d is blocked on this lock:\n", +- task->comm, task_pid_nr(task)); +- printk_lock(waiter->deadlock_lock, 1); +- +- debug_show_held_locks(current); +- debug_show_held_locks(task); +- +- printk("\n%s/%d's [blocked] stackdump:\n\n", +- task->comm, task_pid_nr(task)); +- show_stack(task, NULL, KERN_DEFAULT); +- printk("\n%s/%d's [current] stackdump:\n\n", +- current->comm, task_pid_nr(current)); +- dump_stack(); +- debug_show_all_locks(); +- rcu_read_unlock(); +- +- printk("[ turning off deadlock detection." +- "Please report this trace. ]\n\n"); +-} +- + void debug_rt_mutex_lock(struct rt_mutex *lock) + { + } +@@ -153,12 +60,10 @@ void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) + void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) + { + memset(waiter, 0x11, sizeof(*waiter)); +- waiter->deadlock_task_pid = NULL; + } + + void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) + { +- put_pid(waiter->deadlock_task_pid); + memset(waiter, 0x22, sizeof(*waiter)); + } + +@@ -168,10 +73,8 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_cl + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); +- lock->name = name; + + #ifdef CONFIG_DEBUG_LOCK_ALLOC + lockdep_init_map(&lock->dep_map, name, key, 0); + #endif + } +- +diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h +index fc549713bba3..659e93e256c6 100644 +--- a/kernel/locking/rtmutex-debug.h ++++ b/kernel/locking/rtmutex-debug.h +@@ -18,20 +18,9 @@ extern void debug_rt_mutex_unlock(struct rt_mutex *lock); + extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, + struct task_struct *powner); + extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); +-extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, +- struct rt_mutex_waiter *waiter, +- struct rt_mutex *lock); +-extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); +-# define debug_rt_mutex_reset_waiter(w) \ +- do { (w)->deadlock_lock = NULL; } while (0) + + static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, + enum rtmutex_chainwalk walk) + { + return (waiter != NULL); + } +- +-static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) +-{ +- debug_rt_mutex_print_deadlock(w); +-} +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index 4792fc7abe7d..9aa4d545020a 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -597,7 +597,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * walk, we detected a deadlock. + */ + if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { +- debug_rt_mutex_deadlock(chwalk, orig_waiter, lock); + raw_spin_unlock(&lock->wait_lock); + ret = -EDEADLK; + goto out_unlock_pi; +@@ -1189,8 +1188,6 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, + + raw_spin_unlock_irq(&lock->wait_lock); + +- debug_rt_mutex_print_deadlock(waiter); +- + schedule(); + + raw_spin_lock_irq(&lock->wait_lock); +@@ -1211,10 +1208,6 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock, + if (res != -EDEADLOCK || detect_deadlock) + return; + +- /* +- * Yell lowdly and stop the task right here. +- */ +- rt_mutex_print_deadlock(w); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); +@@ -1763,8 +1756,6 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, + ret = 0; + } + +- debug_rt_mutex_print_deadlock(waiter); +- + return ret; + } + +diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h +index 732f96abf462..338ccd29119a 100644 +--- a/kernel/locking/rtmutex.h ++++ b/kernel/locking/rtmutex.h +@@ -19,15 +19,8 @@ + #define debug_rt_mutex_proxy_unlock(l) do { } while (0) + #define debug_rt_mutex_unlock(l) do { } while (0) + #define debug_rt_mutex_init(m, n, k) do { } while (0) +-#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) +-#define debug_rt_mutex_print_deadlock(w) do { } while (0) + #define debug_rt_mutex_reset_waiter(w) do { } while (0) + +-static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) +-{ +- WARN(1, "rtmutex deadlock detected\n"); +-} +- + static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w, + enum rtmutex_chainwalk walk) + { +diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h +index e6913103d7ff..b1455dc2366f 100644 +--- a/kernel/locking/rtmutex_common.h ++++ b/kernel/locking/rtmutex_common.h +@@ -29,10 +29,6 @@ struct rt_mutex_waiter { + struct rb_node pi_tree_entry; + struct task_struct *task; + struct rt_mutex *lock; +-#ifdef CONFIG_DEBUG_RT_MUTEXES +- struct pid *deadlock_task_pid; +- struct rt_mutex *deadlock_lock; +-#endif + int prio; + u64 deadline; + }; +-- +2.43.0 + diff --git a/debian/patches-rt/0158-locking-rtmutex-Move-rt_mutex_init-outside-of-CONFIG.patch b/debian/patches-rt/0158-locking-rtmutex-Move-rt_mutex_init-outside-of-CONFIG.patch new file mode 100644 index 000000000..91e3d2b55 --- /dev/null +++ b/debian/patches-rt/0158-locking-rtmutex-Move-rt_mutex_init-outside-of-CONFIG.patch @@ -0,0 +1,60 @@ +From e25aa4319847e6a6769075a1f38c98d8cf3246f9 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 29 Sep 2020 16:32:49 +0200 +Subject: [PATCH 158/323] locking/rtmutex: Move rt_mutex_init() outside of + CONFIG_DEBUG_RT_MUTEXES +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +rt_mutex_init() only initializes lockdep if CONFIG_DEBUG_RT_MUTEXES is +enabled. The static initializer (DEFINE_RT_MUTEX) does not have such a +restriction. + +Move rt_mutex_init() outside of CONFIG_DEBUG_RT_MUTEXES. +Move the remaining functions in this CONFIG_DEBUG_RT_MUTEXES block to +the upper block. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/rtmutex.h | 12 +++--------- + 1 file changed, 3 insertions(+), 9 deletions(-) + +diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h +index 88a0ba806066..2dc10b582d4a 100644 +--- a/include/linux/rtmutex.h ++++ b/include/linux/rtmutex.h +@@ -43,6 +43,7 @@ struct hrtimer_sleeper; + extern int rt_mutex_debug_check_no_locks_freed(const void *from, + unsigned long len); + extern void rt_mutex_debug_check_no_locks_held(struct task_struct *task); ++ extern void rt_mutex_debug_task_free(struct task_struct *tsk); + #else + static inline int rt_mutex_debug_check_no_locks_freed(const void *from, + unsigned long len) +@@ -50,22 +51,15 @@ struct hrtimer_sleeper; + return 0; + } + # define rt_mutex_debug_check_no_locks_held(task) do { } while (0) ++# define rt_mutex_debug_task_free(t) do { } while (0) + #endif + +-#ifdef CONFIG_DEBUG_RT_MUTEXES +- +-# define rt_mutex_init(mutex) \ ++#define rt_mutex_init(mutex) \ + do { \ + static struct lock_class_key __key; \ + __rt_mutex_init(mutex, __func__, &__key); \ + } while (0) + +- extern void rt_mutex_debug_task_free(struct task_struct *tsk); +-#else +-# define rt_mutex_init(mutex) __rt_mutex_init(mutex, NULL, NULL) +-# define rt_mutex_debug_task_free(t) do { } while (0) +-#endif +- + #ifdef CONFIG_DEBUG_LOCK_ALLOC + #define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) \ + , .dep_map = { .name = #mutexname } +-- +2.43.0 + diff --git a/debian/patches-rt/0159-locking-rtmutex-Remove-rt_mutex_timed_lock.patch b/debian/patches-rt/0159-locking-rtmutex-Remove-rt_mutex_timed_lock.patch new file mode 100644 index 000000000..5573d0e56 --- /dev/null +++ b/debian/patches-rt/0159-locking-rtmutex-Remove-rt_mutex_timed_lock.patch @@ -0,0 +1,98 @@ +From 6ec5055da1da31b64ab24ba9ff1739d05e46bb7f Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 7 Oct 2020 12:11:33 +0200 +Subject: [PATCH 159/323] locking/rtmutex: Remove rt_mutex_timed_lock() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +rt_mutex_timed_lock() has no callers since commit + c051b21f71d1f ("rtmutex: Confine deadlock logic to futex") + +Remove rt_mutex_timed_lock(). + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/rtmutex.h | 3 --- + kernel/locking/rtmutex.c | 46 ---------------------------------------- + 2 files changed, 49 deletions(-) + +diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h +index 2dc10b582d4a..243fabc2c85f 100644 +--- a/include/linux/rtmutex.h ++++ b/include/linux/rtmutex.h +@@ -99,9 +99,6 @@ extern void rt_mutex_lock(struct rt_mutex *lock); + #endif + + extern int rt_mutex_lock_interruptible(struct rt_mutex *lock); +-extern int rt_mutex_timed_lock(struct rt_mutex *lock, +- struct hrtimer_sleeper *timeout); +- + extern int rt_mutex_trylock(struct rt_mutex *lock); + + extern void rt_mutex_unlock(struct rt_mutex *lock); +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index 9aa4d545020a..b5d5c570a8ac 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1405,21 +1405,6 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state, + return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); + } + +-static inline int +-rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, +- struct hrtimer_sleeper *timeout, +- enum rtmutex_chainwalk chwalk, +- int (*slowfn)(struct rt_mutex *lock, int state, +- struct hrtimer_sleeper *timeout, +- enum rtmutex_chainwalk chwalk)) +-{ +- if (chwalk == RT_MUTEX_MIN_CHAINWALK && +- likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) +- return 0; +- +- return slowfn(lock, state, timeout, chwalk); +-} +- + static inline int + rt_mutex_fasttrylock(struct rt_mutex *lock, + int (*slowfn)(struct rt_mutex *lock)) +@@ -1527,37 +1512,6 @@ int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) + return __rt_mutex_slowtrylock(lock); + } + +-/** +- * rt_mutex_timed_lock - lock a rt_mutex interruptible +- * the timeout structure is provided +- * by the caller +- * +- * @lock: the rt_mutex to be locked +- * @timeout: timeout structure or NULL (no timeout) +- * +- * Returns: +- * 0 on success +- * -EINTR when interrupted by a signal +- * -ETIMEDOUT when the timeout expired +- */ +-int +-rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) +-{ +- int ret; +- +- might_sleep(); +- +- mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); +- ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, +- RT_MUTEX_MIN_CHAINWALK, +- rt_mutex_slowlock); +- if (ret) +- mutex_release(&lock->dep_map, _RET_IP_); +- +- return ret; +-} +-EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); +- + /** + * rt_mutex_trylock - try to lock a rt_mutex + * +-- +2.43.0 + diff --git a/debian/patches-rt/0160-locking-rtmutex-Handle-the-various-new-futex-race-co.patch b/debian/patches-rt/0160-locking-rtmutex-Handle-the-various-new-futex-race-co.patch new file mode 100644 index 000000000..3e02c70df --- /dev/null +++ b/debian/patches-rt/0160-locking-rtmutex-Handle-the-various-new-futex-race-co.patch @@ -0,0 +1,255 @@ +From faad74e290468f037de33422cb1799c05477242d Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 10 Jun 2011 11:04:15 +0200 +Subject: [PATCH 160/323] locking/rtmutex: Handle the various new futex race + conditions +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +RT opens a few new interesting race conditions in the rtmutex/futex +combo due to futex hash bucket lock being a 'sleeping' spinlock and +therefor not disabling preemption. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + kernel/futex/core.c | 77 ++++++++++++++++++++++++++------- + kernel/locking/rtmutex.c | 36 ++++++++++++--- + kernel/locking/rtmutex_common.h | 2 + + 3 files changed, 94 insertions(+), 21 deletions(-) + +diff --git a/kernel/futex/core.c b/kernel/futex/core.c +index cde0ca876b93..706cd446ec71 100644 +--- a/kernel/futex/core.c ++++ b/kernel/futex/core.c +@@ -2165,6 +2165,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + */ + requeue_pi_wake_futex(this, &key2, hb2); + continue; ++ } else if (ret == -EAGAIN) { ++ /* ++ * Waiter was woken by timeout or ++ * signal and has set pi_blocked_on to ++ * PI_WAKEUP_INPROGRESS before we ++ * tried to enqueue it on the rtmutex. ++ */ ++ this->pi_state = NULL; ++ put_pi_state(pi_state); ++ continue; + } else if (ret) { + /* + * rt_mutex_start_proxy_lock() detected a +@@ -3182,7 +3192,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + { + struct hrtimer_sleeper timeout, *to; + struct rt_mutex_waiter rt_waiter; +- struct futex_hash_bucket *hb; ++ struct futex_hash_bucket *hb, *hb2; + union futex_key key2 = FUTEX_KEY_INIT; + struct futex_q q = futex_q_init; + int res, ret; +@@ -3234,20 +3244,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + /* Queue the futex_q, drop the hb lock, wait for wakeup. */ + futex_wait_queue_me(hb, &q, to); + +- spin_lock(&hb->lock); +- ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); +- spin_unlock(&hb->lock); +- if (ret) +- goto out; ++ /* ++ * On RT we must avoid races with requeue and trying to block ++ * on two mutexes (hb->lock and uaddr2's rtmutex) by ++ * serializing access to pi_blocked_on with pi_lock. ++ */ ++ raw_spin_lock_irq(¤t->pi_lock); ++ if (current->pi_blocked_on) { ++ /* ++ * We have been requeued or are in the process of ++ * being requeued. ++ */ ++ raw_spin_unlock_irq(¤t->pi_lock); ++ } else { ++ /* ++ * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS ++ * prevents a concurrent requeue from moving us to the ++ * uaddr2 rtmutex. After that we can safely acquire ++ * (and possibly block on) hb->lock. ++ */ ++ current->pi_blocked_on = PI_WAKEUP_INPROGRESS; ++ raw_spin_unlock_irq(¤t->pi_lock); ++ ++ spin_lock(&hb->lock); ++ ++ /* ++ * Clean up pi_blocked_on. We might leak it otherwise ++ * when we succeeded with the hb->lock in the fast ++ * path. ++ */ ++ raw_spin_lock_irq(¤t->pi_lock); ++ current->pi_blocked_on = NULL; ++ raw_spin_unlock_irq(¤t->pi_lock); ++ ++ ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); ++ spin_unlock(&hb->lock); ++ if (ret) ++ goto out; ++ } + + /* +- * In order for us to be here, we know our q.key == key2, and since +- * we took the hb->lock above, we also know that futex_requeue() has +- * completed and we no longer have to concern ourselves with a wakeup +- * race with the atomic proxy lock acquisition by the requeue code. The +- * futex_requeue dropped our key1 reference and incremented our key2 +- * reference count. ++ * In order to be here, we have either been requeued, are in ++ * the process of being requeued, or requeue successfully ++ * acquired uaddr2 on our behalf. If pi_blocked_on was ++ * non-null above, we may be racing with a requeue. Do not ++ * rely on q->lock_ptr to be hb2->lock until after blocking on ++ * hb->lock or hb2->lock. The futex_requeue dropped our key1 ++ * reference and incremented our key2 reference count. + */ ++ hb2 = hash_futex(&key2); + + /* Check if the requeue code acquired the second futex for us. */ + if (!q.rt_waiter) { +@@ -3256,14 +3301,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + * did a lock-steal - fix up the PI-state in that case. + */ + if (q.pi_state && (q.pi_state->owner != current)) { +- spin_lock(q.lock_ptr); ++ spin_lock(&hb2->lock); ++ BUG_ON(&hb2->lock != q.lock_ptr); + ret = fixup_pi_state_owner(uaddr2, &q, current); + /* + * Drop the reference to the pi state which + * the requeue_pi() code acquired for us. + */ + put_pi_state(q.pi_state); +- spin_unlock(q.lock_ptr); ++ spin_unlock(&hb2->lock); + /* + * Adjust the return value. It's either -EFAULT or + * success (1) but the caller expects 0 for success. +@@ -3282,7 +3328,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + pi_mutex = &q.pi_state->pi_mutex; + ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); + +- spin_lock(q.lock_ptr); ++ spin_lock(&hb2->lock); ++ BUG_ON(&hb2->lock != q.lock_ptr); + if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) + ret = 0; + +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index b5d5c570a8ac..2ef5a6eb95da 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -136,6 +136,11 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) + WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); + } + ++static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter) ++{ ++ return waiter && waiter != PI_WAKEUP_INPROGRESS; ++} ++ + /* + * We can speed up the acquire/release, if there's no debugging state to be + * set up. +@@ -378,7 +383,8 @@ int max_lock_depth = 1024; + + static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) + { +- return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL; ++ return rt_mutex_real_waiter(p->pi_blocked_on) ? ++ p->pi_blocked_on->lock : NULL; + } + + /* +@@ -514,7 +520,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * reached or the state of the chain has changed while we + * dropped the locks. + */ +- if (!waiter) ++ if (!rt_mutex_real_waiter(waiter)) + goto out_unlock_pi; + + /* +@@ -947,6 +953,22 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + return -EDEADLK; + + raw_spin_lock(&task->pi_lock); ++ /* ++ * In the case of futex requeue PI, this will be a proxy ++ * lock. The task will wake unaware that it is enqueueed on ++ * this lock. Avoid blocking on two locks and corrupting ++ * pi_blocked_on via the PI_WAKEUP_INPROGRESS ++ * flag. futex_wait_requeue_pi() sets this when it wakes up ++ * before requeue (due to a signal or timeout). Do not enqueue ++ * the task if PI_WAKEUP_INPROGRESS is set. ++ */ ++ if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) { ++ raw_spin_unlock(&task->pi_lock); ++ return -EAGAIN; ++ } ++ ++ BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on)); ++ + waiter->task = task; + waiter->lock = lock; + waiter->prio = task->prio; +@@ -970,7 +992,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + rt_mutex_enqueue_pi(owner, waiter); + + rt_mutex_adjust_prio(owner); +- if (owner->pi_blocked_on) ++ if (rt_mutex_real_waiter(owner->pi_blocked_on)) + chain_walk = 1; + } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { + chain_walk = 1; +@@ -1066,7 +1088,7 @@ static void remove_waiter(struct rt_mutex *lock, + { + bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); + struct task_struct *owner = rt_mutex_owner(lock); +- struct rt_mutex *next_lock; ++ struct rt_mutex *next_lock = NULL; + + lockdep_assert_held(&lock->wait_lock); + +@@ -1092,7 +1114,8 @@ static void remove_waiter(struct rt_mutex *lock, + rt_mutex_adjust_prio(owner); + + /* Store the lock on which owner is blocked or NULL */ +- next_lock = task_blocked_on_lock(owner); ++ if (rt_mutex_real_waiter(owner->pi_blocked_on)) ++ next_lock = task_blocked_on_lock(owner); + + raw_spin_unlock(&owner->pi_lock); + +@@ -1128,7 +1151,8 @@ void rt_mutex_adjust_pi(struct task_struct *task) + raw_spin_lock_irqsave(&task->pi_lock, flags); + + waiter = task->pi_blocked_on; +- if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { ++ if (!rt_mutex_real_waiter(waiter) || ++ rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + return; + } +diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h +index b1455dc2366f..096b16cfb096 100644 +--- a/kernel/locking/rtmutex_common.h ++++ b/kernel/locking/rtmutex_common.h +@@ -125,6 +125,8 @@ enum rtmutex_chainwalk { + /* + * PI-futex support (proxy locking functions, etc.): + */ ++#define PI_WAKEUP_INPROGRESS ((struct rt_mutex_waiter *) 1) ++ + extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); + extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner); +-- +2.43.0 + diff --git a/debian/patches-rt/0161-futex-Fix-bug-on-when-a-requeued-RT-task-times-out.patch b/debian/patches-rt/0161-futex-Fix-bug-on-when-a-requeued-RT-task-times-out.patch new file mode 100644 index 000000000..bab95baf2 --- /dev/null +++ b/debian/patches-rt/0161-futex-Fix-bug-on-when-a-requeued-RT-task-times-out.patch @@ -0,0 +1,118 @@ +From 787b0032cdfc388293dcde2c4bc3a2c40b6d9cf2 Mon Sep 17 00:00:00 2001 +From: Steven Rostedt <rostedt@goodmis.org> +Date: Tue, 14 Jul 2015 14:26:34 +0200 +Subject: [PATCH 161/323] futex: Fix bug on when a requeued RT task times out +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Requeue with timeout causes a bug with PREEMPT_RT. + +The bug comes from a timed out condition. + + TASK 1 TASK 2 + ------ ------ + futex_wait_requeue_pi() + futex_wait_queue_me() + <timed out> + + double_lock_hb(); + + raw_spin_lock(pi_lock); + if (current->pi_blocked_on) { + } else { + current->pi_blocked_on = PI_WAKE_INPROGRESS; + run_spin_unlock(pi_lock); + spin_lock(hb->lock); <-- blocked! + + plist_for_each_entry_safe(this) { + rt_mutex_start_proxy_lock(); + task_blocks_on_rt_mutex(); + BUG_ON(task->pi_blocked_on)!!!! + +The BUG_ON() actually has a check for PI_WAKE_INPROGRESS, but the +problem is that, after TASK 1 sets PI_WAKE_INPROGRESS, it then tries to +grab the hb->lock, which it fails to do so. As the hb->lock is a mutex, +it will block and set the "pi_blocked_on" to the hb->lock. + +When TASK 2 goes to requeue it, the check for PI_WAKE_INPROGESS fails +because the task1's pi_blocked_on is no longer set to that, but instead, +set to the hb->lock. + +The fix: + +When calling rt_mutex_start_proxy_lock() a check is made to see +if the proxy tasks pi_blocked_on is set. If so, exit out early. +Otherwise set it to a new flag PI_REQUEUE_INPROGRESS, which notifies +the proxy task that it is being requeued, and will handle things +appropriately. + +Signed-off-by: Steven Rostedt <rostedt@goodmis.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + kernel/locking/rtmutex.c | 31 ++++++++++++++++++++++++++++++- + kernel/locking/rtmutex_common.h | 1 + + 2 files changed, 31 insertions(+), 1 deletion(-) + +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index 2ef5a6eb95da..d8755d5ef227 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -138,7 +138,8 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) + + static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter) + { +- return waiter && waiter != PI_WAKEUP_INPROGRESS; ++ return waiter && waiter != PI_WAKEUP_INPROGRESS && ++ waiter != PI_REQUEUE_INPROGRESS; + } + + /* +@@ -1720,6 +1721,34 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, + if (try_to_take_rt_mutex(lock, task, NULL)) + return 1; + ++#ifdef CONFIG_PREEMPT_RT ++ /* ++ * In PREEMPT_RT there's an added race. ++ * If the task, that we are about to requeue, times out, ++ * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue ++ * to skip this task. But right after the task sets ++ * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then ++ * block on the spin_lock(&hb->lock), which in RT is an rtmutex. ++ * This will replace the PI_WAKEUP_INPROGRESS with the actual ++ * lock that it blocks on. We *must not* place this task ++ * on this proxy lock in that case. ++ * ++ * To prevent this race, we first take the task's pi_lock ++ * and check if it has updated its pi_blocked_on. If it has, ++ * we assume that it woke up and we return -EAGAIN. ++ * Otherwise, we set the task's pi_blocked_on to ++ * PI_REQUEUE_INPROGRESS, so that if the task is waking up ++ * it will know that we are in the process of requeuing it. ++ */ ++ raw_spin_lock(&task->pi_lock); ++ if (task->pi_blocked_on) { ++ raw_spin_unlock(&task->pi_lock); ++ return -EAGAIN; ++ } ++ task->pi_blocked_on = PI_REQUEUE_INPROGRESS; ++ raw_spin_unlock(&task->pi_lock); ++#endif ++ + /* We enforce deadlock detection for futexes */ + ret = task_blocks_on_rt_mutex(lock, waiter, task, + RT_MUTEX_FULL_CHAINWALK); +diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h +index 096b16cfb096..37cd6b3bf6f4 100644 +--- a/kernel/locking/rtmutex_common.h ++++ b/kernel/locking/rtmutex_common.h +@@ -126,6 +126,7 @@ enum rtmutex_chainwalk { + * PI-futex support (proxy locking functions, etc.): + */ + #define PI_WAKEUP_INPROGRESS ((struct rt_mutex_waiter *) 1) ++#define PI_REQUEUE_INPROGRESS ((struct rt_mutex_waiter *) 2) + + extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); + extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, +-- +2.43.0 + diff --git a/debian/patches-rt/0162-locking-rtmutex-Make-lock_killable-work.patch b/debian/patches-rt/0162-locking-rtmutex-Make-lock_killable-work.patch new file mode 100644 index 000000000..aebd5f60e --- /dev/null +++ b/debian/patches-rt/0162-locking-rtmutex-Make-lock_killable-work.patch @@ -0,0 +1,50 @@ +From 09e10e723c3bc6ceb0d1b64e0a6b3b9e2ec52225 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Sat, 1 Apr 2017 12:50:59 +0200 +Subject: [PATCH 162/323] locking/rtmutex: Make lock_killable work +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Locking an rt mutex killable does not work because signal handling is +restricted to TASK_INTERRUPTIBLE. + +Use signal_pending_state() unconditionally. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/locking/rtmutex.c | 19 +++++++------------ + 1 file changed, 7 insertions(+), 12 deletions(-) + +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index d8755d5ef227..c3f3c23fefef 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1197,18 +1197,13 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, + if (try_to_take_rt_mutex(lock, current, waiter)) + break; + +- /* +- * TASK_INTERRUPTIBLE checks for signals and +- * timeout. Ignored otherwise. +- */ +- if (likely(state == TASK_INTERRUPTIBLE)) { +- /* Signal pending? */ +- if (signal_pending(current)) +- ret = -EINTR; +- if (timeout && !timeout->task) +- ret = -ETIMEDOUT; +- if (ret) +- break; ++ if (timeout && !timeout->task) { ++ ret = -ETIMEDOUT; ++ break; ++ } ++ if (signal_pending_state(state, current)) { ++ ret = -EINTR; ++ break; + } + + raw_spin_unlock_irq(&lock->wait_lock); +-- +2.43.0 + diff --git a/debian/patches-rt/0163-locking-spinlock-Split-the-lock-types-header.patch b/debian/patches-rt/0163-locking-spinlock-Split-the-lock-types-header.patch new file mode 100644 index 000000000..a28543235 --- /dev/null +++ b/debian/patches-rt/0163-locking-spinlock-Split-the-lock-types-header.patch @@ -0,0 +1,253 @@ +From 79fca819370ecec3ce47fe9cb5f70efbacb613ad Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Wed, 29 Jun 2011 19:34:01 +0200 +Subject: [PATCH 163/323] locking/spinlock: Split the lock types header +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Split raw_spinlock into its own file and the remaining spinlock_t into +its own non-RT header. The non-RT header will be replaced later by sleeping +spinlocks. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + include/linux/rwlock_types.h | 4 ++ + include/linux/spinlock_types.h | 87 +---------------------------- + include/linux/spinlock_types_nort.h | 39 +++++++++++++ + include/linux/spinlock_types_raw.h | 65 +++++++++++++++++++++ + 4 files changed, 110 insertions(+), 85 deletions(-) + create mode 100644 include/linux/spinlock_types_nort.h + create mode 100644 include/linux/spinlock_types_raw.h + +diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h +index 3bd03e18061c..0ad226b5d8fd 100644 +--- a/include/linux/rwlock_types.h ++++ b/include/linux/rwlock_types.h +@@ -1,6 +1,10 @@ + #ifndef __LINUX_RWLOCK_TYPES_H + #define __LINUX_RWLOCK_TYPES_H + ++#if !defined(__LINUX_SPINLOCK_TYPES_H) ++# error "Do not include directly, include spinlock_types.h" ++#endif ++ + /* + * include/linux/rwlock_types.h - generic rwlock type definitions + * and initializers +diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h +index b981caafe8bf..5c8664d57fb8 100644 +--- a/include/linux/spinlock_types.h ++++ b/include/linux/spinlock_types.h +@@ -9,92 +9,9 @@ + * Released under the General Public License (GPL). + */ + +-#if defined(CONFIG_SMP) +-# include <asm/spinlock_types.h> +-#else +-# include <linux/spinlock_types_up.h> +-#endif ++#include <linux/spinlock_types_raw.h> + +-#include <linux/lockdep_types.h> +- +-typedef struct raw_spinlock { +- arch_spinlock_t raw_lock; +-#ifdef CONFIG_DEBUG_SPINLOCK +- unsigned int magic, owner_cpu; +- void *owner; +-#endif +-#ifdef CONFIG_DEBUG_LOCK_ALLOC +- struct lockdep_map dep_map; +-#endif +-} raw_spinlock_t; +- +-#define SPINLOCK_MAGIC 0xdead4ead +- +-#define SPINLOCK_OWNER_INIT ((void *)-1L) +- +-#ifdef CONFIG_DEBUG_LOCK_ALLOC +-# define RAW_SPIN_DEP_MAP_INIT(lockname) \ +- .dep_map = { \ +- .name = #lockname, \ +- .wait_type_inner = LD_WAIT_SPIN, \ +- } +-# define SPIN_DEP_MAP_INIT(lockname) \ +- .dep_map = { \ +- .name = #lockname, \ +- .wait_type_inner = LD_WAIT_CONFIG, \ +- } +-#else +-# define RAW_SPIN_DEP_MAP_INIT(lockname) +-# define SPIN_DEP_MAP_INIT(lockname) +-#endif +- +-#ifdef CONFIG_DEBUG_SPINLOCK +-# define SPIN_DEBUG_INIT(lockname) \ +- .magic = SPINLOCK_MAGIC, \ +- .owner_cpu = -1, \ +- .owner = SPINLOCK_OWNER_INIT, +-#else +-# define SPIN_DEBUG_INIT(lockname) +-#endif +- +-#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \ +- { \ +- .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \ +- SPIN_DEBUG_INIT(lockname) \ +- RAW_SPIN_DEP_MAP_INIT(lockname) } +- +-#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \ +- (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname) +- +-#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x) +- +-typedef struct spinlock { +- union { +- struct raw_spinlock rlock; +- +-#ifdef CONFIG_DEBUG_LOCK_ALLOC +-# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map)) +- struct { +- u8 __padding[LOCK_PADSIZE]; +- struct lockdep_map dep_map; +- }; +-#endif +- }; +-} spinlock_t; +- +-#define ___SPIN_LOCK_INITIALIZER(lockname) \ +- { \ +- .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \ +- SPIN_DEBUG_INIT(lockname) \ +- SPIN_DEP_MAP_INIT(lockname) } +- +-#define __SPIN_LOCK_INITIALIZER(lockname) \ +- { { .rlock = ___SPIN_LOCK_INITIALIZER(lockname) } } +- +-#define __SPIN_LOCK_UNLOCKED(lockname) \ +- (spinlock_t) __SPIN_LOCK_INITIALIZER(lockname) +- +-#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) ++#include <linux/spinlock_types_nort.h> + + #include <linux/rwlock_types.h> + +diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h +new file mode 100644 +index 000000000000..e4549f0dd197 +--- /dev/null ++++ b/include/linux/spinlock_types_nort.h +@@ -0,0 +1,39 @@ ++#ifndef __LINUX_SPINLOCK_TYPES_NORT_H ++#define __LINUX_SPINLOCK_TYPES_NORT_H ++ ++#ifndef __LINUX_SPINLOCK_TYPES_H ++#error "Do not include directly. Include spinlock_types.h instead" ++#endif ++ ++/* ++ * The non RT version maps spinlocks to raw_spinlocks ++ */ ++typedef struct spinlock { ++ union { ++ struct raw_spinlock rlock; ++ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map)) ++ struct { ++ u8 __padding[LOCK_PADSIZE]; ++ struct lockdep_map dep_map; ++ }; ++#endif ++ }; ++} spinlock_t; ++ ++#define ___SPIN_LOCK_INITIALIZER(lockname) \ ++{ \ ++ .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \ ++ SPIN_DEBUG_INIT(lockname) \ ++ SPIN_DEP_MAP_INIT(lockname) } ++ ++#define __SPIN_LOCK_INITIALIZER(lockname) \ ++ { { .rlock = ___SPIN_LOCK_INITIALIZER(lockname) } } ++ ++#define __SPIN_LOCK_UNLOCKED(lockname) \ ++ (spinlock_t) __SPIN_LOCK_INITIALIZER(lockname) ++ ++#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) ++ ++#endif +diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h +new file mode 100644 +index 000000000000..1d4a180e983d +--- /dev/null ++++ b/include/linux/spinlock_types_raw.h +@@ -0,0 +1,65 @@ ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H ++#define __LINUX_SPINLOCK_TYPES_RAW_H ++ ++#include <linux/types.h> ++ ++#if defined(CONFIG_SMP) ++# include <asm/spinlock_types.h> ++#else ++# include <linux/spinlock_types_up.h> ++#endif ++ ++#include <linux/lockdep_types.h> ++ ++typedef struct raw_spinlock { ++ arch_spinlock_t raw_lock; ++#ifdef CONFIG_DEBUG_SPINLOCK ++ unsigned int magic, owner_cpu; ++ void *owner; ++#endif ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map dep_map; ++#endif ++} raw_spinlock_t; ++ ++#define SPINLOCK_MAGIC 0xdead4ead ++ ++#define SPINLOCK_OWNER_INIT ((void *)-1L) ++ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++# define RAW_SPIN_DEP_MAP_INIT(lockname) \ ++ .dep_map = { \ ++ .name = #lockname, \ ++ .wait_type_inner = LD_WAIT_SPIN, \ ++ } ++# define SPIN_DEP_MAP_INIT(lockname) \ ++ .dep_map = { \ ++ .name = #lockname, \ ++ .wait_type_inner = LD_WAIT_CONFIG, \ ++ } ++#else ++# define RAW_SPIN_DEP_MAP_INIT(lockname) ++# define SPIN_DEP_MAP_INIT(lockname) ++#endif ++ ++#ifdef CONFIG_DEBUG_SPINLOCK ++# define SPIN_DEBUG_INIT(lockname) \ ++ .magic = SPINLOCK_MAGIC, \ ++ .owner_cpu = -1, \ ++ .owner = SPINLOCK_OWNER_INIT, ++#else ++# define SPIN_DEBUG_INIT(lockname) ++#endif ++ ++#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \ ++{ \ ++ .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \ ++ SPIN_DEBUG_INIT(lockname) \ ++ RAW_SPIN_DEP_MAP_INIT(lockname) } ++ ++#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \ ++ (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname) ++ ++#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x) ++ ++#endif +-- +2.43.0 + diff --git a/debian/patches-rt/0164-locking-rtmutex-Avoid-include-hell.patch b/debian/patches-rt/0164-locking-rtmutex-Avoid-include-hell.patch new file mode 100644 index 000000000..09d1bf4dd --- /dev/null +++ b/debian/patches-rt/0164-locking-rtmutex-Avoid-include-hell.patch @@ -0,0 +1,30 @@ +From 1b45f1cea53e6a886ba8bfab1c7617c32c66555d Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Wed, 29 Jun 2011 20:06:39 +0200 +Subject: [PATCH 164/323] locking/rtmutex: Avoid include hell +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Include only the required raw types. This avoids pulling in the +complete spinlock header which in turn requires rtmutex.h at some point. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + include/linux/rtmutex.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h +index 243fabc2c85f..add1dab27df5 100644 +--- a/include/linux/rtmutex.h ++++ b/include/linux/rtmutex.h +@@ -15,7 +15,7 @@ + + #include <linux/linkage.h> + #include <linux/rbtree.h> +-#include <linux/spinlock_types.h> ++#include <linux/spinlock_types_raw.h> + + extern int max_lock_depth; /* for sysctl */ + +-- +2.43.0 + diff --git a/debian/patches-rt/0165-lockdep-Reduce-header-files-in-debug_locks.h.patch b/debian/patches-rt/0165-lockdep-Reduce-header-files-in-debug_locks.h.patch new file mode 100644 index 000000000..bfe1d5ecd --- /dev/null +++ b/debian/patches-rt/0165-lockdep-Reduce-header-files-in-debug_locks.h.patch @@ -0,0 +1,33 @@ +From 740bc44f8756ed7c76a697de3eb3eae130c2de77 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Fri, 14 Aug 2020 16:55:25 +0200 +Subject: [PATCH 165/323] lockdep: Reduce header files in debug_locks.h +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The inclusion of printk.h leads to circular dependency if spinlock_t is +based on rt_mutex. + +Include only atomic.h (xchg()) and cache.h (__read_mostly). + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/debug_locks.h | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h +index edb5c186b0b7..3f49e65169c6 100644 +--- a/include/linux/debug_locks.h ++++ b/include/linux/debug_locks.h +@@ -3,8 +3,7 @@ + #define __LINUX_DEBUG_LOCKING_H + + #include <linux/atomic.h> +-#include <linux/bug.h> +-#include <linux/printk.h> ++#include <linux/cache.h> + + struct task_struct; + +-- +2.43.0 + diff --git a/debian/patches-rt/0166-locking-split-out-the-rbtree-definition.patch b/debian/patches-rt/0166-locking-split-out-the-rbtree-definition.patch new file mode 100644 index 000000000..d8189ef50 --- /dev/null +++ b/debian/patches-rt/0166-locking-split-out-the-rbtree-definition.patch @@ -0,0 +1,120 @@ +From b9604fd91b7d02c92ddb504c330e1793f0694318 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Fri, 14 Aug 2020 17:08:41 +0200 +Subject: [PATCH 166/323] locking: split out the rbtree definition +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +rtmutex.h needs the definition for rb_root_cached. By including kernel.h +we will get to spinlock.h which requires rtmutex.h again. + +Split out the required struct definition and move it into its own header +file which can be included by rtmutex.h + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/rbtree.h | 27 +-------------------------- + include/linux/rbtree_type.h | 31 +++++++++++++++++++++++++++++++ + include/linux/rtmutex.h | 2 +- + 3 files changed, 33 insertions(+), 27 deletions(-) + create mode 100644 include/linux/rbtree_type.h + +diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h +index d7db17996322..c33b0e16d04b 100644 +--- a/include/linux/rbtree.h ++++ b/include/linux/rbtree.h +@@ -19,19 +19,9 @@ + + #include <linux/kernel.h> + #include <linux/stddef.h> ++#include <linux/rbtree_type.h> + #include <linux/rcupdate.h> + +-struct rb_node { +- unsigned long __rb_parent_color; +- struct rb_node *rb_right; +- struct rb_node *rb_left; +-} __attribute__((aligned(sizeof(long)))); +- /* The alignment might seem pointless, but allegedly CRIS needs it */ +- +-struct rb_root { +- struct rb_node *rb_node; +-}; +- + #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) + + #define RB_ROOT (struct rb_root) { NULL, } +@@ -112,21 +102,6 @@ static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent + typeof(*pos), field); 1; }); \ + pos = n) + +-/* +- * Leftmost-cached rbtrees. +- * +- * We do not cache the rightmost node based on footprint +- * size vs number of potential users that could benefit +- * from O(1) rb_last(). Just not worth it, users that want +- * this feature can always implement the logic explicitly. +- * Furthermore, users that want to cache both pointers may +- * find it a bit asymmetric, but that's ok. +- */ +-struct rb_root_cached { +- struct rb_root rb_root; +- struct rb_node *rb_leftmost; +-}; +- + #define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } + + /* Same as rb_first(), but O(1) */ +diff --git a/include/linux/rbtree_type.h b/include/linux/rbtree_type.h +new file mode 100644 +index 000000000000..77a89dd2c7c6 +--- /dev/null ++++ b/include/linux/rbtree_type.h +@@ -0,0 +1,31 @@ ++/* SPDX-License-Identifier: GPL-2.0-or-later */ ++#ifndef _LINUX_RBTREE_TYPE_H ++#define _LINUX_RBTREE_TYPE_H ++ ++struct rb_node { ++ unsigned long __rb_parent_color; ++ struct rb_node *rb_right; ++ struct rb_node *rb_left; ++} __attribute__((aligned(sizeof(long)))); ++/* The alignment might seem pointless, but allegedly CRIS needs it */ ++ ++struct rb_root { ++ struct rb_node *rb_node; ++}; ++ ++/* ++ * Leftmost-cached rbtrees. ++ * ++ * We do not cache the rightmost node based on footprint ++ * size vs number of potential users that could benefit ++ * from O(1) rb_last(). Just not worth it, users that want ++ * this feature can always implement the logic explicitly. ++ * Furthermore, users that want to cache both pointers may ++ * find it a bit asymmetric, but that's ok. ++ */ ++struct rb_root_cached { ++ struct rb_root rb_root; ++ struct rb_node *rb_leftmost; ++}; ++ ++#endif +diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h +index add1dab27df5..b828b938c876 100644 +--- a/include/linux/rtmutex.h ++++ b/include/linux/rtmutex.h +@@ -14,7 +14,7 @@ + #define __LINUX_RT_MUTEX_H + + #include <linux/linkage.h> +-#include <linux/rbtree.h> ++#include <linux/rbtree_type.h> + #include <linux/spinlock_types_raw.h> + + extern int max_lock_depth; /* for sysctl */ +-- +2.43.0 + diff --git a/debian/patches-rt/0167-locking-rtmutex-Provide-rt_mutex_slowlock_locked.patch b/debian/patches-rt/0167-locking-rtmutex-Provide-rt_mutex_slowlock_locked.patch new file mode 100644 index 000000000..125846fe5 --- /dev/null +++ b/debian/patches-rt/0167-locking-rtmutex-Provide-rt_mutex_slowlock_locked.patch @@ -0,0 +1,145 @@ +From dbd55916b6a3efb4b812322d7502b174b4cceb69 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Thu, 12 Oct 2017 16:14:22 +0200 +Subject: [PATCH 167/323] locking/rtmutex: Provide rt_mutex_slowlock_locked() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +This is the inner-part of rt_mutex_slowlock(), required for rwsem-rt. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/locking/rtmutex.c | 67 +++++++++++++++++++-------------- + kernel/locking/rtmutex_common.h | 7 ++++ + 2 files changed, 45 insertions(+), 29 deletions(-) + +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index c3f3c23fefef..395a3e8b2463 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1234,35 +1234,16 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock, + } + } + +-/* +- * Slow path lock function: +- */ +-static int __sched +-rt_mutex_slowlock(struct rt_mutex *lock, int state, +- struct hrtimer_sleeper *timeout, +- enum rtmutex_chainwalk chwalk) ++int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state, ++ struct hrtimer_sleeper *timeout, ++ enum rtmutex_chainwalk chwalk, ++ struct rt_mutex_waiter *waiter) + { +- struct rt_mutex_waiter waiter; +- unsigned long flags; +- int ret = 0; +- +- rt_mutex_init_waiter(&waiter); +- +- /* +- * Technically we could use raw_spin_[un]lock_irq() here, but this can +- * be called in early boot if the cmpxchg() fast path is disabled +- * (debug, no architecture support). In this case we will acquire the +- * rtmutex with lock->wait_lock held. But we cannot unconditionally +- * enable interrupts in that early boot case. So we need to use the +- * irqsave/restore variants. +- */ +- raw_spin_lock_irqsave(&lock->wait_lock, flags); ++ int ret; + + /* Try to acquire the lock again: */ +- if (try_to_take_rt_mutex(lock, current, NULL)) { +- raw_spin_unlock_irqrestore(&lock->wait_lock, flags); ++ if (try_to_take_rt_mutex(lock, current, NULL)) + return 0; +- } + + set_current_state(state); + +@@ -1270,16 +1251,16 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, + if (unlikely(timeout)) + hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); + +- ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); ++ ret = task_blocks_on_rt_mutex(lock, waiter, current, chwalk); + + if (likely(!ret)) + /* sleep on the mutex */ +- ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); ++ ret = __rt_mutex_slowlock(lock, state, timeout, waiter); + + if (unlikely(ret)) { + __set_current_state(TASK_RUNNING); +- remove_waiter(lock, &waiter); +- rt_mutex_handle_deadlock(ret, chwalk, &waiter); ++ remove_waiter(lock, waiter); ++ rt_mutex_handle_deadlock(ret, chwalk, waiter); + } + + /* +@@ -1287,6 +1268,34 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, + * unconditionally. We might have to fix that up. + */ + fixup_rt_mutex_waiters(lock); ++ return ret; ++} ++ ++/* ++ * Slow path lock function: ++ */ ++static int __sched ++rt_mutex_slowlock(struct rt_mutex *lock, int state, ++ struct hrtimer_sleeper *timeout, ++ enum rtmutex_chainwalk chwalk) ++{ ++ struct rt_mutex_waiter waiter; ++ unsigned long flags; ++ int ret = 0; ++ ++ rt_mutex_init_waiter(&waiter); ++ ++ /* ++ * Technically we could use raw_spin_[un]lock_irq() here, but this can ++ * be called in early boot if the cmpxchg() fast path is disabled ++ * (debug, no architecture support). In this case we will acquire the ++ * rtmutex with lock->wait_lock held. But we cannot unconditionally ++ * enable interrupts in that early boot case. So we need to use the ++ * irqsave/restore variants. ++ */ ++ raw_spin_lock_irqsave(&lock->wait_lock, flags); ++ ++ ret = rt_mutex_slowlock_locked(lock, state, timeout, chwalk, &waiter); + + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + +diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h +index 37cd6b3bf6f4..b5a2affa59d5 100644 +--- a/kernel/locking/rtmutex_common.h ++++ b/kernel/locking/rtmutex_common.h +@@ -15,6 +15,7 @@ + + #include <linux/rtmutex.h> + #include <linux/sched/wake_q.h> ++#include <linux/sched/debug.h> + + /* + * This is the control structure for tasks blocked on a rt_mutex, +@@ -153,6 +154,12 @@ extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, + struct wake_q_head *wqh); + + extern void rt_mutex_postunlock(struct wake_q_head *wake_q); ++/* RW semaphore special interface */ ++ ++int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state, ++ struct hrtimer_sleeper *timeout, ++ enum rtmutex_chainwalk chwalk, ++ struct rt_mutex_waiter *waiter); + + #ifdef CONFIG_DEBUG_RT_MUTEXES + # include "rtmutex-debug.h" +-- +2.43.0 + diff --git a/debian/patches-rt/0168-locking-rtmutex-export-lockdep-less-version-of-rt_mu.patch b/debian/patches-rt/0168-locking-rtmutex-export-lockdep-less-version-of-rt_mu.patch new file mode 100644 index 000000000..6978baba3 --- /dev/null +++ b/debian/patches-rt/0168-locking-rtmutex-export-lockdep-less-version-of-rt_mu.patch @@ -0,0 +1,130 @@ +From 0472803e0c929a5b0c4cb02a1e667d745e12fc51 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Thu, 12 Oct 2017 16:36:39 +0200 +Subject: [PATCH 168/323] locking/rtmutex: export lockdep-less version of + rt_mutex's lock, trylock and unlock +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Required for lock implementation ontop of rtmutex. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/locking/rtmutex.c | 54 +++++++++++++++++++++++---------- + kernel/locking/rtmutex_common.h | 3 ++ + 2 files changed, 41 insertions(+), 16 deletions(-) + +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index 395a3e8b2463..40ae934c6caa 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1469,12 +1469,33 @@ rt_mutex_fastunlock(struct rt_mutex *lock, + rt_mutex_postunlock(&wake_q); + } + +-static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass) ++int __sched __rt_mutex_lock_state(struct rt_mutex *lock, int state) + { + might_sleep(); ++ return rt_mutex_fastlock(lock, state, rt_mutex_slowlock); ++} ++ ++/** ++ * rt_mutex_lock_state - lock a rt_mutex with a given state ++ * ++ * @lock: The rt_mutex to be locked ++ * @state: The state to set when blocking on the rt_mutex ++ */ ++static inline int __sched rt_mutex_lock_state(struct rt_mutex *lock, ++ unsigned int subclass, int state) ++{ ++ int ret; + + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); +- rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); ++ ret = __rt_mutex_lock_state(lock, state); ++ if (ret) ++ mutex_release(&lock->dep_map, _RET_IP_); ++ return ret; ++} ++ ++static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass) ++{ ++ rt_mutex_lock_state(lock, subclass, TASK_UNINTERRUPTIBLE); + } + + #ifdef CONFIG_DEBUG_LOCK_ALLOC +@@ -1515,16 +1536,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock); + */ + int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) + { +- int ret; +- +- might_sleep(); +- +- mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); +- ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); +- if (ret) +- mutex_release(&lock->dep_map, _RET_IP_); +- +- return ret; ++ return rt_mutex_lock_state(lock, 0, TASK_INTERRUPTIBLE); + } + EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); + +@@ -1541,6 +1553,14 @@ int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) + return __rt_mutex_slowtrylock(lock); + } + ++int __sched __rt_mutex_trylock(struct rt_mutex *lock) ++{ ++ if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) ++ return 0; ++ ++ return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); ++} ++ + /** + * rt_mutex_trylock - try to lock a rt_mutex + * +@@ -1556,10 +1576,7 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock) + { + int ret; + +- if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) +- return 0; +- +- ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); ++ ret = __rt_mutex_trylock(lock); + if (ret) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + +@@ -1567,6 +1584,11 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock) + } + EXPORT_SYMBOL_GPL(rt_mutex_trylock); + ++void __sched __rt_mutex_unlock(struct rt_mutex *lock) ++{ ++ rt_mutex_fastunlock(lock, rt_mutex_slowunlock); ++} ++ + /** + * rt_mutex_unlock - unlock a rt_mutex + * +diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h +index b5a2affa59d5..9d1e974ca9c3 100644 +--- a/kernel/locking/rtmutex_common.h ++++ b/kernel/locking/rtmutex_common.h +@@ -156,6 +156,9 @@ extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, + extern void rt_mutex_postunlock(struct wake_q_head *wake_q); + /* RW semaphore special interface */ + ++extern int __rt_mutex_lock_state(struct rt_mutex *lock, int state); ++extern int __rt_mutex_trylock(struct rt_mutex *lock); ++extern void __rt_mutex_unlock(struct rt_mutex *lock); + int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + enum rtmutex_chainwalk chwalk, +-- +2.43.0 + diff --git a/debian/patches-rt/0169-sched-Add-saved_state-for-tasks-blocked-on-sleeping-.patch b/debian/patches-rt/0169-sched-Add-saved_state-for-tasks-blocked-on-sleeping-.patch new file mode 100644 index 000000000..83106676c --- /dev/null +++ b/debian/patches-rt/0169-sched-Add-saved_state-for-tasks-blocked-on-sleeping-.patch @@ -0,0 +1,116 @@ +From 622996bf8475bf1d06cbb523dacc7469e7b9c864 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Sat, 25 Jun 2011 09:21:04 +0200 +Subject: [PATCH 169/323] sched: Add saved_state for tasks blocked on sleeping + locks +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Spinlocks are state preserving in !RT. RT changes the state when a +task gets blocked on a lock. So we need to remember the state before +the lock contention. If a regular wakeup (not a RTmutex related +wakeup) happens, the saved_state is updated to running. When the lock +sleep is done, the saved state is restored. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + include/linux/sched.h | 3 +++ + kernel/sched/core.c | 34 ++++++++++++++++++++++++++++++++-- + kernel/sched/sched.h | 1 + + 3 files changed, 36 insertions(+), 2 deletions(-) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index d31da4867bb2..73defe42fd23 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -659,6 +659,8 @@ struct task_struct { + #endif + /* -1 unrunnable, 0 runnable, >0 stopped: */ + volatile long state; ++ /* saved state for "spinlock sleepers" */ ++ volatile long saved_state; + + /* + * This begins the randomizable portion of task_struct. Only +@@ -1782,6 +1784,7 @@ extern struct task_struct *find_get_task_by_vpid(pid_t nr); + + extern int wake_up_state(struct task_struct *tsk, unsigned int state); + extern int wake_up_process(struct task_struct *tsk); ++extern int wake_up_lock_sleeper(struct task_struct *tsk); + extern void wake_up_new_task(struct task_struct *tsk); + + #ifdef CONFIG_SMP +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 390b51366f5e..f6d40256c0d4 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -3305,7 +3305,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) + int cpu, success = 0; + + preempt_disable(); +- if (p == current) { ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && p == current) { + /* + * We're waking current, this means 'p->on_rq' and 'task_cpu(p) + * == smp_processor_id()'. Together this means we can special +@@ -3335,8 +3335,26 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) + */ + raw_spin_lock_irqsave(&p->pi_lock, flags); + smp_mb__after_spinlock(); +- if (!(p->state & state)) ++ if (!(p->state & state)) { ++ /* ++ * The task might be running due to a spinlock sleeper ++ * wakeup. Check the saved state and set it to running ++ * if the wakeup condition is true. ++ */ ++ if (!(wake_flags & WF_LOCK_SLEEPER)) { ++ if (p->saved_state & state) { ++ p->saved_state = TASK_RUNNING; ++ success = 1; ++ } ++ } + goto unlock; ++ } ++ /* ++ * If this is a regular wakeup, then we can unconditionally ++ * clear the saved state of a "lock sleeper". ++ */ ++ if (!(wake_flags & WF_LOCK_SLEEPER)) ++ p->saved_state = TASK_RUNNING; + + trace_sched_waking(p); + +@@ -3525,6 +3543,18 @@ int wake_up_process(struct task_struct *p) + } + EXPORT_SYMBOL(wake_up_process); + ++/** ++ * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock" ++ * @p: The process to be woken up. ++ * ++ * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate ++ * the nature of the wakeup. ++ */ ++int wake_up_lock_sleeper(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER); ++} ++ + int wake_up_state(struct task_struct *p, unsigned int state) + { + return try_to_wake_up(p, state, 0); +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index c26b1c7009f4..d4bfc51358d3 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1753,6 +1753,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) + #define WF_FORK 0x02 /* Child wakeup after fork */ + #define WF_MIGRATED 0x04 /* Internal use, task got migrated */ + #define WF_ON_CPU 0x08 /* Wakee is on_cpu */ ++#define WF_LOCK_SLEEPER 0x10 /* Wakeup spinlock "sleeper" */ + + /* + * To aid in avoiding the subversion of "niceness" due to uneven distribution +-- +2.43.0 + diff --git a/debian/patches-rt/0170-locking-rtmutex-add-sleeping-lock-implementation.patch b/debian/patches-rt/0170-locking-rtmutex-add-sleeping-lock-implementation.patch new file mode 100644 index 000000000..e8755a734 --- /dev/null +++ b/debian/patches-rt/0170-locking-rtmutex-add-sleeping-lock-implementation.patch @@ -0,0 +1,1214 @@ +From 4c85c347eeb7dae59269d3a4a07388a00de563bf Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Thu, 12 Oct 2017 17:11:19 +0200 +Subject: [PATCH 170/323] locking/rtmutex: add sleeping lock implementation +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + include/linux/kernel.h | 5 + + include/linux/preempt.h | 4 + + include/linux/rtmutex.h | 19 +- + include/linux/sched.h | 7 + + include/linux/sched/wake_q.h | 13 +- + include/linux/spinlock_rt.h | 155 +++++++++++ + include/linux/spinlock_types_rt.h | 38 +++ + kernel/futex/core.c | 10 +- + kernel/locking/rtmutex.c | 444 +++++++++++++++++++++++++++--- + kernel/locking/rtmutex_common.h | 14 +- + kernel/sched/core.c | 39 ++- + 11 files changed, 693 insertions(+), 55 deletions(-) + create mode 100644 include/linux/spinlock_rt.h + create mode 100644 include/linux/spinlock_types_rt.h + +diff --git a/include/linux/kernel.h b/include/linux/kernel.h +index 55d48d5627c7..7b4fdd5b2f7b 100644 +--- a/include/linux/kernel.h ++++ b/include/linux/kernel.h +@@ -223,6 +223,10 @@ extern void __cant_migrate(const char *file, int line); + */ + # define might_sleep() \ + do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) ++ ++# define might_sleep_no_state_check() \ ++ do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) ++ + /** + * cant_sleep - annotation for functions that cannot sleep + * +@@ -266,6 +270,7 @@ extern void __cant_migrate(const char *file, int line); + static inline void __might_sleep(const char *file, int line, + int preempt_offset) { } + # define might_sleep() do { might_resched(); } while (0) ++# define might_sleep_no_state_check() do { might_resched(); } while (0) + # define cant_sleep() do { } while (0) + # define cant_migrate() do { } while (0) + # define sched_annotate_sleep() do { } while (0) +diff --git a/include/linux/preempt.h b/include/linux/preempt.h +index 9881eac0698f..4d244e295e85 100644 +--- a/include/linux/preempt.h ++++ b/include/linux/preempt.h +@@ -121,7 +121,11 @@ + /* + * The preempt_count offset after spin_lock() + */ ++#if !defined(CONFIG_PREEMPT_RT) + #define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET ++#else ++#define PREEMPT_LOCK_OFFSET 0 ++#endif + + /* + * The preempt_count offset needed for things like: +diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h +index b828b938c876..b02009f53026 100644 +--- a/include/linux/rtmutex.h ++++ b/include/linux/rtmutex.h +@@ -19,6 +19,10 @@ + + extern int max_lock_depth; /* for sysctl */ + ++#ifdef CONFIG_DEBUG_MUTEXES ++#include <linux/debug_locks.h> ++#endif ++ + /** + * The rt_mutex structure + * +@@ -31,6 +35,7 @@ struct rt_mutex { + raw_spinlock_t wait_lock; + struct rb_root_cached waiters; + struct task_struct *owner; ++ int save_state; + #ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; + #endif +@@ -67,11 +72,19 @@ do { \ + #define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) + #endif + +-#define __RT_MUTEX_INITIALIZER(mutexname) \ +- { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ ++#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \ ++ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ + , .waiters = RB_ROOT_CACHED \ + , .owner = NULL \ +- __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)} ++ __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) ++ ++#define __RT_MUTEX_INITIALIZER(mutexname) \ ++ { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \ ++ , .save_state = 0 } ++ ++#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \ ++ { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \ ++ , .save_state = 1 } + + #define DEFINE_RT_MUTEX(mutexname) \ + struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname) +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 73defe42fd23..3650fdaac4ca 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -141,6 +141,9 @@ struct io_uring_task; + smp_store_mb(current->state, (state_value)); \ + } while (0) + ++#define __set_current_state_no_track(state_value) \ ++ current->state = (state_value); ++ + #define set_special_state(state_value) \ + do { \ + unsigned long flags; /* may shadow */ \ +@@ -194,6 +197,9 @@ struct io_uring_task; + #define set_current_state(state_value) \ + smp_store_mb(current->state, (state_value)) + ++#define __set_current_state_no_track(state_value) \ ++ __set_current_state(state_value) ++ + /* + * set_special_state() should be used for those states when the blocking task + * can not use the regular condition based wait-loop. In that case we must +@@ -1021,6 +1027,7 @@ struct task_struct { + raw_spinlock_t pi_lock; + + struct wake_q_node wake_q; ++ struct wake_q_node wake_q_sleeper; + + #ifdef CONFIG_RT_MUTEXES + /* PI waiters blocked on a rt_mutex held by this task: */ +diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h +index 26a2013ac39c..6e2dff721547 100644 +--- a/include/linux/sched/wake_q.h ++++ b/include/linux/sched/wake_q.h +@@ -58,6 +58,17 @@ static inline bool wake_q_empty(struct wake_q_head *head) + + extern void wake_q_add(struct wake_q_head *head, struct task_struct *task); + extern void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task); +-extern void wake_up_q(struct wake_q_head *head); ++extern void wake_q_add_sleeper(struct wake_q_head *head, struct task_struct *task); ++extern void __wake_up_q(struct wake_q_head *head, bool sleeper); ++ ++static inline void wake_up_q(struct wake_q_head *head) ++{ ++ __wake_up_q(head, false); ++} ++ ++static inline void wake_up_q_sleeper(struct wake_q_head *head) ++{ ++ __wake_up_q(head, true); ++} + + #endif /* _LINUX_SCHED_WAKE_Q_H */ +diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h +new file mode 100644 +index 000000000000..3085132eae38 +--- /dev/null ++++ b/include/linux/spinlock_rt.h +@@ -0,0 +1,155 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#ifndef __LINUX_SPINLOCK_RT_H ++#define __LINUX_SPINLOCK_RT_H ++ ++#ifndef __LINUX_SPINLOCK_H ++#error Do not include directly. Use spinlock.h ++#endif ++ ++#include <linux/bug.h> ++ ++extern void ++__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key); ++ ++#define spin_lock_init(slock) \ ++do { \ ++ static struct lock_class_key __key; \ ++ \ ++ rt_mutex_init(&(slock)->lock); \ ++ __rt_spin_lock_init(slock, #slock, &__key); \ ++} while (0) ++ ++extern void __lockfunc rt_spin_lock(spinlock_t *lock); ++extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass); ++extern void __lockfunc rt_spin_lock_nest_lock(spinlock_t *lock, struct lockdep_map *nest_lock); ++extern void __lockfunc rt_spin_unlock(spinlock_t *lock); ++extern void __lockfunc rt_spin_lock_unlock(spinlock_t *lock); ++extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags); ++extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock); ++extern int __lockfunc rt_spin_trylock(spinlock_t *lock); ++extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock); ++ ++/* ++ * lockdep-less calls, for derived types like rwlock: ++ * (for trylock they can use rt_mutex_trylock() directly. ++ * Migrate disable handling must be done at the call site. ++ */ ++extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock); ++extern void __lockfunc __rt_spin_trylock(struct rt_mutex *lock); ++extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock); ++ ++#define spin_lock(lock) rt_spin_lock(lock) ++ ++#define spin_lock_bh(lock) \ ++ do { \ ++ local_bh_disable(); \ ++ rt_spin_lock(lock); \ ++ } while (0) ++ ++#define spin_lock_irq(lock) spin_lock(lock) ++ ++#define spin_do_trylock(lock) __cond_lock(lock, rt_spin_trylock(lock)) ++ ++#define spin_trylock(lock) \ ++({ \ ++ int __locked; \ ++ __locked = spin_do_trylock(lock); \ ++ __locked; \ ++}) ++ ++#ifdef CONFIG_LOCKDEP ++# define spin_lock_nested(lock, subclass) \ ++ do { \ ++ rt_spin_lock_nested(lock, subclass); \ ++ } while (0) ++ ++#define spin_lock_bh_nested(lock, subclass) \ ++ do { \ ++ local_bh_disable(); \ ++ rt_spin_lock_nested(lock, subclass); \ ++ } while (0) ++ ++# define spin_lock_nest_lock(lock, subclass) \ ++ do { \ ++ typecheck(struct lockdep_map *, &(subclass)->dep_map); \ ++ rt_spin_lock_nest_lock(lock, &(subclass)->dep_map); \ ++ } while (0) ++ ++# define spin_lock_irqsave_nested(lock, flags, subclass) \ ++ do { \ ++ typecheck(unsigned long, flags); \ ++ flags = 0; \ ++ rt_spin_lock_nested(lock, subclass); \ ++ } while (0) ++#else ++# define spin_lock_nested(lock, subclass) spin_lock(((void)(subclass), (lock))) ++# define spin_lock_nest_lock(lock, subclass) spin_lock(((void)(subclass), (lock))) ++# define spin_lock_bh_nested(lock, subclass) spin_lock_bh(((void)(subclass), (lock))) ++ ++# define spin_lock_irqsave_nested(lock, flags, subclass) \ ++ do { \ ++ typecheck(unsigned long, flags); \ ++ flags = 0; \ ++ spin_lock(((void)(subclass), (lock))); \ ++ } while (0) ++#endif ++ ++#define spin_lock_irqsave(lock, flags) \ ++ do { \ ++ typecheck(unsigned long, flags); \ ++ flags = 0; \ ++ spin_lock(lock); \ ++ } while (0) ++ ++#define spin_unlock(lock) rt_spin_unlock(lock) ++ ++#define spin_unlock_bh(lock) \ ++ do { \ ++ rt_spin_unlock(lock); \ ++ local_bh_enable(); \ ++ } while (0) ++ ++#define spin_unlock_irq(lock) spin_unlock(lock) ++ ++#define spin_unlock_irqrestore(lock, flags) \ ++ do { \ ++ typecheck(unsigned long, flags); \ ++ (void) flags; \ ++ spin_unlock(lock); \ ++ } while (0) ++ ++#define spin_trylock_bh(lock) __cond_lock(lock, rt_spin_trylock_bh(lock)) ++#define spin_trylock_irq(lock) spin_trylock(lock) ++ ++#define spin_trylock_irqsave(lock, flags) \ ++({ \ ++ int __locked; \ ++ \ ++ typecheck(unsigned long, flags); \ ++ flags = 0; \ ++ __locked = spin_trylock(lock); \ ++ __locked; \ ++}) ++ ++#ifdef CONFIG_GENERIC_LOCKBREAK ++# define spin_is_contended(lock) ((lock)->break_lock) ++#else ++# define spin_is_contended(lock) (((void)(lock), 0)) ++#endif ++ ++static inline int spin_can_lock(spinlock_t *lock) ++{ ++ return !rt_mutex_is_locked(&lock->lock); ++} ++ ++static inline int spin_is_locked(spinlock_t *lock) ++{ ++ return rt_mutex_is_locked(&lock->lock); ++} ++ ++static inline void assert_spin_locked(spinlock_t *lock) ++{ ++ BUG_ON(!spin_is_locked(lock)); ++} ++ ++#endif +diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h +new file mode 100644 +index 000000000000..446da786e5d5 +--- /dev/null ++++ b/include/linux/spinlock_types_rt.h +@@ -0,0 +1,38 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#ifndef __LINUX_SPINLOCK_TYPES_RT_H ++#define __LINUX_SPINLOCK_TYPES_RT_H ++ ++#ifndef __LINUX_SPINLOCK_TYPES_H ++#error "Do not include directly. Include spinlock_types.h instead" ++#endif ++ ++#include <linux/cache.h> ++ ++/* ++ * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field: ++ */ ++typedef struct spinlock { ++ struct rt_mutex lock; ++ unsigned int break_lock; ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map dep_map; ++#endif ++} spinlock_t; ++ ++#define __RT_SPIN_INITIALIZER(name) \ ++ { \ ++ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ ++ .save_state = 1, \ ++ } ++/* ++.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock) ++*/ ++ ++#define __SPIN_LOCK_UNLOCKED(name) \ ++ { .lock = __RT_SPIN_INITIALIZER(name.lock), \ ++ SPIN_DEP_MAP_INIT(name) } ++ ++#define DEFINE_SPINLOCK(name) \ ++ spinlock_t name = __SPIN_LOCK_UNLOCKED(name) ++ ++#endif +diff --git a/kernel/futex/core.c b/kernel/futex/core.c +index 706cd446ec71..909dcd708a52 100644 +--- a/kernel/futex/core.c ++++ b/kernel/futex/core.c +@@ -1508,6 +1508,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ + struct task_struct *new_owner; + bool postunlock = false; + DEFINE_WAKE_Q(wake_q); ++ DEFINE_WAKE_Q(wake_sleeper_q); + int ret = 0; + + new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); +@@ -1557,14 +1558,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ + * not fail. + */ + pi_state_update_owner(pi_state, new_owner); +- postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); ++ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q, ++ &wake_sleeper_q); + } + + out_unlock: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + + if (postunlock) +- rt_mutex_postunlock(&wake_q); ++ rt_mutex_postunlock(&wake_q, &wake_sleeper_q); + + return ret; + } +@@ -2867,7 +2869,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, + goto no_block; + } + +- rt_mutex_init_waiter(&rt_waiter); ++ rt_mutex_init_waiter(&rt_waiter, false); + + /* + * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not +@@ -3213,7 +3215,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + * The waiter is allocated on our stack, manipulated by the requeue + * code while we sleep on uaddr. + */ +- rt_mutex_init_waiter(&rt_waiter); ++ rt_mutex_init_waiter(&rt_waiter, false); + + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); + if (unlikely(ret != 0)) +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index 40ae934c6caa..35cdbbb5daa9 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -8,6 +8,11 @@ + * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt + * Copyright (C) 2006 Esben Nielsen ++ * Adaptive Spinlocks: ++ * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich, ++ * and Peter Morreale, ++ * Adaptive Spinlocks simplification: ++ * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com> + * + * See Documentation/locking/rt-mutex-design.rst for details. + */ +@@ -233,7 +238,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + * Only use with rt_mutex_waiter_{less,equal}() + */ + #define task_to_waiter(p) \ +- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } ++ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline, .task = (p) } + + static inline int + rt_mutex_waiter_less(struct rt_mutex_waiter *left, +@@ -273,6 +278,27 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, + return 1; + } + ++#define STEAL_NORMAL 0 ++#define STEAL_LATERAL 1 ++ ++static inline int ++rt_mutex_steal(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, int mode) ++{ ++ struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock); ++ ++ if (waiter == top_waiter || rt_mutex_waiter_less(waiter, top_waiter)) ++ return 1; ++ ++ /* ++ * Note that RT tasks are excluded from lateral-steals ++ * to prevent the introduction of an unbounded latency. ++ */ ++ if (mode == STEAL_NORMAL || rt_task(waiter->task)) ++ return 0; ++ ++ return rt_mutex_waiter_equal(waiter, top_waiter); ++} ++ + static void + rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) + { +@@ -377,6 +403,14 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, + return debug_rt_mutex_detect_deadlock(waiter, chwalk); + } + ++static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter) ++{ ++ if (waiter->savestate) ++ wake_up_lock_sleeper(waiter->task); ++ else ++ wake_up_process(waiter->task); ++} ++ + /* + * Max number of times we'll walk the boosting chain: + */ +@@ -700,13 +734,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * follow here. This is the end of the chain we are walking. + */ + if (!rt_mutex_owner(lock)) { ++ struct rt_mutex_waiter *lock_top_waiter; ++ + /* + * If the requeue [7] above changed the top waiter, + * then we need to wake the new top waiter up to try + * to get the lock. + */ +- if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) +- wake_up_process(rt_mutex_top_waiter(lock)->task); ++ lock_top_waiter = rt_mutex_top_waiter(lock); ++ if (prerequeue_top_waiter != lock_top_waiter) ++ rt_mutex_wake_waiter(lock_top_waiter); + raw_spin_unlock_irq(&lock->wait_lock); + return 0; + } +@@ -807,9 +844,11 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * @task: The task which wants to acquire the lock + * @waiter: The waiter that is queued to the lock's wait tree if the + * callsite called task_blocked_on_lock(), otherwise NULL ++ * @mode: Lock steal mode (STEAL_NORMAL, STEAL_LATERAL) + */ +-static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, +- struct rt_mutex_waiter *waiter) ++static int __try_to_take_rt_mutex(struct rt_mutex *lock, ++ struct task_struct *task, ++ struct rt_mutex_waiter *waiter, int mode) + { + lockdep_assert_held(&lock->wait_lock); + +@@ -845,12 +884,11 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, + */ + if (waiter) { + /* +- * If waiter is not the highest priority waiter of +- * @lock, give up. ++ * If waiter is not the highest priority waiter of @lock, ++ * or its peer when lateral steal is allowed, give up. + */ +- if (waiter != rt_mutex_top_waiter(lock)) ++ if (!rt_mutex_steal(lock, waiter, mode)) + return 0; +- + /* + * We can acquire the lock. Remove the waiter from the + * lock waiters tree. +@@ -868,14 +906,12 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, + */ + if (rt_mutex_has_waiters(lock)) { + /* +- * If @task->prio is greater than or equal to +- * the top waiter priority (kernel view), +- * @task lost. ++ * If @task->prio is greater than the top waiter ++ * priority (kernel view), or equal to it when a ++ * lateral steal is forbidden, @task lost. + */ +- if (!rt_mutex_waiter_less(task_to_waiter(task), +- rt_mutex_top_waiter(lock))) ++ if (!rt_mutex_steal(lock, task_to_waiter(task), mode)) + return 0; +- + /* + * The current top waiter stays enqueued. We + * don't have to change anything in the lock +@@ -922,6 +958,289 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, + return 1; + } + ++#ifdef CONFIG_PREEMPT_RT ++/* ++ * preemptible spin_lock functions: ++ */ ++static inline void rt_spin_lock_fastlock(struct rt_mutex *lock, ++ void (*slowfn)(struct rt_mutex *lock)) ++{ ++ might_sleep_no_state_check(); ++ ++ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) ++ return; ++ else ++ slowfn(lock); ++} ++ ++static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock, ++ void (*slowfn)(struct rt_mutex *lock)) ++{ ++ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) ++ return; ++ else ++ slowfn(lock); ++} ++#ifdef CONFIG_SMP ++/* ++ * Note that owner is a speculative pointer and dereferencing relies ++ * on rcu_read_lock() and the check against the lock owner. ++ */ ++static int adaptive_wait(struct rt_mutex *lock, ++ struct task_struct *owner) ++{ ++ int res = 0; ++ ++ rcu_read_lock(); ++ for (;;) { ++ if (owner != rt_mutex_owner(lock)) ++ break; ++ /* ++ * Ensure that owner->on_cpu is dereferenced _after_ ++ * checking the above to be valid. ++ */ ++ barrier(); ++ if (!owner->on_cpu) { ++ res = 1; ++ break; ++ } ++ cpu_relax(); ++ } ++ rcu_read_unlock(); ++ return res; ++} ++#else ++static int adaptive_wait(struct rt_mutex *lock, ++ struct task_struct *orig_owner) ++{ ++ return 1; ++} ++#endif ++ ++static int task_blocks_on_rt_mutex(struct rt_mutex *lock, ++ struct rt_mutex_waiter *waiter, ++ struct task_struct *task, ++ enum rtmutex_chainwalk chwalk); ++/* ++ * Slow path lock function spin_lock style: this variant is very ++ * careful not to miss any non-lock wakeups. ++ * ++ * We store the current state under p->pi_lock in p->saved_state and ++ * the try_to_wake_up() code handles this accordingly. ++ */ ++void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock, ++ struct rt_mutex_waiter *waiter, ++ unsigned long flags) ++{ ++ struct task_struct *lock_owner, *self = current; ++ struct rt_mutex_waiter *top_waiter; ++ int ret; ++ ++ if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) ++ return; ++ ++ BUG_ON(rt_mutex_owner(lock) == self); ++ ++ /* ++ * We save whatever state the task is in and we'll restore it ++ * after acquiring the lock taking real wakeups into account ++ * as well. We are serialized via pi_lock against wakeups. See ++ * try_to_wake_up(). ++ */ ++ raw_spin_lock(&self->pi_lock); ++ self->saved_state = self->state; ++ __set_current_state_no_track(TASK_UNINTERRUPTIBLE); ++ raw_spin_unlock(&self->pi_lock); ++ ++ ret = task_blocks_on_rt_mutex(lock, waiter, self, RT_MUTEX_MIN_CHAINWALK); ++ BUG_ON(ret); ++ ++ for (;;) { ++ /* Try to acquire the lock again. */ ++ if (__try_to_take_rt_mutex(lock, self, waiter, STEAL_LATERAL)) ++ break; ++ ++ top_waiter = rt_mutex_top_waiter(lock); ++ lock_owner = rt_mutex_owner(lock); ++ ++ raw_spin_unlock_irqrestore(&lock->wait_lock, flags); ++ ++ if (top_waiter != waiter || adaptive_wait(lock, lock_owner)) ++ schedule(); ++ ++ raw_spin_lock_irqsave(&lock->wait_lock, flags); ++ ++ raw_spin_lock(&self->pi_lock); ++ __set_current_state_no_track(TASK_UNINTERRUPTIBLE); ++ raw_spin_unlock(&self->pi_lock); ++ } ++ ++ /* ++ * Restore the task state to current->saved_state. We set it ++ * to the original state above and the try_to_wake_up() code ++ * has possibly updated it when a real (non-rtmutex) wakeup ++ * happened while we were blocked. Clear saved_state so ++ * try_to_wakeup() does not get confused. ++ */ ++ raw_spin_lock(&self->pi_lock); ++ __set_current_state_no_track(self->saved_state); ++ self->saved_state = TASK_RUNNING; ++ raw_spin_unlock(&self->pi_lock); ++ ++ /* ++ * try_to_take_rt_mutex() sets the waiter bit ++ * unconditionally. We might have to fix that up: ++ */ ++ fixup_rt_mutex_waiters(lock); ++ ++ BUG_ON(rt_mutex_has_waiters(lock) && waiter == rt_mutex_top_waiter(lock)); ++ BUG_ON(!RB_EMPTY_NODE(&waiter->tree_entry)); ++} ++ ++static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock) ++{ ++ struct rt_mutex_waiter waiter; ++ unsigned long flags; ++ ++ rt_mutex_init_waiter(&waiter, true); ++ ++ raw_spin_lock_irqsave(&lock->wait_lock, flags); ++ rt_spin_lock_slowlock_locked(lock, &waiter, flags); ++ raw_spin_unlock_irqrestore(&lock->wait_lock, flags); ++ debug_rt_mutex_free_waiter(&waiter); ++} ++ ++static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock, ++ struct wake_q_head *wake_q, ++ struct wake_q_head *wq_sleeper); ++/* ++ * Slow path to release a rt_mutex spin_lock style ++ */ ++void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock) ++{ ++ unsigned long flags; ++ DEFINE_WAKE_Q(wake_q); ++ DEFINE_WAKE_Q(wake_sleeper_q); ++ bool postunlock; ++ ++ raw_spin_lock_irqsave(&lock->wait_lock, flags); ++ postunlock = __rt_mutex_unlock_common(lock, &wake_q, &wake_sleeper_q); ++ raw_spin_unlock_irqrestore(&lock->wait_lock, flags); ++ ++ if (postunlock) ++ rt_mutex_postunlock(&wake_q, &wake_sleeper_q); ++} ++ ++void __lockfunc rt_spin_lock(spinlock_t *lock) ++{ ++ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); ++ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); ++ migrate_disable(); ++} ++EXPORT_SYMBOL(rt_spin_lock); ++ ++void __lockfunc __rt_spin_lock(struct rt_mutex *lock) ++{ ++ rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock); ++} ++ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass) ++{ ++ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); ++ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); ++ migrate_disable(); ++} ++EXPORT_SYMBOL(rt_spin_lock_nested); ++ ++void __lockfunc rt_spin_lock_nest_lock(spinlock_t *lock, ++ struct lockdep_map *nest_lock) ++{ ++ spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); ++ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); ++ migrate_disable(); ++} ++EXPORT_SYMBOL(rt_spin_lock_nest_lock); ++#endif ++ ++void __lockfunc rt_spin_unlock(spinlock_t *lock) ++{ ++ /* NOTE: we always pass in '1' for nested, for simplicity */ ++ spin_release(&lock->dep_map, _RET_IP_); ++ migrate_enable(); ++ rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock); ++} ++EXPORT_SYMBOL(rt_spin_unlock); ++ ++void __lockfunc __rt_spin_unlock(struct rt_mutex *lock) ++{ ++ rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock); ++} ++EXPORT_SYMBOL(__rt_spin_unlock); ++ ++/* ++ * Wait for the lock to get unlocked: instead of polling for an unlock ++ * (like raw spinlocks do), we lock and unlock, to force the kernel to ++ * schedule if there's contention: ++ */ ++void __lockfunc rt_spin_lock_unlock(spinlock_t *lock) ++{ ++ spin_lock(lock); ++ spin_unlock(lock); ++} ++EXPORT_SYMBOL(rt_spin_lock_unlock); ++ ++int __lockfunc rt_spin_trylock(spinlock_t *lock) ++{ ++ int ret; ++ ++ ret = __rt_mutex_trylock(&lock->lock); ++ if (ret) { ++ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); ++ migrate_disable(); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(rt_spin_trylock); ++ ++int __lockfunc rt_spin_trylock_bh(spinlock_t *lock) ++{ ++ int ret; ++ ++ local_bh_disable(); ++ ret = __rt_mutex_trylock(&lock->lock); ++ if (ret) { ++ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); ++ migrate_disable(); ++ } else { ++ local_bh_enable(); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(rt_spin_trylock_bh); ++ ++void ++__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key) ++{ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ /* ++ * Make sure we are not reinitializing a held lock: ++ */ ++ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); ++ lockdep_init_map(&lock->dep_map, name, key, 0); ++#endif ++} ++EXPORT_SYMBOL(__rt_spin_lock_init); ++ ++#endif /* PREEMPT_RT */ ++ ++static inline int ++try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, ++ struct rt_mutex_waiter *waiter) ++{ ++ return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL); ++} ++ + /* + * Task blocks on lock. + * +@@ -1035,6 +1354,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + * Called with lock->wait_lock held and interrupts disabled. + */ + static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, ++ struct wake_q_head *wake_sleeper_q, + struct rt_mutex *lock) + { + struct rt_mutex_waiter *waiter; +@@ -1074,7 +1394,10 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, + * Pairs with preempt_enable() in rt_mutex_postunlock(); + */ + preempt_disable(); +- wake_q_add(wake_q, waiter->task); ++ if (waiter->savestate) ++ wake_q_add_sleeper(wake_sleeper_q, waiter->task); ++ else ++ wake_q_add(wake_q, waiter->task); + raw_spin_unlock(¤t->pi_lock); + } + +@@ -1158,21 +1481,22 @@ void rt_mutex_adjust_pi(struct task_struct *task) + return; + } + next_lock = waiter->lock; +- raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(task); + ++ raw_spin_unlock_irqrestore(&task->pi_lock, flags); + rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL, + next_lock, NULL, task); + } + +-void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) ++void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate) + { + debug_rt_mutex_init_waiter(waiter); + RB_CLEAR_NODE(&waiter->pi_tree_entry); + RB_CLEAR_NODE(&waiter->tree_entry); + waiter->task = NULL; ++ waiter->savestate = savestate; + } + + /** +@@ -1283,7 +1607,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, + unsigned long flags; + int ret = 0; + +- rt_mutex_init_waiter(&waiter); ++ rt_mutex_init_waiter(&waiter, false); + + /* + * Technically we could use raw_spin_[un]lock_irq() here, but this can +@@ -1356,7 +1680,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) + * Return whether the current task needs to call rt_mutex_postunlock(). + */ + static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, +- struct wake_q_head *wake_q) ++ struct wake_q_head *wake_q, ++ struct wake_q_head *wake_sleeper_q) + { + unsigned long flags; + +@@ -1410,7 +1735,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, + * + * Queue the next waiter for wakeup once we release the wait_lock. + */ +- mark_wakeup_next_waiter(wake_q, lock); ++ mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + + return true; /* call rt_mutex_postunlock() */ +@@ -1447,9 +1772,11 @@ rt_mutex_fasttrylock(struct rt_mutex *lock, + /* + * Performs the wakeup of the top-waiter and re-enables preemption. + */ +-void rt_mutex_postunlock(struct wake_q_head *wake_q) ++void rt_mutex_postunlock(struct wake_q_head *wake_q, ++ struct wake_q_head *wake_sleeper_q) + { + wake_up_q(wake_q); ++ wake_up_q_sleeper(wake_sleeper_q); + + /* Pairs with preempt_disable() in rt_mutex_slowunlock() */ + preempt_enable(); +@@ -1458,15 +1785,17 @@ void rt_mutex_postunlock(struct wake_q_head *wake_q) + static inline void + rt_mutex_fastunlock(struct rt_mutex *lock, + bool (*slowfn)(struct rt_mutex *lock, +- struct wake_q_head *wqh)) ++ struct wake_q_head *wqh, ++ struct wake_q_head *wq_sleeper)) + { + DEFINE_WAKE_Q(wake_q); ++ DEFINE_WAKE_Q(wake_sleeper_q); + + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) + return; + +- if (slowfn(lock, &wake_q)) +- rt_mutex_postunlock(&wake_q); ++ if (slowfn(lock, &wake_q, &wake_sleeper_q)) ++ rt_mutex_postunlock(&wake_q, &wake_sleeper_q); + } + + int __sched __rt_mutex_lock_state(struct rt_mutex *lock, int state) +@@ -1597,16 +1926,13 @@ void __sched __rt_mutex_unlock(struct rt_mutex *lock) + void __sched rt_mutex_unlock(struct rt_mutex *lock) + { + mutex_release(&lock->dep_map, _RET_IP_); +- rt_mutex_fastunlock(lock, rt_mutex_slowunlock); ++ __rt_mutex_unlock(lock); + } + EXPORT_SYMBOL_GPL(rt_mutex_unlock); + +-/** +- * Futex variant, that since futex variants do not use the fast-path, can be +- * simple and will not need to retry. +- */ +-bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, +- struct wake_q_head *wake_q) ++static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock, ++ struct wake_q_head *wake_q, ++ struct wake_q_head *wq_sleeper) + { + lockdep_assert_held(&lock->wait_lock); + +@@ -1623,23 +1949,35 @@ bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, + * avoid inversion prior to the wakeup. preempt_disable() + * therein pairs with rt_mutex_postunlock(). + */ +- mark_wakeup_next_waiter(wake_q, lock); ++ mark_wakeup_next_waiter(wake_q, wq_sleeper, lock); + + return true; /* call postunlock() */ + } + ++/** ++ * Futex variant, that since futex variants do not use the fast-path, can be ++ * simple and will not need to retry. ++ */ ++bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, ++ struct wake_q_head *wake_q, ++ struct wake_q_head *wq_sleeper) ++{ ++ return __rt_mutex_unlock_common(lock, wake_q, wq_sleeper); ++} ++ + void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) + { + DEFINE_WAKE_Q(wake_q); ++ DEFINE_WAKE_Q(wake_sleeper_q); + unsigned long flags; + bool postunlock; + + raw_spin_lock_irqsave(&lock->wait_lock, flags); +- postunlock = __rt_mutex_futex_unlock(lock, &wake_q); ++ postunlock = __rt_mutex_futex_unlock(lock, &wake_q, &wake_sleeper_q); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + + if (postunlock) +- rt_mutex_postunlock(&wake_q); ++ rt_mutex_postunlock(&wake_q, &wake_sleeper_q); + } + + /** +@@ -1675,7 +2013,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name, + if (name && key) + debug_rt_mutex_init(lock, name, key); + } +-EXPORT_SYMBOL_GPL(__rt_mutex_init); ++EXPORT_SYMBOL(__rt_mutex_init); + + /** + * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a +@@ -1695,6 +2033,14 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner) + { + __rt_mutex_init(lock, NULL, NULL); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* ++ * get another key class for the wait_lock. LOCK_PI and UNLOCK_PI is ++ * holding the ->wait_lock of the proxy_lock while unlocking a sleeping ++ * lock. ++ */ ++ raw_spin_lock_init(&lock->wait_lock); ++#endif + debug_rt_mutex_proxy_lock(lock, proxy_owner); + rt_mutex_set_owner(lock, proxy_owner); + } +@@ -1717,6 +2063,26 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock) + rt_mutex_set_owner(lock, NULL); + } + ++static void fixup_rt_mutex_blocked(struct rt_mutex *lock) ++{ ++ struct task_struct *tsk = current; ++ /* ++ * RT has a problem here when the wait got interrupted by a timeout ++ * or a signal. task->pi_blocked_on is still set. The task must ++ * acquire the hash bucket lock when returning from this function. ++ * ++ * If the hash bucket lock is contended then the ++ * BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on)) in ++ * task_blocks_on_rt_mutex() will trigger. This can be avoided by ++ * clearing task->pi_blocked_on which removes the task from the ++ * boosting chain of the rtmutex. That's correct because the task ++ * is not longer blocked on it. ++ */ ++ raw_spin_lock(&tsk->pi_lock); ++ tsk->pi_blocked_on = NULL; ++ raw_spin_unlock(&tsk->pi_lock); ++} ++ + /** + * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take +@@ -1789,6 +2155,9 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, + ret = 0; + } + ++ if (ret) ++ fixup_rt_mutex_blocked(lock); ++ + return ret; + } + +@@ -1878,6 +2247,9 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock); ++ if (ret) ++ fixup_rt_mutex_blocked(lock); ++ + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h +index 9d1e974ca9c3..c1a280167e3c 100644 +--- a/kernel/locking/rtmutex_common.h ++++ b/kernel/locking/rtmutex_common.h +@@ -31,6 +31,7 @@ struct rt_mutex_waiter { + struct task_struct *task; + struct rt_mutex *lock; + int prio; ++ bool savestate; + u64 deadline; + }; + +@@ -133,7 +134,7 @@ extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); + extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner); + extern void rt_mutex_proxy_unlock(struct rt_mutex *lock); +-extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); ++extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate); + extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task); +@@ -151,9 +152,12 @@ extern int __rt_mutex_futex_trylock(struct rt_mutex *l); + + extern void rt_mutex_futex_unlock(struct rt_mutex *lock); + extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, +- struct wake_q_head *wqh); ++ struct wake_q_head *wqh, ++ struct wake_q_head *wq_sleeper); ++ ++extern void rt_mutex_postunlock(struct wake_q_head *wake_q, ++ struct wake_q_head *wake_sleeper_q); + +-extern void rt_mutex_postunlock(struct wake_q_head *wake_q); + /* RW semaphore special interface */ + + extern int __rt_mutex_lock_state(struct rt_mutex *lock, int state); +@@ -163,6 +167,10 @@ int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + enum rtmutex_chainwalk chwalk, + struct rt_mutex_waiter *waiter); ++void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock, ++ struct rt_mutex_waiter *waiter, ++ unsigned long flags); ++void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock); + + #ifdef CONFIG_DEBUG_RT_MUTEXES + # include "rtmutex-debug.h" +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index f6d40256c0d4..be5d41ed6ff2 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -511,9 +511,15 @@ static bool set_nr_if_polling(struct task_struct *p) + #endif + #endif + +-static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task, ++ bool sleeper) + { +- struct wake_q_node *node = &task->wake_q; ++ struct wake_q_node *node; ++ ++ if (sleeper) ++ node = &task->wake_q_sleeper; ++ else ++ node = &task->wake_q; + + /* + * Atomically grab the task, if ->wake_q is !nil already it means +@@ -549,7 +555,13 @@ static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) + */ + void wake_q_add(struct wake_q_head *head, struct task_struct *task) + { +- if (__wake_q_add(head, task)) ++ if (__wake_q_add(head, task, false)) ++ get_task_struct(task); ++} ++ ++void wake_q_add_sleeper(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task, true)) + get_task_struct(task); + } + +@@ -572,28 +584,39 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) + */ + void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) + { +- if (!__wake_q_add(head, task)) ++ if (!__wake_q_add(head, task, false)) + put_task_struct(task); + } + +-void wake_up_q(struct wake_q_head *head) ++void __wake_up_q(struct wake_q_head *head, bool sleeper) + { + struct wake_q_node *node = head->first; + + while (node != WAKE_Q_TAIL) { + struct task_struct *task; + +- task = container_of(node, struct task_struct, wake_q); ++ if (sleeper) ++ task = container_of(node, struct task_struct, wake_q_sleeper); ++ else ++ task = container_of(node, struct task_struct, wake_q); ++ + BUG_ON(!task); + /* Task can safely be re-inserted now: */ + node = node->next; +- task->wake_q.next = NULL; + ++ if (sleeper) ++ task->wake_q_sleeper.next = NULL; ++ else ++ task->wake_q.next = NULL; + /* + * wake_up_process() executes a full barrier, which pairs with + * the queueing in wake_q_add() so as not to miss wakeups. + */ +- wake_up_process(task); ++ if (sleeper) ++ wake_up_lock_sleeper(task); ++ else ++ wake_up_process(task); ++ + put_task_struct(task); + } + } +-- +2.43.0 + diff --git a/debian/patches-rt/0171-locking-rtmutex-Allow-rt_mutex_trylock-on-PREEMPT_RT.patch b/debian/patches-rt/0171-locking-rtmutex-Allow-rt_mutex_trylock-on-PREEMPT_RT.patch new file mode 100644 index 000000000..4c2c1b803 --- /dev/null +++ b/debian/patches-rt/0171-locking-rtmutex-Allow-rt_mutex_trylock-on-PREEMPT_RT.patch @@ -0,0 +1,37 @@ +From bcc9f5e2b7eb56660512649787ab651fc138d9e4 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 2 Dec 2015 11:34:07 +0100 +Subject: [PATCH 171/323] locking/rtmutex: Allow rt_mutex_trylock() on + PREEMPT_RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Non PREEMPT_RT kernel can deadlock on rt_mutex_trylock() in softirq +context. +On PREEMPT_RT the softirq context is handled in thread context. This +avoids the deadlock in the slow path and PI-boosting will be done on the +correct thread. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/locking/rtmutex.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index 35cdbbb5daa9..1bf7a04688c5 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1884,7 +1884,11 @@ int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) + + int __sched __rt_mutex_trylock(struct rt_mutex *lock) + { ++#ifdef CONFIG_PREEMPT_RT ++ if (WARN_ON_ONCE(in_irq() || in_nmi())) ++#else + if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) ++#endif + return 0; + + return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); +-- +2.43.0 + diff --git a/debian/patches-rt/0172-locking-rtmutex-add-mutex-implementation-based-on-rt.patch b/debian/patches-rt/0172-locking-rtmutex-add-mutex-implementation-based-on-rt.patch new file mode 100644 index 000000000..655fd8534 --- /dev/null +++ b/debian/patches-rt/0172-locking-rtmutex-add-mutex-implementation-based-on-rt.patch @@ -0,0 +1,385 @@ +From c363a86a2fab38d43a50a8841e2b6d64b87ae69b Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Thu, 12 Oct 2017 17:17:03 +0200 +Subject: [PATCH 172/323] locking/rtmutex: add mutex implementation based on + rtmutex +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/mutex_rt.h | 130 ++++++++++++++++++++++ + kernel/locking/mutex-rt.c | 224 ++++++++++++++++++++++++++++++++++++++ + 2 files changed, 354 insertions(+) + create mode 100644 include/linux/mutex_rt.h + create mode 100644 kernel/locking/mutex-rt.c + +diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h +new file mode 100644 +index 000000000000..f0b2e07cd5c5 +--- /dev/null ++++ b/include/linux/mutex_rt.h +@@ -0,0 +1,130 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#ifndef __LINUX_MUTEX_RT_H ++#define __LINUX_MUTEX_RT_H ++ ++#ifndef __LINUX_MUTEX_H ++#error "Please include mutex.h" ++#endif ++ ++#include <linux/rtmutex.h> ++ ++/* FIXME: Just for __lockfunc */ ++#include <linux/spinlock.h> ++ ++struct mutex { ++ struct rt_mutex lock; ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map dep_map; ++#endif ++}; ++ ++#define __MUTEX_INITIALIZER(mutexname) \ ++ { \ ++ .lock = __RT_MUTEX_INITIALIZER(mutexname.lock) \ ++ __DEP_MAP_MUTEX_INITIALIZER(mutexname) \ ++ } ++ ++#define DEFINE_MUTEX(mutexname) \ ++ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) ++ ++extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key); ++extern void __lockfunc _mutex_lock(struct mutex *lock); ++extern void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass); ++extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock); ++extern int __lockfunc _mutex_lock_killable(struct mutex *lock); ++extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass); ++extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock); ++extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass); ++extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass); ++extern int __lockfunc _mutex_trylock(struct mutex *lock); ++extern void __lockfunc _mutex_unlock(struct mutex *lock); ++ ++#define mutex_is_locked(l) rt_mutex_is_locked(&(l)->lock) ++#define mutex_lock(l) _mutex_lock(l) ++#define mutex_lock_interruptible(l) _mutex_lock_interruptible(l) ++#define mutex_lock_killable(l) _mutex_lock_killable(l) ++#define mutex_trylock(l) _mutex_trylock(l) ++#define mutex_unlock(l) _mutex_unlock(l) ++#define mutex_lock_io(l) _mutex_lock_io_nested(l, 0); ++ ++#define __mutex_owner(l) ((l)->lock.owner) ++ ++#ifdef CONFIG_DEBUG_MUTEXES ++#define mutex_destroy(l) rt_mutex_destroy(&(l)->lock) ++#else ++static inline void mutex_destroy(struct mutex *lock) {} ++#endif ++ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++# define mutex_lock_nested(l, s) _mutex_lock_nested(l, s) ++# define mutex_lock_interruptible_nested(l, s) \ ++ _mutex_lock_interruptible_nested(l, s) ++# define mutex_lock_killable_nested(l, s) \ ++ _mutex_lock_killable_nested(l, s) ++# define mutex_lock_io_nested(l, s) _mutex_lock_io_nested(l, s) ++ ++# define mutex_lock_nest_lock(lock, nest_lock) \ ++do { \ ++ typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ ++ _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \ ++} while (0) ++ ++#else ++# define mutex_lock_nested(l, s) _mutex_lock(l) ++# define mutex_lock_interruptible_nested(l, s) \ ++ _mutex_lock_interruptible(l) ++# define mutex_lock_killable_nested(l, s) \ ++ _mutex_lock_killable(l) ++# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock) ++# define mutex_lock_io_nested(l, s) _mutex_lock_io_nested(l, s) ++#endif ++ ++# define mutex_init(mutex) \ ++do { \ ++ static struct lock_class_key __key; \ ++ \ ++ rt_mutex_init(&(mutex)->lock); \ ++ __mutex_do_init((mutex), #mutex, &__key); \ ++} while (0) ++ ++# define __mutex_init(mutex, name, key) \ ++do { \ ++ rt_mutex_init(&(mutex)->lock); \ ++ __mutex_do_init((mutex), name, key); \ ++} while (0) ++ ++/** ++ * These values are chosen such that FAIL and SUCCESS match the ++ * values of the regular mutex_trylock(). ++ */ ++enum mutex_trylock_recursive_enum { ++ MUTEX_TRYLOCK_FAILED = 0, ++ MUTEX_TRYLOCK_SUCCESS = 1, ++ MUTEX_TRYLOCK_RECURSIVE, ++}; ++/** ++ * mutex_trylock_recursive - trylock variant that allows recursive locking ++ * @lock: mutex to be locked ++ * ++ * This function should not be used, _ever_. It is purely for hysterical GEM ++ * raisins, and once those are gone this will be removed. ++ * ++ * Returns: ++ * MUTEX_TRYLOCK_FAILED - trylock failed, ++ * MUTEX_TRYLOCK_SUCCESS - lock acquired, ++ * MUTEX_TRYLOCK_RECURSIVE - we already owned the lock. ++ */ ++int __rt_mutex_owner_current(struct rt_mutex *lock); ++ ++static inline /* __deprecated */ __must_check enum mutex_trylock_recursive_enum ++mutex_trylock_recursive(struct mutex *lock) ++{ ++ if (unlikely(__rt_mutex_owner_current(&lock->lock))) ++ return MUTEX_TRYLOCK_RECURSIVE; ++ ++ return mutex_trylock(lock); ++} ++ ++extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); ++ ++#endif +diff --git a/kernel/locking/mutex-rt.c b/kernel/locking/mutex-rt.c +new file mode 100644 +index 000000000000..2b849e6b9b4a +--- /dev/null ++++ b/kernel/locking/mutex-rt.c +@@ -0,0 +1,224 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++/* ++ * Real-Time Preemption Support ++ * ++ * started by Ingo Molnar: ++ * ++ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> ++ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> ++ * ++ * historic credit for proving that Linux spinlocks can be implemented via ++ * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow ++ * and others) who prototyped it on 2.4 and did lots of comparative ++ * research and analysis; TimeSys, for proving that you can implement a ++ * fully preemptible kernel via the use of IRQ threading and mutexes; ++ * Bill Huey for persuasively arguing on lkml that the mutex model is the ++ * right one; and to MontaVista, who ported pmutexes to 2.6. ++ * ++ * This code is a from-scratch implementation and is not based on pmutexes, ++ * but the idea of converting spinlocks to mutexes is used here too. ++ * ++ * lock debugging, locking tree, deadlock detection: ++ * ++ * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey ++ * Released under the General Public License (GPL). ++ * ++ * Includes portions of the generic R/W semaphore implementation from: ++ * ++ * Copyright (c) 2001 David Howells (dhowells@redhat.com). ++ * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de> ++ * - Derived also from comments by Linus ++ * ++ * Pending ownership of locks and ownership stealing: ++ * ++ * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt ++ * ++ * (also by Steven Rostedt) ++ * - Converted single pi_lock to individual task locks. ++ * ++ * By Esben Nielsen: ++ * Doing priority inheritance with help of the scheduler. ++ * ++ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> ++ * - major rework based on Esben Nielsens initial patch ++ * - replaced thread_info references by task_struct refs ++ * - removed task->pending_owner dependency ++ * - BKL drop/reacquire for semaphore style locks to avoid deadlocks ++ * in the scheduler return path as discussed with Steven Rostedt ++ * ++ * Copyright (C) 2006, Kihon Technologies Inc. ++ * Steven Rostedt <rostedt@goodmis.org> ++ * - debugged and patched Thomas Gleixner's rework. ++ * - added back the cmpxchg to the rework. ++ * - turned atomic require back on for SMP. ++ */ ++ ++#include <linux/spinlock.h> ++#include <linux/rtmutex.h> ++#include <linux/sched.h> ++#include <linux/delay.h> ++#include <linux/module.h> ++#include <linux/kallsyms.h> ++#include <linux/syscalls.h> ++#include <linux/interrupt.h> ++#include <linux/plist.h> ++#include <linux/fs.h> ++#include <linux/futex.h> ++#include <linux/hrtimer.h> ++#include <linux/blkdev.h> ++ ++#include "rtmutex_common.h" ++ ++/* ++ * struct mutex functions ++ */ ++void __mutex_do_init(struct mutex *mutex, const char *name, ++ struct lock_class_key *key) ++{ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ /* ++ * Make sure we are not reinitializing a held lock: ++ */ ++ debug_check_no_locks_freed((void *)mutex, sizeof(*mutex)); ++ lockdep_init_map(&mutex->dep_map, name, key, 0); ++#endif ++ mutex->lock.save_state = 0; ++} ++EXPORT_SYMBOL(__mutex_do_init); ++ ++static int _mutex_lock_blk_flush(struct mutex *lock, int state) ++{ ++ /* ++ * Flush blk before ->pi_blocked_on is set. At schedule() time it is too ++ * late if one of the callbacks needs to acquire a sleeping lock. ++ */ ++ if (blk_needs_flush_plug(current)) ++ blk_schedule_flush_plug(current); ++ return __rt_mutex_lock_state(&lock->lock, state); ++} ++ ++void __lockfunc _mutex_lock(struct mutex *lock) ++{ ++ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); ++ _mutex_lock_blk_flush(lock, TASK_UNINTERRUPTIBLE); ++} ++EXPORT_SYMBOL(_mutex_lock); ++ ++void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ ++ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_); ++ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE); ++ ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL_GPL(_mutex_lock_io_nested); ++ ++int __lockfunc _mutex_lock_interruptible(struct mutex *lock) ++{ ++ int ret; ++ ++ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); ++ ret = _mutex_lock_blk_flush(lock, TASK_INTERRUPTIBLE); ++ if (ret) ++ mutex_release(&lock->dep_map, _RET_IP_); ++ return ret; ++} ++EXPORT_SYMBOL(_mutex_lock_interruptible); ++ ++int __lockfunc _mutex_lock_killable(struct mutex *lock) ++{ ++ int ret; ++ ++ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); ++ ret = _mutex_lock_blk_flush(lock, TASK_KILLABLE); ++ if (ret) ++ mutex_release(&lock->dep_map, _RET_IP_); ++ return ret; ++} ++EXPORT_SYMBOL(_mutex_lock_killable); ++ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass) ++{ ++ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_); ++ _mutex_lock_blk_flush(lock, TASK_UNINTERRUPTIBLE); ++} ++EXPORT_SYMBOL(_mutex_lock_nested); ++ ++void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) ++{ ++ mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_); ++ _mutex_lock_blk_flush(lock, TASK_UNINTERRUPTIBLE); ++} ++EXPORT_SYMBOL(_mutex_lock_nest_lock); ++ ++int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass) ++{ ++ int ret; ++ ++ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_); ++ ret = _mutex_lock_blk_flush(lock, TASK_INTERRUPTIBLE); ++ if (ret) ++ mutex_release(&lock->dep_map, _RET_IP_); ++ return ret; ++} ++EXPORT_SYMBOL(_mutex_lock_interruptible_nested); ++ ++int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass) ++{ ++ int ret; ++ ++ mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); ++ ret = _mutex_lock_blk_flush(lock, TASK_KILLABLE); ++ if (ret) ++ mutex_release(&lock->dep_map, _RET_IP_); ++ return ret; ++} ++EXPORT_SYMBOL(_mutex_lock_killable_nested); ++#endif ++ ++int __lockfunc _mutex_trylock(struct mutex *lock) ++{ ++ int ret = __rt_mutex_trylock(&lock->lock); ++ ++ if (ret) ++ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); ++ ++ return ret; ++} ++EXPORT_SYMBOL(_mutex_trylock); ++ ++void __lockfunc _mutex_unlock(struct mutex *lock) ++{ ++ mutex_release(&lock->dep_map, _RET_IP_); ++ __rt_mutex_unlock(&lock->lock); ++} ++EXPORT_SYMBOL(_mutex_unlock); ++ ++/** ++ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 ++ * @cnt: the atomic which we are to dec ++ * @lock: the mutex to return holding if we dec to 0 ++ * ++ * return true and hold lock if we dec to 0, return false otherwise ++ */ ++int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) ++{ ++ /* dec if we can't possibly hit 0 */ ++ if (atomic_add_unless(cnt, -1, 1)) ++ return 0; ++ /* we might hit 0, so take the lock */ ++ mutex_lock(lock); ++ if (!atomic_dec_and_test(cnt)) { ++ /* when we actually did the dec, we didn't hit 0 */ ++ mutex_unlock(lock); ++ return 0; ++ } ++ /* we hit 0, and we hold the lock */ ++ return 1; ++} ++EXPORT_SYMBOL(atomic_dec_and_mutex_lock); +-- +2.43.0 + diff --git a/debian/patches-rt/0173-locking-rtmutex-add-rwsem-implementation-based-on-rt.patch b/debian/patches-rt/0173-locking-rtmutex-add-rwsem-implementation-based-on-rt.patch new file mode 100644 index 000000000..d1595ab45 --- /dev/null +++ b/debian/patches-rt/0173-locking-rtmutex-add-rwsem-implementation-based-on-rt.patch @@ -0,0 +1,455 @@ +From 9095a37abced402fb5197fef8d126069f63ebfb4 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Thu, 12 Oct 2017 17:28:34 +0200 +Subject: [PATCH 173/323] locking/rtmutex: add rwsem implementation based on + rtmutex +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The RT specific R/W semaphore implementation restricts the number of readers +to one because a writer cannot block on multiple readers and inherit its +priority or budget. + +The single reader restricting is painful in various ways: + + - Performance bottleneck for multi-threaded applications in the page fault + path (mmap sem) + + - Progress blocker for drivers which are carefully crafted to avoid the + potential reader/writer deadlock in mainline. + +The analysis of the writer code paths shows, that properly written RT tasks +should not take them. Syscalls like mmap(), file access which take mmap sem +write locked have unbound latencies which are completely unrelated to mmap +sem. Other R/W sem users like graphics drivers are not suitable for RT tasks +either. + +So there is little risk to hurt RT tasks when the RT rwsem implementation is +changed in the following way: + + - Allow concurrent readers + + - Make writers block until the last reader left the critical section. This + blocking is not subject to priority/budget inheritance. + + - Readers blocked on a writer inherit their priority/budget in the normal + way. + +There is a drawback with this scheme. R/W semaphores become writer unfair +though the applications which have triggered writer starvation (mostly on +mmap_sem) in the past are not really the typical workloads running on a RT +system. So while it's unlikely to hit writer starvation, it's possible. If +there are unexpected workloads on RT systems triggering it, we need to rethink +the approach. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/rwsem-rt.h | 70 +++++++++ + kernel/locking/rwsem-rt.c | 318 ++++++++++++++++++++++++++++++++++++++ + 2 files changed, 388 insertions(+) + create mode 100644 include/linux/rwsem-rt.h + create mode 100644 kernel/locking/rwsem-rt.c + +diff --git a/include/linux/rwsem-rt.h b/include/linux/rwsem-rt.h +new file mode 100644 +index 000000000000..0ba8aae9a198 +--- /dev/null ++++ b/include/linux/rwsem-rt.h +@@ -0,0 +1,70 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#ifndef _LINUX_RWSEM_RT_H ++#define _LINUX_RWSEM_RT_H ++ ++#ifndef _LINUX_RWSEM_H ++#error "Include rwsem.h" ++#endif ++ ++#include <linux/rtmutex.h> ++#include <linux/swait.h> ++ ++#define READER_BIAS (1U << 31) ++#define WRITER_BIAS (1U << 30) ++ ++struct rw_semaphore { ++ atomic_t readers; ++ struct rt_mutex rtmutex; ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map dep_map; ++#endif ++}; ++ ++#define __RWSEM_INITIALIZER(name) \ ++{ \ ++ .readers = ATOMIC_INIT(READER_BIAS), \ ++ .rtmutex = __RT_MUTEX_INITIALIZER(name.rtmutex), \ ++ RW_DEP_MAP_INIT(name) \ ++} ++ ++#define DECLARE_RWSEM(lockname) \ ++ struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname) ++ ++extern void __rwsem_init(struct rw_semaphore *rwsem, const char *name, ++ struct lock_class_key *key); ++ ++#define __init_rwsem(sem, name, key) \ ++do { \ ++ rt_mutex_init(&(sem)->rtmutex); \ ++ __rwsem_init((sem), (name), (key)); \ ++} while (0) ++ ++#define init_rwsem(sem) \ ++do { \ ++ static struct lock_class_key __key; \ ++ \ ++ __init_rwsem((sem), #sem, &__key); \ ++} while (0) ++ ++static inline int rwsem_is_locked(struct rw_semaphore *sem) ++{ ++ return atomic_read(&sem->readers) != READER_BIAS; ++} ++ ++static inline int rwsem_is_contended(struct rw_semaphore *sem) ++{ ++ return atomic_read(&sem->readers) > 0; ++} ++ ++extern void __down_read(struct rw_semaphore *sem); ++extern int __down_read_interruptible(struct rw_semaphore *sem); ++extern int __down_read_killable(struct rw_semaphore *sem); ++extern int __down_read_trylock(struct rw_semaphore *sem); ++extern void __down_write(struct rw_semaphore *sem); ++extern int __must_check __down_write_killable(struct rw_semaphore *sem); ++extern int __down_write_trylock(struct rw_semaphore *sem); ++extern void __up_read(struct rw_semaphore *sem); ++extern void __up_write(struct rw_semaphore *sem); ++extern void __downgrade_write(struct rw_semaphore *sem); ++ ++#endif +diff --git a/kernel/locking/rwsem-rt.c b/kernel/locking/rwsem-rt.c +new file mode 100644 +index 000000000000..a0771c150041 +--- /dev/null ++++ b/kernel/locking/rwsem-rt.c +@@ -0,0 +1,318 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#include <linux/rwsem.h> ++#include <linux/sched/debug.h> ++#include <linux/sched/signal.h> ++#include <linux/export.h> ++#include <linux/blkdev.h> ++ ++#include "rtmutex_common.h" ++ ++/* ++ * RT-specific reader/writer semaphores ++ * ++ * down_write() ++ * 1) Lock sem->rtmutex ++ * 2) Remove the reader BIAS to force readers into the slow path ++ * 3) Wait until all readers have left the critical region ++ * 4) Mark it write locked ++ * ++ * up_write() ++ * 1) Remove the write locked marker ++ * 2) Set the reader BIAS so readers can use the fast path again ++ * 3) Unlock sem->rtmutex to release blocked readers ++ * ++ * down_read() ++ * 1) Try fast path acquisition (reader BIAS is set) ++ * 2) Take sem->rtmutex.wait_lock which protects the writelocked flag ++ * 3) If !writelocked, acquire it for read ++ * 4) If writelocked, block on sem->rtmutex ++ * 5) unlock sem->rtmutex, goto 1) ++ * ++ * up_read() ++ * 1) Try fast path release (reader count != 1) ++ * 2) Wake the writer waiting in down_write()#3 ++ * ++ * down_read()#3 has the consequence, that rw semaphores on RT are not writer ++ * fair, but writers, which should be avoided in RT tasks (think mmap_sem), ++ * are subject to the rtmutex priority/DL inheritance mechanism. ++ * ++ * It's possible to make the rw semaphores writer fair by keeping a list of ++ * active readers. A blocked writer would force all newly incoming readers to ++ * block on the rtmutex, but the rtmutex would have to be proxy locked for one ++ * reader after the other. We can't use multi-reader inheritance because there ++ * is no way to support that with SCHED_DEADLINE. Implementing the one by one ++ * reader boosting/handover mechanism is a major surgery for a very dubious ++ * value. ++ * ++ * The risk of writer starvation is there, but the pathological use cases ++ * which trigger it are not necessarily the typical RT workloads. ++ */ ++ ++void __rwsem_init(struct rw_semaphore *sem, const char *name, ++ struct lock_class_key *key) ++{ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ /* ++ * Make sure we are not reinitializing a held semaphore: ++ */ ++ debug_check_no_locks_freed((void *)sem, sizeof(*sem)); ++ lockdep_init_map(&sem->dep_map, name, key, 0); ++#endif ++ atomic_set(&sem->readers, READER_BIAS); ++} ++EXPORT_SYMBOL(__rwsem_init); ++ ++int __down_read_trylock(struct rw_semaphore *sem) ++{ ++ int r, old; ++ ++ /* ++ * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is ++ * set. ++ */ ++ for (r = atomic_read(&sem->readers); r < 0;) { ++ old = atomic_cmpxchg(&sem->readers, r, r + 1); ++ if (likely(old == r)) ++ return 1; ++ r = old; ++ } ++ return 0; ++} ++ ++static int __sched __down_read_common(struct rw_semaphore *sem, int state) ++{ ++ struct rt_mutex *m = &sem->rtmutex; ++ struct rt_mutex_waiter waiter; ++ int ret; ++ ++ if (__down_read_trylock(sem)) ++ return 0; ++ ++ /* ++ * Flush blk before ->pi_blocked_on is set. At schedule() time it is too ++ * late if one of the callbacks needs to acquire a sleeping lock. ++ */ ++ if (blk_needs_flush_plug(current)) ++ blk_schedule_flush_plug(current); ++ ++ might_sleep(); ++ raw_spin_lock_irq(&m->wait_lock); ++ /* ++ * Allow readers as long as the writer has not completely ++ * acquired the semaphore for write. ++ */ ++ if (atomic_read(&sem->readers) != WRITER_BIAS) { ++ atomic_inc(&sem->readers); ++ raw_spin_unlock_irq(&m->wait_lock); ++ return 0; ++ } ++ ++ /* ++ * Call into the slow lock path with the rtmutex->wait_lock ++ * held, so this can't result in the following race: ++ * ++ * Reader1 Reader2 Writer ++ * down_read() ++ * down_write() ++ * rtmutex_lock(m) ++ * swait() ++ * down_read() ++ * unlock(m->wait_lock) ++ * up_read() ++ * swake() ++ * lock(m->wait_lock) ++ * sem->writelocked=true ++ * unlock(m->wait_lock) ++ * ++ * up_write() ++ * sem->writelocked=false ++ * rtmutex_unlock(m) ++ * down_read() ++ * down_write() ++ * rtmutex_lock(m) ++ * swait() ++ * rtmutex_lock(m) ++ * ++ * That would put Reader1 behind the writer waiting on ++ * Reader2 to call up_read() which might be unbound. ++ */ ++ rt_mutex_init_waiter(&waiter, false); ++ ret = rt_mutex_slowlock_locked(m, state, NULL, RT_MUTEX_MIN_CHAINWALK, ++ &waiter); ++ /* ++ * The slowlock() above is guaranteed to return with the rtmutex (for ++ * ret = 0) is now held, so there can't be a writer active. Increment ++ * the reader count and immediately drop the rtmutex again. ++ * For ret != 0 we don't hold the rtmutex and need unlock the wait_lock. ++ * We don't own the lock then. ++ */ ++ if (!ret) ++ atomic_inc(&sem->readers); ++ raw_spin_unlock_irq(&m->wait_lock); ++ if (!ret) ++ __rt_mutex_unlock(m); ++ ++ debug_rt_mutex_free_waiter(&waiter); ++ return ret; ++} ++ ++void __down_read(struct rw_semaphore *sem) ++{ ++ int ret; ++ ++ ret = __down_read_common(sem, TASK_UNINTERRUPTIBLE); ++ WARN_ON_ONCE(ret); ++} ++ ++int __down_read_interruptible(struct rw_semaphore *sem) ++{ ++ int ret; ++ ++ ret = __down_read_common(sem, TASK_INTERRUPTIBLE); ++ if (likely(!ret)) ++ return ret; ++ WARN_ONCE(ret != -EINTR, "Unexpected state: %d\n", ret); ++ return -EINTR; ++} ++ ++int __down_read_killable(struct rw_semaphore *sem) ++{ ++ int ret; ++ ++ ret = __down_read_common(sem, TASK_KILLABLE); ++ if (likely(!ret)) ++ return ret; ++ WARN_ONCE(ret != -EINTR, "Unexpected state: %d\n", ret); ++ return -EINTR; ++} ++ ++void __up_read(struct rw_semaphore *sem) ++{ ++ struct rt_mutex *m = &sem->rtmutex; ++ struct task_struct *tsk; ++ ++ /* ++ * sem->readers can only hit 0 when a writer is waiting for the ++ * active readers to leave the critical region. ++ */ ++ if (!atomic_dec_and_test(&sem->readers)) ++ return; ++ ++ might_sleep(); ++ raw_spin_lock_irq(&m->wait_lock); ++ /* ++ * Wake the writer, i.e. the rtmutex owner. It might release the ++ * rtmutex concurrently in the fast path (due to a signal), but to ++ * clean up the rwsem it needs to acquire m->wait_lock. The worst ++ * case which can happen is a spurious wakeup. ++ */ ++ tsk = rt_mutex_owner(m); ++ if (tsk) ++ wake_up_process(tsk); ++ ++ raw_spin_unlock_irq(&m->wait_lock); ++} ++ ++static void __up_write_unlock(struct rw_semaphore *sem, int bias, ++ unsigned long flags) ++{ ++ struct rt_mutex *m = &sem->rtmutex; ++ ++ atomic_add(READER_BIAS - bias, &sem->readers); ++ raw_spin_unlock_irqrestore(&m->wait_lock, flags); ++ __rt_mutex_unlock(m); ++} ++ ++static int __sched __down_write_common(struct rw_semaphore *sem, int state) ++{ ++ struct rt_mutex *m = &sem->rtmutex; ++ unsigned long flags; ++ ++ /* ++ * Flush blk before ->pi_blocked_on is set. At schedule() time it is too ++ * late if one of the callbacks needs to acquire a sleeping lock. ++ */ ++ if (blk_needs_flush_plug(current)) ++ blk_schedule_flush_plug(current); ++ ++ /* Take the rtmutex as a first step */ ++ if (__rt_mutex_lock_state(m, state)) ++ return -EINTR; ++ ++ /* Force readers into slow path */ ++ atomic_sub(READER_BIAS, &sem->readers); ++ might_sleep(); ++ ++ set_current_state(state); ++ for (;;) { ++ raw_spin_lock_irqsave(&m->wait_lock, flags); ++ /* Have all readers left the critical region? */ ++ if (!atomic_read(&sem->readers)) { ++ atomic_set(&sem->readers, WRITER_BIAS); ++ __set_current_state(TASK_RUNNING); ++ raw_spin_unlock_irqrestore(&m->wait_lock, flags); ++ return 0; ++ } ++ ++ if (signal_pending_state(state, current)) { ++ __set_current_state(TASK_RUNNING); ++ __up_write_unlock(sem, 0, flags); ++ return -EINTR; ++ } ++ raw_spin_unlock_irqrestore(&m->wait_lock, flags); ++ ++ if (atomic_read(&sem->readers) != 0) { ++ schedule(); ++ set_current_state(state); ++ } ++ } ++} ++ ++void __sched __down_write(struct rw_semaphore *sem) ++{ ++ __down_write_common(sem, TASK_UNINTERRUPTIBLE); ++} ++ ++int __sched __down_write_killable(struct rw_semaphore *sem) ++{ ++ return __down_write_common(sem, TASK_KILLABLE); ++} ++ ++int __down_write_trylock(struct rw_semaphore *sem) ++{ ++ struct rt_mutex *m = &sem->rtmutex; ++ unsigned long flags; ++ ++ if (!__rt_mutex_trylock(m)) ++ return 0; ++ ++ atomic_sub(READER_BIAS, &sem->readers); ++ ++ raw_spin_lock_irqsave(&m->wait_lock, flags); ++ if (!atomic_read(&sem->readers)) { ++ atomic_set(&sem->readers, WRITER_BIAS); ++ raw_spin_unlock_irqrestore(&m->wait_lock, flags); ++ return 1; ++ } ++ __up_write_unlock(sem, 0, flags); ++ return 0; ++} ++ ++void __up_write(struct rw_semaphore *sem) ++{ ++ struct rt_mutex *m = &sem->rtmutex; ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&m->wait_lock, flags); ++ __up_write_unlock(sem, WRITER_BIAS, flags); ++} ++ ++void __downgrade_write(struct rw_semaphore *sem) ++{ ++ struct rt_mutex *m = &sem->rtmutex; ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&m->wait_lock, flags); ++ /* Release it and account current as reader */ ++ __up_write_unlock(sem, WRITER_BIAS - 1, flags); ++} +-- +2.43.0 + diff --git a/debian/patches-rt/0174-locking-rtmutex-add-rwlock-implementation-based-on-r.patch b/debian/patches-rt/0174-locking-rtmutex-add-rwlock-implementation-based-on-r.patch new file mode 100644 index 000000000..481610364 --- /dev/null +++ b/debian/patches-rt/0174-locking-rtmutex-add-rwlock-implementation-based-on-r.patch @@ -0,0 +1,548 @@ +From 85ee193953134a79ee60def58e6d8c98b8cd8e55 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Thu, 12 Oct 2017 17:18:06 +0200 +Subject: [PATCH 174/323] locking/rtmutex: add rwlock implementation based on + rtmutex +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The implementation is bias-based, similar to the rwsem implementation. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/rwlock_rt.h | 109 +++++++++++ + include/linux/rwlock_types_rt.h | 56 ++++++ + kernel/Kconfig.locks | 2 +- + kernel/locking/rwlock-rt.c | 328 ++++++++++++++++++++++++++++++++ + 4 files changed, 494 insertions(+), 1 deletion(-) + create mode 100644 include/linux/rwlock_rt.h + create mode 100644 include/linux/rwlock_types_rt.h + create mode 100644 kernel/locking/rwlock-rt.c + +diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h +new file mode 100644 +index 000000000000..aafdb0a685d5 +--- /dev/null ++++ b/include/linux/rwlock_rt.h +@@ -0,0 +1,109 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#ifndef __LINUX_RWLOCK_RT_H ++#define __LINUX_RWLOCK_RT_H ++ ++#ifndef __LINUX_SPINLOCK_H ++#error Do not include directly. Use spinlock.h ++#endif ++ ++extern void __lockfunc rt_write_lock(rwlock_t *rwlock); ++extern void __lockfunc rt_read_lock(rwlock_t *rwlock); ++extern int __lockfunc rt_write_trylock(rwlock_t *rwlock); ++extern int __lockfunc rt_read_trylock(rwlock_t *rwlock); ++extern void __lockfunc rt_write_unlock(rwlock_t *rwlock); ++extern void __lockfunc rt_read_unlock(rwlock_t *rwlock); ++extern int __lockfunc rt_read_can_lock(rwlock_t *rwlock); ++extern int __lockfunc rt_write_can_lock(rwlock_t *rwlock); ++extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key); ++ ++#define read_can_lock(rwlock) rt_read_can_lock(rwlock) ++#define write_can_lock(rwlock) rt_write_can_lock(rwlock) ++ ++#define read_trylock(lock) __cond_lock(lock, rt_read_trylock(lock)) ++#define write_trylock(lock) __cond_lock(lock, rt_write_trylock(lock)) ++ ++static inline int __write_trylock_rt_irqsave(rwlock_t *lock, unsigned long *flags) ++{ ++ *flags = 0; ++ return rt_write_trylock(lock); ++} ++ ++#define write_trylock_irqsave(lock, flags) \ ++ __cond_lock(lock, __write_trylock_rt_irqsave(lock, &(flags))) ++ ++#define read_lock_irqsave(lock, flags) \ ++ do { \ ++ typecheck(unsigned long, flags); \ ++ rt_read_lock(lock); \ ++ flags = 0; \ ++ } while (0) ++ ++#define write_lock_irqsave(lock, flags) \ ++ do { \ ++ typecheck(unsigned long, flags); \ ++ rt_write_lock(lock); \ ++ flags = 0; \ ++ } while (0) ++ ++#define read_lock(lock) rt_read_lock(lock) ++ ++#define read_lock_bh(lock) \ ++ do { \ ++ local_bh_disable(); \ ++ rt_read_lock(lock); \ ++ } while (0) ++ ++#define read_lock_irq(lock) read_lock(lock) ++ ++#define write_lock(lock) rt_write_lock(lock) ++ ++#define write_lock_bh(lock) \ ++ do { \ ++ local_bh_disable(); \ ++ rt_write_lock(lock); \ ++ } while (0) ++ ++#define write_lock_irq(lock) write_lock(lock) ++ ++#define read_unlock(lock) rt_read_unlock(lock) ++ ++#define read_unlock_bh(lock) \ ++ do { \ ++ rt_read_unlock(lock); \ ++ local_bh_enable(); \ ++ } while (0) ++ ++#define read_unlock_irq(lock) read_unlock(lock) ++ ++#define write_unlock(lock) rt_write_unlock(lock) ++ ++#define write_unlock_bh(lock) \ ++ do { \ ++ rt_write_unlock(lock); \ ++ local_bh_enable(); \ ++ } while (0) ++ ++#define write_unlock_irq(lock) write_unlock(lock) ++ ++#define read_unlock_irqrestore(lock, flags) \ ++ do { \ ++ typecheck(unsigned long, flags); \ ++ (void) flags; \ ++ rt_read_unlock(lock); \ ++ } while (0) ++ ++#define write_unlock_irqrestore(lock, flags) \ ++ do { \ ++ typecheck(unsigned long, flags); \ ++ (void) flags; \ ++ rt_write_unlock(lock); \ ++ } while (0) ++ ++#define rwlock_init(rwl) \ ++do { \ ++ static struct lock_class_key __key; \ ++ \ ++ __rt_rwlock_init(rwl, #rwl, &__key); \ ++} while (0) ++ ++#endif +diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h +new file mode 100644 +index 000000000000..4762391d659b +--- /dev/null ++++ b/include/linux/rwlock_types_rt.h +@@ -0,0 +1,56 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#ifndef __LINUX_RWLOCK_TYPES_RT_H ++#define __LINUX_RWLOCK_TYPES_RT_H ++ ++#ifndef __LINUX_SPINLOCK_TYPES_H ++#error "Do not include directly. Include spinlock_types.h instead" ++#endif ++ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++# define RW_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname } ++#else ++# define RW_DEP_MAP_INIT(lockname) ++#endif ++ ++typedef struct rt_rw_lock rwlock_t; ++ ++#define __RW_LOCK_UNLOCKED(name) __RWLOCK_RT_INITIALIZER(name) ++ ++#define DEFINE_RWLOCK(name) \ ++ rwlock_t name = __RW_LOCK_UNLOCKED(name) ++ ++/* ++ * A reader biased implementation primarily for CPU pinning. ++ * ++ * Can be selected as general replacement for the single reader RT rwlock ++ * variant ++ */ ++struct rt_rw_lock { ++ struct rt_mutex rtmutex; ++ atomic_t readers; ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map dep_map; ++#endif ++}; ++ ++#define READER_BIAS (1U << 31) ++#define WRITER_BIAS (1U << 30) ++ ++#define __RWLOCK_RT_INITIALIZER(name) \ ++{ \ ++ .readers = ATOMIC_INIT(READER_BIAS), \ ++ .rtmutex = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.rtmutex), \ ++ RW_DEP_MAP_INIT(name) \ ++} ++ ++void __rwlock_biased_rt_init(struct rt_rw_lock *lock, const char *name, ++ struct lock_class_key *key); ++ ++#define rwlock_biased_rt_init(rwlock) \ ++ do { \ ++ static struct lock_class_key __key; \ ++ \ ++ __rwlock_biased_rt_init((rwlock), #rwlock, &__key); \ ++ } while (0) ++ ++#endif +diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks +index 3de8fd11873b..4198f0273ecd 100644 +--- a/kernel/Kconfig.locks ++++ b/kernel/Kconfig.locks +@@ -251,7 +251,7 @@ config ARCH_USE_QUEUED_RWLOCKS + + config QUEUED_RWLOCKS + def_bool y if ARCH_USE_QUEUED_RWLOCKS +- depends on SMP ++ depends on SMP && !PREEMPT_RT + + config ARCH_HAS_MMIOWB + bool +diff --git a/kernel/locking/rwlock-rt.c b/kernel/locking/rwlock-rt.c +new file mode 100644 +index 000000000000..1ee16b8fedd7 +--- /dev/null ++++ b/kernel/locking/rwlock-rt.c +@@ -0,0 +1,328 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#include <linux/sched/debug.h> ++#include <linux/export.h> ++ ++#include "rtmutex_common.h" ++#include <linux/rwlock_types_rt.h> ++ ++/* ++ * RT-specific reader/writer locks ++ * ++ * write_lock() ++ * 1) Lock lock->rtmutex ++ * 2) Remove the reader BIAS to force readers into the slow path ++ * 3) Wait until all readers have left the critical region ++ * 4) Mark it write locked ++ * ++ * write_unlock() ++ * 1) Remove the write locked marker ++ * 2) Set the reader BIAS so readers can use the fast path again ++ * 3) Unlock lock->rtmutex to release blocked readers ++ * ++ * read_lock() ++ * 1) Try fast path acquisition (reader BIAS is set) ++ * 2) Take lock->rtmutex.wait_lock which protects the writelocked flag ++ * 3) If !writelocked, acquire it for read ++ * 4) If writelocked, block on lock->rtmutex ++ * 5) unlock lock->rtmutex, goto 1) ++ * ++ * read_unlock() ++ * 1) Try fast path release (reader count != 1) ++ * 2) Wake the writer waiting in write_lock()#3 ++ * ++ * read_lock()#3 has the consequence, that rw locks on RT are not writer ++ * fair, but writers, which should be avoided in RT tasks (think tasklist ++ * lock), are subject to the rtmutex priority/DL inheritance mechanism. ++ * ++ * It's possible to make the rw locks writer fair by keeping a list of ++ * active readers. A blocked writer would force all newly incoming readers ++ * to block on the rtmutex, but the rtmutex would have to be proxy locked ++ * for one reader after the other. We can't use multi-reader inheritance ++ * because there is no way to support that with ++ * SCHED_DEADLINE. Implementing the one by one reader boosting/handover ++ * mechanism is a major surgery for a very dubious value. ++ * ++ * The risk of writer starvation is there, but the pathological use cases ++ * which trigger it are not necessarily the typical RT workloads. ++ */ ++ ++void __rwlock_biased_rt_init(struct rt_rw_lock *lock, const char *name, ++ struct lock_class_key *key) ++{ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ /* ++ * Make sure we are not reinitializing a held semaphore: ++ */ ++ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); ++ lockdep_init_map(&lock->dep_map, name, key, 0); ++#endif ++ atomic_set(&lock->readers, READER_BIAS); ++ rt_mutex_init(&lock->rtmutex); ++ lock->rtmutex.save_state = 1; ++} ++ ++static int __read_rt_trylock(struct rt_rw_lock *lock) ++{ ++ int r, old; ++ ++ /* ++ * Increment reader count, if lock->readers < 0, i.e. READER_BIAS is ++ * set. ++ */ ++ for (r = atomic_read(&lock->readers); r < 0;) { ++ old = atomic_cmpxchg(&lock->readers, r, r + 1); ++ if (likely(old == r)) ++ return 1; ++ r = old; ++ } ++ return 0; ++} ++ ++static void __read_rt_lock(struct rt_rw_lock *lock) ++{ ++ struct rt_mutex *m = &lock->rtmutex; ++ struct rt_mutex_waiter waiter; ++ unsigned long flags; ++ ++ if (__read_rt_trylock(lock)) ++ return; ++ ++ raw_spin_lock_irqsave(&m->wait_lock, flags); ++ /* ++ * Allow readers as long as the writer has not completely ++ * acquired the semaphore for write. ++ */ ++ if (atomic_read(&lock->readers) != WRITER_BIAS) { ++ atomic_inc(&lock->readers); ++ raw_spin_unlock_irqrestore(&m->wait_lock, flags); ++ return; ++ } ++ ++ /* ++ * Call into the slow lock path with the rtmutex->wait_lock ++ * held, so this can't result in the following race: ++ * ++ * Reader1 Reader2 Writer ++ * read_lock() ++ * write_lock() ++ * rtmutex_lock(m) ++ * swait() ++ * read_lock() ++ * unlock(m->wait_lock) ++ * read_unlock() ++ * swake() ++ * lock(m->wait_lock) ++ * lock->writelocked=true ++ * unlock(m->wait_lock) ++ * ++ * write_unlock() ++ * lock->writelocked=false ++ * rtmutex_unlock(m) ++ * read_lock() ++ * write_lock() ++ * rtmutex_lock(m) ++ * swait() ++ * rtmutex_lock(m) ++ * ++ * That would put Reader1 behind the writer waiting on ++ * Reader2 to call read_unlock() which might be unbound. ++ */ ++ rt_mutex_init_waiter(&waiter, true); ++ rt_spin_lock_slowlock_locked(m, &waiter, flags); ++ /* ++ * The slowlock() above is guaranteed to return with the rtmutex is ++ * now held, so there can't be a writer active. Increment the reader ++ * count and immediately drop the rtmutex again. ++ */ ++ atomic_inc(&lock->readers); ++ raw_spin_unlock_irqrestore(&m->wait_lock, flags); ++ rt_spin_lock_slowunlock(m); ++ ++ debug_rt_mutex_free_waiter(&waiter); ++} ++ ++static void __read_rt_unlock(struct rt_rw_lock *lock) ++{ ++ struct rt_mutex *m = &lock->rtmutex; ++ struct task_struct *tsk; ++ ++ /* ++ * sem->readers can only hit 0 when a writer is waiting for the ++ * active readers to leave the critical region. ++ */ ++ if (!atomic_dec_and_test(&lock->readers)) ++ return; ++ ++ raw_spin_lock_irq(&m->wait_lock); ++ /* ++ * Wake the writer, i.e. the rtmutex owner. It might release the ++ * rtmutex concurrently in the fast path, but to clean up the rw ++ * lock it needs to acquire m->wait_lock. The worst case which can ++ * happen is a spurious wakeup. ++ */ ++ tsk = rt_mutex_owner(m); ++ if (tsk) ++ wake_up_process(tsk); ++ ++ raw_spin_unlock_irq(&m->wait_lock); ++} ++ ++static void __write_unlock_common(struct rt_rw_lock *lock, int bias, ++ unsigned long flags) ++{ ++ struct rt_mutex *m = &lock->rtmutex; ++ ++ atomic_add(READER_BIAS - bias, &lock->readers); ++ raw_spin_unlock_irqrestore(&m->wait_lock, flags); ++ rt_spin_lock_slowunlock(m); ++} ++ ++static void __write_rt_lock(struct rt_rw_lock *lock) ++{ ++ struct rt_mutex *m = &lock->rtmutex; ++ struct task_struct *self = current; ++ unsigned long flags; ++ ++ /* Take the rtmutex as a first step */ ++ __rt_spin_lock(m); ++ ++ /* Force readers into slow path */ ++ atomic_sub(READER_BIAS, &lock->readers); ++ ++ raw_spin_lock_irqsave(&m->wait_lock, flags); ++ ++ raw_spin_lock(&self->pi_lock); ++ self->saved_state = self->state; ++ __set_current_state_no_track(TASK_UNINTERRUPTIBLE); ++ raw_spin_unlock(&self->pi_lock); ++ ++ for (;;) { ++ /* Have all readers left the critical region? */ ++ if (!atomic_read(&lock->readers)) { ++ atomic_set(&lock->readers, WRITER_BIAS); ++ raw_spin_lock(&self->pi_lock); ++ __set_current_state_no_track(self->saved_state); ++ self->saved_state = TASK_RUNNING; ++ raw_spin_unlock(&self->pi_lock); ++ raw_spin_unlock_irqrestore(&m->wait_lock, flags); ++ return; ++ } ++ ++ raw_spin_unlock_irqrestore(&m->wait_lock, flags); ++ ++ if (atomic_read(&lock->readers) != 0) ++ schedule(); ++ ++ raw_spin_lock_irqsave(&m->wait_lock, flags); ++ ++ raw_spin_lock(&self->pi_lock); ++ __set_current_state_no_track(TASK_UNINTERRUPTIBLE); ++ raw_spin_unlock(&self->pi_lock); ++ } ++} ++ ++static int __write_rt_trylock(struct rt_rw_lock *lock) ++{ ++ struct rt_mutex *m = &lock->rtmutex; ++ unsigned long flags; ++ ++ if (!__rt_mutex_trylock(m)) ++ return 0; ++ ++ atomic_sub(READER_BIAS, &lock->readers); ++ ++ raw_spin_lock_irqsave(&m->wait_lock, flags); ++ if (!atomic_read(&lock->readers)) { ++ atomic_set(&lock->readers, WRITER_BIAS); ++ raw_spin_unlock_irqrestore(&m->wait_lock, flags); ++ return 1; ++ } ++ __write_unlock_common(lock, 0, flags); ++ return 0; ++} ++ ++static void __write_rt_unlock(struct rt_rw_lock *lock) ++{ ++ struct rt_mutex *m = &lock->rtmutex; ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&m->wait_lock, flags); ++ __write_unlock_common(lock, WRITER_BIAS, flags); ++} ++ ++int __lockfunc rt_read_can_lock(rwlock_t *rwlock) ++{ ++ return atomic_read(&rwlock->readers) < 0; ++} ++ ++int __lockfunc rt_write_can_lock(rwlock_t *rwlock) ++{ ++ return atomic_read(&rwlock->readers) == READER_BIAS; ++} ++ ++/* ++ * The common functions which get wrapped into the rwlock API. ++ */ ++int __lockfunc rt_read_trylock(rwlock_t *rwlock) ++{ ++ int ret; ++ ++ ret = __read_rt_trylock(rwlock); ++ if (ret) { ++ rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_); ++ migrate_disable(); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(rt_read_trylock); ++ ++int __lockfunc rt_write_trylock(rwlock_t *rwlock) ++{ ++ int ret; ++ ++ ret = __write_rt_trylock(rwlock); ++ if (ret) { ++ rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_); ++ migrate_disable(); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(rt_write_trylock); ++ ++void __lockfunc rt_read_lock(rwlock_t *rwlock) ++{ ++ rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); ++ __read_rt_lock(rwlock); ++ migrate_disable(); ++} ++EXPORT_SYMBOL(rt_read_lock); ++ ++void __lockfunc rt_write_lock(rwlock_t *rwlock) ++{ ++ rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); ++ __write_rt_lock(rwlock); ++ migrate_disable(); ++} ++EXPORT_SYMBOL(rt_write_lock); ++ ++void __lockfunc rt_read_unlock(rwlock_t *rwlock) ++{ ++ rwlock_release(&rwlock->dep_map, _RET_IP_); ++ migrate_enable(); ++ __read_rt_unlock(rwlock); ++} ++EXPORT_SYMBOL(rt_read_unlock); ++ ++void __lockfunc rt_write_unlock(rwlock_t *rwlock) ++{ ++ rwlock_release(&rwlock->dep_map, _RET_IP_); ++ migrate_enable(); ++ __write_rt_unlock(rwlock); ++} ++EXPORT_SYMBOL(rt_write_unlock); ++ ++void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key) ++{ ++ __rwlock_biased_rt_init(rwlock, name, key); ++} ++EXPORT_SYMBOL(__rt_rwlock_init); +-- +2.43.0 + diff --git a/debian/patches-rt/0175-locking-rtmutex-wire-up-RT-s-locking.patch b/debian/patches-rt/0175-locking-rtmutex-wire-up-RT-s-locking.patch new file mode 100644 index 000000000..4a55d2f15 --- /dev/null +++ b/debian/patches-rt/0175-locking-rtmutex-wire-up-RT-s-locking.patch @@ -0,0 +1,348 @@ +From 48720bc07f81cfe49fe8e2a7afa1f18acd9227dd Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Thu, 12 Oct 2017 17:31:14 +0200 +Subject: [PATCH 175/323] locking/rtmutex: wire up RT's locking +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/mutex.h | 26 ++++++++++++++++---------- + include/linux/rwsem.h | 12 ++++++++++++ + include/linux/spinlock.h | 12 +++++++++++- + include/linux/spinlock_api_smp.h | 4 +++- + include/linux/spinlock_types.h | 11 ++++++++--- + include/linux/spinlock_types_up.h | 2 +- + kernel/Kconfig.preempt | 1 + + kernel/locking/Makefile | 10 +++++++--- + kernel/locking/rwsem.c | 6 ++++++ + kernel/locking/spinlock.c | 7 +++++++ + kernel/locking/spinlock_debug.c | 5 +++++ + 11 files changed, 77 insertions(+), 19 deletions(-) + +diff --git a/include/linux/mutex.h b/include/linux/mutex.h +index 4d671fba3cab..e45774a337d2 100644 +--- a/include/linux/mutex.h ++++ b/include/linux/mutex.h +@@ -22,6 +22,20 @@ + + struct ww_acquire_ctx; + ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ ++ , .dep_map = { \ ++ .name = #lockname, \ ++ .wait_type_inner = LD_WAIT_SLEEP, \ ++ } ++#else ++# define __DEP_MAP_MUTEX_INITIALIZER(lockname) ++#endif ++ ++#ifdef CONFIG_PREEMPT_RT ++# include <linux/mutex_rt.h> ++#else ++ + /* + * Simple, straightforward mutexes with strict semantics: + * +@@ -119,16 +133,6 @@ do { \ + __mutex_init((mutex), #mutex, &__key); \ + } while (0) + +-#ifdef CONFIG_DEBUG_LOCK_ALLOC +-# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ +- , .dep_map = { \ +- .name = #lockname, \ +- .wait_type_inner = LD_WAIT_SLEEP, \ +- } +-#else +-# define __DEP_MAP_MUTEX_INITIALIZER(lockname) +-#endif +- + #define __MUTEX_INITIALIZER(lockname) \ + { .owner = ATOMIC_LONG_INIT(0) \ + , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ +@@ -224,4 +228,6 @@ enum mutex_trylock_recursive_enum { + extern /* __deprecated */ __must_check enum mutex_trylock_recursive_enum + mutex_trylock_recursive(struct mutex *lock); + ++#endif /* !PREEMPT_RT */ ++ + #endif /* __LINUX_MUTEX_H */ +diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h +index 4c715be48717..9323af8a9244 100644 +--- a/include/linux/rwsem.h ++++ b/include/linux/rwsem.h +@@ -16,6 +16,11 @@ + #include <linux/spinlock.h> + #include <linux/atomic.h> + #include <linux/err.h> ++ ++#ifdef CONFIG_PREEMPT_RT ++#include <linux/rwsem-rt.h> ++#else /* PREEMPT_RT */ ++ + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER + #include <linux/osq_lock.h> + #endif +@@ -119,6 +124,13 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem) + return !list_empty(&sem->wait_list); + } + ++#endif /* !PREEMPT_RT */ ++ ++/* ++ * The functions below are the same for all rwsem implementations including ++ * the RT specific variant. ++ */ ++ + /* + * lock for reading + */ +diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h +index 79897841a2cc..c3c70291b46c 100644 +--- a/include/linux/spinlock.h ++++ b/include/linux/spinlock.h +@@ -309,7 +309,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) + }) + + /* Include rwlock functions */ +-#include <linux/rwlock.h> ++#ifdef CONFIG_PREEMPT_RT ++# include <linux/rwlock_rt.h> ++#else ++# include <linux/rwlock.h> ++#endif + + /* + * Pull the _spin_*()/_read_*()/_write_*() functions/declarations: +@@ -320,6 +324,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) + # include <linux/spinlock_api_up.h> + #endif + ++#ifdef CONFIG_PREEMPT_RT ++# include <linux/spinlock_rt.h> ++#else /* PREEMPT_RT */ ++ + /* + * Map the spin_lock functions to the raw variants for PREEMPT_RT=n + */ +@@ -454,6 +462,8 @@ static __always_inline int spin_is_contended(spinlock_t *lock) + + #define assert_spin_locked(lock) assert_raw_spin_locked(&(lock)->rlock) + ++#endif /* !PREEMPT_RT */ ++ + /* + * Pull the atomic_t declaration: + * (asm-mips/atomic.h needs above definitions) +diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h +index 19a9be9d97ee..da38149f2843 100644 +--- a/include/linux/spinlock_api_smp.h ++++ b/include/linux/spinlock_api_smp.h +@@ -187,6 +187,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock) + return 0; + } + +-#include <linux/rwlock_api_smp.h> ++#ifndef CONFIG_PREEMPT_RT ++# include <linux/rwlock_api_smp.h> ++#endif + + #endif /* __LINUX_SPINLOCK_API_SMP_H */ +diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h +index 5c8664d57fb8..8d896d3e1a01 100644 +--- a/include/linux/spinlock_types.h ++++ b/include/linux/spinlock_types.h +@@ -11,8 +11,13 @@ + + #include <linux/spinlock_types_raw.h> + +-#include <linux/spinlock_types_nort.h> +- +-#include <linux/rwlock_types.h> ++#ifndef CONFIG_PREEMPT_RT ++# include <linux/spinlock_types_nort.h> ++# include <linux/rwlock_types.h> ++#else ++# include <linux/rtmutex.h> ++# include <linux/spinlock_types_rt.h> ++# include <linux/rwlock_types_rt.h> ++#endif + + #endif /* __LINUX_SPINLOCK_TYPES_H */ +diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h +index c09b6407ae1b..d9b371fa13e0 100644 +--- a/include/linux/spinlock_types_up.h ++++ b/include/linux/spinlock_types_up.h +@@ -1,7 +1,7 @@ + #ifndef __LINUX_SPINLOCK_TYPES_UP_H + #define __LINUX_SPINLOCK_TYPES_UP_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__LINUX_RT_MUTEX_H) + # error "please don't include this file directly" + #endif + +diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt +index bf82259cff96..cbe3aa495519 100644 +--- a/kernel/Kconfig.preempt ++++ b/kernel/Kconfig.preempt +@@ -59,6 +59,7 @@ config PREEMPT_RT + bool "Fully Preemptible Kernel (Real-Time)" + depends on EXPERT && ARCH_SUPPORTS_RT + select PREEMPTION ++ select RT_MUTEXES + help + This option turns the kernel into a real-time kernel by replacing + various locking primitives (spinlocks, rwlocks, etc.) with +diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile +index 6d11cfb9b41f..c7fbf737e16e 100644 +--- a/kernel/locking/Makefile ++++ b/kernel/locking/Makefile +@@ -3,7 +3,7 @@ + # and is generally not a function of system call inputs. + KCOV_INSTRUMENT := n + +-obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o ++obj-y += semaphore.o rwsem.o percpu-rwsem.o + + # Avoid recursion lockdep -> KCSAN -> ... -> lockdep. + KCSAN_SANITIZE_lockdep.o := n +@@ -15,19 +15,23 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE) + CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE) + endif + +-obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o + obj-$(CONFIG_LOCKDEP) += lockdep.o + ifeq ($(CONFIG_PROC_FS),y) + obj-$(CONFIG_LOCKDEP) += lockdep_proc.o + endif + obj-$(CONFIG_SMP) += spinlock.o +-obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o + obj-$(CONFIG_PROVE_LOCKING) += spinlock.o + obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o + obj-$(CONFIG_RT_MUTEXES) += rtmutex.o + obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o + obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o + obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o ++ifneq ($(CONFIG_PREEMPT_RT),y) ++obj-y += mutex.o ++obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o ++obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o ++endif ++obj-$(CONFIG_PREEMPT_RT) += mutex-rt.o rwsem-rt.o rwlock-rt.o + obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o + obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o + obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o +diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c +index cc5cc889b5b7..f7c909ef1261 100644 +--- a/kernel/locking/rwsem.c ++++ b/kernel/locking/rwsem.c +@@ -28,6 +28,7 @@ + #include <linux/rwsem.h> + #include <linux/atomic.h> + ++#ifndef CONFIG_PREEMPT_RT + #include "lock_events.h" + + /* +@@ -1494,6 +1495,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) + if (tmp & RWSEM_FLAG_WAITERS) + rwsem_downgrade_wake(sem); + } ++#endif + + /* + * lock for reading +@@ -1657,7 +1659,9 @@ void down_read_non_owner(struct rw_semaphore *sem) + { + might_sleep(); + __down_read(sem); ++#ifndef CONFIG_PREEMPT_RT + __rwsem_set_reader_owned(sem, NULL); ++#endif + } + EXPORT_SYMBOL(down_read_non_owner); + +@@ -1686,7 +1690,9 @@ EXPORT_SYMBOL(down_write_killable_nested); + + void up_read_non_owner(struct rw_semaphore *sem) + { ++#ifndef CONFIG_PREEMPT_RT + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); ++#endif + __up_read(sem); + } + EXPORT_SYMBOL(up_read_non_owner); +diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c +index 0ff08380f531..45445a2f1799 100644 +--- a/kernel/locking/spinlock.c ++++ b/kernel/locking/spinlock.c +@@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ + * __[spin|read|write]_lock_bh() + */ + BUILD_LOCK_OPS(spin, raw_spinlock); ++ ++#ifndef CONFIG_PREEMPT_RT + BUILD_LOCK_OPS(read, rwlock); + BUILD_LOCK_OPS(write, rwlock); ++#endif + + #endif + +@@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) + EXPORT_SYMBOL(_raw_spin_unlock_bh); + #endif + ++#ifndef CONFIG_PREEMPT_RT ++ + #ifndef CONFIG_INLINE_READ_TRYLOCK + int __lockfunc _raw_read_trylock(rwlock_t *lock) + { +@@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) + EXPORT_SYMBOL(_raw_write_unlock_bh); + #endif + ++#endif /* !PREEMPT_RT */ ++ + #ifdef CONFIG_DEBUG_LOCK_ALLOC + + void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) +diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c +index b9d93087ee66..72e306e0e8a3 100644 +--- a/kernel/locking/spinlock_debug.c ++++ b/kernel/locking/spinlock_debug.c +@@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, + + EXPORT_SYMBOL(__raw_spin_lock_init); + ++#ifndef CONFIG_PREEMPT_RT + void __rwlock_init(rwlock_t *lock, const char *name, + struct lock_class_key *key) + { +@@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name, + } + + EXPORT_SYMBOL(__rwlock_init); ++#endif + + static void spin_dump(raw_spinlock_t *lock, const char *msg) + { +@@ -139,6 +141,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock) + arch_spin_unlock(&lock->raw_lock); + } + ++#ifndef CONFIG_PREEMPT_RT + static void rwlock_bug(rwlock_t *lock, const char *msg) + { + if (!debug_locks_off()) +@@ -228,3 +231,5 @@ void do_raw_write_unlock(rwlock_t *lock) + debug_write_unlock(lock); + arch_write_unlock(&lock->raw_lock); + } ++ ++#endif +-- +2.43.0 + diff --git a/debian/patches-rt/0176-locking-rtmutex-add-ww_mutex-addon-for-mutex-rt.patch b/debian/patches-rt/0176-locking-rtmutex-add-ww_mutex-addon-for-mutex-rt.patch new file mode 100644 index 000000000..4199d82f9 --- /dev/null +++ b/debian/patches-rt/0176-locking-rtmutex-add-ww_mutex-addon-for-mutex-rt.patch @@ -0,0 +1,456 @@ +From 47ba586f2ade0d9f8cc9281cb08529146bfc3c5d Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Thu, 12 Oct 2017 17:34:38 +0200 +Subject: [PATCH 176/323] locking/rtmutex: add ww_mutex addon for mutex-rt +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/mutex.h | 8 - + include/linux/ww_mutex.h | 8 + + kernel/locking/rtmutex.c | 262 ++++++++++++++++++++++++++++++-- + kernel/locking/rtmutex_common.h | 2 + + kernel/locking/rwsem-rt.c | 2 +- + 5 files changed, 262 insertions(+), 20 deletions(-) + +diff --git a/include/linux/mutex.h b/include/linux/mutex.h +index e45774a337d2..90923d3008fc 100644 +--- a/include/linux/mutex.h ++++ b/include/linux/mutex.h +@@ -82,14 +82,6 @@ struct mutex { + struct ww_class; + struct ww_acquire_ctx; + +-struct ww_mutex { +- struct mutex base; +- struct ww_acquire_ctx *ctx; +-#ifdef CONFIG_DEBUG_MUTEXES +- struct ww_class *ww_class; +-#endif +-}; +- + /* + * This is the control structure for tasks blocked on mutex, + * which resides on the blocked task's kernel stack: +diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h +index 6ecf2a0220db..3145de598645 100644 +--- a/include/linux/ww_mutex.h ++++ b/include/linux/ww_mutex.h +@@ -28,6 +28,14 @@ struct ww_class { + unsigned int is_wait_die; + }; + ++struct ww_mutex { ++ struct mutex base; ++ struct ww_acquire_ctx *ctx; ++#ifdef CONFIG_DEBUG_MUTEXES ++ struct ww_class *ww_class; ++#endif ++}; ++ + struct ww_acquire_ctx { + struct task_struct *task; + unsigned long stamp; +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index 1bf7a04688c5..c095d1b92f70 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -24,6 +24,7 @@ + #include <linux/sched/wake_q.h> + #include <linux/sched/debug.h> + #include <linux/timer.h> ++#include <linux/ww_mutex.h> + + #include "rtmutex_common.h" + +@@ -1234,6 +1235,40 @@ EXPORT_SYMBOL(__rt_spin_lock_init); + + #endif /* PREEMPT_RT */ + ++#ifdef CONFIG_PREEMPT_RT ++ static inline int __sched ++__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx) ++{ ++ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock); ++ struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); ++ ++ if (!hold_ctx) ++ return 0; ++ ++ if (unlikely(ctx == hold_ctx)) ++ return -EALREADY; ++ ++ if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && ++ (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { ++#ifdef CONFIG_DEBUG_MUTEXES ++ DEBUG_LOCKS_WARN_ON(ctx->contending_lock); ++ ctx->contending_lock = ww; ++#endif ++ return -EDEADLK; ++ } ++ ++ return 0; ++} ++#else ++ static inline int __sched ++__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx) ++{ ++ BUG(); ++ return 0; ++} ++ ++#endif ++ + static inline int + try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, + struct rt_mutex_waiter *waiter) +@@ -1512,7 +1547,8 @@ void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate) + static int __sched + __rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, +- struct rt_mutex_waiter *waiter) ++ struct rt_mutex_waiter *waiter, ++ struct ww_acquire_ctx *ww_ctx) + { + int ret = 0; + +@@ -1530,6 +1566,12 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, + break; + } + ++ if (ww_ctx && ww_ctx->acquired > 0) { ++ ret = __mutex_lock_check_stamp(lock, ww_ctx); ++ if (ret) ++ break; ++ } ++ + raw_spin_unlock_irq(&lock->wait_lock); + + schedule(); +@@ -1558,16 +1600,106 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock, + } + } + ++static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, ++ struct ww_acquire_ctx *ww_ctx) ++{ ++#ifdef CONFIG_DEBUG_MUTEXES ++ /* ++ * If this WARN_ON triggers, you used ww_mutex_lock to acquire, ++ * but released with a normal mutex_unlock in this call. ++ * ++ * This should never happen, always use ww_mutex_unlock. ++ */ ++ DEBUG_LOCKS_WARN_ON(ww->ctx); ++ ++ /* ++ * Not quite done after calling ww_acquire_done() ? ++ */ ++ DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); ++ ++ if (ww_ctx->contending_lock) { ++ /* ++ * After -EDEADLK you tried to ++ * acquire a different ww_mutex? Bad! ++ */ ++ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); ++ ++ /* ++ * You called ww_mutex_lock after receiving -EDEADLK, ++ * but 'forgot' to unlock everything else first? ++ */ ++ DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); ++ ww_ctx->contending_lock = NULL; ++ } ++ ++ /* ++ * Naughty, using a different class will lead to undefined behavior! ++ */ ++ DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); ++#endif ++ ww_ctx->acquired++; ++} ++ ++#ifdef CONFIG_PREEMPT_RT ++static void ww_mutex_account_lock(struct rt_mutex *lock, ++ struct ww_acquire_ctx *ww_ctx) ++{ ++ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock); ++ struct rt_mutex_waiter *waiter, *n; ++ ++ /* ++ * This branch gets optimized out for the common case, ++ * and is only important for ww_mutex_lock. ++ */ ++ ww_mutex_lock_acquired(ww, ww_ctx); ++ ww->ctx = ww_ctx; ++ ++ /* ++ * Give any possible sleeping processes the chance to wake up, ++ * so they can recheck if they have to back off. ++ */ ++ rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters.rb_root, ++ tree_entry) { ++ /* XXX debug rt mutex waiter wakeup */ ++ ++ BUG_ON(waiter->lock != lock); ++ rt_mutex_wake_waiter(waiter); ++ } ++} ++ ++#else ++ ++static void ww_mutex_account_lock(struct rt_mutex *lock, ++ struct ww_acquire_ctx *ww_ctx) ++{ ++ BUG(); ++} ++#endif ++ + int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + enum rtmutex_chainwalk chwalk, ++ struct ww_acquire_ctx *ww_ctx, + struct rt_mutex_waiter *waiter) + { + int ret; + ++#ifdef CONFIG_PREEMPT_RT ++ if (ww_ctx) { ++ struct ww_mutex *ww; ++ ++ ww = container_of(lock, struct ww_mutex, base.lock); ++ if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) ++ return -EALREADY; ++ } ++#endif ++ + /* Try to acquire the lock again: */ +- if (try_to_take_rt_mutex(lock, current, NULL)) ++ if (try_to_take_rt_mutex(lock, current, NULL)) { ++ if (ww_ctx) ++ ww_mutex_account_lock(lock, ww_ctx); + return 0; ++ } + + set_current_state(state); + +@@ -1577,14 +1709,24 @@ int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state, + + ret = task_blocks_on_rt_mutex(lock, waiter, current, chwalk); + +- if (likely(!ret)) ++ if (likely(!ret)) { + /* sleep on the mutex */ +- ret = __rt_mutex_slowlock(lock, state, timeout, waiter); ++ ret = __rt_mutex_slowlock(lock, state, timeout, waiter, ++ ww_ctx); ++ } else if (ww_ctx) { ++ /* ww_mutex received EDEADLK, let it become EALREADY */ ++ ret = __mutex_lock_check_stamp(lock, ww_ctx); ++ BUG_ON(!ret); ++ } + + if (unlikely(ret)) { + __set_current_state(TASK_RUNNING); + remove_waiter(lock, waiter); +- rt_mutex_handle_deadlock(ret, chwalk, waiter); ++ /* ww_mutex wants to report EDEADLK/EALREADY, let it */ ++ if (!ww_ctx) ++ rt_mutex_handle_deadlock(ret, chwalk, waiter); ++ } else if (ww_ctx) { ++ ww_mutex_account_lock(lock, ww_ctx); + } + + /* +@@ -1601,7 +1743,8 @@ int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state, + static int __sched + rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, +- enum rtmutex_chainwalk chwalk) ++ enum rtmutex_chainwalk chwalk, ++ struct ww_acquire_ctx *ww_ctx) + { + struct rt_mutex_waiter waiter; + unsigned long flags; +@@ -1619,7 +1762,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, + */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); + +- ret = rt_mutex_slowlock_locked(lock, state, timeout, chwalk, &waiter); ++ ret = rt_mutex_slowlock_locked(lock, state, timeout, chwalk, ww_ctx, ++ &waiter); + + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + +@@ -1749,14 +1893,16 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, + */ + static inline int + rt_mutex_fastlock(struct rt_mutex *lock, int state, ++ struct ww_acquire_ctx *ww_ctx, + int (*slowfn)(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, +- enum rtmutex_chainwalk chwalk)) ++ enum rtmutex_chainwalk chwalk, ++ struct ww_acquire_ctx *ww_ctx)) + { + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) + return 0; + +- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); ++ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, ww_ctx); + } + + static inline int +@@ -1801,7 +1947,7 @@ rt_mutex_fastunlock(struct rt_mutex *lock, + int __sched __rt_mutex_lock_state(struct rt_mutex *lock, int state) + { + might_sleep(); +- return rt_mutex_fastlock(lock, state, rt_mutex_slowlock); ++ return rt_mutex_fastlock(lock, state, NULL, rt_mutex_slowlock); + } + + /** +@@ -2245,7 +2391,7 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, + raw_spin_lock_irq(&lock->wait_lock); + /* sleep on the mutex */ + set_current_state(TASK_INTERRUPTIBLE); +- ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); ++ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL); + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. +@@ -2315,3 +2461,97 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, + + return cleanup; + } ++ ++static inline int ++ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) ++{ ++#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH ++ unsigned int tmp; ++ ++ if (ctx->deadlock_inject_countdown-- == 0) { ++ tmp = ctx->deadlock_inject_interval; ++ if (tmp > UINT_MAX/4) ++ tmp = UINT_MAX; ++ else ++ tmp = tmp*2 + tmp + tmp/2; ++ ++ ctx->deadlock_inject_interval = tmp; ++ ctx->deadlock_inject_countdown = tmp; ++ ctx->contending_lock = lock; ++ ++ ww_mutex_unlock(lock); ++ ++ return -EDEADLK; ++ } ++#endif ++ ++ return 0; ++} ++ ++#ifdef CONFIG_PREEMPT_RT ++int __sched ++ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) ++{ ++ int ret; ++ ++ might_sleep(); ++ ++ mutex_acquire_nest(&lock->base.dep_map, 0, 0, ++ ctx ? &ctx->dep_map : NULL, _RET_IP_); ++ ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ++ ctx); ++ if (ret) ++ mutex_release(&lock->base.dep_map, _RET_IP_); ++ else if (!ret && ctx && ctx->acquired > 1) ++ return ww_mutex_deadlock_injection(lock, ctx); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible); ++ ++int __sched ++ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) ++{ ++ int ret; ++ ++ might_sleep(); ++ ++ mutex_acquire_nest(&lock->base.dep_map, 0, 0, ++ ctx ? &ctx->dep_map : NULL, _RET_IP_); ++ ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ++ ctx); ++ if (ret) ++ mutex_release(&lock->base.dep_map, _RET_IP_); ++ else if (!ret && ctx && ctx->acquired > 1) ++ return ww_mutex_deadlock_injection(lock, ctx); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(ww_mutex_lock); ++ ++void __sched ww_mutex_unlock(struct ww_mutex *lock) ++{ ++ /* ++ * The unlocking fastpath is the 0->1 transition from 'locked' ++ * into 'unlocked' state: ++ */ ++ if (lock->ctx) { ++#ifdef CONFIG_DEBUG_MUTEXES ++ DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); ++#endif ++ if (lock->ctx->acquired > 0) ++ lock->ctx->acquired--; ++ lock->ctx = NULL; ++ } ++ ++ mutex_release(&lock->base.dep_map, _RET_IP_); ++ __rt_mutex_unlock(&lock->base.lock); ++} ++EXPORT_SYMBOL(ww_mutex_unlock); ++ ++int __rt_mutex_owner_current(struct rt_mutex *lock) ++{ ++ return rt_mutex_owner(lock) == current; ++} ++EXPORT_SYMBOL(__rt_mutex_owner_current); ++#endif +diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h +index c1a280167e3c..248a7d91583b 100644 +--- a/kernel/locking/rtmutex_common.h ++++ b/kernel/locking/rtmutex_common.h +@@ -159,6 +159,7 @@ extern void rt_mutex_postunlock(struct wake_q_head *wake_q, + struct wake_q_head *wake_sleeper_q); + + /* RW semaphore special interface */ ++struct ww_acquire_ctx; + + extern int __rt_mutex_lock_state(struct rt_mutex *lock, int state); + extern int __rt_mutex_trylock(struct rt_mutex *lock); +@@ -166,6 +167,7 @@ extern void __rt_mutex_unlock(struct rt_mutex *lock); + int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + enum rtmutex_chainwalk chwalk, ++ struct ww_acquire_ctx *ww_ctx, + struct rt_mutex_waiter *waiter); + void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, +diff --git a/kernel/locking/rwsem-rt.c b/kernel/locking/rwsem-rt.c +index a0771c150041..274172d5bb3a 100644 +--- a/kernel/locking/rwsem-rt.c ++++ b/kernel/locking/rwsem-rt.c +@@ -138,7 +138,7 @@ static int __sched __down_read_common(struct rw_semaphore *sem, int state) + */ + rt_mutex_init_waiter(&waiter, false); + ret = rt_mutex_slowlock_locked(m, state, NULL, RT_MUTEX_MIN_CHAINWALK, +- &waiter); ++ NULL, &waiter); + /* + * The slowlock() above is guaranteed to return with the rtmutex (for + * ret = 0) is now held, so there can't be a writer active. Increment +-- +2.43.0 + diff --git a/debian/patches-rt/0177-locking-rtmutex-Use-custom-scheduling-function-for-s.patch b/debian/patches-rt/0177-locking-rtmutex-Use-custom-scheduling-function-for-s.patch new file mode 100644 index 000000000..9cabce468 --- /dev/null +++ b/debian/patches-rt/0177-locking-rtmutex-Use-custom-scheduling-function-for-s.patch @@ -0,0 +1,243 @@ +From 0a34a9993d2798ba232792f998e5ae5fe7519730 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 6 Oct 2020 13:07:17 +0200 +Subject: [PATCH 177/323] locking/rtmutex: Use custom scheduling function for + spin-schedule() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +PREEMPT_RT builds the rwsem, mutex, spinlock and rwlock typed locks on +top of a rtmutex lock. While blocked task->pi_blocked_on is set +(tsk_is_pi_blocked()) and task needs to schedule away while waiting. + +The schedule process must distinguish between blocking on a regular +sleeping lock (rwsem and mutex) and a RT-only sleeping lock (spinlock +and rwlock): +- rwsem and mutex must flush block requests (blk_schedule_flush_plug()) + even if blocked on a lock. This can not deadlock because this also + happens for non-RT. + There should be a warning if the scheduling point is within a RCU read + section. + +- spinlock and rwlock must not flush block requests. This will deadlock + if the callback attempts to acquire a lock which is already acquired. + Similarly to being preempted, there should be no warning if the + scheduling point is within a RCU read section. + +Add preempt_schedule_lock() which is invoked if scheduling is required +while blocking on a PREEMPT_RT-only sleeping lock. +Remove tsk_is_pi_blocked() from the scheduler path which is no longer +needed with the additional scheduler entry point. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/arm64/include/asm/preempt.h | 3 +++ + arch/x86/include/asm/preempt.h | 3 +++ + include/asm-generic/preempt.h | 3 +++ + include/linux/sched/rt.h | 8 -------- + kernel/locking/rtmutex.c | 2 +- + kernel/locking/rwlock-rt.c | 2 +- + kernel/sched/core.c | 32 +++++++++++++++++++++----------- + 7 files changed, 32 insertions(+), 21 deletions(-) + +diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h +index e83f0982b99c..f1486b32502c 100644 +--- a/arch/arm64/include/asm/preempt.h ++++ b/arch/arm64/include/asm/preempt.h +@@ -81,6 +81,9 @@ static inline bool should_resched(int preempt_offset) + + #ifdef CONFIG_PREEMPTION + void preempt_schedule(void); ++#ifdef CONFIG_PREEMPT_RT ++void preempt_schedule_lock(void); ++#endif + #define __preempt_schedule() preempt_schedule() + void preempt_schedule_notrace(void); + #define __preempt_schedule_notrace() preempt_schedule_notrace() +diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h +index a334dd0d7c42..50e0c0ab7b97 100644 +--- a/arch/x86/include/asm/preempt.h ++++ b/arch/x86/include/asm/preempt.h +@@ -103,6 +103,9 @@ static __always_inline bool should_resched(int preempt_offset) + } + + #ifdef CONFIG_PREEMPTION ++#ifdef CONFIG_PREEMPT_RT ++ extern void preempt_schedule_lock(void); ++#endif + extern asmlinkage void preempt_schedule_thunk(void); + # define __preempt_schedule() \ + asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT) +diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h +index b4d43a4af5f7..ac255e889462 100644 +--- a/include/asm-generic/preempt.h ++++ b/include/asm-generic/preempt.h +@@ -79,6 +79,9 @@ static __always_inline bool should_resched(int preempt_offset) + } + + #ifdef CONFIG_PREEMPTION ++#ifdef CONFIG_PREEMPT_RT ++extern void preempt_schedule_lock(void); ++#endif + extern asmlinkage void preempt_schedule(void); + #define __preempt_schedule() preempt_schedule() + extern asmlinkage void preempt_schedule_notrace(void); +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..994c25640e15 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -39,20 +39,12 @@ static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p) + } + extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task); + extern void rt_mutex_adjust_pi(struct task_struct *p); +-static inline bool tsk_is_pi_blocked(struct task_struct *tsk) +-{ +- return tsk->pi_blocked_on != NULL; +-} + #else + static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) + { + return NULL; + } + # define rt_mutex_adjust_pi(p) do { } while (0) +-static inline bool tsk_is_pi_blocked(struct task_struct *tsk) +-{ +- return false; +-} + #endif + + extern void normalize_rt_tasks(void); +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index c095d1b92f70..2fe178651254 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1067,7 +1067,7 @@ void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock, + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + + if (top_waiter != waiter || adaptive_wait(lock, lock_owner)) +- schedule(); ++ preempt_schedule_lock(); + + raw_spin_lock_irqsave(&lock->wait_lock, flags); + +diff --git a/kernel/locking/rwlock-rt.c b/kernel/locking/rwlock-rt.c +index 1ee16b8fedd7..16be7111aae7 100644 +--- a/kernel/locking/rwlock-rt.c ++++ b/kernel/locking/rwlock-rt.c +@@ -211,7 +211,7 @@ static void __write_rt_lock(struct rt_rw_lock *lock) + raw_spin_unlock_irqrestore(&m->wait_lock, flags); + + if (atomic_read(&lock->readers) != 0) +- schedule(); ++ preempt_schedule_lock(); + + raw_spin_lock_irqsave(&m->wait_lock, flags); + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index be5d41ed6ff2..aaeed4b14278 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -5001,7 +5001,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + * + * WARNING: must be called with preemption disabled! + */ +-static void __sched notrace __schedule(bool preempt) ++static void __sched notrace __schedule(bool preempt, bool spinning_lock) + { + struct task_struct *prev, *next; + unsigned long *switch_count; +@@ -5054,7 +5054,7 @@ static void __sched notrace __schedule(bool preempt) + * - ptrace_{,un}freeze_traced() can change ->state underneath us. + */ + prev_state = prev->state; +- if (!preempt && prev_state) { ++ if ((!preempt || spinning_lock) && prev_state) { + if (signal_pending_state(prev_state, prev)) { + prev->state = TASK_RUNNING; + } else { +@@ -5138,7 +5138,7 @@ void __noreturn do_task_dead(void) + /* Tell freezer to ignore us: */ + current->flags |= PF_NOFREEZE; + +- __schedule(false); ++ __schedule(false, false); + BUG(); + + /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ +@@ -5171,9 +5171,6 @@ static inline void sched_submit_work(struct task_struct *tsk) + preempt_enable_no_resched(); + } + +- if (tsk_is_pi_blocked(tsk)) +- return; +- + /* + * If we are going to sleep and we have plugged IO queued, + * make sure to submit it to avoid deadlocks. +@@ -5199,7 +5196,7 @@ asmlinkage __visible void __sched schedule(void) + sched_submit_work(tsk); + do { + preempt_disable(); +- __schedule(false); ++ __schedule(false, false); + sched_preempt_enable_no_resched(); + } while (need_resched()); + sched_update_worker(tsk); +@@ -5227,7 +5224,7 @@ void __sched schedule_idle(void) + */ + WARN_ON_ONCE(current->state); + do { +- __schedule(false); ++ __schedule(false, false); + } while (need_resched()); + } + +@@ -5280,7 +5277,7 @@ static void __sched notrace preempt_schedule_common(void) + */ + preempt_disable_notrace(); + preempt_latency_start(1); +- __schedule(true); ++ __schedule(true, false); + preempt_latency_stop(1); + preempt_enable_no_resched_notrace(); + +@@ -5310,6 +5307,19 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) + NOKPROBE_SYMBOL(preempt_schedule); + EXPORT_SYMBOL(preempt_schedule); + ++#ifdef CONFIG_PREEMPT_RT ++void __sched notrace preempt_schedule_lock(void) ++{ ++ do { ++ preempt_disable(); ++ __schedule(true, true); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++} ++NOKPROBE_SYMBOL(preempt_schedule_lock); ++EXPORT_SYMBOL(preempt_schedule_lock); ++#endif ++ + /** + * preempt_schedule_notrace - preempt_schedule called by tracing + * +@@ -5353,7 +5363,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) + * an infinite recursion. + */ + prev_ctx = exception_enter(); +- __schedule(true); ++ __schedule(true, false); + exception_exit(prev_ctx); + + preempt_latency_stop(1); +@@ -5382,7 +5392,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) + do { + preempt_disable(); + local_irq_enable(); +- __schedule(true); ++ __schedule(true, false); + local_irq_disable(); + sched_preempt_enable_no_resched(); + } while (need_resched()); +-- +2.43.0 + diff --git a/debian/patches-rt/0178-signal-Revert-ptrace-preempt-magic.patch b/debian/patches-rt/0178-signal-Revert-ptrace-preempt-magic.patch new file mode 100644 index 000000000..c90e0baa5 --- /dev/null +++ b/debian/patches-rt/0178-signal-Revert-ptrace-preempt-magic.patch @@ -0,0 +1,39 @@ +From 3d8d7beacea4da55fdd1b45bc8d5c19da1438dcb Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Wed, 21 Sep 2011 19:57:12 +0200 +Subject: [PATCH 178/323] signal: Revert ptrace preempt magic +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Upstream commit '53da1d9456fe7f8 fix ptrace slowness' is nothing more +than a bandaid around the ptrace design trainwreck. It's not a +correctness issue, it's merily a cosmetic bandaid. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + kernel/signal.c | 8 -------- + 1 file changed, 8 deletions(-) + +diff --git a/kernel/signal.c b/kernel/signal.c +index e487c4660921..5ab628e2afc5 100644 +--- a/kernel/signal.c ++++ b/kernel/signal.c +@@ -2193,16 +2193,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t + if (gstop_done && ptrace_reparented(current)) + do_notify_parent_cldstop(current, false, why); + +- /* +- * Don't want to allow preemption here, because +- * sys_ptrace() needs this task to be inactive. +- * +- * XXX: implement read_unlock_no_resched(). +- */ +- preempt_disable(); + read_unlock(&tasklist_lock); + cgroup_enter_frozen(); +- preempt_enable_no_resched(); + freezable_schedule(); + cgroup_leave_frozen(true); + } else { +-- +2.43.0 + diff --git a/debian/patches-rt/0179-preempt-Provide-preempt_-_-no-rt-variants.patch b/debian/patches-rt/0179-preempt-Provide-preempt_-_-no-rt-variants.patch new file mode 100644 index 000000000..044b63df6 --- /dev/null +++ b/debian/patches-rt/0179-preempt-Provide-preempt_-_-no-rt-variants.patch @@ -0,0 +1,53 @@ +From 500d1733493dabbf5baf698854070d3cb1b0990f Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 24 Jul 2009 12:38:56 +0200 +Subject: [PATCH 179/323] preempt: Provide preempt_*_(no)rt variants +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +RT needs a few preempt_disable/enable points which are not necessary +otherwise. Implement variants to avoid #ifdeffery. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + include/linux/preempt.h | 18 +++++++++++++++++- + 1 file changed, 17 insertions(+), 1 deletion(-) + +diff --git a/include/linux/preempt.h b/include/linux/preempt.h +index 4d244e295e85..5ceac863e729 100644 +--- a/include/linux/preempt.h ++++ b/include/linux/preempt.h +@@ -188,7 +188,11 @@ do { \ + preempt_count_dec(); \ + } while (0) + +-#define preempt_enable_no_resched() sched_preempt_enable_no_resched() ++#ifdef CONFIG_PREEMPT_RT ++# define preempt_enable_no_resched() sched_preempt_enable_no_resched() ++#else ++# define preempt_enable_no_resched() preempt_enable() ++#endif + + #define preemptible() (preempt_count() == 0 && !irqs_disabled()) + +@@ -282,6 +286,18 @@ do { \ + set_preempt_need_resched(); \ + } while (0) + ++#ifdef CONFIG_PREEMPT_RT ++# define preempt_disable_rt() preempt_disable() ++# define preempt_enable_rt() preempt_enable() ++# define preempt_disable_nort() barrier() ++# define preempt_enable_nort() barrier() ++#else ++# define preempt_disable_rt() barrier() ++# define preempt_enable_rt() barrier() ++# define preempt_disable_nort() preempt_disable() ++# define preempt_enable_nort() preempt_enable() ++#endif ++ + #ifdef CONFIG_PREEMPT_NOTIFIERS + + struct preempt_notifier; +-- +2.43.0 + diff --git a/debian/patches-rt/0180-mm-vmstat-Protect-per-cpu-variables-with-preempt-dis.patch b/debian/patches-rt/0180-mm-vmstat-Protect-per-cpu-variables-with-preempt-dis.patch new file mode 100644 index 000000000..6512c18a9 --- /dev/null +++ b/debian/patches-rt/0180-mm-vmstat-Protect-per-cpu-variables-with-preempt-dis.patch @@ -0,0 +1,145 @@ +From a066e44619e1a6f661cd861015a5b658adead8bc Mon Sep 17 00:00:00 2001 +From: Ingo Molnar <mingo@elte.hu> +Date: Fri, 3 Jul 2009 08:30:13 -0500 +Subject: [PATCH 180/323] mm/vmstat: Protect per cpu variables with preempt + disable on RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Disable preemption on -RT for the vmstat code. On vanila the code runs in +IRQ-off regions while on -RT it is not. "preempt_disable" ensures that the +same ressources is not updated in parallel due to preemption. + +Signed-off-by: Ingo Molnar <mingo@elte.hu> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + include/linux/vmstat.h | 4 ++++ + mm/vmstat.c | 12 ++++++++++++ + 2 files changed, 16 insertions(+) + +diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h +index 322dcbfcc933..9a3a10ea3e3c 100644 +--- a/include/linux/vmstat.h ++++ b/include/linux/vmstat.h +@@ -63,7 +63,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states); + */ + static inline void __count_vm_event(enum vm_event_item item) + { ++ preempt_disable_rt(); + raw_cpu_inc(vm_event_states.event[item]); ++ preempt_enable_rt(); + } + + static inline void count_vm_event(enum vm_event_item item) +@@ -73,7 +75,9 @@ static inline void count_vm_event(enum vm_event_item item) + + static inline void __count_vm_events(enum vm_event_item item, long delta) + { ++ preempt_disable_rt(); + raw_cpu_add(vm_event_states.event[item], delta); ++ preempt_enable_rt(); + } + + static inline void count_vm_events(enum vm_event_item item, long delta) +diff --git a/mm/vmstat.c b/mm/vmstat.c +index e292e63afebf..598e9317c7e0 100644 +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -321,6 +321,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + long x; + long t; + ++ preempt_disable_rt(); + x = delta + __this_cpu_read(*p); + + t = __this_cpu_read(pcp->stat_threshold); +@@ -330,6 +331,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + x = 0; + } + __this_cpu_write(*p, x); ++ preempt_enable_rt(); + } + EXPORT_SYMBOL(__mod_zone_page_state); + +@@ -346,6 +348,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, + delta >>= PAGE_SHIFT; + } + ++ preempt_disable_rt(); + x = delta + __this_cpu_read(*p); + + t = __this_cpu_read(pcp->stat_threshold); +@@ -355,6 +358,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, + x = 0; + } + __this_cpu_write(*p, x); ++ preempt_enable_rt(); + } + EXPORT_SYMBOL(__mod_node_page_state); + +@@ -387,6 +391,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) + s8 __percpu *p = pcp->vm_stat_diff + item; + s8 v, t; + ++ preempt_disable_rt(); + v = __this_cpu_inc_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v > t)) { +@@ -395,6 +400,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) + zone_page_state_add(v + overstep, zone, item); + __this_cpu_write(*p, -overstep); + } ++ preempt_enable_rt(); + } + + void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) +@@ -405,6 +411,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) + + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + ++ preempt_disable_rt(); + v = __this_cpu_inc_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v > t)) { +@@ -413,6 +420,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) + node_page_state_add(v + overstep, pgdat, item); + __this_cpu_write(*p, -overstep); + } ++ preempt_enable_rt(); + } + + void __inc_zone_page_state(struct page *page, enum zone_stat_item item) +@@ -433,6 +441,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) + s8 __percpu *p = pcp->vm_stat_diff + item; + s8 v, t; + ++ preempt_disable_rt(); + v = __this_cpu_dec_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v < - t)) { +@@ -441,6 +450,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) + zone_page_state_add(v - overstep, zone, item); + __this_cpu_write(*p, overstep); + } ++ preempt_enable_rt(); + } + + void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) +@@ -451,6 +461,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) + + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + ++ preempt_disable_rt(); + v = __this_cpu_dec_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v < - t)) { +@@ -459,6 +470,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) + node_page_state_add(v - overstep, pgdat, item); + __this_cpu_write(*p, overstep); + } ++ preempt_enable_rt(); + } + + void __dec_zone_page_state(struct page *page, enum zone_stat_item item) +-- +2.43.0 + diff --git a/debian/patches-rt/0181-mm-memcontrol-Disable-preemption-in-__mod_memcg_lruv.patch b/debian/patches-rt/0181-mm-memcontrol-Disable-preemption-in-__mod_memcg_lruv.patch new file mode 100644 index 000000000..2df18df08 --- /dev/null +++ b/debian/patches-rt/0181-mm-memcontrol-Disable-preemption-in-__mod_memcg_lruv.patch @@ -0,0 +1,44 @@ +From 6f35e0e3a90abef410034d9900a21e68131aa5d5 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 28 Oct 2020 18:15:32 +0100 +Subject: [PATCH 181/323] mm/memcontrol: Disable preemption in + __mod_memcg_lruvec_state() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The callers expect disabled preemption/interrupts while invoking +__mod_memcg_lruvec_state(). This works mainline because a lock of +somekind is acquired. + +Use preempt_disable_rt() where per-CPU variables are accessed and a +stable pointer is expected. This is also done in __mod_zone_page_state() +for the same reason. + +Cc: stable-rt@vger.kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + mm/memcontrol.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index ddc8ed096dec..49566afaef1c 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -816,6 +816,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + memcg = pn->memcg; + ++ preempt_disable_rt(); + /* Update memcg */ + __mod_memcg_state(memcg, idx, val); + +@@ -835,6 +836,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + x = 0; + } + __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); ++ preempt_enable_rt(); + } + + /** +-- +2.43.0 + diff --git a/debian/patches-rt/0182-xfrm-Use-sequence-counter-with-associated-spinlock.patch b/debian/patches-rt/0182-xfrm-Use-sequence-counter-with-associated-spinlock.patch new file mode 100644 index 000000000..abce9d9d0 --- /dev/null +++ b/debian/patches-rt/0182-xfrm-Use-sequence-counter-with-associated-spinlock.patch @@ -0,0 +1,46 @@ +From 17aae888f2940f55065824bd7c267df06be47fbb Mon Sep 17 00:00:00 2001 +From: "Ahmed S. Darwish" <a.darwish@linutronix.de> +Date: Wed, 10 Jun 2020 12:53:22 +0200 +Subject: [PATCH 182/323] xfrm: Use sequence counter with associated spinlock +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +A sequence counter write side critical section must be protected by some +form of locking to serialize writers. A plain seqcount_t does not +contain the information of which lock must be held when entering a write +side critical section. + +Use the new seqcount_spinlock_t data type, which allows to associate a +spinlock with the sequence counter. This enables lockdep to verify that +the spinlock used for writer serialization is held when the write side +critical section is entered. + +If lockdep is disabled this lock association is compiled out and has +neither storage size nor runtime overhead. + +Upstream-status: The xfrm locking used for seqcoun writer serialization +appears to be broken. If that's the case, a proper fix will need to be +submitted upstream. (e.g. make the seqcount per network namespace?) + +Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + net/xfrm/xfrm_state.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c +index ba73014805a4..14e1f36bdda5 100644 +--- a/net/xfrm/xfrm_state.c ++++ b/net/xfrm/xfrm_state.c +@@ -2691,7 +2691,8 @@ int __net_init xfrm_state_init(struct net *net) + net->xfrm.state_num = 0; + INIT_WORK(&net->xfrm.state_hash_work, xfrm_hash_resize); + spin_lock_init(&net->xfrm.xfrm_state_lock); +- seqcount_init(&net->xfrm.xfrm_state_hash_generation); ++ seqcount_spinlock_init(&net->xfrm.xfrm_state_hash_generation, ++ &net->xfrm.xfrm_state_lock); + return 0; + + out_byspi: +-- +2.43.0 + diff --git a/debian/patches-rt/0183-u64_stats-Disable-preemption-on-32bit-UP-SMP-with-RT.patch b/debian/patches-rt/0183-u64_stats-Disable-preemption-on-32bit-UP-SMP-with-RT.patch new file mode 100644 index 000000000..f100e0717 --- /dev/null +++ b/debian/patches-rt/0183-u64_stats-Disable-preemption-on-32bit-UP-SMP-with-RT.patch @@ -0,0 +1,152 @@ +From e6ec60749d80fcdde8451a7a544a218f7c5ef393 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Mon, 17 Aug 2020 12:28:10 +0200 +Subject: [PATCH 183/323] u64_stats: Disable preemption on 32bit-UP/SMP with RT + during updates +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +On RT the seqcount_t is required even on UP because the softirq can be +preempted. The IRQ handler is threaded so it is also preemptible. + +Disable preemption on 32bit-RT during value updates. There is no need to +disable interrupts on RT because the handler is run threaded. Therefore +disabling preemption is enough to guarantee that the update is not +interruped. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/u64_stats_sync.h | 42 ++++++++++++++++++++++------------ + 1 file changed, 28 insertions(+), 14 deletions(-) + +diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h +index e81856c0ba13..66eb968a09d4 100644 +--- a/include/linux/u64_stats_sync.h ++++ b/include/linux/u64_stats_sync.h +@@ -66,7 +66,7 @@ + #include <linux/seqlock.h> + + struct u64_stats_sync { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG==32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + seqcount_t seq; + #endif + }; +@@ -115,7 +115,7 @@ static inline void u64_stats_inc(u64_stats_t *p) + } + #endif + +-#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + #define u64_stats_init(syncp) seqcount_init(&(syncp)->seq) + #else + static inline void u64_stats_init(struct u64_stats_sync *syncp) +@@ -125,15 +125,19 @@ static inline void u64_stats_init(struct u64_stats_sync *syncp) + + static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) + { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_disable(); + write_seqcount_begin(&syncp->seq); + #endif + } + + static inline void u64_stats_update_end(struct u64_stats_sync *syncp) + { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + write_seqcount_end(&syncp->seq); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_enable(); + #endif + } + +@@ -142,8 +146,11 @@ u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) + { + unsigned long flags = 0; + +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +- local_irq_save(flags); ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_disable(); ++ else ++ local_irq_save(flags); + write_seqcount_begin(&syncp->seq); + #endif + return flags; +@@ -153,15 +160,18 @@ static inline void + u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, + unsigned long flags) + { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + write_seqcount_end(&syncp->seq); +- local_irq_restore(flags); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_enable(); ++ else ++ local_irq_restore(flags); + #endif + } + + static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) + { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + return read_seqcount_begin(&syncp->seq); + #else + return 0; +@@ -170,7 +180,7 @@ static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync * + + static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) + { +-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) + preempt_disable(); + #endif + return __u64_stats_fetch_begin(syncp); +@@ -179,7 +189,7 @@ static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *sy + static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + unsigned int start) + { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + return read_seqcount_retry(&syncp->seq, start); + #else + return false; +@@ -189,7 +199,7 @@ static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + unsigned int start) + { +-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) + preempt_enable(); + #endif + return __u64_stats_fetch_retry(syncp, start); +@@ -203,7 +213,9 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + */ + static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) + { +-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) ++ preempt_disable(); ++#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) + local_irq_disable(); + #endif + return __u64_stats_fetch_begin(syncp); +@@ -212,7 +224,9 @@ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync + static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, + unsigned int start) + { +-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) ++ preempt_enable(); ++#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) + local_irq_enable(); + #endif + return __u64_stats_fetch_retry(syncp, start); +-- +2.43.0 + diff --git a/debian/patches-rt/0184-fs-dcache-use-swait_queue-instead-of-waitqueue.patch b/debian/patches-rt/0184-fs-dcache-use-swait_queue-instead-of-waitqueue.patch new file mode 100644 index 000000000..8a43098f4 --- /dev/null +++ b/debian/patches-rt/0184-fs-dcache-use-swait_queue-instead-of-waitqueue.patch @@ -0,0 +1,263 @@ +From aeadd35a13f446ec84a832549b2815fc3de30917 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 14 Sep 2016 14:35:49 +0200 +Subject: [PATCH 184/323] fs/dcache: use swait_queue instead of waitqueue +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +__d_lookup_done() invokes wake_up_all() while holding a hlist_bl_lock() +which disables preemption. As a workaround convert it to swait. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + fs/afs/dir_silly.c | 2 +- + fs/cifs/readdir.c | 2 +- + fs/dcache.c | 27 +++++++++++++++------------ + fs/fuse/readdir.c | 2 +- + fs/namei.c | 4 ++-- + fs/nfs/dir.c | 4 ++-- + fs/nfs/unlink.c | 4 ++-- + fs/proc/base.c | 3 ++- + fs/proc/proc_sysctl.c | 2 +- + include/linux/dcache.h | 4 ++-- + include/linux/nfs_xdr.h | 2 +- + kernel/sched/swait.c | 1 + + 12 files changed, 31 insertions(+), 26 deletions(-) + +diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c +index dae9a57d7ec0..9a6a0ec4d1fb 100644 +--- a/fs/afs/dir_silly.c ++++ b/fs/afs/dir_silly.c +@@ -239,7 +239,7 @@ int afs_silly_iput(struct dentry *dentry, struct inode *inode) + struct dentry *alias; + int ret; + +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + + _enter("%p{%pd},%llx", dentry, dentry, vnode->fid.vnode); + +diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c +index 799be3a5d25e..d5165a7da071 100644 +--- a/fs/cifs/readdir.c ++++ b/fs/cifs/readdir.c +@@ -81,7 +81,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, + struct inode *inode; + struct super_block *sb = parent->d_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + + cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); + +diff --git a/fs/dcache.c b/fs/dcache.c +index ea0485861d93..1f4255ef8722 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -2518,21 +2518,24 @@ static inline void end_dir_add(struct inode *dir, unsigned n) + + static void d_wait_lookup(struct dentry *dentry) + { +- if (d_in_lookup(dentry)) { +- DECLARE_WAITQUEUE(wait, current); +- add_wait_queue(dentry->d_wait, &wait); +- do { +- set_current_state(TASK_UNINTERRUPTIBLE); +- spin_unlock(&dentry->d_lock); +- schedule(); +- spin_lock(&dentry->d_lock); +- } while (d_in_lookup(dentry)); +- } ++ struct swait_queue __wait; ++ ++ if (!d_in_lookup(dentry)) ++ return; ++ ++ INIT_LIST_HEAD(&__wait.task_list); ++ do { ++ prepare_to_swait_exclusive(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE); ++ spin_unlock(&dentry->d_lock); ++ schedule(); ++ spin_lock(&dentry->d_lock); ++ } while (d_in_lookup(dentry)); ++ finish_swait(dentry->d_wait, &__wait); + } + + struct dentry *d_alloc_parallel(struct dentry *parent, + const struct qstr *name, +- wait_queue_head_t *wq) ++ struct swait_queue_head *wq) + { + unsigned int hash = name->hash; + struct hlist_bl_head *b = in_lookup_hash(parent, hash); +@@ -2647,7 +2650,7 @@ void __d_lookup_done(struct dentry *dentry) + hlist_bl_lock(b); + dentry->d_flags &= ~DCACHE_PAR_LOOKUP; + __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); +- wake_up_all(dentry->d_wait); ++ swake_up_all(dentry->d_wait); + dentry->d_wait = NULL; + hlist_bl_unlock(b); + INIT_HLIST_NODE(&dentry->d_u.d_alias); +diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c +index 14e99ffa57af..eb899feaf82d 100644 +--- a/fs/fuse/readdir.c ++++ b/fs/fuse/readdir.c +@@ -160,7 +160,7 @@ static int fuse_direntplus_link(struct file *file, + struct inode *dir = d_inode(parent); + struct fuse_conn *fc; + struct inode *inode; +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + + if (!o->nodeid) { + /* +diff --git a/fs/namei.c b/fs/namei.c +index 3ff954a2bbd1..01e3f8195ee1 100644 +--- a/fs/namei.c ++++ b/fs/namei.c +@@ -1532,7 +1532,7 @@ static struct dentry *__lookup_slow(const struct qstr *name, + { + struct dentry *dentry, *old; + struct inode *inode = dir->d_inode; +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + + /* Don't go there if it's already dead */ + if (unlikely(IS_DEADDIR(inode))) +@@ -3085,7 +3085,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, + struct dentry *dentry; + int error, create_error = 0; + umode_t mode = op->mode; +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + + if (unlikely(IS_DEADDIR(dir_inode))) + return ERR_PTR(-ENOENT); +diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c +index 9f88ca7b2001..bc8a78ecfe1c 100644 +--- a/fs/nfs/dir.c ++++ b/fs/nfs/dir.c +@@ -484,7 +484,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, + unsigned long dir_verifier) + { + struct qstr filename = QSTR_INIT(entry->name, entry->len); +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + struct dentry *dentry; + struct dentry *alias; + struct inode *inode; +@@ -1660,7 +1660,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned open_flags, + umode_t mode) + { +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + struct nfs_open_context *ctx; + struct dentry *res; + struct iattr attr = { .ia_valid = ATTR_OPEN }; +diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c +index b27ebdccef70..f86c98a7ed04 100644 +--- a/fs/nfs/unlink.c ++++ b/fs/nfs/unlink.c +@@ -13,7 +13,7 @@ + #include <linux/sunrpc/clnt.h> + #include <linux/nfs_fs.h> + #include <linux/sched.h> +-#include <linux/wait.h> ++#include <linux/swait.h> + #include <linux/namei.h> + #include <linux/fsnotify.h> + +@@ -180,7 +180,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name) + + data->cred = get_current_cred(); + data->res.dir_attr = &data->dir_attr; +- init_waitqueue_head(&data->wq); ++ init_swait_queue_head(&data->wq); + + status = -EBUSY; + spin_lock(&dentry->d_lock); +diff --git a/fs/proc/base.c b/fs/proc/base.c +index 712948e97991..585d0afd1af6 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -96,6 +96,7 @@ + #include <linux/posix-timers.h> + #include <linux/time_namespace.h> + #include <linux/resctrl.h> ++#include <linux/swait.h> + #include <trace/events/oom.h> + #include "internal.h" + #include "fd.h" +@@ -2066,7 +2067,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, + + child = d_hash_and_lookup(dir, &qname); + if (!child) { +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + child = d_alloc_parallel(dir, &qname, &wq); + if (IS_ERR(child)) + goto end_instantiate; +diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c +index aff9593feb73..d1a29668bff8 100644 +--- a/fs/proc/proc_sysctl.c ++++ b/fs/proc/proc_sysctl.c +@@ -684,7 +684,7 @@ static bool proc_sys_fill_cache(struct file *file, + + child = d_lookup(dir, &qname); + if (!child) { +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + child = d_alloc_parallel(dir, &qname, &wq); + if (IS_ERR(child)) + return false; +diff --git a/include/linux/dcache.h b/include/linux/dcache.h +index 6f95c3300cbb..c1290db778bd 100644 +--- a/include/linux/dcache.h ++++ b/include/linux/dcache.h +@@ -106,7 +106,7 @@ struct dentry { + + union { + struct list_head d_lru; /* LRU list */ +- wait_queue_head_t *d_wait; /* in-lookup ones only */ ++ struct swait_queue_head *d_wait; /* in-lookup ones only */ + }; + struct list_head d_child; /* child of parent list */ + struct list_head d_subdirs; /* our children */ +@@ -238,7 +238,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op + extern struct dentry * d_alloc(struct dentry *, const struct qstr *); + extern struct dentry * d_alloc_anon(struct super_block *); + extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *, +- wait_queue_head_t *); ++ struct swait_queue_head *); + extern struct dentry * d_splice_alias(struct inode *, struct dentry *); + extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); + extern struct dentry * d_exact_alias(struct dentry *, struct inode *); +diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h +index 33442fd018a0..4612bb5be6ca 100644 +--- a/include/linux/nfs_xdr.h ++++ b/include/linux/nfs_xdr.h +@@ -1675,7 +1675,7 @@ struct nfs_unlinkdata { + struct nfs_removeargs args; + struct nfs_removeres res; + struct dentry *dentry; +- wait_queue_head_t wq; ++ struct swait_queue_head wq; + const struct cred *cred; + struct nfs_fattr dir_attr; + long timeout; +diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c +index e1c655f928c7..f230b1ac7f91 100644 +--- a/kernel/sched/swait.c ++++ b/kernel/sched/swait.c +@@ -64,6 +64,7 @@ void swake_up_all(struct swait_queue_head *q) + struct swait_queue *curr; + LIST_HEAD(tmp); + ++ WARN_ON(irqs_disabled()); + raw_spin_lock_irq(&q->lock); + list_splice_init(&q->task_list, &tmp); + while (!list_empty(&tmp)) { +-- +2.43.0 + diff --git a/debian/patches-rt/0185-fs-dcache-disable-preemption-on-i_dir_seq-s-write-si.patch b/debian/patches-rt/0185-fs-dcache-disable-preemption-on-i_dir_seq-s-write-si.patch new file mode 100644 index 000000000..e687b4d2d --- /dev/null +++ b/debian/patches-rt/0185-fs-dcache-disable-preemption-on-i_dir_seq-s-write-si.patch @@ -0,0 +1,99 @@ +From 06c5210bfc8eff16e0d1df2430c7fcb07eae6c67 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Fri, 20 Oct 2017 11:29:53 +0200 +Subject: [PATCH 185/323] fs/dcache: disable preemption on i_dir_seq's write + side +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +i_dir_seq is an opencoded seqcounter. Based on the code it looks like we +could have two writers in parallel despite the fact that the d_lock is +held. The problem is that during the write process on RT the preemption +is still enabled and if this process is interrupted by a reader with RT +priority then we lock up. +To avoid that lock up I am disabling the preemption during the update. +The rename of i_dir_seq is here to ensure to catch new write sides in +future. + +Cc: stable-rt@vger.kernel.org +Reported-by: Oleg.Karfich@wago.com +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + fs/dcache.c | 12 +++++++----- + fs/inode.c | 2 +- + include/linux/fs.h | 2 +- + 3 files changed, 9 insertions(+), 7 deletions(-) + +diff --git a/fs/dcache.c b/fs/dcache.c +index 1f4255ef8722..26a187abf13a 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -2503,9 +2503,10 @@ EXPORT_SYMBOL(d_rehash); + static inline unsigned start_dir_add(struct inode *dir) + { + ++ preempt_disable_rt(); + for (;;) { +- unsigned n = dir->i_dir_seq; +- if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) ++ unsigned n = dir->__i_dir_seq; ++ if (!(n & 1) && cmpxchg(&dir->__i_dir_seq, n, n + 1) == n) + return n; + cpu_relax(); + } +@@ -2513,7 +2514,8 @@ static inline unsigned start_dir_add(struct inode *dir) + + static inline void end_dir_add(struct inode *dir, unsigned n) + { +- smp_store_release(&dir->i_dir_seq, n + 2); ++ smp_store_release(&dir->__i_dir_seq, n + 2); ++ preempt_enable_rt(); + } + + static void d_wait_lookup(struct dentry *dentry) +@@ -2549,7 +2551,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent, + + retry: + rcu_read_lock(); +- seq = smp_load_acquire(&parent->d_inode->i_dir_seq); ++ seq = smp_load_acquire(&parent->d_inode->__i_dir_seq); + r_seq = read_seqbegin(&rename_lock); + dentry = __d_lookup_rcu(parent, name, &d_seq); + if (unlikely(dentry)) { +@@ -2577,7 +2579,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent, + } + + hlist_bl_lock(b); +- if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) { ++ if (unlikely(READ_ONCE(parent->d_inode->__i_dir_seq) != seq)) { + hlist_bl_unlock(b); + rcu_read_unlock(); + goto retry; +diff --git a/fs/inode.c b/fs/inode.c +index 5c7139aa2bda..4ee8239c055f 100644 +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -158,7 +158,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) + inode->i_bdev = NULL; + inode->i_cdev = NULL; + inode->i_link = NULL; +- inode->i_dir_seq = 0; ++ inode->__i_dir_seq = 0; + inode->i_rdev = 0; + inode->dirtied_when = 0; + +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 82316863c71f..a29a0b8a1eca 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -699,7 +699,7 @@ struct inode { + struct block_device *i_bdev; + struct cdev *i_cdev; + char *i_link; +- unsigned i_dir_seq; ++ unsigned __i_dir_seq; + }; + + __u32 i_generation; +-- +2.43.0 + diff --git a/debian/patches-rt/0186-net-Qdisc-use-a-seqlock-instead-seqcount.patch b/debian/patches-rt/0186-net-Qdisc-use-a-seqlock-instead-seqcount.patch new file mode 100644 index 000000000..a289ccc4f --- /dev/null +++ b/debian/patches-rt/0186-net-Qdisc-use-a-seqlock-instead-seqcount.patch @@ -0,0 +1,299 @@ +From a996bcbd8188409882599175697a464fa6265193 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 14 Sep 2016 17:36:35 +0200 +Subject: [PATCH 186/323] net/Qdisc: use a seqlock instead seqcount +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The seqcount disables preemption on -RT while it is held which can't +remove. Also we don't want the reader to spin for ages if the writer is +scheduled out. The seqlock on the other hand will serialize / sleep on +the lock while writer is active. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/net/gen_stats.h | 11 ++++++----- + include/net/net_seq_lock.h | 24 ++++++++++++++++++++++++ + include/net/sch_generic.h | 19 +++++++++++++++++-- + net/core/gen_estimator.c | 6 +++--- + net/core/gen_stats.c | 12 ++++++------ + net/sched/sch_api.c | 2 +- + net/sched/sch_generic.c | 10 ++++++++++ + 7 files changed, 67 insertions(+), 17 deletions(-) + create mode 100644 include/net/net_seq_lock.h + +diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h +index 1424e02cef90..163f8415e5db 100644 +--- a/include/net/gen_stats.h ++++ b/include/net/gen_stats.h +@@ -6,6 +6,7 @@ + #include <linux/socket.h> + #include <linux/rtnetlink.h> + #include <linux/pkt_sched.h> ++#include <net/net_seq_lock.h> + + /* Note: this used to be in include/uapi/linux/gen_stats.h */ + struct gnet_stats_basic_packed { +@@ -42,15 +43,15 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type, + spinlock_t *lock, struct gnet_dump *d, + int padattr); + +-int gnet_stats_copy_basic(const seqcount_t *running, ++int gnet_stats_copy_basic(net_seqlock_t *running, + struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b); +-void __gnet_stats_copy_basic(const seqcount_t *running, ++void __gnet_stats_copy_basic(net_seqlock_t *running, + struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b); +-int gnet_stats_copy_basic_hw(const seqcount_t *running, ++int gnet_stats_copy_basic_hw(net_seqlock_t *running, + struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b); +@@ -70,13 +71,13 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu_bstats, + struct net_rate_estimator __rcu **rate_est, + spinlock_t *lock, +- seqcount_t *running, struct nlattr *opt); ++ net_seqlock_t *running, struct nlattr *opt); + void gen_kill_estimator(struct net_rate_estimator __rcu **ptr); + int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu_bstats, + struct net_rate_estimator __rcu **ptr, + spinlock_t *lock, +- seqcount_t *running, struct nlattr *opt); ++ net_seqlock_t *running, struct nlattr *opt); + bool gen_estimator_active(struct net_rate_estimator __rcu **ptr); + bool gen_estimator_read(struct net_rate_estimator __rcu **ptr, + struct gnet_stats_rate_est64 *sample); +diff --git a/include/net/net_seq_lock.h b/include/net/net_seq_lock.h +new file mode 100644 +index 000000000000..95a497a72e51 +--- /dev/null ++++ b/include/net/net_seq_lock.h +@@ -0,0 +1,24 @@ ++#ifndef __NET_NET_SEQ_LOCK_H__ ++#define __NET_NET_SEQ_LOCK_H__ ++ ++#ifdef CONFIG_PREEMPT_RT ++# define net_seqlock_t seqlock_t ++# define net_seq_begin(__r) read_seqbegin(__r) ++# define net_seq_retry(__r, __s) read_seqretry(__r, __s) ++ ++static inline int try_write_seqlock(seqlock_t *sl) ++{ ++ if (spin_trylock(&sl->lock)) { ++ write_seqcount_begin(&sl->seqcount); ++ return 1; ++ } ++ return 0; ++} ++ ++#else ++# define net_seqlock_t seqcount_t ++# define net_seq_begin(__r) read_seqcount_begin(__r) ++# define net_seq_retry(__r, __s) read_seqcount_retry(__r, __s) ++#endif ++ ++#endif +diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h +index a62677be7452..8ce663a9b4f4 100644 +--- a/include/net/sch_generic.h ++++ b/include/net/sch_generic.h +@@ -10,6 +10,7 @@ + #include <linux/percpu.h> + #include <linux/dynamic_queue_limits.h> + #include <linux/list.h> ++#include <net/net_seq_lock.h> + #include <linux/refcount.h> + #include <linux/workqueue.h> + #include <linux/mutex.h> +@@ -101,7 +102,7 @@ struct Qdisc { + struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; + struct qdisc_skb_head q; + struct gnet_stats_basic_packed bstats; +- seqcount_t running; ++ net_seqlock_t running; + struct gnet_stats_queue qstats; + unsigned long state; + struct Qdisc *next_sched; +@@ -142,7 +143,11 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc) + { + if (qdisc->flags & TCQ_F_NOLOCK) + return spin_is_locked(&qdisc->seqlock); ++#ifdef CONFIG_PREEMPT_RT ++ return spin_is_locked(&qdisc->running.lock) ? true : false; ++#else + return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; ++#endif + } + + static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) +@@ -183,17 +188,27 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) + } else if (qdisc_is_running(qdisc)) { + return false; + } ++#ifdef CONFIG_PREEMPT_RT ++ if (try_write_seqlock(&qdisc->running)) ++ return true; ++ return false; ++#else + /* Variant of write_seqcount_begin() telling lockdep a trylock + * was attempted. + */ + raw_write_seqcount_begin(&qdisc->running); + seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_); + return true; ++#endif + } + + static inline void qdisc_run_end(struct Qdisc *qdisc) + { ++#ifdef CONFIG_PREEMPT_RT ++ write_sequnlock(&qdisc->running); ++#else + write_seqcount_end(&qdisc->running); ++#endif + if (qdisc->flags & TCQ_F_NOLOCK) { + spin_unlock(&qdisc->seqlock); + +@@ -583,7 +598,7 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc) + return qdisc_lock(root); + } + +-static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc) ++static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc) + { + struct Qdisc *root = qdisc_root_sleeping(qdisc); + +diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c +index 8e582e29a41e..e51f4854d8b2 100644 +--- a/net/core/gen_estimator.c ++++ b/net/core/gen_estimator.c +@@ -42,7 +42,7 @@ + struct net_rate_estimator { + struct gnet_stats_basic_packed *bstats; + spinlock_t *stats_lock; +- seqcount_t *running; ++ net_seqlock_t *running; + struct gnet_stats_basic_cpu __percpu *cpu_bstats; + u8 ewma_log; + u8 intvl_log; /* period : (250ms << intvl_log) */ +@@ -125,7 +125,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu_bstats, + struct net_rate_estimator __rcu **rate_est, + spinlock_t *lock, +- seqcount_t *running, ++ net_seqlock_t *running, + struct nlattr *opt) + { + struct gnet_estimator *parm = nla_data(opt); +@@ -226,7 +226,7 @@ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu_bstats, + struct net_rate_estimator __rcu **rate_est, + spinlock_t *lock, +- seqcount_t *running, struct nlattr *opt) ++ net_seqlock_t *running, struct nlattr *opt) + { + return gen_new_estimator(bstats, cpu_bstats, rate_est, + lock, running, opt); +diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c +index e491b083b348..ef432cea2e10 100644 +--- a/net/core/gen_stats.c ++++ b/net/core/gen_stats.c +@@ -137,7 +137,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, + } + + void +-__gnet_stats_copy_basic(const seqcount_t *running, ++__gnet_stats_copy_basic(net_seqlock_t *running, + struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b) +@@ -150,15 +150,15 @@ __gnet_stats_copy_basic(const seqcount_t *running, + } + do { + if (running) +- seq = read_seqcount_begin(running); ++ seq = net_seq_begin(running); + bstats->bytes = b->bytes; + bstats->packets = b->packets; +- } while (running && read_seqcount_retry(running, seq)); ++ } while (running && net_seq_retry(running, seq)); + } + EXPORT_SYMBOL(__gnet_stats_copy_basic); + + static int +-___gnet_stats_copy_basic(const seqcount_t *running, ++___gnet_stats_copy_basic(net_seqlock_t *running, + struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b, +@@ -204,7 +204,7 @@ ___gnet_stats_copy_basic(const seqcount_t *running, + * if the room in the socket buffer was not sufficient. + */ + int +-gnet_stats_copy_basic(const seqcount_t *running, ++gnet_stats_copy_basic(net_seqlock_t *running, + struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b) +@@ -228,7 +228,7 @@ EXPORT_SYMBOL(gnet_stats_copy_basic); + * if the room in the socket buffer was not sufficient. + */ + int +-gnet_stats_copy_basic_hw(const seqcount_t *running, ++gnet_stats_copy_basic_hw(net_seqlock_t *running, + struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b) +diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c +index 5c2d230790db..a5430619ca5f 100644 +--- a/net/sched/sch_api.c ++++ b/net/sched/sch_api.c +@@ -1275,7 +1275,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, + rcu_assign_pointer(sch->stab, stab); + } + if (tca[TCA_RATE]) { +- seqcount_t *running; ++ net_seqlock_t *running; + + err = -EOPNOTSUPP; + if (sch->flags & TCQ_F_MQROOT) { +diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c +index ecdd9e83f2f4..73b5aa797645 100644 +--- a/net/sched/sch_generic.c ++++ b/net/sched/sch_generic.c +@@ -578,7 +578,11 @@ struct Qdisc noop_qdisc = { + .ops = &noop_qdisc_ops, + .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), + .dev_queue = &noop_netdev_queue, ++#ifdef CONFIG_PREEMPT_RT ++ .running = __SEQLOCK_UNLOCKED(noop_qdisc.running), ++#else + .running = SEQCNT_ZERO(noop_qdisc.running), ++#endif + .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), + .gso_skb = { + .next = (struct sk_buff *)&noop_qdisc.gso_skb, +@@ -889,9 +893,15 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, + lockdep_set_class(&sch->seqlock, + dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + ++#ifdef CONFIG_PREEMPT_RT ++ seqlock_init(&sch->running); ++ lockdep_set_class(&sch->running.lock, ++ dev->qdisc_running_key ?: &qdisc_running_key); ++#else + seqcount_init(&sch->running); + lockdep_set_class(&sch->running, + dev->qdisc_running_key ?: &qdisc_running_key); ++#endif + + sch->ops = ops; + sch->flags = ops->static_flags; +-- +2.43.0 + diff --git a/debian/patches-rt/0187-net-Properly-annotate-the-try-lock-for-the-seqlock.patch b/debian/patches-rt/0187-net-Properly-annotate-the-try-lock-for-the-seqlock.patch new file mode 100644 index 000000000..130a8cb78 --- /dev/null +++ b/debian/patches-rt/0187-net-Properly-annotate-the-try-lock-for-the-seqlock.patch @@ -0,0 +1,71 @@ +From 442e014107fc616d529c73b0f0436980b817d021 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 8 Sep 2020 16:57:11 +0200 +Subject: [PATCH 187/323] net: Properly annotate the try-lock for the seqlock +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +In patch + ("net/Qdisc: use a seqlock instead seqcount") + +the seqcount has been replaced with a seqlock to allow to reader to +boost the preempted writer. +The try_write_seqlock() acquired the lock with a try-lock but the +seqcount annotation was "lock". + +Opencode write_seqcount_t_begin() and use the try-lock annotation for +lockdep. + +Reported-by: Mike Galbraith <efault@gmx.de> +Cc: stable-rt@vger.kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + include/net/net_seq_lock.h | 9 --------- + include/net/sch_generic.h | 10 +++++++++- + 2 files changed, 9 insertions(+), 10 deletions(-) + +diff --git a/include/net/net_seq_lock.h b/include/net/net_seq_lock.h +index 95a497a72e51..67710bace741 100644 +--- a/include/net/net_seq_lock.h ++++ b/include/net/net_seq_lock.h +@@ -6,15 +6,6 @@ + # define net_seq_begin(__r) read_seqbegin(__r) + # define net_seq_retry(__r, __s) read_seqretry(__r, __s) + +-static inline int try_write_seqlock(seqlock_t *sl) +-{ +- if (spin_trylock(&sl->lock)) { +- write_seqcount_begin(&sl->seqcount); +- return 1; +- } +- return 0; +-} +- + #else + # define net_seqlock_t seqcount_t + # define net_seq_begin(__r) read_seqcount_begin(__r) +diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h +index 8ce663a9b4f4..eee11a1c9321 100644 +--- a/include/net/sch_generic.h ++++ b/include/net/sch_generic.h +@@ -189,8 +189,16 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) + return false; + } + #ifdef CONFIG_PREEMPT_RT +- if (try_write_seqlock(&qdisc->running)) ++ if (spin_trylock(&qdisc->running.lock)) { ++ seqcount_t *s = &qdisc->running.seqcount.seqcount; ++ /* ++ * Variant of write_seqcount_t_begin() telling lockdep that a ++ * trylock was attempted. ++ */ ++ do_raw_write_seqcount_begin(s); ++ seqcount_acquire(&s->dep_map, 0, 1, _RET_IP_); + return true; ++ } + return false; + #else + /* Variant of write_seqcount_begin() telling lockdep a trylock +-- +2.43.0 + diff --git a/debian/patches-rt/0188-kconfig-Disable-config-options-which-are-not-RT-comp.patch b/debian/patches-rt/0188-kconfig-Disable-config-options-which-are-not-RT-comp.patch new file mode 100644 index 000000000..27d06fa44 --- /dev/null +++ b/debian/patches-rt/0188-kconfig-Disable-config-options-which-are-not-RT-comp.patch @@ -0,0 +1,43 @@ +From f4fb1a364b1dc97c110e6103ffe75b21e46a0e63 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Sun, 24 Jul 2011 12:11:43 +0200 +Subject: [PATCH 188/323] kconfig: Disable config options which are not RT + compatible +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Disable stuff which is known to have issues on RT + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + arch/Kconfig | 1 + + mm/Kconfig | 2 +- + 2 files changed, 2 insertions(+), 1 deletion(-) + +diff --git a/arch/Kconfig b/arch/Kconfig +index 628e1e7fe302..8bbbca8839e1 100644 +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -37,6 +37,7 @@ config OPROFILE + tristate "OProfile system profiling" + depends on PROFILING + depends on HAVE_OPROFILE ++ depends on !PREEMPT_RT + select RING_BUFFER + select RING_BUFFER_ALLOW_SWAP + help +diff --git a/mm/Kconfig b/mm/Kconfig +index 8c49d09da214..c8cbcb5118b0 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -387,7 +387,7 @@ config NOMMU_INITIAL_TRIM_EXCESS + + config TRANSPARENT_HUGEPAGE + bool "Transparent Hugepage Support" +- depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE ++ depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT + select COMPACTION + select XARRAY_MULTI + help +-- +2.43.0 + diff --git a/debian/patches-rt/0189-mm-Allow-only-SLUB-on-RT.patch b/debian/patches-rt/0189-mm-Allow-only-SLUB-on-RT.patch new file mode 100644 index 000000000..9fc3d7351 --- /dev/null +++ b/debian/patches-rt/0189-mm-Allow-only-SLUB-on-RT.patch @@ -0,0 +1,47 @@ +From 1da220d86ade67051c7c89743d6d037902c3839d Mon Sep 17 00:00:00 2001 +From: Ingo Molnar <mingo@elte.hu> +Date: Fri, 3 Jul 2009 08:44:03 -0500 +Subject: [PATCH 189/323] mm: Allow only SLUB on RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Memory allocation disables interrupts as part of the allocation and freeing +process. For -RT it is important that this section remain short and don't +depend on the size of the request or an internal state of the memory allocator. +At the beginning the SLAB memory allocator was adopted for RT's needs and it +required substantial changes. Later, with the addition of the SLUB memory +allocator we adopted this one as well and the changes were smaller. More +important, due to the design of the SLUB allocator it performs better and its +worst case latency was smaller. In the end only SLUB remained supported. + +Disable SLAB and SLOB on -RT. Only SLUB is adopted to -RT needs. + +Signed-off-by: Ingo Molnar <mingo@elte.hu> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + init/Kconfig | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/init/Kconfig b/init/Kconfig +index 9807c66b24bb..df8d0d0f800f 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1922,6 +1922,7 @@ choice + + config SLAB + bool "SLAB" ++ depends on !PREEMPT_RT + select HAVE_HARDENED_USERCOPY_ALLOCATOR + help + The regular slab allocator that is established and known to work +@@ -1942,6 +1943,7 @@ config SLUB + config SLOB + depends on EXPERT + bool "SLOB (Simple Allocator)" ++ depends on !PREEMPT_RT + help + SLOB replaces the stock allocator with a drastically simpler + allocator. SLOB is generally more space efficient but +-- +2.43.0 + diff --git a/debian/patches-rt/0190-sched-Disable-CONFIG_RT_GROUP_SCHED-on-RT.patch b/debian/patches-rt/0190-sched-Disable-CONFIG_RT_GROUP_SCHED-on-RT.patch new file mode 100644 index 000000000..eb62300e8 --- /dev/null +++ b/debian/patches-rt/0190-sched-Disable-CONFIG_RT_GROUP_SCHED-on-RT.patch @@ -0,0 +1,35 @@ +From 4e23e134063807a9288e9b547e1955a9124ac5b7 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 18 Jul 2011 17:03:52 +0200 +Subject: [PATCH 190/323] sched: Disable CONFIG_RT_GROUP_SCHED on RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Carsten reported problems when running: + + taskset 01 chrt -f 1 sleep 1 + +from within rc.local on a F15 machine. The task stays running and +never gets on the run queue because some of the run queues have +rt_throttled=1 which does not go away. Works nice from a ssh login +shell. Disabling CONFIG_RT_GROUP_SCHED solves that as well. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + init/Kconfig | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/init/Kconfig b/init/Kconfig +index df8d0d0f800f..7e9578a0176f 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -985,6 +985,7 @@ config CFS_BANDWIDTH + config RT_GROUP_SCHED + bool "Group scheduling for SCHED_RR/FIFO" + depends on CGROUP_SCHED ++ depends on !PREEMPT_RT + default n + help + This feature lets you explicitly allocate real CPU bandwidth +-- +2.43.0 + diff --git a/debian/patches-rt/0191-net-core-disable-NET_RX_BUSY_POLL-on-RT.patch b/debian/patches-rt/0191-net-core-disable-NET_RX_BUSY_POLL-on-RT.patch new file mode 100644 index 000000000..aef82ca64 --- /dev/null +++ b/debian/patches-rt/0191-net-core-disable-NET_RX_BUSY_POLL-on-RT.patch @@ -0,0 +1,44 @@ +From 311a1cf04650b065502758f992df44702d23f6cd Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Sat, 27 May 2017 19:02:06 +0200 +Subject: [PATCH 191/323] net/core: disable NET_RX_BUSY_POLL on RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +napi_busy_loop() disables preemption and performs a NAPI poll. We can't acquire +sleeping locks with disabled preemption so we would have to work around this +and add explicit locking for synchronisation against ksoftirqd. +Without explicit synchronisation a low priority process would "own" the NAPI +state (by setting NAPIF_STATE_SCHED) and could be scheduled out (no +preempt_disable() and BH is preemptible on RT). +In case a network packages arrives then the interrupt handler would set +NAPIF_STATE_MISSED and the system would wait until the task owning the NAPI +would be scheduled in again. +Should a task with RT priority busy poll then it would consume the CPU instead +allowing tasks with lower priority to run. + +The NET_RX_BUSY_POLL is disabled by default (the system wide sysctls for +poll/read are set to zero) so disable NET_RX_BUSY_POLL on RT to avoid wrong +locking context on RT. Should this feature be considered useful on RT systems +then it could be enabled again with proper locking and synchronisation. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + net/Kconfig | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/Kconfig b/net/Kconfig +index a22c3fb88564..5a17bded7136 100644 +--- a/net/Kconfig ++++ b/net/Kconfig +@@ -280,7 +280,7 @@ config CGROUP_NET_CLASSID + + config NET_RX_BUSY_POLL + bool +- default y ++ default y if !PREEMPT_RT + + config BQL + bool +-- +2.43.0 + diff --git a/debian/patches-rt/0192-efi-Disable-runtime-services-on-RT.patch b/debian/patches-rt/0192-efi-Disable-runtime-services-on-RT.patch new file mode 100644 index 000000000..ed06b5934 --- /dev/null +++ b/debian/patches-rt/0192-efi-Disable-runtime-services-on-RT.patch @@ -0,0 +1,46 @@ +From 6c6afc933fea370e5c90a34ed6b622bcc274af5a Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Thu, 26 Jul 2018 15:03:16 +0200 +Subject: [PATCH 192/323] efi: Disable runtime services on RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Based on meassurements the EFI functions get_variable / +get_next_variable take up to 2us which looks okay. +The functions get_time, set_time take around 10ms. Those 10ms are too +much. Even one ms would be too much. +Ard mentioned that SetVariable might even trigger larger latencies if +the firware will erase flash blocks on NOR. + +The time-functions are used by efi-rtc and can be triggered during +runtimed (either via explicit read/write or ntp sync). + +The variable write could be used by pstore. +These functions can be disabled without much of a loss. The poweroff / +reboot hooks may be provided by PSCI. + +Disable EFI's runtime wrappers. + +This was observed on "EFI v2.60 by SoftIron Overdrive 1000". + +Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/firmware/efi/efi.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c +index 332739f3eded..8d390091203f 100644 +--- a/drivers/firmware/efi/efi.c ++++ b/drivers/firmware/efi/efi.c +@@ -66,7 +66,7 @@ struct mm_struct efi_mm = { + + struct workqueue_struct *efi_rts_wq; + +-static bool disable_runtime; ++static bool disable_runtime = IS_ENABLED(CONFIG_PREEMPT_RT); + static int __init setup_noefi(char *arg) + { + disable_runtime = true; +-- +2.43.0 + diff --git a/debian/patches-rt/0193-efi-Allow-efi-runtime.patch b/debian/patches-rt/0193-efi-Allow-efi-runtime.patch new file mode 100644 index 000000000..f52a8c240 --- /dev/null +++ b/debian/patches-rt/0193-efi-Allow-efi-runtime.patch @@ -0,0 +1,32 @@ +From 712d19894b0524bdc524334f6d76babef13db194 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Thu, 26 Jul 2018 15:06:10 +0200 +Subject: [PATCH 193/323] efi: Allow efi=runtime +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +In case the command line option "efi=noruntime" is default at built-time, the user +could overwrite its state by `efi=runtime' and allow it again. + +Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/firmware/efi/efi.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c +index 8d390091203f..8589df0e8c1f 100644 +--- a/drivers/firmware/efi/efi.c ++++ b/drivers/firmware/efi/efi.c +@@ -97,6 +97,9 @@ static int __init parse_efi_cmdline(char *str) + if (parse_option_str(str, "noruntime")) + disable_runtime = true; + ++ if (parse_option_str(str, "runtime")) ++ disable_runtime = false; ++ + if (parse_option_str(str, "nosoftreserve")) + set_bit(EFI_MEM_NO_SOFT_RESERVE, &efi.flags); + +-- +2.43.0 + diff --git a/debian/patches-rt/0194-rt-Add-local-irq-locks.patch b/debian/patches-rt/0194-rt-Add-local-irq-locks.patch new file mode 100644 index 000000000..53b3fd552 --- /dev/null +++ b/debian/patches-rt/0194-rt-Add-local-irq-locks.patch @@ -0,0 +1,182 @@ +From 0d3e0b0db513aac6e4cd2e2d084b7a76f4d28c44 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 20 Jun 2011 09:03:47 +0200 +Subject: [PATCH 194/323] rt: Add local irq locks +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Introduce locallock. For !RT this maps to preempt_disable()/ +local_irq_disable() so there is not much that changes. For RT this will +map to a spinlock. This makes preemption possible and locked "ressource" +gets the lockdep anotation it wouldn't have otherwise. The locks are +recursive for owner == current. Also, all locks user migrate_disable() +which ensures that the task is not migrated to another CPU while the lock +is held and the owner is preempted. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + include/linux/local_lock_internal.h | 111 ++++++++++++++++++++++++++-- + 1 file changed, 103 insertions(+), 8 deletions(-) + +diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h +index 3f02b818625e..1b8ae034946f 100644 +--- a/include/linux/local_lock_internal.h ++++ b/include/linux/local_lock_internal.h +@@ -7,13 +7,39 @@ + #include <linux/lockdep.h> + + typedef struct { +-#ifdef CONFIG_DEBUG_LOCK_ALLOC ++#ifdef CONFIG_PREEMPT_RT ++ spinlock_t lock; ++ struct task_struct *owner; ++ int nestcnt; ++ ++#elif defined(CONFIG_DEBUG_LOCK_ALLOC) + struct lockdep_map dep_map; + struct task_struct *owner; + #endif + } local_lock_t; + +-#ifdef CONFIG_DEBUG_LOCK_ALLOC ++#ifdef CONFIG_PREEMPT_RT ++ ++#define INIT_LOCAL_LOCK(lockname) { \ ++ __SPIN_LOCK_UNLOCKED((lockname).lock), \ ++ .owner = NULL, \ ++ .nestcnt = 0, \ ++ } ++ ++static inline void ___local_lock_init(local_lock_t *l) ++{ ++ l->owner = NULL; ++ l->nestcnt = 0; ++} ++ ++#define __local_lock_init(l) \ ++do { \ ++ spin_lock_init(&(l)->lock); \ ++ ___local_lock_init(l); \ ++} while (0) ++ ++#elif defined(CONFIG_DEBUG_LOCK_ALLOC) ++ + # define LOCAL_LOCK_DEBUG_INIT(lockname) \ + .dep_map = { \ + .name = #lockname, \ +@@ -21,7 +47,33 @@ typedef struct { + .lock_type = LD_LOCK_PERCPU, \ + }, \ + .owner = NULL, ++#endif ++ ++#ifdef CONFIG_PREEMPT_RT + ++static inline void local_lock_acquire(local_lock_t *l) ++{ ++ if (l->owner != current) { ++ spin_lock(&l->lock); ++ DEBUG_LOCKS_WARN_ON(l->owner); ++ DEBUG_LOCKS_WARN_ON(l->nestcnt); ++ l->owner = current; ++ } ++ l->nestcnt++; ++} ++ ++static inline void local_lock_release(local_lock_t *l) ++{ ++ DEBUG_LOCKS_WARN_ON(l->nestcnt == 0); ++ DEBUG_LOCKS_WARN_ON(l->owner != current); ++ if (--l->nestcnt) ++ return; ++ ++ l->owner = NULL; ++ spin_unlock(&l->lock); ++} ++ ++#elif defined(CONFIG_DEBUG_LOCK_ALLOC) + static inline void local_lock_acquire(local_lock_t *l) + { + lock_map_acquire(&l->dep_map); +@@ -47,6 +99,47 @@ static inline void local_lock_release(local_lock_t *l) { } + static inline void local_lock_debug_init(local_lock_t *l) { } + #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ + ++#ifdef CONFIG_PREEMPT_RT ++ ++#define __local_lock(lock) \ ++ do { \ ++ migrate_disable(); \ ++ local_lock_acquire(this_cpu_ptr(lock)); \ ++ } while (0) ++ ++#define __local_unlock(lock) \ ++ do { \ ++ local_lock_release(this_cpu_ptr(lock)); \ ++ migrate_enable(); \ ++ } while (0) ++ ++#define __local_lock_irq(lock) \ ++ do { \ ++ migrate_disable(); \ ++ local_lock_acquire(this_cpu_ptr(lock)); \ ++ } while (0) ++ ++#define __local_lock_irqsave(lock, flags) \ ++ do { \ ++ migrate_disable(); \ ++ flags = 0; \ ++ local_lock_acquire(this_cpu_ptr(lock)); \ ++ } while (0) ++ ++#define __local_unlock_irq(lock) \ ++ do { \ ++ local_lock_release(this_cpu_ptr(lock)); \ ++ migrate_enable(); \ ++ } while (0) ++ ++#define __local_unlock_irqrestore(lock, flags) \ ++ do { \ ++ local_lock_release(this_cpu_ptr(lock)); \ ++ migrate_enable(); \ ++ } while (0) ++ ++#else ++ + #define INIT_LOCAL_LOCK(lockname) { LOCAL_LOCK_DEBUG_INIT(lockname) } + + #define __local_lock_init(lock) \ +@@ -66,6 +159,12 @@ do { \ + local_lock_acquire(this_cpu_ptr(lock)); \ + } while (0) + ++#define __local_unlock(lock) \ ++ do { \ ++ local_lock_release(this_cpu_ptr(lock)); \ ++ preempt_enable(); \ ++ } while (0) ++ + #define __local_lock_irq(lock) \ + do { \ + local_irq_disable(); \ +@@ -78,12 +177,6 @@ do { \ + local_lock_acquire(this_cpu_ptr(lock)); \ + } while (0) + +-#define __local_unlock(lock) \ +- do { \ +- local_lock_release(this_cpu_ptr(lock)); \ +- preempt_enable(); \ +- } while (0) +- + #define __local_unlock_irq(lock) \ + do { \ + local_lock_release(this_cpu_ptr(lock)); \ +@@ -95,3 +188,5 @@ do { \ + local_lock_release(this_cpu_ptr(lock)); \ + local_irq_restore(flags); \ + } while (0) ++ ++#endif +-- +2.43.0 + diff --git a/debian/patches-rt/0195-signal-x86-Delay-calling-signals-in-atomic.patch b/debian/patches-rt/0195-signal-x86-Delay-calling-signals-in-atomic.patch new file mode 100644 index 000000000..af5ab46e8 --- /dev/null +++ b/debian/patches-rt/0195-signal-x86-Delay-calling-signals-in-atomic.patch @@ -0,0 +1,140 @@ +From f4ff44284474fd78b52414326c579dcd8196fc47 Mon Sep 17 00:00:00 2001 +From: Oleg Nesterov <oleg@redhat.com> +Date: Tue, 14 Jul 2015 14:26:34 +0200 +Subject: [PATCH 195/323] signal/x86: Delay calling signals in atomic +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +On x86_64 we must disable preemption before we enable interrupts +for stack faults, int3 and debugging, because the current task is using +a per CPU debug stack defined by the IST. If we schedule out, another task +can come in and use the same stack and cause the stack to be corrupted +and crash the kernel on return. + +When CONFIG_PREEMPT_RT is enabled, spin_locks become mutexes, and +one of these is the spin lock used in signal handling. + +Some of the debug code (int3) causes do_trap() to send a signal. +This function calls a spin lock that has been converted to a mutex +and has the possibility to sleep. If this happens, the above issues with +the corrupted stack is possible. + +Instead of calling the signal right away, for PREEMPT_RT and x86_64, +the signal information is stored on the stacks task_struct and +TIF_NOTIFY_RESUME is set. Then on exit of the trap, the signal resume +code will send the signal when preemption is enabled. + +[ rostedt: Switched from #ifdef CONFIG_PREEMPT_RT to + ARCH_RT_DELAYS_SIGNAL_SEND and added comments to the code. ] + +Signed-off-by: Oleg Nesterov <oleg@redhat.com> +Signed-off-by: Steven Rostedt <rostedt@goodmis.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +[bigeasy: also needed on 32bit as per Yang Shi <yang.shi@linaro.org>] +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/x86/include/asm/signal.h | 13 +++++++++++++ + include/linux/sched.h | 4 ++++ + kernel/entry/common.c | 8 ++++++++ + kernel/signal.c | 28 ++++++++++++++++++++++++++++ + 4 files changed, 53 insertions(+) + +diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h +index 6fd8410a3910..f3bf2f515edb 100644 +--- a/arch/x86/include/asm/signal.h ++++ b/arch/x86/include/asm/signal.h +@@ -28,6 +28,19 @@ typedef struct { + #define SA_IA32_ABI 0x02000000u + #define SA_X32_ABI 0x01000000u + ++/* ++ * Because some traps use the IST stack, we must keep preemption ++ * disabled while calling do_trap(), but do_trap() may call ++ * force_sig_info() which will grab the signal spin_locks for the ++ * task, which in PREEMPT_RT are mutexes. By defining ++ * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set ++ * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the ++ * trap. ++ */ ++#if defined(CONFIG_PREEMPT_RT) ++#define ARCH_RT_DELAYS_SIGNAL_SEND ++#endif ++ + #ifndef CONFIG_COMPAT + typedef sigset_t compat_sigset_t; + #endif +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 3650fdaac4ca..d259126f46cf 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1001,6 +1001,10 @@ struct task_struct { + /* Restored if set_restore_sigmask() was used: */ + sigset_t saved_sigmask; + struct sigpending pending; ++#ifdef CONFIG_PREEMPT_RT ++ /* TODO: move me into ->restart_block ? */ ++ struct kernel_siginfo forced_info; ++#endif + unsigned long sas_ss_sp; + size_t sas_ss_size; + unsigned int sas_ss_flags; +diff --git a/kernel/entry/common.c b/kernel/entry/common.c +index e6a66de1202a..e6da86039ccf 100644 +--- a/kernel/entry/common.c ++++ b/kernel/entry/common.c +@@ -160,6 +160,14 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + if (ti_work & _TIF_NEED_RESCHED) + schedule(); + ++#ifdef ARCH_RT_DELAYS_SIGNAL_SEND ++ if (unlikely(current->forced_info.si_signo)) { ++ struct task_struct *t = current; ++ force_sig_info(&t->forced_info); ++ t->forced_info.si_signo = 0; ++ } ++#endif ++ + if (ti_work & _TIF_UPROBE) + uprobe_notify_resume(regs); + +diff --git a/kernel/signal.c b/kernel/signal.c +index 5ab628e2afc5..e8819aabe3cd 100644 +--- a/kernel/signal.c ++++ b/kernel/signal.c +@@ -1314,6 +1314,34 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t) + struct k_sigaction *action; + int sig = info->si_signo; + ++ /* ++ * On some archs, PREEMPT_RT has to delay sending a signal from a trap ++ * since it can not enable preemption, and the signal code's spin_locks ++ * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will ++ * send the signal on exit of the trap. ++ */ ++#ifdef ARCH_RT_DELAYS_SIGNAL_SEND ++ if (in_atomic()) { ++ struct task_struct *t = current; ++ ++ if (WARN_ON_ONCE(t->forced_info.si_signo)) ++ return 0; ++ ++ if (is_si_special(info)) { ++ WARN_ON_ONCE(info != SEND_SIG_PRIV); ++ t->forced_info.si_signo = info->si_signo; ++ t->forced_info.si_errno = 0; ++ t->forced_info.si_code = SI_KERNEL; ++ t->forced_info.si_pid = 0; ++ t->forced_info.si_uid = 0; ++ } else { ++ t->forced_info = *info; ++ } ++ ++ set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); ++ return 0; ++ } ++#endif + spin_lock_irqsave(&t->sighand->siglock, flags); + action = &t->sighand->action[sig-1]; + ignored = action->sa.sa_handler == SIG_IGN; +-- +2.43.0 + diff --git a/debian/patches-rt/0196-Split-IRQ-off-and-zone-lock-while-freeing-pages-from.patch b/debian/patches-rt/0196-Split-IRQ-off-and-zone-lock-while-freeing-pages-from.patch new file mode 100644 index 000000000..999298a0e --- /dev/null +++ b/debian/patches-rt/0196-Split-IRQ-off-and-zone-lock-while-freeing-pages-from.patch @@ -0,0 +1,172 @@ +From 1e362e9e053f608c55deb05577371d61d5db1a92 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Mon, 28 May 2018 15:24:20 +0200 +Subject: [PATCH 196/323] Split IRQ-off and zone->lock while freeing pages from + PCP list #1 +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Split the IRQ-off section while accessing the PCP list from zone->lock +while freeing pages. +Introcude isolate_pcp_pages() which separates the pages from the PCP +list onto a temporary list and then free the temporary list via +free_pcppages_bulk(). + +Signed-off-by: Peter Zijlstra <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + mm/page_alloc.c | 81 +++++++++++++++++++++++++++++++------------------ + 1 file changed, 51 insertions(+), 30 deletions(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 124ab9324610..0dc0eb767fb6 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1331,7 +1331,7 @@ static inline void prefetch_buddy(struct page *page) + } + + /* +- * Frees a number of pages from the PCP lists ++ * Frees a number of pages which have been collected from the pcp lists. + * Assumes all pages on list are in same zone, and of same order. + * count is the number of pages to free. + * +@@ -1342,14 +1342,40 @@ static inline void prefetch_buddy(struct page *page) + * pinned" detection logic. + */ + static void free_pcppages_bulk(struct zone *zone, int count, +- struct per_cpu_pages *pcp) ++ struct list_head *head) ++{ ++ bool isolated_pageblocks; ++ struct page *page, *tmp; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&zone->lock, flags); ++ isolated_pageblocks = has_isolate_pageblock(zone); ++ ++ /* ++ * Use safe version since after __free_one_page(), ++ * page->lru.next will not point to original list. ++ */ ++ list_for_each_entry_safe(page, tmp, head, lru) { ++ int mt = get_pcppage_migratetype(page); ++ /* MIGRATE_ISOLATE page should not go to pcplists */ ++ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); ++ /* Pageblock could have been isolated meanwhile */ ++ if (unlikely(isolated_pageblocks)) ++ mt = get_pageblock_migratetype(page); ++ ++ __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE); ++ trace_mm_page_pcpu_drain(page, 0, mt); ++ } ++ spin_unlock_irqrestore(&zone->lock, flags); ++} ++ ++static void isolate_pcp_pages(int count, struct per_cpu_pages *pcp, ++ struct list_head *dst) + { + int migratetype = 0; + int batch_free = 0; + int prefetch_nr = 0; +- bool isolated_pageblocks; +- struct page *page, *tmp; +- LIST_HEAD(head); ++ struct page *page; + + /* + * Ensure proper count is passed which otherwise would stuck in the +@@ -1386,7 +1412,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, + if (bulkfree_pcp_prepare(page)) + continue; + +- list_add_tail(&page->lru, &head); ++ list_add_tail(&page->lru, dst); + + /* + * We are going to put the page back to the global +@@ -1401,26 +1427,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, + prefetch_buddy(page); + } while (--count && --batch_free && !list_empty(list)); + } +- +- spin_lock(&zone->lock); +- isolated_pageblocks = has_isolate_pageblock(zone); +- +- /* +- * Use safe version since after __free_one_page(), +- * page->lru.next will not point to original list. +- */ +- list_for_each_entry_safe(page, tmp, &head, lru) { +- int mt = get_pcppage_migratetype(page); +- /* MIGRATE_ISOLATE page should not go to pcplists */ +- VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); +- /* Pageblock could have been isolated meanwhile */ +- if (unlikely(isolated_pageblocks)) +- mt = get_pageblock_migratetype(page); +- +- __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE); +- trace_mm_page_pcpu_drain(page, 0, mt); +- } +- spin_unlock(&zone->lock); + } + + static void free_one_page(struct zone *zone, +@@ -2938,13 +2944,18 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) + { + unsigned long flags; + int to_drain, batch; ++ LIST_HEAD(dst); + + local_irq_save(flags); + batch = READ_ONCE(pcp->batch); + to_drain = min(pcp->count, batch); + if (to_drain > 0) +- free_pcppages_bulk(zone, to_drain, pcp); ++ isolate_pcp_pages(to_drain, pcp, &dst); ++ + local_irq_restore(flags); ++ ++ if (to_drain > 0) ++ free_pcppages_bulk(zone, to_drain, &dst); + } + #endif + +@@ -2960,14 +2971,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) + unsigned long flags; + struct per_cpu_pageset *pset; + struct per_cpu_pages *pcp; ++ LIST_HEAD(dst); ++ int count; + + local_irq_save(flags); + pset = per_cpu_ptr(zone->pageset, cpu); + + pcp = &pset->pcp; +- if (pcp->count) +- free_pcppages_bulk(zone, pcp->count, pcp); ++ count = pcp->count; ++ if (count) ++ isolate_pcp_pages(count, pcp, &dst); ++ + local_irq_restore(flags); ++ ++ if (count) ++ free_pcppages_bulk(zone, count, &dst); + } + + /* +@@ -3196,7 +3214,10 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn) + pcp->count++; + if (pcp->count >= pcp->high) { + unsigned long batch = READ_ONCE(pcp->batch); +- free_pcppages_bulk(zone, batch, pcp); ++ LIST_HEAD(dst); ++ ++ isolate_pcp_pages(batch, pcp, &dst); ++ free_pcppages_bulk(zone, batch, &dst); + } + } + +-- +2.43.0 + diff --git a/debian/patches-rt/0197-Split-IRQ-off-and-zone-lock-while-freeing-pages-from.patch b/debian/patches-rt/0197-Split-IRQ-off-and-zone-lock-while-freeing-pages-from.patch new file mode 100644 index 000000000..60c805684 --- /dev/null +++ b/debian/patches-rt/0197-Split-IRQ-off-and-zone-lock-while-freeing-pages-from.patch @@ -0,0 +1,172 @@ +From 3bb0aee1e418d977b819e6d0632d71cbb7cd1138 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Mon, 28 May 2018 15:24:21 +0200 +Subject: [PATCH 197/323] Split IRQ-off and zone->lock while freeing pages from + PCP list #2 +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Split the IRQ-off section while accessing the PCP list from zone->lock +while freeing pages. +Introcude isolate_pcp_pages() which separates the pages from the PCP +list onto a temporary list and then free the temporary list via +free_pcppages_bulk(). + +Signed-off-by: Peter Zijlstra <peterz@infradead.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + mm/page_alloc.c | 60 ++++++++++++++++++++++++++++++++++++++++--------- + 1 file changed, 50 insertions(+), 10 deletions(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 0dc0eb767fb6..a3f1e4c6bdc8 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -1341,8 +1341,8 @@ static inline void prefetch_buddy(struct page *page) + * And clear the zone's pages_scanned counter, to hold off the "all pages are + * pinned" detection logic. + */ +-static void free_pcppages_bulk(struct zone *zone, int count, +- struct list_head *head) ++static void free_pcppages_bulk(struct zone *zone, struct list_head *head, ++ bool zone_retry) + { + bool isolated_pageblocks; + struct page *page, *tmp; +@@ -1357,12 +1357,27 @@ static void free_pcppages_bulk(struct zone *zone, int count, + */ + list_for_each_entry_safe(page, tmp, head, lru) { + int mt = get_pcppage_migratetype(page); ++ ++ if (page_zone(page) != zone) { ++ /* ++ * free_unref_page_list() sorts pages by zone. If we end ++ * up with pages from a different NUMA nodes belonging ++ * to the same ZONE index then we need to redo with the ++ * correct ZONE pointer. Skip the page for now, redo it ++ * on the next iteration. ++ */ ++ WARN_ON_ONCE(zone_retry == false); ++ if (zone_retry) ++ continue; ++ } ++ + /* MIGRATE_ISOLATE page should not go to pcplists */ + VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); + /* Pageblock could have been isolated meanwhile */ + if (unlikely(isolated_pageblocks)) + mt = get_pageblock_migratetype(page); + ++ list_del(&page->lru); + __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE); + trace_mm_page_pcpu_drain(page, 0, mt); + } +@@ -2955,7 +2970,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) + local_irq_restore(flags); + + if (to_drain > 0) +- free_pcppages_bulk(zone, to_drain, &dst); ++ free_pcppages_bulk(zone, &dst, false); + } + #endif + +@@ -2985,7 +3000,7 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) + local_irq_restore(flags); + + if (count) +- free_pcppages_bulk(zone, count, &dst); ++ free_pcppages_bulk(zone, &dst, false); + } + + /* +@@ -3184,7 +3199,8 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn) + return true; + } + +-static void free_unref_page_commit(struct page *page, unsigned long pfn) ++static void free_unref_page_commit(struct page *page, unsigned long pfn, ++ struct list_head *dst) + { + struct zone *zone = page_zone(page); + struct per_cpu_pages *pcp; +@@ -3214,10 +3230,8 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn) + pcp->count++; + if (pcp->count >= pcp->high) { + unsigned long batch = READ_ONCE(pcp->batch); +- LIST_HEAD(dst); + +- isolate_pcp_pages(batch, pcp, &dst); +- free_pcppages_bulk(zone, batch, &dst); ++ isolate_pcp_pages(batch, pcp, dst); + } + } + +@@ -3228,13 +3242,17 @@ void free_unref_page(struct page *page) + { + unsigned long flags; + unsigned long pfn = page_to_pfn(page); ++ struct zone *zone = page_zone(page); ++ LIST_HEAD(dst); + + if (!free_unref_page_prepare(page, pfn)) + return; + + local_irq_save(flags); +- free_unref_page_commit(page, pfn); ++ free_unref_page_commit(page, pfn, &dst); + local_irq_restore(flags); ++ if (!list_empty(&dst)) ++ free_pcppages_bulk(zone, &dst, false); + } + + /* +@@ -3245,6 +3263,11 @@ void free_unref_page_list(struct list_head *list) + struct page *page, *next; + unsigned long flags, pfn; + int batch_count = 0; ++ struct list_head dsts[__MAX_NR_ZONES]; ++ int i; ++ ++ for (i = 0; i < __MAX_NR_ZONES; i++) ++ INIT_LIST_HEAD(&dsts[i]); + + /* Prepare pages for freeing */ + list_for_each_entry_safe(page, next, list, lru) { +@@ -3257,10 +3280,12 @@ void free_unref_page_list(struct list_head *list) + local_irq_save(flags); + list_for_each_entry_safe(page, next, list, lru) { + unsigned long pfn = page_private(page); ++ enum zone_type type; + + set_page_private(page, 0); + trace_mm_page_free_batched(page); +- free_unref_page_commit(page, pfn); ++ type = page_zonenum(page); ++ free_unref_page_commit(page, pfn, &dsts[type]); + + /* + * Guard against excessive IRQ disabled times when we get +@@ -3273,6 +3298,21 @@ void free_unref_page_list(struct list_head *list) + } + } + local_irq_restore(flags); ++ ++ for (i = 0; i < __MAX_NR_ZONES; ) { ++ struct page *page; ++ struct zone *zone; ++ ++ if (list_empty(&dsts[i])) { ++ i++; ++ continue; ++ } ++ ++ page = list_first_entry(&dsts[i], struct page, lru); ++ zone = page_zone(page); ++ ++ free_pcppages_bulk(zone, &dsts[i], true); ++ } + } + + /* +-- +2.43.0 + diff --git a/debian/patches-rt/0198-mm-SLxB-change-list_lock-to-raw_spinlock_t.patch b/debian/patches-rt/0198-mm-SLxB-change-list_lock-to-raw_spinlock_t.patch new file mode 100644 index 000000000..903ce7fd5 --- /dev/null +++ b/debian/patches-rt/0198-mm-SLxB-change-list_lock-to-raw_spinlock_t.patch @@ -0,0 +1,603 @@ +From 24eecb5c4b97d1a359a8b16be1dae2a499720795 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 28 May 2018 15:24:22 +0200 +Subject: [PATCH 198/323] mm/SLxB: change list_lock to raw_spinlock_t +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The list_lock is used with used with IRQs off on RT. Make it a raw_spinlock_t +otherwise the interrupts won't be disabled on -RT. The locking rules remain +the same on !RT. +This patch changes it for SLAB and SLUB since both share the same header +file for struct kmem_cache_node defintion. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + mm/slab.c | 90 +++++++++++++++++++++++++++---------------------------- + mm/slab.h | 2 +- + mm/slub.c | 50 +++++++++++++++---------------- + 3 files changed, 71 insertions(+), 71 deletions(-) + +diff --git a/mm/slab.c b/mm/slab.c +index b2cc2cf7d8a3..677c0651ef66 100644 +--- a/mm/slab.c ++++ b/mm/slab.c +@@ -233,7 +233,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) + parent->shared = NULL; + parent->alien = NULL; + parent->colour_next = 0; +- spin_lock_init(&parent->list_lock); ++ raw_spin_lock_init(&parent->list_lock); + parent->free_objects = 0; + parent->free_touched = 0; + } +@@ -558,9 +558,9 @@ static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep, + page_node = page_to_nid(page); + n = get_node(cachep, page_node); + +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + free_block(cachep, &objp, 1, page_node, &list); +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + + slabs_destroy(cachep, &list); + } +@@ -698,7 +698,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, + struct kmem_cache_node *n = get_node(cachep, node); + + if (ac->avail) { +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + /* + * Stuff objects into the remote nodes shared array first. + * That way we could avoid the overhead of putting the objects +@@ -709,7 +709,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, + + free_block(cachep, ac->entry, ac->avail, node, list); + ac->avail = 0; +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + } + } + +@@ -782,9 +782,9 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp, + slabs_destroy(cachep, &list); + } else { + n = get_node(cachep, page_node); +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + free_block(cachep, &objp, 1, page_node, &list); +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); + } + return 1; +@@ -825,10 +825,10 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp) + */ + n = get_node(cachep, node); + if (n) { +- spin_lock_irq(&n->list_lock); ++ raw_spin_lock_irq(&n->list_lock); + n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + + cachep->num; +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + + return 0; + } +@@ -907,7 +907,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep, + goto fail; + + n = get_node(cachep, node); +- spin_lock_irq(&n->list_lock); ++ raw_spin_lock_irq(&n->list_lock); + if (n->shared && force_change) { + free_block(cachep, n->shared->entry, + n->shared->avail, node, &list); +@@ -925,7 +925,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep, + new_alien = NULL; + } + +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); + + /* +@@ -964,7 +964,7 @@ static void cpuup_canceled(long cpu) + if (!n) + continue; + +- spin_lock_irq(&n->list_lock); ++ raw_spin_lock_irq(&n->list_lock); + + /* Free limit for this kmem_cache_node */ + n->free_limit -= cachep->batchcount; +@@ -975,7 +975,7 @@ static void cpuup_canceled(long cpu) + nc->avail = 0; + + if (!cpumask_empty(mask)) { +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + goto free_slab; + } + +@@ -989,7 +989,7 @@ static void cpuup_canceled(long cpu) + alien = n->alien; + n->alien = NULL; + +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + + kfree(shared); + if (alien) { +@@ -1173,7 +1173,7 @@ static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node * + /* + * Do not assume that spinlocks can be initialized via memcpy: + */ +- spin_lock_init(&ptr->list_lock); ++ raw_spin_lock_init(&ptr->list_lock); + + MAKE_ALL_LISTS(cachep, ptr, nodeid); + cachep->node[nodeid] = ptr; +@@ -1344,11 +1344,11 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) + for_each_kmem_cache_node(cachep, node, n) { + unsigned long total_slabs, free_slabs, free_objs; + +- spin_lock_irqsave(&n->list_lock, flags); ++ raw_spin_lock_irqsave(&n->list_lock, flags); + total_slabs = n->total_slabs; + free_slabs = n->free_slabs; + free_objs = n->free_objects; +- spin_unlock_irqrestore(&n->list_lock, flags); ++ raw_spin_unlock_irqrestore(&n->list_lock, flags); + + pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n", + node, total_slabs - free_slabs, total_slabs, +@@ -2105,7 +2105,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) + { + #ifdef CONFIG_SMP + check_irq_off(); +- assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); ++ assert_raw_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); + #endif + } + +@@ -2113,7 +2113,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) + { + #ifdef CONFIG_SMP + check_irq_off(); +- assert_spin_locked(&get_node(cachep, node)->list_lock); ++ assert_raw_spin_locked(&get_node(cachep, node)->list_lock); + #endif + } + +@@ -2153,9 +2153,9 @@ static void do_drain(void *arg) + check_irq_off(); + ac = cpu_cache_get(cachep); + n = get_node(cachep, node); +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + free_block(cachep, ac->entry, ac->avail, node, &list); +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + ac->avail = 0; + slabs_destroy(cachep, &list); + } +@@ -2173,9 +2173,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep) + drain_alien_cache(cachep, n->alien); + + for_each_kmem_cache_node(cachep, node, n) { +- spin_lock_irq(&n->list_lock); ++ raw_spin_lock_irq(&n->list_lock); + drain_array_locked(cachep, n->shared, node, true, &list); +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + + slabs_destroy(cachep, &list); + } +@@ -2197,10 +2197,10 @@ static int drain_freelist(struct kmem_cache *cache, + nr_freed = 0; + while (nr_freed < tofree && !list_empty(&n->slabs_free)) { + +- spin_lock_irq(&n->list_lock); ++ raw_spin_lock_irq(&n->list_lock); + p = n->slabs_free.prev; + if (p == &n->slabs_free) { +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + goto out; + } + +@@ -2213,7 +2213,7 @@ static int drain_freelist(struct kmem_cache *cache, + * to the cache. + */ + n->free_objects -= cache->num; +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + slab_destroy(cache, page); + nr_freed++; + } +@@ -2649,7 +2649,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) + INIT_LIST_HEAD(&page->slab_list); + n = get_node(cachep, page_to_nid(page)); + +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + n->total_slabs++; + if (!page->active) { + list_add_tail(&page->slab_list, &n->slabs_free); +@@ -2659,7 +2659,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) + + STATS_INC_GROWN(cachep); + n->free_objects += cachep->num - page->active; +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + + fixup_objfreelist_debug(cachep, &list); + } +@@ -2825,7 +2825,7 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) + { + struct page *page; + +- assert_spin_locked(&n->list_lock); ++ assert_raw_spin_locked(&n->list_lock); + page = list_first_entry_or_null(&n->slabs_partial, struct page, + slab_list); + if (!page) { +@@ -2852,10 +2852,10 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, + if (!gfp_pfmemalloc_allowed(flags)) + return NULL; + +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + page = get_first_slab(n, true); + if (!page) { +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + return NULL; + } + +@@ -2864,7 +2864,7 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, + + fixup_slab_list(cachep, n, page, &list); + +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); + + return obj; +@@ -2923,7 +2923,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) + if (!n->free_objects && (!shared || !shared->avail)) + goto direct_grow; + +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + shared = READ_ONCE(n->shared); + + /* See if we can refill from the shared array */ +@@ -2947,7 +2947,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) + must_grow: + n->free_objects -= ac->avail; + alloc_done: +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); + + direct_grow: +@@ -3172,7 +3172,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, + BUG_ON(!n); + + check_irq_off(); +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + page = get_first_slab(n, false); + if (!page) + goto must_grow; +@@ -3190,12 +3190,12 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, + + fixup_slab_list(cachep, n, page, &list); + +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); + return obj; + + must_grow: +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid); + if (page) { + /* This slab isn't counted yet so don't update free_objects */ +@@ -3373,7 +3373,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) + + check_irq_off(); + n = get_node(cachep, node); +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + if (n->shared) { + struct array_cache *shared_array = n->shared; + int max = shared_array->limit - shared_array->avail; +@@ -3402,7 +3402,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) + STATS_SET_FREEABLE(cachep, i); + } + #endif +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + ac->avail -= batchcount; + memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); + slabs_destroy(cachep, &list); +@@ -3831,9 +3831,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, + + node = cpu_to_mem(cpu); + n = get_node(cachep, node); +- spin_lock_irq(&n->list_lock); ++ raw_spin_lock_irq(&n->list_lock); + free_block(cachep, ac->entry, ac->avail, node, &list); +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); + } + free_percpu(prev); +@@ -3928,9 +3928,9 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, + return; + } + +- spin_lock_irq(&n->list_lock); ++ raw_spin_lock_irq(&n->list_lock); + drain_array_locked(cachep, ac, node, false, &list); +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + + slabs_destroy(cachep, &list); + } +@@ -4014,7 +4014,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) + + for_each_kmem_cache_node(cachep, node, n) { + check_irq_on(); +- spin_lock_irq(&n->list_lock); ++ raw_spin_lock_irq(&n->list_lock); + + total_slabs += n->total_slabs; + free_slabs += n->free_slabs; +@@ -4023,7 +4023,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) + if (n->shared) + shared_avail += n->shared->avail; + +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + } + num_objs = total_slabs * cachep->num; + active_slabs = total_slabs - free_slabs; +diff --git a/mm/slab.h b/mm/slab.h +index 6952e10cf33b..ed5dd6e9e0cc 100644 +--- a/mm/slab.h ++++ b/mm/slab.h +@@ -543,7 +543,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, + * The slab lists for all objects. + */ + struct kmem_cache_node { +- spinlock_t list_lock; ++ raw_spinlock_t list_lock; + + #ifdef CONFIG_SLAB + struct list_head slabs_partial; /* partial list first, better asm code */ +diff --git a/mm/slub.c b/mm/slub.c +index b0f637519ac9..863554db3323 100644 +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -1214,7 +1214,7 @@ static noinline int free_debug_processing( + unsigned long flags; + int ret = 0; + +- spin_lock_irqsave(&n->list_lock, flags); ++ raw_spin_lock_irqsave(&n->list_lock, flags); + slab_lock(page); + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { +@@ -1249,7 +1249,7 @@ static noinline int free_debug_processing( + bulk_cnt, cnt); + + slab_unlock(page); +- spin_unlock_irqrestore(&n->list_lock, flags); ++ raw_spin_unlock_irqrestore(&n->list_lock, flags); + if (!ret) + slab_fix(s, "Object at 0x%p not freed", object); + return ret; +@@ -1967,7 +1967,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, + if (!n || !n->nr_partial) + return NULL; + +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + list_for_each_entry_safe(page, page2, &n->partial, slab_list) { + void *t; + +@@ -1992,7 +1992,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, + break; + + } +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + return object; + } + +@@ -2246,7 +2246,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, + * that acquire_slab() will see a slab page that + * is frozen + */ +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + } + } else { + m = M_FULL; +@@ -2258,7 +2258,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, + * slabs from diagnostic functions will not see + * any frozen slabs. + */ +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + } + #endif + } +@@ -2283,7 +2283,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, + goto redo; + + if (lock) +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + + if (m == M_PARTIAL) + stat(s, tail); +@@ -2323,10 +2323,10 @@ static void unfreeze_partials(struct kmem_cache *s, + n2 = get_node(s, page_to_nid(page)); + if (n != n2) { + if (n) +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + + n = n2; +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + } + + do { +@@ -2355,7 +2355,7 @@ static void unfreeze_partials(struct kmem_cache *s, + } + + if (n) +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + + while (discard_page) { + page = discard_page; +@@ -2520,10 +2520,10 @@ static unsigned long count_partial(struct kmem_cache_node *n, + unsigned long x = 0; + struct page *page; + +- spin_lock_irqsave(&n->list_lock, flags); ++ raw_spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, slab_list) + x += get_count(page); +- spin_unlock_irqrestore(&n->list_lock, flags); ++ raw_spin_unlock_irqrestore(&n->list_lock, flags); + return x; + } + #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */ +@@ -2983,7 +2983,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, + + do { + if (unlikely(n)) { +- spin_unlock_irqrestore(&n->list_lock, flags); ++ raw_spin_unlock_irqrestore(&n->list_lock, flags); + n = NULL; + } + prior = page->freelist; +@@ -3015,7 +3015,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, + * Otherwise the list_lock will synchronize with + * other processors updating the list of slabs. + */ +- spin_lock_irqsave(&n->list_lock, flags); ++ raw_spin_lock_irqsave(&n->list_lock, flags); + + } + } +@@ -3057,7 +3057,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, + add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } +- spin_unlock_irqrestore(&n->list_lock, flags); ++ raw_spin_unlock_irqrestore(&n->list_lock, flags); + return; + + slab_empty: +@@ -3072,7 +3072,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, + remove_full(s, n, page); + } + +- spin_unlock_irqrestore(&n->list_lock, flags); ++ raw_spin_unlock_irqrestore(&n->list_lock, flags); + stat(s, FREE_SLAB); + discard_slab(s, page); + } +@@ -3479,7 +3479,7 @@ static void + init_kmem_cache_node(struct kmem_cache_node *n) + { + n->nr_partial = 0; +- spin_lock_init(&n->list_lock); ++ raw_spin_lock_init(&n->list_lock); + INIT_LIST_HEAD(&n->partial); + #ifdef CONFIG_SLUB_DEBUG + atomic_long_set(&n->nr_slabs, 0); +@@ -3874,7 +3874,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) + struct page *page, *h; + + BUG_ON(irqs_disabled()); +- spin_lock_irq(&n->list_lock); ++ raw_spin_lock_irq(&n->list_lock); + list_for_each_entry_safe(page, h, &n->partial, slab_list) { + if (!page->inuse) { + remove_partial(n, page); +@@ -3884,7 +3884,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) + "Objects remaining in %s on __kmem_cache_shutdown()"); + } + } +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + + list_for_each_entry_safe(page, h, &discard, slab_list) + discard_slab(s, page); +@@ -4155,7 +4155,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) + for (i = 0; i < SHRINK_PROMOTE_MAX; i++) + INIT_LIST_HEAD(promote + i); + +- spin_lock_irqsave(&n->list_lock, flags); ++ raw_spin_lock_irqsave(&n->list_lock, flags); + + /* + * Build lists of slabs to discard or promote. +@@ -4186,7 +4186,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) + for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--) + list_splice(promote + i, &n->partial); + +- spin_unlock_irqrestore(&n->list_lock, flags); ++ raw_spin_unlock_irqrestore(&n->list_lock, flags); + + /* Release empty slabs */ + list_for_each_entry_safe(page, t, &discard, slab_list) +@@ -4548,7 +4548,7 @@ static int validate_slab_node(struct kmem_cache *s, + struct page *page; + unsigned long flags; + +- spin_lock_irqsave(&n->list_lock, flags); ++ raw_spin_lock_irqsave(&n->list_lock, flags); + + list_for_each_entry(page, &n->partial, slab_list) { + validate_slab(s, page); +@@ -4570,7 +4570,7 @@ static int validate_slab_node(struct kmem_cache *s, + s->name, count, atomic_long_read(&n->nr_slabs)); + + out: +- spin_unlock_irqrestore(&n->list_lock, flags); ++ raw_spin_unlock_irqrestore(&n->list_lock, flags); + return count; + } + +@@ -4749,12 +4749,12 @@ static int list_locations(struct kmem_cache *s, char *buf, + if (!atomic_long_read(&n->nr_slabs)) + continue; + +- spin_lock_irqsave(&n->list_lock, flags); ++ raw_spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, slab_list) + process_slab(&t, s, page, alloc); + list_for_each_entry(page, &n->full, slab_list) + process_slab(&t, s, page, alloc); +- spin_unlock_irqrestore(&n->list_lock, flags); ++ raw_spin_unlock_irqrestore(&n->list_lock, flags); + } + + for (i = 0; i < t.count; i++) { +-- +2.43.0 + diff --git a/debian/patches-rt/0199-mm-SLUB-delay-giving-back-empty-slubs-to-IRQ-enabled.patch b/debian/patches-rt/0199-mm-SLUB-delay-giving-back-empty-slubs-to-IRQ-enabled.patch new file mode 100644 index 000000000..8af579dea --- /dev/null +++ b/debian/patches-rt/0199-mm-SLUB-delay-giving-back-empty-slubs-to-IRQ-enabled.patch @@ -0,0 +1,223 @@ +From e620c8c3d37a6e6ff999f303521997d2950a71cd Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Thu, 21 Jun 2018 17:29:19 +0200 +Subject: [PATCH 199/323] mm/SLUB: delay giving back empty slubs to IRQ enabled + regions +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +__free_slab() is invoked with disabled interrupts which increases the +irq-off time while __free_pages() is doing the work. +Allow __free_slab() to be invoked with enabled interrupts and move +everything from interrupts-off invocations to a temporary per-CPU list +so it can be processed later. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + mm/slub.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++---- + 1 file changed, 69 insertions(+), 5 deletions(-) + +diff --git a/mm/slub.c b/mm/slub.c +index 863554db3323..faba29039375 100644 +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -1494,6 +1494,12 @@ static bool freelist_corrupted(struct kmem_cache *s, struct page *page, + } + #endif /* CONFIG_SLUB_DEBUG */ + ++struct slub_free_list { ++ raw_spinlock_t lock; ++ struct list_head list; ++}; ++static DEFINE_PER_CPU(struct slub_free_list, slub_free_list); ++ + /* + * Hooks for other subsystems that check memory allocations. In a typical + * production configuration these hooks all should produce no code at all. +@@ -1849,6 +1855,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page) + __free_pages(page, order); + } + ++static void free_delayed(struct list_head *h) ++{ ++ while (!list_empty(h)) { ++ struct page *page = list_first_entry(h, struct page, lru); ++ ++ list_del(&page->lru); ++ __free_slab(page->slab_cache, page); ++ } ++} ++ + static void rcu_free_slab(struct rcu_head *h) + { + struct page *page = container_of(h, struct page, rcu_head); +@@ -1860,6 +1876,12 @@ static void free_slab(struct kmem_cache *s, struct page *page) + { + if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { + call_rcu(&page->rcu_head, rcu_free_slab); ++ } else if (irqs_disabled()) { ++ struct slub_free_list *f = this_cpu_ptr(&slub_free_list); ++ ++ raw_spin_lock(&f->lock); ++ list_add(&page->lru, &f->list); ++ raw_spin_unlock(&f->lock); + } else + __free_slab(s, page); + } +@@ -2392,14 +2414,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) + pobjects = oldpage->pobjects; + pages = oldpage->pages; + if (drain && pobjects > slub_cpu_partial(s)) { ++ struct slub_free_list *f; + unsigned long flags; ++ LIST_HEAD(tofree); + /* + * partial array is full. Move the existing + * set to the per node partial list. + */ + local_irq_save(flags); + unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); ++ f = this_cpu_ptr(&slub_free_list); ++ raw_spin_lock(&f->lock); ++ list_splice_init(&f->list, &tofree); ++ raw_spin_unlock(&f->lock); + local_irq_restore(flags); ++ free_delayed(&tofree); + oldpage = NULL; + pobjects = 0; + pages = 0; +@@ -2465,7 +2494,22 @@ static bool has_cpu_slab(int cpu, void *info) + + static void flush_all(struct kmem_cache *s) + { ++ LIST_HEAD(tofree); ++ int cpu; ++ + on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1); ++ for_each_online_cpu(cpu) { ++ struct slub_free_list *f; ++ ++ if (!has_cpu_slab(cpu, s)) ++ continue; ++ ++ f = &per_cpu(slub_free_list, cpu); ++ raw_spin_lock_irq(&f->lock); ++ list_splice_init(&f->list, &tofree); ++ raw_spin_unlock_irq(&f->lock); ++ free_delayed(&tofree); ++ } + } + + /* +@@ -2662,8 +2706,10 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) + * already disabled (which is the case for bulk allocation). + */ + static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, +- unsigned long addr, struct kmem_cache_cpu *c) ++ unsigned long addr, struct kmem_cache_cpu *c, ++ struct list_head *to_free) + { ++ struct slub_free_list *f; + void *freelist; + struct page *page; + +@@ -2732,6 +2778,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + VM_BUG_ON(!c->page->frozen); + c->freelist = get_freepointer(s, freelist); + c->tid = next_tid(c->tid); ++ ++out: ++ f = this_cpu_ptr(&slub_free_list); ++ raw_spin_lock(&f->lock); ++ list_splice_init(&f->list, to_free); ++ raw_spin_unlock(&f->lock); ++ + return freelist; + + new_slab: +@@ -2747,7 +2800,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + + if (unlikely(!freelist)) { + slab_out_of_memory(s, gfpflags, node); +- return NULL; ++ goto out; + } + + page = c->page; +@@ -2760,7 +2813,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + goto new_slab; /* Slab failed checks. Next slab needed */ + + deactivate_slab(s, page, get_freepointer(s, freelist), c); +- return freelist; ++ goto out; + } + + /* +@@ -2772,6 +2825,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + { + void *p; + unsigned long flags; ++ LIST_HEAD(tofree); + + local_irq_save(flags); + #ifdef CONFIG_PREEMPTION +@@ -2783,8 +2837,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + c = this_cpu_ptr(s->cpu_slab); + #endif + +- p = ___slab_alloc(s, gfpflags, node, addr, c); ++ p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree); + local_irq_restore(flags); ++ free_delayed(&tofree); + return p; + } + +@@ -3282,6 +3337,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) + { + struct kmem_cache_cpu *c; ++ LIST_HEAD(to_free); + int i; + struct obj_cgroup *objcg = NULL; + +@@ -3315,7 +3371,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + * of re-populating per CPU c->freelist + */ + p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, +- _RET_IP_, c); ++ _RET_IP_, c, &to_free); + if (unlikely(!p[i])) + goto error; + +@@ -3330,6 +3386,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + } + c->tid = next_tid(c->tid); + local_irq_enable(); ++ free_delayed(&to_free); + + /* Clear memory outside IRQ disabled fastpath loop */ + if (unlikely(slab_want_init_on_alloc(flags, s))) { +@@ -3344,6 +3401,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + return i; + error: + local_irq_enable(); ++ free_delayed(&to_free); + slab_post_alloc_hook(s, objcg, flags, i, p); + __kmem_cache_free_bulk(s, i, p); + return 0; +@@ -4361,6 +4419,12 @@ void __init kmem_cache_init(void) + { + static __initdata struct kmem_cache boot_kmem_cache, + boot_kmem_cache_node; ++ int cpu; ++ ++ for_each_possible_cpu(cpu) { ++ raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock); ++ INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list); ++ } + + if (debug_guardpage_minorder()) + slub_max_order = 0; +-- +2.43.0 + diff --git a/debian/patches-rt/0200-mm-slub-Always-flush-the-delayed-empty-slubs-in-flus.patch b/debian/patches-rt/0200-mm-slub-Always-flush-the-delayed-empty-slubs-in-flus.patch new file mode 100644 index 000000000..0fd8180bd --- /dev/null +++ b/debian/patches-rt/0200-mm-slub-Always-flush-the-delayed-empty-slubs-in-flus.patch @@ -0,0 +1,61 @@ +From d903b2377e443ae1df59e15824b683d326e7c24b Mon Sep 17 00:00:00 2001 +From: Kevin Hao <haokexin@gmail.com> +Date: Mon, 4 May 2020 11:34:07 +0800 +Subject: [PATCH 200/323] mm: slub: Always flush the delayed empty slubs in + flush_all() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +After commit f0b231101c94 ("mm/SLUB: delay giving back empty slubs to +IRQ enabled regions"), when the free_slab() is invoked with the IRQ +disabled, the empty slubs are moved to a per-CPU list and will be +freed after IRQ enabled later. But in the current codes, there is +a check to see if there really has the cpu slub on a specific cpu +before flushing the delayed empty slubs, this may cause a reference +of already released kmem_cache in a scenario like below: + cpu 0 cpu 1 + kmem_cache_destroy() + flush_all() + --->IPI flush_cpu_slab() + flush_slab() + deactivate_slab() + discard_slab() + free_slab() + c->page = NULL; + for_each_online_cpu(cpu) + if (!has_cpu_slab(1, s)) + continue + this skip to flush the delayed + empty slub released by cpu1 + kmem_cache_free(kmem_cache, s) + + kmalloc() + __slab_alloc() + free_delayed() + __free_slab() + reference to released kmem_cache + +Fixes: f0b231101c94 ("mm/SLUB: delay giving back empty slubs to IRQ enabled regions") +Signed-off-by: Kevin Hao <haokexin@gmail.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Cc: stable-rt@vger.kernel.org +--- + mm/slub.c | 3 --- + 1 file changed, 3 deletions(-) + +diff --git a/mm/slub.c b/mm/slub.c +index faba29039375..cb414c1f9ef6 100644 +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2501,9 +2501,6 @@ static void flush_all(struct kmem_cache *s) + for_each_online_cpu(cpu) { + struct slub_free_list *f; + +- if (!has_cpu_slab(cpu, s)) +- continue; +- + f = &per_cpu(slub_free_list, cpu); + raw_spin_lock_irq(&f->lock); + list_splice_init(&f->list, &tofree); +-- +2.43.0 + diff --git a/debian/patches-rt/0201-mm-slub-Don-t-resize-the-location-tracking-cache-on-.patch b/debian/patches-rt/0201-mm-slub-Don-t-resize-the-location-tracking-cache-on-.patch new file mode 100644 index 000000000..c233be3ef --- /dev/null +++ b/debian/patches-rt/0201-mm-slub-Don-t-resize-the-location-tracking-cache-on-.patch @@ -0,0 +1,37 @@ +From 6f5d1f04864beb65079068f48faf17c269c8509b Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Fri, 26 Feb 2021 17:26:04 +0100 +Subject: [PATCH 201/323] mm: slub: Don't resize the location tracking cache on + PREEMPT_RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The location tracking cache has a size of a page and is resized if its +current size is too small. +This allocation happens with disabled interrupts and can't happen on +PREEMPT_RT. +Should one page be too small, then we have to allocate more at the +beginning. The only downside is that less callers will be visible. + +Cc: stable-rt@vger.kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + mm/slub.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/mm/slub.c b/mm/slub.c +index cb414c1f9ef6..57c598dd7454 100644 +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -4682,6 +4682,9 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) + struct location *l; + int order; + ++ if (IS_ENABLED(CONFIG_PREEMPT_RT) && flags == GFP_ATOMIC) ++ return 0; ++ + order = get_order(sizeof(struct location) * max); + + l = (void *)__get_free_pages(flags, order); +-- +2.43.0 + diff --git a/debian/patches-rt/0202-mm-page_alloc-Use-migrate_disable-in-drain_local_pag.patch b/debian/patches-rt/0202-mm-page_alloc-Use-migrate_disable-in-drain_local_pag.patch new file mode 100644 index 000000000..486873119 --- /dev/null +++ b/debian/patches-rt/0202-mm-page_alloc-Use-migrate_disable-in-drain_local_pag.patch @@ -0,0 +1,39 @@ +From 4c4d7886b6c8d16df841ac103b4742481ad5f014 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Thu, 2 Jul 2020 14:27:23 +0200 +Subject: [PATCH 202/323] mm/page_alloc: Use migrate_disable() in + drain_local_pages_wq() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +drain_local_pages_wq() disables preemption to avoid CPU migration during +CPU hotplug. +Using migrate_disable() makes the function preemptible on PREEMPT_RT but +still avoids CPU migrations during CPU-hotplug. On !PREEMPT_RT it +behaves like preempt_disable(). + +Use migrate_disable() in drain_local_pages_wq(). + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + mm/page_alloc.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index a3f1e4c6bdc8..508650843235 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -3048,9 +3048,9 @@ static void drain_local_pages_wq(struct work_struct *work) + * cpu which is allright but we also have to make sure to not move to + * a different one. + */ +- preempt_disable(); ++ migrate_disable(); + drain_local_pages(drain->zone); +- preempt_enable(); ++ migrate_enable(); + } + + /* +-- +2.43.0 + diff --git a/debian/patches-rt/0203-mm-page_alloc-rt-friendly-per-cpu-pages.patch b/debian/patches-rt/0203-mm-page_alloc-rt-friendly-per-cpu-pages.patch new file mode 100644 index 000000000..8102e89d2 --- /dev/null +++ b/debian/patches-rt/0203-mm-page_alloc-rt-friendly-per-cpu-pages.patch @@ -0,0 +1,197 @@ +From 9f33cce55eb2594c4602653658103381064572d7 Mon Sep 17 00:00:00 2001 +From: Ingo Molnar <mingo@elte.hu> +Date: Fri, 3 Jul 2009 08:29:37 -0500 +Subject: [PATCH 203/323] mm: page_alloc: rt-friendly per-cpu pages +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +rt-friendly per-cpu pages: convert the irqs-off per-cpu locking +method into a preemptible, explicit-per-cpu-locks method. + +Contains fixes from: + Peter Zijlstra <a.p.zijlstra@chello.nl> + Thomas Gleixner <tglx@linutronix.de> + +Signed-off-by: Ingo Molnar <mingo@elte.hu> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + mm/page_alloc.c | 47 ++++++++++++++++++++++++++++------------------- + 1 file changed, 28 insertions(+), 19 deletions(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 508650843235..c5eb7d6844ae 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -61,6 +61,7 @@ + #include <linux/hugetlb.h> + #include <linux/sched/rt.h> + #include <linux/sched/mm.h> ++#include <linux/local_lock.h> + #include <linux/page_owner.h> + #include <linux/kthread.h> + #include <linux/memcontrol.h> +@@ -386,6 +387,13 @@ EXPORT_SYMBOL(nr_node_ids); + EXPORT_SYMBOL(nr_online_nodes); + #endif + ++struct pa_lock { ++ local_lock_t l; ++}; ++static DEFINE_PER_CPU(struct pa_lock, pa_lock) = { ++ .l = INIT_LOCAL_LOCK(l), ++}; ++ + int page_group_by_mobility_disabled __read_mostly; + + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +@@ -1543,11 +1551,11 @@ static void __free_pages_ok(struct page *page, unsigned int order, + return; + + migratetype = get_pfnblock_migratetype(page, pfn); +- local_irq_save(flags); ++ local_lock_irqsave(&pa_lock.l, flags); + __count_vm_events(PGFREE, 1 << order); + free_one_page(page_zone(page), page, pfn, order, migratetype, + fpi_flags); +- local_irq_restore(flags); ++ local_unlock_irqrestore(&pa_lock.l, flags); + } + + void __free_pages_core(struct page *page, unsigned int order) +@@ -2961,13 +2969,13 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) + int to_drain, batch; + LIST_HEAD(dst); + +- local_irq_save(flags); ++ local_lock_irqsave(&pa_lock.l, flags); + batch = READ_ONCE(pcp->batch); + to_drain = min(pcp->count, batch); + if (to_drain > 0) + isolate_pcp_pages(to_drain, pcp, &dst); + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&pa_lock.l, flags); + + if (to_drain > 0) + free_pcppages_bulk(zone, &dst, false); +@@ -2989,7 +2997,7 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) + LIST_HEAD(dst); + int count; + +- local_irq_save(flags); ++ local_lock_irqsave(&pa_lock.l, flags); + pset = per_cpu_ptr(zone->pageset, cpu); + + pcp = &pset->pcp; +@@ -2997,7 +3005,7 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) + if (count) + isolate_pcp_pages(count, pcp, &dst); + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&pa_lock.l, flags); + + if (count) + free_pcppages_bulk(zone, &dst, false); +@@ -3248,9 +3256,9 @@ void free_unref_page(struct page *page) + if (!free_unref_page_prepare(page, pfn)) + return; + +- local_irq_save(flags); ++ local_lock_irqsave(&pa_lock.l, flags); + free_unref_page_commit(page, pfn, &dst); +- local_irq_restore(flags); ++ local_unlock_irqrestore(&pa_lock.l, flags); + if (!list_empty(&dst)) + free_pcppages_bulk(zone, &dst, false); + } +@@ -3277,7 +3285,7 @@ void free_unref_page_list(struct list_head *list) + set_page_private(page, pfn); + } + +- local_irq_save(flags); ++ local_lock_irqsave(&pa_lock.l, flags); + list_for_each_entry_safe(page, next, list, lru) { + unsigned long pfn = page_private(page); + enum zone_type type; +@@ -3292,12 +3300,12 @@ void free_unref_page_list(struct list_head *list) + * a large list of pages to free. + */ + if (++batch_count == SWAP_CLUSTER_MAX) { +- local_irq_restore(flags); ++ local_unlock_irqrestore(&pa_lock.l, flags); + batch_count = 0; +- local_irq_save(flags); ++ local_lock_irqsave(&pa_lock.l, flags); + } + } +- local_irq_restore(flags); ++ local_unlock_irqrestore(&pa_lock.l, flags); + + for (i = 0; i < __MAX_NR_ZONES; ) { + struct page *page; +@@ -3468,7 +3476,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, + struct page *page; + unsigned long flags; + +- local_irq_save(flags); ++ local_lock_irqsave(&pa_lock.l, flags); + pcp = &this_cpu_ptr(zone->pageset)->pcp; + list = &pcp->lists[migratetype]; + page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); +@@ -3476,7 +3484,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); + zone_statistics(preferred_zone, zone); + } +- local_irq_restore(flags); ++ local_unlock_irqrestore(&pa_lock.l, flags); + return page; + } + +@@ -3510,7 +3518,8 @@ struct page *rmqueue(struct zone *preferred_zone, + * allocate greater than order-1 page units with __GFP_NOFAIL. + */ + WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); +- spin_lock_irqsave(&zone->lock, flags); ++ local_lock_irqsave(&pa_lock.l, flags); ++ spin_lock(&zone->lock); + + do { + page = NULL; +@@ -3536,7 +3545,7 @@ struct page *rmqueue(struct zone *preferred_zone, + + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); + zone_statistics(preferred_zone, zone); +- local_irq_restore(flags); ++ local_unlock_irqrestore(&pa_lock.l, flags); + + out: + /* Separate test+clear to avoid unnecessary atomics */ +@@ -3549,7 +3558,7 @@ struct page *rmqueue(struct zone *preferred_zone, + return page; + + failed: +- local_irq_restore(flags); ++ local_unlock_irqrestore(&pa_lock.l, flags); + return NULL; + } + +@@ -8892,7 +8901,7 @@ void zone_pcp_reset(struct zone *zone) + struct per_cpu_pageset *pset; + + /* avoid races with drain_pages() */ +- local_irq_save(flags); ++ local_lock_irqsave(&pa_lock.l, flags); + if (zone->pageset != &boot_pageset) { + for_each_online_cpu(cpu) { + pset = per_cpu_ptr(zone->pageset, cpu); +@@ -8901,7 +8910,7 @@ void zone_pcp_reset(struct zone *zone) + free_percpu(zone->pageset); + zone->pageset = &boot_pageset; + } +- local_irq_restore(flags); ++ local_unlock_irqrestore(&pa_lock.l, flags); + } + + #ifdef CONFIG_MEMORY_HOTREMOVE +-- +2.43.0 + diff --git a/debian/patches-rt/0204-mm-slub-Make-object_map_lock-a-raw_spinlock_t.patch b/debian/patches-rt/0204-mm-slub-Make-object_map_lock-a-raw_spinlock_t.patch new file mode 100644 index 000000000..28c14ea49 --- /dev/null +++ b/debian/patches-rt/0204-mm-slub-Make-object_map_lock-a-raw_spinlock_t.patch @@ -0,0 +1,50 @@ +From d2d45bd13ad6112afefb224d46501b76473196eb Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Thu, 16 Jul 2020 18:47:50 +0200 +Subject: [PATCH 204/323] mm/slub: Make object_map_lock a raw_spinlock_t +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The variable object_map is protected by object_map_lock. The lock is always +acquired in debug code and within already atomic context + +Make object_map_lock a raw_spinlock_t. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + mm/slub.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/mm/slub.c b/mm/slub.c +index 57c598dd7454..ac2ddf6a4220 100644 +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -435,7 +435,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, + + #ifdef CONFIG_SLUB_DEBUG + static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; +-static DEFINE_SPINLOCK(object_map_lock); ++static DEFINE_RAW_SPINLOCK(object_map_lock); + + /* + * Determine a map of object in use on a page. +@@ -451,7 +451,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page) + + VM_BUG_ON(!irqs_disabled()); + +- spin_lock(&object_map_lock); ++ raw_spin_lock(&object_map_lock); + + bitmap_zero(object_map, page->objects); + +@@ -464,7 +464,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page) + static void put_map(unsigned long *map) __releases(&object_map_lock) + { + VM_BUG_ON(map != object_map); +- spin_unlock(&object_map_lock); ++ raw_spin_unlock(&object_map_lock); + } + + static inline unsigned int size_from_object(struct kmem_cache *s) +-- +2.43.0 + diff --git a/debian/patches-rt/0205-slub-Enable-irqs-for-__GFP_WAIT.patch b/debian/patches-rt/0205-slub-Enable-irqs-for-__GFP_WAIT.patch new file mode 100644 index 000000000..d3496f78e --- /dev/null +++ b/debian/patches-rt/0205-slub-Enable-irqs-for-__GFP_WAIT.patch @@ -0,0 +1,76 @@ +From c0f57e0b6278391f1fec71ccb1e25fe46deade2b Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Wed, 9 Jan 2013 12:08:15 +0100 +Subject: [PATCH 205/323] slub: Enable irqs for __GFP_WAIT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +SYSTEM_RUNNING might be too late for enabling interrupts. Allocations +with GFP_WAIT can happen before that. So use this as an indicator. + +[bigeasy: Add warning on RT for allocations in atomic context. + Don't enable interrupts on allocations during SYSTEM_SUSPEND. This is done + during suspend by ACPI, noticed by Liwei Song <liwei.song@windriver.com> +] + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + mm/slub.c | 18 +++++++++++++++++- + 1 file changed, 17 insertions(+), 1 deletion(-) + +diff --git a/mm/slub.c b/mm/slub.c +index ac2ddf6a4220..00f1d1206dbc 100644 +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -1750,10 +1750,18 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) + void *start, *p, *next; + int idx; + bool shuffle; ++ bool enableirqs = false; + + flags &= gfp_allowed_mask; + + if (gfpflags_allow_blocking(flags)) ++ enableirqs = true; ++ ++#ifdef CONFIG_PREEMPT_RT ++ if (system_state > SYSTEM_BOOTING && system_state < SYSTEM_SUSPEND) ++ enableirqs = true; ++#endif ++ if (enableirqs) + local_irq_enable(); + + flags |= s->allocflags; +@@ -1812,7 +1820,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) + page->frozen = 1; + + out: +- if (gfpflags_allow_blocking(flags)) ++ if (enableirqs) + local_irq_disable(); + if (!page) + return NULL; +@@ -2870,6 +2878,10 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, + unsigned long tid; + struct obj_cgroup *objcg = NULL; + ++ if (IS_ENABLED(CONFIG_PREEMPT_RT) && IS_ENABLED(CONFIG_DEBUG_ATOMIC_SLEEP)) ++ WARN_ON_ONCE(!preemptible() && ++ (system_state > SYSTEM_BOOTING && system_state < SYSTEM_SUSPEND)); ++ + s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags); + if (!s) + return NULL; +@@ -3338,6 +3350,10 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + int i; + struct obj_cgroup *objcg = NULL; + ++ if (IS_ENABLED(CONFIG_PREEMPT_RT) && IS_ENABLED(CONFIG_DEBUG_ATOMIC_SLEEP)) ++ WARN_ON_ONCE(!preemptible() && ++ (system_state > SYSTEM_BOOTING && system_state < SYSTEM_SUSPEND)); ++ + /* memcg and kmem_cache debug support */ + s = slab_pre_alloc_hook(s, &objcg, size, flags); + if (unlikely(!s)) +-- +2.43.0 + diff --git a/debian/patches-rt/0206-slub-Disable-SLUB_CPU_PARTIAL.patch b/debian/patches-rt/0206-slub-Disable-SLUB_CPU_PARTIAL.patch new file mode 100644 index 000000000..9f521b566 --- /dev/null +++ b/debian/patches-rt/0206-slub-Disable-SLUB_CPU_PARTIAL.patch @@ -0,0 +1,54 @@ +From 8da7619853b38005d113af771657ad188006e1d5 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 15 Apr 2015 19:00:47 +0200 +Subject: [PATCH 206/323] slub: Disable SLUB_CPU_PARTIAL +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +|BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:915 +|in_atomic(): 1, irqs_disabled(): 0, pid: 87, name: rcuop/7 +|1 lock held by rcuop/7/87: +| #0: (rcu_callback){......}, at: [<ffffffff8112c76a>] rcu_nocb_kthread+0x1ca/0x5d0 +|Preemption disabled at:[<ffffffff811eebd9>] put_cpu_partial+0x29/0x220 +| +|CPU: 0 PID: 87 Comm: rcuop/7 Tainted: G W 4.0.0-rt0+ #477 +|Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 +| 000000000007a9fc ffff88013987baf8 ffffffff817441c7 0000000000000007 +| 0000000000000000 ffff88013987bb18 ffffffff810eee51 0000000000000000 +| ffff88013fc10200 ffff88013987bb48 ffffffff8174a1c4 000000000007a9fc +|Call Trace: +| [<ffffffff817441c7>] dump_stack+0x4f/0x90 +| [<ffffffff810eee51>] ___might_sleep+0x121/0x1b0 +| [<ffffffff8174a1c4>] rt_spin_lock+0x24/0x60 +| [<ffffffff811a689a>] __free_pages_ok+0xaa/0x540 +| [<ffffffff811a729d>] __free_pages+0x1d/0x30 +| [<ffffffff811eddd5>] __free_slab+0xc5/0x1e0 +| [<ffffffff811edf46>] free_delayed+0x56/0x70 +| [<ffffffff811eecfd>] put_cpu_partial+0x14d/0x220 +| [<ffffffff811efc98>] __slab_free+0x158/0x2c0 +| [<ffffffff811f0021>] kmem_cache_free+0x221/0x2d0 +| [<ffffffff81204d0c>] file_free_rcu+0x2c/0x40 +| [<ffffffff8112c7e3>] rcu_nocb_kthread+0x243/0x5d0 +| [<ffffffff810e951c>] kthread+0xfc/0x120 +| [<ffffffff8174abc8>] ret_from_fork+0x58/0x90 + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + init/Kconfig | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/init/Kconfig b/init/Kconfig +index 7e9578a0176f..c5f276d782de 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -2011,7 +2011,7 @@ config SHUFFLE_PAGE_ALLOCATOR + + config SLUB_CPU_PARTIAL + default y +- depends on SLUB && SMP ++ depends on SLUB && SMP && !PREEMPT_RT + bool "SLUB per cpu partial cache" + help + Per cpu partial caches accelerate objects allocation and freeing +-- +2.43.0 + diff --git a/debian/patches-rt/0207-mm-memcontrol-Provide-a-local_lock-for-per-CPU-memcg.patch b/debian/patches-rt/0207-mm-memcontrol-Provide-a-local_lock-for-per-CPU-memcg.patch new file mode 100644 index 000000000..73fbdfd8d --- /dev/null +++ b/debian/patches-rt/0207-mm-memcontrol-Provide-a-local_lock-for-per-CPU-memcg.patch @@ -0,0 +1,144 @@ +From 7a4ec53aa79ebc416eb752e204669e0d4dc53a06 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 18 Aug 2020 10:30:00 +0200 +Subject: [PATCH 207/323] mm: memcontrol: Provide a local_lock for per-CPU + memcg_stock +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The interrupts are disabled to ensure CPU-local access to the per-CPU +variable `memcg_stock'. +As the code inside the interrupt disabled section acquires regular +spinlocks, which are converted to 'sleeping' spinlocks on a PREEMPT_RT +kernel, this conflicts with the RT semantics. + +Convert it to a local_lock which allows RT kernels to substitute them with +a real per CPU lock. On non RT kernels this maps to local_irq_save() as +before, but provides also lockdep coverage of the critical region. +No functional change. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + mm/memcontrol.c | 31 ++++++++++++++++++------------- + 1 file changed, 18 insertions(+), 13 deletions(-) + +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index 49566afaef1c..d2a47428831b 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -2202,6 +2202,7 @@ void unlock_page_memcg(struct page *page) + EXPORT_SYMBOL(unlock_page_memcg); + + struct memcg_stock_pcp { ++ local_lock_t lock; + struct mem_cgroup *cached; /* this never be root cgroup */ + unsigned int nr_pages; + +@@ -2253,7 +2254,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) + if (nr_pages > MEMCG_CHARGE_BATCH) + return ret; + +- local_irq_save(flags); ++ local_lock_irqsave(&memcg_stock.lock, flags); + + stock = this_cpu_ptr(&memcg_stock); + if (memcg == stock->cached && stock->nr_pages >= nr_pages) { +@@ -2261,7 +2262,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) + ret = true; + } + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&memcg_stock.lock, flags); + + return ret; + } +@@ -2296,14 +2297,14 @@ static void drain_local_stock(struct work_struct *dummy) + * The only protection from memory hotplug vs. drain_stock races is + * that we always operate on local CPU stock here with IRQ disabled + */ +- local_irq_save(flags); ++ local_lock_irqsave(&memcg_stock.lock, flags); + + stock = this_cpu_ptr(&memcg_stock); + drain_obj_stock(stock); + drain_stock(stock); + clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&memcg_stock.lock, flags); + } + + /* +@@ -2315,7 +2316,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) + struct memcg_stock_pcp *stock; + unsigned long flags; + +- local_irq_save(flags); ++ local_lock_irqsave(&memcg_stock.lock, flags); + + stock = this_cpu_ptr(&memcg_stock); + if (stock->cached != memcg) { /* reset if necessary */ +@@ -2328,7 +2329,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) + if (stock->nr_pages > MEMCG_CHARGE_BATCH) + drain_stock(stock); + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&memcg_stock.lock, flags); + } + + /* +@@ -3137,7 +3138,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) + unsigned long flags; + bool ret = false; + +- local_irq_save(flags); ++ local_lock_irqsave(&memcg_stock.lock, flags); + + stock = this_cpu_ptr(&memcg_stock); + if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { +@@ -3145,7 +3146,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) + ret = true; + } + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&memcg_stock.lock, flags); + + return ret; + } +@@ -3212,7 +3213,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) + struct memcg_stock_pcp *stock; + unsigned long flags; + +- local_irq_save(flags); ++ local_lock_irqsave(&memcg_stock.lock, flags); + + stock = this_cpu_ptr(&memcg_stock); + if (stock->cached_objcg != objcg) { /* reset if necessary */ +@@ -3226,7 +3227,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) + if (stock->nr_bytes > PAGE_SIZE) + drain_obj_stock(stock); + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&memcg_stock.lock, flags); + } + + int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) +@@ -7161,9 +7162,13 @@ static int __init mem_cgroup_init(void) + cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, + memcg_hotplug_cpu_dead); + +- for_each_possible_cpu(cpu) +- INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, +- drain_local_stock); ++ for_each_possible_cpu(cpu) { ++ struct memcg_stock_pcp *stock; ++ ++ stock = per_cpu_ptr(&memcg_stock, cpu); ++ INIT_WORK(&stock->work, drain_local_stock); ++ local_lock_init(&stock->lock); ++ } + + for_each_node(node) { + struct mem_cgroup_tree_per_node *rtpn; +-- +2.43.0 + diff --git a/debian/patches-rt/0208-mm-memcontrol-Don-t-call-schedule_work_on-in-preempt.patch b/debian/patches-rt/0208-mm-memcontrol-Don-t-call-schedule_work_on-in-preempt.patch new file mode 100644 index 000000000..e1de69b43 --- /dev/null +++ b/debian/patches-rt/0208-mm-memcontrol-Don-t-call-schedule_work_on-in-preempt.patch @@ -0,0 +1,74 @@ +From 915ee75a3da5732cf20be8f92b31d3ce33042c5b Mon Sep 17 00:00:00 2001 +From: Yang Shi <yang.shi@windriver.com> +Date: Wed, 30 Oct 2013 11:48:33 -0700 +Subject: [PATCH 208/323] mm/memcontrol: Don't call schedule_work_on in + preemption disabled context +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The following trace is triggered when running ltp oom test cases: + +BUG: sleeping function called from invalid context at kernel/rtmutex.c:659 +in_atomic(): 1, irqs_disabled(): 0, pid: 17188, name: oom03 +Preemption disabled at:[<ffffffff8112ba70>] mem_cgroup_reclaim+0x90/0xe0 + +CPU: 2 PID: 17188 Comm: oom03 Not tainted 3.10.10-rt3 #2 +Hardware name: Intel Corporation Calpella platform/MATXM-CORE-411-B, BIOS 4.6.3 08/18/2010 +ffff88007684d730 ffff880070df9b58 ffffffff8169918d ffff880070df9b70 +ffffffff8106db31 ffff88007688b4a0 ffff880070df9b88 ffffffff8169d9c0 +ffff88007688b4a0 ffff880070df9bc8 ffffffff81059da1 0000000170df9bb0 +Call Trace: +[<ffffffff8169918d>] dump_stack+0x19/0x1b +[<ffffffff8106db31>] __might_sleep+0xf1/0x170 +[<ffffffff8169d9c0>] rt_spin_lock+0x20/0x50 +[<ffffffff81059da1>] queue_work_on+0x61/0x100 +[<ffffffff8112b361>] drain_all_stock+0xe1/0x1c0 +[<ffffffff8112ba70>] mem_cgroup_reclaim+0x90/0xe0 +[<ffffffff8112beda>] __mem_cgroup_try_charge+0x41a/0xc40 +[<ffffffff810f1c91>] ? release_pages+0x1b1/0x1f0 +[<ffffffff8106f200>] ? sched_exec+0x40/0xb0 +[<ffffffff8112cc87>] mem_cgroup_charge_common+0x37/0x70 +[<ffffffff8112e2c6>] mem_cgroup_newpage_charge+0x26/0x30 +[<ffffffff8110af68>] handle_pte_fault+0x618/0x840 +[<ffffffff8103ecf6>] ? unpin_current_cpu+0x16/0x70 +[<ffffffff81070f94>] ? migrate_enable+0xd4/0x200 +[<ffffffff8110cde5>] handle_mm_fault+0x145/0x1e0 +[<ffffffff810301e1>] __do_page_fault+0x1a1/0x4c0 +[<ffffffff8169c9eb>] ? preempt_schedule_irq+0x4b/0x70 +[<ffffffff8169e3b7>] ? retint_kernel+0x37/0x40 +[<ffffffff8103053e>] do_page_fault+0xe/0x10 +[<ffffffff8169e4c2>] page_fault+0x22/0x30 + +So, to prevent schedule_work_on from being called in preempt disabled context, +replace the pair of get/put_cpu() to get/put_cpu_light(). + +Signed-off-by: Yang Shi <yang.shi@windriver.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + mm/memcontrol.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index d2a47428831b..53522a52ff15 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -2349,7 +2349,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) + * as well as workers from this path always operate on the local + * per-cpu data. CPU up doesn't touch memcg_stock at all. + */ +- curcpu = get_cpu(); ++ curcpu = get_cpu_light(); + for_each_online_cpu(cpu) { + struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); + struct mem_cgroup *memcg; +@@ -2372,7 +2372,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) + schedule_work_on(cpu, &stock->work); + } + } +- put_cpu(); ++ put_cpu_light(); + mutex_unlock(&percpu_charge_mutex); + } + +-- +2.43.0 + diff --git a/debian/patches-rt/0209-mm-memcontrol-Replace-local_irq_disable-with-local-l.patch b/debian/patches-rt/0209-mm-memcontrol-Replace-local_irq_disable-with-local-l.patch new file mode 100644 index 000000000..143299bc1 --- /dev/null +++ b/debian/patches-rt/0209-mm-memcontrol-Replace-local_irq_disable-with-local-l.patch @@ -0,0 +1,123 @@ +From aed289fe7a9923b6c8a98fa08048d56aa97c71c4 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 28 Jan 2015 17:14:16 +0100 +Subject: [PATCH 209/323] mm/memcontrol: Replace local_irq_disable with local + locks +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +There are a few local_irq_disable() which then take sleeping locks. This +patch converts them local locks. + +[bigeasy: Move unlock after memcg_check_events() in mem_cgroup_swapout(), + pointed out by Matt Fleming <matt@codeblueprint.co.uk>] +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + mm/memcontrol.c | 29 +++++++++++++++++++++-------- + 1 file changed, 21 insertions(+), 8 deletions(-) + +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index 53522a52ff15..5bf696318643 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -63,6 +63,7 @@ + #include <net/sock.h> + #include <net/ip.h> + #include "slab.h" ++#include <linux/local_lock.h> + + #include <linux/uaccess.h> + +@@ -93,6 +94,13 @@ bool cgroup_memory_noswap __read_mostly; + static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); + #endif + ++struct event_lock { ++ local_lock_t l; ++}; ++static DEFINE_PER_CPU(struct event_lock, event_lock) = { ++ .l = INIT_LOCAL_LOCK(l), ++}; ++ + /* Whether legacy memory+swap accounting is active */ + static bool do_memsw_account(void) + { +@@ -5747,12 +5755,12 @@ static int mem_cgroup_move_account(struct page *page, + + ret = 0; + +- local_irq_disable(); ++ local_lock_irq(&event_lock.l); + mem_cgroup_charge_statistics(to, page, nr_pages); + memcg_check_events(to, page); + mem_cgroup_charge_statistics(from, page, -nr_pages); + memcg_check_events(from, page); +- local_irq_enable(); ++ local_unlock_irq(&event_lock.l); + out_unlock: + unlock_page(page); + out: +@@ -6822,10 +6830,10 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) + css_get(&memcg->css); + commit_charge(page, memcg); + +- local_irq_disable(); ++ local_lock_irq(&event_lock.l); + mem_cgroup_charge_statistics(memcg, page, nr_pages); + memcg_check_events(memcg, page); +- local_irq_enable(); ++ local_unlock_irq(&event_lock.l); + + /* + * Cgroup1's unified memory+swap counter has been charged with the +@@ -6881,11 +6889,11 @@ static void uncharge_batch(const struct uncharge_gather *ug) + memcg_oom_recover(ug->memcg); + } + +- local_irq_save(flags); ++ local_lock_irqsave(&event_lock.l, flags); + __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); + __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages); + memcg_check_events(ug->memcg, ug->dummy_page); +- local_irq_restore(flags); ++ local_unlock_irqrestore(&event_lock.l, flags); + + /* drop reference from uncharge_page */ + css_put(&ug->memcg->css); +@@ -7039,10 +7047,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) + css_get(&memcg->css); + commit_charge(newpage, memcg); + +- local_irq_save(flags); ++ local_lock_irqsave(&event_lock.l, flags); + mem_cgroup_charge_statistics(memcg, newpage, nr_pages); + memcg_check_events(memcg, newpage); +- local_irq_restore(flags); ++ local_unlock_irqrestore(&event_lock.l, flags); + } + + DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); +@@ -7217,6 +7225,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) + struct mem_cgroup *memcg, *swap_memcg; + unsigned int nr_entries; + unsigned short oldid; ++ unsigned long flags; + + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); +@@ -7262,9 +7271,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) + * important here to have the interrupts disabled because it is the + * only synchronisation we have for updating the per-CPU variables. + */ ++ local_lock_irqsave(&event_lock.l, flags); ++#ifndef CONFIG_PREEMPT_RT + VM_BUG_ON(!irqs_disabled()); ++#endif + mem_cgroup_charge_statistics(memcg, page, -nr_entries); + memcg_check_events(memcg, page); ++ local_unlock_irqrestore(&event_lock.l, flags); + + css_put(&memcg->css); + } +-- +2.43.0 + diff --git a/debian/patches-rt/0210-mm-zsmalloc-copy-with-get_cpu_var-and-locking.patch b/debian/patches-rt/0210-mm-zsmalloc-copy-with-get_cpu_var-and-locking.patch new file mode 100644 index 000000000..f48595a8d --- /dev/null +++ b/debian/patches-rt/0210-mm-zsmalloc-copy-with-get_cpu_var-and-locking.patch @@ -0,0 +1,212 @@ +From 7e86ac4a42c020de9b35b02a3ef02f2fc0dee2ed Mon Sep 17 00:00:00 2001 +From: Mike Galbraith <umgwanakikbuti@gmail.com> +Date: Tue, 22 Mar 2016 11:16:09 +0100 +Subject: [PATCH 210/323] mm/zsmalloc: copy with get_cpu_var() and locking +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +get_cpu_var() disables preemption and triggers a might_sleep() splat later. +This is replaced with get_locked_var(). +This bitspinlocks are replaced with a proper mutex which requires a slightly +larger struct to allocate. + +Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com> +[bigeasy: replace the bitspin_lock() with a mutex, get_locked_var(). Mike then +fixed the size magic] +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + mm/zsmalloc.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++---- + 1 file changed, 79 insertions(+), 6 deletions(-) + +diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c +index c18dc8e61d35..7dad2ff3e778 100644 +--- a/mm/zsmalloc.c ++++ b/mm/zsmalloc.c +@@ -57,6 +57,7 @@ + #include <linux/wait.h> + #include <linux/pagemap.h> + #include <linux/fs.h> ++#include <linux/local_lock.h> + + #define ZSPAGE_MAGIC 0x58 + +@@ -77,6 +78,20 @@ + + #define ZS_HANDLE_SIZE (sizeof(unsigned long)) + ++#ifdef CONFIG_PREEMPT_RT ++ ++struct zsmalloc_handle { ++ unsigned long addr; ++ struct mutex lock; ++}; ++ ++#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle)) ++ ++#else ++ ++#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long)) ++#endif ++ + /* + * Object location (<PFN>, <obj_idx>) is encoded as + * a single (unsigned long) handle value. +@@ -293,6 +308,7 @@ struct zspage { + }; + + struct mapping_area { ++ local_lock_t lock; + char *vm_buf; /* copy buffer for objects that span pages */ + char *vm_addr; /* address of kmap_atomic()'ed pages */ + enum zs_mapmode vm_mm; /* mapping mode */ +@@ -322,7 +338,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} + + static int create_cache(struct zs_pool *pool) + { +- pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, ++ pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE, + 0, 0, NULL); + if (!pool->handle_cachep) + return 1; +@@ -346,9 +362,26 @@ static void destroy_cache(struct zs_pool *pool) + + static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) + { +- return (unsigned long)kmem_cache_alloc(pool->handle_cachep, +- gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); ++ void *p; ++ ++ p = kmem_cache_alloc(pool->handle_cachep, ++ gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); ++#ifdef CONFIG_PREEMPT_RT ++ if (p) { ++ struct zsmalloc_handle *zh = p; ++ ++ mutex_init(&zh->lock); ++ } ++#endif ++ return (unsigned long)p; ++} ++ ++#ifdef CONFIG_PREEMPT_RT ++static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle) ++{ ++ return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1)); + } ++#endif + + static void cache_free_handle(struct zs_pool *pool, unsigned long handle) + { +@@ -368,12 +401,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) + + static void record_obj(unsigned long handle, unsigned long obj) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ WRITE_ONCE(zh->addr, obj); ++#else + /* + * lsb of @obj represents handle lock while other bits + * represent object value the handle is pointing so + * updating shouldn't do store tearing. + */ + WRITE_ONCE(*(unsigned long *)handle, obj); ++#endif + } + + /* zpool driver */ +@@ -455,7 +494,10 @@ MODULE_ALIAS("zpool-zsmalloc"); + #endif /* CONFIG_ZPOOL */ + + /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ +-static DEFINE_PER_CPU(struct mapping_area, zs_map_area); ++static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { ++ /* XXX remove this and use a spin_lock_t in pin_tag() */ ++ .lock = INIT_LOCAL_LOCK(lock), ++}; + + static bool is_zspage_isolated(struct zspage *zspage) + { +@@ -865,7 +907,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) + + static unsigned long handle_to_obj(unsigned long handle) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ return zh->addr; ++#else + return *(unsigned long *)handle; ++#endif + } + + static unsigned long obj_to_head(struct page *page, void *obj) +@@ -879,22 +927,46 @@ static unsigned long obj_to_head(struct page *page, void *obj) + + static inline int testpin_tag(unsigned long handle) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ return mutex_is_locked(&zh->lock); ++#else + return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); ++#endif + } + + static inline int trypin_tag(unsigned long handle) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ return mutex_trylock(&zh->lock); ++#else + return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); ++#endif + } + + static void pin_tag(unsigned long handle) __acquires(bitlock) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ return mutex_lock(&zh->lock); ++#else + bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); ++#endif + } + + static void unpin_tag(unsigned long handle) __releases(bitlock) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ return mutex_unlock(&zh->lock); ++#else + bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); ++#endif + } + + static void reset_page(struct page *page) +@@ -1278,7 +1350,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, + class = pool->size_class[class_idx]; + off = (class->size * obj_idx) & ~PAGE_MASK; + +- area = &get_cpu_var(zs_map_area); ++ local_lock(&zs_map_area.lock); ++ area = this_cpu_ptr(&zs_map_area); + area->vm_mm = mm; + if (off + class->size <= PAGE_SIZE) { + /* this object is contained entirely within a page */ +@@ -1332,7 +1405,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) + + __zs_unmap_object(area, pages, off, class->size); + } +- put_cpu_var(zs_map_area); ++ local_unlock(&zs_map_area.lock); + + migrate_read_unlock(zspage); + unpin_tag(handle); +-- +2.43.0 + diff --git a/debian/patches-rt/0211-mm-zswap-Use-local-lock-to-protect-per-CPU-data.patch b/debian/patches-rt/0211-mm-zswap-Use-local-lock-to-protect-per-CPU-data.patch new file mode 100644 index 000000000..772cff44a --- /dev/null +++ b/debian/patches-rt/0211-mm-zswap-Use-local-lock-to-protect-per-CPU-data.patch @@ -0,0 +1,150 @@ +From 5bfa3eb76d50818c90253df970e7afbc747211c2 Mon Sep 17 00:00:00 2001 +From: "Luis Claudio R. Goncalves" <lgoncalv@redhat.com> +Date: Tue, 25 Jun 2019 11:28:04 -0300 +Subject: [PATCH 211/323] mm/zswap: Use local lock to protect per-CPU data +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +zwap uses per-CPU compression. The per-CPU data pointer is acquired with +get_cpu_ptr() which implicitly disables preemption. It allocates +memory inside the preempt disabled region which conflicts with the +PREEMPT_RT semantics. + +Replace the implicit preemption control with an explicit local lock. +This allows RT kernels to substitute it with a real per CPU lock, which +serializes the access but keeps the code section preemptible. On non RT +kernels this maps to preempt_disable() as before, i.e. no functional +change. + +[bigeasy: Use local_lock(), additional hunks, patch description] + +Cc: Seth Jennings <sjenning@redhat.com> +Cc: Dan Streetman <ddstreet@ieee.org> +Cc: Vitaly Wool <vitaly.wool@konsulko.com> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: linux-mm@kvack.org +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + mm/zswap.c | 43 ++++++++++++++++++++++++++++--------------- + 1 file changed, 28 insertions(+), 15 deletions(-) + +diff --git a/mm/zswap.c b/mm/zswap.c +index fbb782924ccc..b24f761b9241 100644 +--- a/mm/zswap.c ++++ b/mm/zswap.c +@@ -18,6 +18,7 @@ + #include <linux/highmem.h> + #include <linux/slab.h> + #include <linux/spinlock.h> ++#include <linux/local_lock.h> + #include <linux/types.h> + #include <linux/atomic.h> + #include <linux/frontswap.h> +@@ -387,27 +388,37 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, + /********************************* + * per-cpu code + **********************************/ +-static DEFINE_PER_CPU(u8 *, zswap_dstmem); ++struct zswap_comp { ++ /* Used for per-CPU dstmem and tfm */ ++ local_lock_t lock; ++ u8 *dstmem; ++}; ++ ++static DEFINE_PER_CPU(struct zswap_comp, zswap_comp) = { ++ .lock = INIT_LOCAL_LOCK(lock), ++}; + + static int zswap_dstmem_prepare(unsigned int cpu) + { ++ struct zswap_comp *zcomp; + u8 *dst; + + dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + if (!dst) + return -ENOMEM; + +- per_cpu(zswap_dstmem, cpu) = dst; ++ zcomp = per_cpu_ptr(&zswap_comp, cpu); ++ zcomp->dstmem = dst; + return 0; + } + + static int zswap_dstmem_dead(unsigned int cpu) + { +- u8 *dst; ++ struct zswap_comp *zcomp; + +- dst = per_cpu(zswap_dstmem, cpu); +- kfree(dst); +- per_cpu(zswap_dstmem, cpu) = NULL; ++ zcomp = per_cpu_ptr(&zswap_comp, cpu); ++ kfree(zcomp->dstmem); ++ zcomp->dstmem = NULL; + + return 0; + } +@@ -919,10 +930,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) + dlen = PAGE_SIZE; + src = (u8 *)zhdr + sizeof(struct zswap_header); + dst = kmap_atomic(page); +- tfm = *get_cpu_ptr(entry->pool->tfm); ++ local_lock(&zswap_comp.lock); ++ tfm = *this_cpu_ptr(entry->pool->tfm); + ret = crypto_comp_decompress(tfm, src, entry->length, + dst, &dlen); +- put_cpu_ptr(entry->pool->tfm); ++ local_unlock(&zswap_comp.lock); + kunmap_atomic(dst); + BUG_ON(ret); + BUG_ON(dlen != PAGE_SIZE); +@@ -1074,12 +1086,12 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, + } + + /* compress */ +- dst = get_cpu_var(zswap_dstmem); +- tfm = *get_cpu_ptr(entry->pool->tfm); ++ local_lock(&zswap_comp.lock); ++ dst = *this_cpu_ptr(&zswap_comp.dstmem); ++ tfm = *this_cpu_ptr(entry->pool->tfm); + src = kmap_atomic(page); + ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen); + kunmap_atomic(src); +- put_cpu_ptr(entry->pool->tfm); + if (ret) { + ret = -EINVAL; + goto put_dstmem; +@@ -1103,7 +1115,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, + memcpy(buf, &zhdr, hlen); + memcpy(buf + hlen, dst, dlen); + zpool_unmap_handle(entry->pool->zpool, handle); +- put_cpu_var(zswap_dstmem); ++ local_unlock(&zswap_comp.lock); + + /* populate entry */ + entry->offset = offset; +@@ -1131,7 +1143,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, + return 0; + + put_dstmem: +- put_cpu_var(zswap_dstmem); ++ local_unlock(&zswap_comp.lock); + zswap_pool_put(entry->pool); + freepage: + zswap_entry_cache_free(entry); +@@ -1176,9 +1188,10 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, + if (zpool_evictable(entry->pool->zpool)) + src += sizeof(struct zswap_header); + dst = kmap_atomic(page); +- tfm = *get_cpu_ptr(entry->pool->tfm); ++ local_lock(&zswap_comp.lock); ++ tfm = *this_cpu_ptr(entry->pool->tfm); + ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen); +- put_cpu_ptr(entry->pool->tfm); ++ local_unlock(&zswap_comp.lock); + kunmap_atomic(dst); + zpool_unmap_handle(entry->pool->zpool, entry->handle); + BUG_ON(ret); +-- +2.43.0 + diff --git a/debian/patches-rt/0212-x86-kvm-Require-const-tsc-for-RT.patch b/debian/patches-rt/0212-x86-kvm-Require-const-tsc-for-RT.patch new file mode 100644 index 000000000..461bb6d81 --- /dev/null +++ b/debian/patches-rt/0212-x86-kvm-Require-const-tsc-for-RT.patch @@ -0,0 +1,38 @@ +From 6ef3a23d8f37d55fa94b9f1212830a3b8018c76f Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Sun, 6 Nov 2011 12:26:18 +0100 +Subject: [PATCH 212/323] x86: kvm Require const tsc for RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Non constant TSC is a nightmare on bare metal already, but with +virtualization it becomes a complete disaster because the workarounds +are horrible latency wise. That's also a preliminary for running RT in +a guest on top of a RT host. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + arch/x86/kvm/x86.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 13e4699a0744..21a1aa76eb23 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -8167,6 +8167,14 @@ int kvm_arch_init(void *opaque) + goto out; + } + ++#ifdef CONFIG_PREEMPT_RT ++ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { ++ pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n"); ++ r = -EOPNOTSUPP; ++ goto out; ++ } ++#endif ++ + r = -ENOMEM; + x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu), + __alignof__(struct fpu), SLAB_ACCOUNT, +-- +2.43.0 + diff --git a/debian/patches-rt/0213-wait.h-include-atomic.h.patch b/debian/patches-rt/0213-wait.h-include-atomic.h.patch new file mode 100644 index 000000000..1d060be29 --- /dev/null +++ b/debian/patches-rt/0213-wait.h-include-atomic.h.patch @@ -0,0 +1,42 @@ +From 25ddd3a9a10e8b30e5c6ff85e57532a5f20ffa9d Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Mon, 28 Oct 2013 12:19:57 +0100 +Subject: [PATCH 213/323] wait.h: include atomic.h +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +| CC init/main.o +|In file included from include/linux/mmzone.h:9:0, +| from include/linux/gfp.h:4, +| from include/linux/kmod.h:22, +| from include/linux/module.h:13, +| from init/main.c:15: +|include/linux/wait.h: In function ‘wait_on_atomic_t’: +|include/linux/wait.h:982:2: error: implicit declaration of function ‘atomic_read’ [-Werror=implicit-function-declaration] +| if (atomic_read(val) == 0) +| ^ + +This pops up on ARM. Non-RT gets its atomic.h include from spinlock.h + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/wait.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/include/linux/wait.h b/include/linux/wait.h +index 1663e47681a3..20aae69387aa 100644 +--- a/include/linux/wait.h ++++ b/include/linux/wait.h +@@ -10,6 +10,7 @@ + + #include <asm/current.h> + #include <uapi/linux/wait.h> ++#include <linux/atomic.h> + + typedef struct wait_queue_entry wait_queue_entry_t; + +-- +2.43.0 + diff --git a/debian/patches-rt/0214-sched-Limit-the-number-of-task-migrations-per-batch.patch b/debian/patches-rt/0214-sched-Limit-the-number-of-task-migrations-per-batch.patch new file mode 100644 index 000000000..c157da12c --- /dev/null +++ b/debian/patches-rt/0214-sched-Limit-the-number-of-task-migrations-per-batch.patch @@ -0,0 +1,33 @@ +From a178dbd2a3f8eeedf39cde9582a29bc95f335b17 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 6 Jun 2011 12:12:51 +0200 +Subject: [PATCH 214/323] sched: Limit the number of task migrations per batch +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Put an upper limit on the number of tasks which are migrated per batch +to avoid large latencies. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + kernel/sched/core.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index aaeed4b14278..3d7e9ecbd376 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -65,7 +65,11 @@ const_debug unsigned int sysctl_sched_features = + * Number of tasks to iterate in a single balance run. + * Limited because this is done with IRQs disabled. + */ ++#ifdef CONFIG_PREEMPT_RT ++const_debug unsigned int sysctl_sched_nr_migrate = 8; ++#else + const_debug unsigned int sysctl_sched_nr_migrate = 32; ++#endif + + /* + * period over which we measure -rt task CPU usage in us. +-- +2.43.0 + diff --git a/debian/patches-rt/0215-sched-Move-mmdrop-to-RCU-on-RT.patch b/debian/patches-rt/0215-sched-Move-mmdrop-to-RCU-on-RT.patch new file mode 100644 index 000000000..caf0a9bc6 --- /dev/null +++ b/debian/patches-rt/0215-sched-Move-mmdrop-to-RCU-on-RT.patch @@ -0,0 +1,115 @@ +From e29feec1bad5cc39575600e68d6f5523b17c189a Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 6 Jun 2011 12:20:33 +0200 +Subject: [PATCH 215/323] sched: Move mmdrop to RCU on RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Takes sleeping locks and calls into the memory allocator, so nothing +we want to do in task switch and oder atomic contexts. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + include/linux/mm_types.h | 4 ++++ + include/linux/sched/mm.h | 11 +++++++++++ + kernel/fork.c | 13 +++++++++++++ + kernel/sched/core.c | 7 ++++++- + 4 files changed, 34 insertions(+), 1 deletion(-) + +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 4eb38918da8f..2a83d965b00e 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -12,6 +12,7 @@ + #include <linux/completion.h> + #include <linux/cpumask.h> + #include <linux/uprobes.h> ++#include <linux/rcupdate.h> + #include <linux/page-flags-layout.h> + #include <linux/workqueue.h> + #include <linux/seqlock.h> +@@ -570,6 +571,9 @@ struct mm_struct { + bool tlb_flush_batched; + #endif + struct uprobes_state uprobes_state; ++#ifdef CONFIG_PREEMPT_RT ++ struct rcu_head delayed_drop; ++#endif + #ifdef CONFIG_HUGETLB_PAGE + atomic_long_t hugetlb_usage; + #endif +diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h +index e3e5e149b00e..6d39ad0f55bb 100644 +--- a/include/linux/sched/mm.h ++++ b/include/linux/sched/mm.h +@@ -49,6 +49,17 @@ static inline void mmdrop(struct mm_struct *mm) + __mmdrop(mm); + } + ++#ifdef CONFIG_PREEMPT_RT ++extern void __mmdrop_delayed(struct rcu_head *rhp); ++static inline void mmdrop_delayed(struct mm_struct *mm) ++{ ++ if (atomic_dec_and_test(&mm->mm_count)) ++ call_rcu(&mm->delayed_drop, __mmdrop_delayed); ++} ++#else ++# define mmdrop_delayed(mm) mmdrop(mm) ++#endif ++ + /** + * mmget() - Pin the address space associated with a &struct mm_struct. + * @mm: The address space to pin. +diff --git a/kernel/fork.c b/kernel/fork.c +index 32b9d7205ac1..078058436f2f 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -691,6 +691,19 @@ void __mmdrop(struct mm_struct *mm) + } + EXPORT_SYMBOL_GPL(__mmdrop); + ++#ifdef CONFIG_PREEMPT_RT ++/* ++ * RCU callback for delayed mm drop. Not strictly rcu, but we don't ++ * want another facility to make this work. ++ */ ++void __mmdrop_delayed(struct rcu_head *rhp) ++{ ++ struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); ++ ++ __mmdrop(mm); ++} ++#endif ++ + static void mmdrop_async_fn(struct work_struct *work) + { + struct mm_struct *mm; +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 3d7e9ecbd376..5e5d28f5d970 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4269,9 +4269,13 @@ static struct rq *finish_task_switch(struct task_struct *prev) + * provided by mmdrop(), + * - a sync_core for SYNC_CORE. + */ ++ /* ++ * We use mmdrop_delayed() here so we don't have to do the ++ * full __mmdrop() when we are the last user. ++ */ + if (mm) { + membarrier_mm_sync_core_before_usermode(mm); +- mmdrop(mm); ++ mmdrop_delayed(mm); + } + if (unlikely(prev_state == TASK_DEAD)) { + if (prev->sched_class->task_dead) +@@ -7264,6 +7268,7 @@ void sched_setnuma(struct task_struct *p, int nid) + #endif /* CONFIG_NUMA_BALANCING */ + + #ifdef CONFIG_HOTPLUG_CPU ++ + /* + * Ensure that the idle task is using init_mm right before its CPU goes + * offline. +-- +2.43.0 + diff --git a/debian/patches-rt/0216-kernel-sched-move-stack-kprobe-clean-up-to-__put_tas.patch b/debian/patches-rt/0216-kernel-sched-move-stack-kprobe-clean-up-to-__put_tas.patch new file mode 100644 index 000000000..c420a7438 --- /dev/null +++ b/debian/patches-rt/0216-kernel-sched-move-stack-kprobe-clean-up-to-__put_tas.patch @@ -0,0 +1,81 @@ +From 0d19bdfa59a3f9976aacac9944e5c2519f1a508b Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Mon, 21 Nov 2016 19:31:08 +0100 +Subject: [PATCH 216/323] kernel/sched: move stack + kprobe clean up to + __put_task_struct() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +There is no need to free the stack before the task struct (except for reasons +mentioned in commit 68f24b08ee89 ("sched/core: Free the stack early if +CONFIG_THREAD_INFO_IN_TASK")). This also comes handy on -RT because we can't +free memory in preempt disabled region. +vfree_atomic() delays the memory cleanup to a worker. Since we move everything +to the RCU callback, we can also free it immediately. + +Cc: stable-rt@vger.kernel.org #for kprobe_flush_task() +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/fork.c | 12 +++++++++++- + kernel/sched/core.c | 9 --------- + 2 files changed, 11 insertions(+), 10 deletions(-) + +diff --git a/kernel/fork.c b/kernel/fork.c +index 078058436f2f..2a11bf5f9e30 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -42,6 +42,7 @@ + #include <linux/mmu_notifier.h> + #include <linux/fs.h> + #include <linux/mm.h> ++#include <linux/kprobes.h> + #include <linux/vmacache.h> + #include <linux/nsproxy.h> + #include <linux/capability.h> +@@ -288,7 +289,7 @@ static inline void free_thread_stack(struct task_struct *tsk) + return; + } + +- vfree_atomic(tsk->stack); ++ vfree(tsk->stack); + return; + } + #endif +@@ -745,6 +746,15 @@ void __put_task_struct(struct task_struct *tsk) + WARN_ON(refcount_read(&tsk->usage)); + WARN_ON(tsk == current); + ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(tsk); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(tsk); ++ + io_uring_free(tsk); + cgroup_free(tsk); + task_numa_free(tsk, true); +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 5e5d28f5d970..81bc9efbd191 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4281,15 +4281,6 @@ static struct rq *finish_task_switch(struct task_struct *prev) + if (prev->sched_class->task_dead) + prev->sched_class->task_dead(prev); + +- /* +- * Remove function-return probe instances associated with this +- * task and put them back on the free list. +- */ +- kprobe_flush_task(prev); +- +- /* Task is done with its stack. */ +- put_task_stack(prev); +- + put_task_struct_rcu_user(prev); + } + +-- +2.43.0 + diff --git a/debian/patches-rt/0217-sched-Do-not-account-rcu_preempt_depth-on-RT-in-migh.patch b/debian/patches-rt/0217-sched-Do-not-account-rcu_preempt_depth-on-RT-in-migh.patch new file mode 100644 index 000000000..71686f72d --- /dev/null +++ b/debian/patches-rt/0217-sched-Do-not-account-rcu_preempt_depth-on-RT-in-migh.patch @@ -0,0 +1,57 @@ +From 55d4bd8636745b8d1b987b6561ae2b14cebace78 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 7 Jun 2011 09:19:06 +0200 +Subject: [PATCH 217/323] sched: Do not account rcu_preempt_depth on RT in + might_sleep() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +RT changes the rcu_preempt_depth semantics, so we cannot check for it +in might_sleep(). + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + include/linux/rcupdate.h | 7 +++++++ + kernel/sched/core.c | 2 +- + 2 files changed, 8 insertions(+), 1 deletion(-) + +diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h +index bb9681309e0d..cbeea5c61a3a 100644 +--- a/include/linux/rcupdate.h ++++ b/include/linux/rcupdate.h +@@ -54,6 +54,11 @@ void __rcu_read_unlock(void); + * types of kernel builds, the rcu_read_lock() nesting depth is unknowable. + */ + #define rcu_preempt_depth() (current->rcu_read_lock_nesting) ++#ifndef CONFIG_PREEMPT_RT ++#define sched_rcu_preempt_depth() rcu_preempt_depth() ++#else ++static inline int sched_rcu_preempt_depth(void) { return 0; } ++#endif + + #else /* #ifdef CONFIG_PREEMPT_RCU */ + +@@ -79,6 +84,8 @@ static inline int rcu_preempt_depth(void) + return 0; + } + ++#define sched_rcu_preempt_depth() rcu_preempt_depth() ++ + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + + /* Internal to kernel */ +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 81bc9efbd191..2f689b4fa68b 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -7878,7 +7878,7 @@ void __init sched_init(void) + #ifdef CONFIG_DEBUG_ATOMIC_SLEEP + static inline int preempt_count_equals(int preempt_offset) + { +- int nested = preempt_count() + rcu_preempt_depth(); ++ int nested = preempt_count() + sched_rcu_preempt_depth(); + + return (nested == preempt_offset); + } +-- +2.43.0 + diff --git a/debian/patches-rt/0218-sched-Disable-TTWU_QUEUE-on-RT.patch b/debian/patches-rt/0218-sched-Disable-TTWU_QUEUE-on-RT.patch new file mode 100644 index 000000000..77eb3b029 --- /dev/null +++ b/debian/patches-rt/0218-sched-Disable-TTWU_QUEUE-on-RT.patch @@ -0,0 +1,38 @@ +From 5b867d72795af288909e6370aabef927a70c8d78 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 13 Sep 2011 16:42:35 +0200 +Subject: [PATCH 218/323] sched: Disable TTWU_QUEUE on RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The queued remote wakeup mechanism can introduce rather large +latencies if the number of migrated tasks is high. Disable it for RT. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + kernel/sched/features.h | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index f1bf5e12d889..402fd37fb340 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -45,11 +45,16 @@ SCHED_FEAT(DOUBLE_TICK, false) + */ + SCHED_FEAT(NONTASK_CAPACITY, true) + ++#ifdef CONFIG_PREEMPT_RT ++SCHED_FEAT(TTWU_QUEUE, false) ++#else ++ + /* + * Queue remote wakeups on the target CPU and process them + * using the scheduler IPI. Reduces rq->lock contention/bounces. + */ + SCHED_FEAT(TTWU_QUEUE, true) ++#endif + + /* + * When doing wakeups, attempt to limit superfluous scans of the LLC domain. +-- +2.43.0 + diff --git a/debian/patches-rt/0219-softirq-Check-preemption-after-reenabling-interrupts.patch b/debian/patches-rt/0219-softirq-Check-preemption-after-reenabling-interrupts.patch new file mode 100644 index 000000000..173c118c2 --- /dev/null +++ b/debian/patches-rt/0219-softirq-Check-preemption-after-reenabling-interrupts.patch @@ -0,0 +1,151 @@ +From c63275b524df4d3556c7cf1690bc874d4a8cd72e Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Sun, 13 Nov 2011 17:17:09 +0100 +Subject: [PATCH 219/323] softirq: Check preemption after reenabling interrupts +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +raise_softirq_irqoff() disables interrupts and wakes the softirq +daemon, but after reenabling interrupts there is no preemption check, +so the execution of the softirq thread might be delayed arbitrarily. + +In principle we could add that check to local_irq_enable/restore, but +that's overkill as the rasie_softirq_irqoff() sections are the only +ones which show this behaviour. + +Reported-by: Carsten Emde <cbe@osadl.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + include/linux/preempt.h | 3 +++ + lib/irq_poll.c | 5 +++++ + net/core/dev.c | 7 +++++++ + 3 files changed, 15 insertions(+) + +diff --git a/include/linux/preempt.h b/include/linux/preempt.h +index 5ceac863e729..fb140e00f74d 100644 +--- a/include/linux/preempt.h ++++ b/include/linux/preempt.h +@@ -190,8 +190,10 @@ do { \ + + #ifdef CONFIG_PREEMPT_RT + # define preempt_enable_no_resched() sched_preempt_enable_no_resched() ++# define preempt_check_resched_rt() preempt_check_resched() + #else + # define preempt_enable_no_resched() preempt_enable() ++# define preempt_check_resched_rt() barrier(); + #endif + + #define preemptible() (preempt_count() == 0 && !irqs_disabled()) +@@ -262,6 +264,7 @@ do { \ + #define preempt_disable_notrace() barrier() + #define preempt_enable_no_resched_notrace() barrier() + #define preempt_enable_notrace() barrier() ++#define preempt_check_resched_rt() barrier() + #define preemptible() 0 + + #endif /* CONFIG_PREEMPT_COUNT */ +diff --git a/lib/irq_poll.c b/lib/irq_poll.c +index 2f17b488d58e..7557bf7ecf1f 100644 +--- a/lib/irq_poll.c ++++ b/lib/irq_poll.c +@@ -37,6 +37,7 @@ void irq_poll_sched(struct irq_poll *iop) + list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll)); + raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); + local_irq_restore(flags); ++ preempt_check_resched_rt(); + } + EXPORT_SYMBOL(irq_poll_sched); + +@@ -72,6 +73,7 @@ void irq_poll_complete(struct irq_poll *iop) + local_irq_save(flags); + __irq_poll_complete(iop); + local_irq_restore(flags); ++ preempt_check_resched_rt(); + } + EXPORT_SYMBOL(irq_poll_complete); + +@@ -96,6 +98,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h) + } + + local_irq_enable(); ++ preempt_check_resched_rt(); + + /* Even though interrupts have been re-enabled, this + * access is safe because interrupts can only add new +@@ -133,6 +136,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h) + __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); + + local_irq_enable(); ++ preempt_check_resched_rt(); + } + + /** +@@ -196,6 +200,7 @@ static int irq_poll_cpu_dead(unsigned int cpu) + this_cpu_ptr(&blk_cpu_iopoll)); + __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); + local_irq_enable(); ++ preempt_check_resched_rt(); + + return 0; + } +diff --git a/net/core/dev.c b/net/core/dev.c +index 3fc27b52bf42..3f23ac2ad497 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -3052,6 +3052,7 @@ static void __netif_reschedule(struct Qdisc *q) + sd->output_queue_tailp = &q->next_sched; + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); ++ preempt_check_resched_rt(); + } + + void __netif_schedule(struct Qdisc *q) +@@ -3114,6 +3115,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) + __this_cpu_write(softnet_data.completion_queue, skb); + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); ++ preempt_check_resched_rt(); + } + EXPORT_SYMBOL(__dev_kfree_skb_irq); + +@@ -4596,6 +4598,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, + rps_unlock(sd); + + local_irq_restore(flags); ++ preempt_check_resched_rt(); + + atomic_long_inc(&skb->dev->rx_dropped); + kfree_skb(skb); +@@ -6348,12 +6351,14 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) + sd->rps_ipi_list = NULL; + + local_irq_enable(); ++ preempt_check_resched_rt(); + + /* Send pending IPI's to kick RPS processing on remote cpus. */ + net_rps_send_ipi(remsd); + } else + #endif + local_irq_enable(); ++ preempt_check_resched_rt(); + } + + static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) +@@ -6431,6 +6436,7 @@ void __napi_schedule(struct napi_struct *n) + local_irq_save(flags); + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + local_irq_restore(flags); ++ preempt_check_resched_rt(); + } + EXPORT_SYMBOL(__napi_schedule); + +@@ -10962,6 +10968,7 @@ static int dev_cpu_dead(unsigned int oldcpu) + + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_enable(); ++ preempt_check_resched_rt(); + + #ifdef CONFIG_RPS + remsd = oldsd->rps_ipi_list; +-- +2.43.0 + diff --git a/debian/patches-rt/0220-softirq-Disable-softirq-stacks-for-RT.patch b/debian/patches-rt/0220-softirq-Disable-softirq-stacks-for-RT.patch new file mode 100644 index 000000000..90e5f9abf --- /dev/null +++ b/debian/patches-rt/0220-softirq-Disable-softirq-stacks-for-RT.patch @@ -0,0 +1,168 @@ +From 539938b186e370ddf9bfc1d68703cfddf80a3f42 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Mon, 18 Jul 2011 13:59:17 +0200 +Subject: [PATCH 220/323] softirq: Disable softirq stacks for RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Disable extra stacks for softirqs. We want to preempt softirqs and +having them on special IRQ-stack does not make this easier. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + arch/powerpc/kernel/irq.c | 2 ++ + arch/powerpc/kernel/misc_32.S | 2 ++ + arch/powerpc/kernel/misc_64.S | 2 ++ + arch/sh/kernel/irq.c | 2 ++ + arch/sparc/kernel/irq_64.c | 2 ++ + arch/x86/kernel/irq_32.c | 2 ++ + arch/x86/kernel/irq_64.c | 2 ++ + include/linux/interrupt.h | 2 +- + 8 files changed, 15 insertions(+), 1 deletion(-) + +diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c +index e8a548447dd6..5ad4f27cba10 100644 +--- a/arch/powerpc/kernel/irq.c ++++ b/arch/powerpc/kernel/irq.c +@@ -753,10 +753,12 @@ void *mcheckirq_ctx[NR_CPUS] __read_mostly; + void *softirq_ctx[NR_CPUS] __read_mostly; + void *hardirq_ctx[NR_CPUS] __read_mostly; + ++#ifndef CONFIG_PREEMPT_RT + void do_softirq_own_stack(void) + { + call_do_softirq(softirq_ctx[smp_processor_id()]); + } ++#endif + + irq_hw_number_t virq_to_hw(unsigned int virq) + { +diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S +index 717e658b90fd..08ee95ad6593 100644 +--- a/arch/powerpc/kernel/misc_32.S ++++ b/arch/powerpc/kernel/misc_32.S +@@ -31,6 +31,7 @@ + * We store the saved ksp_limit in the unused part + * of the STACK_FRAME_OVERHEAD + */ ++#ifndef CONFIG_PREEMPT_RT + _GLOBAL(call_do_softirq) + mflr r0 + stw r0,4(r1) +@@ -46,6 +47,7 @@ _GLOBAL(call_do_softirq) + stw r10,THREAD+KSP_LIMIT(r2) + mtlr r0 + blr ++#endif + + /* + * void call_do_irq(struct pt_regs *regs, void *sp); +diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S +index 070465825c21..a6b33f7b3264 100644 +--- a/arch/powerpc/kernel/misc_64.S ++++ b/arch/powerpc/kernel/misc_64.S +@@ -27,6 +27,7 @@ + + .text + ++#ifndef CONFIG_PREEMPT_RT + _GLOBAL(call_do_softirq) + mflr r0 + std r0,16(r1) +@@ -37,6 +38,7 @@ _GLOBAL(call_do_softirq) + ld r0,16(r1) + mtlr r0 + blr ++#endif + + _GLOBAL(call_do_irq) + mflr r0 +diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c +index ab5f790b0cd2..5db7af565dec 100644 +--- a/arch/sh/kernel/irq.c ++++ b/arch/sh/kernel/irq.c +@@ -148,6 +148,7 @@ void irq_ctx_exit(int cpu) + hardirq_ctx[cpu] = NULL; + } + ++#ifndef CONFIG_PREEMPT_RT + void do_softirq_own_stack(void) + { + struct thread_info *curctx; +@@ -175,6 +176,7 @@ void do_softirq_own_stack(void) + "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr" + ); + } ++#endif + #else + static inline void handle_one_irq(unsigned int irq) + { +diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c +index 3ec9f1402aad..eb21682abfcb 100644 +--- a/arch/sparc/kernel/irq_64.c ++++ b/arch/sparc/kernel/irq_64.c +@@ -854,6 +854,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs) + set_irq_regs(old_regs); + } + ++#ifndef CONFIG_PREEMPT_RT + void do_softirq_own_stack(void) + { + void *orig_sp, *sp = softirq_stack[smp_processor_id()]; +@@ -868,6 +869,7 @@ void do_softirq_own_stack(void) + __asm__ __volatile__("mov %0, %%sp" + : : "r" (orig_sp)); + } ++#endif + + #ifdef CONFIG_HOTPLUG_CPU + void fixup_irqs(void) +diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c +index 0b79efc87be5..93c6b88b382a 100644 +--- a/arch/x86/kernel/irq_32.c ++++ b/arch/x86/kernel/irq_32.c +@@ -131,6 +131,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) + return 0; + } + ++#ifndef CONFIG_PREEMPT_RT + void do_softirq_own_stack(void) + { + struct irq_stack *irqstk; +@@ -147,6 +148,7 @@ void do_softirq_own_stack(void) + + call_on_stack(__do_softirq, isp); + } ++#endif + + void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) + { +diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c +index 440eed558558..7cfc4e6b7c94 100644 +--- a/arch/x86/kernel/irq_64.c ++++ b/arch/x86/kernel/irq_64.c +@@ -72,7 +72,9 @@ int irq_init_percpu_irqstack(unsigned int cpu) + return map_irq_stack(cpu); + } + ++#ifndef CONFIG_PREEMPT_RT + void do_softirq_own_stack(void) + { + run_on_irqstack_cond(__do_softirq, NULL); + } ++#endif +diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h +index a1c64d969532..7f185482ea22 100644 +--- a/include/linux/interrupt.h ++++ b/include/linux/interrupt.h +@@ -564,7 +564,7 @@ struct softirq_action + asmlinkage void do_softirq(void); + asmlinkage void __do_softirq(void); + +-#ifdef __ARCH_HAS_DO_SOFTIRQ ++#if defined(__ARCH_HAS_DO_SOFTIRQ) && !defined(CONFIG_PREEMPT_RT) + void do_softirq_own_stack(void); + #else + static inline void do_softirq_own_stack(void) +-- +2.43.0 + diff --git a/debian/patches-rt/0221-net-core-use-local_bh_disable-in-netif_rx_ni.patch b/debian/patches-rt/0221-net-core-use-local_bh_disable-in-netif_rx_ni.patch new file mode 100644 index 000000000..aeba8e9de --- /dev/null +++ b/debian/patches-rt/0221-net-core-use-local_bh_disable-in-netif_rx_ni.patch @@ -0,0 +1,41 @@ +From 3f41204b42e2804a8eff4dbae9f811c5112a5e49 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Fri, 16 Jun 2017 19:03:16 +0200 +Subject: [PATCH 221/323] net/core: use local_bh_disable() in netif_rx_ni() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +In 2004 netif_rx_ni() gained a preempt_disable() section around +netif_rx() and its do_softirq() + testing for it. The do_softirq() part +is required because netif_rx() raises the softirq but does not invoke +it. The preempt_disable() is required to remain on the same CPU which added the +skb to the per-CPU list. +All this can be avoided be putting this into a local_bh_disable()ed +section. The local_bh_enable() part will invoke do_softirq() if +required. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + net/core/dev.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +diff --git a/net/core/dev.c b/net/core/dev.c +index 3f23ac2ad497..f973f53c7a1f 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -4870,11 +4870,9 @@ int netif_rx_ni(struct sk_buff *skb) + + trace_netif_rx_ni_entry(skb); + +- preempt_disable(); ++ local_bh_disable(); + err = netif_rx_internal(skb); +- if (local_softirq_pending()) +- do_softirq(); +- preempt_enable(); ++ local_bh_enable(); + trace_netif_rx_ni_exit(err); + + return err; +-- +2.43.0 + diff --git a/debian/patches-rt/0222-pid.h-include-atomic.h.patch b/debian/patches-rt/0222-pid.h-include-atomic.h.patch new file mode 100644 index 000000000..159e5afbe --- /dev/null +++ b/debian/patches-rt/0222-pid.h-include-atomic.h.patch @@ -0,0 +1,43 @@ +From 281d6228df38fdd331290ccf7753daee6ee7e520 Mon Sep 17 00:00:00 2001 +From: Grygorii Strashko <Grygorii.Strashko@linaro.org> +Date: Tue, 21 Jul 2015 19:43:56 +0300 +Subject: [PATCH 222/323] pid.h: include atomic.h +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +This patch fixes build error: + CC kernel/pid_namespace.o +In file included from kernel/pid_namespace.c:11:0: +include/linux/pid.h: In function 'get_pid': +include/linux/pid.h:78:3: error: implicit declaration of function 'atomic_inc' [-Werror=implicit-function-declaration] + atomic_inc(&pid->count); + ^ +which happens when + CONFIG_PROVE_LOCKING=n + CONFIG_DEBUG_SPINLOCK=n + CONFIG_DEBUG_MUTEXES=n + CONFIG_DEBUG_LOCK_ALLOC=n + CONFIG_PID_NS=y + +Vanilla gets this via spinlock.h. + +Signed-off-by: Grygorii Strashko <Grygorii.Strashko@linaro.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/pid.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/include/linux/pid.h b/include/linux/pid.h +index fa10acb8d6a4..2f86f84e9fc1 100644 +--- a/include/linux/pid.h ++++ b/include/linux/pid.h +@@ -3,6 +3,7 @@ + #define _LINUX_PID_H + + #include <linux/rculist.h> ++#include <linux/atomic.h> + #include <linux/wait.h> + #include <linux/refcount.h> + +-- +2.43.0 + diff --git a/debian/patches-rt/0223-ptrace-fix-ptrace-vs-tasklist_lock-race.patch b/debian/patches-rt/0223-ptrace-fix-ptrace-vs-tasklist_lock-race.patch new file mode 100644 index 000000000..efff232da --- /dev/null +++ b/debian/patches-rt/0223-ptrace-fix-ptrace-vs-tasklist_lock-race.patch @@ -0,0 +1,166 @@ +From 2b742ac53e22db703a46b21ff3817c797b7cb071 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Thu, 29 Aug 2013 18:21:04 +0200 +Subject: [PATCH 223/323] ptrace: fix ptrace vs tasklist_lock race +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +As explained by Alexander Fyodorov <halcy@yandex.ru>: + +|read_lock(&tasklist_lock) in ptrace_stop() is converted to mutex on RT kernel, +|and it can remove __TASK_TRACED from task->state (by moving it to +|task->saved_state). If parent does wait() on child followed by a sys_ptrace +|call, the following race can happen: +| +|- child sets __TASK_TRACED in ptrace_stop() +|- parent does wait() which eventually calls wait_task_stopped() and returns +| child's pid +|- child blocks on read_lock(&tasklist_lock) in ptrace_stop() and moves +| __TASK_TRACED flag to saved_state +|- parent calls sys_ptrace, which calls ptrace_check_attach() and wait_task_inactive() + +The patch is based on his initial patch where an additional check is +added in case the __TASK_TRACED moved to ->saved_state. The pi_lock is +taken in case the caller is interrupted between looking into ->state and +->saved_state. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/sched.h | 49 +++++++++++++++++++++++++++++++++++++++---- + kernel/ptrace.c | 9 +++++++- + kernel/sched/core.c | 17 +++++++++++++-- + 3 files changed, 68 insertions(+), 7 deletions(-) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index d259126f46cf..bd0c9c633438 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -112,12 +112,8 @@ struct io_uring_task; + __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ + TASK_PARKED) + +-#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) +- + #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) + +-#define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) +- + #ifdef CONFIG_DEBUG_ATOMIC_SLEEP + + /* +@@ -1886,6 +1882,51 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) + return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); + } + ++static inline bool __task_is_stopped_or_traced(struct task_struct *task) ++{ ++ if (task->state & (__TASK_STOPPED | __TASK_TRACED)) ++ return true; ++#ifdef CONFIG_PREEMPT_RT ++ if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED)) ++ return true; ++#endif ++ return false; ++} ++ ++static inline bool task_is_stopped_or_traced(struct task_struct *task) ++{ ++ bool traced_stopped; ++ ++#ifdef CONFIG_PREEMPT_RT ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&task->pi_lock, flags); ++ traced_stopped = __task_is_stopped_or_traced(task); ++ raw_spin_unlock_irqrestore(&task->pi_lock, flags); ++#else ++ traced_stopped = __task_is_stopped_or_traced(task); ++#endif ++ return traced_stopped; ++} ++ ++static inline bool task_is_traced(struct task_struct *task) ++{ ++ bool traced = false; ++ ++ if (task->state & __TASK_TRACED) ++ return true; ++#ifdef CONFIG_PREEMPT_RT ++ /* in case the task is sleeping on tasklist_lock */ ++ raw_spin_lock_irq(&task->pi_lock); ++ if (task->state & __TASK_TRACED) ++ traced = true; ++ else if (task->saved_state & __TASK_TRACED) ++ traced = true; ++ raw_spin_unlock_irq(&task->pi_lock); ++#endif ++ return traced; ++} ++ + /* + * cond_resched() and cond_resched_lock(): latency reduction via + * explicit rescheduling in places that are safe. The return +diff --git a/kernel/ptrace.c b/kernel/ptrace.c +index aab480e24bd6..71acb6341e63 100644 +--- a/kernel/ptrace.c ++++ b/kernel/ptrace.c +@@ -196,7 +196,14 @@ static bool ptrace_freeze_traced(struct task_struct *task) + spin_lock_irq(&task->sighand->siglock); + if (task_is_traced(task) && !looks_like_a_spurious_pid(task) && + !__fatal_signal_pending(task)) { +- task->state = __TASK_TRACED; ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&task->pi_lock, flags); ++ if (task->state & __TASK_TRACED) ++ task->state = __TASK_TRACED; ++ else ++ task->saved_state = __TASK_TRACED; ++ raw_spin_unlock_irqrestore(&task->pi_lock, flags); + ret = true; + } + spin_unlock_irq(&task->sighand->siglock); +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 2f689b4fa68b..8ac1f0526476 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -2587,6 +2587,18 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p, + } + #endif /* CONFIG_NUMA_BALANCING */ + ++static bool check_task_state(struct task_struct *p, long match_state) ++{ ++ bool match = false; ++ ++ raw_spin_lock_irq(&p->pi_lock); ++ if (p->state == match_state || p->saved_state == match_state) ++ match = true; ++ raw_spin_unlock_irq(&p->pi_lock); ++ ++ return match; ++} ++ + /* + * wait_task_inactive - wait for a thread to unschedule. + * +@@ -2631,7 +2643,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) + * is actually now running somewhere else! + */ + while (task_running(rq, p)) { +- if (match_state && unlikely(p->state != match_state)) ++ if (match_state && !check_task_state(p, match_state)) + return 0; + cpu_relax(); + } +@@ -2646,7 +2658,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) + running = task_running(rq, p); + queued = task_on_rq_queued(p); + ncsw = 0; +- if (!match_state || p->state == match_state) ++ if (!match_state || p->state == match_state || ++ p->saved_state == match_state) + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ + task_rq_unlock(rq, p, &rf); + +-- +2.43.0 + diff --git a/debian/patches-rt/0224-ptrace-fix-ptrace_unfreeze_traced-race-with-rt-lock.patch b/debian/patches-rt/0224-ptrace-fix-ptrace_unfreeze_traced-race-with-rt-lock.patch new file mode 100644 index 000000000..4e61137e3 --- /dev/null +++ b/debian/patches-rt/0224-ptrace-fix-ptrace_unfreeze_traced-race-with-rt-lock.patch @@ -0,0 +1,65 @@ +From 10d5d8e25cce1695bb99f7149038d97b9a160201 Mon Sep 17 00:00:00 2001 +From: Oleg Nesterov <oleg@redhat.com> +Date: Tue, 3 Nov 2020 12:39:01 +0100 +Subject: [PATCH 224/323] ptrace: fix ptrace_unfreeze_traced() race with + rt-lock +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The patch "ptrace: fix ptrace vs tasklist_lock race" changed +ptrace_freeze_traced() to take task->saved_state into account, but +ptrace_unfreeze_traced() has the same problem and needs a similar fix: +it should check/update both ->state and ->saved_state. + +Reported-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +Fixes: "ptrace: fix ptrace vs tasklist_lock race" +Signed-off-by: Oleg Nesterov <oleg@redhat.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Cc: stable-rt@vger.kernel.org +--- + kernel/ptrace.c | 23 +++++++++++++++-------- + 1 file changed, 15 insertions(+), 8 deletions(-) + +diff --git a/kernel/ptrace.c b/kernel/ptrace.c +index 71acb6341e63..fb5d1a19507d 100644 +--- a/kernel/ptrace.c ++++ b/kernel/ptrace.c +@@ -213,8 +213,8 @@ static bool ptrace_freeze_traced(struct task_struct *task) + + static void ptrace_unfreeze_traced(struct task_struct *task) + { +- if (task->state != __TASK_TRACED) +- return; ++ unsigned long flags; ++ bool frozen = true; + + WARN_ON(!task->ptrace || task->parent != current); + +@@ -223,12 +223,19 @@ static void ptrace_unfreeze_traced(struct task_struct *task) + * Recheck state under the lock to close this race. + */ + spin_lock_irq(&task->sighand->siglock); +- if (task->state == __TASK_TRACED) { +- if (__fatal_signal_pending(task)) +- wake_up_state(task, __TASK_TRACED); +- else +- task->state = TASK_TRACED; +- } ++ ++ raw_spin_lock_irqsave(&task->pi_lock, flags); ++ if (task->state == __TASK_TRACED) ++ task->state = TASK_TRACED; ++ else if (task->saved_state == __TASK_TRACED) ++ task->saved_state = TASK_TRACED; ++ else ++ frozen = false; ++ raw_spin_unlock_irqrestore(&task->pi_lock, flags); ++ ++ if (frozen && __fatal_signal_pending(task)) ++ wake_up_state(task, __TASK_TRACED); ++ + spin_unlock_irq(&task->sighand->siglock); + } + +-- +2.43.0 + diff --git a/debian/patches-rt/0225-kernel-sched-add-put-get-_cpu_light.patch b/debian/patches-rt/0225-kernel-sched-add-put-get-_cpu_light.patch new file mode 100644 index 000000000..6a6e0620c --- /dev/null +++ b/debian/patches-rt/0225-kernel-sched-add-put-get-_cpu_light.patch @@ -0,0 +1,28 @@ +From f5e1368135f9f3d520828544dc31762f19f43848 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Sat, 27 May 2017 19:02:06 +0200 +Subject: [PATCH 225/323] kernel/sched: add {put|get}_cpu_light() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/smp.h | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/include/linux/smp.h b/include/linux/smp.h +index 04f44e0aa2e0..cae66815f9e2 100644 +--- a/include/linux/smp.h ++++ b/include/linux/smp.h +@@ -239,6 +239,9 @@ static inline int get_boot_cpu_id(void) + #define get_cpu() ({ preempt_disable(); __smp_processor_id(); }) + #define put_cpu() preempt_enable() + ++#define get_cpu_light() ({ migrate_disable(); __smp_processor_id(); }) ++#define put_cpu_light() migrate_enable() ++ + /* + * Callback to arch code if there's nosmp or maxcpus=0 on the + * boot command line: +-- +2.43.0 + diff --git a/debian/patches-rt/0226-trace-Add-migrate-disabled-counter-to-tracing-output.patch b/debian/patches-rt/0226-trace-Add-migrate-disabled-counter-to-tracing-output.patch new file mode 100644 index 000000000..23ba35a68 --- /dev/null +++ b/debian/patches-rt/0226-trace-Add-migrate-disabled-counter-to-tracing-output.patch @@ -0,0 +1,123 @@ +From 61f802c21cb2e063c6e8b39e9f497f2d38440b5a Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Sun, 17 Jul 2011 21:56:42 +0200 +Subject: [PATCH 226/323] trace: Add migrate-disabled counter to tracing output +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + include/linux/trace_events.h | 2 ++ + kernel/trace/trace.c | 26 +++++++++++++++++++------- + kernel/trace/trace_events.c | 1 + + kernel/trace/trace_output.c | 5 +++++ + 4 files changed, 27 insertions(+), 7 deletions(-) + +diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h +index 2fea9fcd4d4e..2151524a10f0 100644 +--- a/include/linux/trace_events.h ++++ b/include/linux/trace_events.h +@@ -69,6 +69,7 @@ struct trace_entry { + unsigned char flags; + unsigned char preempt_count; + int pid; ++ unsigned char migrate_disable; + }; + + #define TRACE_EVENT_TYPE_MAX \ +@@ -157,6 +158,7 @@ static inline void tracing_generic_entry_update(struct trace_entry *entry, + unsigned int trace_ctx) + { + entry->preempt_count = trace_ctx & 0xff; ++ entry->migrate_disable = (trace_ctx >> 8) & 0xff; + entry->pid = current->pid; + entry->type = type; + entry->flags = trace_ctx >> 16; +diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c +index 5d6730589823..02dffb1862b8 100644 +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -2575,6 +2575,15 @@ enum print_line_t trace_handle_return(struct trace_seq *s) + } + EXPORT_SYMBOL_GPL(trace_handle_return); + ++static unsigned short migration_disable_value(void) ++{ ++#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) ++ return current->migration_disabled; ++#else ++ return 0; ++#endif ++} ++ + unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) + { + unsigned int trace_flags = irqs_status; +@@ -2593,7 +2602,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) + trace_flags |= TRACE_FLAG_NEED_RESCHED; + if (test_preempt_need_resched()) + trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; +- return (trace_flags << 16) | (pc & 0xff); ++ return (trace_flags << 16) | (pc & 0xff) | ++ (migration_disable_value() & 0xff) << 8; + } + + struct ring_buffer_event * +@@ -3856,9 +3866,10 @@ static void print_lat_help_header(struct seq_file *m) + "# | / _----=> need-resched \n" + "# || / _---=> hardirq/softirq \n" + "# ||| / _--=> preempt-depth \n" +- "# |||| / delay \n" +- "# cmd pid ||||| time | caller \n" +- "# \\ / ||||| \\ | / \n"); ++ "# |||| / _-=> migrate-disable \n" ++ "# ||||| / delay \n" ++ "# cmd pid |||||| time | caller \n" ++ "# \\ / |||||| \\ | / \n"); + } + + static void print_event_info(struct array_buffer *buf, struct seq_file *m) +@@ -3896,9 +3907,10 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file + seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); + seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); + seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); +- seq_printf(m, "# %.*s||| / delay\n", prec, space); +- seq_printf(m, "# TASK-PID %.*s CPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID "); +- seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | "); ++ seq_printf(m, "# %.*s||| / _-=> migrate-disable\n", prec, space); ++ seq_printf(m, "# %.*s|||| / delay\n", prec, space); ++ seq_printf(m, "# TASK-PID %.*s CPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID "); ++ seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | "); + } + + void +diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c +index df64b92c5edc..636fb7df3714 100644 +--- a/kernel/trace/trace_events.c ++++ b/kernel/trace/trace_events.c +@@ -184,6 +184,7 @@ static int trace_define_common_fields(void) + __common_field(unsigned char, flags); + __common_field(unsigned char, preempt_count); + __common_field(int, pid); ++ __common_field(unsigned char, migrate_disable); + + return ret; + } +diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c +index 94b0991717b6..b3619b21217c 100644 +--- a/kernel/trace/trace_output.c ++++ b/kernel/trace/trace_output.c +@@ -497,6 +497,11 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) + else + trace_seq_putc(s, '.'); + ++ if (entry->migrate_disable) ++ trace_seq_printf(s, "%x", entry->migrate_disable); ++ else ++ trace_seq_putc(s, '.'); ++ + return !trace_seq_has_overflowed(s); + } + +-- +2.43.0 + diff --git a/debian/patches-rt/0227-locking-don-t-check-for-__LINUX_SPINLOCK_TYPES_H-on-.patch b/debian/patches-rt/0227-locking-don-t-check-for-__LINUX_SPINLOCK_TYPES_H-on-.patch new file mode 100644 index 000000000..aeea997ea --- /dev/null +++ b/debian/patches-rt/0227-locking-don-t-check-for-__LINUX_SPINLOCK_TYPES_H-on-.patch @@ -0,0 +1,166 @@ +From d9e2f0098c8afde2f68ff94ea5dcdf818bc454d9 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Fri, 4 Aug 2017 17:40:42 +0200 +Subject: [PATCH 227/323] locking: don't check for __LINUX_SPINLOCK_TYPES_H on + -RT archs +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Upstream uses arch_spinlock_t within spinlock_t and requests that +spinlock_types.h header file is included first. +On -RT we have the rt_mutex with its raw_lock wait_lock which needs +architectures' spinlock_types.h header file for its definition. However +we need rt_mutex first because it is used to build the spinlock_t so +that check does not work for us. +Therefore I am dropping that check. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/alpha/include/asm/spinlock_types.h | 4 ---- + arch/arm/include/asm/spinlock_types.h | 4 ---- + arch/arm64/include/asm/spinlock_types.h | 4 ---- + arch/hexagon/include/asm/spinlock_types.h | 4 ---- + arch/ia64/include/asm/spinlock_types.h | 4 ---- + arch/powerpc/include/asm/spinlock_types.h | 4 ---- + arch/s390/include/asm/spinlock_types.h | 4 ---- + arch/sh/include/asm/spinlock_types.h | 4 ---- + arch/xtensa/include/asm/spinlock_types.h | 4 ---- + 9 files changed, 36 deletions(-) + +diff --git a/arch/alpha/include/asm/spinlock_types.h b/arch/alpha/include/asm/spinlock_types.h +index 1d5716bc060b..6883bc952d22 100644 +--- a/arch/alpha/include/asm/spinlock_types.h ++++ b/arch/alpha/include/asm/spinlock_types.h +@@ -2,10 +2,6 @@ + #ifndef _ALPHA_SPINLOCK_TYPES_H + #define _ALPHA_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H +-# error "please don't include this file directly" +-#endif +- + typedef struct { + volatile unsigned int lock; + } arch_spinlock_t; +diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h +index 5976958647fe..a37c0803954b 100644 +--- a/arch/arm/include/asm/spinlock_types.h ++++ b/arch/arm/include/asm/spinlock_types.h +@@ -2,10 +2,6 @@ + #ifndef __ASM_SPINLOCK_TYPES_H + #define __ASM_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H +-# error "please don't include this file directly" +-#endif +- + #define TICKET_SHIFT 16 + + typedef struct { +diff --git a/arch/arm64/include/asm/spinlock_types.h b/arch/arm64/include/asm/spinlock_types.h +index 18782f0c4721..6672b05350b4 100644 +--- a/arch/arm64/include/asm/spinlock_types.h ++++ b/arch/arm64/include/asm/spinlock_types.h +@@ -5,10 +5,6 @@ + #ifndef __ASM_SPINLOCK_TYPES_H + #define __ASM_SPINLOCK_TYPES_H + +-#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H) +-# error "please don't include this file directly" +-#endif +- + #include <asm-generic/qspinlock_types.h> + #include <asm-generic/qrwlock_types.h> + +diff --git a/arch/hexagon/include/asm/spinlock_types.h b/arch/hexagon/include/asm/spinlock_types.h +index 19d233497ba5..de72fb23016d 100644 +--- a/arch/hexagon/include/asm/spinlock_types.h ++++ b/arch/hexagon/include/asm/spinlock_types.h +@@ -8,10 +8,6 @@ + #ifndef _ASM_SPINLOCK_TYPES_H + #define _ASM_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H +-# error "please don't include this file directly" +-#endif +- + typedef struct { + volatile unsigned int lock; + } arch_spinlock_t; +diff --git a/arch/ia64/include/asm/spinlock_types.h b/arch/ia64/include/asm/spinlock_types.h +index 6e345fefcdca..681408d6816f 100644 +--- a/arch/ia64/include/asm/spinlock_types.h ++++ b/arch/ia64/include/asm/spinlock_types.h +@@ -2,10 +2,6 @@ + #ifndef _ASM_IA64_SPINLOCK_TYPES_H + #define _ASM_IA64_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H +-# error "please don't include this file directly" +-#endif +- + typedef struct { + volatile unsigned int lock; + } arch_spinlock_t; +diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h +index c5d742f18021..cc6922a011ba 100644 +--- a/arch/powerpc/include/asm/spinlock_types.h ++++ b/arch/powerpc/include/asm/spinlock_types.h +@@ -2,10 +2,6 @@ + #ifndef _ASM_POWERPC_SPINLOCK_TYPES_H + #define _ASM_POWERPC_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H +-# error "please don't include this file directly" +-#endif +- + #ifdef CONFIG_PPC_QUEUED_SPINLOCKS + #include <asm-generic/qspinlock_types.h> + #include <asm-generic/qrwlock_types.h> +diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h +index cfed272e4fd5..8e28e8176ec8 100644 +--- a/arch/s390/include/asm/spinlock_types.h ++++ b/arch/s390/include/asm/spinlock_types.h +@@ -2,10 +2,6 @@ + #ifndef __ASM_SPINLOCK_TYPES_H + #define __ASM_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H +-# error "please don't include this file directly" +-#endif +- + typedef struct { + int lock; + } __attribute__ ((aligned (4))) arch_spinlock_t; +diff --git a/arch/sh/include/asm/spinlock_types.h b/arch/sh/include/asm/spinlock_types.h +index e82369f286a2..22ca9a98bbb8 100644 +--- a/arch/sh/include/asm/spinlock_types.h ++++ b/arch/sh/include/asm/spinlock_types.h +@@ -2,10 +2,6 @@ + #ifndef __ASM_SH_SPINLOCK_TYPES_H + #define __ASM_SH_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H +-# error "please don't include this file directly" +-#endif +- + typedef struct { + volatile unsigned int lock; + } arch_spinlock_t; +diff --git a/arch/xtensa/include/asm/spinlock_types.h b/arch/xtensa/include/asm/spinlock_types.h +index 64c9389254f1..dc846323b1cd 100644 +--- a/arch/xtensa/include/asm/spinlock_types.h ++++ b/arch/xtensa/include/asm/spinlock_types.h +@@ -2,10 +2,6 @@ + #ifndef __ASM_SPINLOCK_TYPES_H + #define __ASM_SPINLOCK_TYPES_H + +-#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H) +-# error "please don't include this file directly" +-#endif +- + #include <asm-generic/qspinlock_types.h> + #include <asm-generic/qrwlock_types.h> + +-- +2.43.0 + diff --git a/debian/patches-rt/0228-locking-Make-spinlock_t-and-rwlock_t-a-RCU-section-o.patch b/debian/patches-rt/0228-locking-Make-spinlock_t-and-rwlock_t-a-RCU-section-o.patch new file mode 100644 index 000000000..77ce72c24 --- /dev/null +++ b/debian/patches-rt/0228-locking-Make-spinlock_t-and-rwlock_t-a-RCU-section-o.patch @@ -0,0 +1,126 @@ +From 166485c3d38dd51a1a9a35edb7b2d3c1054a2418 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 19 Nov 2019 09:25:04 +0100 +Subject: [PATCH 228/323] locking: Make spinlock_t and rwlock_t a RCU section + on RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +On !RT a locked spinlock_t and rwlock_t disables preemption which +implies a RCU read section. There is code that relies on that behaviour. + +Add an explicit RCU read section on RT while a sleeping lock (a lock +which would disables preemption on !RT) acquired. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/locking/rtmutex.c | 6 ++++++ + kernel/locking/rwlock-rt.c | 6 ++++++ + 2 files changed, 12 insertions(+) + +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index 2fe178651254..31b374d36d04 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1136,6 +1136,7 @@ void __lockfunc rt_spin_lock(spinlock_t *lock) + { + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); ++ rcu_read_lock(); + migrate_disable(); + } + EXPORT_SYMBOL(rt_spin_lock); +@@ -1150,6 +1151,7 @@ void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass) + { + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); ++ rcu_read_lock(); + migrate_disable(); + } + EXPORT_SYMBOL(rt_spin_lock_nested); +@@ -1159,6 +1161,7 @@ void __lockfunc rt_spin_lock_nest_lock(spinlock_t *lock, + { + spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); ++ rcu_read_lock(); + migrate_disable(); + } + EXPORT_SYMBOL(rt_spin_lock_nest_lock); +@@ -1169,6 +1172,7 @@ void __lockfunc rt_spin_unlock(spinlock_t *lock) + /* NOTE: we always pass in '1' for nested, for simplicity */ + spin_release(&lock->dep_map, _RET_IP_); + migrate_enable(); ++ rcu_read_unlock(); + rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock); + } + EXPORT_SYMBOL(rt_spin_unlock); +@@ -1198,6 +1202,7 @@ int __lockfunc rt_spin_trylock(spinlock_t *lock) + ret = __rt_mutex_trylock(&lock->lock); + if (ret) { + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); ++ rcu_read_lock(); + migrate_disable(); + } + return ret; +@@ -1212,6 +1217,7 @@ int __lockfunc rt_spin_trylock_bh(spinlock_t *lock) + ret = __rt_mutex_trylock(&lock->lock); + if (ret) { + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); ++ rcu_read_lock(); + migrate_disable(); + } else { + local_bh_enable(); +diff --git a/kernel/locking/rwlock-rt.c b/kernel/locking/rwlock-rt.c +index 16be7111aae7..3d2d1f14b513 100644 +--- a/kernel/locking/rwlock-rt.c ++++ b/kernel/locking/rwlock-rt.c +@@ -270,6 +270,7 @@ int __lockfunc rt_read_trylock(rwlock_t *rwlock) + ret = __read_rt_trylock(rwlock); + if (ret) { + rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_); ++ rcu_read_lock(); + migrate_disable(); + } + return ret; +@@ -283,6 +284,7 @@ int __lockfunc rt_write_trylock(rwlock_t *rwlock) + ret = __write_rt_trylock(rwlock); + if (ret) { + rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_); ++ rcu_read_lock(); + migrate_disable(); + } + return ret; +@@ -293,6 +295,7 @@ void __lockfunc rt_read_lock(rwlock_t *rwlock) + { + rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); + __read_rt_lock(rwlock); ++ rcu_read_lock(); + migrate_disable(); + } + EXPORT_SYMBOL(rt_read_lock); +@@ -301,6 +304,7 @@ void __lockfunc rt_write_lock(rwlock_t *rwlock) + { + rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); + __write_rt_lock(rwlock); ++ rcu_read_lock(); + migrate_disable(); + } + EXPORT_SYMBOL(rt_write_lock); +@@ -309,6 +313,7 @@ void __lockfunc rt_read_unlock(rwlock_t *rwlock) + { + rwlock_release(&rwlock->dep_map, _RET_IP_); + migrate_enable(); ++ rcu_read_unlock(); + __read_rt_unlock(rwlock); + } + EXPORT_SYMBOL(rt_read_unlock); +@@ -317,6 +322,7 @@ void __lockfunc rt_write_unlock(rwlock_t *rwlock) + { + rwlock_release(&rwlock->dep_map, _RET_IP_); + migrate_enable(); ++ rcu_read_unlock(); + __write_rt_unlock(rwlock); + } + EXPORT_SYMBOL(rt_write_unlock); +-- +2.43.0 + diff --git a/debian/patches-rt/0229-mm-vmalloc-Another-preempt-disable-region-which-suck.patch b/debian/patches-rt/0229-mm-vmalloc-Another-preempt-disable-region-which-suck.patch new file mode 100644 index 000000000..c699a1ab7 --- /dev/null +++ b/debian/patches-rt/0229-mm-vmalloc-Another-preempt-disable-region-which-suck.patch @@ -0,0 +1,73 @@ +From 61aced1d729a22bbaf66e1f68806e04058bafb3f Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 12 Jul 2011 11:39:36 +0200 +Subject: [PATCH 229/323] mm/vmalloc: Another preempt disable region which + sucks +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Avoid the preempt disable version of get_cpu_var(). The inner-lock should +provide enough serialisation. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + mm/vmalloc.c | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +diff --git a/mm/vmalloc.c b/mm/vmalloc.c +index d6a4794fa8ca..8113e4f0d2f2 100644 +--- a/mm/vmalloc.c ++++ b/mm/vmalloc.c +@@ -1542,7 +1542,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) + struct vmap_block *vb; + struct vmap_area *va; + unsigned long vb_idx; +- int node, err; ++ int node, err, cpu; + void *vaddr; + + node = numa_node_id(); +@@ -1579,11 +1579,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) + return ERR_PTR(err); + } + +- vbq = &get_cpu_var(vmap_block_queue); ++ cpu = get_cpu_light(); ++ vbq = this_cpu_ptr(&vmap_block_queue); + spin_lock(&vbq->lock); + list_add_tail_rcu(&vb->free_list, &vbq->free); + spin_unlock(&vbq->lock); +- put_cpu_var(vmap_block_queue); ++ put_cpu_light(); + + return vaddr; + } +@@ -1648,6 +1649,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) + struct vmap_block *vb; + void *vaddr = NULL; + unsigned int order; ++ int cpu; + + BUG_ON(offset_in_page(size)); + BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); +@@ -1662,7 +1664,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) + order = get_order(size); + + rcu_read_lock(); +- vbq = &get_cpu_var(vmap_block_queue); ++ cpu = get_cpu_light(); ++ vbq = this_cpu_ptr(&vmap_block_queue); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + unsigned long pages_off; + +@@ -1685,7 +1688,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) + break; + } + +- put_cpu_var(vmap_block_queue); ++ put_cpu_light(); + rcu_read_unlock(); + + /* Allocate new block if nothing was found */ +-- +2.43.0 + diff --git a/debian/patches-rt/0230-block-mq-do-not-invoke-preempt_disable.patch b/debian/patches-rt/0230-block-mq-do-not-invoke-preempt_disable.patch new file mode 100644 index 000000000..d1f8234d5 --- /dev/null +++ b/debian/patches-rt/0230-block-mq-do-not-invoke-preempt_disable.patch @@ -0,0 +1,40 @@ +From 0470292c7761cc1cd2847e60b43b202b1be7ee29 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 14 Jul 2015 14:26:34 +0200 +Subject: [PATCH 230/323] block/mq: do not invoke preempt_disable() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +preempt_disable() and get_cpu() don't play well together with the sleeping +locks it tries to allocate later. +It seems to be enough to replace it with get_cpu_light() and migrate_disable(). + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + block/blk-mq.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/block/blk-mq.c b/block/blk-mq.c +index a6618bf45992..37466b3a4ba5 100644 +--- a/block/blk-mq.c ++++ b/block/blk-mq.c +@@ -1575,14 +1575,14 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, + return; + + if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { +- int cpu = get_cpu(); ++ int cpu = get_cpu_light(); + if (cpumask_test_cpu(cpu, hctx->cpumask)) { + __blk_mq_run_hw_queue(hctx); +- put_cpu(); ++ put_cpu_light(); + return; + } + +- put_cpu(); ++ put_cpu_light(); + } + + kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, +-- +2.43.0 + diff --git a/debian/patches-rt/0231-md-raid5-Make-raid5_percpu-handling-RT-aware.patch b/debian/patches-rt/0231-md-raid5-Make-raid5_percpu-handling-RT-aware.patch new file mode 100644 index 000000000..33a40d5fc --- /dev/null +++ b/debian/patches-rt/0231-md-raid5-Make-raid5_percpu-handling-RT-aware.patch @@ -0,0 +1,69 @@ +From 4af1829664e98b9b3ec542a03ee30aeee8f47f6c Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 6 Apr 2010 16:51:31 +0200 +Subject: [PATCH 231/323] md: raid5: Make raid5_percpu handling RT aware +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +__raid_run_ops() disables preemption with get_cpu() around the access +to the raid5_percpu variables. That causes scheduling while atomic +spews on RT. + +Serialize the access to the percpu data with a lock and keep the code +preemptible. + +Reported-by: Udo van den Heuvel <udovdh@xs4all.nl> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Tested-by: Udo van den Heuvel <udovdh@xs4all.nl> +--- + drivers/md/raid5.c | 7 +++++-- + drivers/md/raid5.h | 1 + + 2 files changed, 6 insertions(+), 2 deletions(-) + +diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c +index 9f114b9d8dc6..7e0eb8defeaf 100644 +--- a/drivers/md/raid5.c ++++ b/drivers/md/raid5.c +@@ -2218,8 +2218,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) + struct raid5_percpu *percpu; + unsigned long cpu; + +- cpu = get_cpu(); ++ cpu = get_cpu_light(); + percpu = per_cpu_ptr(conf->percpu, cpu); ++ spin_lock(&percpu->lock); + if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { + ops_run_biofill(sh); + overlap_clear++; +@@ -2278,7 +2279,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) + if (test_and_clear_bit(R5_Overlap, &dev->flags)) + wake_up(&sh->raid_conf->wait_for_overlap); + } +- put_cpu(); ++ spin_unlock(&percpu->lock); ++ put_cpu_light(); + } + + static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) +@@ -7108,6 +7110,7 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) + __func__, cpu); + return -ENOMEM; + } ++ spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock); + return 0; + } + +diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h +index 5c05acf20e1f..665fe138ab4f 100644 +--- a/drivers/md/raid5.h ++++ b/drivers/md/raid5.h +@@ -635,6 +635,7 @@ struct r5conf { + int recovery_disabled; + /* per cpu variables */ + struct raid5_percpu { ++ spinlock_t lock; /* Protection for -RT */ + struct page *spare_page; /* Used when checking P/Q in raid6 */ + void *scribble; /* space for constructing buffer + * lists and performing address +-- +2.43.0 + diff --git a/debian/patches-rt/0232-scsi-fcoe-Make-RT-aware.patch b/debian/patches-rt/0232-scsi-fcoe-Make-RT-aware.patch new file mode 100644 index 000000000..dbe99481a --- /dev/null +++ b/debian/patches-rt/0232-scsi-fcoe-Make-RT-aware.patch @@ -0,0 +1,116 @@ +From a2eaf0c262e1151f5978da03d033819e82104cce Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Sat, 12 Nov 2011 14:00:48 +0100 +Subject: [PATCH 232/323] scsi/fcoe: Make RT aware. +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Do not disable preemption while taking sleeping locks. All user look safe +for migrate_diable() only. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + drivers/scsi/fcoe/fcoe.c | 16 ++++++++-------- + drivers/scsi/fcoe/fcoe_ctlr.c | 4 ++-- + drivers/scsi/libfc/fc_exch.c | 4 ++-- + 3 files changed, 12 insertions(+), 12 deletions(-) + +diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c +index 30afcbbe1f86..4ae5b8152ece 100644 +--- a/drivers/scsi/fcoe/fcoe.c ++++ b/drivers/scsi/fcoe/fcoe.c +@@ -1452,11 +1452,11 @@ static int fcoe_rcv(struct sk_buff *skb, struct net_device *netdev, + static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen) + { + struct fcoe_percpu_s *fps; +- int rc; ++ int rc, cpu = get_cpu_light(); + +- fps = &get_cpu_var(fcoe_percpu); ++ fps = &per_cpu(fcoe_percpu, cpu); + rc = fcoe_get_paged_crc_eof(skb, tlen, fps); +- put_cpu_var(fcoe_percpu); ++ put_cpu_light(); + + return rc; + } +@@ -1641,11 +1641,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport, + return 0; + } + +- stats = per_cpu_ptr(lport->stats, get_cpu()); ++ stats = per_cpu_ptr(lport->stats, get_cpu_light()); + stats->InvalidCRCCount++; + if (stats->InvalidCRCCount < 5) + printk(KERN_WARNING "fcoe: dropping frame with CRC error\n"); +- put_cpu(); ++ put_cpu_light(); + return -EINVAL; + } + +@@ -1686,7 +1686,7 @@ static void fcoe_recv_frame(struct sk_buff *skb) + */ + hp = (struct fcoe_hdr *) skb_network_header(skb); + +- stats = per_cpu_ptr(lport->stats, get_cpu()); ++ stats = per_cpu_ptr(lport->stats, get_cpu_light()); + if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) { + if (stats->ErrorFrames < 5) + printk(KERN_WARNING "fcoe: FCoE version " +@@ -1718,13 +1718,13 @@ static void fcoe_recv_frame(struct sk_buff *skb) + goto drop; + + if (!fcoe_filter_frames(lport, fp)) { +- put_cpu(); ++ put_cpu_light(); + fc_exch_recv(lport, fp); + return; + } + drop: + stats->ErrorFrames++; +- put_cpu(); ++ put_cpu_light(); + kfree_skb(skb); + } + +diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c +index a2d60ad2a683..3fdc403bc7d8 100644 +--- a/drivers/scsi/fcoe/fcoe_ctlr.c ++++ b/drivers/scsi/fcoe/fcoe_ctlr.c +@@ -830,7 +830,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip) + + INIT_LIST_HEAD(&del_list); + +- stats = per_cpu_ptr(fip->lp->stats, get_cpu()); ++ stats = per_cpu_ptr(fip->lp->stats, get_cpu_light()); + + list_for_each_entry_safe(fcf, next, &fip->fcfs, list) { + deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2; +@@ -866,7 +866,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip) + sel_time = fcf->time; + } + } +- put_cpu(); ++ put_cpu_light(); + + list_for_each_entry_safe(fcf, next, &del_list, list) { + /* Removes fcf from current list */ +diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c +index 4261380af97b..65160eaaa929 100644 +--- a/drivers/scsi/libfc/fc_exch.c ++++ b/drivers/scsi/libfc/fc_exch.c +@@ -826,10 +826,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport, + } + memset(ep, 0, sizeof(*ep)); + +- cpu = get_cpu(); ++ cpu = get_cpu_light(); + pool = per_cpu_ptr(mp->pool, cpu); + spin_lock_bh(&pool->lock); +- put_cpu(); ++ put_cpu_light(); + + /* peek cache of free slot */ + if (pool->left != FC_XID_UNKNOWN) { +-- +2.43.0 + diff --git a/debian/patches-rt/0233-sunrpc-Make-svc_xprt_do_enqueue-use-get_cpu_light.patch b/debian/patches-rt/0233-sunrpc-Make-svc_xprt_do_enqueue-use-get_cpu_light.patch new file mode 100644 index 000000000..22b2f6973 --- /dev/null +++ b/debian/patches-rt/0233-sunrpc-Make-svc_xprt_do_enqueue-use-get_cpu_light.patch @@ -0,0 +1,60 @@ +From 971df27cc65fb6805b2253bd66e81bd244887ecc Mon Sep 17 00:00:00 2001 +From: Mike Galbraith <umgwanakikbuti@gmail.com> +Date: Wed, 18 Feb 2015 16:05:28 +0100 +Subject: [PATCH 233/323] sunrpc: Make svc_xprt_do_enqueue() use + get_cpu_light() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +|BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:915 +|in_atomic(): 1, irqs_disabled(): 0, pid: 3194, name: rpc.nfsd +|Preemption disabled at:[<ffffffffa06bf0bb>] svc_xprt_received+0x4b/0xc0 [sunrpc] +|CPU: 6 PID: 3194 Comm: rpc.nfsd Not tainted 3.18.7-rt1 #9 +|Hardware name: MEDION MS-7848/MS-7848, BIOS M7848W08.404 11/06/2014 +| ffff880409630000 ffff8800d9a33c78 ffffffff815bdeb5 0000000000000002 +| 0000000000000000 ffff8800d9a33c98 ffffffff81073c86 ffff880408dd6008 +| ffff880408dd6000 ffff8800d9a33cb8 ffffffff815c3d84 ffff88040b3ac000 +|Call Trace: +| [<ffffffff815bdeb5>] dump_stack+0x4f/0x9e +| [<ffffffff81073c86>] __might_sleep+0xe6/0x150 +| [<ffffffff815c3d84>] rt_spin_lock+0x24/0x50 +| [<ffffffffa06beec0>] svc_xprt_do_enqueue+0x80/0x230 [sunrpc] +| [<ffffffffa06bf0bb>] svc_xprt_received+0x4b/0xc0 [sunrpc] +| [<ffffffffa06c03ed>] svc_add_new_perm_xprt+0x6d/0x80 [sunrpc] +| [<ffffffffa06b2693>] svc_addsock+0x143/0x200 [sunrpc] +| [<ffffffffa072e69c>] write_ports+0x28c/0x340 [nfsd] +| [<ffffffffa072d2ac>] nfsctl_transaction_write+0x4c/0x80 [nfsd] +| [<ffffffff8117ee83>] vfs_write+0xb3/0x1d0 +| [<ffffffff8117f889>] SyS_write+0x49/0xb0 +| [<ffffffff815c4556>] system_call_fastpath+0x16/0x1b + +Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + net/sunrpc/svc_xprt.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c +index 06e503466c32..16bd1278a989 100644 +--- a/net/sunrpc/svc_xprt.c ++++ b/net/sunrpc/svc_xprt.c +@@ -422,7 +422,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) + if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) + return; + +- cpu = get_cpu(); ++ cpu = get_cpu_light(); + pool = svc_pool_for_cpu(xprt->xpt_server, cpu); + + atomic_long_inc(&pool->sp_stats.packets); +@@ -446,7 +446,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) + rqstp = NULL; + out_unlock: + rcu_read_unlock(); +- put_cpu(); ++ put_cpu_light(); + trace_svc_xprt_do_enqueue(xprt, rqstp); + } + EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue); +-- +2.43.0 + diff --git a/debian/patches-rt/0234-rt-Introduce-cpu_chill.patch b/debian/patches-rt/0234-rt-Introduce-cpu_chill.patch new file mode 100644 index 000000000..f2a8eeee5 --- /dev/null +++ b/debian/patches-rt/0234-rt-Introduce-cpu_chill.patch @@ -0,0 +1,122 @@ +From 38e8018fba96ecd6e7c7a47e18ed06324fd1d0e2 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Wed, 7 Mar 2012 20:51:03 +0100 +Subject: [PATCH 234/323] rt: Introduce cpu_chill() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Retry loops on RT might loop forever when the modifying side was +preempted. Add cpu_chill() to replace cpu_relax(). cpu_chill() +defaults to cpu_relax() for non RT. On RT it puts the looping task to +sleep for a tick so the preempted task can make progress. + +Steven Rostedt changed it to use a hrtimer instead of msleep(): +| +|Ulrich Obergfell pointed out that cpu_chill() calls msleep() which is woken +|up by the ksoftirqd running the TIMER softirq. But as the cpu_chill() is +|called from softirq context, it may block the ksoftirqd() from running, in +|which case, it may never wake up the msleep() causing the deadlock. + ++ bigeasy later changed to schedule_hrtimeout() +|If a task calls cpu_chill() and gets woken up by a regular or spurious +|wakeup and has a signal pending, then it exits the sleep loop in +|do_nanosleep() and sets up the restart block. If restart->nanosleep.type is +|not TI_NONE then this results in accessing a stale user pointer from a +|previously interrupted syscall and a copy to user based on the stale +|pointer or a BUG() when 'type' is not supported in nanosleep_copyout(). + ++ bigeasy: add PF_NOFREEZE: +| [....] Waiting for /dev to be fully populated... +| ===================================== +| [ BUG: udevd/229 still has locks held! ] +| 3.12.11-rt17 #23 Not tainted +| ------------------------------------- +| 1 lock held by udevd/229: +| #0: (&type->i_mutex_dir_key#2){+.+.+.}, at: lookup_slow+0x28/0x98 +| +| stack backtrace: +| CPU: 0 PID: 229 Comm: udevd Not tainted 3.12.11-rt17 #23 +| (unwind_backtrace+0x0/0xf8) from (show_stack+0x10/0x14) +| (show_stack+0x10/0x14) from (dump_stack+0x74/0xbc) +| (dump_stack+0x74/0xbc) from (do_nanosleep+0x120/0x160) +| (do_nanosleep+0x120/0x160) from (hrtimer_nanosleep+0x90/0x110) +| (hrtimer_nanosleep+0x90/0x110) from (cpu_chill+0x30/0x38) +| (cpu_chill+0x30/0x38) from (dentry_kill+0x158/0x1ec) +| (dentry_kill+0x158/0x1ec) from (dput+0x74/0x15c) +| (dput+0x74/0x15c) from (lookup_real+0x4c/0x50) +| (lookup_real+0x4c/0x50) from (__lookup_hash+0x34/0x44) +| (__lookup_hash+0x34/0x44) from (lookup_slow+0x38/0x98) +| (lookup_slow+0x38/0x98) from (path_lookupat+0x208/0x7fc) +| (path_lookupat+0x208/0x7fc) from (filename_lookup+0x20/0x60) +| (filename_lookup+0x20/0x60) from (user_path_at_empty+0x50/0x7c) +| (user_path_at_empty+0x50/0x7c) from (user_path_at+0x14/0x1c) +| (user_path_at+0x14/0x1c) from (vfs_fstatat+0x48/0x94) +| (vfs_fstatat+0x48/0x94) from (SyS_stat64+0x14/0x30) +| (SyS_stat64+0x14/0x30) from (ret_fast_syscall+0x0/0x48) + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Steven Rostedt <rostedt@goodmis.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/delay.h | 6 ++++++ + kernel/time/hrtimer.c | 30 ++++++++++++++++++++++++++++++ + 2 files changed, 36 insertions(+) + +diff --git a/include/linux/delay.h b/include/linux/delay.h +index 1d0e2ce6b6d9..02b37178b54f 100644 +--- a/include/linux/delay.h ++++ b/include/linux/delay.h +@@ -76,4 +76,10 @@ static inline void fsleep(unsigned long usecs) + msleep(DIV_ROUND_UP(usecs, 1000)); + } + ++#ifdef CONFIG_PREEMPT_RT ++extern void cpu_chill(void); ++#else ++# define cpu_chill() cpu_relax() ++#endif ++ + #endif /* defined(_LINUX_DELAY_H) */ +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index ede09dda36e9..9dcc6215599f 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -2054,6 +2054,36 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, + } + #endif + ++#ifdef CONFIG_PREEMPT_RT ++/* ++ * Sleep for 1 ms in hope whoever holds what we want will let it go. ++ */ ++void cpu_chill(void) ++{ ++ unsigned int freeze_flag = current->flags & PF_NOFREEZE; ++ struct task_struct *self = current; ++ ktime_t chill_time; ++ ++ raw_spin_lock_irq(&self->pi_lock); ++ self->saved_state = self->state; ++ __set_current_state_no_track(TASK_UNINTERRUPTIBLE); ++ raw_spin_unlock_irq(&self->pi_lock); ++ ++ chill_time = ktime_set(0, NSEC_PER_MSEC); ++ ++ current->flags |= PF_NOFREEZE; ++ schedule_hrtimeout(&chill_time, HRTIMER_MODE_REL_HARD); ++ if (!freeze_flag) ++ current->flags &= ~PF_NOFREEZE; ++ ++ raw_spin_lock_irq(&self->pi_lock); ++ __set_current_state_no_track(self->saved_state); ++ self->saved_state = TASK_RUNNING; ++ raw_spin_unlock_irq(&self->pi_lock); ++} ++EXPORT_SYMBOL(cpu_chill); ++#endif ++ + /* + * Functions related to boot-time initialization: + */ +-- +2.43.0 + diff --git a/debian/patches-rt/0235-fs-namespace-Use-cpu_chill-in-trylock-loops.patch b/debian/patches-rt/0235-fs-namespace-Use-cpu_chill-in-trylock-loops.patch new file mode 100644 index 000000000..046c48516 --- /dev/null +++ b/debian/patches-rt/0235-fs-namespace-Use-cpu_chill-in-trylock-loops.patch @@ -0,0 +1,44 @@ +From e174bc8c285602601d09d4e7997654e66a18bc87 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Wed, 7 Mar 2012 21:00:34 +0100 +Subject: [PATCH 235/323] fs: namespace: Use cpu_chill() in trylock loops +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Retry loops on RT might loop forever when the modifying side was +preempted. Use cpu_chill() instead of cpu_relax() to let the system +make progress. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + fs/namespace.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/fs/namespace.c b/fs/namespace.c +index 046b084136c5..6a550e342d6b 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -14,6 +14,7 @@ + #include <linux/mnt_namespace.h> + #include <linux/user_namespace.h> + #include <linux/namei.h> ++#include <linux/delay.h> + #include <linux/security.h> + #include <linux/cred.h> + #include <linux/idr.h> +@@ -321,8 +322,11 @@ int __mnt_want_write(struct vfsmount *m) + * incremented count after it has set MNT_WRITE_HOLD. + */ + smp_mb(); +- while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) +- cpu_relax(); ++ while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { ++ preempt_enable(); ++ cpu_chill(); ++ preempt_disable(); ++ } + /* + * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will + * be set to match its requirements. So we must not load that until +-- +2.43.0 + diff --git a/debian/patches-rt/0236-net-Use-skbufhead-with-raw-lock.patch b/debian/patches-rt/0236-net-Use-skbufhead-with-raw-lock.patch new file mode 100644 index 000000000..48d95d023 --- /dev/null +++ b/debian/patches-rt/0236-net-Use-skbufhead-with-raw-lock.patch @@ -0,0 +1,74 @@ +From d58fbc3fc363cee50de0ac64be0b90fd2666956a Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 12 Jul 2011 15:38:34 +0200 +Subject: [PATCH 236/323] net: Use skbufhead with raw lock +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Use the rps lock as rawlock so we can keep irq-off regions. It looks low +latency. However we can't kfree() from this context therefore we defer this +to the softirq and use the tofree_queue list for it (similar to process_queue). + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + include/linux/skbuff.h | 7 +++++++ + net/core/dev.c | 6 +++--- + 2 files changed, 10 insertions(+), 3 deletions(-) + +diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h +index a210f1995862..af89bd5e7d56 100644 +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -296,6 +296,7 @@ struct sk_buff_head { + + __u32 qlen; + spinlock_t lock; ++ raw_spinlock_t raw_lock; + }; + + struct sk_buff; +@@ -1892,6 +1893,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list) + __skb_queue_head_init(list); + } + ++static inline void skb_queue_head_init_raw(struct sk_buff_head *list) ++{ ++ raw_spin_lock_init(&list->raw_lock); ++ __skb_queue_head_init(list); ++} ++ + static inline void skb_queue_head_init_class(struct sk_buff_head *list, + struct lock_class_key *class) + { +diff --git a/net/core/dev.c b/net/core/dev.c +index f973f53c7a1f..895eb42bb5d6 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -221,14 +221,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) + static inline void rps_lock(struct softnet_data *sd) + { + #ifdef CONFIG_RPS +- spin_lock(&sd->input_pkt_queue.lock); ++ raw_spin_lock(&sd->input_pkt_queue.raw_lock); + #endif + } + + static inline void rps_unlock(struct softnet_data *sd) + { + #ifdef CONFIG_RPS +- spin_unlock(&sd->input_pkt_queue.lock); ++ raw_spin_unlock(&sd->input_pkt_queue.raw_lock); + #endif + } + +@@ -11296,7 +11296,7 @@ static int __init net_dev_init(void) + + INIT_WORK(flush, flush_backlog); + +- skb_queue_head_init(&sd->input_pkt_queue); ++ skb_queue_head_init_raw(&sd->input_pkt_queue); + skb_queue_head_init(&sd->process_queue); + #ifdef CONFIG_XFRM_OFFLOAD + skb_queue_head_init(&sd->xfrm_backlog); +-- +2.43.0 + diff --git a/debian/patches-rt/0237-net-Dequeue-in-dev_cpu_dead-without-the-lock.patch b/debian/patches-rt/0237-net-Dequeue-in-dev_cpu_dead-without-the-lock.patch new file mode 100644 index 000000000..36e0d28d2 --- /dev/null +++ b/debian/patches-rt/0237-net-Dequeue-in-dev_cpu_dead-without-the-lock.patch @@ -0,0 +1,37 @@ +From 95dbb10c81619c9a35a63f73dadd7379d95fdec0 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 16 Sep 2020 16:15:39 +0200 +Subject: [PATCH 237/323] net: Dequeue in dev_cpu_dead() without the lock +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Upstream uses skb_dequeue() to acquire lock of `input_pkt_queue'. The reason is +to synchronize against a remote CPU which still thinks that the CPU is online +enqueues packets to this CPU. +There are no guarantees that the packet is enqueued before the callback is run, +it just hope. +RT however complains about an not initialized lock because it uses another lock +for `input_pkt_queue' due to the IRQ-off nature of the context. + +Use the unlocked dequeue version for `input_pkt_queue'. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + net/core/dev.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/core/dev.c b/net/core/dev.c +index 895eb42bb5d6..c55993848233 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -10980,7 +10980,7 @@ static int dev_cpu_dead(unsigned int oldcpu) + netif_rx_ni(skb); + input_queue_head_incr(oldsd); + } +- while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { ++ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { + netif_rx_ni(skb); + input_queue_head_incr(oldsd); + } +-- +2.43.0 + diff --git a/debian/patches-rt/0238-net-dev-always-take-qdisc-s-busylock-in-__dev_xmit_s.patch b/debian/patches-rt/0238-net-dev-always-take-qdisc-s-busylock-in-__dev_xmit_s.patch new file mode 100644 index 000000000..fe05e1e78 --- /dev/null +++ b/debian/patches-rt/0238-net-dev-always-take-qdisc-s-busylock-in-__dev_xmit_s.patch @@ -0,0 +1,42 @@ +From 3a7ff2036ac29effdf8fc39fc7ea9610f98ca5e4 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 30 Mar 2016 13:36:29 +0200 +Subject: [PATCH 238/323] net: dev: always take qdisc's busylock in + __dev_xmit_skb() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The root-lock is dropped before dev_hard_start_xmit() is invoked and after +setting the __QDISC___STATE_RUNNING bit. If this task is now pushed away +by a task with a higher priority then the task with the higher priority +won't be able to submit packets to the NIC directly instead they will be +enqueued into the Qdisc. The NIC will remain idle until the task(s) with +higher priority leave the CPU and the task with lower priority gets back +and finishes the job. + +If we take always the busylock we ensure that the RT task can boost the +low-prio task and submit the packet. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + net/core/dev.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/net/core/dev.c b/net/core/dev.c +index c55993848233..c3b4e5e320ca 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -3796,7 +3796,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, + * This permits qdisc->running owner to get the lock more + * often and dequeue packets faster. + */ ++#ifdef CONFIG_PREEMPT_RT ++ contended = true; ++#else + contended = qdisc_is_running(q); ++#endif + if (unlikely(contended)) + spin_lock(&q->busylock); + +-- +2.43.0 + diff --git a/debian/patches-rt/0239-irqwork-push-most-work-into-softirq-context.patch b/debian/patches-rt/0239-irqwork-push-most-work-into-softirq-context.patch new file mode 100644 index 000000000..1ac41992f --- /dev/null +++ b/debian/patches-rt/0239-irqwork-push-most-work-into-softirq-context.patch @@ -0,0 +1,189 @@ +From 57453ffdf0bcba3fa27ca0b19f622ea371c2c686 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 23 Jun 2015 15:32:51 +0200 +Subject: [PATCH 239/323] irqwork: push most work into softirq context +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Initially we defered all irqwork into softirq because we didn't want the +latency spikes if perf or another user was busy and delayed the RT task. +The NOHZ trigger (nohz_full_kick_work) was the first user that did not work +as expected if it did not run in the original irqwork context so we had to +bring it back somehow for it. push_irq_work_func is the second one that +requires this. + +This patch adds the IRQ_WORK_HARD_IRQ which makes sure the callback runs +in raw-irq context. Everything else is defered into softirq context. Without +-RT we have the orignal behavior. + +This patch incorporates tglx orignal work which revoked a little bringing back +the arch_irq_work_raise() if possible and a few fixes from Steven Rostedt and +Mike Galbraith, + +[bigeasy: melt tglx's irq_work_tick_soft() which splits irq_work_tick() into a + hard and soft variant] +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/irq_work.h | 6 +++++ + kernel/irq_work.c | 58 +++++++++++++++++++++++++++++++--------- + kernel/sched/topology.c | 1 + + kernel/time/timer.c | 2 ++ + 4 files changed, 55 insertions(+), 12 deletions(-) + +diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h +index 30823780c192..f941f2d7d71c 100644 +--- a/include/linux/irq_work.h ++++ b/include/linux/irq_work.h +@@ -55,4 +55,10 @@ static inline void irq_work_run(void) { } + static inline void irq_work_single(void *arg) { } + #endif + ++#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT) ++void irq_work_tick_soft(void); ++#else ++static inline void irq_work_tick_soft(void) { } ++#endif ++ + #endif /* _LINUX_IRQ_WORK_H */ +diff --git a/kernel/irq_work.c b/kernel/irq_work.c +index eca83965b631..8183d30e1bb1 100644 +--- a/kernel/irq_work.c ++++ b/kernel/irq_work.c +@@ -18,6 +18,7 @@ + #include <linux/cpu.h> + #include <linux/notifier.h> + #include <linux/smp.h> ++#include <linux/interrupt.h> + #include <asm/processor.h> + + +@@ -52,13 +53,19 @@ void __weak arch_irq_work_raise(void) + /* Enqueue on current CPU, work must already be claimed and preempt disabled */ + static void __irq_work_queue_local(struct irq_work *work) + { ++ struct llist_head *list; ++ bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT); ++ ++ lazy_work = atomic_read(&work->flags) & IRQ_WORK_LAZY; ++ + /* If the work is "lazy", handle it from next tick if any */ +- if (atomic_read(&work->flags) & IRQ_WORK_LAZY) { +- if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && +- tick_nohz_tick_stopped()) +- arch_irq_work_raise(); +- } else { +- if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) ++ if (lazy_work || (realtime && !(atomic_read(&work->flags) & IRQ_WORK_HARD_IRQ))) ++ list = this_cpu_ptr(&lazy_list); ++ else ++ list = this_cpu_ptr(&raised_list); ++ ++ if (llist_add(&work->llnode, list)) { ++ if (!lazy_work || tick_nohz_tick_stopped()) + arch_irq_work_raise(); + } + } +@@ -102,7 +109,13 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) + if (cpu != smp_processor_id()) { + /* Arch remote IPI send/receive backend aren't NMI safe */ + WARN_ON_ONCE(in_nmi()); +- __smp_call_single_queue(cpu, &work->llnode); ++ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(atomic_read(&work->flags) & IRQ_WORK_HARD_IRQ)) { ++ if (llist_add(&work->llnode, &per_cpu(lazy_list, cpu))) ++ arch_send_call_function_single_ipi(cpu); ++ } else { ++ __smp_call_single_queue(cpu, &work->llnode); ++ } + } else { + __irq_work_queue_local(work); + } +@@ -120,9 +133,8 @@ bool irq_work_needs_cpu(void) + raised = this_cpu_ptr(&raised_list); + lazy = this_cpu_ptr(&lazy_list); + +- if (llist_empty(raised) || arch_irq_work_has_interrupt()) +- if (llist_empty(lazy)) +- return false; ++ if (llist_empty(raised) && llist_empty(lazy)) ++ return false; + + /* All work should have been flushed before going offline */ + WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); +@@ -160,8 +172,12 @@ static void irq_work_run_list(struct llist_head *list) + struct irq_work *work, *tmp; + struct llist_node *llnode; + ++#ifndef CONFIG_PREEMPT_RT ++ /* ++ * nort: On RT IRQ-work may run in SOFTIRQ context. ++ */ + BUG_ON(!irqs_disabled()); +- ++#endif + if (llist_empty(list)) + return; + +@@ -177,7 +193,16 @@ static void irq_work_run_list(struct llist_head *list) + void irq_work_run(void) + { + irq_work_run_list(this_cpu_ptr(&raised_list)); +- irq_work_run_list(this_cpu_ptr(&lazy_list)); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { ++ /* ++ * NOTE: we raise softirq via IPI for safety, ++ * and execute in irq_work_tick() to move the ++ * overhead from hard to soft irq context. ++ */ ++ if (!llist_empty(this_cpu_ptr(&lazy_list))) ++ raise_softirq(TIMER_SOFTIRQ); ++ } else ++ irq_work_run_list(this_cpu_ptr(&lazy_list)); + } + EXPORT_SYMBOL_GPL(irq_work_run); + +@@ -187,8 +212,17 @@ void irq_work_tick(void) + + if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) + irq_work_run_list(raised); ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ irq_work_run_list(this_cpu_ptr(&lazy_list)); ++} ++ ++#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT) ++void irq_work_tick_soft(void) ++{ + irq_work_run_list(this_cpu_ptr(&lazy_list)); + } ++#endif + + /* + * Synchronize against the irq_work @entry, ensures the entry is not +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index ff2c6d3ba6c7..2f1bea4f7dba 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -514,6 +514,7 @@ static int init_rootdomain(struct root_domain *rd) + rd->rto_cpu = -1; + raw_spin_lock_init(&rd->rto_lock); + init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); ++ atomic_or(IRQ_WORK_HARD_IRQ, &rd->rto_push_work.flags); + #endif + + init_dl_bw(&rd->dl_bw); +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index a4fdc7cfb723..1cad0efd635c 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1770,6 +1770,8 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) + { + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + ++ irq_work_tick_soft(); ++ + __run_timers(base); + if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) + __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); +-- +2.43.0 + diff --git a/debian/patches-rt/0240-x86-crypto-Reduce-preempt-disabled-regions.patch b/debian/patches-rt/0240-x86-crypto-Reduce-preempt-disabled-regions.patch new file mode 100644 index 000000000..b84653cfb --- /dev/null +++ b/debian/patches-rt/0240-x86-crypto-Reduce-preempt-disabled-regions.patch @@ -0,0 +1,118 @@ +From 523f64f1a87e6454e46f5e25ae797b6a7050a6a8 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Mon, 14 Nov 2011 18:19:27 +0100 +Subject: [PATCH 240/323] x86: crypto: Reduce preempt disabled regions +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Restrict the preempt disabled regions to the actual floating point +operations and enable preemption for the administrative actions. + +This is necessary on RT to avoid that kfree and other operations are +called with preemption disabled. + +Reported-and-tested-by: Carsten Emde <cbe@osadl.org> +Signed-off-by: Peter Zijlstra <peterz@infradead.org> + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + arch/x86/crypto/aesni-intel_glue.c | 22 ++++++++++++---------- + 1 file changed, 12 insertions(+), 10 deletions(-) + +diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c +index be891fdf8d17..29c716ed103f 100644 +--- a/arch/x86/crypto/aesni-intel_glue.c ++++ b/arch/x86/crypto/aesni-intel_glue.c +@@ -379,14 +379,14 @@ static int ecb_encrypt(struct skcipher_request *req) + + err = skcipher_walk_virt(&walk, req, true); + +- kernel_fpu_begin(); + while ((nbytes = walk.nbytes)) { ++ kernel_fpu_begin(); + aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, + nbytes & AES_BLOCK_MASK); ++ kernel_fpu_end(); + nbytes &= AES_BLOCK_SIZE - 1; + err = skcipher_walk_done(&walk, nbytes); + } +- kernel_fpu_end(); + + return err; + } +@@ -401,14 +401,14 @@ static int ecb_decrypt(struct skcipher_request *req) + + err = skcipher_walk_virt(&walk, req, true); + +- kernel_fpu_begin(); + while ((nbytes = walk.nbytes)) { ++ kernel_fpu_begin(); + aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, + nbytes & AES_BLOCK_MASK); ++ kernel_fpu_end(); + nbytes &= AES_BLOCK_SIZE - 1; + err = skcipher_walk_done(&walk, nbytes); + } +- kernel_fpu_end(); + + return err; + } +@@ -423,14 +423,14 @@ static int cbc_encrypt(struct skcipher_request *req) + + err = skcipher_walk_virt(&walk, req, true); + +- kernel_fpu_begin(); + while ((nbytes = walk.nbytes)) { ++ kernel_fpu_begin(); + aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, + nbytes & AES_BLOCK_MASK, walk.iv); ++ kernel_fpu_end(); + nbytes &= AES_BLOCK_SIZE - 1; + err = skcipher_walk_done(&walk, nbytes); + } +- kernel_fpu_end(); + + return err; + } +@@ -445,14 +445,14 @@ static int cbc_decrypt(struct skcipher_request *req) + + err = skcipher_walk_virt(&walk, req, true); + +- kernel_fpu_begin(); + while ((nbytes = walk.nbytes)) { ++ kernel_fpu_begin(); + aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, + nbytes & AES_BLOCK_MASK, walk.iv); ++ kernel_fpu_end(); + nbytes &= AES_BLOCK_SIZE - 1; + err = skcipher_walk_done(&walk, nbytes); + } +- kernel_fpu_end(); + + return err; + } +@@ -500,18 +500,20 @@ static int ctr_crypt(struct skcipher_request *req) + + err = skcipher_walk_virt(&walk, req, true); + +- kernel_fpu_begin(); + while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { ++ kernel_fpu_begin(); + aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr, + nbytes & AES_BLOCK_MASK, walk.iv); ++ kernel_fpu_end(); + nbytes &= AES_BLOCK_SIZE - 1; + err = skcipher_walk_done(&walk, nbytes); + } + if (walk.nbytes) { ++ kernel_fpu_begin(); + ctr_crypt_final(ctx, &walk); ++ kernel_fpu_end(); + err = skcipher_walk_done(&walk, 0); + } +- kernel_fpu_end(); + + return err; + } +-- +2.43.0 + diff --git a/debian/patches-rt/0241-crypto-Reduce-preempt-disabled-regions-more-algos.patch b/debian/patches-rt/0241-crypto-Reduce-preempt-disabled-regions-more-algos.patch new file mode 100644 index 000000000..f02d88086 --- /dev/null +++ b/debian/patches-rt/0241-crypto-Reduce-preempt-disabled-regions-more-algos.patch @@ -0,0 +1,240 @@ +From e9bc715aa25df5aa654a5243e90ee31fb1b1e010 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Fri, 21 Feb 2014 17:24:04 +0100 +Subject: [PATCH 241/323] crypto: Reduce preempt disabled regions, more algos +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Don Estabrook reported +| kernel: WARNING: CPU: 2 PID: 858 at kernel/sched/core.c:2428 migrate_disable+0xed/0x100() +| kernel: WARNING: CPU: 2 PID: 858 at kernel/sched/core.c:2462 migrate_enable+0x17b/0x200() +| kernel: WARNING: CPU: 3 PID: 865 at kernel/sched/core.c:2428 migrate_disable+0xed/0x100() + +and his backtrace showed some crypto functions which looked fine. + +The problem is the following sequence: + +glue_xts_crypt_128bit() +{ + blkcipher_walk_virt(); /* normal migrate_disable() */ + + glue_fpu_begin(); /* get atomic */ + + while (nbytes) { + __glue_xts_crypt_128bit(); + blkcipher_walk_done(); /* with nbytes = 0, migrate_enable() + * while we are atomic */ + }; + glue_fpu_end() /* no longer atomic */ +} + +and this is why the counter get out of sync and the warning is printed. +The other problem is that we are non-preemptible between +glue_fpu_begin() and glue_fpu_end() and the latency grows. To fix this, +I shorten the FPU off region and ensure blkcipher_walk_done() is called +with preemption enabled. This might hurt the performance because we now +enable/disable the FPU state more often but we gain lower latency and +the bug is gone. + +Reported-by: Don Estabrook <don.estabrook@gmail.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/x86/crypto/cast5_avx_glue.c | 21 +++++++++------------ + arch/x86/crypto/glue_helper.c | 26 +++++++++++++++----------- + 2 files changed, 24 insertions(+), 23 deletions(-) + +diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c +index 384ccb00f9e1..2f8df8ef8644 100644 +--- a/arch/x86/crypto/cast5_avx_glue.c ++++ b/arch/x86/crypto/cast5_avx_glue.c +@@ -46,7 +46,7 @@ static inline void cast5_fpu_end(bool fpu_enabled) + + static int ecb_crypt(struct skcipher_request *req, bool enc) + { +- bool fpu_enabled = false; ++ bool fpu_enabled; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; +@@ -61,7 +61,7 @@ static int ecb_crypt(struct skcipher_request *req, bool enc) + u8 *wsrc = walk.src.virt.addr; + u8 *wdst = walk.dst.virt.addr; + +- fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes); ++ fpu_enabled = cast5_fpu_begin(false, &walk, nbytes); + + /* Process multi-block batch */ + if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { +@@ -90,10 +90,9 @@ static int ecb_crypt(struct skcipher_request *req, bool enc) + } while (nbytes >= bsize); + + done: ++ cast5_fpu_end(fpu_enabled); + err = skcipher_walk_done(&walk, nbytes); + } +- +- cast5_fpu_end(fpu_enabled); + return err; + } + +@@ -197,7 +196,7 @@ static int cbc_decrypt(struct skcipher_request *req) + { + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); +- bool fpu_enabled = false; ++ bool fpu_enabled; + struct skcipher_walk walk; + unsigned int nbytes; + int err; +@@ -205,12 +204,11 @@ static int cbc_decrypt(struct skcipher_request *req) + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes)) { +- fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes); ++ fpu_enabled = cast5_fpu_begin(false, &walk, nbytes); + nbytes = __cbc_decrypt(ctx, &walk); ++ cast5_fpu_end(fpu_enabled); + err = skcipher_walk_done(&walk, nbytes); + } +- +- cast5_fpu_end(fpu_enabled); + return err; + } + +@@ -277,7 +275,7 @@ static int ctr_crypt(struct skcipher_request *req) + { + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); +- bool fpu_enabled = false; ++ bool fpu_enabled; + struct skcipher_walk walk; + unsigned int nbytes; + int err; +@@ -285,13 +283,12 @@ static int ctr_crypt(struct skcipher_request *req) + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) { +- fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes); ++ fpu_enabled = cast5_fpu_begin(false, &walk, nbytes); + nbytes = __ctr_crypt(&walk, ctx); ++ cast5_fpu_end(fpu_enabled); + err = skcipher_walk_done(&walk, nbytes); + } + +- cast5_fpu_end(fpu_enabled); +- + if (walk.nbytes) { + ctr_crypt_final(&walk, ctx); + err = skcipher_walk_done(&walk, 0); +diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c +index d3d91a0abf88..6d0774721514 100644 +--- a/arch/x86/crypto/glue_helper.c ++++ b/arch/x86/crypto/glue_helper.c +@@ -24,7 +24,7 @@ int glue_ecb_req_128bit(const struct common_glue_ctx *gctx, + void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); + const unsigned int bsize = 128 / 8; + struct skcipher_walk walk; +- bool fpu_enabled = false; ++ bool fpu_enabled; + unsigned int nbytes; + int err; + +@@ -37,7 +37,7 @@ int glue_ecb_req_128bit(const struct common_glue_ctx *gctx, + unsigned int i; + + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, +- &walk, fpu_enabled, nbytes); ++ &walk, false, nbytes); + for (i = 0; i < gctx->num_funcs; i++) { + func_bytes = bsize * gctx->funcs[i].num_blocks; + +@@ -55,10 +55,9 @@ int glue_ecb_req_128bit(const struct common_glue_ctx *gctx, + if (nbytes < bsize) + break; + } ++ glue_fpu_end(fpu_enabled); + err = skcipher_walk_done(&walk, nbytes); + } +- +- glue_fpu_end(fpu_enabled); + return err; + } + EXPORT_SYMBOL_GPL(glue_ecb_req_128bit); +@@ -101,7 +100,7 @@ int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx, + void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); + const unsigned int bsize = 128 / 8; + struct skcipher_walk walk; +- bool fpu_enabled = false; ++ bool fpu_enabled; + unsigned int nbytes; + int err; + +@@ -115,7 +114,7 @@ int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx, + u128 last_iv; + + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, +- &walk, fpu_enabled, nbytes); ++ &walk, false, nbytes); + /* Start of the last block. */ + src += nbytes / bsize - 1; + dst += nbytes / bsize - 1; +@@ -148,10 +147,10 @@ int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx, + done: + u128_xor(dst, dst, (u128 *)walk.iv); + *(u128 *)walk.iv = last_iv; ++ glue_fpu_end(fpu_enabled); + err = skcipher_walk_done(&walk, nbytes); + } + +- glue_fpu_end(fpu_enabled); + return err; + } + EXPORT_SYMBOL_GPL(glue_cbc_decrypt_req_128bit); +@@ -162,7 +161,7 @@ int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, + void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); + const unsigned int bsize = 128 / 8; + struct skcipher_walk walk; +- bool fpu_enabled = false; ++ bool fpu_enabled; + unsigned int nbytes; + int err; + +@@ -176,7 +175,7 @@ int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, + le128 ctrblk; + + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, +- &walk, fpu_enabled, nbytes); ++ &walk, false, nbytes); + + be128_to_le128(&ctrblk, (be128 *)walk.iv); + +@@ -202,11 +201,10 @@ int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, + } + + le128_to_be128((be128 *)walk.iv, &ctrblk); ++ glue_fpu_end(fpu_enabled); + err = skcipher_walk_done(&walk, nbytes); + } + +- glue_fpu_end(fpu_enabled); +- + if (nbytes) { + le128 ctrblk; + u128 tmp; +@@ -306,8 +304,14 @@ int glue_xts_req_128bit(const struct common_glue_ctx *gctx, + tweak_fn(tweak_ctx, walk.iv, walk.iv); + + while (nbytes) { ++ fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, ++ &walk, fpu_enabled, ++ nbytes < bsize ? bsize : nbytes); + nbytes = __glue_xts_req_128bit(gctx, crypt_ctx, &walk); + ++ glue_fpu_end(fpu_enabled); ++ fpu_enabled = false; ++ + err = skcipher_walk_done(&walk, nbytes); + nbytes = walk.nbytes; + } +-- +2.43.0 + diff --git a/debian/patches-rt/0242-crypto-limit-more-FPU-enabled-sections.patch b/debian/patches-rt/0242-crypto-limit-more-FPU-enabled-sections.patch new file mode 100644 index 000000000..9f98cd223 --- /dev/null +++ b/debian/patches-rt/0242-crypto-limit-more-FPU-enabled-sections.patch @@ -0,0 +1,74 @@ +From dd8a48342b4ff4b80a32a4ae49115db93ad069c1 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Thu, 30 Nov 2017 13:40:10 +0100 +Subject: [PATCH 242/323] crypto: limit more FPU-enabled sections +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Those crypto drivers use SSE/AVX/… for their crypto work and in order to +do so in kernel they need to enable the "FPU" in kernel mode which +disables preemption. +There are two problems with the way they are used: +- the while loop which processes X bytes may create latency spikes and + should be avoided or limited. +- the cipher-walk-next part may allocate/free memory and may use + kmap_atomic(). + +The whole kernel_fpu_begin()/end() processing isn't probably that cheap. +It most likely makes sense to process as much of those as possible in one +go. The new *_fpu_sched_rt() schedules only if a RT task is pending. + +Probably we should measure the performance those ciphers in pure SW +mode and with this optimisations to see if it makes sense to keep them +for RT. + +This kernel_fpu_resched() makes the code more preemptible which might hurt +performance. + +Cc: stable-rt@vger.kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/x86/include/asm/fpu/api.h | 1 + + arch/x86/kernel/fpu/core.c | 12 ++++++++++++ + 2 files changed, 13 insertions(+) + +diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h +index ad2fee785310..d31b0886592a 100644 +--- a/arch/x86/include/asm/fpu/api.h ++++ b/arch/x86/include/asm/fpu/api.h +@@ -28,6 +28,7 @@ extern void kernel_fpu_begin_mask(unsigned int kfpu_mask); + extern void kernel_fpu_end(void); + extern bool irq_fpu_usable(void); + extern void fpregs_mark_activate(void); ++extern void kernel_fpu_resched(void); + + /* Code that is unaware of kernel_fpu_begin_mask() can use this */ + static inline void kernel_fpu_begin(void) +diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c +index 835b948095cd..8cfc2f972fe5 100644 +--- a/arch/x86/kernel/fpu/core.c ++++ b/arch/x86/kernel/fpu/core.c +@@ -144,6 +144,18 @@ void kernel_fpu_end(void) + } + EXPORT_SYMBOL_GPL(kernel_fpu_end); + ++void kernel_fpu_resched(void) ++{ ++ WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); ++ ++ if (should_resched(PREEMPT_OFFSET)) { ++ kernel_fpu_end(); ++ cond_resched(); ++ kernel_fpu_begin(); ++ } ++} ++EXPORT_SYMBOL_GPL(kernel_fpu_resched); ++ + /* + * Save the FPU state (mark it for reload if necessary): + * +-- +2.43.0 + diff --git a/debian/patches-rt/0243-panic-skip-get_random_bytes-for-RT_FULL-in-init_oops.patch b/debian/patches-rt/0243-panic-skip-get_random_bytes-for-RT_FULL-in-init_oops.patch new file mode 100644 index 000000000..fbcd9407a --- /dev/null +++ b/debian/patches-rt/0243-panic-skip-get_random_bytes-for-RT_FULL-in-init_oops.patch @@ -0,0 +1,34 @@ +From e0a748983b07205d9f2f52230c897f49a2c84c8d Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 14 Jul 2015 14:26:34 +0200 +Subject: [PATCH 243/323] panic: skip get_random_bytes for RT_FULL in + init_oops_id +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Disable on -RT. If this is invoked from irq-context we will have problems +to acquire the sleeping lock. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + kernel/panic.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/kernel/panic.c b/kernel/panic.c +index 5a1a4bf2feb3..aa36078657be 100644 +--- a/kernel/panic.c ++++ b/kernel/panic.c +@@ -621,9 +621,11 @@ static u64 oops_id; + + static int init_oops_id(void) + { ++#ifndef CONFIG_PREEMPT_RT + if (!oops_id) + get_random_bytes(&oops_id, sizeof(oops_id)); + else ++#endif + oops_id++; + + return 0; +-- +2.43.0 + diff --git a/debian/patches-rt/0244-x86-stackprotector-Avoid-random-pool-on-rt.patch b/debian/patches-rt/0244-x86-stackprotector-Avoid-random-pool-on-rt.patch new file mode 100644 index 000000000..c8e46ef39 --- /dev/null +++ b/debian/patches-rt/0244-x86-stackprotector-Avoid-random-pool-on-rt.patch @@ -0,0 +1,51 @@ +From f7c4220e56be1d8fc2bc339984fb5919ee02cf4f Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Thu, 16 Dec 2010 14:25:18 +0100 +Subject: [PATCH 244/323] x86: stackprotector: Avoid random pool on rt +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +CPU bringup calls into the random pool to initialize the stack +canary. During boot that works nicely even on RT as the might sleep +checks are disabled. During CPU hotplug the might sleep checks +trigger. Making the locks in random raw is a major PITA, so avoid the +call on RT is the only sensible solution. This is basically the same +randomness which we get during boot where the random pool has no +entropy and we rely on the TSC randomnness. + +Reported-by: Carsten Emde <carsten.emde@osadl.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + arch/x86/include/asm/stackprotector.h | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h +index 7fb482f0f25b..3df0a95c9e13 100644 +--- a/arch/x86/include/asm/stackprotector.h ++++ b/arch/x86/include/asm/stackprotector.h +@@ -65,7 +65,7 @@ + */ + static __always_inline void boot_init_stack_canary(void) + { +- u64 canary; ++ u64 canary = 0; + u64 tsc; + + #ifdef CONFIG_X86_64 +@@ -76,8 +76,14 @@ static __always_inline void boot_init_stack_canary(void) + * of randomness. The TSC only matters for very early init, + * there it already has some randomness on most systems. Later + * on during the bootup the random pool has true entropy too. ++ * For preempt-rt we need to weaken the randomness a bit, as ++ * we can't call into the random generator from atomic context ++ * due to locking constraints. We just leave canary ++ * uninitialized and use the TSC based randomness on top of it. + */ ++#ifndef CONFIG_PREEMPT_RT + get_random_bytes(&canary, sizeof(canary)); ++#endif + tsc = rdtsc(); + canary += tsc + (tsc << 32UL); + canary &= CANARY_MASK; +-- +2.43.0 + diff --git a/debian/patches-rt/0245-net-Remove-preemption-disabling-in-netif_rx.patch b/debian/patches-rt/0245-net-Remove-preemption-disabling-in-netif_rx.patch new file mode 100644 index 000000000..88ffbf8ad --- /dev/null +++ b/debian/patches-rt/0245-net-Remove-preemption-disabling-in-netif_rx.patch @@ -0,0 +1,68 @@ +From ceda0e136831589d22c9bd35844b3fafcc722e7c Mon Sep 17 00:00:00 2001 +From: Priyanka Jain <Priyanka.Jain@freescale.com> +Date: Thu, 17 May 2012 09:35:11 +0530 +Subject: [PATCH 245/323] net: Remove preemption disabling in netif_rx() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +1)enqueue_to_backlog() (called from netif_rx) should be + bind to a particluar CPU. This can be achieved by + disabling migration. No need to disable preemption + +2)Fixes crash "BUG: scheduling while atomic: ksoftirqd" + in case of RT. + If preemption is disabled, enqueue_to_backog() is called + in atomic context. And if backlog exceeds its count, + kfree_skb() is called. But in RT, kfree_skb() might + gets scheduled out, so it expects non atomic context. + +-Replace preempt_enable(), preempt_disable() with + migrate_enable(), migrate_disable() respectively +-Replace get_cpu(), put_cpu() with get_cpu_light(), + put_cpu_light() respectively + +Signed-off-by: Priyanka Jain <Priyanka.Jain@freescale.com> +Acked-by: Rajan Srivastava <Rajan.Srivastava@freescale.com> +Cc: <rostedt@goodmis.orgn> +Link: http://lkml.kernel.org/r/1337227511-2271-1-git-send-email-Priyanka.Jain@freescale.com + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +[bigeasy: Remove assumption about migrate_disable() from the description.] +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + net/core/dev.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/net/core/dev.c b/net/core/dev.c +index c3b4e5e320ca..fb00a777f559 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -4818,7 +4818,7 @@ static int netif_rx_internal(struct sk_buff *skb) + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu; + +- preempt_disable(); ++ migrate_disable(); + rcu_read_lock(); + + cpu = get_rps_cpu(skb->dev, skb, &rflow); +@@ -4828,14 +4828,14 @@ static int netif_rx_internal(struct sk_buff *skb) + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + + rcu_read_unlock(); +- preempt_enable(); ++ migrate_enable(); + } else + #endif + { + unsigned int qtail; + +- ret = enqueue_to_backlog(skb, get_cpu(), &qtail); +- put_cpu(); ++ ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail); ++ put_cpu_light(); + } + return ret; + } +-- +2.43.0 + diff --git a/debian/patches-rt/0246-lockdep-Make-it-RT-aware.patch b/debian/patches-rt/0246-lockdep-Make-it-RT-aware.patch new file mode 100644 index 000000000..6c7174f34 --- /dev/null +++ b/debian/patches-rt/0246-lockdep-Make-it-RT-aware.patch @@ -0,0 +1,78 @@ +From 9d9c323e950c13cde64d5ae87dd12e0e3b81abd8 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Sun, 17 Jul 2011 18:51:23 +0200 +Subject: [PATCH 246/323] lockdep: Make it RT aware +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +teach lockdep that we don't really do softirqs on -RT. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + include/linux/irqflags.h | 23 +++++++++++++++-------- + kernel/locking/lockdep.c | 2 ++ + 2 files changed, 17 insertions(+), 8 deletions(-) + +diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h +index 3ed4e8771b64..a437b2e70d37 100644 +--- a/include/linux/irqflags.h ++++ b/include/linux/irqflags.h +@@ -71,14 +71,6 @@ do { \ + do { \ + __this_cpu_dec(hardirq_context); \ + } while (0) +-# define lockdep_softirq_enter() \ +-do { \ +- current->softirq_context++; \ +-} while (0) +-# define lockdep_softirq_exit() \ +-do { \ +- current->softirq_context--; \ +-} while (0) + + # define lockdep_hrtimer_enter(__hrtimer) \ + ({ \ +@@ -140,6 +132,21 @@ do { \ + # define lockdep_irq_work_exit(__work) do { } while (0) + #endif + ++#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT) ++# define lockdep_softirq_enter() \ ++do { \ ++ current->softirq_context++; \ ++} while (0) ++# define lockdep_softirq_exit() \ ++do { \ ++ current->softirq_context--; \ ++} while (0) ++ ++#else ++# define lockdep_softirq_enter() do { } while (0) ++# define lockdep_softirq_exit() do { } while (0) ++#endif ++ + #if defined(CONFIG_IRQSOFF_TRACER) || \ + defined(CONFIG_PREEMPT_TRACER) + extern void stop_critical_timings(void); +diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c +index 7471d85f54ae..4fb9e6301018 100644 +--- a/kernel/locking/lockdep.c ++++ b/kernel/locking/lockdep.c +@@ -5414,6 +5414,7 @@ static noinstr void check_flags(unsigned long flags) + } + } + ++#ifndef CONFIG_PREEMPT_RT + /* + * We dont accurately track softirq state in e.g. + * hardirq contexts (such as on 4KSTACKS), so only +@@ -5428,6 +5429,7 @@ static noinstr void check_flags(unsigned long flags) + DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); + } + } ++#endif + + if (!debug_locks) + print_irqtrace_events(current); +-- +2.43.0 + diff --git a/debian/patches-rt/0247-lockdep-selftest-Only-do-hardirq-context-test-for-ra.patch b/debian/patches-rt/0247-lockdep-selftest-Only-do-hardirq-context-test-for-ra.patch new file mode 100644 index 000000000..c4ca02928 --- /dev/null +++ b/debian/patches-rt/0247-lockdep-selftest-Only-do-hardirq-context-test-for-ra.patch @@ -0,0 +1,62 @@ +From 98d28c479e8b02a795109f6bc0b6aa1da54f6ede Mon Sep 17 00:00:00 2001 +From: Yong Zhang <yong.zhang@windriver.com> +Date: Mon, 16 Apr 2012 15:01:56 +0800 +Subject: [PATCH 247/323] lockdep: selftest: Only do hardirq context test for + raw spinlock +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +On -rt there is no softirq context any more and rwlock is sleepable, +disable softirq context test and rwlock+irq test. + +Signed-off-by: Yong Zhang <yong.zhang0@gmail.com> +Cc: Yong Zhang <yong.zhang@windriver.com> +Link: http://lkml.kernel.org/r/1334559716-18447-3-git-send-email-yong.zhang0@gmail.com +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + lib/locking-selftest.c | 23 +++++++++++++++++++++++ + 1 file changed, 23 insertions(+) + +diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c +index 76c52b0b76d3..663c071bc445 100644 +--- a/lib/locking-selftest.c ++++ b/lib/locking-selftest.c +@@ -2456,6 +2456,7 @@ void locking_selftest(void) + + printk(" --------------------------------------------------------------------------\n"); + ++#ifndef CONFIG_PREEMPT_RT + /* + * irq-context testcases: + */ +@@ -2470,6 +2471,28 @@ void locking_selftest(void) + DO_TESTCASE_6x2x2RW("irq read-recursion #2", irq_read_recursion2); + DO_TESTCASE_6x2x2RW("irq read-recursion #3", irq_read_recursion3); + ++#else ++ /* On -rt, we only do hardirq context test for raw spinlock */ ++ DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12); ++ DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21); ++ ++ DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12); ++ DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21); ++ ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123); ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132); ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213); ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231); ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312); ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321); ++ ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123); ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132); ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213); ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231); ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312); ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321); ++#endif + ww_tests(); + + force_read_lock_recursive = 0; +-- +2.43.0 + diff --git a/debian/patches-rt/0248-lockdep-selftest-fix-warnings-due-to-missing-PREEMPT.patch b/debian/patches-rt/0248-lockdep-selftest-fix-warnings-due-to-missing-PREEMPT.patch new file mode 100644 index 000000000..dbb683c42 --- /dev/null +++ b/debian/patches-rt/0248-lockdep-selftest-fix-warnings-due-to-missing-PREEMPT.patch @@ -0,0 +1,149 @@ +From c10790e859050b8a0a251e2e57b7fffb9ffa75df Mon Sep 17 00:00:00 2001 +From: Josh Cartwright <josh.cartwright@ni.com> +Date: Wed, 28 Jan 2015 13:08:45 -0600 +Subject: [PATCH 248/323] lockdep: selftest: fix warnings due to missing + PREEMPT_RT conditionals +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +"lockdep: Selftest: Only do hardirq context test for raw spinlock" +disabled the execution of certain tests with PREEMPT_RT, but did +not prevent the tests from still being defined. This leads to warnings +like: + + ./linux/lib/locking-selftest.c:574:1: warning: 'irqsafe1_hard_rlock_12' defined but not used [-Wunused-function] + ./linux/lib/locking-selftest.c:574:1: warning: 'irqsafe1_hard_rlock_21' defined but not used [-Wunused-function] + ./linux/lib/locking-selftest.c:577:1: warning: 'irqsafe1_hard_wlock_12' defined but not used [-Wunused-function] + ./linux/lib/locking-selftest.c:577:1: warning: 'irqsafe1_hard_wlock_21' defined but not used [-Wunused-function] + ./linux/lib/locking-selftest.c:580:1: warning: 'irqsafe1_soft_spin_12' defined but not used [-Wunused-function] + ... + +Fixed by wrapping the test definitions in #ifndef CONFIG_PREEMPT_RT +conditionals. + +Signed-off-by: Josh Cartwright <josh.cartwright@ni.com> +Signed-off-by: Xander Huff <xander.huff@ni.com> +Acked-by: Gratian Crisan <gratian.crisan@ni.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + lib/locking-selftest.c | 28 ++++++++++++++++++++++++++++ + 1 file changed, 28 insertions(+) + +diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c +index 663c071bc445..98c376b02dff 100644 +--- a/lib/locking-selftest.c ++++ b/lib/locking-selftest.c +@@ -787,6 +787,8 @@ GENERATE_TESTCASE(init_held_rtmutex); + #include "locking-selftest-spin-hardirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin) + ++#ifndef CONFIG_PREEMPT_RT ++ + #include "locking-selftest-rlock-hardirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock) + +@@ -802,9 +804,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock) + #include "locking-selftest-wlock-softirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock) + ++#endif ++ + #undef E1 + #undef E2 + ++#ifndef CONFIG_PREEMPT_RT + /* + * Enabling hardirqs with a softirq-safe lock held: + */ +@@ -837,6 +842,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock) + #undef E1 + #undef E2 + ++#endif ++ + /* + * Enabling irqs with an irq-safe lock held: + */ +@@ -860,6 +867,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock) + #include "locking-selftest-spin-hardirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin) + ++#ifndef CONFIG_PREEMPT_RT ++ + #include "locking-selftest-rlock-hardirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock) + +@@ -875,6 +884,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock) + #include "locking-selftest-wlock-softirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock) + ++#endif ++ + #undef E1 + #undef E2 + +@@ -906,6 +917,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock) + #include "locking-selftest-spin-hardirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin) + ++#ifndef CONFIG_PREEMPT_RT ++ + #include "locking-selftest-rlock-hardirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock) + +@@ -921,6 +934,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock) + #include "locking-selftest-wlock-softirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock) + ++#endif ++ + #undef E1 + #undef E2 + #undef E3 +@@ -954,6 +969,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock) + #include "locking-selftest-spin-hardirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin) + ++#ifndef CONFIG_PREEMPT_RT ++ + #include "locking-selftest-rlock-hardirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock) + +@@ -969,10 +986,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock) + #include "locking-selftest-wlock-softirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock) + ++#endif ++ + #undef E1 + #undef E2 + #undef E3 + ++#ifndef CONFIG_PREEMPT_RT ++ + /* + * read-lock / write-lock irq inversion. + * +@@ -1162,6 +1183,11 @@ GENERATE_PERMUTATIONS_3_EVENTS(W1W2_R2R3_R3W1) + #undef E1 + #undef E2 + #undef E3 ++ ++#endif ++ ++#ifndef CONFIG_PREEMPT_RT ++ + /* + * read-lock / write-lock recursion that is actually safe. + */ +@@ -1208,6 +1234,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft_wlock) + #undef E2 + #undef E3 + ++#endif ++ + /* + * read-lock / write-lock recursion that is unsafe. + */ +-- +2.43.0 + diff --git a/debian/patches-rt/0249-lockdep-disable-self-test.patch b/debian/patches-rt/0249-lockdep-disable-self-test.patch new file mode 100644 index 000000000..bd19a4f00 --- /dev/null +++ b/debian/patches-rt/0249-lockdep-disable-self-test.patch @@ -0,0 +1,35 @@ +From da32b1f1c20ebd096acfb511cf3a608169d210c7 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 17 Oct 2017 16:36:18 +0200 +Subject: [PATCH 249/323] lockdep: disable self-test +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The self-test wasn't always 100% accurate for RT. We disabled a few +tests which failed because they had a different semantic for RT. Some +still reported false positives. Now the selftest locks up the system +during boot and it needs to be investigated… + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + lib/Kconfig.debug | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug +index 24ca61cf86dd..70172d5ab82a 100644 +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -1373,7 +1373,7 @@ config DEBUG_ATOMIC_SLEEP + + config DEBUG_LOCKING_API_SELFTESTS + bool "Locking API boot-time self-tests" +- depends on DEBUG_KERNEL ++ depends on DEBUG_KERNEL && !PREEMPT_RT + help + Say Y here if you want the kernel to run a short self-test during + bootup. The self-test checks whether common types of locking bugs +-- +2.43.0 + diff --git a/debian/patches-rt/0250-drm-radeon-i915-Use-preempt_disable-enable_rt-where-.patch b/debian/patches-rt/0250-drm-radeon-i915-Use-preempt_disable-enable_rt-where-.patch new file mode 100644 index 000000000..c0e7434f2 --- /dev/null +++ b/debian/patches-rt/0250-drm-radeon-i915-Use-preempt_disable-enable_rt-where-.patch @@ -0,0 +1,61 @@ +From 2c69acb6875156b092516fc81bd2a05b96973c2b Mon Sep 17 00:00:00 2001 +From: Mike Galbraith <umgwanakikbuti@gmail.com> +Date: Sat, 27 Feb 2016 08:09:11 +0100 +Subject: [PATCH 250/323] drm,radeon,i915: Use preempt_disable/enable_rt() + where recommended +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +DRM folks identified the spots, so use them. + +Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com> +Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Cc: linux-rt-users <linux-rt-users@vger.kernel.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + drivers/gpu/drm/i915/i915_irq.c | 2 ++ + drivers/gpu/drm/radeon/radeon_display.c | 2 ++ + 2 files changed, 4 insertions(+) + +diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c +index 759f523c6a6b..7339a42ab2b8 100644 +--- a/drivers/gpu/drm/i915/i915_irq.c ++++ b/drivers/gpu/drm/i915/i915_irq.c +@@ -847,6 +847,7 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, + spin_lock_irqsave(&dev_priv->uncore.lock, irqflags); + + /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */ ++ preempt_disable_rt(); + + /* Get optional system timestamp before query. */ + if (stime) +@@ -898,6 +899,7 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, + *etime = ktime_get(); + + /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */ ++ preempt_enable_rt(); + + spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); + +diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c +index 71bdafac9210..95ce311f434b 100644 +--- a/drivers/gpu/drm/radeon/radeon_display.c ++++ b/drivers/gpu/drm/radeon/radeon_display.c +@@ -1823,6 +1823,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe, + struct radeon_device *rdev = dev->dev_private; + + /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */ ++ preempt_disable_rt(); + + /* Get optional system timestamp before query. */ + if (stime) +@@ -1915,6 +1916,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe, + *etime = ktime_get(); + + /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */ ++ preempt_enable_rt(); + + /* Decode into vertical and horizontal scanout position. */ + *vpos = position & 0x1fff; +-- +2.43.0 + diff --git a/debian/patches-rt/0251-drm-i915-Don-t-disable-interrupts-on-PREEMPT_RT-duri.patch b/debian/patches-rt/0251-drm-i915-Don-t-disable-interrupts-on-PREEMPT_RT-duri.patch new file mode 100644 index 000000000..dd9329591 --- /dev/null +++ b/debian/patches-rt/0251-drm-i915-Don-t-disable-interrupts-on-PREEMPT_RT-duri.patch @@ -0,0 +1,80 @@ +From 940e64f8d4b797c592e6736a1fae0d0ff2087e95 Mon Sep 17 00:00:00 2001 +From: Mike Galbraith <umgwanakikbuti@gmail.com> +Date: Sat, 27 Feb 2016 09:01:42 +0100 +Subject: [PATCH 251/323] drm/i915: Don't disable interrupts on PREEMPT_RT + during atomic updates +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Commit + 8d7849db3eab7 ("drm/i915: Make sprite updates atomic") + +started disabling interrupts across atomic updates. This breaks on PREEMPT_RT +because within this section the code attempt to acquire spinlock_t locks which +are sleeping locks on PREEMPT_RT. + +According to the comment the interrupts are disabled to avoid random delays and +not required for protection or synchronisation. + +Don't disable interrupts on PREEMPT_RT during atomic updates. + +[bigeasy: drop local locks, commit message] + +Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/gpu/drm/i915/display/intel_sprite.c | 15 ++++++++++----- + 1 file changed, 10 insertions(+), 5 deletions(-) + +diff --git a/drivers/gpu/drm/i915/display/intel_sprite.c b/drivers/gpu/drm/i915/display/intel_sprite.c +index 12f7128b777f..a65061e3e1d3 100644 +--- a/drivers/gpu/drm/i915/display/intel_sprite.c ++++ b/drivers/gpu/drm/i915/display/intel_sprite.c +@@ -118,7 +118,8 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) + "PSR idle timed out 0x%x, atomic update may fail\n", + psr_status); + +- local_irq_disable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_disable(); + + crtc->debug.min_vbl = min; + crtc->debug.max_vbl = max; +@@ -143,11 +144,13 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) + break; + } + +- local_irq_enable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_enable(); + + timeout = schedule_timeout(timeout); + +- local_irq_disable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_disable(); + } + + finish_wait(wq, &wait); +@@ -180,7 +183,8 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) + return; + + irq_disable: +- local_irq_disable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_disable(); + } + + /** +@@ -218,7 +222,8 @@ void intel_pipe_update_end(struct intel_crtc_state *new_crtc_state) + new_crtc_state->uapi.event = NULL; + } + +- local_irq_enable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_enable(); + + if (intel_vgpu_active(dev_priv)) + return; +-- +2.43.0 + diff --git a/debian/patches-rt/0252-drm-i915-disable-tracing-on-RT.patch b/debian/patches-rt/0252-drm-i915-disable-tracing-on-RT.patch new file mode 100644 index 000000000..833c9d37f --- /dev/null +++ b/debian/patches-rt/0252-drm-i915-disable-tracing-on-RT.patch @@ -0,0 +1,47 @@ +From 0e88cde8cbd40cc7e519bccaf1ba847de2d9bd02 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Thu, 6 Dec 2018 09:52:20 +0100 +Subject: [PATCH 252/323] drm/i915: disable tracing on -RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Luca Abeni reported this: +| BUG: scheduling while atomic: kworker/u8:2/15203/0x00000003 +| CPU: 1 PID: 15203 Comm: kworker/u8:2 Not tainted 4.19.1-rt3 #10 +| Call Trace: +| rt_spin_lock+0x3f/0x50 +| gen6_read32+0x45/0x1d0 [i915] +| g4x_get_vblank_counter+0x36/0x40 [i915] +| trace_event_raw_event_i915_pipe_update_start+0x7d/0xf0 [i915] + +The tracing events use trace_i915_pipe_update_start() among other events +use functions acquire spin locks. A few trace points use +intel_get_crtc_scanline(), others use ->get_vblank_counter() wich also +might acquire a sleeping lock. + +Based on this I don't see any other way than disable trace points on RT. + +Cc: stable-rt@vger.kernel.org +Reported-by: Luca Abeni <lucabe72@gmail.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/gpu/drm/i915/i915_trace.h | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h +index a4addcc64978..0ba5a0a0fd25 100644 +--- a/drivers/gpu/drm/i915/i915_trace.h ++++ b/drivers/gpu/drm/i915/i915_trace.h +@@ -2,6 +2,10 @@ + #if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) + #define _I915_TRACE_H_ + ++#ifdef CONFIG_PREEMPT_RT ++#define NOTRACE ++#endif ++ + #include <linux/stringify.h> + #include <linux/types.h> + #include <linux/tracepoint.h> +-- +2.43.0 + diff --git a/debian/patches-rt/0253-drm-i915-skip-DRM_I915_LOW_LEVEL_TRACEPOINTS-with-NO.patch b/debian/patches-rt/0253-drm-i915-skip-DRM_I915_LOW_LEVEL_TRACEPOINTS-with-NO.patch new file mode 100644 index 000000000..9aa6af966 --- /dev/null +++ b/debian/patches-rt/0253-drm-i915-skip-DRM_I915_LOW_LEVEL_TRACEPOINTS-with-NO.patch @@ -0,0 +1,33 @@ +From 2c550981e43f1612f6999bbc04634e0d6eb16760 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 19 Dec 2018 10:47:02 +0100 +Subject: [PATCH 253/323] drm/i915: skip DRM_I915_LOW_LEVEL_TRACEPOINTS with + NOTRACE +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The order of the header files is important. If this header file is +included after tracepoint.h was included then the NOTRACE here becomes a +nop. Currently this happens for two .c files which use the tracepoitns +behind DRM_I915_LOW_LEVEL_TRACEPOINTS. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/gpu/drm/i915/i915_trace.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h +index 0ba5a0a0fd25..396b6598694d 100644 +--- a/drivers/gpu/drm/i915/i915_trace.h ++++ b/drivers/gpu/drm/i915/i915_trace.h +@@ -782,7 +782,7 @@ DEFINE_EVENT(i915_request, i915_request_add, + TP_ARGS(rq) + ); + +-#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) ++#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) && !defined(NOTRACE) + DEFINE_EVENT(i915_request, i915_request_submit, + TP_PROTO(struct i915_request *rq), + TP_ARGS(rq) +-- +2.43.0 + diff --git a/debian/patches-rt/0254-drm-i915-gt-Only-disable-interrupts-for-the-timeline.patch b/debian/patches-rt/0254-drm-i915-gt-Only-disable-interrupts-for-the-timeline.patch new file mode 100644 index 000000000..722c5567e --- /dev/null +++ b/debian/patches-rt/0254-drm-i915-gt-Only-disable-interrupts-for-the-timeline.patch @@ -0,0 +1,52 @@ +From 63884dc87002ed5187007558a7356e7a4f570e75 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 7 Jul 2020 12:25:11 +0200 +Subject: [PATCH 254/323] drm/i915/gt: Only disable interrupts for the timeline + lock on !force-threaded +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +According to commit + d67739268cf0e ("drm/i915/gt: Mark up the nested engine-pm timeline lock as irqsafe") + +the intrrupts are disabled the code may be called from an interrupt +handler and from preemptible context. +With `force_irqthreads' set the timeline mutex is never observed in IRQ +context so it is not neede to disable interrupts. + +Disable only interrupts if not in `force_irqthreads' mode. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/gpu/drm/i915/gt/intel_engine_pm.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c +index f7b2e07e2229..313d8a28e776 100644 +--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c ++++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c +@@ -60,9 +60,10 @@ static int __engine_unpark(struct intel_wakeref *wf) + + static inline unsigned long __timeline_mark_lock(struct intel_context *ce) + { +- unsigned long flags; ++ unsigned long flags = 0; + +- local_irq_save(flags); ++ if (!force_irqthreads) ++ local_irq_save(flags); + mutex_acquire(&ce->timeline->mutex.dep_map, 2, 0, _THIS_IP_); + + return flags; +@@ -72,7 +73,8 @@ static inline void __timeline_mark_unlock(struct intel_context *ce, + unsigned long flags) + { + mutex_release(&ce->timeline->mutex.dep_map, _THIS_IP_); +- local_irq_restore(flags); ++ if (!force_irqthreads) ++ local_irq_restore(flags); + } + + #else +-- +2.43.0 + diff --git a/debian/patches-rt/0255-cpuset-Convert-callback_lock-to-raw_spinlock_t.patch b/debian/patches-rt/0255-cpuset-Convert-callback_lock-to-raw_spinlock_t.patch new file mode 100644 index 000000000..bb6e6895a --- /dev/null +++ b/debian/patches-rt/0255-cpuset-Convert-callback_lock-to-raw_spinlock_t.patch @@ -0,0 +1,367 @@ +From ebc27a7d1ceef838efef4feecd2bcb490552f0ec Mon Sep 17 00:00:00 2001 +From: Mike Galbraith <efault@gmx.de> +Date: Sun, 8 Jan 2017 09:32:25 +0100 +Subject: [PATCH 255/323] cpuset: Convert callback_lock to raw_spinlock_t +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The two commits below add up to a cpuset might_sleep() splat for RT: + +8447a0fee974 cpuset: convert callback_mutex to a spinlock +344736f29b35 cpuset: simplify cpuset_node_allowed API + +BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:995 +in_atomic(): 0, irqs_disabled(): 1, pid: 11718, name: cset +CPU: 135 PID: 11718 Comm: cset Tainted: G E 4.10.0-rt1-rt #4 +Hardware name: Intel Corporation BRICKLAND/BRICKLAND, BIOS BRHSXSD1.86B.0056.R01.1409242327 09/24/2014 +Call Trace: + ? dump_stack+0x5c/0x81 + ? ___might_sleep+0xf4/0x170 + ? rt_spin_lock+0x1c/0x50 + ? __cpuset_node_allowed+0x66/0xc0 + ? ___slab_alloc+0x390/0x570 <disables IRQs> + ? anon_vma_fork+0x8f/0x140 + ? copy_page_range+0x6cf/0xb00 + ? anon_vma_fork+0x8f/0x140 + ? __slab_alloc.isra.74+0x5a/0x81 + ? anon_vma_fork+0x8f/0x140 + ? kmem_cache_alloc+0x1b5/0x1f0 + ? anon_vma_fork+0x8f/0x140 + ? copy_process.part.35+0x1670/0x1ee0 + ? _do_fork+0xdd/0x3f0 + ? _do_fork+0xdd/0x3f0 + ? do_syscall_64+0x61/0x170 + ? entry_SYSCALL64_slow_path+0x25/0x25 + +The later ensured that a NUMA box WILL take callback_lock in atomic +context by removing the allocator and reclaim path __GFP_HARDWALL +usage which prevented such contexts from taking callback_mutex. + +One option would be to reinstate __GFP_HARDWALL protections for +RT, however, as the 8447a0fee974 changelog states: + +The callback_mutex is only used to synchronize reads/updates of cpusets' +flags and cpu/node masks. These operations should always proceed fast so +there's no reason why we can't use a spinlock instead of the mutex. + +Cc: stable-rt@vger.kernel.org +Signed-off-by: Mike Galbraith <efault@gmx.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + kernel/cgroup/cpuset.c | 82 +++++++++++++++++++++--------------------- + 1 file changed, 41 insertions(+), 41 deletions(-) + +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index 195f9cccab20..fd371215b81f 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -368,7 +368,7 @@ void cpuset_unlock(void) + mutex_unlock(&cpuset_mutex); + } + +-static DEFINE_SPINLOCK(callback_lock); ++static DEFINE_RAW_SPINLOCK(callback_lock); + + static struct workqueue_struct *cpuset_migrate_mm_wq; + +@@ -1316,7 +1316,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, + * Newly added CPUs will be removed from effective_cpus and + * newly deleted ones will be added back to effective_cpus. + */ +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + if (adding) { + cpumask_or(parent->subparts_cpus, + parent->subparts_cpus, tmp->addmask); +@@ -1338,7 +1338,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, + + if (cpuset->partition_root_state != new_prs) + cpuset->partition_root_state = new_prs; +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + + return cmd == partcmd_update; + } +@@ -1441,7 +1441,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) + continue; + rcu_read_unlock(); + +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + + cpumask_copy(cp->effective_cpus, tmp->new_cpus); + if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) { +@@ -1475,7 +1475,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) + if (new_prs != cp->partition_root_state) + cp->partition_root_state = new_prs; + +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + + WARN_ON(!is_in_v2_mode() && + !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); +@@ -1603,7 +1603,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, + return -EINVAL; + } + +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); + + /* +@@ -1613,7 +1613,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, + cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed); + cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); + } +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + + update_cpumasks_hier(cs, &tmp); + +@@ -1807,9 +1807,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) + continue; + rcu_read_unlock(); + +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + cp->effective_mems = *new_mems; +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + + WARN_ON(!is_in_v2_mode() && + !nodes_equal(cp->mems_allowed, cp->effective_mems)); +@@ -1877,9 +1877,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, + if (retval < 0) + goto done; + +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + cs->mems_allowed = trialcs->mems_allowed; +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + + /* use trialcs->mems_allowed as a temp variable */ + update_nodemasks_hier(cs, &trialcs->mems_allowed); +@@ -1970,9 +1970,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, + spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) + || (is_spread_page(cs) != is_spread_page(trialcs))); + +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + cs->flags = trialcs->flags; +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + + if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) + rebuild_sched_domains_locked(); +@@ -2058,9 +2058,9 @@ static int update_prstate(struct cpuset *cs, int new_prs) + rebuild_sched_domains_locked(); + out: + if (!err) { +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + cs->partition_root_state = new_prs; +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + } + + free_cpumasks(NULL, &tmpmask); +@@ -2527,7 +2527,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) + cpuset_filetype_t type = seq_cft(sf)->private; + int ret = 0; + +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + + switch (type) { + case FILE_CPULIST: +@@ -2549,7 +2549,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) + ret = -EINVAL; + } + +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + return ret; + } + +@@ -2862,14 +2862,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) + + cpuset_inc(); + +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + if (is_in_v2_mode()) { + cpumask_copy(cs->effective_cpus, parent->effective_cpus); + cs->effective_mems = parent->effective_mems; + cs->use_parent_ecpus = true; + parent->child_ecpus_count++; + } +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) + goto out_unlock; +@@ -2896,12 +2896,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) + } + rcu_read_unlock(); + +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + cs->mems_allowed = parent->mems_allowed; + cs->effective_mems = parent->mems_allowed; + cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); + cpumask_copy(cs->effective_cpus, parent->cpus_allowed); +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + out_unlock: + mutex_unlock(&cpuset_mutex); + put_online_cpus(); +@@ -2957,7 +2957,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) + static void cpuset_bind(struct cgroup_subsys_state *root_css) + { + mutex_lock(&cpuset_mutex); +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + + if (is_in_v2_mode()) { + cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); +@@ -2968,7 +2968,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) + top_cpuset.mems_allowed = top_cpuset.effective_mems; + } + +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + mutex_unlock(&cpuset_mutex); + } + +@@ -3063,12 +3063,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs, + { + bool is_empty; + +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + cpumask_copy(cs->cpus_allowed, new_cpus); + cpumask_copy(cs->effective_cpus, new_cpus); + cs->mems_allowed = *new_mems; + cs->effective_mems = *new_mems; +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + + /* + * Don't call update_tasks_cpumask() if the cpuset becomes empty, +@@ -3105,10 +3105,10 @@ hotplug_update_tasks(struct cpuset *cs, + if (nodes_empty(*new_mems)) + *new_mems = parent_cs(cs)->effective_mems; + +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + cpumask_copy(cs->effective_cpus, new_cpus); + cs->effective_mems = *new_mems; +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + + if (cpus_updated) + update_tasks_cpumask(cs); +@@ -3175,10 +3175,10 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) + if (is_partition_root(cs) && (cpumask_empty(&new_cpus) || + (parent->partition_root_state == PRS_ERROR))) { + if (cs->nr_subparts_cpus) { +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + cs->nr_subparts_cpus = 0; + cpumask_clear(cs->subparts_cpus); +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + compute_effective_cpumask(&new_cpus, cs, parent); + } + +@@ -3192,9 +3192,9 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) + cpumask_empty(&new_cpus)) { + update_parent_subparts_cpumask(cs, partcmd_disable, + NULL, tmp); +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + cs->partition_root_state = PRS_ERROR; +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + } + cpuset_force_rebuild(); + } +@@ -3274,7 +3274,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) + + /* synchronize cpus_allowed to cpu_active_mask */ + if (cpus_updated) { +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + if (!on_dfl) + cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); + /* +@@ -3294,17 +3294,17 @@ static void cpuset_hotplug_workfn(struct work_struct *work) + } + } + cpumask_copy(top_cpuset.effective_cpus, &new_cpus); +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + /* we don't mess with cpumasks of tasks in top_cpuset */ + } + + /* synchronize mems_allowed to N_MEMORY */ + if (mems_updated) { +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + if (!on_dfl) + top_cpuset.mems_allowed = new_mems; + top_cpuset.effective_mems = new_mems; +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + update_tasks_nodemask(&top_cpuset); + } + +@@ -3408,11 +3408,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) + { + unsigned long flags; + +- spin_lock_irqsave(&callback_lock, flags); ++ raw_spin_lock_irqsave(&callback_lock, flags); + rcu_read_lock(); + guarantee_online_cpus(task_cs(tsk), pmask); + rcu_read_unlock(); +- spin_unlock_irqrestore(&callback_lock, flags); ++ raw_spin_unlock_irqrestore(&callback_lock, flags); + } + + /** +@@ -3473,11 +3473,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) + nodemask_t mask; + unsigned long flags; + +- spin_lock_irqsave(&callback_lock, flags); ++ raw_spin_lock_irqsave(&callback_lock, flags); + rcu_read_lock(); + guarantee_online_mems(task_cs(tsk), &mask); + rcu_read_unlock(); +- spin_unlock_irqrestore(&callback_lock, flags); ++ raw_spin_unlock_irqrestore(&callback_lock, flags); + + return mask; + } +@@ -3569,14 +3569,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask) + return true; + + /* Not hardwall and node outside mems_allowed: scan up cpusets */ +- spin_lock_irqsave(&callback_lock, flags); ++ raw_spin_lock_irqsave(&callback_lock, flags); + + rcu_read_lock(); + cs = nearest_hardwall_ancestor(task_cs(current)); + allowed = node_isset(node, cs->mems_allowed); + rcu_read_unlock(); + +- spin_unlock_irqrestore(&callback_lock, flags); ++ raw_spin_unlock_irqrestore(&callback_lock, flags); + return allowed; + } + +-- +2.43.0 + diff --git a/debian/patches-rt/0256-x86-Allow-to-enable-RT.patch b/debian/patches-rt/0256-x86-Allow-to-enable-RT.patch new file mode 100644 index 000000000..50535647c --- /dev/null +++ b/debian/patches-rt/0256-x86-Allow-to-enable-RT.patch @@ -0,0 +1,28 @@ +From 6ab508c3cdd4fa46f17f940bd56007189d4984e6 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 7 Aug 2019 18:15:38 +0200 +Subject: [PATCH 256/323] x86: Allow to enable RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Allow to select RT. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/x86/Kconfig | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 54e5284a6ae1..c72d66a0e840 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -27,6 +27,7 @@ config X86_64 + # Options that are inherently 64-bit kernel only: + select ARCH_HAS_GIGANTIC_PAGE + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 ++ select ARCH_SUPPORTS_RT + select ARCH_USE_CMPXCHG_LOCKREF + select HAVE_ARCH_SOFT_DIRTY + select MODULES_USE_ELF_RELA +-- +2.43.0 + diff --git a/debian/patches-rt/0257-mm-scatterlist-Do-not-disable-irqs-on-RT.patch b/debian/patches-rt/0257-mm-scatterlist-Do-not-disable-irqs-on-RT.patch new file mode 100644 index 000000000..c97153016 --- /dev/null +++ b/debian/patches-rt/0257-mm-scatterlist-Do-not-disable-irqs-on-RT.patch @@ -0,0 +1,30 @@ +From 767e9014ec3dbfeee9323cfcc34acac5315c262e Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 3 Jul 2009 08:44:34 -0500 +Subject: [PATCH 257/323] mm/scatterlist: Do not disable irqs on RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +For -RT it is enough to keep pagefault disabled (which is currently handled by +kmap_atomic()). + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + lib/scatterlist.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/lib/scatterlist.c b/lib/scatterlist.c +index a59778946404..907f59045998 100644 +--- a/lib/scatterlist.c ++++ b/lib/scatterlist.c +@@ -892,7 +892,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter) + flush_kernel_dcache_page(miter->page); + + if (miter->__flags & SG_MITER_ATOMIC) { +- WARN_ON_ONCE(preemptible()); ++ WARN_ON_ONCE(!pagefault_disabled()); + kunmap_atomic(miter->addr); + } else + kunmap(miter->page); +-- +2.43.0 + diff --git a/debian/patches-rt/0258-sched-Add-support-for-lazy-preemption.patch b/debian/patches-rt/0258-sched-Add-support-for-lazy-preemption.patch new file mode 100644 index 000000000..12d064860 --- /dev/null +++ b/debian/patches-rt/0258-sched-Add-support-for-lazy-preemption.patch @@ -0,0 +1,691 @@ +From e8b4a64bf2eb6ff6330544895c0f69696059ffda Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 26 Oct 2012 18:50:54 +0100 +Subject: [PATCH 258/323] sched: Add support for lazy preemption +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +It has become an obsession to mitigate the determinism vs. throughput +loss of RT. Looking at the mainline semantics of preemption points +gives a hint why RT sucks throughput wise for ordinary SCHED_OTHER +tasks. One major issue is the wakeup of tasks which are right away +preempting the waking task while the waking task holds a lock on which +the woken task will block right after having preempted the wakee. In +mainline this is prevented due to the implicit preemption disable of +spin/rw_lock held regions. On RT this is not possible due to the fully +preemptible nature of sleeping spinlocks. + +Though for a SCHED_OTHER task preempting another SCHED_OTHER task this +is really not a correctness issue. RT folks are concerned about +SCHED_FIFO/RR tasks preemption and not about the purely fairness +driven SCHED_OTHER preemption latencies. + +So I introduced a lazy preemption mechanism which only applies to +SCHED_OTHER tasks preempting another SCHED_OTHER task. Aside of the +existing preempt_count each tasks sports now a preempt_lazy_count +which is manipulated on lock acquiry and release. This is slightly +incorrect as for lazyness reasons I coupled this on +migrate_disable/enable so some other mechanisms get the same treatment +(e.g. get_cpu_light). + +Now on the scheduler side instead of setting NEED_RESCHED this sets +NEED_RESCHED_LAZY in case of a SCHED_OTHER/SCHED_OTHER preemption and +therefor allows to exit the waking task the lock held region before +the woken task preempts. That also works better for cross CPU wakeups +as the other side can stay in the adaptive spinning loop. + +For RT class preemption there is no change. This simply sets +NEED_RESCHED and forgoes the lazy preemption counter. + + Initial test do not expose any observable latency increasement, but +history shows that I've been proven wrong before :) + +The lazy preemption mode is per default on, but with +CONFIG_SCHED_DEBUG enabled it can be disabled via: + + # echo NO_PREEMPT_LAZY >/sys/kernel/debug/sched_features + +and reenabled via + + # echo PREEMPT_LAZY >/sys/kernel/debug/sched_features + +The test results so far are very machine and workload dependent, but +there is a clear trend that it enhances the non RT workload +performance. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + include/linux/preempt.h | 54 ++++++++++++++++++++++-- + include/linux/sched.h | 38 +++++++++++++++++ + include/linux/thread_info.h | 12 +++++- + include/linux/trace_events.h | 5 ++- + kernel/Kconfig.preempt | 6 +++ + kernel/sched/core.c | 82 +++++++++++++++++++++++++++++++++++- + kernel/sched/fair.c | 16 +++---- + kernel/sched/features.h | 3 ++ + kernel/sched/sched.h | 9 ++++ + kernel/trace/trace.c | 50 +++++++++++++--------- + kernel/trace/trace_events.c | 1 + + kernel/trace/trace_output.c | 14 +++++- + 12 files changed, 254 insertions(+), 36 deletions(-) + +diff --git a/include/linux/preempt.h b/include/linux/preempt.h +index fb140e00f74d..af39859f02ee 100644 +--- a/include/linux/preempt.h ++++ b/include/linux/preempt.h +@@ -174,6 +174,20 @@ extern void preempt_count_sub(int val); + #define preempt_count_inc() preempt_count_add(1) + #define preempt_count_dec() preempt_count_sub(1) + ++#ifdef CONFIG_PREEMPT_LAZY ++#define add_preempt_lazy_count(val) do { preempt_lazy_count() += (val); } while (0) ++#define sub_preempt_lazy_count(val) do { preempt_lazy_count() -= (val); } while (0) ++#define inc_preempt_lazy_count() add_preempt_lazy_count(1) ++#define dec_preempt_lazy_count() sub_preempt_lazy_count(1) ++#define preempt_lazy_count() (current_thread_info()->preempt_lazy_count) ++#else ++#define add_preempt_lazy_count(val) do { } while (0) ++#define sub_preempt_lazy_count(val) do { } while (0) ++#define inc_preempt_lazy_count() do { } while (0) ++#define dec_preempt_lazy_count() do { } while (0) ++#define preempt_lazy_count() (0) ++#endif ++ + #ifdef CONFIG_PREEMPT_COUNT + + #define preempt_disable() \ +@@ -182,6 +196,12 @@ do { \ + barrier(); \ + } while (0) + ++#define preempt_lazy_disable() \ ++do { \ ++ inc_preempt_lazy_count(); \ ++ barrier(); \ ++} while (0) ++ + #define sched_preempt_enable_no_resched() \ + do { \ + barrier(); \ +@@ -219,6 +239,18 @@ do { \ + __preempt_schedule(); \ + } while (0) + ++/* ++ * open code preempt_check_resched() because it is not exported to modules and ++ * used by local_unlock() or bpf_enable_instrumentation(). ++ */ ++#define preempt_lazy_enable() \ ++do { \ ++ dec_preempt_lazy_count(); \ ++ barrier(); \ ++ if (should_resched(0)) \ ++ __preempt_schedule(); \ ++} while (0) ++ + #else /* !CONFIG_PREEMPTION */ + #define preempt_enable() \ + do { \ +@@ -226,6 +258,12 @@ do { \ + preempt_count_dec(); \ + } while (0) + ++#define preempt_lazy_enable() \ ++do { \ ++ dec_preempt_lazy_count(); \ ++ barrier(); \ ++} while (0) ++ + #define preempt_enable_notrace() \ + do { \ + barrier(); \ +@@ -267,6 +305,9 @@ do { \ + #define preempt_check_resched_rt() barrier() + #define preemptible() 0 + ++#define preempt_lazy_disable() barrier() ++#define preempt_lazy_enable() barrier() ++ + #endif /* CONFIG_PREEMPT_COUNT */ + + #ifdef MODULE +@@ -285,7 +326,7 @@ do { \ + } while (0) + #define preempt_fold_need_resched() \ + do { \ +- if (tif_need_resched()) \ ++ if (tif_need_resched_now()) \ + set_preempt_need_resched(); \ + } while (0) + +@@ -413,8 +454,15 @@ extern void migrate_enable(void); + + #else + +-static inline void migrate_disable(void) { } +-static inline void migrate_enable(void) { } ++static inline void migrate_disable(void) ++{ ++ preempt_lazy_disable(); ++} ++ ++static inline void migrate_enable(void) ++{ ++ preempt_lazy_enable(); ++} + + #endif /* CONFIG_SMP */ + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index bd0c9c633438..665a17e4f69b 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1882,6 +1882,44 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) + return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); + } + ++#ifdef CONFIG_PREEMPT_LAZY ++static inline void set_tsk_need_resched_lazy(struct task_struct *tsk) ++{ ++ set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY); ++} ++ ++static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) ++{ ++ clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY); ++} ++ ++static inline int test_tsk_need_resched_lazy(struct task_struct *tsk) ++{ ++ return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY)); ++} ++ ++static inline int need_resched_lazy(void) ++{ ++ return test_thread_flag(TIF_NEED_RESCHED_LAZY); ++} ++ ++static inline int need_resched_now(void) ++{ ++ return test_thread_flag(TIF_NEED_RESCHED); ++} ++ ++#else ++static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { } ++static inline int need_resched_lazy(void) { return 0; } ++ ++static inline int need_resched_now(void) ++{ ++ return test_thread_flag(TIF_NEED_RESCHED); ++} ++ ++#endif ++ ++ + static inline bool __task_is_stopped_or_traced(struct task_struct *task) + { + if (task->state & (__TASK_STOPPED | __TASK_TRACED)) +diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h +index f3040b0b4b23..3cb02ced141b 100644 +--- a/include/linux/thread_info.h ++++ b/include/linux/thread_info.h +@@ -110,7 +110,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag) + #define test_thread_flag(flag) \ + test_ti_thread_flag(current_thread_info(), flag) + +-#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) ++#ifdef CONFIG_PREEMPT_LAZY ++#define tif_need_resched() (test_thread_flag(TIF_NEED_RESCHED) || \ ++ test_thread_flag(TIF_NEED_RESCHED_LAZY)) ++#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED)) ++#define tif_need_resched_lazy() test_thread_flag(TIF_NEED_RESCHED_LAZY)) ++ ++#else ++#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) ++#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED) ++#define tif_need_resched_lazy() 0 ++#endif + + #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES + static inline int arch_within_stack_frames(const void * const stack, +diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h +index 2151524a10f0..e7afd9fe35e5 100644 +--- a/include/linux/trace_events.h ++++ b/include/linux/trace_events.h +@@ -70,6 +70,7 @@ struct trace_entry { + unsigned char preempt_count; + int pid; + unsigned char migrate_disable; ++ unsigned char preempt_lazy_count; + }; + + #define TRACE_EVENT_TYPE_MAX \ +@@ -159,9 +160,10 @@ static inline void tracing_generic_entry_update(struct trace_entry *entry, + { + entry->preempt_count = trace_ctx & 0xff; + entry->migrate_disable = (trace_ctx >> 8) & 0xff; ++ entry->preempt_lazy_count = (trace_ctx >> 16) & 0xff; + entry->pid = current->pid; + entry->type = type; +- entry->flags = trace_ctx >> 16; ++ entry->flags = trace_ctx >> 24; + } + + unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); +@@ -174,6 +176,7 @@ enum trace_flag_type { + TRACE_FLAG_SOFTIRQ = 0x10, + TRACE_FLAG_PREEMPT_RESCHED = 0x20, + TRACE_FLAG_NMI = 0x40, ++ TRACE_FLAG_NEED_RESCHED_LAZY = 0x80, + }; + + #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT +diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt +index cbe3aa495519..b5cd1e278eb5 100644 +--- a/kernel/Kconfig.preempt ++++ b/kernel/Kconfig.preempt +@@ -1,5 +1,11 @@ + # SPDX-License-Identifier: GPL-2.0-only + ++config HAVE_PREEMPT_LAZY ++ bool ++ ++config PREEMPT_LAZY ++ def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT ++ + choice + prompt "Preemption Model" + default PREEMPT_NONE +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 8ac1f0526476..c847d17e3b04 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -656,6 +656,48 @@ void resched_curr(struct rq *rq) + trace_sched_wake_idle_without_ipi(cpu); + } + ++#ifdef CONFIG_PREEMPT_LAZY ++ ++static int tsk_is_polling(struct task_struct *p) ++{ ++#ifdef TIF_POLLING_NRFLAG ++ return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); ++#else ++ return 0; ++#endif ++} ++ ++void resched_curr_lazy(struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ int cpu; ++ ++ if (!sched_feat(PREEMPT_LAZY)) { ++ resched_curr(rq); ++ return; ++ } ++ ++ lockdep_assert_held(&rq->lock); ++ ++ if (test_tsk_need_resched(curr)) ++ return; ++ ++ if (test_tsk_need_resched_lazy(curr)) ++ return; ++ ++ set_tsk_need_resched_lazy(curr); ++ ++ cpu = cpu_of(rq); ++ if (cpu == smp_processor_id()) ++ return; ++ ++ /* NEED_RESCHED_LAZY must be visible before we test polling */ ++ smp_mb(); ++ if (!tsk_is_polling(curr)) ++ smp_send_reschedule(cpu); ++} ++#endif ++ + void resched_cpu(int cpu) + { + struct rq *rq = cpu_rq(cpu); +@@ -1772,6 +1814,7 @@ void migrate_disable(void) + preempt_disable(); + this_rq()->nr_pinned++; + p->migration_disabled = 1; ++ preempt_lazy_disable(); + preempt_enable(); + } + EXPORT_SYMBOL_GPL(migrate_disable); +@@ -1800,6 +1843,7 @@ void migrate_enable(void) + barrier(); + p->migration_disabled = 0; + this_rq()->nr_pinned--; ++ preempt_lazy_enable(); + preempt_enable(); + + trace_sched_migrate_enable_tp(p); +@@ -3822,6 +3866,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) + p->on_cpu = 0; + #endif + init_task_preempt_count(p); ++#ifdef CONFIG_HAVE_PREEMPT_LAZY ++ task_thread_info(p)->preempt_lazy_count = 0; ++#endif + #ifdef CONFIG_SMP + plist_node_init(&p->pushable_tasks, MAX_PRIO); + RB_CLEAR_NODE(&p->pushable_dl_tasks); +@@ -5101,6 +5148,7 @@ static void __sched notrace __schedule(bool preempt, bool spinning_lock) + + next = pick_next_task(rq, prev, &rf); + clear_tsk_need_resched(prev); ++ clear_tsk_need_resched_lazy(prev); + clear_preempt_need_resched(); + + if (likely(prev != next)) { +@@ -5300,6 +5348,30 @@ static void __sched notrace preempt_schedule_common(void) + } while (need_resched()); + } + ++#ifdef CONFIG_PREEMPT_LAZY ++/* ++ * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is ++ * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as ++ * preempt_lazy_count counter >0. ++ */ ++static __always_inline int preemptible_lazy(void) ++{ ++ if (test_thread_flag(TIF_NEED_RESCHED)) ++ return 1; ++ if (current_thread_info()->preempt_lazy_count) ++ return 0; ++ return 1; ++} ++ ++#else ++ ++static inline int preemptible_lazy(void) ++{ ++ return 1; ++} ++ ++#endif ++ + #ifdef CONFIG_PREEMPTION + /* + * This is the entry point to schedule() from in-kernel preemption +@@ -5313,7 +5385,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) + */ + if (likely(!preemptible())) + return; +- ++ if (!preemptible_lazy()) ++ return; + preempt_schedule_common(); + } + NOKPROBE_SYMBOL(preempt_schedule); +@@ -5353,6 +5426,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) + if (likely(!preemptible())) + return; + ++ if (!preemptible_lazy()) ++ return; ++ + do { + /* + * Because the function tracer can trace preempt_count_sub() +@@ -7175,7 +7251,9 @@ void __init init_idle(struct task_struct *idle, int cpu) + + /* Set the preempt count _outside_ the spinlocks! */ + init_idle_preempt_count(idle, cpu); +- ++#ifdef CONFIG_HAVE_PREEMPT_LAZY ++ task_thread_info(idle)->preempt_lazy_count = 0; ++#endif + /* + * The idle tasks have their own, simple scheduling class: + */ +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 73a89fbd81be..f4928e5b6611 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -4570,7 +4570,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) + ideal_runtime = sched_slice(cfs_rq, curr); + delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; + if (delta_exec > ideal_runtime) { +- resched_curr(rq_of(cfs_rq)); ++ resched_curr_lazy(rq_of(cfs_rq)); + /* + * The current task ran long enough, ensure it doesn't get + * re-elected due to buddy favours. +@@ -4594,7 +4594,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) + return; + + if (delta > ideal_runtime) +- resched_curr(rq_of(cfs_rq)); ++ resched_curr_lazy(rq_of(cfs_rq)); + } + + static void +@@ -4737,7 +4737,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) + * validating it and just reschedule. + */ + if (queued) { +- resched_curr(rq_of(cfs_rq)); ++ resched_curr_lazy(rq_of(cfs_rq)); + return; + } + /* +@@ -4874,7 +4874,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) + * hierarchy can be throttled + */ + if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) +- resched_curr(rq_of(cfs_rq)); ++ resched_curr_lazy(rq_of(cfs_rq)); + } + + static __always_inline +@@ -5609,7 +5609,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) + + if (delta < 0) { + if (rq->curr == p) +- resched_curr(rq); ++ resched_curr_lazy(rq); + return; + } + hrtick_start(rq, delta); +@@ -7218,7 +7218,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ + return; + + preempt: +- resched_curr(rq); ++ resched_curr_lazy(rq); + /* + * Only set the backward buddy when the current task is still + * on the rq. This can happen when a wakeup gets interleaved +@@ -11059,7 +11059,7 @@ static void task_fork_fair(struct task_struct *p) + * 'current' within the tree based on its new key value. + */ + swap(curr->vruntime, se->vruntime); +- resched_curr(rq); ++ resched_curr_lazy(rq); + } + + se->vruntime -= cfs_rq->min_vruntime; +@@ -11086,7 +11086,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) + */ + if (rq->curr == p) { + if (p->prio > oldprio) +- resched_curr(rq); ++ resched_curr_lazy(rq); + } else + check_preempt_curr(rq, p, 0); + } +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index 402fd37fb340..bc2466af142e 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -47,6 +47,9 @@ SCHED_FEAT(NONTASK_CAPACITY, true) + + #ifdef CONFIG_PREEMPT_RT + SCHED_FEAT(TTWU_QUEUE, false) ++# ifdef CONFIG_PREEMPT_LAZY ++SCHED_FEAT(PREEMPT_LAZY, true) ++# endif + #else + + /* +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index d4bfc51358d3..ad854a670701 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1997,6 +1997,15 @@ extern void reweight_task(struct task_struct *p, int prio); + extern void resched_curr(struct rq *rq); + extern void resched_cpu(int cpu); + ++#ifdef CONFIG_PREEMPT_LAZY ++extern void resched_curr_lazy(struct rq *rq); ++#else ++static inline void resched_curr_lazy(struct rq *rq) ++{ ++ resched_curr(rq); ++} ++#endif ++ + extern struct rt_bandwidth def_rt_bandwidth; + extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); + +diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c +index 02dffb1862b8..7caae85af03d 100644 +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -2602,8 +2602,16 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) + trace_flags |= TRACE_FLAG_NEED_RESCHED; + if (test_preempt_need_resched()) + trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; +- return (trace_flags << 16) | (pc & 0xff) | +- (migration_disable_value() & 0xff) << 8; ++ ++#ifdef CONFIG_PREEMPT_LAZY ++ if (need_resched_lazy()) ++ trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY; ++#endif ++ ++ return (pc & 0xff) | ++ (migration_disable_value() & 0xff) << 8 | ++ (preempt_lazy_count() & 0xff) << 16 | ++ (trace_flags << 24); + } + + struct ring_buffer_event * +@@ -3861,15 +3869,17 @@ unsigned long trace_total_entries(struct trace_array *tr) + + static void print_lat_help_header(struct seq_file *m) + { +- seq_puts(m, "# _------=> CPU# \n" +- "# / _-----=> irqs-off \n" +- "# | / _----=> need-resched \n" +- "# || / _---=> hardirq/softirq \n" +- "# ||| / _--=> preempt-depth \n" +- "# |||| / _-=> migrate-disable \n" +- "# ||||| / delay \n" +- "# cmd pid |||||| time | caller \n" +- "# \\ / |||||| \\ | / \n"); ++ seq_puts(m, "# _--------=> CPU# \n" ++ "# / _-------=> irqs-off \n" ++ "# | / _------=> need-resched \n" ++ "# || / _-----=> need-resched-lazy\n" ++ "# ||| / _----=> hardirq/softirq \n" ++ "# |||| / _---=> preempt-depth \n" ++ "# ||||| / _--=> preempt-lazy-depth\n" ++ "# |||||| / _-=> migrate-disable \n" ++ "# ||||||| / delay \n" ++ "# cmd pid |||||||| time | caller \n" ++ "# \\ / |||||||| \\ | / \n"); + } + + static void print_event_info(struct array_buffer *buf, struct seq_file *m) +@@ -3903,14 +3913,16 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file + + print_event_info(buf, m); + +- seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); +- seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); +- seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); +- seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); +- seq_printf(m, "# %.*s||| / _-=> migrate-disable\n", prec, space); +- seq_printf(m, "# %.*s|||| / delay\n", prec, space); +- seq_printf(m, "# TASK-PID %.*s CPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID "); +- seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | "); ++ seq_printf(m, "# %.*s _-------=> irqs-off\n", prec, space); ++ seq_printf(m, "# %.*s / _------=> need-resched\n", prec, space); ++ seq_printf(m, "# %.*s| / _-----=> need-resched-lazy\n", prec, space); ++ seq_printf(m, "# %.*s|| / _----=> hardirq/softirq\n", prec, space); ++ seq_printf(m, "# %.*s||| / _---=> preempt-depth\n", prec, space); ++ seq_printf(m, "# %.*s|||| / _--=> preempt-lazy-depth\n", prec, space); ++ seq_printf(m, "# %.*s||||| / _-=> migrate-disable\n", prec, space); ++ seq_printf(m, "# %.*s|||||| / delay\n", prec, space); ++ seq_printf(m, "# TASK-PID %.*s CPU# ||||||| TIMESTAMP FUNCTION\n", prec, " TGID "); ++ seq_printf(m, "# | | %.*s | ||||||| | |\n", prec, " | "); + } + + void +diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c +index 636fb7df3714..245b8289ecdf 100644 +--- a/kernel/trace/trace_events.c ++++ b/kernel/trace/trace_events.c +@@ -185,6 +185,7 @@ static int trace_define_common_fields(void) + __common_field(unsigned char, preempt_count); + __common_field(int, pid); + __common_field(unsigned char, migrate_disable); ++ __common_field(unsigned char, preempt_lazy_count); + + return ret; + } +diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c +index b3619b21217c..5a71964ade3a 100644 +--- a/kernel/trace/trace_output.c ++++ b/kernel/trace/trace_output.c +@@ -451,6 +451,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) + { + char hardsoft_irq; + char need_resched; ++ char need_resched_lazy; + char irqs_off; + int hardirq; + int softirq; +@@ -481,6 +482,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) + break; + } + ++ need_resched_lazy = ++ (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.'; ++ + hardsoft_irq = + (nmi && hardirq) ? 'Z' : + nmi ? 'z' : +@@ -489,14 +493,20 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) + softirq ? 's' : + '.' ; + +- trace_seq_printf(s, "%c%c%c", +- irqs_off, need_resched, hardsoft_irq); ++ trace_seq_printf(s, "%c%c%c%c", ++ irqs_off, need_resched, need_resched_lazy, ++ hardsoft_irq); + + if (entry->preempt_count) + trace_seq_printf(s, "%x", entry->preempt_count); + else + trace_seq_putc(s, '.'); + ++ if (entry->preempt_lazy_count) ++ trace_seq_printf(s, "%x", entry->preempt_lazy_count); ++ else ++ trace_seq_putc(s, '.'); ++ + if (entry->migrate_disable) + trace_seq_printf(s, "%x", entry->migrate_disable); + else +-- +2.43.0 + diff --git a/debian/patches-rt/0259-x86-entry-Use-should_resched-in-idtentry_exit_cond_r.patch b/debian/patches-rt/0259-x86-entry-Use-should_resched-in-idtentry_exit_cond_r.patch new file mode 100644 index 000000000..44c4cc8c2 --- /dev/null +++ b/debian/patches-rt/0259-x86-entry-Use-should_resched-in-idtentry_exit_cond_r.patch @@ -0,0 +1,35 @@ +From 367fe43dc09205ecc5fcdca95276fc481e7ccd9f Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 30 Jun 2020 11:45:14 +0200 +Subject: [PATCH 259/323] x86/entry: Use should_resched() in + idtentry_exit_cond_resched() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The TIF_NEED_RESCHED bit is inlined on x86 into the preemption counter. +By using should_resched(0) instead of need_resched() the same check can +be performed which uses the same variable as 'preempt_count()` which was +issued before. + +Use should_resched(0) instead need_resched(). + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/entry/common.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/entry/common.c b/kernel/entry/common.c +index e6da86039ccf..f32250adb681 100644 +--- a/kernel/entry/common.c ++++ b/kernel/entry/common.c +@@ -371,7 +371,7 @@ void irqentry_exit_cond_resched(void) + rcu_irq_exit_check_preempt(); + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + WARN_ON_ONCE(!on_thread_stack()); +- if (need_resched()) ++ if (should_resched(0)) + preempt_schedule_irq(); + } + } +-- +2.43.0 + diff --git a/debian/patches-rt/0260-x86-Support-for-lazy-preemption.patch b/debian/patches-rt/0260-x86-Support-for-lazy-preemption.patch new file mode 100644 index 000000000..1b4da37ab --- /dev/null +++ b/debian/patches-rt/0260-x86-Support-for-lazy-preemption.patch @@ -0,0 +1,171 @@ +From 3e75dc5bbc430b2a1db27fe101ed508b067af7b0 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Thu, 1 Nov 2012 11:03:47 +0100 +Subject: [PATCH 260/323] x86: Support for lazy preemption +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Implement the x86 pieces for lazy preempt. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + arch/x86/Kconfig | 1 + + arch/x86/include/asm/preempt.h | 33 +++++++++++++++++++++++++++++- + arch/x86/include/asm/thread_info.h | 11 ++++++++++ + include/linux/entry-common.h | 6 +++--- + kernel/entry/common.c | 2 +- + 5 files changed, 48 insertions(+), 5 deletions(-) + +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index c72d66a0e840..b9f68b01a8c8 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -213,6 +213,7 @@ config X86 + select HAVE_PCI + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP ++ select HAVE_PREEMPT_LAZY + select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT + select HAVE_POSIX_CPU_TIMERS_TASK_WORK + select HAVE_REGS_AND_STACK_ACCESS_API +diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h +index 50e0c0ab7b97..afe37a8c6c24 100644 +--- a/arch/x86/include/asm/preempt.h ++++ b/arch/x86/include/asm/preempt.h +@@ -89,17 +89,48 @@ static __always_inline void __preempt_count_sub(int val) + * a decrement which hits zero means we have no preempt_count and should + * reschedule. + */ +-static __always_inline bool __preempt_count_dec_and_test(void) ++static __always_inline bool ____preempt_count_dec_and_test(void) + { + return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var])); + } + ++static __always_inline bool __preempt_count_dec_and_test(void) ++{ ++ if (____preempt_count_dec_and_test()) ++ return true; ++#ifdef CONFIG_PREEMPT_LAZY ++ if (preempt_count()) ++ return false; ++ if (current_thread_info()->preempt_lazy_count) ++ return false; ++ return test_thread_flag(TIF_NEED_RESCHED_LAZY); ++#else ++ return false; ++#endif ++} ++ + /* + * Returns true when we need to resched and can (barring IRQ state). + */ + static __always_inline bool should_resched(int preempt_offset) + { ++#ifdef CONFIG_PREEMPT_LAZY ++ u32 tmp; ++ tmp = raw_cpu_read_4(__preempt_count); ++ if (tmp == preempt_offset) ++ return true; ++ ++ /* preempt count == 0 ? */ ++ tmp &= ~PREEMPT_NEED_RESCHED; ++ if (tmp != preempt_offset) ++ return false; ++ /* XXX PREEMPT_LOCK_OFFSET */ ++ if (current_thread_info()->preempt_lazy_count) ++ return false; ++ return test_thread_flag(TIF_NEED_RESCHED_LAZY); ++#else + return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); ++#endif + } + + #ifdef CONFIG_PREEMPTION +diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h +index 012c8ee93b67..d77e99ba7610 100644 +--- a/arch/x86/include/asm/thread_info.h ++++ b/arch/x86/include/asm/thread_info.h +@@ -56,17 +56,24 @@ struct task_struct; + struct thread_info { + unsigned long flags; /* low level flags */ + u32 status; /* thread synchronous flags */ ++ int preempt_lazy_count; /* 0 => lazy preemptable ++ <0 => BUG */ + }; + + #define INIT_THREAD_INFO(tsk) \ + { \ + .flags = 0, \ ++ .preempt_lazy_count = 0, \ + } + + #else /* !__ASSEMBLY__ */ + + #include <asm/asm-offsets.h> + ++#define GET_THREAD_INFO(reg) \ ++ _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \ ++ _ASM_SUB $(THREAD_SIZE),reg ; ++ + #endif + + /* +@@ -103,6 +110,7 @@ struct thread_info { + #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ + #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ + #define TIF_X32 30 /* 32-bit native x86-64 binary */ ++#define TIF_NEED_RESCHED_LAZY 31 /* lazy rescheduling necessary */ + + #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) + #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) +@@ -124,6 +132,7 @@ struct thread_info { + #define _TIF_IA32 (1 << TIF_IA32) + #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) + #define _TIF_SLD (1 << TIF_SLD) ++#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) + #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) + #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) + #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) +@@ -156,6 +165,8 @@ struct thread_info { + + #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) + ++#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) ++ + #define STACK_WARN (THREAD_SIZE/8) + + /* +diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h +index 46c42479f950..b902af13d05f 100644 +--- a/include/linux/entry-common.h ++++ b/include/linux/entry-common.h +@@ -67,9 +67,9 @@ + # define ARCH_EXIT_TO_USER_MODE_WORK (0) + #endif + +-#define EXIT_TO_USER_MODE_WORK \ +- (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ +- _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ ++#define EXIT_TO_USER_MODE_WORK \ ++ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ ++ _TIF_NEED_RESCHED_MASK | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ + ARCH_EXIT_TO_USER_MODE_WORK) + + /** +diff --git a/kernel/entry/common.c b/kernel/entry/common.c +index f32250adb681..cdf97ea30b4c 100644 +--- a/kernel/entry/common.c ++++ b/kernel/entry/common.c +@@ -157,7 +157,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + + local_irq_enable_exit_to_user(ti_work); + +- if (ti_work & _TIF_NEED_RESCHED) ++ if (ti_work & _TIF_NEED_RESCHED_MASK) + schedule(); + + #ifdef ARCH_RT_DELAYS_SIGNAL_SEND +-- +2.43.0 + diff --git a/debian/patches-rt/0261-arm-Add-support-for-lazy-preemption.patch b/debian/patches-rt/0261-arm-Add-support-for-lazy-preemption.patch new file mode 100644 index 000000000..18012c8d8 --- /dev/null +++ b/debian/patches-rt/0261-arm-Add-support-for-lazy-preemption.patch @@ -0,0 +1,153 @@ +From 125d4e163b1aa93781f9cf4673f729053af2bc53 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Wed, 31 Oct 2012 12:04:11 +0100 +Subject: [PATCH 261/323] arm: Add support for lazy preemption +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Implement the arm pieces for lazy preempt. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + arch/arm/Kconfig | 1 + + arch/arm/include/asm/thread_info.h | 5 ++++- + arch/arm/kernel/asm-offsets.c | 1 + + arch/arm/kernel/entry-armv.S | 19 ++++++++++++++++--- + arch/arm/kernel/entry-common.S | 1 + + arch/arm/kernel/signal.c | 3 ++- + 6 files changed, 25 insertions(+), 5 deletions(-) + +diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig +index c01251683018..ee2cbe8ad502 100644 +--- a/arch/arm/Kconfig ++++ b/arch/arm/Kconfig +@@ -107,6 +107,7 @@ config ARM + select HAVE_PERF_EVENTS + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP ++ select HAVE_PREEMPT_LAZY + select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE + select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_RSEQ +diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h +index fcccf35f5cf9..3b981ad0c322 100644 +--- a/arch/arm/include/asm/thread_info.h ++++ b/arch/arm/include/asm/thread_info.h +@@ -46,6 +46,7 @@ struct cpu_context_save { + struct thread_info { + unsigned long flags; /* low level flags */ + int preempt_count; /* 0 => preemptable, <0 => bug */ ++ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ + mm_segment_t addr_limit; /* address limit */ + struct task_struct *task; /* main task structure */ + __u32 cpu; /* cpu */ +@@ -134,6 +135,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, + #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ + #define TIF_UPROBE 3 /* breakpointed or singlestepping */ + #define TIF_NOTIFY_SIGNAL 4 /* signal notifications exist */ ++#define TIF_NEED_RESCHED_LAZY 5 + + #define TIF_USING_IWMMXT 17 + #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ +@@ -143,10 +145,10 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, + #define TIF_SYSCALL_TRACEPOINT 22 /* syscall tracepoint instrumentation */ + #define TIF_SECCOMP 23 /* seccomp syscall filtering active */ + +- + #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) + #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) + #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) ++#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) + #define _TIF_UPROBE (1 << TIF_UPROBE) + #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) + #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) +@@ -164,6 +166,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, + */ + #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ + _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ ++ _TIF_NEED_RESCHED_LAZY | \ + _TIF_NOTIFY_SIGNAL) + + #endif /* __KERNEL__ */ +diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c +index 70993af22d80..024c65c3a0f2 100644 +--- a/arch/arm/kernel/asm-offsets.c ++++ b/arch/arm/kernel/asm-offsets.c +@@ -43,6 +43,7 @@ int main(void) + BLANK(); + DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); + DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); ++ DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count)); + DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit)); + DEFINE(TI_TASK, offsetof(struct thread_info, task)); + DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); +diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S +index 030351d169aa..d6086559168d 100644 +--- a/arch/arm/kernel/entry-armv.S ++++ b/arch/arm/kernel/entry-armv.S +@@ -206,11 +206,18 @@ __irq_svc: + + #ifdef CONFIG_PREEMPTION + ldr r8, [tsk, #TI_PREEMPT] @ get preempt count +- ldr r0, [tsk, #TI_FLAGS] @ get flags + teq r8, #0 @ if preempt count != 0 ++ bne 1f @ return from exeption ++ ldr r0, [tsk, #TI_FLAGS] @ get flags ++ tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set ++ blne svc_preempt @ preempt! ++ ++ ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count ++ teq r8, #0 @ if preempt lazy count != 0 + movne r0, #0 @ force flags to 0 +- tst r0, #_TIF_NEED_RESCHED ++ tst r0, #_TIF_NEED_RESCHED_LAZY + blne svc_preempt ++1: + #endif + + svc_exit r5, irq = 1 @ return from exception +@@ -225,8 +232,14 @@ svc_preempt: + 1: bl preempt_schedule_irq @ irq en/disable is done inside + ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS + tst r0, #_TIF_NEED_RESCHED ++ bne 1b ++ tst r0, #_TIF_NEED_RESCHED_LAZY + reteq r8 @ go again +- b 1b ++ ldr r0, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count ++ teq r0, #0 @ if preempt lazy count != 0 ++ beq 1b ++ ret r8 @ go again ++ + #endif + + __und_fault: +diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S +index 9b3c737575e9..a30b1a1cc4d1 100644 +--- a/arch/arm/kernel/entry-common.S ++++ b/arch/arm/kernel/entry-common.S +@@ -92,6 +92,7 @@ __ret_fast_syscall: + ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing + movs r1, r1, lsl #16 + beq no_work_pending ++do_slower_path: + UNWIND(.fnend ) + ENDPROC(ret_fast_syscall) + +diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c +index a3a38d0a4c85..f04ccf19ab1f 100644 +--- a/arch/arm/kernel/signal.c ++++ b/arch/arm/kernel/signal.c +@@ -649,7 +649,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) + */ + trace_hardirqs_off(); + do { +- if (likely(thread_flags & _TIF_NEED_RESCHED)) { ++ if (likely(thread_flags & (_TIF_NEED_RESCHED | ++ _TIF_NEED_RESCHED_LAZY))) { + schedule(); + } else { + if (unlikely(!user_mode(regs))) +-- +2.43.0 + diff --git a/debian/patches-rt/0262-powerpc-Add-support-for-lazy-preemption.patch b/debian/patches-rt/0262-powerpc-Add-support-for-lazy-preemption.patch new file mode 100644 index 000000000..259c9928b --- /dev/null +++ b/debian/patches-rt/0262-powerpc-Add-support-for-lazy-preemption.patch @@ -0,0 +1,267 @@ +From 8e5e88a7528b5f0b0f2b9835deaf8b6db693500f Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Thu, 1 Nov 2012 10:14:11 +0100 +Subject: [PATCH 262/323] powerpc: Add support for lazy preemption +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Implement the powerpc pieces for lazy preempt. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + arch/powerpc/Kconfig | 1 + + arch/powerpc/include/asm/thread_info.h | 16 ++++++++++++---- + arch/powerpc/kernel/asm-offsets.c | 1 + + arch/powerpc/kernel/entry_32.S | 23 ++++++++++++++++------- + arch/powerpc/kernel/exceptions-64e.S | 16 ++++++++++++---- + arch/powerpc/kernel/syscall_64.c | 10 +++++++--- + 6 files changed, 49 insertions(+), 18 deletions(-) + +diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig +index b3ab6c2d9f66..6d8ce54ad6dd 100644 +--- a/arch/powerpc/Kconfig ++++ b/arch/powerpc/Kconfig +@@ -230,6 +230,7 @@ config PPC + select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP ++ select HAVE_PREEMPT_LAZY + select MMU_GATHER_RCU_TABLE_FREE + select MMU_GATHER_PAGE_SIZE + select HAVE_REGS_AND_STACK_ACCESS_API +diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h +index ff31d2fa2140..23bfe231fda3 100644 +--- a/arch/powerpc/include/asm/thread_info.h ++++ b/arch/powerpc/include/asm/thread_info.h +@@ -54,6 +54,8 @@ + struct thread_info { + int preempt_count; /* 0 => preemptable, + <0 => BUG */ ++ int preempt_lazy_count; /* 0 => preemptable, ++ <0 => BUG */ + unsigned long local_flags; /* private flags for thread */ + #ifdef CONFIG_LIVEPATCH + unsigned long *livepatch_sp; +@@ -104,11 +106,12 @@ void arch_setup_new_exec(void); + #define TIF_SINGLESTEP 8 /* singlestepping active */ + #define TIF_NOHZ 9 /* in adaptive nohz mode */ + #define TIF_SECCOMP 10 /* secure computing */ +-#define TIF_RESTOREALL 11 /* Restore all regs (implies NOERROR) */ +-#define TIF_NOERROR 12 /* Force successful syscall return */ ++ ++#define TIF_NEED_RESCHED_LAZY 11 /* lazy rescheduling necessary */ ++#define TIF_SYSCALL_TRACEPOINT 12 /* syscall tracepoint instrumentation */ ++ + #define TIF_NOTIFY_RESUME 13 /* callback before returning to user */ + #define TIF_UPROBE 14 /* breakpointed or single-stepping */ +-#define TIF_SYSCALL_TRACEPOINT 15 /* syscall tracepoint instrumentation */ + #define TIF_EMULATE_STACK_STORE 16 /* Is an instruction emulation + for stack store? */ + #define TIF_MEMDIE 17 /* is terminating due to OOM killer */ +@@ -117,6 +120,9 @@ void arch_setup_new_exec(void); + #endif + #define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling TIF_NEED_RESCHED */ + #define TIF_32BIT 20 /* 32 bit binary */ ++#define TIF_RESTOREALL 21 /* Restore all regs (implies NOERROR) */ ++#define TIF_NOERROR 22 /* Force successful syscall return */ ++ + + /* as above, but as bit values */ + #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) +@@ -137,6 +143,7 @@ void arch_setup_new_exec(void); + #define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT) + #define _TIF_EMULATE_STACK_STORE (1<<TIF_EMULATE_STACK_STORE) + #define _TIF_NOHZ (1<<TIF_NOHZ) ++#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY) + #define _TIF_SYSCALL_EMU (1<<TIF_SYSCALL_EMU) + #define _TIF_SYSCALL_DOTRACE (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ + _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \ +@@ -145,8 +152,9 @@ void arch_setup_new_exec(void); + #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \ + _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ + _TIF_RESTORE_TM | _TIF_PATCH_PENDING | \ +- _TIF_NOTIFY_SIGNAL) ++ _TIF_NEED_RESCHED_LAZY | _TIF_NOTIFY_SIGNAL) + #define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR) ++#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) + + /* Bits in local_flags */ + /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */ +diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c +index 5c125255571c..597379121407 100644 +--- a/arch/powerpc/kernel/asm-offsets.c ++++ b/arch/powerpc/kernel/asm-offsets.c +@@ -189,6 +189,7 @@ int main(void) + OFFSET(TI_FLAGS, thread_info, flags); + OFFSET(TI_LOCAL_FLAGS, thread_info, local_flags); + OFFSET(TI_PREEMPT, thread_info, preempt_count); ++ OFFSET(TI_PREEMPT_LAZY, thread_info, preempt_lazy_count); + + #ifdef CONFIG_PPC64 + OFFSET(DCACHEL1BLOCKSIZE, ppc64_caches, l1d.block_size); +diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S +index 459f5d00b990..fc9517a97640 100644 +--- a/arch/powerpc/kernel/entry_32.S ++++ b/arch/powerpc/kernel/entry_32.S +@@ -414,7 +414,9 @@ ret_from_syscall: + mtmsr r10 + lwz r9,TI_FLAGS(r2) + li r8,-MAX_ERRNO +- andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) ++ lis r0,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)@h ++ ori r0,r0, (_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)@l ++ and. r0,r9,r0 + bne- syscall_exit_work + cmplw 0,r3,r8 + blt+ syscall_exit_cont +@@ -530,13 +532,13 @@ syscall_dotrace: + b syscall_dotrace_cont + + syscall_exit_work: +- andi. r0,r9,_TIF_RESTOREALL ++ andis. r0,r9,_TIF_RESTOREALL@h + beq+ 0f + REST_NVGPRS(r1) + b 2f + 0: cmplw 0,r3,r8 + blt+ 1f +- andi. r0,r9,_TIF_NOERROR ++ andis. r0,r9,_TIF_NOERROR@h + bne- 1f + lwz r11,_CCR(r1) /* Load CR */ + neg r3,r3 +@@ -545,12 +547,12 @@ syscall_exit_work: + + 1: stw r6,RESULT(r1) /* Save result */ + stw r3,GPR3(r1) /* Update return value */ +-2: andi. r0,r9,(_TIF_PERSYSCALL_MASK) ++2: andis. r0,r9,(_TIF_PERSYSCALL_MASK)@h + beq 4f + + /* Clear per-syscall TIF flags if any are set. */ + +- li r11,_TIF_PERSYSCALL_MASK ++ lis r11,(_TIF_PERSYSCALL_MASK)@h + addi r12,r2,TI_FLAGS + 3: lwarx r8,0,r12 + andc r8,r8,r11 +@@ -927,7 +929,14 @@ resume_kernel: + cmpwi 0,r0,0 /* if non-zero, just restore regs and return */ + bne restore_kuap + andi. r8,r8,_TIF_NEED_RESCHED ++ bne+ 1f ++ lwz r0,TI_PREEMPT_LAZY(r2) ++ cmpwi 0,r0,0 /* if non-zero, just restore regs and return */ ++ bne restore_kuap ++ lwz r0,TI_FLAGS(r2) ++ andi. r0,r0,_TIF_NEED_RESCHED_LAZY + beq+ restore_kuap ++1: + lwz r3,_MSR(r1) + andi. r0,r3,MSR_EE /* interrupts off? */ + beq restore_kuap /* don't schedule if so */ +@@ -1248,7 +1257,7 @@ global_dbcr0: + #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */ + + do_work: /* r10 contains MSR_KERNEL here */ +- andi. r0,r9,_TIF_NEED_RESCHED ++ andi. r0,r9,_TIF_NEED_RESCHED_MASK + beq do_user_signal + + do_resched: /* r10 contains MSR_KERNEL here */ +@@ -1267,7 +1276,7 @@ recheck: + LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) + mtmsr r10 /* disable interrupts */ + lwz r9,TI_FLAGS(r2) +- andi. r0,r9,_TIF_NEED_RESCHED ++ andi. r0,r9,_TIF_NEED_RESCHED_MASK + bne- do_resched + andi. r0,r9,_TIF_USER_WORK_MASK + beq restore_user +diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S +index f579ce46eef2..715ff292a8f8 100644 +--- a/arch/powerpc/kernel/exceptions-64e.S ++++ b/arch/powerpc/kernel/exceptions-64e.S +@@ -1080,7 +1080,7 @@ _GLOBAL(ret_from_except_lite) + li r10, -1 + mtspr SPRN_DBSR,r10 + b restore +-1: andi. r0,r4,_TIF_NEED_RESCHED ++1: andi. r0,r4,_TIF_NEED_RESCHED_MASK + beq 2f + bl restore_interrupts + SCHEDULE_USER +@@ -1132,12 +1132,20 @@ resume_kernel: + bne- 0b + 1: + +-#ifdef CONFIG_PREEMPT ++#ifdef CONFIG_PREEMPTION + /* Check if we need to preempt */ ++ lwz r8,TI_PREEMPT(r9) ++ cmpwi 0,r8,0 /* if non-zero, just restore regs and return */ ++ bne restore + andi. r0,r4,_TIF_NEED_RESCHED ++ bne+ check_count ++ ++ andi. r0,r4,_TIF_NEED_RESCHED_LAZY + beq+ restore ++ lwz r8,TI_PREEMPT_LAZY(r9) ++ + /* Check that preempt_count() == 0 and interrupts are enabled */ +- lwz r8,TI_PREEMPT(r9) ++check_count: + cmpwi cr0,r8,0 + bne restore + ld r0,SOFTE(r1) +@@ -1158,7 +1166,7 @@ resume_kernel: + * interrupted after loading SRR0/1. + */ + wrteei 0 +-#endif /* CONFIG_PREEMPT */ ++#endif /* CONFIG_PREEMPTION */ + + restore: + /* +diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c +index 310bcd768cd5..ae3212dcf562 100644 +--- a/arch/powerpc/kernel/syscall_64.c ++++ b/arch/powerpc/kernel/syscall_64.c +@@ -193,7 +193,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3, + ti_flags = READ_ONCE(*ti_flagsp); + while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) { + local_irq_enable(); +- if (ti_flags & _TIF_NEED_RESCHED) { ++ if (ti_flags & _TIF_NEED_RESCHED_MASK) { + schedule(); + } else { + /* +@@ -277,7 +277,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned + ti_flags = READ_ONCE(*ti_flagsp); + while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) { + local_irq_enable(); /* returning to user: may enable */ +- if (ti_flags & _TIF_NEED_RESCHED) { ++ if (ti_flags & _TIF_NEED_RESCHED_MASK) { + schedule(); + } else { + if (ti_flags & _TIF_SIGPENDING) +@@ -361,11 +361,15 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign + /* Returning to a kernel context with local irqs enabled. */ + WARN_ON_ONCE(!(regs->msr & MSR_EE)); + again: +- if (IS_ENABLED(CONFIG_PREEMPT)) { ++ if (IS_ENABLED(CONFIG_PREEMPTION)) { + /* Return to preemptible kernel context */ + if (unlikely(*ti_flagsp & _TIF_NEED_RESCHED)) { + if (preempt_count() == 0) + preempt_schedule_irq(); ++ } else if (unlikely(*ti_flagsp & _TIF_NEED_RESCHED_LAZY)) { ++ if ((preempt_count() == 0) && ++ (current_thread_info()->preempt_lazy_count == 0)) ++ preempt_schedule_irq(); + } + } + +-- +2.43.0 + diff --git a/debian/patches-rt/0263-arch-arm64-Add-lazy-preempt-support.patch b/debian/patches-rt/0263-arch-arm64-Add-lazy-preempt-support.patch new file mode 100644 index 000000000..bf46cf898 --- /dev/null +++ b/debian/patches-rt/0263-arch-arm64-Add-lazy-preempt-support.patch @@ -0,0 +1,168 @@ +From 7499fc472caecd0270f69d5857ca8a53cc6dad83 Mon Sep 17 00:00:00 2001 +From: Anders Roxell <anders.roxell@linaro.org> +Date: Thu, 14 May 2015 17:52:17 +0200 +Subject: [PATCH 263/323] arch/arm64: Add lazy preempt support +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +arm64 is missing support for PREEMPT_RT. The main feature which is +lacking is support for lazy preemption. The arch-specific entry code, +thread information structure definitions, and associated data tables +have to be extended to provide this support. Then the Kconfig file has +to be extended to indicate the support is available, and also to +indicate that support for full RT preemption is now available. + +Signed-off-by: Anders Roxell <anders.roxell@linaro.org> +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + arch/arm64/Kconfig | 1 + + arch/arm64/include/asm/preempt.h | 25 ++++++++++++++++++++++++- + arch/arm64/include/asm/thread_info.h | 6 +++++- + arch/arm64/kernel/asm-offsets.c | 1 + + arch/arm64/kernel/entry.S | 13 +++++++++++-- + arch/arm64/kernel/signal.c | 2 +- + 6 files changed, 43 insertions(+), 5 deletions(-) + +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index 13cf137da999..7681d7b65c7d 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -173,6 +173,7 @@ config ARM64 + select HAVE_PERF_EVENTS + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP ++ select HAVE_PREEMPT_LAZY + select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_FUNCTION_ARG_ACCESS_API + select HAVE_FUTEX_CMPXCHG if FUTEX +diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h +index f1486b32502c..7a5770d825b9 100644 +--- a/arch/arm64/include/asm/preempt.h ++++ b/arch/arm64/include/asm/preempt.h +@@ -70,13 +70,36 @@ static inline bool __preempt_count_dec_and_test(void) + * interrupt occurring between the non-atomic READ_ONCE/WRITE_ONCE + * pair. + */ +- return !pc || !READ_ONCE(ti->preempt_count); ++ if (!pc || !READ_ONCE(ti->preempt_count)) ++ return true; ++#ifdef CONFIG_PREEMPT_LAZY ++ if ((pc & ~PREEMPT_NEED_RESCHED)) ++ return false; ++ if (current_thread_info()->preempt_lazy_count) ++ return false; ++ return test_thread_flag(TIF_NEED_RESCHED_LAZY); ++#else ++ return false; ++#endif + } + + static inline bool should_resched(int preempt_offset) + { ++#ifdef CONFIG_PREEMPT_LAZY ++ u64 pc = READ_ONCE(current_thread_info()->preempt_count); ++ if (pc == preempt_offset) ++ return true; ++ ++ if ((pc & ~PREEMPT_NEED_RESCHED) != preempt_offset) ++ return false; ++ ++ if (current_thread_info()->preempt_lazy_count) ++ return false; ++ return test_thread_flag(TIF_NEED_RESCHED_LAZY); ++#else + u64 pc = READ_ONCE(current_thread_info()->preempt_count); + return pc == preempt_offset; ++#endif + } + + #ifdef CONFIG_PREEMPTION +diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h +index cdcf307764aa..6eb36a2126e8 100644 +--- a/arch/arm64/include/asm/thread_info.h ++++ b/arch/arm64/include/asm/thread_info.h +@@ -29,6 +29,7 @@ struct thread_info { + #ifdef CONFIG_ARM64_SW_TTBR0_PAN + u64 ttbr0; /* saved TTBR0_EL1 */ + #endif ++ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ + union { + u64 preempt_count; /* 0 => preemptible, <0 => bug */ + struct { +@@ -74,6 +75,7 @@ void arch_release_task_struct(struct task_struct *tsk); + #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ + #define TIF_SECCOMP 11 /* syscall secure computing */ + #define TIF_SYSCALL_EMU 12 /* syscall emulation active */ ++#define TIF_NEED_RESCHED_LAZY 13 + #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ + #define TIF_FREEZE 19 + #define TIF_RESTORE_SIGMASK 20 +@@ -99,13 +101,15 @@ void arch_release_task_struct(struct task_struct *tsk); + #define _TIF_32BIT (1 << TIF_32BIT) + #define _TIF_SVE (1 << TIF_SVE) + #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) ++#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) + #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) + + #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ + _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ + _TIF_UPROBE | _TIF_FSCHECK | _TIF_MTE_ASYNC_FAULT | \ +- _TIF_NOTIFY_SIGNAL) ++ _TIF_NEED_RESCHED_LAZY | _TIF_NOTIFY_SIGNAL) + ++#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) + #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ + _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ + _TIF_SYSCALL_EMU) +diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c +index 7d32fc959b1a..b2f29bd2ae87 100644 +--- a/arch/arm64/kernel/asm-offsets.c ++++ b/arch/arm64/kernel/asm-offsets.c +@@ -30,6 +30,7 @@ int main(void) + BLANK(); + DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); + DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); ++ DEFINE(TSK_TI_PREEMPT_LAZY, offsetof(struct task_struct, thread_info.preempt_lazy_count)); + DEFINE(TSK_TI_ADDR_LIMIT, offsetof(struct task_struct, thread_info.addr_limit)); + #ifdef CONFIG_ARM64_SW_TTBR0_PAN + DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); +diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S +index a94acea770c7..0bfa079043f8 100644 +--- a/arch/arm64/kernel/entry.S ++++ b/arch/arm64/kernel/entry.S +@@ -523,9 +523,18 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING + mrs x0, daif + orr x24, x24, x0 + alternative_else_nop_endif +- cbnz x24, 1f // preempt count != 0 || NMI return path +- bl arm64_preempt_schedule_irq // irq en/disable is done inside ++ ++ cbz x24, 1f // (need_resched + count) == 0 ++ cbnz w24, 2f // count != 0 ++ ++ ldr w24, [tsk, #TSK_TI_PREEMPT_LAZY] // get preempt lazy count ++ cbnz w24, 2f // preempt lazy count != 0 ++ ++ ldr x0, [tsk, #TSK_TI_FLAGS] // get flags ++ tbz x0, #TIF_NEED_RESCHED_LAZY, 2f // needs rescheduling? + 1: ++ bl arm64_preempt_schedule_irq // irq en/disable is done inside ++2: + #endif + + mov x0, sp +diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c +index b6fbbd527dd7..94eed0dc3afc 100644 +--- a/arch/arm64/kernel/signal.c ++++ b/arch/arm64/kernel/signal.c +@@ -921,7 +921,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, + /* Check valid user FS if needed */ + addr_limit_user_check(); + +- if (thread_flags & _TIF_NEED_RESCHED) { ++ if (thread_flags & _TIF_NEED_RESCHED_MASK) { + /* Unmask Debug and SError for the next task */ + local_daif_restore(DAIF_PROCCTX_NOIRQ); + +-- +2.43.0 + diff --git a/debian/patches-rt/0264-jump-label-disable-if-stop_machine-is-used.patch b/debian/patches-rt/0264-jump-label-disable-if-stop_machine-is-used.patch new file mode 100644 index 000000000..168745160 --- /dev/null +++ b/debian/patches-rt/0264-jump-label-disable-if-stop_machine-is-used.patch @@ -0,0 +1,42 @@ +From 915e18f01ac0c97a6dc374b940a96a105bf57c62 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Wed, 8 Jul 2015 17:14:48 +0200 +Subject: [PATCH 264/323] jump-label: disable if stop_machine() is used +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Some architectures are using stop_machine() while switching the opcode which +leads to latency spikes. +The architectures which use stop_machine() atm: +- ARM stop machine +- s390 stop machine + +The architecures which use other sorcery: +- MIPS +- X86 +- powerpc +- sparc +- arm64 + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +[bigeasy: only ARM for now] +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/arm/Kconfig | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig +index ee2cbe8ad502..73c55cb2f296 100644 +--- a/arch/arm/Kconfig ++++ b/arch/arm/Kconfig +@@ -67,7 +67,7 @@ config ARM + select HARDIRQS_SW_RESEND + select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT + select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 +- select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU ++ select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT + select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU + select HAVE_ARCH_MMAP_RND_BITS if MMU + select HAVE_ARCH_SECCOMP +-- +2.43.0 + diff --git a/debian/patches-rt/0265-leds-trigger-disable-CPU-trigger-on-RT.patch b/debian/patches-rt/0265-leds-trigger-disable-CPU-trigger-on-RT.patch new file mode 100644 index 000000000..87bc1d736 --- /dev/null +++ b/debian/patches-rt/0265-leds-trigger-disable-CPU-trigger-on-RT.patch @@ -0,0 +1,40 @@ +From 524ea188033506aaa1b70534af78b30fe718ae98 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Thu, 23 Jan 2014 14:45:59 +0100 +Subject: [PATCH 265/323] leds: trigger: disable CPU trigger on -RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +as it triggers: +|CPU: 0 PID: 0 Comm: swapper Not tainted 3.12.8-rt10 #141 +|[<c0014aa4>] (unwind_backtrace+0x0/0xf8) from [<c0012788>] (show_stack+0x1c/0x20) +|[<c0012788>] (show_stack+0x1c/0x20) from [<c043c8dc>] (dump_stack+0x20/0x2c) +|[<c043c8dc>] (dump_stack+0x20/0x2c) from [<c004c5e8>] (__might_sleep+0x13c/0x170) +|[<c004c5e8>] (__might_sleep+0x13c/0x170) from [<c043f270>] (__rt_spin_lock+0x28/0x38) +|[<c043f270>] (__rt_spin_lock+0x28/0x38) from [<c043fa00>] (rt_read_lock+0x68/0x7c) +|[<c043fa00>] (rt_read_lock+0x68/0x7c) from [<c036cf74>] (led_trigger_event+0x2c/0x5c) +|[<c036cf74>] (led_trigger_event+0x2c/0x5c) from [<c036e0bc>] (ledtrig_cpu+0x54/0x5c) +|[<c036e0bc>] (ledtrig_cpu+0x54/0x5c) from [<c000ffd8>] (arch_cpu_idle_exit+0x18/0x1c) +|[<c000ffd8>] (arch_cpu_idle_exit+0x18/0x1c) from [<c00590b8>] (cpu_startup_entry+0xa8/0x234) +|[<c00590b8>] (cpu_startup_entry+0xa8/0x234) from [<c043b2cc>] (rest_init+0xb8/0xe0) +|[<c043b2cc>] (rest_init+0xb8/0xe0) from [<c061ebe0>] (start_kernel+0x2c4/0x380) + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/leds/trigger/Kconfig | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig +index ce9429ca6dde..29ccbd6acf43 100644 +--- a/drivers/leds/trigger/Kconfig ++++ b/drivers/leds/trigger/Kconfig +@@ -64,6 +64,7 @@ config LEDS_TRIGGER_BACKLIGHT + + config LEDS_TRIGGER_CPU + bool "LED CPU Trigger" ++ depends on !PREEMPT_RT + help + This allows LEDs to be controlled by active CPUs. This shows + the active CPUs across an array of LEDs so you can see which +-- +2.43.0 + diff --git a/debian/patches-rt/0266-tty-serial-omap-Make-the-locking-RT-aware.patch b/debian/patches-rt/0266-tty-serial-omap-Make-the-locking-RT-aware.patch new file mode 100644 index 000000000..e926c5ed8 --- /dev/null +++ b/debian/patches-rt/0266-tty-serial-omap-Make-the-locking-RT-aware.patch @@ -0,0 +1,49 @@ +From c436953e83166f6588469c76032a7038a50cc585 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Thu, 28 Jul 2011 13:32:57 +0200 +Subject: [PATCH 266/323] tty/serial/omap: Make the locking RT aware +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The lock is a sleeping lock and local_irq_save() is not the +optimsation we are looking for. Redo it to make it work on -RT and +non-RT. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + drivers/tty/serial/omap-serial.c | 12 ++++-------- + 1 file changed, 4 insertions(+), 8 deletions(-) + +diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c +index 84e8158088cd..342005ed5ebf 100644 +--- a/drivers/tty/serial/omap-serial.c ++++ b/drivers/tty/serial/omap-serial.c +@@ -1311,13 +1311,10 @@ serial_omap_console_write(struct console *co, const char *s, + + pm_runtime_get_sync(up->dev); + +- local_irq_save(flags); +- if (up->port.sysrq) +- locked = 0; +- else if (oops_in_progress) +- locked = spin_trylock(&up->port.lock); ++ if (up->port.sysrq || oops_in_progress) ++ locked = spin_trylock_irqsave(&up->port.lock, flags); + else +- spin_lock(&up->port.lock); ++ spin_lock_irqsave(&up->port.lock, flags); + + /* + * First save the IER then disable the interrupts +@@ -1346,8 +1343,7 @@ serial_omap_console_write(struct console *co, const char *s, + pm_runtime_mark_last_busy(up->dev); + pm_runtime_put_autosuspend(up->dev); + if (locked) +- spin_unlock(&up->port.lock); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&up->port.lock, flags); + } + + static int __init +-- +2.43.0 + diff --git a/debian/patches-rt/0267-tty-serial-pl011-Make-the-locking-work-on-RT.patch b/debian/patches-rt/0267-tty-serial-pl011-Make-the-locking-work-on-RT.patch new file mode 100644 index 000000000..73b68ab8d --- /dev/null +++ b/debian/patches-rt/0267-tty-serial-pl011-Make-the-locking-work-on-RT.patch @@ -0,0 +1,60 @@ +From 822739997ff16caa50f619a00344655f21d65a4b Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Tue, 8 Jan 2013 21:36:51 +0100 +Subject: [PATCH 267/323] tty/serial/pl011: Make the locking work on RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The lock is a sleeping lock and local_irq_save() is not the optimsation +we are looking for. Redo it to make it work on -RT and non-RT. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + drivers/tty/serial/amba-pl011.c | 17 +++++++++++------ + 1 file changed, 11 insertions(+), 6 deletions(-) + +diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c +index d4a93a94b4ca..275c625e3eb9 100644 +--- a/drivers/tty/serial/amba-pl011.c ++++ b/drivers/tty/serial/amba-pl011.c +@@ -2207,18 +2207,24 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) + { + struct uart_amba_port *uap = amba_ports[co->index]; + unsigned int old_cr = 0, new_cr; +- unsigned long flags; ++ unsigned long flags = 0; + int locked = 1; + + clk_enable(uap->clk); + +- local_irq_save(flags); ++ /* ++ * local_irq_save(flags); ++ * ++ * This local_irq_save() is nonsense. If we come in via sysrq ++ * handling then interrupts are already disabled. Aside of ++ * that the port.sysrq check is racy on SMP regardless. ++ */ + if (uap->port.sysrq) + locked = 0; + else if (oops_in_progress) +- locked = spin_trylock(&uap->port.lock); ++ locked = spin_trylock_irqsave(&uap->port.lock, flags); + else +- spin_lock(&uap->port.lock); ++ spin_lock_irqsave(&uap->port.lock, flags); + + /* + * First save the CR then disable the interrupts +@@ -2244,8 +2250,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) + pl011_write(old_cr, uap, REG_CR); + + if (locked) +- spin_unlock(&uap->port.lock); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&uap->port.lock, flags); + + clk_disable(uap->clk); + } +-- +2.43.0 + diff --git a/debian/patches-rt/0268-ARM-enable-irq-in-translation-section-permission-fau.patch b/debian/patches-rt/0268-ARM-enable-irq-in-translation-section-permission-fau.patch new file mode 100644 index 000000000..35e0764d9 --- /dev/null +++ b/debian/patches-rt/0268-ARM-enable-irq-in-translation-section-permission-fau.patch @@ -0,0 +1,95 @@ +From 0a5ad9dd4773a61fe0f0d6928ef603081390df21 Mon Sep 17 00:00:00 2001 +From: "Yadi.hu" <yadi.hu@windriver.com> +Date: Wed, 10 Dec 2014 10:32:09 +0800 +Subject: [PATCH 268/323] ARM: enable irq in translation/section permission + fault handlers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Probably happens on all ARM, with +CONFIG_PREEMPT_RT +CONFIG_DEBUG_ATOMIC_SLEEP + +This simple program.... + +int main() { + *((char*)0xc0001000) = 0; +}; + +[ 512.742724] BUG: sleeping function called from invalid context at kernel/rtmutex.c:658 +[ 512.743000] in_atomic(): 0, irqs_disabled(): 128, pid: 994, name: a +[ 512.743217] INFO: lockdep is turned off. +[ 512.743360] irq event stamp: 0 +[ 512.743482] hardirqs last enabled at (0): [< (null)>] (null) +[ 512.743714] hardirqs last disabled at (0): [<c0426370>] copy_process+0x3b0/0x11c0 +[ 512.744013] softirqs last enabled at (0): [<c0426370>] copy_process+0x3b0/0x11c0 +[ 512.744303] softirqs last disabled at (0): [< (null)>] (null) +[ 512.744631] [<c041872c>] (unwind_backtrace+0x0/0x104) +[ 512.745001] [<c09af0c4>] (dump_stack+0x20/0x24) +[ 512.745355] [<c0462490>] (__might_sleep+0x1dc/0x1e0) +[ 512.745717] [<c09b6770>] (rt_spin_lock+0x34/0x6c) +[ 512.746073] [<c0441bf0>] (do_force_sig_info+0x34/0xf0) +[ 512.746457] [<c0442668>] (force_sig_info+0x18/0x1c) +[ 512.746829] [<c041d880>] (__do_user_fault+0x9c/0xd8) +[ 512.747185] [<c041d938>] (do_bad_area+0x7c/0x94) +[ 512.747536] [<c041d990>] (do_sect_fault+0x40/0x48) +[ 512.747898] [<c040841c>] (do_DataAbort+0x40/0xa0) +[ 512.748181] Exception stack(0xecaa1fb0 to 0xecaa1ff8) + +Oxc0000000 belongs to kernel address space, user task can not be +allowed to access it. For above condition, correct result is that +test case should receive a “segment fault” and exits but not stacks. + +the root cause is commit 02fe2845d6a8 ("avoid enabling interrupts in +prefetch/data abort handlers"),it deletes irq enable block in Data +abort assemble code and move them into page/breakpiont/alignment fault +handlers instead. But author does not enable irq in translation/section +permission fault handlers. ARM disables irq when it enters exception/ +interrupt mode, if kernel doesn't enable irq, it would be still disabled +during translation/section permission fault. + +We see the above splat because do_force_sig_info is still called with +IRQs off, and that code eventually does a: + + spin_lock_irqsave(&t->sighand->siglock, flags); + +As this is architecture independent code, and we've not seen any other +need for other arch to have the siglock converted to raw lock, we can +conclude that we should enable irq for ARM translation/section +permission exception. + +Signed-off-by: Yadi.hu <yadi.hu@windriver.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/arm/mm/fault.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c +index af5177801fb1..1de016008e2e 100644 +--- a/arch/arm/mm/fault.c ++++ b/arch/arm/mm/fault.c +@@ -400,6 +400,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, + if (addr < TASK_SIZE) + return do_page_fault(addr, fsr, regs); + ++ if (interrupts_enabled(regs)) ++ local_irq_enable(); ++ + if (user_mode(regs)) + goto bad_area; + +@@ -470,6 +473,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, + static int + do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + { ++ if (interrupts_enabled(regs)) ++ local_irq_enable(); ++ + do_bad_area(addr, fsr, regs); + return 0; + } +-- +2.43.0 + diff --git a/debian/patches-rt/0269-genirq-update-irq_set_irqchip_state-documentation.patch b/debian/patches-rt/0269-genirq-update-irq_set_irqchip_state-documentation.patch new file mode 100644 index 000000000..aae472078 --- /dev/null +++ b/debian/patches-rt/0269-genirq-update-irq_set_irqchip_state-documentation.patch @@ -0,0 +1,32 @@ +From 0eeda295a1e21ff716f482a4115822e9789e305f Mon Sep 17 00:00:00 2001 +From: Josh Cartwright <joshc@ni.com> +Date: Thu, 11 Feb 2016 11:54:00 -0600 +Subject: [PATCH 269/323] genirq: update irq_set_irqchip_state documentation +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +On -rt kernels, the use of migrate_disable()/migrate_enable() is +sufficient to guarantee a task isn't moved to another CPU. Update the +irq_set_irqchip_state() documentation to reflect this. + +Signed-off-by: Josh Cartwright <joshc@ni.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/irq/manage.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c +index 5ec167fb77b9..228699d9c85c 100644 +--- a/kernel/irq/manage.c ++++ b/kernel/irq/manage.c +@@ -2756,7 +2756,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state); + * This call sets the internal irqchip state of an interrupt, + * depending on the value of @which. + * +- * This function should be called with preemption disabled if the ++ * This function should be called with migration disabled if the + * interrupt controller has per-cpu registers. + */ + int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, +-- +2.43.0 + diff --git a/debian/patches-rt/0270-KVM-arm-arm64-downgrade-preempt_disable-d-region-to-.patch b/debian/patches-rt/0270-KVM-arm-arm64-downgrade-preempt_disable-d-region-to-.patch new file mode 100644 index 000000000..e9a9de0ad --- /dev/null +++ b/debian/patches-rt/0270-KVM-arm-arm64-downgrade-preempt_disable-d-region-to-.patch @@ -0,0 +1,59 @@ +From bbb0b6e29c89355e0fb77bef3b06badef0ccc946 Mon Sep 17 00:00:00 2001 +From: Josh Cartwright <joshc@ni.com> +Date: Thu, 11 Feb 2016 11:54:01 -0600 +Subject: [PATCH 270/323] KVM: arm/arm64: downgrade preempt_disable()d region + to migrate_disable() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +kvm_arch_vcpu_ioctl_run() disables the use of preemption when updating +the vgic and timer states to prevent the calling task from migrating to +another CPU. It does so to prevent the task from writing to the +incorrect per-CPU GIC distributor registers. + +On -rt kernels, it's possible to maintain the same guarantee with the +use of migrate_{disable,enable}(), with the added benefit that the +migrate-disabled region is preemptible. Update +kvm_arch_vcpu_ioctl_run() to do so. + +Cc: Christoffer Dall <christoffer.dall@linaro.org> +Reported-by: Manish Jaggi <Manish.Jaggi@caviumnetworks.com> +Signed-off-by: Josh Cartwright <joshc@ni.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/arm64/kvm/arm.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c +index 4d63fcd7574b..06fced951e75 100644 +--- a/arch/arm64/kvm/arm.c ++++ b/arch/arm64/kvm/arm.c +@@ -708,7 +708,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) + * involves poking the GIC, which must be done in a + * non-preemptible context. + */ +- preempt_disable(); ++ migrate_disable(); + + kvm_pmu_flush_hwstate(vcpu); + +@@ -757,7 +757,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) + kvm_timer_sync_user(vcpu); + kvm_vgic_sync_hwstate(vcpu); + local_irq_enable(); +- preempt_enable(); ++ migrate_enable(); + continue; + } + +@@ -829,7 +829,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) + /* Exit types that need handling before we can be preempted */ + handle_exit_early(vcpu, ret); + +- preempt_enable(); ++ migrate_enable(); + + /* + * The ARMv8 architecture doesn't give the hypervisor +-- +2.43.0 + diff --git a/debian/patches-rt/0271-arm64-fpsimd-Delay-freeing-memory-in-fpsimd_flush_th.patch b/debian/patches-rt/0271-arm64-fpsimd-Delay-freeing-memory-in-fpsimd_flush_th.patch new file mode 100644 index 000000000..733617e78 --- /dev/null +++ b/debian/patches-rt/0271-arm64-fpsimd-Delay-freeing-memory-in-fpsimd_flush_th.patch @@ -0,0 +1,66 @@ +From d0fe894184b649a8d383fa8d2db7ae3233b04428 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 25 Jul 2018 14:02:38 +0200 +Subject: [PATCH 271/323] arm64: fpsimd: Delay freeing memory in + fpsimd_flush_thread() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +fpsimd_flush_thread() invokes kfree() via sve_free() within a preempt disabled +section which is not working on -RT. + +Delay freeing of memory until preemption is enabled again. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/arm64/kernel/fpsimd.c | 14 +++++++++++++- + 1 file changed, 13 insertions(+), 1 deletion(-) + +diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c +index a9bbfb800ec2..aa631771e0dc 100644 +--- a/arch/arm64/kernel/fpsimd.c ++++ b/arch/arm64/kernel/fpsimd.c +@@ -226,6 +226,16 @@ static void sve_free(struct task_struct *task) + __sve_free(task); + } + ++static void *sve_free_atomic(struct task_struct *task) ++{ ++ void *sve_state = task->thread.sve_state; ++ ++ WARN_ON(test_tsk_thread_flag(task, TIF_SVE)); ++ ++ task->thread.sve_state = NULL; ++ return sve_state; ++} ++ + /* + * TIF_SVE controls whether a task can use SVE without trapping while + * in userspace, and also the way a task's FPSIMD/SVE state is stored +@@ -1022,6 +1032,7 @@ void fpsimd_thread_switch(struct task_struct *next) + void fpsimd_flush_thread(void) + { + int vl, supported_vl; ++ void *mem = NULL; + + if (!system_supports_fpsimd()) + return; +@@ -1034,7 +1045,7 @@ void fpsimd_flush_thread(void) + + if (system_supports_sve()) { + clear_thread_flag(TIF_SVE); +- sve_free(current); ++ mem = sve_free_atomic(current); + + /* + * Reset the task vector length as required. +@@ -1068,6 +1079,7 @@ void fpsimd_flush_thread(void) + } + + put_cpu_fpsimd_context(); ++ kfree(mem); + } + + /* +-- +2.43.0 + diff --git a/debian/patches-rt/0272-x86-Enable-RT-also-on-32bit.patch b/debian/patches-rt/0272-x86-Enable-RT-also-on-32bit.patch new file mode 100644 index 000000000..e7c39e069 --- /dev/null +++ b/debian/patches-rt/0272-x86-Enable-RT-also-on-32bit.patch @@ -0,0 +1,34 @@ +From b4def70f635385c0a02c2261e1915671363eeccd Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Thu, 7 Nov 2019 17:49:20 +0100 +Subject: [PATCH 272/323] x86: Enable RT also on 32bit +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/x86/Kconfig | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index b9f68b01a8c8..ca79a831f611 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -27,7 +27,6 @@ config X86_64 + # Options that are inherently 64-bit kernel only: + select ARCH_HAS_GIGANTIC_PAGE + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 +- select ARCH_SUPPORTS_RT + select ARCH_USE_CMPXCHG_LOCKREF + select HAVE_ARCH_SOFT_DIRTY + select MODULES_USE_ELF_RELA +@@ -96,6 +95,7 @@ config X86 + select ARCH_SUPPORTS_ACPI + select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 ++ select ARCH_SUPPORTS_RT + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_QUEUED_RWLOCKS + select ARCH_USE_QUEUED_SPINLOCKS +-- +2.43.0 + diff --git a/debian/patches-rt/0273-ARM-Allow-to-enable-RT.patch b/debian/patches-rt/0273-ARM-Allow-to-enable-RT.patch new file mode 100644 index 000000000..9392f0977 --- /dev/null +++ b/debian/patches-rt/0273-ARM-Allow-to-enable-RT.patch @@ -0,0 +1,36 @@ +From 98a5d5b92ddcd0df7aa708accc20a07b6008d86a Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Fri, 11 Oct 2019 13:14:29 +0200 +Subject: [PATCH 273/323] ARM: Allow to enable RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Allow to select RT. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/arm/Kconfig | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig +index 73c55cb2f296..adf9347724ed 100644 +--- a/arch/arm/Kconfig ++++ b/arch/arm/Kconfig +@@ -32,6 +32,7 @@ config ARM + select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX + select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 + select ARCH_SUPPORTS_ATOMIC_RMW ++ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU +@@ -123,6 +124,7 @@ config ARM + select OLD_SIGSUSPEND3 + select PCI_SYSCALL if PCI + select PERF_USE_VMALLOC ++ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM + select RTC_LIB + select SET_FS + select SYS_SUPPORTS_APM_EMULATION +-- +2.43.0 + diff --git a/debian/patches-rt/0274-ARM64-Allow-to-enable-RT.patch b/debian/patches-rt/0274-ARM64-Allow-to-enable-RT.patch new file mode 100644 index 000000000..5176f9521 --- /dev/null +++ b/debian/patches-rt/0274-ARM64-Allow-to-enable-RT.patch @@ -0,0 +1,36 @@ +From 27aef9eb166603124a97edf3a14035b406f36096 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Fri, 11 Oct 2019 13:14:35 +0200 +Subject: [PATCH 274/323] ARM64: Allow to enable RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Allow to select RT. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/arm64/Kconfig | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index 7681d7b65c7d..ab8b86b23190 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -76,6 +76,7 @@ config ARM64 + select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && (GCC_VERSION >= 50000 || CC_IS_CLANG) + select ARCH_SUPPORTS_NUMA_BALANCING ++ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK + select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT + select ARCH_WANT_DEFAULT_BPF_JIT + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT +@@ -195,6 +196,7 @@ config ARM64 + select PCI_DOMAINS_GENERIC if PCI + select PCI_ECAM if (ACPI && PCI) + select PCI_SYSCALL if PCI ++ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM + select POWER_RESET + select POWER_SUPPLY + select SET_FS +-- +2.43.0 + diff --git a/debian/patches-rt/0275-powerpc-traps-Use-PREEMPT_RT.patch b/debian/patches-rt/0275-powerpc-traps-Use-PREEMPT_RT.patch new file mode 100644 index 000000000..75ece9602 --- /dev/null +++ b/debian/patches-rt/0275-powerpc-traps-Use-PREEMPT_RT.patch @@ -0,0 +1,39 @@ +From 25b6e91e812b8fe81cda3732be60b3d8402fee43 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Fri, 26 Jul 2019 11:30:49 +0200 +Subject: [PATCH 275/323] powerpc: traps: Use PREEMPT_RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Add PREEMPT_RT to the backtrace if enabled. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/powerpc/kernel/traps.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c +index d39a4a6b4ddf..75a6c83299ad 100644 +--- a/arch/powerpc/kernel/traps.c ++++ b/arch/powerpc/kernel/traps.c +@@ -259,12 +259,17 @@ static char *get_mmu_str(void) + + static int __die(const char *str, struct pt_regs *regs, long err) + { ++ const char *pr = ""; ++ + printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); + ++ if (IS_ENABLED(CONFIG_PREEMPTION)) ++ pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; ++ + printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n", + IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE", + PAGE_SIZE / 1024, get_mmu_str(), +- IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", ++ pr, + IS_ENABLED(CONFIG_SMP) ? " SMP" : "", + IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", + debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", +-- +2.43.0 + diff --git a/debian/patches-rt/0276-powerpc-pseries-iommu-Use-a-locallock-instead-local_.patch b/debian/patches-rt/0276-powerpc-pseries-iommu-Use-a-locallock-instead-local_.patch new file mode 100644 index 000000000..fbd35f4c7 --- /dev/null +++ b/debian/patches-rt/0276-powerpc-pseries-iommu-Use-a-locallock-instead-local_.patch @@ -0,0 +1,117 @@ +From afebcd1022d509a6887f066b6601caf561ee1ee5 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 26 Mar 2019 18:31:54 +0100 +Subject: [PATCH 276/323] powerpc/pseries/iommu: Use a locallock instead + local_irq_save() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The locallock protects the per-CPU variable tce_page. The function +attempts to allocate memory while tce_page is protected (by disabling +interrupts). + +Use local_irq_save() instead of local_irq_disable(). + +Cc: stable-rt@vger.kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/powerpc/platforms/pseries/iommu.c | 31 +++++++++++++++++--------- + 1 file changed, 20 insertions(+), 11 deletions(-) + +diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c +index 245f1f8df656..f05555dde8e7 100644 +--- a/arch/powerpc/platforms/pseries/iommu.c ++++ b/arch/powerpc/platforms/pseries/iommu.c +@@ -24,6 +24,7 @@ + #include <linux/of.h> + #include <linux/iommu.h> + #include <linux/rculist.h> ++#include <linux/local_lock.h> + #include <asm/io.h> + #include <asm/prom.h> + #include <asm/rtas.h> +@@ -190,7 +191,13 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, + return ret; + } + +-static DEFINE_PER_CPU(__be64 *, tce_page); ++struct tce_page { ++ __be64 * page; ++ local_lock_t lock; ++}; ++static DEFINE_PER_CPU(struct tce_page, tce_page) = { ++ .lock = INIT_LOCAL_LOCK(lock), ++}; + + static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, + long npages, unsigned long uaddr, +@@ -212,9 +219,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, + direction, attrs); + } + +- local_irq_save(flags); /* to protect tcep and the page behind it */ ++ /* to protect tcep and the page behind it */ ++ local_lock_irqsave(&tce_page.lock, flags); + +- tcep = __this_cpu_read(tce_page); ++ tcep = __this_cpu_read(tce_page.page); + + /* This is safe to do since interrupts are off when we're called + * from iommu_alloc{,_sg}() +@@ -223,12 +231,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, + tcep = (__be64 *)__get_free_page(GFP_ATOMIC); + /* If allocation fails, fall back to the loop implementation */ + if (!tcep) { +- local_irq_restore(flags); ++ local_unlock_irqrestore(&tce_page.lock, flags); + return tce_build_pSeriesLP(tbl->it_index, tcenum, + tbl->it_page_shift, + npages, uaddr, direction, attrs); + } +- __this_cpu_write(tce_page, tcep); ++ __this_cpu_write(tce_page.page, tcep); + } + + rpn = __pa(uaddr) >> TCE_SHIFT; +@@ -258,7 +266,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, + tcenum += limit; + } while (npages > 0 && !rc); + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&tce_page.lock, flags); + + if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { + ret = (int)rc; +@@ -429,16 +437,17 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, + DMA_BIDIRECTIONAL, 0); + } + +- local_irq_disable(); /* to protect tcep and the page behind it */ +- tcep = __this_cpu_read(tce_page); ++ /* to protect tcep and the page behind it */ ++ local_lock_irq(&tce_page.lock); ++ tcep = __this_cpu_read(tce_page.page); + + if (!tcep) { + tcep = (__be64 *)__get_free_page(GFP_ATOMIC); + if (!tcep) { +- local_irq_enable(); ++ local_unlock_irq(&tce_page.lock); + return -ENOMEM; + } +- __this_cpu_write(tce_page, tcep); ++ __this_cpu_write(tce_page.page, tcep); + } + + proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; +@@ -481,7 +490,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, + + /* error cleanup: caller will clear whole range */ + +- local_irq_enable(); ++ local_unlock_irq(&tce_page.lock); + return rc; + } + +-- +2.43.0 + diff --git a/debian/patches-rt/0277-powerpc-kvm-Disable-in-kernel-MPIC-emulation-for-PRE.patch b/debian/patches-rt/0277-powerpc-kvm-Disable-in-kernel-MPIC-emulation-for-PRE.patch new file mode 100644 index 000000000..1d465fba1 --- /dev/null +++ b/debian/patches-rt/0277-powerpc-kvm-Disable-in-kernel-MPIC-emulation-for-PRE.patch @@ -0,0 +1,45 @@ +From f6ff6354df6d4609ce786969f2fd16afeaf88fba Mon Sep 17 00:00:00 2001 +From: Bogdan Purcareata <bogdan.purcareata@freescale.com> +Date: Fri, 24 Apr 2015 15:53:13 +0000 +Subject: [PATCH 277/323] powerpc/kvm: Disable in-kernel MPIC emulation for + PREEMPT_RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +While converting the openpic emulation code to use a raw_spinlock_t enables +guests to run on RT, there's still a performance issue. For interrupts sent in +directed delivery mode with a multiple CPU mask, the emulated openpic will loop +through all of the VCPUs, and for each VCPUs, it call IRQ_check, which will loop +through all the pending interrupts for that VCPU. This is done while holding the +raw_lock, meaning that in all this time the interrupts and preemption are +disabled on the host Linux. A malicious user app can max both these number and +cause a DoS. + +This temporary fix is sent for two reasons. First is so that users who want to +use the in-kernel MPIC emulation are aware of the potential latencies, thus +making sure that the hardware MPIC and their usage scenario does not involve +interrupts sent in directed delivery mode, and the number of possible pending +interrupts is kept small. Secondly, this should incentivize the development of a +proper openpic emulation that would be better suited for RT. + +Acked-by: Scott Wood <scottwood@freescale.com> +Signed-off-by: Bogdan Purcareata <bogdan.purcareata@freescale.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/powerpc/kvm/Kconfig | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig +index 549591d9aaa2..efb5bfe93f70 100644 +--- a/arch/powerpc/kvm/Kconfig ++++ b/arch/powerpc/kvm/Kconfig +@@ -178,6 +178,7 @@ config KVM_E500MC + config KVM_MPIC + bool "KVM in-kernel MPIC emulation" + depends on KVM && E500 ++ depends on !PREEMPT_RT + select HAVE_KVM_IRQCHIP + select HAVE_KVM_IRQFD + select HAVE_KVM_IRQ_ROUTING +-- +2.43.0 + diff --git a/debian/patches-rt/0278-powerpc-stackprotector-work-around-stack-guard-init-.patch b/debian/patches-rt/0278-powerpc-stackprotector-work-around-stack-guard-init-.patch new file mode 100644 index 000000000..a46f2fadf --- /dev/null +++ b/debian/patches-rt/0278-powerpc-stackprotector-work-around-stack-guard-init-.patch @@ -0,0 +1,36 @@ +From fdb2b0b4898a26e8910152ed1ab6fa6748d1b839 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 26 Mar 2019 18:31:29 +0100 +Subject: [PATCH 278/323] powerpc/stackprotector: work around stack-guard init + from atomic +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +This is invoked from the secondary CPU in atomic context. On x86 we use +tsc instead. On Power we XOR it against mftb() so lets use stack address +as the initial value. + +Cc: stable-rt@vger.kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/powerpc/include/asm/stackprotector.h | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h +index 1c8460e23583..b1653c160bab 100644 +--- a/arch/powerpc/include/asm/stackprotector.h ++++ b/arch/powerpc/include/asm/stackprotector.h +@@ -24,7 +24,11 @@ static __always_inline void boot_init_stack_canary(void) + unsigned long canary; + + /* Try to get a semi random initial value. */ ++#ifdef CONFIG_PREEMPT_RT ++ canary = (unsigned long)&canary; ++#else + canary = get_random_canary(); ++#endif + canary ^= mftb(); + canary ^= LINUX_VERSION_CODE; + canary &= CANARY_MASK; +-- +2.43.0 + diff --git a/debian/patches-rt/0279-powerpc-Avoid-recursive-header-includes.patch b/debian/patches-rt/0279-powerpc-Avoid-recursive-header-includes.patch new file mode 100644 index 000000000..64bf1e545 --- /dev/null +++ b/debian/patches-rt/0279-powerpc-Avoid-recursive-header-includes.patch @@ -0,0 +1,48 @@ +From 725e44480dc7376b55d990292c119883ddc51288 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Fri, 8 Jan 2021 19:48:21 +0100 +Subject: [PATCH 279/323] powerpc: Avoid recursive header includes +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +- The include of bug.h leads to an include of printk.h which gets back + to spinlock.h and complains then about missing xchg(). + Remove bug.h and add bits.h which is needed for BITS_PER_BYTE. + +- Avoid the "please don't include this file directly" error from + rwlock-rt. Allow an include from/with rtmutex.h. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/powerpc/include/asm/cmpxchg.h | 2 +- + arch/powerpc/include/asm/simple_spinlock_types.h | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h +index cf091c4c22e5..7371f7e23c35 100644 +--- a/arch/powerpc/include/asm/cmpxchg.h ++++ b/arch/powerpc/include/asm/cmpxchg.h +@@ -5,7 +5,7 @@ + #ifdef __KERNEL__ + #include <linux/compiler.h> + #include <asm/synch.h> +-#include <linux/bug.h> ++#include <linux/bits.h> + + #ifdef __BIG_ENDIAN + #define BITOFF_CAL(size, off) ((sizeof(u32) - size - off) * BITS_PER_BYTE) +diff --git a/arch/powerpc/include/asm/simple_spinlock_types.h b/arch/powerpc/include/asm/simple_spinlock_types.h +index 0f3cdd8faa95..d45561e9e6ba 100644 +--- a/arch/powerpc/include/asm/simple_spinlock_types.h ++++ b/arch/powerpc/include/asm/simple_spinlock_types.h +@@ -2,7 +2,7 @@ + #ifndef _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H + #define _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__LINUX_RT_MUTEX_H) + # error "please don't include this file directly" + #endif + +-- +2.43.0 + diff --git a/debian/patches-rt/0280-POWERPC-Allow-to-enable-RT.patch b/debian/patches-rt/0280-POWERPC-Allow-to-enable-RT.patch new file mode 100644 index 000000000..955781532 --- /dev/null +++ b/debian/patches-rt/0280-POWERPC-Allow-to-enable-RT.patch @@ -0,0 +1,36 @@ +From 934b061dd0089023163d44f7670bc3b0a4ebb7b1 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Fri, 11 Oct 2019 13:14:41 +0200 +Subject: [PATCH 280/323] POWERPC: Allow to enable RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Allow to select RT. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + arch/powerpc/Kconfig | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig +index 6d8ce54ad6dd..065ed526be1a 100644 +--- a/arch/powerpc/Kconfig ++++ b/arch/powerpc/Kconfig +@@ -146,6 +146,7 @@ config PPC + select ARCH_MIGHT_HAVE_PC_SERIO + select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX + select ARCH_SUPPORTS_ATOMIC_RMW ++ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_CMPXCHG_LOCKREF if PPC64 + select ARCH_USE_QUEUED_RWLOCKS if PPC_QUEUED_SPINLOCKS +@@ -238,6 +239,7 @@ config PPC + select HAVE_SYSCALL_TRACEPOINTS + select HAVE_VIRT_CPU_ACCOUNTING + select HAVE_IRQ_TIME_ACCOUNTING ++ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM + select HAVE_RSEQ + select IOMMU_HELPER if PPC64 + select IRQ_DOMAIN +-- +2.43.0 + diff --git a/debian/patches-rt/0281-drivers-block-zram-Replace-bit-spinlocks-with-rtmute.patch b/debian/patches-rt/0281-drivers-block-zram-Replace-bit-spinlocks-with-rtmute.patch new file mode 100644 index 000000000..4c370a764 --- /dev/null +++ b/debian/patches-rt/0281-drivers-block-zram-Replace-bit-spinlocks-with-rtmute.patch @@ -0,0 +1,93 @@ +From b7d25d7dc9f6a1923426c6e235061e825434e4fd Mon Sep 17 00:00:00 2001 +From: Mike Galbraith <umgwanakikbuti@gmail.com> +Date: Thu, 31 Mar 2016 04:08:28 +0200 +Subject: [PATCH 281/323] drivers/block/zram: Replace bit spinlocks with + rtmutex for -rt +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +They're nondeterministic, and lead to ___might_sleep() splats in -rt. +OTOH, they're a lot less wasteful than an rtmutex per page. + +Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/block/zram/zram_drv.c | 36 +++++++++++++++++++++++++++++++++++ + drivers/block/zram/zram_drv.h | 1 + + 2 files changed, 37 insertions(+) + +diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c +index 0636df6b67db..1a7523cefbe9 100644 +--- a/drivers/block/zram/zram_drv.c ++++ b/drivers/block/zram/zram_drv.c +@@ -59,6 +59,40 @@ static void zram_free_page(struct zram *zram, size_t index); + static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, + u32 index, int offset, struct bio *bio); + ++#ifdef CONFIG_PREEMPT_RT ++static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) ++{ ++ size_t index; ++ ++ for (index = 0; index < num_pages; index++) ++ spin_lock_init(&zram->table[index].lock); ++} ++ ++static int zram_slot_trylock(struct zram *zram, u32 index) ++{ ++ int ret; ++ ++ ret = spin_trylock(&zram->table[index].lock); ++ if (ret) ++ __set_bit(ZRAM_LOCK, &zram->table[index].flags); ++ return ret; ++} ++ ++static void zram_slot_lock(struct zram *zram, u32 index) ++{ ++ spin_lock(&zram->table[index].lock); ++ __set_bit(ZRAM_LOCK, &zram->table[index].flags); ++} ++ ++static void zram_slot_unlock(struct zram *zram, u32 index) ++{ ++ __clear_bit(ZRAM_LOCK, &zram->table[index].flags); ++ spin_unlock(&zram->table[index].lock); ++} ++ ++#else ++ ++static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { } + + static int zram_slot_trylock(struct zram *zram, u32 index) + { +@@ -74,6 +108,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index) + { + bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); + } ++#endif + + static inline bool init_done(struct zram *zram) + { +@@ -1165,6 +1200,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) + + if (!huge_class_size) + huge_class_size = zs_huge_class_size(zram->mem_pool); ++ zram_meta_init_table_locks(zram, num_pages); + return true; + } + +diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h +index f2fd46daa760..7e4dd447e1dd 100644 +--- a/drivers/block/zram/zram_drv.h ++++ b/drivers/block/zram/zram_drv.h +@@ -63,6 +63,7 @@ struct zram_table_entry { + unsigned long element; + }; + unsigned long flags; ++ spinlock_t lock; + #ifdef CONFIG_ZRAM_MEMORY_TRACKING + ktime_t ac_time; + #endif +-- +2.43.0 + diff --git a/debian/patches-rt/0282-tpm_tis-fix-stall-after-iowrite-s.patch b/debian/patches-rt/0282-tpm_tis-fix-stall-after-iowrite-s.patch new file mode 100644 index 000000000..8452ac49c --- /dev/null +++ b/debian/patches-rt/0282-tpm_tis-fix-stall-after-iowrite-s.patch @@ -0,0 +1,84 @@ +From ae8783c450876b505b7c6066ccda3b68e8d08c58 Mon Sep 17 00:00:00 2001 +From: Haris Okanovic <haris.okanovic@ni.com> +Date: Tue, 15 Aug 2017 15:13:08 -0500 +Subject: [PATCH 282/323] tpm_tis: fix stall after iowrite*()s +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +ioread8() operations to TPM MMIO addresses can stall the cpu when +immediately following a sequence of iowrite*()'s to the same region. + +For example, cyclitest measures ~400us latency spikes when a non-RT +usermode application communicates with an SPI-based TPM chip (Intel Atom +E3940 system, PREEMPT_RT kernel). The spikes are caused by a +stalling ioread8() operation following a sequence of 30+ iowrite8()s to +the same address. I believe this happens because the write sequence is +buffered (in cpu or somewhere along the bus), and gets flushed on the +first LOAD instruction (ioread*()) that follows. + +The enclosed change appears to fix this issue: read the TPM chip's +access register (status code) after every iowrite*() operation to +amortize the cost of flushing data to chip across multiple instructions. + +Signed-off-by: Haris Okanovic <haris.okanovic@ni.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + drivers/char/tpm/tpm_tis.c | 29 +++++++++++++++++++++++++++-- + 1 file changed, 27 insertions(+), 2 deletions(-) + +diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c +index 3e1bb28b7efd..c26ed0ce6b34 100644 +--- a/drivers/char/tpm/tpm_tis.c ++++ b/drivers/char/tpm/tpm_tis.c +@@ -50,6 +50,31 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da + return container_of(data, struct tpm_tis_tcg_phy, priv); + } + ++#ifdef CONFIG_PREEMPT_RT ++/* ++ * Flushes previous write operations to chip so that a subsequent ++ * ioread*()s won't stall a cpu. ++ */ ++static inline void tpm_tis_flush(void __iomem *iobase) ++{ ++ ioread8(iobase + TPM_ACCESS(0)); ++} ++#else ++#define tpm_tis_flush(iobase) do { } while (0) ++#endif ++ ++static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr) ++{ ++ iowrite8(b, iobase + addr); ++ tpm_tis_flush(iobase); ++} ++ ++static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr) ++{ ++ iowrite32(b, iobase + addr); ++ tpm_tis_flush(iobase); ++} ++ + static int interrupts = -1; + module_param(interrupts, int, 0444); + MODULE_PARM_DESC(interrupts, "Enable interrupts"); +@@ -186,7 +211,7 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len, + struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); + + while (len--) +- iowrite8(*value++, phy->iobase + addr); ++ tpm_tis_iowrite8(*value++, phy->iobase, addr); + + return 0; + } +@@ -213,7 +238,7 @@ static int tpm_tcg_write32(struct tpm_tis_data *data, u32 addr, u32 value) + { + struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); + +- iowrite32(value, phy->iobase + addr); ++ tpm_tis_iowrite32(value, phy->iobase, addr); + + return 0; + } +-- +2.43.0 + diff --git a/debian/patches-rt/0283-signals-Allow-rt-tasks-to-cache-one-sigqueue-struct.patch b/debian/patches-rt/0283-signals-Allow-rt-tasks-to-cache-one-sigqueue-struct.patch new file mode 100644 index 000000000..a34a3538f --- /dev/null +++ b/debian/patches-rt/0283-signals-Allow-rt-tasks-to-cache-one-sigqueue-struct.patch @@ -0,0 +1,212 @@ +From 1f9d07b4cf227f0cf0800f96c10f9ef143b5d663 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 3 Jul 2009 08:44:56 -0500 +Subject: [PATCH 283/323] signals: Allow rt tasks to cache one sigqueue struct +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +To avoid allocation allow rt tasks to cache one sigqueue struct in +task struct. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + include/linux/sched.h | 1 + + include/linux/signal.h | 1 + + kernel/exit.c | 2 +- + kernel/fork.c | 1 + + kernel/signal.c | 69 +++++++++++++++++++++++++++++++++++++++--- + 5 files changed, 69 insertions(+), 5 deletions(-) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 665a17e4f69b..a73528e8235d 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -992,6 +992,7 @@ struct task_struct { + /* Signal handlers: */ + struct signal_struct *signal; + struct sighand_struct __rcu *sighand; ++ struct sigqueue *sigqueue_cache; + sigset_t blocked; + sigset_t real_blocked; + /* Restored if set_restore_sigmask() was used: */ +diff --git a/include/linux/signal.h b/include/linux/signal.h +index b256f9c65661..ebf6c515a7b2 100644 +--- a/include/linux/signal.h ++++ b/include/linux/signal.h +@@ -265,6 +265,7 @@ static inline void init_sigpending(struct sigpending *sig) + } + + extern void flush_sigqueue(struct sigpending *queue); ++extern void flush_task_sigqueue(struct task_struct *tsk); + + /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */ + static inline int valid_signal(unsigned long sig) +diff --git a/kernel/exit.c b/kernel/exit.c +index bacdaf980933..b86f388d3e64 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -199,7 +199,7 @@ static void __exit_signal(struct task_struct *tsk) + * Do this under ->siglock, we can race with another thread + * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. + */ +- flush_sigqueue(&tsk->pending); ++ flush_task_sigqueue(tsk); + tsk->sighand = NULL; + spin_unlock(&sighand->siglock); + +diff --git a/kernel/fork.c b/kernel/fork.c +index 2a11bf5f9e30..dfefb6e7e082 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -2046,6 +2046,7 @@ static __latent_entropy struct task_struct *copy_process( + spin_lock_init(&p->alloc_lock); + + init_sigpending(&p->pending); ++ p->sigqueue_cache = NULL; + + p->utime = p->stime = p->gtime = 0; + #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME +diff --git a/kernel/signal.c b/kernel/signal.c +index e8819aabe3cd..e1f263cbcf09 100644 +--- a/kernel/signal.c ++++ b/kernel/signal.c +@@ -20,6 +20,7 @@ + #include <linux/sched/task.h> + #include <linux/sched/task_stack.h> + #include <linux/sched/cputime.h> ++#include <linux/sched/rt.h> + #include <linux/file.h> + #include <linux/fs.h> + #include <linux/proc_fs.h> +@@ -404,13 +405,30 @@ void task_join_group_stop(struct task_struct *task) + task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING); + } + ++static inline struct sigqueue *get_task_cache(struct task_struct *t) ++{ ++ struct sigqueue *q = t->sigqueue_cache; ++ ++ if (cmpxchg(&t->sigqueue_cache, q, NULL) != q) ++ return NULL; ++ return q; ++} ++ ++static inline int put_task_cache(struct task_struct *t, struct sigqueue *q) ++{ ++ if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL) ++ return 0; ++ return 1; ++} ++ + /* + * allocate a new signal queue record + * - this may be called without locks if and only if t == current, otherwise an + * appropriate lock must be held to stop the target task from exiting + */ + static struct sigqueue * +-__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) ++__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags, ++ int override_rlimit, int fromslab) + { + struct sigqueue *q = NULL; + struct user_struct *user; +@@ -432,7 +450,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi + rcu_read_unlock(); + + if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { +- q = kmem_cache_alloc(sigqueue_cachep, flags); ++ if (!fromslab) ++ q = get_task_cache(t); ++ if (!q) ++ q = kmem_cache_alloc(sigqueue_cachep, flags); + } else { + print_dropped_signal(sig); + } +@@ -449,6 +470,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi + return q; + } + ++static struct sigqueue * ++__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, ++ int override_rlimit) ++{ ++ return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0); ++} ++ + static void __sigqueue_free(struct sigqueue *q) + { + if (q->flags & SIGQUEUE_PREALLOC) +@@ -458,6 +486,21 @@ static void __sigqueue_free(struct sigqueue *q) + kmem_cache_free(sigqueue_cachep, q); + } + ++static void sigqueue_free_current(struct sigqueue *q) ++{ ++ struct user_struct *up; ++ ++ if (q->flags & SIGQUEUE_PREALLOC) ++ return; ++ ++ up = q->user; ++ if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) { ++ atomic_dec(&up->sigpending); ++ free_uid(up); ++ } else ++ __sigqueue_free(q); ++} ++ + void flush_sigqueue(struct sigpending *queue) + { + struct sigqueue *q; +@@ -470,6 +513,21 @@ void flush_sigqueue(struct sigpending *queue) + } + } + ++/* ++ * Called from __exit_signal. Flush tsk->pending and ++ * tsk->sigqueue_cache ++ */ ++void flush_task_sigqueue(struct task_struct *tsk) ++{ ++ struct sigqueue *q; ++ ++ flush_sigqueue(&tsk->pending); ++ ++ q = get_task_cache(tsk); ++ if (q) ++ kmem_cache_free(sigqueue_cachep, q); ++} ++ + /* + * Flush all pending signals for this kthread. + */ +@@ -594,7 +652,7 @@ static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *i + (info->si_code == SI_TIMER) && + (info->si_sys_private); + +- __sigqueue_free(first); ++ sigqueue_free_current(first); + } else { + /* + * Ok, it wasn't in the queue. This must be +@@ -631,6 +689,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *in + bool resched_timer = false; + int signr; + ++ WARN_ON_ONCE(tsk != current); ++ + /* We only dequeue private signals from ourselves, we don't let + * signalfd steal them + */ +@@ -1835,7 +1895,8 @@ EXPORT_SYMBOL(kill_pid); + */ + struct sigqueue *sigqueue_alloc(void) + { +- struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0); ++ /* Preallocated sigqueue objects always from the slabcache ! */ ++ struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1); + + if (q) + q->flags |= SIGQUEUE_PREALLOC; +-- +2.43.0 + diff --git a/debian/patches-rt/0284-signal-Prevent-double-free-of-user-struct.patch b/debian/patches-rt/0284-signal-Prevent-double-free-of-user-struct.patch new file mode 100644 index 000000000..8a50b4089 --- /dev/null +++ b/debian/patches-rt/0284-signal-Prevent-double-free-of-user-struct.patch @@ -0,0 +1,52 @@ +From 9a8853e64ab5f1c44db176464996da3ba86d2a8e Mon Sep 17 00:00:00 2001 +From: Matt Fleming <matt@codeblueprint.co.uk> +Date: Tue, 7 Apr 2020 10:54:13 +0100 +Subject: [PATCH 284/323] signal: Prevent double-free of user struct +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The way user struct reference counting works changed significantly with, + + fda31c50292a ("signal: avoid double atomic counter increments for user accounting") + +Now user structs are only freed once the last pending signal is +dequeued. Make sigqueue_free_current() follow this new convention to +avoid freeing the user struct multiple times and triggering this +warning: + + refcount_t: underflow; use-after-free. + WARNING: CPU: 0 PID: 6794 at lib/refcount.c:288 refcount_dec_not_one+0x45/0x50 + Call Trace: + refcount_dec_and_lock_irqsave+0x16/0x60 + free_uid+0x31/0xa0 + __dequeue_signal+0x17c/0x190 + dequeue_signal+0x5a/0x1b0 + do_sigtimedwait+0x208/0x250 + __x64_sys_rt_sigtimedwait+0x6f/0xd0 + do_syscall_64+0x72/0x200 + entry_SYSCALL_64_after_hwframe+0x49/0xbe + +Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk> +Reported-by: Daniel Wagner <wagi@monom.org> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/signal.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/signal.c b/kernel/signal.c +index e1f263cbcf09..a1eeecb4922d 100644 +--- a/kernel/signal.c ++++ b/kernel/signal.c +@@ -495,8 +495,8 @@ static void sigqueue_free_current(struct sigqueue *q) + + up = q->user; + if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) { +- atomic_dec(&up->sigpending); +- free_uid(up); ++ if (atomic_dec_and_test(&up->sigpending)) ++ free_uid(up); + } else + __sigqueue_free(q); + } +-- +2.43.0 + diff --git a/debian/patches-rt/0285-genirq-Disable-irqpoll-on-rt.patch b/debian/patches-rt/0285-genirq-Disable-irqpoll-on-rt.patch new file mode 100644 index 000000000..8033dbbab --- /dev/null +++ b/debian/patches-rt/0285-genirq-Disable-irqpoll-on-rt.patch @@ -0,0 +1,43 @@ +From 4795907a92ad1bf9383b0c748facb436cac71e36 Mon Sep 17 00:00:00 2001 +From: Ingo Molnar <mingo@elte.hu> +Date: Fri, 3 Jul 2009 08:29:57 -0500 +Subject: [PATCH 285/323] genirq: Disable irqpoll on -rt +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Creates long latencies for no value + +Signed-off-by: Ingo Molnar <mingo@elte.hu> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + kernel/irq/spurious.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c +index f865e5f4d382..dc7311dd74b1 100644 +--- a/kernel/irq/spurious.c ++++ b/kernel/irq/spurious.c +@@ -443,6 +443,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); + + static int __init irqfixup_setup(char *str) + { ++#ifdef CONFIG_PREEMPT_RT ++ pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT\n"); ++ return 1; ++#endif + irqfixup = 1; + printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); + printk(KERN_WARNING "This may impact system performance.\n"); +@@ -455,6 +459,10 @@ module_param(irqfixup, int, 0644); + + static int __init irqpoll_setup(char *str) + { ++#ifdef CONFIG_PREEMPT_RT ++ pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT\n"); ++ return 1; ++#endif + irqfixup = 2; + printk(KERN_WARNING "Misrouted IRQ fixup and polling support " + "enabled\n"); +-- +2.43.0 + diff --git a/debian/patches-rt/0286-sysfs-Add-sys-kernel-realtime-entry.patch b/debian/patches-rt/0286-sysfs-Add-sys-kernel-realtime-entry.patch new file mode 100644 index 000000000..e28616641 --- /dev/null +++ b/debian/patches-rt/0286-sysfs-Add-sys-kernel-realtime-entry.patch @@ -0,0 +1,54 @@ +From e7909a8ac08579fb76c51cde86c1171c8d1c6c36 Mon Sep 17 00:00:00 2001 +From: Clark Williams <williams@redhat.com> +Date: Sat, 30 Jul 2011 21:55:53 -0500 +Subject: [PATCH 286/323] sysfs: Add /sys/kernel/realtime entry +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Add a /sys/kernel entry to indicate that the kernel is a +realtime kernel. + +Clark says that he needs this for udev rules, udev needs to evaluate +if its a PREEMPT_RT kernel a few thousand times and parsing uname +output is too slow or so. + +Are there better solutions? Should it exist and return 0 on !-rt? + +Signed-off-by: Clark Williams <williams@redhat.com> +Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> +--- + kernel/ksysfs.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c +index e20c19e3ba49..777168d58f02 100644 +--- a/kernel/ksysfs.c ++++ b/kernel/ksysfs.c +@@ -143,6 +143,15 @@ KERNEL_ATTR_RO(vmcoreinfo); + + #endif /* CONFIG_CRASH_CORE */ + ++#if defined(CONFIG_PREEMPT_RT) ++static ssize_t realtime_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%d\n", 1); ++} ++KERNEL_ATTR_RO(realtime); ++#endif ++ + /* whether file capabilities are enabled */ + static ssize_t fscaps_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +@@ -233,6 +242,9 @@ static struct attribute * kernel_attrs[] = { + #ifndef CONFIG_TINY_RCU + &rcu_expedited_attr.attr, + &rcu_normal_attr.attr, ++#endif ++#ifdef CONFIG_PREEMPT_RT ++ &realtime_attr.attr, + #endif + NULL + }; +-- +2.43.0 + diff --git a/debian/patches-rt/0287-Add-localversion-for-RT-release.patch b/debian/patches-rt/0287-Add-localversion-for-RT-release.patch new file mode 100644 index 000000000..ee5bf247a --- /dev/null +++ b/debian/patches-rt/0287-Add-localversion-for-RT-release.patch @@ -0,0 +1,22 @@ +From 180a0118d4485cb909de0be0f7ebb61adce763e1 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 8 Jul 2011 20:25:16 +0200 +Subject: [PATCH 287/323] Add localversion for -RT release +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +--- + localversion-rt | 1 + + 1 file changed, 1 insertion(+) + create mode 100644 localversion-rt + +diff --git a/localversion-rt b/localversion-rt +new file mode 100644 +index 000000000000..21988f9ad53f +--- /dev/null ++++ b/localversion-rt +@@ -0,0 +1 @@ ++-rt34 +-- +2.43.0 + diff --git a/debian/patches-rt/0288-net-xfrm-Use-sequence-counter-with-associated-spinlo.patch b/debian/patches-rt/0288-net-xfrm-Use-sequence-counter-with-associated-spinlo.patch new file mode 100644 index 000000000..df8a062a3 --- /dev/null +++ b/debian/patches-rt/0288-net-xfrm-Use-sequence-counter-with-associated-spinlo.patch @@ -0,0 +1,43 @@ +From 97138dbb6886695284b96e3680f1533beae6e43a Mon Sep 17 00:00:00 2001 +From: "Ahmed S. Darwish" <a.darwish@linutronix.de> +Date: Tue, 16 Mar 2021 11:56:30 +0100 +Subject: [PATCH 288/323] net: xfrm: Use sequence counter with associated + spinlock +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +A sequence counter write section must be serialized or its internal +state can get corrupted. A plain seqcount_t does not contain the +information of which lock must be held to guaranteee write side +serialization. + +For xfrm_state_hash_generation, use seqcount_spinlock_t instead of plain +seqcount_t. This allows to associate the spinlock used for write +serialization with the sequence counter. It thus enables lockdep to +verify that the write serialization lock is indeed held before entering +the sequence counter write section. + +If lockdep is disabled, this lock association is compiled out and has +neither storage size nor runtime overhead. + +Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de> +Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> +--- + include/net/netns/xfrm.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h +index 7b87da22b295..fef22e029c0d 100644 +--- a/include/net/netns/xfrm.h ++++ b/include/net/netns/xfrm.h +@@ -77,7 +77,7 @@ struct netns_xfrm { + struct dst_ops xfrm6_dst_ops; + #endif + spinlock_t xfrm_state_lock; +- seqcount_t xfrm_state_hash_generation; ++ seqcount_spinlock_t xfrm_state_hash_generation; + seqcount_spinlock_t xfrm_policy_hash_generation; + + spinlock_t xfrm_policy_lock; +-- +2.43.0 + diff --git a/debian/patches-rt/0289-sched-Fix-migration_cpu_stop-requeueing.patch b/debian/patches-rt/0289-sched-Fix-migration_cpu_stop-requeueing.patch new file mode 100644 index 000000000..54a798b25 --- /dev/null +++ b/debian/patches-rt/0289-sched-Fix-migration_cpu_stop-requeueing.patch @@ -0,0 +1,147 @@ +From 013f5d7501b106952e04d54b0803c2d1ae9a4876 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Tue, 8 Jun 2021 00:37:30 -0400 +Subject: [PATCH 289/323] sched: Fix migration_cpu_stop() requeueing +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +commit 8a6edb5257e2a84720fe78cb179eca58ba76126f upstream. + +When affine_move_task(p) is called on a running task @p, which is not +otherwise already changing affinity, we'll first set +p->migration_pending and then do: + + stop_one_cpu(cpu_of_rq(rq), migration_cpu_stop, &arg); + +This then gets us to migration_cpu_stop() running on the CPU that was +previously running our victim task @p. + +If we find that our task is no longer on that runqueue (this can +happen because of a concurrent migration due to load-balance etc.), +then we'll end up at the: + + } else if (dest_cpu < 1 || pending) { + +branch. Which we'll take because we set pending earlier. Here we first +check if the task @p has already satisfied the affinity constraints, +if so we bail early [A]. Otherwise we'll reissue migration_cpu_stop() +onto the CPU that is now hosting our task @p: + + stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, + &pending->arg, &pending->stop_work); + +Except, we've never initialized pending->arg, which will be all 0s. + +This then results in running migration_cpu_stop() on the next CPU with +arg->p == NULL, which gives the by now obvious result of fireworks. + +The cure is to change affine_move_task() to always use pending->arg, +furthermore we can use the exact same pattern as the +SCA_MIGRATE_ENABLE case, since we'll block on the pending->done +completion anyway, no point in adding yet another completion in +stop_one_cpu(). + +This then gives a clear distinction between the two +migration_cpu_stop() use cases: + + - sched_exec() / migrate_task_to() : arg->pending == NULL + - affine_move_task() : arg->pending != NULL; + +And we can have it ignore p->migration_pending when !arg->pending. Any +stop work from sched_exec() / migrate_task_to() is in addition to stop +works from affine_move_task(), which will be sufficient to issue the +completion. + +Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") +Cc: stable@kernel.org +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Reviewed-by: Valentin Schneider <valentin.schneider@arm.com> +Link: https://lkml.kernel.org/r/20210224131355.357743989@infradead.org +Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + kernel/sched/core.c | 39 ++++++++++++++++++++++++++++----------- + 1 file changed, 28 insertions(+), 11 deletions(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index c847d17e3b04..df9c4c3838c0 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1974,6 +1974,24 @@ static int migration_cpu_stop(void *data) + rq_lock(rq, &rf); + + pending = p->migration_pending; ++ if (pending && !arg->pending) { ++ /* ++ * This happens from sched_exec() and migrate_task_to(), ++ * neither of them care about pending and just want a task to ++ * maybe move about. ++ * ++ * Even if there is a pending, we can ignore it, since ++ * affine_move_task() will have it's own stop_work's in flight ++ * which will manage the completion. ++ * ++ * Notably, pending doesn't need to match arg->pending. This can ++ * happen when tripple concurrent affine_move_task() first sets ++ * pending, then clears pending and eventually sets another ++ * pending. ++ */ ++ pending = NULL; ++ } ++ + /* + * If task_rq(p) != rq, it cannot be migrated here, because we're + * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because +@@ -2246,10 +2264,6 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + int dest_cpu, unsigned int flags) + { + struct set_affinity_pending my_pending = { }, *pending = NULL; +- struct migration_arg arg = { +- .task = p, +- .dest_cpu = dest_cpu, +- }; + bool complete = false; + + /* Can the task run on the task's current CPU? If so, we're done */ +@@ -2287,6 +2301,12 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + /* Install the request */ + refcount_set(&my_pending.refs, 1); + init_completion(&my_pending.done); ++ my_pending.arg = (struct migration_arg) { ++ .task = p, ++ .dest_cpu = -1, /* any */ ++ .pending = &my_pending, ++ }; ++ + p->migration_pending = &my_pending; + } else { + pending = p->migration_pending; +@@ -2317,12 +2337,6 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + p->migration_flags &= ~MDF_PUSH; + task_rq_unlock(rq, p, rf); + +- pending->arg = (struct migration_arg) { +- .task = p, +- .dest_cpu = -1, +- .pending = pending, +- }; +- + stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, + &pending->arg, &pending->stop_work); + +@@ -2335,8 +2349,11 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + * is_migration_disabled(p) checks to the stopper, which will + * run on the same CPU as said p. + */ ++ refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ + task_rq_unlock(rq, p, rf); +- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); ++ ++ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, ++ &pending->arg, &pending->stop_work); + + } else { + +-- +2.43.0 + diff --git a/debian/patches-rt/0290-sched-Simplify-migration_cpu_stop.patch b/debian/patches-rt/0290-sched-Simplify-migration_cpu_stop.patch new file mode 100644 index 000000000..4d0841707 --- /dev/null +++ b/debian/patches-rt/0290-sched-Simplify-migration_cpu_stop.patch @@ -0,0 +1,142 @@ +From 4f468832c2a5bb5c0bb23b453038ff4fed722061 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Tue, 8 Jun 2021 00:37:31 -0400 +Subject: [PATCH 290/323] sched: Simplify migration_cpu_stop() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +commit c20cf065d4a619d394d23290093b1002e27dff86 upstream. + +When affine_move_task() issues a migration_cpu_stop(), the purpose of +that function is to complete that @pending, not any random other +p->migration_pending that might have gotten installed since. + +This realization much simplifies migration_cpu_stop() and allows +further necessary steps to fix all this as it provides the guarantee +that @pending's stopper will complete @pending (and not some random +other @pending). + +Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") +Cc: stable@kernel.org +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Reviewed-by: Valentin Schneider <valentin.schneider@arm.com> +Link: https://lkml.kernel.org/r/20210224131355.430014682@infradead.org +Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + kernel/sched/core.c | 56 +++++++-------------------------------------- + 1 file changed, 8 insertions(+), 48 deletions(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index df9c4c3838c0..31fa925852e5 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1950,8 +1950,8 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, + */ + static int migration_cpu_stop(void *data) + { +- struct set_affinity_pending *pending; + struct migration_arg *arg = data; ++ struct set_affinity_pending *pending = arg->pending; + struct task_struct *p = arg->task; + int dest_cpu = arg->dest_cpu; + struct rq *rq = this_rq(); +@@ -1973,25 +1973,6 @@ static int migration_cpu_stop(void *data) + raw_spin_lock(&p->pi_lock); + rq_lock(rq, &rf); + +- pending = p->migration_pending; +- if (pending && !arg->pending) { +- /* +- * This happens from sched_exec() and migrate_task_to(), +- * neither of them care about pending and just want a task to +- * maybe move about. +- * +- * Even if there is a pending, we can ignore it, since +- * affine_move_task() will have it's own stop_work's in flight +- * which will manage the completion. +- * +- * Notably, pending doesn't need to match arg->pending. This can +- * happen when tripple concurrent affine_move_task() first sets +- * pending, then clears pending and eventually sets another +- * pending. +- */ +- pending = NULL; +- } +- + /* + * If task_rq(p) != rq, it cannot be migrated here, because we're + * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because +@@ -2002,31 +1983,20 @@ static int migration_cpu_stop(void *data) + goto out; + + if (pending) { +- p->migration_pending = NULL; ++ if (p->migration_pending == pending) ++ p->migration_pending = NULL; + complete = true; + } + +- /* migrate_enable() -- we must not race against SCA */ +- if (dest_cpu < 0) { +- /* +- * When this was migrate_enable() but we no longer +- * have a @pending, a concurrent SCA 'fixed' things +- * and we should be valid again. Nothing to do. +- */ +- if (!pending) { +- WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); +- goto out; +- } +- ++ if (dest_cpu < 0) + dest_cpu = cpumask_any_distribute(&p->cpus_mask); +- } + + if (task_on_rq_queued(p)) + rq = __migrate_task(rq, &rf, p, dest_cpu); + else + p->wake_cpu = dest_cpu; + +- } else if (dest_cpu < 0 || pending) { ++ } else if (pending) { + /* + * This happens when we get migrated between migrate_enable()'s + * preempt_enable() and scheduling the stopper task. At that +@@ -2041,22 +2011,13 @@ static int migration_cpu_stop(void *data) + * ->pi_lock, so the allowed mask is stable - if it got + * somewhere allowed, we're done. + */ +- if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { +- p->migration_pending = NULL; ++ if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { ++ if (p->migration_pending == pending) ++ p->migration_pending = NULL; + complete = true; + goto out; + } + +- /* +- * When this was migrate_enable() but we no longer have an +- * @pending, a concurrent SCA 'fixed' things and we should be +- * valid again. Nothing to do. +- */ +- if (!pending) { +- WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); +- goto out; +- } +- + /* + * When migrate_enable() hits a rq mis-match we can't reliably + * determine is_migration_disabled() and so have to chase after +@@ -2074,7 +2035,6 @@ static int migration_cpu_stop(void *data) + complete_all(&pending->done); + + /* For pending->{arg,stop_work} */ +- pending = arg->pending; + if (pending && refcount_dec_and_test(&pending->refs)) + wake_up_var(&pending->refs); + +-- +2.43.0 + diff --git a/debian/patches-rt/0291-sched-Collate-affine_move_task-stoppers.patch b/debian/patches-rt/0291-sched-Collate-affine_move_task-stoppers.patch new file mode 100644 index 000000000..b73fb91be --- /dev/null +++ b/debian/patches-rt/0291-sched-Collate-affine_move_task-stoppers.patch @@ -0,0 +1,69 @@ +From e62b758a0149648256ed0fa194d9e107e294d8aa Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Tue, 8 Jun 2021 00:37:32 -0400 +Subject: [PATCH 291/323] sched: Collate affine_move_task() stoppers +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +commit 58b1a45086b5f80f2b2842aa7ed0da51a64a302b upstream. + +The SCA_MIGRATE_ENABLE and task_running() cases are almost identical, +collapse them to avoid further duplication. + +Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") +Cc: stable@kernel.org +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Reviewed-by: Valentin Schneider <valentin.schneider@arm.com> +Link: https://lkml.kernel.org/r/20210224131355.500108964@infradead.org +Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + kernel/sched/core.c | 23 ++++++++--------------- + 1 file changed, 8 insertions(+), 15 deletions(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 31fa925852e5..3130289baf79 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -2291,30 +2291,23 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + return -EINVAL; + } + +- if (flags & SCA_MIGRATE_ENABLE) { +- +- refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ +- p->migration_flags &= ~MDF_PUSH; +- task_rq_unlock(rq, p, rf); +- +- stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, +- &pending->arg, &pending->stop_work); +- +- return 0; +- } +- + if (task_running(rq, p) || p->state == TASK_WAKING) { + /* +- * Lessen races (and headaches) by delegating +- * is_migration_disabled(p) checks to the stopper, which will +- * run on the same CPU as said p. ++ * MIGRATE_ENABLE gets here because 'p == current', but for ++ * anything else we cannot do is_migration_disabled(), punt ++ * and have the stopper function handle it all race-free. + */ ++ + refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ ++ if (flags & SCA_MIGRATE_ENABLE) ++ p->migration_flags &= ~MDF_PUSH; + task_rq_unlock(rq, p, rf); + + stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, + &pending->arg, &pending->stop_work); + ++ if (flags & SCA_MIGRATE_ENABLE) ++ return 0; + } else { + + if (!is_migration_disabled(p)) { +-- +2.43.0 + diff --git a/debian/patches-rt/0292-sched-Optimize-migration_cpu_stop.patch b/debian/patches-rt/0292-sched-Optimize-migration_cpu_stop.patch new file mode 100644 index 000000000..87ca79ca8 --- /dev/null +++ b/debian/patches-rt/0292-sched-Optimize-migration_cpu_stop.patch @@ -0,0 +1,58 @@ +From b1353ab8df1e6dd3bf9afaef3096749db2a193f1 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Tue, 8 Jun 2021 00:37:33 -0400 +Subject: [PATCH 292/323] sched: Optimize migration_cpu_stop() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +commit 3f1bc119cd7fc987c8ed25ffb717f99403bb308c upstream. + +When the purpose of migration_cpu_stop() is to migrate the task to +'any' valid CPU, don't migrate the task when it's already running on a +valid CPU. + +Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") +Cc: stable@kernel.org +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Reviewed-by: Valentin Schneider <valentin.schneider@arm.com> +Link: https://lkml.kernel.org/r/20210224131355.569238629@infradead.org +Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + kernel/sched/core.c | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 3130289baf79..e28fb33afa95 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1988,14 +1988,25 @@ static int migration_cpu_stop(void *data) + complete = true; + } + +- if (dest_cpu < 0) ++ if (dest_cpu < 0) { ++ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) ++ goto out; ++ + dest_cpu = cpumask_any_distribute(&p->cpus_mask); ++ } + + if (task_on_rq_queued(p)) + rq = __migrate_task(rq, &rf, p, dest_cpu); + else + p->wake_cpu = dest_cpu; + ++ /* ++ * XXX __migrate_task() can fail, at which point we might end ++ * up running on a dodgy CPU, AFAICT this can only happen ++ * during CPU hotplug, at which point we'll get pushed out ++ * anyway, so it's probably not a big deal. ++ */ ++ + } else if (pending) { + /* + * This happens when we get migrated between migrate_enable()'s +-- +2.43.0 + diff --git a/debian/patches-rt/0293-sched-Fix-affine_move_task-self-concurrency.patch b/debian/patches-rt/0293-sched-Fix-affine_move_task-self-concurrency.patch new file mode 100644 index 000000000..4a906f7cb --- /dev/null +++ b/debian/patches-rt/0293-sched-Fix-affine_move_task-self-concurrency.patch @@ -0,0 +1,96 @@ +From e9faaf024fdd553b55aaed31855385da7e9d505a Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Tue, 8 Jun 2021 00:37:34 -0400 +Subject: [PATCH 293/323] sched: Fix affine_move_task() self-concurrency +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +commit 9e81889c7648d48dd5fe13f41cbc99f3c362484a upstream. + +Consider: + + sched_setaffinity(p, X); sched_setaffinity(p, Y); + +Then the first will install p->migration_pending = &my_pending; and +issue stop_one_cpu_nowait(pending); and the second one will read +p->migration_pending and _also_ issue: stop_one_cpu_nowait(pending), +the _SAME_ @pending. + +This causes stopper list corruption. + +Add set_affinity_pending::stop_pending, to indicate if a stopper is in +progress. + +Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") +Cc: stable@kernel.org +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Reviewed-by: Valentin Schneider <valentin.schneider@arm.com> +Link: https://lkml.kernel.org/r/20210224131355.649146419@infradead.org +Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + kernel/sched/core.c | 15 ++++++++++++--- + 1 file changed, 12 insertions(+), 3 deletions(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index e28fb33afa95..76fa3daf1f60 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1916,6 +1916,7 @@ struct migration_arg { + + struct set_affinity_pending { + refcount_t refs; ++ unsigned int stop_pending; + struct completion done; + struct cpu_stop_work stop_work; + struct migration_arg arg; +@@ -2034,12 +2035,15 @@ static int migration_cpu_stop(void *data) + * determine is_migration_disabled() and so have to chase after + * it. + */ ++ WARN_ON_ONCE(!pending->stop_pending); + task_rq_unlock(rq, p, &rf); + stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, + &pending->arg, &pending->stop_work); + return 0; + } + out: ++ if (pending) ++ pending->stop_pending = false; + task_rq_unlock(rq, p, &rf); + + if (complete) +@@ -2235,7 +2239,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + int dest_cpu, unsigned int flags) + { + struct set_affinity_pending my_pending = { }, *pending = NULL; +- bool complete = false; ++ bool stop_pending, complete = false; + + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { +@@ -2308,14 +2312,19 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + * anything else we cannot do is_migration_disabled(), punt + * and have the stopper function handle it all race-free. + */ ++ stop_pending = pending->stop_pending; ++ if (!stop_pending) ++ pending->stop_pending = true; + + refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ + if (flags & SCA_MIGRATE_ENABLE) + p->migration_flags &= ~MDF_PUSH; + task_rq_unlock(rq, p, rf); + +- stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, +- &pending->arg, &pending->stop_work); ++ if (!stop_pending) { ++ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, ++ &pending->arg, &pending->stop_work); ++ } + + if (flags & SCA_MIGRATE_ENABLE) + return 0; +-- +2.43.0 + diff --git a/debian/patches-rt/0294-sched-Simplify-set_affinity_pending-refcounts.patch b/debian/patches-rt/0294-sched-Simplify-set_affinity_pending-refcounts.patch new file mode 100644 index 000000000..1230259b8 --- /dev/null +++ b/debian/patches-rt/0294-sched-Simplify-set_affinity_pending-refcounts.patch @@ -0,0 +1,129 @@ +From 6681b566fe9ffe9365121a790537ada2e4ef97ba Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Tue, 8 Jun 2021 00:37:35 -0400 +Subject: [PATCH 294/323] sched: Simplify set_affinity_pending refcounts +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +commit 50caf9c14b1498c90cf808dbba2ca29bd32ccba4 upstream. + +Now that we have set_affinity_pending::stop_pending to indicate if a +stopper is in progress, and we have the guarantee that if that stopper +exists, it will (eventually) complete our @pending we can simplify the +refcount scheme by no longer counting the stopper thread. + +Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") +Cc: stable@kernel.org +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Reviewed-by: Valentin Schneider <valentin.schneider@arm.com> +Link: https://lkml.kernel.org/r/20210224131355.724130207@infradead.org +Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + kernel/sched/core.c | 32 ++++++++++++++++++++------------ + 1 file changed, 20 insertions(+), 12 deletions(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 76fa3daf1f60..d3b9d69171a2 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1914,6 +1914,10 @@ struct migration_arg { + struct set_affinity_pending *pending; + }; + ++/* ++ * @refs: number of wait_for_completion() ++ * @stop_pending: is @stop_work in use ++ */ + struct set_affinity_pending { + refcount_t refs; + unsigned int stop_pending; +@@ -2049,10 +2053,6 @@ static int migration_cpu_stop(void *data) + if (complete) + complete_all(&pending->done); + +- /* For pending->{arg,stop_work} */ +- if (pending && refcount_dec_and_test(&pending->refs)) +- wake_up_var(&pending->refs); +- + return 0; + } + +@@ -2251,12 +2251,16 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + push_task = get_task_struct(p); + } + ++ /* ++ * If there are pending waiters, but no pending stop_work, ++ * then complete now. ++ */ + pending = p->migration_pending; +- if (pending) { +- refcount_inc(&pending->refs); ++ if (pending && !pending->stop_pending) { + p->migration_pending = NULL; + complete = true; + } ++ + task_rq_unlock(rq, p, rf); + + if (push_task) { +@@ -2265,7 +2269,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + } + + if (complete) +- goto do_complete; ++ complete_all(&pending->done); + + return 0; + } +@@ -2316,9 +2320,9 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + if (!stop_pending) + pending->stop_pending = true; + +- refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ + if (flags & SCA_MIGRATE_ENABLE) + p->migration_flags &= ~MDF_PUSH; ++ + task_rq_unlock(rq, p, rf); + + if (!stop_pending) { +@@ -2334,12 +2338,13 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + if (task_on_rq_queued(p)) + rq = move_queued_task(rq, rf, p, dest_cpu); + +- p->migration_pending = NULL; +- complete = true; ++ if (!pending->stop_pending) { ++ p->migration_pending = NULL; ++ complete = true; ++ } + } + task_rq_unlock(rq, p, rf); + +-do_complete: + if (complete) + complete_all(&pending->done); + } +@@ -2347,7 +2352,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + wait_for_completion(&pending->done); + + if (refcount_dec_and_test(&pending->refs)) +- wake_up_var(&pending->refs); ++ wake_up_var(&pending->refs); /* No UaF, just an address */ + + /* + * Block the original owner of &pending until all subsequent callers +@@ -2355,6 +2360,9 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + */ + wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); + ++ /* ARGH */ ++ WARN_ON_ONCE(my_pending.stop_pending); ++ + return 0; + } + +-- +2.43.0 + diff --git a/debian/patches-rt/0295-sched-Don-t-defer-CPU-pick-to-migration_cpu_stop.patch b/debian/patches-rt/0295-sched-Don-t-defer-CPU-pick-to-migration_cpu_stop.patch new file mode 100644 index 000000000..e05440c01 --- /dev/null +++ b/debian/patches-rt/0295-sched-Don-t-defer-CPU-pick-to-migration_cpu_stop.patch @@ -0,0 +1,100 @@ +From b625852d41e17fb13b4caf7192734866534d9799 Mon Sep 17 00:00:00 2001 +From: Valentin Schneider <valentin.schneider@arm.com> +Date: Tue, 8 Jun 2021 00:37:36 -0400 +Subject: [PATCH 295/323] sched: Don't defer CPU pick to migration_cpu_stop() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +commit 475ea6c60279e9f2ddf7e4cf2648cd8ae0608361 upstream. + +Will reported that the 'XXX __migrate_task() can fail' in migration_cpu_stop() +can happen, and it *is* sort of a big deal. Looking at it some more, one +will note there is a glaring hole in the deferred CPU selection: + + (w/ CONFIG_CPUSET=n, so that the affinity mask passed via taskset doesn't + get AND'd with cpu_online_mask) + + $ taskset -pc 0-2 $PID + # offline CPUs 3-4 + $ taskset -pc 3-5 $PID + `\ + $PID may stay on 0-2 due to the cpumask_any_distribute() picking an + offline CPU and __migrate_task() refusing to do anything due to + cpu_is_allowed(). + +set_cpus_allowed_ptr() goes to some length to pick a dest_cpu that matches +the right constraints vs affinity and the online/active state of the +CPUs. Reuse that instead of discarding it in the affine_move_task() case. + +Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") +Reported-by: Will Deacon <will@kernel.org> +Signed-off-by: Valentin Schneider <valentin.schneider@arm.com> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Link: https://lkml.kernel.org/r/20210526205751.842360-2-valentin.schneider@arm.com +Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + kernel/sched/core.c | 20 ++++++++++++-------- + 1 file changed, 12 insertions(+), 8 deletions(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index d3b9d69171a2..81b342d6629b 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1958,7 +1958,6 @@ static int migration_cpu_stop(void *data) + struct migration_arg *arg = data; + struct set_affinity_pending *pending = arg->pending; + struct task_struct *p = arg->task; +- int dest_cpu = arg->dest_cpu; + struct rq *rq = this_rq(); + bool complete = false; + struct rq_flags rf; +@@ -1991,19 +1990,15 @@ static int migration_cpu_stop(void *data) + if (p->migration_pending == pending) + p->migration_pending = NULL; + complete = true; +- } + +- if (dest_cpu < 0) { + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) + goto out; +- +- dest_cpu = cpumask_any_distribute(&p->cpus_mask); + } + + if (task_on_rq_queued(p)) +- rq = __migrate_task(rq, &rf, p, dest_cpu); ++ rq = __migrate_task(rq, &rf, p, arg->dest_cpu); + else +- p->wake_cpu = dest_cpu; ++ p->wake_cpu = arg->dest_cpu; + + /* + * XXX __migrate_task() can fail, at which point we might end +@@ -2282,7 +2277,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + init_completion(&my_pending.done); + my_pending.arg = (struct migration_arg) { + .task = p, +- .dest_cpu = -1, /* any */ ++ .dest_cpu = dest_cpu, + .pending = &my_pending, + }; + +@@ -2290,6 +2285,15 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag + } else { + pending = p->migration_pending; + refcount_inc(&pending->refs); ++ /* ++ * Affinity has changed, but we've already installed a ++ * pending. migration_cpu_stop() *must* see this, else ++ * we risk a completion of the pending despite having a ++ * task on a disallowed CPU. ++ * ++ * Serialized by p->pi_lock, so this is safe. ++ */ ++ pending->arg.dest_cpu = dest_cpu; + } + } + pending = p->migration_pending; +-- +2.43.0 + diff --git a/debian/patches-rt/0296-printk-Enhance-the-condition-check-of-msleep-in-pr_f.patch b/debian/patches-rt/0296-printk-Enhance-the-condition-check-of-msleep-in-pr_f.patch new file mode 100644 index 000000000..72589519a --- /dev/null +++ b/debian/patches-rt/0296-printk-Enhance-the-condition-check-of-msleep-in-pr_f.patch @@ -0,0 +1,48 @@ +From 6e1b154dfc59c96068eafef9fafbda28b723d3e6 Mon Sep 17 00:00:00 2001 +From: Chao Qin <chao.qin@intel.com> +Date: Mon, 19 Jul 2021 10:26:50 +0800 +Subject: [PATCH 296/323] printk: Enhance the condition check of msleep in + pr_flush() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +[ Upstream commit 83e9288d9c4295d1195e9d780fcbc42c72ba4a83 ] + +There is msleep in pr_flush(). If call WARN() in the early boot +stage such as in early_initcall, pr_flush() will run into msleep +when process scheduler is not ready yet. And then the system will +sleep forever. + +Before the system_state is SYSTEM_RUNNING, make sure DO NOT sleep +in pr_flush(). + +Fixes: c0b395bd0fe3("printk: add pr_flush()") +Signed-off-by: Chao Qin <chao.qin@intel.com> +Signed-off-by: Lili Li <lili.li@intel.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Reviewed-by: John Ogness <john.ogness@linutronix.de> +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Link: https://lore.kernel.org/lkml/20210719022649.3444072-1-chao.qin@intel.com +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + kernel/printk/printk.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 14cb111fe9f0..d2205872304d 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -3554,7 +3554,9 @@ bool pr_flush(int timeout_ms, bool reset_on_progress) + u64 diff; + u64 seq; + +- may_sleep = (preemptible() && !in_softirq()); ++ may_sleep = (preemptible() && ++ !in_softirq() && ++ system_state >= SYSTEM_RUNNING); + + seq = prb_next_seq(prb); + +-- +2.43.0 + diff --git a/debian/patches-rt/0297-locking-rwsem-rt-Remove-might_sleep-in-__up_read.patch b/debian/patches-rt/0297-locking-rwsem-rt-Remove-might_sleep-in-__up_read.patch new file mode 100644 index 000000000..91a9ede63 --- /dev/null +++ b/debian/patches-rt/0297-locking-rwsem-rt-Remove-might_sleep-in-__up_read.patch @@ -0,0 +1,31 @@ +From 7131b777e7276a566838fdfb4a0b2ddc44ad2eca Mon Sep 17 00:00:00 2001 +From: Andrew Halaney <ahalaney@redhat.com> +Date: Tue, 6 Apr 2021 17:19:52 -0500 +Subject: [PATCH 297/323] locking/rwsem-rt: Remove might_sleep() in __up_read() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +There's no chance of sleeping here, the reader is giving up the +lock and possibly waking up the writer who is waiting on it. + +Reported-by: Chunyu Hu <chuhu@redhat.com> +Signed-off-by: Andrew Halaney <ahalaney@redhat.com> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + kernel/locking/rwsem-rt.c | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/kernel/locking/rwsem-rt.c b/kernel/locking/rwsem-rt.c +index 274172d5bb3a..b61edc4dcb73 100644 +--- a/kernel/locking/rwsem-rt.c ++++ b/kernel/locking/rwsem-rt.c +@@ -198,7 +198,6 @@ void __up_read(struct rw_semaphore *sem) + if (!atomic_dec_and_test(&sem->readers)) + return; + +- might_sleep(); + raw_spin_lock_irq(&m->wait_lock); + /* + * Wake the writer, i.e. the rtmutex owner. It might release the +-- +2.43.0 + diff --git a/debian/patches-rt/0298-mm-zsmalloc-Convert-zsmalloc_handle.lock-to-spinlock.patch b/debian/patches-rt/0298-mm-zsmalloc-Convert-zsmalloc_handle.lock-to-spinlock.patch new file mode 100644 index 000000000..d26ff0ea0 --- /dev/null +++ b/debian/patches-rt/0298-mm-zsmalloc-Convert-zsmalloc_handle.lock-to-spinlock.patch @@ -0,0 +1,82 @@ +From b4a9c84408720dd6da0cdb52fc3e7070aef9c4fe Mon Sep 17 00:00:00 2001 +From: Mike Galbraith <efault@gmx.de> +Date: Tue, 24 Aug 2021 13:08:14 +0200 +Subject: [PATCH 298/323] mm, zsmalloc: Convert zsmalloc_handle.lock to + spinlock_t +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +local_lock_t becoming a synonym of spinlock_t had consequences for the RT +mods to zsmalloc, which were taking a mutex while holding a local_lock, +inspiring a lockdep "BUG: Invalid wait context" gripe. + +Converting zsmalloc_handle.lock to a spinlock_t restored lockdep silence. + +Cc: stable-rt@vger.kernel.org +Signed-off-by: Mike Galbraith <efault@gmx.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + mm/zsmalloc.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c +index 7dad2ff3e778..16ce2b05df90 100644 +--- a/mm/zsmalloc.c ++++ b/mm/zsmalloc.c +@@ -82,7 +82,7 @@ + + struct zsmalloc_handle { + unsigned long addr; +- struct mutex lock; ++ spinlock_t lock; + }; + + #define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle)) +@@ -370,7 +370,7 @@ static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) + if (p) { + struct zsmalloc_handle *zh = p; + +- mutex_init(&zh->lock); ++ spin_lock_init(&zh->lock); + } + #endif + return (unsigned long)p; +@@ -930,7 +930,7 @@ static inline int testpin_tag(unsigned long handle) + #ifdef CONFIG_PREEMPT_RT + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); + +- return mutex_is_locked(&zh->lock); ++ return spin_is_locked(&zh->lock); + #else + return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); + #endif +@@ -941,7 +941,7 @@ static inline int trypin_tag(unsigned long handle) + #ifdef CONFIG_PREEMPT_RT + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); + +- return mutex_trylock(&zh->lock); ++ return spin_trylock(&zh->lock); + #else + return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); + #endif +@@ -952,7 +952,7 @@ static void pin_tag(unsigned long handle) __acquires(bitlock) + #ifdef CONFIG_PREEMPT_RT + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); + +- return mutex_lock(&zh->lock); ++ return spin_lock(&zh->lock); + #else + bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); + #endif +@@ -963,7 +963,7 @@ static void unpin_tag(unsigned long handle) __releases(bitlock) + #ifdef CONFIG_PREEMPT_RT + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); + +- return mutex_unlock(&zh->lock); ++ return spin_unlock(&zh->lock); + #else + bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); + #endif +-- +2.43.0 + diff --git a/debian/patches-rt/0299-sched-Fix-get_push_task-vs-migrate_disable.patch b/debian/patches-rt/0299-sched-Fix-get_push_task-vs-migrate_disable.patch new file mode 100644 index 000000000..f59c122c0 --- /dev/null +++ b/debian/patches-rt/0299-sched-Fix-get_push_task-vs-migrate_disable.patch @@ -0,0 +1,46 @@ +From 93c4258bc009109bee258c30fa2a53680c8b1f9a Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Thu, 26 Aug 2021 15:37:38 +0200 +Subject: [PATCH 299/323] sched: Fix get_push_task() vs migrate_disable() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +push_rt_task() attempts to move the currently running task away if the +next runnable task has migration disabled and therefore is pinned on the +current CPU. + +The current task is retrieved via get_push_task() which only checks for +nr_cpus_allowed == 1, but does not check whether the task has migration +disabled and therefore cannot be moved either. The consequence is a +pointless invocation of the migration thread which correctly observes +that the task cannot be moved. + +Return NULL if the task has migration disabled and cannot be moved to +another CPU. + +Cc: stable-rt@vger.kernel.org +Fixes: a7c81556ec4d3 ("sched: Fix migrate_disable() vs rt/dl balancing") +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Link: https://lkml.kernel.org/r/20210826133738.yiotqbtdaxzjsnfj@linutronix.de +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + kernel/sched/sched.h | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index ad854a670701..bda12827832e 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1949,6 +1949,9 @@ static inline struct task_struct *get_push_task(struct rq *rq) + if (p->nr_cpus_allowed == 1) + return NULL; + ++ if (p->migration_disabled) ++ return NULL; ++ + rq->push_busy = true; + return get_task_struct(p); + } +-- +2.43.0 + diff --git a/debian/patches-rt/0300-sched-Switch-wait_task_inactive-to-HRTIMER_MODE_REL_.patch b/debian/patches-rt/0300-sched-Switch-wait_task_inactive-to-HRTIMER_MODE_REL_.patch new file mode 100644 index 000000000..720b08fc7 --- /dev/null +++ b/debian/patches-rt/0300-sched-Switch-wait_task_inactive-to-HRTIMER_MODE_REL_.patch @@ -0,0 +1,48 @@ +From f29a6f7bb4568b4ac2d5a9226d503db7a2c69bab Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 24 Aug 2021 22:47:37 +0200 +Subject: [PATCH 300/323] sched: Switch wait_task_inactive to + HRTIMER_MODE_REL_HARD +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +With PREEMPT_RT enabled all hrtimers callbacks will be invoked in +softirq mode unless they are explicitly marked as HRTIMER_MODE_HARD. +During boot kthread_bind() is used for the creation of per-CPU threads +and then hangs in wait_task_inactive() if the ksoftirqd is not +yet up and running. +The hang disappeared since commit + 26c7295be0c5e ("kthread: Do not preempt current task if it is going to call schedule()") + +but enabling function trace on boot reliably leads to the freeze on boot +behaviour again. +The timer in wait_task_inactive() can not be directly used by an user +interface to abuse it and create a mass wake of several tasks at the +same time which would to long sections with disabled interrupts. +Therefore it is safe to make the timer HRTIMER_MODE_REL_HARD. + +Switch the timer to HRTIMER_MODE_REL_HARD. + +Cc: stable-rt@vger.kernel.org +Link: https://lkml.kernel.org/r/20210826170408.vm7rlj7odslshwch@linutronix.de +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + kernel/sched/core.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 81b342d6629b..6735872a8508 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -2739,7 +2739,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) + ktime_t to = NSEC_PER_SEC / HZ; + + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); + continue; + } + +-- +2.43.0 + diff --git a/debian/patches-rt/0301-preempt-Move-preempt_enable_no_resched-to-the-RT-blo.patch b/debian/patches-rt/0301-preempt-Move-preempt_enable_no_resched-to-the-RT-blo.patch new file mode 100644 index 000000000..41de0ba3d --- /dev/null +++ b/debian/patches-rt/0301-preempt-Move-preempt_enable_no_resched-to-the-RT-blo.patch @@ -0,0 +1,45 @@ +From 8cc1a32ca44ae3484956295c75afcc2234e809ae Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 22 Sep 2021 19:34:40 +0200 +Subject: [PATCH 301/323] preempt: Move preempt_enable_no_resched() to the RT + block +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +preempt_enable_no_resched() should point to preempt_enable() on +PREEMPT_RT so nobody is playing any preempt tricks and enables +preemption without checking for the need-resched flag. + +This was misplaced in v3.14.0-rt1 und remained unnoticed until now. + +Point preempt_enable_no_resched() and preempt_enable() on RT. + +Cc: stable-rt@vger.kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + include/linux/preempt.h | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/include/linux/preempt.h b/include/linux/preempt.h +index af39859f02ee..7b5b2ed55531 100644 +--- a/include/linux/preempt.h ++++ b/include/linux/preempt.h +@@ -208,12 +208,12 @@ do { \ + preempt_count_dec(); \ + } while (0) + +-#ifdef CONFIG_PREEMPT_RT ++#ifndef CONFIG_PREEMPT_RT + # define preempt_enable_no_resched() sched_preempt_enable_no_resched() +-# define preempt_check_resched_rt() preempt_check_resched() ++# define preempt_check_resched_rt() barrier(); + #else + # define preempt_enable_no_resched() preempt_enable() +-# define preempt_check_resched_rt() barrier(); ++# define preempt_check_resched_rt() preempt_check_resched() + #endif + + #define preemptible() (preempt_count() == 0 && !irqs_disabled()) +-- +2.43.0 + diff --git a/debian/patches-rt/0302-mm-Disable-NUMA_BALANCING_DEFAULT_ENABLED-and-TRANSP.patch b/debian/patches-rt/0302-mm-Disable-NUMA_BALANCING_DEFAULT_ENABLED-and-TRANSP.patch new file mode 100644 index 000000000..e9b579280 --- /dev/null +++ b/debian/patches-rt/0302-mm-Disable-NUMA_BALANCING_DEFAULT_ENABLED-and-TRANSP.patch @@ -0,0 +1,51 @@ +From 4e8d89996a692bd5d4a094e55c0a88044993057f Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Fri, 29 Oct 2021 10:07:11 +0200 +Subject: [PATCH 302/323] mm: Disable NUMA_BALANCING_DEFAULT_ENABLED and + TRANSPARENT_HUGEPAGE on PREEMPT_RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +TRANSPARENT_HUGEPAGE: +There are potential non-deterministic delays to an RT thread if a critical +memory region is not THP-aligned and a non-RT buffer is located in the same +hugepage-aligned region. It's also possible for an unrelated thread to migrate +pages belonging to an RT task incurring unexpected page faults due to memory +defragmentation even if khugepaged is disabled. + +Regular HUGEPAGEs are not affected by this can be used. + +NUMA_BALANCING: +There is a non-deterministic delay to mark PTEs PROT_NONE to gather NUMA fault +samples, increased page faults of regions even if mlocked and non-deterministic +delays when migrating pages. + +[Mel Gorman worded 99% of the commit description]. + +Link: https://lore.kernel.org/all/20200304091159.GN3818@techsingularity.net/ +Link: https://lore.kernel.org/all/20211026165100.ahz5bkx44lrrw5pt@linutronix.de/ +Cc: stable-rt@vger.kernel.org +Cc: Mel Gorman <mgorman@techsingularity.net> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Acked-by: Mel Gorman <mgorman@techsingularity.net> +Link: https://lore.kernel.org/r/20211028143327.hfbxjze7palrpfgp@linutronix.de +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + init/Kconfig | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/init/Kconfig b/init/Kconfig +index c5f276d782de..a62d2961064b 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -878,7 +878,7 @@ config NUMA_BALANCING + bool "Memory placement aware NUMA scheduler" + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY +- depends on SMP && NUMA && MIGRATION ++ depends on SMP && NUMA && MIGRATION && !PREEMPT_RT + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +-- +2.43.0 + diff --git a/debian/patches-rt/0303-fscache-Use-only-one-fscache_object_cong_wait.patch b/debian/patches-rt/0303-fscache-Use-only-one-fscache_object_cong_wait.patch new file mode 100644 index 000000000..a2f67e27c --- /dev/null +++ b/debian/patches-rt/0303-fscache-Use-only-one-fscache_object_cong_wait.patch @@ -0,0 +1,129 @@ +From 9ebcbc0c3bbeaa6e13b623ff2d84d3b29e0a1431 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Thu, 28 Oct 2021 17:30:50 +0200 +Subject: [PATCH 303/323] fscache: Use only one fscache_object_cong_wait. +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +In the commit mentioned below, fscache was converted from slow-work to +workqueue. slow_work_enqueue() and slow_work_sleep_till_thread_needed() +did not use a per-CPU workqueue. They choose from two global waitqueues +depending on the SLOW_WORK_VERY_SLOW bit which was not set so it always +one waitqueue. + +I can't find out how it is ensured that a waiter on certain CPU is woken +up be the other side. My guess is that the timeout in schedule_timeout() +ensures that it does not wait forever (or a random wake up). + +fscache_object_sleep_till_congested() must be invoked from preemptible +context in order for schedule() to work. In this case this_cpu_ptr() +should complain with CONFIG_DEBUG_PREEMPT enabled except the thread is +bound to one CPU. + +wake_up() wakes only one waiter and I'm not sure if it is guaranteed +that only one waiter exists. + +Replace the per-CPU waitqueue with one global waitqueue. + +Fixes: 8b8edefa2fffb ("fscache: convert object to use workqueue instead of slow-work") +Reported-by: Gregor Beck <gregor.beck@gmail.com> +Cc: stable-rt@vger.kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + fs/fscache/internal.h | 1 - + fs/fscache/main.c | 6 ------ + fs/fscache/object.c | 11 +++++------ + 3 files changed, 5 insertions(+), 13 deletions(-) + +diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h +index 64aa552b296d..7dae569dafb9 100644 +--- a/fs/fscache/internal.h ++++ b/fs/fscache/internal.h +@@ -95,7 +95,6 @@ extern unsigned fscache_debug; + extern struct kobject *fscache_root; + extern struct workqueue_struct *fscache_object_wq; + extern struct workqueue_struct *fscache_op_wq; +-DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); + + extern unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n); + +diff --git a/fs/fscache/main.c b/fs/fscache/main.c +index 4207f98e405f..85f8cf3a323d 100644 +--- a/fs/fscache/main.c ++++ b/fs/fscache/main.c +@@ -41,8 +41,6 @@ struct kobject *fscache_root; + struct workqueue_struct *fscache_object_wq; + struct workqueue_struct *fscache_op_wq; + +-DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); +- + /* these values serve as lower bounds, will be adjusted in fscache_init() */ + static unsigned fscache_object_max_active = 4; + static unsigned fscache_op_max_active = 2; +@@ -138,7 +136,6 @@ unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n) + static int __init fscache_init(void) + { + unsigned int nr_cpus = num_possible_cpus(); +- unsigned int cpu; + int ret; + + fscache_object_max_active = +@@ -161,9 +158,6 @@ static int __init fscache_init(void) + if (!fscache_op_wq) + goto error_op_wq; + +- for_each_possible_cpu(cpu) +- init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu)); +- + ret = fscache_proc_init(); + if (ret < 0) + goto error_proc; +diff --git a/fs/fscache/object.c b/fs/fscache/object.c +index cb2146e02cd5..55158f30d093 100644 +--- a/fs/fscache/object.c ++++ b/fs/fscache/object.c +@@ -807,6 +807,8 @@ void fscache_object_destroy(struct fscache_object *object) + } + EXPORT_SYMBOL(fscache_object_destroy); + ++static DECLARE_WAIT_QUEUE_HEAD(fscache_object_cong_wait); ++ + /* + * enqueue an object for metadata-type processing + */ +@@ -815,12 +817,10 @@ void fscache_enqueue_object(struct fscache_object *object) + _enter("{OBJ%x}", object->debug_id); + + if (fscache_get_object(object, fscache_obj_get_queue) >= 0) { +- wait_queue_head_t *cong_wq = +- &get_cpu_var(fscache_object_cong_wait); + + if (queue_work(fscache_object_wq, &object->work)) { + if (fscache_object_congested()) +- wake_up(cong_wq); ++ wake_up(&fscache_object_cong_wait); + } else + fscache_put_object(object, fscache_obj_put_queue); + +@@ -842,16 +842,15 @@ void fscache_enqueue_object(struct fscache_object *object) + */ + bool fscache_object_sleep_till_congested(signed long *timeoutp) + { +- wait_queue_head_t *cong_wq = this_cpu_ptr(&fscache_object_cong_wait); + DEFINE_WAIT(wait); + + if (fscache_object_congested()) + return true; + +- add_wait_queue_exclusive(cong_wq, &wait); ++ add_wait_queue_exclusive(&fscache_object_cong_wait, &wait); + if (!fscache_object_congested()) + *timeoutp = schedule_timeout(*timeoutp); +- finish_wait(cong_wq, &wait); ++ finish_wait(&fscache_object_cong_wait, &wait); + + return fscache_object_congested(); + } +-- +2.43.0 + diff --git a/debian/patches-rt/0304-fscache-Use-only-one-fscache_object_cong_wait.patch b/debian/patches-rt/0304-fscache-Use-only-one-fscache_object_cong_wait.patch new file mode 100644 index 000000000..ca5c5cbd4 --- /dev/null +++ b/debian/patches-rt/0304-fscache-Use-only-one-fscache_object_cong_wait.patch @@ -0,0 +1,31 @@ +From 5fe656c51fcb73a09cf33665bcee0341edaf14c3 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 2 Nov 2021 11:52:05 +0100 +Subject: [PATCH 304/323] fscache: Use only one fscache_object_cong_wait. +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +This is an update of the original patch, removing put_cpu_var() which +was overseen in the initial patch. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + fs/fscache/object.c | 2 -- + 1 file changed, 2 deletions(-) + +diff --git a/fs/fscache/object.c b/fs/fscache/object.c +index 55158f30d093..fb9794dce721 100644 +--- a/fs/fscache/object.c ++++ b/fs/fscache/object.c +@@ -823,8 +823,6 @@ void fscache_enqueue_object(struct fscache_object *object) + wake_up(&fscache_object_cong_wait); + } else + fscache_put_object(object, fscache_obj_put_queue); +- +- put_cpu_var(fscache_object_cong_wait); + } + } + +-- +2.43.0 + diff --git a/debian/patches-rt/0305-locking-Drop-might_resched-from-might_sleep_no_state.patch b/debian/patches-rt/0305-locking-Drop-might_resched-from-might_sleep_no_state.patch new file mode 100644 index 000000000..f424e077f --- /dev/null +++ b/debian/patches-rt/0305-locking-Drop-might_resched-from-might_sleep_no_state.patch @@ -0,0 +1,40 @@ +From 8eddd2dc82853a3df908061f6ddf71bde523917b Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Mon, 6 Sep 2021 18:19:16 +0200 +Subject: [PATCH 305/323] locking: Drop might_resched() from + might_sleep_no_state_check() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +might_sleep_no_state_check() serves the same purpose as might_sleep() +except it is used before sleeping locks are acquired and therefore does +not check task_struct::state because the state is preserved. + +That state is preserved in the locking slow path so we must not schedule +at the begin of the locking function because the state will be lost and +not preserved at that time. + +Remove might_resched() from might_sleep_no_state_check() to avoid losing the +state before it is preserved. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + include/linux/kernel.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/linux/kernel.h b/include/linux/kernel.h +index 7b4fdd5b2f7b..4e88ae0b2578 100644 +--- a/include/linux/kernel.h ++++ b/include/linux/kernel.h +@@ -225,7 +225,7 @@ extern void __cant_migrate(const char *file, int line); + do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) + + # define might_sleep_no_state_check() \ +- do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) ++ do { ___might_sleep(__FILE__, __LINE__, 0); } while (0) + + /** + * cant_sleep - annotation for functions that cannot sleep +-- +2.43.0 + diff --git a/debian/patches-rt/0306-drm-i915-gt-Queue-and-wait-for-the-irq_work-item.patch b/debian/patches-rt/0306-drm-i915-gt-Queue-and-wait-for-the-irq_work-item.patch new file mode 100644 index 000000000..c7e1e5838 --- /dev/null +++ b/debian/patches-rt/0306-drm-i915-gt-Queue-and-wait-for-the-irq_work-item.patch @@ -0,0 +1,49 @@ +From 45cb435eff7cd17311c87552c32c3a202b09bf51 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 26 Oct 2021 13:40:53 +0200 +Subject: [PATCH 306/323] drm/i915/gt: Queue and wait for the irq_work item. +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Disabling interrupts and invoking the irq_work function directly breaks +on PREEMPT_RT. +PREEMPT_RT does not invoke all irq_work from hardirq context because +some of the user have spinlock_t locking in the callback function. +These locks are then turned into a sleeping locks which can not be +acquired with disabled interrupts. + +Using irq_work_queue() has the benefit that the irqwork will be invoked +in the regular context. In general there is "no" delay between enqueuing +the callback and its invocation because the interrupt is raised right +away on architectures which support it (which includes x86). + +Use irq_work_queue() + irq_work_sync() instead invoking the callback +directly. + +Reported-by: Clark Williams <williams@redhat.com> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com> +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +index 0040b4765a54..3f4f854786f2 100644 +--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c ++++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +@@ -342,10 +342,9 @@ void intel_breadcrumbs_park(struct intel_breadcrumbs *b) + /* Kick the work once more to drain the signalers */ + irq_work_sync(&b->irq_work); + while (unlikely(READ_ONCE(b->irq_armed))) { +- local_irq_disable(); +- signal_irq_work(&b->irq_work); +- local_irq_enable(); ++ irq_work_queue(&b->irq_work); + cond_resched(); ++ irq_work_sync(&b->irq_work); + } + GEM_BUG_ON(!list_empty(&b->signalers)); + } +-- +2.43.0 + diff --git a/debian/patches-rt/0307-irq_work-Allow-irq_work_sync-to-sleep-if-irq_work-no.patch b/debian/patches-rt/0307-irq_work-Allow-irq_work_sync-to-sleep-if-irq_work-no.patch new file mode 100644 index 000000000..2c0ed1db3 --- /dev/null +++ b/debian/patches-rt/0307-irq_work-Allow-irq_work_sync-to-sleep-if-irq_work-no.patch @@ -0,0 +1,100 @@ +From cf4bb976aef7af85b7134aa7cffdfa8e058dc5c6 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 24 Nov 2021 17:12:19 +0100 +Subject: [PATCH 307/323] irq_work: Allow irq_work_sync() to sleep if + irq_work() no IRQ support. +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +irq_work() triggers instantly an interrupt if supported by the +architecture. Otherwise the work will be processed on the next timer +tick. In worst case irq_work_sync() could spin up to a jiffy. + +irq_work_sync() is usually used in tear down context which is fully +preemptible. Based on review irq_work_sync() is invoked from preemptible +context and there is one waiter at a time. This qualifies it to use +rcuwait for synchronisation. + +Let irq_work_sync() synchronize with rcuwait if the architecture +processes irqwork via the timer tick. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Link: https://lkml.kernel.org/r/20211006111852.1514359-3-bigeasy@linutronix.de +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + include/linux/irq_work.h | 10 +++++++++- + kernel/irq_work.c | 10 ++++++++++ + 2 files changed, 19 insertions(+), 1 deletion(-) + +diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h +index f941f2d7d71c..3c6d3a96bca0 100644 +--- a/include/linux/irq_work.h ++++ b/include/linux/irq_work.h +@@ -3,6 +3,7 @@ + #define _LINUX_IRQ_WORK_H + + #include <linux/smp_types.h> ++#include <linux/rcuwait.h> + + /* + * An entry can be in one of four states: +@@ -22,6 +23,7 @@ struct irq_work { + }; + }; + void (*func)(struct irq_work *); ++ struct rcuwait irqwait; + }; + + static inline +@@ -29,13 +31,19 @@ void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *)) + { + atomic_set(&work->flags, 0); + work->func = func; ++ rcuwait_init(&work->irqwait); + } + + #define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { \ + .flags = ATOMIC_INIT(0), \ +- .func = (_f) \ ++ .func = (_f), \ ++ .irqwait = __RCUWAIT_INITIALIZER(irqwait), \ + } + ++static inline bool irq_work_is_busy(struct irq_work *work) ++{ ++ return atomic_read(&work->flags) & IRQ_WORK_BUSY; ++} + + bool irq_work_queue(struct irq_work *work); + bool irq_work_queue_on(struct irq_work *work, int cpu); +diff --git a/kernel/irq_work.c b/kernel/irq_work.c +index 8183d30e1bb1..8969aff790e2 100644 +--- a/kernel/irq_work.c ++++ b/kernel/irq_work.c +@@ -165,6 +165,9 @@ void irq_work_single(void *arg) + */ + flags &= ~IRQ_WORK_PENDING; + (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); ++ ++ if (!arch_irq_work_has_interrupt()) ++ rcuwait_wake_up(&work->irqwait); + } + + static void irq_work_run_list(struct llist_head *list) +@@ -231,6 +234,13 @@ void irq_work_tick_soft(void) + void irq_work_sync(struct irq_work *work) + { + lockdep_assert_irqs_enabled(); ++ might_sleep(); ++ ++ if (!arch_irq_work_has_interrupt()) { ++ rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), ++ TASK_UNINTERRUPTIBLE); ++ return; ++ } + + while (atomic_read(&work->flags) & IRQ_WORK_BUSY) + cpu_relax(); +-- +2.43.0 + diff --git a/debian/patches-rt/0308-irq_work-Handle-some-irq_work-in-a-per-CPU-thread-on.patch b/debian/patches-rt/0308-irq_work-Handle-some-irq_work-in-a-per-CPU-thread-on.patch new file mode 100644 index 000000000..a6aea41c0 --- /dev/null +++ b/debian/patches-rt/0308-irq_work-Handle-some-irq_work-in-a-per-CPU-thread-on.patch @@ -0,0 +1,307 @@ +From 2b803272d74039863a77523d79f79cc938eff7cf Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 24 Nov 2021 17:12:20 +0100 +Subject: [PATCH 308/323] irq_work: Handle some irq_work in a per-CPU thread on + PREEMPT_RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The irq_work callback is invoked in hard IRQ context. By default all +callbacks are scheduled for invocation right away (given supported by +the architecture) except for the ones marked IRQ_WORK_LAZY which are +delayed until the next timer-tick. + +While looking over the callbacks, some of them may acquire locks +(spinlock_t, rwlock_t) which are transformed into sleeping locks on +PREEMPT_RT and must not be acquired in hard IRQ context. +Changing the locks into locks which could be acquired in this context +will lead to other problems such as increased latencies if everything +in the chain has IRQ-off locks. This will not solve all the issues as +one callback has been noticed which invoked kref_put() and its callback +invokes kfree() and this can not be invoked in hardirq context. + +Some callbacks are required to be invoked in hardirq context even on +PREEMPT_RT to work properly. This includes for instance the NO_HZ +callback which needs to be able to observe the idle context. + +The callbacks which require to be run in hardirq have already been +marked. Use this information to split the callbacks onto the two lists +on PREEMPT_RT: +- lazy_list + Work items which are not marked with IRQ_WORK_HARD_IRQ will be added + to this list. Callbacks on this list will be invoked from a per-CPU + thread. + The handler here may acquire sleeping locks such as spinlock_t and + invoke kfree(). + +- raised_list + Work items which are marked with IRQ_WORK_HARD_IRQ will be added to + this list. They will be invoked in hardirq context and must not + acquire any sleeping locks. + +The wake up of the per-CPU thread occurs from irq_work handler/ +hardirq context. The thread runs with lowest RT priority to ensure it +runs before any SCHED_OTHER tasks do. + +[bigeasy: melt tglx's irq_work_tick_soft() which splits irq_work_tick() into a + hard and soft variant. Collected fixes over time from Steven + Rostedt and Mike Galbraith. Move to per-CPU threads instead of + softirq as suggested by PeterZ.] + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Link: https://lkml.kernel.org/r/20211007092646.uhshe3ut2wkrcfzv@linutronix.de +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + include/linux/irq_work.h | 16 +++-- + kernel/irq_work.c | 131 ++++++++++++++++++++++++++++----------- + kernel/time/timer.c | 2 - + 3 files changed, 106 insertions(+), 43 deletions(-) + +diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h +index 3c6d3a96bca0..f551ba9c99d4 100644 +--- a/include/linux/irq_work.h ++++ b/include/linux/irq_work.h +@@ -40,6 +40,16 @@ void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *)) + .irqwait = __RCUWAIT_INITIALIZER(irqwait), \ + } + ++#define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \ ++ .flags = ATOMIC_INIT(_flags), \ ++ .func = (_func), \ ++ .irqwait = __RCUWAIT_INITIALIZER(irqwait), \ ++} ++ ++#define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0) ++#define IRQ_WORK_INIT_LAZY(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_LAZY) ++#define IRQ_WORK_INIT_HARD(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_HARD_IRQ) ++ + static inline bool irq_work_is_busy(struct irq_work *work) + { + return atomic_read(&work->flags) & IRQ_WORK_BUSY; +@@ -63,10 +73,4 @@ static inline void irq_work_run(void) { } + static inline void irq_work_single(void *arg) { } + #endif + +-#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT) +-void irq_work_tick_soft(void); +-#else +-static inline void irq_work_tick_soft(void) { } +-#endif +- + #endif /* _LINUX_IRQ_WORK_H */ +diff --git a/kernel/irq_work.c b/kernel/irq_work.c +index 8969aff790e2..03d09d779ee1 100644 +--- a/kernel/irq_work.c ++++ b/kernel/irq_work.c +@@ -18,12 +18,37 @@ + #include <linux/cpu.h> + #include <linux/notifier.h> + #include <linux/smp.h> ++#include <linux/smpboot.h> + #include <linux/interrupt.h> + #include <asm/processor.h> + + + static DEFINE_PER_CPU(struct llist_head, raised_list); + static DEFINE_PER_CPU(struct llist_head, lazy_list); ++static DEFINE_PER_CPU(struct task_struct *, irq_workd); ++ ++static void wake_irq_workd(void) ++{ ++ struct task_struct *tsk = __this_cpu_read(irq_workd); ++ ++ if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk) ++ wake_up_process(tsk); ++} ++ ++#ifdef CONFIG_SMP ++static void irq_work_wake(struct irq_work *entry) ++{ ++ wake_irq_workd(); ++} ++ ++static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) = ++ IRQ_WORK_INIT_HARD(irq_work_wake); ++#endif ++ ++static int irq_workd_should_run(unsigned int cpu) ++{ ++ return !llist_empty(this_cpu_ptr(&lazy_list)); ++} + + /* + * Claim the entry so that no one else will poke at it. +@@ -54,20 +79,28 @@ void __weak arch_irq_work_raise(void) + static void __irq_work_queue_local(struct irq_work *work) + { + struct llist_head *list; +- bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT); +- +- lazy_work = atomic_read(&work->flags) & IRQ_WORK_LAZY; +- +- /* If the work is "lazy", handle it from next tick if any */ +- if (lazy_work || (realtime && !(atomic_read(&work->flags) & IRQ_WORK_HARD_IRQ))) ++ bool rt_lazy_work = false; ++ bool lazy_work = false; ++ int work_flags; ++ ++ work_flags = atomic_read(&work->flags); ++ if (work_flags & IRQ_WORK_LAZY) ++ lazy_work = true; ++ else if (IS_ENABLED(CONFIG_PREEMPT_RT) && ++ !(work_flags & IRQ_WORK_HARD_IRQ)) ++ rt_lazy_work = true; ++ ++ if (lazy_work || rt_lazy_work) + list = this_cpu_ptr(&lazy_list); + else + list = this_cpu_ptr(&raised_list); + +- if (llist_add(&work->llnode, list)) { +- if (!lazy_work || tick_nohz_tick_stopped()) +- arch_irq_work_raise(); +- } ++ if (!llist_add(&work->llnode, list)) ++ return; ++ ++ /* If the work is "lazy", handle it from next tick if any */ ++ if (!lazy_work || tick_nohz_tick_stopped()) ++ arch_irq_work_raise(); + } + + /* Enqueue the irq work @work on the current CPU */ +@@ -110,15 +143,27 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) + /* Arch remote IPI send/receive backend aren't NMI safe */ + WARN_ON_ONCE(in_nmi()); + +- if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(atomic_read(&work->flags) & IRQ_WORK_HARD_IRQ)) { +- if (llist_add(&work->llnode, &per_cpu(lazy_list, cpu))) +- arch_send_call_function_single_ipi(cpu); +- } else { +- __smp_call_single_queue(cpu, &work->llnode); ++ /* ++ * On PREEMPT_RT the items which are not marked as ++ * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work ++ * item is used on the remote CPU to wake the thread. ++ */ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT) && ++ !(atomic_read(&work->flags) & IRQ_WORK_HARD_IRQ)) { ++ ++ if (!llist_add(&work->llnode, &per_cpu(lazy_list, cpu))) ++ goto out; ++ ++ work = &per_cpu(irq_work_wakeup, cpu); ++ if (!irq_work_claim(work)) ++ goto out; + } ++ ++ __smp_call_single_queue(cpu, &work->llnode); + } else { + __irq_work_queue_local(work); + } ++out: + preempt_enable(); + + return true; +@@ -175,12 +220,13 @@ static void irq_work_run_list(struct llist_head *list) + struct irq_work *work, *tmp; + struct llist_node *llnode; + +-#ifndef CONFIG_PREEMPT_RT + /* +- * nort: On RT IRQ-work may run in SOFTIRQ context. ++ * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed ++ * in a per-CPU thread in preemptible context. Only the items which are ++ * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context. + */ +- BUG_ON(!irqs_disabled()); +-#endif ++ BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT)); ++ + if (llist_empty(list)) + return; + +@@ -196,16 +242,10 @@ static void irq_work_run_list(struct llist_head *list) + void irq_work_run(void) + { + irq_work_run_list(this_cpu_ptr(&raised_list)); +- if (IS_ENABLED(CONFIG_PREEMPT_RT)) { +- /* +- * NOTE: we raise softirq via IPI for safety, +- * and execute in irq_work_tick() to move the +- * overhead from hard to soft irq context. +- */ +- if (!llist_empty(this_cpu_ptr(&lazy_list))) +- raise_softirq(TIMER_SOFTIRQ); +- } else ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + irq_work_run_list(this_cpu_ptr(&lazy_list)); ++ else ++ wake_irq_workd(); + } + EXPORT_SYMBOL_GPL(irq_work_run); + +@@ -218,15 +258,10 @@ void irq_work_tick(void) + + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + irq_work_run_list(this_cpu_ptr(&lazy_list)); ++ else ++ wake_irq_workd(); + } + +-#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT) +-void irq_work_tick_soft(void) +-{ +- irq_work_run_list(this_cpu_ptr(&lazy_list)); +-} +-#endif +- + /* + * Synchronize against the irq_work @entry, ensures the entry is not + * currently in use. +@@ -246,3 +281,29 @@ void irq_work_sync(struct irq_work *work) + cpu_relax(); + } + EXPORT_SYMBOL_GPL(irq_work_sync); ++ ++static void run_irq_workd(unsigned int cpu) ++{ ++ irq_work_run_list(this_cpu_ptr(&lazy_list)); ++} ++ ++static void irq_workd_setup(unsigned int cpu) ++{ ++ sched_set_fifo_low(current); ++} ++ ++static struct smp_hotplug_thread irqwork_threads = { ++ .store = &irq_workd, ++ .setup = irq_workd_setup, ++ .thread_should_run = irq_workd_should_run, ++ .thread_fn = run_irq_workd, ++ .thread_comm = "irq_work/%u", ++}; ++ ++static __init int irq_work_init_threads(void) ++{ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ BUG_ON(smpboot_register_percpu_thread(&irqwork_threads)); ++ return 0; ++} ++early_initcall(irq_work_init_threads); +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index 1cad0efd635c..a4fdc7cfb723 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1770,8 +1770,6 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) + { + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + +- irq_work_tick_soft(); +- + __run_timers(base); + if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) + __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); +-- +2.43.0 + diff --git a/debian/patches-rt/0309-irq_work-Also-rcuwait-for-IRQ_WORK_HARD_IRQ-on-PREEM.patch b/debian/patches-rt/0309-irq_work-Also-rcuwait-for-IRQ_WORK_HARD_IRQ-on-PREEM.patch new file mode 100644 index 000000000..ed320f6b7 --- /dev/null +++ b/debian/patches-rt/0309-irq_work-Also-rcuwait-for-IRQ_WORK_HARD_IRQ-on-PREEM.patch @@ -0,0 +1,65 @@ +From ca685c962ea2301291d3459ec8717208395b3cf9 Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Wed, 24 Nov 2021 17:12:21 +0100 +Subject: [PATCH 309/323] irq_work: Also rcuwait for !IRQ_WORK_HARD_IRQ on + PREEMPT_RT +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +On PREEMPT_RT most items are processed as LAZY via softirq context. +Avoid to spin-wait for them because irq_work_sync() could have higher +priority and not allow the irq-work to be completed. + +Wait additionally for !IRQ_WORK_HARD_IRQ irq_work items on PREEMPT_RT. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Link: https://lkml.kernel.org/r/20211006111852.1514359-5-bigeasy@linutronix.de +Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> +--- + include/linux/irq_work.h | 5 +++++ + kernel/irq_work.c | 6 ++++-- + 2 files changed, 9 insertions(+), 2 deletions(-) + +diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h +index f551ba9c99d4..2c0059340871 100644 +--- a/include/linux/irq_work.h ++++ b/include/linux/irq_work.h +@@ -55,6 +55,11 @@ static inline bool irq_work_is_busy(struct irq_work *work) + return atomic_read(&work->flags) & IRQ_WORK_BUSY; + } + ++static inline bool irq_work_is_hard(struct irq_work *work) ++{ ++ return atomic_read(&work->flags) & IRQ_WORK_HARD_IRQ; ++} ++ + bool irq_work_queue(struct irq_work *work); + bool irq_work_queue_on(struct irq_work *work, int cpu); + +diff --git a/kernel/irq_work.c b/kernel/irq_work.c +index 03d09d779ee1..cbec10c32ead 100644 +--- a/kernel/irq_work.c ++++ b/kernel/irq_work.c +@@ -211,7 +211,8 @@ void irq_work_single(void *arg) + flags &= ~IRQ_WORK_PENDING; + (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); + +- if (!arch_irq_work_has_interrupt()) ++ if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || ++ !arch_irq_work_has_interrupt()) + rcuwait_wake_up(&work->irqwait); + } + +@@ -271,7 +272,8 @@ void irq_work_sync(struct irq_work *work) + lockdep_assert_irqs_enabled(); + might_sleep(); + +- if (!arch_irq_work_has_interrupt()) { ++ if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || ++ !arch_irq_work_has_interrupt()) { + rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), + TASK_UNINTERRUPTIBLE); + return; +-- +2.43.0 + diff --git a/debian/patches-rt/0310-eventfd-Make-signal-recursion-protection-a-task-bit.patch b/debian/patches-rt/0310-eventfd-Make-signal-recursion-protection-a-task-bit.patch new file mode 100644 index 000000000..719316f76 --- /dev/null +++ b/debian/patches-rt/0310-eventfd-Make-signal-recursion-protection-a-task-bit.patch @@ -0,0 +1,150 @@ +From 17c41196e72418f94cf308a8b23fe478612a1610 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 17 Dec 2021 11:32:09 +0100 +Subject: [PATCH 310/323] eventfd: Make signal recursion protection a task bit +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Upstream commit b542e383d8c005f06a131e2b40d5889b812f19c6 + +The recursion protection for eventfd_signal() is based on a per CPU +variable and relies on the !RT semantics of spin_lock_irqsave() for +protecting this per CPU variable. On RT kernels spin_lock_irqsave() neither +disables preemption nor interrupts which allows the spin lock held section +to be preempted. If the preempting task invokes eventfd_signal() as well, +then the recursion warning triggers. + +Paolo suggested to protect the per CPU variable with a local lock, but +that's heavyweight and actually not necessary. The goal of this protection +is to prevent the task stack from overflowing, which can be achieved with a +per task recursion protection as well. + +Replace the per CPU variable with a per task bit similar to other recursion +protection bits like task_struct::in_page_owner. This works on both !RT and +RT kernels and removes as a side effect the extra per CPU storage. + +No functional change for !RT kernels. + +Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com> +Acked-by: Jason Wang <jasowang@redhat.com> +Cc: Al Viro <viro@zeniv.linux.org.uk> +Link: https://lore.kernel.org/r/87wnp9idso.ffs@tglx +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + fs/aio.c | 2 +- + fs/eventfd.c | 12 +++++------- + include/linux/eventfd.h | 11 +++++------ + include/linux/sched.h | 4 ++++ + 4 files changed, 15 insertions(+), 14 deletions(-) + +diff --git a/fs/aio.c b/fs/aio.c +index c90e045a37bc..e1181dc37be1 100644 +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -1765,7 +1765,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + list_del_init(&req->wait.entry); + list_del(&iocb->ki_list); + iocb->ki_res.res = mangle_poll(mask); +- if (iocb->ki_eventfd && eventfd_signal_count()) { ++ if (iocb->ki_eventfd && eventfd_signal_allowed()) { + iocb = NULL; + INIT_WORK(&req->work, aio_poll_put_work); + schedule_work(&req->work); +diff --git a/fs/eventfd.c b/fs/eventfd.c +index 3673eb8de035..ce9ca07d6c00 100644 +--- a/fs/eventfd.c ++++ b/fs/eventfd.c +@@ -25,8 +25,6 @@ + #include <linux/idr.h> + #include <linux/uio.h> + +-DEFINE_PER_CPU(int, eventfd_wake_count); +- + static DEFINE_IDA(eventfd_ida); + + struct eventfd_ctx { +@@ -53,21 +51,21 @@ __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask) + * Deadlock or stack overflow issues can happen if we recurse here + * through waitqueue wakeup handlers. If the caller users potentially + * nested waitqueues with custom wakeup handlers, then it should +- * check eventfd_signal_count() before calling this function. If +- * it returns true, the eventfd_signal() call should be deferred to a ++ * check eventfd_signal_allowed() before calling this function. If ++ * it returns false, the eventfd_signal() call should be deferred to a + * safe context. + */ +- if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count))) ++ if (WARN_ON_ONCE(current->in_eventfd_signal)) + return 0; + + spin_lock_irqsave(&ctx->wqh.lock, flags); +- this_cpu_inc(eventfd_wake_count); ++ current->in_eventfd_signal = 1; + if (ULLONG_MAX - ctx->count < n) + n = ULLONG_MAX - ctx->count; + ctx->count += n; + if (waitqueue_active(&ctx->wqh)) + wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask); +- this_cpu_dec(eventfd_wake_count); ++ current->in_eventfd_signal = 0; + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + + return n; +diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h +index c1bd4883e2fa..842d223dfe17 100644 +--- a/include/linux/eventfd.h ++++ b/include/linux/eventfd.h +@@ -14,6 +14,7 @@ + #include <linux/err.h> + #include <linux/percpu-defs.h> + #include <linux/percpu.h> ++#include <linux/sched.h> + + /* + * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining +@@ -44,11 +45,9 @@ int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *w + __u64 *cnt); + void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt); + +-DECLARE_PER_CPU(int, eventfd_wake_count); +- +-static inline bool eventfd_signal_count(void) ++static inline bool eventfd_signal_allowed(void) + { +- return this_cpu_read(eventfd_wake_count); ++ return !current->in_eventfd_signal; + } + + #else /* CONFIG_EVENTFD */ +@@ -85,9 +84,9 @@ static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, + return -ENOSYS; + } + +-static inline bool eventfd_signal_count(void) ++static inline bool eventfd_signal_allowed(void) + { +- return false; ++ return true; + } + + static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) +diff --git a/include/linux/sched.h b/include/linux/sched.h +index a73528e8235d..13d4957189bb 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -848,6 +848,10 @@ struct task_struct { + /* Stalled due to lack of memory */ + unsigned in_memstall:1; + #endif ++#ifdef CONFIG_EVENTFD ++ /* Recursion prevention for eventfd_signal() */ ++ unsigned in_eventfd_signal:1; ++#endif + + unsigned long atomic_flags; /* Flags requiring atomic access. */ + +-- +2.43.0 + diff --git a/debian/patches-rt/0311-stop_machine-Remove-this_cpu_ptr-from-print_stop_inf.patch b/debian/patches-rt/0311-stop_machine-Remove-this_cpu_ptr-from-print_stop_inf.patch new file mode 100644 index 000000000..653909e6e --- /dev/null +++ b/debian/patches-rt/0311-stop_machine-Remove-this_cpu_ptr-from-print_stop_inf.patch @@ -0,0 +1,38 @@ +From ce2910a453196930e7083aaf13240adcec160afd Mon Sep 17 00:00:00 2001 +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Fri, 17 Dec 2021 11:32:08 +0100 +Subject: [PATCH 311/323] stop_machine: Remove this_cpu_ptr() from + print_stop_info(). +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +This aligns the patch ("stop_machine: Add function and caller debug +info) with commit + a8b62fd085050 ("stop_machine: Add function and caller debug info") + +that was merged upstream and is slightly different. + +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/stop_machine.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c +index dbf585cf4b9f..971d8acceaec 100644 +--- a/kernel/stop_machine.c ++++ b/kernel/stop_machine.c +@@ -51,7 +51,11 @@ static bool stop_machine_initialized = false; + + void print_stop_info(const char *log_lvl, struct task_struct *task) + { +- struct cpu_stopper *stopper = this_cpu_ptr(&cpu_stopper); ++ /* ++ * If @task is a stopper task, it cannot migrate and task_cpu() is ++ * stable. ++ */ ++ struct cpu_stopper *stopper = per_cpu_ptr(&cpu_stopper, task_cpu(task)); + + if (task != stopper->thread) + return; +-- +2.43.0 + diff --git a/debian/patches-rt/0312-aio-Fix-incorrect-usage-of-eventfd_signal_allowed.patch b/debian/patches-rt/0312-aio-Fix-incorrect-usage-of-eventfd_signal_allowed.patch new file mode 100644 index 000000000..978a2de7d --- /dev/null +++ b/debian/patches-rt/0312-aio-Fix-incorrect-usage-of-eventfd_signal_allowed.patch @@ -0,0 +1,38 @@ +From fd840c98e310adf2a03ee242969a9597a909a672 Mon Sep 17 00:00:00 2001 +From: Xie Yongji <xieyongji@bytedance.com> +Date: Mon, 13 Sep 2021 19:19:28 +0800 +Subject: [PATCH 312/323] aio: Fix incorrect usage of eventfd_signal_allowed() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +commit 4b3749865374899e115aa8c48681709b086fe6d3 upstream. + +We should defer eventfd_signal() to the workqueue when +eventfd_signal_allowed() return false rather than return +true. + +Fixes: b542e383d8c0 ("eventfd: Make signal recursion protection a task bit") +Signed-off-by: Xie Yongji <xieyongji@bytedance.com> +Link: https://lore.kernel.org/r/20210913111928.98-1-xieyongji@bytedance.com +Reviewed-by: Eric Biggers <ebiggers@google.com> +Signed-off-by: Eric Biggers <ebiggers@google.com> +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + fs/aio.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/aio.c b/fs/aio.c +index e1181dc37be1..e88fd9b58f3f 100644 +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -1765,7 +1765,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + list_del_init(&req->wait.entry); + list_del(&iocb->ki_list); + iocb->ki_res.res = mangle_poll(mask); +- if (iocb->ki_eventfd && eventfd_signal_allowed()) { ++ if (iocb->ki_eventfd && !eventfd_signal_allowed()) { + iocb = NULL; + INIT_WORK(&req->work, aio_poll_put_work); + schedule_work(&req->work); +-- +2.43.0 + diff --git a/debian/patches-rt/0313-rt-remove-extra-parameter-from-__trace_stack.patch b/debian/patches-rt/0313-rt-remove-extra-parameter-from-__trace_stack.patch new file mode 100644 index 000000000..d688d1f6d --- /dev/null +++ b/debian/patches-rt/0313-rt-remove-extra-parameter-from-__trace_stack.patch @@ -0,0 +1,41 @@ +From 38a85604759ae488654bab5c1cd9f23cd0d77ce0 Mon Sep 17 00:00:00 2001 +From: "Luis Claudio R. Goncalves" <lgoncalv@redhat.com> +Date: Thu, 12 May 2022 23:25:33 -0300 +Subject: [PATCH 313/323] rt: remove extra parameter from __trace_stack() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Fix the build error below while keeping the current PREEMPT_RT code: + +kernel/trace/trace_events_trigger.c: In function ‘stacktrace_trigger’: +kernel/trace/trace_events_trigger.c:1227:3: error: too many arguments to function ‘__trace_stack’ + __trace_stack(file->tr, flags, STACK_SKIP, preempt_count()); + ^~~~~~~~~~~~~ +In file included from kernel/trace/trace_events_trigger.c:15: +kernel/trace/trace.h:826:6: note: declared here + void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip); + ^~~~~~~~~~~~~ + +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + kernel/trace/trace_events_trigger.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c +index 4bc90965abb2..75fef9fcfd0f 100644 +--- a/kernel/trace/trace_events_trigger.c ++++ b/kernel/trace/trace_events_trigger.c +@@ -1224,7 +1224,7 @@ stacktrace_trigger(struct event_trigger_data *data, void *rec, + + if (file) { + local_save_flags(flags); +- __trace_stack(file->tr, flags, STACK_SKIP, preempt_count()); ++ __trace_stack(file->tr, STACK_SKIP, preempt_count()); + } else + trace_dump_stack(STACK_SKIP); + } +-- +2.43.0 + diff --git a/debian/patches-rt/0314-locking-rtmutex-switch-to-EXPORT_SYMBOL-for-ww_mutex.patch b/debian/patches-rt/0314-locking-rtmutex-switch-to-EXPORT_SYMBOL-for-ww_mutex.patch new file mode 100644 index 000000000..4cb76be05 --- /dev/null +++ b/debian/patches-rt/0314-locking-rtmutex-switch-to-EXPORT_SYMBOL-for-ww_mutex.patch @@ -0,0 +1,43 @@ +From a02b1841d962977869e02fff684931b53c8d4232 Mon Sep 17 00:00:00 2001 +From: Yajun Deng <yajun.deng@linux.dev> +Date: Wed, 3 Aug 2022 14:24:30 +0800 +Subject: [PATCH 314/323] locking/rtmutex: switch to EXPORT_SYMBOL() for + ww_mutex_lock{,_interruptible}() +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +We can use EXPORT_SYMBOL() instead of EXPORT_SYMBOL_GPL() in +ww_mutex_lock_interruptible() and ww_mutex_lock(). That match +ww_mutex_unlock() well. And also good for 3rd kernel modules. + +Link: https://lore.kernel.org/r/20220803062430.1307312-1-yajun.deng@linux.dev +Signed-off-by: Yajun Deng <yajun.deng@linux.dev> +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + kernel/locking/rtmutex.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index 31b374d36d04..b110fc57f733 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -2513,7 +2513,7 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) + + return ret; + } +-EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible); ++EXPORT_SYMBOL(ww_mutex_lock_interruptible); + + int __sched + ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +@@ -2533,7 +2533,7 @@ ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) + + return ret; + } +-EXPORT_SYMBOL_GPL(ww_mutex_lock); ++EXPORT_SYMBOL(ww_mutex_lock); + + void __sched ww_mutex_unlock(struct ww_mutex *lock) + { +-- +2.43.0 + diff --git a/debian/patches-rt/0315-ftrace-Fix-improper-usage-of-__trace_stack-function.patch b/debian/patches-rt/0315-ftrace-Fix-improper-usage-of-__trace_stack-function.patch new file mode 100644 index 000000000..9d6f96753 --- /dev/null +++ b/debian/patches-rt/0315-ftrace-Fix-improper-usage-of-__trace_stack-function.patch @@ -0,0 +1,50 @@ +From 54eb98b6f77e491259ac08a1172d36fd6ccf5284 Mon Sep 17 00:00:00 2001 +From: Anand Je Saipureddy <s.anandje1@gmail.com> +Date: Sat, 23 Jul 2022 12:19:43 +0530 +Subject: [PATCH 315/323] ftrace: Fix improper usage of __trace_stack() + function. +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +In kernel/trace/trace_events_trigger.c --> stacktrace_trigger() --> +__trace_stack() is not defined as per the function definition. + +With commit edbaaa13a660 +("tracing: Merge irqflags + preemt counter, add RT bits") +the irqflags(flags) and preemption counter(preempt_count()) are +now should be evaluated early by tracing_gen_ctx(). + +This patch replaces the irqflags and preemption counter +with tracing_gen_ctx(). + +Fixes: 5e8446e3820c ("tracing: Dump stacktrace trigger to the corresponding instance") +Link: https://lore.kernel.org/r/20220723064943.16532-1-s.anandje1@gmail.com +Signed-off-by: Anand Je Saipureddy <s.anandje1@gmail.com> +Reviewed-by: Corey Minyard <cminyard@mvista.com> +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + kernel/trace/trace_events_trigger.c | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c +index 75fef9fcfd0f..3c6229f16e81 100644 +--- a/kernel/trace/trace_events_trigger.c ++++ b/kernel/trace/trace_events_trigger.c +@@ -1220,12 +1220,10 @@ stacktrace_trigger(struct event_trigger_data *data, void *rec, + struct ring_buffer_event *event) + { + struct trace_event_file *file = data->private_data; +- unsigned long flags; + +- if (file) { +- local_save_flags(flags); +- __trace_stack(file->tr, STACK_SKIP, preempt_count()); +- } else ++ if (file) ++ __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP); ++ else + trace_dump_stack(STACK_SKIP); + } + +-- +2.43.0 + diff --git a/debian/patches-rt/0316-rt-arm64-make-_TIF_WORK_MASK-bits-contiguous.patch b/debian/patches-rt/0316-rt-arm64-make-_TIF_WORK_MASK-bits-contiguous.patch new file mode 100644 index 000000000..38664d0ff --- /dev/null +++ b/debian/patches-rt/0316-rt-arm64-make-_TIF_WORK_MASK-bits-contiguous.patch @@ -0,0 +1,57 @@ +From 599ec4565144185541405febb71e81cf36cad908 Mon Sep 17 00:00:00 2001 +From: Salvatore Bonaccorso <carnil@debian.org> +Date: Fri, 20 Jan 2023 19:23:03 +0100 +Subject: [PATCH 316/323] rt: arm64: make _TIF_WORK_MASK bits contiguous +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +As same as in commit 870d16757ba8 ("arm64: make _TIF_WORK_MASK bits +contiguous") in mainline, we need to make the bits of _TIF_WORK_MASK to +be contiguous in order to use this as an immediate argument to an AND +instruction in entry.S. + +We shuffle these bits down-by-one keeping the existing contiguity after +inserting TIF_NEED_RESCHED_LAZY in the preempt-rt patch series. + +Otherwise, omitting this change will result in a build failure as below: + + arch/arm64/kernel/entry.S: Assembler messages: + arch/arm64/kernel/entry.S:763: Error: immediate out of range at operand 3 -- `and x2,x19,#((1<<1)|(1<<0)|(1<<2)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<13)|(1<<7))' + +Reported-by: Vignesh Raghavendra <vigneshr@ti.com> +Reported-by: Pavel Machek <pavel@denx.de> +Cc: Mark Rutland <mark.rutland@arm.com> +Cc: Catalin Marinas <catalin.marinas@arm.com> +Cc: Will Deacon <will@kernel.org> +Link: https://lore.kernel.org/lkml/40de655e-26f3-aa7b-f1ec-6877396a9f1e@ti.com/ +Signed-off-by: Salvatore Bonaccorso <carnil@debian.org> +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + arch/arm64/include/asm/thread_info.h | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h +index 6eb36a2126e8..2afd9ceb66c9 100644 +--- a/arch/arm64/include/asm/thread_info.h ++++ b/arch/arm64/include/asm/thread_info.h +@@ -70,12 +70,12 @@ void arch_release_task_struct(struct task_struct *tsk); + #define TIF_FSCHECK 5 /* Check FS is USER_DS on return */ + #define TIF_MTE_ASYNC_FAULT 6 /* MTE Asynchronous Tag Check Fault */ + #define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ +-#define TIF_SYSCALL_TRACE 8 /* syscall trace active */ +-#define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ +-#define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ +-#define TIF_SECCOMP 11 /* syscall secure computing */ +-#define TIF_SYSCALL_EMU 12 /* syscall emulation active */ +-#define TIF_NEED_RESCHED_LAZY 13 ++#define TIF_NEED_RESCHED_LAZY 8 ++#define TIF_SYSCALL_TRACE 9 /* syscall trace active */ ++#define TIF_SYSCALL_AUDIT 10 /* syscall auditing */ ++#define TIF_SYSCALL_TRACEPOINT 11 /* syscall tracepoint for ftrace */ ++#define TIF_SECCOMP 12 /* syscall secure computing */ ++#define TIF_SYSCALL_EMU 13 /* syscall emulation active */ + #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ + #define TIF_FREEZE 19 + #define TIF_RESTORE_SIGMASK 20 +-- +2.43.0 + diff --git a/debian/patches-rt/0317-printk-ignore-consoles-without-write-callback.patch b/debian/patches-rt/0317-printk-ignore-consoles-without-write-callback.patch new file mode 100644 index 000000000..a0942062a --- /dev/null +++ b/debian/patches-rt/0317-printk-ignore-consoles-without-write-callback.patch @@ -0,0 +1,50 @@ +From 7efd125170f344b124a00390d8d7b714beac1698 Mon Sep 17 00:00:00 2001 +From: John Ogness <john.ogness@linutronix.de> +Date: Fri, 17 Feb 2023 09:53:44 +0106 +Subject: [PATCH 317/323] printk: ignore consoles without write() callback +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The ttynull driver does not provide an implementation for the write() +callback. This leads to a NULL pointer dereference in the related +printing kthread, which assumes it can call that callback. + +Do not create kthreads for consoles that do not implement the write() +callback. Also, for pr_flush(), ignore consoles that do not implement +write() or write_atomic(), since there is no way those consoles can +flush their output. + +Link: https://lore.kernel.org/lkml/1831554214.546921.1676479103702.JavaMail.zimbra@hale.at +Reported-by: Michael Thalmeier <michael.thalmeier@hale.at> +Signed-off-by: John Ogness <john.ogness@linutronix.de> +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + kernel/printk/printk.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index d2205872304d..64747c72fbea 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -2267,6 +2267,10 @@ static int printk_kthread_func(void *data) + /* Must be called within console_lock(). */ + static void start_printk_kthread(struct console *con) + { ++ /* No need to start a printing thread if the console cannot print. */ ++ if (!con->write) ++ return; ++ + con->thread = kthread_run(printk_kthread_func, con, + "pr/%s%d", con->name, con->index); + if (IS_ERR(con->thread)) { +@@ -3566,6 +3570,8 @@ bool pr_flush(int timeout_ms, bool reset_on_progress) + for_each_console(con) { + if (!(con->flags & CON_ENABLED)) + continue; ++ if (!con->write && !con->write_atomic) ++ continue; + printk_seq = atomic64_read(&con->printk_seq); + if (printk_seq < seq) + diff += seq - printk_seq; +-- +2.43.0 + diff --git a/debian/patches-rt/0318-kernel-fork-set-wake_q_sleeper.next-NULL-again-in-du.patch b/debian/patches-rt/0318-kernel-fork-set-wake_q_sleeper.next-NULL-again-in-du.patch new file mode 100644 index 000000000..21e81955f --- /dev/null +++ b/debian/patches-rt/0318-kernel-fork-set-wake_q_sleeper.next-NULL-again-in-du.patch @@ -0,0 +1,61 @@ +From de3bf8893c5067f2c4d7ef3c57144c134e30ad7f Mon Sep 17 00:00:00 2001 +From: Steffen Dirkwinkel <s.dirkwinkel@beckhoff.com> +Date: Mon, 20 Mar 2023 09:03:47 +0100 +Subject: [PATCH 318/323] kernel: fork: set wake_q_sleeper.next=NULL again in + dup_task_struct +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Without this we get system hangs within a couple of days. +It's also reproducible in minutes with "stress-ng --exec 20". + +Example error in dmesg: +INFO: task stress-ng:163916 blocked for more than 120 seconds. + Not tainted 5.10.168-rt83 #2 +"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. +task:stress-ng state:D stack: 0 pid:163916 ppid: 72833 flags:0x00004000 +Call Trace: + __schedule+0x2bd/0x940 + preempt_schedule_lock+0x23/0x50 + rt_spin_lock_slowlock_locked+0x117/0x2c0 + rt_spin_lock_slowlock+0x51/0x80 + rt_write_lock+0x1e/0x1c0 + do_exit+0x3ac/0xb20 + do_group_exit+0x39/0xb0 + get_signal+0x145/0x960 + ? wake_up_new_task+0x21f/0x3c0 + arch_do_signal_or_restart+0xf1/0x830 + ? __x64_sys_futex+0x146/0x1d0 + exit_to_user_mode_prepare+0x116/0x1a0 + syscall_exit_to_user_mode+0x28/0x190 + entry_SYSCALL_64_after_hwframe+0x61/0xc6 +RIP: 0033:0x7f738d9074a7 +RSP: 002b:00007ffdafda3cb0 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca +RAX: fffffffffffffe00 RBX: 00000000000000ca RCX: 00007f738d9074a7 +RDX: 0000000000028051 RSI: 0000000000000000 RDI: 00007f738be949d0 +RBP: 00007ffdafda3d88 R08: 0000000000000000 R09: 00007f738be94700 +R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000028051 +R13: 00007f738be949d0 R14: 00007ffdafda51e0 R15: 00007f738be94700 + +Fixes: 1ba44dcf789d ("Merge tag 'v5.10.162' into v5.10-rt") +Acked-by: Joe Korty <joe.korty@concurrent-rt.com> +Signed-off-by: Steffen Dirkwinkel <s.dirkwinkel@beckhoff.com> +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + kernel/fork.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/kernel/fork.c b/kernel/fork.c +index dfefb6e7e082..5dc7abedf08f 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -960,6 +960,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) + tsk->splice_pipe = NULL; + tsk->task_frag.page = NULL; + tsk->wake_q.next = NULL; ++ tsk->wake_q_sleeper.next = NULL; + tsk->pf_io_worker = NULL; + + account_kernel_stack(tsk, 1); +-- +2.43.0 + diff --git a/debian/patches-rt/0319-Revert-mm-page_alloc-fix-potential-deadlock-on-zonel.patch b/debian/patches-rt/0319-Revert-mm-page_alloc-fix-potential-deadlock-on-zonel.patch new file mode 100644 index 000000000..1872b2fc8 --- /dev/null +++ b/debian/patches-rt/0319-Revert-mm-page_alloc-fix-potential-deadlock-on-zonel.patch @@ -0,0 +1,63 @@ +From 59fc9fa59a8a6eaa1456e7a3035b10152a9e6d94 Mon Sep 17 00:00:00 2001 +From: "Luis Claudio R. Goncalves" <lgoncalv@redhat.com> +Date: Thu, 8 Jun 2023 19:47:25 -0300 +Subject: [PATCH 319/323] Revert "mm/page_alloc: fix potential deadlock on + zonelist_update_seqseqlock" +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +This reverts commit a992c387b41186ab968fd176ca26b432b05c53ec. + +The support for deferred printing was removed in v5.10-rc1-rt1 by commit +9153e3c5cb0c9 ("printk: remove deferred printing") because: + + Since printing occurs either atomically or from the printing + kthread, there is no need for any deferring or tracking possible + recursion paths. Remove all printk context tracking. + +Also, disabling interrupts in __build_all_zonelists() should produce warnings +once that code path is hit. + +Fixes: a992c387b4118 ("mm/page_alloc: fix potential deadlock on zonelist_update_seq seqlock") +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + mm/page_alloc.c | 16 ---------------- + 1 file changed, 16 deletions(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index c5eb7d6844ae..39d1782b398f 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -6043,21 +6043,7 @@ static void __build_all_zonelists(void *data) + int nid; + int __maybe_unused cpu; + pg_data_t *self = data; +- unsigned long flags; + +- /* +- * Explicitly disable this CPU's interrupts before taking seqlock +- * to prevent any IRQ handler from calling into the page allocator +- * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock. +- */ +- local_irq_save(flags); +- /* +- * Explicitly disable this CPU's synchronous printk() before taking +- * seqlock to prevent any printk() from trying to hold port->lock, for +- * tty_insert_flip_string_and_push_buffer() on other CPU might be +- * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held. +- */ +- printk_deferred_enter(); + write_seqlock(&zonelist_update_seq); + + #ifdef CONFIG_NUMA +@@ -6092,8 +6078,6 @@ static void __build_all_zonelists(void *data) + } + + write_sequnlock(&zonelist_update_seq); +- printk_deferred_exit(); +- local_irq_restore(flags); + } + + static noinline void __init +-- +2.43.0 + diff --git a/debian/patches-rt/0320-Revert-printk-declare-printk_deferred_-enter-safe-in.patch b/debian/patches-rt/0320-Revert-printk-declare-printk_deferred_-enter-safe-in.patch new file mode 100644 index 000000000..b52e5aa42 --- /dev/null +++ b/debian/patches-rt/0320-Revert-printk-declare-printk_deferred_-enter-safe-in.patch @@ -0,0 +1,53 @@ +From f59d43768ad6d22235cbc9ec14f6307867aefefe Mon Sep 17 00:00:00 2001 +From: "Luis Claudio R. Goncalves" <lgoncalv@redhat.com> +Date: Thu, 8 Jun 2023 19:47:25 -0300 +Subject: [PATCH 320/323] Revert "printk: declare + printk_deferred_{enter,safe}() in include/linux/printk.h" +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +This reverts commit 32232bcd4e5300e678718d5c29da4dfa07ade01e. + +The support for deferred printing was removed in v5.10-rc1-rt1 by commit +9153e3c5cb0c9 ("printk: remove deferred printing") because: + + Since printing occurs either atomically or from the printing + kthread, there is no need for any deferring or tracking possible + recursion paths. Remove all printk context tracking. + +Fixes: 32232bcd4e530 ("printk: declare printk_deferred_{enter,safe}() in include/linux/printk.h") +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + include/linux/printk.h | 19 ------------------- + 1 file changed, 19 deletions(-) + +diff --git a/include/linux/printk.h b/include/linux/printk.h +index 83c7734e9802..7e4352467d83 100644 +--- a/include/linux/printk.h ++++ b/include/linux/printk.h +@@ -609,23 +609,4 @@ static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type, + #define print_hex_dump_bytes(prefix_str, prefix_type, buf, len) \ + print_hex_dump_debug(prefix_str, prefix_type, 16, 1, buf, len, true) + +-#ifdef CONFIG_PRINTK +-extern void __printk_safe_enter(void); +-extern void __printk_safe_exit(void); +-/* +- * The printk_deferred_enter/exit macros are available only as a hack for +- * some code paths that need to defer all printk console printing. Interrupts +- * must be disabled for the deferred duration. +- */ +-#define printk_deferred_enter __printk_safe_enter +-#define printk_deferred_exit __printk_safe_exit +-#else +-static inline void printk_deferred_enter(void) +-{ +-} +-static inline void printk_deferred_exit(void) +-{ +-} +-#endif +- + #endif +-- +2.43.0 + diff --git a/debian/patches-rt/0321-arm64-signal-Use-ARCH_RT_DELAYS_SIGNAL_SEND.patch b/debian/patches-rt/0321-arm64-signal-Use-ARCH_RT_DELAYS_SIGNAL_SEND.patch new file mode 100644 index 000000000..0c86c88b7 --- /dev/null +++ b/debian/patches-rt/0321-arm64-signal-Use-ARCH_RT_DELAYS_SIGNAL_SEND.patch @@ -0,0 +1,89 @@ +From 2074b2bfdbed96348cbb99a1a3920fa7a9e4df46 Mon Sep 17 00:00:00 2001 +From: Wang Yong <wang.yong12@zte.com.cn> +Date: Tue, 12 Sep 2023 15:14:28 +0800 +Subject: [PATCH 321/323] arm64: signal: Use ARCH_RT_DELAYS_SIGNAL_SEND +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +The ltp test prompts the following bug information under the 5.10 kernel: +BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:969 +in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 796, name: cat +Preemption disabled at: +[<ffffffe40f433980>] do_debug_exception+0x60/0x180 +CPU: 3 PID: 796 Comm: cat Not tainted 5.10.59-rt52-KERNEL_VERSION #38 +Hardware name: linux,dummy-virt (DT) +Call trace: + dump_backtrace+0x0/0x198 + show_stack+0x20/0x30 + dump_stack+0xf0/0x13c + ___might_sleep+0x140/0x178 + rt_spin_lock+0x30/0x90 + force_sig_info_to_task+0x30/0xe0 + force_sig_fault_to_task+0x54/0x78 + force_sig_fault+0x1c/0x28 + arm64_force_sig_fault+0x48/0x78 + send_user_sigtrap+0x4c/0x80 + brk_handler+0x3c/0x68 + do_debug_exception+0xac/0x180 + el0_dbg+0x34/0x58 + el0_sync_handler+0x50/0xb8 + el0_sync+0x180/0x1c0 + +It has been fixed by +0c34700de5e7 ("arm64: signal: Use ARCH_RT_DELAYS_SIGNAL_SEND.") in +higher versions of the kernel. This patch needs to be compatible with 5.10. +5.10 kernel does not have signal.h file, so adding signal.h file to +define ARCH_RT_DELAYS_SIGNAL_SEND. + +Link: https://lore.kernel.org/r/202309121514283793475@zte.com.cn +Signed-off-by: Wang Yong <wang.yong12@zte.com.cn> +Cc: Xuexin Jiang <jiang.xuexin@zte.com.cn> +Cc: Yang Yang <yang.yang29@zte.com.cn> +Cc: Xiaokai Ran <ran.xiaokai@zte.com.cn> +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + arch/arm64/include/asm/signal.h | 12 ++++++++++++ + arch/arm64/kernel/signal.c | 9 +++++++++ + 2 files changed, 21 insertions(+) + create mode 100644 arch/arm64/include/asm/signal.h + +diff --git a/arch/arm64/include/asm/signal.h b/arch/arm64/include/asm/signal.h +new file mode 100644 +index 000000000000..0fb418cf4c17 +--- /dev/null ++++ b/arch/arm64/include/asm/signal.h +@@ -0,0 +1,12 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef __ARM64_ASM_SIGNAL_H ++#define __ARM64_ASM_SIGNAL_H ++ ++#include <uapi/asm/signal.h> ++#include <uapi/asm/siginfo.h> ++ ++#if defined(CONFIG_PREEMPT_RT) ++#define ARCH_RT_DELAYS_SIGNAL_SEND ++#endif ++ ++#endif +diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c +index 94eed0dc3afc..5b08f55ec85d 100644 +--- a/arch/arm64/kernel/signal.c ++++ b/arch/arm64/kernel/signal.c +@@ -929,6 +929,15 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, + } else { + local_daif_restore(DAIF_PROCCTX); + ++#ifdef ARCH_RT_DELAYS_SIGNAL_SEND ++ if (unlikely(current->forced_info.si_signo)) { ++ struct task_struct *t = current; ++ ++ force_sig_info(&t->forced_info); ++ t->forced_info.si_signo = 0; ++ } ++#endif ++ + if (thread_flags & _TIF_UPROBE) + uprobe_notify_resume(regs); + +-- +2.43.0 + diff --git a/debian/patches-rt/0322-rt-mm-page_alloc-backport-missing-bits-from-__build_.patch b/debian/patches-rt/0322-rt-mm-page_alloc-backport-missing-bits-from-__build_.patch new file mode 100644 index 000000000..8c13d7c63 --- /dev/null +++ b/debian/patches-rt/0322-rt-mm-page_alloc-backport-missing-bits-from-__build_.patch @@ -0,0 +1,49 @@ +From dbe86f5017c79c8abb6f42a6f0f1ad8d97dbae46 Mon Sep 17 00:00:00 2001 +From: "Luis Claudio R. Goncalves" <lgoncalv@redhat.com> +Date: Wed, 20 Dec 2023 10:20:48 -0300 +Subject: [PATCH 322/323] rt: mm/page_alloc: backport missing bits from + __build_all_zonelists() fix +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +A while ago upstream landed commit a2ebb51575828 ("mm/page_alloc: use +write_seqlock_irqsave() instead write_seqlock() + local_irq_save().") +to fix a problem that had already been worked on v5.10-rt via commit +7bdd3bd5143a4 ("Revert "mm/page_alloc: fix potential deadlock on +zonelist_update_seqseqlock""). Sebastian pointed out it was important +to backport the missing elements of a2ebb51575828 for code consistency. + +Fixes: 7bdd3bd5143a4 ("Revert "mm/page_alloc: fix potential deadlock on zonelist_update_seqseqlock"") +Cc: stable-rt@vger.kernel.org +Suggested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + mm/page_alloc.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 39d1782b398f..cd1e8d0b2269 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -6043,8 +6043,9 @@ static void __build_all_zonelists(void *data) + int nid; + int __maybe_unused cpu; + pg_data_t *self = data; ++ unsigned long flags; + +- write_seqlock(&zonelist_update_seq); ++ write_seqlock_irqsave(&zonelist_update_seq, flags); + + #ifdef CONFIG_NUMA + memset(node_load, 0, sizeof(node_load)); +@@ -6077,7 +6078,7 @@ static void __build_all_zonelists(void *data) + #endif + } + +- write_sequnlock(&zonelist_update_seq); ++ write_sequnlock_irqrestore(&zonelist_update_seq, flags); + } + + static noinline void __init +-- +2.43.0 + diff --git a/debian/patches-rt/0323-Linux-5.10.204-rt100-REBASE.patch b/debian/patches-rt/0323-Linux-5.10.204-rt100-REBASE.patch new file mode 100644 index 000000000..3173940fe --- /dev/null +++ b/debian/patches-rt/0323-Linux-5.10.204-rt100-REBASE.patch @@ -0,0 +1,21 @@ +From 0a50987f60b76f392050410e35609fb4361dcca7 Mon Sep 17 00:00:00 2001 +From: "Luis Claudio R. Goncalves" <lgoncalv@redhat.com> +Date: Fri, 22 Dec 2023 17:30:57 -0300 +Subject: [PATCH 323/323] Linux 5.10.204-rt100 REBASE +Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/5.10/older/patches-5.10.204-rt100.tar.xz + +Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com> +--- + localversion-rt | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/localversion-rt b/localversion-rt +index 21988f9ad53f..79d3e2b9f550 100644 +--- a/localversion-rt ++++ b/localversion-rt +@@ -1 +1 @@ +--rt34 ++-rt100 +-- +2.43.0 + diff --git a/debian/patches-rt/series b/debian/patches-rt/series new file mode 100644 index 000000000..70c05ce6b --- /dev/null +++ b/debian/patches-rt/series @@ -0,0 +1,323 @@ +0001-z3fold-remove-preempt-disabled-sections-for-RT.patch +0002-stop_machine-Add-function-and-caller-debug-info.patch +0003-sched-Fix-balance_callback.patch +0004-sched-hotplug-Ensure-only-per-cpu-kthreads-run-durin.patch +0005-sched-core-Wait-for-tasks-being-pushed-away-on-hotpl.patch +0006-workqueue-Manually-break-affinity-on-hotplug.patch +0007-sched-hotplug-Consolidate-task-migration-on-CPU-unpl.patch +0008-sched-Fix-hotplug-vs-CPU-bandwidth-control.patch +0009-sched-Massage-set_cpus_allowed.patch +0010-sched-Add-migrate_disable.patch +0011-sched-Fix-migrate_disable-vs-set_cpus_allowed_ptr.patch +0012-sched-core-Make-migrate-disable-and-CPU-hotplug-coop.patch +0013-sched-rt-Use-cpumask_any-_distribute.patch +0014-sched-rt-Use-the-full-cpumask-for-balancing.patch +0015-sched-lockdep-Annotate-pi_lock-recursion.patch +0016-sched-Fix-migrate_disable-vs-rt-dl-balancing.patch +0017-sched-proc-Print-accurate-cpumask-vs-migrate_disable.patch +0018-sched-Add-migrate_disable-tracepoints.patch +0019-sched-Deny-self-issued-__set_cpus_allowed_ptr-when-m.patch +0020-sched-Comment-affine_move_task.patch +0021-sched-Unlock-the-rq-in-affine_move_task-error-path.patch +0022-sched-Fix-migration_cpu_stop-WARN.patch +0023-sched-core-Add-missing-completion-for-affine_move_ta.patch +0024-mm-highmem-Un-EXPORT-__kmap_atomic_idx.patch +0025-highmem-Remove-unused-functions.patch +0026-fs-Remove-asm-kmap_types.h-includes.patch +0027-sh-highmem-Remove-all-traces-of-unused-cruft.patch +0028-asm-generic-Provide-kmap_size.h.patch +0029-highmem-Provide-generic-variant-of-kmap_atomic.patch +0030-highmem-Make-DEBUG_HIGHMEM-functional.patch +0031-x86-mm-highmem-Use-generic-kmap-atomic-implementatio.patch +0032-arc-mm-highmem-Use-generic-kmap-atomic-implementatio.patch +0033-ARM-highmem-Switch-to-generic-kmap-atomic.patch +0034-csky-mm-highmem-Switch-to-generic-kmap-atomic.patch +0035-microblaze-mm-highmem-Switch-to-generic-kmap-atomic.patch +0036-mips-mm-highmem-Switch-to-generic-kmap-atomic.patch +0037-nds32-mm-highmem-Switch-to-generic-kmap-atomic.patch +0038-powerpc-mm-highmem-Switch-to-generic-kmap-atomic.patch +0039-sparc-mm-highmem-Switch-to-generic-kmap-atomic.patch +0040-xtensa-mm-highmem-Switch-to-generic-kmap-atomic.patch +0041-highmem-Get-rid-of-kmap_types.h.patch +0042-mm-highmem-Remove-the-old-kmap_atomic-cruft.patch +0043-io-mapping-Cleanup-atomic-iomap.patch +0044-Documentation-io-mapping-Remove-outdated-blurb.patch +0045-highmem-High-implementation-details-and-document-API.patch +0046-sched-Make-migrate_disable-enable-independent-of-RT.patch +0047-sched-highmem-Store-local-kmaps-in-task-struct.patch +0048-mm-highmem-Provide-kmap_local.patch +0049-io-mapping-Provide-iomap_local-variant.patch +0050-x86-crashdump-32-Simplify-copy_oldmem_page.patch +0051-mips-crashdump-Simplify-copy_oldmem_page.patch +0052-ARM-mm-Replace-kmap_atomic_pfn.patch +0053-highmem-Remove-kmap_atomic_pfn.patch +0054-drm-ttm-Replace-kmap_atomic-usage.patch +0055-drm-vmgfx-Replace-kmap_atomic.patch +0056-highmem-Remove-kmap_atomic_prot.patch +0057-drm-qxl-Replace-io_mapping_map_atomic_wc.patch +0058-drm-nouveau-device-Replace-io_mapping_map_atomic_wc.patch +0059-drm-i915-Replace-io_mapping_map_atomic_wc.patch +0060-io-mapping-Remove-io_mapping_map_atomic_wc.patch +0061-mm-highmem-Take-kmap_high_get-properly-into-account.patch +0062-highmem-Don-t-disable-preemption-on-RT-in-kmap_atomi.patch +0063-blk-mq-Don-t-complete-on-a-remote-CPU-in-force-threa.patch +0064-blk-mq-Always-complete-remote-completions-requests-i.patch +0065-blk-mq-Use-llist_head-for-blk_cpu_done.patch +0066-lib-test_lockup-Minimum-fix-to-get-it-compiled-on-PR.patch +0067-timers-Don-t-block-on-expiry_lock-for-TIMER_IRQSAFE.patch +0068-kthread-Move-prio-affinite-change-into-the-newly-cre.patch +0069-genirq-Move-prio-assignment-into-the-newly-created-t.patch +0070-notifier-Make-atomic_notifiers-use-raw_spinlock.patch +0071-rcu-Make-RCU_BOOST-default-on-CONFIG_PREEMPT_RT.patch +0072-rcu-Unconditionally-use-rcuc-threads-on-PREEMPT_RT.patch +0073-rcu-Enable-rcu_normal_after_boot-unconditionally-for.patch +0074-doc-Update-RCU-s-requirements-page-about-the-PREEMPT.patch +0075-doc-Use-CONFIG_PREEMPTION.patch +0076-tracing-Merge-irqflags-preempt-counter.patch +0077-tracing-Inline-tracing_gen_ctx_flags.patch +0078-tracing-Use-in_serving_softirq-to-deduct-softirq-sta.patch +0079-tracing-Remove-NULL-check-from-current-in-tracing_ge.patch +0080-printk-inline-log_output-log_store-in-vprintk_store.patch +0081-printk-remove-logbuf_lock-writer-protection-of-ringb.patch +0082-printk-limit-second-loop-of-syslog_print_all.patch +0083-printk-kmsg_dump-remove-unused-fields.patch +0084-printk-refactor-kmsg_dump_get_buffer.patch +0085-printk-consolidate-kmsg_dump_get_buffer-syslog_print.patch +0086-printk-introduce-CONSOLE_LOG_MAX-for-improved-multi-.patch +0087-printk-use-seqcount_latch-for-clear_seq.patch +0088-printk-use-atomic64_t-for-devkmsg_user.seq.patch +0089-printk-add-syslog_lock.patch +0090-printk-introduce-a-kmsg_dump-iterator.patch +0091-um-synchronize-kmsg_dumper.patch +0092-printk-remove-logbuf_lock.patch +0093-printk-kmsg_dump-remove-_nolock-variants.patch +0094-printk-kmsg_dump-use-kmsg_dump_rewind.patch +0095-printk-console-remove-unnecessary-safe-buffer-usage.patch +0096-printk-track-limit-recursion.patch +0097-printk-remove-safe-buffers.patch +0098-printk-convert-syslog_lock-to-spin_lock.patch +0099-console-add-write_atomic-interface.patch +0100-serial-8250-implement-write_atomic.patch +0101-printk-relocate-printk_delay-and-vprintk_default.patch +0102-printk-combine-boot_delay_msec-into-printk_delay.patch +0103-printk-change-console_seq-to-atomic64_t.patch +0104-printk-introduce-kernel-sync-mode.patch +0105-printk-move-console-printing-to-kthreads.patch +0106-printk-remove-deferred-printing.patch +0107-printk-add-console-handover.patch +0108-printk-add-pr_flush.patch +0109-cgroup-use-irqsave-in-cgroup_rstat_flush_locked.patch +0110-mm-workingset-replace-IRQ-off-check-with-a-lockdep-a.patch +0111-tpm-remove-tpm_dev_wq_lock.patch +0112-shmem-Use-raw_spinlock_t-for-stat_lock.patch +0113-net-Move-lockdep-where-it-belongs.patch +0114-parisc-Remove-bogus-__IRQ_STAT-macro.patch +0115-sh-Get-rid-of-nmi_count.patch +0116-irqstat-Get-rid-of-nmi_count-and-__IRQ_STAT.patch +0117-um-irqstat-Get-rid-of-the-duplicated-declarations.patch +0118-ARM-irqstat-Get-rid-of-duplicated-declaration.patch +0119-arm64-irqstat-Get-rid-of-duplicated-declaration.patch +0120-asm-generic-irqstat-Add-optional-__nmi_count-member.patch +0121-sh-irqstat-Use-the-generic-irq_cpustat_t.patch +0122-irqstat-Move-declaration-into-asm-generic-hardirq.h.patch +0123-preempt-Cleanup-the-macro-maze-a-bit.patch +0124-softirq-Move-related-code-into-one-section.patch +0125-sh-irq-Add-missing-closing-parentheses-in-arch_show_.patch +0126-sched-cputime-Remove-symbol-exports-from-IRQ-time-ac.patch +0127-s390-vtime-Use-the-generic-IRQ-entry-accounting.patch +0128-sched-vtime-Consolidate-IRQ-time-accounting.patch +0129-irqtime-Move-irqtime-entry-accounting-after-irq-offs.patch +0130-irq-Call-tick_irq_enter-inside-HARDIRQ_OFFSET.patch +0131-smp-Wake-ksoftirqd-on-PREEMPT_RT-instead-do_softirq.patch +0132-tasklets-Replace-barrier-with-cpu_relax-in-tasklet_u.patch +0133-tasklets-Use-static-inlines-for-stub-implementations.patch +0134-tasklets-Provide-tasklet_disable_in_atomic.patch +0135-tasklets-Use-spin-wait-in-tasklet_disable-temporaril.patch +0136-tasklets-Replace-spin-wait-in-tasklet_unlock_wait.patch +0137-tasklets-Replace-spin-wait-in-tasklet_kill.patch +0138-tasklets-Prevent-tasklet_unlock_spin_wait-deadlock-o.patch +0139-net-jme-Replace-link-change-tasklet-with-work.patch +0140-net-sundance-Use-tasklet_disable_in_atomic.patch +0141-ath9k-Use-tasklet_disable_in_atomic.patch +0142-atm-eni-Use-tasklet_disable_in_atomic-in-the-send-ca.patch +0143-PCI-hv-Use-tasklet_disable_in_atomic.patch +0144-firewire-ohci-Use-tasklet_disable_in_atomic-where-re.patch +0145-tasklets-Switch-tasklet_disable-to-the-sleep-wait-va.patch +0146-softirq-Add-RT-specific-softirq-accounting.patch +0147-irqtime-Make-accounting-correct-on-RT.patch +0148-softirq-Move-various-protections-into-inline-helpers.patch +0149-softirq-Make-softirq-control-and-processing-RT-aware.patch +0150-tick-sched-Prevent-false-positive-softirq-pending-wa.patch +0151-rcu-Prevent-false-positive-softirq-warning-on-RT.patch +0152-chelsio-cxgb-Replace-the-workqueue-with-threaded-int.patch +0153-chelsio-cxgb-Disable-the-card-on-error-in-threaded-i.patch +0154-x86-fpu-Simplify-fpregs_-un-lock.patch +0155-x86-fpu-Make-kernel-FPU-protection-RT-friendly.patch +0156-locking-rtmutex-Remove-cruft.patch +0157-locking-rtmutex-Remove-output-from-deadlock-detector.patch +0158-locking-rtmutex-Move-rt_mutex_init-outside-of-CONFIG.patch +0159-locking-rtmutex-Remove-rt_mutex_timed_lock.patch +0160-locking-rtmutex-Handle-the-various-new-futex-race-co.patch +0161-futex-Fix-bug-on-when-a-requeued-RT-task-times-out.patch +0162-locking-rtmutex-Make-lock_killable-work.patch +0163-locking-spinlock-Split-the-lock-types-header.patch +0164-locking-rtmutex-Avoid-include-hell.patch +0165-lockdep-Reduce-header-files-in-debug_locks.h.patch +0166-locking-split-out-the-rbtree-definition.patch +0167-locking-rtmutex-Provide-rt_mutex_slowlock_locked.patch +0168-locking-rtmutex-export-lockdep-less-version-of-rt_mu.patch +0169-sched-Add-saved_state-for-tasks-blocked-on-sleeping-.patch +0170-locking-rtmutex-add-sleeping-lock-implementation.patch +0171-locking-rtmutex-Allow-rt_mutex_trylock-on-PREEMPT_RT.patch +0172-locking-rtmutex-add-mutex-implementation-based-on-rt.patch +0173-locking-rtmutex-add-rwsem-implementation-based-on-rt.patch +0174-locking-rtmutex-add-rwlock-implementation-based-on-r.patch +0175-locking-rtmutex-wire-up-RT-s-locking.patch +0176-locking-rtmutex-add-ww_mutex-addon-for-mutex-rt.patch +0177-locking-rtmutex-Use-custom-scheduling-function-for-s.patch +0178-signal-Revert-ptrace-preempt-magic.patch +0179-preempt-Provide-preempt_-_-no-rt-variants.patch +0180-mm-vmstat-Protect-per-cpu-variables-with-preempt-dis.patch +0181-mm-memcontrol-Disable-preemption-in-__mod_memcg_lruv.patch +0182-xfrm-Use-sequence-counter-with-associated-spinlock.patch +0183-u64_stats-Disable-preemption-on-32bit-UP-SMP-with-RT.patch +0184-fs-dcache-use-swait_queue-instead-of-waitqueue.patch +0185-fs-dcache-disable-preemption-on-i_dir_seq-s-write-si.patch +0186-net-Qdisc-use-a-seqlock-instead-seqcount.patch +0187-net-Properly-annotate-the-try-lock-for-the-seqlock.patch +0188-kconfig-Disable-config-options-which-are-not-RT-comp.patch +0189-mm-Allow-only-SLUB-on-RT.patch +0190-sched-Disable-CONFIG_RT_GROUP_SCHED-on-RT.patch +0191-net-core-disable-NET_RX_BUSY_POLL-on-RT.patch +0192-efi-Disable-runtime-services-on-RT.patch +0193-efi-Allow-efi-runtime.patch +0194-rt-Add-local-irq-locks.patch +0195-signal-x86-Delay-calling-signals-in-atomic.patch +0196-Split-IRQ-off-and-zone-lock-while-freeing-pages-from.patch +0197-Split-IRQ-off-and-zone-lock-while-freeing-pages-from.patch +0198-mm-SLxB-change-list_lock-to-raw_spinlock_t.patch +0199-mm-SLUB-delay-giving-back-empty-slubs-to-IRQ-enabled.patch +0200-mm-slub-Always-flush-the-delayed-empty-slubs-in-flus.patch +0201-mm-slub-Don-t-resize-the-location-tracking-cache-on-.patch +0202-mm-page_alloc-Use-migrate_disable-in-drain_local_pag.patch +0203-mm-page_alloc-rt-friendly-per-cpu-pages.patch +0204-mm-slub-Make-object_map_lock-a-raw_spinlock_t.patch +0205-slub-Enable-irqs-for-__GFP_WAIT.patch +0206-slub-Disable-SLUB_CPU_PARTIAL.patch +0207-mm-memcontrol-Provide-a-local_lock-for-per-CPU-memcg.patch +0208-mm-memcontrol-Don-t-call-schedule_work_on-in-preempt.patch +0209-mm-memcontrol-Replace-local_irq_disable-with-local-l.patch +0210-mm-zsmalloc-copy-with-get_cpu_var-and-locking.patch +0211-mm-zswap-Use-local-lock-to-protect-per-CPU-data.patch +0212-x86-kvm-Require-const-tsc-for-RT.patch +0213-wait.h-include-atomic.h.patch +0214-sched-Limit-the-number-of-task-migrations-per-batch.patch +0215-sched-Move-mmdrop-to-RCU-on-RT.patch +0216-kernel-sched-move-stack-kprobe-clean-up-to-__put_tas.patch +0217-sched-Do-not-account-rcu_preempt_depth-on-RT-in-migh.patch +0218-sched-Disable-TTWU_QUEUE-on-RT.patch +0219-softirq-Check-preemption-after-reenabling-interrupts.patch +0220-softirq-Disable-softirq-stacks-for-RT.patch +0221-net-core-use-local_bh_disable-in-netif_rx_ni.patch +0222-pid.h-include-atomic.h.patch +0223-ptrace-fix-ptrace-vs-tasklist_lock-race.patch +0224-ptrace-fix-ptrace_unfreeze_traced-race-with-rt-lock.patch +0225-kernel-sched-add-put-get-_cpu_light.patch +0226-trace-Add-migrate-disabled-counter-to-tracing-output.patch +0227-locking-don-t-check-for-__LINUX_SPINLOCK_TYPES_H-on-.patch +0228-locking-Make-spinlock_t-and-rwlock_t-a-RCU-section-o.patch +0229-mm-vmalloc-Another-preempt-disable-region-which-suck.patch +0230-block-mq-do-not-invoke-preempt_disable.patch +0231-md-raid5-Make-raid5_percpu-handling-RT-aware.patch +0232-scsi-fcoe-Make-RT-aware.patch +0233-sunrpc-Make-svc_xprt_do_enqueue-use-get_cpu_light.patch +0234-rt-Introduce-cpu_chill.patch +0235-fs-namespace-Use-cpu_chill-in-trylock-loops.patch +0236-net-Use-skbufhead-with-raw-lock.patch +0237-net-Dequeue-in-dev_cpu_dead-without-the-lock.patch +0238-net-dev-always-take-qdisc-s-busylock-in-__dev_xmit_s.patch +0239-irqwork-push-most-work-into-softirq-context.patch +0240-x86-crypto-Reduce-preempt-disabled-regions.patch +0241-crypto-Reduce-preempt-disabled-regions-more-algos.patch +0242-crypto-limit-more-FPU-enabled-sections.patch +0243-panic-skip-get_random_bytes-for-RT_FULL-in-init_oops.patch +0244-x86-stackprotector-Avoid-random-pool-on-rt.patch +0245-net-Remove-preemption-disabling-in-netif_rx.patch +0246-lockdep-Make-it-RT-aware.patch +0247-lockdep-selftest-Only-do-hardirq-context-test-for-ra.patch +0248-lockdep-selftest-fix-warnings-due-to-missing-PREEMPT.patch +0249-lockdep-disable-self-test.patch +0250-drm-radeon-i915-Use-preempt_disable-enable_rt-where-.patch +0251-drm-i915-Don-t-disable-interrupts-on-PREEMPT_RT-duri.patch +0252-drm-i915-disable-tracing-on-RT.patch +0253-drm-i915-skip-DRM_I915_LOW_LEVEL_TRACEPOINTS-with-NO.patch +0254-drm-i915-gt-Only-disable-interrupts-for-the-timeline.patch +0255-cpuset-Convert-callback_lock-to-raw_spinlock_t.patch +0256-x86-Allow-to-enable-RT.patch +0257-mm-scatterlist-Do-not-disable-irqs-on-RT.patch +0258-sched-Add-support-for-lazy-preemption.patch +0259-x86-entry-Use-should_resched-in-idtentry_exit_cond_r.patch +0260-x86-Support-for-lazy-preemption.patch +0261-arm-Add-support-for-lazy-preemption.patch +0262-powerpc-Add-support-for-lazy-preemption.patch +0263-arch-arm64-Add-lazy-preempt-support.patch +0264-jump-label-disable-if-stop_machine-is-used.patch +0265-leds-trigger-disable-CPU-trigger-on-RT.patch +0266-tty-serial-omap-Make-the-locking-RT-aware.patch +0267-tty-serial-pl011-Make-the-locking-work-on-RT.patch +0268-ARM-enable-irq-in-translation-section-permission-fau.patch +0269-genirq-update-irq_set_irqchip_state-documentation.patch +0270-KVM-arm-arm64-downgrade-preempt_disable-d-region-to-.patch +0271-arm64-fpsimd-Delay-freeing-memory-in-fpsimd_flush_th.patch +0272-x86-Enable-RT-also-on-32bit.patch +0273-ARM-Allow-to-enable-RT.patch +0274-ARM64-Allow-to-enable-RT.patch +0275-powerpc-traps-Use-PREEMPT_RT.patch +0276-powerpc-pseries-iommu-Use-a-locallock-instead-local_.patch +0277-powerpc-kvm-Disable-in-kernel-MPIC-emulation-for-PRE.patch +0278-powerpc-stackprotector-work-around-stack-guard-init-.patch +0279-powerpc-Avoid-recursive-header-includes.patch +0280-POWERPC-Allow-to-enable-RT.patch +0281-drivers-block-zram-Replace-bit-spinlocks-with-rtmute.patch +0282-tpm_tis-fix-stall-after-iowrite-s.patch +0283-signals-Allow-rt-tasks-to-cache-one-sigqueue-struct.patch +0284-signal-Prevent-double-free-of-user-struct.patch +0285-genirq-Disable-irqpoll-on-rt.patch +0286-sysfs-Add-sys-kernel-realtime-entry.patch +0287-Add-localversion-for-RT-release.patch +0288-net-xfrm-Use-sequence-counter-with-associated-spinlo.patch +0289-sched-Fix-migration_cpu_stop-requeueing.patch +0290-sched-Simplify-migration_cpu_stop.patch +0291-sched-Collate-affine_move_task-stoppers.patch +0292-sched-Optimize-migration_cpu_stop.patch +0293-sched-Fix-affine_move_task-self-concurrency.patch +0294-sched-Simplify-set_affinity_pending-refcounts.patch +0295-sched-Don-t-defer-CPU-pick-to-migration_cpu_stop.patch +0296-printk-Enhance-the-condition-check-of-msleep-in-pr_f.patch +0297-locking-rwsem-rt-Remove-might_sleep-in-__up_read.patch +0298-mm-zsmalloc-Convert-zsmalloc_handle.lock-to-spinlock.patch +0299-sched-Fix-get_push_task-vs-migrate_disable.patch +0300-sched-Switch-wait_task_inactive-to-HRTIMER_MODE_REL_.patch +0301-preempt-Move-preempt_enable_no_resched-to-the-RT-blo.patch +0302-mm-Disable-NUMA_BALANCING_DEFAULT_ENABLED-and-TRANSP.patch +0303-fscache-Use-only-one-fscache_object_cong_wait.patch +0304-fscache-Use-only-one-fscache_object_cong_wait.patch +0305-locking-Drop-might_resched-from-might_sleep_no_state.patch +0306-drm-i915-gt-Queue-and-wait-for-the-irq_work-item.patch +0307-irq_work-Allow-irq_work_sync-to-sleep-if-irq_work-no.patch +0308-irq_work-Handle-some-irq_work-in-a-per-CPU-thread-on.patch +0309-irq_work-Also-rcuwait-for-IRQ_WORK_HARD_IRQ-on-PREEM.patch +0310-eventfd-Make-signal-recursion-protection-a-task-bit.patch +0311-stop_machine-Remove-this_cpu_ptr-from-print_stop_inf.patch +0312-aio-Fix-incorrect-usage-of-eventfd_signal_allowed.patch +0313-rt-remove-extra-parameter-from-__trace_stack.patch +0314-locking-rtmutex-switch-to-EXPORT_SYMBOL-for-ww_mutex.patch +0315-ftrace-Fix-improper-usage-of-__trace_stack-function.patch +0316-rt-arm64-make-_TIF_WORK_MASK-bits-contiguous.patch +0317-printk-ignore-consoles-without-write-callback.patch +0318-kernel-fork-set-wake_q_sleeper.next-NULL-again-in-du.patch +0319-Revert-mm-page_alloc-fix-potential-deadlock-on-zonel.patch +0320-Revert-printk-declare-printk_deferred_-enter-safe-in.patch +0321-arm64-signal-Use-ARCH_RT_DELAYS_SIGNAL_SEND.patch +0322-rt-mm-page_alloc-backport-missing-bits-from-__build_.patch +0323-Linux-5.10.204-rt100-REBASE.patch |