1 files changed, 330 insertions, 0 deletions
diff --git a/debian/patches-rt/0002-net-Allow-to-use-SMP-threads-for-backlog-NAPI.patch b/debian/patches-rt/0002-net-Allow-to-use-SMP-threads-for-backlog-NAPI.patch
new file mode 100644
index 0000000000..acdfbddf5a
--- /dev/null
+++ b/debian/patches-rt/0002-net-Allow-to-use-SMP-threads-for-backlog-NAPI.patch
@@ -0,0 +1,330 @@
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Date: Sat, 9 Mar 2024 10:05:10 +0100
+Subject: [PATCH 2/4] net: Allow to use SMP threads for backlog NAPI.
+Origin: https://www.kernel.org/pub/linux/kernel/projects/rt/6.8/older/patches-6.8.2-rt11.tar.xz
+
+Backlog NAPI is a per-CPU NAPI struct only (with no device behind it)
+used by drivers which don't do NAPI them self, RPS and parts of the
+stack which need to avoid recursive deadlocks while processing a packet.
+
+The non-NAPI driver use the CPU local backlog NAPI. If RPS is enabled
+then a flow for the skb is computed and based on the flow the skb can be
+enqueued on a remote CPU. Scheduling/ raising the softirq (for backlog's
+NAPI) on the remote CPU isn't trivial because the softirq is only
+scheduled on the local CPU and performed after the hardirq is done.
+In order to schedule a softirq on the remote CPU, an IPI is sent to the
+remote CPU which schedules the backlog-NAPI on the then local CPU.
+
+On PREEMPT_RT interrupts are force-threaded. The soft interrupts are
+raised within the interrupt thread and processed after the interrupt
+handler completed still within the context of the interrupt thread. The
+softirq is handled in the context where it originated.
+
+With force-threaded interrupts enabled, ksoftirqd is woken up if a
+softirq is raised from hardirq context. This is the case if it is raised
+from an IPI. Additionally there is a warning on PREEMPT_RT if the
+softirq is raised from the idle thread.
+This was done for two reasons:
+- With threaded interrupts the processing should happen in thread
+  context (where it originated) and ksoftirqd is the only thread for
+  this context if raised from hardirq. Using the currently running task
+  instead would "punish" a random task.
+- Once ksoftirqd is active it consumes all further softirqs until it
+  stops running. This changed recently and is no longer the case.
+
+Instead of keeping the backlog NAPI in ksoftirqd (in force-threaded/
+PREEMPT_RT setups) I am proposing NAPI-threads for backlog.
+The "proper" setup with threaded-NAPI is not doable because the threads
+are not pinned to an individual CPU and can be modified by the user.
+Additionally a dummy network device would have to be assigned. Also
+CPU-hotplug has to be considered if additional CPUs show up.
+All this can be probably done/ solved but the smpboot-threads already
+provide this infrastructure.
+
+Sending UDP packets over loopback expects that the packet is processed
+within the call. Delaying it by handing it over to the thread hurts
+performance. It is not beneficial to the outcome if the context switch
+happens immediately after enqueue or after a while to process a few
+packets in a batch.
+There is no need to always use the thread if the backlog NAPI is
+requested on the local CPU. This restores the loopback throuput. The
+performance drops mostly to the same value after enabling RPS on the
+loopback comparing the IPI and the tread result.
+
+Create NAPI-threads for backlog if request during boot. The thread runs
+the inner loop from napi_threaded_poll(), the wait part is different. It
+checks for NAPI_STATE_SCHED (the backlog NAPI can not be disabled).
+
+The NAPI threads for backlog are optional, it has to be enabled via the boot
+argument "thread_backlog_napi". It is mandatory for PREEMPT_RT to avoid the
+wakeup of ksoftirqd from the IPI.
+
+Acked-by: Jakub Kicinski <kuba@kernel.org>
+Link: https://lore.kernel.org/r/20240309090824.2956805-3-bigeasy@linutronix.de
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ net/core/dev.c |  152 +++++++++++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 115 insertions(+), 37 deletions(-)
+
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -78,6 +78,7 @@
+ #include <linux/slab.h>
+ #include <linux/sched.h>
+ #include <linux/sched/mm.h>
++#include <linux/smpboot.h>
+ #include <linux/mutex.h>
+ #include <linux/rwsem.h>
+ #include <linux/string.h>
+@@ -216,6 +217,31 @@ static inline struct hlist_head *dev_ind
+ 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
+ }
+ 
++#ifndef CONFIG_PREEMPT_RT
++
++static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key);
++
++static int __init setup_backlog_napi_threads(char *arg)
++{
++	static_branch_enable(&use_backlog_threads_key);
++	return 0;
++}
++early_param("thread_backlog_napi", setup_backlog_napi_threads);
++
++static bool use_backlog_threads(void)
++{
++	return static_branch_unlikely(&use_backlog_threads_key);
++}
++
++#else
++
++static bool use_backlog_threads(void)
++{
++	return true;
++}
++
++#endif
++
+ static inline void rps_lock_irqsave(struct softnet_data *sd,
+ 				    unsigned long *flags)
+ {
+@@ -4420,6 +4446,7 @@ EXPORT_SYMBOL(__dev_direct_xmit);
+ /*************************************************************************
+  *			Receiver routines
+  *************************************************************************/
++static DEFINE_PER_CPU(struct task_struct *, backlog_napi);
+ 
+ int netdev_max_backlog __read_mostly = 1000;
+ EXPORT_SYMBOL(netdev_max_backlog);
+@@ -4452,12 +4479,16 @@ static inline void ____napi_schedule(str
+ 		 */
+ 		thread = READ_ONCE(napi->thread);
+ 		if (thread) {
++			if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
++				goto use_local_napi;
++
+ 			set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
+ 			wake_up_process(thread);
+ 			return;
+ 		}
+ 	}
+ 
++use_local_napi:
+ 	list_add_tail(&napi->poll_list, &sd->poll_list);
+ 	WRITE_ONCE(napi->list_owner, smp_processor_id());
+ 	/* If not called from net_rx_action()
+@@ -4703,6 +4734,11 @@ static void napi_schedule_rps(struct sof
+ 
+ #ifdef CONFIG_RPS
+ 	if (sd != mysd) {
++		if (use_backlog_threads()) {
++			__napi_schedule_irqoff(&sd->backlog);
++			return;
++		}
++
+ 		sd->rps_ipi_next = mysd->rps_ipi_list;
+ 		mysd->rps_ipi_list = sd;
+ 
+@@ -5926,7 +5962,7 @@ static void net_rps_action_and_irq_enabl
+ #ifdef CONFIG_RPS
+ 	struct softnet_data *remsd = sd->rps_ipi_list;
+ 
+-	if (remsd) {
++	if (!use_backlog_threads() && remsd) {
+ 		sd->rps_ipi_list = NULL;
+ 
+ 		local_irq_enable();
+@@ -5941,7 +5977,7 @@ static void net_rps_action_and_irq_enabl
+ static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
+ {
+ #ifdef CONFIG_RPS
+-	return sd->rps_ipi_list != NULL;
++	return !use_backlog_threads() && sd->rps_ipi_list;
+ #else
+ 	return false;
+ #endif
+@@ -5985,7 +6021,7 @@ static int process_backlog(struct napi_s
+ 			 * We can use a plain write instead of clear_bit(),
+ 			 * and we dont need an smp_mb() memory barrier.
+ 			 */
+-			napi->state = 0;
++			napi->state &= NAPIF_STATE_THREADED;
+ 			again = false;
+ 		} else {
+ 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
+@@ -6691,43 +6727,48 @@ static void skb_defer_free_flush(struct
+ 	}
+ }
+ 
+-static int napi_threaded_poll(void *data)
++static void napi_threaded_poll_loop(struct napi_struct *napi)
+ {
+-	struct napi_struct *napi = data;
+ 	struct softnet_data *sd;
+-	void *have;
++	unsigned long last_qs = jiffies;
+ 
+-	while (!napi_thread_wait(napi)) {
+-		unsigned long last_qs = jiffies;
++	for (;;) {
++		bool repoll = false;
++		void *have;
+ 
+-		for (;;) {
+-			bool repoll = false;
++		local_bh_disable();
++		sd = this_cpu_ptr(&softnet_data);
++		sd->in_napi_threaded_poll = true;
+ 
+-			local_bh_disable();
+-			sd = this_cpu_ptr(&softnet_data);
+-			sd->in_napi_threaded_poll = true;
+-
+-			have = netpoll_poll_lock(napi);
+-			__napi_poll(napi, &repoll);
+-			netpoll_poll_unlock(have);
+-
+-			sd->in_napi_threaded_poll = false;
+-			barrier();
+-
+-			if (sd_has_rps_ipi_waiting(sd)) {
+-				local_irq_disable();
+-				net_rps_action_and_irq_enable(sd);
+-			}
+-			skb_defer_free_flush(sd);
+-			local_bh_enable();
++		have = netpoll_poll_lock(napi);
++		__napi_poll(napi, &repoll);
++		netpoll_poll_unlock(have);
++
++		sd->in_napi_threaded_poll = false;
++		barrier();
++
++		if (sd_has_rps_ipi_waiting(sd)) {
++			local_irq_disable();
++			net_rps_action_and_irq_enable(sd);
++		}
++		skb_defer_free_flush(sd);
++		local_bh_enable();
+ 
+-			if (!repoll)
+-				break;
++		if (!repoll)
++			break;
+ 
+-			rcu_softirq_qs_periodic(last_qs);
+-			cond_resched();
+-		}
++		rcu_softirq_qs_periodic(last_qs);
++		cond_resched();
+ 	}
++}
++
++static int napi_threaded_poll(void *data)
++{
++	struct napi_struct *napi = data;
++
++	while (!napi_thread_wait(napi))
++		napi_threaded_poll_loop(napi);
++
+ 	return 0;
+ }
+ 
+@@ -11326,7 +11367,7 @@ static int dev_cpu_dead(unsigned int old
+ 
+ 		list_del_init(&napi->poll_list);
+ 		if (napi->poll == process_backlog)
+-			napi->state = 0;
++			napi->state &= NAPIF_STATE_THREADED;
+ 		else
+ 			____napi_schedule(sd, napi);
+ 	}
+@@ -11334,12 +11375,14 @@ static int dev_cpu_dead(unsigned int old
+ 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
+ 	local_irq_enable();
+ 
++	if (!use_backlog_threads()) {
+ #ifdef CONFIG_RPS
+-	remsd = oldsd->rps_ipi_list;
+-	oldsd->rps_ipi_list = NULL;
++		remsd = oldsd->rps_ipi_list;
++		oldsd->rps_ipi_list = NULL;
+ #endif
+-	/* send out pending IPI's on offline CPU */
+-	net_rps_send_ipi(remsd);
++		/* send out pending IPI's on offline CPU */
++		net_rps_send_ipi(remsd);
++	}
+ 
+ 	/* Process offline CPU's input_pkt_queue */
+ 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
+@@ -11659,6 +11702,38 @@ static void __init net_dev_struct_check(
+  *
+  */
+ 
++static int backlog_napi_should_run(unsigned int cpu)
++{
++	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
++	struct napi_struct *napi = &sd->backlog;
++
++	return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
++}
++
++static void run_backlog_napi(unsigned int cpu)
++{
++	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
++
++	napi_threaded_poll_loop(&sd->backlog);
++}
++
++static void backlog_napi_setup(unsigned int cpu)
++{
++	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
++	struct napi_struct *napi = &sd->backlog;
++
++	napi->thread = this_cpu_read(backlog_napi);
++	set_bit(NAPI_STATE_THREADED, &napi->state);
++}
++
++static struct smp_hotplug_thread backlog_threads = {
++	.store			= &backlog_napi,
++	.thread_should_run	= backlog_napi_should_run,
++	.thread_fn		= run_backlog_napi,
++	.thread_comm		= "backlog_napi/%u",
++	.setup			= backlog_napi_setup,
++};
++
+ /*
+  *       This is called single threaded during boot, so no need
+  *       to take the rtnl semaphore.
+@@ -11711,7 +11786,10 @@ static int __init net_dev_init(void)
+ 		init_gro_hash(&sd->backlog);
+ 		sd->backlog.poll = process_backlog;
+ 		sd->backlog.weight = weight_p;
++		INIT_LIST_HEAD(&sd->backlog.poll_list);
+ 	}
++	if (use_backlog_threads())
++		smpboot_register_percpu_thread(&backlog_threads);
+ 
+ 	dev_boot_phase = 0;
+