[PATCH v5 2/3] sched: Avoid placing RT threads on cores handling long softirqs
From: John Stultz
Date: Wed Nov 16 2022 - 02:59:51 EST
From: Connor O'Brien <connoro@xxxxxxxxxx>
In certain audio use cases, scheduling RT threads on cores that
are handling softirqs can lead to glitches. Prevent this
behavior in cases where the softirq is likely to take a long
time. To avoid unnecessary migrations, the old behavior is
preserved for RCU, SCHED and TIMER irqs which are expected to be
relatively quick.
This patch reworks and combines two related changes originally
by John Dias <joaodias@xxxxxxxxxx>
Cc: John Dias <joaodias@xxxxxxxxxx>
Cc: Connor O'Brien <connoro@xxxxxxxxxx>
Cc: Rick Yiu <rickyiu@xxxxxxxxxx>
Cc: John Kacur <jkacur@xxxxxxxxxx>
Cc: Qais Yousef <qyousef@xxxxxxxxxx>
Cc: Chris Redpath <chris.redpath@xxxxxxx>
Cc: Abhijeet Dharmapurikar <adharmap@xxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Juri Lelli <juri.lelli@xxxxxxxxxx>
Cc: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Cc: Dietmar Eggemann <dietmar.eggemann@xxxxxxx>
Cc: Steven Rostedt <rostedt@xxxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Heiko Carstens <hca@xxxxxxxxxxxxx>
Cc: Vasily Gorbik <gor@xxxxxxxxxxxxx>
Cc: Joel Fernandes <joel@xxxxxxxxxxxxxxxxx>
Cc: Alexander Gordeev <agordeev@xxxxxxxxxxxxx>
Cc: kernel-team@xxxxxxxxxxx
Signed-off-by: John Dias <joaodias@xxxxxxxxxx>
[elavila: Port to mainline, amend commit text]
Signed-off-by: J. Avila <elavila@xxxxxxxxxx>
[connoro: Reworked, simplified, and merged two patches together]
Signed-off-by: Connor O'Brien <connoro@xxxxxxxxxx>
[jstultz: Further simplified and fixed issues, reworded commit
message, removed arm64-isms]
Signed-off-by: John Stultz <jstultz@xxxxxxxxxx>
---
v2:
* Reformatted Kconfig entry to match coding style
(Reported-by: Randy Dunlap <rdunlap@xxxxxxxxxxxxx>)
* Made rt_task_fits_capacity_and_may_preempt static to
avoid warnings (Reported-by: kernel test robot <lkp@xxxxxxxxx>)
* Rework to use preempt_count and drop kconfig dependency on ARM64
v3:
* Use introduced __cpu_softirq_pending() to avoid s390 build
issues (Reported-by: kernel test robot <lkp@xxxxxxxxx>)
v4:
* Drop TASKLET_SOFTIRQ from LONG_SOFTIRQS (suggested by Qais)
* Depend on !PREEMPT_RT (Suggested by Qais)
* Larger simplification of logic (suggested by Qais)
* Rework LONG_SOFTIRQS to use BIT() macros
* Rename task_may_preempt() to cpu_busy_with_softirqs()
v5:
* Conditionalize active_softirqs handling (suggested by Alexander
Gordeev <agordeev@xxxxxxxxxxxxx>)
* Reorder rt_task_fits_cpu to have the "fast" function first
(Suggested by Alexander Gordeev <agordeev@xxxxxxxxxxxxx>)
* Fix bug I introduced in v2 condensing
task_thread_info(task)->preempt_count to preempt_count()
(Reported-by: Alexander Gordeev <agordeev@xxxxxxxxxxxxx>)
* Tweak comment discription to remove the vauge "slow"
descriptor of softirqs being run by ksoftirqd
(Suggested by Alexander Gordeev <agordeev@xxxxxxxxxxxxx>)
* Switch to using CONFIG_RT_SOFTIRQ_AWARE_SCHED (suggested by
Joel Fernandes <joel@xxxxxxxxxxxxxxxxx>)
* Simplify cpu_busy_with_softirqs() logic as pointed out by
Alexander Gordeev <agordeev@xxxxxxxxxxxxx>
* Switch to using IS_ENABLED rather then defining my own macro
(suggsted by Joel Fernandes <joel@xxxxxxxxxxxxxxxxx>)
---
include/linux/interrupt.h | 9 +++++++
init/Kconfig | 10 ++++++++
kernel/sched/rt.c | 49 ++++++++++++++++++++++++++++++++-------
kernel/softirq.c | 17 ++++++++++++++
4 files changed, 76 insertions(+), 9 deletions(-)
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a749a8663841..7d09eb998d4c 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -582,6 +582,11 @@ enum
* _ IRQ_POLL: irq_poll_cpu_dead() migrates the queue
*/
#define SOFTIRQ_HOTPLUG_SAFE_MASK (BIT(RCU_SOFTIRQ) | BIT(IRQ_POLL_SOFTIRQ))
+/* Softirq's where the handling might be long: */
+#define LONG_SOFTIRQ_MASK (BIT(NET_TX_SOFTIRQ) | \
+ BIT(NET_RX_SOFTIRQ) | \
+ BIT(BLOCK_SOFTIRQ) | \
+ BIT(IRQ_POLL_SOFTIRQ))
/* map softirq index to softirq name. update 'softirq_to_name' in
* kernel/softirq.c when adding a new softirq.
@@ -618,6 +623,10 @@ extern void raise_softirq(unsigned int nr);
DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
+#ifdef CONFIG_RT_SOFTIRQ_AWARE_SCHED
+DECLARE_PER_CPU(u32, active_softirqs);
+#endif
+
static inline struct task_struct *this_cpu_ksoftirqd(void)
{
return this_cpu_read(ksoftirqd);
diff --git a/init/Kconfig b/init/Kconfig
index abf65098f1b6..ce0f0be5759c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1291,6 +1291,16 @@ config SCHED_AUTOGROUP
desktop applications. Task group autogeneration is currently based
upon task session.
+config RT_SOFTIRQ_AWARE_SCHED
+ bool "Improve RT scheduling during long softirq execution"
+ depends on SMP && !PREEMPT_RT
+ default n
+ help
+ Enable an optimization which tries to avoid placing RT tasks on CPUs
+ occupied by nonpreemptible tasks, such as a long softirq or CPUs
+ which may soon block preemptions, such as a CPU running a ksoftirq
+ thread which handles slow softirqs.
+
config SYSFS_DEPRECATED
bool "Enable deprecated sysfs features to support old userspace tools"
depends on SYSFS
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ed2a47e4ddae..152347c4394c 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1595,6 +1595,32 @@ static void yield_task_rt(struct rq *rq)
#ifdef CONFIG_SMP
static int find_lowest_rq(struct task_struct *task);
+#ifdef CONFIG_RT_SOFTIRQ_AWARE_SCHED
+/*
+ * Return whether the given cpu is currently non-preemptible
+ * while handling a potentially long softirq, or if the current
+ * task is likely to block preemptions soon because it is a
+ * ksoftirq thread that is handling softirqs.
+ */
+static bool cpu_busy_with_softirqs(int cpu)
+{
+ u32 softirqs = per_cpu(active_softirqs, cpu) |
+ __cpu_softirq_pending(cpu);
+
+ return softirqs & LONG_SOFTIRQ_MASK;
+}
+#else
+static bool cpu_busy_with_softirqs(int cpu)
+{
+ return false;
+}
+#endif /* CONFIG_RT_SOFTIRQ_AWARE_SCHED */
+
+static bool rt_task_fits_cpu(struct task_struct *p, int cpu)
+{
+ return rt_task_fits_capacity(p, cpu) && !cpu_busy_with_softirqs(cpu);
+}
+
static int
select_task_rq_rt(struct task_struct *p, int cpu, int flags)
{
@@ -1633,22 +1659,24 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags)
* This test is optimistic, if we get it wrong the load-balancer
* will have to sort it out.
*
- * We take into account the capacity of the CPU to ensure it fits the
- * requirement of the task - which is only important on heterogeneous
- * systems like big.LITTLE.
+ * We use rt_task_fits_cpu() to evaluate if the CPU is busy with
+ * potentially long-running softirq work, as well as take into
+ * account the capacity of the CPU to ensure it fits the
+ * requirement of the task - which is only important on
+ * heterogeneous systems like big.LITTLE.
*/
test = curr &&
unlikely(rt_task(curr)) &&
(curr->nr_cpus_allowed < 2 || curr->prio <= p->prio);
- if (test || !rt_task_fits_capacity(p, cpu)) {
+ if (test || !rt_task_fits_cpu(p, cpu)) {
int target = find_lowest_rq(p);
/*
* Bail out if we were forcing a migration to find a better
* fitting CPU but our search failed.
*/
- if (!test && target != -1 && !rt_task_fits_capacity(p, target))
+ if (!test && target != -1 && !rt_task_fits_cpu(p, target))
goto out_unlock;
/*
@@ -1890,14 +1918,17 @@ static int find_lowest_rq(struct task_struct *task)
return -1; /* No other targets possible */
/*
- * If we're on asym system ensure we consider the different capacities
- * of the CPUs when searching for the lowest_mask.
+ * If we're using the softirq optimization or if we are
+ * on asym system, ensure we consider the softirq processing
+ * or different capacities of the CPUs when searching for the
+ * lowest_mask.
*/
- if (sched_asym_cpucap_active()) {
+ if (IS_ENABLED(CONFIG_RT_SOFTIRQ_AWARE_SCHED) ||
+ sched_asym_cpucap_active()) {
ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri,
task, lowest_mask,
- rt_task_fits_capacity);
+ rt_task_fits_cpu);
} else {
ret = cpupri_find(&task_rq(task)->rd->cpupri,
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c8a6913c067d..dd92ce8f771b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -60,6 +60,21 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+#ifdef CONFIG_RT_SOFTIRQ_AWARE_SCHED
+/*
+ * active_softirqs -- per cpu, a mask of softirqs that are being handled,
+ * with the expectation that approximate answers are acceptable and therefore
+ * no synchronization.
+ */
+DEFINE_PER_CPU(u32, active_softirqs);
+static inline void set_active_softirqs(u32 pending)
+{
+ __this_cpu_write(active_softirqs, pending);
+}
+#else /* CONFIG_RT_SOFTIRQ_AWARE_SCHED */
+static inline void set_active_softirqs(u32 pending) {};
+#endif /* CONFIG_RT_SOFTIRQ_AWARE_SCHED */
+
const char * const softirq_to_name[NR_SOFTIRQS] = {
"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
"TASKLET", "SCHED", "HRTIMER", "RCU"
@@ -551,6 +566,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
restart:
/* Reset the pending bitmask before enabling irqs */
set_softirq_pending(0);
+ set_active_softirqs(pending);
local_irq_enable();
@@ -580,6 +596,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
pending >>= softirq_bit;
}
+ set_active_softirqs(0);
if (!IS_ENABLED(CONFIG_PREEMPT_RT) &&
__this_cpu_read(ksoftirqd) == current)
rcu_softirq_qs();
--
2.38.1.431.g37b22c650d-goog