[PATCH 24/30] sched: NUMA-aware per-memory-map concurrency ID

From: Mathieu Desnoyers
Date: Tue Nov 22 2022 - 15:42:29 EST

Next message: Mathieu Desnoyers: "[PATCH 29/30] selftests/rseq: Implement mm_numa_cid tests"
Previous message: Mathieu Desnoyers: "[PATCH 26/30] selftests/rseq: x86: Implement rseq_load_u32_u32"
In reply to: Mathieu Desnoyers: "[PATCH 26/30] selftests/rseq: x86: Implement rseq_load_u32_u32"
Next in thread: Mathieu Desnoyers: "[PATCH 29/30] selftests/rseq: Implement mm_numa_cid tests"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Keep track of a NUMA-aware concurrency ID. On NUMA systems, when a
NUMA-aware concurrency ID is observed by user-space to be associated
with a NUMA node, it is guaranteed to never change NUMA node unless a
kernel-level NUMA configuration change happens.

Exposing a numa-aware concurrency ID is useful in situations where a
process or a set of processes belonging to cpuset are pinned to a set of
cores which belong to a subset of the system's NUMA nodes. In those
situations, it is possible to benefit from the compactness of
concurrency IDs over CPU ids, while keeping NUMA locality, for indexing
a per-cpu data structure which takes into account NUMA locality.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx>
---
include/linux/mm.h | 18 +++++
include/linux/mm_types.h | 68 +++++++++++++++-
include/linux/sched.h | 3 +
kernel/fork.c | 3 +
kernel/sched/core.c | 8 +-
kernel/sched/sched.h | 168 +++++++++++++++++++++++++++++++++++----
6 files changed, 245 insertions(+), 23 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e0fba52de3e2..c7dfdf4c9d08 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3484,6 +3484,20 @@ static inline int task_mm_cid(struct task_struct *t)
{
return t->mm_cid;
}
+#ifdef CONFIG_NUMA
+static inline int task_mm_numa_cid(struct task_struct *t)
+{
+ if (num_possible_nodes() == 1)
+ return task_mm_cid(t);
+ else
+ return t->mm_numa_cid;
+}
+#else
+static inline int task_mm_numa_cid(struct task_struct *t)
+{
+ return task_mm_cid(t);
+}
+#endif
#else
static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
@@ -3498,6 +3512,10 @@ static inline int task_mm_cid(struct task_struct *t)
*/
return raw_smp_processor_id();
}
+static inline int task_mm_numa_cid(struct task_struct *t)
+{
+ return task_mm_cid(t);
+}
#endif

#endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index dabb42d26bb9..8c9afe8ce603 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -18,6 +18,7 @@
#include <linux/page-flags-layout.h>
#include <linux/workqueue.h>
#include <linux/seqlock.h>
+#include <linux/nodemask.h>

#include <asm/mmu.h>

@@ -847,15 +848,80 @@ static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
return (struct cpumask *)cid_bitmap;
}

+#ifdef CONFIG_NUMA
+/*
+ * Layout of node cidmasks:
+ * - mm_numa cidmask: cpumask of the currently used mm_numa cids.
+ * - node_alloc cidmask: cpumask tracking which cid were
+ * allocated (across nodes) in this
+ * memory map.
+ * - node cidmask[nr_node_ids]: per-node cpumask tracking which cid
+ * were allocated in this memory map.
+ */
+static inline cpumask_t *mm_numa_cidmask(struct mm_struct *mm)
+{
+ unsigned long cid_bitmap = (unsigned long)mm_cidmask(mm);
+
+ /* Skip mm_cidmask */
+ cid_bitmap += cpumask_size();
+ return (struct cpumask *)cid_bitmap;
+}
+
+static inline cpumask_t *mm_node_alloc_cidmask(struct mm_struct *mm)
+{
+ unsigned long cid_bitmap = (unsigned long)mm_numa_cidmask(mm);
+
+ /* Skip mm_numa_cidmask */
+ cid_bitmap += cpumask_size();
+ return (struct cpumask *)cid_bitmap;
+}
+
+static inline cpumask_t *mm_node_cidmask(struct mm_struct *mm, unsigned int node)
+{
+ unsigned long cid_bitmap = (unsigned long)mm_node_alloc_cidmask(mm);
+
+ /* Skip node alloc cidmask */
+ cid_bitmap += cpumask_size();
+ cid_bitmap += node * cpumask_size();
+ return (struct cpumask *)cid_bitmap;
+}
+
+static inline void mm_init_node_cidmask(struct mm_struct *mm)
+{
+ unsigned int node;
+
+ if (num_possible_nodes() == 1)
+ return;
+ cpumask_clear(mm_numa_cidmask(mm));
+ cpumask_clear(mm_node_alloc_cidmask(mm));
+ for (node = 0; node < nr_node_ids; node++)
+ cpumask_clear(mm_node_cidmask(mm, node));
+}
+
+static inline unsigned int mm_node_cidmask_size(void)
+{
+ if (num_possible_nodes() == 1)
+ return 0;
+ return (nr_node_ids + 2) * cpumask_size();
+}
+#else /* CONFIG_NUMA */
+static inline void mm_init_node_cidmask(struct mm_struct *mm) { }
+static inline unsigned int mm_node_cidmask_size(void)
+{
+ return 0;
+}
+#endif /* CONFIG_NUMA */
+
static inline void mm_init_cid(struct mm_struct *mm)
{
raw_spin_lock_init(&mm->cid_lock);
cpumask_clear(mm_cidmask(mm));
+ mm_init_node_cidmask(mm);
}

static inline unsigned int mm_cid_size(void)
{
- return cpumask_size();
+ return cpumask_size() + mm_node_cidmask_size();
}
#else /* CONFIG_SCHED_MM_CID */
static inline void mm_init_cid(struct mm_struct *mm) { }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c7e3c27e0e2e..990ef3d4b22b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1317,6 +1317,9 @@ struct task_struct {
#ifdef CONFIG_SCHED_MM_CID
int mm_cid; /* Current cid in mm */
int mm_cid_active; /* Whether cid bitmap is active */
+#ifdef CONFIG_NUMA
+ int mm_numa_cid; /* Current numa_cid in mm */
+#endif
#endif

struct tlbflush_unmap_batch tlb_ubc;
diff --git a/kernel/fork.c b/kernel/fork.c
index d48dedc4be75..364f4c62b1a4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1050,6 +1050,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_SCHED_MM_CID
tsk->mm_cid = -1;
tsk->mm_cid_active = 0;
+#ifdef CONFIG_NUMA
+ tsk->mm_numa_cid = -1;
+#endif
#endif
return tsk;

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ef0cc40cca6b..095b5eb35d3d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11284,8 +11284,7 @@ void sched_mm_cid_exit_signals(struct task_struct *t)
if (!mm)
return;
local_irq_save(flags);
- mm_cid_put(mm, t->mm_cid);
- t->mm_cid = -1;
+ mm_cid_put(mm, t);
t->mm_cid_active = 0;
local_irq_restore(flags);
}
@@ -11298,8 +11297,7 @@ void sched_mm_cid_before_execve(struct task_struct *t)
if (!mm)
return;
local_irq_save(flags);
- mm_cid_put(mm, t->mm_cid);
- t->mm_cid = -1;
+ mm_cid_put(mm, t);
t->mm_cid_active = 0;
local_irq_restore(flags);
}
@@ -11312,7 +11310,7 @@ void sched_mm_cid_after_execve(struct task_struct *t)
WARN_ON_ONCE((t->flags & PF_KTHREAD) || !t->mm);

local_irq_save(flags);
- t->mm_cid = mm_cid_get(mm);
+ mm_cid_get(mm, t);
t->mm_cid_active = 1;
local_irq_restore(flags);
rseq_set_notify_resume(t);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0096dc22926e..87f61f926e88 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3262,38 +3262,174 @@ static inline void update_current_exec_runtime(struct task_struct *curr,
}

#ifdef CONFIG_SCHED_MM_CID
-static inline int __mm_cid_get(struct mm_struct *mm)
+#ifdef CONFIG_NUMA
+static inline void __mm_numa_cid_get(struct mm_struct *mm, struct task_struct *t)
+{
+ struct cpumask *cpumask = mm_numa_cidmask(mm),
+ *node_cpumask = mm_node_cidmask(mm, numa_node_id()),
+ *node_alloc_cpumask = mm_node_alloc_cidmask(mm);
+ unsigned int node;
+ int cid;
+
+ if (num_possible_nodes() == 1) {
+ cid = -1;
+ goto end;
+ }
+
+ /*
+ * Try to reserve lowest available cid number within those already
+ * reserved for this NUMA node.
+ */
+ cid = cpumask_first_andnot(node_cpumask, cpumask);
+ if (cid >= nr_cpu_ids)
+ goto alloc_numa;
+ __cpumask_set_cpu(cid, cpumask);
+ goto end;
+
+alloc_numa:
+ /*
+ * Try to reserve lowest available cid number within those not already
+ * allocated for numa nodes.
+ */
+ cid = cpumask_first_notandnot(node_alloc_cpumask, cpumask);
+ if (cid >= nr_cpu_ids)
+ goto numa_update;
+ __cpumask_set_cpu(cid, cpumask);
+ __cpumask_set_cpu(cid, node_cpumask);
+ __cpumask_set_cpu(cid, node_alloc_cpumask);
+ goto end;
+
+numa_update:
+ /*
+ * NUMA node id configuration changed for at least one CPU in the system.
+ * We need to steal a currently unused cid from an overprovisioned
+ * node for our current node. Userspace must handle the fact that the
+ * node id associated with this cid may change due to node ID
+ * reconfiguration.
+ *
+ * Count how many possible cpus are attached to each (other) node id,
+ * and compare this with the per-mm node cidmask cpu count. Find one
+ * which has too many cpus in its mask to steal from.
+ */
+ for (node = 0; node < nr_node_ids; node++) {
+ struct cpumask *iter_cpumask;
+
+ if (node == numa_node_id())
+ continue;
+ iter_cpumask = mm_node_cidmask(mm, node);
+ if (nr_cpus_node(node) < cpumask_weight(iter_cpumask)) {
+ /* Try to steal from this node. */
+ cid = cpumask_first_andnot(iter_cpumask, cpumask);
+ if (cid >= nr_cpu_ids)
+ goto steal_fail;
+ __cpumask_set_cpu(cid, cpumask);
+ __cpumask_clear_cpu(cid, iter_cpumask);
+ __cpumask_set_cpu(cid, node_cpumask);
+ goto end;
+ }
+ }
+
+steal_fail:
+ /*
+ * Our attempt at gracefully stealing a cid from another
+ * overprovisioned NUMA node failed. Fallback to grabbing the first
+ * available cid.
+ */
+ cid = cpumask_first_zero(cpumask);
+ if (cid >= nr_cpu_ids) {
+ cid = -1;
+ goto end;
+ }
+ __cpumask_set_cpu(cid, cpumask);
+ /* Steal cid from its numa node mask. */
+ for (node = 0; node < nr_node_ids; node++) {
+ struct cpumask *iter_cpumask;
+
+ if (node == numa_node_id())
+ continue;
+ iter_cpumask = mm_node_cidmask(mm, node);
+ if (cpumask_test_cpu(cid, iter_cpumask)) {
+ __cpumask_clear_cpu(cid, iter_cpumask);
+ break;
+ }
+ }
+ __cpumask_set_cpu(cid, node_cpumask);
+end:
+ t->mm_numa_cid = cid;
+}
+
+static inline void __mm_numa_cid_put(struct mm_struct *mm, struct task_struct *t)
+{
+ int cid = t->mm_numa_cid;
+
+ if (num_possible_nodes() == 1)
+ return;
+ if (cid < 0)
+ return;
+ __cpumask_clear_cpu(cid, mm_numa_cidmask(mm));
+ t->mm_numa_cid = -1;
+}
+
+static inline void mm_numa_transfer_cid_prev_next(struct task_struct *prev, struct task_struct *next)
+{
+ next->mm_numa_cid = prev->mm_numa_cid;
+ prev->mm_numa_cid = -1;
+}
+#else
+static inline void __mm_numa_cid_get(struct mm_struct *mm, struct task_struct *t) { }
+static inline void __mm_numa_cid_put(struct mm_struct *mm, struct task_struct *t) { }
+static inline void mm_numa_transfer_cid_prev_next(struct task_struct *prev, struct task_struct *next) { }
+#endif
+
+static inline void __mm_cid_get(struct mm_struct *mm, struct task_struct *t)
{
struct cpumask *cpumask;
int cid;

cpumask = mm_cidmask(mm);
cid = cpumask_first_zero(cpumask);
- if (cid >= nr_cpu_ids)
- return -1;
+ if (cid >= nr_cpu_ids) {
+ cid = -1;
+ goto end;
+ }
__cpumask_set_cpu(cid, cpumask);
- return cid;
+end:
+ t->mm_cid = cid;
}

-static inline void mm_cid_put(struct mm_struct *mm, int cid)
+static inline void mm_cid_get(struct mm_struct *mm, struct task_struct *t)
{
lockdep_assert_irqs_disabled();
- if (cid < 0)
- return;
raw_spin_lock(&mm->cid_lock);
- __cpumask_clear_cpu(cid, mm_cidmask(mm));
+ __mm_cid_get(mm, t);
+ __mm_numa_cid_get(mm, t);
raw_spin_unlock(&mm->cid_lock);
}

-static inline int mm_cid_get(struct mm_struct *mm)
+static inline void __mm_cid_put(struct mm_struct *mm, struct task_struct *t)
{
- int ret;
+ int cid = t->mm_cid;
+
+ if (cid < 0)
+ return;
+ __cpumask_clear_cpu(cid, mm_cidmask(mm));
+ t->mm_cid = -1;
+}

+static inline void mm_cid_put(struct mm_struct *mm, struct task_struct *t)
+{
lockdep_assert_irqs_disabled();
raw_spin_lock(&mm->cid_lock);
- ret = __mm_cid_get(mm);
+ __mm_cid_put(mm, t);
+ __mm_numa_cid_put(mm, t);
raw_spin_unlock(&mm->cid_lock);
- return ret;
+}
+
+static inline void mm_transfer_cid_prev_next(struct task_struct *prev, struct task_struct *next)
+{
+ next->mm_cid = prev->mm_cid;
+ prev->mm_cid = -1;
+ mm_numa_transfer_cid_prev_next(prev, next);
}

static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
@@ -3304,15 +3440,13 @@ static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *n
* Context switch between threads in same mm, hand over
* the mm_cid from prev to next.
*/
- next->mm_cid = prev->mm_cid;
- prev->mm_cid = -1;
+ mm_transfer_cid_prev_next(prev, next);
return;
}
- mm_cid_put(prev->mm, prev->mm_cid);
- prev->mm_cid = -1;
+ mm_cid_put(prev->mm, prev);
}
if (next->mm_cid_active)
- next->mm_cid = mm_cid_get(next->mm);
+ mm_cid_get(next->mm, next);
}

#else
--
2.25.1

Next message: Mathieu Desnoyers: "[PATCH 29/30] selftests/rseq: Implement mm_numa_cid tests"
Previous message: Mathieu Desnoyers: "[PATCH 26/30] selftests/rseq: x86: Implement rseq_load_u32_u32"
In reply to: Mathieu Desnoyers: "[PATCH 26/30] selftests/rseq: x86: Implement rseq_load_u32_u32"
Next in thread: Mathieu Desnoyers: "[PATCH 29/30] selftests/rseq: Implement mm_numa_cid tests"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]