Re: [External] Re: [PATCH v2] mm: add new syscall pidfd_set_mempolicy().
From: Zhongkun He
Date: Thu Nov 17 2022 - 02:19:30 EST
Hi Michal, thanks for your replay.
It would be better to add the patch that has been tested.
OK.
One way to deal with that would be to use a similar model as css_tryget
Percpu_ref is a good way to reduce memory footprint in fast path.But it
has the potential to make mempolicy heavy. the sizeof mempolicy is 32
bytes and it may not have a long life time, which duplicated from the
parent in fork().If we modify atomic_t to percpu_ref, the efficiency of
reading in fastpath will increase, the efficiency of creation and
deletion will decrease, and the occupied space will increase
significantly.I am not really sure it is worth it.
atomic_t; 4
sizeof(percpu_ref + percpu_ref_data + cpus* unsigned long)
16+56+cpus*8
Btw. have you tried to profile those slowdowns to identify hotspots?
Thanks
Yes, it will degrade performance about 2%-%3 may because of the
task_lock and atomic operations on the reference count as shown
in the previous email.
new hotspots in perf.
1.34% [kernel] [k] __mpol_put
0.53% [kernel] [k] _raw_spin_lock
0.44% [kernel] [k] get_task_policy
Tested patch.
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8a74cdcc9af0..3f1b5c8329a8 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -105,10 +105,7 @@ static void hold_task_mempolicy(struct
proc_maps_private *priv)
{
struct task_struct *task = priv->task;
- task_lock(task);
priv->task_mempolicy = get_task_policy(task);
- mpol_get(priv->task_mempolicy);
- task_unlock(task);
}
static void release_task_mempolicy(struct proc_maps_private *priv)
{
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index d232de7cdc56..786481d7abfd 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -62,7 +62,7 @@ struct mempolicy {
extern void __mpol_put(struct mempolicy *pol);
static inline void mpol_put(struct mempolicy *pol)
{
- if (pol)
+ if (pol && !(pol->flags & MPOL_F_STATIC))
__mpol_put(pol);
}
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 046d0ccba4cd..7c2068163a0c 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -63,7 +63,7 @@ enum {
#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
#define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On
Node */
-
+#define MPOL_F_STATIC (1 << 5)
/*
* These bit locations are exposed in the vm.zone_reclaim_mode sysctl
* ABI. New bits are OK, but existing bits can never change.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 546df97c31e4..4cca96a40d04 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1247,6 +1247,7 @@ static struct page *dequeue_huge_page_vma(struct
hstate *h,
}
mpol_cond_put(mpol);
+ mpol_put(mpol);
return page;
err:
@@ -2316,6 +2317,7 @@ struct page
*alloc_buddy_huge_page_with_mpol(struct hstate *h,
if (!page)
page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
mpol_cond_put(mpol);
+ mpol_put(mpol);
return page;
}
@@ -2352,6 +2354,7 @@ struct page *alloc_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
mpol_cond_put(mpol);
+ mpol_put(mpol);
return page;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 61aa9aedb728..ea670db6881f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -126,6 +126,7 @@ enum zone_type policy_zone = 0;
static struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */
.mode = MPOL_LOCAL,
+ .flags = MPOL_F_STATIC
};
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
@@ -160,11 +161,19 @@ EXPORT_SYMBOL_GPL(numa_map_to_online_node);
struct mempolicy *get_task_policy(struct task_struct *p)
{
- struct mempolicy *pol = p->mempolicy;
+ struct mempolicy *pol;
int node;
- if (pol)
- return pol;
+ if (p->mempolicy)
+ {
+ task_lock(p);
+ pol = p->mempolicy;
+ mpol_get(pol);
+ task_unlock(p);
+
+ if(pol)
+ return pol;
+ }
node = numa_node_id();
if (node != NUMA_NO_NODE) {
@@ -1764,10 +1773,12 @@ struct mempolicy *__get_vma_policy(struct
vm_area_struct *vma,
* a pseudo vma whose vma->vm_ops=NULL. Take a
reference
* count on these policies which will be dropped by
* mpol_cond_put() later
+ *
+ * if (mpol_needs_cond_ref(pol))
+ * mpol_get(pol);
*/
- if (mpol_needs_cond_ref(pol))
- mpol_get(pol);
}
+ mpol_get(pol);
}
return pol;
@@ -1799,9 +1810,9 @@ static struct mempolicy *get_vma_policy(struct
vm_area_struct *vma,
bool vma_policy_mof(struct vm_area_struct *vma)
{
struct mempolicy *pol;
+ bool ret = false;
if (vma->vm_ops && vma->vm_ops->get_policy) {
- bool ret = false;
pol = vma->vm_ops->get_policy(vma, vma->vm_start);
if (pol && (pol->flags & MPOL_F_MOF))
@@ -1812,10 +1823,13 @@ bool vma_policy_mof(struct vm_area_struct *vma)
}
pol = vma->vm_policy;
+ mpol_get(pol);
if (!pol)
pol = get_task_policy(current);
+ ret = pol && (pol->flags & MPOL_F_MOF);
+ mpol_put(pol);
- return pol->flags & MPOL_F_MOF;
+ return ret;
}
bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
@@ -2179,7 +2193,6 @@ struct folio *vma_alloc_folio(gfp_t gfp, int
order, struct vm_area_struct *vma,
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
- mpol_cond_put(pol);
gfp |= __GFP_COMP;
page = alloc_page_interleave(gfp, order, nid);
if (page && order > 1)
@@ -2194,7 +2207,6 @@ struct folio *vma_alloc_folio(gfp_t gfp, int
order, struct vm_area_struct *vma,
node = policy_node(gfp, pol, node);
gfp |= __GFP_COMP;
page = alloc_pages_preferred_many(gfp, order, node, pol);
- mpol_cond_put(pol);
if (page && order > 1)
prep_transhuge_page(page);
folio = (struct folio *)page;
@@ -2219,7 +2231,6 @@ struct folio *vma_alloc_folio(gfp_t gfp, int
order, struct vm_area_struct *vma,
nmask = policy_nodemask(gfp, pol);
if (!nmask || node_isset(hpage_node, *nmask)) {
- mpol_cond_put(pol);
/*
* First, try to allocate THP only on local
node, but
* don't reclaim unnecessarily, just compact.
@@ -2244,8 +2255,9 @@ struct folio *vma_alloc_folio(gfp_t gfp, int
order, struct vm_area_struct *vma,
nmask = policy_nodemask(gfp, pol);
preferred_nid = policy_node(gfp, pol, node);
folio = __folio_alloc(gfp, order, preferred_nid, nmask);
- mpol_cond_put(pol);
out:
+ mpol_cond_put(pol);
+ mpol_put(pol);
return folio;
}
EXPORT_SYMBOL(vma_alloc_folio);
@@ -2286,6 +2298,7 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
policy_node(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
+ mpol_put(pol);
return page;
}
EXPORT_SYMBOL(alloc_pages);
@@ -2365,21 +2378,23 @@ unsigned long
alloc_pages_bulk_array_mempolicy(gfp_t gfp,
unsigned long nr_pages, struct page **page_array)
{
struct mempolicy *pol = &default_policy;
+ unsigned long allocated;
if (!in_interrupt() && !(gfp & __GFP_THISNODE))
pol = get_task_policy(current);
- if (pol->mode == MPOL_INTERLEAVE)
- return alloc_pages_bulk_array_interleave(gfp, pol,
+ if (pol->mode == MPOL_INTERLEAVE) {
+ allocated = alloc_pages_bulk_array_interleave(gfp, pol,
nr_pages,
page_array);
-
- if (pol->mode == MPOL_PREFERRED_MANY)
- return alloc_pages_bulk_array_preferred_many(gfp,
+ } else if (pol->mode == MPOL_PREFERRED_MANY)
+ allocated = alloc_pages_bulk_array_preferred_many(gfp,
numa_node_id(), pol, nr_pages, page_array);
-
- return __alloc_pages_bulk(gfp, policy_node(gfp, pol,
numa_node_id()),
+ else
+ allocated = __alloc_pages_bulk(gfp, policy_node(gfp, pol,
numa_node_id()),
policy_nodemask(gfp, pol), nr_pages,
NULL,
page_array);
+ mpol_put(pol);
+ return allocated;
}
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
@@ -2636,6 +2651,7 @@ int mpol_misplaced(struct page *page, struct
vm_area_struct *vma, unsigned long
ret = polnid;
out:
mpol_cond_put(pol);
+ mpol_put(pol);
return ret;
}
@@ -2917,7 +2933,7 @@ void __init numa_policy_init(void)
preferred_node_policy[nid] = (struct mempolicy) {
.refcnt = ATOMIC_INIT(1),
.mode = MPOL_PREFERRED,
- .flags = MPOL_F_MOF | MPOL_F_MORON,
+ .flags = MPOL_F_MOF | MPOL_F_MORON | MPOL_F_STATIC,
.nodes = nodemask_of_node(nid),
};
}