Re: Low TCP throughput due to vmpressure with swap enabled
From: Ivan Babrou
Date: Tue Nov 22 2022 - 17:12:12 EST
On Tue, Nov 22, 2022 at 12:05 PM Johannes Weiner <hannes@xxxxxxxxxxx> wrote:
>
> On Mon, Nov 21, 2022 at 04:53:43PM -0800, Ivan Babrou wrote:
> > Hello,
> >
> > We have observed a negative TCP throughput behavior from the following commit:
> >
> > * 8e8ae645249b mm: memcontrol: hook up vmpressure to socket pressure
> >
> > It landed back in 2016 in v4.5, so it's not exactly a new issue.
> >
> > The crux of the issue is that in some cases with swap present the
> > workload can be unfairly throttled in terms of TCP throughput.
>
> Thanks for the detailed analysis, Ivan.
>
> Originally, we pushed back on sockets only when regular page reclaim
> had completely failed and we were about to OOM. This patch was an
> attempt to be smarter about it and equalize pressure more smoothly
> between socket memory, file cache, anonymous pages.
>
> After a recent discussion with Shakeel, I'm no longer quite sure the
> kernel is the right place to attempt this sort of balancing. It kind
> of depends on the workload which type of memory is more imporant. And
> your report shows that vmpressure is a flawed mechanism to implement
> this, anyway.
>
> So I'm thinking we should delete the vmpressure thing, and go back to
> socket throttling only if an OOM is imminent. This is in line with
> what we do at the system level: sockets get throttled only after
> reclaim fails and we hit hard limits. It's then up to the users and
> sysadmin to allocate a reasonable amount of buffers given the overall
> memory budget.
>
> Cgroup accounting, limiting and OOM enforcement is still there for the
> socket buffers, so misbehaving groups will be contained either way.
>
> What do you think? Something like the below patch?
The idea sounds very reasonable to me. I can't really speak for the
patch contents with any sort of authority, but it looks ok to my
non-expert eyes.
There were some conflicts when cherry-picking this into v5.15. I think
the only real one was for the "!sc->proactive" condition not being
present there. For the rest I just accepted the incoming change.
I'm going to be away from my work computer until December 5th, but
I'll try to expedite my backported patch to a production machine today
to confirm that it makes the difference. If I can get some approvals
on my internal PRs, I should be able to provide the results by EOD
tomorrow.
>
> ---
>
> From 67757f78d8b4412b72fe1583ebaf13cfd0fc03b0 Mon Sep 17 00:00:00 2001
> From: Johannes Weiner <hannes@xxxxxxxxxxx>
> Date: Tue, 22 Nov 2022 14:40:50 -0500
> Subject: [PATCH] Revert "mm: memcontrol: hook up vmpressure to socket
> pressure"
>
> This reverts commit 8e8ae645249b85c8ed6c178557f8db8613a6bcc7.
>
> NOT-Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
> ---
> include/linux/memcontrol.h | 6 ++--
> include/linux/vmpressure.h | 7 ++---
> mm/memcontrol.c | 19 +++++++++----
> mm/vmpressure.c | 58 ++++++--------------------------------
> mm/vmscan.c | 15 +---------
> 5 files changed, 29 insertions(+), 76 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index e1644a24009c..e7369363d4c2 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -283,11 +283,11 @@ struct mem_cgroup {
> atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
> atomic_long_t memory_events_local[MEMCG_NR_MEMORY_EVENTS];
>
> + /* Socket memory allocations have failed */
> unsigned long socket_pressure;
>
> /* Legacy tcp memory accounting */
> bool tcpmem_active;
> - int tcpmem_pressure;
>
> #ifdef CONFIG_MEMCG_KMEM
> int kmemcg_id;
> @@ -1701,10 +1701,10 @@ void mem_cgroup_sk_alloc(struct sock *sk);
> void mem_cgroup_sk_free(struct sock *sk);
> static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
> {
> - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure)
> + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->socket_pressure)
> return true;
> do {
> - if (time_before(jiffies, READ_ONCE(memcg->socket_pressure)))
> + if (memcg->socket_pressure)
> return true;
> } while ((memcg = parent_mem_cgroup(memcg)));
> return false;
> diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
> index 6a2f51ebbfd3..20d93de37a17 100644
> --- a/include/linux/vmpressure.h
> +++ b/include/linux/vmpressure.h
> @@ -11,9 +11,6 @@
> #include <linux/eventfd.h>
>
> struct vmpressure {
> - unsigned long scanned;
> - unsigned long reclaimed;
> -
> unsigned long tree_scanned;
> unsigned long tree_reclaimed;
> /* The lock is used to keep the scanned/reclaimed above in sync. */
> @@ -30,7 +27,7 @@ struct vmpressure {
> struct mem_cgroup;
>
> #ifdef CONFIG_MEMCG
> -extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
> +extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
> unsigned long scanned, unsigned long reclaimed);
> extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
>
> @@ -44,7 +41,7 @@ extern int vmpressure_register_event(struct mem_cgroup *memcg,
> extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
> struct eventfd_ctx *eventfd);
> #else
> -static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
> +static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
> unsigned long scanned, unsigned long reclaimed) {}
> static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg,
> int prio) {}
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 2d8549ae1b30..066166aebbef 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -7195,10 +7195,10 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
> struct page_counter *fail;
>
> if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
> - memcg->tcpmem_pressure = 0;
> + memcg->socket_pressure = 0;
> return true;
> }
> - memcg->tcpmem_pressure = 1;
> + memcg->socket_pressure = 1;
> if (gfp_mask & __GFP_NOFAIL) {
> page_counter_charge(&memcg->tcpmem, nr_pages);
> return true;
> @@ -7206,12 +7206,21 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
> return false;
> }
>
> - if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
> - mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
> - return true;
> + if (try_charge(memcg, gfp_mask & ~__GFP_NOFAIL, nr_pages) == 0) {
> + memcg->socket_pressure = 0;
> + goto success;
> + }
> + memcg->socket_pressure = 1;
> + if (gfp_mask & __GFP_NOFAIL) {
> + try_charge(memcg, gfp_mask, nr_pages);
> + goto success;
> }
>
> return false;
> +
> +success:
> + mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
> + return true;
> }
>
> /**
> diff --git a/mm/vmpressure.c b/mm/vmpressure.c
> index b52644771cc4..4cec90711cf4 100644
> --- a/mm/vmpressure.c
> +++ b/mm/vmpressure.c
> @@ -219,7 +219,6 @@ static void vmpressure_work_fn(struct work_struct *work)
> * vmpressure() - Account memory pressure through scanned/reclaimed ratio
> * @gfp: reclaimer's gfp mask
> * @memcg: cgroup memory controller handle
> - * @tree: legacy subtree mode
> * @scanned: number of pages scanned
> * @reclaimed: number of pages reclaimed
> *
> @@ -227,16 +226,9 @@ static void vmpressure_work_fn(struct work_struct *work)
> * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
> * pressure index is then further refined and averaged over time.
> *
> - * If @tree is set, vmpressure is in traditional userspace reporting
> - * mode: @memcg is considered the pressure root and userspace is
> - * notified of the entire subtree's reclaim efficiency.
> - *
> - * If @tree is not set, reclaim efficiency is recorded for @memcg, and
> - * only in-kernel users are notified.
> - *
> * This function does not return any value.
> */
> -void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
> +void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
> unsigned long scanned, unsigned long reclaimed)
> {
> struct vmpressure *vmpr;
> @@ -271,46 +263,14 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
> if (!scanned)
> return;
>
> - if (tree) {
> - spin_lock(&vmpr->sr_lock);
> - scanned = vmpr->tree_scanned += scanned;
> - vmpr->tree_reclaimed += reclaimed;
> - spin_unlock(&vmpr->sr_lock);
> -
> - if (scanned < vmpressure_win)
> - return;
> - schedule_work(&vmpr->work);
> - } else {
> - enum vmpressure_levels level;
> -
> - /* For now, no users for root-level efficiency */
> - if (!memcg || mem_cgroup_is_root(memcg))
> - return;
> -
> - spin_lock(&vmpr->sr_lock);
> - scanned = vmpr->scanned += scanned;
> - reclaimed = vmpr->reclaimed += reclaimed;
> - if (scanned < vmpressure_win) {
> - spin_unlock(&vmpr->sr_lock);
> - return;
> - }
> - vmpr->scanned = vmpr->reclaimed = 0;
> - spin_unlock(&vmpr->sr_lock);
> + spin_lock(&vmpr->sr_lock);
> + scanned = vmpr->tree_scanned += scanned;
> + vmpr->tree_reclaimed += reclaimed;
> + spin_unlock(&vmpr->sr_lock);
>
> - level = vmpressure_calc_level(scanned, reclaimed);
> -
> - if (level > VMPRESSURE_LOW) {
> - /*
> - * Let the socket buffer allocator know that
> - * we are having trouble reclaiming LRU pages.
> - *
> - * For hysteresis keep the pressure state
> - * asserted for a second in which subsequent
> - * pressure events can occur.
> - */
> - WRITE_ONCE(memcg->socket_pressure, jiffies + HZ);
> - }
> - }
> + if (scanned < vmpressure_win)
> + return;
> + schedule_work(&vmpr->work);
> }
>
> /**
> @@ -340,7 +300,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
> * to the vmpressure() basically means that we signal 'critical'
> * level.
> */
> - vmpressure(gfp, memcg, true, vmpressure_win, 0);
> + vmpressure(gfp, memcg, vmpressure_win, 0);
> }
>
> #define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2)
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 04d8b88e5216..d348366d58d4 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -6035,8 +6035,6 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
> memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
> do {
> struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
> - unsigned long reclaimed;
> - unsigned long scanned;
>
> /*
> * This loop can become CPU-bound when target memcgs
> @@ -6068,20 +6066,9 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
> memcg_memory_event(memcg, MEMCG_LOW);
> }
>
> - reclaimed = sc->nr_reclaimed;
> - scanned = sc->nr_scanned;
> -
> shrink_lruvec(lruvec, sc);
> -
> shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
> sc->priority);
> -
> - /* Record the group's reclaim efficiency */
> - if (!sc->proactive)
> - vmpressure(sc->gfp_mask, memcg, false,
> - sc->nr_scanned - scanned,
> - sc->nr_reclaimed - reclaimed);
> -
> } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
> }
>
> @@ -6111,7 +6098,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>
> /* Record the subtree's reclaim efficiency */
> if (!sc->proactive)
> - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> + vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
> sc->nr_scanned - nr_scanned,
> sc->nr_reclaimed - nr_reclaimed);
>
> --
> 2.38.1
>