Re: [PATCH] mm: thp: move deferred split queue to memcg's nodeinfo

Michal Hocko Wed, 02 Oct 2019 01:43:30 -0700

On Wed 02-10-19 06:16:43, Yang Shi wrote:
> The commit 87eaceb3faa59b9b4d940ec9554ce251325d83fe ("mm: thp: make
> deferred split shrinker memcg aware") makes deferred split queue per
> memcg to resolve memcg pre-mature OOM problem.  But, all nodes end up
> sharing the same queue instead of one queue per-node before the commit.
> It is not a big deal for memcg limit reclaim, but it may cause global
> kswapd shrink THPs from a different node.
> 
> And, 0-day testing reported -19.6% regression of stress-ng's madvise
> test [1].  I didn't see that much regression on my test box (24 threads,
> 48GB memory, 2 nodes), with the same test (stress-ng --timeout 1
> --metrics-brief --sequential 72  --class vm --exclude spawn,exec), I saw
> average -3% (run the same test 10 times then calculate the average since
> the test itself may have most 15% variation according to my test)
> regression sometimes (not every time, sometimes I didn't see regression
> at all).
> 
> This might be caused by deferred split queue lock contention.  With some
> configuration (i.e. just one root memcg) the lock contention my be worse
> than before (given 2 nodes, two locks are reduced to one lock).
> 
> So, moving deferred split queue to memcg's nodeinfo to make it NUMA
> aware again.
> 
> With this change stress-ng's madvise test shows average 4% improvement
> sometimes and I didn't see degradation anymore.


My concern about this getting more and more complex
(http://lkml.kernel.org/r/[email protected]) holds
here even more. Can we step back and reconsider the whole thing please?

> [1]: https://lore.kernel.org/lkml/20190930084604.GC17687@shao2-debian/
> 
> Cc: Kirill A. Shutemov <[email protected]>
> Cc: Kirill Tkhai <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Michal Hocko <[email protected]>
> Cc: Hugh Dickins <[email protected]>
> Cc: Shakeel Butt <[email protected]>
> Cc: David Rientjes <[email protected]>
> Signed-off-by: Yang Shi <[email protected]>
> ---
>  include/linux/memcontrol.h |  8 ++++----
>  mm/huge_memory.c           | 15 +++++++++------
>  mm/memcontrol.c            | 29 +++++++++++++++++------------
>  3 files changed, 30 insertions(+), 22 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 9b60863..4b5c791 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -137,6 +137,10 @@ struct mem_cgroup_per_node {
>       bool                    congested;      /* memcg has many dirty pages */
>                                               /* backed by a congested BDI */
>  
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +     struct deferred_split deferred_split_queue;
> +#endif
> +
>       struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
>                                               /* use container_of        */
>  };
> @@ -330,10 +334,6 @@ struct mem_cgroup {
>       struct list_head event_list;
>       spinlock_t event_list_lock;
>  
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -     struct deferred_split deferred_split_queue;
> -#endif
> -
>       struct mem_cgroup_per_node *nodeinfo[0];
>       /* WARNING: nodeinfo must be the last member here */
>  };
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index c5cb6dc..3b78910 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -500,10 +500,11 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct 
> vm_area_struct *vma)
>  static inline struct deferred_split *get_deferred_split_queue(struct page 
> *page)
>  {
>       struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
> -     struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
> +     int nid = page_to_nid(page);
> +     struct pglist_data *pgdat = NODE_DATA(nid);
>  
>       if (memcg)
> -             return &memcg->deferred_split_queue;
> +             return &memcg->nodeinfo[nid]->deferred_split_queue;
>       else
>               return &pgdat->deferred_split_queue;
>  }
> @@ -2882,12 +2883,13 @@ void deferred_split_huge_page(struct page *page)
>  static unsigned long deferred_split_count(struct shrinker *shrink,
>               struct shrink_control *sc)
>  {
> -     struct pglist_data *pgdata = NODE_DATA(sc->nid);
> +     int nid = sc->nid;
> +     struct pglist_data *pgdata = NODE_DATA(nid);
>       struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
>  
>  #ifdef CONFIG_MEMCG
>       if (sc->memcg)
> -             ds_queue = &sc->memcg->deferred_split_queue;
> +             ds_queue = &sc->memcg->nodeinfo[nid]->deferred_split_queue;
>  #endif
>       return READ_ONCE(ds_queue->split_queue_len);
>  }
> @@ -2895,7 +2897,8 @@ static unsigned long deferred_split_count(struct 
> shrinker *shrink,
>  static unsigned long deferred_split_scan(struct shrinker *shrink,
>               struct shrink_control *sc)
>  {
> -     struct pglist_data *pgdata = NODE_DATA(sc->nid);
> +     int nid = sc->nid;
> +     struct pglist_data *pgdata = NODE_DATA(nid);
>       struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
>       unsigned long flags;
>       LIST_HEAD(list), *pos, *next;
> @@ -2904,7 +2907,7 @@ static unsigned long deferred_split_scan(struct 
> shrinker *shrink,
>  
>  #ifdef CONFIG_MEMCG
>       if (sc->memcg)
> -             ds_queue = &sc->memcg->deferred_split_queue;
> +             ds_queue = &sc->memcg->nodeinfo[nid]->deferred_split_queue;
>  #endif
>  
>       spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index c313c49..19d4295 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4989,6 +4989,12 @@ static int alloc_mem_cgroup_per_node_info(struct 
> mem_cgroup *memcg, int node)
>       pn->on_tree = false;
>       pn->memcg = memcg;
>  
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +     spin_lock_init(&pn->deferred_split_queue.split_queue_lock);
> +     INIT_LIST_HEAD(&pn->deferred_split_queue.split_queue);
> +     pn->deferred_split_queue.split_queue_len = 0;
> +#endif
> +
>       memcg->nodeinfo[node] = pn;
>       return 0;
>  }
> @@ -5081,11 +5087,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
>               memcg->cgwb_frn[i].done =
>                       __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
>  #endif
> -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -     spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
> -     INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
> -     memcg->deferred_split_queue.split_queue_len = 0;
> -#endif
>       idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
>       return memcg;
>  fail:
> @@ -5419,6 +5420,8 @@ static int mem_cgroup_move_account(struct page *page,
>       unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
>       int ret;
>       bool anon;
> +     struct deferred_split *ds_queue;
> +     int nid = page_to_nid(page);
>  
>       VM_BUG_ON(from == to);
>       VM_BUG_ON_PAGE(PageLRU(page), page);
> @@ -5466,10 +5469,11 @@ static int mem_cgroup_move_account(struct page *page,
>  
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>       if (compound && !list_empty(page_deferred_list(page))) {
> -             spin_lock(&from->deferred_split_queue.split_queue_lock);
> +             ds_queue = &from->nodeinfo[nid]->deferred_split_queue;
> +             spin_lock(&ds_queue->split_queue_lock);
>               list_del_init(page_deferred_list(page));
> -             from->deferred_split_queue.split_queue_len--;
> -             spin_unlock(&from->deferred_split_queue.split_queue_lock);
> +             ds_queue->split_queue_len--;
> +             spin_unlock(&ds_queue->split_queue_lock);
>       }
>  #endif
>       /*
> @@ -5483,11 +5487,12 @@ static int mem_cgroup_move_account(struct page *page,
>  
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>       if (compound && list_empty(page_deferred_list(page))) {
> -             spin_lock(&to->deferred_split_queue.split_queue_lock);
> +             ds_queue = &to->nodeinfo[nid]->deferred_split_queue;
> +             spin_lock(&ds_queue->split_queue_lock);
>               list_add_tail(page_deferred_list(page),
> -                           &to->deferred_split_queue.split_queue);
> -             to->deferred_split_queue.split_queue_len++;
> -             spin_unlock(&to->deferred_split_queue.split_queue_lock);
> +                           &ds_queue->split_queue);
> +             ds_queue->split_queue_len++;
> +             spin_unlock(&ds_queue->split_queue_lock);
>       }
>  #endif
>  
> -- 
> 1.8.3.1

-- 
Michal Hocko
SUSE Labs

Re: [PATCH] mm: thp: move deferred split queue to memcg's nodeinfo

Reply via email to