Do we really need this if deferred list is going to be shrunk more
pro-actively as discussed already - I am sorry I do not have a link handy
but in short the deferred list would be drained from a kworker context
more pro-actively rather than wait for the memory pressure to happen.

On Mon 23-09-19 15:09:02, Andrew Morton wrote:
> ------------------------------------------------------
> From: Yang Shi <[email protected]>
> Subject: mm: thp: extract split_queue_* into a struct
> 
> Patch series "Make deferred split shrinker memcg aware", v6.
> 
> Currently THP deferred split shrinker is not memcg aware, this may cause
> premature OOM with some configuration.  For example the below test would
> run into premature OOM easily:
> 
> $ cgcreate -g memory:thp
> $ echo 4G > /sys/fs/cgroup/memory/thp/memory/limit_in_bytes
> $ cgexec -g memory:thp transhuge-stress 4000
> 
> transhuge-stress comes from kernel selftest.
> 
> It is easy to hit OOM, but there are still a lot THP on the deferred split
> queue, memcg direct reclaim can't touch them since the deferred split
> shrinker is not memcg aware.
> 
> Convert deferred split shrinker memcg aware by introducing per memcg
> deferred split queue.  The THP should be on either per node or per memcg
> deferred split queue if it belongs to a memcg.  When the page is
> immigrated to the other memcg, it will be immigrated to the target memcg's
> deferred split queue too.
> 
> Reuse the second tail page's deferred_list for per memcg list since the
> same THP can't be on multiple deferred split queues.
> 
> Make deferred split shrinker not depend on memcg kmem since it is not
> slab.  It doesn't make sense to not shrink THP even though memcg kmem is
> disabled.
> 
> With the above change the test demonstrated above doesn't trigger OOM even
> though with cgroup.memory=nokmem.
> 
> 
> This patch (of 4):
> 
> Put split_queue, split_queue_lock and split_queue_len into a struct in
> order to reduce code duplication when we convert deferred_split to memcg
> aware in the later patches.
> 
> Link: 
> http://lkml.kernel.org/r/[email protected]
> Signed-off-by: Yang Shi <[email protected]>
> Suggested-by: "Kirill A . Shutemov" <[email protected]>
> Acked-by: Kirill A. Shutemov <[email protected]>
> Reviewed-by: Kirill Tkhai <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Michal Hocko <[email protected]>
> Cc: Hugh Dickins <[email protected]>
> Cc: Shakeel Butt <[email protected]>
> Cc: David Rientjes <[email protected]>
> Cc: Qian Cai <[email protected]>
> Cc: Vladimir Davydov <[email protected]>
> Signed-off-by: Andrew Morton <[email protected]>
> ---
> 
>  include/linux/mmzone.h |   12 +++++++---
>  mm/huge_memory.c       |   45 +++++++++++++++++++++------------------
>  mm/page_alloc.c        |    8 ++++--
>  3 files changed, 39 insertions(+), 26 deletions(-)
> 
> --- a/include/linux/mmzone.h~mm-thp-extract-split_queue_-into-a-struct
> +++ a/include/linux/mmzone.h
> @@ -679,6 +679,14 @@ struct zonelist {
>  extern struct page *mem_map;
>  #endif
>  
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +struct deferred_split {
> +     spinlock_t split_queue_lock;
> +     struct list_head split_queue;
> +     unsigned long split_queue_len;
> +};
> +#endif
> +
>  /*
>   * On NUMA machines, each NUMA node would have a pg_data_t to describe
>   * it's memory layout. On UMA machines there is a single pglist_data which
> @@ -758,9 +766,7 @@ typedef struct pglist_data {
>  #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
>  
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> -     spinlock_t split_queue_lock;
> -     struct list_head split_queue;
> -     unsigned long split_queue_len;
> +     struct deferred_split deferred_split_queue;
>  #endif
>  
>       /* Fields commonly accessed by the page reclaim scanner */
> --- a/mm/huge_memory.c~mm-thp-extract-split_queue_-into-a-struct
> +++ a/mm/huge_memory.c
> @@ -2691,6 +2691,7 @@ int split_huge_page_to_list(struct page
>  {
>       struct page *head = compound_head(page);
>       struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
> +     struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
>       struct anon_vma *anon_vma = NULL;
>       struct address_space *mapping = NULL;
>       int count, mapcount, extra_pins, ret;
> @@ -2777,17 +2778,17 @@ int split_huge_page_to_list(struct page
>       }
>  
>       /* Prevent deferred_split_scan() touching ->_refcount */
> -     spin_lock(&pgdata->split_queue_lock);
> +     spin_lock(&ds_queue->split_queue_lock);
>       count = page_count(head);
>       mapcount = total_mapcount(head);
>       if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
>               if (!list_empty(page_deferred_list(head))) {
> -                     pgdata->split_queue_len--;
> +                     ds_queue->split_queue_len--;
>                       list_del(page_deferred_list(head));
>               }
>               if (mapping)
>                       __dec_node_page_state(page, NR_SHMEM_THPS);
> -             spin_unlock(&pgdata->split_queue_lock);
> +             spin_unlock(&ds_queue->split_queue_lock);
>               __split_huge_page(page, list, end, flags);
>               if (PageSwapCache(head)) {
>                       swp_entry_t entry = { .val = page_private(head) };
> @@ -2804,7 +2805,7 @@ int split_huge_page_to_list(struct page
>                       dump_page(page, "total_mapcount(head) > 0");
>                       BUG();
>               }
> -             spin_unlock(&pgdata->split_queue_lock);
> +             spin_unlock(&ds_queue->split_queue_lock);
>  fail:                if (mapping)
>                       xa_unlock(&mapping->i_pages);
>               spin_unlock_irqrestore(&pgdata->lru_lock, flags);
> @@ -2827,52 +2828,56 @@ out:
>  void free_transhuge_page(struct page *page)
>  {
>       struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
> +     struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
>       unsigned long flags;
>  
> -     spin_lock_irqsave(&pgdata->split_queue_lock, flags);
> +     spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>       if (!list_empty(page_deferred_list(page))) {
> -             pgdata->split_queue_len--;
> +             ds_queue->split_queue_len--;
>               list_del(page_deferred_list(page));
>       }
> -     spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
> +     spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>       free_compound_page(page);
>  }
>  
>  void deferred_split_huge_page(struct page *page)
>  {
>       struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
> +     struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
>       unsigned long flags;
>  
>       VM_BUG_ON_PAGE(!PageTransHuge(page), page);
>  
> -     spin_lock_irqsave(&pgdata->split_queue_lock, flags);
> +     spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>       if (list_empty(page_deferred_list(page))) {
>               count_vm_event(THP_DEFERRED_SPLIT_PAGE);
> -             list_add_tail(page_deferred_list(page), &pgdata->split_queue);
> -             pgdata->split_queue_len++;
> +             list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
> +             ds_queue->split_queue_len++;
>       }
> -     spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
> +     spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>  }
>  
>  static unsigned long deferred_split_count(struct shrinker *shrink,
>               struct shrink_control *sc)
>  {
>       struct pglist_data *pgdata = NODE_DATA(sc->nid);
> -     return READ_ONCE(pgdata->split_queue_len);
> +     struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
> +     return READ_ONCE(ds_queue->split_queue_len);
>  }
>  
>  static unsigned long deferred_split_scan(struct shrinker *shrink,
>               struct shrink_control *sc)
>  {
>       struct pglist_data *pgdata = NODE_DATA(sc->nid);
> +     struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
>       unsigned long flags;
>       LIST_HEAD(list), *pos, *next;
>       struct page *page;
>       int split = 0;
>  
> -     spin_lock_irqsave(&pgdata->split_queue_lock, flags);
> +     spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
>       /* Take pin on all head pages to avoid freeing them under us */
> -     list_for_each_safe(pos, next, &pgdata->split_queue) {
> +     list_for_each_safe(pos, next, &ds_queue->split_queue) {
>               page = list_entry((void *)pos, struct page, mapping);
>               page = compound_head(page);
>               if (get_page_unless_zero(page)) {
> @@ -2880,12 +2885,12 @@ static unsigned long deferred_split_scan
>               } else {
>                       /* We lost race with put_compound_page() */
>                       list_del_init(page_deferred_list(page));
> -                     pgdata->split_queue_len--;
> +                     ds_queue->split_queue_len--;
>               }
>               if (!--sc->nr_to_scan)
>                       break;
>       }
> -     spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
> +     spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>  
>       list_for_each_safe(pos, next, &list) {
>               page = list_entry((void *)pos, struct page, mapping);
> @@ -2899,15 +2904,15 @@ next:
>               put_page(page);
>       }
>  
> -     spin_lock_irqsave(&pgdata->split_queue_lock, flags);
> -     list_splice_tail(&list, &pgdata->split_queue);
> -     spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
> +     spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> +     list_splice_tail(&list, &ds_queue->split_queue);
> +     spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>  
>       /*
>        * Stop shrinker if we didn't split any page, but the queue is empty.
>        * This can happen if pages were freed under us.
>        */
> -     if (!split && list_empty(&pgdata->split_queue))
> +     if (!split && list_empty(&ds_queue->split_queue))
>               return SHRINK_STOP;
>       return split;
>  }
> --- a/mm/page_alloc.c~mm-thp-extract-split_queue_-into-a-struct
> +++ a/mm/page_alloc.c
> @@ -6646,9 +6646,11 @@ static unsigned long __init calc_memmap_
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>  static void pgdat_init_split_queue(struct pglist_data *pgdat)
>  {
> -     spin_lock_init(&pgdat->split_queue_lock);
> -     INIT_LIST_HEAD(&pgdat->split_queue);
> -     pgdat->split_queue_len = 0;
> +     struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
> +
> +     spin_lock_init(&ds_queue->split_queue_lock);
> +     INIT_LIST_HEAD(&ds_queue->split_queue);
> +     ds_queue->split_queue_len = 0;
>  }
>  #else
>  static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
> _
> 
> Patches currently in -mm which might be from [email protected] are
> 
> mmthp-add-read-only-thp-support-for-non-shmem-fs.patch
> mm-thp-extract-split_queue_-into-a-struct.patch
> mm-move-mem_cgroup_uncharge-out-of-__page_cache_release.patch
> mm-shrinker-make-shrinker-not-depend-on-memcg-kmem.patch
> mm-shrinker-make-shrinker-not-depend-on-memcg-kmem-v6.patch
> mm-thp-make-deferred-split-shrinker-memcg-aware.patch
> mm-thp-make-deferred-split-shrinker-memcg-aware-v6.patch

-- 
Michal Hocko
SUSE Labs

Reply via email to