Do we really need this if deferred list is going to be shrunk more pro-actively as discussed already - I am sorry I do not have a link handy but in short the deferred list would be drained from a kworker context more pro-actively rather than wait for the memory pressure to happen.
On Mon 23-09-19 15:09:02, Andrew Morton wrote: > ------------------------------------------------------ > From: Yang Shi <[email protected]> > Subject: mm: thp: extract split_queue_* into a struct > > Patch series "Make deferred split shrinker memcg aware", v6. > > Currently THP deferred split shrinker is not memcg aware, this may cause > premature OOM with some configuration. For example the below test would > run into premature OOM easily: > > $ cgcreate -g memory:thp > $ echo 4G > /sys/fs/cgroup/memory/thp/memory/limit_in_bytes > $ cgexec -g memory:thp transhuge-stress 4000 > > transhuge-stress comes from kernel selftest. > > It is easy to hit OOM, but there are still a lot THP on the deferred split > queue, memcg direct reclaim can't touch them since the deferred split > shrinker is not memcg aware. > > Convert deferred split shrinker memcg aware by introducing per memcg > deferred split queue. The THP should be on either per node or per memcg > deferred split queue if it belongs to a memcg. When the page is > immigrated to the other memcg, it will be immigrated to the target memcg's > deferred split queue too. > > Reuse the second tail page's deferred_list for per memcg list since the > same THP can't be on multiple deferred split queues. > > Make deferred split shrinker not depend on memcg kmem since it is not > slab. It doesn't make sense to not shrink THP even though memcg kmem is > disabled. > > With the above change the test demonstrated above doesn't trigger OOM even > though with cgroup.memory=nokmem. > > > This patch (of 4): > > Put split_queue, split_queue_lock and split_queue_len into a struct in > order to reduce code duplication when we convert deferred_split to memcg > aware in the later patches. > > Link: > http://lkml.kernel.org/r/[email protected] > Signed-off-by: Yang Shi <[email protected]> > Suggested-by: "Kirill A . Shutemov" <[email protected]> > Acked-by: Kirill A. Shutemov <[email protected]> > Reviewed-by: Kirill Tkhai <[email protected]> > Cc: Johannes Weiner <[email protected]> > Cc: Michal Hocko <[email protected]> > Cc: Hugh Dickins <[email protected]> > Cc: Shakeel Butt <[email protected]> > Cc: David Rientjes <[email protected]> > Cc: Qian Cai <[email protected]> > Cc: Vladimir Davydov <[email protected]> > Signed-off-by: Andrew Morton <[email protected]> > --- > > include/linux/mmzone.h | 12 +++++++--- > mm/huge_memory.c | 45 +++++++++++++++++++++------------------ > mm/page_alloc.c | 8 ++++-- > 3 files changed, 39 insertions(+), 26 deletions(-) > > --- a/include/linux/mmzone.h~mm-thp-extract-split_queue_-into-a-struct > +++ a/include/linux/mmzone.h > @@ -679,6 +679,14 @@ struct zonelist { > extern struct page *mem_map; > #endif > > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > +struct deferred_split { > + spinlock_t split_queue_lock; > + struct list_head split_queue; > + unsigned long split_queue_len; > +}; > +#endif > + > /* > * On NUMA machines, each NUMA node would have a pg_data_t to describe > * it's memory layout. On UMA machines there is a single pglist_data which > @@ -758,9 +766,7 @@ typedef struct pglist_data { > #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ > > #ifdef CONFIG_TRANSPARENT_HUGEPAGE > - spinlock_t split_queue_lock; > - struct list_head split_queue; > - unsigned long split_queue_len; > + struct deferred_split deferred_split_queue; > #endif > > /* Fields commonly accessed by the page reclaim scanner */ > --- a/mm/huge_memory.c~mm-thp-extract-split_queue_-into-a-struct > +++ a/mm/huge_memory.c > @@ -2691,6 +2691,7 @@ int split_huge_page_to_list(struct page > { > struct page *head = compound_head(page); > struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); > + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; > struct anon_vma *anon_vma = NULL; > struct address_space *mapping = NULL; > int count, mapcount, extra_pins, ret; > @@ -2777,17 +2778,17 @@ int split_huge_page_to_list(struct page > } > > /* Prevent deferred_split_scan() touching ->_refcount */ > - spin_lock(&pgdata->split_queue_lock); > + spin_lock(&ds_queue->split_queue_lock); > count = page_count(head); > mapcount = total_mapcount(head); > if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { > if (!list_empty(page_deferred_list(head))) { > - pgdata->split_queue_len--; > + ds_queue->split_queue_len--; > list_del(page_deferred_list(head)); > } > if (mapping) > __dec_node_page_state(page, NR_SHMEM_THPS); > - spin_unlock(&pgdata->split_queue_lock); > + spin_unlock(&ds_queue->split_queue_lock); > __split_huge_page(page, list, end, flags); > if (PageSwapCache(head)) { > swp_entry_t entry = { .val = page_private(head) }; > @@ -2804,7 +2805,7 @@ int split_huge_page_to_list(struct page > dump_page(page, "total_mapcount(head) > 0"); > BUG(); > } > - spin_unlock(&pgdata->split_queue_lock); > + spin_unlock(&ds_queue->split_queue_lock); > fail: if (mapping) > xa_unlock(&mapping->i_pages); > spin_unlock_irqrestore(&pgdata->lru_lock, flags); > @@ -2827,52 +2828,56 @@ out: > void free_transhuge_page(struct page *page) > { > struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); > + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; > unsigned long flags; > > - spin_lock_irqsave(&pgdata->split_queue_lock, flags); > + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); > if (!list_empty(page_deferred_list(page))) { > - pgdata->split_queue_len--; > + ds_queue->split_queue_len--; > list_del(page_deferred_list(page)); > } > - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); > + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); > free_compound_page(page); > } > > void deferred_split_huge_page(struct page *page) > { > struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); > + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; > unsigned long flags; > > VM_BUG_ON_PAGE(!PageTransHuge(page), page); > > - spin_lock_irqsave(&pgdata->split_queue_lock, flags); > + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); > if (list_empty(page_deferred_list(page))) { > count_vm_event(THP_DEFERRED_SPLIT_PAGE); > - list_add_tail(page_deferred_list(page), &pgdata->split_queue); > - pgdata->split_queue_len++; > + list_add_tail(page_deferred_list(page), &ds_queue->split_queue); > + ds_queue->split_queue_len++; > } > - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); > + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); > } > > static unsigned long deferred_split_count(struct shrinker *shrink, > struct shrink_control *sc) > { > struct pglist_data *pgdata = NODE_DATA(sc->nid); > - return READ_ONCE(pgdata->split_queue_len); > + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; > + return READ_ONCE(ds_queue->split_queue_len); > } > > static unsigned long deferred_split_scan(struct shrinker *shrink, > struct shrink_control *sc) > { > struct pglist_data *pgdata = NODE_DATA(sc->nid); > + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; > unsigned long flags; > LIST_HEAD(list), *pos, *next; > struct page *page; > int split = 0; > > - spin_lock_irqsave(&pgdata->split_queue_lock, flags); > + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); > /* Take pin on all head pages to avoid freeing them under us */ > - list_for_each_safe(pos, next, &pgdata->split_queue) { > + list_for_each_safe(pos, next, &ds_queue->split_queue) { > page = list_entry((void *)pos, struct page, mapping); > page = compound_head(page); > if (get_page_unless_zero(page)) { > @@ -2880,12 +2885,12 @@ static unsigned long deferred_split_scan > } else { > /* We lost race with put_compound_page() */ > list_del_init(page_deferred_list(page)); > - pgdata->split_queue_len--; > + ds_queue->split_queue_len--; > } > if (!--sc->nr_to_scan) > break; > } > - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); > + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); > > list_for_each_safe(pos, next, &list) { > page = list_entry((void *)pos, struct page, mapping); > @@ -2899,15 +2904,15 @@ next: > put_page(page); > } > > - spin_lock_irqsave(&pgdata->split_queue_lock, flags); > - list_splice_tail(&list, &pgdata->split_queue); > - spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); > + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); > + list_splice_tail(&list, &ds_queue->split_queue); > + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); > > /* > * Stop shrinker if we didn't split any page, but the queue is empty. > * This can happen if pages were freed under us. > */ > - if (!split && list_empty(&pgdata->split_queue)) > + if (!split && list_empty(&ds_queue->split_queue)) > return SHRINK_STOP; > return split; > } > --- a/mm/page_alloc.c~mm-thp-extract-split_queue_-into-a-struct > +++ a/mm/page_alloc.c > @@ -6646,9 +6646,11 @@ static unsigned long __init calc_memmap_ > #ifdef CONFIG_TRANSPARENT_HUGEPAGE > static void pgdat_init_split_queue(struct pglist_data *pgdat) > { > - spin_lock_init(&pgdat->split_queue_lock); > - INIT_LIST_HEAD(&pgdat->split_queue); > - pgdat->split_queue_len = 0; > + struct deferred_split *ds_queue = &pgdat->deferred_split_queue; > + > + spin_lock_init(&ds_queue->split_queue_lock); > + INIT_LIST_HEAD(&ds_queue->split_queue); > + ds_queue->split_queue_len = 0; > } > #else > static void pgdat_init_split_queue(struct pglist_data *pgdat) {} > _ > > Patches currently in -mm which might be from [email protected] are > > mmthp-add-read-only-thp-support-for-non-shmem-fs.patch > mm-thp-extract-split_queue_-into-a-struct.patch > mm-move-mem_cgroup_uncharge-out-of-__page_cache_release.patch > mm-shrinker-make-shrinker-not-depend-on-memcg-kmem.patch > mm-shrinker-make-shrinker-not-depend-on-memcg-kmem-v6.patch > mm-thp-make-deferred-split-shrinker-memcg-aware.patch > mm-thp-make-deferred-split-shrinker-memcg-aware-v6.patch -- Michal Hocko SUSE Labs

