On Wed, Jul 27, 2016 at 04:13:33PM +0100, Mel Gorman wrote:
> On Wed, Jul 27, 2016 at 03:22:26PM +0100, Mel Gorman wrote:
> > ---8<---
> > From: Mel Gorman <[email protected]>
> > Subject: [PATCH] mm, vmscan: Wait on a waitqueue when too many pages are
> >  isolated
> > 
> 
> This is potentially a much better version as it avoids wakeup storms and
> do a better job of handling the case where pages could not be reclaimed.
> 
> ---8<---
> mm, vmscan: Wait on a waitqueue when too many pages are isolated
> 
> When too many pages are isolated, direct reclaim waits on congestion to
> clear for up to a tenth of a second. There is no reason to believe that too
> many pages are isolated due to dirty pages, reclaim efficiency or congestion.
> It may simply be because an extremely large number of processes have entered
> direct reclaim at the same time.
> 
> This patch has processes wait on a waitqueue when too many pages are
> isolated.  When parallel reclaimers finish shrink_page_list, they wake the
> waiters to recheck whether too many pages are isolated. While it is difficult
> to trigger this corner case, it's possible by lauching an extremely large
> number of hackbench processes on a 32-bit system with limited memory. Without
> the patch, a large number of processes wait uselessly and with the patch
> applied, I was unable to stall the system.
> 
> Signed-off-by: Mel Gorman <[email protected]>
> ---
>  include/linux/mmzone.h |  1 +
>  mm/page_alloc.c        |  1 +
>  mm/vmscan.c            | 24 +++++++++++++++---------
>  3 files changed, 17 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index d572b78b65e1..467878d7af33 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -653,6 +653,7 @@ typedef struct pglist_data {
>       int node_id;
>       wait_queue_head_t kswapd_wait;
>       wait_queue_head_t pfmemalloc_wait;
> +     wait_queue_head_t isolated_wait;
>       struct task_struct *kswapd;     /* Protected by
>                                          mem_hotplug_begin/end() */
>       int kswapd_order;
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index fbd329e61bf6..3800972f240e 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5859,6 +5859,7 @@ static void __paginginit free_area_init_core(struct 
> pglist_data *pgdat)
>  #endif
>       init_waitqueue_head(&pgdat->kswapd_wait);
>       init_waitqueue_head(&pgdat->pfmemalloc_wait);
> +     init_waitqueue_head(&pgdat->isolated_wait);
>  #ifdef CONFIG_COMPACTION
>       init_waitqueue_head(&pgdat->kcompactd_wait);
>  #endif
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 9c0b2b0fc164..e264fcb7556b 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1554,16 +1554,16 @@ int isolate_lru_page(struct page *page)
>   * the LRU list will go small and be scanned faster than necessary, leading 
> to
>   * unnecessary swapping, thrashing and OOM.
>   */
> -static int too_many_isolated(struct pglist_data *pgdat, int file,
> +static bool safe_to_isolate(struct pglist_data *pgdat, int file,
>               struct scan_control *sc)
>  {
>       unsigned long inactive, isolated;
>  
>       if (current_is_kswapd())
> -             return 0;
> +             return true;
>  
> -     if (!sane_reclaim(sc))
> -             return 0;
> +     if (sane_reclaim(sc))
> +             return true;
>  
>       if (file) {
>               inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
> @@ -1581,7 +1581,7 @@ static int too_many_isolated(struct pglist_data *pgdat, 
> int file,
>       if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
>               inactive >>= 3;
>  
> -     return isolated > inactive;
> +     return isolated < inactive;
>  }
>  
>  static noinline_for_stack void
> @@ -1701,12 +1701,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct 
> lruvec *lruvec,
>       if (!inactive_reclaimable_pages(lruvec, sc, lru))
>               return 0;
>  
> -     while (unlikely(too_many_isolated(pgdat, file, sc))) {
> -             congestion_wait(BLK_RW_ASYNC, HZ/10);
> +     if (!safe_to_isolate(pgdat, file, sc)) {
> +             wait_event_killable(pgdat->isolated_wait,
> +                     safe_to_isolate(pgdat, file, sc));
>  
>               /* We are about to die and free our memory. Return now. */
> -             if (fatal_signal_pending(current))
> -                     return SWAP_CLUSTER_MAX;
> +             if (fatal_signal_pending(current)) {
> +                     nr_reclaimed = SWAP_CLUSTER_MAX;
> +                     goto out;
> +             }
>       }
>  
>       lru_add_drain();
> @@ -1819,6 +1822,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct 
> lruvec *lruvec,
>       trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
>                       nr_scanned, nr_reclaimed,
>                       sc->priority, file);
> +
> +out:
> +     wake_up(&pgdat->isolated_wait);

Isolation can happen migrate/khugepaged as well as reclaim so it would
sleep forever unless all of places can isolate pages don't wake up.

Reply via email to