refault regression in cache workingset transition

Konstantin Khorenko Thu, 13 Jul 2017 08:43:56 -0700

The commit is pushed to "branch-rh7-3.10.0-514.26.1.vz7.33.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-514.26.1.vz7.33.6
------>
commit 107a8bf713c484ac3ef055afc165acf1f3c29d29
Author: Andrey Ryabinin <[email protected]>
Date:   Thu Jul 13 19:41:24 2017 +0400


    ms/mm: vmscan: fix IO/refault regression in cache workingset transition
    
    This is simplified (w/o memcg-awareness) version of the upstream commit 
bellow.
    Simplified because we don't have cgroup-aware workingset yet, and 
backporting
    it would be a lot of work.
    
    commit 2a2e48854d704214dac7546e87ae0e4daa0e61a0
    Author: Johannes Weiner <[email protected]>
    Date:   Wed May 3 14:55:03 2017 -0700
    
        mm: vmscan: fix IO/refault regression in cache workingset transition
    
        Since commit 59dc76b0d4df ("mm: vmscan: reduce size of inactive file
        list") we noticed bigger IO spikes during changes in cache access
        patterns.
    
        The patch in question shrunk the inactive list size to leave more room
        for the current workingset in the presence of streaming IO.  However,
        workingset transitions that previously happened on the inactive list are
        now pushed out of memory and incur more refaults to complete.
    
        This patch disables active list protection when refaults are being
        observed.  This accelerates workingset transitions, and allows more of
        the new set to establish itself from memory, without eating into the
        ability to protect the established workingset during stable periods.
    
        The workloads that were measurably affected for us were hit pretty bad
        by it, with refault/majfault rates doubling and tripling during cache
        transitions, and the machines sustaining half-hour periods of 100% IO
        utilization, where they'd previously have sub-minute peaks at 60-90%.
    
        Stateful services that handle user data tend to be more conservative
        with kernel upgrades.  As a result we hit most page cache issues with
        some delay, as was the case here.
    
        The severity seemed to warrant a stable tag.
    
        Fixes: 59dc76b0d4df ("mm: vmscan: reduce size of inactive file list")
        Link: http://lkml.kernel.org/r/[email protected]
        Signed-off-by: Johannes Weiner <[email protected]>
        Cc: Rik van Riel <[email protected]>
        Cc: Mel Gorman <[email protected]>
        Cc: Michal Hocko <[email protected]>
        Cc: Vladimir Davydov <[email protected]>
        Cc: <[email protected]>    [4.7+]
        Signed-off-by: Andrew Morton <[email protected]>
        Signed-off-by: Linus Torvalds <[email protected]>
    
    https://jira.sw.ru/browse/PSBM-68029
    
    Signed-off-by: Andrey Ryabinin <[email protected]>
---
 include/linux/mmzone.h |  3 +++
 mm/vmscan.c            | 52 ++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 386569f..e27fb5e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -430,6 +430,9 @@ struct zone {
         */
        unsigned int inactive_ratio;
 
+       /* Refaults at the time of last reclaim cycle */
+       unsigned long                   refaults;
+
 #ifdef CONFIG_MEMCG
        bool force_scan;
 #endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 26e7620..b9e77c3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1824,6 +1824,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
  * Both inactive lists should also be large enough that each inactive
  * page has a chance to be referenced again before it is reclaimed.
  *
+ * If that fails and refaulting is observed, the inactive list grows.
+ *
  * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
  * on this LRU, maintained by the pageout code. A zone->inactive_ratio
  * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
@@ -1839,12 +1841,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
  *    1TB     101        10GB
  *   10TB     320        32GB
  */
-static int inactive_list_is_low(struct lruvec *lruvec, bool file)
+static int inactive_list_is_low(struct lruvec *lruvec, bool file,
+                               struct mem_cgroup *memcg, bool actual_reclaim)
 {
+       struct zone *zone = lruvec_zone(lruvec);
        unsigned long inactive_ratio;
        unsigned long inactive;
        unsigned long active;
        unsigned long gb;
+       unsigned long refaults;
 
        /*
         * If we don't have swap space, anonymous page deactivation
@@ -1856,12 +1861,20 @@ static int inactive_list_is_low(struct lruvec *lruvec, 
bool file)
        inactive = get_lru_size(lruvec, file * LRU_FILE);
        active = get_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
 
-       gb = (inactive + active) >> (30 - PAGE_SHIFT);
-       if (gb)
-               inactive_ratio = int_sqrt(10 * gb);
-       else
-               inactive_ratio = 1;
+       if (memcg)
+               refaults = zone->refaults; /* we don't support per-cgroup 
workingset */
+        else
+               refaults = zone_page_state(zone, WORKINGSET_ACTIVATE);
 
+       if (file && actual_reclaim && zone->refaults != refaults) {
+               inactive_ratio = 0;
+       } else {
+               gb = (inactive + active) >> (30 - PAGE_SHIFT);
+               if (gb)
+                       inactive_ratio = int_sqrt(10 * gb);
+               else
+                       inactive_ratio = 1;
+       }
        return inactive * inactive_ratio < active;
 }
 
@@ -1869,7 +1882,8 @@ static unsigned long shrink_list(enum lru_list lru, 
unsigned long nr_to_scan,
                                 struct lruvec *lruvec, struct scan_control *sc)
 {
        if (is_active_lru(lru)) {
-               if (inactive_list_is_low(lruvec, is_file_lru(lru)))
+               if (inactive_list_is_low(lruvec, is_file_lru(lru),
+                                       sc->target_mem_cgroup, true))
                        shrink_active_list(nr_to_scan, lruvec, sc, lru);
                return 0;
        }
@@ -2034,7 +2048,7 @@ static void get_scan_count(struct lruvec *lruvec, struct 
scan_control *sc,
         * There is enough inactive page cache, do not reclaim
         * anything from the anonymous working set right now.
         */
-       if (!inactive_list_is_low(lruvec, true) &&
+       if (!inactive_list_is_low(lruvec, true, sc->target_mem_cgroup, false) &&
            get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority > 0) {
                scan_balance = SCAN_FILE;
                goto out;
@@ -2258,7 +2272,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct 
scan_control *sc,
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
         */
-       if (inactive_list_is_low(lruvec, false))
+       if (inactive_list_is_low(lruvec, false, sc->target_mem_cgroup, true))
                shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                   sc, LRU_ACTIVE_ANON);
 
@@ -2619,6 +2633,8 @@ static unsigned long do_try_to_free_pages(struct zonelist 
*zonelist,
        unsigned long total_scanned = 0;
        unsigned long writeback_threshold;
        bool aborted_reclaim;
+       struct zone *zone;
+       struct zoneref *z;
 
 retry:
        {KSTAT_PERF_ENTER(ttfp);
@@ -2663,6 +2679,11 @@ static unsigned long do_try_to_free_pages(struct 
zonelist *zonelist,
        } while (--sc->priority >= 0 && !aborted_reclaim);
 
 out:
+       if (!sc->target_mem_cgroup)
+               for_each_zone_zonelist_nodemask(zone, z, zonelist,
+                                       gfp_zone(sc->gfp_mask), sc->nodemask)
+                       zone->refaults = zone_page_state(zone, 
WORKINGSET_ACTIVATE);
+
        delayacct_freepages_end();
        KSTAT_PERF_LEAVE(ttfp);}
 
@@ -2953,7 +2974,8 @@ static void age_active_anon(struct zone *zone, struct 
scan_control *sc)
        do {
                struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
-               if (inactive_list_is_low(lruvec, false))
+               if (inactive_list_is_low(lruvec, false,
+                                       sc->target_mem_cgroup, true))
                        shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                           sc, LRU_ACTIVE_ANON);
 
@@ -3338,6 +3360,16 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int 
order,
                 !pgdat_balanced(pgdat, order, *classzone_idx));
 
 out:
+
+       for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+               struct zone *zone = pgdat->node_zones + i;
+
+               if (!populated_zone(zone))
+                       continue;
+
+               zone->refaults = zone_page_state(zone, WORKINGSET_ACTIVATE);
+       }
+
        /*
         * Return the order we were reclaiming at so prepare_kswapd_sleep()
         * makes a decision on the order we were last reclaiming at. However,
_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RHEL7 COMMIT] ms/mm: vmscan: fix IO/refault regression in cache workingset transition

Reply via email to