The commit is pushed to "branch-rh7-3.10.0-514.26.1.vz7.33.x-ovz" and will
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-514.26.1.vz7.33.6
------>
commit 107a8bf713c484ac3ef055afc165acf1f3c29d29
Author: Andrey Ryabinin <[email protected]>
Date: Thu Jul 13 19:41:24 2017 +0400
ms/mm: vmscan: fix IO/refault regression in cache workingset transition
This is simplified (w/o memcg-awareness) version of the upstream commit
bellow.
Simplified because we don't have cgroup-aware workingset yet, and
backporting
it would be a lot of work.
commit 2a2e48854d704214dac7546e87ae0e4daa0e61a0
Author: Johannes Weiner <[email protected]>
Date: Wed May 3 14:55:03 2017 -0700
mm: vmscan: fix IO/refault regression in cache workingset transition
Since commit 59dc76b0d4df ("mm: vmscan: reduce size of inactive file
list") we noticed bigger IO spikes during changes in cache access
patterns.
The patch in question shrunk the inactive list size to leave more room
for the current workingset in the presence of streaming IO. However,
workingset transitions that previously happened on the inactive list are
now pushed out of memory and incur more refaults to complete.
This patch disables active list protection when refaults are being
observed. This accelerates workingset transitions, and allows more of
the new set to establish itself from memory, without eating into the
ability to protect the established workingset during stable periods.
The workloads that were measurably affected for us were hit pretty bad
by it, with refault/majfault rates doubling and tripling during cache
transitions, and the machines sustaining half-hour periods of 100% IO
utilization, where they'd previously have sub-minute peaks at 60-90%.
Stateful services that handle user data tend to be more conservative
with kernel upgrades. As a result we hit most page cache issues with
some delay, as was the case here.
The severity seemed to warrant a stable tag.
Fixes: 59dc76b0d4df ("mm: vmscan: reduce size of inactive file list")
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Johannes Weiner <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Vladimir Davydov <[email protected]>
Cc: <[email protected]> [4.7+]
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
https://jira.sw.ru/browse/PSBM-68029
Signed-off-by: Andrey Ryabinin <[email protected]>
---
include/linux/mmzone.h | 3 +++
mm/vmscan.c | 52 ++++++++++++++++++++++++++++++++++++++++----------
2 files changed, 45 insertions(+), 10 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 386569f..e27fb5e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -430,6 +430,9 @@ struct zone {
*/
unsigned int inactive_ratio;
+ /* Refaults at the time of last reclaim cycle */
+ unsigned long refaults;
+
#ifdef CONFIG_MEMCG
bool force_scan;
#endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 26e7620..b9e77c3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1824,6 +1824,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
* Both inactive lists should also be large enough that each inactive
* page has a chance to be referenced again before it is reclaimed.
*
+ * If that fails and refaulting is observed, the inactive list grows.
+ *
* The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
* on this LRU, maintained by the pageout code. A zone->inactive_ratio
* of 3 means 3:1 or 25% of the pages are kept on the inactive list.
@@ -1839,12 +1841,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
* 1TB 101 10GB
* 10TB 320 32GB
*/
-static int inactive_list_is_low(struct lruvec *lruvec, bool file)
+static int inactive_list_is_low(struct lruvec *lruvec, bool file,
+ struct mem_cgroup *memcg, bool actual_reclaim)
{
+ struct zone *zone = lruvec_zone(lruvec);
unsigned long inactive_ratio;
unsigned long inactive;
unsigned long active;
unsigned long gb;
+ unsigned long refaults;
/*
* If we don't have swap space, anonymous page deactivation
@@ -1856,12 +1861,20 @@ static int inactive_list_is_low(struct lruvec *lruvec,
bool file)
inactive = get_lru_size(lruvec, file * LRU_FILE);
active = get_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
- gb = (inactive + active) >> (30 - PAGE_SHIFT);
- if (gb)
- inactive_ratio = int_sqrt(10 * gb);
- else
- inactive_ratio = 1;
+ if (memcg)
+ refaults = zone->refaults; /* we don't support per-cgroup
workingset */
+ else
+ refaults = zone_page_state(zone, WORKINGSET_ACTIVATE);
+ if (file && actual_reclaim && zone->refaults != refaults) {
+ inactive_ratio = 0;
+ } else {
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
+ if (gb)
+ inactive_ratio = int_sqrt(10 * gb);
+ else
+ inactive_ratio = 1;
+ }
return inactive * inactive_ratio < active;
}
@@ -1869,7 +1882,8 @@ static unsigned long shrink_list(enum lru_list lru,
unsigned long nr_to_scan,
struct lruvec *lruvec, struct scan_control *sc)
{
if (is_active_lru(lru)) {
- if (inactive_list_is_low(lruvec, is_file_lru(lru)))
+ if (inactive_list_is_low(lruvec, is_file_lru(lru),
+ sc->target_mem_cgroup, true))
shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
@@ -2034,7 +2048,7 @@ static void get_scan_count(struct lruvec *lruvec, struct
scan_control *sc,
* There is enough inactive page cache, do not reclaim
* anything from the anonymous working set right now.
*/
- if (!inactive_list_is_low(lruvec, true) &&
+ if (!inactive_list_is_low(lruvec, true, sc->target_mem_cgroup, false) &&
get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority > 0) {
scan_balance = SCAN_FILE;
goto out;
@@ -2258,7 +2272,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct
scan_control *sc,
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_list_is_low(lruvec, false))
+ if (inactive_list_is_low(lruvec, false, sc->target_mem_cgroup, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -2619,6 +2633,8 @@ static unsigned long do_try_to_free_pages(struct zonelist
*zonelist,
unsigned long total_scanned = 0;
unsigned long writeback_threshold;
bool aborted_reclaim;
+ struct zone *zone;
+ struct zoneref *z;
retry:
{KSTAT_PERF_ENTER(ttfp);
@@ -2663,6 +2679,11 @@ static unsigned long do_try_to_free_pages(struct
zonelist *zonelist,
} while (--sc->priority >= 0 && !aborted_reclaim);
out:
+ if (!sc->target_mem_cgroup)
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask), sc->nodemask)
+ zone->refaults = zone_page_state(zone,
WORKINGSET_ACTIVATE);
+
delayacct_freepages_end();
KSTAT_PERF_LEAVE(ttfp);}
@@ -2953,7 +2974,8 @@ static void age_active_anon(struct zone *zone, struct
scan_control *sc)
do {
struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
- if (inactive_list_is_low(lruvec, false))
+ if (inactive_list_is_low(lruvec, false,
+ sc->target_mem_cgroup, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -3338,6 +3360,16 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int
order,
!pgdat_balanced(pgdat, order, *classzone_idx));
out:
+
+ for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ if (!populated_zone(zone))
+ continue;
+
+ zone->refaults = zone_page_state(zone, WORKINGSET_ACTIVATE);
+ }
+
/*
* Return the order we were reclaiming at so prepare_kswapd_sleep()
* makes a decision on the order we were last reclaiming at. However,
_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel