With per-zone lru stat, we could know normal zone has 86M anonymous
memory but OOM happend with non-atomic order-0 allocation. And most of
anonymous pages in normal zone are on active list.

gfp_mask=0x26004c0(GFP_KERNEL|__GFP_REPEAT|__GFP_NOTRACK), order=0
Call Trace:
 [<c51a76e2>] __alloc_pages_nodemask+0xe52/0xe60
 [<c51f31dc>] ? new_slab+0x39c/0x3b0
 [<c51f31dc>] new_slab+0x39c/0x3b0
 [<c51f4eca>] ___slab_alloc.constprop.87+0x6da/0x840
 [<c563e6fc>] ? __alloc_skb+0x3c/0x260
 [<c50b8e93>] ? enqueue_task_fair+0x73/0xbf0
 [<c5219ee0>] ? poll_select_copy_remaining+0x140/0x140
 [<c5201645>] __slab_alloc.isra.81.constprop.86+0x40/0x6d
 [<c563e6fc>] ? __alloc_skb+0x3c/0x260
 [<c51f525c>] kmem_cache_alloc+0x22c/0x260
 [<c563e6fc>] ? __alloc_skb+0x3c/0x260
 [<c563e6fc>] __alloc_skb+0x3c/0x260
 [<c563eece>] alloc_skb_with_frags+0x4e/0x1a0
 [<c5638d6a>] sock_alloc_send_pskb+0x16a/0x1b0
 [<c570b581>] ? wait_for_unix_gc+0x31/0x90
 [<c57084dd>] unix_stream_sendmsg+0x28d/0x340
 [<c5634dad>] sock_sendmsg+0x2d/0x40
 [<c5634e2c>] sock_write_iter+0x6c/0xc0
 [<c5204a90>] __vfs_write+0xc0/0x120
 [<c52053ab>] vfs_write+0x9b/0x1a0
 [<c51cc4a9>] ? __might_fault+0x49/0xa0
 [<c52062c4>] SyS_write+0x44/0x90
 [<c50036c6>] do_fast_syscall_32+0xa6/0x1e0

Mem-Info:
active_anon:101103 inactive_anon:102219 isolated_anon:0
 active_file:503 inactive_file:544 isolated_file:0
 unevictable:0 dirty:0 writeback:34 unstable:0
 slab_reclaimable:6298 slab_unreclaimable:74669
 mapped:863 shmem:0 pagetables:100998 bounce:0
 free:23573 free_pcp:1861 free_cma:0
Node 0 active_anon:404412kB inactive_anon:409040kB active_file:2012kB 
inactive_file:2176kB unevictable:0kB isolated(anon):0kB isolated(file):0kB 
mapped:3452kB dirty:0kB writeback:136kB shmem:0kB writeback_tmp:0kB 
unstable:0kB pages_scanned:1320845 all_unreclaimable? yes
DMA free:3296kB min:68kB low:84kB high:100kB active_anon:5540kB 
inactive_anon:0kB active_file:0kB inactive_file:0kB present:15992kB 
managed:15916kB mlocked:0kB slab_reclaimable:248kB slab_unreclaimable:2628kB 
kernel_stack:792kB pagetables:2316kB bounce:0kB free_pcp:0kB local_pcp:0kB 
free_cma:0kB
lowmem_reserve[]: 0 809 1965 1965
Normal free:3600kB min:3604kB low:4504kB high:5404kB active_anon:86304kB 
inactive_anon:0kB active_file:160kB inactive_file:376kB present:897016kB 
managed:858524kB mlocked:0kB slab_reclaimable:24944kB 
slab_unreclaimable:296048kB kernel_stack:163832kB pagetables:35892kB bounce:0kB 
free_pcp:3076kB local_pcp:656kB free_cma:0kB
lowmem_reserve[]: 0 0 9247 9247
HighMem free:86156kB min:512kB low:1796kB high:3080kB active_anon:312852kB 
inactive_anon:410024kB active_file:1924kB inactive_file:2012kB 
present:1183736kB managed:1183736kB mlocked:0kB slab_reclaimable:0kB 
slab_unreclaimable:0kB kernel_stack:0kB pagetables:365784kB bounce:0kB 
free_pcp:3868kB local_pcp:720kB free_cma:0kB
lowmem_reserve[]: 0 0 0 0
DMA: 8*4kB (UM) 8*8kB (UM) 4*16kB (M) 2*32kB (UM) 2*64kB (UM) 1*128kB (M) 
3*256kB (UME) 2*512kB (UE) 1*1024kB (E) 0*2048kB 0*4096kB = 3296kB
Normal: 240*4kB (UME) 160*8kB (UME) 23*16kB (ME) 3*32kB (UE) 3*64kB (UME) 
2*128kB (ME) 1*256kB (U) 0*512kB 0*1024kB 0*2048kB 0*4096kB = 3408kB
HighMem: 10942*4kB (UM) 3102*8kB (UM) 866*16kB (UM) 76*32kB (UM) 11*64kB (UM) 
4*128kB (UM) 1*256kB (M) 0*512kB 0*1024kB 0*2048kB 0*4096kB = 86344kB
Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
54409 total pagecache pages
53215 pages in swap cache
Swap cache stats: add 300982, delete 247765, find 157978/226539
Free swap  = 3803244kB
Total swap = 4192252kB
524186 pages RAM
295934 pages HighMem/MovableOnly
9642 pages reserved
0 pages cma reserved

The problem was current deactivation logic(i.e., inactive_list_is_low).

        Node 0 active_anon:404412kB inactive_anon:409040kB

IOW, (inactive_anon of node * inactive_ratio > active_anon of node) due
to highmem anonymous stat so VM never deactivates normal zone's
anonymous pages.

This patch makes inactive_list_is_low be aware of per-zone inactive raio
so if requested zone or lower zones's LRU list cannot meet the ratio, VM
start to deactivate pages.

Signed-off-by: Minchan Kim <[email protected]>
---
 mm/vmscan.c | 41 ++++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index dffa7e0..e60329b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1955,12 +1955,16 @@ static void shrink_active_list(unsigned long nr_to_scan,
  *    1TB     101        10GB
  *   10TB     320        32GB
  */
-static bool inactive_list_is_low(struct lruvec *lruvec, bool file)
+static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
+                                               int classzone_idx)
 {
        unsigned long inactive_ratio;
        unsigned long inactive;
        unsigned long active;
        unsigned long gb;
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+       struct zone *zone;
+       int zone_idx;
 
        /*
         * If we don't have swap space, anonymous page deactivation
@@ -1969,23 +1973,34 @@ static bool inactive_list_is_low(struct lruvec *lruvec, 
bool file)
        if (!file && !total_swap_pages)
                return false;
 
-       inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
-       active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
+       for (zone_idx = classzone_idx; zone_idx > 0; zone_idx--) {
+               zone = &pgdat->node_zones[zone_idx];
+               if (!populated_zone(zone))
+                       continue;
 
-       gb = (inactive + active) >> (30 - PAGE_SHIFT);
-       if (gb)
-               inactive_ratio = int_sqrt(10 * gb);
-       else
-               inactive_ratio = 1;
+               inactive = zone_page_state(zone,
+                               NR_ZONE_LRU_BASE + file * LRU_FILE);
+               active = zone_page_state(zone,
+                               NR_ZONE_LRU_BASE + file * LRU_FILE + 
LRU_ACTIVE);
 
-       return inactive * inactive_ratio < active;
+               gb = (inactive + active) >> (30 - PAGE_SHIFT);
+               if (gb)
+                       inactive_ratio = int_sqrt(10 * gb);
+               else
+                       inactive_ratio = 1;
+
+               if ((inactive * inactive_ratio) < active)
+                       return true;
+       }
+
+       return false;
 }
 
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
                                 struct lruvec *lruvec, struct scan_control *sc)
 {
        if (is_active_lru(lru)) {
-               if (inactive_list_is_low(lruvec, is_file_lru(lru)))
+               if (inactive_list_is_low(lruvec, is_file_lru(lru), 
sc->reclaim_idx))
                        shrink_active_list(nr_to_scan, lruvec, sc, lru);
                return 0;
        }
@@ -2116,7 +2131,7 @@ static void get_scan_count(struct lruvec *lruvec, struct 
mem_cgroup *memcg,
         * lruvec even if it has plenty of old anonymous pages unless the
         * system is under heavy pressure.
         */
-       if (!inactive_list_is_low(lruvec, true) &&
+       if (!inactive_list_is_low(lruvec, true, sc->reclaim_idx) &&
            lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
                scan_balance = SCAN_FILE;
                goto out;
@@ -2358,7 +2373,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, 
struct mem_cgroup *memc
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
         */
-       if (inactive_list_is_low(lruvec, false))
+       if (inactive_list_is_low(lruvec, false, sc->reclaim_idx))
                shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                   sc, LRU_ACTIVE_ANON);
 
@@ -3011,7 +3026,7 @@ static void age_active_anon(struct pglist_data *pgdat,
        do {
                struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
 
-               if (inactive_list_is_low(lruvec, false))
+               if (inactive_list_is_low(lruvec, false, sc->reclaim_idx))
                        shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                           sc, LRU_ACTIVE_ANON);
 
-- 
1.9.1

Reply via email to