Re: [PATCH v17 17/21] mm/lru: replace pgdat lru_lock with lruvec lock

2020-08-06 Thread Alex Shi
Hi Johannes, Michal,

>From page to its lruvec, a few memory access under lock cause extra cost.
Would you like to save the per memcg lruvec pointer to page->private?

Thanks
Alex



在 2020/7/25 下午8:59, Alex Shi 写道:
>  /**
>   * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
>   * @page: the page
> @@ -1215,7 +1228,8 @@ struct lruvec *mem_cgroup_page_lruvec(struct page 
> *page, struct pglist_data *pgd
>   goto out;
>   }
>  
> - memcg = page->mem_cgroup;
> + VM_BUG_ON_PAGE(PageTail(page), page);
> + memcg = READ_ONCE(page->mem_cgroup);
>   /*
>* Swapcache readahead pages are added to the LRU - and
>* possibly migrated - before they are charged.
> @@ -1236,6 +1250,51 @@ struct lruvec *mem_cgroup_page_lruvec(struct page 
> *page, struct pglist_data *pgd
>   return lruvec;
>  }
>  
> +struct lruvec *lock_page_lruvec(struct page *page)
> +{
> + struct lruvec *lruvec;
> + struct pglist_data *pgdat = page_pgdat(page);
> +
> + rcu_read_lock();
> + lruvec = mem_cgroup_page_lruvec(page, pgdat);
> + spin_lock(>lru_lock);
> + rcu_read_unlock();
> +
> + lruvec_memcg_debug(lruvec, page);
> +
> + return lruvec;
> +}
> +


Re: [PATCH v17 17/21] mm/lru: replace pgdat lru_lock with lruvec lock

2020-07-28 Thread Alex Shi
rewrite the commit log.

>From 5e9340444632d69cf10c8db521577d0637819c5f Mon Sep 17 00:00:00 2001
From: Alex Shi 
Date: Tue, 26 May 2020 17:27:52 +0800
Subject: [PATCH v17 17/23] mm/lru: replace pgdat lru_lock with lruvec lock

This patch moves per node lru_lock into lruvec, thus bring a lru_lock for
each of memcg per node. So on a large machine, each of memcg don't
have to suffer from per node pgdat->lru_lock competition. They could go
fast with their self lru_lock.

After move memcg charge before lru inserting, page isolation could
serialize page's memcg, then per memcg lruvec lock is stable and could
replace per node lru lock.

In func isolate_migratepages_block, compact_unlock_should_abort is
opend, and lock_page_lruvec logical is embedded for tight process.
Also add a debug func in locking which may give some clues if there are
sth out of hands.

According to Daniel Jordan's suggestion, I run 208 'dd' with on 104
containers on a 2s * 26cores * HT box with a modefied case:
https://git.kernel.org/pub/scm/linux/kernel/git/wfg/vm-scalability.git/tree/case-lru-file-readtwice

With this and later patches, the readtwice performance increases about
80% within concurrent containers.

On a large but non memcg machine, the extra recheck if page's lruvec
should be changed in few place, that increase a little lock holding
time, and a little regression.

Hugh Dickins helped on patch polish, thanks!

Reported-by: kernel test robot 
Signed-off-by: Alex Shi 
Cc: Hugh Dickins 
Cc: Andrew Morton 
Cc: Johannes Weiner 
Cc: Michal Hocko 
Cc: Vladimir Davydov 
Cc: Yang Shi 
Cc: Matthew Wilcox 
Cc: Konstantin Khlebnikov 
Cc: Tejun Heo 
Cc: linux-kernel@vger.kernel.org
Cc: linux...@kvack.org
Cc: cgro...@vger.kernel.org
---
 include/linux/memcontrol.h |  58 +
 include/linux/mmzone.h |   2 +
 mm/compaction.c|  67 ++---
 mm/huge_memory.c   |  11 ++---
 mm/memcontrol.c|  63 ++-
 mm/mlock.c |  47 +---
 mm/mmzone.c|   1 +
 mm/swap.c  | 104 +
 mm/vmscan.c|  70 --
 9 files changed, 288 insertions(+), 135 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e77197a62809..258901021c6c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -411,6 +411,19 @@ static inline struct lruvec *mem_cgroup_lruvec(struct 
mem_cgroup *memcg,
 
 struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);
 
+struct lruvec *lock_page_lruvec(struct page *page);
+struct lruvec *lock_page_lruvec_irq(struct page *page);
+struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+   unsigned long *flags);
+
+#ifdef CONFIG_DEBUG_VM
+void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page);
+#else
+static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+}
+#endif
+
 static inline
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
return css ? container_of(css, struct mem_cgroup, css) : NULL;
@@ -892,6 +905,31 @@ static inline void mem_cgroup_put(struct mem_cgroup *memcg)
 {
 }
 
+static inline struct lruvec *lock_page_lruvec(struct page *page)
+{
+   struct pglist_data *pgdat = page_pgdat(page);
+
+   spin_lock(>__lruvec.lru_lock);
+   return >__lruvec;
+}
+
+static inline struct lruvec *lock_page_lruvec_irq(struct page *page)
+{
+   struct pglist_data *pgdat = page_pgdat(page);
+
+   spin_lock_irq(>__lruvec.lru_lock);
+   return >__lruvec;
+}
+
+static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+   unsigned long *flagsp)
+{
+   struct pglist_data *pgdat = page_pgdat(page);
+
+   spin_lock_irqsave(>__lruvec.lru_lock, *flagsp);
+   return >__lruvec;
+}
+
 static inline struct mem_cgroup *
 mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
@@ -1126,6 +1164,10 @@ static inline void count_memcg_page_event(struct page 
*page,
 void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
 {
 }
+
+static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+}
 #endif /* CONFIG_MEMCG */
 
 /* idx can be of type enum memcg_stat_item or node_stat_item */
@@ -1255,6 +1297,22 @@ static inline struct lruvec *parent_lruvec(struct lruvec 
*lruvec)
return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
 }
 
+static inline void unlock_page_lruvec(struct lruvec *lruvec)
+{
+   spin_unlock(>lru_lock);
+}
+
+static inline void unlock_page_lruvec_irq(struct lruvec *lruvec)
+{
+   spin_unlock_irq(>lru_lock);
+}
+
+static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
+   unsigned long flags)
+{
+   spin_unlock_irqrestore(>lru_lock, flags);
+}
+
 #ifdef 

Re: [PATCH v17 17/21] mm/lru: replace pgdat lru_lock with lruvec lock

2020-07-28 Thread Alex Shi



在 2020/7/29 上午9:27, Alexander Duyck 写道:
> On Tue, Jul 28, 2020 at 6:00 PM Alex Shi  wrote:
>>
>>
>>
>> 在 2020/7/28 下午10:54, Alexander Duyck 写道:
>>> On Tue, Jul 28, 2020 at 4:20 AM Alex Shi  wrote:



 在 2020/7/28 上午7:34, Alexander Duyck 写道:
>> @@ -1876,6 +1876,12 @@ static unsigned noinline_for_stack 
>> move_pages_to_lru(struct lruvec *lruvec,
>>  *
>> list_add(>lru,)
>>  * list_add(>lru,) //corrupt
>>  */
>> +   new_lruvec = mem_cgroup_page_lruvec(page, 
>> page_pgdat(page));
>> +   if (new_lruvec != lruvec) {
>> +   if (lruvec)
>> +   spin_unlock_irq(>lru_lock);
>> +   lruvec = lock_page_lruvec_irq(page);
>> +   }
>> SetPageLRU(page);
>>
>> if (unlikely(put_page_testzero(page))) {
> I was going through the code of the entire patch set and I noticed
> these changes in move_pages_to_lru. What is the reason for adding the
> new_lruvec logic? My understanding is that we are moving the pages to
> the lruvec provided are we not?If so why do we need to add code to get
> a new lruvec? The code itself seems to stand out from the rest of the
> patch as it is introducing new code instead of replacing existing
> locking code, and it doesn't match up with the description of what
> this function is supposed to do since it changes the lruvec.

 this new_lruvec is the replacement of removed line, as following code:
>> -   lruvec = mem_cgroup_page_lruvec(page, pgdat);
 This recheck is for the page move the root memcg, otherwise it cause the 
 bug:
>>>
>>> Okay, now I see where the issue is. You moved this code so now it has
>>> a different effect than it did before. You are relocking things before
>>> you needed to. Don't forget that when you came into this function you
>>> already had the lock. In addition the patch is broken as it currently
>>> stands as you aren't using similar logic in the code just above this
>>> addition if you encounter an evictable page. As a result this is
>>> really difficult to review as there are subtle bugs here.
>>
>> Why you think its a bug? the relock only happens if locked lruvec is 
>> different.
>> and unlock the old one.
> 
> The section I am talking about with the bug is this section here:
>while (!list_empty(list)) {
> +   struct lruvec *new_lruvec = NULL;
> +
> page = lru_to_page(list);
> VM_BUG_ON_PAGE(PageLRU(page), page);
> list_del(>lru);
> if (unlikely(!page_evictable(page))) {
> -   spin_unlock_irq(>lru_lock);
> +   spin_unlock_irq(>lru_lock);
> putback_lru_page(page);
> -   spin_lock_irq(>lru_lock);
> +   spin_lock_irq(>lru_lock);

It would be still fine. The lruvec->lru_lock will be checked again before
we take and use it. 
And this lock will optimized in patch 19th which did by Hugh Dickins.

> continue;
> }
> 
> Basically it probably is not advisable to be retaking the
> lruvec->lru_lock directly as the lruvec may have changed so it
> wouldn't be correct for the next page. It would make more sense to be
> using your API and calling unlock_page_lruvec_irq and
> lock_page_lruvec_irq instead of using the lock directly.
> 
>>>
>>> I suppose the correct fix is to get rid of this line, but  it should
>>> be placed everywhere the original function was calling
>>> spin_lock_irq().
>>>
>>> In addition I would consider changing the arguments/documentation for
>>> move_pages_to_lru. You aren't moving the pages to lruvec, so there is
>>> probably no need to pass that as an argument. Instead I would pass
>>> pgdat since that isn't going to be moving and is the only thing you
>>> actually derive based on the original lruvec.
>>
>> yes, The comments should be changed with the line was introduced from long 
>> ago. :)
>> Anyway, I am wondering if it worth a v18 version resend?
> 
> So I have been looking over the function itself and I wonder if it
> isn't worth looking at rewriting this to optimize the locking behavior
> to minimize the number of times we have to take the LRU lock. I have
> some code I am working on that I plan to submit as an RFC in the next
> day or so after I can get it smoke tested. The basic idea would be to
> defer returning the evictiable pages or freeing the compound pages
> until after we have processed the pages that can be moved while still
> holding the lock. I would think it should reduce the lock contention
> significantly while improving the throughput.
> 

I had tried once, but the freeing page cross onto release_pages which hard to 
deal with.
I am very glad to 

Re: [PATCH v17 17/21] mm/lru: replace pgdat lru_lock with lruvec lock

2020-07-28 Thread Alexander Duyck
On Tue, Jul 28, 2020 at 6:00 PM Alex Shi  wrote:
>
>
>
> 在 2020/7/28 下午10:54, Alexander Duyck 写道:
> > On Tue, Jul 28, 2020 at 4:20 AM Alex Shi  wrote:
> >>
> >>
> >>
> >> 在 2020/7/28 上午7:34, Alexander Duyck 写道:
>  @@ -1876,6 +1876,12 @@ static unsigned noinline_for_stack 
>  move_pages_to_lru(struct lruvec *lruvec,
>   *
>  list_add(>lru,)
>   * list_add(>lru,) //corrupt
>   */
>  +   new_lruvec = mem_cgroup_page_lruvec(page, 
>  page_pgdat(page));
>  +   if (new_lruvec != lruvec) {
>  +   if (lruvec)
>  +   spin_unlock_irq(>lru_lock);
>  +   lruvec = lock_page_lruvec_irq(page);
>  +   }
>  SetPageLRU(page);
> 
>  if (unlikely(put_page_testzero(page))) {
> >>> I was going through the code of the entire patch set and I noticed
> >>> these changes in move_pages_to_lru. What is the reason for adding the
> >>> new_lruvec logic? My understanding is that we are moving the pages to
> >>> the lruvec provided are we not?If so why do we need to add code to get
> >>> a new lruvec? The code itself seems to stand out from the rest of the
> >>> patch as it is introducing new code instead of replacing existing
> >>> locking code, and it doesn't match up with the description of what
> >>> this function is supposed to do since it changes the lruvec.
> >>
> >> this new_lruvec is the replacement of removed line, as following code:
>  -   lruvec = mem_cgroup_page_lruvec(page, pgdat);
> >> This recheck is for the page move the root memcg, otherwise it cause the 
> >> bug:
> >
> > Okay, now I see where the issue is. You moved this code so now it has
> > a different effect than it did before. You are relocking things before
> > you needed to. Don't forget that when you came into this function you
> > already had the lock. In addition the patch is broken as it currently
> > stands as you aren't using similar logic in the code just above this
> > addition if you encounter an evictable page. As a result this is
> > really difficult to review as there are subtle bugs here.
>
> Why you think its a bug? the relock only happens if locked lruvec is 
> different.
> and unlock the old one.

The section I am talking about with the bug is this section here:
   while (!list_empty(list)) {
+   struct lruvec *new_lruvec = NULL;
+
page = lru_to_page(list);
VM_BUG_ON_PAGE(PageLRU(page), page);
list_del(>lru);
if (unlikely(!page_evictable(page))) {
-   spin_unlock_irq(>lru_lock);
+   spin_unlock_irq(>lru_lock);
putback_lru_page(page);
-   spin_lock_irq(>lru_lock);
+   spin_lock_irq(>lru_lock);
continue;
}

Basically it probably is not advisable to be retaking the
lruvec->lru_lock directly as the lruvec may have changed so it
wouldn't be correct for the next page. It would make more sense to be
using your API and calling unlock_page_lruvec_irq and
lock_page_lruvec_irq instead of using the lock directly.

> >
> > I suppose the correct fix is to get rid of this line, but  it should
> > be placed everywhere the original function was calling
> > spin_lock_irq().
> >
> > In addition I would consider changing the arguments/documentation for
> > move_pages_to_lru. You aren't moving the pages to lruvec, so there is
> > probably no need to pass that as an argument. Instead I would pass
> > pgdat since that isn't going to be moving and is the only thing you
> > actually derive based on the original lruvec.
>
> yes, The comments should be changed with the line was introduced from long 
> ago. :)
> Anyway, I am wondering if it worth a v18 version resend?

So I have been looking over the function itself and I wonder if it
isn't worth looking at rewriting this to optimize the locking behavior
to minimize the number of times we have to take the LRU lock. I have
some code I am working on that I plan to submit as an RFC in the next
day or so after I can get it smoke tested. The basic idea would be to
defer returning the evictiable pages or freeing the compound pages
until after we have processed the pages that can be moved while still
holding the lock. I would think it should reduce the lock contention
significantly while improving the throughput.


Re: [PATCH v17 17/21] mm/lru: replace pgdat lru_lock with lruvec lock

2020-07-28 Thread Alex Shi



在 2020/7/28 下午10:54, Alexander Duyck 写道:
> On Tue, Jul 28, 2020 at 4:20 AM Alex Shi  wrote:
>>
>>
>>
>> 在 2020/7/28 上午7:34, Alexander Duyck 写道:
 @@ -1876,6 +1876,12 @@ static unsigned noinline_for_stack 
 move_pages_to_lru(struct lruvec *lruvec,
  *
 list_add(>lru,)
  * list_add(>lru,) //corrupt
  */
 +   new_lruvec = mem_cgroup_page_lruvec(page, 
 page_pgdat(page));
 +   if (new_lruvec != lruvec) {
 +   if (lruvec)
 +   spin_unlock_irq(>lru_lock);
 +   lruvec = lock_page_lruvec_irq(page);
 +   }
 SetPageLRU(page);

 if (unlikely(put_page_testzero(page))) {
>>> I was going through the code of the entire patch set and I noticed
>>> these changes in move_pages_to_lru. What is the reason for adding the
>>> new_lruvec logic? My understanding is that we are moving the pages to
>>> the lruvec provided are we not?If so why do we need to add code to get
>>> a new lruvec? The code itself seems to stand out from the rest of the
>>> patch as it is introducing new code instead of replacing existing
>>> locking code, and it doesn't match up with the description of what
>>> this function is supposed to do since it changes the lruvec.
>>
>> this new_lruvec is the replacement of removed line, as following code:
 -   lruvec = mem_cgroup_page_lruvec(page, pgdat);
>> This recheck is for the page move the root memcg, otherwise it cause the bug:
> 
> Okay, now I see where the issue is. You moved this code so now it has
> a different effect than it did before. You are relocking things before
> you needed to. Don't forget that when you came into this function you
> already had the lock. In addition the patch is broken as it currently
> stands as you aren't using similar logic in the code just above this
> addition if you encounter an evictable page. As a result this is
> really difficult to review as there are subtle bugs here.

Why you think its a bug? the relock only happens if locked lruvec is different.
and unlock the old one.

> 
> I suppose the correct fix is to get rid of this line, but  it should
> be placed everywhere the original function was calling
> spin_lock_irq().
> 
> In addition I would consider changing the arguments/documentation for
> move_pages_to_lru. You aren't moving the pages to lruvec, so there is
> probably no need to pass that as an argument. Instead I would pass
> pgdat since that isn't going to be moving and is the only thing you
> actually derive based on the original lruvec.

yes, The comments should be changed with the line was introduced from long ago. 
:)
Anyway, I am wondering if it worth a v18 version resend?

> 


Re: [PATCH v17 17/21] mm/lru: replace pgdat lru_lock with lruvec lock

2020-07-28 Thread Alex Shi



在 2020/7/28 下午11:55, Alexander Duyck 写道:
>>  /*
>> @@ -511,11 +511,11 @@ static bool compact_lock_irqsave(spinlock_t *lock, 
>> unsigned long *flags,
>>   * scheduled)
>>   */
>>  static bool compact_unlock_should_abort(spinlock_t *lock,
>> -   unsigned long flags, bool *locked, struct compact_control 
>> *cc)
>> +   unsigned long flags, void **locked, struct compact_control 
>> *cc)
> Instead of passing both a void pointer and the lock why not just pass
> the pointer to the lock pointer? You could combine lock and locked
> into a single argument and save yourself some extra effort.
> 

the passed locked pointer could be rewrite in the func, that is unacceptable if 
it is a lock which could
be used other place.

And it is alreay dangerous to NULL a local pointer. In fact, I perfer the 
orignal verion, not so smart
but rebust enough for future changes, right?

Thanks
Alex


>>  {
>> if (*locked) {
>> spin_unlock_irqrestore(lock, flags);
>> -   *locked = false;
>> +   *locked = NULL;
>> }
>>
>> if (fatal_signal_pending(current)) {


Re: [PATCH v17 17/21] mm/lru: replace pgdat lru_lock with lruvec lock

2020-07-28 Thread Alexander Duyck
On Tue, Jul 28, 2020 at 8:40 AM Alex Shi  wrote:
>
>
>
> 在 2020/7/28 上午7:34, Alexander Duyck 写道:
> > It might make more sense to look at modifying
> > compact_unlock_should_abort and compact_lock_irqsave (which always
> > returns true so should probably be a void) to address the deficiencies
> > they have that make them unusable for you.
>
> One of possible reuse for the func compact_unlock_should_abort, could be
> like the following, the locked parameter reused different in 2 places.
> but, it's seems no this style usage in kernel, isn't it?
>
> Thanks
> Alex
>
> From 41d5ce6562f20f74bc6ac2db83e226ac28d56e90 Mon Sep 17 00:00:00 2001
> From: Alex Shi 
> Date: Tue, 28 Jul 2020 21:19:32 +0800
> Subject: [PATCH] compaction polishing
>
> Signed-off-by: Alex Shi 
> ---
>  mm/compaction.c | 71 
> -
>  1 file changed, 30 insertions(+), 41 deletions(-)
>
> diff --git a/mm/compaction.c b/mm/compaction.c
> index c28a43481f01..36fce988de3e 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -479,20 +479,20 @@ static bool test_and_set_skip(struct compact_control 
> *cc, struct page *page,
>   *
>   * Always returns true which makes it easier to track lock state in callers.
>   */
> -static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
> +static void compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
> struct compact_control *cc)
> __acquires(lock)
>  {
> /* Track if the lock is contended in async mode */
> if (cc->mode == MIGRATE_ASYNC && !cc->contended) {
> if (spin_trylock_irqsave(lock, *flags))
> -   return true;
> +   return;
>
> cc->contended = true;
> }
>
> spin_lock_irqsave(lock, *flags);
> -   return true;
> +   return;
>  }
>
>  /*
> @@ -511,11 +511,11 @@ static bool compact_lock_irqsave(spinlock_t *lock, 
> unsigned long *flags,
>   * scheduled)
>   */
>  static bool compact_unlock_should_abort(spinlock_t *lock,
> -   unsigned long flags, bool *locked, struct compact_control *cc)
> +   unsigned long flags, void **locked, struct compact_control 
> *cc)

Instead of passing both a void pointer and the lock why not just pass
the pointer to the lock pointer? You could combine lock and locked
into a single argument and save yourself some extra effort.

>  {
> if (*locked) {
> spin_unlock_irqrestore(lock, flags);
> -   *locked = false;
> +   *locked = NULL;
> }
>
> if (fatal_signal_pending(current)) {
> @@ -543,7 +543,7 @@ static unsigned long isolate_freepages_block(struct 
> compact_control *cc,
> int nr_scanned = 0, total_isolated = 0;
> struct page *cursor;
> unsigned long flags = 0;
> -   bool locked = false;
> +   struct compact_control *locked = NULL;
> unsigned long blockpfn = *start_pfn;
> unsigned int order;
>
> @@ -565,7 +565,7 @@ static unsigned long isolate_freepages_block(struct 
> compact_control *cc,
>  */
> if (!(blockpfn % SWAP_CLUSTER_MAX)
> && compact_unlock_should_abort(>zone->lock, flags,
> -   , cc))
> +   (void**), cc))
> break;
>
> nr_scanned++;
> @@ -599,8 +599,8 @@ static unsigned long isolate_freepages_block(struct 
> compact_control *cc,
>  * recheck as well.
>  */
> if (!locked) {
> -   locked = compact_lock_irqsave(>zone->lock,
> -   , cc);
> +   compact_lock_irqsave(>zone->lock, , cc);
> +   locked = cc;
>
> /* Recheck this is a buddy page under lock */
> if (!PageBuddy(page))

If you have to provide a pointer you might as well just provide a
pointer to the zone lock since that is the thing that is actually
holding the lock at this point and would be consistent with your other
uses of the locked value. One possibility would be to change the
return type so that you return a pointer to the lock you are using.
Then the code would look closer to the lruvec code you are already
using.

> @@ -787,7 +787,7 @@ static bool too_many_isolated(pg_data_t *pgdat)
> unsigned long nr_scanned = 0, nr_isolated = 0;
> struct lruvec *lruvec;
> unsigned long flags = 0;
> -   struct lruvec *locked_lruvec = NULL;
> +   struct lruvec *locked = NULL;
> struct page *page = NULL, *valid_page = NULL;
> unsigned long start_pfn = low_pfn;
> bool skip_on_failure = false;
> @@ -847,21 +847,11 @@ static bool too_many_isolated(pg_data_t *pgdat)
>  

Re: [PATCH v17 17/21] mm/lru: replace pgdat lru_lock with lruvec lock

2020-07-28 Thread Alex Shi



在 2020/7/28 上午7:34, Alexander Duyck 写道:
> It might make more sense to look at modifying
> compact_unlock_should_abort and compact_lock_irqsave (which always
> returns true so should probably be a void) to address the deficiencies
> they have that make them unusable for you.

One of possible reuse for the func compact_unlock_should_abort, could be
like the following, the locked parameter reused different in 2 places.
but, it's seems no this style usage in kernel, isn't it?

Thanks
Alex

>From 41d5ce6562f20f74bc6ac2db83e226ac28d56e90 Mon Sep 17 00:00:00 2001
From: Alex Shi 
Date: Tue, 28 Jul 2020 21:19:32 +0800
Subject: [PATCH] compaction polishing

Signed-off-by: Alex Shi 
---
 mm/compaction.c | 71 -
 1 file changed, 30 insertions(+), 41 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index c28a43481f01..36fce988de3e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -479,20 +479,20 @@ static bool test_and_set_skip(struct compact_control *cc, 
struct page *page,
  *
  * Always returns true which makes it easier to track lock state in callers.
  */
-static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
+static void compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
struct compact_control *cc)
__acquires(lock)
 {
/* Track if the lock is contended in async mode */
if (cc->mode == MIGRATE_ASYNC && !cc->contended) {
if (spin_trylock_irqsave(lock, *flags))
-   return true;
+   return;
 
cc->contended = true;
}
 
spin_lock_irqsave(lock, *flags);
-   return true;
+   return;
 }
 
 /*
@@ -511,11 +511,11 @@ static bool compact_lock_irqsave(spinlock_t *lock, 
unsigned long *flags,
  * scheduled)
  */
 static bool compact_unlock_should_abort(spinlock_t *lock,
-   unsigned long flags, bool *locked, struct compact_control *cc)
+   unsigned long flags, void **locked, struct compact_control *cc)
 {
if (*locked) {
spin_unlock_irqrestore(lock, flags);
-   *locked = false;
+   *locked = NULL;
}
 
if (fatal_signal_pending(current)) {
@@ -543,7 +543,7 @@ static unsigned long isolate_freepages_block(struct 
compact_control *cc,
int nr_scanned = 0, total_isolated = 0;
struct page *cursor;
unsigned long flags = 0;
-   bool locked = false;
+   struct compact_control *locked = NULL;
unsigned long blockpfn = *start_pfn;
unsigned int order;
 
@@ -565,7 +565,7 @@ static unsigned long isolate_freepages_block(struct 
compact_control *cc,
 */
if (!(blockpfn % SWAP_CLUSTER_MAX)
&& compact_unlock_should_abort(>zone->lock, flags,
-   , cc))
+   (void**), cc))
break;
 
nr_scanned++;
@@ -599,8 +599,8 @@ static unsigned long isolate_freepages_block(struct 
compact_control *cc,
 * recheck as well.
 */
if (!locked) {
-   locked = compact_lock_irqsave(>zone->lock,
-   , cc);
+   compact_lock_irqsave(>zone->lock, , cc);
+   locked = cc;
 
/* Recheck this is a buddy page under lock */
if (!PageBuddy(page))
@@ -787,7 +787,7 @@ static bool too_many_isolated(pg_data_t *pgdat)
unsigned long nr_scanned = 0, nr_isolated = 0;
struct lruvec *lruvec;
unsigned long flags = 0;
-   struct lruvec *locked_lruvec = NULL;
+   struct lruvec *locked = NULL;
struct page *page = NULL, *valid_page = NULL;
unsigned long start_pfn = low_pfn;
bool skip_on_failure = false;
@@ -847,21 +847,11 @@ static bool too_many_isolated(pg_data_t *pgdat)
 * contention, to give chance to IRQs. Abort completely if
 * a fatal signal is pending.
 */
-   if (!(low_pfn % SWAP_CLUSTER_MAX)) {
-   if (locked_lruvec) {
-   unlock_page_lruvec_irqrestore(locked_lruvec,
-   flags);
-   locked_lruvec = NULL;
-   }
-
-   if (fatal_signal_pending(current)) {
-   cc->contended = true;
-
-   low_pfn = 0;
-   goto fatal_pending;
-   }
-
-   cond_resched();
+   if (!(low_pfn % SWAP_CLUSTER_MAX)
+   && 

Re: [PATCH v17 17/21] mm/lru: replace pgdat lru_lock with lruvec lock

2020-07-28 Thread Alexander Duyck
On Tue, Jul 28, 2020 at 4:20 AM Alex Shi  wrote:
>
>
>
> 在 2020/7/28 上午7:34, Alexander Duyck 写道:
> >> @@ -1876,6 +1876,12 @@ static unsigned noinline_for_stack 
> >> move_pages_to_lru(struct lruvec *lruvec,
> >>  *
> >> list_add(>lru,)
> >>  * list_add(>lru,) //corrupt
> >>  */
> >> +   new_lruvec = mem_cgroup_page_lruvec(page, 
> >> page_pgdat(page));
> >> +   if (new_lruvec != lruvec) {
> >> +   if (lruvec)
> >> +   spin_unlock_irq(>lru_lock);
> >> +   lruvec = lock_page_lruvec_irq(page);
> >> +   }
> >> SetPageLRU(page);
> >>
> >> if (unlikely(put_page_testzero(page))) {
> > I was going through the code of the entire patch set and I noticed
> > these changes in move_pages_to_lru. What is the reason for adding the
> > new_lruvec logic? My understanding is that we are moving the pages to
> > the lruvec provided are we not?If so why do we need to add code to get
> > a new lruvec? The code itself seems to stand out from the rest of the
> > patch as it is introducing new code instead of replacing existing
> > locking code, and it doesn't match up with the description of what
> > this function is supposed to do since it changes the lruvec.
>
> this new_lruvec is the replacement of removed line, as following code:
> >> -   lruvec = mem_cgroup_page_lruvec(page, pgdat);
> This recheck is for the page move the root memcg, otherwise it cause the bug:

Okay, now I see where the issue is. You moved this code so now it has
a different effect than it did before. You are relocking things before
you needed to. Don't forget that when you came into this function you
already had the lock. In addition the patch is broken as it currently
stands as you aren't using similar logic in the code just above this
addition if you encounter an evictable page. As a result this is
really difficult to review as there are subtle bugs here.

I suppose the correct fix is to get rid of this line, but  it should
be placed everywhere the original function was calling
spin_lock_irq().

In addition I would consider changing the arguments/documentation for
move_pages_to_lru. You aren't moving the pages to lruvec, so there is
probably no need to pass that as an argument. Instead I would pass
pgdat since that isn't going to be moving and is the only thing you
actually derive based on the original lruvec.


Re: [PATCH v17 17/21] mm/lru: replace pgdat lru_lock with lruvec lock

2020-07-28 Thread Alex Shi



在 2020/7/28 上午7:34, Alexander Duyck 写道:
>> @@ -1876,6 +1876,12 @@ static unsigned noinline_for_stack 
>> move_pages_to_lru(struct lruvec *lruvec,
>>  *
>> list_add(>lru,)
>>  * list_add(>lru,) //corrupt
>>  */
>> +   new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
>> +   if (new_lruvec != lruvec) {
>> +   if (lruvec)
>> +   spin_unlock_irq(>lru_lock);
>> +   lruvec = lock_page_lruvec_irq(page);
>> +   }
>> SetPageLRU(page);
>>
>> if (unlikely(put_page_testzero(page))) {
> I was going through the code of the entire patch set and I noticed
> these changes in move_pages_to_lru. What is the reason for adding the
> new_lruvec logic? My understanding is that we are moving the pages to
> the lruvec provided are we not?If so why do we need to add code to get
> a new lruvec? The code itself seems to stand out from the rest of the
> patch as it is introducing new code instead of replacing existing
> locking code, and it doesn't match up with the description of what
> this function is supposed to do since it changes the lruvec.

this new_lruvec is the replacement of removed line, as following code:
>> -   lruvec = mem_cgroup_page_lruvec(page, pgdat);
This recheck is for the page move the root memcg, otherwise it cause the bug:

[ 2081.240795] BUG: kernel NULL pointer dereference, address: 
[ 2081.248125] #PF: supervisor read access in kernel mode
[ 2081.253627] #PF: error_code(0x) - not-present page
[ 2081.259124] PGD 800044cb0067 P4D 800044cb0067 PUD 95c9067 PMD 0
[ 2081.266193] Oops:  [#1] PREEMPT SMP PTI
[ 2081.270740] CPU: 5 PID: 131 Comm: kswapd0 Kdump: loaded Tainted: GW  
   5.8.0-rc6-00025-gc708f8a0db47 #45
[ 2081.281960] Hardware name: Alibaba X-Dragon CN 01/20G4B, BIOS 1ALSP016 
05/21/2018
[ 2081.290054] RIP: 0010:do_raw_spin_trylock+0x5/0x40
[ 2081.295209] Code: 76 82 48 89 df e8 bb fe ff ff eb 8c 89 c6 48 89 df e8 4f 
dd ff ff 66 90 eb 8b 90 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 <8b> 07 85 
c0 75 28 ba 01 00 00 00 f0 0f b1 17 75 1d 65 8b 05 03 6a
[ 2081.314832] RSP: 0018:c92ebac8 EFLAGS: 00010082
[ 2081.320410] RAX:  RBX: 0018 RCX: 
[ 2081.327907] RDX: 888035833480 RSI:  RDI: 
[ 2081.335407] RBP:  R08: 0001 R09: 0001
[ 2081.342907] R10:  R11:  R12: 0001
[ 2081.350405] R13: dead0100 R14:  R15: c92ebbb0
[ 2081.357908] FS:  () GS:88807a20() 
knlGS:
[ 2081.366619] CS:  0010 DS:  ES:  CR0: 80050033
[ 2081.372717] CR2:  CR3: 31228005 CR4: 003606e0
[ 2081.380215] DR0:  DR1:  DR2: 
[ 2081.387713] DR3:  DR6: fffe0ff0 DR7: 0400
[ 2081.395198] Call Trace:
[ 2081.398008]  _raw_spin_lock_irq+0x47/0x80
[ 2081.402387]  ? move_pages_to_lru+0x566/0xb80
[ 2081.407028]  move_pages_to_lru+0x566/0xb80
[ 2081.411495]  shrink_active_list+0x355/0xa70
[ 2081.416054]  shrink_lruvec+0x4f7/0x810
[ 2081.420176]  ? mem_cgroup_iter+0xb6/0x410
[ 2081.424558]  shrink_node+0x1cc/0x8d0
[ 2081.428510]  balance_pgdat+0x3cf/0x760
[ 2081.432634]  kswapd+0x232/0x660
[ 2081.436147]  ? finish_wait+0x80/0x80
[ 2081.440093]  ? balance_pgdat+0x760/0x760
[ 2081.444382]  kthread+0x17e/0x1b0
[ 2081.447975]  ? kthread_park+0xc0/0xc0
[ 2081.452005]  ret_from_fork+0x22/0x30

Thanks!
Alex
> 
>> @@ -1883,16 +1889,15 @@ static unsigned noinline_for_stack 
>> move_pages_to_lru(struct lruvec *lruvec,
>> __ClearPageActive(page);
>>
>> if (unlikely(PageCompound(page))) {
>> -   spin_unlock_irq(>lru_lock);
>> +   spin_unlock_irq(>lru_lock);
>> destroy_compound_page(page);
>> -   spin_lock_irq(>lru_lock);
>> +   spin_lock_irq(>lru_lock);
>> } else
>> list_add(>lru, _to_free);
>>
>> continue;
>> }
>>
>> -   lruvec = mem_cgroup_page_lruvec(page, pgdat);
>> lru = page_lru(page);
>> nr_pages = hpage_nr_pages(page);


Re: [PATCH v17 17/21] mm/lru: replace pgdat lru_lock with lruvec lock

2020-07-28 Thread Alex Shi



在 2020/7/28 上午7:34, Alexander Duyck 写道:

>> @@ -847,11 +847,21 @@ static bool too_many_isolated(pg_data_t *pgdat)
>>  * contention, to give chance to IRQs. Abort completely if
>>  * a fatal signal is pending.
>>  */
>> -   if (!(low_pfn % SWAP_CLUSTER_MAX)
>> -   && compact_unlock_should_abort(>lru_lock,
>> -   flags, , cc)) {
>> -   low_pfn = 0;
>> -   goto fatal_pending;
>> +   if (!(low_pfn % SWAP_CLUSTER_MAX)) {
>> +   if (locked_lruvec) {
>> +   unlock_page_lruvec_irqrestore(locked_lruvec,
>> +   
>> flags);
>> +   locked_lruvec = NULL;
>> +   }
>> +
>> +   if (fatal_signal_pending(current)) {
>> +   cc->contended = true;
>> +
>> +   low_pfn = 0;
>> +   goto fatal_pending;
>> +   }
>> +
>> +   cond_resched();
>> }
>>
>> if (!pfn_valid_within(low_pfn))
> 
> I'm noticing this patch introduces a bunch of noise. What is the
> reason for getting rid of compact_unlock_should_abort? It seems like
> you just open coded it here. If there is some sort of issue with it
> then it might be better to replace it as part of a preparatory patch
> before you introduce this one as changes like this make it harder to
> review.

Thanks for comments, Alex.

the func compact_unlock_should_abort should be removed since one of parameters
changed from 'bool *locked' to 'struct lruvec *lruvec'. So it's not applicable
now. I have to open it here instead of adding a only one user func.

> 
> It might make more sense to look at modifying
> compact_unlock_should_abort and compact_lock_irqsave (which always
> returns true so should probably be a void) to address the deficiencies
> they have that make them unusable for you.

I am wondering if people like a patch which just open 
compact_unlock_should_abort
func and move bool to void as a preparation patch, do you like this?


>> @@ -966,10 +975,20 @@ static bool too_many_isolated(pg_data_t *pgdat)
>> if (!TestClearPageLRU(page))
>> goto isolate_fail_put;
>>
>> +   rcu_read_lock();
>> +   lruvec = mem_cgroup_page_lruvec(page, pgdat);
>> +
>> /* If we already hold the lock, we can skip some rechecking 
>> */
>> -   if (!locked) {
>> -   locked = compact_lock_irqsave(>lru_lock,
>> -   , cc);
>> +   if (lruvec != locked_lruvec) {
>> +   if (locked_lruvec)
>> +   unlock_page_lruvec_irqrestore(locked_lruvec,
>> +   
>> flags);
>> +
>> +   compact_lock_irqsave(>lru_lock, , cc);
>> +   locked_lruvec = lruvec;
>> +   rcu_read_unlock();
>> +
>> +   lruvec_memcg_debug(lruvec, page);
>>
>> /* Try get exclusive access under lock */
>> if (!skip_updated) {
> 
> So this bit makes things a bit complicated. From what I can can tell
> the comment about exclusive access under the lock is supposed to apply
> to the pageblock via the lru_lock. However you are having to retest
> the lock for each page because it is possible the page was moved to
> another memory cgroup while the lru_lock was released correct? So in

The pageblock is aligned by pfn, so pages in them maynot on same memcg
originally. and yes, page may be changed memcg also.

> this case is the lru vector lock really providing any protection for
> the skip_updated portion of this code block if the lock isn't
> exclusive to the pageblock? In theory this would probably make more
> sense to have protected the skip bits under the zone lock, but I
> imagine that was avoided due to the additional overhead.

when we change to lruvec->lru_lock, it does the same thing as pgdat->lru_lock.
just may get a bit more chance to here, and find out this is a skipable
pageblock and quit. 
Yes, logically, pgdat lru_lock seems better, but since we are holding lru_lock.
It's fine to not bother more locks.

> 
>> @@ -1876,6 +1876,12 @@ static unsigned noinline_for_stack 
>> move_pages_to_lru(struct lruvec *lruvec,
>>  *
>> list_add(>lru,)
>>  * list_add(>lru,) //corrupt
>>  */
>> +   new_lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
>> +   if (new_lruvec != lruvec) {
>> +   if (lruvec)
>> +  

Re: [PATCH v17 17/21] mm/lru: replace pgdat lru_lock with lruvec lock

2020-07-27 Thread Alexander Duyck
On Sat, Jul 25, 2020 at 6:01 AM Alex Shi  wrote:
>
> This patch moves per node lru_lock into lruvec, thus bring a lru_lock for
> each of memcg per node. So on a large machine, each of memcg don't
> have to suffer from per node pgdat->lru_lock competition. They could go
> fast with their self lru_lock.
>
> After move memcg charge before lru inserting, page isolation could
> serialize page's memcg, then per memcg lruvec lock is stable and could
> replace per node lru lock.
>
> According to Daniel Jordan's suggestion, I run 208 'dd' with on 104
> containers on a 2s * 26cores * HT box with a modefied case:
> https://git.kernel.org/pub/scm/linux/kernel/git/wfg/vm-scalability.git/tree/case-lru-file-readtwice
>
> With this and later patches, the readtwice performance increases about
> 80% within concurrent containers.
>
> Also add a debug func in locking which may give some clues if there are
> sth out of hands.
>
> Hugh Dickins helped on patch polish, thanks!
>
> Reported-by: kernel test robot 
> Signed-off-by: Alex Shi 
> Cc: Hugh Dickins 
> Cc: Andrew Morton 
> Cc: Johannes Weiner 
> Cc: Michal Hocko 
> Cc: Vladimir Davydov 
> Cc: Yang Shi 
> Cc: Matthew Wilcox 
> Cc: Konstantin Khlebnikov 
> Cc: Tejun Heo 
> Cc: linux-kernel@vger.kernel.org
> Cc: linux...@kvack.org
> Cc: cgro...@vger.kernel.org
> ---
>  include/linux/memcontrol.h |  58 +
>  include/linux/mmzone.h |   2 +
>  mm/compaction.c|  67 ++---
>  mm/huge_memory.c   |  11 ++---
>  mm/memcontrol.c|  63 ++-
>  mm/mlock.c |  47 +---
>  mm/mmzone.c|   1 +
>  mm/swap.c  | 104 
> +
>  mm/vmscan.c|  70 --
>  9 files changed, 288 insertions(+), 135 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index e77197a62809..258901021c6c 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -411,6 +411,19 @@ static inline struct lruvec *mem_cgroup_lruvec(struct 
> mem_cgroup *memcg,
>
>  struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);
>
> +struct lruvec *lock_page_lruvec(struct page *page);
> +struct lruvec *lock_page_lruvec_irq(struct page *page);
> +struct lruvec *lock_page_lruvec_irqsave(struct page *page,
> +   unsigned long *flags);
> +
> +#ifdef CONFIG_DEBUG_VM
> +void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page);
> +#else
> +static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page 
> *page)
> +{
> +}
> +#endif
> +
>  static inline
>  struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
> return css ? container_of(css, struct mem_cgroup, css) : NULL;
> @@ -892,6 +905,31 @@ static inline void mem_cgroup_put(struct mem_cgroup 
> *memcg)
>  {
>  }
>
> +static inline struct lruvec *lock_page_lruvec(struct page *page)
> +{
> +   struct pglist_data *pgdat = page_pgdat(page);
> +
> +   spin_lock(>__lruvec.lru_lock);
> +   return >__lruvec;
> +}
> +
> +static inline struct lruvec *lock_page_lruvec_irq(struct page *page)
> +{
> +   struct pglist_data *pgdat = page_pgdat(page);
> +
> +   spin_lock_irq(>__lruvec.lru_lock);
> +   return >__lruvec;
> +}
> +
> +static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page,
> +   unsigned long *flagsp)
> +{
> +   struct pglist_data *pgdat = page_pgdat(page);
> +
> +   spin_lock_irqsave(>__lruvec.lru_lock, *flagsp);
> +   return >__lruvec;
> +}
> +
>  static inline struct mem_cgroup *
>  mem_cgroup_iter(struct mem_cgroup *root,
> struct mem_cgroup *prev,
> @@ -1126,6 +1164,10 @@ static inline void count_memcg_page_event(struct page 
> *page,
>  void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
>  {
>  }
> +
> +static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page 
> *page)
> +{
> +}
>  #endif /* CONFIG_MEMCG */
>
>  /* idx can be of type enum memcg_stat_item or node_stat_item */
> @@ -1255,6 +1297,22 @@ static inline struct lruvec *parent_lruvec(struct 
> lruvec *lruvec)
> return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
>  }
>
> +static inline void unlock_page_lruvec(struct lruvec *lruvec)
> +{
> +   spin_unlock(>lru_lock);
> +}
> +
> +static inline void unlock_page_lruvec_irq(struct lruvec *lruvec)
> +{
> +   spin_unlock_irq(>lru_lock);
> +}
> +
> +static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
> +   unsigned long flags)
> +{
> +   spin_unlock_irqrestore(>lru_lock, flags);
> +}
> +
>  #ifdef CONFIG_CGROUP_WRITEBACK
>
>  struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 14c668b7e793..30b961a9a749 100644
> --- 

[PATCH v17 17/21] mm/lru: replace pgdat lru_lock with lruvec lock

2020-07-25 Thread Alex Shi
This patch moves per node lru_lock into lruvec, thus bring a lru_lock for
each of memcg per node. So on a large machine, each of memcg don't
have to suffer from per node pgdat->lru_lock competition. They could go
fast with their self lru_lock.

After move memcg charge before lru inserting, page isolation could
serialize page's memcg, then per memcg lruvec lock is stable and could
replace per node lru lock.

According to Daniel Jordan's suggestion, I run 208 'dd' with on 104
containers on a 2s * 26cores * HT box with a modefied case:
https://git.kernel.org/pub/scm/linux/kernel/git/wfg/vm-scalability.git/tree/case-lru-file-readtwice

With this and later patches, the readtwice performance increases about
80% within concurrent containers.

Also add a debug func in locking which may give some clues if there are
sth out of hands.

Hugh Dickins helped on patch polish, thanks!

Reported-by: kernel test robot 
Signed-off-by: Alex Shi 
Cc: Hugh Dickins 
Cc: Andrew Morton 
Cc: Johannes Weiner 
Cc: Michal Hocko 
Cc: Vladimir Davydov 
Cc: Yang Shi 
Cc: Matthew Wilcox 
Cc: Konstantin Khlebnikov 
Cc: Tejun Heo 
Cc: linux-kernel@vger.kernel.org
Cc: linux...@kvack.org
Cc: cgro...@vger.kernel.org
---
 include/linux/memcontrol.h |  58 +
 include/linux/mmzone.h |   2 +
 mm/compaction.c|  67 ++---
 mm/huge_memory.c   |  11 ++---
 mm/memcontrol.c|  63 ++-
 mm/mlock.c |  47 +---
 mm/mmzone.c|   1 +
 mm/swap.c  | 104 +
 mm/vmscan.c|  70 --
 9 files changed, 288 insertions(+), 135 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e77197a62809..258901021c6c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -411,6 +411,19 @@ static inline struct lruvec *mem_cgroup_lruvec(struct 
mem_cgroup *memcg,
 
 struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);
 
+struct lruvec *lock_page_lruvec(struct page *page);
+struct lruvec *lock_page_lruvec_irq(struct page *page);
+struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+   unsigned long *flags);
+
+#ifdef CONFIG_DEBUG_VM
+void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page);
+#else
+static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+}
+#endif
+
 static inline
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
return css ? container_of(css, struct mem_cgroup, css) : NULL;
@@ -892,6 +905,31 @@ static inline void mem_cgroup_put(struct mem_cgroup *memcg)
 {
 }
 
+static inline struct lruvec *lock_page_lruvec(struct page *page)
+{
+   struct pglist_data *pgdat = page_pgdat(page);
+
+   spin_lock(>__lruvec.lru_lock);
+   return >__lruvec;
+}
+
+static inline struct lruvec *lock_page_lruvec_irq(struct page *page)
+{
+   struct pglist_data *pgdat = page_pgdat(page);
+
+   spin_lock_irq(>__lruvec.lru_lock);
+   return >__lruvec;
+}
+
+static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page,
+   unsigned long *flagsp)
+{
+   struct pglist_data *pgdat = page_pgdat(page);
+
+   spin_lock_irqsave(>__lruvec.lru_lock, *flagsp);
+   return >__lruvec;
+}
+
 static inline struct mem_cgroup *
 mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
@@ -1126,6 +1164,10 @@ static inline void count_memcg_page_event(struct page 
*page,
 void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
 {
 }
+
+static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+}
 #endif /* CONFIG_MEMCG */
 
 /* idx can be of type enum memcg_stat_item or node_stat_item */
@@ -1255,6 +1297,22 @@ static inline struct lruvec *parent_lruvec(struct lruvec 
*lruvec)
return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
 }
 
+static inline void unlock_page_lruvec(struct lruvec *lruvec)
+{
+   spin_unlock(>lru_lock);
+}
+
+static inline void unlock_page_lruvec_irq(struct lruvec *lruvec)
+{
+   spin_unlock_irq(>lru_lock);
+}
+
+static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
+   unsigned long flags)
+{
+   spin_unlock_irqrestore(>lru_lock, flags);
+}
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 
 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 14c668b7e793..30b961a9a749 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -249,6 +249,8 @@ enum lruvec_flags {
 };
 
 struct lruvec {
+   /* per lruvec lru_lock for memcg */
+   spinlock_t  lru_lock;
struct list_headlists[NR_LRU_LISTS];
/*
 * These track the cost of reclaiming one LRU - file or