Re: [PATCH] arm: kprobe: replace patch_lock to raw lock

2016-12-05 Thread Shi, Yang

On 12/1/2016 6:13 AM, Sebastian Andrzej Siewior wrote:

On 2016-11-10 16:17:55 [-0800], Yang Shi wrote:


Since patch_text_stop_machine() is called in stop_machine() which disables IRQ,
sleepable lock should be not used in this atomic context, so replace patch_lock
to raw lock.

Signed-off-by: Yang Shi <yang@linaro.org>


This can also go upstream.
Acked-by: Sebastian Andrzej Siewior <bige...@linutronix.de>


Yes, thanks for acking.

Russell,

Could you please consider this patch?

Thanks,
Yang



Sebastian





Re: [PATCH] arm: kprobe: replace patch_lock to raw lock

2016-12-05 Thread Shi, Yang

On 12/1/2016 6:13 AM, Sebastian Andrzej Siewior wrote:

On 2016-11-10 16:17:55 [-0800], Yang Shi wrote:


Since patch_text_stop_machine() is called in stop_machine() which disables IRQ,
sleepable lock should be not used in this atomic context, so replace patch_lock
to raw lock.

Signed-off-by: Yang Shi 


This can also go upstream.
Acked-by: Sebastian Andrzej Siewior 


Yes, thanks for acking.

Russell,

Could you please consider this patch?

Thanks,
Yang



Sebastian





Re: [v2 PATCH] arm64: kasan: instrument user memory access API

2016-06-06 Thread Shi, Yang

Hi Will & Catalin,

Any comment for this patch?

Thanks,
Yang


On 5/27/2016 2:01 PM, Yang Shi wrote:

The upstream commit 1771c6e1a567ea0ba20a4ffe68a1419fd8ef
("x86/kasan: instrument user memory access API") added KASAN instrument to
x86 user memory access API, so added such instrument to ARM64 too.

Define __copy_to/from_user in C in order to add kasan_check_read/write call,
rename assembly implementation to __arch_copy_to/from_user.

Tested by test_kasan module.

Signed-off-by: Yang Shi <yang@linaro.org>
---
v2:
 Adopted the comment from Andrey and Mark to add kasan_check_read/write into
 __copy_to/from_user.

 arch/arm64/include/asm/uaccess.h | 25 +
 arch/arm64/kernel/arm64ksyms.c   |  4 ++--
 arch/arm64/lib/copy_from_user.S  |  4 ++--
 arch/arm64/lib/copy_to_user.S|  4 ++--
 4 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 0685d74..4dc9a8f 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -23,6 +23,7 @@
  */
 #include 
 #include 
+#include 

 #include 
 #include 
@@ -269,15 +270,29 @@ do {  
\
-EFAULT;\
 })

-extern unsigned long __must_check __copy_from_user(void *to, const void __user 
*from, unsigned long n);
-extern unsigned long __must_check __copy_to_user(void __user *to, const void 
*from, unsigned long n);
+extern unsigned long __must_check __arch_copy_from_user(void *to, const void 
__user *from, unsigned long n);
+extern unsigned long __must_check __arch_copy_to_user(void __user *to, const 
void *from, unsigned long n);
 extern unsigned long __must_check __copy_in_user(void __user *to, const void 
__user *from, unsigned long n);
 extern unsigned long __must_check __clear_user(void __user *addr, unsigned 
long n);

+static inline unsigned long __must_check __copy_from_user(void *to, const void 
__user *from, unsigned long n)
+{
+   kasan_check_write(to, n);
+   return  __arch_copy_from_user(to, from, n);
+}
+
+static inline unsigned long __must_check __copy_to_user(void __user *to, const 
void *from, unsigned long n)
+{
+   kasan_check_read(from, n);
+   return  __arch_copy_to_user(to, from, n);
+}
+
 static inline unsigned long __must_check copy_from_user(void *to, const void 
__user *from, unsigned long n)
 {
+   kasan_check_write(to, n);
+
if (access_ok(VERIFY_READ, from, n))
-   n = __copy_from_user(to, from, n);
+   n = __arch_copy_from_user(to, from, n);
else /* security hole - plug it */
memset(to, 0, n);
return n;
@@ -285,8 +300,10 @@ static inline unsigned long __must_check 
copy_from_user(void *to, const void __u

 static inline unsigned long __must_check copy_to_user(void __user *to, const 
void *from, unsigned long n)
 {
+   kasan_check_read(from, n);
+
if (access_ok(VERIFY_WRITE, to, n))
-   n = __copy_to_user(to, from, n);
+   n = __arch_copy_to_user(to, from, n);
return n;
 }

diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 678f30b0..2dc4440 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -34,8 +34,8 @@ EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);

/* user mem (segment) */
-EXPORT_SYMBOL(__copy_from_user);
-EXPORT_SYMBOL(__copy_to_user);
+EXPORT_SYMBOL(__arch_copy_from_user);
+EXPORT_SYMBOL(__arch_copy_to_user);
 EXPORT_SYMBOL(__clear_user);
 EXPORT_SYMBOL(__copy_in_user);

diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 17e8306..0b90497 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -66,7 +66,7 @@
.endm

 end.reqx5
-ENTRY(__copy_from_user)
+ENTRY(__arch_copy_from_user)
 ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
CONFIG_ARM64_PAN)
add end, x0, x2
@@ -75,7 +75,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), 
ARM64_ALT_PAN_NOT_UAO, \
CONFIG_ARM64_PAN)
mov x0, #0  // Nothing to copy
ret
-ENDPROC(__copy_from_user)
+ENDPROC(__arch_copy_from_user)

.section .fixup,"ax"
.align  2
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 21faae6..7a7efe2 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -65,7 +65,7 @@
.endm

 end.reqx5
-ENTRY(__copy_to_user)
+ENTRY(__arch_copy_to_user)
 ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
CONFIG_ARM64_PAN)
add end, x0, x2
@@ -74,7 +74,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), 
ARM64_ALT_PAN_NOT_UAO, \
   

Re: [v2 PATCH] arm64: kasan: instrument user memory access API

2016-06-06 Thread Shi, Yang

Hi Will & Catalin,

Any comment for this patch?

Thanks,
Yang


On 5/27/2016 2:01 PM, Yang Shi wrote:

The upstream commit 1771c6e1a567ea0ba20a4ffe68a1419fd8ef
("x86/kasan: instrument user memory access API") added KASAN instrument to
x86 user memory access API, so added such instrument to ARM64 too.

Define __copy_to/from_user in C in order to add kasan_check_read/write call,
rename assembly implementation to __arch_copy_to/from_user.

Tested by test_kasan module.

Signed-off-by: Yang Shi 
---
v2:
 Adopted the comment from Andrey and Mark to add kasan_check_read/write into
 __copy_to/from_user.

 arch/arm64/include/asm/uaccess.h | 25 +
 arch/arm64/kernel/arm64ksyms.c   |  4 ++--
 arch/arm64/lib/copy_from_user.S  |  4 ++--
 arch/arm64/lib/copy_to_user.S|  4 ++--
 4 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 0685d74..4dc9a8f 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -23,6 +23,7 @@
  */
 #include 
 #include 
+#include 

 #include 
 #include 
@@ -269,15 +270,29 @@ do {  
\
-EFAULT;\
 })

-extern unsigned long __must_check __copy_from_user(void *to, const void __user 
*from, unsigned long n);
-extern unsigned long __must_check __copy_to_user(void __user *to, const void 
*from, unsigned long n);
+extern unsigned long __must_check __arch_copy_from_user(void *to, const void 
__user *from, unsigned long n);
+extern unsigned long __must_check __arch_copy_to_user(void __user *to, const 
void *from, unsigned long n);
 extern unsigned long __must_check __copy_in_user(void __user *to, const void 
__user *from, unsigned long n);
 extern unsigned long __must_check __clear_user(void __user *addr, unsigned 
long n);

+static inline unsigned long __must_check __copy_from_user(void *to, const void 
__user *from, unsigned long n)
+{
+   kasan_check_write(to, n);
+   return  __arch_copy_from_user(to, from, n);
+}
+
+static inline unsigned long __must_check __copy_to_user(void __user *to, const 
void *from, unsigned long n)
+{
+   kasan_check_read(from, n);
+   return  __arch_copy_to_user(to, from, n);
+}
+
 static inline unsigned long __must_check copy_from_user(void *to, const void 
__user *from, unsigned long n)
 {
+   kasan_check_write(to, n);
+
if (access_ok(VERIFY_READ, from, n))
-   n = __copy_from_user(to, from, n);
+   n = __arch_copy_from_user(to, from, n);
else /* security hole - plug it */
memset(to, 0, n);
return n;
@@ -285,8 +300,10 @@ static inline unsigned long __must_check 
copy_from_user(void *to, const void __u

 static inline unsigned long __must_check copy_to_user(void __user *to, const 
void *from, unsigned long n)
 {
+   kasan_check_read(from, n);
+
if (access_ok(VERIFY_WRITE, to, n))
-   n = __copy_to_user(to, from, n);
+   n = __arch_copy_to_user(to, from, n);
return n;
 }

diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 678f30b0..2dc4440 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -34,8 +34,8 @@ EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);

/* user mem (segment) */
-EXPORT_SYMBOL(__copy_from_user);
-EXPORT_SYMBOL(__copy_to_user);
+EXPORT_SYMBOL(__arch_copy_from_user);
+EXPORT_SYMBOL(__arch_copy_to_user);
 EXPORT_SYMBOL(__clear_user);
 EXPORT_SYMBOL(__copy_in_user);

diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 17e8306..0b90497 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -66,7 +66,7 @@
.endm

 end.reqx5
-ENTRY(__copy_from_user)
+ENTRY(__arch_copy_from_user)
 ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
CONFIG_ARM64_PAN)
add end, x0, x2
@@ -75,7 +75,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), 
ARM64_ALT_PAN_NOT_UAO, \
CONFIG_ARM64_PAN)
mov x0, #0  // Nothing to copy
ret
-ENDPROC(__copy_from_user)
+ENDPROC(__arch_copy_from_user)

.section .fixup,"ax"
.align  2
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 21faae6..7a7efe2 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -65,7 +65,7 @@
.endm

 end.reqx5
-ENTRY(__copy_to_user)
+ENTRY(__arch_copy_to_user)
 ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
CONFIG_ARM64_PAN)
add end, x0, x2
@@ -74,7 +74,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), 
ARM64_ALT_PAN_NOT_UAO, \
CONFIG_ARM64_PAN)
mov x0, #0
ret
-ENDPROC(__copy_to_user)

Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites

2016-06-02 Thread Shi, Yang

On 6/1/2016 10:00 PM, Minchan Kim wrote:

On Wed, Jun 01, 2016 at 01:40:48PM -0700, Shi, Yang wrote:

On 5/29/2016 11:11 PM, Minchan Kim wrote:

On Fri, May 27, 2016 at 11:16:41AM -0700, Shi, Yang wrote:





If we goes this way, how to guarantee this race?


Thanks for pointing out this. It sounds reasonable. However, this
should be only possible to happen on 32 bit since just 32 bit
version page_is_idle() calls lookup_page_ext(), it doesn't do it on
64 bit.

And, such race condition should exist regardless of whether DEBUG_VM
is enabled or not, right?

rcu might be good enough to protect it.

A quick fix may look like:

diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 8f5d4ad..bf0cd6a 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -77,8 +77,12 @@ static inline bool
test_and_clear_page_young(struct page *page)
static inline bool page_is_idle(struct page *page)
{
   struct page_ext *page_ext;
+
+   rcu_read_lock();
   page_ext = lookup_page_ext(page);
+   rcu_read_unlock();
+
if (unlikely(!page_ext))
   return false;

diff --git a/mm/page_ext.c b/mm/page_ext.c
index 56b160f..94927c9 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -183,7 +183,6 @@ struct page_ext *lookup_page_ext(struct page *page)
{
   unsigned long pfn = page_to_pfn(page);
   struct mem_section *section = __pfn_to_section(pfn);
-#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
   /*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
@@ -195,7 +194,7 @@ struct page_ext *lookup_page_ext(struct page *page)
*/
   if (!section->page_ext)
   return NULL;
-#endif
+
   return section->page_ext + pfn;
}

@@ -279,7 +278,8 @@ static void __free_page_ext(unsigned long pfn)
   return;
   base = ms->page_ext + pfn;
   free_page_ext(base);
-   ms->page_ext = NULL;
+   rcu_assign_pointer(ms->page_ext, NULL);
+   synchronize_rcu();


How does it fix the problem?
I cannot understand your point.


Assigning NULL pointer to page_Ext will be blocked until
rcu_read_lock critical section is done, so the lookup and writing
operations will be serialized. And, rcu_read_lock disables preempt
too.


I meant your rcu_read_lock in page_idle should cover test_bit op.


Yes, definitely. Thanks for catching it.


One more thing, you should use rcu_dereference.


I will check which one is the best since I saw some use rcu_assign_pointer.



As well, please cover memory onlining case I mentioned in another
thread as well as memory offlining.


I will look into it too.

Thanks,
Yang



Anyway, to me, every caller of page_ext should prepare lookup_page_ext
can return NULL anytime and they should use rcu_read_[un]lock, which
is not good. :(





Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites

2016-06-02 Thread Shi, Yang

On 6/1/2016 10:00 PM, Minchan Kim wrote:

On Wed, Jun 01, 2016 at 01:40:48PM -0700, Shi, Yang wrote:

On 5/29/2016 11:11 PM, Minchan Kim wrote:

On Fri, May 27, 2016 at 11:16:41AM -0700, Shi, Yang wrote:





If we goes this way, how to guarantee this race?


Thanks for pointing out this. It sounds reasonable. However, this
should be only possible to happen on 32 bit since just 32 bit
version page_is_idle() calls lookup_page_ext(), it doesn't do it on
64 bit.

And, such race condition should exist regardless of whether DEBUG_VM
is enabled or not, right?

rcu might be good enough to protect it.

A quick fix may look like:

diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 8f5d4ad..bf0cd6a 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -77,8 +77,12 @@ static inline bool
test_and_clear_page_young(struct page *page)
static inline bool page_is_idle(struct page *page)
{
   struct page_ext *page_ext;
+
+   rcu_read_lock();
   page_ext = lookup_page_ext(page);
+   rcu_read_unlock();
+
if (unlikely(!page_ext))
   return false;

diff --git a/mm/page_ext.c b/mm/page_ext.c
index 56b160f..94927c9 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -183,7 +183,6 @@ struct page_ext *lookup_page_ext(struct page *page)
{
   unsigned long pfn = page_to_pfn(page);
   struct mem_section *section = __pfn_to_section(pfn);
-#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
   /*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
@@ -195,7 +194,7 @@ struct page_ext *lookup_page_ext(struct page *page)
*/
   if (!section->page_ext)
   return NULL;
-#endif
+
   return section->page_ext + pfn;
}

@@ -279,7 +278,8 @@ static void __free_page_ext(unsigned long pfn)
   return;
   base = ms->page_ext + pfn;
   free_page_ext(base);
-   ms->page_ext = NULL;
+   rcu_assign_pointer(ms->page_ext, NULL);
+   synchronize_rcu();


How does it fix the problem?
I cannot understand your point.


Assigning NULL pointer to page_Ext will be blocked until
rcu_read_lock critical section is done, so the lookup and writing
operations will be serialized. And, rcu_read_lock disables preempt
too.


I meant your rcu_read_lock in page_idle should cover test_bit op.


Yes, definitely. Thanks for catching it.


One more thing, you should use rcu_dereference.


I will check which one is the best since I saw some use rcu_assign_pointer.



As well, please cover memory onlining case I mentioned in another
thread as well as memory offlining.


I will look into it too.

Thanks,
Yang



Anyway, to me, every caller of page_ext should prepare lookup_page_ext
can return NULL anytime and they should use rcu_read_[un]lock, which
is not good. :(





Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites

2016-06-01 Thread Shi, Yang

On 5/29/2016 11:08 PM, Minchan Kim wrote:

On Mon, May 30, 2016 at 02:39:06PM +0900, Joonsoo Kim wrote:

On Fri, May 27, 2016 at 05:11:08PM +0900, Minchan Kim wrote:

On Fri, May 27, 2016 at 03:08:39PM +0900, Joonsoo Kim wrote:

On Fri, May 27, 2016 at 02:14:32PM +0900, Minchan Kim wrote:

On Thu, May 26, 2016 at 04:15:28PM -0700, Shi, Yang wrote:

On 5/25/2016 5:37 PM, Minchan Kim wrote:

On Tue, May 24, 2016 at 11:58:11AM +0900, Minchan Kim wrote:

On Mon, May 23, 2016 at 10:16:08AM -0700, Yang Shi wrote:

Per the discussion with Joonsoo Kim [1], we need check the return value of
lookup_page_ext() for all call sites since it might return NULL in some cases,
although it is unlikely, i.e. memory hotplug.

Tested with ltp with "page_owner=0".

[1] http://lkml.kernel.org/r/20160519002809.GA10245@js1304-P5Q-DELUXE

Signed-off-by: Yang Shi <yang@linaro.org>


I didn't read code code in detail to see how page_ext memory space
allocated in boot code and memory hotplug but to me, it's not good
to check NULL whenever we calls lookup_page_ext.

More dangerous thing is now page_ext is used by optionable feature(ie, not
critical for system stability) but if we want to use page_ext as
another important tool for the system in future,
it could be a serious problem.


Hello, Minchan.


Hi Joonsoo,



I wonder how pages that isn't managed by kernel yet will cause serious
problem. Until onlining, these pages are out of our scope. Any
information about them would be useless until it is actually
activated. I guess that returning NULL for those pages will not hurt
any functionality. Do you have any possible scenario that this causes the
serious problem?


I don't have any specific usecase now. That's why I said "in future".
And I don't want to argue whether there is possible scenario or not
to make the feature useful but if you want, I should write novel.
One of example, pop up my mind, xen, hv and even memory_hotplug itself
might want to use page_ext for their functionality extension to hook
guest pages.


There is no detail so I can't guess how to use it and how it causes
the serious problem. But, we can do it when it is really needed.



My opinion is that page_ext is extension of struct page so it would
be better to allow any operation on struct page without any limitation
if we can do it. Whether it's useful or useless depend on random
usecase and we don't need to limit that way from the beginning.


If there is no drawback, it would be a better design. But, we have
trade-off that for some case that the memory is added but not
onlined, there is memory saving if we allocate page_ext later.
So, in current situation that there is no user to require such
guarantee, I don't think it's worth doing right now.


However, current design allows deferred page_ext population so any user
of page_ext should keep it in mind and should either make fallback plan
or don't use page_ext for those cases. If we decide go this way through
discussion, at least, we should make such limitation more clear to
somewhere in this chance, maybe around page_ext_operation->need comment.


Agreed.


Okay, We realized from this discussion that by design, guest of page_ext
at the meoment should know his page_ext access from the page can be failed
so every caller should prepare for it.

Shi, Yang, Please include some comment about that in your patch to
prevent further reviewer waste his time with repeating same discussion
and client of page_ext can know the limitation.




My comment's point is that we should consider that way at least. It's
worth to discuss pros and cons, what's the best and what makes that way
hesitate if we can't.


Yes, your suggestion would be good for future direction, but, for now,
I think that inserting NULL to all callers is right fix.

1) Current design that page_ext is allocated when online is design
decision of page_ext to save memory as much as possible. Fixing
possible problem within this design decision looks good to me.


Okay. Shi Yang, please include this comment in your patch, too.


There is already such comment in lookup_page_ext:

 * The sanity checks the page allocator does upon freeing a
 * page can reach here before the page_ext arrays are
 * allocated when feeding a range of pages to the allocator
 * for the first time during bootup or memory hotplug.

I will add more details, i.e. newly added but not onlined memory could 
reach here too.






2) Maybe, we need to backport fixes because it would crash older
kernels. In this case, fix with NULL is easy to backport.


Agreed.

Then, Shi Yang need to mark the page as stable.


Agreed.


Shi, Please resend your patch with hard testing and more better
description with marking it as stable.


Will come with an incremental patch to remove CONFIG_DEBUG #ifdef and 
fix the improve the comments since the comments are included by it.


Thanks,
Yang


And I know another race problem about Shi's patc

Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites

2016-06-01 Thread Shi, Yang

On 5/29/2016 11:08 PM, Minchan Kim wrote:

On Mon, May 30, 2016 at 02:39:06PM +0900, Joonsoo Kim wrote:

On Fri, May 27, 2016 at 05:11:08PM +0900, Minchan Kim wrote:

On Fri, May 27, 2016 at 03:08:39PM +0900, Joonsoo Kim wrote:

On Fri, May 27, 2016 at 02:14:32PM +0900, Minchan Kim wrote:

On Thu, May 26, 2016 at 04:15:28PM -0700, Shi, Yang wrote:

On 5/25/2016 5:37 PM, Minchan Kim wrote:

On Tue, May 24, 2016 at 11:58:11AM +0900, Minchan Kim wrote:

On Mon, May 23, 2016 at 10:16:08AM -0700, Yang Shi wrote:

Per the discussion with Joonsoo Kim [1], we need check the return value of
lookup_page_ext() for all call sites since it might return NULL in some cases,
although it is unlikely, i.e. memory hotplug.

Tested with ltp with "page_owner=0".

[1] http://lkml.kernel.org/r/20160519002809.GA10245@js1304-P5Q-DELUXE

Signed-off-by: Yang Shi 


I didn't read code code in detail to see how page_ext memory space
allocated in boot code and memory hotplug but to me, it's not good
to check NULL whenever we calls lookup_page_ext.

More dangerous thing is now page_ext is used by optionable feature(ie, not
critical for system stability) but if we want to use page_ext as
another important tool for the system in future,
it could be a serious problem.


Hello, Minchan.


Hi Joonsoo,



I wonder how pages that isn't managed by kernel yet will cause serious
problem. Until onlining, these pages are out of our scope. Any
information about them would be useless until it is actually
activated. I guess that returning NULL for those pages will not hurt
any functionality. Do you have any possible scenario that this causes the
serious problem?


I don't have any specific usecase now. That's why I said "in future".
And I don't want to argue whether there is possible scenario or not
to make the feature useful but if you want, I should write novel.
One of example, pop up my mind, xen, hv and even memory_hotplug itself
might want to use page_ext for their functionality extension to hook
guest pages.


There is no detail so I can't guess how to use it and how it causes
the serious problem. But, we can do it when it is really needed.



My opinion is that page_ext is extension of struct page so it would
be better to allow any operation on struct page without any limitation
if we can do it. Whether it's useful or useless depend on random
usecase and we don't need to limit that way from the beginning.


If there is no drawback, it would be a better design. But, we have
trade-off that for some case that the memory is added but not
onlined, there is memory saving if we allocate page_ext later.
So, in current situation that there is no user to require such
guarantee, I don't think it's worth doing right now.


However, current design allows deferred page_ext population so any user
of page_ext should keep it in mind and should either make fallback plan
or don't use page_ext for those cases. If we decide go this way through
discussion, at least, we should make such limitation more clear to
somewhere in this chance, maybe around page_ext_operation->need comment.


Agreed.


Okay, We realized from this discussion that by design, guest of page_ext
at the meoment should know his page_ext access from the page can be failed
so every caller should prepare for it.

Shi, Yang, Please include some comment about that in your patch to
prevent further reviewer waste his time with repeating same discussion
and client of page_ext can know the limitation.




My comment's point is that we should consider that way at least. It's
worth to discuss pros and cons, what's the best and what makes that way
hesitate if we can't.


Yes, your suggestion would be good for future direction, but, for now,
I think that inserting NULL to all callers is right fix.

1) Current design that page_ext is allocated when online is design
decision of page_ext to save memory as much as possible. Fixing
possible problem within this design decision looks good to me.


Okay. Shi Yang, please include this comment in your patch, too.


There is already such comment in lookup_page_ext:

 * The sanity checks the page allocator does upon freeing a
 * page can reach here before the page_ext arrays are
 * allocated when feeding a range of pages to the allocator
 * for the first time during bootup or memory hotplug.

I will add more details, i.e. newly added but not onlined memory could 
reach here too.






2) Maybe, we need to backport fixes because it would crash older
kernels. In this case, fix with NULL is easy to backport.


Agreed.

Then, Shi Yang need to mark the page as stable.


Agreed.


Shi, Please resend your patch with hard testing and more better
description with marking it as stable.


Will come with an incremental patch to remove CONFIG_DEBUG #ifdef and 
fix the improve the comments since the comments are included by it.


Thanks,
Yang


And I know another race problem about Shi's patch.
I will reply to the thread.

Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites

2016-06-01 Thread Shi, Yang

On 5/29/2016 11:11 PM, Minchan Kim wrote:

On Fri, May 27, 2016 at 11:16:41AM -0700, Shi, Yang wrote:





If we goes this way, how to guarantee this race?


Thanks for pointing out this. It sounds reasonable. However, this
should be only possible to happen on 32 bit since just 32 bit
version page_is_idle() calls lookup_page_ext(), it doesn't do it on
64 bit.

And, such race condition should exist regardless of whether DEBUG_VM
is enabled or not, right?

rcu might be good enough to protect it.

A quick fix may look like:

diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 8f5d4ad..bf0cd6a 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -77,8 +77,12 @@ static inline bool
test_and_clear_page_young(struct page *page)
 static inline bool page_is_idle(struct page *page)
 {
struct page_ext *page_ext;
+
+   rcu_read_lock();
page_ext = lookup_page_ext(page);
+   rcu_read_unlock();
+
if (unlikely(!page_ext))
return false;

diff --git a/mm/page_ext.c b/mm/page_ext.c
index 56b160f..94927c9 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -183,7 +183,6 @@ struct page_ext *lookup_page_ext(struct page *page)
 {
unsigned long pfn = page_to_pfn(page);
struct mem_section *section = __pfn_to_section(pfn);
-#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
/*
 * The sanity checks the page allocator does upon freeing a
 * page can reach here before the page_ext arrays are
@@ -195,7 +194,7 @@ struct page_ext *lookup_page_ext(struct page *page)
 */
if (!section->page_ext)
return NULL;
-#endif
+
return section->page_ext + pfn;
 }

@@ -279,7 +278,8 @@ static void __free_page_ext(unsigned long pfn)
return;
base = ms->page_ext + pfn;
free_page_ext(base);
-   ms->page_ext = NULL;
+   rcu_assign_pointer(ms->page_ext, NULL);
+   synchronize_rcu();


How does it fix the problem?
I cannot understand your point.


Assigning NULL pointer to page_Ext will be blocked until rcu_read_lock 
critical section is done, so the lookup and writing operations will be 
serialized. And, rcu_read_lock disables preempt too.


Yang




 }

 static int __meminit online_page_ext(unsigned long start_pfn,

Thanks,
Yang



   kpageflags_read
   stable_page_flags
   page_is_idle
 lookup_page_ext
 section = __pfn_to_section(pfn)
offline_pages
memory_notify(MEM_OFFLINE)
 offline_page_ext
 ms->page_ext = NULL
 section->page_ext + pfn



Thanks.







Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites

2016-06-01 Thread Shi, Yang

On 5/29/2016 11:11 PM, Minchan Kim wrote:

On Fri, May 27, 2016 at 11:16:41AM -0700, Shi, Yang wrote:





If we goes this way, how to guarantee this race?


Thanks for pointing out this. It sounds reasonable. However, this
should be only possible to happen on 32 bit since just 32 bit
version page_is_idle() calls lookup_page_ext(), it doesn't do it on
64 bit.

And, such race condition should exist regardless of whether DEBUG_VM
is enabled or not, right?

rcu might be good enough to protect it.

A quick fix may look like:

diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 8f5d4ad..bf0cd6a 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -77,8 +77,12 @@ static inline bool
test_and_clear_page_young(struct page *page)
 static inline bool page_is_idle(struct page *page)
 {
struct page_ext *page_ext;
+
+   rcu_read_lock();
page_ext = lookup_page_ext(page);
+   rcu_read_unlock();
+
if (unlikely(!page_ext))
return false;

diff --git a/mm/page_ext.c b/mm/page_ext.c
index 56b160f..94927c9 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -183,7 +183,6 @@ struct page_ext *lookup_page_ext(struct page *page)
 {
unsigned long pfn = page_to_pfn(page);
struct mem_section *section = __pfn_to_section(pfn);
-#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
/*
 * The sanity checks the page allocator does upon freeing a
 * page can reach here before the page_ext arrays are
@@ -195,7 +194,7 @@ struct page_ext *lookup_page_ext(struct page *page)
 */
if (!section->page_ext)
return NULL;
-#endif
+
return section->page_ext + pfn;
 }

@@ -279,7 +278,8 @@ static void __free_page_ext(unsigned long pfn)
return;
base = ms->page_ext + pfn;
free_page_ext(base);
-   ms->page_ext = NULL;
+   rcu_assign_pointer(ms->page_ext, NULL);
+   synchronize_rcu();


How does it fix the problem?
I cannot understand your point.


Assigning NULL pointer to page_Ext will be blocked until rcu_read_lock 
critical section is done, so the lookup and writing operations will be 
serialized. And, rcu_read_lock disables preempt too.


Yang




 }

 static int __meminit online_page_ext(unsigned long start_pfn,

Thanks,
Yang



   kpageflags_read
   stable_page_flags
   page_is_idle
 lookup_page_ext
 section = __pfn_to_section(pfn)
offline_pages
memory_notify(MEM_OFFLINE)
 offline_page_ext
 ms->page_ext = NULL
 section->page_ext + pfn



Thanks.







Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites

2016-05-27 Thread Shi, Yang

On 5/27/2016 1:02 PM, Andrew Morton wrote:

On Thu, 26 May 2016 16:15:28 -0700 "Shi, Yang" <yang@linaro.org> wrote:



I hope we consider this direction, too.


Yang, Could you think about this?


Thanks a lot for the suggestion. Sorry for the late reply, I was busy on
preparing patches. I do agree this is a direction we should look into,
but I haven't got time to think about it deeper. I hope Joonsoo could
chime in too since he is the original author for page extension.



Even, your patch was broken, I think.
It doesn't work with !CONFIG_DEBUG_VM && !CONFIG_PAGE_POISONING because
lookup_page_ext doesn't return NULL in that case.


Actually, I think the #ifdef should be removed if lookup_page_ext() is
possible to return NULL. It sounds not make sense returning NULL only
when DEBUG_VM is enabled. It should return NULL no matter what debug
config is selected. If Joonsoo agrees with me I'm going to come up with
a patch to fix it.



I've lost the plot here.  What is the status of this patch?

Latest version:


Yes, this is the latest version. We are discussing about some future 
optimization.


And, Minchan Kim pointed out a possible race condition which exists even 
before this patch. I proposed a quick fix, as long as they are happy to 
the fix, I will post it to the mailing list.


Thanks,
Yang



From: Yang Shi <yang@linaro.org>
Subject: mm: check the return value of lookup_page_ext for all call sites

Per the discussion with Joonsoo Kim [1], we need check the return value of
lookup_page_ext() for all call sites since it might return NULL in some
cases, although it is unlikely, i.e.  memory hotplug.

Tested with ltp with "page_owner=0".

[1] http://lkml.kernel.org/r/20160519002809.GA10245@js1304-P5Q-DELUXE

[a...@linux-foundation.org: fix build-breaking typos]
[a...@arndb.de: fix build problems from lookup_page_ext]
  Link: http://lkml.kernel.org/r/6285269.2CksypHdYp@wuerfel
[a...@linux-foundation.org: coding-style fixes]
Link: 
http://lkml.kernel.org/r/1464023768-31025-1-git-send-email-yang....@linaro.org
Signed-off-by: Yang Shi <yang@linaro.org>
Signed-off-by: Arnd Bergmann <a...@arndb.de>
Cc: Joonsoo Kim <iamjoonsoo@lge.com>
Signed-off-by: Andrew Morton <a...@linux-foundation.org>
---

 include/linux/page_idle.h |   43 ++--
 mm/page_alloc.c   |6 +
 mm/page_owner.c   |   26 +
 mm/page_poison.c  |8 +-
 mm/vmstat.c   |2 +
 5 files changed, 77 insertions(+), 8 deletions(-)

diff -puN 
include/linux/page_idle.h~mm-check-the-return-value-of-lookup_page_ext-for-all-call-sites
 include/linux/page_idle.h
--- 
a/include/linux/page_idle.h~mm-check-the-return-value-of-lookup_page_ext-for-all-call-sites
+++ a/include/linux/page_idle.h
@@ -46,33 +46,62 @@ extern struct page_ext_operations page_i

 static inline bool page_is_young(struct page *page)
 {
-   return test_bit(PAGE_EXT_YOUNG, _page_ext(page)->flags);
+   struct page_ext *page_ext = lookup_page_ext(page);
+
+   if (unlikely(!page_ext))
+   return false;
+
+   return test_bit(PAGE_EXT_YOUNG, _ext->flags);
 }

 static inline void set_page_young(struct page *page)
 {
-   set_bit(PAGE_EXT_YOUNG, _page_ext(page)->flags);
+   struct page_ext *page_ext = lookup_page_ext(page);
+
+   if (unlikely(!page_ext))
+   return;
+
+   set_bit(PAGE_EXT_YOUNG, _ext->flags);
 }

 static inline bool test_and_clear_page_young(struct page *page)
 {
-   return test_and_clear_bit(PAGE_EXT_YOUNG,
- _page_ext(page)->flags);
+   struct page_ext *page_ext = lookup_page_ext(page);
+
+   if (unlikely(!page_ext))
+   return false;
+
+   return test_and_clear_bit(PAGE_EXT_YOUNG, _ext->flags);
 }

 static inline bool page_is_idle(struct page *page)
 {
-   return test_bit(PAGE_EXT_IDLE, _page_ext(page)->flags);
+   struct page_ext *page_ext = lookup_page_ext(page);
+
+   if (unlikely(!page_ext))
+   return false;
+
+   return test_bit(PAGE_EXT_IDLE, _ext->flags);
 }

 static inline void set_page_idle(struct page *page)
 {
-   set_bit(PAGE_EXT_IDLE, _page_ext(page)->flags);
+   struct page_ext *page_ext = lookup_page_ext(page);
+
+   if (unlikely(!page_ext))
+   return;
+
+   set_bit(PAGE_EXT_IDLE, _ext->flags);
 }

 static inline void clear_page_idle(struct page *page)
 {
-   clear_bit(PAGE_EXT_IDLE, _page_ext(page)->flags);
+   struct page_ext *page_ext = lookup_page_ext(page);
+
+   if (unlikely(!page_ext))
+   return;
+
+   clear_bit(PAGE_EXT_IDLE, _ext->flags);
 }
 #endif /* CONFIG_64BIT */

diff -puN 
mm/page_alloc.c~mm-check-the-return-value-of-lookup_page_ext-for-all-call-sites 
mm/page_alloc.c
--- 
a/mm/page_alloc.c~mm-check-the-return-value-of-lookup_page_ext-f

Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites

2016-05-27 Thread Shi, Yang

On 5/27/2016 1:02 PM, Andrew Morton wrote:

On Thu, 26 May 2016 16:15:28 -0700 "Shi, Yang"  wrote:



I hope we consider this direction, too.


Yang, Could you think about this?


Thanks a lot for the suggestion. Sorry for the late reply, I was busy on
preparing patches. I do agree this is a direction we should look into,
but I haven't got time to think about it deeper. I hope Joonsoo could
chime in too since he is the original author for page extension.



Even, your patch was broken, I think.
It doesn't work with !CONFIG_DEBUG_VM && !CONFIG_PAGE_POISONING because
lookup_page_ext doesn't return NULL in that case.


Actually, I think the #ifdef should be removed if lookup_page_ext() is
possible to return NULL. It sounds not make sense returning NULL only
when DEBUG_VM is enabled. It should return NULL no matter what debug
config is selected. If Joonsoo agrees with me I'm going to come up with
a patch to fix it.



I've lost the plot here.  What is the status of this patch?

Latest version:


Yes, this is the latest version. We are discussing about some future 
optimization.


And, Minchan Kim pointed out a possible race condition which exists even 
before this patch. I proposed a quick fix, as long as they are happy to 
the fix, I will post it to the mailing list.


Thanks,
Yang



From: Yang Shi 
Subject: mm: check the return value of lookup_page_ext for all call sites

Per the discussion with Joonsoo Kim [1], we need check the return value of
lookup_page_ext() for all call sites since it might return NULL in some
cases, although it is unlikely, i.e.  memory hotplug.

Tested with ltp with "page_owner=0".

[1] http://lkml.kernel.org/r/20160519002809.GA10245@js1304-P5Q-DELUXE

[a...@linux-foundation.org: fix build-breaking typos]
[a...@arndb.de: fix build problems from lookup_page_ext]
  Link: http://lkml.kernel.org/r/6285269.2CksypHdYp@wuerfel
[a...@linux-foundation.org: coding-style fixes]
Link: 
http://lkml.kernel.org/r/1464023768-31025-1-git-send-email-yang@linaro.org
Signed-off-by: Yang Shi 
Signed-off-by: Arnd Bergmann 
Cc: Joonsoo Kim 
Signed-off-by: Andrew Morton 
---

 include/linux/page_idle.h |   43 ++--
 mm/page_alloc.c   |6 +
 mm/page_owner.c   |   26 +
 mm/page_poison.c  |8 +-
 mm/vmstat.c   |2 +
 5 files changed, 77 insertions(+), 8 deletions(-)

diff -puN 
include/linux/page_idle.h~mm-check-the-return-value-of-lookup_page_ext-for-all-call-sites
 include/linux/page_idle.h
--- 
a/include/linux/page_idle.h~mm-check-the-return-value-of-lookup_page_ext-for-all-call-sites
+++ a/include/linux/page_idle.h
@@ -46,33 +46,62 @@ extern struct page_ext_operations page_i

 static inline bool page_is_young(struct page *page)
 {
-   return test_bit(PAGE_EXT_YOUNG, _page_ext(page)->flags);
+   struct page_ext *page_ext = lookup_page_ext(page);
+
+   if (unlikely(!page_ext))
+   return false;
+
+   return test_bit(PAGE_EXT_YOUNG, _ext->flags);
 }

 static inline void set_page_young(struct page *page)
 {
-   set_bit(PAGE_EXT_YOUNG, _page_ext(page)->flags);
+   struct page_ext *page_ext = lookup_page_ext(page);
+
+   if (unlikely(!page_ext))
+   return;
+
+   set_bit(PAGE_EXT_YOUNG, _ext->flags);
 }

 static inline bool test_and_clear_page_young(struct page *page)
 {
-   return test_and_clear_bit(PAGE_EXT_YOUNG,
- _page_ext(page)->flags);
+   struct page_ext *page_ext = lookup_page_ext(page);
+
+   if (unlikely(!page_ext))
+   return false;
+
+   return test_and_clear_bit(PAGE_EXT_YOUNG, _ext->flags);
 }

 static inline bool page_is_idle(struct page *page)
 {
-   return test_bit(PAGE_EXT_IDLE, _page_ext(page)->flags);
+   struct page_ext *page_ext = lookup_page_ext(page);
+
+   if (unlikely(!page_ext))
+   return false;
+
+   return test_bit(PAGE_EXT_IDLE, _ext->flags);
 }

 static inline void set_page_idle(struct page *page)
 {
-   set_bit(PAGE_EXT_IDLE, _page_ext(page)->flags);
+   struct page_ext *page_ext = lookup_page_ext(page);
+
+   if (unlikely(!page_ext))
+   return;
+
+   set_bit(PAGE_EXT_IDLE, _ext->flags);
 }

 static inline void clear_page_idle(struct page *page)
 {
-   clear_bit(PAGE_EXT_IDLE, _page_ext(page)->flags);
+   struct page_ext *page_ext = lookup_page_ext(page);
+
+   if (unlikely(!page_ext))
+   return;
+
+   clear_bit(PAGE_EXT_IDLE, _ext->flags);
 }
 #endif /* CONFIG_64BIT */

diff -puN 
mm/page_alloc.c~mm-check-the-return-value-of-lookup_page_ext-for-all-call-sites 
mm/page_alloc.c
--- 
a/mm/page_alloc.c~mm-check-the-return-value-of-lookup_page_ext-for-all-call-sites
+++ a/mm/page_alloc.c
@@ -656,6 +656,9 @@ static inline void set_page_guard(struct
return;

page_ext = lookup_page_ex

Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites

2016-05-27 Thread Shi, Yang

On 5/27/2016 1:11 AM, Minchan Kim wrote:

On Fri, May 27, 2016 at 03:08:39PM +0900, Joonsoo Kim wrote:

On Fri, May 27, 2016 at 02:14:32PM +0900, Minchan Kim wrote:

On Thu, May 26, 2016 at 04:15:28PM -0700, Shi, Yang wrote:

On 5/25/2016 5:37 PM, Minchan Kim wrote:

On Tue, May 24, 2016 at 11:58:11AM +0900, Minchan Kim wrote:

On Mon, May 23, 2016 at 10:16:08AM -0700, Yang Shi wrote:

Per the discussion with Joonsoo Kim [1], we need check the return value of
lookup_page_ext() for all call sites since it might return NULL in some cases,
although it is unlikely, i.e. memory hotplug.

Tested with ltp with "page_owner=0".

[1] http://lkml.kernel.org/r/20160519002809.GA10245@js1304-P5Q-DELUXE

Signed-off-by: Yang Shi <yang@linaro.org>


I didn't read code code in detail to see how page_ext memory space
allocated in boot code and memory hotplug but to me, it's not good
to check NULL whenever we calls lookup_page_ext.

More dangerous thing is now page_ext is used by optionable feature(ie, not
critical for system stability) but if we want to use page_ext as
another important tool for the system in future,
it could be a serious problem.


Hello, Minchan.


Hi Joonsoo,



I wonder how pages that isn't managed by kernel yet will cause serious
problem. Until onlining, these pages are out of our scope. Any
information about them would be useless until it is actually
activated. I guess that returning NULL for those pages will not hurt
any functionality. Do you have any possible scenario that this causes the
serious problem?


I don't have any specific usecase now. That's why I said "in future".
And I don't want to argue whether there is possible scenario or not
to make the feature useful but if you want, I should write novel.
One of example, pop up my mind, xen, hv and even memory_hotplug itself
might want to use page_ext for their functionality extension to hook
guest pages.

My opinion is that page_ext is extension of struct page so it would
be better to allow any operation on struct page without any limitation
if we can do it. Whether it's useful or useless depend on random
usecase and we don't need to limit that way from the beginning.

However, current design allows deferred page_ext population so any user
of page_ext should keep it in mind and should either make fallback plan
or don't use page_ext for those cases. If we decide go this way through
discussion, at least, we should make such limitation more clear to
somewhere in this chance, maybe around page_ext_operation->need comment.

My comment's point is that we should consider that way at least. It's
worth to discuss pros and cons, what's the best and what makes that way
hesitate if we can't.



And, allocation such memory space doesn't come from free. If someone
just add the memory device and don't online it, these memory will be


Here goes several questions.
Cced hotplug guys

1.
If someone just add the memory device without onlining, kernel code
can return pfn_valid == true on the offlined page?

2.
If so, it means memmap on offline memory is already populated somewhere.
Where is the memmap allocated? part of offlined memory space or other memory?

3. Could we allocate page_ext in part of offline memory space so that
it doesn't consume online memory.


wasted. I don't know if there is such a usecase but it's possible
scenario.






Can we put some hooks of page_ext into memory-hotplug so guarantee
that page_ext memory space is allocated with memmap space at the
same time? IOW, once every PFN wakers find a page is valid, page_ext
is valid, too so lookup_page_ext never returns NULL on valid page
by design.

I hope we consider this direction, too.


Yang, Could you think about this?


Thanks a lot for the suggestion. Sorry for the late reply, I was
busy on preparing patches. I do agree this is a direction we should
look into, but I haven't got time to think about it deeper. I hope
Joonsoo could chime in too since he is the original author for page
extension.



Even, your patch was broken, I think.
It doesn't work with !CONFIG_DEBUG_VM && !CONFIG_PAGE_POISONING because
lookup_page_ext doesn't return NULL in that case.


Actually, I think the #ifdef should be removed if lookup_page_ext()
is possible to return NULL. It sounds not make sense returning NULL
only when DEBUG_VM is enabled. It should return NULL no matter what
debug config is selected. If Joonsoo agrees with me I'm going to
come up with a patch to fix it.


Agreed but let's wait for Minchan's response.


If we goes this way, how to guarantee this race?


Thanks for pointing out this. It sounds reasonable. However, this should 
be only possible to happen on 32 bit since just 32 bit version 
page_is_idle() calls lookup_page_ext(), it doesn't do it on 64 bit.


And, such race condition should exist regardless of whether DEBUG_VM is 
enabled or not, right?


rcu might be good enough to protect it.

A quick fix may look like:

diff --git a/include/linux/pa

Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites

2016-05-27 Thread Shi, Yang

On 5/27/2016 1:11 AM, Minchan Kim wrote:

On Fri, May 27, 2016 at 03:08:39PM +0900, Joonsoo Kim wrote:

On Fri, May 27, 2016 at 02:14:32PM +0900, Minchan Kim wrote:

On Thu, May 26, 2016 at 04:15:28PM -0700, Shi, Yang wrote:

On 5/25/2016 5:37 PM, Minchan Kim wrote:

On Tue, May 24, 2016 at 11:58:11AM +0900, Minchan Kim wrote:

On Mon, May 23, 2016 at 10:16:08AM -0700, Yang Shi wrote:

Per the discussion with Joonsoo Kim [1], we need check the return value of
lookup_page_ext() for all call sites since it might return NULL in some cases,
although it is unlikely, i.e. memory hotplug.

Tested with ltp with "page_owner=0".

[1] http://lkml.kernel.org/r/20160519002809.GA10245@js1304-P5Q-DELUXE

Signed-off-by: Yang Shi 


I didn't read code code in detail to see how page_ext memory space
allocated in boot code and memory hotplug but to me, it's not good
to check NULL whenever we calls lookup_page_ext.

More dangerous thing is now page_ext is used by optionable feature(ie, not
critical for system stability) but if we want to use page_ext as
another important tool for the system in future,
it could be a serious problem.


Hello, Minchan.


Hi Joonsoo,



I wonder how pages that isn't managed by kernel yet will cause serious
problem. Until onlining, these pages are out of our scope. Any
information about them would be useless until it is actually
activated. I guess that returning NULL for those pages will not hurt
any functionality. Do you have any possible scenario that this causes the
serious problem?


I don't have any specific usecase now. That's why I said "in future".
And I don't want to argue whether there is possible scenario or not
to make the feature useful but if you want, I should write novel.
One of example, pop up my mind, xen, hv and even memory_hotplug itself
might want to use page_ext for their functionality extension to hook
guest pages.

My opinion is that page_ext is extension of struct page so it would
be better to allow any operation on struct page without any limitation
if we can do it. Whether it's useful or useless depend on random
usecase and we don't need to limit that way from the beginning.

However, current design allows deferred page_ext population so any user
of page_ext should keep it in mind and should either make fallback plan
or don't use page_ext for those cases. If we decide go this way through
discussion, at least, we should make such limitation more clear to
somewhere in this chance, maybe around page_ext_operation->need comment.

My comment's point is that we should consider that way at least. It's
worth to discuss pros and cons, what's the best and what makes that way
hesitate if we can't.



And, allocation such memory space doesn't come from free. If someone
just add the memory device and don't online it, these memory will be


Here goes several questions.
Cced hotplug guys

1.
If someone just add the memory device without onlining, kernel code
can return pfn_valid == true on the offlined page?

2.
If so, it means memmap on offline memory is already populated somewhere.
Where is the memmap allocated? part of offlined memory space or other memory?

3. Could we allocate page_ext in part of offline memory space so that
it doesn't consume online memory.


wasted. I don't know if there is such a usecase but it's possible
scenario.






Can we put some hooks of page_ext into memory-hotplug so guarantee
that page_ext memory space is allocated with memmap space at the
same time? IOW, once every PFN wakers find a page is valid, page_ext
is valid, too so lookup_page_ext never returns NULL on valid page
by design.

I hope we consider this direction, too.


Yang, Could you think about this?


Thanks a lot for the suggestion. Sorry for the late reply, I was
busy on preparing patches. I do agree this is a direction we should
look into, but I haven't got time to think about it deeper. I hope
Joonsoo could chime in too since he is the original author for page
extension.



Even, your patch was broken, I think.
It doesn't work with !CONFIG_DEBUG_VM && !CONFIG_PAGE_POISONING because
lookup_page_ext doesn't return NULL in that case.


Actually, I think the #ifdef should be removed if lookup_page_ext()
is possible to return NULL. It sounds not make sense returning NULL
only when DEBUG_VM is enabled. It should return NULL no matter what
debug config is selected. If Joonsoo agrees with me I'm going to
come up with a patch to fix it.


Agreed but let's wait for Minchan's response.


If we goes this way, how to guarantee this race?


Thanks for pointing out this. It sounds reasonable. However, this should 
be only possible to happen on 32 bit since just 32 bit version 
page_is_idle() calls lookup_page_ext(), it doesn't do it on 64 bit.


And, such race condition should exist regardless of whether DEBUG_VM is 
enabled or not, right?


rcu might be good enough to protect it.

A quick fix may look like:

diff --git a/include/linux/page_idle.h b/include/linux/pag

Re: [PATCH] arm64: kasan: instrument user memory access API

2016-05-27 Thread Shi, Yang

On 5/27/2016 10:46 AM, Mark Rutland wrote:

On Fri, May 27, 2016 at 09:34:03AM -0700, Shi, Yang wrote:

On 5/27/2016 4:02 AM, Andrey Ryabinin wrote:



On 05/26/2016 09:43 PM, Yang Shi wrote:

The upstream commit 1771c6e1a567ea0ba20a4ffe68a1419fd8ef
("x86/kasan: instrument user memory access API") added KASAN instrument to
x86 user memory access API, so added such instrument to ARM64 too.

Tested by test_kasan module.

Signed-off-by: Yang Shi <yang@linaro.org>
---
arch/arm64/include/asm/uaccess.h | 18 --
1 file changed, 16 insertions(+), 2 deletions(-)


Please, cover __copy_from_user() and __copy_to_user() too.
Unlike x86, your patch doesn't instrument these two.


Argh, I missed those when reviewing. My bad.


I should elaborated this in my review. Yes, I did think about it,
but unlike x86, __copy_to/from_user are implemented by asm code on
ARM64. If I add kasan_check_read/write into them, I have to move the
registers around to prepare the parameters for kasan calls, then
restore them after the call, for example the below code for
__copy_to_user:

mov x9, x0
mov x10, x1
mov x11, x2
mov x0, x10
mov x1, x11
bl  kasan_check_read
mov x0, x9
mov x1, x10


There's no need to alter the assembly.

Rename the functions (e.g. have __arch_raw_copy_from_user), and add
static inline wrappers in uaccess.h that do the kasan calls before
calling the assembly functions.

That gives the compiler the freedom to do the right thing, and avoids
horrible ifdeffery in the assembly code.


Thanks for the suggestion, will address in v2.

Yang




So, I'm wondering if it is worth or not since __copy_to/from_user
are just called at a couple of places, i.e. sctp, a couple of
drivers, etc and not used too much.


[mark@leverpostej:~/src/linux]% git grep -w __copy_to_user -- ^arch | wc -l
63
[mark@leverpostej:~/src/linux]% git grep -w __copy_from_user -- ^arch | wc -l
47

That's a reasonable number of callsites.

If we're going to bother adding this, it should be complete. So please
do update __copy_from_user and __copy_to_user.


Actually, I think some of them
could be replaced by __copy_to/from_user_inatomic.


Given the number of existing callers outside of arch code, I think we'll
get far more traction reworking the arm64 parts for now.

Thanks,
Mark.





Re: [PATCH] arm64: kasan: instrument user memory access API

2016-05-27 Thread Shi, Yang

On 5/27/2016 10:46 AM, Mark Rutland wrote:

On Fri, May 27, 2016 at 09:34:03AM -0700, Shi, Yang wrote:

On 5/27/2016 4:02 AM, Andrey Ryabinin wrote:



On 05/26/2016 09:43 PM, Yang Shi wrote:

The upstream commit 1771c6e1a567ea0ba20a4ffe68a1419fd8ef
("x86/kasan: instrument user memory access API") added KASAN instrument to
x86 user memory access API, so added such instrument to ARM64 too.

Tested by test_kasan module.

Signed-off-by: Yang Shi 
---
arch/arm64/include/asm/uaccess.h | 18 --
1 file changed, 16 insertions(+), 2 deletions(-)


Please, cover __copy_from_user() and __copy_to_user() too.
Unlike x86, your patch doesn't instrument these two.


Argh, I missed those when reviewing. My bad.


I should elaborated this in my review. Yes, I did think about it,
but unlike x86, __copy_to/from_user are implemented by asm code on
ARM64. If I add kasan_check_read/write into them, I have to move the
registers around to prepare the parameters for kasan calls, then
restore them after the call, for example the below code for
__copy_to_user:

mov x9, x0
mov x10, x1
mov x11, x2
mov x0, x10
mov x1, x11
bl  kasan_check_read
mov x0, x9
mov x1, x10


There's no need to alter the assembly.

Rename the functions (e.g. have __arch_raw_copy_from_user), and add
static inline wrappers in uaccess.h that do the kasan calls before
calling the assembly functions.

That gives the compiler the freedom to do the right thing, and avoids
horrible ifdeffery in the assembly code.


Thanks for the suggestion, will address in v2.

Yang




So, I'm wondering if it is worth or not since __copy_to/from_user
are just called at a couple of places, i.e. sctp, a couple of
drivers, etc and not used too much.


[mark@leverpostej:~/src/linux]% git grep -w __copy_to_user -- ^arch | wc -l
63
[mark@leverpostej:~/src/linux]% git grep -w __copy_from_user -- ^arch | wc -l
47

That's a reasonable number of callsites.

If we're going to bother adding this, it should be complete. So please
do update __copy_from_user and __copy_to_user.


Actually, I think some of them
could be replaced by __copy_to/from_user_inatomic.


Given the number of existing callers outside of arch code, I think we'll
get far more traction reworking the arm64 parts for now.

Thanks,
Mark.





Re: [PATCH] arm64: kasan: instrument user memory access API

2016-05-27 Thread Shi, Yang

On 5/27/2016 4:02 AM, Andrey Ryabinin wrote:



On 05/26/2016 09:43 PM, Yang Shi wrote:

The upstream commit 1771c6e1a567ea0ba20a4ffe68a1419fd8ef
("x86/kasan: instrument user memory access API") added KASAN instrument to
x86 user memory access API, so added such instrument to ARM64 too.

Tested by test_kasan module.

Signed-off-by: Yang Shi <yang@linaro.org>
---
 arch/arm64/include/asm/uaccess.h | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)


Please, cover __copy_from_user() and __copy_to_user() too.
Unlike x86, your patch doesn't instrument these two.


I should elaborated this in my review. Yes, I did think about it, but 
unlike x86, __copy_to/from_user are implemented by asm code on ARM64. If 
I add kasan_check_read/write into them, I have to move the registers 
around to prepare the parameters for kasan calls, then restore them 
after the call, for example the below code for __copy_to_user:


mov x9, x0
mov x10, x1
mov x11, x2
mov x0, x10
mov x1, x11
bl  kasan_check_read
mov x0, x9
mov x1, x10


So, I'm wondering if it is worth or not since __copy_to/from_user are 
just called at a couple of places, i.e. sctp, a couple of drivers, etc 
and not used too much. Actually, I think some of them could be replaced 
by __copy_to/from_user_inatomic.


Any idea is appreciated.

Thanks,
Yang







Re: [PATCH] arm64: kasan: instrument user memory access API

2016-05-27 Thread Shi, Yang

On 5/27/2016 4:02 AM, Andrey Ryabinin wrote:



On 05/26/2016 09:43 PM, Yang Shi wrote:

The upstream commit 1771c6e1a567ea0ba20a4ffe68a1419fd8ef
("x86/kasan: instrument user memory access API") added KASAN instrument to
x86 user memory access API, so added such instrument to ARM64 too.

Tested by test_kasan module.

Signed-off-by: Yang Shi 
---
 arch/arm64/include/asm/uaccess.h | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)


Please, cover __copy_from_user() and __copy_to_user() too.
Unlike x86, your patch doesn't instrument these two.


I should elaborated this in my review. Yes, I did think about it, but 
unlike x86, __copy_to/from_user are implemented by asm code on ARM64. If 
I add kasan_check_read/write into them, I have to move the registers 
around to prepare the parameters for kasan calls, then restore them 
after the call, for example the below code for __copy_to_user:


mov x9, x0
mov x10, x1
mov x11, x2
mov x0, x10
mov x1, x11
bl  kasan_check_read
mov x0, x9
mov x1, x10


So, I'm wondering if it is worth or not since __copy_to/from_user are 
just called at a couple of places, i.e. sctp, a couple of drivers, etc 
and not used too much. Actually, I think some of them could be replaced 
by __copy_to/from_user_inatomic.


Any idea is appreciated.

Thanks,
Yang







Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites

2016-05-26 Thread Shi, Yang

On 5/25/2016 5:37 PM, Minchan Kim wrote:

On Tue, May 24, 2016 at 11:58:11AM +0900, Minchan Kim wrote:

On Mon, May 23, 2016 at 10:16:08AM -0700, Yang Shi wrote:

Per the discussion with Joonsoo Kim [1], we need check the return value of
lookup_page_ext() for all call sites since it might return NULL in some cases,
although it is unlikely, i.e. memory hotplug.

Tested with ltp with "page_owner=0".

[1] http://lkml.kernel.org/r/20160519002809.GA10245@js1304-P5Q-DELUXE

Signed-off-by: Yang Shi <yang@linaro.org>


I didn't read code code in detail to see how page_ext memory space
allocated in boot code and memory hotplug but to me, it's not good
to check NULL whenever we calls lookup_page_ext.

More dangerous thing is now page_ext is used by optionable feature(ie, not
critical for system stability) but if we want to use page_ext as
another important tool for the system in future,
it could be a serious problem.

Can we put some hooks of page_ext into memory-hotplug so guarantee
that page_ext memory space is allocated with memmap space at the
same time? IOW, once every PFN wakers find a page is valid, page_ext
is valid, too so lookup_page_ext never returns NULL on valid page
by design.

I hope we consider this direction, too.


Yang, Could you think about this?


Thanks a lot for the suggestion. Sorry for the late reply, I was busy on 
preparing patches. I do agree this is a direction we should look into, 
but I haven't got time to think about it deeper. I hope Joonsoo could 
chime in too since he is the original author for page extension.




Even, your patch was broken, I think.
It doesn't work with !CONFIG_DEBUG_VM && !CONFIG_PAGE_POISONING because
lookup_page_ext doesn't return NULL in that case.


Actually, I think the #ifdef should be removed if lookup_page_ext() is 
possible to return NULL. It sounds not make sense returning NULL only 
when DEBUG_VM is enabled. It should return NULL no matter what debug 
config is selected. If Joonsoo agrees with me I'm going to come up with 
a patch to fix it.


Regards,
Yang





Thanks.


---
 include/linux/page_idle.h | 43 ---
 mm/page_alloc.c   |  6 ++
 mm/page_owner.c   | 27 +++
 mm/page_poison.c  |  8 +++-
 mm/vmstat.c   |  2 ++
 5 files changed, 78 insertions(+), 8 deletions(-)

diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index bf268fa..8f5d4ad 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -46,33 +46,62 @@ extern struct page_ext_operations page_idle_ops;

 static inline bool page_is_young(struct page *page)
 {
-   return test_bit(PAGE_EXT_YOUNG, _page_ext(page)->flags);
+   struct page_ext *page_ext;
+   page_ext = lookup_page_ext(page);
+   if (unlikely(!page_ext)
+   return false;
+
+   return test_bit(PAGE_EXT_YOUNG, _ext->flags);
 }

 static inline void set_page_young(struct page *page)
 {
-   set_bit(PAGE_EXT_YOUNG, _page_ext(page)->flags);
+   struct page_ext *page_ext;
+   page_ext = lookup_page_ext(page);
+   if (unlikely(!page_ext)
+   return;
+
+   set_bit(PAGE_EXT_YOUNG, _ext->flags);
 }

 static inline bool test_and_clear_page_young(struct page *page)
 {
-   return test_and_clear_bit(PAGE_EXT_YOUNG,
- _page_ext(page)->flags);
+   struct page_ext *page_ext;
+   page_ext = lookup_page_ext(page);
+   if (unlikely(!page_ext)
+   return false;
+
+   return test_and_clear_bit(PAGE_EXT_YOUNG, _ext->flags);
 }

 static inline bool page_is_idle(struct page *page)
 {
-   return test_bit(PAGE_EXT_IDLE, _page_ext(page)->flags);
+   struct page_ext *page_ext;
+   page_ext = lookup_page_ext(page);
+   if (unlikely(!page_ext)
+   return false;
+
+   return test_bit(PAGE_EXT_IDLE, _ext->flags);
 }

 static inline void set_page_idle(struct page *page)
 {
-   set_bit(PAGE_EXT_IDLE, _page_ext(page)->flags);
+   struct page_ext *page_ext;
+   page_ext = lookup_page_ext(page);
+   if (unlikely(!page_ext)
+   return;
+
+   set_bit(PAGE_EXT_IDLE, _ext->flags);
 }

 static inline void clear_page_idle(struct page *page)
 {
-   clear_bit(PAGE_EXT_IDLE, _page_ext(page)->flags);
+   struct page_ext *page_ext;
+   page_ext = lookup_page_ext(page);
+   if (unlikely(!page_ext)
+   return;
+
+   clear_bit(PAGE_EXT_IDLE, _ext->flags);
 }
 #endif /* CONFIG_64BIT */

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f8f3bfc..d27e8b9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -656,6 +656,9 @@ static inline void set_page_guard(struct zone *zone, struct 
page *page,
return;

page_ext = lookup_page_ext(page);
+   if (unlikely(!page_ext))
+   return;
+
__set_bit(PAGE_EXT_DEBUG_GUARD, _

Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites

2016-05-26 Thread Shi, Yang

On 5/25/2016 5:37 PM, Minchan Kim wrote:

On Tue, May 24, 2016 at 11:58:11AM +0900, Minchan Kim wrote:

On Mon, May 23, 2016 at 10:16:08AM -0700, Yang Shi wrote:

Per the discussion with Joonsoo Kim [1], we need check the return value of
lookup_page_ext() for all call sites since it might return NULL in some cases,
although it is unlikely, i.e. memory hotplug.

Tested with ltp with "page_owner=0".

[1] http://lkml.kernel.org/r/20160519002809.GA10245@js1304-P5Q-DELUXE

Signed-off-by: Yang Shi 


I didn't read code code in detail to see how page_ext memory space
allocated in boot code and memory hotplug but to me, it's not good
to check NULL whenever we calls lookup_page_ext.

More dangerous thing is now page_ext is used by optionable feature(ie, not
critical for system stability) but if we want to use page_ext as
another important tool for the system in future,
it could be a serious problem.

Can we put some hooks of page_ext into memory-hotplug so guarantee
that page_ext memory space is allocated with memmap space at the
same time? IOW, once every PFN wakers find a page is valid, page_ext
is valid, too so lookup_page_ext never returns NULL on valid page
by design.

I hope we consider this direction, too.


Yang, Could you think about this?


Thanks a lot for the suggestion. Sorry for the late reply, I was busy on 
preparing patches. I do agree this is a direction we should look into, 
but I haven't got time to think about it deeper. I hope Joonsoo could 
chime in too since he is the original author for page extension.




Even, your patch was broken, I think.
It doesn't work with !CONFIG_DEBUG_VM && !CONFIG_PAGE_POISONING because
lookup_page_ext doesn't return NULL in that case.


Actually, I think the #ifdef should be removed if lookup_page_ext() is 
possible to return NULL. It sounds not make sense returning NULL only 
when DEBUG_VM is enabled. It should return NULL no matter what debug 
config is selected. If Joonsoo agrees with me I'm going to come up with 
a patch to fix it.


Regards,
Yang





Thanks.


---
 include/linux/page_idle.h | 43 ---
 mm/page_alloc.c   |  6 ++
 mm/page_owner.c   | 27 +++
 mm/page_poison.c  |  8 +++-
 mm/vmstat.c   |  2 ++
 5 files changed, 78 insertions(+), 8 deletions(-)

diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index bf268fa..8f5d4ad 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -46,33 +46,62 @@ extern struct page_ext_operations page_idle_ops;

 static inline bool page_is_young(struct page *page)
 {
-   return test_bit(PAGE_EXT_YOUNG, _page_ext(page)->flags);
+   struct page_ext *page_ext;
+   page_ext = lookup_page_ext(page);
+   if (unlikely(!page_ext)
+   return false;
+
+   return test_bit(PAGE_EXT_YOUNG, _ext->flags);
 }

 static inline void set_page_young(struct page *page)
 {
-   set_bit(PAGE_EXT_YOUNG, _page_ext(page)->flags);
+   struct page_ext *page_ext;
+   page_ext = lookup_page_ext(page);
+   if (unlikely(!page_ext)
+   return;
+
+   set_bit(PAGE_EXT_YOUNG, _ext->flags);
 }

 static inline bool test_and_clear_page_young(struct page *page)
 {
-   return test_and_clear_bit(PAGE_EXT_YOUNG,
- _page_ext(page)->flags);
+   struct page_ext *page_ext;
+   page_ext = lookup_page_ext(page);
+   if (unlikely(!page_ext)
+   return false;
+
+   return test_and_clear_bit(PAGE_EXT_YOUNG, _ext->flags);
 }

 static inline bool page_is_idle(struct page *page)
 {
-   return test_bit(PAGE_EXT_IDLE, _page_ext(page)->flags);
+   struct page_ext *page_ext;
+   page_ext = lookup_page_ext(page);
+   if (unlikely(!page_ext)
+   return false;
+
+   return test_bit(PAGE_EXT_IDLE, _ext->flags);
 }

 static inline void set_page_idle(struct page *page)
 {
-   set_bit(PAGE_EXT_IDLE, _page_ext(page)->flags);
+   struct page_ext *page_ext;
+   page_ext = lookup_page_ext(page);
+   if (unlikely(!page_ext)
+   return;
+
+   set_bit(PAGE_EXT_IDLE, _ext->flags);
 }

 static inline void clear_page_idle(struct page *page)
 {
-   clear_bit(PAGE_EXT_IDLE, _page_ext(page)->flags);
+   struct page_ext *page_ext;
+   page_ext = lookup_page_ext(page);
+   if (unlikely(!page_ext)
+   return;
+
+   clear_bit(PAGE_EXT_IDLE, _ext->flags);
 }
 #endif /* CONFIG_64BIT */

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f8f3bfc..d27e8b9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -656,6 +656,9 @@ static inline void set_page_guard(struct zone *zone, struct 
page *page,
return;

page_ext = lookup_page_ext(page);
+   if (unlikely(!page_ext))
+   return;
+
__set_bit(PAGE_EXT_DEBUG_GUARD, _ext->flags);

INIT_LIST_HEAD(>lru);
@@ -673,6 +676,9 @@ static inline void 

Re: [PATCH] mm: use early_pfn_to_nid in register_page_bootmem_info_node

2016-05-25 Thread Shi, Yang

On 5/25/2016 3:23 PM, Andrew Morton wrote:

On Wed, 25 May 2016 14:00:07 -0700 Yang Shi <yang@linaro.org> wrote:


register_page_bootmem_info_node() is invoked in mem_init(), so it will be
called before page_alloc_init_late() if CONFIG_DEFERRED_STRUCT_PAGE_INIT
is enabled. But, pfn_to_nid() depends on memmap which won't be fully setup
until page_alloc_init_late() is done, so replace pfn_to_nid() by
early_pfn_to_nid().


What are the runtime effects of this fix?


I didn't experience any problem without the fix. During working on the 
page_ext_init() fix (replace to early_pfn_to_nid()), I added printk 
before each pfn_to_nid() calls to check which one might be called before 
page_alloc_init_late(), then this one is caught.


From the code perspective, it sounds not right since 
register_page_bootmem_info_section() may miss some pfns when 
CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, just like the problem 
happened in page_ext_init().


Thanks,
Yang








Re: [PATCH] mm: use early_pfn_to_nid in register_page_bootmem_info_node

2016-05-25 Thread Shi, Yang

On 5/25/2016 3:23 PM, Andrew Morton wrote:

On Wed, 25 May 2016 14:00:07 -0700 Yang Shi  wrote:


register_page_bootmem_info_node() is invoked in mem_init(), so it will be
called before page_alloc_init_late() if CONFIG_DEFERRED_STRUCT_PAGE_INIT
is enabled. But, pfn_to_nid() depends on memmap which won't be fully setup
until page_alloc_init_late() is done, so replace pfn_to_nid() by
early_pfn_to_nid().


What are the runtime effects of this fix?


I didn't experience any problem without the fix. During working on the 
page_ext_init() fix (replace to early_pfn_to_nid()), I added printk 
before each pfn_to_nid() calls to check which one might be called before 
page_alloc_init_late(), then this one is caught.


From the code perspective, it sounds not right since 
register_page_bootmem_info_section() may miss some pfns when 
CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, just like the problem 
happened in page_ext_init().


Thanks,
Yang








Re: [PATCH] Fixing compilation error from file include/linux/page_idle.h

2016-05-25 Thread Shi, Yang

On 5/25/2016 3:11 AM, shakilk1...@gmail.com wrote:

From: shakil khan 


Thanks for the patch, this issue had been fixed by Arnd.

Yang



---
 include/linux/page_idle.h | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 569c3a1..fec4027 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -48,7 +48,7 @@ static inline bool page_is_young(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return false;

return test_bit(PAGE_EXT_YOUNG, _ext->flags);
@@ -58,7 +58,7 @@ static inline void set_page_young(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return;

set_bit(PAGE_EXT_YOUNG, _ext->flags);
@@ -68,7 +68,7 @@ static inline bool test_and_clear_page_young(struct page 
*page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return false;

return test_and_clear_bit(PAGE_EXT_YOUNG, _ext->flags);
@@ -78,7 +78,7 @@ static inline bool page_is_idle(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return false;

return test_bit(PAGE_EXT_IDLE, _ext->flags);
@@ -88,7 +88,7 @@ static inline void set_page_idle(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return;

set_bit(PAGE_EXT_IDLE, _ext->flags);
@@ -98,7 +98,7 @@ static inline void clear_page_idle(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return;

clear_bit(PAGE_EXT_IDLE, _ext->flags);





Re: [PATCH] Fixing compilation error from file include/linux/page_idle.h

2016-05-25 Thread Shi, Yang

On 5/25/2016 3:11 AM, shakilk1...@gmail.com wrote:

From: shakil khan 


Thanks for the patch, this issue had been fixed by Arnd.

Yang



---
 include/linux/page_idle.h | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 569c3a1..fec4027 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -48,7 +48,7 @@ static inline bool page_is_young(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return false;

return test_bit(PAGE_EXT_YOUNG, _ext->flags);
@@ -58,7 +58,7 @@ static inline void set_page_young(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return;

set_bit(PAGE_EXT_YOUNG, _ext->flags);
@@ -68,7 +68,7 @@ static inline bool test_and_clear_page_young(struct page 
*page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return false;

return test_and_clear_bit(PAGE_EXT_YOUNG, _ext->flags);
@@ -78,7 +78,7 @@ static inline bool page_is_idle(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return false;

return test_bit(PAGE_EXT_IDLE, _ext->flags);
@@ -88,7 +88,7 @@ static inline void set_page_idle(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return;

set_bit(PAGE_EXT_IDLE, _ext->flags);
@@ -98,7 +98,7 @@ static inline void clear_page_idle(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return;

clear_bit(PAGE_EXT_IDLE, _ext->flags);





Re: [PATCH] mm: page_is_guard return false when page_ext arrays are not allocated yet

2016-05-24 Thread Shi, Yang

On 5/23/2016 10:37 PM, Joonsoo Kim wrote:

On Fri, May 20, 2016 at 10:00:06AM -0700, Shi, Yang wrote:

On 5/19/2016 7:40 PM, Joonsoo Kim wrote:

2016-05-20 2:18 GMT+09:00 Shi, Yang <yang@linaro.org>:

On 5/18/2016 5:28 PM, Joonsoo Kim wrote:


Vlastiml, thanks for ccing me on original bug report.

On Wed, May 18, 2016 at 03:23:45PM -0700, Yang Shi wrote:


When enabling the below kernel configs:

CONFIG_DEFERRED_STRUCT_PAGE_INIT
CONFIG_DEBUG_PAGEALLOC
CONFIG_PAGE_EXTENSION
CONFIG_DEBUG_VM

kernel bootup may fail due to the following oops:

BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [] free_pcppages_bulk+0x2d2/0x8d0
PGD 0
Oops:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
Modules linked in:
CPU: 11 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc5-next-20160427 #26
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS
S5500.86B.01.10.0025.030220091519 03/02/2009
task: 88017c080040 ti: 88017c084000 task.ti: 88017c084000
RIP: 0010:[]  []
free_pcppages_bulk+0x2d2/0x8d0
RSP: :88017c087c48  EFLAGS: 00010046
RAX:  RBX:  RCX: 0001
RDX: 0980 RSI: 0080 RDI: 00660401
RBP: 88017c087cd0 R08: 0401 R09: 0009
R10: 88017c080040 R11: 000a R12: 0400
R13: ea001981 R14: ea0019810040 R15: 88066cfe6080
FS:  () GS:88066cd4()
knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 02406000 CR4: 06e0
Stack:
88066cd5bbd8 88066cfe6640  
001f001f 88066cd5bbe8 ea001981 8118f53e
0009 0401 000a 0001
Call Trace:
[] free_hot_cold_page+0x192/0x1d0
[] __free_pages+0x5c/0x90
[] __free_pages_boot_core+0x11a/0x14e
[] deferred_free_range+0x50/0x62
[] deferred_init_memmap+0x220/0x3c3
[] ? setup_per_cpu_pageset+0x35/0x35
[] kthread+0xf8/0x110
[] ret_from_fork+0x22/0x40
[] ? kthread_create_on_node+0x200/0x200
Code: 49 89 d4 48 c1 e0 06 49 01 c5 e9 de fe ff ff 4c 89 f7 44 89 4d b8
4c 89 45 c0 44 89 5d c8 48 89 4d d0 e8 62 c7 07 00 48 8b 4d d0 <48> 8b 00 44
8b 5d c8 4c 8b 45 c0 44 8b 4d b8 a8 02 0f 84 05 ff
RIP  [] free_pcppages_bulk+0x2d2/0x8d0
RSP 
CR2: 

The problem is lookup_page_ext() returns NULL then page_is_guard() tried
to
access it in page freeing.

page_is_guard() depends on PAGE_EXT_DEBUG_GUARD bit of page extension
flag, but
freeing page might reach here before the page_ext arrays are allocated
when
feeding a range of pages to the allocator for the first time during
bootup or
memory hotplug.



Patch itself looks find to me because I also found that this kind of
problem happens during memory hotplug. So, we need to fix more sites,
all callers of lookup_page_ext().



Yes, I agree. I will come up with a patch or a couple of patches to check
the return value of lookup_page_ext().



But, I'd like to know how your problem occurs during bootup.
debug_guardpage_enabled() is turned to 'enable' after page_ext is
initialized. Before that, page_is_guard() unconditionally returns
false so I think that the problem what you mentioned can't happen.

Could you check that when debug_guardpage_enabled() returns 'enable'
and init_section_page_ext() is called?



I think the problem is I have CONFIG_DEFERRED_STRUCT_PAGE_INIT enabled,
which will defer some struct pages initialization to "pgdatinitX" kernel
thread in page_alloc_init_late(). But, page_ext_init() is called before it.
So, it leads debug_guardpage_enabled() return true, but page extension is
not allocated yet for the struct pages initialized by "pgdatinitX".


No. After page_ext_init(), it is ensured that all page extension is initialized.


It sounds page_ext_init() should be called after page_alloc_init_late(). Or
it should be just incompatible with CONFIG_DEFERRED_STRUCT_PAGE_INIT.

I will try to move the init call around.


We need to investigate more. I guess that problem is introduced by
CONFIG_DEFERRED_STRUCT_PAGE_INIT. It makes pfn_to_nid() invalid
until page_alloc_init_late() is done. That is a big side-effect. If
there is pfn walker
and it uses pfn_to_nid() between memmap_init_zone() and page_alloc_init_late(),
it also has same problem. So, we need to think how to fix it more
carefully.


Thanks for the analysis. I think you are correct. Since pfn_to_nid()
depends on memmap which has not been fully setup yet until
page_alloc_init_late() is done.

So, for such usecase early_pfn_to_nid() should be used.



Anyway, to make sure that my assumption is true, could you confirm that
below change fix your problem?


Yes, it does.



Thanks.

->8--
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 2d864e6..cac5dc9 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -391,7 +391,7 @@ void __init page_ext_init(void)
   

Re: [PATCH] mm: page_is_guard return false when page_ext arrays are not allocated yet

2016-05-24 Thread Shi, Yang

On 5/23/2016 10:37 PM, Joonsoo Kim wrote:

On Fri, May 20, 2016 at 10:00:06AM -0700, Shi, Yang wrote:

On 5/19/2016 7:40 PM, Joonsoo Kim wrote:

2016-05-20 2:18 GMT+09:00 Shi, Yang :

On 5/18/2016 5:28 PM, Joonsoo Kim wrote:


Vlastiml, thanks for ccing me on original bug report.

On Wed, May 18, 2016 at 03:23:45PM -0700, Yang Shi wrote:


When enabling the below kernel configs:

CONFIG_DEFERRED_STRUCT_PAGE_INIT
CONFIG_DEBUG_PAGEALLOC
CONFIG_PAGE_EXTENSION
CONFIG_DEBUG_VM

kernel bootup may fail due to the following oops:

BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [] free_pcppages_bulk+0x2d2/0x8d0
PGD 0
Oops:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
Modules linked in:
CPU: 11 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc5-next-20160427 #26
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS
S5500.86B.01.10.0025.030220091519 03/02/2009
task: 88017c080040 ti: 88017c084000 task.ti: 88017c084000
RIP: 0010:[]  []
free_pcppages_bulk+0x2d2/0x8d0
RSP: :88017c087c48  EFLAGS: 00010046
RAX:  RBX:  RCX: 0001
RDX: 0980 RSI: 0080 RDI: 00660401
RBP: 88017c087cd0 R08: 0401 R09: 0009
R10: 88017c080040 R11: 000a R12: 0400
R13: ea001981 R14: ea0019810040 R15: 88066cfe6080
FS:  () GS:88066cd4()
knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 02406000 CR4: 06e0
Stack:
88066cd5bbd8 88066cfe6640  
001f001f 88066cd5bbe8 ea001981 8118f53e
0009 0401 000a 0001
Call Trace:
[] free_hot_cold_page+0x192/0x1d0
[] __free_pages+0x5c/0x90
[] __free_pages_boot_core+0x11a/0x14e
[] deferred_free_range+0x50/0x62
[] deferred_init_memmap+0x220/0x3c3
[] ? setup_per_cpu_pageset+0x35/0x35
[] kthread+0xf8/0x110
[] ret_from_fork+0x22/0x40
[] ? kthread_create_on_node+0x200/0x200
Code: 49 89 d4 48 c1 e0 06 49 01 c5 e9 de fe ff ff 4c 89 f7 44 89 4d b8
4c 89 45 c0 44 89 5d c8 48 89 4d d0 e8 62 c7 07 00 48 8b 4d d0 <48> 8b 00 44
8b 5d c8 4c 8b 45 c0 44 8b 4d b8 a8 02 0f 84 05 ff
RIP  [] free_pcppages_bulk+0x2d2/0x8d0
RSP 
CR2: 

The problem is lookup_page_ext() returns NULL then page_is_guard() tried
to
access it in page freeing.

page_is_guard() depends on PAGE_EXT_DEBUG_GUARD bit of page extension
flag, but
freeing page might reach here before the page_ext arrays are allocated
when
feeding a range of pages to the allocator for the first time during
bootup or
memory hotplug.



Patch itself looks find to me because I also found that this kind of
problem happens during memory hotplug. So, we need to fix more sites,
all callers of lookup_page_ext().



Yes, I agree. I will come up with a patch or a couple of patches to check
the return value of lookup_page_ext().



But, I'd like to know how your problem occurs during bootup.
debug_guardpage_enabled() is turned to 'enable' after page_ext is
initialized. Before that, page_is_guard() unconditionally returns
false so I think that the problem what you mentioned can't happen.

Could you check that when debug_guardpage_enabled() returns 'enable'
and init_section_page_ext() is called?



I think the problem is I have CONFIG_DEFERRED_STRUCT_PAGE_INIT enabled,
which will defer some struct pages initialization to "pgdatinitX" kernel
thread in page_alloc_init_late(). But, page_ext_init() is called before it.
So, it leads debug_guardpage_enabled() return true, but page extension is
not allocated yet for the struct pages initialized by "pgdatinitX".


No. After page_ext_init(), it is ensured that all page extension is initialized.


It sounds page_ext_init() should be called after page_alloc_init_late(). Or
it should be just incompatible with CONFIG_DEFERRED_STRUCT_PAGE_INIT.

I will try to move the init call around.


We need to investigate more. I guess that problem is introduced by
CONFIG_DEFERRED_STRUCT_PAGE_INIT. It makes pfn_to_nid() invalid
until page_alloc_init_late() is done. That is a big side-effect. If
there is pfn walker
and it uses pfn_to_nid() between memmap_init_zone() and page_alloc_init_late(),
it also has same problem. So, we need to think how to fix it more
carefully.


Thanks for the analysis. I think you are correct. Since pfn_to_nid()
depends on memmap which has not been fully setup yet until
page_alloc_init_late() is done.

So, for such usecase early_pfn_to_nid() should be used.



Anyway, to make sure that my assumption is true, could you confirm that
below change fix your problem?


Yes, it does.



Thanks.

->8--
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 2d864e6..cac5dc9 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -391,7 +391,7 @@ void __init page_ext_init(void)
 

Re: [PATCH] mm: fix build problems from lookup_page_ext

2016-05-24 Thread Shi, Yang

Hi Arnd,

Thanks a lot for the patch. My bad, sorry for the inconvenience. I 
omitted the specific page_idle change is for 32 bit only.


And, my host compiler looks old which is still 4.8 so it might not catch 
the warning. I will update my compiler.


Regards,
Yang


On 5/24/2016 3:08 AM, Arnd Bergmann wrote:

A patch for lookup_page_ext introduced several build errors and
warnings, e.g.

mm/page_owner.c: In function '__set_page_owner':
mm/page_owner.c:71:2: error: ISO C90 forbids mixed declarations and code 
[-Werror=declaration-after-statement]
include/linux/page_idle.h: In function 'set_page_young':
include/linux/page_idle.h:62:3: error: expected ')' before 'return'

This fixes all of them. Please fold into the original patch.

Signed-off-by: Arnd Bergmann 
Fixes: 38c4fffbad3c ("mm: check the return value of lookup_page_ext for all call 
sites")

diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 569c3a180625..fec40271339f 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -48,7 +48,7 @@ static inline bool page_is_young(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return false;

return test_bit(PAGE_EXT_YOUNG, _ext->flags);
@@ -58,7 +58,7 @@ static inline void set_page_young(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return;

set_bit(PAGE_EXT_YOUNG, _ext->flags);
@@ -68,7 +68,7 @@ static inline bool test_and_clear_page_young(struct page 
*page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return false;

return test_and_clear_bit(PAGE_EXT_YOUNG, _ext->flags);
@@ -78,7 +78,7 @@ static inline bool page_is_idle(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return false;

return test_bit(PAGE_EXT_IDLE, _ext->flags);
@@ -88,7 +88,7 @@ static inline void set_page_idle(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return;

set_bit(PAGE_EXT_IDLE, _ext->flags);
@@ -98,7 +98,7 @@ static inline void clear_page_idle(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return;

clear_bit(PAGE_EXT_IDLE, _ext->flags);
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 902e39813295..c6cda3e36212 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -65,9 +65,6 @@ void __set_page_owner(struct page *page, unsigned int order, 
gfp_t gfp_mask)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext))
-   return;
-
struct stack_trace trace = {
.nr_entries = 0,
.max_entries = ARRAY_SIZE(page_ext->trace_entries),
@@ -75,6 +72,9 @@ void __set_page_owner(struct page *page, unsigned int order, 
gfp_t gfp_mask)
.skip = 3,
};

+   if (unlikely(!page_ext))
+   return;
+
save_stack_trace();

page_ext->order = order;
@@ -111,12 +111,11 @@ void __copy_page_owner(struct page *oldpage, struct page 
*newpage)
 {
struct page_ext *old_ext = lookup_page_ext(oldpage);
struct page_ext *new_ext = lookup_page_ext(newpage);
+   int i;

if (unlikely(!old_ext || !new_ext))
return;

-   int i;
-
new_ext->order = old_ext->order;
new_ext->gfp_mask = old_ext->gfp_mask;
new_ext->nr_entries = old_ext->nr_entries;
@@ -204,11 +203,6 @@ err:
 void __dump_page_owner(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);
-   if (unlikely(!page_ext)) {
-   pr_alert("There is not page extension available.\n");
-   return;
-   }
-
struct stack_trace trace = {
.nr_entries = page_ext->nr_entries,
.entries = _ext->trace_entries[0],
@@ -216,6 +210,11 @@ void __dump_page_owner(struct page *page)
gfp_t gfp_mask = page_ext->gfp_mask;
int mt = gfpflags_to_migratetype(gfp_mask);

+   if (unlikely(!page_ext)) {
+   pr_alert("There is not page extension available.\n");
+   return;
+   }
+
if (!test_bit(PAGE_EXT_OWNER, _ext->flags)) {
pr_alert("page_owner info is not active (free page?)\n");
return;





Re: [PATCH] mm: fix build problems from lookup_page_ext

2016-05-24 Thread Shi, Yang

Hi Arnd,

Thanks a lot for the patch. My bad, sorry for the inconvenience. I 
omitted the specific page_idle change is for 32 bit only.


And, my host compiler looks old which is still 4.8 so it might not catch 
the warning. I will update my compiler.


Regards,
Yang


On 5/24/2016 3:08 AM, Arnd Bergmann wrote:

A patch for lookup_page_ext introduced several build errors and
warnings, e.g.

mm/page_owner.c: In function '__set_page_owner':
mm/page_owner.c:71:2: error: ISO C90 forbids mixed declarations and code 
[-Werror=declaration-after-statement]
include/linux/page_idle.h: In function 'set_page_young':
include/linux/page_idle.h:62:3: error: expected ')' before 'return'

This fixes all of them. Please fold into the original patch.

Signed-off-by: Arnd Bergmann 
Fixes: 38c4fffbad3c ("mm: check the return value of lookup_page_ext for all call 
sites")

diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 569c3a180625..fec40271339f 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -48,7 +48,7 @@ static inline bool page_is_young(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return false;

return test_bit(PAGE_EXT_YOUNG, _ext->flags);
@@ -58,7 +58,7 @@ static inline void set_page_young(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return;

set_bit(PAGE_EXT_YOUNG, _ext->flags);
@@ -68,7 +68,7 @@ static inline bool test_and_clear_page_young(struct page 
*page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return false;

return test_and_clear_bit(PAGE_EXT_YOUNG, _ext->flags);
@@ -78,7 +78,7 @@ static inline bool page_is_idle(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return false;

return test_bit(PAGE_EXT_IDLE, _ext->flags);
@@ -88,7 +88,7 @@ static inline void set_page_idle(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return;

set_bit(PAGE_EXT_IDLE, _ext->flags);
@@ -98,7 +98,7 @@ static inline void clear_page_idle(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext)
+   if (unlikely(!page_ext))
return;

clear_bit(PAGE_EXT_IDLE, _ext->flags);
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 902e39813295..c6cda3e36212 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -65,9 +65,6 @@ void __set_page_owner(struct page *page, unsigned int order, 
gfp_t gfp_mask)
 {
struct page_ext *page_ext = lookup_page_ext(page);

-   if (unlikely(!page_ext))
-   return;
-
struct stack_trace trace = {
.nr_entries = 0,
.max_entries = ARRAY_SIZE(page_ext->trace_entries),
@@ -75,6 +72,9 @@ void __set_page_owner(struct page *page, unsigned int order, 
gfp_t gfp_mask)
.skip = 3,
};

+   if (unlikely(!page_ext))
+   return;
+
save_stack_trace();

page_ext->order = order;
@@ -111,12 +111,11 @@ void __copy_page_owner(struct page *oldpage, struct page 
*newpage)
 {
struct page_ext *old_ext = lookup_page_ext(oldpage);
struct page_ext *new_ext = lookup_page_ext(newpage);
+   int i;

if (unlikely(!old_ext || !new_ext))
return;

-   int i;
-
new_ext->order = old_ext->order;
new_ext->gfp_mask = old_ext->gfp_mask;
new_ext->nr_entries = old_ext->nr_entries;
@@ -204,11 +203,6 @@ err:
 void __dump_page_owner(struct page *page)
 {
struct page_ext *page_ext = lookup_page_ext(page);
-   if (unlikely(!page_ext)) {
-   pr_alert("There is not page extension available.\n");
-   return;
-   }
-
struct stack_trace trace = {
.nr_entries = page_ext->nr_entries,
.entries = _ext->trace_entries[0],
@@ -216,6 +210,11 @@ void __dump_page_owner(struct page *page)
gfp_t gfp_mask = page_ext->gfp_mask;
int mt = gfpflags_to_migratetype(gfp_mask);

+   if (unlikely(!page_ext)) {
+   pr_alert("There is not page extension available.\n");
+   return;
+   }
+
if (!test_bit(PAGE_EXT_OWNER, _ext->flags)) {
pr_alert("page_owner info is not active (free page?)\n");
return;





Re: [PATCH] mm: make CONFIG_DEFERRED_STRUCT_PAGE_INIT depends on !FLATMEM explicitly

2016-05-23 Thread Shi, Yang

On 5/23/2016 11:22 AM, Michal Hocko wrote:

On Mon 23-05-16 09:54:31, Yang Shi wrote:

Per the suggestion from Michal Hocko [1], CONFIG_DEFERRED_STRUCT_PAGE_INIT
should be incompatible with FLATMEM, make this explicitly in Kconfig.


I guess the changelog could benefit from some clarification. What
do you think about the following:

"
DEFERRED_STRUCT_PAGE_INIT requires some ordering wrt other
initialization operations, e.g. page_ext_init has to happen after the
whole memmap is initialized properly. For SPARSEMEM this requires to
wait for page_alloc_init_late. Other memory models (e.g. flatmem) might
have different initialization layouts (page_ext_init_flatmem). Currently
DEFERRED_STRUCT_PAGE_INIT depends on MEMORY_HOTPLUG which in turn
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG

and X86_64_ACPI_NUMA depends on NUMA which in turn disable FLATMEM
memory model:
config ARCH_FLATMEM_ENABLE
def_bool y
depends on X86_32 && !NUMA

so FLATMEM is ruled out via dependency maze. Be explicit and disable
FLATMEM for DEFERRED_STRUCT_PAGE_INIT so that we do not reintroduce
subtle initialization bugs
"


Thanks a lot. It sounds way better. Will address in V2.

Yang





[1] http://lkml.kernel.org/r/20160523073157.gd2...@dhcp22.suse.cz

Signed-off-by: Yang Shi <yang@linaro.org>
---
 mm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/Kconfig b/mm/Kconfig
index 2664c11..22fa818 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -649,6 +649,7 @@ config DEFERRED_STRUCT_PAGE_INIT
default n
depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
depends on MEMORY_HOTPLUG
+   depends on !FLATMEM
help
  Ordinarily all struct pages are initialised during early boot in a
  single thread. On very large machines this can take a considerable
--
2.0.2






Re: [PATCH] mm: make CONFIG_DEFERRED_STRUCT_PAGE_INIT depends on !FLATMEM explicitly

2016-05-23 Thread Shi, Yang

On 5/23/2016 11:22 AM, Michal Hocko wrote:

On Mon 23-05-16 09:54:31, Yang Shi wrote:

Per the suggestion from Michal Hocko [1], CONFIG_DEFERRED_STRUCT_PAGE_INIT
should be incompatible with FLATMEM, make this explicitly in Kconfig.


I guess the changelog could benefit from some clarification. What
do you think about the following:

"
DEFERRED_STRUCT_PAGE_INIT requires some ordering wrt other
initialization operations, e.g. page_ext_init has to happen after the
whole memmap is initialized properly. For SPARSEMEM this requires to
wait for page_alloc_init_late. Other memory models (e.g. flatmem) might
have different initialization layouts (page_ext_init_flatmem). Currently
DEFERRED_STRUCT_PAGE_INIT depends on MEMORY_HOTPLUG which in turn
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG

and X86_64_ACPI_NUMA depends on NUMA which in turn disable FLATMEM
memory model:
config ARCH_FLATMEM_ENABLE
def_bool y
depends on X86_32 && !NUMA

so FLATMEM is ruled out via dependency maze. Be explicit and disable
FLATMEM for DEFERRED_STRUCT_PAGE_INIT so that we do not reintroduce
subtle initialization bugs
"


Thanks a lot. It sounds way better. Will address in V2.

Yang





[1] http://lkml.kernel.org/r/20160523073157.gd2...@dhcp22.suse.cz

Signed-off-by: Yang Shi 
---
 mm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/Kconfig b/mm/Kconfig
index 2664c11..22fa818 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -649,6 +649,7 @@ config DEFERRED_STRUCT_PAGE_INIT
default n
depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
depends on MEMORY_HOTPLUG
+   depends on !FLATMEM
help
  Ordinarily all struct pages are initialised during early boot in a
  single thread. On very large machines this can take a considerable
--
2.0.2






Re: [v2 PATCH] mm: move page_ext_init after all struct pages are initialized

2016-05-23 Thread Shi, Yang

On 5/23/2016 12:31 AM, Michal Hocko wrote:

On Fri 20-05-16 08:41:09, Shi, Yang wrote:

On 5/20/2016 6:16 AM, Michal Hocko wrote:

On Thu 19-05-16 15:13:26, Yang Shi wrote:
[...]

diff --git a/init/main.c b/init/main.c
index b3c6e36..2075faf 100644
--- a/init/main.c
+++ b/init/main.c
@@ -606,7 +606,6 @@ asmlinkage __visible void __init start_kernel(void)
initrd_start = 0;
}
 #endif
-   page_ext_init();
debug_objects_mem_init();
kmemleak_init();
setup_per_cpu_pageset();
@@ -1004,6 +1003,8 @@ static noinline void __init kernel_init_freeable(void)
sched_init_smp();

page_alloc_init_late();
+   /* Initialize page ext after all struct pages are initializaed */
+   page_ext_init();

do_basic_setup();


I might be missing something but don't we have the same problem with
CONFIG_FLATMEM? page_ext_init_flatmem is called way earlier. Or
CONFIG_DEFERRED_STRUCT_PAGE_INIT is never enabled for CONFIG_FLATMEM?


Yes, CONFIG_DEFERRED_STRUCT_PAGE_INIT depends on MEMORY_HOTPLUG which
depends on SPARSEMEM. So, this config is not valid for FLATMEM at all.


Well
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG

I wasn't really sure about X86_64_ACPI_NUMA dependency branch which
depends on X86_64 && NUMA && ACPI && PCI and that didn't sound like
SPARSEMEM only. If the FLATMEM shouldn't exist with


Actually, FLATMEMT depends on !NUMA.


CONFIG_DEFERRED_STRUCT_PAGE_INIT can we make that explicit please?


Sure, it makes the condition clearer and more readable.

Thanks,
Yang







Re: [v2 PATCH] mm: move page_ext_init after all struct pages are initialized

2016-05-23 Thread Shi, Yang

On 5/23/2016 12:31 AM, Michal Hocko wrote:

On Fri 20-05-16 08:41:09, Shi, Yang wrote:

On 5/20/2016 6:16 AM, Michal Hocko wrote:

On Thu 19-05-16 15:13:26, Yang Shi wrote:
[...]

diff --git a/init/main.c b/init/main.c
index b3c6e36..2075faf 100644
--- a/init/main.c
+++ b/init/main.c
@@ -606,7 +606,6 @@ asmlinkage __visible void __init start_kernel(void)
initrd_start = 0;
}
 #endif
-   page_ext_init();
debug_objects_mem_init();
kmemleak_init();
setup_per_cpu_pageset();
@@ -1004,6 +1003,8 @@ static noinline void __init kernel_init_freeable(void)
sched_init_smp();

page_alloc_init_late();
+   /* Initialize page ext after all struct pages are initializaed */
+   page_ext_init();

do_basic_setup();


I might be missing something but don't we have the same problem with
CONFIG_FLATMEM? page_ext_init_flatmem is called way earlier. Or
CONFIG_DEFERRED_STRUCT_PAGE_INIT is never enabled for CONFIG_FLATMEM?


Yes, CONFIG_DEFERRED_STRUCT_PAGE_INIT depends on MEMORY_HOTPLUG which
depends on SPARSEMEM. So, this config is not valid for FLATMEM at all.


Well
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG

I wasn't really sure about X86_64_ACPI_NUMA dependency branch which
depends on X86_64 && NUMA && ACPI && PCI and that didn't sound like
SPARSEMEM only. If the FLATMEM shouldn't exist with


Actually, FLATMEMT depends on !NUMA.


CONFIG_DEFERRED_STRUCT_PAGE_INIT can we make that explicit please?


Sure, it makes the condition clearer and more readable.

Thanks,
Yang







Re: [PATCH] mm: page_is_guard return false when page_ext arrays are not allocated yet

2016-05-20 Thread Shi, Yang

On 5/19/2016 7:40 PM, Joonsoo Kim wrote:

2016-05-20 2:18 GMT+09:00 Shi, Yang <yang@linaro.org>:

On 5/18/2016 5:28 PM, Joonsoo Kim wrote:


Vlastiml, thanks for ccing me on original bug report.

On Wed, May 18, 2016 at 03:23:45PM -0700, Yang Shi wrote:


When enabling the below kernel configs:

CONFIG_DEFERRED_STRUCT_PAGE_INIT
CONFIG_DEBUG_PAGEALLOC
CONFIG_PAGE_EXTENSION
CONFIG_DEBUG_VM

kernel bootup may fail due to the following oops:

BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [] free_pcppages_bulk+0x2d2/0x8d0
PGD 0
Oops:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
Modules linked in:
CPU: 11 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc5-next-20160427 #26
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS
S5500.86B.01.10.0025.030220091519 03/02/2009
task: 88017c080040 ti: 88017c084000 task.ti: 88017c084000
RIP: 0010:[]  []
free_pcppages_bulk+0x2d2/0x8d0
RSP: :88017c087c48  EFLAGS: 00010046
RAX:  RBX:  RCX: 0001
RDX: 0980 RSI: 0080 RDI: 00660401
RBP: 88017c087cd0 R08: 0401 R09: 0009
R10: 88017c080040 R11: 000a R12: 0400
R13: ea001981 R14: ea0019810040 R15: 88066cfe6080
FS:  () GS:88066cd4()
knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 02406000 CR4: 06e0
Stack:
 88066cd5bbd8 88066cfe6640  
 001f001f 88066cd5bbe8 ea001981 8118f53e
 0009 0401 000a 0001
Call Trace:
 [] free_hot_cold_page+0x192/0x1d0
 [] __free_pages+0x5c/0x90
 [] __free_pages_boot_core+0x11a/0x14e
 [] deferred_free_range+0x50/0x62
 [] deferred_init_memmap+0x220/0x3c3
 [] ? setup_per_cpu_pageset+0x35/0x35
 [] kthread+0xf8/0x110
 [] ret_from_fork+0x22/0x40
 [] ? kthread_create_on_node+0x200/0x200
Code: 49 89 d4 48 c1 e0 06 49 01 c5 e9 de fe ff ff 4c 89 f7 44 89 4d b8
4c 89 45 c0 44 89 5d c8 48 89 4d d0 e8 62 c7 07 00 48 8b 4d d0 <48> 8b 00 44
8b 5d c8 4c 8b 45 c0 44 8b 4d b8 a8 02 0f 84 05 ff
RIP  [] free_pcppages_bulk+0x2d2/0x8d0
 RSP 
CR2: 

The problem is lookup_page_ext() returns NULL then page_is_guard() tried
to
access it in page freeing.

page_is_guard() depends on PAGE_EXT_DEBUG_GUARD bit of page extension
flag, but
freeing page might reach here before the page_ext arrays are allocated
when
feeding a range of pages to the allocator for the first time during
bootup or
memory hotplug.



Patch itself looks find to me because I also found that this kind of
problem happens during memory hotplug. So, we need to fix more sites,
all callers of lookup_page_ext().



Yes, I agree. I will come up with a patch or a couple of patches to check
the return value of lookup_page_ext().



But, I'd like to know how your problem occurs during bootup.
debug_guardpage_enabled() is turned to 'enable' after page_ext is
initialized. Before that, page_is_guard() unconditionally returns
false so I think that the problem what you mentioned can't happen.

Could you check that when debug_guardpage_enabled() returns 'enable'
and init_section_page_ext() is called?



I think the problem is I have CONFIG_DEFERRED_STRUCT_PAGE_INIT enabled,
which will defer some struct pages initialization to "pgdatinitX" kernel
thread in page_alloc_init_late(). But, page_ext_init() is called before it.
So, it leads debug_guardpage_enabled() return true, but page extension is
not allocated yet for the struct pages initialized by "pgdatinitX".


No. After page_ext_init(), it is ensured that all page extension is initialized.


It sounds page_ext_init() should be called after page_alloc_init_late(). Or
it should be just incompatible with CONFIG_DEFERRED_STRUCT_PAGE_INIT.

I will try to move the init call around.


We need to investigate more. I guess that problem is introduced by
CONFIG_DEFERRED_STRUCT_PAGE_INIT. It makes pfn_to_nid() invalid
until page_alloc_init_late() is done. That is a big side-effect. If
there is pfn walker
and it uses pfn_to_nid() between memmap_init_zone() and page_alloc_init_late(),
it also has same problem. So, we need to think how to fix it more
carefully.


Thanks for the analysis. I think you are correct. Since pfn_to_nid() 
depends on memmap which has not been fully setup yet until 
page_alloc_init_late() is done.


So, for such usecase early_pfn_to_nid() should be used.



Anyway, to make sure that my assumption is true, could you confirm that
below change fix your problem?


Yes, it does.



Thanks.

->8--
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 2d864e6..cac5dc9 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -391,7 +391,7 @@ void __init page_ext_init(void)
 * -pfn-->

Re: [PATCH] mm: page_is_guard return false when page_ext arrays are not allocated yet

2016-05-20 Thread Shi, Yang

On 5/19/2016 7:40 PM, Joonsoo Kim wrote:

2016-05-20 2:18 GMT+09:00 Shi, Yang :

On 5/18/2016 5:28 PM, Joonsoo Kim wrote:


Vlastiml, thanks for ccing me on original bug report.

On Wed, May 18, 2016 at 03:23:45PM -0700, Yang Shi wrote:


When enabling the below kernel configs:

CONFIG_DEFERRED_STRUCT_PAGE_INIT
CONFIG_DEBUG_PAGEALLOC
CONFIG_PAGE_EXTENSION
CONFIG_DEBUG_VM

kernel bootup may fail due to the following oops:

BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [] free_pcppages_bulk+0x2d2/0x8d0
PGD 0
Oops:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
Modules linked in:
CPU: 11 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc5-next-20160427 #26
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS
S5500.86B.01.10.0025.030220091519 03/02/2009
task: 88017c080040 ti: 88017c084000 task.ti: 88017c084000
RIP: 0010:[]  []
free_pcppages_bulk+0x2d2/0x8d0
RSP: :88017c087c48  EFLAGS: 00010046
RAX:  RBX:  RCX: 0001
RDX: 0980 RSI: 0080 RDI: 00660401
RBP: 88017c087cd0 R08: 0401 R09: 0009
R10: 88017c080040 R11: 000a R12: 0400
R13: ea001981 R14: ea0019810040 R15: 88066cfe6080
FS:  () GS:88066cd4()
knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 02406000 CR4: 06e0
Stack:
 88066cd5bbd8 88066cfe6640  
 001f001f 88066cd5bbe8 ea001981 8118f53e
 0009 0401 000a 0001
Call Trace:
 [] free_hot_cold_page+0x192/0x1d0
 [] __free_pages+0x5c/0x90
 [] __free_pages_boot_core+0x11a/0x14e
 [] deferred_free_range+0x50/0x62
 [] deferred_init_memmap+0x220/0x3c3
 [] ? setup_per_cpu_pageset+0x35/0x35
 [] kthread+0xf8/0x110
 [] ret_from_fork+0x22/0x40
 [] ? kthread_create_on_node+0x200/0x200
Code: 49 89 d4 48 c1 e0 06 49 01 c5 e9 de fe ff ff 4c 89 f7 44 89 4d b8
4c 89 45 c0 44 89 5d c8 48 89 4d d0 e8 62 c7 07 00 48 8b 4d d0 <48> 8b 00 44
8b 5d c8 4c 8b 45 c0 44 8b 4d b8 a8 02 0f 84 05 ff
RIP  [] free_pcppages_bulk+0x2d2/0x8d0
 RSP 
CR2: 

The problem is lookup_page_ext() returns NULL then page_is_guard() tried
to
access it in page freeing.

page_is_guard() depends on PAGE_EXT_DEBUG_GUARD bit of page extension
flag, but
freeing page might reach here before the page_ext arrays are allocated
when
feeding a range of pages to the allocator for the first time during
bootup or
memory hotplug.



Patch itself looks find to me because I also found that this kind of
problem happens during memory hotplug. So, we need to fix more sites,
all callers of lookup_page_ext().



Yes, I agree. I will come up with a patch or a couple of patches to check
the return value of lookup_page_ext().



But, I'd like to know how your problem occurs during bootup.
debug_guardpage_enabled() is turned to 'enable' after page_ext is
initialized. Before that, page_is_guard() unconditionally returns
false so I think that the problem what you mentioned can't happen.

Could you check that when debug_guardpage_enabled() returns 'enable'
and init_section_page_ext() is called?



I think the problem is I have CONFIG_DEFERRED_STRUCT_PAGE_INIT enabled,
which will defer some struct pages initialization to "pgdatinitX" kernel
thread in page_alloc_init_late(). But, page_ext_init() is called before it.
So, it leads debug_guardpage_enabled() return true, but page extension is
not allocated yet for the struct pages initialized by "pgdatinitX".


No. After page_ext_init(), it is ensured that all page extension is initialized.


It sounds page_ext_init() should be called after page_alloc_init_late(). Or
it should be just incompatible with CONFIG_DEFERRED_STRUCT_PAGE_INIT.

I will try to move the init call around.


We need to investigate more. I guess that problem is introduced by
CONFIG_DEFERRED_STRUCT_PAGE_INIT. It makes pfn_to_nid() invalid
until page_alloc_init_late() is done. That is a big side-effect. If
there is pfn walker
and it uses pfn_to_nid() between memmap_init_zone() and page_alloc_init_late(),
it also has same problem. So, we need to think how to fix it more
carefully.


Thanks for the analysis. I think you are correct. Since pfn_to_nid() 
depends on memmap which has not been fully setup yet until 
page_alloc_init_late() is done.


So, for such usecase early_pfn_to_nid() should be used.



Anyway, to make sure that my assumption is true, could you confirm that
below change fix your problem?


Yes, it does.



Thanks.

->8--
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 2d864e6..cac5dc9 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -391,7 +391,7 @@ void __init page_ext_init(void)
 * -pfn-->
  

Re: [v2 PATCH] mm: move page_ext_init after all struct pages are initialized

2016-05-20 Thread Shi, Yang

On 5/20/2016 6:16 AM, Michal Hocko wrote:

On Thu 19-05-16 15:13:26, Yang Shi wrote:
[...]

diff --git a/init/main.c b/init/main.c
index b3c6e36..2075faf 100644
--- a/init/main.c
+++ b/init/main.c
@@ -606,7 +606,6 @@ asmlinkage __visible void __init start_kernel(void)
initrd_start = 0;
}
 #endif
-   page_ext_init();
debug_objects_mem_init();
kmemleak_init();
setup_per_cpu_pageset();
@@ -1004,6 +1003,8 @@ static noinline void __init kernel_init_freeable(void)
sched_init_smp();

page_alloc_init_late();
+   /* Initialize page ext after all struct pages are initializaed */
+   page_ext_init();

do_basic_setup();


I might be missing something but don't we have the same problem with
CONFIG_FLATMEM? page_ext_init_flatmem is called way earlier. Or
CONFIG_DEFERRED_STRUCT_PAGE_INIT is never enabled for CONFIG_FLATMEM?


Yes, CONFIG_DEFERRED_STRUCT_PAGE_INIT depends on MEMORY_HOTPLUG which 
depends on SPARSEMEM. So, this config is not valid for FLATMEM at all.


Thanks,
Yang







Re: [v2 PATCH] mm: move page_ext_init after all struct pages are initialized

2016-05-20 Thread Shi, Yang

On 5/20/2016 6:16 AM, Michal Hocko wrote:

On Thu 19-05-16 15:13:26, Yang Shi wrote:
[...]

diff --git a/init/main.c b/init/main.c
index b3c6e36..2075faf 100644
--- a/init/main.c
+++ b/init/main.c
@@ -606,7 +606,6 @@ asmlinkage __visible void __init start_kernel(void)
initrd_start = 0;
}
 #endif
-   page_ext_init();
debug_objects_mem_init();
kmemleak_init();
setup_per_cpu_pageset();
@@ -1004,6 +1003,8 @@ static noinline void __init kernel_init_freeable(void)
sched_init_smp();

page_alloc_init_late();
+   /* Initialize page ext after all struct pages are initializaed */
+   page_ext_init();

do_basic_setup();


I might be missing something but don't we have the same problem with
CONFIG_FLATMEM? page_ext_init_flatmem is called way earlier. Or
CONFIG_DEFERRED_STRUCT_PAGE_INIT is never enabled for CONFIG_FLATMEM?


Yes, CONFIG_DEFERRED_STRUCT_PAGE_INIT depends on MEMORY_HOTPLUG which 
depends on SPARSEMEM. So, this config is not valid for FLATMEM at all.


Thanks,
Yang







Re: [PATCH] mm: move page_ext_init after all struct pages are initialized

2016-05-19 Thread Shi, Yang

On 5/19/2016 4:21 PM, Andrew Morton wrote:

On Thu, 19 May 2016 15:35:15 -0700 "Shi, Yang" <yang@linaro.org> wrote:


On 5/19/2016 3:30 PM, Andrew Morton wrote:

On Thu, 19 May 2016 14:29:05 -0700 Yang Shi <yang@linaro.org> wrote:


When DEFERRED_STRUCT_PAGE_INIT is enabled, just a subset of memmap at boot
are initialized, then the rest are initialized in parallel by starting one-off
"pgdatinitX" kernel thread for each node X.

If page_ext_init is called before it, some pages will not have valid extension,
so move page_ext_init() after it.



When fixing a bug, please fully describe the end-user impact
of that bug


The kernel ran into the below oops which is same with the oops reported
in
http://ozlabs.org/~akpm/mmots/broken-out/mm-page_is_guard-return-false-when-page_ext-arrays-are-not-allocated-yet.patch.


So this patch makes
mm-page_is_guard-return-false-when-page_ext-arrays-are-not-allocated-yet.patch
obsolete?


Actually, no. Checking the return value for lookup_page_ext() is still 
needed. But, the commit log need to be amended since that bootup oops 
won't happen anymore with this patch applied.


Thanks,
Yang







Re: [PATCH] mm: move page_ext_init after all struct pages are initialized

2016-05-19 Thread Shi, Yang

On 5/19/2016 4:21 PM, Andrew Morton wrote:

On Thu, 19 May 2016 15:35:15 -0700 "Shi, Yang"  wrote:


On 5/19/2016 3:30 PM, Andrew Morton wrote:

On Thu, 19 May 2016 14:29:05 -0700 Yang Shi  wrote:


When DEFERRED_STRUCT_PAGE_INIT is enabled, just a subset of memmap at boot
are initialized, then the rest are initialized in parallel by starting one-off
"pgdatinitX" kernel thread for each node X.

If page_ext_init is called before it, some pages will not have valid extension,
so move page_ext_init() after it.



When fixing a bug, please fully describe the end-user impact
of that bug


The kernel ran into the below oops which is same with the oops reported
in
http://ozlabs.org/~akpm/mmots/broken-out/mm-page_is_guard-return-false-when-page_ext-arrays-are-not-allocated-yet.patch.


So this patch makes
mm-page_is_guard-return-false-when-page_ext-arrays-are-not-allocated-yet.patch
obsolete?


Actually, no. Checking the return value for lookup_page_ext() is still 
needed. But, the commit log need to be amended since that bootup oops 
won't happen anymore with this patch applied.


Thanks,
Yang







Re: [PATCH] mm: move page_ext_init after all struct pages are initialized

2016-05-19 Thread Shi, Yang

On 5/19/2016 3:30 PM, Andrew Morton wrote:

On Thu, 19 May 2016 14:29:05 -0700 Yang Shi <yang@linaro.org> wrote:


When DEFERRED_STRUCT_PAGE_INIT is enabled, just a subset of memmap at boot
are initialized, then the rest are initialized in parallel by starting one-off
"pgdatinitX" kernel thread for each node X.

If page_ext_init is called before it, some pages will not have valid extension,
so move page_ext_init() after it.



When fixing a bug, please fully describe the end-user impact
of that bug


The kernel ran into the below oops which is same with the oops reported 
in 
http://ozlabs.org/~akpm/mmots/broken-out/mm-page_is_guard-return-false-when-page_ext-arrays-are-not-allocated-yet.patch.


BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [] free_pcppages_bulk+0x2d2/0x8d0
PGD 0
Oops:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
Modules linked in:
CPU: 11 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc5-next-20160427 #26
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS 
S5500.86B.01.10.0025.030220091519 03/02/2009

task: 88017c080040 ti: 88017c084000 task.ti: 88017c084000
RIP: 0010:[]  [] 
free_pcppages_bulk+0x2d2/0x8d0

RSP: :88017c087c48  EFLAGS: 00010046
RAX:  RBX:  RCX: 0001
RDX: 0980 RSI: 0080 RDI: 00660401
RBP: 88017c087cd0 R08: 0401 R09: 0009
R10: 88017c080040 R11: 000a R12: 0400
R13: ea001981 R14: ea0019810040 R15: 88066cfe6080
FS:  () GS:88066cd4() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 02406000 CR4: 06e0
Stack:
 88066cd5bbd8 88066cfe6640  
 001f001f 88066cd5bbe8 ea001981 8118f53e
 0009 0401 000a 0001
Call Trace:
 [] free_hot_cold_page+0x192/0x1d0
 [] __free_pages+0x5c/0x90
 [] __free_pages_boot_core+0x11a/0x14e
 [] deferred_free_range+0x50/0x62
 [] deferred_init_memmap+0x220/0x3c3
 [] ? setup_per_cpu_pageset+0x35/0x35
 [] kthread+0xf8/0x110
 [] ret_from_fork+0x22/0x40
 [] ? kthread_create_on_node+0x200/0x200
Code: 49 89 d4 48 c1 e0 06 49 01 c5 e9 de fe ff ff 4c 89 f7 44 89 4d b8 
4c 89 45 c0 44 89 5d c8 48 89 4d d0 e8 62 c7 07 00 48 8b 4d d0 <48> 8b 
00 44 8b 5d c8 4c 8b 45 c0 44 8b 4d b8 a8 02 0f 84 05 ff

RIP  [] free_pcppages_bulk+0x2d2/0x8d0
 RSP 
CR2: 


I will add the oops info into the commit log in V2.

Thanks,
Yang







Re: [PATCH] mm: move page_ext_init after all struct pages are initialized

2016-05-19 Thread Shi, Yang

On 5/19/2016 3:30 PM, Andrew Morton wrote:

On Thu, 19 May 2016 14:29:05 -0700 Yang Shi  wrote:


When DEFERRED_STRUCT_PAGE_INIT is enabled, just a subset of memmap at boot
are initialized, then the rest are initialized in parallel by starting one-off
"pgdatinitX" kernel thread for each node X.

If page_ext_init is called before it, some pages will not have valid extension,
so move page_ext_init() after it.



When fixing a bug, please fully describe the end-user impact
of that bug


The kernel ran into the below oops which is same with the oops reported 
in 
http://ozlabs.org/~akpm/mmots/broken-out/mm-page_is_guard-return-false-when-page_ext-arrays-are-not-allocated-yet.patch.


BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [] free_pcppages_bulk+0x2d2/0x8d0
PGD 0
Oops:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
Modules linked in:
CPU: 11 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc5-next-20160427 #26
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS 
S5500.86B.01.10.0025.030220091519 03/02/2009

task: 88017c080040 ti: 88017c084000 task.ti: 88017c084000
RIP: 0010:[]  [] 
free_pcppages_bulk+0x2d2/0x8d0

RSP: :88017c087c48  EFLAGS: 00010046
RAX:  RBX:  RCX: 0001
RDX: 0980 RSI: 0080 RDI: 00660401
RBP: 88017c087cd0 R08: 0401 R09: 0009
R10: 88017c080040 R11: 000a R12: 0400
R13: ea001981 R14: ea0019810040 R15: 88066cfe6080
FS:  () GS:88066cd4() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 02406000 CR4: 06e0
Stack:
 88066cd5bbd8 88066cfe6640  
 001f001f 88066cd5bbe8 ea001981 8118f53e
 0009 0401 000a 0001
Call Trace:
 [] free_hot_cold_page+0x192/0x1d0
 [] __free_pages+0x5c/0x90
 [] __free_pages_boot_core+0x11a/0x14e
 [] deferred_free_range+0x50/0x62
 [] deferred_init_memmap+0x220/0x3c3
 [] ? setup_per_cpu_pageset+0x35/0x35
 [] kthread+0xf8/0x110
 [] ret_from_fork+0x22/0x40
 [] ? kthread_create_on_node+0x200/0x200
Code: 49 89 d4 48 c1 e0 06 49 01 c5 e9 de fe ff ff 4c 89 f7 44 89 4d b8 
4c 89 45 c0 44 89 5d c8 48 89 4d d0 e8 62 c7 07 00 48 8b 4d d0 <48> 8b 
00 44 8b 5d c8 4c 8b 45 c0 44 8b 4d b8 a8 02 0f 84 05 ff

RIP  [] free_pcppages_bulk+0x2d2/0x8d0
 RSP 
CR2: 


I will add the oops info into the commit log in V2.

Thanks,
Yang







Re: [PATCH] mm: page_is_guard return false when page_ext arrays are not allocated yet

2016-05-19 Thread Shi, Yang

On 5/18/2016 5:28 PM, Joonsoo Kim wrote:

Vlastiml, thanks for ccing me on original bug report.

On Wed, May 18, 2016 at 03:23:45PM -0700, Yang Shi wrote:

When enabling the below kernel configs:

CONFIG_DEFERRED_STRUCT_PAGE_INIT
CONFIG_DEBUG_PAGEALLOC
CONFIG_PAGE_EXTENSION
CONFIG_DEBUG_VM

kernel bootup may fail due to the following oops:

BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [] free_pcppages_bulk+0x2d2/0x8d0
PGD 0
Oops:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
Modules linked in:
CPU: 11 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc5-next-20160427 #26
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS 
S5500.86B.01.10.0025.030220091519 03/02/2009
task: 88017c080040 ti: 88017c084000 task.ti: 88017c084000
RIP: 0010:[]  [] 
free_pcppages_bulk+0x2d2/0x8d0
RSP: :88017c087c48  EFLAGS: 00010046
RAX:  RBX:  RCX: 0001
RDX: 0980 RSI: 0080 RDI: 00660401
RBP: 88017c087cd0 R08: 0401 R09: 0009
R10: 88017c080040 R11: 000a R12: 0400
R13: ea001981 R14: ea0019810040 R15: 88066cfe6080
FS:  () GS:88066cd4() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 02406000 CR4: 06e0
Stack:
 88066cd5bbd8 88066cfe6640  
 001f001f 88066cd5bbe8 ea001981 8118f53e
 0009 0401 000a 0001
Call Trace:
 [] free_hot_cold_page+0x192/0x1d0
 [] __free_pages+0x5c/0x90
 [] __free_pages_boot_core+0x11a/0x14e
 [] deferred_free_range+0x50/0x62
 [] deferred_init_memmap+0x220/0x3c3
 [] ? setup_per_cpu_pageset+0x35/0x35
 [] kthread+0xf8/0x110
 [] ret_from_fork+0x22/0x40
 [] ? kthread_create_on_node+0x200/0x200
Code: 49 89 d4 48 c1 e0 06 49 01 c5 e9 de fe ff ff 4c 89 f7 44 89 4d b8 4c 89 45 c0 
44 89 5d c8 48 89 4d d0 e8 62 c7 07 00 48 8b 4d d0 <48> 8b 00 44 8b 5d c8 4c 8b 
45 c0 44 8b 4d b8 a8 02 0f 84 05 ff
RIP  [] free_pcppages_bulk+0x2d2/0x8d0
 RSP 
CR2: 

The problem is lookup_page_ext() returns NULL then page_is_guard() tried to
access it in page freeing.

page_is_guard() depends on PAGE_EXT_DEBUG_GUARD bit of page extension flag, but
freeing page might reach here before the page_ext arrays are allocated when
feeding a range of pages to the allocator for the first time during bootup or
memory hotplug.


Patch itself looks find to me because I also found that this kind of
problem happens during memory hotplug. So, we need to fix more sites,
all callers of lookup_page_ext().


Yes, I agree. I will come up with a patch or a couple of patches to 
check the return value of lookup_page_ext().




But, I'd like to know how your problem occurs during bootup.
debug_guardpage_enabled() is turned to 'enable' after page_ext is
initialized. Before that, page_is_guard() unconditionally returns
false so I think that the problem what you mentioned can't happen.

Could you check that when debug_guardpage_enabled() returns 'enable'
and init_section_page_ext() is called?


I think the problem is I have CONFIG_DEFERRED_STRUCT_PAGE_INIT enabled, 
which will defer some struct pages initialization to "pgdatinitX" kernel 
thread in page_alloc_init_late(). But, page_ext_init() is called before it.


So, it leads debug_guardpage_enabled() return true, but page extension 
is not allocated yet for the struct pages initialized by "pgdatinitX".


It sounds page_ext_init() should be called after page_alloc_init_late(). 
Or it should be just incompatible with CONFIG_DEFERRED_STRUCT_PAGE_INIT.


I will try to move the init call around.

Thanks,
Yang



And, above comment would be stale because it comes from when memcg uses
this struct page extension funtionality. Now, memcg doesn't use it and
there are some changes on this area so I'm not sure that is still true.

Thanks.





Re: [PATCH] mm: page_is_guard return false when page_ext arrays are not allocated yet

2016-05-19 Thread Shi, Yang

On 5/18/2016 5:28 PM, Joonsoo Kim wrote:

Vlastiml, thanks for ccing me on original bug report.

On Wed, May 18, 2016 at 03:23:45PM -0700, Yang Shi wrote:

When enabling the below kernel configs:

CONFIG_DEFERRED_STRUCT_PAGE_INIT
CONFIG_DEBUG_PAGEALLOC
CONFIG_PAGE_EXTENSION
CONFIG_DEBUG_VM

kernel bootup may fail due to the following oops:

BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [] free_pcppages_bulk+0x2d2/0x8d0
PGD 0
Oops:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
Modules linked in:
CPU: 11 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc5-next-20160427 #26
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS 
S5500.86B.01.10.0025.030220091519 03/02/2009
task: 88017c080040 ti: 88017c084000 task.ti: 88017c084000
RIP: 0010:[]  [] 
free_pcppages_bulk+0x2d2/0x8d0
RSP: :88017c087c48  EFLAGS: 00010046
RAX:  RBX:  RCX: 0001
RDX: 0980 RSI: 0080 RDI: 00660401
RBP: 88017c087cd0 R08: 0401 R09: 0009
R10: 88017c080040 R11: 000a R12: 0400
R13: ea001981 R14: ea0019810040 R15: 88066cfe6080
FS:  () GS:88066cd4() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 02406000 CR4: 06e0
Stack:
 88066cd5bbd8 88066cfe6640  
 001f001f 88066cd5bbe8 ea001981 8118f53e
 0009 0401 000a 0001
Call Trace:
 [] free_hot_cold_page+0x192/0x1d0
 [] __free_pages+0x5c/0x90
 [] __free_pages_boot_core+0x11a/0x14e
 [] deferred_free_range+0x50/0x62
 [] deferred_init_memmap+0x220/0x3c3
 [] ? setup_per_cpu_pageset+0x35/0x35
 [] kthread+0xf8/0x110
 [] ret_from_fork+0x22/0x40
 [] ? kthread_create_on_node+0x200/0x200
Code: 49 89 d4 48 c1 e0 06 49 01 c5 e9 de fe ff ff 4c 89 f7 44 89 4d b8 4c 89 45 c0 
44 89 5d c8 48 89 4d d0 e8 62 c7 07 00 48 8b 4d d0 <48> 8b 00 44 8b 5d c8 4c 8b 
45 c0 44 8b 4d b8 a8 02 0f 84 05 ff
RIP  [] free_pcppages_bulk+0x2d2/0x8d0
 RSP 
CR2: 

The problem is lookup_page_ext() returns NULL then page_is_guard() tried to
access it in page freeing.

page_is_guard() depends on PAGE_EXT_DEBUG_GUARD bit of page extension flag, but
freeing page might reach here before the page_ext arrays are allocated when
feeding a range of pages to the allocator for the first time during bootup or
memory hotplug.


Patch itself looks find to me because I also found that this kind of
problem happens during memory hotplug. So, we need to fix more sites,
all callers of lookup_page_ext().


Yes, I agree. I will come up with a patch or a couple of patches to 
check the return value of lookup_page_ext().




But, I'd like to know how your problem occurs during bootup.
debug_guardpage_enabled() is turned to 'enable' after page_ext is
initialized. Before that, page_is_guard() unconditionally returns
false so I think that the problem what you mentioned can't happen.

Could you check that when debug_guardpage_enabled() returns 'enable'
and init_section_page_ext() is called?


I think the problem is I have CONFIG_DEFERRED_STRUCT_PAGE_INIT enabled, 
which will defer some struct pages initialization to "pgdatinitX" kernel 
thread in page_alloc_init_late(). But, page_ext_init() is called before it.


So, it leads debug_guardpage_enabled() return true, but page extension 
is not allocated yet for the struct pages initialized by "pgdatinitX".


It sounds page_ext_init() should be called after page_alloc_init_late(). 
Or it should be just incompatible with CONFIG_DEFERRED_STRUCT_PAGE_INIT.


I will try to move the init call around.

Thanks,
Yang



And, above comment would be stale because it comes from when memcg uses
this struct page extension funtionality. Now, memcg doesn't use it and
there are some changes on this area so I'm not sure that is still true.

Thanks.





Re: [BUG] Null pointer dereference when freeing pages on 4.6-rc6

2016-05-18 Thread Shi, Yang

On 5/16/2016 5:44 AM, Vlastimil Babka wrote:

[+CC Joonsoo based on git blame]

On 05/05/2016 11:13 PM, Shi, Yang wrote:

Hi folks,

When I enable the below kernel configs on 4.6-rc6, I came across null
pointer deference issue in boot stage.

CONFIG_SPARSEMEM
CONFIG_DEFERRED_STRUCT_PAGE_INIT
CONFIG_DEBUG_PAGEALLOC
CONFIG_PAGE_EXTENSION
CONFIG_DEBUG_VM


The splat is:

BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [] page_is_buddy+0x7b/0xe0
PGD 0
Oops:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
Modules linked in:
CPU: 3 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc6 #8
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS
S5500.86B.01.10.0025.030220091519 03/02/2009
task: 88017c1d0040 ti: 88017c1d4000 task.ti: 88017c1d4000
RIP: 0010:[]  []
page_is_buddy+0x7b/0xe0
RSP: :88017c1d7bf0  EFLAGS: 00010046
RAX:  RBX: ea0019810040 RCX: 
RDX:  RSI: ea0019810040 RDI: 00660401
RBP: 88017c1d7c08 R08: 0001 R09: 
R10: 01af R11: 0001 R12: ea001981
R13:  R14: 0009 R15: ea001981
FS:  () GS:88066cc4()
knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 02406000 CR4: 06e0
Stack:
   1981  88066cfe6080 88017c1d7c70
   8118bfea 000a 1600 0001
   0401 ea0019810040 0400 88066cc5aca8
Call Trace:
   [] __free_one_page+0x23a/0x450
   [] free_pcppages_bulk+0x136/0x360
   [] free_hot_cold_page+0x168/0x1b0
   [] __free_pages+0x5c/0x90
   [] __free_pages_boot_core.isra.70+0x11a/0x14d
   [] deferred_free_range+0x50/0x62
   [] deferred_init_memmap+0x220/0x3c3
   [] ? deferred_free_range+0x62/0x62
   [] kthread+0xf8/0x110
   [] ret_from_fork+0x22/0x40
   [] ? kthread_create_on_node+0x200/0x200
Code: 75 7b 48 89 d8 8b 40 1c 85 c0 74 50 48 c7 c6 38 bd 0d 82 48 89 df
e8 25 e2 02 00 0f 0b 48 89 f7 89 55 ec e8 18 cb 07 00 8b 55 ec <48> 8b
00 a8 02 74 9d 3b 53 30 75 98 49 8b 14 24 48 8b 03 48 c1
RIP  [] page_is_buddy+0x7b/0xe0
   RSP 
CR2: 
---[ end trace e0c05a86b43d97f9 ]---
note: pgdatinit1[106] exited with preempt_count 1


I changed page_is_buddy and __free_one_page to non-inline to get more
accurate stack trace.


Then I did some investigation on it with printing the address of page
and buddy, please see the below log:

@@__free_one_page:715: page is at ea0005f05c00 buddy is at
ea0005f05c80, order is 1
@@__free_one_page:715: page is at ea0005f05c00 buddy is at
ea0005f05d00, order is 2
@@__free_one_page:715: page is at ea0005f05c00 buddy is at
ea0005f05e00, order is 3
@@__free_one_page:715: page is at ea001981 buddy is at
ea0019810040, order is 0

call trace splat

@@__free_one_page:715: page is at ea0005f05bc0 buddy is at
ea0005f05b80, order is 0
@@__free_one_page:715: page is at ea0005f05b80 buddy is at
ea0005f05bc0, order is 0
@@__free_one_page:715: page is at ea0005f05b80 buddy is at
ea0005f05b00, order is 1
@@__free_one_page:715: page is at ea0005f05b40 buddy is at
ea0005f05b00, order is 0
@@__free_one_page:715: page is at ea0005f05b00 buddy is at
ea0005f05b40, order is 0

It shows just before the call trace splat, the page address jumped to
ea001981 from ea0005f05xxx, not sure why this is happening.
Any hint is appreciated.


I think the page address didn't jump, it was that the previous call of
__free_one_page() finished at order=3 and now another one was invoked
that started at order=0 page ea001981 and immediately hit the
splat.


And, reading the code leads me to the below call path:

page_is_buddy()
--> page_is_guard()
--> lookup_page_ext()


From a quick decodecode I guess that's indeed the path that's crashed.
You could try verifying with e.g. addr2line on your vmlinux?


I tracked down there via gdb.




Then lookup_page_ext() just returns null due to the below code:

#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
  /*
   * The sanity checks the page allocator does upon freeing a
   * page can reach here before the page_ext arrays are
   * allocated when feeding a range of pages to the allocator
   * for the first time during bootup or memory hotplug.
   *
   * This check is also necessary for ensuring page poisoning
   * works as expected when enabled
   */
  if (!section->page_ext)
  return NULL;
#endif

So, according to the comment, it looks there should be a WARN or BUG if
it returns NULL?


The comment reads like it's unlikely, but possible, so WARN/BUG doesn't
sounds right.


And, almost no codes check if the return 

Re: [BUG] Null pointer dereference when freeing pages on 4.6-rc6

2016-05-18 Thread Shi, Yang

On 5/16/2016 5:44 AM, Vlastimil Babka wrote:

[+CC Joonsoo based on git blame]

On 05/05/2016 11:13 PM, Shi, Yang wrote:

Hi folks,

When I enable the below kernel configs on 4.6-rc6, I came across null
pointer deference issue in boot stage.

CONFIG_SPARSEMEM
CONFIG_DEFERRED_STRUCT_PAGE_INIT
CONFIG_DEBUG_PAGEALLOC
CONFIG_PAGE_EXTENSION
CONFIG_DEBUG_VM


The splat is:

BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [] page_is_buddy+0x7b/0xe0
PGD 0
Oops:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
Modules linked in:
CPU: 3 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc6 #8
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS
S5500.86B.01.10.0025.030220091519 03/02/2009
task: 88017c1d0040 ti: 88017c1d4000 task.ti: 88017c1d4000
RIP: 0010:[]  []
page_is_buddy+0x7b/0xe0
RSP: :88017c1d7bf0  EFLAGS: 00010046
RAX:  RBX: ea0019810040 RCX: 
RDX:  RSI: ea0019810040 RDI: 00660401
RBP: 88017c1d7c08 R08: 0001 R09: 
R10: 01af R11: 0001 R12: ea001981
R13:  R14: 0009 R15: ea001981
FS:  () GS:88066cc4()
knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 02406000 CR4: 06e0
Stack:
   1981  88066cfe6080 88017c1d7c70
   8118bfea 000a 1600 0001
   0401 ea0019810040 0400 88066cc5aca8
Call Trace:
   [] __free_one_page+0x23a/0x450
   [] free_pcppages_bulk+0x136/0x360
   [] free_hot_cold_page+0x168/0x1b0
   [] __free_pages+0x5c/0x90
   [] __free_pages_boot_core.isra.70+0x11a/0x14d
   [] deferred_free_range+0x50/0x62
   [] deferred_init_memmap+0x220/0x3c3
   [] ? deferred_free_range+0x62/0x62
   [] kthread+0xf8/0x110
   [] ret_from_fork+0x22/0x40
   [] ? kthread_create_on_node+0x200/0x200
Code: 75 7b 48 89 d8 8b 40 1c 85 c0 74 50 48 c7 c6 38 bd 0d 82 48 89 df
e8 25 e2 02 00 0f 0b 48 89 f7 89 55 ec e8 18 cb 07 00 8b 55 ec <48> 8b
00 a8 02 74 9d 3b 53 30 75 98 49 8b 14 24 48 8b 03 48 c1
RIP  [] page_is_buddy+0x7b/0xe0
   RSP 
CR2: 
---[ end trace e0c05a86b43d97f9 ]---
note: pgdatinit1[106] exited with preempt_count 1


I changed page_is_buddy and __free_one_page to non-inline to get more
accurate stack trace.


Then I did some investigation on it with printing the address of page
and buddy, please see the below log:

@@__free_one_page:715: page is at ea0005f05c00 buddy is at
ea0005f05c80, order is 1
@@__free_one_page:715: page is at ea0005f05c00 buddy is at
ea0005f05d00, order is 2
@@__free_one_page:715: page is at ea0005f05c00 buddy is at
ea0005f05e00, order is 3
@@__free_one_page:715: page is at ea001981 buddy is at
ea0019810040, order is 0

call trace splat

@@__free_one_page:715: page is at ea0005f05bc0 buddy is at
ea0005f05b80, order is 0
@@__free_one_page:715: page is at ea0005f05b80 buddy is at
ea0005f05bc0, order is 0
@@__free_one_page:715: page is at ea0005f05b80 buddy is at
ea0005f05b00, order is 1
@@__free_one_page:715: page is at ea0005f05b40 buddy is at
ea0005f05b00, order is 0
@@__free_one_page:715: page is at ea0005f05b00 buddy is at
ea0005f05b40, order is 0

It shows just before the call trace splat, the page address jumped to
ea001981 from ea0005f05xxx, not sure why this is happening.
Any hint is appreciated.


I think the page address didn't jump, it was that the previous call of
__free_one_page() finished at order=3 and now another one was invoked
that started at order=0 page ea001981 and immediately hit the
splat.


And, reading the code leads me to the below call path:

page_is_buddy()
--> page_is_guard()
--> lookup_page_ext()


From a quick decodecode I guess that's indeed the path that's crashed.
You could try verifying with e.g. addr2line on your vmlinux?


I tracked down there via gdb.




Then lookup_page_ext() just returns null due to the below code:

#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
  /*
   * The sanity checks the page allocator does upon freeing a
   * page can reach here before the page_ext arrays are
   * allocated when feeding a range of pages to the allocator
   * for the first time during bootup or memory hotplug.
   *
   * This check is also necessary for ensuring page poisoning
   * works as expected when enabled
   */
  if (!section->page_ext)
  return NULL;
#endif

So, according to the comment, it looks there should be a WARN or BUG if
it returns NULL?


The comment reads like it's unlikely, but possible, so WARN/BUG doesn't
sounds right.


And, almost no codes check if the return 

Re: [PATCH net-next] bpf: arm64: remove callee-save registers use for tmp registers

2016-05-16 Thread Shi, Yang

On 5/16/2016 4:45 PM, Z Lim wrote:

Hi Yang,

On Mon, May 16, 2016 at 4:09 PM, Yang Shi <yang@linaro.org> wrote:

In the current implementation of ARM64 eBPF JIT, R23 and R24 are used for
tmp registers, which are callee-saved registers. This leads to variable size
of JIT prologue and epilogue. The latest blinding constant change prefers to
constant size of prologue and epilogue. AAPCS reserves R9 ~ R15 for temp
registers which not need to be saved/restored during function call. So, replace
R23 and R24 to R10 and R11, and remove tmp_used flag.

CC: Zi Shen Lim <zlim@gmail.com>
CC: Daniel Borkmann <dan...@iogearbox.net>
Signed-off-by: Yang Shi <yang@linaro.org>
---


Couple suggestions, but otherwise:
Acked-by: Zi Shen Lim <zlim@gmail.com>

1. Update the diagram. I think it should now be:

-* BPF fp register => -80:+-+ <= (BPF_FP)
+* BPF fp register => -64:+-+ <= (BPF_FP)


Nice catch. I forgot the stack diagram.



2. Add a comment in commit log along the lines of: this is an
optimization saving 2 instructions per jited BPF program.


Sure, will address in V2.

Thanks,
Yang



Thanks :)

z


Apply on top of Daniel's blinding constant patchset.

 arch/arm64/net/bpf_jit_comp.c | 32 
 1 file changed, 4 insertions(+), 28 deletions(-)





Re: [PATCH net-next] bpf: arm64: remove callee-save registers use for tmp registers

2016-05-16 Thread Shi, Yang

On 5/16/2016 4:45 PM, Z Lim wrote:

Hi Yang,

On Mon, May 16, 2016 at 4:09 PM, Yang Shi  wrote:

In the current implementation of ARM64 eBPF JIT, R23 and R24 are used for
tmp registers, which are callee-saved registers. This leads to variable size
of JIT prologue and epilogue. The latest blinding constant change prefers to
constant size of prologue and epilogue. AAPCS reserves R9 ~ R15 for temp
registers which not need to be saved/restored during function call. So, replace
R23 and R24 to R10 and R11, and remove tmp_used flag.

CC: Zi Shen Lim 
CC: Daniel Borkmann 
Signed-off-by: Yang Shi 
---


Couple suggestions, but otherwise:
Acked-by: Zi Shen Lim 

1. Update the diagram. I think it should now be:

-* BPF fp register => -80:+-+ <= (BPF_FP)
+* BPF fp register => -64:+-+ <= (BPF_FP)


Nice catch. I forgot the stack diagram.



2. Add a comment in commit log along the lines of: this is an
optimization saving 2 instructions per jited BPF program.


Sure, will address in V2.

Thanks,
Yang



Thanks :)

z


Apply on top of Daniel's blinding constant patchset.

 arch/arm64/net/bpf_jit_comp.c | 32 
 1 file changed, 4 insertions(+), 28 deletions(-)





Re: [PATCH] mm: slab: remove ZONE_DMA_FLAG

2016-05-09 Thread Shi, Yang

On 5/5/2016 4:49 AM, Michal Hocko wrote:

On Wed 04-05-16 10:01:37, Yang Shi wrote:

Now we have IS_ENABLED helper to check if a Kconfig option is enabled or not,
so ZONE_DMA_FLAG sounds no longer useful.

And, the use of ZONE_DMA_FLAG in slab looks pointless according to the
comment [1] from Johannes Weiner, so remove them and ORing passed in flags with
the cache gfp flags has been done in kmem_getpages().

[1] https://lkml.org/lkml/2014/9/25/553


I haven't checked the patch but I have a formal suggestion.
lkml.org tends to break and forget, please use
http://lkml.kernel.org/r/$msg-id instead. In this case
http://lkml.kernel.org/r/20140925185047.ga21...@cmpxchg.org


Thanks for the suggestion. Will use msg-id in later post.

Regards,
Yang



Thanks!





Re: [PATCH] mm: slab: remove ZONE_DMA_FLAG

2016-05-09 Thread Shi, Yang

On 5/5/2016 4:49 AM, Michal Hocko wrote:

On Wed 04-05-16 10:01:37, Yang Shi wrote:

Now we have IS_ENABLED helper to check if a Kconfig option is enabled or not,
so ZONE_DMA_FLAG sounds no longer useful.

And, the use of ZONE_DMA_FLAG in slab looks pointless according to the
comment [1] from Johannes Weiner, so remove them and ORing passed in flags with
the cache gfp flags has been done in kmem_getpages().

[1] https://lkml.org/lkml/2014/9/25/553


I haven't checked the patch but I have a formal suggestion.
lkml.org tends to break and forget, please use
http://lkml.kernel.org/r/$msg-id instead. In this case
http://lkml.kernel.org/r/20140925185047.ga21...@cmpxchg.org


Thanks for the suggestion. Will use msg-id in later post.

Regards,
Yang



Thanks!





[BUG] Null pointer dereference when freeing pages on 4.6-rc6

2016-05-05 Thread Shi, Yang

Hi folks,

When I enable the below kernel configs on 4.6-rc6, I came across null 
pointer deference issue in boot stage.


CONFIG_SPARSEMEM
CONFIG_DEFERRED_STRUCT_PAGE_INIT
CONFIG_DEBUG_PAGEALLOC
CONFIG_PAGE_EXTENSION
CONFIG_DEBUG_VM


The splat is:

BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [] page_is_buddy+0x7b/0xe0
PGD 0
Oops:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
Modules linked in:
CPU: 3 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc6 #8
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS 
S5500.86B.01.10.0025.030220091519 03/02/2009

task: 88017c1d0040 ti: 88017c1d4000 task.ti: 88017c1d4000
RIP: 0010:[]  [] page_is_buddy+0x7b/0xe0
RSP: :88017c1d7bf0  EFLAGS: 00010046
RAX:  RBX: ea0019810040 RCX: 
RDX:  RSI: ea0019810040 RDI: 00660401
RBP: 88017c1d7c08 R08: 0001 R09: 
R10: 01af R11: 0001 R12: ea001981
R13:  R14: 0009 R15: ea001981
FS:  () GS:88066cc4() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 02406000 CR4: 06e0
Stack:
 1981  88066cfe6080 88017c1d7c70
 8118bfea 000a 1600 0001
 0401 ea0019810040 0400 88066cc5aca8
Call Trace:
 [] __free_one_page+0x23a/0x450
 [] free_pcppages_bulk+0x136/0x360
 [] free_hot_cold_page+0x168/0x1b0
 [] __free_pages+0x5c/0x90
 [] __free_pages_boot_core.isra.70+0x11a/0x14d
 [] deferred_free_range+0x50/0x62
 [] deferred_init_memmap+0x220/0x3c3
 [] ? deferred_free_range+0x62/0x62
 [] kthread+0xf8/0x110
 [] ret_from_fork+0x22/0x40
 [] ? kthread_create_on_node+0x200/0x200
Code: 75 7b 48 89 d8 8b 40 1c 85 c0 74 50 48 c7 c6 38 bd 0d 82 48 89 df 
e8 25 e2 02 00 0f 0b 48 89 f7 89 55 ec e8 18 cb 07 00 8b 55 ec <48> 8b 
00 a8 02 74 9d 3b 53 30 75 98 49 8b 14 24 48 8b 03 48 c1

RIP  [] page_is_buddy+0x7b/0xe0
 RSP 
CR2: 
---[ end trace e0c05a86b43d97f9 ]---
note: pgdatinit1[106] exited with preempt_count 1


I changed page_is_buddy and __free_one_page to non-inline to get more 
accurate stack trace.



Then I did some investigation on it with printing the address of page 
and buddy, please see the below log:


@@__free_one_page:715: page is at ea0005f05c00 buddy is at 
ea0005f05c80, order is 1
@@__free_one_page:715: page is at ea0005f05c00 buddy is at 
ea0005f05d00, order is 2
@@__free_one_page:715: page is at ea0005f05c00 buddy is at 
ea0005f05e00, order is 3
@@__free_one_page:715: page is at ea001981 buddy is at 
ea0019810040, order is 0


call trace splat

@@__free_one_page:715: page is at ea0005f05bc0 buddy is at 
ea0005f05b80, order is 0
@@__free_one_page:715: page is at ea0005f05b80 buddy is at 
ea0005f05bc0, order is 0
@@__free_one_page:715: page is at ea0005f05b80 buddy is at 
ea0005f05b00, order is 1
@@__free_one_page:715: page is at ea0005f05b40 buddy is at 
ea0005f05b00, order is 0
@@__free_one_page:715: page is at ea0005f05b00 buddy is at 
ea0005f05b40, order is 0


It shows just before the call trace splat, the page address jumped to 
ea001981 from ea0005f05xxx, not sure why this is happening. 
Any hint is appreciated.


And, reading the code leads me to the below call path:

page_is_buddy()
--> page_is_guard()
--> lookup_page_ext()

Then lookup_page_ext() just returns null due to the below code:

#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
/*
 * The sanity checks the page allocator does upon freeing a
 * page can reach here before the page_ext arrays are
 * allocated when feeding a range of pages to the allocator
 * for the first time during bootup or memory hotplug.
 *
 * This check is also necessary for ensuring page poisoning
 * works as expected when enabled
 */
if (!section->page_ext)
return NULL;
#endif

So, according to the comment, it looks there should be a WARN or BUG if 
it returns NULL? And, almost no codes check if the return pointer is 
null or not after lookup_page_ext() is called.


Thanks,
Yang


[BUG] Null pointer dereference when freeing pages on 4.6-rc6

2016-05-05 Thread Shi, Yang

Hi folks,

When I enable the below kernel configs on 4.6-rc6, I came across null 
pointer deference issue in boot stage.


CONFIG_SPARSEMEM
CONFIG_DEFERRED_STRUCT_PAGE_INIT
CONFIG_DEBUG_PAGEALLOC
CONFIG_PAGE_EXTENSION
CONFIG_DEBUG_VM


The splat is:

BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [] page_is_buddy+0x7b/0xe0
PGD 0
Oops:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
Modules linked in:
CPU: 3 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc6 #8
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS 
S5500.86B.01.10.0025.030220091519 03/02/2009

task: 88017c1d0040 ti: 88017c1d4000 task.ti: 88017c1d4000
RIP: 0010:[]  [] page_is_buddy+0x7b/0xe0
RSP: :88017c1d7bf0  EFLAGS: 00010046
RAX:  RBX: ea0019810040 RCX: 
RDX:  RSI: ea0019810040 RDI: 00660401
RBP: 88017c1d7c08 R08: 0001 R09: 
R10: 01af R11: 0001 R12: ea001981
R13:  R14: 0009 R15: ea001981
FS:  () GS:88066cc4() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 02406000 CR4: 06e0
Stack:
 1981  88066cfe6080 88017c1d7c70
 8118bfea 000a 1600 0001
 0401 ea0019810040 0400 88066cc5aca8
Call Trace:
 [] __free_one_page+0x23a/0x450
 [] free_pcppages_bulk+0x136/0x360
 [] free_hot_cold_page+0x168/0x1b0
 [] __free_pages+0x5c/0x90
 [] __free_pages_boot_core.isra.70+0x11a/0x14d
 [] deferred_free_range+0x50/0x62
 [] deferred_init_memmap+0x220/0x3c3
 [] ? deferred_free_range+0x62/0x62
 [] kthread+0xf8/0x110
 [] ret_from_fork+0x22/0x40
 [] ? kthread_create_on_node+0x200/0x200
Code: 75 7b 48 89 d8 8b 40 1c 85 c0 74 50 48 c7 c6 38 bd 0d 82 48 89 df 
e8 25 e2 02 00 0f 0b 48 89 f7 89 55 ec e8 18 cb 07 00 8b 55 ec <48> 8b 
00 a8 02 74 9d 3b 53 30 75 98 49 8b 14 24 48 8b 03 48 c1

RIP  [] page_is_buddy+0x7b/0xe0
 RSP 
CR2: 
---[ end trace e0c05a86b43d97f9 ]---
note: pgdatinit1[106] exited with preempt_count 1


I changed page_is_buddy and __free_one_page to non-inline to get more 
accurate stack trace.



Then I did some investigation on it with printing the address of page 
and buddy, please see the below log:


@@__free_one_page:715: page is at ea0005f05c00 buddy is at 
ea0005f05c80, order is 1
@@__free_one_page:715: page is at ea0005f05c00 buddy is at 
ea0005f05d00, order is 2
@@__free_one_page:715: page is at ea0005f05c00 buddy is at 
ea0005f05e00, order is 3
@@__free_one_page:715: page is at ea001981 buddy is at 
ea0019810040, order is 0


call trace splat

@@__free_one_page:715: page is at ea0005f05bc0 buddy is at 
ea0005f05b80, order is 0
@@__free_one_page:715: page is at ea0005f05b80 buddy is at 
ea0005f05bc0, order is 0
@@__free_one_page:715: page is at ea0005f05b80 buddy is at 
ea0005f05b00, order is 1
@@__free_one_page:715: page is at ea0005f05b40 buddy is at 
ea0005f05b00, order is 0
@@__free_one_page:715: page is at ea0005f05b00 buddy is at 
ea0005f05b40, order is 0


It shows just before the call trace splat, the page address jumped to 
ea001981 from ea0005f05xxx, not sure why this is happening. 
Any hint is appreciated.


And, reading the code leads me to the below call path:

page_is_buddy()
--> page_is_guard()
--> lookup_page_ext()

Then lookup_page_ext() just returns null due to the below code:

#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
/*
 * The sanity checks the page allocator does upon freeing a
 * page can reach here before the page_ext arrays are
 * allocated when feeding a range of pages to the allocator
 * for the first time during bootup or memory hotplug.
 *
 * This check is also necessary for ensuring page poisoning
 * works as expected when enabled
 */
if (!section->page_ext)
return NULL;
#endif

So, according to the comment, it looks there should be a WARN or BUG if 
it returns NULL? And, almost no codes check if the return pointer is 
null or not after lookup_page_ext() is called.


Thanks,
Yang


Re: [PATCH] mm: move huge_pmd_set_accessed out of huge_memory.c

2016-04-29 Thread Shi, Yang

On 4/22/2016 2:48 AM, Kirill A. Shutemov wrote:

On Thu, Apr 21, 2016 at 03:56:07PM -0700, Shi, Yang wrote:

On 4/21/2016 12:30 AM, Kirill A. Shutemov wrote:

On Wed, Apr 20, 2016 at 02:00:11PM -0700, Shi, Yang wrote:

Hi folks,

I didn't realize pmd_* functions are protected by
CONFIG_TRANSPARENT_HUGEPAGE on the most architectures before I made this
change.

Before I fix all the affected architectures code, I want to check if you
guys think this change is worth or not?

Thanks,
Yang

On 4/20/2016 11:24 AM, Yang Shi wrote:

huge_pmd_set_accessed is only called by __handle_mm_fault from memory.c,
move the definition to memory.c and make it static like create_huge_pmd and
wp_huge_pmd.

Signed-off-by: Yang Shi <yang@linaro.org>


On pte side we have the same functionality open-coded. Should we do the
same for pmd? Or change pte side the same way?


Sorry, I don't quite understand you. Do you mean pte_* functions?


See handle_pte_fault(), we do the same for pte there what
huge_pmd_set_accessed() does for pmd.


Thanks for directing to this code.



I think we should be consistent here: either both are abstructed into
functions or both open-coded.


I'm supposed functions sound better. However, do_wp_page has to be 
called with pte lock acquired. So, the abstracted function has to call it.


Thanks,
Yang








Re: [PATCH] mm: move huge_pmd_set_accessed out of huge_memory.c

2016-04-29 Thread Shi, Yang

On 4/22/2016 2:48 AM, Kirill A. Shutemov wrote:

On Thu, Apr 21, 2016 at 03:56:07PM -0700, Shi, Yang wrote:

On 4/21/2016 12:30 AM, Kirill A. Shutemov wrote:

On Wed, Apr 20, 2016 at 02:00:11PM -0700, Shi, Yang wrote:

Hi folks,

I didn't realize pmd_* functions are protected by
CONFIG_TRANSPARENT_HUGEPAGE on the most architectures before I made this
change.

Before I fix all the affected architectures code, I want to check if you
guys think this change is worth or not?

Thanks,
Yang

On 4/20/2016 11:24 AM, Yang Shi wrote:

huge_pmd_set_accessed is only called by __handle_mm_fault from memory.c,
move the definition to memory.c and make it static like create_huge_pmd and
wp_huge_pmd.

Signed-off-by: Yang Shi 


On pte side we have the same functionality open-coded. Should we do the
same for pmd? Or change pte side the same way?


Sorry, I don't quite understand you. Do you mean pte_* functions?


See handle_pte_fault(), we do the same for pte there what
huge_pmd_set_accessed() does for pmd.


Thanks for directing to this code.



I think we should be consistent here: either both are abstructed into
functions or both open-coded.


I'm supposed functions sound better. However, do_wp_page has to be 
called with pte lock acquired. So, the abstracted function has to call it.


Thanks,
Yang








Re: [BUG linux-next] Kernel panic found with linux-next-20160414

2016-04-28 Thread Shi, Yang

On 4/27/2016 1:14 AM, Hugh Dickins wrote:

On Wed, 20 Apr 2016, Shi, Yang wrote:

On 4/20/2016 1:01 AM, Hugh Dickins wrote:

On Tue, 19 Apr 2016, Shi, Yang wrote:

Hi folks,

When I ran ltp on linux-next-20160414 on my ARM64 machine, I got the
below
kernel panic:

Unable to handle kernel paging request at virtual address
ffc007846000
pgd = ffc01e21d000
[ffc007846000] *pgd=, *pud=
Internal error: Oops: 9647 [#11] PREEMPT SMP
Modules linked in: loop
CPU: 7 PID: 274 Comm: systemd-journal Tainted: G  D
4.6.0-rc3-next-20160414-WR8.0.0.0_standard+ #9
Hardware name: Freescale Layerscape 2085a RDB Board (DT)
task: ffc01e3fcf80 ti: ffc01ea8c000 task.ti: ffc01ea8c000
PC is at copy_page+0x38/0x120
LR is at migrate_page_copy+0x604/0x1660
pc : [] lr : [] pstate: 2145
sp : ffc01ea8ecd0
x29: ffc01ea8ecd0 x28: 
x27: 1ff7b80240f8 x26: ffc018196f20
x25: ffbdc01e1180 x24: ffbdc01e1180
x23:  x22: ffc01e3fcf80
x21: ffc00481f000 x20: ff900a31d000
x19: ffbdc01207c0 x18: 0f00
x17:  x16: 
x15:  x14: 
x13:  x12: 
x11:  x10: 
x9 :  x8 : 
x7 :  x6 : 
x5 :  x4 : 
x3 :  x2 : 
x1 : ffc00481f080 x0 : ffc007846000

Call trace:
Exception stack(0xffc021fc2ed0 to 0xffc021fc2ff0)
2ec0:   ffbdc00887c0 ff900a31d000
2ee0: ffc021fc30f0 ff9008ff2318 2145 0025
2f00: ffbdc025a280 ffc020adc4c0 41b58ab3 ff900a085fd0
2f20: ff9008200658   ffbdc00887c0
2f40: ff900b0f1320 ffc021fc3078 41b58ab3 ff900a0864f8
2f60: ff9008210010 ffc021fb8960 ff900867bacc 1ff8043f712d
2f80: ffc021fc2fb0 ff9008210564 ffc021fc3070 ffc021fb8940
2fa0: 08221f78 ff900862f9c8 ffc021fc2fe0 ff9008215dc8
2fc0: 1ff8043f8602 ffc021fc ffc00968a000 ffc00221f080
2fe0: f9407e11d1f0 d61f02209103e210
[] copy_page+0x38/0x120
[] migrate_page+0x74/0x98
[] nfs_migrate_page+0x58/0x80
[] move_to_new_page+0x15c/0x4d8
[] migrate_pages+0x7c8/0x11f0
[] compact_zone+0xdfc/0x2570
[] compact_zone_order+0xe0/0x170
[] try_to_compact_pages+0x2e8/0x8f8
[] __alloc_pages_direct_compact+0x100/0x540
[] __alloc_pages_nodemask+0xc40/0x1c58
[] khugepaged+0x468/0x19c8
[] kthread+0x248/0x2c0
[] ret_from_fork+0x10/0x40
Code: d281f012 91020021 f1020252 d503201f (a8000c02)


I did some initial investigation and found it is caused by
DEBUG_PAGEALLOC
and CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT. And, mainline 4.6-rc3 works
well.

It should be not arch specific although I got it caught on ARM64. I
suspect
this might be caused by Hugh's huge tmpfs patches.


Thanks for testing.  It might be caused by my patches, but I don't think
that's very likely.  This is page migraton for compaction, in the service
of anon THP's khugepaged; and I wonder if you were even exercising huge
tmpfs when running LTP here (it certainly can be done: I like to mount a
huge tmpfs on /opt/ltp and install there, with shmem_huge 2 so any other
tmpfs mounts are also huge).


Some further investigation shows I got the panic even though I don't have
tmpfs mounted with huge=1 or set shmem_huge to 2.



There are compaction changes in linux-next too, but I don't see any
reason why they'd cause this.  I don't know arm64 traces enough to know
whether it's the source page or the destination page for the copy, but
it looks as if it has been freed (and DEBUG_PAGEALLOC unmapped) before
reaching migration's copy.


The fault address is passed by x0, which is dest in the implementation of
copy_page, so it is the destination page.



Needs more debugging, I'm afraid: is it reproducible?


Yes, as long as I enable those two PAGEALLOC debug options, I can get the
panic once I run ltp. But, it is not caused any specific ltp test case
directly, the panic happens randomly during ltp is running.


Your ping on the crash in release_freepages() reminded me to take another
look at this one.  And found that I only needed to enable DEBUG_PAGEALLOC
and run LTP to get it on x86_64 too, as you suspected.

It's another of those compaction errors, in mmotm and linux-next of a
week or two ago, whose patch has since been withdrawn (but huge tmpfs
has also been withdrawn for now, so you're right to stick with the
older linux-next for testing it).


Yes, I saw the discussion on LSFMM 2016 and the patches have gone in my 
latest update from linux-next. I will stick to 20160420 for the huge 
tmpfs testing.




I believe the patch below fixes it; but I've not done full diligence
on it - if I had more time, I'd want to check that all of the things

Re: [BUG linux-next] Kernel panic found with linux-next-20160414

2016-04-28 Thread Shi, Yang

On 4/27/2016 1:14 AM, Hugh Dickins wrote:

On Wed, 20 Apr 2016, Shi, Yang wrote:

On 4/20/2016 1:01 AM, Hugh Dickins wrote:

On Tue, 19 Apr 2016, Shi, Yang wrote:

Hi folks,

When I ran ltp on linux-next-20160414 on my ARM64 machine, I got the
below
kernel panic:

Unable to handle kernel paging request at virtual address
ffc007846000
pgd = ffc01e21d000
[ffc007846000] *pgd=, *pud=
Internal error: Oops: 9647 [#11] PREEMPT SMP
Modules linked in: loop
CPU: 7 PID: 274 Comm: systemd-journal Tainted: G  D
4.6.0-rc3-next-20160414-WR8.0.0.0_standard+ #9
Hardware name: Freescale Layerscape 2085a RDB Board (DT)
task: ffc01e3fcf80 ti: ffc01ea8c000 task.ti: ffc01ea8c000
PC is at copy_page+0x38/0x120
LR is at migrate_page_copy+0x604/0x1660
pc : [] lr : [] pstate: 2145
sp : ffc01ea8ecd0
x29: ffc01ea8ecd0 x28: 
x27: 1ff7b80240f8 x26: ffc018196f20
x25: ffbdc01e1180 x24: ffbdc01e1180
x23:  x22: ffc01e3fcf80
x21: ffc00481f000 x20: ff900a31d000
x19: ffbdc01207c0 x18: 0f00
x17:  x16: 
x15:  x14: 
x13:  x12: 
x11:  x10: 
x9 :  x8 : 
x7 :  x6 : 
x5 :  x4 : 
x3 :  x2 : 
x1 : ffc00481f080 x0 : ffc007846000

Call trace:
Exception stack(0xffc021fc2ed0 to 0xffc021fc2ff0)
2ec0:   ffbdc00887c0 ff900a31d000
2ee0: ffc021fc30f0 ff9008ff2318 2145 0025
2f00: ffbdc025a280 ffc020adc4c0 41b58ab3 ff900a085fd0
2f20: ff9008200658   ffbdc00887c0
2f40: ff900b0f1320 ffc021fc3078 41b58ab3 ff900a0864f8
2f60: ff9008210010 ffc021fb8960 ff900867bacc 1ff8043f712d
2f80: ffc021fc2fb0 ff9008210564 ffc021fc3070 ffc021fb8940
2fa0: 08221f78 ff900862f9c8 ffc021fc2fe0 ff9008215dc8
2fc0: 1ff8043f8602 ffc021fc ffc00968a000 ffc00221f080
2fe0: f9407e11d1f0 d61f02209103e210
[] copy_page+0x38/0x120
[] migrate_page+0x74/0x98
[] nfs_migrate_page+0x58/0x80
[] move_to_new_page+0x15c/0x4d8
[] migrate_pages+0x7c8/0x11f0
[] compact_zone+0xdfc/0x2570
[] compact_zone_order+0xe0/0x170
[] try_to_compact_pages+0x2e8/0x8f8
[] __alloc_pages_direct_compact+0x100/0x540
[] __alloc_pages_nodemask+0xc40/0x1c58
[] khugepaged+0x468/0x19c8
[] kthread+0x248/0x2c0
[] ret_from_fork+0x10/0x40
Code: d281f012 91020021 f1020252 d503201f (a8000c02)


I did some initial investigation and found it is caused by
DEBUG_PAGEALLOC
and CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT. And, mainline 4.6-rc3 works
well.

It should be not arch specific although I got it caught on ARM64. I
suspect
this might be caused by Hugh's huge tmpfs patches.


Thanks for testing.  It might be caused by my patches, but I don't think
that's very likely.  This is page migraton for compaction, in the service
of anon THP's khugepaged; and I wonder if you were even exercising huge
tmpfs when running LTP here (it certainly can be done: I like to mount a
huge tmpfs on /opt/ltp and install there, with shmem_huge 2 so any other
tmpfs mounts are also huge).


Some further investigation shows I got the panic even though I don't have
tmpfs mounted with huge=1 or set shmem_huge to 2.



There are compaction changes in linux-next too, but I don't see any
reason why they'd cause this.  I don't know arm64 traces enough to know
whether it's the source page or the destination page for the copy, but
it looks as if it has been freed (and DEBUG_PAGEALLOC unmapped) before
reaching migration's copy.


The fault address is passed by x0, which is dest in the implementation of
copy_page, so it is the destination page.



Needs more debugging, I'm afraid: is it reproducible?


Yes, as long as I enable those two PAGEALLOC debug options, I can get the
panic once I run ltp. But, it is not caused any specific ltp test case
directly, the panic happens randomly during ltp is running.


Your ping on the crash in release_freepages() reminded me to take another
look at this one.  And found that I only needed to enable DEBUG_PAGEALLOC
and run LTP to get it on x86_64 too, as you suspected.

It's another of those compaction errors, in mmotm and linux-next of a
week or two ago, whose patch has since been withdrawn (but huge tmpfs
has also been withdrawn for now, so you're right to stick with the
older linux-next for testing it).


Yes, I saw the discussion on LSFMM 2016 and the patches have gone in my 
latest update from linux-next. I will stick to 20160420 for the huge 
tmpfs testing.




I believe the patch below fixes it; but I've not done full diligence
on it - if I had more time, I'd want to check that all of the things

Re: [BUG] set_pte_at: racy dirty state clearing warning

2016-04-27 Thread Shi, Yang

On 4/21/2016 1:49 AM, Catalin Marinas wrote:

On Wed, Apr 20, 2016 at 04:01:39PM -0700, Shi, Yang wrote:

When I enable memory comact via

# echo 1 > /proc/sys/vm/compact_memory

I got the below WARNING:

set_pte_at: racy dirty state clearing: 0x006899371bd3 ->
0x006899371fd3
[ cut here ]
WARNING: CPU: 5 PID: 294 at ./arch/arm64/include/asm/pgtable.h:227
ptep_set_access_flags+0x138/0x1b8
Modules linked in:


Do you have this patch applied:

http://article.gmane.org/gmane.linux.ports.arm.kernel/492239

It's also queued into -next as commit 66dbd6e61a52.


No, but I just applied it, it works.

Thanks,
Yang




My kernel has ARM64_HW_AFDBM enabled, but LS2085 is not ARMv8.1.

The code shows it just check if ARM64_HW_AFDBM is enabled or not, but
doesn't check if the CPU really has such capability.

So, it might be better to have the capability checked runtime?


The warnings are there to spot any incorrect uses of the pte accessors
even before you run on AF/DBM-capable hardware.





Re: [BUG] set_pte_at: racy dirty state clearing warning

2016-04-27 Thread Shi, Yang

On 4/21/2016 1:49 AM, Catalin Marinas wrote:

On Wed, Apr 20, 2016 at 04:01:39PM -0700, Shi, Yang wrote:

When I enable memory comact via

# echo 1 > /proc/sys/vm/compact_memory

I got the below WARNING:

set_pte_at: racy dirty state clearing: 0x006899371bd3 ->
0x006899371fd3
[ cut here ]
WARNING: CPU: 5 PID: 294 at ./arch/arm64/include/asm/pgtable.h:227
ptep_set_access_flags+0x138/0x1b8
Modules linked in:


Do you have this patch applied:

http://article.gmane.org/gmane.linux.ports.arm.kernel/492239

It's also queued into -next as commit 66dbd6e61a52.


No, but I just applied it, it works.

Thanks,
Yang




My kernel has ARM64_HW_AFDBM enabled, but LS2085 is not ARMv8.1.

The code shows it just check if ARM64_HW_AFDBM is enabled or not, but
doesn't check if the CPU really has such capability.

So, it might be better to have the capability checked runtime?


The warnings are there to spot any incorrect uses of the pte accessors
even before you run on AF/DBM-capable hardware.





Re: [BUG linux-next] kernel NULL pointer dereference on linux-next-20160420

2016-04-26 Thread Shi, Yang

Did some preliminary investigation on this one.

The problem is the cc->freepages list is empty, but cc->nr_freepages > 
0, it looks the list and nr_freepages get out-of-sync somewhere.


Any hint is appreciated.

Thanks,
Yang


On 4/21/2016 5:38 PM, Shi, Yang wrote:

Hi folks,

I did the below test with huge tmpfs on linux-next-20160420:

# mount -t tmpfs huge=1 tmpfs /mnt
# cd /mnt
Then clone linux kernel

Then I got the below bug, such test works well on non-huge tmpfs.

BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [] release_freepages+0x18/0xa0
PGD 0
Oops:  [#1] PREEMPT SMP
Modules linked in:
CPU: 6 PID: 110 Comm: kcompactd0 Not tainted
4.6.0-rc4-next-20160420-WR7.0.0.0_standard #4
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS
S5500.86B.01.10.0025.030220091519 03/02/2009
task: 880361708040 ti: 880361704000 task.ti: 880361704000
RIP: 0010:[]  []
release_freepages+0x18/0xa0
RSP: 0018:880361707cf8  EFLAGS: 00010282
RAX:  RBX: 88036ffde7c0 RCX: 0009
RDX: 1bf1 RSI: 000f RDI: 880361707dd0
RBP: 880361707d20 R08: 0007 R09: 1600
R10: 88036ffde7c0 R11:  R12: 
R13: 880361707dd0 R14: 880361707dc0 R15: 880361707db0
FS:  () GS:880363cc()
knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 02206000 CR4: 06e0
Stack:
  88036ffde7c0  1a00 880361707dc0
  880361707db0 880361707da0 8119f13d 81196239
   880361708040 0001 0010
Call Trace:
  [] compact_zone+0x55d/0x9f0
  [] ? fragmentation_index+0x19/0x70
  [] kcompactd_do_work+0x10f/0x230
  [] kcompactd+0x90/0x1e0
  [] ? wait_woken+0xa0/0xa0
  [] ? kcompactd_do_work+0x230/0x230
  [] kthread+0xdd/0x100
  [] ret_from_fork+0x22/0x40
  [] ? kthread_create_on_node+0x180/0x180
Code: c1 fa 06 31 f6 e8 a9 9b fd ff eb 98 0f 1f 80 00 00 00 00 66 66 66
66 90 55 48 89 e5 41 57 41 56 41 55 49 89 fd 41 54 53 48 8b 07 <48> 8b
10 48 8d 78 e0 49 39 c5 4c 8d 62 e0 74 70 49 be 00 00 00
RIP  [] release_freepages+0x18/0xa0
  RSP 
CR2: 
---[ end trace 855da7e142f7311f ]---




Re: [BUG linux-next] kernel NULL pointer dereference on linux-next-20160420

2016-04-26 Thread Shi, Yang

Did some preliminary investigation on this one.

The problem is the cc->freepages list is empty, but cc->nr_freepages > 
0, it looks the list and nr_freepages get out-of-sync somewhere.


Any hint is appreciated.

Thanks,
Yang


On 4/21/2016 5:38 PM, Shi, Yang wrote:

Hi folks,

I did the below test with huge tmpfs on linux-next-20160420:

# mount -t tmpfs huge=1 tmpfs /mnt
# cd /mnt
Then clone linux kernel

Then I got the below bug, such test works well on non-huge tmpfs.

BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [] release_freepages+0x18/0xa0
PGD 0
Oops:  [#1] PREEMPT SMP
Modules linked in:
CPU: 6 PID: 110 Comm: kcompactd0 Not tainted
4.6.0-rc4-next-20160420-WR7.0.0.0_standard #4
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS
S5500.86B.01.10.0025.030220091519 03/02/2009
task: 880361708040 ti: 880361704000 task.ti: 880361704000
RIP: 0010:[]  []
release_freepages+0x18/0xa0
RSP: 0018:880361707cf8  EFLAGS: 00010282
RAX:  RBX: 88036ffde7c0 RCX: 0009
RDX: 1bf1 RSI: 000f RDI: 880361707dd0
RBP: 880361707d20 R08: 0007 R09: 1600
R10: 88036ffde7c0 R11:  R12: 
R13: 880361707dd0 R14: 880361707dc0 R15: 880361707db0
FS:  () GS:880363cc()
knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 02206000 CR4: 06e0
Stack:
  88036ffde7c0  1a00 880361707dc0
  880361707db0 880361707da0 8119f13d 81196239
   880361708040 0001 0010
Call Trace:
  [] compact_zone+0x55d/0x9f0
  [] ? fragmentation_index+0x19/0x70
  [] kcompactd_do_work+0x10f/0x230
  [] kcompactd+0x90/0x1e0
  [] ? wait_woken+0xa0/0xa0
  [] ? kcompactd_do_work+0x230/0x230
  [] kthread+0xdd/0x100
  [] ret_from_fork+0x22/0x40
  [] ? kthread_create_on_node+0x180/0x180
Code: c1 fa 06 31 f6 e8 a9 9b fd ff eb 98 0f 1f 80 00 00 00 00 66 66 66
66 90 55 48 89 e5 41 57 41 56 41 55 49 89 fd 41 54 53 48 8b 07 <48> 8b
10 48 8d 78 e0 49 39 c5 4c 8d 62 e0 74 70 49 be 00 00 00
RIP  [] release_freepages+0x18/0xa0
  RSP 
CR2: 
---[ end trace 855da7e142f7311f ]---




Re: [PATCH] panic: lockdep: correct lock debugging state check

2016-04-26 Thread Shi, Yang

On 4/26/2016 5:39 AM, Peter Zijlstra wrote:

On Mon, Apr 25, 2016 at 08:36:37PM -0700, Yang Shi wrote:

When kernel oops happens, lock debugging is turned off by debug_locks_off()
in oops_enter() via calling __debug_locks_off() which set debug_locks to 0
via xchg(). But, calling to __debug_locks_off() to check lock debugging state
in add_taint() called by oops_end() will always return false since xchg()
returns the old value of debug_locks which is cleared in oops_enter() already.

This prevents add_taint() from printing out lock debugging disable information
although LOCKDEP_NOW_UNRELIABLE is passed to it.

Check lock debugging state via !debug_locks to fix this. Although
!__debug_locks_off() could do the same thing, it may look confusing.


What are you smoking? This is the second completely insane patch you
send this week.

This breaks add_taint() and gains us nothing except trivialities. Who


I apologize in advance, if I misunderstand the code and please ignore 
all the bullshit below.


In my understanding, add_taint() should call that pr_warn if 
LOCKDEP_NOW_UNRELIABLE is passed and lock debugging is disabled. This is 
what the code tells me.


LOCKDEP_NOW_UNRELIABLE is passed via lock_ok parameter, lock debugging 
is turned off by debug_locks_off() already, so it should print out 
something, but it doesn't since __debug_locks_off() always returns 0.


So, it looks the if statement logic is broken.

There are alternatives to fix it, I may pick up the not ideal one.


bloody cares about that print if you've just had an OOPS.


I do agree not too many people care about that print and such 
information is too trivial to draw attention from people. However, it 
doesn't mean oops print is a perfect place to hide something wrong. I 
just happened to find this by checking the oops information to try to 
get some clue for another issue. Then I thought it is just a quick fix, 
why not I should do that to make kernel better.


Thanks,
Yang







Re: [PATCH] panic: lockdep: correct lock debugging state check

2016-04-26 Thread Shi, Yang

On 4/26/2016 5:39 AM, Peter Zijlstra wrote:

On Mon, Apr 25, 2016 at 08:36:37PM -0700, Yang Shi wrote:

When kernel oops happens, lock debugging is turned off by debug_locks_off()
in oops_enter() via calling __debug_locks_off() which set debug_locks to 0
via xchg(). But, calling to __debug_locks_off() to check lock debugging state
in add_taint() called by oops_end() will always return false since xchg()
returns the old value of debug_locks which is cleared in oops_enter() already.

This prevents add_taint() from printing out lock debugging disable information
although LOCKDEP_NOW_UNRELIABLE is passed to it.

Check lock debugging state via !debug_locks to fix this. Although
!__debug_locks_off() could do the same thing, it may look confusing.


What are you smoking? This is the second completely insane patch you
send this week.

This breaks add_taint() and gains us nothing except trivialities. Who


I apologize in advance, if I misunderstand the code and please ignore 
all the bullshit below.


In my understanding, add_taint() should call that pr_warn if 
LOCKDEP_NOW_UNRELIABLE is passed and lock debugging is disabled. This is 
what the code tells me.


LOCKDEP_NOW_UNRELIABLE is passed via lock_ok parameter, lock debugging 
is turned off by debug_locks_off() already, so it should print out 
something, but it doesn't since __debug_locks_off() always returns 0.


So, it looks the if statement logic is broken.

There are alternatives to fix it, I may pick up the not ideal one.


bloody cares about that print if you've just had an OOPS.


I do agree not too many people care about that print and such 
information is too trivial to draw attention from people. However, it 
doesn't mean oops print is a perfect place to hide something wrong. I 
just happened to find this by checking the oops information to try to 
get some clue for another issue. Then I thought it is just a quick fix, 
why not I should do that to make kernel better.


Thanks,
Yang







Re: [linux-next PATCH] sched: cgroup: enable interrupt before calling threadgroup_change_begin

2016-04-25 Thread Shi, Yang

On 4/25/2016 10:35 AM, Shi, Yang wrote:

On 4/23/2016 2:14 AM, Peter Zijlstra wrote:

On Fri, Apr 22, 2016 at 08:56:28PM -0700, Yang Shi wrote:

When kernel oops happens in some kernel thread, i.e. kcompactd in the
test,
the below bug might be triggered by the oops handler:


What are you trying to fix? You already oopsed the thing is wrecked.


Actually, I ran into the below kernel BUG:

BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [] release_freepages+0x18/0xa0
PGD 0
Oops:  [#1] PREEMPT SMP
Modules linked in:
CPU: 6 PID: 110 Comm: kcompactd0 Not tainted 4.6.0-rc4-next-20160420 #4
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS
S5500.86B.01.10.0025.030220091519 03/02/2009
task: 880361732680 ti: 88036173c000 task.ti: 88036173c000
RIP: 0010:[]  []
release_freepages+0x18/0xa0
RSP: 0018:88036173fcf8  EFLAGS: 00010282
RAX:  RBX: 88036ffde7c0 RCX: 0009
RDX: 1bf1 RSI: 000f RDI: 88036173fdd0
RBP: 88036173fd20 R08: 0007 R09: 1600
R10: 88036ffde7c0 R11:  R12: 
R13: 88036173fdd0 R14: 88036173fdc0 R15: 88036173fdb0
FS:  () GS:880363cc()
knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 02206000 CR4: 06e0
Stack:
  88036ffde7c0  1a00 88036173fdc0
  88036173fdb0 88036173fda0 8119f13d 81196239
   880361732680 0001 0010
Call Trace:
  [] compact_zone+0x55d/0x9f0
  [] ? fragmentation_index+0x19/0x70
  [] kcompactd_do_work+0x10f/0x230
  [] kcompactd+0x90/0x1e0
  [] ? wait_woken+0xa0/0xa0
  [] ? kcompactd_do_work+0x230/0x230
  [] kthread+0xdd/0x100
  [] ret_from_fork+0x22/0x40
  [] ? kthread_create_on_node+0x180/0x180
Code: c1 fa 06 31 f6 e8 a9 9b fd ff eb 98 0f 1f 80 00 00 00 00 66 66 66
66 90 55 48 89 e5 41 57 41 56 41 55 49 89 fd 41 54 53 48 8b 07 <48> 8b
10 48 8d 78 e0 49 39 c5 4c 8d 62 e0 74 70 49 be 00 00 00
RIP  [] release_freepages+0x18/0xa0
  RSP 
CR2: 
---[ end trace 2e96d09e0ba6342f ]---

Then the "schedule in atomic context" bug is triggered which cause the
system hang. But, the system is still alive without the "schedule in
atomic context" bug. The previous null pointer deference issue doesn't
bring the system down other than killing the compactd kthread.


BTW, I don't have "panic on oops" set. So, the kernel doesn't panic.

Thanks,
Yang



Thanks,
Yang









Re: [linux-next PATCH] sched: cgroup: enable interrupt before calling threadgroup_change_begin

2016-04-25 Thread Shi, Yang

On 4/25/2016 10:35 AM, Shi, Yang wrote:

On 4/23/2016 2:14 AM, Peter Zijlstra wrote:

On Fri, Apr 22, 2016 at 08:56:28PM -0700, Yang Shi wrote:

When kernel oops happens in some kernel thread, i.e. kcompactd in the
test,
the below bug might be triggered by the oops handler:


What are you trying to fix? You already oopsed the thing is wrecked.


Actually, I ran into the below kernel BUG:

BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [] release_freepages+0x18/0xa0
PGD 0
Oops:  [#1] PREEMPT SMP
Modules linked in:
CPU: 6 PID: 110 Comm: kcompactd0 Not tainted 4.6.0-rc4-next-20160420 #4
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS
S5500.86B.01.10.0025.030220091519 03/02/2009
task: 880361732680 ti: 88036173c000 task.ti: 88036173c000
RIP: 0010:[]  []
release_freepages+0x18/0xa0
RSP: 0018:88036173fcf8  EFLAGS: 00010282
RAX:  RBX: 88036ffde7c0 RCX: 0009
RDX: 1bf1 RSI: 000f RDI: 88036173fdd0
RBP: 88036173fd20 R08: 0007 R09: 1600
R10: 88036ffde7c0 R11:  R12: 
R13: 88036173fdd0 R14: 88036173fdc0 R15: 88036173fdb0
FS:  () GS:880363cc()
knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 02206000 CR4: 06e0
Stack:
  88036ffde7c0  1a00 88036173fdc0
  88036173fdb0 88036173fda0 8119f13d 81196239
   880361732680 0001 0010
Call Trace:
  [] compact_zone+0x55d/0x9f0
  [] ? fragmentation_index+0x19/0x70
  [] kcompactd_do_work+0x10f/0x230
  [] kcompactd+0x90/0x1e0
  [] ? wait_woken+0xa0/0xa0
  [] ? kcompactd_do_work+0x230/0x230
  [] kthread+0xdd/0x100
  [] ret_from_fork+0x22/0x40
  [] ? kthread_create_on_node+0x180/0x180
Code: c1 fa 06 31 f6 e8 a9 9b fd ff eb 98 0f 1f 80 00 00 00 00 66 66 66
66 90 55 48 89 e5 41 57 41 56 41 55 49 89 fd 41 54 53 48 8b 07 <48> 8b
10 48 8d 78 e0 49 39 c5 4c 8d 62 e0 74 70 49 be 00 00 00
RIP  [] release_freepages+0x18/0xa0
  RSP 
CR2: 
---[ end trace 2e96d09e0ba6342f ]---

Then the "schedule in atomic context" bug is triggered which cause the
system hang. But, the system is still alive without the "schedule in
atomic context" bug. The previous null pointer deference issue doesn't
bring the system down other than killing the compactd kthread.


BTW, I don't have "panic on oops" set. So, the kernel doesn't panic.

Thanks,
Yang



Thanks,
Yang









Re: [linux-next PATCH] sched: cgroup: enable interrupt before calling threadgroup_change_begin

2016-04-25 Thread Shi, Yang

On 4/23/2016 2:14 AM, Peter Zijlstra wrote:

On Fri, Apr 22, 2016 at 08:56:28PM -0700, Yang Shi wrote:

When kernel oops happens in some kernel thread, i.e. kcompactd in the test,
the below bug might be triggered by the oops handler:


What are you trying to fix? You already oopsed the thing is wrecked.


Actually, I ran into the below kernel BUG:

BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [] release_freepages+0x18/0xa0
PGD 0
Oops:  [#1] PREEMPT SMP
Modules linked in:
CPU: 6 PID: 110 Comm: kcompactd0 Not tainted 4.6.0-rc4-next-20160420 #4
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS 
S5500.86B.01.10.0025.030220091519 03/02/2009

task: 880361732680 ti: 88036173c000 task.ti: 88036173c000
RIP: 0010:[]  [] 
release_freepages+0x18/0xa0

RSP: 0018:88036173fcf8  EFLAGS: 00010282
RAX:  RBX: 88036ffde7c0 RCX: 0009
RDX: 1bf1 RSI: 000f RDI: 88036173fdd0
RBP: 88036173fd20 R08: 0007 R09: 1600
R10: 88036ffde7c0 R11:  R12: 
R13: 88036173fdd0 R14: 88036173fdc0 R15: 88036173fdb0
FS:  () GS:880363cc() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 02206000 CR4: 06e0
Stack:
 88036ffde7c0  1a00 88036173fdc0
 88036173fdb0 88036173fda0 8119f13d 81196239
  880361732680 0001 0010
Call Trace:
 [] compact_zone+0x55d/0x9f0
 [] ? fragmentation_index+0x19/0x70
 [] kcompactd_do_work+0x10f/0x230
 [] kcompactd+0x90/0x1e0
 [] ? wait_woken+0xa0/0xa0
 [] ? kcompactd_do_work+0x230/0x230
 [] kthread+0xdd/0x100
 [] ret_from_fork+0x22/0x40
 [] ? kthread_create_on_node+0x180/0x180
Code: c1 fa 06 31 f6 e8 a9 9b fd ff eb 98 0f 1f 80 00 00 00 00 66 66 66 
66 90 55 48 89 e5 41 57 41 56 41 55 49 89 fd 41 54 53 48 8b 07 <48> 8b 
10 48 8d 78 e0 49 39 c5 4c 8d 62 e0 74 70 49 be 00 00 00

RIP  [] release_freepages+0x18/0xa0
 RSP 
CR2: 
---[ end trace 2e96d09e0ba6342f ]---

Then the "schedule in atomic context" bug is triggered which cause the 
system hang. But, the system is still alive without the "schedule in 
atomic context" bug. The previous null pointer deference issue doesn't 
bring the system down other than killing the compactd kthread.


Thanks,
Yang







Re: [linux-next PATCH] sched: cgroup: enable interrupt before calling threadgroup_change_begin

2016-04-25 Thread Shi, Yang

On 4/23/2016 2:14 AM, Peter Zijlstra wrote:

On Fri, Apr 22, 2016 at 08:56:28PM -0700, Yang Shi wrote:

When kernel oops happens in some kernel thread, i.e. kcompactd in the test,
the below bug might be triggered by the oops handler:


What are you trying to fix? You already oopsed the thing is wrecked.


Actually, I ran into the below kernel BUG:

BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [] release_freepages+0x18/0xa0
PGD 0
Oops:  [#1] PREEMPT SMP
Modules linked in:
CPU: 6 PID: 110 Comm: kcompactd0 Not tainted 4.6.0-rc4-next-20160420 #4
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS 
S5500.86B.01.10.0025.030220091519 03/02/2009

task: 880361732680 ti: 88036173c000 task.ti: 88036173c000
RIP: 0010:[]  [] 
release_freepages+0x18/0xa0

RSP: 0018:88036173fcf8  EFLAGS: 00010282
RAX:  RBX: 88036ffde7c0 RCX: 0009
RDX: 1bf1 RSI: 000f RDI: 88036173fdd0
RBP: 88036173fd20 R08: 0007 R09: 1600
R10: 88036ffde7c0 R11:  R12: 
R13: 88036173fdd0 R14: 88036173fdc0 R15: 88036173fdb0
FS:  () GS:880363cc() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 02206000 CR4: 06e0
Stack:
 88036ffde7c0  1a00 88036173fdc0
 88036173fdb0 88036173fda0 8119f13d 81196239
  880361732680 0001 0010
Call Trace:
 [] compact_zone+0x55d/0x9f0
 [] ? fragmentation_index+0x19/0x70
 [] kcompactd_do_work+0x10f/0x230
 [] kcompactd+0x90/0x1e0
 [] ? wait_woken+0xa0/0xa0
 [] ? kcompactd_do_work+0x230/0x230
 [] kthread+0xdd/0x100
 [] ret_from_fork+0x22/0x40
 [] ? kthread_create_on_node+0x180/0x180
Code: c1 fa 06 31 f6 e8 a9 9b fd ff eb 98 0f 1f 80 00 00 00 00 66 66 66 
66 90 55 48 89 e5 41 57 41 56 41 55 49 89 fd 41 54 53 48 8b 07 <48> 8b 
10 48 8d 78 e0 49 39 c5 4c 8d 62 e0 74 70 49 be 00 00 00

RIP  [] release_freepages+0x18/0xa0
 RSP 
CR2: 
---[ end trace 2e96d09e0ba6342f ]---

Then the "schedule in atomic context" bug is triggered which cause the 
system hang. But, the system is still alive without the "schedule in 
atomic context" bug. The previous null pointer deference issue doesn't 
bring the system down other than killing the compactd kthread.


Thanks,
Yang







Re: Warning triggered by lockdep checks for sock_owned_by_user on linux-next-20160420

2016-04-25 Thread Shi, Yang

On 4/22/2016 9:50 PM, Eric Dumazet wrote:

On Fri, 2016-04-22 at 21:02 -0700, Shi, Yang wrote:

Hi David,

When I ran some test on a nfs mounted rootfs, I got the below warning
with LOCKDEP enabled on linux-next-20160420:

WARNING: CPU: 9 PID: 0 at include/net/sock.h:1408
udp_queue_rcv_skb+0x3d0/0x660
Modules linked in:
CPU: 9 PID: 0 Comm: swapper/9 Tainted: G  D
4.6.0-rc4-next-20160420-WR7.0.0.0_standard+ #6
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS
S5500.86B.01.10.0025.030220091519 03/02/2009
    88066fd03a70 8155855f 
    88066fd03ab0 81062803 058061318ec8
   88065d1e39c0 880661318e40  880661318ec8
Call Trace:
 [] dump_stack+0x67/0x98
Checking out fil [] __warn+0xd3/0xf0
   [] warn_slowpath_null+0x1d/0x20
   [] udp_queue_rcv_skb+0x3d0/0x660
   [] __udp4_lib_rcv+0x4dc/0xc00
   [] udp_rcv+0x1a/0x20
   [] ip_local_deliver_finish+0xd1/0x2e0
es:  57% (30585/ [] ? ip_local_deliver_finish+0x3f/0x2e0
   [] ip_local_deliver+0xc2/0xd0
   [] ip_rcv_finish+0x1e2/0x5a0
   [] ip_rcv+0x2dc/0x410
   [] ? __pskb_pull_tail+0x82/0x400
   [] __netif_receive_skb_core+0x3a8/0xa80
   [] ? netif_receive_skb_internal+0x1b/0xf0
   [] __netif_receive_skb+0x1d/0x60
   [] netif_receive_skb_internal+0x55/0xf0
   [] ? netif_receive_skb_internal+0x1b/0xf0
   [] napi_gro_receive+0xc2/0x180
   [] igb_poll+0x5ea/0xdf0
   [] net_rx_action+0x15c/0x3d0
   [] __do_softirq+0x161/0x413
   [] irq_exit+0xd1/0x110
   [] do_IRQ+0x62/0xf0
   [] common_interrupt+0x8e/0x8e
 [] ? cpuidle_enter_state+0xc6/0x290
   [] cpuidle_enter+0x17/0x20
   [] call_cpuidle+0x33/0x50
   [] cpu_startup_entry+0x229/0x3b0
   [] start_secondary+0x144/0x150
---[ end trace ba508c424f0d52bf ]---


The warning is triggered by commit
fafc4e1ea1a4c1eb13a30c9426fb799f5efacbc3 ("sock: tigthen lockdep checks
for sock_owned_by_user"), which checks if slock is held before locking
"owned".

It looks good to lock_sock which is just called lock_sock_nested. But,
bh_lock_sock is different, which just calls spin_lock so it doesn't
touch dep_map then the check will fail even though it is locked.


?? spin_lock() definitely is lockdep friendly.


Yes, this is what I thought too. But, I didn't figure out why the 
warning was still reported even though spin_lock is called.






So, I'm wondering what a right fix for it should be:

1. Replace bh_lock_sock to bh_lock_sock_nested in the protocols
implementation, but there are a lot places calling it.

2. Just like lock_sock, just call bh_lock_sock_nested instead of spin_lock.

Or the both approach is wrong or not ideal?


I sent a patch yesterday, I am not sure what the status is.


Thanks for the patch. I just found your original patch and the 
discussion with Valdis. I think I ran into the same problem. There is 
kernel BUG is triggered before the warning, but "lockdep is off" 
information is not printed out, although is it really off.


Just tried your patch, it works for me.

Thanks,
Yang



diff --git a/include/net/sock.h b/include/net/sock.h
index d997ec13a643..db8301c76d50 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1350,7 +1350,8 @@ static inline bool lockdep_sock_is_held(const struct sock 
*csk)
  {
struct sock *sk = (struct sock *)csk;

-   return lockdep_is_held(>sk_lock) ||
+   return !debug_locks ||
+  lockdep_is_held(>sk_lock) ||
   lockdep_is_held(>sk_lock.slock);
  }
  #endif









Re: Warning triggered by lockdep checks for sock_owned_by_user on linux-next-20160420

2016-04-25 Thread Shi, Yang

On 4/22/2016 9:50 PM, Eric Dumazet wrote:

On Fri, 2016-04-22 at 21:02 -0700, Shi, Yang wrote:

Hi David,

When I ran some test on a nfs mounted rootfs, I got the below warning
with LOCKDEP enabled on linux-next-20160420:

WARNING: CPU: 9 PID: 0 at include/net/sock.h:1408
udp_queue_rcv_skb+0x3d0/0x660
Modules linked in:
CPU: 9 PID: 0 Comm: swapper/9 Tainted: G  D
4.6.0-rc4-next-20160420-WR7.0.0.0_standard+ #6
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS
S5500.86B.01.10.0025.030220091519 03/02/2009
    88066fd03a70 8155855f 
    88066fd03ab0 81062803 058061318ec8
   88065d1e39c0 880661318e40  880661318ec8
Call Trace:
 [] dump_stack+0x67/0x98
Checking out fil [] __warn+0xd3/0xf0
   [] warn_slowpath_null+0x1d/0x20
   [] udp_queue_rcv_skb+0x3d0/0x660
   [] __udp4_lib_rcv+0x4dc/0xc00
   [] udp_rcv+0x1a/0x20
   [] ip_local_deliver_finish+0xd1/0x2e0
es:  57% (30585/ [] ? ip_local_deliver_finish+0x3f/0x2e0
   [] ip_local_deliver+0xc2/0xd0
   [] ip_rcv_finish+0x1e2/0x5a0
   [] ip_rcv+0x2dc/0x410
   [] ? __pskb_pull_tail+0x82/0x400
   [] __netif_receive_skb_core+0x3a8/0xa80
   [] ? netif_receive_skb_internal+0x1b/0xf0
   [] __netif_receive_skb+0x1d/0x60
   [] netif_receive_skb_internal+0x55/0xf0
   [] ? netif_receive_skb_internal+0x1b/0xf0
   [] napi_gro_receive+0xc2/0x180
   [] igb_poll+0x5ea/0xdf0
   [] net_rx_action+0x15c/0x3d0
   [] __do_softirq+0x161/0x413
   [] irq_exit+0xd1/0x110
   [] do_IRQ+0x62/0xf0
   [] common_interrupt+0x8e/0x8e
 [] ? cpuidle_enter_state+0xc6/0x290
   [] cpuidle_enter+0x17/0x20
   [] call_cpuidle+0x33/0x50
   [] cpu_startup_entry+0x229/0x3b0
   [] start_secondary+0x144/0x150
---[ end trace ba508c424f0d52bf ]---


The warning is triggered by commit
fafc4e1ea1a4c1eb13a30c9426fb799f5efacbc3 ("sock: tigthen lockdep checks
for sock_owned_by_user"), which checks if slock is held before locking
"owned".

It looks good to lock_sock which is just called lock_sock_nested. But,
bh_lock_sock is different, which just calls spin_lock so it doesn't
touch dep_map then the check will fail even though it is locked.


?? spin_lock() definitely is lockdep friendly.


Yes, this is what I thought too. But, I didn't figure out why the 
warning was still reported even though spin_lock is called.






So, I'm wondering what a right fix for it should be:

1. Replace bh_lock_sock to bh_lock_sock_nested in the protocols
implementation, but there are a lot places calling it.

2. Just like lock_sock, just call bh_lock_sock_nested instead of spin_lock.

Or the both approach is wrong or not ideal?


I sent a patch yesterday, I am not sure what the status is.


Thanks for the patch. I just found your original patch and the 
discussion with Valdis. I think I ran into the same problem. There is 
kernel BUG is triggered before the warning, but "lockdep is off" 
information is not printed out, although is it really off.


Just tried your patch, it works for me.

Thanks,
Yang



diff --git a/include/net/sock.h b/include/net/sock.h
index d997ec13a643..db8301c76d50 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1350,7 +1350,8 @@ static inline bool lockdep_sock_is_held(const struct sock 
*csk)
  {
struct sock *sk = (struct sock *)csk;

-   return lockdep_is_held(>sk_lock) ||
+   return !debug_locks ||
+  lockdep_is_held(>sk_lock) ||
   lockdep_is_held(>sk_lock.slock);
  }
  #endif









Warning triggered by lockdep checks for sock_owned_by_user on linux-next-20160420

2016-04-22 Thread Shi, Yang

Hi David,

When I ran some test on a nfs mounted rootfs, I got the below warning 
with LOCKDEP enabled on linux-next-20160420:


WARNING: CPU: 9 PID: 0 at include/net/sock.h:1408 
udp_queue_rcv_skb+0x3d0/0x660

Modules linked in:
CPU: 9 PID: 0 Comm: swapper/9 Tainted: G  D 
4.6.0-rc4-next-20160420-WR7.0.0.0_standard+ #6
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS 
S5500.86B.01.10.0025.030220091519 03/02/2009

  88066fd03a70 8155855f 
  88066fd03ab0 81062803 058061318ec8
 88065d1e39c0 880661318e40  880661318ec8
Call Trace:
   [] dump_stack+0x67/0x98
Checking out fil [] __warn+0xd3/0xf0
 [] warn_slowpath_null+0x1d/0x20
 [] udp_queue_rcv_skb+0x3d0/0x660
 [] __udp4_lib_rcv+0x4dc/0xc00
 [] udp_rcv+0x1a/0x20
 [] ip_local_deliver_finish+0xd1/0x2e0
es:  57% (30585/ [] ? ip_local_deliver_finish+0x3f/0x2e0
 [] ip_local_deliver+0xc2/0xd0
 [] ip_rcv_finish+0x1e2/0x5a0
 [] ip_rcv+0x2dc/0x410
 [] ? __pskb_pull_tail+0x82/0x400
 [] __netif_receive_skb_core+0x3a8/0xa80
 [] ? netif_receive_skb_internal+0x1b/0xf0
 [] __netif_receive_skb+0x1d/0x60
 [] netif_receive_skb_internal+0x55/0xf0
 [] ? netif_receive_skb_internal+0x1b/0xf0
 [] napi_gro_receive+0xc2/0x180
 [] igb_poll+0x5ea/0xdf0
 [] net_rx_action+0x15c/0x3d0
 [] __do_softirq+0x161/0x413
 [] irq_exit+0xd1/0x110
 [] do_IRQ+0x62/0xf0
 [] common_interrupt+0x8e/0x8e
   [] ? cpuidle_enter_state+0xc6/0x290
 [] cpuidle_enter+0x17/0x20
 [] call_cpuidle+0x33/0x50
 [] cpu_startup_entry+0x229/0x3b0
 [] start_secondary+0x144/0x150
---[ end trace ba508c424f0d52bf ]---


The warning is triggered by commit 
fafc4e1ea1a4c1eb13a30c9426fb799f5efacbc3 ("sock: tigthen lockdep checks 
for sock_owned_by_user"), which checks if slock is held before locking 
"owned".


It looks good to lock_sock which is just called lock_sock_nested. But, 
bh_lock_sock is different, which just calls spin_lock so it doesn't 
touch dep_map then the check will fail even though it is locked.


So, I'm wondering what a right fix for it should be:

1. Replace bh_lock_sock to bh_lock_sock_nested in the protocols 
implementation, but there are a lot places calling it.


2. Just like lock_sock, just call bh_lock_sock_nested instead of spin_lock.

Or the both approach is wrong or not ideal?

Thanks,
Yang


Warning triggered by lockdep checks for sock_owned_by_user on linux-next-20160420

2016-04-22 Thread Shi, Yang

Hi David,

When I ran some test on a nfs mounted rootfs, I got the below warning 
with LOCKDEP enabled on linux-next-20160420:


WARNING: CPU: 9 PID: 0 at include/net/sock.h:1408 
udp_queue_rcv_skb+0x3d0/0x660

Modules linked in:
CPU: 9 PID: 0 Comm: swapper/9 Tainted: G  D 
4.6.0-rc4-next-20160420-WR7.0.0.0_standard+ #6
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS 
S5500.86B.01.10.0025.030220091519 03/02/2009

  88066fd03a70 8155855f 
  88066fd03ab0 81062803 058061318ec8
 88065d1e39c0 880661318e40  880661318ec8
Call Trace:
   [] dump_stack+0x67/0x98
Checking out fil [] __warn+0xd3/0xf0
 [] warn_slowpath_null+0x1d/0x20
 [] udp_queue_rcv_skb+0x3d0/0x660
 [] __udp4_lib_rcv+0x4dc/0xc00
 [] udp_rcv+0x1a/0x20
 [] ip_local_deliver_finish+0xd1/0x2e0
es:  57% (30585/ [] ? ip_local_deliver_finish+0x3f/0x2e0
 [] ip_local_deliver+0xc2/0xd0
 [] ip_rcv_finish+0x1e2/0x5a0
 [] ip_rcv+0x2dc/0x410
 [] ? __pskb_pull_tail+0x82/0x400
 [] __netif_receive_skb_core+0x3a8/0xa80
 [] ? netif_receive_skb_internal+0x1b/0xf0
 [] __netif_receive_skb+0x1d/0x60
 [] netif_receive_skb_internal+0x55/0xf0
 [] ? netif_receive_skb_internal+0x1b/0xf0
 [] napi_gro_receive+0xc2/0x180
 [] igb_poll+0x5ea/0xdf0
 [] net_rx_action+0x15c/0x3d0
 [] __do_softirq+0x161/0x413
 [] irq_exit+0xd1/0x110
 [] do_IRQ+0x62/0xf0
 [] common_interrupt+0x8e/0x8e
   [] ? cpuidle_enter_state+0xc6/0x290
 [] cpuidle_enter+0x17/0x20
 [] call_cpuidle+0x33/0x50
 [] cpu_startup_entry+0x229/0x3b0
 [] start_secondary+0x144/0x150
---[ end trace ba508c424f0d52bf ]---


The warning is triggered by commit 
fafc4e1ea1a4c1eb13a30c9426fb799f5efacbc3 ("sock: tigthen lockdep checks 
for sock_owned_by_user"), which checks if slock is held before locking 
"owned".


It looks good to lock_sock which is just called lock_sock_nested. But, 
bh_lock_sock is different, which just calls spin_lock so it doesn't 
touch dep_map then the check will fail even though it is locked.


So, I'm wondering what a right fix for it should be:

1. Replace bh_lock_sock to bh_lock_sock_nested in the protocols 
implementation, but there are a lot places calling it.


2. Just like lock_sock, just call bh_lock_sock_nested instead of spin_lock.

Or the both approach is wrong or not ideal?

Thanks,
Yang


[BUG linux-next] kernel NULL pointer dereference on linux-next-20160420

2016-04-21 Thread Shi, Yang

Hi folks,

I did the below test with huge tmpfs on linux-next-20160420:

# mount -t tmpfs huge=1 tmpfs /mnt
# cd /mnt
Then clone linux kernel

Then I got the below bug, such test works well on non-huge tmpfs.

BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [] release_freepages+0x18/0xa0
PGD 0
Oops:  [#1] PREEMPT SMP
Modules linked in:
CPU: 6 PID: 110 Comm: kcompactd0 Not tainted 
4.6.0-rc4-next-20160420-WR7.0.0.0_standard #4
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS 
S5500.86B.01.10.0025.030220091519 03/02/2009

task: 880361708040 ti: 880361704000 task.ti: 880361704000
RIP: 0010:[]  [] 
release_freepages+0x18/0xa0

RSP: 0018:880361707cf8  EFLAGS: 00010282
RAX:  RBX: 88036ffde7c0 RCX: 0009
RDX: 1bf1 RSI: 000f RDI: 880361707dd0
RBP: 880361707d20 R08: 0007 R09: 1600
R10: 88036ffde7c0 R11:  R12: 
R13: 880361707dd0 R14: 880361707dc0 R15: 880361707db0
FS:  () GS:880363cc() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 02206000 CR4: 06e0
Stack:
 88036ffde7c0  1a00 880361707dc0
 880361707db0 880361707da0 8119f13d 81196239
  880361708040 0001 0010
Call Trace:
 [] compact_zone+0x55d/0x9f0
 [] ? fragmentation_index+0x19/0x70
 [] kcompactd_do_work+0x10f/0x230
 [] kcompactd+0x90/0x1e0
 [] ? wait_woken+0xa0/0xa0
 [] ? kcompactd_do_work+0x230/0x230
 [] kthread+0xdd/0x100
 [] ret_from_fork+0x22/0x40
 [] ? kthread_create_on_node+0x180/0x180
Code: c1 fa 06 31 f6 e8 a9 9b fd ff eb 98 0f 1f 80 00 00 00 00 66 66 66 
66 90 55 48 89 e5 41 57 41 56 41 55 49 89 fd 41 54 53 48 8b 07 <48> 8b 
10 48 8d 78 e0 49 39 c5 4c 8d 62 e0 74 70 49 be 00 00 00

RIP  [] release_freepages+0x18/0xa0
 RSP 
CR2: 
---[ end trace 855da7e142f7311f ]---


[BUG linux-next] kernel NULL pointer dereference on linux-next-20160420

2016-04-21 Thread Shi, Yang

Hi folks,

I did the below test with huge tmpfs on linux-next-20160420:

# mount -t tmpfs huge=1 tmpfs /mnt
# cd /mnt
Then clone linux kernel

Then I got the below bug, such test works well on non-huge tmpfs.

BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [] release_freepages+0x18/0xa0
PGD 0
Oops:  [#1] PREEMPT SMP
Modules linked in:
CPU: 6 PID: 110 Comm: kcompactd0 Not tainted 
4.6.0-rc4-next-20160420-WR7.0.0.0_standard #4
Hardware name: Intel Corporation S5520HC/S5520HC, BIOS 
S5500.86B.01.10.0025.030220091519 03/02/2009

task: 880361708040 ti: 880361704000 task.ti: 880361704000
RIP: 0010:[]  [] 
release_freepages+0x18/0xa0

RSP: 0018:880361707cf8  EFLAGS: 00010282
RAX:  RBX: 88036ffde7c0 RCX: 0009
RDX: 1bf1 RSI: 000f RDI: 880361707dd0
RBP: 880361707d20 R08: 0007 R09: 1600
R10: 88036ffde7c0 R11:  R12: 
R13: 880361707dd0 R14: 880361707dc0 R15: 880361707db0
FS:  () GS:880363cc() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2:  CR3: 02206000 CR4: 06e0
Stack:
 88036ffde7c0  1a00 880361707dc0
 880361707db0 880361707da0 8119f13d 81196239
  880361708040 0001 0010
Call Trace:
 [] compact_zone+0x55d/0x9f0
 [] ? fragmentation_index+0x19/0x70
 [] kcompactd_do_work+0x10f/0x230
 [] kcompactd+0x90/0x1e0
 [] ? wait_woken+0xa0/0xa0
 [] ? kcompactd_do_work+0x230/0x230
 [] kthread+0xdd/0x100
 [] ret_from_fork+0x22/0x40
 [] ? kthread_create_on_node+0x180/0x180
Code: c1 fa 06 31 f6 e8 a9 9b fd ff eb 98 0f 1f 80 00 00 00 00 66 66 66 
66 90 55 48 89 e5 41 57 41 56 41 55 49 89 fd 41 54 53 48 8b 07 <48> 8b 
10 48 8d 78 e0 49 39 c5 4c 8d 62 e0 74 70 49 be 00 00 00

RIP  [] release_freepages+0x18/0xa0
 RSP 
CR2: 
---[ end trace 855da7e142f7311f ]---


Re: [PATCH] mm: move huge_pmd_set_accessed out of huge_memory.c

2016-04-21 Thread Shi, Yang

On 4/21/2016 2:15 AM, Hugh Dickins wrote:

On Wed, 20 Apr 2016, Shi, Yang wrote:


Hi folks,

I didn't realize pmd_* functions are protected by CONFIG_TRANSPARENT_HUGEPAGE
on the most architectures before I made this change.

Before I fix all the affected architectures code, I want to check if you guys
think this change is worth or not?


Thanks for asking: no, it is not worthwhile.

I would much prefer not to have to consider these trivial cleanups
in the huge memory area at this time.  Kirill and I have urgent work
to do in this area, and coping with patch conflicts between different
versions of the source will not help any of us.


Thanks for your suggestion. I would consider to put such cleanup work on 
the back burner.


Yang



Thanks,
Hugh



Thanks,
Yang

On 4/20/2016 11:24 AM, Yang Shi wrote:

huge_pmd_set_accessed is only called by __handle_mm_fault from memory.c,
move the definition to memory.c and make it static like create_huge_pmd and
wp_huge_pmd.

Signed-off-by: Yang Shi <yang@linaro.org>
---
   include/linux/huge_mm.h |  4 
   mm/huge_memory.c| 23 ---
   mm/memory.c | 23 +++
   3 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7008623..c218ab7b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -8,10 +8,6 @@ extern int do_huge_pmd_anonymous_page(struct mm_struct
*mm,
   extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct
*src_mm,
 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 struct vm_area_struct *vma);
-extern void huge_pmd_set_accessed(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- pmd_t orig_pmd, int dirty);
   extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct
vm_area_struct *vma,
   unsigned long address, pmd_t *pmd,
   pmd_t orig_pmd);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fecbbc5..6c14cb6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1137,29 +1137,6 @@ out:
return ret;
   }

-void huge_pmd_set_accessed(struct mm_struct *mm,
-  struct vm_area_struct *vma,
-  unsigned long address,
-  pmd_t *pmd, pmd_t orig_pmd,
-  int dirty)
-{
-   spinlock_t *ptl;
-   pmd_t entry;
-   unsigned long haddr;
-
-   ptl = pmd_lock(mm, pmd);
-   if (unlikely(!pmd_same(*pmd, orig_pmd)))
-   goto unlock;
-
-   entry = pmd_mkyoung(orig_pmd);
-   haddr = address & HPAGE_PMD_MASK;
-   if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
-   update_mmu_cache_pmd(vma, address, pmd);
-
-unlock:
-   spin_unlock(ptl);
-}
-
   static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
diff --git a/mm/memory.c b/mm/memory.c
index 93897f2..6ced4eb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3287,6 +3287,29 @@ static int wp_huge_pmd(struct mm_struct *mm, struct
vm_area_struct *vma,
return VM_FAULT_FALLBACK;
   }

+static void huge_pmd_set_accessed(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmd, pmd_t orig_pmd,
+ int dirty)
+{
+   spinlock_t *ptl;
+   pmd_t entry;
+   unsigned long haddr;
+
+   ptl = pmd_lock(mm, pmd);
+   if (unlikely(!pmd_same(*pmd, orig_pmd)))
+   goto unlock;
+
+   entry = pmd_mkyoung(orig_pmd);
+   haddr = address & HPAGE_PMD_MASK;
+   if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
+   update_mmu_cache_pmd(vma, address, pmd);
+
+unlock:
+   spin_unlock(ptl);
+}
+
   /*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most




Re: [PATCH] mm: move huge_pmd_set_accessed out of huge_memory.c

2016-04-21 Thread Shi, Yang

On 4/21/2016 2:15 AM, Hugh Dickins wrote:

On Wed, 20 Apr 2016, Shi, Yang wrote:


Hi folks,

I didn't realize pmd_* functions are protected by CONFIG_TRANSPARENT_HUGEPAGE
on the most architectures before I made this change.

Before I fix all the affected architectures code, I want to check if you guys
think this change is worth or not?


Thanks for asking: no, it is not worthwhile.

I would much prefer not to have to consider these trivial cleanups
in the huge memory area at this time.  Kirill and I have urgent work
to do in this area, and coping with patch conflicts between different
versions of the source will not help any of us.


Thanks for your suggestion. I would consider to put such cleanup work on 
the back burner.


Yang



Thanks,
Hugh



Thanks,
Yang

On 4/20/2016 11:24 AM, Yang Shi wrote:

huge_pmd_set_accessed is only called by __handle_mm_fault from memory.c,
move the definition to memory.c and make it static like create_huge_pmd and
wp_huge_pmd.

Signed-off-by: Yang Shi 
---
   include/linux/huge_mm.h |  4 
   mm/huge_memory.c| 23 ---
   mm/memory.c | 23 +++
   3 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7008623..c218ab7b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -8,10 +8,6 @@ extern int do_huge_pmd_anonymous_page(struct mm_struct
*mm,
   extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct
*src_mm,
 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 struct vm_area_struct *vma);
-extern void huge_pmd_set_accessed(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- pmd_t orig_pmd, int dirty);
   extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct
vm_area_struct *vma,
   unsigned long address, pmd_t *pmd,
   pmd_t orig_pmd);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fecbbc5..6c14cb6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1137,29 +1137,6 @@ out:
return ret;
   }

-void huge_pmd_set_accessed(struct mm_struct *mm,
-  struct vm_area_struct *vma,
-  unsigned long address,
-  pmd_t *pmd, pmd_t orig_pmd,
-  int dirty)
-{
-   spinlock_t *ptl;
-   pmd_t entry;
-   unsigned long haddr;
-
-   ptl = pmd_lock(mm, pmd);
-   if (unlikely(!pmd_same(*pmd, orig_pmd)))
-   goto unlock;
-
-   entry = pmd_mkyoung(orig_pmd);
-   haddr = address & HPAGE_PMD_MASK;
-   if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
-   update_mmu_cache_pmd(vma, address, pmd);
-
-unlock:
-   spin_unlock(ptl);
-}
-
   static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
diff --git a/mm/memory.c b/mm/memory.c
index 93897f2..6ced4eb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3287,6 +3287,29 @@ static int wp_huge_pmd(struct mm_struct *mm, struct
vm_area_struct *vma,
return VM_FAULT_FALLBACK;
   }

+static void huge_pmd_set_accessed(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmd, pmd_t orig_pmd,
+ int dirty)
+{
+   spinlock_t *ptl;
+   pmd_t entry;
+   unsigned long haddr;
+
+   ptl = pmd_lock(mm, pmd);
+   if (unlikely(!pmd_same(*pmd, orig_pmd)))
+   goto unlock;
+
+   entry = pmd_mkyoung(orig_pmd);
+   haddr = address & HPAGE_PMD_MASK;
+   if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
+   update_mmu_cache_pmd(vma, address, pmd);
+
+unlock:
+   spin_unlock(ptl);
+}
+
   /*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most




Re: [PATCH] mm: move huge_pmd_set_accessed out of huge_memory.c

2016-04-21 Thread Shi, Yang

On 4/21/2016 12:30 AM, Kirill A. Shutemov wrote:

On Wed, Apr 20, 2016 at 02:00:11PM -0700, Shi, Yang wrote:

Hi folks,

I didn't realize pmd_* functions are protected by
CONFIG_TRANSPARENT_HUGEPAGE on the most architectures before I made this
change.

Before I fix all the affected architectures code, I want to check if you
guys think this change is worth or not?

Thanks,
Yang

On 4/20/2016 11:24 AM, Yang Shi wrote:

huge_pmd_set_accessed is only called by __handle_mm_fault from memory.c,
move the definition to memory.c and make it static like create_huge_pmd and
wp_huge_pmd.

Signed-off-by: Yang Shi <yang@linaro.org>


On pte side we have the same functionality open-coded. Should we do the
same for pmd? Or change pte side the same way?


Sorry, I don't quite understand you. Do you mean pte_* functions?

Thanks,
Yang







Re: [PATCH] mm: move huge_pmd_set_accessed out of huge_memory.c

2016-04-21 Thread Shi, Yang

On 4/21/2016 12:30 AM, Kirill A. Shutemov wrote:

On Wed, Apr 20, 2016 at 02:00:11PM -0700, Shi, Yang wrote:

Hi folks,

I didn't realize pmd_* functions are protected by
CONFIG_TRANSPARENT_HUGEPAGE on the most architectures before I made this
change.

Before I fix all the affected architectures code, I want to check if you
guys think this change is worth or not?

Thanks,
Yang

On 4/20/2016 11:24 AM, Yang Shi wrote:

huge_pmd_set_accessed is only called by __handle_mm_fault from memory.c,
move the definition to memory.c and make it static like create_huge_pmd and
wp_huge_pmd.

Signed-off-by: Yang Shi 


On pte side we have the same functionality open-coded. Should we do the
same for pmd? Or change pte side the same way?


Sorry, I don't quite understand you. Do you mean pte_* functions?

Thanks,
Yang







[BUG] set_pte_at: racy dirty state clearing warning

2016-04-20 Thread Shi, Yang

Hi Will and Catalin,

When I enable memory comact via

# echo 1 > /proc/sys/vm/compact_memory

I got the below WARNING:

set_pte_at: racy dirty state clearing: 0x006899371bd3 -> 
0x006899371fd3

[ cut here ]
WARNING: CPU: 5 PID: 294 at ./arch/arm64/include/asm/pgtable.h:227 
ptep_set_access_flags+0x138/0x1b8

Modules linked in:

CPU: 5 PID: 294 Comm: systemd-journal Not tainted 
4.6.0-rc3-next-20160414 #13

Hardware name: Freescale Layerscape 2085a RDB Board (DT)
task: 80001e4f8080 ti: 80001e8b4000 task.ti: 80001e8b4000
PC is at ptep_set_access_flags+0x138/0x1b8
LR is at ptep_set_access_flags+0x138/0x1b8
pc : [] lr : [] pstate: 2145
sp : 80001e8b7bc0
x29: 80001e8b7bc0 x28: 80001e843ac8
x27: 0040 x26: 80001e9ae0d8
x25: 29901000 x24: 80001f32a938
x23: 0001 x22: 80001e9ae088
x21: 7eb59000 x20: 80001e843ac8
x19: 006899371fd3 x18: fe09
x17: 7ea48c88 x16: cb5afb20
x15: 003b9aca x14: 307830203e2d2033
x13: 6462313733393930 x12: 3030303836303078
x11: 30203a676e697261 x10: 656c632065746174
x9 : 7320797472696420 x8 : 79636172203a7461
x7 : 5f6574705f746573 x6 : 28300ab8
x5 : 0003 x4 : 
x3 : 0003 x2 : 13d16f64
x1 : dfff2000 x0 : 004f

---[ end trace d75cd9bb88364c80 ]---
Call trace:
Exception stack(0x80001e8b79a0 to 0x80001e8b7ac0)
79a0: 006899371fd3 80001e843ac8 80001e8b7bc0 28497b70
79c0: 2145 003d 29901000 28301558
79e0: 41b58ab3 296870d0 28200668 292b0e40
7a00: 0001 80001f32a938 29901000 80001e9ae0d8
7a20: 0040 80001e843ac8 29901000 80001e9ae0d8
7a40: 2a9dcf60  0a6d9320 2000
7a60: 80001e8b7bc0 80001e8b7bc0 80001e8b7b80 ffc8
7a80: 80001e8b7ad0 28415418 80001e8b4000 13d16f64
7aa0: 004f dfff2000 13d16f64 0003
[] ptep_set_access_flags+0x138/0x1b8
[] handle_mm_fault+0xa24/0xfa0
[] do_page_fault+0x3d4/0x4c0
[] do_mem_abort+0xac/0x140


My kernel has ARM64_HW_AFDBM enabled, but LS2085 is not ARMv8.1.

The code shows it just check if ARM64_HW_AFDBM is enabled or not, but 
doesn't check if the CPU really has such capability.


So, it might be better to have the capability checked runtime?

Thanks,
Yang


[BUG] set_pte_at: racy dirty state clearing warning

2016-04-20 Thread Shi, Yang

Hi Will and Catalin,

When I enable memory comact via

# echo 1 > /proc/sys/vm/compact_memory

I got the below WARNING:

set_pte_at: racy dirty state clearing: 0x006899371bd3 -> 
0x006899371fd3

[ cut here ]
WARNING: CPU: 5 PID: 294 at ./arch/arm64/include/asm/pgtable.h:227 
ptep_set_access_flags+0x138/0x1b8

Modules linked in:

CPU: 5 PID: 294 Comm: systemd-journal Not tainted 
4.6.0-rc3-next-20160414 #13

Hardware name: Freescale Layerscape 2085a RDB Board (DT)
task: 80001e4f8080 ti: 80001e8b4000 task.ti: 80001e8b4000
PC is at ptep_set_access_flags+0x138/0x1b8
LR is at ptep_set_access_flags+0x138/0x1b8
pc : [] lr : [] pstate: 2145
sp : 80001e8b7bc0
x29: 80001e8b7bc0 x28: 80001e843ac8
x27: 0040 x26: 80001e9ae0d8
x25: 29901000 x24: 80001f32a938
x23: 0001 x22: 80001e9ae088
x21: 7eb59000 x20: 80001e843ac8
x19: 006899371fd3 x18: fe09
x17: 7ea48c88 x16: cb5afb20
x15: 003b9aca x14: 307830203e2d2033
x13: 6462313733393930 x12: 3030303836303078
x11: 30203a676e697261 x10: 656c632065746174
x9 : 7320797472696420 x8 : 79636172203a7461
x7 : 5f6574705f746573 x6 : 28300ab8
x5 : 0003 x4 : 
x3 : 0003 x2 : 13d16f64
x1 : dfff2000 x0 : 004f

---[ end trace d75cd9bb88364c80 ]---
Call trace:
Exception stack(0x80001e8b79a0 to 0x80001e8b7ac0)
79a0: 006899371fd3 80001e843ac8 80001e8b7bc0 28497b70
79c0: 2145 003d 29901000 28301558
79e0: 41b58ab3 296870d0 28200668 292b0e40
7a00: 0001 80001f32a938 29901000 80001e9ae0d8
7a20: 0040 80001e843ac8 29901000 80001e9ae0d8
7a40: 2a9dcf60  0a6d9320 2000
7a60: 80001e8b7bc0 80001e8b7bc0 80001e8b7b80 ffc8
7a80: 80001e8b7ad0 28415418 80001e8b4000 13d16f64
7aa0: 004f dfff2000 13d16f64 0003
[] ptep_set_access_flags+0x138/0x1b8
[] handle_mm_fault+0xa24/0xfa0
[] do_page_fault+0x3d4/0x4c0
[] do_mem_abort+0xac/0x140


My kernel has ARM64_HW_AFDBM enabled, but LS2085 is not ARMv8.1.

The code shows it just check if ARM64_HW_AFDBM is enabled or not, but 
doesn't check if the CPU really has such capability.


So, it might be better to have the capability checked runtime?

Thanks,
Yang


Re: [PATCH] mm: move huge_pmd_set_accessed out of huge_memory.c

2016-04-20 Thread Shi, Yang

Hi folks,

I didn't realize pmd_* functions are protected by 
CONFIG_TRANSPARENT_HUGEPAGE on the most architectures before I made this 
change.


Before I fix all the affected architectures code, I want to check if you 
guys think this change is worth or not?


Thanks,
Yang

On 4/20/2016 11:24 AM, Yang Shi wrote:

huge_pmd_set_accessed is only called by __handle_mm_fault from memory.c,
move the definition to memory.c and make it static like create_huge_pmd and
wp_huge_pmd.

Signed-off-by: Yang Shi <yang@linaro.org>
---
  include/linux/huge_mm.h |  4 
  mm/huge_memory.c| 23 ---
  mm/memory.c | 23 +++
  3 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7008623..c218ab7b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -8,10 +8,6 @@ extern int do_huge_pmd_anonymous_page(struct mm_struct *mm,
  extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 struct vm_area_struct *vma);
-extern void huge_pmd_set_accessed(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- pmd_t orig_pmd, int dirty);
  extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct 
*vma,
   unsigned long address, pmd_t *pmd,
   pmd_t orig_pmd);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fecbbc5..6c14cb6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1137,29 +1137,6 @@ out:
return ret;
  }

-void huge_pmd_set_accessed(struct mm_struct *mm,
-  struct vm_area_struct *vma,
-  unsigned long address,
-  pmd_t *pmd, pmd_t orig_pmd,
-  int dirty)
-{
-   spinlock_t *ptl;
-   pmd_t entry;
-   unsigned long haddr;
-
-   ptl = pmd_lock(mm, pmd);
-   if (unlikely(!pmd_same(*pmd, orig_pmd)))
-   goto unlock;
-
-   entry = pmd_mkyoung(orig_pmd);
-   haddr = address & HPAGE_PMD_MASK;
-   if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
-   update_mmu_cache_pmd(vma, address, pmd);
-
-unlock:
-   spin_unlock(ptl);
-}
-
  static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
diff --git a/mm/memory.c b/mm/memory.c
index 93897f2..6ced4eb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3287,6 +3287,29 @@ static int wp_huge_pmd(struct mm_struct *mm, struct 
vm_area_struct *vma,
return VM_FAULT_FALLBACK;
  }

+static void huge_pmd_set_accessed(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmd, pmd_t orig_pmd,
+ int dirty)
+{
+   spinlock_t *ptl;
+   pmd_t entry;
+   unsigned long haddr;
+
+   ptl = pmd_lock(mm, pmd);
+   if (unlikely(!pmd_same(*pmd, orig_pmd)))
+   goto unlock;
+
+   entry = pmd_mkyoung(orig_pmd);
+   haddr = address & HPAGE_PMD_MASK;
+   if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
+   update_mmu_cache_pmd(vma, address, pmd);
+
+unlock:
+   spin_unlock(ptl);
+}
+
  /*
   * These routines also need to handle stuff like marking pages dirty
   * and/or accessed for architectures that don't do it in hardware (most





Re: [PATCH] mm: move huge_pmd_set_accessed out of huge_memory.c

2016-04-20 Thread Shi, Yang

Hi folks,

I didn't realize pmd_* functions are protected by 
CONFIG_TRANSPARENT_HUGEPAGE on the most architectures before I made this 
change.


Before I fix all the affected architectures code, I want to check if you 
guys think this change is worth or not?


Thanks,
Yang

On 4/20/2016 11:24 AM, Yang Shi wrote:

huge_pmd_set_accessed is only called by __handle_mm_fault from memory.c,
move the definition to memory.c and make it static like create_huge_pmd and
wp_huge_pmd.

Signed-off-by: Yang Shi 
---
  include/linux/huge_mm.h |  4 
  mm/huge_memory.c| 23 ---
  mm/memory.c | 23 +++
  3 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7008623..c218ab7b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -8,10 +8,6 @@ extern int do_huge_pmd_anonymous_page(struct mm_struct *mm,
  extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 struct vm_area_struct *vma);
-extern void huge_pmd_set_accessed(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- pmd_t orig_pmd, int dirty);
  extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct 
*vma,
   unsigned long address, pmd_t *pmd,
   pmd_t orig_pmd);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fecbbc5..6c14cb6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1137,29 +1137,6 @@ out:
return ret;
  }

-void huge_pmd_set_accessed(struct mm_struct *mm,
-  struct vm_area_struct *vma,
-  unsigned long address,
-  pmd_t *pmd, pmd_t orig_pmd,
-  int dirty)
-{
-   spinlock_t *ptl;
-   pmd_t entry;
-   unsigned long haddr;
-
-   ptl = pmd_lock(mm, pmd);
-   if (unlikely(!pmd_same(*pmd, orig_pmd)))
-   goto unlock;
-
-   entry = pmd_mkyoung(orig_pmd);
-   haddr = address & HPAGE_PMD_MASK;
-   if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
-   update_mmu_cache_pmd(vma, address, pmd);
-
-unlock:
-   spin_unlock(ptl);
-}
-
  static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
diff --git a/mm/memory.c b/mm/memory.c
index 93897f2..6ced4eb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3287,6 +3287,29 @@ static int wp_huge_pmd(struct mm_struct *mm, struct 
vm_area_struct *vma,
return VM_FAULT_FALLBACK;
  }

+static void huge_pmd_set_accessed(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmd, pmd_t orig_pmd,
+ int dirty)
+{
+   spinlock_t *ptl;
+   pmd_t entry;
+   unsigned long haddr;
+
+   ptl = pmd_lock(mm, pmd);
+   if (unlikely(!pmd_same(*pmd, orig_pmd)))
+   goto unlock;
+
+   entry = pmd_mkyoung(orig_pmd);
+   haddr = address & HPAGE_PMD_MASK;
+   if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
+   update_mmu_cache_pmd(vma, address, pmd);
+
+unlock:
+   spin_unlock(ptl);
+}
+
  /*
   * These routines also need to handle stuff like marking pages dirty
   * and/or accessed for architectures that don't do it in hardware (most





[BUG linux-next] KASAN bug is raised on linux-next-20160414 with huge tmpfs on

2016-04-20 Thread Shi, Yang

Hi folks,

When I run the below test on my ARM64 machine with NFS mounted rootfs, I 
got KASAN bug report. The test runs well if mnt is not mounted with 
"huge=1".


# mount -t tmpfs -o huge=1 tmpfs /mnt
# cp -a /opt/ltp /mnt/

BUG: KASAN: use-after-free in nfs_readdir+0x2c4/0x848 at addr 
8b7f4000

Read of size 4 by task crond/446
page:7bffc02dfd00 count:2 mapcount:0 mapping:80001c2cae98 index:0x0
flags: 0x6c(referenced|uptodate|lru|active)
page dumped because: kasan: bad access detected
page->mem_cgroup:80002402da80
CPU: 0 PID: 446 Comm: crond Tainted: GW 
4.6.0-rc3-next-20160414-WR8.0.0.0_standard+ #13

Hardware name: Freescale Layerscape 2085a RDB Board (DT)
Call trace:
[] dump_backtrace+0x0/0x2b8
[] show_stack+0x24/0x30
[] dump_stack+0xb0/0xe8
[] kasan_report_error+0x518/0x5c0
[] kasan_report+0x60/0x70
[] __asan_load4+0x64/0x80
[] nfs_readdir+0x2c4/0x848
[] iterate_dir+0x120/0x1d8
[] SyS_getdents64+0xdc/0x170
[] __sys_trace_return+0x0/0x4
Memory state around the buggy address:
 8b7f3f00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
 8b7f3f80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
>8b7f4000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
   ^
 8b7f4080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
 8b7f4100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff

BUG: KASAN: use-after-free in nfs_do_filldir+0x88/0x298 at addr 
8b7f4000

Read of size 4 by task crond/446
page:7bffc02dfd00 count:2 mapcount:0 mapping:80001c2cae98 index:0x0
flags: 0x6c(referenced|uptodate|lru|active)
page dumped because: kasan: bad access detected
page->mem_cgroup:80002402da80
CPU: 0 PID: 446 Comm: crond Tainted: GB   W 
4.6.0-rc3-next-20160414-WR8.0.0.0_standard+ #13

Hardware name: Freescale Layerscape 2085a RDB Board (DT)
Call trace:
[] dump_backtrace+0x0/0x2b8
[] show_stack+0x24/0x30
[] dump_stack+0xb0/0xe8
[] kasan_report_error+0x518/0x5c0
[] kasan_report+0x60/0x70
[] __asan_load4+0x64/0x80
[] nfs_do_filldir+0x88/0x298
[] nfs_readdir+0x488/0x848
[] iterate_dir+0x120/0x1d8
[] SyS_getdents64+0xdc/0x170
[] __sys_trace_return+0x0/0x4
Memory state around the buggy address:
 8b7f3f00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
 8b7f3f80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
>8b7f4000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
   ^
 8b7f4080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
 8b7f4100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff

Thanks,
Yang


[BUG linux-next] KASAN bug is raised on linux-next-20160414 with huge tmpfs on

2016-04-20 Thread Shi, Yang

Hi folks,

When I run the below test on my ARM64 machine with NFS mounted rootfs, I 
got KASAN bug report. The test runs well if mnt is not mounted with 
"huge=1".


# mount -t tmpfs -o huge=1 tmpfs /mnt
# cp -a /opt/ltp /mnt/

BUG: KASAN: use-after-free in nfs_readdir+0x2c4/0x848 at addr 
8b7f4000

Read of size 4 by task crond/446
page:7bffc02dfd00 count:2 mapcount:0 mapping:80001c2cae98 index:0x0
flags: 0x6c(referenced|uptodate|lru|active)
page dumped because: kasan: bad access detected
page->mem_cgroup:80002402da80
CPU: 0 PID: 446 Comm: crond Tainted: GW 
4.6.0-rc3-next-20160414-WR8.0.0.0_standard+ #13

Hardware name: Freescale Layerscape 2085a RDB Board (DT)
Call trace:
[] dump_backtrace+0x0/0x2b8
[] show_stack+0x24/0x30
[] dump_stack+0xb0/0xe8
[] kasan_report_error+0x518/0x5c0
[] kasan_report+0x60/0x70
[] __asan_load4+0x64/0x80
[] nfs_readdir+0x2c4/0x848
[] iterate_dir+0x120/0x1d8
[] SyS_getdents64+0xdc/0x170
[] __sys_trace_return+0x0/0x4
Memory state around the buggy address:
 8b7f3f00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
 8b7f3f80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
>8b7f4000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
   ^
 8b7f4080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
 8b7f4100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff

BUG: KASAN: use-after-free in nfs_do_filldir+0x88/0x298 at addr 
8b7f4000

Read of size 4 by task crond/446
page:7bffc02dfd00 count:2 mapcount:0 mapping:80001c2cae98 index:0x0
flags: 0x6c(referenced|uptodate|lru|active)
page dumped because: kasan: bad access detected
page->mem_cgroup:80002402da80
CPU: 0 PID: 446 Comm: crond Tainted: GB   W 
4.6.0-rc3-next-20160414-WR8.0.0.0_standard+ #13

Hardware name: Freescale Layerscape 2085a RDB Board (DT)
Call trace:
[] dump_backtrace+0x0/0x2b8
[] show_stack+0x24/0x30
[] dump_stack+0xb0/0xe8
[] kasan_report_error+0x518/0x5c0
[] kasan_report+0x60/0x70
[] __asan_load4+0x64/0x80
[] nfs_do_filldir+0x88/0x298
[] nfs_readdir+0x488/0x848
[] iterate_dir+0x120/0x1d8
[] SyS_getdents64+0xdc/0x170
[] __sys_trace_return+0x0/0x4
Memory state around the buggy address:
 8b7f3f00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
 8b7f3f80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
>8b7f4000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
   ^
 8b7f4080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
 8b7f4100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff

Thanks,
Yang


Re: [BUG linux-next] Kernel panic found with linux-next-20160414

2016-04-20 Thread Shi, Yang

On 4/20/2016 1:01 AM, Hugh Dickins wrote:

On Tue, 19 Apr 2016, Shi, Yang wrote:

Hi folks,

When I ran ltp on linux-next-20160414 on my ARM64 machine, I got the below
kernel panic:

Unable to handle kernel paging request at virtual address ffc007846000
pgd = ffc01e21d000
[ffc007846000] *pgd=, *pud=
Internal error: Oops: 9647 [#11] PREEMPT SMP
Modules linked in: loop
CPU: 7 PID: 274 Comm: systemd-journal Tainted: G  D
4.6.0-rc3-next-20160414-WR8.0.0.0_standard+ #9
Hardware name: Freescale Layerscape 2085a RDB Board (DT)
task: ffc01e3fcf80 ti: ffc01ea8c000 task.ti: ffc01ea8c000
PC is at copy_page+0x38/0x120
LR is at migrate_page_copy+0x604/0x1660
pc : [] lr : [] pstate: 2145
sp : ffc01ea8ecd0
x29: ffc01ea8ecd0 x28: 
x27: 1ff7b80240f8 x26: ffc018196f20
x25: ffbdc01e1180 x24: ffbdc01e1180
x23:  x22: ffc01e3fcf80
x21: ffc00481f000 x20: ff900a31d000
x19: ffbdc01207c0 x18: 0f00
x17:  x16: 
x15:  x14: 
x13:  x12: 
x11:  x10: 
x9 :  x8 : 
x7 :  x6 : 
x5 :  x4 : 
x3 :  x2 : 
x1 : ffc00481f080 x0 : ffc007846000

Call trace:
Exception stack(0xffc021fc2ed0 to 0xffc021fc2ff0)
2ec0:   ffbdc00887c0 ff900a31d000
2ee0: ffc021fc30f0 ff9008ff2318 2145 0025
2f00: ffbdc025a280 ffc020adc4c0 41b58ab3 ff900a085fd0
2f20: ff9008200658   ffbdc00887c0
2f40: ff900b0f1320 ffc021fc3078 41b58ab3 ff900a0864f8
2f60: ff9008210010 ffc021fb8960 ff900867bacc 1ff8043f712d
2f80: ffc021fc2fb0 ff9008210564 ffc021fc3070 ffc021fb8940
2fa0: 08221f78 ff900862f9c8 ffc021fc2fe0 ff9008215dc8
2fc0: 1ff8043f8602 ffc021fc ffc00968a000 ffc00221f080
2fe0: f9407e11d1f0 d61f02209103e210
[] copy_page+0x38/0x120
[] migrate_page+0x74/0x98
[] nfs_migrate_page+0x58/0x80
[] move_to_new_page+0x15c/0x4d8
[] migrate_pages+0x7c8/0x11f0
[] compact_zone+0xdfc/0x2570
[] compact_zone_order+0xe0/0x170
[] try_to_compact_pages+0x2e8/0x8f8
[] __alloc_pages_direct_compact+0x100/0x540
[] __alloc_pages_nodemask+0xc40/0x1c58
[] khugepaged+0x468/0x19c8
[] kthread+0x248/0x2c0
[] ret_from_fork+0x10/0x40
Code: d281f012 91020021 f1020252 d503201f (a8000c02)


I did some initial investigation and found it is caused by DEBUG_PAGEALLOC
and CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT. And, mainline 4.6-rc3 works well.

It should be not arch specific although I got it caught on ARM64. I suspect
this might be caused by Hugh's huge tmpfs patches.


Thanks for testing.  It might be caused by my patches, but I don't think
that's very likely.  This is page migraton for compaction, in the service
of anon THP's khugepaged; and I wonder if you were even exercising huge
tmpfs when running LTP here (it certainly can be done: I like to mount a
huge tmpfs on /opt/ltp and install there, with shmem_huge 2 so any other
tmpfs mounts are also huge).


Some further investigation shows I got the panic even though I don't 
have tmpfs mounted with huge=1 or set shmem_huge to 2.




There are compaction changes in linux-next too, but I don't see any
reason why they'd cause this.  I don't know arm64 traces enough to know
whether it's the source page or the destination page for the copy, but
it looks as if it has been freed (and DEBUG_PAGEALLOC unmapped) before
reaching migration's copy.


The fault address is passed by x0, which is dest in the implementation 
of copy_page, so it is the destination page.




Needs more debugging, I'm afraid: is it reproducible?


Yes, as long as I enable those two PAGEALLOC debug options, I can get 
the panic once I run ltp. But, it is not caused any specific ltp test 
case directly, the panic happens randomly during ltp is running.


Thanks,
Yang



Hugh





Re: [BUG linux-next] Kernel panic found with linux-next-20160414

2016-04-20 Thread Shi, Yang

On 4/20/2016 1:01 AM, Hugh Dickins wrote:

On Tue, 19 Apr 2016, Shi, Yang wrote:

Hi folks,

When I ran ltp on linux-next-20160414 on my ARM64 machine, I got the below
kernel panic:

Unable to handle kernel paging request at virtual address ffc007846000
pgd = ffc01e21d000
[ffc007846000] *pgd=, *pud=
Internal error: Oops: 9647 [#11] PREEMPT SMP
Modules linked in: loop
CPU: 7 PID: 274 Comm: systemd-journal Tainted: G  D
4.6.0-rc3-next-20160414-WR8.0.0.0_standard+ #9
Hardware name: Freescale Layerscape 2085a RDB Board (DT)
task: ffc01e3fcf80 ti: ffc01ea8c000 task.ti: ffc01ea8c000
PC is at copy_page+0x38/0x120
LR is at migrate_page_copy+0x604/0x1660
pc : [] lr : [] pstate: 2145
sp : ffc01ea8ecd0
x29: ffc01ea8ecd0 x28: 
x27: 1ff7b80240f8 x26: ffc018196f20
x25: ffbdc01e1180 x24: ffbdc01e1180
x23:  x22: ffc01e3fcf80
x21: ffc00481f000 x20: ff900a31d000
x19: ffbdc01207c0 x18: 0f00
x17:  x16: 
x15:  x14: 
x13:  x12: 
x11:  x10: 
x9 :  x8 : 
x7 :  x6 : 
x5 :  x4 : 
x3 :  x2 : 
x1 : ffc00481f080 x0 : ffc007846000

Call trace:
Exception stack(0xffc021fc2ed0 to 0xffc021fc2ff0)
2ec0:   ffbdc00887c0 ff900a31d000
2ee0: ffc021fc30f0 ff9008ff2318 2145 0025
2f00: ffbdc025a280 ffc020adc4c0 41b58ab3 ff900a085fd0
2f20: ff9008200658   ffbdc00887c0
2f40: ff900b0f1320 ffc021fc3078 41b58ab3 ff900a0864f8
2f60: ff9008210010 ffc021fb8960 ff900867bacc 1ff8043f712d
2f80: ffc021fc2fb0 ff9008210564 ffc021fc3070 ffc021fb8940
2fa0: 08221f78 ff900862f9c8 ffc021fc2fe0 ff9008215dc8
2fc0: 1ff8043f8602 ffc021fc ffc00968a000 ffc00221f080
2fe0: f9407e11d1f0 d61f02209103e210
[] copy_page+0x38/0x120
[] migrate_page+0x74/0x98
[] nfs_migrate_page+0x58/0x80
[] move_to_new_page+0x15c/0x4d8
[] migrate_pages+0x7c8/0x11f0
[] compact_zone+0xdfc/0x2570
[] compact_zone_order+0xe0/0x170
[] try_to_compact_pages+0x2e8/0x8f8
[] __alloc_pages_direct_compact+0x100/0x540
[] __alloc_pages_nodemask+0xc40/0x1c58
[] khugepaged+0x468/0x19c8
[] kthread+0x248/0x2c0
[] ret_from_fork+0x10/0x40
Code: d281f012 91020021 f1020252 d503201f (a8000c02)


I did some initial investigation and found it is caused by DEBUG_PAGEALLOC
and CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT. And, mainline 4.6-rc3 works well.

It should be not arch specific although I got it caught on ARM64. I suspect
this might be caused by Hugh's huge tmpfs patches.


Thanks for testing.  It might be caused by my patches, but I don't think
that's very likely.  This is page migraton for compaction, in the service
of anon THP's khugepaged; and I wonder if you were even exercising huge
tmpfs when running LTP here (it certainly can be done: I like to mount a
huge tmpfs on /opt/ltp and install there, with shmem_huge 2 so any other
tmpfs mounts are also huge).


Some further investigation shows I got the panic even though I don't 
have tmpfs mounted with huge=1 or set shmem_huge to 2.




There are compaction changes in linux-next too, but I don't see any
reason why they'd cause this.  I don't know arm64 traces enough to know
whether it's the source page or the destination page for the copy, but
it looks as if it has been freed (and DEBUG_PAGEALLOC unmapped) before
reaching migration's copy.


The fault address is passed by x0, which is dest in the implementation 
of copy_page, so it is the destination page.




Needs more debugging, I'm afraid: is it reproducible?


Yes, as long as I enable those two PAGEALLOC debug options, I can get 
the panic once I run ltp. But, it is not caused any specific ltp test 
case directly, the panic happens randomly during ltp is running.


Thanks,
Yang



Hugh





Re: [PATCHv7 00/29] THP-enabled tmpfs/shmem using compound pages

2016-04-19 Thread Shi, Yang

On 4/19/2016 9:50 AM, Andrea Arcangeli wrote:

Hello,

On Mon, Apr 18, 2016 at 03:55:44PM -0700, Shi, Yang wrote:

Hi Kirill,

Finally, I got some time to look into and try yours and Hugh's patches,
got two problems.


One thing that come to mind to test is this: qemu with -machine
accel=kvm -mem-path=/dev/shm/,share=on .


Thanks for the suggestion, I will definitely have a try with KVM.

It would be better if Kirill and Hugh could share what benchmark they 
ran and how much they got improved since my test case is very simple and 
may just cover a small part of it.


Yang



The THP Compound approach in tmpfs may just happen to work already
with KVM (or at worst it'd require minor adjustments) because it uses
the exact same model KVM is already aware about from THP in anonymous
memory, example from arch/x86/kvm/mmu.c:

static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
gfn_t *gfnp, kvm_pfn_t *pfnp,
int *levelp)
{
kvm_pfn_t pfn = *pfnp;
gfn_t gfn = *gfnp;
int level = *levelp;

/*
 * Check if it's a transparent hugepage. If this would be an
 * hugetlbfs page, level wouldn't be set to
 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
 * here.
 */
if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
level == PT_PAGE_TABLE_LEVEL &&
PageTransCompound(pfn_to_page(pfn)) &&
!mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {

Not using two different models between THP in tmpfs and THP in anon is
essential not just to significantly reduce the size of the kernel
code, but also because THP knowledge can't be self contained in the
mm/shmem.c file. Having to support two different models would
complicate things for secondary MMU drivers (i.e. mmu notifer users)
like KVM who also need to create huge mapping in the shadow pagetable
layer in arch/x86/kvm if the primary MMU allows for it.


x86-64 and ARM64 with yours and Hugh's patches (linux-next tree), I got
the program execution time reduced by ~12% on x86-64, it looks very
impressive.


Agreed, both patchset are impressive works and achieving amazing
results!

My view is that in terms of long-lived computation from userland point
of view, both models are malleable enough and could achieve everything
we need in the end, but as far as the overall kernel efficiency is
concerned the compound model will always retain a slight advantage in
performance by leveraging a native THP compound refcounting that
requires just one atomic_inc/dec per THP mapcount instead of 512 of
them. Other advantages of the compound model is that it's half in code
size despite already including khugepaged (i.e. the same
split_huge_page works for both tmpfs and anon) and like said above it
won't introduce much complications for drivers like KVM as the model
didn't change.

Thanks,
Andrea





Re: [PATCHv7 00/29] THP-enabled tmpfs/shmem using compound pages

2016-04-19 Thread Shi, Yang

On 4/19/2016 9:50 AM, Andrea Arcangeli wrote:

Hello,

On Mon, Apr 18, 2016 at 03:55:44PM -0700, Shi, Yang wrote:

Hi Kirill,

Finally, I got some time to look into and try yours and Hugh's patches,
got two problems.


One thing that come to mind to test is this: qemu with -machine
accel=kvm -mem-path=/dev/shm/,share=on .


Thanks for the suggestion, I will definitely have a try with KVM.

It would be better if Kirill and Hugh could share what benchmark they 
ran and how much they got improved since my test case is very simple and 
may just cover a small part of it.


Yang



The THP Compound approach in tmpfs may just happen to work already
with KVM (or at worst it'd require minor adjustments) because it uses
the exact same model KVM is already aware about from THP in anonymous
memory, example from arch/x86/kvm/mmu.c:

static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
gfn_t *gfnp, kvm_pfn_t *pfnp,
int *levelp)
{
kvm_pfn_t pfn = *pfnp;
gfn_t gfn = *gfnp;
int level = *levelp;

/*
 * Check if it's a transparent hugepage. If this would be an
 * hugetlbfs page, level wouldn't be set to
 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
 * here.
 */
if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
level == PT_PAGE_TABLE_LEVEL &&
PageTransCompound(pfn_to_page(pfn)) &&
!mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {

Not using two different models between THP in tmpfs and THP in anon is
essential not just to significantly reduce the size of the kernel
code, but also because THP knowledge can't be self contained in the
mm/shmem.c file. Having to support two different models would
complicate things for secondary MMU drivers (i.e. mmu notifer users)
like KVM who also need to create huge mapping in the shadow pagetable
layer in arch/x86/kvm if the primary MMU allows for it.


x86-64 and ARM64 with yours and Hugh's patches (linux-next tree), I got
the program execution time reduced by ~12% on x86-64, it looks very
impressive.


Agreed, both patchset are impressive works and achieving amazing
results!

My view is that in terms of long-lived computation from userland point
of view, both models are malleable enough and could achieve everything
we need in the end, but as far as the overall kernel efficiency is
concerned the compound model will always retain a slight advantage in
performance by leveraging a native THP compound refcounting that
requires just one atomic_inc/dec per THP mapcount instead of 512 of
them. Other advantages of the compound model is that it's half in code
size despite already including khugepaged (i.e. the same
split_huge_page works for both tmpfs and anon) and like said above it
won't introduce much complications for drivers like KVM as the model
didn't change.

Thanks,
Andrea





[BUG linux-next] Kernel panic found with linux-next-20160414

2016-04-19 Thread Shi, Yang

Hi folks,

When I ran ltp on linux-next-20160414 on my ARM64 machine, I got the 
below kernel panic:


Unable to handle kernel paging request at virtual address ffc007846000
pgd = ffc01e21d000
[ffc007846000] *pgd=, *pud=
Internal error: Oops: 9647 [#11] PREEMPT SMP
Modules linked in: loop
CPU: 7 PID: 274 Comm: systemd-journal Tainted: G  D 
4.6.0-rc3-next-20160414-WR8.0.0.0_standard+ #9

Hardware name: Freescale Layerscape 2085a RDB Board (DT)
task: ffc01e3fcf80 ti: ffc01ea8c000 task.ti: ffc01ea8c000
PC is at copy_page+0x38/0x120
LR is at migrate_page_copy+0x604/0x1660
pc : [] lr : [] pstate: 2145
sp : ffc01ea8ecd0
x29: ffc01ea8ecd0 x28: 
x27: 1ff7b80240f8 x26: ffc018196f20
x25: ffbdc01e1180 x24: ffbdc01e1180
x23:  x22: ffc01e3fcf80
x21: ffc00481f000 x20: ff900a31d000
x19: ffbdc01207c0 x18: 0f00
x17:  x16: 
x15:  x14: 
x13:  x12: 
x11:  x10: 
x9 :  x8 : 
x7 :  x6 : 
x5 :  x4 : 
x3 :  x2 : 
x1 : ffc00481f080 x0 : ffc007846000

Call trace:
Exception stack(0xffc021fc2ed0 to 0xffc021fc2ff0)
2ec0:   ffbdc00887c0 ff900a31d000
2ee0: ffc021fc30f0 ff9008ff2318 2145 0025
2f00: ffbdc025a280 ffc020adc4c0 41b58ab3 ff900a085fd0
2f20: ff9008200658   ffbdc00887c0
2f40: ff900b0f1320 ffc021fc3078 41b58ab3 ff900a0864f8
2f60: ff9008210010 ffc021fb8960 ff900867bacc 1ff8043f712d
2f80: ffc021fc2fb0 ff9008210564 ffc021fc3070 ffc021fb8940
2fa0: 08221f78 ff900862f9c8 ffc021fc2fe0 ff9008215dc8
2fc0: 1ff8043f8602 ffc021fc ffc00968a000 ffc00221f080
2fe0: f9407e11d1f0 d61f02209103e210
[] copy_page+0x38/0x120
[] migrate_page+0x74/0x98
[] nfs_migrate_page+0x58/0x80
[] move_to_new_page+0x15c/0x4d8
[] migrate_pages+0x7c8/0x11f0
[] compact_zone+0xdfc/0x2570
[] compact_zone_order+0xe0/0x170
[] try_to_compact_pages+0x2e8/0x8f8
[] __alloc_pages_direct_compact+0x100/0x540
[] __alloc_pages_nodemask+0xc40/0x1c58
[] khugepaged+0x468/0x19c8
[] kthread+0x248/0x2c0
[] ret_from_fork+0x10/0x40
Code: d281f012 91020021 f1020252 d503201f (a8000c02)


I did some initial investigation and found it is caused by 
DEBUG_PAGEALLOC and CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT. And, mainline 
4.6-rc3 works well.


It should be not arch specific although I got it caught on ARM64. I 
suspect this might be caused by Hugh's huge tmpfs patches.


Thanks,
Yang


[BUG linux-next] Kernel panic found with linux-next-20160414

2016-04-19 Thread Shi, Yang

Hi folks,

When I ran ltp on linux-next-20160414 on my ARM64 machine, I got the 
below kernel panic:


Unable to handle kernel paging request at virtual address ffc007846000
pgd = ffc01e21d000
[ffc007846000] *pgd=, *pud=
Internal error: Oops: 9647 [#11] PREEMPT SMP
Modules linked in: loop
CPU: 7 PID: 274 Comm: systemd-journal Tainted: G  D 
4.6.0-rc3-next-20160414-WR8.0.0.0_standard+ #9

Hardware name: Freescale Layerscape 2085a RDB Board (DT)
task: ffc01e3fcf80 ti: ffc01ea8c000 task.ti: ffc01ea8c000
PC is at copy_page+0x38/0x120
LR is at migrate_page_copy+0x604/0x1660
pc : [] lr : [] pstate: 2145
sp : ffc01ea8ecd0
x29: ffc01ea8ecd0 x28: 
x27: 1ff7b80240f8 x26: ffc018196f20
x25: ffbdc01e1180 x24: ffbdc01e1180
x23:  x22: ffc01e3fcf80
x21: ffc00481f000 x20: ff900a31d000
x19: ffbdc01207c0 x18: 0f00
x17:  x16: 
x15:  x14: 
x13:  x12: 
x11:  x10: 
x9 :  x8 : 
x7 :  x6 : 
x5 :  x4 : 
x3 :  x2 : 
x1 : ffc00481f080 x0 : ffc007846000

Call trace:
Exception stack(0xffc021fc2ed0 to 0xffc021fc2ff0)
2ec0:   ffbdc00887c0 ff900a31d000
2ee0: ffc021fc30f0 ff9008ff2318 2145 0025
2f00: ffbdc025a280 ffc020adc4c0 41b58ab3 ff900a085fd0
2f20: ff9008200658   ffbdc00887c0
2f40: ff900b0f1320 ffc021fc3078 41b58ab3 ff900a0864f8
2f60: ff9008210010 ffc021fb8960 ff900867bacc 1ff8043f712d
2f80: ffc021fc2fb0 ff9008210564 ffc021fc3070 ffc021fb8940
2fa0: 08221f78 ff900862f9c8 ffc021fc2fe0 ff9008215dc8
2fc0: 1ff8043f8602 ffc021fc ffc00968a000 ffc00221f080
2fe0: f9407e11d1f0 d61f02209103e210
[] copy_page+0x38/0x120
[] migrate_page+0x74/0x98
[] nfs_migrate_page+0x58/0x80
[] move_to_new_page+0x15c/0x4d8
[] migrate_pages+0x7c8/0x11f0
[] compact_zone+0xdfc/0x2570
[] compact_zone_order+0xe0/0x170
[] try_to_compact_pages+0x2e8/0x8f8
[] __alloc_pages_direct_compact+0x100/0x540
[] __alloc_pages_nodemask+0xc40/0x1c58
[] khugepaged+0x468/0x19c8
[] kthread+0x248/0x2c0
[] ret_from_fork+0x10/0x40
Code: d281f012 91020021 f1020252 d503201f (a8000c02)


I did some initial investigation and found it is caused by 
DEBUG_PAGEALLOC and CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT. And, mainline 
4.6-rc3 works well.


It should be not arch specific although I got it caught on ARM64. I 
suspect this might be caused by Hugh's huge tmpfs patches.


Thanks,
Yang


KASAN can't change FRAME_WARN

2016-04-19 Thread Shi, Yang

Hi Andrey,

When I enable KASAN for 4.5 and 4.6 (I didn't try with older versions), 
I got FRAME_WARN warning for frame size exceeds 2048 bytes.


Then I found the kconfig looks like:

range 0 8192
default 0 if KASAN
default 1024 if !64BIT
default 2048 if 64BIT

In my understanding, FRAME_WARN should be 0 once KASAN is enabled, but 
it is still 2048. I tried a couple of fixes, i.e.


default 0 if KASAN
default 1024 if (!KASAN && !64BIT)
default 2048 if (!KASAN && 64BIT)

But, nothing works, so I have to add "depends on !KASAN" to disable 
FRAME_WARN completely, but it causes the kernel image size increased.


Any hint is appreciated.

Thanks,
Yang


KASAN can't change FRAME_WARN

2016-04-19 Thread Shi, Yang

Hi Andrey,

When I enable KASAN for 4.5 and 4.6 (I didn't try with older versions), 
I got FRAME_WARN warning for frame size exceeds 2048 bytes.


Then I found the kconfig looks like:

range 0 8192
default 0 if KASAN
default 1024 if !64BIT
default 2048 if 64BIT

In my understanding, FRAME_WARN should be 0 once KASAN is enabled, but 
it is still 2048. I tried a couple of fixes, i.e.


default 0 if KASAN
default 1024 if (!KASAN && !64BIT)
default 2048 if (!KASAN && 64BIT)

But, nothing works, so I have to add "depends on !KASAN" to disable 
FRAME_WARN completely, but it causes the kernel image size increased.


Any hint is appreciated.

Thanks,
Yang


Re: [PATCHv7 00/29] THP-enabled tmpfs/shmem using compound pages

2016-04-19 Thread Shi, Yang

On 4/19/2016 7:33 AM, Jerome Marchand wrote:

On 04/19/2016 12:55 AM, Shi, Yang wrote:

2. I ran my THP test (generated a program with 4MB text section) on both
x86-64 and ARM64 with yours and Hugh's patches (linux-next tree), I got
the program execution time reduced by ~12% on x86-64, it looks very
impressive.

But, on ARM64, there is just ~3% change, and sometimes huge tmpfs may
show even worse data than non-hugepage.

Both yours and Hugh's patches has the same behavior.

Any idea?


Just a shot in the dark, but what page size do you use? If you use 4k
pages, then hugepage size should be the same as on x86 and a similar


I do use 4K pages for both x86-64 and ARM64 in my testing.

Thanks,
Yang


behavior could be expected. Otherwise, hugepages would be too big to be
taken advantage of by your test program.

Jerome





Re: [PATCHv7 00/29] THP-enabled tmpfs/shmem using compound pages

2016-04-19 Thread Shi, Yang

On 4/19/2016 7:33 AM, Jerome Marchand wrote:

On 04/19/2016 12:55 AM, Shi, Yang wrote:

2. I ran my THP test (generated a program with 4MB text section) on both
x86-64 and ARM64 with yours and Hugh's patches (linux-next tree), I got
the program execution time reduced by ~12% on x86-64, it looks very
impressive.

But, on ARM64, there is just ~3% change, and sometimes huge tmpfs may
show even worse data than non-hugepage.

Both yours and Hugh's patches has the same behavior.

Any idea?


Just a shot in the dark, but what page size do you use? If you use 4k
pages, then hugepage size should be the same as on x86 and a similar


I do use 4K pages for both x86-64 and ARM64 in my testing.

Thanks,
Yang


behavior could be expected. Otherwise, hugepages would be too big to be
taken advantage of by your test program.

Jerome





Re: [PATCHv7 00/29] THP-enabled tmpfs/shmem using compound pages

2016-04-18 Thread Shi, Yang

Hi Kirill,

Finally, I got some time to look into and try yours and Hugh's patches, 
got two problems.


1. A quick boot up test on my ARM64 machine with your v7 tree shows some 
unexpected error:


systemd-journald[285]: Failed to save stream data 
/run/systemd/journal/streams/8:16863: No space left on device
systemd-journald[285]: Failed to save stream data 
/run/systemd/journal/streams/8:16865: No space left on device
 Starting DNS forwarder and DHCP server.systemd-journald[285]: 
Failed to save stream data /run/systemd/journal/streams/8:16867: No 
space left on device

..
systemd-journald[285]: Failed to save stream data 
/run/systemd/journal/streams/8:16869: No space left on device

 Starting Postfix Mail Transport Agent...
systemd-journald[285]: Failed to save stream data 
/run/systemd/journal/streams/8:16871: No space left on device

 Starting Berkeley Internet Name Domain (DNS)...
 Starting Wait for Network to be Configured...
systemd-journald[285]: Failed to save stream data 
/run/systemd/journal/streams/8:2422: No space left on device

[  OK  ] Started /etc/rc.local Compatibility.
[FAILED] Failed to start DNS forwarder and DHCP server.
See 'systemctl status dnsmasq.service' for details.
systemd-journald[285]: Failed to save stream data 
/run/systemd/journal/streams/8:2425: No space left on device

[  OK  ] Started Serial Getty on ttyS1.
[  OK  ] Started Serial Getty on ttyS0.
[  OK  ] Started Getty on tty1.
systemd-journald[285]: Failed to save stream data 
/run/systemd/journal/streams/8:2433: No space left on device

[FAILED] Failed to start Berkeley Internet Name Domain (DNS).
See 'systemctl status named.service' for details.


The /run dir is mounted as tmpfs.

x86 boot doesn't get such error. And, Hugh's patches don't have such 
problem.


2. I ran my THP test (generated a program with 4MB text section) on both 
x86-64 and ARM64 with yours and Hugh's patches (linux-next tree), I got 
the program execution time reduced by ~12% on x86-64, it looks very 
impressive.


But, on ARM64, there is just ~3% change, and sometimes huge tmpfs may 
show even worse data than non-hugepage.


Both yours and Hugh's patches has the same behavior.

Any idea?

Thanks,
Yang


On 4/15/2016 5:23 PM, Kirill A. Shutemov wrote:

This is probably the last update before the mm summit. Main forcus is on
khugepaged stability.

khugepaged is in more reasonable shape now. I missed quite a few corner
cases on first try. I run this version via LTP, trinity and syzkaller
without crashes so far.

The patchset is on top of v4.6-rc3 plus Hugh's "easy preliminaries to
THPagecache" and Ebru's khugepaged swapin patches form -mm tree.

Git tree:

git://git.kernel.org/pub/scm/linux/kernel/git/kas/linux.git hugetmpfs/v7

== Changelog ==

v7:
   - khugepaged updates:
 + fix page leak/page cache corruption on collapse fail;
 + filter out VMAs not suitable for huge pages due misaligned vm_pgoff;
 + fix build without CONFIG_SHMEM;
 + drop few over-protective checks;
   - fix bogus VM_BUG_ON() in __delete_from_page_cache();

v6:
   - experimental collapse support;
   - fix swapout mapped huge pages;
   - fix page leak in faularound code;
   - fix exessive huge page allocation with huge=within_size;
   - rename VM_NO_THP to VM_NO_KHUGEPAGED;
   - fix condition in hugepage_madvise();
   - accounting reworked again;

v5:
   - add FileHugeMapped to /proc/PID/smaps;
   - make FileHugeMapped in meminfo aligned with other fields;
   - Documentation/vm/transhuge.txt updated;

v4:
   - first four patch were applied to -mm tree;
   - drop pages beyond i_size on split_huge_pages;
   - few small random bugfixes;

v3:
   - huge= mountoption now can have values always, within_size, advice and
 never;
   - sysctl handle is replaced with sysfs knob;
   - MADV_HUGEPAGE/MADV_NOHUGEPAGE is now respected on page allocation via
 page fault;
   - mlock() handling had been fixed;
   - bunch of smaller bugfixes and cleanups.

== Design overview ==

Huge pages are allocated by shmem when it's allowed (by mount option) and
there's no entries for the range in radix-tree. Huge page is represented by
HPAGE_PMD_NR entries in radix-tree.

MM core maps a page with PMD if ->fault() returns huge page and the VMA is
suitable for huge pages (size, alignment). There's no need into two
requests to file system: filesystem returns huge page if it can,
graceful fallback to small pages otherwise.

As with DAX, split_huge_pmd() is implemented by unmapping the PMD: we can
re-fault the page with PTEs later.

Basic scheme for split_huge_page() is the same as for anon-THP.
Few differences:

   - File pages are on radix-tree, so we have head->_count offset by
 HPAGE_PMD_NR. The count got distributed to small pages during split.

   - mapping->tree_lock prevents non-lockless access to pages under split
 over radix-tree;

   - Lockless access is prevented by setting the head->_count to 0 during
 split, so 

Re: [PATCHv7 00/29] THP-enabled tmpfs/shmem using compound pages

2016-04-18 Thread Shi, Yang

Hi Kirill,

Finally, I got some time to look into and try yours and Hugh's patches, 
got two problems.


1. A quick boot up test on my ARM64 machine with your v7 tree shows some 
unexpected error:


systemd-journald[285]: Failed to save stream data 
/run/systemd/journal/streams/8:16863: No space left on device
systemd-journald[285]: Failed to save stream data 
/run/systemd/journal/streams/8:16865: No space left on device
 Starting DNS forwarder and DHCP server.systemd-journald[285]: 
Failed to save stream data /run/systemd/journal/streams/8:16867: No 
space left on device

..
systemd-journald[285]: Failed to save stream data 
/run/systemd/journal/streams/8:16869: No space left on device

 Starting Postfix Mail Transport Agent...
systemd-journald[285]: Failed to save stream data 
/run/systemd/journal/streams/8:16871: No space left on device

 Starting Berkeley Internet Name Domain (DNS)...
 Starting Wait for Network to be Configured...
systemd-journald[285]: Failed to save stream data 
/run/systemd/journal/streams/8:2422: No space left on device

[  OK  ] Started /etc/rc.local Compatibility.
[FAILED] Failed to start DNS forwarder and DHCP server.
See 'systemctl status dnsmasq.service' for details.
systemd-journald[285]: Failed to save stream data 
/run/systemd/journal/streams/8:2425: No space left on device

[  OK  ] Started Serial Getty on ttyS1.
[  OK  ] Started Serial Getty on ttyS0.
[  OK  ] Started Getty on tty1.
systemd-journald[285]: Failed to save stream data 
/run/systemd/journal/streams/8:2433: No space left on device

[FAILED] Failed to start Berkeley Internet Name Domain (DNS).
See 'systemctl status named.service' for details.


The /run dir is mounted as tmpfs.

x86 boot doesn't get such error. And, Hugh's patches don't have such 
problem.


2. I ran my THP test (generated a program with 4MB text section) on both 
x86-64 and ARM64 with yours and Hugh's patches (linux-next tree), I got 
the program execution time reduced by ~12% on x86-64, it looks very 
impressive.


But, on ARM64, there is just ~3% change, and sometimes huge tmpfs may 
show even worse data than non-hugepage.


Both yours and Hugh's patches has the same behavior.

Any idea?

Thanks,
Yang


On 4/15/2016 5:23 PM, Kirill A. Shutemov wrote:

This is probably the last update before the mm summit. Main forcus is on
khugepaged stability.

khugepaged is in more reasonable shape now. I missed quite a few corner
cases on first try. I run this version via LTP, trinity and syzkaller
without crashes so far.

The patchset is on top of v4.6-rc3 plus Hugh's "easy preliminaries to
THPagecache" and Ebru's khugepaged swapin patches form -mm tree.

Git tree:

git://git.kernel.org/pub/scm/linux/kernel/git/kas/linux.git hugetmpfs/v7

== Changelog ==

v7:
   - khugepaged updates:
 + fix page leak/page cache corruption on collapse fail;
 + filter out VMAs not suitable for huge pages due misaligned vm_pgoff;
 + fix build without CONFIG_SHMEM;
 + drop few over-protective checks;
   - fix bogus VM_BUG_ON() in __delete_from_page_cache();

v6:
   - experimental collapse support;
   - fix swapout mapped huge pages;
   - fix page leak in faularound code;
   - fix exessive huge page allocation with huge=within_size;
   - rename VM_NO_THP to VM_NO_KHUGEPAGED;
   - fix condition in hugepage_madvise();
   - accounting reworked again;

v5:
   - add FileHugeMapped to /proc/PID/smaps;
   - make FileHugeMapped in meminfo aligned with other fields;
   - Documentation/vm/transhuge.txt updated;

v4:
   - first four patch were applied to -mm tree;
   - drop pages beyond i_size on split_huge_pages;
   - few small random bugfixes;

v3:
   - huge= mountoption now can have values always, within_size, advice and
 never;
   - sysctl handle is replaced with sysfs knob;
   - MADV_HUGEPAGE/MADV_NOHUGEPAGE is now respected on page allocation via
 page fault;
   - mlock() handling had been fixed;
   - bunch of smaller bugfixes and cleanups.

== Design overview ==

Huge pages are allocated by shmem when it's allowed (by mount option) and
there's no entries for the range in radix-tree. Huge page is represented by
HPAGE_PMD_NR entries in radix-tree.

MM core maps a page with PMD if ->fault() returns huge page and the VMA is
suitable for huge pages (size, alignment). There's no need into two
requests to file system: filesystem returns huge page if it can,
graceful fallback to small pages otherwise.

As with DAX, split_huge_pmd() is implemented by unmapping the PMD: we can
re-fault the page with PTEs later.

Basic scheme for split_huge_page() is the same as for anon-THP.
Few differences:

   - File pages are on radix-tree, so we have head->_count offset by
 HPAGE_PMD_NR. The count got distributed to small pages during split.

   - mapping->tree_lock prevents non-lockless access to pages under split
 over radix-tree;

   - Lockless access is prevented by setting the head->_count to 0 during
 split, so 

Re: [PATCH] locktorture: make verbose writable and control stats print

2016-04-15 Thread Shi, Yang

On 4/15/2016 5:09 PM, Paul E. McKenney wrote:

On Fri, Apr 15, 2016 at 04:45:32PM -0700, Shi, Yang wrote:

On 4/15/2016 4:26 PM, Paul E. McKenney wrote:

On Fri, Apr 15, 2016 at 01:28:11PM -0700, Yang Shi wrote:

When building locktorture test into kernel image, it keeps printing out
stats information even though there is no lock type specified.

There is already verbose parameter to control print, but it is read-only,
so it can't be changed at runtime. Make verbose read-write and control
stats print.

Signed-off-by: Yang Shi <yang@linaro.org>


Interesting change!

But just out of curiosity, when you boot with locktorture built in,
do you specify the shutdown_secs boot parameter?  If so, another


No, just use the default value, which is 0 for shutdown_secs.


approach would be to shutdown immediately upon detecting an error
during initialization.


In my case, it looks there is not error involved.


You said that there is no lock type specified, but that should mean that
the default ("spin_lock") is chosen.  If so, I would expect it to just


Yes, spin_lock is chosen by default.


do the test, at least if locktorture.torture_runnable has been set.


But, the default value of torture_runnable is 0. And, it is readonly 
parameter too. This prevents torture from running if it is built into 
kernel instead of module.


Actually, I'm confused why there is not LOCK_TORTURE_TEST_RUNNABLE 
Kconfig like RCU torture?


Thanks,
Yang



Either way, the usual way to make locktorture shut up would be to boot
with locktorture.stat_interval=0.


If not, I would like to know more about your use case.


In my test, I just built locktorture test into kernel instead of a
module then check how it behaves, no specific purpose.

It sounds like not a normal approach to use it.


Agreed, I do believe that this is a case of "working as designed".

Thanx, Paul





Re: [PATCH] locktorture: make verbose writable and control stats print

2016-04-15 Thread Shi, Yang

On 4/15/2016 5:09 PM, Paul E. McKenney wrote:

On Fri, Apr 15, 2016 at 04:45:32PM -0700, Shi, Yang wrote:

On 4/15/2016 4:26 PM, Paul E. McKenney wrote:

On Fri, Apr 15, 2016 at 01:28:11PM -0700, Yang Shi wrote:

When building locktorture test into kernel image, it keeps printing out
stats information even though there is no lock type specified.

There is already verbose parameter to control print, but it is read-only,
so it can't be changed at runtime. Make verbose read-write and control
stats print.

Signed-off-by: Yang Shi 


Interesting change!

But just out of curiosity, when you boot with locktorture built in,
do you specify the shutdown_secs boot parameter?  If so, another


No, just use the default value, which is 0 for shutdown_secs.


approach would be to shutdown immediately upon detecting an error
during initialization.


In my case, it looks there is not error involved.


You said that there is no lock type specified, but that should mean that
the default ("spin_lock") is chosen.  If so, I would expect it to just


Yes, spin_lock is chosen by default.


do the test, at least if locktorture.torture_runnable has been set.


But, the default value of torture_runnable is 0. And, it is readonly 
parameter too. This prevents torture from running if it is built into 
kernel instead of module.


Actually, I'm confused why there is not LOCK_TORTURE_TEST_RUNNABLE 
Kconfig like RCU torture?


Thanks,
Yang



Either way, the usual way to make locktorture shut up would be to boot
with locktorture.stat_interval=0.


If not, I would like to know more about your use case.


In my test, I just built locktorture test into kernel instead of a
module then check how it behaves, no specific purpose.

It sounds like not a normal approach to use it.


Agreed, I do believe that this is a case of "working as designed".

Thanx, Paul





Re: [PATCH] locktorture: make verbose writable and control stats print

2016-04-15 Thread Shi, Yang

On 4/15/2016 4:26 PM, Paul E. McKenney wrote:

On Fri, Apr 15, 2016 at 01:28:11PM -0700, Yang Shi wrote:

When building locktorture test into kernel image, it keeps printing out
stats information even though there is no lock type specified.

There is already verbose parameter to control print, but it is read-only,
so it can't be changed at runtime. Make verbose read-write and control
stats print.

Signed-off-by: Yang Shi <yang@linaro.org>


Interesting change!

But just out of curiosity, when you boot with locktorture built in,
do you specify the shutdown_secs boot parameter?  If so, another


No, just use the default value, which is 0 for shutdown_secs.


approach would be to shutdown immediately upon detecting an error
during initialization.


In my case, it looks there is not error involved.



If not, I would like to know more about your use case.


In my test, I just built locktorture test into kernel instead of a 
module then check how it behaves, no specific purpose.


It sounds like not a normal approach to use it.

Thanks,
Yang



Thanx, Paul


---
  include/linux/torture.h  |  2 ++
  kernel/locking/locktorture.c | 11 +++
  2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 7759fc3..86d6e54 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -48,6 +48,8 @@
do { if (verbose) pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s); 
} while (0)
  #define VERBOSE_TOROUT_ERRSTRING(s) \
do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, 
s); } while (0)
+#define VERBOSE_STRING(s) \
+   do { if (verbose) pr_alert("%s", s); } while (0)

  /* Definitions for online/offline exerciser. */
  int torture_onoff_init(long ooholdoff, long oointerval);
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 8ef1919..d9838a3 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -55,8 +55,11 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero 
to disable.");
  torture_param(int, stat_interval, 60,
 "Number of seconds between stats printk()s");
  torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 
0=disable");
-torture_param(bool, verbose, true,
-"Enable verbose debugging printk()s");
+
+static bool verbose = true;
+module_param(verbose, bool, 0644);
+MODULE_PARM_DESC(verbose,
+"Enable verbose debugging printk()s");

  static char *torture_type = "spin_lock";
  module_param(torture_type, charp, 0444);
@@ -693,7 +696,7 @@ static void lock_torture_stats_print(void)
}

__torture_print_stats(buf, cxt.lwsa, true);
-   pr_alert("%s", buf);
+   VERBOSE_STRING(buf);
kfree(buf);

if (cxt.cur_ops->readlock) {
@@ -705,7 +708,7 @@ static void lock_torture_stats_print(void)
}

__torture_print_stats(buf, cxt.lrsa, false);
-   pr_alert("%s", buf);
+   VERBOSE_STRING(buf);
kfree(buf);
}
  }
--
2.0.2







Re: [PATCH] locktorture: make verbose writable and control stats print

2016-04-15 Thread Shi, Yang

On 4/15/2016 4:26 PM, Paul E. McKenney wrote:

On Fri, Apr 15, 2016 at 01:28:11PM -0700, Yang Shi wrote:

When building locktorture test into kernel image, it keeps printing out
stats information even though there is no lock type specified.

There is already verbose parameter to control print, but it is read-only,
so it can't be changed at runtime. Make verbose read-write and control
stats print.

Signed-off-by: Yang Shi 


Interesting change!

But just out of curiosity, when you boot with locktorture built in,
do you specify the shutdown_secs boot parameter?  If so, another


No, just use the default value, which is 0 for shutdown_secs.


approach would be to shutdown immediately upon detecting an error
during initialization.


In my case, it looks there is not error involved.



If not, I would like to know more about your use case.


In my test, I just built locktorture test into kernel instead of a 
module then check how it behaves, no specific purpose.


It sounds like not a normal approach to use it.

Thanks,
Yang



Thanx, Paul


---
  include/linux/torture.h  |  2 ++
  kernel/locking/locktorture.c | 11 +++
  2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 7759fc3..86d6e54 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -48,6 +48,8 @@
do { if (verbose) pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s); 
} while (0)
  #define VERBOSE_TOROUT_ERRSTRING(s) \
do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, 
s); } while (0)
+#define VERBOSE_STRING(s) \
+   do { if (verbose) pr_alert("%s", s); } while (0)

  /* Definitions for online/offline exerciser. */
  int torture_onoff_init(long ooholdoff, long oointerval);
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 8ef1919..d9838a3 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -55,8 +55,11 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero 
to disable.");
  torture_param(int, stat_interval, 60,
 "Number of seconds between stats printk()s");
  torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 
0=disable");
-torture_param(bool, verbose, true,
-"Enable verbose debugging printk()s");
+
+static bool verbose = true;
+module_param(verbose, bool, 0644);
+MODULE_PARM_DESC(verbose,
+"Enable verbose debugging printk()s");

  static char *torture_type = "spin_lock";
  module_param(torture_type, charp, 0444);
@@ -693,7 +696,7 @@ static void lock_torture_stats_print(void)
}

__torture_print_stats(buf, cxt.lwsa, true);
-   pr_alert("%s", buf);
+   VERBOSE_STRING(buf);
kfree(buf);

if (cxt.cur_ops->readlock) {
@@ -705,7 +708,7 @@ static void lock_torture_stats_print(void)
}

__torture_print_stats(buf, cxt.lrsa, false);
-   pr_alert("%s", buf);
+   VERBOSE_STRING(buf);
kfree(buf);
}
  }
--
2.0.2







Re: [PATCH] arm64: Kconfig: make SCHED_MC and SCHED_SMT depend on SMP

2016-04-14 Thread Shi, Yang

On 4/14/2016 1:47 AM, Will Deacon wrote:

On Wed, Apr 13, 2016 at 05:54:12PM -0700, Yang Shi wrote:

SCHED_MC and SCHED_SMT are pointless when SMP is disabled.
Although SMP is rarely disabled for ARM64, it looks more consistent to have
such depend in Kconfig.


You can't disable CONFIG_SMP for arm64 -- we select it unconditionally
in the kconfig.


Thanks Will. I didn't realize ARM64 has SMP selected unconditionally, it 
looks the patch is pointless.


A follow-up question, I know ARM64 has no UP implementation now, it 
sounds make sense to have SMP select unconditionally, however, it might 
be more flexible to have SMP like x86 and other architectures. And, it 
may also help to find more bugs when !SMP.


Thanks,
Yang



Will





Re: [PATCH] arm64: Kconfig: make SCHED_MC and SCHED_SMT depend on SMP

2016-04-14 Thread Shi, Yang

On 4/14/2016 1:47 AM, Will Deacon wrote:

On Wed, Apr 13, 2016 at 05:54:12PM -0700, Yang Shi wrote:

SCHED_MC and SCHED_SMT are pointless when SMP is disabled.
Although SMP is rarely disabled for ARM64, it looks more consistent to have
such depend in Kconfig.


You can't disable CONFIG_SMP for arm64 -- we select it unconditionally
in the kconfig.


Thanks Will. I didn't realize ARM64 has SMP selected unconditionally, it 
looks the patch is pointless.


A follow-up question, I know ARM64 has no UP implementation now, it 
sounds make sense to have SMP select unconditionally, however, it might 
be more flexible to have SMP like x86 and other architectures. And, it 
may also help to find more bugs when !SMP.


Thanks,
Yang



Will





Re: [RFC V4 PATCH] trace: writeback: replace cgroup path to cgroup ino

2016-03-06 Thread Shi, Yang

Hi Steven,

Any comment on this one? I'm supposed it will be taken by your tracing tree?

Thanks,
Yang


On 3/4/2016 4:52 AM, Tejun Heo wrote:

On Thu, Mar 03, 2016 at 01:08:57AM -0800, Yang Shi wrote:

commit 5634cc2aa9aebc77bc862992e7805469dcf83dac ("writeback: update writeback
tracepoints to report cgroup") made writeback tracepoints print out cgroup
path when CGROUP_WRITEBACK is enabled, but it may trigger the below bug on -rt
kernel since kernfs_path and kernfs_path_len are called by tracepoints, which
acquire spin lock that is sleepable on -rt kernel.


Acked-by: Tejun Heo 

Thanks.





Re: [RFC V4 PATCH] trace: writeback: replace cgroup path to cgroup ino

2016-03-06 Thread Shi, Yang

Hi Steven,

Any comment on this one? I'm supposed it will be taken by your tracing tree?

Thanks,
Yang


On 3/4/2016 4:52 AM, Tejun Heo wrote:

On Thu, Mar 03, 2016 at 01:08:57AM -0800, Yang Shi wrote:

commit 5634cc2aa9aebc77bc862992e7805469dcf83dac ("writeback: update writeback
tracepoints to report cgroup") made writeback tracepoints print out cgroup
path when CGROUP_WRITEBACK is enabled, but it may trigger the below bug on -rt
kernel since kernfs_path and kernfs_path_len are called by tracepoints, which
acquire spin lock that is sleepable on -rt kernel.


Acked-by: Tejun Heo 

Thanks.





  1   2   3   >