Re: [PATCH] arm: kprobe: replace patch_lock to raw lock
On 12/1/2016 6:13 AM, Sebastian Andrzej Siewior wrote: On 2016-11-10 16:17:55 [-0800], Yang Shi wrote: Since patch_text_stop_machine() is called in stop_machine() which disables IRQ, sleepable lock should be not used in this atomic context, so replace patch_lock to raw lock. Signed-off-by: Yang Shi <yang@linaro.org> This can also go upstream. Acked-by: Sebastian Andrzej Siewior <bige...@linutronix.de> Yes, thanks for acking. Russell, Could you please consider this patch? Thanks, Yang Sebastian
Re: [PATCH] arm: kprobe: replace patch_lock to raw lock
On 12/1/2016 6:13 AM, Sebastian Andrzej Siewior wrote: On 2016-11-10 16:17:55 [-0800], Yang Shi wrote: Since patch_text_stop_machine() is called in stop_machine() which disables IRQ, sleepable lock should be not used in this atomic context, so replace patch_lock to raw lock. Signed-off-by: Yang Shi This can also go upstream. Acked-by: Sebastian Andrzej Siewior Yes, thanks for acking. Russell, Could you please consider this patch? Thanks, Yang Sebastian
Re: [v2 PATCH] arm64: kasan: instrument user memory access API
Hi Will & Catalin, Any comment for this patch? Thanks, Yang On 5/27/2016 2:01 PM, Yang Shi wrote: The upstream commit 1771c6e1a567ea0ba20a4ffe68a1419fd8ef ("x86/kasan: instrument user memory access API") added KASAN instrument to x86 user memory access API, so added such instrument to ARM64 too. Define __copy_to/from_user in C in order to add kasan_check_read/write call, rename assembly implementation to __arch_copy_to/from_user. Tested by test_kasan module. Signed-off-by: Yang Shi <yang@linaro.org> --- v2: Adopted the comment from Andrey and Mark to add kasan_check_read/write into __copy_to/from_user. arch/arm64/include/asm/uaccess.h | 25 + arch/arm64/kernel/arm64ksyms.c | 4 ++-- arch/arm64/lib/copy_from_user.S | 4 ++-- arch/arm64/lib/copy_to_user.S| 4 ++-- 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 0685d74..4dc9a8f 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -23,6 +23,7 @@ */ #include #include +#include #include #include @@ -269,15 +270,29 @@ do { \ -EFAULT;\ }) -extern unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n); -extern unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n); +extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n); +extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n); extern unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n); extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n); +static inline unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n) +{ + kasan_check_write(to, n); + return __arch_copy_from_user(to, from, n); +} + +static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n) +{ + kasan_check_read(from, n); + return __arch_copy_to_user(to, from, n); +} + static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { + kasan_check_write(to, n); + if (access_ok(VERIFY_READ, from, n)) - n = __copy_from_user(to, from, n); + n = __arch_copy_from_user(to, from, n); else /* security hole - plug it */ memset(to, 0, n); return n; @@ -285,8 +300,10 @@ static inline unsigned long __must_check copy_from_user(void *to, const void __u static inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { + kasan_check_read(from, n); + if (access_ok(VERIFY_WRITE, to, n)) - n = __copy_to_user(to, from, n); + n = __arch_copy_to_user(to, from, n); return n; } diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index 678f30b0..2dc4440 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -34,8 +34,8 @@ EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); /* user mem (segment) */ -EXPORT_SYMBOL(__copy_from_user); -EXPORT_SYMBOL(__copy_to_user); +EXPORT_SYMBOL(__arch_copy_from_user); +EXPORT_SYMBOL(__arch_copy_to_user); EXPORT_SYMBOL(__clear_user); EXPORT_SYMBOL(__copy_in_user); diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 17e8306..0b90497 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -66,7 +66,7 @@ .endm end.reqx5 -ENTRY(__copy_from_user) +ENTRY(__arch_copy_from_user) ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) add end, x0, x2 @@ -75,7 +75,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) mov x0, #0 // Nothing to copy ret -ENDPROC(__copy_from_user) +ENDPROC(__arch_copy_from_user) .section .fixup,"ax" .align 2 diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 21faae6..7a7efe2 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -65,7 +65,7 @@ .endm end.reqx5 -ENTRY(__copy_to_user) +ENTRY(__arch_copy_to_user) ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) add end, x0, x2 @@ -74,7 +74,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
Re: [v2 PATCH] arm64: kasan: instrument user memory access API
Hi Will & Catalin, Any comment for this patch? Thanks, Yang On 5/27/2016 2:01 PM, Yang Shi wrote: The upstream commit 1771c6e1a567ea0ba20a4ffe68a1419fd8ef ("x86/kasan: instrument user memory access API") added KASAN instrument to x86 user memory access API, so added such instrument to ARM64 too. Define __copy_to/from_user in C in order to add kasan_check_read/write call, rename assembly implementation to __arch_copy_to/from_user. Tested by test_kasan module. Signed-off-by: Yang Shi --- v2: Adopted the comment from Andrey and Mark to add kasan_check_read/write into __copy_to/from_user. arch/arm64/include/asm/uaccess.h | 25 + arch/arm64/kernel/arm64ksyms.c | 4 ++-- arch/arm64/lib/copy_from_user.S | 4 ++-- arch/arm64/lib/copy_to_user.S| 4 ++-- 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 0685d74..4dc9a8f 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -23,6 +23,7 @@ */ #include #include +#include #include #include @@ -269,15 +270,29 @@ do { \ -EFAULT;\ }) -extern unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n); -extern unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n); +extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n); +extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n); extern unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n); extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n); +static inline unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n) +{ + kasan_check_write(to, n); + return __arch_copy_from_user(to, from, n); +} + +static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n) +{ + kasan_check_read(from, n); + return __arch_copy_to_user(to, from, n); +} + static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { + kasan_check_write(to, n); + if (access_ok(VERIFY_READ, from, n)) - n = __copy_from_user(to, from, n); + n = __arch_copy_from_user(to, from, n); else /* security hole - plug it */ memset(to, 0, n); return n; @@ -285,8 +300,10 @@ static inline unsigned long __must_check copy_from_user(void *to, const void __u static inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { + kasan_check_read(from, n); + if (access_ok(VERIFY_WRITE, to, n)) - n = __copy_to_user(to, from, n); + n = __arch_copy_to_user(to, from, n); return n; } diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index 678f30b0..2dc4440 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -34,8 +34,8 @@ EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); /* user mem (segment) */ -EXPORT_SYMBOL(__copy_from_user); -EXPORT_SYMBOL(__copy_to_user); +EXPORT_SYMBOL(__arch_copy_from_user); +EXPORT_SYMBOL(__arch_copy_to_user); EXPORT_SYMBOL(__clear_user); EXPORT_SYMBOL(__copy_in_user); diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 17e8306..0b90497 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -66,7 +66,7 @@ .endm end.reqx5 -ENTRY(__copy_from_user) +ENTRY(__arch_copy_from_user) ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) add end, x0, x2 @@ -75,7 +75,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) mov x0, #0 // Nothing to copy ret -ENDPROC(__copy_from_user) +ENDPROC(__arch_copy_from_user) .section .fixup,"ax" .align 2 diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 21faae6..7a7efe2 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -65,7 +65,7 @@ .endm end.reqx5 -ENTRY(__copy_to_user) +ENTRY(__arch_copy_to_user) ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) add end, x0, x2 @@ -74,7 +74,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ CONFIG_ARM64_PAN) mov x0, #0 ret -ENDPROC(__copy_to_user)
Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites
On 6/1/2016 10:00 PM, Minchan Kim wrote: On Wed, Jun 01, 2016 at 01:40:48PM -0700, Shi, Yang wrote: On 5/29/2016 11:11 PM, Minchan Kim wrote: On Fri, May 27, 2016 at 11:16:41AM -0700, Shi, Yang wrote: If we goes this way, how to guarantee this race? Thanks for pointing out this. It sounds reasonable. However, this should be only possible to happen on 32 bit since just 32 bit version page_is_idle() calls lookup_page_ext(), it doesn't do it on 64 bit. And, such race condition should exist regardless of whether DEBUG_VM is enabled or not, right? rcu might be good enough to protect it. A quick fix may look like: diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 8f5d4ad..bf0cd6a 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -77,8 +77,12 @@ static inline bool test_and_clear_page_young(struct page *page) static inline bool page_is_idle(struct page *page) { struct page_ext *page_ext; + + rcu_read_lock(); page_ext = lookup_page_ext(page); + rcu_read_unlock(); + if (unlikely(!page_ext)) return false; diff --git a/mm/page_ext.c b/mm/page_ext.c index 56b160f..94927c9 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -183,7 +183,6 @@ struct page_ext *lookup_page_ext(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); -#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are @@ -195,7 +194,7 @@ struct page_ext *lookup_page_ext(struct page *page) */ if (!section->page_ext) return NULL; -#endif + return section->page_ext + pfn; } @@ -279,7 +278,8 @@ static void __free_page_ext(unsigned long pfn) return; base = ms->page_ext + pfn; free_page_ext(base); - ms->page_ext = NULL; + rcu_assign_pointer(ms->page_ext, NULL); + synchronize_rcu(); How does it fix the problem? I cannot understand your point. Assigning NULL pointer to page_Ext will be blocked until rcu_read_lock critical section is done, so the lookup and writing operations will be serialized. And, rcu_read_lock disables preempt too. I meant your rcu_read_lock in page_idle should cover test_bit op. Yes, definitely. Thanks for catching it. One more thing, you should use rcu_dereference. I will check which one is the best since I saw some use rcu_assign_pointer. As well, please cover memory onlining case I mentioned in another thread as well as memory offlining. I will look into it too. Thanks, Yang Anyway, to me, every caller of page_ext should prepare lookup_page_ext can return NULL anytime and they should use rcu_read_[un]lock, which is not good. :(
Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites
On 6/1/2016 10:00 PM, Minchan Kim wrote: On Wed, Jun 01, 2016 at 01:40:48PM -0700, Shi, Yang wrote: On 5/29/2016 11:11 PM, Minchan Kim wrote: On Fri, May 27, 2016 at 11:16:41AM -0700, Shi, Yang wrote: If we goes this way, how to guarantee this race? Thanks for pointing out this. It sounds reasonable. However, this should be only possible to happen on 32 bit since just 32 bit version page_is_idle() calls lookup_page_ext(), it doesn't do it on 64 bit. And, such race condition should exist regardless of whether DEBUG_VM is enabled or not, right? rcu might be good enough to protect it. A quick fix may look like: diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 8f5d4ad..bf0cd6a 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -77,8 +77,12 @@ static inline bool test_and_clear_page_young(struct page *page) static inline bool page_is_idle(struct page *page) { struct page_ext *page_ext; + + rcu_read_lock(); page_ext = lookup_page_ext(page); + rcu_read_unlock(); + if (unlikely(!page_ext)) return false; diff --git a/mm/page_ext.c b/mm/page_ext.c index 56b160f..94927c9 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -183,7 +183,6 @@ struct page_ext *lookup_page_ext(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); -#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are @@ -195,7 +194,7 @@ struct page_ext *lookup_page_ext(struct page *page) */ if (!section->page_ext) return NULL; -#endif + return section->page_ext + pfn; } @@ -279,7 +278,8 @@ static void __free_page_ext(unsigned long pfn) return; base = ms->page_ext + pfn; free_page_ext(base); - ms->page_ext = NULL; + rcu_assign_pointer(ms->page_ext, NULL); + synchronize_rcu(); How does it fix the problem? I cannot understand your point. Assigning NULL pointer to page_Ext will be blocked until rcu_read_lock critical section is done, so the lookup and writing operations will be serialized. And, rcu_read_lock disables preempt too. I meant your rcu_read_lock in page_idle should cover test_bit op. Yes, definitely. Thanks for catching it. One more thing, you should use rcu_dereference. I will check which one is the best since I saw some use rcu_assign_pointer. As well, please cover memory onlining case I mentioned in another thread as well as memory offlining. I will look into it too. Thanks, Yang Anyway, to me, every caller of page_ext should prepare lookup_page_ext can return NULL anytime and they should use rcu_read_[un]lock, which is not good. :(
Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites
On 5/29/2016 11:08 PM, Minchan Kim wrote: On Mon, May 30, 2016 at 02:39:06PM +0900, Joonsoo Kim wrote: On Fri, May 27, 2016 at 05:11:08PM +0900, Minchan Kim wrote: On Fri, May 27, 2016 at 03:08:39PM +0900, Joonsoo Kim wrote: On Fri, May 27, 2016 at 02:14:32PM +0900, Minchan Kim wrote: On Thu, May 26, 2016 at 04:15:28PM -0700, Shi, Yang wrote: On 5/25/2016 5:37 PM, Minchan Kim wrote: On Tue, May 24, 2016 at 11:58:11AM +0900, Minchan Kim wrote: On Mon, May 23, 2016 at 10:16:08AM -0700, Yang Shi wrote: Per the discussion with Joonsoo Kim [1], we need check the return value of lookup_page_ext() for all call sites since it might return NULL in some cases, although it is unlikely, i.e. memory hotplug. Tested with ltp with "page_owner=0". [1] http://lkml.kernel.org/r/20160519002809.GA10245@js1304-P5Q-DELUXE Signed-off-by: Yang Shi <yang@linaro.org> I didn't read code code in detail to see how page_ext memory space allocated in boot code and memory hotplug but to me, it's not good to check NULL whenever we calls lookup_page_ext. More dangerous thing is now page_ext is used by optionable feature(ie, not critical for system stability) but if we want to use page_ext as another important tool for the system in future, it could be a serious problem. Hello, Minchan. Hi Joonsoo, I wonder how pages that isn't managed by kernel yet will cause serious problem. Until onlining, these pages are out of our scope. Any information about them would be useless until it is actually activated. I guess that returning NULL for those pages will not hurt any functionality. Do you have any possible scenario that this causes the serious problem? I don't have any specific usecase now. That's why I said "in future". And I don't want to argue whether there is possible scenario or not to make the feature useful but if you want, I should write novel. One of example, pop up my mind, xen, hv and even memory_hotplug itself might want to use page_ext for their functionality extension to hook guest pages. There is no detail so I can't guess how to use it and how it causes the serious problem. But, we can do it when it is really needed. My opinion is that page_ext is extension of struct page so it would be better to allow any operation on struct page without any limitation if we can do it. Whether it's useful or useless depend on random usecase and we don't need to limit that way from the beginning. If there is no drawback, it would be a better design. But, we have trade-off that for some case that the memory is added but not onlined, there is memory saving if we allocate page_ext later. So, in current situation that there is no user to require such guarantee, I don't think it's worth doing right now. However, current design allows deferred page_ext population so any user of page_ext should keep it in mind and should either make fallback plan or don't use page_ext for those cases. If we decide go this way through discussion, at least, we should make such limitation more clear to somewhere in this chance, maybe around page_ext_operation->need comment. Agreed. Okay, We realized from this discussion that by design, guest of page_ext at the meoment should know his page_ext access from the page can be failed so every caller should prepare for it. Shi, Yang, Please include some comment about that in your patch to prevent further reviewer waste his time with repeating same discussion and client of page_ext can know the limitation. My comment's point is that we should consider that way at least. It's worth to discuss pros and cons, what's the best and what makes that way hesitate if we can't. Yes, your suggestion would be good for future direction, but, for now, I think that inserting NULL to all callers is right fix. 1) Current design that page_ext is allocated when online is design decision of page_ext to save memory as much as possible. Fixing possible problem within this design decision looks good to me. Okay. Shi Yang, please include this comment in your patch, too. There is already such comment in lookup_page_ext: * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. I will add more details, i.e. newly added but not onlined memory could reach here too. 2) Maybe, we need to backport fixes because it would crash older kernels. In this case, fix with NULL is easy to backport. Agreed. Then, Shi Yang need to mark the page as stable. Agreed. Shi, Please resend your patch with hard testing and more better description with marking it as stable. Will come with an incremental patch to remove CONFIG_DEBUG #ifdef and fix the improve the comments since the comments are included by it. Thanks, Yang And I know another race problem about Shi's patc
Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites
On 5/29/2016 11:08 PM, Minchan Kim wrote: On Mon, May 30, 2016 at 02:39:06PM +0900, Joonsoo Kim wrote: On Fri, May 27, 2016 at 05:11:08PM +0900, Minchan Kim wrote: On Fri, May 27, 2016 at 03:08:39PM +0900, Joonsoo Kim wrote: On Fri, May 27, 2016 at 02:14:32PM +0900, Minchan Kim wrote: On Thu, May 26, 2016 at 04:15:28PM -0700, Shi, Yang wrote: On 5/25/2016 5:37 PM, Minchan Kim wrote: On Tue, May 24, 2016 at 11:58:11AM +0900, Minchan Kim wrote: On Mon, May 23, 2016 at 10:16:08AM -0700, Yang Shi wrote: Per the discussion with Joonsoo Kim [1], we need check the return value of lookup_page_ext() for all call sites since it might return NULL in some cases, although it is unlikely, i.e. memory hotplug. Tested with ltp with "page_owner=0". [1] http://lkml.kernel.org/r/20160519002809.GA10245@js1304-P5Q-DELUXE Signed-off-by: Yang Shi I didn't read code code in detail to see how page_ext memory space allocated in boot code and memory hotplug but to me, it's not good to check NULL whenever we calls lookup_page_ext. More dangerous thing is now page_ext is used by optionable feature(ie, not critical for system stability) but if we want to use page_ext as another important tool for the system in future, it could be a serious problem. Hello, Minchan. Hi Joonsoo, I wonder how pages that isn't managed by kernel yet will cause serious problem. Until onlining, these pages are out of our scope. Any information about them would be useless until it is actually activated. I guess that returning NULL for those pages will not hurt any functionality. Do you have any possible scenario that this causes the serious problem? I don't have any specific usecase now. That's why I said "in future". And I don't want to argue whether there is possible scenario or not to make the feature useful but if you want, I should write novel. One of example, pop up my mind, xen, hv and even memory_hotplug itself might want to use page_ext for their functionality extension to hook guest pages. There is no detail so I can't guess how to use it and how it causes the serious problem. But, we can do it when it is really needed. My opinion is that page_ext is extension of struct page so it would be better to allow any operation on struct page without any limitation if we can do it. Whether it's useful or useless depend on random usecase and we don't need to limit that way from the beginning. If there is no drawback, it would be a better design. But, we have trade-off that for some case that the memory is added but not onlined, there is memory saving if we allocate page_ext later. So, in current situation that there is no user to require such guarantee, I don't think it's worth doing right now. However, current design allows deferred page_ext population so any user of page_ext should keep it in mind and should either make fallback plan or don't use page_ext for those cases. If we decide go this way through discussion, at least, we should make such limitation more clear to somewhere in this chance, maybe around page_ext_operation->need comment. Agreed. Okay, We realized from this discussion that by design, guest of page_ext at the meoment should know his page_ext access from the page can be failed so every caller should prepare for it. Shi, Yang, Please include some comment about that in your patch to prevent further reviewer waste his time with repeating same discussion and client of page_ext can know the limitation. My comment's point is that we should consider that way at least. It's worth to discuss pros and cons, what's the best and what makes that way hesitate if we can't. Yes, your suggestion would be good for future direction, but, for now, I think that inserting NULL to all callers is right fix. 1) Current design that page_ext is allocated when online is design decision of page_ext to save memory as much as possible. Fixing possible problem within this design decision looks good to me. Okay. Shi Yang, please include this comment in your patch, too. There is already such comment in lookup_page_ext: * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. I will add more details, i.e. newly added but not onlined memory could reach here too. 2) Maybe, we need to backport fixes because it would crash older kernels. In this case, fix with NULL is easy to backport. Agreed. Then, Shi Yang need to mark the page as stable. Agreed. Shi, Please resend your patch with hard testing and more better description with marking it as stable. Will come with an incremental patch to remove CONFIG_DEBUG #ifdef and fix the improve the comments since the comments are included by it. Thanks, Yang And I know another race problem about Shi's patch. I will reply to the thread.
Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites
On 5/29/2016 11:11 PM, Minchan Kim wrote: On Fri, May 27, 2016 at 11:16:41AM -0700, Shi, Yang wrote: If we goes this way, how to guarantee this race? Thanks for pointing out this. It sounds reasonable. However, this should be only possible to happen on 32 bit since just 32 bit version page_is_idle() calls lookup_page_ext(), it doesn't do it on 64 bit. And, such race condition should exist regardless of whether DEBUG_VM is enabled or not, right? rcu might be good enough to protect it. A quick fix may look like: diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 8f5d4ad..bf0cd6a 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -77,8 +77,12 @@ static inline bool test_and_clear_page_young(struct page *page) static inline bool page_is_idle(struct page *page) { struct page_ext *page_ext; + + rcu_read_lock(); page_ext = lookup_page_ext(page); + rcu_read_unlock(); + if (unlikely(!page_ext)) return false; diff --git a/mm/page_ext.c b/mm/page_ext.c index 56b160f..94927c9 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -183,7 +183,6 @@ struct page_ext *lookup_page_ext(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); -#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are @@ -195,7 +194,7 @@ struct page_ext *lookup_page_ext(struct page *page) */ if (!section->page_ext) return NULL; -#endif + return section->page_ext + pfn; } @@ -279,7 +278,8 @@ static void __free_page_ext(unsigned long pfn) return; base = ms->page_ext + pfn; free_page_ext(base); - ms->page_ext = NULL; + rcu_assign_pointer(ms->page_ext, NULL); + synchronize_rcu(); How does it fix the problem? I cannot understand your point. Assigning NULL pointer to page_Ext will be blocked until rcu_read_lock critical section is done, so the lookup and writing operations will be serialized. And, rcu_read_lock disables preempt too. Yang } static int __meminit online_page_ext(unsigned long start_pfn, Thanks, Yang kpageflags_read stable_page_flags page_is_idle lookup_page_ext section = __pfn_to_section(pfn) offline_pages memory_notify(MEM_OFFLINE) offline_page_ext ms->page_ext = NULL section->page_ext + pfn Thanks.
Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites
On 5/29/2016 11:11 PM, Minchan Kim wrote: On Fri, May 27, 2016 at 11:16:41AM -0700, Shi, Yang wrote: If we goes this way, how to guarantee this race? Thanks for pointing out this. It sounds reasonable. However, this should be only possible to happen on 32 bit since just 32 bit version page_is_idle() calls lookup_page_ext(), it doesn't do it on 64 bit. And, such race condition should exist regardless of whether DEBUG_VM is enabled or not, right? rcu might be good enough to protect it. A quick fix may look like: diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 8f5d4ad..bf0cd6a 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -77,8 +77,12 @@ static inline bool test_and_clear_page_young(struct page *page) static inline bool page_is_idle(struct page *page) { struct page_ext *page_ext; + + rcu_read_lock(); page_ext = lookup_page_ext(page); + rcu_read_unlock(); + if (unlikely(!page_ext)) return false; diff --git a/mm/page_ext.c b/mm/page_ext.c index 56b160f..94927c9 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -183,7 +183,6 @@ struct page_ext *lookup_page_ext(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); -#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are @@ -195,7 +194,7 @@ struct page_ext *lookup_page_ext(struct page *page) */ if (!section->page_ext) return NULL; -#endif + return section->page_ext + pfn; } @@ -279,7 +278,8 @@ static void __free_page_ext(unsigned long pfn) return; base = ms->page_ext + pfn; free_page_ext(base); - ms->page_ext = NULL; + rcu_assign_pointer(ms->page_ext, NULL); + synchronize_rcu(); How does it fix the problem? I cannot understand your point. Assigning NULL pointer to page_Ext will be blocked until rcu_read_lock critical section is done, so the lookup and writing operations will be serialized. And, rcu_read_lock disables preempt too. Yang } static int __meminit online_page_ext(unsigned long start_pfn, Thanks, Yang kpageflags_read stable_page_flags page_is_idle lookup_page_ext section = __pfn_to_section(pfn) offline_pages memory_notify(MEM_OFFLINE) offline_page_ext ms->page_ext = NULL section->page_ext + pfn Thanks.
Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites
On 5/27/2016 1:02 PM, Andrew Morton wrote: On Thu, 26 May 2016 16:15:28 -0700 "Shi, Yang" <yang@linaro.org> wrote: I hope we consider this direction, too. Yang, Could you think about this? Thanks a lot for the suggestion. Sorry for the late reply, I was busy on preparing patches. I do agree this is a direction we should look into, but I haven't got time to think about it deeper. I hope Joonsoo could chime in too since he is the original author for page extension. Even, your patch was broken, I think. It doesn't work with !CONFIG_DEBUG_VM && !CONFIG_PAGE_POISONING because lookup_page_ext doesn't return NULL in that case. Actually, I think the #ifdef should be removed if lookup_page_ext() is possible to return NULL. It sounds not make sense returning NULL only when DEBUG_VM is enabled. It should return NULL no matter what debug config is selected. If Joonsoo agrees with me I'm going to come up with a patch to fix it. I've lost the plot here. What is the status of this patch? Latest version: Yes, this is the latest version. We are discussing about some future optimization. And, Minchan Kim pointed out a possible race condition which exists even before this patch. I proposed a quick fix, as long as they are happy to the fix, I will post it to the mailing list. Thanks, Yang From: Yang Shi <yang@linaro.org> Subject: mm: check the return value of lookup_page_ext for all call sites Per the discussion with Joonsoo Kim [1], we need check the return value of lookup_page_ext() for all call sites since it might return NULL in some cases, although it is unlikely, i.e. memory hotplug. Tested with ltp with "page_owner=0". [1] http://lkml.kernel.org/r/20160519002809.GA10245@js1304-P5Q-DELUXE [a...@linux-foundation.org: fix build-breaking typos] [a...@arndb.de: fix build problems from lookup_page_ext] Link: http://lkml.kernel.org/r/6285269.2CksypHdYp@wuerfel [a...@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1464023768-31025-1-git-send-email-yang....@linaro.org Signed-off-by: Yang Shi <yang@linaro.org> Signed-off-by: Arnd Bergmann <a...@arndb.de> Cc: Joonsoo Kim <iamjoonsoo@lge.com> Signed-off-by: Andrew Morton <a...@linux-foundation.org> --- include/linux/page_idle.h | 43 ++-- mm/page_alloc.c |6 + mm/page_owner.c | 26 + mm/page_poison.c |8 +- mm/vmstat.c |2 + 5 files changed, 77 insertions(+), 8 deletions(-) diff -puN include/linux/page_idle.h~mm-check-the-return-value-of-lookup_page_ext-for-all-call-sites include/linux/page_idle.h --- a/include/linux/page_idle.h~mm-check-the-return-value-of-lookup_page_ext-for-all-call-sites +++ a/include/linux/page_idle.h @@ -46,33 +46,62 @@ extern struct page_ext_operations page_i static inline bool page_is_young(struct page *page) { - return test_bit(PAGE_EXT_YOUNG, _page_ext(page)->flags); + struct page_ext *page_ext = lookup_page_ext(page); + + if (unlikely(!page_ext)) + return false; + + return test_bit(PAGE_EXT_YOUNG, _ext->flags); } static inline void set_page_young(struct page *page) { - set_bit(PAGE_EXT_YOUNG, _page_ext(page)->flags); + struct page_ext *page_ext = lookup_page_ext(page); + + if (unlikely(!page_ext)) + return; + + set_bit(PAGE_EXT_YOUNG, _ext->flags); } static inline bool test_and_clear_page_young(struct page *page) { - return test_and_clear_bit(PAGE_EXT_YOUNG, - _page_ext(page)->flags); + struct page_ext *page_ext = lookup_page_ext(page); + + if (unlikely(!page_ext)) + return false; + + return test_and_clear_bit(PAGE_EXT_YOUNG, _ext->flags); } static inline bool page_is_idle(struct page *page) { - return test_bit(PAGE_EXT_IDLE, _page_ext(page)->flags); + struct page_ext *page_ext = lookup_page_ext(page); + + if (unlikely(!page_ext)) + return false; + + return test_bit(PAGE_EXT_IDLE, _ext->flags); } static inline void set_page_idle(struct page *page) { - set_bit(PAGE_EXT_IDLE, _page_ext(page)->flags); + struct page_ext *page_ext = lookup_page_ext(page); + + if (unlikely(!page_ext)) + return; + + set_bit(PAGE_EXT_IDLE, _ext->flags); } static inline void clear_page_idle(struct page *page) { - clear_bit(PAGE_EXT_IDLE, _page_ext(page)->flags); + struct page_ext *page_ext = lookup_page_ext(page); + + if (unlikely(!page_ext)) + return; + + clear_bit(PAGE_EXT_IDLE, _ext->flags); } #endif /* CONFIG_64BIT */ diff -puN mm/page_alloc.c~mm-check-the-return-value-of-lookup_page_ext-for-all-call-sites mm/page_alloc.c --- a/mm/page_alloc.c~mm-check-the-return-value-of-lookup_page_ext-f
Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites
On 5/27/2016 1:02 PM, Andrew Morton wrote: On Thu, 26 May 2016 16:15:28 -0700 "Shi, Yang" wrote: I hope we consider this direction, too. Yang, Could you think about this? Thanks a lot for the suggestion. Sorry for the late reply, I was busy on preparing patches. I do agree this is a direction we should look into, but I haven't got time to think about it deeper. I hope Joonsoo could chime in too since he is the original author for page extension. Even, your patch was broken, I think. It doesn't work with !CONFIG_DEBUG_VM && !CONFIG_PAGE_POISONING because lookup_page_ext doesn't return NULL in that case. Actually, I think the #ifdef should be removed if lookup_page_ext() is possible to return NULL. It sounds not make sense returning NULL only when DEBUG_VM is enabled. It should return NULL no matter what debug config is selected. If Joonsoo agrees with me I'm going to come up with a patch to fix it. I've lost the plot here. What is the status of this patch? Latest version: Yes, this is the latest version. We are discussing about some future optimization. And, Minchan Kim pointed out a possible race condition which exists even before this patch. I proposed a quick fix, as long as they are happy to the fix, I will post it to the mailing list. Thanks, Yang From: Yang Shi Subject: mm: check the return value of lookup_page_ext for all call sites Per the discussion with Joonsoo Kim [1], we need check the return value of lookup_page_ext() for all call sites since it might return NULL in some cases, although it is unlikely, i.e. memory hotplug. Tested with ltp with "page_owner=0". [1] http://lkml.kernel.org/r/20160519002809.GA10245@js1304-P5Q-DELUXE [a...@linux-foundation.org: fix build-breaking typos] [a...@arndb.de: fix build problems from lookup_page_ext] Link: http://lkml.kernel.org/r/6285269.2CksypHdYp@wuerfel [a...@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1464023768-31025-1-git-send-email-yang@linaro.org Signed-off-by: Yang Shi Signed-off-by: Arnd Bergmann Cc: Joonsoo Kim Signed-off-by: Andrew Morton --- include/linux/page_idle.h | 43 ++-- mm/page_alloc.c |6 + mm/page_owner.c | 26 + mm/page_poison.c |8 +- mm/vmstat.c |2 + 5 files changed, 77 insertions(+), 8 deletions(-) diff -puN include/linux/page_idle.h~mm-check-the-return-value-of-lookup_page_ext-for-all-call-sites include/linux/page_idle.h --- a/include/linux/page_idle.h~mm-check-the-return-value-of-lookup_page_ext-for-all-call-sites +++ a/include/linux/page_idle.h @@ -46,33 +46,62 @@ extern struct page_ext_operations page_i static inline bool page_is_young(struct page *page) { - return test_bit(PAGE_EXT_YOUNG, _page_ext(page)->flags); + struct page_ext *page_ext = lookup_page_ext(page); + + if (unlikely(!page_ext)) + return false; + + return test_bit(PAGE_EXT_YOUNG, _ext->flags); } static inline void set_page_young(struct page *page) { - set_bit(PAGE_EXT_YOUNG, _page_ext(page)->flags); + struct page_ext *page_ext = lookup_page_ext(page); + + if (unlikely(!page_ext)) + return; + + set_bit(PAGE_EXT_YOUNG, _ext->flags); } static inline bool test_and_clear_page_young(struct page *page) { - return test_and_clear_bit(PAGE_EXT_YOUNG, - _page_ext(page)->flags); + struct page_ext *page_ext = lookup_page_ext(page); + + if (unlikely(!page_ext)) + return false; + + return test_and_clear_bit(PAGE_EXT_YOUNG, _ext->flags); } static inline bool page_is_idle(struct page *page) { - return test_bit(PAGE_EXT_IDLE, _page_ext(page)->flags); + struct page_ext *page_ext = lookup_page_ext(page); + + if (unlikely(!page_ext)) + return false; + + return test_bit(PAGE_EXT_IDLE, _ext->flags); } static inline void set_page_idle(struct page *page) { - set_bit(PAGE_EXT_IDLE, _page_ext(page)->flags); + struct page_ext *page_ext = lookup_page_ext(page); + + if (unlikely(!page_ext)) + return; + + set_bit(PAGE_EXT_IDLE, _ext->flags); } static inline void clear_page_idle(struct page *page) { - clear_bit(PAGE_EXT_IDLE, _page_ext(page)->flags); + struct page_ext *page_ext = lookup_page_ext(page); + + if (unlikely(!page_ext)) + return; + + clear_bit(PAGE_EXT_IDLE, _ext->flags); } #endif /* CONFIG_64BIT */ diff -puN mm/page_alloc.c~mm-check-the-return-value-of-lookup_page_ext-for-all-call-sites mm/page_alloc.c --- a/mm/page_alloc.c~mm-check-the-return-value-of-lookup_page_ext-for-all-call-sites +++ a/mm/page_alloc.c @@ -656,6 +656,9 @@ static inline void set_page_guard(struct return; page_ext = lookup_page_ex
Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites
On 5/27/2016 1:11 AM, Minchan Kim wrote: On Fri, May 27, 2016 at 03:08:39PM +0900, Joonsoo Kim wrote: On Fri, May 27, 2016 at 02:14:32PM +0900, Minchan Kim wrote: On Thu, May 26, 2016 at 04:15:28PM -0700, Shi, Yang wrote: On 5/25/2016 5:37 PM, Minchan Kim wrote: On Tue, May 24, 2016 at 11:58:11AM +0900, Minchan Kim wrote: On Mon, May 23, 2016 at 10:16:08AM -0700, Yang Shi wrote: Per the discussion with Joonsoo Kim [1], we need check the return value of lookup_page_ext() for all call sites since it might return NULL in some cases, although it is unlikely, i.e. memory hotplug. Tested with ltp with "page_owner=0". [1] http://lkml.kernel.org/r/20160519002809.GA10245@js1304-P5Q-DELUXE Signed-off-by: Yang Shi <yang@linaro.org> I didn't read code code in detail to see how page_ext memory space allocated in boot code and memory hotplug but to me, it's not good to check NULL whenever we calls lookup_page_ext. More dangerous thing is now page_ext is used by optionable feature(ie, not critical for system stability) but if we want to use page_ext as another important tool for the system in future, it could be a serious problem. Hello, Minchan. Hi Joonsoo, I wonder how pages that isn't managed by kernel yet will cause serious problem. Until onlining, these pages are out of our scope. Any information about them would be useless until it is actually activated. I guess that returning NULL for those pages will not hurt any functionality. Do you have any possible scenario that this causes the serious problem? I don't have any specific usecase now. That's why I said "in future". And I don't want to argue whether there is possible scenario or not to make the feature useful but if you want, I should write novel. One of example, pop up my mind, xen, hv and even memory_hotplug itself might want to use page_ext for their functionality extension to hook guest pages. My opinion is that page_ext is extension of struct page so it would be better to allow any operation on struct page without any limitation if we can do it. Whether it's useful or useless depend on random usecase and we don't need to limit that way from the beginning. However, current design allows deferred page_ext population so any user of page_ext should keep it in mind and should either make fallback plan or don't use page_ext for those cases. If we decide go this way through discussion, at least, we should make such limitation more clear to somewhere in this chance, maybe around page_ext_operation->need comment. My comment's point is that we should consider that way at least. It's worth to discuss pros and cons, what's the best and what makes that way hesitate if we can't. And, allocation such memory space doesn't come from free. If someone just add the memory device and don't online it, these memory will be Here goes several questions. Cced hotplug guys 1. If someone just add the memory device without onlining, kernel code can return pfn_valid == true on the offlined page? 2. If so, it means memmap on offline memory is already populated somewhere. Where is the memmap allocated? part of offlined memory space or other memory? 3. Could we allocate page_ext in part of offline memory space so that it doesn't consume online memory. wasted. I don't know if there is such a usecase but it's possible scenario. Can we put some hooks of page_ext into memory-hotplug so guarantee that page_ext memory space is allocated with memmap space at the same time? IOW, once every PFN wakers find a page is valid, page_ext is valid, too so lookup_page_ext never returns NULL on valid page by design. I hope we consider this direction, too. Yang, Could you think about this? Thanks a lot for the suggestion. Sorry for the late reply, I was busy on preparing patches. I do agree this is a direction we should look into, but I haven't got time to think about it deeper. I hope Joonsoo could chime in too since he is the original author for page extension. Even, your patch was broken, I think. It doesn't work with !CONFIG_DEBUG_VM && !CONFIG_PAGE_POISONING because lookup_page_ext doesn't return NULL in that case. Actually, I think the #ifdef should be removed if lookup_page_ext() is possible to return NULL. It sounds not make sense returning NULL only when DEBUG_VM is enabled. It should return NULL no matter what debug config is selected. If Joonsoo agrees with me I'm going to come up with a patch to fix it. Agreed but let's wait for Minchan's response. If we goes this way, how to guarantee this race? Thanks for pointing out this. It sounds reasonable. However, this should be only possible to happen on 32 bit since just 32 bit version page_is_idle() calls lookup_page_ext(), it doesn't do it on 64 bit. And, such race condition should exist regardless of whether DEBUG_VM is enabled or not, right? rcu might be good enough to protect it. A quick fix may look like: diff --git a/include/linux/pa
Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites
On 5/27/2016 1:11 AM, Minchan Kim wrote: On Fri, May 27, 2016 at 03:08:39PM +0900, Joonsoo Kim wrote: On Fri, May 27, 2016 at 02:14:32PM +0900, Minchan Kim wrote: On Thu, May 26, 2016 at 04:15:28PM -0700, Shi, Yang wrote: On 5/25/2016 5:37 PM, Minchan Kim wrote: On Tue, May 24, 2016 at 11:58:11AM +0900, Minchan Kim wrote: On Mon, May 23, 2016 at 10:16:08AM -0700, Yang Shi wrote: Per the discussion with Joonsoo Kim [1], we need check the return value of lookup_page_ext() for all call sites since it might return NULL in some cases, although it is unlikely, i.e. memory hotplug. Tested with ltp with "page_owner=0". [1] http://lkml.kernel.org/r/20160519002809.GA10245@js1304-P5Q-DELUXE Signed-off-by: Yang Shi I didn't read code code in detail to see how page_ext memory space allocated in boot code and memory hotplug but to me, it's not good to check NULL whenever we calls lookup_page_ext. More dangerous thing is now page_ext is used by optionable feature(ie, not critical for system stability) but if we want to use page_ext as another important tool for the system in future, it could be a serious problem. Hello, Minchan. Hi Joonsoo, I wonder how pages that isn't managed by kernel yet will cause serious problem. Until onlining, these pages are out of our scope. Any information about them would be useless until it is actually activated. I guess that returning NULL for those pages will not hurt any functionality. Do you have any possible scenario that this causes the serious problem? I don't have any specific usecase now. That's why I said "in future". And I don't want to argue whether there is possible scenario or not to make the feature useful but if you want, I should write novel. One of example, pop up my mind, xen, hv and even memory_hotplug itself might want to use page_ext for their functionality extension to hook guest pages. My opinion is that page_ext is extension of struct page so it would be better to allow any operation on struct page without any limitation if we can do it. Whether it's useful or useless depend on random usecase and we don't need to limit that way from the beginning. However, current design allows deferred page_ext population so any user of page_ext should keep it in mind and should either make fallback plan or don't use page_ext for those cases. If we decide go this way through discussion, at least, we should make such limitation more clear to somewhere in this chance, maybe around page_ext_operation->need comment. My comment's point is that we should consider that way at least. It's worth to discuss pros and cons, what's the best and what makes that way hesitate if we can't. And, allocation such memory space doesn't come from free. If someone just add the memory device and don't online it, these memory will be Here goes several questions. Cced hotplug guys 1. If someone just add the memory device without onlining, kernel code can return pfn_valid == true on the offlined page? 2. If so, it means memmap on offline memory is already populated somewhere. Where is the memmap allocated? part of offlined memory space or other memory? 3. Could we allocate page_ext in part of offline memory space so that it doesn't consume online memory. wasted. I don't know if there is such a usecase but it's possible scenario. Can we put some hooks of page_ext into memory-hotplug so guarantee that page_ext memory space is allocated with memmap space at the same time? IOW, once every PFN wakers find a page is valid, page_ext is valid, too so lookup_page_ext never returns NULL on valid page by design. I hope we consider this direction, too. Yang, Could you think about this? Thanks a lot for the suggestion. Sorry for the late reply, I was busy on preparing patches. I do agree this is a direction we should look into, but I haven't got time to think about it deeper. I hope Joonsoo could chime in too since he is the original author for page extension. Even, your patch was broken, I think. It doesn't work with !CONFIG_DEBUG_VM && !CONFIG_PAGE_POISONING because lookup_page_ext doesn't return NULL in that case. Actually, I think the #ifdef should be removed if lookup_page_ext() is possible to return NULL. It sounds not make sense returning NULL only when DEBUG_VM is enabled. It should return NULL no matter what debug config is selected. If Joonsoo agrees with me I'm going to come up with a patch to fix it. Agreed but let's wait for Minchan's response. If we goes this way, how to guarantee this race? Thanks for pointing out this. It sounds reasonable. However, this should be only possible to happen on 32 bit since just 32 bit version page_is_idle() calls lookup_page_ext(), it doesn't do it on 64 bit. And, such race condition should exist regardless of whether DEBUG_VM is enabled or not, right? rcu might be good enough to protect it. A quick fix may look like: diff --git a/include/linux/page_idle.h b/include/linux/pag
Re: [PATCH] arm64: kasan: instrument user memory access API
On 5/27/2016 10:46 AM, Mark Rutland wrote: On Fri, May 27, 2016 at 09:34:03AM -0700, Shi, Yang wrote: On 5/27/2016 4:02 AM, Andrey Ryabinin wrote: On 05/26/2016 09:43 PM, Yang Shi wrote: The upstream commit 1771c6e1a567ea0ba20a4ffe68a1419fd8ef ("x86/kasan: instrument user memory access API") added KASAN instrument to x86 user memory access API, so added such instrument to ARM64 too. Tested by test_kasan module. Signed-off-by: Yang Shi <yang@linaro.org> --- arch/arm64/include/asm/uaccess.h | 18 -- 1 file changed, 16 insertions(+), 2 deletions(-) Please, cover __copy_from_user() and __copy_to_user() too. Unlike x86, your patch doesn't instrument these two. Argh, I missed those when reviewing. My bad. I should elaborated this in my review. Yes, I did think about it, but unlike x86, __copy_to/from_user are implemented by asm code on ARM64. If I add kasan_check_read/write into them, I have to move the registers around to prepare the parameters for kasan calls, then restore them after the call, for example the below code for __copy_to_user: mov x9, x0 mov x10, x1 mov x11, x2 mov x0, x10 mov x1, x11 bl kasan_check_read mov x0, x9 mov x1, x10 There's no need to alter the assembly. Rename the functions (e.g. have __arch_raw_copy_from_user), and add static inline wrappers in uaccess.h that do the kasan calls before calling the assembly functions. That gives the compiler the freedom to do the right thing, and avoids horrible ifdeffery in the assembly code. Thanks for the suggestion, will address in v2. Yang So, I'm wondering if it is worth or not since __copy_to/from_user are just called at a couple of places, i.e. sctp, a couple of drivers, etc and not used too much. [mark@leverpostej:~/src/linux]% git grep -w __copy_to_user -- ^arch | wc -l 63 [mark@leverpostej:~/src/linux]% git grep -w __copy_from_user -- ^arch | wc -l 47 That's a reasonable number of callsites. If we're going to bother adding this, it should be complete. So please do update __copy_from_user and __copy_to_user. Actually, I think some of them could be replaced by __copy_to/from_user_inatomic. Given the number of existing callers outside of arch code, I think we'll get far more traction reworking the arm64 parts for now. Thanks, Mark.
Re: [PATCH] arm64: kasan: instrument user memory access API
On 5/27/2016 10:46 AM, Mark Rutland wrote: On Fri, May 27, 2016 at 09:34:03AM -0700, Shi, Yang wrote: On 5/27/2016 4:02 AM, Andrey Ryabinin wrote: On 05/26/2016 09:43 PM, Yang Shi wrote: The upstream commit 1771c6e1a567ea0ba20a4ffe68a1419fd8ef ("x86/kasan: instrument user memory access API") added KASAN instrument to x86 user memory access API, so added such instrument to ARM64 too. Tested by test_kasan module. Signed-off-by: Yang Shi --- arch/arm64/include/asm/uaccess.h | 18 -- 1 file changed, 16 insertions(+), 2 deletions(-) Please, cover __copy_from_user() and __copy_to_user() too. Unlike x86, your patch doesn't instrument these two. Argh, I missed those when reviewing. My bad. I should elaborated this in my review. Yes, I did think about it, but unlike x86, __copy_to/from_user are implemented by asm code on ARM64. If I add kasan_check_read/write into them, I have to move the registers around to prepare the parameters for kasan calls, then restore them after the call, for example the below code for __copy_to_user: mov x9, x0 mov x10, x1 mov x11, x2 mov x0, x10 mov x1, x11 bl kasan_check_read mov x0, x9 mov x1, x10 There's no need to alter the assembly. Rename the functions (e.g. have __arch_raw_copy_from_user), and add static inline wrappers in uaccess.h that do the kasan calls before calling the assembly functions. That gives the compiler the freedom to do the right thing, and avoids horrible ifdeffery in the assembly code. Thanks for the suggestion, will address in v2. Yang So, I'm wondering if it is worth or not since __copy_to/from_user are just called at a couple of places, i.e. sctp, a couple of drivers, etc and not used too much. [mark@leverpostej:~/src/linux]% git grep -w __copy_to_user -- ^arch | wc -l 63 [mark@leverpostej:~/src/linux]% git grep -w __copy_from_user -- ^arch | wc -l 47 That's a reasonable number of callsites. If we're going to bother adding this, it should be complete. So please do update __copy_from_user and __copy_to_user. Actually, I think some of them could be replaced by __copy_to/from_user_inatomic. Given the number of existing callers outside of arch code, I think we'll get far more traction reworking the arm64 parts for now. Thanks, Mark.
Re: [PATCH] arm64: kasan: instrument user memory access API
On 5/27/2016 4:02 AM, Andrey Ryabinin wrote: On 05/26/2016 09:43 PM, Yang Shi wrote: The upstream commit 1771c6e1a567ea0ba20a4ffe68a1419fd8ef ("x86/kasan: instrument user memory access API") added KASAN instrument to x86 user memory access API, so added such instrument to ARM64 too. Tested by test_kasan module. Signed-off-by: Yang Shi <yang@linaro.org> --- arch/arm64/include/asm/uaccess.h | 18 -- 1 file changed, 16 insertions(+), 2 deletions(-) Please, cover __copy_from_user() and __copy_to_user() too. Unlike x86, your patch doesn't instrument these two. I should elaborated this in my review. Yes, I did think about it, but unlike x86, __copy_to/from_user are implemented by asm code on ARM64. If I add kasan_check_read/write into them, I have to move the registers around to prepare the parameters for kasan calls, then restore them after the call, for example the below code for __copy_to_user: mov x9, x0 mov x10, x1 mov x11, x2 mov x0, x10 mov x1, x11 bl kasan_check_read mov x0, x9 mov x1, x10 So, I'm wondering if it is worth or not since __copy_to/from_user are just called at a couple of places, i.e. sctp, a couple of drivers, etc and not used too much. Actually, I think some of them could be replaced by __copy_to/from_user_inatomic. Any idea is appreciated. Thanks, Yang
Re: [PATCH] arm64: kasan: instrument user memory access API
On 5/27/2016 4:02 AM, Andrey Ryabinin wrote: On 05/26/2016 09:43 PM, Yang Shi wrote: The upstream commit 1771c6e1a567ea0ba20a4ffe68a1419fd8ef ("x86/kasan: instrument user memory access API") added KASAN instrument to x86 user memory access API, so added such instrument to ARM64 too. Tested by test_kasan module. Signed-off-by: Yang Shi --- arch/arm64/include/asm/uaccess.h | 18 -- 1 file changed, 16 insertions(+), 2 deletions(-) Please, cover __copy_from_user() and __copy_to_user() too. Unlike x86, your patch doesn't instrument these two. I should elaborated this in my review. Yes, I did think about it, but unlike x86, __copy_to/from_user are implemented by asm code on ARM64. If I add kasan_check_read/write into them, I have to move the registers around to prepare the parameters for kasan calls, then restore them after the call, for example the below code for __copy_to_user: mov x9, x0 mov x10, x1 mov x11, x2 mov x0, x10 mov x1, x11 bl kasan_check_read mov x0, x9 mov x1, x10 So, I'm wondering if it is worth or not since __copy_to/from_user are just called at a couple of places, i.e. sctp, a couple of drivers, etc and not used too much. Actually, I think some of them could be replaced by __copy_to/from_user_inatomic. Any idea is appreciated. Thanks, Yang
Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites
On 5/25/2016 5:37 PM, Minchan Kim wrote: On Tue, May 24, 2016 at 11:58:11AM +0900, Minchan Kim wrote: On Mon, May 23, 2016 at 10:16:08AM -0700, Yang Shi wrote: Per the discussion with Joonsoo Kim [1], we need check the return value of lookup_page_ext() for all call sites since it might return NULL in some cases, although it is unlikely, i.e. memory hotplug. Tested with ltp with "page_owner=0". [1] http://lkml.kernel.org/r/20160519002809.GA10245@js1304-P5Q-DELUXE Signed-off-by: Yang Shi <yang@linaro.org> I didn't read code code in detail to see how page_ext memory space allocated in boot code and memory hotplug but to me, it's not good to check NULL whenever we calls lookup_page_ext. More dangerous thing is now page_ext is used by optionable feature(ie, not critical for system stability) but if we want to use page_ext as another important tool for the system in future, it could be a serious problem. Can we put some hooks of page_ext into memory-hotplug so guarantee that page_ext memory space is allocated with memmap space at the same time? IOW, once every PFN wakers find a page is valid, page_ext is valid, too so lookup_page_ext never returns NULL on valid page by design. I hope we consider this direction, too. Yang, Could you think about this? Thanks a lot for the suggestion. Sorry for the late reply, I was busy on preparing patches. I do agree this is a direction we should look into, but I haven't got time to think about it deeper. I hope Joonsoo could chime in too since he is the original author for page extension. Even, your patch was broken, I think. It doesn't work with !CONFIG_DEBUG_VM && !CONFIG_PAGE_POISONING because lookup_page_ext doesn't return NULL in that case. Actually, I think the #ifdef should be removed if lookup_page_ext() is possible to return NULL. It sounds not make sense returning NULL only when DEBUG_VM is enabled. It should return NULL no matter what debug config is selected. If Joonsoo agrees with me I'm going to come up with a patch to fix it. Regards, Yang Thanks. --- include/linux/page_idle.h | 43 --- mm/page_alloc.c | 6 ++ mm/page_owner.c | 27 +++ mm/page_poison.c | 8 +++- mm/vmstat.c | 2 ++ 5 files changed, 78 insertions(+), 8 deletions(-) diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index bf268fa..8f5d4ad 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -46,33 +46,62 @@ extern struct page_ext_operations page_idle_ops; static inline bool page_is_young(struct page *page) { - return test_bit(PAGE_EXT_YOUNG, _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return false; + + return test_bit(PAGE_EXT_YOUNG, _ext->flags); } static inline void set_page_young(struct page *page) { - set_bit(PAGE_EXT_YOUNG, _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return; + + set_bit(PAGE_EXT_YOUNG, _ext->flags); } static inline bool test_and_clear_page_young(struct page *page) { - return test_and_clear_bit(PAGE_EXT_YOUNG, - _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return false; + + return test_and_clear_bit(PAGE_EXT_YOUNG, _ext->flags); } static inline bool page_is_idle(struct page *page) { - return test_bit(PAGE_EXT_IDLE, _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return false; + + return test_bit(PAGE_EXT_IDLE, _ext->flags); } static inline void set_page_idle(struct page *page) { - set_bit(PAGE_EXT_IDLE, _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return; + + set_bit(PAGE_EXT_IDLE, _ext->flags); } static inline void clear_page_idle(struct page *page) { - clear_bit(PAGE_EXT_IDLE, _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return; + + clear_bit(PAGE_EXT_IDLE, _ext->flags); } #endif /* CONFIG_64BIT */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f8f3bfc..d27e8b9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -656,6 +656,9 @@ static inline void set_page_guard(struct zone *zone, struct page *page, return; page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + return; + __set_bit(PAGE_EXT_DEBUG_GUARD, _
Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites
On 5/25/2016 5:37 PM, Minchan Kim wrote: On Tue, May 24, 2016 at 11:58:11AM +0900, Minchan Kim wrote: On Mon, May 23, 2016 at 10:16:08AM -0700, Yang Shi wrote: Per the discussion with Joonsoo Kim [1], we need check the return value of lookup_page_ext() for all call sites since it might return NULL in some cases, although it is unlikely, i.e. memory hotplug. Tested with ltp with "page_owner=0". [1] http://lkml.kernel.org/r/20160519002809.GA10245@js1304-P5Q-DELUXE Signed-off-by: Yang Shi I didn't read code code in detail to see how page_ext memory space allocated in boot code and memory hotplug but to me, it's not good to check NULL whenever we calls lookup_page_ext. More dangerous thing is now page_ext is used by optionable feature(ie, not critical for system stability) but if we want to use page_ext as another important tool for the system in future, it could be a serious problem. Can we put some hooks of page_ext into memory-hotplug so guarantee that page_ext memory space is allocated with memmap space at the same time? IOW, once every PFN wakers find a page is valid, page_ext is valid, too so lookup_page_ext never returns NULL on valid page by design. I hope we consider this direction, too. Yang, Could you think about this? Thanks a lot for the suggestion. Sorry for the late reply, I was busy on preparing patches. I do agree this is a direction we should look into, but I haven't got time to think about it deeper. I hope Joonsoo could chime in too since he is the original author for page extension. Even, your patch was broken, I think. It doesn't work with !CONFIG_DEBUG_VM && !CONFIG_PAGE_POISONING because lookup_page_ext doesn't return NULL in that case. Actually, I think the #ifdef should be removed if lookup_page_ext() is possible to return NULL. It sounds not make sense returning NULL only when DEBUG_VM is enabled. It should return NULL no matter what debug config is selected. If Joonsoo agrees with me I'm going to come up with a patch to fix it. Regards, Yang Thanks. --- include/linux/page_idle.h | 43 --- mm/page_alloc.c | 6 ++ mm/page_owner.c | 27 +++ mm/page_poison.c | 8 +++- mm/vmstat.c | 2 ++ 5 files changed, 78 insertions(+), 8 deletions(-) diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index bf268fa..8f5d4ad 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -46,33 +46,62 @@ extern struct page_ext_operations page_idle_ops; static inline bool page_is_young(struct page *page) { - return test_bit(PAGE_EXT_YOUNG, _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return false; + + return test_bit(PAGE_EXT_YOUNG, _ext->flags); } static inline void set_page_young(struct page *page) { - set_bit(PAGE_EXT_YOUNG, _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return; + + set_bit(PAGE_EXT_YOUNG, _ext->flags); } static inline bool test_and_clear_page_young(struct page *page) { - return test_and_clear_bit(PAGE_EXT_YOUNG, - _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return false; + + return test_and_clear_bit(PAGE_EXT_YOUNG, _ext->flags); } static inline bool page_is_idle(struct page *page) { - return test_bit(PAGE_EXT_IDLE, _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return false; + + return test_bit(PAGE_EXT_IDLE, _ext->flags); } static inline void set_page_idle(struct page *page) { - set_bit(PAGE_EXT_IDLE, _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return; + + set_bit(PAGE_EXT_IDLE, _ext->flags); } static inline void clear_page_idle(struct page *page) { - clear_bit(PAGE_EXT_IDLE, _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return; + + clear_bit(PAGE_EXT_IDLE, _ext->flags); } #endif /* CONFIG_64BIT */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f8f3bfc..d27e8b9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -656,6 +656,9 @@ static inline void set_page_guard(struct zone *zone, struct page *page, return; page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + return; + __set_bit(PAGE_EXT_DEBUG_GUARD, _ext->flags); INIT_LIST_HEAD(>lru); @@ -673,6 +676,9 @@ static inline void
Re: [PATCH] mm: use early_pfn_to_nid in register_page_bootmem_info_node
On 5/25/2016 3:23 PM, Andrew Morton wrote: On Wed, 25 May 2016 14:00:07 -0700 Yang Shi <yang@linaro.org> wrote: register_page_bootmem_info_node() is invoked in mem_init(), so it will be called before page_alloc_init_late() if CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled. But, pfn_to_nid() depends on memmap which won't be fully setup until page_alloc_init_late() is done, so replace pfn_to_nid() by early_pfn_to_nid(). What are the runtime effects of this fix? I didn't experience any problem without the fix. During working on the page_ext_init() fix (replace to early_pfn_to_nid()), I added printk before each pfn_to_nid() calls to check which one might be called before page_alloc_init_late(), then this one is caught. From the code perspective, it sounds not right since register_page_bootmem_info_section() may miss some pfns when CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, just like the problem happened in page_ext_init(). Thanks, Yang
Re: [PATCH] mm: use early_pfn_to_nid in register_page_bootmem_info_node
On 5/25/2016 3:23 PM, Andrew Morton wrote: On Wed, 25 May 2016 14:00:07 -0700 Yang Shi wrote: register_page_bootmem_info_node() is invoked in mem_init(), so it will be called before page_alloc_init_late() if CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled. But, pfn_to_nid() depends on memmap which won't be fully setup until page_alloc_init_late() is done, so replace pfn_to_nid() by early_pfn_to_nid(). What are the runtime effects of this fix? I didn't experience any problem without the fix. During working on the page_ext_init() fix (replace to early_pfn_to_nid()), I added printk before each pfn_to_nid() calls to check which one might be called before page_alloc_init_late(), then this one is caught. From the code perspective, it sounds not right since register_page_bootmem_info_section() may miss some pfns when CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, just like the problem happened in page_ext_init(). Thanks, Yang
Re: [PATCH] Fixing compilation error from file include/linux/page_idle.h
On 5/25/2016 3:11 AM, shakilk1...@gmail.com wrote: From: shakil khanThanks for the patch, this issue had been fixed by Arnd. Yang --- include/linux/page_idle.h | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 569c3a1..fec4027 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -48,7 +48,7 @@ static inline bool page_is_young(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return false; return test_bit(PAGE_EXT_YOUNG, _ext->flags); @@ -58,7 +58,7 @@ static inline void set_page_young(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_YOUNG, _ext->flags); @@ -68,7 +68,7 @@ static inline bool test_and_clear_page_young(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return false; return test_and_clear_bit(PAGE_EXT_YOUNG, _ext->flags); @@ -78,7 +78,7 @@ static inline bool page_is_idle(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return false; return test_bit(PAGE_EXT_IDLE, _ext->flags); @@ -88,7 +88,7 @@ static inline void set_page_idle(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_IDLE, _ext->flags); @@ -98,7 +98,7 @@ static inline void clear_page_idle(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return; clear_bit(PAGE_EXT_IDLE, _ext->flags);
Re: [PATCH] Fixing compilation error from file include/linux/page_idle.h
On 5/25/2016 3:11 AM, shakilk1...@gmail.com wrote: From: shakil khan Thanks for the patch, this issue had been fixed by Arnd. Yang --- include/linux/page_idle.h | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 569c3a1..fec4027 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -48,7 +48,7 @@ static inline bool page_is_young(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return false; return test_bit(PAGE_EXT_YOUNG, _ext->flags); @@ -58,7 +58,7 @@ static inline void set_page_young(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_YOUNG, _ext->flags); @@ -68,7 +68,7 @@ static inline bool test_and_clear_page_young(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return false; return test_and_clear_bit(PAGE_EXT_YOUNG, _ext->flags); @@ -78,7 +78,7 @@ static inline bool page_is_idle(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return false; return test_bit(PAGE_EXT_IDLE, _ext->flags); @@ -88,7 +88,7 @@ static inline void set_page_idle(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_IDLE, _ext->flags); @@ -98,7 +98,7 @@ static inline void clear_page_idle(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return; clear_bit(PAGE_EXT_IDLE, _ext->flags);
Re: [PATCH] mm: page_is_guard return false when page_ext arrays are not allocated yet
On 5/23/2016 10:37 PM, Joonsoo Kim wrote: On Fri, May 20, 2016 at 10:00:06AM -0700, Shi, Yang wrote: On 5/19/2016 7:40 PM, Joonsoo Kim wrote: 2016-05-20 2:18 GMT+09:00 Shi, Yang <yang@linaro.org>: On 5/18/2016 5:28 PM, Joonsoo Kim wrote: Vlastiml, thanks for ccing me on original bug report. On Wed, May 18, 2016 at 03:23:45PM -0700, Yang Shi wrote: When enabling the below kernel configs: CONFIG_DEFERRED_STRUCT_PAGE_INIT CONFIG_DEBUG_PAGEALLOC CONFIG_PAGE_EXTENSION CONFIG_DEBUG_VM kernel bootup may fail due to the following oops: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] free_pcppages_bulk+0x2d2/0x8d0 PGD 0 Oops: [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU: 11 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc5-next-20160427 #26 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 task: 88017c080040 ti: 88017c084000 task.ti: 88017c084000 RIP: 0010:[] [] free_pcppages_bulk+0x2d2/0x8d0 RSP: :88017c087c48 EFLAGS: 00010046 RAX: RBX: RCX: 0001 RDX: 0980 RSI: 0080 RDI: 00660401 RBP: 88017c087cd0 R08: 0401 R09: 0009 R10: 88017c080040 R11: 000a R12: 0400 R13: ea001981 R14: ea0019810040 R15: 88066cfe6080 FS: () GS:88066cd4() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 02406000 CR4: 06e0 Stack: 88066cd5bbd8 88066cfe6640 001f001f 88066cd5bbe8 ea001981 8118f53e 0009 0401 000a 0001 Call Trace: [] free_hot_cold_page+0x192/0x1d0 [] __free_pages+0x5c/0x90 [] __free_pages_boot_core+0x11a/0x14e [] deferred_free_range+0x50/0x62 [] deferred_init_memmap+0x220/0x3c3 [] ? setup_per_cpu_pageset+0x35/0x35 [] kthread+0xf8/0x110 [] ret_from_fork+0x22/0x40 [] ? kthread_create_on_node+0x200/0x200 Code: 49 89 d4 48 c1 e0 06 49 01 c5 e9 de fe ff ff 4c 89 f7 44 89 4d b8 4c 89 45 c0 44 89 5d c8 48 89 4d d0 e8 62 c7 07 00 48 8b 4d d0 <48> 8b 00 44 8b 5d c8 4c 8b 45 c0 44 8b 4d b8 a8 02 0f 84 05 ff RIP [] free_pcppages_bulk+0x2d2/0x8d0 RSP CR2: The problem is lookup_page_ext() returns NULL then page_is_guard() tried to access it in page freeing. page_is_guard() depends on PAGE_EXT_DEBUG_GUARD bit of page extension flag, but freeing page might reach here before the page_ext arrays are allocated when feeding a range of pages to the allocator for the first time during bootup or memory hotplug. Patch itself looks find to me because I also found that this kind of problem happens during memory hotplug. So, we need to fix more sites, all callers of lookup_page_ext(). Yes, I agree. I will come up with a patch or a couple of patches to check the return value of lookup_page_ext(). But, I'd like to know how your problem occurs during bootup. debug_guardpage_enabled() is turned to 'enable' after page_ext is initialized. Before that, page_is_guard() unconditionally returns false so I think that the problem what you mentioned can't happen. Could you check that when debug_guardpage_enabled() returns 'enable' and init_section_page_ext() is called? I think the problem is I have CONFIG_DEFERRED_STRUCT_PAGE_INIT enabled, which will defer some struct pages initialization to "pgdatinitX" kernel thread in page_alloc_init_late(). But, page_ext_init() is called before it. So, it leads debug_guardpage_enabled() return true, but page extension is not allocated yet for the struct pages initialized by "pgdatinitX". No. After page_ext_init(), it is ensured that all page extension is initialized. It sounds page_ext_init() should be called after page_alloc_init_late(). Or it should be just incompatible with CONFIG_DEFERRED_STRUCT_PAGE_INIT. I will try to move the init call around. We need to investigate more. I guess that problem is introduced by CONFIG_DEFERRED_STRUCT_PAGE_INIT. It makes pfn_to_nid() invalid until page_alloc_init_late() is done. That is a big side-effect. If there is pfn walker and it uses pfn_to_nid() between memmap_init_zone() and page_alloc_init_late(), it also has same problem. So, we need to think how to fix it more carefully. Thanks for the analysis. I think you are correct. Since pfn_to_nid() depends on memmap which has not been fully setup yet until page_alloc_init_late() is done. So, for such usecase early_pfn_to_nid() should be used. Anyway, to make sure that my assumption is true, could you confirm that below change fix your problem? Yes, it does. Thanks. ->8-- diff --git a/mm/page_ext.c b/mm/page_ext.c index 2d864e6..cac5dc9 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -391,7 +391,7 @@ void __init page_ext_init(void)
Re: [PATCH] mm: page_is_guard return false when page_ext arrays are not allocated yet
On 5/23/2016 10:37 PM, Joonsoo Kim wrote: On Fri, May 20, 2016 at 10:00:06AM -0700, Shi, Yang wrote: On 5/19/2016 7:40 PM, Joonsoo Kim wrote: 2016-05-20 2:18 GMT+09:00 Shi, Yang : On 5/18/2016 5:28 PM, Joonsoo Kim wrote: Vlastiml, thanks for ccing me on original bug report. On Wed, May 18, 2016 at 03:23:45PM -0700, Yang Shi wrote: When enabling the below kernel configs: CONFIG_DEFERRED_STRUCT_PAGE_INIT CONFIG_DEBUG_PAGEALLOC CONFIG_PAGE_EXTENSION CONFIG_DEBUG_VM kernel bootup may fail due to the following oops: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] free_pcppages_bulk+0x2d2/0x8d0 PGD 0 Oops: [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU: 11 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc5-next-20160427 #26 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 task: 88017c080040 ti: 88017c084000 task.ti: 88017c084000 RIP: 0010:[] [] free_pcppages_bulk+0x2d2/0x8d0 RSP: :88017c087c48 EFLAGS: 00010046 RAX: RBX: RCX: 0001 RDX: 0980 RSI: 0080 RDI: 00660401 RBP: 88017c087cd0 R08: 0401 R09: 0009 R10: 88017c080040 R11: 000a R12: 0400 R13: ea001981 R14: ea0019810040 R15: 88066cfe6080 FS: () GS:88066cd4() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 02406000 CR4: 06e0 Stack: 88066cd5bbd8 88066cfe6640 001f001f 88066cd5bbe8 ea001981 8118f53e 0009 0401 000a 0001 Call Trace: [] free_hot_cold_page+0x192/0x1d0 [] __free_pages+0x5c/0x90 [] __free_pages_boot_core+0x11a/0x14e [] deferred_free_range+0x50/0x62 [] deferred_init_memmap+0x220/0x3c3 [] ? setup_per_cpu_pageset+0x35/0x35 [] kthread+0xf8/0x110 [] ret_from_fork+0x22/0x40 [] ? kthread_create_on_node+0x200/0x200 Code: 49 89 d4 48 c1 e0 06 49 01 c5 e9 de fe ff ff 4c 89 f7 44 89 4d b8 4c 89 45 c0 44 89 5d c8 48 89 4d d0 e8 62 c7 07 00 48 8b 4d d0 <48> 8b 00 44 8b 5d c8 4c 8b 45 c0 44 8b 4d b8 a8 02 0f 84 05 ff RIP [] free_pcppages_bulk+0x2d2/0x8d0 RSP CR2: The problem is lookup_page_ext() returns NULL then page_is_guard() tried to access it in page freeing. page_is_guard() depends on PAGE_EXT_DEBUG_GUARD bit of page extension flag, but freeing page might reach here before the page_ext arrays are allocated when feeding a range of pages to the allocator for the first time during bootup or memory hotplug. Patch itself looks find to me because I also found that this kind of problem happens during memory hotplug. So, we need to fix more sites, all callers of lookup_page_ext(). Yes, I agree. I will come up with a patch or a couple of patches to check the return value of lookup_page_ext(). But, I'd like to know how your problem occurs during bootup. debug_guardpage_enabled() is turned to 'enable' after page_ext is initialized. Before that, page_is_guard() unconditionally returns false so I think that the problem what you mentioned can't happen. Could you check that when debug_guardpage_enabled() returns 'enable' and init_section_page_ext() is called? I think the problem is I have CONFIG_DEFERRED_STRUCT_PAGE_INIT enabled, which will defer some struct pages initialization to "pgdatinitX" kernel thread in page_alloc_init_late(). But, page_ext_init() is called before it. So, it leads debug_guardpage_enabled() return true, but page extension is not allocated yet for the struct pages initialized by "pgdatinitX". No. After page_ext_init(), it is ensured that all page extension is initialized. It sounds page_ext_init() should be called after page_alloc_init_late(). Or it should be just incompatible with CONFIG_DEFERRED_STRUCT_PAGE_INIT. I will try to move the init call around. We need to investigate more. I guess that problem is introduced by CONFIG_DEFERRED_STRUCT_PAGE_INIT. It makes pfn_to_nid() invalid until page_alloc_init_late() is done. That is a big side-effect. If there is pfn walker and it uses pfn_to_nid() between memmap_init_zone() and page_alloc_init_late(), it also has same problem. So, we need to think how to fix it more carefully. Thanks for the analysis. I think you are correct. Since pfn_to_nid() depends on memmap which has not been fully setup yet until page_alloc_init_late() is done. So, for such usecase early_pfn_to_nid() should be used. Anyway, to make sure that my assumption is true, could you confirm that below change fix your problem? Yes, it does. Thanks. ->8-- diff --git a/mm/page_ext.c b/mm/page_ext.c index 2d864e6..cac5dc9 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -391,7 +391,7 @@ void __init page_ext_init(void)
Re: [PATCH] mm: fix build problems from lookup_page_ext
Hi Arnd, Thanks a lot for the patch. My bad, sorry for the inconvenience. I omitted the specific page_idle change is for 32 bit only. And, my host compiler looks old which is still 4.8 so it might not catch the warning. I will update my compiler. Regards, Yang On 5/24/2016 3:08 AM, Arnd Bergmann wrote: A patch for lookup_page_ext introduced several build errors and warnings, e.g. mm/page_owner.c: In function '__set_page_owner': mm/page_owner.c:71:2: error: ISO C90 forbids mixed declarations and code [-Werror=declaration-after-statement] include/linux/page_idle.h: In function 'set_page_young': include/linux/page_idle.h:62:3: error: expected ')' before 'return' This fixes all of them. Please fold into the original patch. Signed-off-by: Arnd BergmannFixes: 38c4fffbad3c ("mm: check the return value of lookup_page_ext for all call sites") diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 569c3a180625..fec40271339f 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -48,7 +48,7 @@ static inline bool page_is_young(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return false; return test_bit(PAGE_EXT_YOUNG, _ext->flags); @@ -58,7 +58,7 @@ static inline void set_page_young(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_YOUNG, _ext->flags); @@ -68,7 +68,7 @@ static inline bool test_and_clear_page_young(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return false; return test_and_clear_bit(PAGE_EXT_YOUNG, _ext->flags); @@ -78,7 +78,7 @@ static inline bool page_is_idle(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return false; return test_bit(PAGE_EXT_IDLE, _ext->flags); @@ -88,7 +88,7 @@ static inline void set_page_idle(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_IDLE, _ext->flags); @@ -98,7 +98,7 @@ static inline void clear_page_idle(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return; clear_bit(PAGE_EXT_IDLE, _ext->flags); diff --git a/mm/page_owner.c b/mm/page_owner.c index 902e39813295..c6cda3e36212 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -65,9 +65,6 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return; - struct stack_trace trace = { .nr_entries = 0, .max_entries = ARRAY_SIZE(page_ext->trace_entries), @@ -75,6 +72,9 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) .skip = 3, }; + if (unlikely(!page_ext)) + return; + save_stack_trace(); page_ext->order = order; @@ -111,12 +111,11 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) { struct page_ext *old_ext = lookup_page_ext(oldpage); struct page_ext *new_ext = lookup_page_ext(newpage); + int i; if (unlikely(!old_ext || !new_ext)) return; - int i; - new_ext->order = old_ext->order; new_ext->gfp_mask = old_ext->gfp_mask; new_ext->nr_entries = old_ext->nr_entries; @@ -204,11 +203,6 @@ err: void __dump_page_owner(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) { - pr_alert("There is not page extension available.\n"); - return; - } - struct stack_trace trace = { .nr_entries = page_ext->nr_entries, .entries = _ext->trace_entries[0], @@ -216,6 +210,11 @@ void __dump_page_owner(struct page *page) gfp_t gfp_mask = page_ext->gfp_mask; int mt = gfpflags_to_migratetype(gfp_mask); + if (unlikely(!page_ext)) { + pr_alert("There is not page extension available.\n"); + return; + } + if (!test_bit(PAGE_EXT_OWNER, _ext->flags)) { pr_alert("page_owner info is not active (free page?)\n"); return;
Re: [PATCH] mm: fix build problems from lookup_page_ext
Hi Arnd, Thanks a lot for the patch. My bad, sorry for the inconvenience. I omitted the specific page_idle change is for 32 bit only. And, my host compiler looks old which is still 4.8 so it might not catch the warning. I will update my compiler. Regards, Yang On 5/24/2016 3:08 AM, Arnd Bergmann wrote: A patch for lookup_page_ext introduced several build errors and warnings, e.g. mm/page_owner.c: In function '__set_page_owner': mm/page_owner.c:71:2: error: ISO C90 forbids mixed declarations and code [-Werror=declaration-after-statement] include/linux/page_idle.h: In function 'set_page_young': include/linux/page_idle.h:62:3: error: expected ')' before 'return' This fixes all of them. Please fold into the original patch. Signed-off-by: Arnd Bergmann Fixes: 38c4fffbad3c ("mm: check the return value of lookup_page_ext for all call sites") diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 569c3a180625..fec40271339f 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -48,7 +48,7 @@ static inline bool page_is_young(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return false; return test_bit(PAGE_EXT_YOUNG, _ext->flags); @@ -58,7 +58,7 @@ static inline void set_page_young(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_YOUNG, _ext->flags); @@ -68,7 +68,7 @@ static inline bool test_and_clear_page_young(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return false; return test_and_clear_bit(PAGE_EXT_YOUNG, _ext->flags); @@ -78,7 +78,7 @@ static inline bool page_is_idle(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return false; return test_bit(PAGE_EXT_IDLE, _ext->flags); @@ -88,7 +88,7 @@ static inline void set_page_idle(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_IDLE, _ext->flags); @@ -98,7 +98,7 @@ static inline void clear_page_idle(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext) + if (unlikely(!page_ext)) return; clear_bit(PAGE_EXT_IDLE, _ext->flags); diff --git a/mm/page_owner.c b/mm/page_owner.c index 902e39813295..c6cda3e36212 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -65,9 +65,6 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return; - struct stack_trace trace = { .nr_entries = 0, .max_entries = ARRAY_SIZE(page_ext->trace_entries), @@ -75,6 +72,9 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) .skip = 3, }; + if (unlikely(!page_ext)) + return; + save_stack_trace(); page_ext->order = order; @@ -111,12 +111,11 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) { struct page_ext *old_ext = lookup_page_ext(oldpage); struct page_ext *new_ext = lookup_page_ext(newpage); + int i; if (unlikely(!old_ext || !new_ext)) return; - int i; - new_ext->order = old_ext->order; new_ext->gfp_mask = old_ext->gfp_mask; new_ext->nr_entries = old_ext->nr_entries; @@ -204,11 +203,6 @@ err: void __dump_page_owner(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) { - pr_alert("There is not page extension available.\n"); - return; - } - struct stack_trace trace = { .nr_entries = page_ext->nr_entries, .entries = _ext->trace_entries[0], @@ -216,6 +210,11 @@ void __dump_page_owner(struct page *page) gfp_t gfp_mask = page_ext->gfp_mask; int mt = gfpflags_to_migratetype(gfp_mask); + if (unlikely(!page_ext)) { + pr_alert("There is not page extension available.\n"); + return; + } + if (!test_bit(PAGE_EXT_OWNER, _ext->flags)) { pr_alert("page_owner info is not active (free page?)\n"); return;
Re: [PATCH] mm: make CONFIG_DEFERRED_STRUCT_PAGE_INIT depends on !FLATMEM explicitly
On 5/23/2016 11:22 AM, Michal Hocko wrote: On Mon 23-05-16 09:54:31, Yang Shi wrote: Per the suggestion from Michal Hocko [1], CONFIG_DEFERRED_STRUCT_PAGE_INIT should be incompatible with FLATMEM, make this explicitly in Kconfig. I guess the changelog could benefit from some clarification. What do you think about the following: " DEFERRED_STRUCT_PAGE_INIT requires some ordering wrt other initialization operations, e.g. page_ext_init has to happen after the whole memmap is initialized properly. For SPARSEMEM this requires to wait for page_alloc_init_late. Other memory models (e.g. flatmem) might have different initialization layouts (page_ext_init_flatmem). Currently DEFERRED_STRUCT_PAGE_INIT depends on MEMORY_HOTPLUG which in turn depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG and X86_64_ACPI_NUMA depends on NUMA which in turn disable FLATMEM memory model: config ARCH_FLATMEM_ENABLE def_bool y depends on X86_32 && !NUMA so FLATMEM is ruled out via dependency maze. Be explicit and disable FLATMEM for DEFERRED_STRUCT_PAGE_INIT so that we do not reintroduce subtle initialization bugs " Thanks a lot. It sounds way better. Will address in V2. Yang [1] http://lkml.kernel.org/r/20160523073157.gd2...@dhcp22.suse.cz Signed-off-by: Yang Shi <yang@linaro.org> --- mm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/Kconfig b/mm/Kconfig index 2664c11..22fa818 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -649,6 +649,7 @@ config DEFERRED_STRUCT_PAGE_INIT default n depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT depends on MEMORY_HOTPLUG + depends on !FLATMEM help Ordinarily all struct pages are initialised during early boot in a single thread. On very large machines this can take a considerable -- 2.0.2
Re: [PATCH] mm: make CONFIG_DEFERRED_STRUCT_PAGE_INIT depends on !FLATMEM explicitly
On 5/23/2016 11:22 AM, Michal Hocko wrote: On Mon 23-05-16 09:54:31, Yang Shi wrote: Per the suggestion from Michal Hocko [1], CONFIG_DEFERRED_STRUCT_PAGE_INIT should be incompatible with FLATMEM, make this explicitly in Kconfig. I guess the changelog could benefit from some clarification. What do you think about the following: " DEFERRED_STRUCT_PAGE_INIT requires some ordering wrt other initialization operations, e.g. page_ext_init has to happen after the whole memmap is initialized properly. For SPARSEMEM this requires to wait for page_alloc_init_late. Other memory models (e.g. flatmem) might have different initialization layouts (page_ext_init_flatmem). Currently DEFERRED_STRUCT_PAGE_INIT depends on MEMORY_HOTPLUG which in turn depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG and X86_64_ACPI_NUMA depends on NUMA which in turn disable FLATMEM memory model: config ARCH_FLATMEM_ENABLE def_bool y depends on X86_32 && !NUMA so FLATMEM is ruled out via dependency maze. Be explicit and disable FLATMEM for DEFERRED_STRUCT_PAGE_INIT so that we do not reintroduce subtle initialization bugs " Thanks a lot. It sounds way better. Will address in V2. Yang [1] http://lkml.kernel.org/r/20160523073157.gd2...@dhcp22.suse.cz Signed-off-by: Yang Shi --- mm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/Kconfig b/mm/Kconfig index 2664c11..22fa818 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -649,6 +649,7 @@ config DEFERRED_STRUCT_PAGE_INIT default n depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT depends on MEMORY_HOTPLUG + depends on !FLATMEM help Ordinarily all struct pages are initialised during early boot in a single thread. On very large machines this can take a considerable -- 2.0.2
Re: [v2 PATCH] mm: move page_ext_init after all struct pages are initialized
On 5/23/2016 12:31 AM, Michal Hocko wrote: On Fri 20-05-16 08:41:09, Shi, Yang wrote: On 5/20/2016 6:16 AM, Michal Hocko wrote: On Thu 19-05-16 15:13:26, Yang Shi wrote: [...] diff --git a/init/main.c b/init/main.c index b3c6e36..2075faf 100644 --- a/init/main.c +++ b/init/main.c @@ -606,7 +606,6 @@ asmlinkage __visible void __init start_kernel(void) initrd_start = 0; } #endif - page_ext_init(); debug_objects_mem_init(); kmemleak_init(); setup_per_cpu_pageset(); @@ -1004,6 +1003,8 @@ static noinline void __init kernel_init_freeable(void) sched_init_smp(); page_alloc_init_late(); + /* Initialize page ext after all struct pages are initializaed */ + page_ext_init(); do_basic_setup(); I might be missing something but don't we have the same problem with CONFIG_FLATMEM? page_ext_init_flatmem is called way earlier. Or CONFIG_DEFERRED_STRUCT_PAGE_INIT is never enabled for CONFIG_FLATMEM? Yes, CONFIG_DEFERRED_STRUCT_PAGE_INIT depends on MEMORY_HOTPLUG which depends on SPARSEMEM. So, this config is not valid for FLATMEM at all. Well config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG I wasn't really sure about X86_64_ACPI_NUMA dependency branch which depends on X86_64 && NUMA && ACPI && PCI and that didn't sound like SPARSEMEM only. If the FLATMEM shouldn't exist with Actually, FLATMEMT depends on !NUMA. CONFIG_DEFERRED_STRUCT_PAGE_INIT can we make that explicit please? Sure, it makes the condition clearer and more readable. Thanks, Yang
Re: [v2 PATCH] mm: move page_ext_init after all struct pages are initialized
On 5/23/2016 12:31 AM, Michal Hocko wrote: On Fri 20-05-16 08:41:09, Shi, Yang wrote: On 5/20/2016 6:16 AM, Michal Hocko wrote: On Thu 19-05-16 15:13:26, Yang Shi wrote: [...] diff --git a/init/main.c b/init/main.c index b3c6e36..2075faf 100644 --- a/init/main.c +++ b/init/main.c @@ -606,7 +606,6 @@ asmlinkage __visible void __init start_kernel(void) initrd_start = 0; } #endif - page_ext_init(); debug_objects_mem_init(); kmemleak_init(); setup_per_cpu_pageset(); @@ -1004,6 +1003,8 @@ static noinline void __init kernel_init_freeable(void) sched_init_smp(); page_alloc_init_late(); + /* Initialize page ext after all struct pages are initializaed */ + page_ext_init(); do_basic_setup(); I might be missing something but don't we have the same problem with CONFIG_FLATMEM? page_ext_init_flatmem is called way earlier. Or CONFIG_DEFERRED_STRUCT_PAGE_INIT is never enabled for CONFIG_FLATMEM? Yes, CONFIG_DEFERRED_STRUCT_PAGE_INIT depends on MEMORY_HOTPLUG which depends on SPARSEMEM. So, this config is not valid for FLATMEM at all. Well config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG I wasn't really sure about X86_64_ACPI_NUMA dependency branch which depends on X86_64 && NUMA && ACPI && PCI and that didn't sound like SPARSEMEM only. If the FLATMEM shouldn't exist with Actually, FLATMEMT depends on !NUMA. CONFIG_DEFERRED_STRUCT_PAGE_INIT can we make that explicit please? Sure, it makes the condition clearer and more readable. Thanks, Yang
Re: [PATCH] mm: page_is_guard return false when page_ext arrays are not allocated yet
On 5/19/2016 7:40 PM, Joonsoo Kim wrote: 2016-05-20 2:18 GMT+09:00 Shi, Yang <yang@linaro.org>: On 5/18/2016 5:28 PM, Joonsoo Kim wrote: Vlastiml, thanks for ccing me on original bug report. On Wed, May 18, 2016 at 03:23:45PM -0700, Yang Shi wrote: When enabling the below kernel configs: CONFIG_DEFERRED_STRUCT_PAGE_INIT CONFIG_DEBUG_PAGEALLOC CONFIG_PAGE_EXTENSION CONFIG_DEBUG_VM kernel bootup may fail due to the following oops: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] free_pcppages_bulk+0x2d2/0x8d0 PGD 0 Oops: [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU: 11 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc5-next-20160427 #26 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 task: 88017c080040 ti: 88017c084000 task.ti: 88017c084000 RIP: 0010:[] [] free_pcppages_bulk+0x2d2/0x8d0 RSP: :88017c087c48 EFLAGS: 00010046 RAX: RBX: RCX: 0001 RDX: 0980 RSI: 0080 RDI: 00660401 RBP: 88017c087cd0 R08: 0401 R09: 0009 R10: 88017c080040 R11: 000a R12: 0400 R13: ea001981 R14: ea0019810040 R15: 88066cfe6080 FS: () GS:88066cd4() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 02406000 CR4: 06e0 Stack: 88066cd5bbd8 88066cfe6640 001f001f 88066cd5bbe8 ea001981 8118f53e 0009 0401 000a 0001 Call Trace: [] free_hot_cold_page+0x192/0x1d0 [] __free_pages+0x5c/0x90 [] __free_pages_boot_core+0x11a/0x14e [] deferred_free_range+0x50/0x62 [] deferred_init_memmap+0x220/0x3c3 [] ? setup_per_cpu_pageset+0x35/0x35 [] kthread+0xf8/0x110 [] ret_from_fork+0x22/0x40 [] ? kthread_create_on_node+0x200/0x200 Code: 49 89 d4 48 c1 e0 06 49 01 c5 e9 de fe ff ff 4c 89 f7 44 89 4d b8 4c 89 45 c0 44 89 5d c8 48 89 4d d0 e8 62 c7 07 00 48 8b 4d d0 <48> 8b 00 44 8b 5d c8 4c 8b 45 c0 44 8b 4d b8 a8 02 0f 84 05 ff RIP [] free_pcppages_bulk+0x2d2/0x8d0 RSP CR2: The problem is lookup_page_ext() returns NULL then page_is_guard() tried to access it in page freeing. page_is_guard() depends on PAGE_EXT_DEBUG_GUARD bit of page extension flag, but freeing page might reach here before the page_ext arrays are allocated when feeding a range of pages to the allocator for the first time during bootup or memory hotplug. Patch itself looks find to me because I also found that this kind of problem happens during memory hotplug. So, we need to fix more sites, all callers of lookup_page_ext(). Yes, I agree. I will come up with a patch or a couple of patches to check the return value of lookup_page_ext(). But, I'd like to know how your problem occurs during bootup. debug_guardpage_enabled() is turned to 'enable' after page_ext is initialized. Before that, page_is_guard() unconditionally returns false so I think that the problem what you mentioned can't happen. Could you check that when debug_guardpage_enabled() returns 'enable' and init_section_page_ext() is called? I think the problem is I have CONFIG_DEFERRED_STRUCT_PAGE_INIT enabled, which will defer some struct pages initialization to "pgdatinitX" kernel thread in page_alloc_init_late(). But, page_ext_init() is called before it. So, it leads debug_guardpage_enabled() return true, but page extension is not allocated yet for the struct pages initialized by "pgdatinitX". No. After page_ext_init(), it is ensured that all page extension is initialized. It sounds page_ext_init() should be called after page_alloc_init_late(). Or it should be just incompatible with CONFIG_DEFERRED_STRUCT_PAGE_INIT. I will try to move the init call around. We need to investigate more. I guess that problem is introduced by CONFIG_DEFERRED_STRUCT_PAGE_INIT. It makes pfn_to_nid() invalid until page_alloc_init_late() is done. That is a big side-effect. If there is pfn walker and it uses pfn_to_nid() between memmap_init_zone() and page_alloc_init_late(), it also has same problem. So, we need to think how to fix it more carefully. Thanks for the analysis. I think you are correct. Since pfn_to_nid() depends on memmap which has not been fully setup yet until page_alloc_init_late() is done. So, for such usecase early_pfn_to_nid() should be used. Anyway, to make sure that my assumption is true, could you confirm that below change fix your problem? Yes, it does. Thanks. ->8-- diff --git a/mm/page_ext.c b/mm/page_ext.c index 2d864e6..cac5dc9 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -391,7 +391,7 @@ void __init page_ext_init(void) * -pfn-->
Re: [PATCH] mm: page_is_guard return false when page_ext arrays are not allocated yet
On 5/19/2016 7:40 PM, Joonsoo Kim wrote: 2016-05-20 2:18 GMT+09:00 Shi, Yang : On 5/18/2016 5:28 PM, Joonsoo Kim wrote: Vlastiml, thanks for ccing me on original bug report. On Wed, May 18, 2016 at 03:23:45PM -0700, Yang Shi wrote: When enabling the below kernel configs: CONFIG_DEFERRED_STRUCT_PAGE_INIT CONFIG_DEBUG_PAGEALLOC CONFIG_PAGE_EXTENSION CONFIG_DEBUG_VM kernel bootup may fail due to the following oops: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] free_pcppages_bulk+0x2d2/0x8d0 PGD 0 Oops: [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU: 11 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc5-next-20160427 #26 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 task: 88017c080040 ti: 88017c084000 task.ti: 88017c084000 RIP: 0010:[] [] free_pcppages_bulk+0x2d2/0x8d0 RSP: :88017c087c48 EFLAGS: 00010046 RAX: RBX: RCX: 0001 RDX: 0980 RSI: 0080 RDI: 00660401 RBP: 88017c087cd0 R08: 0401 R09: 0009 R10: 88017c080040 R11: 000a R12: 0400 R13: ea001981 R14: ea0019810040 R15: 88066cfe6080 FS: () GS:88066cd4() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 02406000 CR4: 06e0 Stack: 88066cd5bbd8 88066cfe6640 001f001f 88066cd5bbe8 ea001981 8118f53e 0009 0401 000a 0001 Call Trace: [] free_hot_cold_page+0x192/0x1d0 [] __free_pages+0x5c/0x90 [] __free_pages_boot_core+0x11a/0x14e [] deferred_free_range+0x50/0x62 [] deferred_init_memmap+0x220/0x3c3 [] ? setup_per_cpu_pageset+0x35/0x35 [] kthread+0xf8/0x110 [] ret_from_fork+0x22/0x40 [] ? kthread_create_on_node+0x200/0x200 Code: 49 89 d4 48 c1 e0 06 49 01 c5 e9 de fe ff ff 4c 89 f7 44 89 4d b8 4c 89 45 c0 44 89 5d c8 48 89 4d d0 e8 62 c7 07 00 48 8b 4d d0 <48> 8b 00 44 8b 5d c8 4c 8b 45 c0 44 8b 4d b8 a8 02 0f 84 05 ff RIP [] free_pcppages_bulk+0x2d2/0x8d0 RSP CR2: The problem is lookup_page_ext() returns NULL then page_is_guard() tried to access it in page freeing. page_is_guard() depends on PAGE_EXT_DEBUG_GUARD bit of page extension flag, but freeing page might reach here before the page_ext arrays are allocated when feeding a range of pages to the allocator for the first time during bootup or memory hotplug. Patch itself looks find to me because I also found that this kind of problem happens during memory hotplug. So, we need to fix more sites, all callers of lookup_page_ext(). Yes, I agree. I will come up with a patch or a couple of patches to check the return value of lookup_page_ext(). But, I'd like to know how your problem occurs during bootup. debug_guardpage_enabled() is turned to 'enable' after page_ext is initialized. Before that, page_is_guard() unconditionally returns false so I think that the problem what you mentioned can't happen. Could you check that when debug_guardpage_enabled() returns 'enable' and init_section_page_ext() is called? I think the problem is I have CONFIG_DEFERRED_STRUCT_PAGE_INIT enabled, which will defer some struct pages initialization to "pgdatinitX" kernel thread in page_alloc_init_late(). But, page_ext_init() is called before it. So, it leads debug_guardpage_enabled() return true, but page extension is not allocated yet for the struct pages initialized by "pgdatinitX". No. After page_ext_init(), it is ensured that all page extension is initialized. It sounds page_ext_init() should be called after page_alloc_init_late(). Or it should be just incompatible with CONFIG_DEFERRED_STRUCT_PAGE_INIT. I will try to move the init call around. We need to investigate more. I guess that problem is introduced by CONFIG_DEFERRED_STRUCT_PAGE_INIT. It makes pfn_to_nid() invalid until page_alloc_init_late() is done. That is a big side-effect. If there is pfn walker and it uses pfn_to_nid() between memmap_init_zone() and page_alloc_init_late(), it also has same problem. So, we need to think how to fix it more carefully. Thanks for the analysis. I think you are correct. Since pfn_to_nid() depends on memmap which has not been fully setup yet until page_alloc_init_late() is done. So, for such usecase early_pfn_to_nid() should be used. Anyway, to make sure that my assumption is true, could you confirm that below change fix your problem? Yes, it does. Thanks. ->8-- diff --git a/mm/page_ext.c b/mm/page_ext.c index 2d864e6..cac5dc9 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -391,7 +391,7 @@ void __init page_ext_init(void) * -pfn-->
Re: [v2 PATCH] mm: move page_ext_init after all struct pages are initialized
On 5/20/2016 6:16 AM, Michal Hocko wrote: On Thu 19-05-16 15:13:26, Yang Shi wrote: [...] diff --git a/init/main.c b/init/main.c index b3c6e36..2075faf 100644 --- a/init/main.c +++ b/init/main.c @@ -606,7 +606,6 @@ asmlinkage __visible void __init start_kernel(void) initrd_start = 0; } #endif - page_ext_init(); debug_objects_mem_init(); kmemleak_init(); setup_per_cpu_pageset(); @@ -1004,6 +1003,8 @@ static noinline void __init kernel_init_freeable(void) sched_init_smp(); page_alloc_init_late(); + /* Initialize page ext after all struct pages are initializaed */ + page_ext_init(); do_basic_setup(); I might be missing something but don't we have the same problem with CONFIG_FLATMEM? page_ext_init_flatmem is called way earlier. Or CONFIG_DEFERRED_STRUCT_PAGE_INIT is never enabled for CONFIG_FLATMEM? Yes, CONFIG_DEFERRED_STRUCT_PAGE_INIT depends on MEMORY_HOTPLUG which depends on SPARSEMEM. So, this config is not valid for FLATMEM at all. Thanks, Yang
Re: [v2 PATCH] mm: move page_ext_init after all struct pages are initialized
On 5/20/2016 6:16 AM, Michal Hocko wrote: On Thu 19-05-16 15:13:26, Yang Shi wrote: [...] diff --git a/init/main.c b/init/main.c index b3c6e36..2075faf 100644 --- a/init/main.c +++ b/init/main.c @@ -606,7 +606,6 @@ asmlinkage __visible void __init start_kernel(void) initrd_start = 0; } #endif - page_ext_init(); debug_objects_mem_init(); kmemleak_init(); setup_per_cpu_pageset(); @@ -1004,6 +1003,8 @@ static noinline void __init kernel_init_freeable(void) sched_init_smp(); page_alloc_init_late(); + /* Initialize page ext after all struct pages are initializaed */ + page_ext_init(); do_basic_setup(); I might be missing something but don't we have the same problem with CONFIG_FLATMEM? page_ext_init_flatmem is called way earlier. Or CONFIG_DEFERRED_STRUCT_PAGE_INIT is never enabled for CONFIG_FLATMEM? Yes, CONFIG_DEFERRED_STRUCT_PAGE_INIT depends on MEMORY_HOTPLUG which depends on SPARSEMEM. So, this config is not valid for FLATMEM at all. Thanks, Yang
Re: [PATCH] mm: move page_ext_init after all struct pages are initialized
On 5/19/2016 4:21 PM, Andrew Morton wrote: On Thu, 19 May 2016 15:35:15 -0700 "Shi, Yang" <yang@linaro.org> wrote: On 5/19/2016 3:30 PM, Andrew Morton wrote: On Thu, 19 May 2016 14:29:05 -0700 Yang Shi <yang@linaro.org> wrote: When DEFERRED_STRUCT_PAGE_INIT is enabled, just a subset of memmap at boot are initialized, then the rest are initialized in parallel by starting one-off "pgdatinitX" kernel thread for each node X. If page_ext_init is called before it, some pages will not have valid extension, so move page_ext_init() after it. When fixing a bug, please fully describe the end-user impact of that bug The kernel ran into the below oops which is same with the oops reported in http://ozlabs.org/~akpm/mmots/broken-out/mm-page_is_guard-return-false-when-page_ext-arrays-are-not-allocated-yet.patch. So this patch makes mm-page_is_guard-return-false-when-page_ext-arrays-are-not-allocated-yet.patch obsolete? Actually, no. Checking the return value for lookup_page_ext() is still needed. But, the commit log need to be amended since that bootup oops won't happen anymore with this patch applied. Thanks, Yang
Re: [PATCH] mm: move page_ext_init after all struct pages are initialized
On 5/19/2016 4:21 PM, Andrew Morton wrote: On Thu, 19 May 2016 15:35:15 -0700 "Shi, Yang" wrote: On 5/19/2016 3:30 PM, Andrew Morton wrote: On Thu, 19 May 2016 14:29:05 -0700 Yang Shi wrote: When DEFERRED_STRUCT_PAGE_INIT is enabled, just a subset of memmap at boot are initialized, then the rest are initialized in parallel by starting one-off "pgdatinitX" kernel thread for each node X. If page_ext_init is called before it, some pages will not have valid extension, so move page_ext_init() after it. When fixing a bug, please fully describe the end-user impact of that bug The kernel ran into the below oops which is same with the oops reported in http://ozlabs.org/~akpm/mmots/broken-out/mm-page_is_guard-return-false-when-page_ext-arrays-are-not-allocated-yet.patch. So this patch makes mm-page_is_guard-return-false-when-page_ext-arrays-are-not-allocated-yet.patch obsolete? Actually, no. Checking the return value for lookup_page_ext() is still needed. But, the commit log need to be amended since that bootup oops won't happen anymore with this patch applied. Thanks, Yang
Re: [PATCH] mm: move page_ext_init after all struct pages are initialized
On 5/19/2016 3:30 PM, Andrew Morton wrote: On Thu, 19 May 2016 14:29:05 -0700 Yang Shi <yang@linaro.org> wrote: When DEFERRED_STRUCT_PAGE_INIT is enabled, just a subset of memmap at boot are initialized, then the rest are initialized in parallel by starting one-off "pgdatinitX" kernel thread for each node X. If page_ext_init is called before it, some pages will not have valid extension, so move page_ext_init() after it. When fixing a bug, please fully describe the end-user impact of that bug The kernel ran into the below oops which is same with the oops reported in http://ozlabs.org/~akpm/mmots/broken-out/mm-page_is_guard-return-false-when-page_ext-arrays-are-not-allocated-yet.patch. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] free_pcppages_bulk+0x2d2/0x8d0 PGD 0 Oops: [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU: 11 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc5-next-20160427 #26 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 task: 88017c080040 ti: 88017c084000 task.ti: 88017c084000 RIP: 0010:[] [] free_pcppages_bulk+0x2d2/0x8d0 RSP: :88017c087c48 EFLAGS: 00010046 RAX: RBX: RCX: 0001 RDX: 0980 RSI: 0080 RDI: 00660401 RBP: 88017c087cd0 R08: 0401 R09: 0009 R10: 88017c080040 R11: 000a R12: 0400 R13: ea001981 R14: ea0019810040 R15: 88066cfe6080 FS: () GS:88066cd4() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 02406000 CR4: 06e0 Stack: 88066cd5bbd8 88066cfe6640 001f001f 88066cd5bbe8 ea001981 8118f53e 0009 0401 000a 0001 Call Trace: [] free_hot_cold_page+0x192/0x1d0 [] __free_pages+0x5c/0x90 [] __free_pages_boot_core+0x11a/0x14e [] deferred_free_range+0x50/0x62 [] deferred_init_memmap+0x220/0x3c3 [] ? setup_per_cpu_pageset+0x35/0x35 [] kthread+0xf8/0x110 [] ret_from_fork+0x22/0x40 [] ? kthread_create_on_node+0x200/0x200 Code: 49 89 d4 48 c1 e0 06 49 01 c5 e9 de fe ff ff 4c 89 f7 44 89 4d b8 4c 89 45 c0 44 89 5d c8 48 89 4d d0 e8 62 c7 07 00 48 8b 4d d0 <48> 8b 00 44 8b 5d c8 4c 8b 45 c0 44 8b 4d b8 a8 02 0f 84 05 ff RIP [] free_pcppages_bulk+0x2d2/0x8d0 RSP CR2: I will add the oops info into the commit log in V2. Thanks, Yang
Re: [PATCH] mm: move page_ext_init after all struct pages are initialized
On 5/19/2016 3:30 PM, Andrew Morton wrote: On Thu, 19 May 2016 14:29:05 -0700 Yang Shi wrote: When DEFERRED_STRUCT_PAGE_INIT is enabled, just a subset of memmap at boot are initialized, then the rest are initialized in parallel by starting one-off "pgdatinitX" kernel thread for each node X. If page_ext_init is called before it, some pages will not have valid extension, so move page_ext_init() after it. When fixing a bug, please fully describe the end-user impact of that bug The kernel ran into the below oops which is same with the oops reported in http://ozlabs.org/~akpm/mmots/broken-out/mm-page_is_guard-return-false-when-page_ext-arrays-are-not-allocated-yet.patch. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] free_pcppages_bulk+0x2d2/0x8d0 PGD 0 Oops: [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU: 11 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc5-next-20160427 #26 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 task: 88017c080040 ti: 88017c084000 task.ti: 88017c084000 RIP: 0010:[] [] free_pcppages_bulk+0x2d2/0x8d0 RSP: :88017c087c48 EFLAGS: 00010046 RAX: RBX: RCX: 0001 RDX: 0980 RSI: 0080 RDI: 00660401 RBP: 88017c087cd0 R08: 0401 R09: 0009 R10: 88017c080040 R11: 000a R12: 0400 R13: ea001981 R14: ea0019810040 R15: 88066cfe6080 FS: () GS:88066cd4() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 02406000 CR4: 06e0 Stack: 88066cd5bbd8 88066cfe6640 001f001f 88066cd5bbe8 ea001981 8118f53e 0009 0401 000a 0001 Call Trace: [] free_hot_cold_page+0x192/0x1d0 [] __free_pages+0x5c/0x90 [] __free_pages_boot_core+0x11a/0x14e [] deferred_free_range+0x50/0x62 [] deferred_init_memmap+0x220/0x3c3 [] ? setup_per_cpu_pageset+0x35/0x35 [] kthread+0xf8/0x110 [] ret_from_fork+0x22/0x40 [] ? kthread_create_on_node+0x200/0x200 Code: 49 89 d4 48 c1 e0 06 49 01 c5 e9 de fe ff ff 4c 89 f7 44 89 4d b8 4c 89 45 c0 44 89 5d c8 48 89 4d d0 e8 62 c7 07 00 48 8b 4d d0 <48> 8b 00 44 8b 5d c8 4c 8b 45 c0 44 8b 4d b8 a8 02 0f 84 05 ff RIP [] free_pcppages_bulk+0x2d2/0x8d0 RSP CR2: I will add the oops info into the commit log in V2. Thanks, Yang
Re: [PATCH] mm: page_is_guard return false when page_ext arrays are not allocated yet
On 5/18/2016 5:28 PM, Joonsoo Kim wrote: Vlastiml, thanks for ccing me on original bug report. On Wed, May 18, 2016 at 03:23:45PM -0700, Yang Shi wrote: When enabling the below kernel configs: CONFIG_DEFERRED_STRUCT_PAGE_INIT CONFIG_DEBUG_PAGEALLOC CONFIG_PAGE_EXTENSION CONFIG_DEBUG_VM kernel bootup may fail due to the following oops: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] free_pcppages_bulk+0x2d2/0x8d0 PGD 0 Oops: [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU: 11 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc5-next-20160427 #26 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 task: 88017c080040 ti: 88017c084000 task.ti: 88017c084000 RIP: 0010:[] [] free_pcppages_bulk+0x2d2/0x8d0 RSP: :88017c087c48 EFLAGS: 00010046 RAX: RBX: RCX: 0001 RDX: 0980 RSI: 0080 RDI: 00660401 RBP: 88017c087cd0 R08: 0401 R09: 0009 R10: 88017c080040 R11: 000a R12: 0400 R13: ea001981 R14: ea0019810040 R15: 88066cfe6080 FS: () GS:88066cd4() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 02406000 CR4: 06e0 Stack: 88066cd5bbd8 88066cfe6640 001f001f 88066cd5bbe8 ea001981 8118f53e 0009 0401 000a 0001 Call Trace: [] free_hot_cold_page+0x192/0x1d0 [] __free_pages+0x5c/0x90 [] __free_pages_boot_core+0x11a/0x14e [] deferred_free_range+0x50/0x62 [] deferred_init_memmap+0x220/0x3c3 [] ? setup_per_cpu_pageset+0x35/0x35 [] kthread+0xf8/0x110 [] ret_from_fork+0x22/0x40 [] ? kthread_create_on_node+0x200/0x200 Code: 49 89 d4 48 c1 e0 06 49 01 c5 e9 de fe ff ff 4c 89 f7 44 89 4d b8 4c 89 45 c0 44 89 5d c8 48 89 4d d0 e8 62 c7 07 00 48 8b 4d d0 <48> 8b 00 44 8b 5d c8 4c 8b 45 c0 44 8b 4d b8 a8 02 0f 84 05 ff RIP [] free_pcppages_bulk+0x2d2/0x8d0 RSP CR2: The problem is lookup_page_ext() returns NULL then page_is_guard() tried to access it in page freeing. page_is_guard() depends on PAGE_EXT_DEBUG_GUARD bit of page extension flag, but freeing page might reach here before the page_ext arrays are allocated when feeding a range of pages to the allocator for the first time during bootup or memory hotplug. Patch itself looks find to me because I also found that this kind of problem happens during memory hotplug. So, we need to fix more sites, all callers of lookup_page_ext(). Yes, I agree. I will come up with a patch or a couple of patches to check the return value of lookup_page_ext(). But, I'd like to know how your problem occurs during bootup. debug_guardpage_enabled() is turned to 'enable' after page_ext is initialized. Before that, page_is_guard() unconditionally returns false so I think that the problem what you mentioned can't happen. Could you check that when debug_guardpage_enabled() returns 'enable' and init_section_page_ext() is called? I think the problem is I have CONFIG_DEFERRED_STRUCT_PAGE_INIT enabled, which will defer some struct pages initialization to "pgdatinitX" kernel thread in page_alloc_init_late(). But, page_ext_init() is called before it. So, it leads debug_guardpage_enabled() return true, but page extension is not allocated yet for the struct pages initialized by "pgdatinitX". It sounds page_ext_init() should be called after page_alloc_init_late(). Or it should be just incompatible with CONFIG_DEFERRED_STRUCT_PAGE_INIT. I will try to move the init call around. Thanks, Yang And, above comment would be stale because it comes from when memcg uses this struct page extension funtionality. Now, memcg doesn't use it and there are some changes on this area so I'm not sure that is still true. Thanks.
Re: [PATCH] mm: page_is_guard return false when page_ext arrays are not allocated yet
On 5/18/2016 5:28 PM, Joonsoo Kim wrote: Vlastiml, thanks for ccing me on original bug report. On Wed, May 18, 2016 at 03:23:45PM -0700, Yang Shi wrote: When enabling the below kernel configs: CONFIG_DEFERRED_STRUCT_PAGE_INIT CONFIG_DEBUG_PAGEALLOC CONFIG_PAGE_EXTENSION CONFIG_DEBUG_VM kernel bootup may fail due to the following oops: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] free_pcppages_bulk+0x2d2/0x8d0 PGD 0 Oops: [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU: 11 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc5-next-20160427 #26 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 task: 88017c080040 ti: 88017c084000 task.ti: 88017c084000 RIP: 0010:[] [] free_pcppages_bulk+0x2d2/0x8d0 RSP: :88017c087c48 EFLAGS: 00010046 RAX: RBX: RCX: 0001 RDX: 0980 RSI: 0080 RDI: 00660401 RBP: 88017c087cd0 R08: 0401 R09: 0009 R10: 88017c080040 R11: 000a R12: 0400 R13: ea001981 R14: ea0019810040 R15: 88066cfe6080 FS: () GS:88066cd4() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 02406000 CR4: 06e0 Stack: 88066cd5bbd8 88066cfe6640 001f001f 88066cd5bbe8 ea001981 8118f53e 0009 0401 000a 0001 Call Trace: [] free_hot_cold_page+0x192/0x1d0 [] __free_pages+0x5c/0x90 [] __free_pages_boot_core+0x11a/0x14e [] deferred_free_range+0x50/0x62 [] deferred_init_memmap+0x220/0x3c3 [] ? setup_per_cpu_pageset+0x35/0x35 [] kthread+0xf8/0x110 [] ret_from_fork+0x22/0x40 [] ? kthread_create_on_node+0x200/0x200 Code: 49 89 d4 48 c1 e0 06 49 01 c5 e9 de fe ff ff 4c 89 f7 44 89 4d b8 4c 89 45 c0 44 89 5d c8 48 89 4d d0 e8 62 c7 07 00 48 8b 4d d0 <48> 8b 00 44 8b 5d c8 4c 8b 45 c0 44 8b 4d b8 a8 02 0f 84 05 ff RIP [] free_pcppages_bulk+0x2d2/0x8d0 RSP CR2: The problem is lookup_page_ext() returns NULL then page_is_guard() tried to access it in page freeing. page_is_guard() depends on PAGE_EXT_DEBUG_GUARD bit of page extension flag, but freeing page might reach here before the page_ext arrays are allocated when feeding a range of pages to the allocator for the first time during bootup or memory hotplug. Patch itself looks find to me because I also found that this kind of problem happens during memory hotplug. So, we need to fix more sites, all callers of lookup_page_ext(). Yes, I agree. I will come up with a patch or a couple of patches to check the return value of lookup_page_ext(). But, I'd like to know how your problem occurs during bootup. debug_guardpage_enabled() is turned to 'enable' after page_ext is initialized. Before that, page_is_guard() unconditionally returns false so I think that the problem what you mentioned can't happen. Could you check that when debug_guardpage_enabled() returns 'enable' and init_section_page_ext() is called? I think the problem is I have CONFIG_DEFERRED_STRUCT_PAGE_INIT enabled, which will defer some struct pages initialization to "pgdatinitX" kernel thread in page_alloc_init_late(). But, page_ext_init() is called before it. So, it leads debug_guardpage_enabled() return true, but page extension is not allocated yet for the struct pages initialized by "pgdatinitX". It sounds page_ext_init() should be called after page_alloc_init_late(). Or it should be just incompatible with CONFIG_DEFERRED_STRUCT_PAGE_INIT. I will try to move the init call around. Thanks, Yang And, above comment would be stale because it comes from when memcg uses this struct page extension funtionality. Now, memcg doesn't use it and there are some changes on this area so I'm not sure that is still true. Thanks.
Re: [BUG] Null pointer dereference when freeing pages on 4.6-rc6
On 5/16/2016 5:44 AM, Vlastimil Babka wrote: [+CC Joonsoo based on git blame] On 05/05/2016 11:13 PM, Shi, Yang wrote: Hi folks, When I enable the below kernel configs on 4.6-rc6, I came across null pointer deference issue in boot stage. CONFIG_SPARSEMEM CONFIG_DEFERRED_STRUCT_PAGE_INIT CONFIG_DEBUG_PAGEALLOC CONFIG_PAGE_EXTENSION CONFIG_DEBUG_VM The splat is: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] page_is_buddy+0x7b/0xe0 PGD 0 Oops: [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU: 3 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc6 #8 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 task: 88017c1d0040 ti: 88017c1d4000 task.ti: 88017c1d4000 RIP: 0010:[] [] page_is_buddy+0x7b/0xe0 RSP: :88017c1d7bf0 EFLAGS: 00010046 RAX: RBX: ea0019810040 RCX: RDX: RSI: ea0019810040 RDI: 00660401 RBP: 88017c1d7c08 R08: 0001 R09: R10: 01af R11: 0001 R12: ea001981 R13: R14: 0009 R15: ea001981 FS: () GS:88066cc4() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 02406000 CR4: 06e0 Stack: 1981 88066cfe6080 88017c1d7c70 8118bfea 000a 1600 0001 0401 ea0019810040 0400 88066cc5aca8 Call Trace: [] __free_one_page+0x23a/0x450 [] free_pcppages_bulk+0x136/0x360 [] free_hot_cold_page+0x168/0x1b0 [] __free_pages+0x5c/0x90 [] __free_pages_boot_core.isra.70+0x11a/0x14d [] deferred_free_range+0x50/0x62 [] deferred_init_memmap+0x220/0x3c3 [] ? deferred_free_range+0x62/0x62 [] kthread+0xf8/0x110 [] ret_from_fork+0x22/0x40 [] ? kthread_create_on_node+0x200/0x200 Code: 75 7b 48 89 d8 8b 40 1c 85 c0 74 50 48 c7 c6 38 bd 0d 82 48 89 df e8 25 e2 02 00 0f 0b 48 89 f7 89 55 ec e8 18 cb 07 00 8b 55 ec <48> 8b 00 a8 02 74 9d 3b 53 30 75 98 49 8b 14 24 48 8b 03 48 c1 RIP [] page_is_buddy+0x7b/0xe0 RSP CR2: ---[ end trace e0c05a86b43d97f9 ]--- note: pgdatinit1[106] exited with preempt_count 1 I changed page_is_buddy and __free_one_page to non-inline to get more accurate stack trace. Then I did some investigation on it with printing the address of page and buddy, please see the below log: @@__free_one_page:715: page is at ea0005f05c00 buddy is at ea0005f05c80, order is 1 @@__free_one_page:715: page is at ea0005f05c00 buddy is at ea0005f05d00, order is 2 @@__free_one_page:715: page is at ea0005f05c00 buddy is at ea0005f05e00, order is 3 @@__free_one_page:715: page is at ea001981 buddy is at ea0019810040, order is 0 call trace splat @@__free_one_page:715: page is at ea0005f05bc0 buddy is at ea0005f05b80, order is 0 @@__free_one_page:715: page is at ea0005f05b80 buddy is at ea0005f05bc0, order is 0 @@__free_one_page:715: page is at ea0005f05b80 buddy is at ea0005f05b00, order is 1 @@__free_one_page:715: page is at ea0005f05b40 buddy is at ea0005f05b00, order is 0 @@__free_one_page:715: page is at ea0005f05b00 buddy is at ea0005f05b40, order is 0 It shows just before the call trace splat, the page address jumped to ea001981 from ea0005f05xxx, not sure why this is happening. Any hint is appreciated. I think the page address didn't jump, it was that the previous call of __free_one_page() finished at order=3 and now another one was invoked that started at order=0 page ea001981 and immediately hit the splat. And, reading the code leads me to the below call path: page_is_buddy() --> page_is_guard() --> lookup_page_ext() From a quick decodecode I guess that's indeed the path that's crashed. You could try verifying with e.g. addr2line on your vmlinux? I tracked down there via gdb. Then lookup_page_ext() just returns null due to the below code: #if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. * * This check is also necessary for ensuring page poisoning * works as expected when enabled */ if (!section->page_ext) return NULL; #endif So, according to the comment, it looks there should be a WARN or BUG if it returns NULL? The comment reads like it's unlikely, but possible, so WARN/BUG doesn't sounds right. And, almost no codes check if the return
Re: [BUG] Null pointer dereference when freeing pages on 4.6-rc6
On 5/16/2016 5:44 AM, Vlastimil Babka wrote: [+CC Joonsoo based on git blame] On 05/05/2016 11:13 PM, Shi, Yang wrote: Hi folks, When I enable the below kernel configs on 4.6-rc6, I came across null pointer deference issue in boot stage. CONFIG_SPARSEMEM CONFIG_DEFERRED_STRUCT_PAGE_INIT CONFIG_DEBUG_PAGEALLOC CONFIG_PAGE_EXTENSION CONFIG_DEBUG_VM The splat is: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] page_is_buddy+0x7b/0xe0 PGD 0 Oops: [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU: 3 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc6 #8 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 task: 88017c1d0040 ti: 88017c1d4000 task.ti: 88017c1d4000 RIP: 0010:[] [] page_is_buddy+0x7b/0xe0 RSP: :88017c1d7bf0 EFLAGS: 00010046 RAX: RBX: ea0019810040 RCX: RDX: RSI: ea0019810040 RDI: 00660401 RBP: 88017c1d7c08 R08: 0001 R09: R10: 01af R11: 0001 R12: ea001981 R13: R14: 0009 R15: ea001981 FS: () GS:88066cc4() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 02406000 CR4: 06e0 Stack: 1981 88066cfe6080 88017c1d7c70 8118bfea 000a 1600 0001 0401 ea0019810040 0400 88066cc5aca8 Call Trace: [] __free_one_page+0x23a/0x450 [] free_pcppages_bulk+0x136/0x360 [] free_hot_cold_page+0x168/0x1b0 [] __free_pages+0x5c/0x90 [] __free_pages_boot_core.isra.70+0x11a/0x14d [] deferred_free_range+0x50/0x62 [] deferred_init_memmap+0x220/0x3c3 [] ? deferred_free_range+0x62/0x62 [] kthread+0xf8/0x110 [] ret_from_fork+0x22/0x40 [] ? kthread_create_on_node+0x200/0x200 Code: 75 7b 48 89 d8 8b 40 1c 85 c0 74 50 48 c7 c6 38 bd 0d 82 48 89 df e8 25 e2 02 00 0f 0b 48 89 f7 89 55 ec e8 18 cb 07 00 8b 55 ec <48> 8b 00 a8 02 74 9d 3b 53 30 75 98 49 8b 14 24 48 8b 03 48 c1 RIP [] page_is_buddy+0x7b/0xe0 RSP CR2: ---[ end trace e0c05a86b43d97f9 ]--- note: pgdatinit1[106] exited with preempt_count 1 I changed page_is_buddy and __free_one_page to non-inline to get more accurate stack trace. Then I did some investigation on it with printing the address of page and buddy, please see the below log: @@__free_one_page:715: page is at ea0005f05c00 buddy is at ea0005f05c80, order is 1 @@__free_one_page:715: page is at ea0005f05c00 buddy is at ea0005f05d00, order is 2 @@__free_one_page:715: page is at ea0005f05c00 buddy is at ea0005f05e00, order is 3 @@__free_one_page:715: page is at ea001981 buddy is at ea0019810040, order is 0 call trace splat @@__free_one_page:715: page is at ea0005f05bc0 buddy is at ea0005f05b80, order is 0 @@__free_one_page:715: page is at ea0005f05b80 buddy is at ea0005f05bc0, order is 0 @@__free_one_page:715: page is at ea0005f05b80 buddy is at ea0005f05b00, order is 1 @@__free_one_page:715: page is at ea0005f05b40 buddy is at ea0005f05b00, order is 0 @@__free_one_page:715: page is at ea0005f05b00 buddy is at ea0005f05b40, order is 0 It shows just before the call trace splat, the page address jumped to ea001981 from ea0005f05xxx, not sure why this is happening. Any hint is appreciated. I think the page address didn't jump, it was that the previous call of __free_one_page() finished at order=3 and now another one was invoked that started at order=0 page ea001981 and immediately hit the splat. And, reading the code leads me to the below call path: page_is_buddy() --> page_is_guard() --> lookup_page_ext() From a quick decodecode I guess that's indeed the path that's crashed. You could try verifying with e.g. addr2line on your vmlinux? I tracked down there via gdb. Then lookup_page_ext() just returns null due to the below code: #if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. * * This check is also necessary for ensuring page poisoning * works as expected when enabled */ if (!section->page_ext) return NULL; #endif So, according to the comment, it looks there should be a WARN or BUG if it returns NULL? The comment reads like it's unlikely, but possible, so WARN/BUG doesn't sounds right. And, almost no codes check if the return
Re: [PATCH net-next] bpf: arm64: remove callee-save registers use for tmp registers
On 5/16/2016 4:45 PM, Z Lim wrote: Hi Yang, On Mon, May 16, 2016 at 4:09 PM, Yang Shi <yang@linaro.org> wrote: In the current implementation of ARM64 eBPF JIT, R23 and R24 are used for tmp registers, which are callee-saved registers. This leads to variable size of JIT prologue and epilogue. The latest blinding constant change prefers to constant size of prologue and epilogue. AAPCS reserves R9 ~ R15 for temp registers which not need to be saved/restored during function call. So, replace R23 and R24 to R10 and R11, and remove tmp_used flag. CC: Zi Shen Lim <zlim@gmail.com> CC: Daniel Borkmann <dan...@iogearbox.net> Signed-off-by: Yang Shi <yang@linaro.org> --- Couple suggestions, but otherwise: Acked-by: Zi Shen Lim <zlim@gmail.com> 1. Update the diagram. I think it should now be: -* BPF fp register => -80:+-+ <= (BPF_FP) +* BPF fp register => -64:+-+ <= (BPF_FP) Nice catch. I forgot the stack diagram. 2. Add a comment in commit log along the lines of: this is an optimization saving 2 instructions per jited BPF program. Sure, will address in V2. Thanks, Yang Thanks :) z Apply on top of Daniel's blinding constant patchset. arch/arm64/net/bpf_jit_comp.c | 32 1 file changed, 4 insertions(+), 28 deletions(-)
Re: [PATCH net-next] bpf: arm64: remove callee-save registers use for tmp registers
On 5/16/2016 4:45 PM, Z Lim wrote: Hi Yang, On Mon, May 16, 2016 at 4:09 PM, Yang Shi wrote: In the current implementation of ARM64 eBPF JIT, R23 and R24 are used for tmp registers, which are callee-saved registers. This leads to variable size of JIT prologue and epilogue. The latest blinding constant change prefers to constant size of prologue and epilogue. AAPCS reserves R9 ~ R15 for temp registers which not need to be saved/restored during function call. So, replace R23 and R24 to R10 and R11, and remove tmp_used flag. CC: Zi Shen Lim CC: Daniel Borkmann Signed-off-by: Yang Shi --- Couple suggestions, but otherwise: Acked-by: Zi Shen Lim 1. Update the diagram. I think it should now be: -* BPF fp register => -80:+-+ <= (BPF_FP) +* BPF fp register => -64:+-+ <= (BPF_FP) Nice catch. I forgot the stack diagram. 2. Add a comment in commit log along the lines of: this is an optimization saving 2 instructions per jited BPF program. Sure, will address in V2. Thanks, Yang Thanks :) z Apply on top of Daniel's blinding constant patchset. arch/arm64/net/bpf_jit_comp.c | 32 1 file changed, 4 insertions(+), 28 deletions(-)
Re: [PATCH] mm: slab: remove ZONE_DMA_FLAG
On 5/5/2016 4:49 AM, Michal Hocko wrote: On Wed 04-05-16 10:01:37, Yang Shi wrote: Now we have IS_ENABLED helper to check if a Kconfig option is enabled or not, so ZONE_DMA_FLAG sounds no longer useful. And, the use of ZONE_DMA_FLAG in slab looks pointless according to the comment [1] from Johannes Weiner, so remove them and ORing passed in flags with the cache gfp flags has been done in kmem_getpages(). [1] https://lkml.org/lkml/2014/9/25/553 I haven't checked the patch but I have a formal suggestion. lkml.org tends to break and forget, please use http://lkml.kernel.org/r/$msg-id instead. In this case http://lkml.kernel.org/r/20140925185047.ga21...@cmpxchg.org Thanks for the suggestion. Will use msg-id in later post. Regards, Yang Thanks!
Re: [PATCH] mm: slab: remove ZONE_DMA_FLAG
On 5/5/2016 4:49 AM, Michal Hocko wrote: On Wed 04-05-16 10:01:37, Yang Shi wrote: Now we have IS_ENABLED helper to check if a Kconfig option is enabled or not, so ZONE_DMA_FLAG sounds no longer useful. And, the use of ZONE_DMA_FLAG in slab looks pointless according to the comment [1] from Johannes Weiner, so remove them and ORing passed in flags with the cache gfp flags has been done in kmem_getpages(). [1] https://lkml.org/lkml/2014/9/25/553 I haven't checked the patch but I have a formal suggestion. lkml.org tends to break and forget, please use http://lkml.kernel.org/r/$msg-id instead. In this case http://lkml.kernel.org/r/20140925185047.ga21...@cmpxchg.org Thanks for the suggestion. Will use msg-id in later post. Regards, Yang Thanks!
[BUG] Null pointer dereference when freeing pages on 4.6-rc6
Hi folks, When I enable the below kernel configs on 4.6-rc6, I came across null pointer deference issue in boot stage. CONFIG_SPARSEMEM CONFIG_DEFERRED_STRUCT_PAGE_INIT CONFIG_DEBUG_PAGEALLOC CONFIG_PAGE_EXTENSION CONFIG_DEBUG_VM The splat is: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] page_is_buddy+0x7b/0xe0 PGD 0 Oops: [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU: 3 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc6 #8 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 task: 88017c1d0040 ti: 88017c1d4000 task.ti: 88017c1d4000 RIP: 0010:[] [] page_is_buddy+0x7b/0xe0 RSP: :88017c1d7bf0 EFLAGS: 00010046 RAX: RBX: ea0019810040 RCX: RDX: RSI: ea0019810040 RDI: 00660401 RBP: 88017c1d7c08 R08: 0001 R09: R10: 01af R11: 0001 R12: ea001981 R13: R14: 0009 R15: ea001981 FS: () GS:88066cc4() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 02406000 CR4: 06e0 Stack: 1981 88066cfe6080 88017c1d7c70 8118bfea 000a 1600 0001 0401 ea0019810040 0400 88066cc5aca8 Call Trace: [] __free_one_page+0x23a/0x450 [] free_pcppages_bulk+0x136/0x360 [] free_hot_cold_page+0x168/0x1b0 [] __free_pages+0x5c/0x90 [] __free_pages_boot_core.isra.70+0x11a/0x14d [] deferred_free_range+0x50/0x62 [] deferred_init_memmap+0x220/0x3c3 [] ? deferred_free_range+0x62/0x62 [] kthread+0xf8/0x110 [] ret_from_fork+0x22/0x40 [] ? kthread_create_on_node+0x200/0x200 Code: 75 7b 48 89 d8 8b 40 1c 85 c0 74 50 48 c7 c6 38 bd 0d 82 48 89 df e8 25 e2 02 00 0f 0b 48 89 f7 89 55 ec e8 18 cb 07 00 8b 55 ec <48> 8b 00 a8 02 74 9d 3b 53 30 75 98 49 8b 14 24 48 8b 03 48 c1 RIP [] page_is_buddy+0x7b/0xe0 RSP CR2: ---[ end trace e0c05a86b43d97f9 ]--- note: pgdatinit1[106] exited with preempt_count 1 I changed page_is_buddy and __free_one_page to non-inline to get more accurate stack trace. Then I did some investigation on it with printing the address of page and buddy, please see the below log: @@__free_one_page:715: page is at ea0005f05c00 buddy is at ea0005f05c80, order is 1 @@__free_one_page:715: page is at ea0005f05c00 buddy is at ea0005f05d00, order is 2 @@__free_one_page:715: page is at ea0005f05c00 buddy is at ea0005f05e00, order is 3 @@__free_one_page:715: page is at ea001981 buddy is at ea0019810040, order is 0 call trace splat @@__free_one_page:715: page is at ea0005f05bc0 buddy is at ea0005f05b80, order is 0 @@__free_one_page:715: page is at ea0005f05b80 buddy is at ea0005f05bc0, order is 0 @@__free_one_page:715: page is at ea0005f05b80 buddy is at ea0005f05b00, order is 1 @@__free_one_page:715: page is at ea0005f05b40 buddy is at ea0005f05b00, order is 0 @@__free_one_page:715: page is at ea0005f05b00 buddy is at ea0005f05b40, order is 0 It shows just before the call trace splat, the page address jumped to ea001981 from ea0005f05xxx, not sure why this is happening. Any hint is appreciated. And, reading the code leads me to the below call path: page_is_buddy() --> page_is_guard() --> lookup_page_ext() Then lookup_page_ext() just returns null due to the below code: #if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. * * This check is also necessary for ensuring page poisoning * works as expected when enabled */ if (!section->page_ext) return NULL; #endif So, according to the comment, it looks there should be a WARN or BUG if it returns NULL? And, almost no codes check if the return pointer is null or not after lookup_page_ext() is called. Thanks, Yang
[BUG] Null pointer dereference when freeing pages on 4.6-rc6
Hi folks, When I enable the below kernel configs on 4.6-rc6, I came across null pointer deference issue in boot stage. CONFIG_SPARSEMEM CONFIG_DEFERRED_STRUCT_PAGE_INIT CONFIG_DEBUG_PAGEALLOC CONFIG_PAGE_EXTENSION CONFIG_DEBUG_VM The splat is: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] page_is_buddy+0x7b/0xe0 PGD 0 Oops: [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU: 3 PID: 106 Comm: pgdatinit1 Not tainted 4.6.0-rc6 #8 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 task: 88017c1d0040 ti: 88017c1d4000 task.ti: 88017c1d4000 RIP: 0010:[] [] page_is_buddy+0x7b/0xe0 RSP: :88017c1d7bf0 EFLAGS: 00010046 RAX: RBX: ea0019810040 RCX: RDX: RSI: ea0019810040 RDI: 00660401 RBP: 88017c1d7c08 R08: 0001 R09: R10: 01af R11: 0001 R12: ea001981 R13: R14: 0009 R15: ea001981 FS: () GS:88066cc4() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 02406000 CR4: 06e0 Stack: 1981 88066cfe6080 88017c1d7c70 8118bfea 000a 1600 0001 0401 ea0019810040 0400 88066cc5aca8 Call Trace: [] __free_one_page+0x23a/0x450 [] free_pcppages_bulk+0x136/0x360 [] free_hot_cold_page+0x168/0x1b0 [] __free_pages+0x5c/0x90 [] __free_pages_boot_core.isra.70+0x11a/0x14d [] deferred_free_range+0x50/0x62 [] deferred_init_memmap+0x220/0x3c3 [] ? deferred_free_range+0x62/0x62 [] kthread+0xf8/0x110 [] ret_from_fork+0x22/0x40 [] ? kthread_create_on_node+0x200/0x200 Code: 75 7b 48 89 d8 8b 40 1c 85 c0 74 50 48 c7 c6 38 bd 0d 82 48 89 df e8 25 e2 02 00 0f 0b 48 89 f7 89 55 ec e8 18 cb 07 00 8b 55 ec <48> 8b 00 a8 02 74 9d 3b 53 30 75 98 49 8b 14 24 48 8b 03 48 c1 RIP [] page_is_buddy+0x7b/0xe0 RSP CR2: ---[ end trace e0c05a86b43d97f9 ]--- note: pgdatinit1[106] exited with preempt_count 1 I changed page_is_buddy and __free_one_page to non-inline to get more accurate stack trace. Then I did some investigation on it with printing the address of page and buddy, please see the below log: @@__free_one_page:715: page is at ea0005f05c00 buddy is at ea0005f05c80, order is 1 @@__free_one_page:715: page is at ea0005f05c00 buddy is at ea0005f05d00, order is 2 @@__free_one_page:715: page is at ea0005f05c00 buddy is at ea0005f05e00, order is 3 @@__free_one_page:715: page is at ea001981 buddy is at ea0019810040, order is 0 call trace splat @@__free_one_page:715: page is at ea0005f05bc0 buddy is at ea0005f05b80, order is 0 @@__free_one_page:715: page is at ea0005f05b80 buddy is at ea0005f05bc0, order is 0 @@__free_one_page:715: page is at ea0005f05b80 buddy is at ea0005f05b00, order is 1 @@__free_one_page:715: page is at ea0005f05b40 buddy is at ea0005f05b00, order is 0 @@__free_one_page:715: page is at ea0005f05b00 buddy is at ea0005f05b40, order is 0 It shows just before the call trace splat, the page address jumped to ea001981 from ea0005f05xxx, not sure why this is happening. Any hint is appreciated. And, reading the code leads me to the below call path: page_is_buddy() --> page_is_guard() --> lookup_page_ext() Then lookup_page_ext() just returns null due to the below code: #if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. * * This check is also necessary for ensuring page poisoning * works as expected when enabled */ if (!section->page_ext) return NULL; #endif So, according to the comment, it looks there should be a WARN or BUG if it returns NULL? And, almost no codes check if the return pointer is null or not after lookup_page_ext() is called. Thanks, Yang
Re: [PATCH] mm: move huge_pmd_set_accessed out of huge_memory.c
On 4/22/2016 2:48 AM, Kirill A. Shutemov wrote: On Thu, Apr 21, 2016 at 03:56:07PM -0700, Shi, Yang wrote: On 4/21/2016 12:30 AM, Kirill A. Shutemov wrote: On Wed, Apr 20, 2016 at 02:00:11PM -0700, Shi, Yang wrote: Hi folks, I didn't realize pmd_* functions are protected by CONFIG_TRANSPARENT_HUGEPAGE on the most architectures before I made this change. Before I fix all the affected architectures code, I want to check if you guys think this change is worth or not? Thanks, Yang On 4/20/2016 11:24 AM, Yang Shi wrote: huge_pmd_set_accessed is only called by __handle_mm_fault from memory.c, move the definition to memory.c and make it static like create_huge_pmd and wp_huge_pmd. Signed-off-by: Yang Shi <yang@linaro.org> On pte side we have the same functionality open-coded. Should we do the same for pmd? Or change pte side the same way? Sorry, I don't quite understand you. Do you mean pte_* functions? See handle_pte_fault(), we do the same for pte there what huge_pmd_set_accessed() does for pmd. Thanks for directing to this code. I think we should be consistent here: either both are abstructed into functions or both open-coded. I'm supposed functions sound better. However, do_wp_page has to be called with pte lock acquired. So, the abstracted function has to call it. Thanks, Yang
Re: [PATCH] mm: move huge_pmd_set_accessed out of huge_memory.c
On 4/22/2016 2:48 AM, Kirill A. Shutemov wrote: On Thu, Apr 21, 2016 at 03:56:07PM -0700, Shi, Yang wrote: On 4/21/2016 12:30 AM, Kirill A. Shutemov wrote: On Wed, Apr 20, 2016 at 02:00:11PM -0700, Shi, Yang wrote: Hi folks, I didn't realize pmd_* functions are protected by CONFIG_TRANSPARENT_HUGEPAGE on the most architectures before I made this change. Before I fix all the affected architectures code, I want to check if you guys think this change is worth or not? Thanks, Yang On 4/20/2016 11:24 AM, Yang Shi wrote: huge_pmd_set_accessed is only called by __handle_mm_fault from memory.c, move the definition to memory.c and make it static like create_huge_pmd and wp_huge_pmd. Signed-off-by: Yang Shi On pte side we have the same functionality open-coded. Should we do the same for pmd? Or change pte side the same way? Sorry, I don't quite understand you. Do you mean pte_* functions? See handle_pte_fault(), we do the same for pte there what huge_pmd_set_accessed() does for pmd. Thanks for directing to this code. I think we should be consistent here: either both are abstructed into functions or both open-coded. I'm supposed functions sound better. However, do_wp_page has to be called with pte lock acquired. So, the abstracted function has to call it. Thanks, Yang
Re: [BUG linux-next] Kernel panic found with linux-next-20160414
On 4/27/2016 1:14 AM, Hugh Dickins wrote: On Wed, 20 Apr 2016, Shi, Yang wrote: On 4/20/2016 1:01 AM, Hugh Dickins wrote: On Tue, 19 Apr 2016, Shi, Yang wrote: Hi folks, When I ran ltp on linux-next-20160414 on my ARM64 machine, I got the below kernel panic: Unable to handle kernel paging request at virtual address ffc007846000 pgd = ffc01e21d000 [ffc007846000] *pgd=, *pud= Internal error: Oops: 9647 [#11] PREEMPT SMP Modules linked in: loop CPU: 7 PID: 274 Comm: systemd-journal Tainted: G D 4.6.0-rc3-next-20160414-WR8.0.0.0_standard+ #9 Hardware name: Freescale Layerscape 2085a RDB Board (DT) task: ffc01e3fcf80 ti: ffc01ea8c000 task.ti: ffc01ea8c000 PC is at copy_page+0x38/0x120 LR is at migrate_page_copy+0x604/0x1660 pc : [] lr : [] pstate: 2145 sp : ffc01ea8ecd0 x29: ffc01ea8ecd0 x28: x27: 1ff7b80240f8 x26: ffc018196f20 x25: ffbdc01e1180 x24: ffbdc01e1180 x23: x22: ffc01e3fcf80 x21: ffc00481f000 x20: ff900a31d000 x19: ffbdc01207c0 x18: 0f00 x17: x16: x15: x14: x13: x12: x11: x10: x9 : x8 : x7 : x6 : x5 : x4 : x3 : x2 : x1 : ffc00481f080 x0 : ffc007846000 Call trace: Exception stack(0xffc021fc2ed0 to 0xffc021fc2ff0) 2ec0: ffbdc00887c0 ff900a31d000 2ee0: ffc021fc30f0 ff9008ff2318 2145 0025 2f00: ffbdc025a280 ffc020adc4c0 41b58ab3 ff900a085fd0 2f20: ff9008200658 ffbdc00887c0 2f40: ff900b0f1320 ffc021fc3078 41b58ab3 ff900a0864f8 2f60: ff9008210010 ffc021fb8960 ff900867bacc 1ff8043f712d 2f80: ffc021fc2fb0 ff9008210564 ffc021fc3070 ffc021fb8940 2fa0: 08221f78 ff900862f9c8 ffc021fc2fe0 ff9008215dc8 2fc0: 1ff8043f8602 ffc021fc ffc00968a000 ffc00221f080 2fe0: f9407e11d1f0 d61f02209103e210 [] copy_page+0x38/0x120 [] migrate_page+0x74/0x98 [] nfs_migrate_page+0x58/0x80 [] move_to_new_page+0x15c/0x4d8 [] migrate_pages+0x7c8/0x11f0 [] compact_zone+0xdfc/0x2570 [] compact_zone_order+0xe0/0x170 [] try_to_compact_pages+0x2e8/0x8f8 [] __alloc_pages_direct_compact+0x100/0x540 [] __alloc_pages_nodemask+0xc40/0x1c58 [] khugepaged+0x468/0x19c8 [] kthread+0x248/0x2c0 [] ret_from_fork+0x10/0x40 Code: d281f012 91020021 f1020252 d503201f (a8000c02) I did some initial investigation and found it is caused by DEBUG_PAGEALLOC and CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT. And, mainline 4.6-rc3 works well. It should be not arch specific although I got it caught on ARM64. I suspect this might be caused by Hugh's huge tmpfs patches. Thanks for testing. It might be caused by my patches, but I don't think that's very likely. This is page migraton for compaction, in the service of anon THP's khugepaged; and I wonder if you were even exercising huge tmpfs when running LTP here (it certainly can be done: I like to mount a huge tmpfs on /opt/ltp and install there, with shmem_huge 2 so any other tmpfs mounts are also huge). Some further investigation shows I got the panic even though I don't have tmpfs mounted with huge=1 or set shmem_huge to 2. There are compaction changes in linux-next too, but I don't see any reason why they'd cause this. I don't know arm64 traces enough to know whether it's the source page or the destination page for the copy, but it looks as if it has been freed (and DEBUG_PAGEALLOC unmapped) before reaching migration's copy. The fault address is passed by x0, which is dest in the implementation of copy_page, so it is the destination page. Needs more debugging, I'm afraid: is it reproducible? Yes, as long as I enable those two PAGEALLOC debug options, I can get the panic once I run ltp. But, it is not caused any specific ltp test case directly, the panic happens randomly during ltp is running. Your ping on the crash in release_freepages() reminded me to take another look at this one. And found that I only needed to enable DEBUG_PAGEALLOC and run LTP to get it on x86_64 too, as you suspected. It's another of those compaction errors, in mmotm and linux-next of a week or two ago, whose patch has since been withdrawn (but huge tmpfs has also been withdrawn for now, so you're right to stick with the older linux-next for testing it). Yes, I saw the discussion on LSFMM 2016 and the patches have gone in my latest update from linux-next. I will stick to 20160420 for the huge tmpfs testing. I believe the patch below fixes it; but I've not done full diligence on it - if I had more time, I'd want to check that all of the things
Re: [BUG linux-next] Kernel panic found with linux-next-20160414
On 4/27/2016 1:14 AM, Hugh Dickins wrote: On Wed, 20 Apr 2016, Shi, Yang wrote: On 4/20/2016 1:01 AM, Hugh Dickins wrote: On Tue, 19 Apr 2016, Shi, Yang wrote: Hi folks, When I ran ltp on linux-next-20160414 on my ARM64 machine, I got the below kernel panic: Unable to handle kernel paging request at virtual address ffc007846000 pgd = ffc01e21d000 [ffc007846000] *pgd=, *pud= Internal error: Oops: 9647 [#11] PREEMPT SMP Modules linked in: loop CPU: 7 PID: 274 Comm: systemd-journal Tainted: G D 4.6.0-rc3-next-20160414-WR8.0.0.0_standard+ #9 Hardware name: Freescale Layerscape 2085a RDB Board (DT) task: ffc01e3fcf80 ti: ffc01ea8c000 task.ti: ffc01ea8c000 PC is at copy_page+0x38/0x120 LR is at migrate_page_copy+0x604/0x1660 pc : [] lr : [] pstate: 2145 sp : ffc01ea8ecd0 x29: ffc01ea8ecd0 x28: x27: 1ff7b80240f8 x26: ffc018196f20 x25: ffbdc01e1180 x24: ffbdc01e1180 x23: x22: ffc01e3fcf80 x21: ffc00481f000 x20: ff900a31d000 x19: ffbdc01207c0 x18: 0f00 x17: x16: x15: x14: x13: x12: x11: x10: x9 : x8 : x7 : x6 : x5 : x4 : x3 : x2 : x1 : ffc00481f080 x0 : ffc007846000 Call trace: Exception stack(0xffc021fc2ed0 to 0xffc021fc2ff0) 2ec0: ffbdc00887c0 ff900a31d000 2ee0: ffc021fc30f0 ff9008ff2318 2145 0025 2f00: ffbdc025a280 ffc020adc4c0 41b58ab3 ff900a085fd0 2f20: ff9008200658 ffbdc00887c0 2f40: ff900b0f1320 ffc021fc3078 41b58ab3 ff900a0864f8 2f60: ff9008210010 ffc021fb8960 ff900867bacc 1ff8043f712d 2f80: ffc021fc2fb0 ff9008210564 ffc021fc3070 ffc021fb8940 2fa0: 08221f78 ff900862f9c8 ffc021fc2fe0 ff9008215dc8 2fc0: 1ff8043f8602 ffc021fc ffc00968a000 ffc00221f080 2fe0: f9407e11d1f0 d61f02209103e210 [] copy_page+0x38/0x120 [] migrate_page+0x74/0x98 [] nfs_migrate_page+0x58/0x80 [] move_to_new_page+0x15c/0x4d8 [] migrate_pages+0x7c8/0x11f0 [] compact_zone+0xdfc/0x2570 [] compact_zone_order+0xe0/0x170 [] try_to_compact_pages+0x2e8/0x8f8 [] __alloc_pages_direct_compact+0x100/0x540 [] __alloc_pages_nodemask+0xc40/0x1c58 [] khugepaged+0x468/0x19c8 [] kthread+0x248/0x2c0 [] ret_from_fork+0x10/0x40 Code: d281f012 91020021 f1020252 d503201f (a8000c02) I did some initial investigation and found it is caused by DEBUG_PAGEALLOC and CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT. And, mainline 4.6-rc3 works well. It should be not arch specific although I got it caught on ARM64. I suspect this might be caused by Hugh's huge tmpfs patches. Thanks for testing. It might be caused by my patches, but I don't think that's very likely. This is page migraton for compaction, in the service of anon THP's khugepaged; and I wonder if you were even exercising huge tmpfs when running LTP here (it certainly can be done: I like to mount a huge tmpfs on /opt/ltp and install there, with shmem_huge 2 so any other tmpfs mounts are also huge). Some further investigation shows I got the panic even though I don't have tmpfs mounted with huge=1 or set shmem_huge to 2. There are compaction changes in linux-next too, but I don't see any reason why they'd cause this. I don't know arm64 traces enough to know whether it's the source page or the destination page for the copy, but it looks as if it has been freed (and DEBUG_PAGEALLOC unmapped) before reaching migration's copy. The fault address is passed by x0, which is dest in the implementation of copy_page, so it is the destination page. Needs more debugging, I'm afraid: is it reproducible? Yes, as long as I enable those two PAGEALLOC debug options, I can get the panic once I run ltp. But, it is not caused any specific ltp test case directly, the panic happens randomly during ltp is running. Your ping on the crash in release_freepages() reminded me to take another look at this one. And found that I only needed to enable DEBUG_PAGEALLOC and run LTP to get it on x86_64 too, as you suspected. It's another of those compaction errors, in mmotm and linux-next of a week or two ago, whose patch has since been withdrawn (but huge tmpfs has also been withdrawn for now, so you're right to stick with the older linux-next for testing it). Yes, I saw the discussion on LSFMM 2016 and the patches have gone in my latest update from linux-next. I will stick to 20160420 for the huge tmpfs testing. I believe the patch below fixes it; but I've not done full diligence on it - if I had more time, I'd want to check that all of the things
Re: [BUG] set_pte_at: racy dirty state clearing warning
On 4/21/2016 1:49 AM, Catalin Marinas wrote: On Wed, Apr 20, 2016 at 04:01:39PM -0700, Shi, Yang wrote: When I enable memory comact via # echo 1 > /proc/sys/vm/compact_memory I got the below WARNING: set_pte_at: racy dirty state clearing: 0x006899371bd3 -> 0x006899371fd3 [ cut here ] WARNING: CPU: 5 PID: 294 at ./arch/arm64/include/asm/pgtable.h:227 ptep_set_access_flags+0x138/0x1b8 Modules linked in: Do you have this patch applied: http://article.gmane.org/gmane.linux.ports.arm.kernel/492239 It's also queued into -next as commit 66dbd6e61a52. No, but I just applied it, it works. Thanks, Yang My kernel has ARM64_HW_AFDBM enabled, but LS2085 is not ARMv8.1. The code shows it just check if ARM64_HW_AFDBM is enabled or not, but doesn't check if the CPU really has such capability. So, it might be better to have the capability checked runtime? The warnings are there to spot any incorrect uses of the pte accessors even before you run on AF/DBM-capable hardware.
Re: [BUG] set_pte_at: racy dirty state clearing warning
On 4/21/2016 1:49 AM, Catalin Marinas wrote: On Wed, Apr 20, 2016 at 04:01:39PM -0700, Shi, Yang wrote: When I enable memory comact via # echo 1 > /proc/sys/vm/compact_memory I got the below WARNING: set_pte_at: racy dirty state clearing: 0x006899371bd3 -> 0x006899371fd3 [ cut here ] WARNING: CPU: 5 PID: 294 at ./arch/arm64/include/asm/pgtable.h:227 ptep_set_access_flags+0x138/0x1b8 Modules linked in: Do you have this patch applied: http://article.gmane.org/gmane.linux.ports.arm.kernel/492239 It's also queued into -next as commit 66dbd6e61a52. No, but I just applied it, it works. Thanks, Yang My kernel has ARM64_HW_AFDBM enabled, but LS2085 is not ARMv8.1. The code shows it just check if ARM64_HW_AFDBM is enabled or not, but doesn't check if the CPU really has such capability. So, it might be better to have the capability checked runtime? The warnings are there to spot any incorrect uses of the pte accessors even before you run on AF/DBM-capable hardware.
Re: [BUG linux-next] kernel NULL pointer dereference on linux-next-20160420
Did some preliminary investigation on this one. The problem is the cc->freepages list is empty, but cc->nr_freepages > 0, it looks the list and nr_freepages get out-of-sync somewhere. Any hint is appreciated. Thanks, Yang On 4/21/2016 5:38 PM, Shi, Yang wrote: Hi folks, I did the below test with huge tmpfs on linux-next-20160420: # mount -t tmpfs huge=1 tmpfs /mnt # cd /mnt Then clone linux kernel Then I got the below bug, such test works well on non-huge tmpfs. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] release_freepages+0x18/0xa0 PGD 0 Oops: [#1] PREEMPT SMP Modules linked in: CPU: 6 PID: 110 Comm: kcompactd0 Not tainted 4.6.0-rc4-next-20160420-WR7.0.0.0_standard #4 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 task: 880361708040 ti: 880361704000 task.ti: 880361704000 RIP: 0010:[] [] release_freepages+0x18/0xa0 RSP: 0018:880361707cf8 EFLAGS: 00010282 RAX: RBX: 88036ffde7c0 RCX: 0009 RDX: 1bf1 RSI: 000f RDI: 880361707dd0 RBP: 880361707d20 R08: 0007 R09: 1600 R10: 88036ffde7c0 R11: R12: R13: 880361707dd0 R14: 880361707dc0 R15: 880361707db0 FS: () GS:880363cc() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 02206000 CR4: 06e0 Stack: 88036ffde7c0 1a00 880361707dc0 880361707db0 880361707da0 8119f13d 81196239 880361708040 0001 0010 Call Trace: [] compact_zone+0x55d/0x9f0 [] ? fragmentation_index+0x19/0x70 [] kcompactd_do_work+0x10f/0x230 [] kcompactd+0x90/0x1e0 [] ? wait_woken+0xa0/0xa0 [] ? kcompactd_do_work+0x230/0x230 [] kthread+0xdd/0x100 [] ret_from_fork+0x22/0x40 [] ? kthread_create_on_node+0x180/0x180 Code: c1 fa 06 31 f6 e8 a9 9b fd ff eb 98 0f 1f 80 00 00 00 00 66 66 66 66 90 55 48 89 e5 41 57 41 56 41 55 49 89 fd 41 54 53 48 8b 07 <48> 8b 10 48 8d 78 e0 49 39 c5 4c 8d 62 e0 74 70 49 be 00 00 00 RIP [] release_freepages+0x18/0xa0 RSP CR2: ---[ end trace 855da7e142f7311f ]---
Re: [BUG linux-next] kernel NULL pointer dereference on linux-next-20160420
Did some preliminary investigation on this one. The problem is the cc->freepages list is empty, but cc->nr_freepages > 0, it looks the list and nr_freepages get out-of-sync somewhere. Any hint is appreciated. Thanks, Yang On 4/21/2016 5:38 PM, Shi, Yang wrote: Hi folks, I did the below test with huge tmpfs on linux-next-20160420: # mount -t tmpfs huge=1 tmpfs /mnt # cd /mnt Then clone linux kernel Then I got the below bug, such test works well on non-huge tmpfs. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] release_freepages+0x18/0xa0 PGD 0 Oops: [#1] PREEMPT SMP Modules linked in: CPU: 6 PID: 110 Comm: kcompactd0 Not tainted 4.6.0-rc4-next-20160420-WR7.0.0.0_standard #4 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 task: 880361708040 ti: 880361704000 task.ti: 880361704000 RIP: 0010:[] [] release_freepages+0x18/0xa0 RSP: 0018:880361707cf8 EFLAGS: 00010282 RAX: RBX: 88036ffde7c0 RCX: 0009 RDX: 1bf1 RSI: 000f RDI: 880361707dd0 RBP: 880361707d20 R08: 0007 R09: 1600 R10: 88036ffde7c0 R11: R12: R13: 880361707dd0 R14: 880361707dc0 R15: 880361707db0 FS: () GS:880363cc() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 02206000 CR4: 06e0 Stack: 88036ffde7c0 1a00 880361707dc0 880361707db0 880361707da0 8119f13d 81196239 880361708040 0001 0010 Call Trace: [] compact_zone+0x55d/0x9f0 [] ? fragmentation_index+0x19/0x70 [] kcompactd_do_work+0x10f/0x230 [] kcompactd+0x90/0x1e0 [] ? wait_woken+0xa0/0xa0 [] ? kcompactd_do_work+0x230/0x230 [] kthread+0xdd/0x100 [] ret_from_fork+0x22/0x40 [] ? kthread_create_on_node+0x180/0x180 Code: c1 fa 06 31 f6 e8 a9 9b fd ff eb 98 0f 1f 80 00 00 00 00 66 66 66 66 90 55 48 89 e5 41 57 41 56 41 55 49 89 fd 41 54 53 48 8b 07 <48> 8b 10 48 8d 78 e0 49 39 c5 4c 8d 62 e0 74 70 49 be 00 00 00 RIP [] release_freepages+0x18/0xa0 RSP CR2: ---[ end trace 855da7e142f7311f ]---
Re: [PATCH] panic: lockdep: correct lock debugging state check
On 4/26/2016 5:39 AM, Peter Zijlstra wrote: On Mon, Apr 25, 2016 at 08:36:37PM -0700, Yang Shi wrote: When kernel oops happens, lock debugging is turned off by debug_locks_off() in oops_enter() via calling __debug_locks_off() which set debug_locks to 0 via xchg(). But, calling to __debug_locks_off() to check lock debugging state in add_taint() called by oops_end() will always return false since xchg() returns the old value of debug_locks which is cleared in oops_enter() already. This prevents add_taint() from printing out lock debugging disable information although LOCKDEP_NOW_UNRELIABLE is passed to it. Check lock debugging state via !debug_locks to fix this. Although !__debug_locks_off() could do the same thing, it may look confusing. What are you smoking? This is the second completely insane patch you send this week. This breaks add_taint() and gains us nothing except trivialities. Who I apologize in advance, if I misunderstand the code and please ignore all the bullshit below. In my understanding, add_taint() should call that pr_warn if LOCKDEP_NOW_UNRELIABLE is passed and lock debugging is disabled. This is what the code tells me. LOCKDEP_NOW_UNRELIABLE is passed via lock_ok parameter, lock debugging is turned off by debug_locks_off() already, so it should print out something, but it doesn't since __debug_locks_off() always returns 0. So, it looks the if statement logic is broken. There are alternatives to fix it, I may pick up the not ideal one. bloody cares about that print if you've just had an OOPS. I do agree not too many people care about that print and such information is too trivial to draw attention from people. However, it doesn't mean oops print is a perfect place to hide something wrong. I just happened to find this by checking the oops information to try to get some clue for another issue. Then I thought it is just a quick fix, why not I should do that to make kernel better. Thanks, Yang
Re: [PATCH] panic: lockdep: correct lock debugging state check
On 4/26/2016 5:39 AM, Peter Zijlstra wrote: On Mon, Apr 25, 2016 at 08:36:37PM -0700, Yang Shi wrote: When kernel oops happens, lock debugging is turned off by debug_locks_off() in oops_enter() via calling __debug_locks_off() which set debug_locks to 0 via xchg(). But, calling to __debug_locks_off() to check lock debugging state in add_taint() called by oops_end() will always return false since xchg() returns the old value of debug_locks which is cleared in oops_enter() already. This prevents add_taint() from printing out lock debugging disable information although LOCKDEP_NOW_UNRELIABLE is passed to it. Check lock debugging state via !debug_locks to fix this. Although !__debug_locks_off() could do the same thing, it may look confusing. What are you smoking? This is the second completely insane patch you send this week. This breaks add_taint() and gains us nothing except trivialities. Who I apologize in advance, if I misunderstand the code and please ignore all the bullshit below. In my understanding, add_taint() should call that pr_warn if LOCKDEP_NOW_UNRELIABLE is passed and lock debugging is disabled. This is what the code tells me. LOCKDEP_NOW_UNRELIABLE is passed via lock_ok parameter, lock debugging is turned off by debug_locks_off() already, so it should print out something, but it doesn't since __debug_locks_off() always returns 0. So, it looks the if statement logic is broken. There are alternatives to fix it, I may pick up the not ideal one. bloody cares about that print if you've just had an OOPS. I do agree not too many people care about that print and such information is too trivial to draw attention from people. However, it doesn't mean oops print is a perfect place to hide something wrong. I just happened to find this by checking the oops information to try to get some clue for another issue. Then I thought it is just a quick fix, why not I should do that to make kernel better. Thanks, Yang
Re: [linux-next PATCH] sched: cgroup: enable interrupt before calling threadgroup_change_begin
On 4/25/2016 10:35 AM, Shi, Yang wrote: On 4/23/2016 2:14 AM, Peter Zijlstra wrote: On Fri, Apr 22, 2016 at 08:56:28PM -0700, Yang Shi wrote: When kernel oops happens in some kernel thread, i.e. kcompactd in the test, the below bug might be triggered by the oops handler: What are you trying to fix? You already oopsed the thing is wrecked. Actually, I ran into the below kernel BUG: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] release_freepages+0x18/0xa0 PGD 0 Oops: [#1] PREEMPT SMP Modules linked in: CPU: 6 PID: 110 Comm: kcompactd0 Not tainted 4.6.0-rc4-next-20160420 #4 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 task: 880361732680 ti: 88036173c000 task.ti: 88036173c000 RIP: 0010:[] [] release_freepages+0x18/0xa0 RSP: 0018:88036173fcf8 EFLAGS: 00010282 RAX: RBX: 88036ffde7c0 RCX: 0009 RDX: 1bf1 RSI: 000f RDI: 88036173fdd0 RBP: 88036173fd20 R08: 0007 R09: 1600 R10: 88036ffde7c0 R11: R12: R13: 88036173fdd0 R14: 88036173fdc0 R15: 88036173fdb0 FS: () GS:880363cc() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 02206000 CR4: 06e0 Stack: 88036ffde7c0 1a00 88036173fdc0 88036173fdb0 88036173fda0 8119f13d 81196239 880361732680 0001 0010 Call Trace: [] compact_zone+0x55d/0x9f0 [] ? fragmentation_index+0x19/0x70 [] kcompactd_do_work+0x10f/0x230 [] kcompactd+0x90/0x1e0 [] ? wait_woken+0xa0/0xa0 [] ? kcompactd_do_work+0x230/0x230 [] kthread+0xdd/0x100 [] ret_from_fork+0x22/0x40 [] ? kthread_create_on_node+0x180/0x180 Code: c1 fa 06 31 f6 e8 a9 9b fd ff eb 98 0f 1f 80 00 00 00 00 66 66 66 66 90 55 48 89 e5 41 57 41 56 41 55 49 89 fd 41 54 53 48 8b 07 <48> 8b 10 48 8d 78 e0 49 39 c5 4c 8d 62 e0 74 70 49 be 00 00 00 RIP [] release_freepages+0x18/0xa0 RSP CR2: ---[ end trace 2e96d09e0ba6342f ]--- Then the "schedule in atomic context" bug is triggered which cause the system hang. But, the system is still alive without the "schedule in atomic context" bug. The previous null pointer deference issue doesn't bring the system down other than killing the compactd kthread. BTW, I don't have "panic on oops" set. So, the kernel doesn't panic. Thanks, Yang Thanks, Yang
Re: [linux-next PATCH] sched: cgroup: enable interrupt before calling threadgroup_change_begin
On 4/25/2016 10:35 AM, Shi, Yang wrote: On 4/23/2016 2:14 AM, Peter Zijlstra wrote: On Fri, Apr 22, 2016 at 08:56:28PM -0700, Yang Shi wrote: When kernel oops happens in some kernel thread, i.e. kcompactd in the test, the below bug might be triggered by the oops handler: What are you trying to fix? You already oopsed the thing is wrecked. Actually, I ran into the below kernel BUG: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] release_freepages+0x18/0xa0 PGD 0 Oops: [#1] PREEMPT SMP Modules linked in: CPU: 6 PID: 110 Comm: kcompactd0 Not tainted 4.6.0-rc4-next-20160420 #4 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 task: 880361732680 ti: 88036173c000 task.ti: 88036173c000 RIP: 0010:[] [] release_freepages+0x18/0xa0 RSP: 0018:88036173fcf8 EFLAGS: 00010282 RAX: RBX: 88036ffde7c0 RCX: 0009 RDX: 1bf1 RSI: 000f RDI: 88036173fdd0 RBP: 88036173fd20 R08: 0007 R09: 1600 R10: 88036ffde7c0 R11: R12: R13: 88036173fdd0 R14: 88036173fdc0 R15: 88036173fdb0 FS: () GS:880363cc() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 02206000 CR4: 06e0 Stack: 88036ffde7c0 1a00 88036173fdc0 88036173fdb0 88036173fda0 8119f13d 81196239 880361732680 0001 0010 Call Trace: [] compact_zone+0x55d/0x9f0 [] ? fragmentation_index+0x19/0x70 [] kcompactd_do_work+0x10f/0x230 [] kcompactd+0x90/0x1e0 [] ? wait_woken+0xa0/0xa0 [] ? kcompactd_do_work+0x230/0x230 [] kthread+0xdd/0x100 [] ret_from_fork+0x22/0x40 [] ? kthread_create_on_node+0x180/0x180 Code: c1 fa 06 31 f6 e8 a9 9b fd ff eb 98 0f 1f 80 00 00 00 00 66 66 66 66 90 55 48 89 e5 41 57 41 56 41 55 49 89 fd 41 54 53 48 8b 07 <48> 8b 10 48 8d 78 e0 49 39 c5 4c 8d 62 e0 74 70 49 be 00 00 00 RIP [] release_freepages+0x18/0xa0 RSP CR2: ---[ end trace 2e96d09e0ba6342f ]--- Then the "schedule in atomic context" bug is triggered which cause the system hang. But, the system is still alive without the "schedule in atomic context" bug. The previous null pointer deference issue doesn't bring the system down other than killing the compactd kthread. BTW, I don't have "panic on oops" set. So, the kernel doesn't panic. Thanks, Yang Thanks, Yang
Re: [linux-next PATCH] sched: cgroup: enable interrupt before calling threadgroup_change_begin
On 4/23/2016 2:14 AM, Peter Zijlstra wrote: On Fri, Apr 22, 2016 at 08:56:28PM -0700, Yang Shi wrote: When kernel oops happens in some kernel thread, i.e. kcompactd in the test, the below bug might be triggered by the oops handler: What are you trying to fix? You already oopsed the thing is wrecked. Actually, I ran into the below kernel BUG: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] release_freepages+0x18/0xa0 PGD 0 Oops: [#1] PREEMPT SMP Modules linked in: CPU: 6 PID: 110 Comm: kcompactd0 Not tainted 4.6.0-rc4-next-20160420 #4 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 task: 880361732680 ti: 88036173c000 task.ti: 88036173c000 RIP: 0010:[] [] release_freepages+0x18/0xa0 RSP: 0018:88036173fcf8 EFLAGS: 00010282 RAX: RBX: 88036ffde7c0 RCX: 0009 RDX: 1bf1 RSI: 000f RDI: 88036173fdd0 RBP: 88036173fd20 R08: 0007 R09: 1600 R10: 88036ffde7c0 R11: R12: R13: 88036173fdd0 R14: 88036173fdc0 R15: 88036173fdb0 FS: () GS:880363cc() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 02206000 CR4: 06e0 Stack: 88036ffde7c0 1a00 88036173fdc0 88036173fdb0 88036173fda0 8119f13d 81196239 880361732680 0001 0010 Call Trace: [] compact_zone+0x55d/0x9f0 [] ? fragmentation_index+0x19/0x70 [] kcompactd_do_work+0x10f/0x230 [] kcompactd+0x90/0x1e0 [] ? wait_woken+0xa0/0xa0 [] ? kcompactd_do_work+0x230/0x230 [] kthread+0xdd/0x100 [] ret_from_fork+0x22/0x40 [] ? kthread_create_on_node+0x180/0x180 Code: c1 fa 06 31 f6 e8 a9 9b fd ff eb 98 0f 1f 80 00 00 00 00 66 66 66 66 90 55 48 89 e5 41 57 41 56 41 55 49 89 fd 41 54 53 48 8b 07 <48> 8b 10 48 8d 78 e0 49 39 c5 4c 8d 62 e0 74 70 49 be 00 00 00 RIP [] release_freepages+0x18/0xa0 RSP CR2: ---[ end trace 2e96d09e0ba6342f ]--- Then the "schedule in atomic context" bug is triggered which cause the system hang. But, the system is still alive without the "schedule in atomic context" bug. The previous null pointer deference issue doesn't bring the system down other than killing the compactd kthread. Thanks, Yang
Re: [linux-next PATCH] sched: cgroup: enable interrupt before calling threadgroup_change_begin
On 4/23/2016 2:14 AM, Peter Zijlstra wrote: On Fri, Apr 22, 2016 at 08:56:28PM -0700, Yang Shi wrote: When kernel oops happens in some kernel thread, i.e. kcompactd in the test, the below bug might be triggered by the oops handler: What are you trying to fix? You already oopsed the thing is wrecked. Actually, I ran into the below kernel BUG: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] release_freepages+0x18/0xa0 PGD 0 Oops: [#1] PREEMPT SMP Modules linked in: CPU: 6 PID: 110 Comm: kcompactd0 Not tainted 4.6.0-rc4-next-20160420 #4 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 task: 880361732680 ti: 88036173c000 task.ti: 88036173c000 RIP: 0010:[] [] release_freepages+0x18/0xa0 RSP: 0018:88036173fcf8 EFLAGS: 00010282 RAX: RBX: 88036ffde7c0 RCX: 0009 RDX: 1bf1 RSI: 000f RDI: 88036173fdd0 RBP: 88036173fd20 R08: 0007 R09: 1600 R10: 88036ffde7c0 R11: R12: R13: 88036173fdd0 R14: 88036173fdc0 R15: 88036173fdb0 FS: () GS:880363cc() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 02206000 CR4: 06e0 Stack: 88036ffde7c0 1a00 88036173fdc0 88036173fdb0 88036173fda0 8119f13d 81196239 880361732680 0001 0010 Call Trace: [] compact_zone+0x55d/0x9f0 [] ? fragmentation_index+0x19/0x70 [] kcompactd_do_work+0x10f/0x230 [] kcompactd+0x90/0x1e0 [] ? wait_woken+0xa0/0xa0 [] ? kcompactd_do_work+0x230/0x230 [] kthread+0xdd/0x100 [] ret_from_fork+0x22/0x40 [] ? kthread_create_on_node+0x180/0x180 Code: c1 fa 06 31 f6 e8 a9 9b fd ff eb 98 0f 1f 80 00 00 00 00 66 66 66 66 90 55 48 89 e5 41 57 41 56 41 55 49 89 fd 41 54 53 48 8b 07 <48> 8b 10 48 8d 78 e0 49 39 c5 4c 8d 62 e0 74 70 49 be 00 00 00 RIP [] release_freepages+0x18/0xa0 RSP CR2: ---[ end trace 2e96d09e0ba6342f ]--- Then the "schedule in atomic context" bug is triggered which cause the system hang. But, the system is still alive without the "schedule in atomic context" bug. The previous null pointer deference issue doesn't bring the system down other than killing the compactd kthread. Thanks, Yang
Re: Warning triggered by lockdep checks for sock_owned_by_user on linux-next-20160420
On 4/22/2016 9:50 PM, Eric Dumazet wrote: On Fri, 2016-04-22 at 21:02 -0700, Shi, Yang wrote: Hi David, When I ran some test on a nfs mounted rootfs, I got the below warning with LOCKDEP enabled on linux-next-20160420: WARNING: CPU: 9 PID: 0 at include/net/sock.h:1408 udp_queue_rcv_skb+0x3d0/0x660 Modules linked in: CPU: 9 PID: 0 Comm: swapper/9 Tainted: G D 4.6.0-rc4-next-20160420-WR7.0.0.0_standard+ #6 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 88066fd03a70 8155855f 88066fd03ab0 81062803 058061318ec8 88065d1e39c0 880661318e40 880661318ec8 Call Trace: [] dump_stack+0x67/0x98 Checking out fil [] __warn+0xd3/0xf0 [] warn_slowpath_null+0x1d/0x20 [] udp_queue_rcv_skb+0x3d0/0x660 [] __udp4_lib_rcv+0x4dc/0xc00 [] udp_rcv+0x1a/0x20 [] ip_local_deliver_finish+0xd1/0x2e0 es: 57% (30585/ [] ? ip_local_deliver_finish+0x3f/0x2e0 [] ip_local_deliver+0xc2/0xd0 [] ip_rcv_finish+0x1e2/0x5a0 [] ip_rcv+0x2dc/0x410 [] ? __pskb_pull_tail+0x82/0x400 [] __netif_receive_skb_core+0x3a8/0xa80 [] ? netif_receive_skb_internal+0x1b/0xf0 [] __netif_receive_skb+0x1d/0x60 [] netif_receive_skb_internal+0x55/0xf0 [] ? netif_receive_skb_internal+0x1b/0xf0 [] napi_gro_receive+0xc2/0x180 [] igb_poll+0x5ea/0xdf0 [] net_rx_action+0x15c/0x3d0 [] __do_softirq+0x161/0x413 [] irq_exit+0xd1/0x110 [] do_IRQ+0x62/0xf0 [] common_interrupt+0x8e/0x8e [] ? cpuidle_enter_state+0xc6/0x290 [] cpuidle_enter+0x17/0x20 [] call_cpuidle+0x33/0x50 [] cpu_startup_entry+0x229/0x3b0 [] start_secondary+0x144/0x150 ---[ end trace ba508c424f0d52bf ]--- The warning is triggered by commit fafc4e1ea1a4c1eb13a30c9426fb799f5efacbc3 ("sock: tigthen lockdep checks for sock_owned_by_user"), which checks if slock is held before locking "owned". It looks good to lock_sock which is just called lock_sock_nested. But, bh_lock_sock is different, which just calls spin_lock so it doesn't touch dep_map then the check will fail even though it is locked. ?? spin_lock() definitely is lockdep friendly. Yes, this is what I thought too. But, I didn't figure out why the warning was still reported even though spin_lock is called. So, I'm wondering what a right fix for it should be: 1. Replace bh_lock_sock to bh_lock_sock_nested in the protocols implementation, but there are a lot places calling it. 2. Just like lock_sock, just call bh_lock_sock_nested instead of spin_lock. Or the both approach is wrong or not ideal? I sent a patch yesterday, I am not sure what the status is. Thanks for the patch. I just found your original patch and the discussion with Valdis. I think I ran into the same problem. There is kernel BUG is triggered before the warning, but "lockdep is off" information is not printed out, although is it really off. Just tried your patch, it works for me. Thanks, Yang diff --git a/include/net/sock.h b/include/net/sock.h index d997ec13a643..db8301c76d50 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1350,7 +1350,8 @@ static inline bool lockdep_sock_is_held(const struct sock *csk) { struct sock *sk = (struct sock *)csk; - return lockdep_is_held(>sk_lock) || + return !debug_locks || + lockdep_is_held(>sk_lock) || lockdep_is_held(>sk_lock.slock); } #endif
Re: Warning triggered by lockdep checks for sock_owned_by_user on linux-next-20160420
On 4/22/2016 9:50 PM, Eric Dumazet wrote: On Fri, 2016-04-22 at 21:02 -0700, Shi, Yang wrote: Hi David, When I ran some test on a nfs mounted rootfs, I got the below warning with LOCKDEP enabled on linux-next-20160420: WARNING: CPU: 9 PID: 0 at include/net/sock.h:1408 udp_queue_rcv_skb+0x3d0/0x660 Modules linked in: CPU: 9 PID: 0 Comm: swapper/9 Tainted: G D 4.6.0-rc4-next-20160420-WR7.0.0.0_standard+ #6 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 88066fd03a70 8155855f 88066fd03ab0 81062803 058061318ec8 88065d1e39c0 880661318e40 880661318ec8 Call Trace: [] dump_stack+0x67/0x98 Checking out fil [] __warn+0xd3/0xf0 [] warn_slowpath_null+0x1d/0x20 [] udp_queue_rcv_skb+0x3d0/0x660 [] __udp4_lib_rcv+0x4dc/0xc00 [] udp_rcv+0x1a/0x20 [] ip_local_deliver_finish+0xd1/0x2e0 es: 57% (30585/ [] ? ip_local_deliver_finish+0x3f/0x2e0 [] ip_local_deliver+0xc2/0xd0 [] ip_rcv_finish+0x1e2/0x5a0 [] ip_rcv+0x2dc/0x410 [] ? __pskb_pull_tail+0x82/0x400 [] __netif_receive_skb_core+0x3a8/0xa80 [] ? netif_receive_skb_internal+0x1b/0xf0 [] __netif_receive_skb+0x1d/0x60 [] netif_receive_skb_internal+0x55/0xf0 [] ? netif_receive_skb_internal+0x1b/0xf0 [] napi_gro_receive+0xc2/0x180 [] igb_poll+0x5ea/0xdf0 [] net_rx_action+0x15c/0x3d0 [] __do_softirq+0x161/0x413 [] irq_exit+0xd1/0x110 [] do_IRQ+0x62/0xf0 [] common_interrupt+0x8e/0x8e [] ? cpuidle_enter_state+0xc6/0x290 [] cpuidle_enter+0x17/0x20 [] call_cpuidle+0x33/0x50 [] cpu_startup_entry+0x229/0x3b0 [] start_secondary+0x144/0x150 ---[ end trace ba508c424f0d52bf ]--- The warning is triggered by commit fafc4e1ea1a4c1eb13a30c9426fb799f5efacbc3 ("sock: tigthen lockdep checks for sock_owned_by_user"), which checks if slock is held before locking "owned". It looks good to lock_sock which is just called lock_sock_nested. But, bh_lock_sock is different, which just calls spin_lock so it doesn't touch dep_map then the check will fail even though it is locked. ?? spin_lock() definitely is lockdep friendly. Yes, this is what I thought too. But, I didn't figure out why the warning was still reported even though spin_lock is called. So, I'm wondering what a right fix for it should be: 1. Replace bh_lock_sock to bh_lock_sock_nested in the protocols implementation, but there are a lot places calling it. 2. Just like lock_sock, just call bh_lock_sock_nested instead of spin_lock. Or the both approach is wrong or not ideal? I sent a patch yesterday, I am not sure what the status is. Thanks for the patch. I just found your original patch and the discussion with Valdis. I think I ran into the same problem. There is kernel BUG is triggered before the warning, but "lockdep is off" information is not printed out, although is it really off. Just tried your patch, it works for me. Thanks, Yang diff --git a/include/net/sock.h b/include/net/sock.h index d997ec13a643..db8301c76d50 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1350,7 +1350,8 @@ static inline bool lockdep_sock_is_held(const struct sock *csk) { struct sock *sk = (struct sock *)csk; - return lockdep_is_held(>sk_lock) || + return !debug_locks || + lockdep_is_held(>sk_lock) || lockdep_is_held(>sk_lock.slock); } #endif
Warning triggered by lockdep checks for sock_owned_by_user on linux-next-20160420
Hi David, When I ran some test on a nfs mounted rootfs, I got the below warning with LOCKDEP enabled on linux-next-20160420: WARNING: CPU: 9 PID: 0 at include/net/sock.h:1408 udp_queue_rcv_skb+0x3d0/0x660 Modules linked in: CPU: 9 PID: 0 Comm: swapper/9 Tainted: G D 4.6.0-rc4-next-20160420-WR7.0.0.0_standard+ #6 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 88066fd03a70 8155855f 88066fd03ab0 81062803 058061318ec8 88065d1e39c0 880661318e40 880661318ec8 Call Trace: [] dump_stack+0x67/0x98 Checking out fil [] __warn+0xd3/0xf0 [] warn_slowpath_null+0x1d/0x20 [] udp_queue_rcv_skb+0x3d0/0x660 [] __udp4_lib_rcv+0x4dc/0xc00 [] udp_rcv+0x1a/0x20 [] ip_local_deliver_finish+0xd1/0x2e0 es: 57% (30585/ [] ? ip_local_deliver_finish+0x3f/0x2e0 [] ip_local_deliver+0xc2/0xd0 [] ip_rcv_finish+0x1e2/0x5a0 [] ip_rcv+0x2dc/0x410 [] ? __pskb_pull_tail+0x82/0x400 [] __netif_receive_skb_core+0x3a8/0xa80 [] ? netif_receive_skb_internal+0x1b/0xf0 [] __netif_receive_skb+0x1d/0x60 [] netif_receive_skb_internal+0x55/0xf0 [] ? netif_receive_skb_internal+0x1b/0xf0 [] napi_gro_receive+0xc2/0x180 [] igb_poll+0x5ea/0xdf0 [] net_rx_action+0x15c/0x3d0 [] __do_softirq+0x161/0x413 [] irq_exit+0xd1/0x110 [] do_IRQ+0x62/0xf0 [] common_interrupt+0x8e/0x8e [] ? cpuidle_enter_state+0xc6/0x290 [] cpuidle_enter+0x17/0x20 [] call_cpuidle+0x33/0x50 [] cpu_startup_entry+0x229/0x3b0 [] start_secondary+0x144/0x150 ---[ end trace ba508c424f0d52bf ]--- The warning is triggered by commit fafc4e1ea1a4c1eb13a30c9426fb799f5efacbc3 ("sock: tigthen lockdep checks for sock_owned_by_user"), which checks if slock is held before locking "owned". It looks good to lock_sock which is just called lock_sock_nested. But, bh_lock_sock is different, which just calls spin_lock so it doesn't touch dep_map then the check will fail even though it is locked. So, I'm wondering what a right fix for it should be: 1. Replace bh_lock_sock to bh_lock_sock_nested in the protocols implementation, but there are a lot places calling it. 2. Just like lock_sock, just call bh_lock_sock_nested instead of spin_lock. Or the both approach is wrong or not ideal? Thanks, Yang
Warning triggered by lockdep checks for sock_owned_by_user on linux-next-20160420
Hi David, When I ran some test on a nfs mounted rootfs, I got the below warning with LOCKDEP enabled on linux-next-20160420: WARNING: CPU: 9 PID: 0 at include/net/sock.h:1408 udp_queue_rcv_skb+0x3d0/0x660 Modules linked in: CPU: 9 PID: 0 Comm: swapper/9 Tainted: G D 4.6.0-rc4-next-20160420-WR7.0.0.0_standard+ #6 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 88066fd03a70 8155855f 88066fd03ab0 81062803 058061318ec8 88065d1e39c0 880661318e40 880661318ec8 Call Trace: [] dump_stack+0x67/0x98 Checking out fil [] __warn+0xd3/0xf0 [] warn_slowpath_null+0x1d/0x20 [] udp_queue_rcv_skb+0x3d0/0x660 [] __udp4_lib_rcv+0x4dc/0xc00 [] udp_rcv+0x1a/0x20 [] ip_local_deliver_finish+0xd1/0x2e0 es: 57% (30585/ [] ? ip_local_deliver_finish+0x3f/0x2e0 [] ip_local_deliver+0xc2/0xd0 [] ip_rcv_finish+0x1e2/0x5a0 [] ip_rcv+0x2dc/0x410 [] ? __pskb_pull_tail+0x82/0x400 [] __netif_receive_skb_core+0x3a8/0xa80 [] ? netif_receive_skb_internal+0x1b/0xf0 [] __netif_receive_skb+0x1d/0x60 [] netif_receive_skb_internal+0x55/0xf0 [] ? netif_receive_skb_internal+0x1b/0xf0 [] napi_gro_receive+0xc2/0x180 [] igb_poll+0x5ea/0xdf0 [] net_rx_action+0x15c/0x3d0 [] __do_softirq+0x161/0x413 [] irq_exit+0xd1/0x110 [] do_IRQ+0x62/0xf0 [] common_interrupt+0x8e/0x8e [] ? cpuidle_enter_state+0xc6/0x290 [] cpuidle_enter+0x17/0x20 [] call_cpuidle+0x33/0x50 [] cpu_startup_entry+0x229/0x3b0 [] start_secondary+0x144/0x150 ---[ end trace ba508c424f0d52bf ]--- The warning is triggered by commit fafc4e1ea1a4c1eb13a30c9426fb799f5efacbc3 ("sock: tigthen lockdep checks for sock_owned_by_user"), which checks if slock is held before locking "owned". It looks good to lock_sock which is just called lock_sock_nested. But, bh_lock_sock is different, which just calls spin_lock so it doesn't touch dep_map then the check will fail even though it is locked. So, I'm wondering what a right fix for it should be: 1. Replace bh_lock_sock to bh_lock_sock_nested in the protocols implementation, but there are a lot places calling it. 2. Just like lock_sock, just call bh_lock_sock_nested instead of spin_lock. Or the both approach is wrong or not ideal? Thanks, Yang
[BUG linux-next] kernel NULL pointer dereference on linux-next-20160420
Hi folks, I did the below test with huge tmpfs on linux-next-20160420: # mount -t tmpfs huge=1 tmpfs /mnt # cd /mnt Then clone linux kernel Then I got the below bug, such test works well on non-huge tmpfs. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] release_freepages+0x18/0xa0 PGD 0 Oops: [#1] PREEMPT SMP Modules linked in: CPU: 6 PID: 110 Comm: kcompactd0 Not tainted 4.6.0-rc4-next-20160420-WR7.0.0.0_standard #4 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 task: 880361708040 ti: 880361704000 task.ti: 880361704000 RIP: 0010:[] [] release_freepages+0x18/0xa0 RSP: 0018:880361707cf8 EFLAGS: 00010282 RAX: RBX: 88036ffde7c0 RCX: 0009 RDX: 1bf1 RSI: 000f RDI: 880361707dd0 RBP: 880361707d20 R08: 0007 R09: 1600 R10: 88036ffde7c0 R11: R12: R13: 880361707dd0 R14: 880361707dc0 R15: 880361707db0 FS: () GS:880363cc() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 02206000 CR4: 06e0 Stack: 88036ffde7c0 1a00 880361707dc0 880361707db0 880361707da0 8119f13d 81196239 880361708040 0001 0010 Call Trace: [] compact_zone+0x55d/0x9f0 [] ? fragmentation_index+0x19/0x70 [] kcompactd_do_work+0x10f/0x230 [] kcompactd+0x90/0x1e0 [] ? wait_woken+0xa0/0xa0 [] ? kcompactd_do_work+0x230/0x230 [] kthread+0xdd/0x100 [] ret_from_fork+0x22/0x40 [] ? kthread_create_on_node+0x180/0x180 Code: c1 fa 06 31 f6 e8 a9 9b fd ff eb 98 0f 1f 80 00 00 00 00 66 66 66 66 90 55 48 89 e5 41 57 41 56 41 55 49 89 fd 41 54 53 48 8b 07 <48> 8b 10 48 8d 78 e0 49 39 c5 4c 8d 62 e0 74 70 49 be 00 00 00 RIP [] release_freepages+0x18/0xa0 RSP CR2: ---[ end trace 855da7e142f7311f ]---
[BUG linux-next] kernel NULL pointer dereference on linux-next-20160420
Hi folks, I did the below test with huge tmpfs on linux-next-20160420: # mount -t tmpfs huge=1 tmpfs /mnt # cd /mnt Then clone linux kernel Then I got the below bug, such test works well on non-huge tmpfs. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] release_freepages+0x18/0xa0 PGD 0 Oops: [#1] PREEMPT SMP Modules linked in: CPU: 6 PID: 110 Comm: kcompactd0 Not tainted 4.6.0-rc4-next-20160420-WR7.0.0.0_standard #4 Hardware name: Intel Corporation S5520HC/S5520HC, BIOS S5500.86B.01.10.0025.030220091519 03/02/2009 task: 880361708040 ti: 880361704000 task.ti: 880361704000 RIP: 0010:[] [] release_freepages+0x18/0xa0 RSP: 0018:880361707cf8 EFLAGS: 00010282 RAX: RBX: 88036ffde7c0 RCX: 0009 RDX: 1bf1 RSI: 000f RDI: 880361707dd0 RBP: 880361707d20 R08: 0007 R09: 1600 R10: 88036ffde7c0 R11: R12: R13: 880361707dd0 R14: 880361707dc0 R15: 880361707db0 FS: () GS:880363cc() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: CR3: 02206000 CR4: 06e0 Stack: 88036ffde7c0 1a00 880361707dc0 880361707db0 880361707da0 8119f13d 81196239 880361708040 0001 0010 Call Trace: [] compact_zone+0x55d/0x9f0 [] ? fragmentation_index+0x19/0x70 [] kcompactd_do_work+0x10f/0x230 [] kcompactd+0x90/0x1e0 [] ? wait_woken+0xa0/0xa0 [] ? kcompactd_do_work+0x230/0x230 [] kthread+0xdd/0x100 [] ret_from_fork+0x22/0x40 [] ? kthread_create_on_node+0x180/0x180 Code: c1 fa 06 31 f6 e8 a9 9b fd ff eb 98 0f 1f 80 00 00 00 00 66 66 66 66 90 55 48 89 e5 41 57 41 56 41 55 49 89 fd 41 54 53 48 8b 07 <48> 8b 10 48 8d 78 e0 49 39 c5 4c 8d 62 e0 74 70 49 be 00 00 00 RIP [] release_freepages+0x18/0xa0 RSP CR2: ---[ end trace 855da7e142f7311f ]---
Re: [PATCH] mm: move huge_pmd_set_accessed out of huge_memory.c
On 4/21/2016 2:15 AM, Hugh Dickins wrote: On Wed, 20 Apr 2016, Shi, Yang wrote: Hi folks, I didn't realize pmd_* functions are protected by CONFIG_TRANSPARENT_HUGEPAGE on the most architectures before I made this change. Before I fix all the affected architectures code, I want to check if you guys think this change is worth or not? Thanks for asking: no, it is not worthwhile. I would much prefer not to have to consider these trivial cleanups in the huge memory area at this time. Kirill and I have urgent work to do in this area, and coping with patch conflicts between different versions of the source will not help any of us. Thanks for your suggestion. I would consider to put such cleanup work on the back burner. Yang Thanks, Hugh Thanks, Yang On 4/20/2016 11:24 AM, Yang Shi wrote: huge_pmd_set_accessed is only called by __handle_mm_fault from memory.c, move the definition to memory.c and make it static like create_huge_pmd and wp_huge_pmd. Signed-off-by: Yang Shi <yang@linaro.org> --- include/linux/huge_mm.h | 4 mm/huge_memory.c| 23 --- mm/memory.c | 23 +++ 3 files changed, 23 insertions(+), 27 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7008623..c218ab7b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -8,10 +8,6 @@ extern int do_huge_pmd_anonymous_page(struct mm_struct *mm, extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma); -extern void huge_pmd_set_accessed(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - pmd_t orig_pmd, int dirty); extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pmd_t orig_pmd); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fecbbc5..6c14cb6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1137,29 +1137,6 @@ out: return ret; } -void huge_pmd_set_accessed(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmd, pmd_t orig_pmd, - int dirty) -{ - spinlock_t *ptl; - pmd_t entry; - unsigned long haddr; - - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_same(*pmd, orig_pmd))) - goto unlock; - - entry = pmd_mkyoung(orig_pmd); - haddr = address & HPAGE_PMD_MASK; - if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) - update_mmu_cache_pmd(vma, address, pmd); - -unlock: - spin_unlock(ptl); -} - static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, diff --git a/mm/memory.c b/mm/memory.c index 93897f2..6ced4eb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3287,6 +3287,29 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_FALLBACK; } +static void huge_pmd_set_accessed(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmd, pmd_t orig_pmd, + int dirty) +{ + spinlock_t *ptl; + pmd_t entry; + unsigned long haddr; + + ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto unlock; + + entry = pmd_mkyoung(orig_pmd); + haddr = address & HPAGE_PMD_MASK; + if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) + update_mmu_cache_pmd(vma, address, pmd); + +unlock: + spin_unlock(ptl); +} + /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most
Re: [PATCH] mm: move huge_pmd_set_accessed out of huge_memory.c
On 4/21/2016 2:15 AM, Hugh Dickins wrote: On Wed, 20 Apr 2016, Shi, Yang wrote: Hi folks, I didn't realize pmd_* functions are protected by CONFIG_TRANSPARENT_HUGEPAGE on the most architectures before I made this change. Before I fix all the affected architectures code, I want to check if you guys think this change is worth or not? Thanks for asking: no, it is not worthwhile. I would much prefer not to have to consider these trivial cleanups in the huge memory area at this time. Kirill and I have urgent work to do in this area, and coping with patch conflicts between different versions of the source will not help any of us. Thanks for your suggestion. I would consider to put such cleanup work on the back burner. Yang Thanks, Hugh Thanks, Yang On 4/20/2016 11:24 AM, Yang Shi wrote: huge_pmd_set_accessed is only called by __handle_mm_fault from memory.c, move the definition to memory.c and make it static like create_huge_pmd and wp_huge_pmd. Signed-off-by: Yang Shi --- include/linux/huge_mm.h | 4 mm/huge_memory.c| 23 --- mm/memory.c | 23 +++ 3 files changed, 23 insertions(+), 27 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7008623..c218ab7b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -8,10 +8,6 @@ extern int do_huge_pmd_anonymous_page(struct mm_struct *mm, extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma); -extern void huge_pmd_set_accessed(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - pmd_t orig_pmd, int dirty); extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pmd_t orig_pmd); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fecbbc5..6c14cb6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1137,29 +1137,6 @@ out: return ret; } -void huge_pmd_set_accessed(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmd, pmd_t orig_pmd, - int dirty) -{ - spinlock_t *ptl; - pmd_t entry; - unsigned long haddr; - - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_same(*pmd, orig_pmd))) - goto unlock; - - entry = pmd_mkyoung(orig_pmd); - haddr = address & HPAGE_PMD_MASK; - if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) - update_mmu_cache_pmd(vma, address, pmd); - -unlock: - spin_unlock(ptl); -} - static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, diff --git a/mm/memory.c b/mm/memory.c index 93897f2..6ced4eb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3287,6 +3287,29 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_FALLBACK; } +static void huge_pmd_set_accessed(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmd, pmd_t orig_pmd, + int dirty) +{ + spinlock_t *ptl; + pmd_t entry; + unsigned long haddr; + + ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto unlock; + + entry = pmd_mkyoung(orig_pmd); + haddr = address & HPAGE_PMD_MASK; + if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) + update_mmu_cache_pmd(vma, address, pmd); + +unlock: + spin_unlock(ptl); +} + /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most
Re: [PATCH] mm: move huge_pmd_set_accessed out of huge_memory.c
On 4/21/2016 12:30 AM, Kirill A. Shutemov wrote: On Wed, Apr 20, 2016 at 02:00:11PM -0700, Shi, Yang wrote: Hi folks, I didn't realize pmd_* functions are protected by CONFIG_TRANSPARENT_HUGEPAGE on the most architectures before I made this change. Before I fix all the affected architectures code, I want to check if you guys think this change is worth or not? Thanks, Yang On 4/20/2016 11:24 AM, Yang Shi wrote: huge_pmd_set_accessed is only called by __handle_mm_fault from memory.c, move the definition to memory.c and make it static like create_huge_pmd and wp_huge_pmd. Signed-off-by: Yang Shi <yang@linaro.org> On pte side we have the same functionality open-coded. Should we do the same for pmd? Or change pte side the same way? Sorry, I don't quite understand you. Do you mean pte_* functions? Thanks, Yang
Re: [PATCH] mm: move huge_pmd_set_accessed out of huge_memory.c
On 4/21/2016 12:30 AM, Kirill A. Shutemov wrote: On Wed, Apr 20, 2016 at 02:00:11PM -0700, Shi, Yang wrote: Hi folks, I didn't realize pmd_* functions are protected by CONFIG_TRANSPARENT_HUGEPAGE on the most architectures before I made this change. Before I fix all the affected architectures code, I want to check if you guys think this change is worth or not? Thanks, Yang On 4/20/2016 11:24 AM, Yang Shi wrote: huge_pmd_set_accessed is only called by __handle_mm_fault from memory.c, move the definition to memory.c and make it static like create_huge_pmd and wp_huge_pmd. Signed-off-by: Yang Shi On pte side we have the same functionality open-coded. Should we do the same for pmd? Or change pte side the same way? Sorry, I don't quite understand you. Do you mean pte_* functions? Thanks, Yang
[BUG] set_pte_at: racy dirty state clearing warning
Hi Will and Catalin, When I enable memory comact via # echo 1 > /proc/sys/vm/compact_memory I got the below WARNING: set_pte_at: racy dirty state clearing: 0x006899371bd3 -> 0x006899371fd3 [ cut here ] WARNING: CPU: 5 PID: 294 at ./arch/arm64/include/asm/pgtable.h:227 ptep_set_access_flags+0x138/0x1b8 Modules linked in: CPU: 5 PID: 294 Comm: systemd-journal Not tainted 4.6.0-rc3-next-20160414 #13 Hardware name: Freescale Layerscape 2085a RDB Board (DT) task: 80001e4f8080 ti: 80001e8b4000 task.ti: 80001e8b4000 PC is at ptep_set_access_flags+0x138/0x1b8 LR is at ptep_set_access_flags+0x138/0x1b8 pc : [] lr : [] pstate: 2145 sp : 80001e8b7bc0 x29: 80001e8b7bc0 x28: 80001e843ac8 x27: 0040 x26: 80001e9ae0d8 x25: 29901000 x24: 80001f32a938 x23: 0001 x22: 80001e9ae088 x21: 7eb59000 x20: 80001e843ac8 x19: 006899371fd3 x18: fe09 x17: 7ea48c88 x16: cb5afb20 x15: 003b9aca x14: 307830203e2d2033 x13: 6462313733393930 x12: 3030303836303078 x11: 30203a676e697261 x10: 656c632065746174 x9 : 7320797472696420 x8 : 79636172203a7461 x7 : 5f6574705f746573 x6 : 28300ab8 x5 : 0003 x4 : x3 : 0003 x2 : 13d16f64 x1 : dfff2000 x0 : 004f ---[ end trace d75cd9bb88364c80 ]--- Call trace: Exception stack(0x80001e8b79a0 to 0x80001e8b7ac0) 79a0: 006899371fd3 80001e843ac8 80001e8b7bc0 28497b70 79c0: 2145 003d 29901000 28301558 79e0: 41b58ab3 296870d0 28200668 292b0e40 7a00: 0001 80001f32a938 29901000 80001e9ae0d8 7a20: 0040 80001e843ac8 29901000 80001e9ae0d8 7a40: 2a9dcf60 0a6d9320 2000 7a60: 80001e8b7bc0 80001e8b7bc0 80001e8b7b80 ffc8 7a80: 80001e8b7ad0 28415418 80001e8b4000 13d16f64 7aa0: 004f dfff2000 13d16f64 0003 [] ptep_set_access_flags+0x138/0x1b8 [] handle_mm_fault+0xa24/0xfa0 [] do_page_fault+0x3d4/0x4c0 [] do_mem_abort+0xac/0x140 My kernel has ARM64_HW_AFDBM enabled, but LS2085 is not ARMv8.1. The code shows it just check if ARM64_HW_AFDBM is enabled or not, but doesn't check if the CPU really has such capability. So, it might be better to have the capability checked runtime? Thanks, Yang
[BUG] set_pte_at: racy dirty state clearing warning
Hi Will and Catalin, When I enable memory comact via # echo 1 > /proc/sys/vm/compact_memory I got the below WARNING: set_pte_at: racy dirty state clearing: 0x006899371bd3 -> 0x006899371fd3 [ cut here ] WARNING: CPU: 5 PID: 294 at ./arch/arm64/include/asm/pgtable.h:227 ptep_set_access_flags+0x138/0x1b8 Modules linked in: CPU: 5 PID: 294 Comm: systemd-journal Not tainted 4.6.0-rc3-next-20160414 #13 Hardware name: Freescale Layerscape 2085a RDB Board (DT) task: 80001e4f8080 ti: 80001e8b4000 task.ti: 80001e8b4000 PC is at ptep_set_access_flags+0x138/0x1b8 LR is at ptep_set_access_flags+0x138/0x1b8 pc : [] lr : [] pstate: 2145 sp : 80001e8b7bc0 x29: 80001e8b7bc0 x28: 80001e843ac8 x27: 0040 x26: 80001e9ae0d8 x25: 29901000 x24: 80001f32a938 x23: 0001 x22: 80001e9ae088 x21: 7eb59000 x20: 80001e843ac8 x19: 006899371fd3 x18: fe09 x17: 7ea48c88 x16: cb5afb20 x15: 003b9aca x14: 307830203e2d2033 x13: 6462313733393930 x12: 3030303836303078 x11: 30203a676e697261 x10: 656c632065746174 x9 : 7320797472696420 x8 : 79636172203a7461 x7 : 5f6574705f746573 x6 : 28300ab8 x5 : 0003 x4 : x3 : 0003 x2 : 13d16f64 x1 : dfff2000 x0 : 004f ---[ end trace d75cd9bb88364c80 ]--- Call trace: Exception stack(0x80001e8b79a0 to 0x80001e8b7ac0) 79a0: 006899371fd3 80001e843ac8 80001e8b7bc0 28497b70 79c0: 2145 003d 29901000 28301558 79e0: 41b58ab3 296870d0 28200668 292b0e40 7a00: 0001 80001f32a938 29901000 80001e9ae0d8 7a20: 0040 80001e843ac8 29901000 80001e9ae0d8 7a40: 2a9dcf60 0a6d9320 2000 7a60: 80001e8b7bc0 80001e8b7bc0 80001e8b7b80 ffc8 7a80: 80001e8b7ad0 28415418 80001e8b4000 13d16f64 7aa0: 004f dfff2000 13d16f64 0003 [] ptep_set_access_flags+0x138/0x1b8 [] handle_mm_fault+0xa24/0xfa0 [] do_page_fault+0x3d4/0x4c0 [] do_mem_abort+0xac/0x140 My kernel has ARM64_HW_AFDBM enabled, but LS2085 is not ARMv8.1. The code shows it just check if ARM64_HW_AFDBM is enabled or not, but doesn't check if the CPU really has such capability. So, it might be better to have the capability checked runtime? Thanks, Yang
Re: [PATCH] mm: move huge_pmd_set_accessed out of huge_memory.c
Hi folks, I didn't realize pmd_* functions are protected by CONFIG_TRANSPARENT_HUGEPAGE on the most architectures before I made this change. Before I fix all the affected architectures code, I want to check if you guys think this change is worth or not? Thanks, Yang On 4/20/2016 11:24 AM, Yang Shi wrote: huge_pmd_set_accessed is only called by __handle_mm_fault from memory.c, move the definition to memory.c and make it static like create_huge_pmd and wp_huge_pmd. Signed-off-by: Yang Shi <yang@linaro.org> --- include/linux/huge_mm.h | 4 mm/huge_memory.c| 23 --- mm/memory.c | 23 +++ 3 files changed, 23 insertions(+), 27 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7008623..c218ab7b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -8,10 +8,6 @@ extern int do_huge_pmd_anonymous_page(struct mm_struct *mm, extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma); -extern void huge_pmd_set_accessed(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - pmd_t orig_pmd, int dirty); extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pmd_t orig_pmd); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fecbbc5..6c14cb6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1137,29 +1137,6 @@ out: return ret; } -void huge_pmd_set_accessed(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmd, pmd_t orig_pmd, - int dirty) -{ - spinlock_t *ptl; - pmd_t entry; - unsigned long haddr; - - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_same(*pmd, orig_pmd))) - goto unlock; - - entry = pmd_mkyoung(orig_pmd); - haddr = address & HPAGE_PMD_MASK; - if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) - update_mmu_cache_pmd(vma, address, pmd); - -unlock: - spin_unlock(ptl); -} - static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, diff --git a/mm/memory.c b/mm/memory.c index 93897f2..6ced4eb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3287,6 +3287,29 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_FALLBACK; } +static void huge_pmd_set_accessed(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmd, pmd_t orig_pmd, + int dirty) +{ + spinlock_t *ptl; + pmd_t entry; + unsigned long haddr; + + ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto unlock; + + entry = pmd_mkyoung(orig_pmd); + haddr = address & HPAGE_PMD_MASK; + if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) + update_mmu_cache_pmd(vma, address, pmd); + +unlock: + spin_unlock(ptl); +} + /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most
Re: [PATCH] mm: move huge_pmd_set_accessed out of huge_memory.c
Hi folks, I didn't realize pmd_* functions are protected by CONFIG_TRANSPARENT_HUGEPAGE on the most architectures before I made this change. Before I fix all the affected architectures code, I want to check if you guys think this change is worth or not? Thanks, Yang On 4/20/2016 11:24 AM, Yang Shi wrote: huge_pmd_set_accessed is only called by __handle_mm_fault from memory.c, move the definition to memory.c and make it static like create_huge_pmd and wp_huge_pmd. Signed-off-by: Yang Shi --- include/linux/huge_mm.h | 4 mm/huge_memory.c| 23 --- mm/memory.c | 23 +++ 3 files changed, 23 insertions(+), 27 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7008623..c218ab7b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -8,10 +8,6 @@ extern int do_huge_pmd_anonymous_page(struct mm_struct *mm, extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma); -extern void huge_pmd_set_accessed(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - pmd_t orig_pmd, int dirty); extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pmd_t orig_pmd); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fecbbc5..6c14cb6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1137,29 +1137,6 @@ out: return ret; } -void huge_pmd_set_accessed(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmd, pmd_t orig_pmd, - int dirty) -{ - spinlock_t *ptl; - pmd_t entry; - unsigned long haddr; - - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_same(*pmd, orig_pmd))) - goto unlock; - - entry = pmd_mkyoung(orig_pmd); - haddr = address & HPAGE_PMD_MASK; - if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) - update_mmu_cache_pmd(vma, address, pmd); - -unlock: - spin_unlock(ptl); -} - static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, diff --git a/mm/memory.c b/mm/memory.c index 93897f2..6ced4eb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3287,6 +3287,29 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_FALLBACK; } +static void huge_pmd_set_accessed(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmd, pmd_t orig_pmd, + int dirty) +{ + spinlock_t *ptl; + pmd_t entry; + unsigned long haddr; + + ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto unlock; + + entry = pmd_mkyoung(orig_pmd); + haddr = address & HPAGE_PMD_MASK; + if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) + update_mmu_cache_pmd(vma, address, pmd); + +unlock: + spin_unlock(ptl); +} + /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most
[BUG linux-next] KASAN bug is raised on linux-next-20160414 with huge tmpfs on
Hi folks, When I run the below test on my ARM64 machine with NFS mounted rootfs, I got KASAN bug report. The test runs well if mnt is not mounted with "huge=1". # mount -t tmpfs -o huge=1 tmpfs /mnt # cp -a /opt/ltp /mnt/ BUG: KASAN: use-after-free in nfs_readdir+0x2c4/0x848 at addr 8b7f4000 Read of size 4 by task crond/446 page:7bffc02dfd00 count:2 mapcount:0 mapping:80001c2cae98 index:0x0 flags: 0x6c(referenced|uptodate|lru|active) page dumped because: kasan: bad access detected page->mem_cgroup:80002402da80 CPU: 0 PID: 446 Comm: crond Tainted: GW 4.6.0-rc3-next-20160414-WR8.0.0.0_standard+ #13 Hardware name: Freescale Layerscape 2085a RDB Board (DT) Call trace: [] dump_backtrace+0x0/0x2b8 [] show_stack+0x24/0x30 [] dump_stack+0xb0/0xe8 [] kasan_report_error+0x518/0x5c0 [] kasan_report+0x60/0x70 [] __asan_load4+0x64/0x80 [] nfs_readdir+0x2c4/0x848 [] iterate_dir+0x120/0x1d8 [] SyS_getdents64+0xdc/0x170 [] __sys_trace_return+0x0/0x4 Memory state around the buggy address: 8b7f3f00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff 8b7f3f80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >8b7f4000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ 8b7f4080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff 8b7f4100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff BUG: KASAN: use-after-free in nfs_do_filldir+0x88/0x298 at addr 8b7f4000 Read of size 4 by task crond/446 page:7bffc02dfd00 count:2 mapcount:0 mapping:80001c2cae98 index:0x0 flags: 0x6c(referenced|uptodate|lru|active) page dumped because: kasan: bad access detected page->mem_cgroup:80002402da80 CPU: 0 PID: 446 Comm: crond Tainted: GB W 4.6.0-rc3-next-20160414-WR8.0.0.0_standard+ #13 Hardware name: Freescale Layerscape 2085a RDB Board (DT) Call trace: [] dump_backtrace+0x0/0x2b8 [] show_stack+0x24/0x30 [] dump_stack+0xb0/0xe8 [] kasan_report_error+0x518/0x5c0 [] kasan_report+0x60/0x70 [] __asan_load4+0x64/0x80 [] nfs_do_filldir+0x88/0x298 [] nfs_readdir+0x488/0x848 [] iterate_dir+0x120/0x1d8 [] SyS_getdents64+0xdc/0x170 [] __sys_trace_return+0x0/0x4 Memory state around the buggy address: 8b7f3f00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff 8b7f3f80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >8b7f4000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ 8b7f4080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff 8b7f4100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff Thanks, Yang
[BUG linux-next] KASAN bug is raised on linux-next-20160414 with huge tmpfs on
Hi folks, When I run the below test on my ARM64 machine with NFS mounted rootfs, I got KASAN bug report. The test runs well if mnt is not mounted with "huge=1". # mount -t tmpfs -o huge=1 tmpfs /mnt # cp -a /opt/ltp /mnt/ BUG: KASAN: use-after-free in nfs_readdir+0x2c4/0x848 at addr 8b7f4000 Read of size 4 by task crond/446 page:7bffc02dfd00 count:2 mapcount:0 mapping:80001c2cae98 index:0x0 flags: 0x6c(referenced|uptodate|lru|active) page dumped because: kasan: bad access detected page->mem_cgroup:80002402da80 CPU: 0 PID: 446 Comm: crond Tainted: GW 4.6.0-rc3-next-20160414-WR8.0.0.0_standard+ #13 Hardware name: Freescale Layerscape 2085a RDB Board (DT) Call trace: [] dump_backtrace+0x0/0x2b8 [] show_stack+0x24/0x30 [] dump_stack+0xb0/0xe8 [] kasan_report_error+0x518/0x5c0 [] kasan_report+0x60/0x70 [] __asan_load4+0x64/0x80 [] nfs_readdir+0x2c4/0x848 [] iterate_dir+0x120/0x1d8 [] SyS_getdents64+0xdc/0x170 [] __sys_trace_return+0x0/0x4 Memory state around the buggy address: 8b7f3f00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff 8b7f3f80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >8b7f4000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ 8b7f4080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff 8b7f4100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff BUG: KASAN: use-after-free in nfs_do_filldir+0x88/0x298 at addr 8b7f4000 Read of size 4 by task crond/446 page:7bffc02dfd00 count:2 mapcount:0 mapping:80001c2cae98 index:0x0 flags: 0x6c(referenced|uptodate|lru|active) page dumped because: kasan: bad access detected page->mem_cgroup:80002402da80 CPU: 0 PID: 446 Comm: crond Tainted: GB W 4.6.0-rc3-next-20160414-WR8.0.0.0_standard+ #13 Hardware name: Freescale Layerscape 2085a RDB Board (DT) Call trace: [] dump_backtrace+0x0/0x2b8 [] show_stack+0x24/0x30 [] dump_stack+0xb0/0xe8 [] kasan_report_error+0x518/0x5c0 [] kasan_report+0x60/0x70 [] __asan_load4+0x64/0x80 [] nfs_do_filldir+0x88/0x298 [] nfs_readdir+0x488/0x848 [] iterate_dir+0x120/0x1d8 [] SyS_getdents64+0xdc/0x170 [] __sys_trace_return+0x0/0x4 Memory state around the buggy address: 8b7f3f00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff 8b7f3f80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >8b7f4000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ 8b7f4080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff 8b7f4100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff Thanks, Yang
Re: [BUG linux-next] Kernel panic found with linux-next-20160414
On 4/20/2016 1:01 AM, Hugh Dickins wrote: On Tue, 19 Apr 2016, Shi, Yang wrote: Hi folks, When I ran ltp on linux-next-20160414 on my ARM64 machine, I got the below kernel panic: Unable to handle kernel paging request at virtual address ffc007846000 pgd = ffc01e21d000 [ffc007846000] *pgd=, *pud= Internal error: Oops: 9647 [#11] PREEMPT SMP Modules linked in: loop CPU: 7 PID: 274 Comm: systemd-journal Tainted: G D 4.6.0-rc3-next-20160414-WR8.0.0.0_standard+ #9 Hardware name: Freescale Layerscape 2085a RDB Board (DT) task: ffc01e3fcf80 ti: ffc01ea8c000 task.ti: ffc01ea8c000 PC is at copy_page+0x38/0x120 LR is at migrate_page_copy+0x604/0x1660 pc : [] lr : [] pstate: 2145 sp : ffc01ea8ecd0 x29: ffc01ea8ecd0 x28: x27: 1ff7b80240f8 x26: ffc018196f20 x25: ffbdc01e1180 x24: ffbdc01e1180 x23: x22: ffc01e3fcf80 x21: ffc00481f000 x20: ff900a31d000 x19: ffbdc01207c0 x18: 0f00 x17: x16: x15: x14: x13: x12: x11: x10: x9 : x8 : x7 : x6 : x5 : x4 : x3 : x2 : x1 : ffc00481f080 x0 : ffc007846000 Call trace: Exception stack(0xffc021fc2ed0 to 0xffc021fc2ff0) 2ec0: ffbdc00887c0 ff900a31d000 2ee0: ffc021fc30f0 ff9008ff2318 2145 0025 2f00: ffbdc025a280 ffc020adc4c0 41b58ab3 ff900a085fd0 2f20: ff9008200658 ffbdc00887c0 2f40: ff900b0f1320 ffc021fc3078 41b58ab3 ff900a0864f8 2f60: ff9008210010 ffc021fb8960 ff900867bacc 1ff8043f712d 2f80: ffc021fc2fb0 ff9008210564 ffc021fc3070 ffc021fb8940 2fa0: 08221f78 ff900862f9c8 ffc021fc2fe0 ff9008215dc8 2fc0: 1ff8043f8602 ffc021fc ffc00968a000 ffc00221f080 2fe0: f9407e11d1f0 d61f02209103e210 [] copy_page+0x38/0x120 [] migrate_page+0x74/0x98 [] nfs_migrate_page+0x58/0x80 [] move_to_new_page+0x15c/0x4d8 [] migrate_pages+0x7c8/0x11f0 [] compact_zone+0xdfc/0x2570 [] compact_zone_order+0xe0/0x170 [] try_to_compact_pages+0x2e8/0x8f8 [] __alloc_pages_direct_compact+0x100/0x540 [] __alloc_pages_nodemask+0xc40/0x1c58 [] khugepaged+0x468/0x19c8 [] kthread+0x248/0x2c0 [] ret_from_fork+0x10/0x40 Code: d281f012 91020021 f1020252 d503201f (a8000c02) I did some initial investigation and found it is caused by DEBUG_PAGEALLOC and CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT. And, mainline 4.6-rc3 works well. It should be not arch specific although I got it caught on ARM64. I suspect this might be caused by Hugh's huge tmpfs patches. Thanks for testing. It might be caused by my patches, but I don't think that's very likely. This is page migraton for compaction, in the service of anon THP's khugepaged; and I wonder if you were even exercising huge tmpfs when running LTP here (it certainly can be done: I like to mount a huge tmpfs on /opt/ltp and install there, with shmem_huge 2 so any other tmpfs mounts are also huge). Some further investigation shows I got the panic even though I don't have tmpfs mounted with huge=1 or set shmem_huge to 2. There are compaction changes in linux-next too, but I don't see any reason why they'd cause this. I don't know arm64 traces enough to know whether it's the source page or the destination page for the copy, but it looks as if it has been freed (and DEBUG_PAGEALLOC unmapped) before reaching migration's copy. The fault address is passed by x0, which is dest in the implementation of copy_page, so it is the destination page. Needs more debugging, I'm afraid: is it reproducible? Yes, as long as I enable those two PAGEALLOC debug options, I can get the panic once I run ltp. But, it is not caused any specific ltp test case directly, the panic happens randomly during ltp is running. Thanks, Yang Hugh
Re: [BUG linux-next] Kernel panic found with linux-next-20160414
On 4/20/2016 1:01 AM, Hugh Dickins wrote: On Tue, 19 Apr 2016, Shi, Yang wrote: Hi folks, When I ran ltp on linux-next-20160414 on my ARM64 machine, I got the below kernel panic: Unable to handle kernel paging request at virtual address ffc007846000 pgd = ffc01e21d000 [ffc007846000] *pgd=, *pud= Internal error: Oops: 9647 [#11] PREEMPT SMP Modules linked in: loop CPU: 7 PID: 274 Comm: systemd-journal Tainted: G D 4.6.0-rc3-next-20160414-WR8.0.0.0_standard+ #9 Hardware name: Freescale Layerscape 2085a RDB Board (DT) task: ffc01e3fcf80 ti: ffc01ea8c000 task.ti: ffc01ea8c000 PC is at copy_page+0x38/0x120 LR is at migrate_page_copy+0x604/0x1660 pc : [] lr : [] pstate: 2145 sp : ffc01ea8ecd0 x29: ffc01ea8ecd0 x28: x27: 1ff7b80240f8 x26: ffc018196f20 x25: ffbdc01e1180 x24: ffbdc01e1180 x23: x22: ffc01e3fcf80 x21: ffc00481f000 x20: ff900a31d000 x19: ffbdc01207c0 x18: 0f00 x17: x16: x15: x14: x13: x12: x11: x10: x9 : x8 : x7 : x6 : x5 : x4 : x3 : x2 : x1 : ffc00481f080 x0 : ffc007846000 Call trace: Exception stack(0xffc021fc2ed0 to 0xffc021fc2ff0) 2ec0: ffbdc00887c0 ff900a31d000 2ee0: ffc021fc30f0 ff9008ff2318 2145 0025 2f00: ffbdc025a280 ffc020adc4c0 41b58ab3 ff900a085fd0 2f20: ff9008200658 ffbdc00887c0 2f40: ff900b0f1320 ffc021fc3078 41b58ab3 ff900a0864f8 2f60: ff9008210010 ffc021fb8960 ff900867bacc 1ff8043f712d 2f80: ffc021fc2fb0 ff9008210564 ffc021fc3070 ffc021fb8940 2fa0: 08221f78 ff900862f9c8 ffc021fc2fe0 ff9008215dc8 2fc0: 1ff8043f8602 ffc021fc ffc00968a000 ffc00221f080 2fe0: f9407e11d1f0 d61f02209103e210 [] copy_page+0x38/0x120 [] migrate_page+0x74/0x98 [] nfs_migrate_page+0x58/0x80 [] move_to_new_page+0x15c/0x4d8 [] migrate_pages+0x7c8/0x11f0 [] compact_zone+0xdfc/0x2570 [] compact_zone_order+0xe0/0x170 [] try_to_compact_pages+0x2e8/0x8f8 [] __alloc_pages_direct_compact+0x100/0x540 [] __alloc_pages_nodemask+0xc40/0x1c58 [] khugepaged+0x468/0x19c8 [] kthread+0x248/0x2c0 [] ret_from_fork+0x10/0x40 Code: d281f012 91020021 f1020252 d503201f (a8000c02) I did some initial investigation and found it is caused by DEBUG_PAGEALLOC and CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT. And, mainline 4.6-rc3 works well. It should be not arch specific although I got it caught on ARM64. I suspect this might be caused by Hugh's huge tmpfs patches. Thanks for testing. It might be caused by my patches, but I don't think that's very likely. This is page migraton for compaction, in the service of anon THP's khugepaged; and I wonder if you were even exercising huge tmpfs when running LTP here (it certainly can be done: I like to mount a huge tmpfs on /opt/ltp and install there, with shmem_huge 2 so any other tmpfs mounts are also huge). Some further investigation shows I got the panic even though I don't have tmpfs mounted with huge=1 or set shmem_huge to 2. There are compaction changes in linux-next too, but I don't see any reason why they'd cause this. I don't know arm64 traces enough to know whether it's the source page or the destination page for the copy, but it looks as if it has been freed (and DEBUG_PAGEALLOC unmapped) before reaching migration's copy. The fault address is passed by x0, which is dest in the implementation of copy_page, so it is the destination page. Needs more debugging, I'm afraid: is it reproducible? Yes, as long as I enable those two PAGEALLOC debug options, I can get the panic once I run ltp. But, it is not caused any specific ltp test case directly, the panic happens randomly during ltp is running. Thanks, Yang Hugh
Re: [PATCHv7 00/29] THP-enabled tmpfs/shmem using compound pages
On 4/19/2016 9:50 AM, Andrea Arcangeli wrote: Hello, On Mon, Apr 18, 2016 at 03:55:44PM -0700, Shi, Yang wrote: Hi Kirill, Finally, I got some time to look into and try yours and Hugh's patches, got two problems. One thing that come to mind to test is this: qemu with -machine accel=kvm -mem-path=/dev/shm/,share=on . Thanks for the suggestion, I will definitely have a try with KVM. It would be better if Kirill and Hugh could share what benchmark they ran and how much they got improved since my test case is very simple and may just cover a small part of it. Yang The THP Compound approach in tmpfs may just happen to work already with KVM (or at worst it'd require minor adjustments) because it uses the exact same model KVM is already aware about from THP in anonymous memory, example from arch/x86/kvm/mmu.c: static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t *gfnp, kvm_pfn_t *pfnp, int *levelp) { kvm_pfn_t pfn = *pfnp; gfn_t gfn = *gfnp; int level = *levelp; /* * Check if it's a transparent hugepage. If this would be an * hugetlbfs page, level wouldn't be set to * PT_PAGE_TABLE_LEVEL and there would be no adjustment done * here. */ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompound(pfn_to_page(pfn)) && !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { Not using two different models between THP in tmpfs and THP in anon is essential not just to significantly reduce the size of the kernel code, but also because THP knowledge can't be self contained in the mm/shmem.c file. Having to support two different models would complicate things for secondary MMU drivers (i.e. mmu notifer users) like KVM who also need to create huge mapping in the shadow pagetable layer in arch/x86/kvm if the primary MMU allows for it. x86-64 and ARM64 with yours and Hugh's patches (linux-next tree), I got the program execution time reduced by ~12% on x86-64, it looks very impressive. Agreed, both patchset are impressive works and achieving amazing results! My view is that in terms of long-lived computation from userland point of view, both models are malleable enough and could achieve everything we need in the end, but as far as the overall kernel efficiency is concerned the compound model will always retain a slight advantage in performance by leveraging a native THP compound refcounting that requires just one atomic_inc/dec per THP mapcount instead of 512 of them. Other advantages of the compound model is that it's half in code size despite already including khugepaged (i.e. the same split_huge_page works for both tmpfs and anon) and like said above it won't introduce much complications for drivers like KVM as the model didn't change. Thanks, Andrea
Re: [PATCHv7 00/29] THP-enabled tmpfs/shmem using compound pages
On 4/19/2016 9:50 AM, Andrea Arcangeli wrote: Hello, On Mon, Apr 18, 2016 at 03:55:44PM -0700, Shi, Yang wrote: Hi Kirill, Finally, I got some time to look into and try yours and Hugh's patches, got two problems. One thing that come to mind to test is this: qemu with -machine accel=kvm -mem-path=/dev/shm/,share=on . Thanks for the suggestion, I will definitely have a try with KVM. It would be better if Kirill and Hugh could share what benchmark they ran and how much they got improved since my test case is very simple and may just cover a small part of it. Yang The THP Compound approach in tmpfs may just happen to work already with KVM (or at worst it'd require minor adjustments) because it uses the exact same model KVM is already aware about from THP in anonymous memory, example from arch/x86/kvm/mmu.c: static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t *gfnp, kvm_pfn_t *pfnp, int *levelp) { kvm_pfn_t pfn = *pfnp; gfn_t gfn = *gfnp; int level = *levelp; /* * Check if it's a transparent hugepage. If this would be an * hugetlbfs page, level wouldn't be set to * PT_PAGE_TABLE_LEVEL and there would be no adjustment done * here. */ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompound(pfn_to_page(pfn)) && !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { Not using two different models between THP in tmpfs and THP in anon is essential not just to significantly reduce the size of the kernel code, but also because THP knowledge can't be self contained in the mm/shmem.c file. Having to support two different models would complicate things for secondary MMU drivers (i.e. mmu notifer users) like KVM who also need to create huge mapping in the shadow pagetable layer in arch/x86/kvm if the primary MMU allows for it. x86-64 and ARM64 with yours and Hugh's patches (linux-next tree), I got the program execution time reduced by ~12% on x86-64, it looks very impressive. Agreed, both patchset are impressive works and achieving amazing results! My view is that in terms of long-lived computation from userland point of view, both models are malleable enough and could achieve everything we need in the end, but as far as the overall kernel efficiency is concerned the compound model will always retain a slight advantage in performance by leveraging a native THP compound refcounting that requires just one atomic_inc/dec per THP mapcount instead of 512 of them. Other advantages of the compound model is that it's half in code size despite already including khugepaged (i.e. the same split_huge_page works for both tmpfs and anon) and like said above it won't introduce much complications for drivers like KVM as the model didn't change. Thanks, Andrea
[BUG linux-next] Kernel panic found with linux-next-20160414
Hi folks, When I ran ltp on linux-next-20160414 on my ARM64 machine, I got the below kernel panic: Unable to handle kernel paging request at virtual address ffc007846000 pgd = ffc01e21d000 [ffc007846000] *pgd=, *pud= Internal error: Oops: 9647 [#11] PREEMPT SMP Modules linked in: loop CPU: 7 PID: 274 Comm: systemd-journal Tainted: G D 4.6.0-rc3-next-20160414-WR8.0.0.0_standard+ #9 Hardware name: Freescale Layerscape 2085a RDB Board (DT) task: ffc01e3fcf80 ti: ffc01ea8c000 task.ti: ffc01ea8c000 PC is at copy_page+0x38/0x120 LR is at migrate_page_copy+0x604/0x1660 pc : [] lr : [] pstate: 2145 sp : ffc01ea8ecd0 x29: ffc01ea8ecd0 x28: x27: 1ff7b80240f8 x26: ffc018196f20 x25: ffbdc01e1180 x24: ffbdc01e1180 x23: x22: ffc01e3fcf80 x21: ffc00481f000 x20: ff900a31d000 x19: ffbdc01207c0 x18: 0f00 x17: x16: x15: x14: x13: x12: x11: x10: x9 : x8 : x7 : x6 : x5 : x4 : x3 : x2 : x1 : ffc00481f080 x0 : ffc007846000 Call trace: Exception stack(0xffc021fc2ed0 to 0xffc021fc2ff0) 2ec0: ffbdc00887c0 ff900a31d000 2ee0: ffc021fc30f0 ff9008ff2318 2145 0025 2f00: ffbdc025a280 ffc020adc4c0 41b58ab3 ff900a085fd0 2f20: ff9008200658 ffbdc00887c0 2f40: ff900b0f1320 ffc021fc3078 41b58ab3 ff900a0864f8 2f60: ff9008210010 ffc021fb8960 ff900867bacc 1ff8043f712d 2f80: ffc021fc2fb0 ff9008210564 ffc021fc3070 ffc021fb8940 2fa0: 08221f78 ff900862f9c8 ffc021fc2fe0 ff9008215dc8 2fc0: 1ff8043f8602 ffc021fc ffc00968a000 ffc00221f080 2fe0: f9407e11d1f0 d61f02209103e210 [] copy_page+0x38/0x120 [] migrate_page+0x74/0x98 [] nfs_migrate_page+0x58/0x80 [] move_to_new_page+0x15c/0x4d8 [] migrate_pages+0x7c8/0x11f0 [] compact_zone+0xdfc/0x2570 [] compact_zone_order+0xe0/0x170 [] try_to_compact_pages+0x2e8/0x8f8 [] __alloc_pages_direct_compact+0x100/0x540 [] __alloc_pages_nodemask+0xc40/0x1c58 [] khugepaged+0x468/0x19c8 [] kthread+0x248/0x2c0 [] ret_from_fork+0x10/0x40 Code: d281f012 91020021 f1020252 d503201f (a8000c02) I did some initial investigation and found it is caused by DEBUG_PAGEALLOC and CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT. And, mainline 4.6-rc3 works well. It should be not arch specific although I got it caught on ARM64. I suspect this might be caused by Hugh's huge tmpfs patches. Thanks, Yang
[BUG linux-next] Kernel panic found with linux-next-20160414
Hi folks, When I ran ltp on linux-next-20160414 on my ARM64 machine, I got the below kernel panic: Unable to handle kernel paging request at virtual address ffc007846000 pgd = ffc01e21d000 [ffc007846000] *pgd=, *pud= Internal error: Oops: 9647 [#11] PREEMPT SMP Modules linked in: loop CPU: 7 PID: 274 Comm: systemd-journal Tainted: G D 4.6.0-rc3-next-20160414-WR8.0.0.0_standard+ #9 Hardware name: Freescale Layerscape 2085a RDB Board (DT) task: ffc01e3fcf80 ti: ffc01ea8c000 task.ti: ffc01ea8c000 PC is at copy_page+0x38/0x120 LR is at migrate_page_copy+0x604/0x1660 pc : [] lr : [] pstate: 2145 sp : ffc01ea8ecd0 x29: ffc01ea8ecd0 x28: x27: 1ff7b80240f8 x26: ffc018196f20 x25: ffbdc01e1180 x24: ffbdc01e1180 x23: x22: ffc01e3fcf80 x21: ffc00481f000 x20: ff900a31d000 x19: ffbdc01207c0 x18: 0f00 x17: x16: x15: x14: x13: x12: x11: x10: x9 : x8 : x7 : x6 : x5 : x4 : x3 : x2 : x1 : ffc00481f080 x0 : ffc007846000 Call trace: Exception stack(0xffc021fc2ed0 to 0xffc021fc2ff0) 2ec0: ffbdc00887c0 ff900a31d000 2ee0: ffc021fc30f0 ff9008ff2318 2145 0025 2f00: ffbdc025a280 ffc020adc4c0 41b58ab3 ff900a085fd0 2f20: ff9008200658 ffbdc00887c0 2f40: ff900b0f1320 ffc021fc3078 41b58ab3 ff900a0864f8 2f60: ff9008210010 ffc021fb8960 ff900867bacc 1ff8043f712d 2f80: ffc021fc2fb0 ff9008210564 ffc021fc3070 ffc021fb8940 2fa0: 08221f78 ff900862f9c8 ffc021fc2fe0 ff9008215dc8 2fc0: 1ff8043f8602 ffc021fc ffc00968a000 ffc00221f080 2fe0: f9407e11d1f0 d61f02209103e210 [] copy_page+0x38/0x120 [] migrate_page+0x74/0x98 [] nfs_migrate_page+0x58/0x80 [] move_to_new_page+0x15c/0x4d8 [] migrate_pages+0x7c8/0x11f0 [] compact_zone+0xdfc/0x2570 [] compact_zone_order+0xe0/0x170 [] try_to_compact_pages+0x2e8/0x8f8 [] __alloc_pages_direct_compact+0x100/0x540 [] __alloc_pages_nodemask+0xc40/0x1c58 [] khugepaged+0x468/0x19c8 [] kthread+0x248/0x2c0 [] ret_from_fork+0x10/0x40 Code: d281f012 91020021 f1020252 d503201f (a8000c02) I did some initial investigation and found it is caused by DEBUG_PAGEALLOC and CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT. And, mainline 4.6-rc3 works well. It should be not arch specific although I got it caught on ARM64. I suspect this might be caused by Hugh's huge tmpfs patches. Thanks, Yang
KASAN can't change FRAME_WARN
Hi Andrey, When I enable KASAN for 4.5 and 4.6 (I didn't try with older versions), I got FRAME_WARN warning for frame size exceeds 2048 bytes. Then I found the kconfig looks like: range 0 8192 default 0 if KASAN default 1024 if !64BIT default 2048 if 64BIT In my understanding, FRAME_WARN should be 0 once KASAN is enabled, but it is still 2048. I tried a couple of fixes, i.e. default 0 if KASAN default 1024 if (!KASAN && !64BIT) default 2048 if (!KASAN && 64BIT) But, nothing works, so I have to add "depends on !KASAN" to disable FRAME_WARN completely, but it causes the kernel image size increased. Any hint is appreciated. Thanks, Yang
KASAN can't change FRAME_WARN
Hi Andrey, When I enable KASAN for 4.5 and 4.6 (I didn't try with older versions), I got FRAME_WARN warning for frame size exceeds 2048 bytes. Then I found the kconfig looks like: range 0 8192 default 0 if KASAN default 1024 if !64BIT default 2048 if 64BIT In my understanding, FRAME_WARN should be 0 once KASAN is enabled, but it is still 2048. I tried a couple of fixes, i.e. default 0 if KASAN default 1024 if (!KASAN && !64BIT) default 2048 if (!KASAN && 64BIT) But, nothing works, so I have to add "depends on !KASAN" to disable FRAME_WARN completely, but it causes the kernel image size increased. Any hint is appreciated. Thanks, Yang
Re: [PATCHv7 00/29] THP-enabled tmpfs/shmem using compound pages
On 4/19/2016 7:33 AM, Jerome Marchand wrote: On 04/19/2016 12:55 AM, Shi, Yang wrote: 2. I ran my THP test (generated a program with 4MB text section) on both x86-64 and ARM64 with yours and Hugh's patches (linux-next tree), I got the program execution time reduced by ~12% on x86-64, it looks very impressive. But, on ARM64, there is just ~3% change, and sometimes huge tmpfs may show even worse data than non-hugepage. Both yours and Hugh's patches has the same behavior. Any idea? Just a shot in the dark, but what page size do you use? If you use 4k pages, then hugepage size should be the same as on x86 and a similar I do use 4K pages for both x86-64 and ARM64 in my testing. Thanks, Yang behavior could be expected. Otherwise, hugepages would be too big to be taken advantage of by your test program. Jerome
Re: [PATCHv7 00/29] THP-enabled tmpfs/shmem using compound pages
On 4/19/2016 7:33 AM, Jerome Marchand wrote: On 04/19/2016 12:55 AM, Shi, Yang wrote: 2. I ran my THP test (generated a program with 4MB text section) on both x86-64 and ARM64 with yours and Hugh's patches (linux-next tree), I got the program execution time reduced by ~12% on x86-64, it looks very impressive. But, on ARM64, there is just ~3% change, and sometimes huge tmpfs may show even worse data than non-hugepage. Both yours and Hugh's patches has the same behavior. Any idea? Just a shot in the dark, but what page size do you use? If you use 4k pages, then hugepage size should be the same as on x86 and a similar I do use 4K pages for both x86-64 and ARM64 in my testing. Thanks, Yang behavior could be expected. Otherwise, hugepages would be too big to be taken advantage of by your test program. Jerome
Re: [PATCHv7 00/29] THP-enabled tmpfs/shmem using compound pages
Hi Kirill, Finally, I got some time to look into and try yours and Hugh's patches, got two problems. 1. A quick boot up test on my ARM64 machine with your v7 tree shows some unexpected error: systemd-journald[285]: Failed to save stream data /run/systemd/journal/streams/8:16863: No space left on device systemd-journald[285]: Failed to save stream data /run/systemd/journal/streams/8:16865: No space left on device Starting DNS forwarder and DHCP server.systemd-journald[285]: Failed to save stream data /run/systemd/journal/streams/8:16867: No space left on device .. systemd-journald[285]: Failed to save stream data /run/systemd/journal/streams/8:16869: No space left on device Starting Postfix Mail Transport Agent... systemd-journald[285]: Failed to save stream data /run/systemd/journal/streams/8:16871: No space left on device Starting Berkeley Internet Name Domain (DNS)... Starting Wait for Network to be Configured... systemd-journald[285]: Failed to save stream data /run/systemd/journal/streams/8:2422: No space left on device [ OK ] Started /etc/rc.local Compatibility. [FAILED] Failed to start DNS forwarder and DHCP server. See 'systemctl status dnsmasq.service' for details. systemd-journald[285]: Failed to save stream data /run/systemd/journal/streams/8:2425: No space left on device [ OK ] Started Serial Getty on ttyS1. [ OK ] Started Serial Getty on ttyS0. [ OK ] Started Getty on tty1. systemd-journald[285]: Failed to save stream data /run/systemd/journal/streams/8:2433: No space left on device [FAILED] Failed to start Berkeley Internet Name Domain (DNS). See 'systemctl status named.service' for details. The /run dir is mounted as tmpfs. x86 boot doesn't get such error. And, Hugh's patches don't have such problem. 2. I ran my THP test (generated a program with 4MB text section) on both x86-64 and ARM64 with yours and Hugh's patches (linux-next tree), I got the program execution time reduced by ~12% on x86-64, it looks very impressive. But, on ARM64, there is just ~3% change, and sometimes huge tmpfs may show even worse data than non-hugepage. Both yours and Hugh's patches has the same behavior. Any idea? Thanks, Yang On 4/15/2016 5:23 PM, Kirill A. Shutemov wrote: This is probably the last update before the mm summit. Main forcus is on khugepaged stability. khugepaged is in more reasonable shape now. I missed quite a few corner cases on first try. I run this version via LTP, trinity and syzkaller without crashes so far. The patchset is on top of v4.6-rc3 plus Hugh's "easy preliminaries to THPagecache" and Ebru's khugepaged swapin patches form -mm tree. Git tree: git://git.kernel.org/pub/scm/linux/kernel/git/kas/linux.git hugetmpfs/v7 == Changelog == v7: - khugepaged updates: + fix page leak/page cache corruption on collapse fail; + filter out VMAs not suitable for huge pages due misaligned vm_pgoff; + fix build without CONFIG_SHMEM; + drop few over-protective checks; - fix bogus VM_BUG_ON() in __delete_from_page_cache(); v6: - experimental collapse support; - fix swapout mapped huge pages; - fix page leak in faularound code; - fix exessive huge page allocation with huge=within_size; - rename VM_NO_THP to VM_NO_KHUGEPAGED; - fix condition in hugepage_madvise(); - accounting reworked again; v5: - add FileHugeMapped to /proc/PID/smaps; - make FileHugeMapped in meminfo aligned with other fields; - Documentation/vm/transhuge.txt updated; v4: - first four patch were applied to -mm tree; - drop pages beyond i_size on split_huge_pages; - few small random bugfixes; v3: - huge= mountoption now can have values always, within_size, advice and never; - sysctl handle is replaced with sysfs knob; - MADV_HUGEPAGE/MADV_NOHUGEPAGE is now respected on page allocation via page fault; - mlock() handling had been fixed; - bunch of smaller bugfixes and cleanups. == Design overview == Huge pages are allocated by shmem when it's allowed (by mount option) and there's no entries for the range in radix-tree. Huge page is represented by HPAGE_PMD_NR entries in radix-tree. MM core maps a page with PMD if ->fault() returns huge page and the VMA is suitable for huge pages (size, alignment). There's no need into two requests to file system: filesystem returns huge page if it can, graceful fallback to small pages otherwise. As with DAX, split_huge_pmd() is implemented by unmapping the PMD: we can re-fault the page with PTEs later. Basic scheme for split_huge_page() is the same as for anon-THP. Few differences: - File pages are on radix-tree, so we have head->_count offset by HPAGE_PMD_NR. The count got distributed to small pages during split. - mapping->tree_lock prevents non-lockless access to pages under split over radix-tree; - Lockless access is prevented by setting the head->_count to 0 during split, so
Re: [PATCHv7 00/29] THP-enabled tmpfs/shmem using compound pages
Hi Kirill, Finally, I got some time to look into and try yours and Hugh's patches, got two problems. 1. A quick boot up test on my ARM64 machine with your v7 tree shows some unexpected error: systemd-journald[285]: Failed to save stream data /run/systemd/journal/streams/8:16863: No space left on device systemd-journald[285]: Failed to save stream data /run/systemd/journal/streams/8:16865: No space left on device Starting DNS forwarder and DHCP server.systemd-journald[285]: Failed to save stream data /run/systemd/journal/streams/8:16867: No space left on device .. systemd-journald[285]: Failed to save stream data /run/systemd/journal/streams/8:16869: No space left on device Starting Postfix Mail Transport Agent... systemd-journald[285]: Failed to save stream data /run/systemd/journal/streams/8:16871: No space left on device Starting Berkeley Internet Name Domain (DNS)... Starting Wait for Network to be Configured... systemd-journald[285]: Failed to save stream data /run/systemd/journal/streams/8:2422: No space left on device [ OK ] Started /etc/rc.local Compatibility. [FAILED] Failed to start DNS forwarder and DHCP server. See 'systemctl status dnsmasq.service' for details. systemd-journald[285]: Failed to save stream data /run/systemd/journal/streams/8:2425: No space left on device [ OK ] Started Serial Getty on ttyS1. [ OK ] Started Serial Getty on ttyS0. [ OK ] Started Getty on tty1. systemd-journald[285]: Failed to save stream data /run/systemd/journal/streams/8:2433: No space left on device [FAILED] Failed to start Berkeley Internet Name Domain (DNS). See 'systemctl status named.service' for details. The /run dir is mounted as tmpfs. x86 boot doesn't get such error. And, Hugh's patches don't have such problem. 2. I ran my THP test (generated a program with 4MB text section) on both x86-64 and ARM64 with yours and Hugh's patches (linux-next tree), I got the program execution time reduced by ~12% on x86-64, it looks very impressive. But, on ARM64, there is just ~3% change, and sometimes huge tmpfs may show even worse data than non-hugepage. Both yours and Hugh's patches has the same behavior. Any idea? Thanks, Yang On 4/15/2016 5:23 PM, Kirill A. Shutemov wrote: This is probably the last update before the mm summit. Main forcus is on khugepaged stability. khugepaged is in more reasonable shape now. I missed quite a few corner cases on first try. I run this version via LTP, trinity and syzkaller without crashes so far. The patchset is on top of v4.6-rc3 plus Hugh's "easy preliminaries to THPagecache" and Ebru's khugepaged swapin patches form -mm tree. Git tree: git://git.kernel.org/pub/scm/linux/kernel/git/kas/linux.git hugetmpfs/v7 == Changelog == v7: - khugepaged updates: + fix page leak/page cache corruption on collapse fail; + filter out VMAs not suitable for huge pages due misaligned vm_pgoff; + fix build without CONFIG_SHMEM; + drop few over-protective checks; - fix bogus VM_BUG_ON() in __delete_from_page_cache(); v6: - experimental collapse support; - fix swapout mapped huge pages; - fix page leak in faularound code; - fix exessive huge page allocation with huge=within_size; - rename VM_NO_THP to VM_NO_KHUGEPAGED; - fix condition in hugepage_madvise(); - accounting reworked again; v5: - add FileHugeMapped to /proc/PID/smaps; - make FileHugeMapped in meminfo aligned with other fields; - Documentation/vm/transhuge.txt updated; v4: - first four patch were applied to -mm tree; - drop pages beyond i_size on split_huge_pages; - few small random bugfixes; v3: - huge= mountoption now can have values always, within_size, advice and never; - sysctl handle is replaced with sysfs knob; - MADV_HUGEPAGE/MADV_NOHUGEPAGE is now respected on page allocation via page fault; - mlock() handling had been fixed; - bunch of smaller bugfixes and cleanups. == Design overview == Huge pages are allocated by shmem when it's allowed (by mount option) and there's no entries for the range in radix-tree. Huge page is represented by HPAGE_PMD_NR entries in radix-tree. MM core maps a page with PMD if ->fault() returns huge page and the VMA is suitable for huge pages (size, alignment). There's no need into two requests to file system: filesystem returns huge page if it can, graceful fallback to small pages otherwise. As with DAX, split_huge_pmd() is implemented by unmapping the PMD: we can re-fault the page with PTEs later. Basic scheme for split_huge_page() is the same as for anon-THP. Few differences: - File pages are on radix-tree, so we have head->_count offset by HPAGE_PMD_NR. The count got distributed to small pages during split. - mapping->tree_lock prevents non-lockless access to pages under split over radix-tree; - Lockless access is prevented by setting the head->_count to 0 during split, so
Re: [PATCH] locktorture: make verbose writable and control stats print
On 4/15/2016 5:09 PM, Paul E. McKenney wrote: On Fri, Apr 15, 2016 at 04:45:32PM -0700, Shi, Yang wrote: On 4/15/2016 4:26 PM, Paul E. McKenney wrote: On Fri, Apr 15, 2016 at 01:28:11PM -0700, Yang Shi wrote: When building locktorture test into kernel image, it keeps printing out stats information even though there is no lock type specified. There is already verbose parameter to control print, but it is read-only, so it can't be changed at runtime. Make verbose read-write and control stats print. Signed-off-by: Yang Shi <yang@linaro.org> Interesting change! But just out of curiosity, when you boot with locktorture built in, do you specify the shutdown_secs boot parameter? If so, another No, just use the default value, which is 0 for shutdown_secs. approach would be to shutdown immediately upon detecting an error during initialization. In my case, it looks there is not error involved. You said that there is no lock type specified, but that should mean that the default ("spin_lock") is chosen. If so, I would expect it to just Yes, spin_lock is chosen by default. do the test, at least if locktorture.torture_runnable has been set. But, the default value of torture_runnable is 0. And, it is readonly parameter too. This prevents torture from running if it is built into kernel instead of module. Actually, I'm confused why there is not LOCK_TORTURE_TEST_RUNNABLE Kconfig like RCU torture? Thanks, Yang Either way, the usual way to make locktorture shut up would be to boot with locktorture.stat_interval=0. If not, I would like to know more about your use case. In my test, I just built locktorture test into kernel instead of a module then check how it behaves, no specific purpose. It sounds like not a normal approach to use it. Agreed, I do believe that this is a case of "working as designed". Thanx, Paul
Re: [PATCH] locktorture: make verbose writable and control stats print
On 4/15/2016 5:09 PM, Paul E. McKenney wrote: On Fri, Apr 15, 2016 at 04:45:32PM -0700, Shi, Yang wrote: On 4/15/2016 4:26 PM, Paul E. McKenney wrote: On Fri, Apr 15, 2016 at 01:28:11PM -0700, Yang Shi wrote: When building locktorture test into kernel image, it keeps printing out stats information even though there is no lock type specified. There is already verbose parameter to control print, but it is read-only, so it can't be changed at runtime. Make verbose read-write and control stats print. Signed-off-by: Yang Shi Interesting change! But just out of curiosity, when you boot with locktorture built in, do you specify the shutdown_secs boot parameter? If so, another No, just use the default value, which is 0 for shutdown_secs. approach would be to shutdown immediately upon detecting an error during initialization. In my case, it looks there is not error involved. You said that there is no lock type specified, but that should mean that the default ("spin_lock") is chosen. If so, I would expect it to just Yes, spin_lock is chosen by default. do the test, at least if locktorture.torture_runnable has been set. But, the default value of torture_runnable is 0. And, it is readonly parameter too. This prevents torture from running if it is built into kernel instead of module. Actually, I'm confused why there is not LOCK_TORTURE_TEST_RUNNABLE Kconfig like RCU torture? Thanks, Yang Either way, the usual way to make locktorture shut up would be to boot with locktorture.stat_interval=0. If not, I would like to know more about your use case. In my test, I just built locktorture test into kernel instead of a module then check how it behaves, no specific purpose. It sounds like not a normal approach to use it. Agreed, I do believe that this is a case of "working as designed". Thanx, Paul
Re: [PATCH] locktorture: make verbose writable and control stats print
On 4/15/2016 4:26 PM, Paul E. McKenney wrote: On Fri, Apr 15, 2016 at 01:28:11PM -0700, Yang Shi wrote: When building locktorture test into kernel image, it keeps printing out stats information even though there is no lock type specified. There is already verbose parameter to control print, but it is read-only, so it can't be changed at runtime. Make verbose read-write and control stats print. Signed-off-by: Yang Shi <yang@linaro.org> Interesting change! But just out of curiosity, when you boot with locktorture built in, do you specify the shutdown_secs boot parameter? If so, another No, just use the default value, which is 0 for shutdown_secs. approach would be to shutdown immediately upon detecting an error during initialization. In my case, it looks there is not error involved. If not, I would like to know more about your use case. In my test, I just built locktorture test into kernel instead of a module then check how it behaves, no specific purpose. It sounds like not a normal approach to use it. Thanks, Yang Thanx, Paul --- include/linux/torture.h | 2 ++ kernel/locking/locktorture.c | 11 +++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/linux/torture.h b/include/linux/torture.h index 7759fc3..86d6e54 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -48,6 +48,8 @@ do { if (verbose) pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s); } while (0) #define VERBOSE_TOROUT_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s); } while (0) +#define VERBOSE_STRING(s) \ + do { if (verbose) pr_alert("%s", s); } while (0) /* Definitions for online/offline exerciser. */ int torture_onoff_init(long ooholdoff, long oointerval); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 8ef1919..d9838a3 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -55,8 +55,11 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); -torture_param(bool, verbose, true, -"Enable verbose debugging printk()s"); + +static bool verbose = true; +module_param(verbose, bool, 0644); +MODULE_PARM_DESC(verbose, +"Enable verbose debugging printk()s"); static char *torture_type = "spin_lock"; module_param(torture_type, charp, 0444); @@ -693,7 +696,7 @@ static void lock_torture_stats_print(void) } __torture_print_stats(buf, cxt.lwsa, true); - pr_alert("%s", buf); + VERBOSE_STRING(buf); kfree(buf); if (cxt.cur_ops->readlock) { @@ -705,7 +708,7 @@ static void lock_torture_stats_print(void) } __torture_print_stats(buf, cxt.lrsa, false); - pr_alert("%s", buf); + VERBOSE_STRING(buf); kfree(buf); } } -- 2.0.2
Re: [PATCH] locktorture: make verbose writable and control stats print
On 4/15/2016 4:26 PM, Paul E. McKenney wrote: On Fri, Apr 15, 2016 at 01:28:11PM -0700, Yang Shi wrote: When building locktorture test into kernel image, it keeps printing out stats information even though there is no lock type specified. There is already verbose parameter to control print, but it is read-only, so it can't be changed at runtime. Make verbose read-write and control stats print. Signed-off-by: Yang Shi Interesting change! But just out of curiosity, when you boot with locktorture built in, do you specify the shutdown_secs boot parameter? If so, another No, just use the default value, which is 0 for shutdown_secs. approach would be to shutdown immediately upon detecting an error during initialization. In my case, it looks there is not error involved. If not, I would like to know more about your use case. In my test, I just built locktorture test into kernel instead of a module then check how it behaves, no specific purpose. It sounds like not a normal approach to use it. Thanks, Yang Thanx, Paul --- include/linux/torture.h | 2 ++ kernel/locking/locktorture.c | 11 +++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/linux/torture.h b/include/linux/torture.h index 7759fc3..86d6e54 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -48,6 +48,8 @@ do { if (verbose) pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s); } while (0) #define VERBOSE_TOROUT_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s); } while (0) +#define VERBOSE_STRING(s) \ + do { if (verbose) pr_alert("%s", s); } while (0) /* Definitions for online/offline exerciser. */ int torture_onoff_init(long ooholdoff, long oointerval); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 8ef1919..d9838a3 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -55,8 +55,11 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); -torture_param(bool, verbose, true, -"Enable verbose debugging printk()s"); + +static bool verbose = true; +module_param(verbose, bool, 0644); +MODULE_PARM_DESC(verbose, +"Enable verbose debugging printk()s"); static char *torture_type = "spin_lock"; module_param(torture_type, charp, 0444); @@ -693,7 +696,7 @@ static void lock_torture_stats_print(void) } __torture_print_stats(buf, cxt.lwsa, true); - pr_alert("%s", buf); + VERBOSE_STRING(buf); kfree(buf); if (cxt.cur_ops->readlock) { @@ -705,7 +708,7 @@ static void lock_torture_stats_print(void) } __torture_print_stats(buf, cxt.lrsa, false); - pr_alert("%s", buf); + VERBOSE_STRING(buf); kfree(buf); } } -- 2.0.2
Re: [PATCH] arm64: Kconfig: make SCHED_MC and SCHED_SMT depend on SMP
On 4/14/2016 1:47 AM, Will Deacon wrote: On Wed, Apr 13, 2016 at 05:54:12PM -0700, Yang Shi wrote: SCHED_MC and SCHED_SMT are pointless when SMP is disabled. Although SMP is rarely disabled for ARM64, it looks more consistent to have such depend in Kconfig. You can't disable CONFIG_SMP for arm64 -- we select it unconditionally in the kconfig. Thanks Will. I didn't realize ARM64 has SMP selected unconditionally, it looks the patch is pointless. A follow-up question, I know ARM64 has no UP implementation now, it sounds make sense to have SMP select unconditionally, however, it might be more flexible to have SMP like x86 and other architectures. And, it may also help to find more bugs when !SMP. Thanks, Yang Will
Re: [PATCH] arm64: Kconfig: make SCHED_MC and SCHED_SMT depend on SMP
On 4/14/2016 1:47 AM, Will Deacon wrote: On Wed, Apr 13, 2016 at 05:54:12PM -0700, Yang Shi wrote: SCHED_MC and SCHED_SMT are pointless when SMP is disabled. Although SMP is rarely disabled for ARM64, it looks more consistent to have such depend in Kconfig. You can't disable CONFIG_SMP for arm64 -- we select it unconditionally in the kconfig. Thanks Will. I didn't realize ARM64 has SMP selected unconditionally, it looks the patch is pointless. A follow-up question, I know ARM64 has no UP implementation now, it sounds make sense to have SMP select unconditionally, however, it might be more flexible to have SMP like x86 and other architectures. And, it may also help to find more bugs when !SMP. Thanks, Yang Will
Re: [RFC V4 PATCH] trace: writeback: replace cgroup path to cgroup ino
Hi Steven, Any comment on this one? I'm supposed it will be taken by your tracing tree? Thanks, Yang On 3/4/2016 4:52 AM, Tejun Heo wrote: On Thu, Mar 03, 2016 at 01:08:57AM -0800, Yang Shi wrote: commit 5634cc2aa9aebc77bc862992e7805469dcf83dac ("writeback: update writeback tracepoints to report cgroup") made writeback tracepoints print out cgroup path when CGROUP_WRITEBACK is enabled, but it may trigger the below bug on -rt kernel since kernfs_path and kernfs_path_len are called by tracepoints, which acquire spin lock that is sleepable on -rt kernel. Acked-by: Tejun HeoThanks.
Re: [RFC V4 PATCH] trace: writeback: replace cgroup path to cgroup ino
Hi Steven, Any comment on this one? I'm supposed it will be taken by your tracing tree? Thanks, Yang On 3/4/2016 4:52 AM, Tejun Heo wrote: On Thu, Mar 03, 2016 at 01:08:57AM -0800, Yang Shi wrote: commit 5634cc2aa9aebc77bc862992e7805469dcf83dac ("writeback: update writeback tracepoints to report cgroup") made writeback tracepoints print out cgroup path when CGROUP_WRITEBACK is enabled, but it may trigger the below bug on -rt kernel since kernfs_path and kernfs_path_len are called by tracepoints, which acquire spin lock that is sleepable on -rt kernel. Acked-by: Tejun Heo Thanks.