Re: Re: [PATCH v2] arm64: kasan: add interceptors for strcmp/strncmp functions
On 2018-09-05 오전 1:24, Andrey Ryabinin wrote: On 09/04/2018 01:10 PM, Andrey Ryabinin wrote: > > > On 09/04/2018 09:59 AM, Kyeongdon Kim wrote: > >>>> +#undef strncmp >>>> +int strncmp(const char *cs, const char *ct, size_t len) >>>> +{ >>>> + check_memory_region((unsigned long)cs, len, false, _RET_IP_); >>>> + check_memory_region((unsigned long)ct, len, false, _RET_IP_); >>> >>> This will cause false positives. Both 'cs', and 'ct' could be less than len bytes. >>> >>> There is no need in these interceptors, just use the C implementations from lib/string.c >>> like you did in your first patch. >>> The only thing that was wrong in the first patch is that assembly implementations >>> were compiled out instead of being declared week. >>> >> Well, at first I thought so.. >> I would remove diff code in /mm/kasan/kasan.c then use C implementations in lib/string.c >> w/ assem implementations as weak : >> >> diff --git a/lib/string.c b/lib/string.c >> index 2c0900a..a18b18f 100644 >> --- a/lib/string.c >> +++ b/lib/string.c >> @@ -312,7 +312,7 @@ size_t strlcat(char *dest, const char *src, size_t count) >> EXPORT_SYMBOL(strlcat); >> #endif >> >> -#ifndef __HAVE_ARCH_STRCMP >> +#if (defined(CONFIG_ARM64) && defined(CONFIG_KASAN)) || !defined(__HAVE_ARCH_STRCMP) > > No. What part of "like you did in your first patch" is unclear to you? Just to be absolutely clear, I meant #ifdef out __HAVE_ARCH_* defines like it has been done in this patch http://lkml.kernel.org/r/<1534233322-106271-1-git-send-email-kyeongdon@lge.com> I understood what you're saying, but I might think the wrong patch. So, thinking about the other way as below: can pick up assem variant or c one, declare them as weak. --- diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index dd95d33..53a2ae0 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h @@ -22,11 +22,22 @@ extern char *strrchr(const char *, int c); #define __HAVE_ARCH_STRCHR extern char *strchr(const char *, int c); +#ifdef CONFIG_KASAN +extern int __strcmp(const char *, const char *); +extern int __strncmp(const char *, const char *, __kernel_size_t); + +#ifndef __SANITIZE_ADDRESS__ +#define strcmp(cs, ct) __strcmp(cs, ct) +#define strncmp(cs, ct, n) __strncmp(cs, ct, n) +#endif + +#else #define __HAVE_ARCH_STRCMP extern int strcmp(const char *, const char *); #define __HAVE_ARCH_STRNCMP extern int strncmp(const char *, const char *, __kernel_size_t); +#endif #define __HAVE_ARCH_STRLEN extern __kernel_size_t strlen(const char *); diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index d894a20..9aeffd5 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -50,6 +50,10 @@ EXPORT_SYMBOL(strcmp); EXPORT_SYMBOL(strncmp); EXPORT_SYMBOL(strlen); EXPORT_SYMBOL(strnlen); +#ifdef CONFIG_KASAN +EXPORT_SYMBOL(__strcmp); +EXPORT_SYMBOL(__strncmp); +#endif EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memmove); diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index a820ed0..5ef7a57 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -110,6 +110,8 @@ __efistub___flush_dcache_area = KALLSYMS_HIDE(__pi___flush_dcache_area); __efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy); __efistub___memmove = KALLSYMS_HIDE(__pi_memmove); __efistub___memset = KALLSYMS_HIDE(__pi_memset); +__efistub___strcmp = KALLSYMS_HIDE(__pi_strcmp); +__efistub___strncmp = KALLSYMS_HIDE(__pi_strncmp); #endif __efistub__text = KALLSYMS_HIDE(_text); diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S index 471fe61..0dffef7 100644 --- a/arch/arm64/lib/strcmp.S +++ b/arch/arm64/lib/strcmp.S @@ -60,6 +60,8 @@ tmp3 .req x9 zeroones .req x10 pos .req x11 +.weak strcmp +ENTRY(__strcmp) ENTRY(strcmp) eor tmp1, src1, src2 mov zeroones, #REP8_01 @@ -232,3 +234,4 @@ CPU_BE( orr syndrome, diff, has_nul ) sub result, data1, data2, lsr #56 ret ENDPIPROC(strcmp) +ENDPROC(__strcmp) diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S index e267044..b2648c7 100644 --- a/arch/arm64/lib/strncmp.S +++ b/arch/arm64/lib/strncmp.S @@ -64,6 +64,8 @@ limit_wd .req x13 mask .req x14 endloop .req x15 +.weak strncmp +ENTRY(__strncmp) ENTRY(strncmp) cbz limit, .Lret0 eor tmp1, src1, src2 @@ -308,3 +310,4 @@ CPU_BE( orr syndrome, diff, has_nul ) mov result, #0 ret ENDPIPROC(strncmp) +ENDPROC(__strncmp) -- Could you review this diff?
Re: Re: [PATCH v2] arm64: kasan: add interceptors for strcmp/strncmp functions
On 2018-09-05 오전 1:24, Andrey Ryabinin wrote: On 09/04/2018 01:10 PM, Andrey Ryabinin wrote: > > > On 09/04/2018 09:59 AM, Kyeongdon Kim wrote: > >>>> +#undef strncmp >>>> +int strncmp(const char *cs, const char *ct, size_t len) >>>> +{ >>>> + check_memory_region((unsigned long)cs, len, false, _RET_IP_); >>>> + check_memory_region((unsigned long)ct, len, false, _RET_IP_); >>> >>> This will cause false positives. Both 'cs', and 'ct' could be less than len bytes. >>> >>> There is no need in these interceptors, just use the C implementations from lib/string.c >>> like you did in your first patch. >>> The only thing that was wrong in the first patch is that assembly implementations >>> were compiled out instead of being declared week. >>> >> Well, at first I thought so.. >> I would remove diff code in /mm/kasan/kasan.c then use C implementations in lib/string.c >> w/ assem implementations as weak : >> >> diff --git a/lib/string.c b/lib/string.c >> index 2c0900a..a18b18f 100644 >> --- a/lib/string.c >> +++ b/lib/string.c >> @@ -312,7 +312,7 @@ size_t strlcat(char *dest, const char *src, size_t count) >> EXPORT_SYMBOL(strlcat); >> #endif >> >> -#ifndef __HAVE_ARCH_STRCMP >> +#if (defined(CONFIG_ARM64) && defined(CONFIG_KASAN)) || !defined(__HAVE_ARCH_STRCMP) > > No. What part of "like you did in your first patch" is unclear to you? Just to be absolutely clear, I meant #ifdef out __HAVE_ARCH_* defines like it has been done in this patch http://lkml.kernel.org/r/<1534233322-106271-1-git-send-email-kyeongdon@lge.com> I understood what you're saying, but I might think the wrong patch. So, thinking about the other way as below: can pick up assem variant or c one, declare them as weak. --- diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index dd95d33..53a2ae0 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h @@ -22,11 +22,22 @@ extern char *strrchr(const char *, int c); #define __HAVE_ARCH_STRCHR extern char *strchr(const char *, int c); +#ifdef CONFIG_KASAN +extern int __strcmp(const char *, const char *); +extern int __strncmp(const char *, const char *, __kernel_size_t); + +#ifndef __SANITIZE_ADDRESS__ +#define strcmp(cs, ct) __strcmp(cs, ct) +#define strncmp(cs, ct, n) __strncmp(cs, ct, n) +#endif + +#else #define __HAVE_ARCH_STRCMP extern int strcmp(const char *, const char *); #define __HAVE_ARCH_STRNCMP extern int strncmp(const char *, const char *, __kernel_size_t); +#endif #define __HAVE_ARCH_STRLEN extern __kernel_size_t strlen(const char *); diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index d894a20..9aeffd5 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -50,6 +50,10 @@ EXPORT_SYMBOL(strcmp); EXPORT_SYMBOL(strncmp); EXPORT_SYMBOL(strlen); EXPORT_SYMBOL(strnlen); +#ifdef CONFIG_KASAN +EXPORT_SYMBOL(__strcmp); +EXPORT_SYMBOL(__strncmp); +#endif EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memmove); diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index a820ed0..5ef7a57 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -110,6 +110,8 @@ __efistub___flush_dcache_area = KALLSYMS_HIDE(__pi___flush_dcache_area); __efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy); __efistub___memmove = KALLSYMS_HIDE(__pi_memmove); __efistub___memset = KALLSYMS_HIDE(__pi_memset); +__efistub___strcmp = KALLSYMS_HIDE(__pi_strcmp); +__efistub___strncmp = KALLSYMS_HIDE(__pi_strncmp); #endif __efistub__text = KALLSYMS_HIDE(_text); diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S index 471fe61..0dffef7 100644 --- a/arch/arm64/lib/strcmp.S +++ b/arch/arm64/lib/strcmp.S @@ -60,6 +60,8 @@ tmp3 .req x9 zeroones .req x10 pos .req x11 +.weak strcmp +ENTRY(__strcmp) ENTRY(strcmp) eor tmp1, src1, src2 mov zeroones, #REP8_01 @@ -232,3 +234,4 @@ CPU_BE( orr syndrome, diff, has_nul ) sub result, data1, data2, lsr #56 ret ENDPIPROC(strcmp) +ENDPROC(__strcmp) diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S index e267044..b2648c7 100644 --- a/arch/arm64/lib/strncmp.S +++ b/arch/arm64/lib/strncmp.S @@ -64,6 +64,8 @@ limit_wd .req x13 mask .req x14 endloop .req x15 +.weak strncmp +ENTRY(__strncmp) ENTRY(strncmp) cbz limit, .Lret0 eor tmp1, src1, src2 @@ -308,3 +310,4 @@ CPU_BE( orr syndrome, diff, has_nul ) mov result, #0 ret ENDPIPROC(strncmp) +ENDPROC(__strncmp) -- Could you review this diff?
Re: Re: [PATCH v2] arm64: kasan: add interceptors for strcmp/strncmp functions
Hello Andrey, Thanks for your review. On 2018-09-03 오후 6:40, Andrey Ryabinin wrote: On 08/23/2018 11:56 AM, Kyeongdon Kim wrote: > diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c > index c3bd520..61ad7f1 100644 > --- a/mm/kasan/kasan.c > +++ b/mm/kasan/kasan.c > @@ -304,6 +304,29 @@ void *memcpy(void *dest, const void *src, size_t len) > > return __memcpy(dest, src, len); > } > +#ifdef CONFIG_ARM64 > +/* > + * Arch arm64 use assembly variant for strcmp/strncmp, > + * xtensa use inline asm operations and x86_64 use c one, > + * so now this interceptors only for arm64 kasan. > + */ > +#undef strcmp > +int strcmp(const char *cs, const char *ct) > +{ > + check_memory_region((unsigned long)cs, 1, false, _RET_IP_); > + check_memory_region((unsigned long)ct, 1, false, _RET_IP_); > + Well this is definitely wrong. strcmp() often accesses far more than one byte. > + return __strcmp(cs, ct); > +} > +#undef strncmp > +int strncmp(const char *cs, const char *ct, size_t len) > +{ > + check_memory_region((unsigned long)cs, len, false, _RET_IP_); > + check_memory_region((unsigned long)ct, len, false, _RET_IP_); This will cause false positives. Both 'cs', and 'ct' could be less than len bytes. There is no need in these interceptors, just use the C implementations from lib/string.c like you did in your first patch. The only thing that was wrong in the first patch is that assembly implementations were compiled out instead of being declared week. Well, at first I thought so.. I would remove diff code in /mm/kasan/kasan.c then use C implementations in lib/string.c w/ assem implementations as weak : diff --git a/lib/string.c b/lib/string.c index 2c0900a..a18b18f 100644 --- a/lib/string.c +++ b/lib/string.c @@ -312,7 +312,7 @@ size_t strlcat(char *dest, const char *src, size_t count) EXPORT_SYMBOL(strlcat); #endif -#ifndef __HAVE_ARCH_STRCMP +#if (defined(CONFIG_ARM64) && defined(CONFIG_KASAN)) || !defined(__HAVE_ARCH_STRCMP) /** * strcmp - Compare two strings * @cs: One string @@ -336,7 +336,7 @@ int strcmp(const char *cs, const char *ct) EXPORT_SYMBOL(strcmp); #endif -#ifndef __HAVE_ARCH_STRNCMP +#if (defined(CONFIG_ARM64) && defined(CONFIG_KASAN)) || !defined(__HAVE_ARCH_STRNCMP) /** * strncmp - Compare two length-limited strings Can I get your opinion wrt this ? Thanks,
Re: Re: [PATCH v2] arm64: kasan: add interceptors for strcmp/strncmp functions
Hello Andrey, Thanks for your review. On 2018-09-03 오후 6:40, Andrey Ryabinin wrote: On 08/23/2018 11:56 AM, Kyeongdon Kim wrote: > diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c > index c3bd520..61ad7f1 100644 > --- a/mm/kasan/kasan.c > +++ b/mm/kasan/kasan.c > @@ -304,6 +304,29 @@ void *memcpy(void *dest, const void *src, size_t len) > > return __memcpy(dest, src, len); > } > +#ifdef CONFIG_ARM64 > +/* > + * Arch arm64 use assembly variant for strcmp/strncmp, > + * xtensa use inline asm operations and x86_64 use c one, > + * so now this interceptors only for arm64 kasan. > + */ > +#undef strcmp > +int strcmp(const char *cs, const char *ct) > +{ > + check_memory_region((unsigned long)cs, 1, false, _RET_IP_); > + check_memory_region((unsigned long)ct, 1, false, _RET_IP_); > + Well this is definitely wrong. strcmp() often accesses far more than one byte. > + return __strcmp(cs, ct); > +} > +#undef strncmp > +int strncmp(const char *cs, const char *ct, size_t len) > +{ > + check_memory_region((unsigned long)cs, len, false, _RET_IP_); > + check_memory_region((unsigned long)ct, len, false, _RET_IP_); This will cause false positives. Both 'cs', and 'ct' could be less than len bytes. There is no need in these interceptors, just use the C implementations from lib/string.c like you did in your first patch. The only thing that was wrong in the first patch is that assembly implementations were compiled out instead of being declared week. Well, at first I thought so.. I would remove diff code in /mm/kasan/kasan.c then use C implementations in lib/string.c w/ assem implementations as weak : diff --git a/lib/string.c b/lib/string.c index 2c0900a..a18b18f 100644 --- a/lib/string.c +++ b/lib/string.c @@ -312,7 +312,7 @@ size_t strlcat(char *dest, const char *src, size_t count) EXPORT_SYMBOL(strlcat); #endif -#ifndef __HAVE_ARCH_STRCMP +#if (defined(CONFIG_ARM64) && defined(CONFIG_KASAN)) || !defined(__HAVE_ARCH_STRCMP) /** * strcmp - Compare two strings * @cs: One string @@ -336,7 +336,7 @@ int strcmp(const char *cs, const char *ct) EXPORT_SYMBOL(strcmp); #endif -#ifndef __HAVE_ARCH_STRNCMP +#if (defined(CONFIG_ARM64) && defined(CONFIG_KASAN)) || !defined(__HAVE_ARCH_STRNCMP) /** * strncmp - Compare two length-limited strings Can I get your opinion wrt this ? Thanks,
Re: Re: [PATCH v2] arm64: kasan: add interceptors for strcmp/strncmp functions
Hello Dmitry, On 2018-09-03 오후 6:13, Dmitry Vyukov wrote: On Mon, Sep 3, 2018 at 11:02 AM, Kyeongdon Kim wrote: > Dear all, > > Could anyone review this and provide me appropriate approach ? > I think str[n]cmp are frequently used functions so I believe very useful w/ > arm64 KASAN. Hi Kyeongdon, Please add tests for this to lib/test_kasan.c. I'll add tests for this patch after next version upload. Thanks, >> >> This patch declares strcmp/strncmp as weak symbols. >> (2 of them are the most used string operations) >> >> Original functions declared as weak and >> strong ones in mm/kasan/kasan.c could replace them. >> >> Assembly optimized strcmp/strncmp functions cannot detect KASan bug. >> But, now we can detect them like the call trace below. >> >> == >> BUG: KASAN: use-after-free in platform_match+0x1c/0x5c at addr >> ffc0ad313500 >> Read of size 1 by task swapper/0/1 >> CPU: 3 PID: 1 Comm: swapper/0 Tainted: G B 4.9.77+ #1 >> Hardware name: Generic (DT) based system >> Call trace: >> dump_backtrace+0x0/0x2e0 >> show_stack+0x14/0x1c >> dump_stack+0x88/0xb0 >> kasan_object_err+0x24/0x7c >> kasan_report+0x2f0/0x484 >> check_memory_region+0x20/0x14c >> strcmp+0x1c/0x5c >> platform_match+0x40/0xe4 >> __driver_attach+0x40/0x130 >> bus_for_each_dev+0xc4/0xe0 >> driver_attach+0x30/0x3c >> bus_add_driver+0x2dc/0x328 >> driver_register+0x118/0x160 >> __platform_driver_register+0x7c/0x88 >> alarmtimer_init+0x154/0x1e4 >> do_one_initcall+0x184/0x1a4 >> kernel_init_freeable+0x2ec/0x2f0 >> kernel_init+0x18/0x10c >> ret_from_fork+0x10/0x50 >> >> In case of xtensa and x86_64 kasan, no need to use this patch now. >> >> Signed-off-by: Kyeongdon Kim >> --- >> arch/arm64/include/asm/string.h | 5 + >> arch/arm64/kernel/arm64ksyms.c | 2 ++ >> arch/arm64/kernel/image.h | 2 ++ >> arch/arm64/lib/strcmp.S | 3 +++ >> arch/arm64/lib/strncmp.S | 3 +++ >> mm/kasan/kasan.c | 23 +++ >> 6 files changed, 38 insertions(+) >> >> diff --git a/arch/arm64/include/asm/string.h >> b/arch/arm64/include/asm/string.h >> index dd95d33..ab60349 100644 >> --- a/arch/arm64/include/asm/string.h >> +++ b/arch/arm64/include/asm/string.h >> @@ -24,9 +24,11 @@ extern char *strchr(const char *, int c); >> >> #define __HAVE_ARCH_STRCMP >> extern int strcmp(const char *, const char *); >> +extern int __strcmp(const char *, const char *); >> >> #define __HAVE_ARCH_STRNCMP >> extern int strncmp(const char *, const char *, __kernel_size_t); >> +extern int __strncmp(const char *, const char *, __kernel_size_t); >> >> #define __HAVE_ARCH_STRLEN >> extern __kernel_size_t strlen(const char *); >> @@ -68,6 +70,9 @@ void memcpy_flushcache(void *dst, const void *src, >> size_t cnt); >> #define memmove(dst, src, len) __memmove(dst, src, len) >> #define memset(s, c, n) __memset(s, c, n) >> >> +#define strcmp(cs, ct) __strcmp(cs, ct) >> +#define strncmp(cs, ct, n) __strncmp(cs, ct, n) >> + >> #ifndef __NO_FORTIFY >> #define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ >> #endif >> diff --git a/arch/arm64/kernel/arm64ksyms.c >> b/arch/arm64/kernel/arm64ksyms.c >> index d894a20..10b1164 100644 >> --- a/arch/arm64/kernel/arm64ksyms.c >> +++ b/arch/arm64/kernel/arm64ksyms.c >> @@ -50,6 +50,8 @@ EXPORT_SYMBOL(strcmp); >> EXPORT_SYMBOL(strncmp); >> EXPORT_SYMBOL(strlen); >> EXPORT_SYMBOL(strnlen); >> +EXPORT_SYMBOL(__strcmp); >> +EXPORT_SYMBOL(__strncmp); >> EXPORT_SYMBOL(memset); >> EXPORT_SYMBOL(memcpy); >> EXPORT_SYMBOL(memmove); >> diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h >> index a820ed0..5ef7a57 100644 >> --- a/arch/arm64/kernel/image.h >> +++ b/arch/arm64/kernel/image.h >> @@ -110,6 +110,8 @@ __efistub___flush_dcache_area = >> KALLSYMS_HIDE(__pi___flush_dcache_area); >> __efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy); >> __efistub___memmove = KALLSYMS_HIDE(__pi_memmove); >> __efistub___memset = KALLSYMS_HIDE(__pi_memset); >> +__efistub___strcmp = KALLSYMS_HIDE(__pi_strcmp); >> +__efistub___strncmp = KALLSYMS_HIDE(__pi_strncmp); >> #endif >> >> __efistub__text = KALLSYMS_HIDE(_text); >> diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S >> index 471fe61..0dffef7 100644 >> --- a/arch/arm64/lib/strcmp.S >> +++ b/arch/arm64/lib/strcmp.S >> @@ -60,6 +
Re: Re: [PATCH v2] arm64: kasan: add interceptors for strcmp/strncmp functions
Hello Dmitry, On 2018-09-03 오후 6:13, Dmitry Vyukov wrote: On Mon, Sep 3, 2018 at 11:02 AM, Kyeongdon Kim wrote: > Dear all, > > Could anyone review this and provide me appropriate approach ? > I think str[n]cmp are frequently used functions so I believe very useful w/ > arm64 KASAN. Hi Kyeongdon, Please add tests for this to lib/test_kasan.c. I'll add tests for this patch after next version upload. Thanks, >> >> This patch declares strcmp/strncmp as weak symbols. >> (2 of them are the most used string operations) >> >> Original functions declared as weak and >> strong ones in mm/kasan/kasan.c could replace them. >> >> Assembly optimized strcmp/strncmp functions cannot detect KASan bug. >> But, now we can detect them like the call trace below. >> >> == >> BUG: KASAN: use-after-free in platform_match+0x1c/0x5c at addr >> ffc0ad313500 >> Read of size 1 by task swapper/0/1 >> CPU: 3 PID: 1 Comm: swapper/0 Tainted: G B 4.9.77+ #1 >> Hardware name: Generic (DT) based system >> Call trace: >> dump_backtrace+0x0/0x2e0 >> show_stack+0x14/0x1c >> dump_stack+0x88/0xb0 >> kasan_object_err+0x24/0x7c >> kasan_report+0x2f0/0x484 >> check_memory_region+0x20/0x14c >> strcmp+0x1c/0x5c >> platform_match+0x40/0xe4 >> __driver_attach+0x40/0x130 >> bus_for_each_dev+0xc4/0xe0 >> driver_attach+0x30/0x3c >> bus_add_driver+0x2dc/0x328 >> driver_register+0x118/0x160 >> __platform_driver_register+0x7c/0x88 >> alarmtimer_init+0x154/0x1e4 >> do_one_initcall+0x184/0x1a4 >> kernel_init_freeable+0x2ec/0x2f0 >> kernel_init+0x18/0x10c >> ret_from_fork+0x10/0x50 >> >> In case of xtensa and x86_64 kasan, no need to use this patch now. >> >> Signed-off-by: Kyeongdon Kim >> --- >> arch/arm64/include/asm/string.h | 5 + >> arch/arm64/kernel/arm64ksyms.c | 2 ++ >> arch/arm64/kernel/image.h | 2 ++ >> arch/arm64/lib/strcmp.S | 3 +++ >> arch/arm64/lib/strncmp.S | 3 +++ >> mm/kasan/kasan.c | 23 +++ >> 6 files changed, 38 insertions(+) >> >> diff --git a/arch/arm64/include/asm/string.h >> b/arch/arm64/include/asm/string.h >> index dd95d33..ab60349 100644 >> --- a/arch/arm64/include/asm/string.h >> +++ b/arch/arm64/include/asm/string.h >> @@ -24,9 +24,11 @@ extern char *strchr(const char *, int c); >> >> #define __HAVE_ARCH_STRCMP >> extern int strcmp(const char *, const char *); >> +extern int __strcmp(const char *, const char *); >> >> #define __HAVE_ARCH_STRNCMP >> extern int strncmp(const char *, const char *, __kernel_size_t); >> +extern int __strncmp(const char *, const char *, __kernel_size_t); >> >> #define __HAVE_ARCH_STRLEN >> extern __kernel_size_t strlen(const char *); >> @@ -68,6 +70,9 @@ void memcpy_flushcache(void *dst, const void *src, >> size_t cnt); >> #define memmove(dst, src, len) __memmove(dst, src, len) >> #define memset(s, c, n) __memset(s, c, n) >> >> +#define strcmp(cs, ct) __strcmp(cs, ct) >> +#define strncmp(cs, ct, n) __strncmp(cs, ct, n) >> + >> #ifndef __NO_FORTIFY >> #define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ >> #endif >> diff --git a/arch/arm64/kernel/arm64ksyms.c >> b/arch/arm64/kernel/arm64ksyms.c >> index d894a20..10b1164 100644 >> --- a/arch/arm64/kernel/arm64ksyms.c >> +++ b/arch/arm64/kernel/arm64ksyms.c >> @@ -50,6 +50,8 @@ EXPORT_SYMBOL(strcmp); >> EXPORT_SYMBOL(strncmp); >> EXPORT_SYMBOL(strlen); >> EXPORT_SYMBOL(strnlen); >> +EXPORT_SYMBOL(__strcmp); >> +EXPORT_SYMBOL(__strncmp); >> EXPORT_SYMBOL(memset); >> EXPORT_SYMBOL(memcpy); >> EXPORT_SYMBOL(memmove); >> diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h >> index a820ed0..5ef7a57 100644 >> --- a/arch/arm64/kernel/image.h >> +++ b/arch/arm64/kernel/image.h >> @@ -110,6 +110,8 @@ __efistub___flush_dcache_area = >> KALLSYMS_HIDE(__pi___flush_dcache_area); >> __efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy); >> __efistub___memmove = KALLSYMS_HIDE(__pi_memmove); >> __efistub___memset = KALLSYMS_HIDE(__pi_memset); >> +__efistub___strcmp = KALLSYMS_HIDE(__pi_strcmp); >> +__efistub___strncmp = KALLSYMS_HIDE(__pi_strncmp); >> #endif >> >> __efistub__text = KALLSYMS_HIDE(_text); >> diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S >> index 471fe61..0dffef7 100644 >> --- a/arch/arm64/lib/strcmp.S >> +++ b/arch/arm64/lib/strcmp.S >> @@ -60,6 +
Re: [PATCH v2] arm64: kasan: add interceptors for strcmp/strncmp functions
Dear all, Could anyone review this and provide me appropriate approach ? I think str[n]cmp are frequently used functions so I believe very useful w/ arm64 KASAN. Best Regards, Kyeongdon Kim On 2018-08-23 오후 5:56, Kyeongdon Kim wrote: This patch declares strcmp/strncmp as weak symbols. (2 of them are the most used string operations) Original functions declared as weak and strong ones in mm/kasan/kasan.c could replace them. Assembly optimized strcmp/strncmp functions cannot detect KASan bug. But, now we can detect them like the call trace below. == BUG: KASAN: use-after-free in platform_match+0x1c/0x5c at addr ffc0ad313500 Read of size 1 by task swapper/0/1 CPU: 3 PID: 1 Comm: swapper/0 Tainted: G B 4.9.77+ #1 Hardware name: Generic (DT) based system Call trace: dump_backtrace+0x0/0x2e0 show_stack+0x14/0x1c dump_stack+0x88/0xb0 kasan_object_err+0x24/0x7c kasan_report+0x2f0/0x484 check_memory_region+0x20/0x14c strcmp+0x1c/0x5c platform_match+0x40/0xe4 __driver_attach+0x40/0x130 bus_for_each_dev+0xc4/0xe0 driver_attach+0x30/0x3c bus_add_driver+0x2dc/0x328 driver_register+0x118/0x160 __platform_driver_register+0x7c/0x88 alarmtimer_init+0x154/0x1e4 do_one_initcall+0x184/0x1a4 kernel_init_freeable+0x2ec/0x2f0 kernel_init+0x18/0x10c ret_from_fork+0x10/0x50 In case of xtensa and x86_64 kasan, no need to use this patch now. Signed-off-by: Kyeongdon Kim --- arch/arm64/include/asm/string.h | 5 + arch/arm64/kernel/arm64ksyms.c | 2 ++ arch/arm64/kernel/image.h | 2 ++ arch/arm64/lib/strcmp.S | 3 +++ arch/arm64/lib/strncmp.S | 3 +++ mm/kasan/kasan.c | 23 +++ 6 files changed, 38 insertions(+) diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index dd95d33..ab60349 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h @@ -24,9 +24,11 @@ extern char *strchr(const char *, int c); #define __HAVE_ARCH_STRCMP extern int strcmp(const char *, const char *); +extern int __strcmp(const char *, const char *); #define __HAVE_ARCH_STRNCMP extern int strncmp(const char *, const char *, __kernel_size_t); +extern int __strncmp(const char *, const char *, __kernel_size_t); #define __HAVE_ARCH_STRLEN extern __kernel_size_t strlen(const char *); @@ -68,6 +70,9 @@ void memcpy_flushcache(void *dst, const void *src, size_t cnt); #define memmove(dst, src, len) __memmove(dst, src, len) #define memset(s, c, n) __memset(s, c, n) +#define strcmp(cs, ct) __strcmp(cs, ct) +#define strncmp(cs, ct, n) __strncmp(cs, ct, n) + #ifndef __NO_FORTIFY #define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ #endif diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index d894a20..10b1164 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -50,6 +50,8 @@ EXPORT_SYMBOL(strcmp); EXPORT_SYMBOL(strncmp); EXPORT_SYMBOL(strlen); EXPORT_SYMBOL(strnlen); +EXPORT_SYMBOL(__strcmp); +EXPORT_SYMBOL(__strncmp); EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memmove); diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index a820ed0..5ef7a57 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -110,6 +110,8 @@ __efistub___flush_dcache_area = KALLSYMS_HIDE(__pi___flush_dcache_area); __efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy); __efistub___memmove = KALLSYMS_HIDE(__pi_memmove); __efistub___memset = KALLSYMS_HIDE(__pi_memset); +__efistub___strcmp = KALLSYMS_HIDE(__pi_strcmp); +__efistub___strncmp = KALLSYMS_HIDE(__pi_strncmp); #endif __efistub__text = KALLSYMS_HIDE(_text); diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S index 471fe61..0dffef7 100644 --- a/arch/arm64/lib/strcmp.S +++ b/arch/arm64/lib/strcmp.S @@ -60,6 +60,8 @@ tmp3 .req x9 zeroones .req x10 pos .req x11 +.weak strcmp +ENTRY(__strcmp) ENTRY(strcmp) eor tmp1, src1, src2 mov zeroones, #REP8_01 @@ -232,3 +234,4 @@ CPU_BE( orr syndrome, diff, has_nul ) sub result, data1, data2, lsr #56 ret ENDPIPROC(strcmp) +ENDPROC(__strcmp) diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S index e267044..b2648c7 100644 --- a/arch/arm64/lib/strncmp.S +++ b/arch/arm64/lib/strncmp.S @@ -64,6 +64,8 @@ limit_wd .req x13 mask .req x14 endloop .req x15 +.weak strncmp +ENTRY(__strncmp) ENTRY(strncmp) cbz limit, .Lret0 eor tmp1, src1, src2 @@ -308,3 +310,4 @@ CPU_BE( orr syndrome, diff, has_nul ) mov result, #0 ret ENDPIPROC(strncmp) +ENDPROC(__strncmp) diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index c3bd520..61ad7f1 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -304,6 +304,29 @@ void *memcpy(void *dest, const void *src, size_t len) return __memcpy(dest, src, len); } +#ifdef CONFIG_ARM64 +/* + * Arch arm64 use assembly variant for strcmp/strncmp, + * xtensa use inline asm operations and x86_64 use c one, + * so now this interceptors only for arm64 kasan. + */ +#undef strcmp +int strcmp(const
Re: [PATCH v2] arm64: kasan: add interceptors for strcmp/strncmp functions
Dear all, Could anyone review this and provide me appropriate approach ? I think str[n]cmp are frequently used functions so I believe very useful w/ arm64 KASAN. Best Regards, Kyeongdon Kim On 2018-08-23 오후 5:56, Kyeongdon Kim wrote: This patch declares strcmp/strncmp as weak symbols. (2 of them are the most used string operations) Original functions declared as weak and strong ones in mm/kasan/kasan.c could replace them. Assembly optimized strcmp/strncmp functions cannot detect KASan bug. But, now we can detect them like the call trace below. == BUG: KASAN: use-after-free in platform_match+0x1c/0x5c at addr ffc0ad313500 Read of size 1 by task swapper/0/1 CPU: 3 PID: 1 Comm: swapper/0 Tainted: G B 4.9.77+ #1 Hardware name: Generic (DT) based system Call trace: dump_backtrace+0x0/0x2e0 show_stack+0x14/0x1c dump_stack+0x88/0xb0 kasan_object_err+0x24/0x7c kasan_report+0x2f0/0x484 check_memory_region+0x20/0x14c strcmp+0x1c/0x5c platform_match+0x40/0xe4 __driver_attach+0x40/0x130 bus_for_each_dev+0xc4/0xe0 driver_attach+0x30/0x3c bus_add_driver+0x2dc/0x328 driver_register+0x118/0x160 __platform_driver_register+0x7c/0x88 alarmtimer_init+0x154/0x1e4 do_one_initcall+0x184/0x1a4 kernel_init_freeable+0x2ec/0x2f0 kernel_init+0x18/0x10c ret_from_fork+0x10/0x50 In case of xtensa and x86_64 kasan, no need to use this patch now. Signed-off-by: Kyeongdon Kim --- arch/arm64/include/asm/string.h | 5 + arch/arm64/kernel/arm64ksyms.c | 2 ++ arch/arm64/kernel/image.h | 2 ++ arch/arm64/lib/strcmp.S | 3 +++ arch/arm64/lib/strncmp.S | 3 +++ mm/kasan/kasan.c | 23 +++ 6 files changed, 38 insertions(+) diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index dd95d33..ab60349 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h @@ -24,9 +24,11 @@ extern char *strchr(const char *, int c); #define __HAVE_ARCH_STRCMP extern int strcmp(const char *, const char *); +extern int __strcmp(const char *, const char *); #define __HAVE_ARCH_STRNCMP extern int strncmp(const char *, const char *, __kernel_size_t); +extern int __strncmp(const char *, const char *, __kernel_size_t); #define __HAVE_ARCH_STRLEN extern __kernel_size_t strlen(const char *); @@ -68,6 +70,9 @@ void memcpy_flushcache(void *dst, const void *src, size_t cnt); #define memmove(dst, src, len) __memmove(dst, src, len) #define memset(s, c, n) __memset(s, c, n) +#define strcmp(cs, ct) __strcmp(cs, ct) +#define strncmp(cs, ct, n) __strncmp(cs, ct, n) + #ifndef __NO_FORTIFY #define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ #endif diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index d894a20..10b1164 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -50,6 +50,8 @@ EXPORT_SYMBOL(strcmp); EXPORT_SYMBOL(strncmp); EXPORT_SYMBOL(strlen); EXPORT_SYMBOL(strnlen); +EXPORT_SYMBOL(__strcmp); +EXPORT_SYMBOL(__strncmp); EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memmove); diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index a820ed0..5ef7a57 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -110,6 +110,8 @@ __efistub___flush_dcache_area = KALLSYMS_HIDE(__pi___flush_dcache_area); __efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy); __efistub___memmove = KALLSYMS_HIDE(__pi_memmove); __efistub___memset = KALLSYMS_HIDE(__pi_memset); +__efistub___strcmp = KALLSYMS_HIDE(__pi_strcmp); +__efistub___strncmp = KALLSYMS_HIDE(__pi_strncmp); #endif __efistub__text = KALLSYMS_HIDE(_text); diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S index 471fe61..0dffef7 100644 --- a/arch/arm64/lib/strcmp.S +++ b/arch/arm64/lib/strcmp.S @@ -60,6 +60,8 @@ tmp3 .req x9 zeroones .req x10 pos .req x11 +.weak strcmp +ENTRY(__strcmp) ENTRY(strcmp) eor tmp1, src1, src2 mov zeroones, #REP8_01 @@ -232,3 +234,4 @@ CPU_BE( orr syndrome, diff, has_nul ) sub result, data1, data2, lsr #56 ret ENDPIPROC(strcmp) +ENDPROC(__strcmp) diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S index e267044..b2648c7 100644 --- a/arch/arm64/lib/strncmp.S +++ b/arch/arm64/lib/strncmp.S @@ -64,6 +64,8 @@ limit_wd .req x13 mask .req x14 endloop .req x15 +.weak strncmp +ENTRY(__strncmp) ENTRY(strncmp) cbz limit, .Lret0 eor tmp1, src1, src2 @@ -308,3 +310,4 @@ CPU_BE( orr syndrome, diff, has_nul ) mov result, #0 ret ENDPIPROC(strncmp) +ENDPROC(__strncmp) diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index c3bd520..61ad7f1 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -304,6 +304,29 @@ void *memcpy(void *dest, const void *src, size_t len) return __memcpy(dest, src, len); } +#ifdef CONFIG_ARM64 +/* + * Arch arm64 use assembly variant for strcmp/strncmp, + * xtensa use inline asm operations and x86_64 use c one, + * so now this interceptors only for arm64 kasan. + */ +#undef strcmp +int strcmp(const
[PATCH v2] arm64: kasan: add interceptors for strcmp/strncmp functions
This patch declares strcmp/strncmp as weak symbols. (2 of them are the most used string operations) Original functions declared as weak and strong ones in mm/kasan/kasan.c could replace them. Assembly optimized strcmp/strncmp functions cannot detect KASan bug. But, now we can detect them like the call trace below. == BUG: KASAN: use-after-free in platform_match+0x1c/0x5c at addr ffc0ad313500 Read of size 1 by task swapper/0/1 CPU: 3 PID: 1 Comm: swapper/0 Tainted: GB 4.9.77+ #1 Hardware name: Generic (DT) based system Call trace: dump_backtrace+0x0/0x2e0 show_stack+0x14/0x1c dump_stack+0x88/0xb0 kasan_object_err+0x24/0x7c kasan_report+0x2f0/0x484 check_memory_region+0x20/0x14c strcmp+0x1c/0x5c platform_match+0x40/0xe4 __driver_attach+0x40/0x130 bus_for_each_dev+0xc4/0xe0 driver_attach+0x30/0x3c bus_add_driver+0x2dc/0x328 driver_register+0x118/0x160 __platform_driver_register+0x7c/0x88 alarmtimer_init+0x154/0x1e4 do_one_initcall+0x184/0x1a4 kernel_init_freeable+0x2ec/0x2f0 kernel_init+0x18/0x10c ret_from_fork+0x10/0x50 In case of xtensa and x86_64 kasan, no need to use this patch now. Signed-off-by: Kyeongdon Kim --- arch/arm64/include/asm/string.h | 5 + arch/arm64/kernel/arm64ksyms.c | 2 ++ arch/arm64/kernel/image.h | 2 ++ arch/arm64/lib/strcmp.S | 3 +++ arch/arm64/lib/strncmp.S| 3 +++ mm/kasan/kasan.c| 23 +++ 6 files changed, 38 insertions(+) diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index dd95d33..ab60349 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h @@ -24,9 +24,11 @@ extern char *strchr(const char *, int c); #define __HAVE_ARCH_STRCMP extern int strcmp(const char *, const char *); +extern int __strcmp(const char *, const char *); #define __HAVE_ARCH_STRNCMP extern int strncmp(const char *, const char *, __kernel_size_t); +extern int __strncmp(const char *, const char *, __kernel_size_t); #define __HAVE_ARCH_STRLEN extern __kernel_size_t strlen(const char *); @@ -68,6 +70,9 @@ void memcpy_flushcache(void *dst, const void *src, size_t cnt); #define memmove(dst, src, len) __memmove(dst, src, len) #define memset(s, c, n) __memset(s, c, n) +#define strcmp(cs, ct) __strcmp(cs, ct) +#define strncmp(cs, ct, n) __strncmp(cs, ct, n) + #ifndef __NO_FORTIFY #define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ #endif diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index d894a20..10b1164 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -50,6 +50,8 @@ EXPORT_SYMBOL(strcmp); EXPORT_SYMBOL(strncmp); EXPORT_SYMBOL(strlen); EXPORT_SYMBOL(strnlen); +EXPORT_SYMBOL(__strcmp); +EXPORT_SYMBOL(__strncmp); EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memmove); diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index a820ed0..5ef7a57 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -110,6 +110,8 @@ __efistub___flush_dcache_area = KALLSYMS_HIDE(__pi___flush_dcache_area); __efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy); __efistub___memmove= KALLSYMS_HIDE(__pi_memmove); __efistub___memset = KALLSYMS_HIDE(__pi_memset); +__efistub___strcmp = KALLSYMS_HIDE(__pi_strcmp); +__efistub___strncmp= KALLSYMS_HIDE(__pi_strncmp); #endif __efistub__text= KALLSYMS_HIDE(_text); diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S index 471fe61..0dffef7 100644 --- a/arch/arm64/lib/strcmp.S +++ b/arch/arm64/lib/strcmp.S @@ -60,6 +60,8 @@ tmp3 .reqx9 zeroones .reqx10 pos.reqx11 +.weak strcmp +ENTRY(__strcmp) ENTRY(strcmp) eor tmp1, src1, src2 mov zeroones, #REP8_01 @@ -232,3 +234,4 @@ CPU_BE( orr syndrome, diff, has_nul ) sub result, data1, data2, lsr #56 ret ENDPIPROC(strcmp) +ENDPROC(__strcmp) diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S index e267044..b2648c7 100644 --- a/arch/arm64/lib/strncmp.S +++ b/arch/arm64/lib/strncmp.S @@ -64,6 +64,8 @@ limit_wd .reqx13 mask .reqx14 endloop.reqx15 +.weak strncmp +ENTRY(__strncmp) ENTRY(strncmp) cbz limit, .Lret0 eor tmp1, src1, src2 @@ -308,3 +310,4 @@ CPU_BE( orr syndrome, diff, has_nul ) mov result, #0 ret ENDPIPROC(strncmp) +ENDPROC(__strncmp) diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index c3bd520..61ad7f1 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -304,6 +304,29 @@ void *memcpy(void *dest, const void *src, size_t len) return __memcpy(dest, src, len); } +#ifdef CONFIG_ARM64 +/* + * Arch arm64 use assembly variant for strcmp/strncmp, + * xtensa use
[PATCH v2] arm64: kasan: add interceptors for strcmp/strncmp functions
This patch declares strcmp/strncmp as weak symbols. (2 of them are the most used string operations) Original functions declared as weak and strong ones in mm/kasan/kasan.c could replace them. Assembly optimized strcmp/strncmp functions cannot detect KASan bug. But, now we can detect them like the call trace below. == BUG: KASAN: use-after-free in platform_match+0x1c/0x5c at addr ffc0ad313500 Read of size 1 by task swapper/0/1 CPU: 3 PID: 1 Comm: swapper/0 Tainted: GB 4.9.77+ #1 Hardware name: Generic (DT) based system Call trace: dump_backtrace+0x0/0x2e0 show_stack+0x14/0x1c dump_stack+0x88/0xb0 kasan_object_err+0x24/0x7c kasan_report+0x2f0/0x484 check_memory_region+0x20/0x14c strcmp+0x1c/0x5c platform_match+0x40/0xe4 __driver_attach+0x40/0x130 bus_for_each_dev+0xc4/0xe0 driver_attach+0x30/0x3c bus_add_driver+0x2dc/0x328 driver_register+0x118/0x160 __platform_driver_register+0x7c/0x88 alarmtimer_init+0x154/0x1e4 do_one_initcall+0x184/0x1a4 kernel_init_freeable+0x2ec/0x2f0 kernel_init+0x18/0x10c ret_from_fork+0x10/0x50 In case of xtensa and x86_64 kasan, no need to use this patch now. Signed-off-by: Kyeongdon Kim --- arch/arm64/include/asm/string.h | 5 + arch/arm64/kernel/arm64ksyms.c | 2 ++ arch/arm64/kernel/image.h | 2 ++ arch/arm64/lib/strcmp.S | 3 +++ arch/arm64/lib/strncmp.S| 3 +++ mm/kasan/kasan.c| 23 +++ 6 files changed, 38 insertions(+) diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index dd95d33..ab60349 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h @@ -24,9 +24,11 @@ extern char *strchr(const char *, int c); #define __HAVE_ARCH_STRCMP extern int strcmp(const char *, const char *); +extern int __strcmp(const char *, const char *); #define __HAVE_ARCH_STRNCMP extern int strncmp(const char *, const char *, __kernel_size_t); +extern int __strncmp(const char *, const char *, __kernel_size_t); #define __HAVE_ARCH_STRLEN extern __kernel_size_t strlen(const char *); @@ -68,6 +70,9 @@ void memcpy_flushcache(void *dst, const void *src, size_t cnt); #define memmove(dst, src, len) __memmove(dst, src, len) #define memset(s, c, n) __memset(s, c, n) +#define strcmp(cs, ct) __strcmp(cs, ct) +#define strncmp(cs, ct, n) __strncmp(cs, ct, n) + #ifndef __NO_FORTIFY #define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ #endif diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index d894a20..10b1164 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -50,6 +50,8 @@ EXPORT_SYMBOL(strcmp); EXPORT_SYMBOL(strncmp); EXPORT_SYMBOL(strlen); EXPORT_SYMBOL(strnlen); +EXPORT_SYMBOL(__strcmp); +EXPORT_SYMBOL(__strncmp); EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memmove); diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index a820ed0..5ef7a57 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -110,6 +110,8 @@ __efistub___flush_dcache_area = KALLSYMS_HIDE(__pi___flush_dcache_area); __efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy); __efistub___memmove= KALLSYMS_HIDE(__pi_memmove); __efistub___memset = KALLSYMS_HIDE(__pi_memset); +__efistub___strcmp = KALLSYMS_HIDE(__pi_strcmp); +__efistub___strncmp= KALLSYMS_HIDE(__pi_strncmp); #endif __efistub__text= KALLSYMS_HIDE(_text); diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S index 471fe61..0dffef7 100644 --- a/arch/arm64/lib/strcmp.S +++ b/arch/arm64/lib/strcmp.S @@ -60,6 +60,8 @@ tmp3 .reqx9 zeroones .reqx10 pos.reqx11 +.weak strcmp +ENTRY(__strcmp) ENTRY(strcmp) eor tmp1, src1, src2 mov zeroones, #REP8_01 @@ -232,3 +234,4 @@ CPU_BE( orr syndrome, diff, has_nul ) sub result, data1, data2, lsr #56 ret ENDPIPROC(strcmp) +ENDPROC(__strcmp) diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S index e267044..b2648c7 100644 --- a/arch/arm64/lib/strncmp.S +++ b/arch/arm64/lib/strncmp.S @@ -64,6 +64,8 @@ limit_wd .reqx13 mask .reqx14 endloop.reqx15 +.weak strncmp +ENTRY(__strncmp) ENTRY(strncmp) cbz limit, .Lret0 eor tmp1, src1, src2 @@ -308,3 +310,4 @@ CPU_BE( orr syndrome, diff, has_nul ) mov result, #0 ret ENDPIPROC(strncmp) +ENDPROC(__strncmp) diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index c3bd520..61ad7f1 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -304,6 +304,29 @@ void *memcpy(void *dest, const void *src, size_t len) return __memcpy(dest, src, len); } +#ifdef CONFIG_ARM64 +/* + * Arch arm64 use assembly variant for strcmp/strncmp, + * xtensa use
[PATCH] arm64: kasan: add interceptors for strcmp/strncmp functions
This patch declares strcmp/strncmp as weak symbols. (2 of them are the most used string operations) Original functions declared as weak symbols and strong ones in mm/kasan/kasan.c could replace them. Assembly optimized strcmp/strncmp functions cannot detect KASan bug. But, now we can detect them. == BUG: KASAN: use-after-free in platform_match+0x1c/0x5c at addr ffc0ad313500 Read of size 1 by task swapper/0/1 CPU: 3 PID: 1 Comm: swapper/0 Tainted: GB 4.9.77+ #1 Hardware name: Generic (DT) based system Call trace: dump_backtrace+0x0/0x2e0 show_stack+0x14/0x1c dump_stack+0x88/0xb0 kasan_object_err+0x24/0x7c kasan_report+0x2f0/0x484 check_memory_region+0x20/0x14c strcmp+0x1c/0x5c platform_match+0x40/0xe4 __driver_attach+0x40/0x130 bus_for_each_dev+0xc4/0xe0 driver_attach+0x30/0x3c bus_add_driver+0x2dc/0x328 driver_register+0x118/0x160 __platform_driver_register+0x7c/0x88 alarmtimer_init+0x154/0x1e4 do_one_initcall+0x184/0x1a4 kernel_init_freeable+0x2ec/0x2f0 kernel_init+0x18/0x10c ret_from_fork+0x10/0x50 Signed-off-by: Kyeongdon Kim --- arch/arm64/include/asm/string.h | 4 arch/arm64/kernel/arm64ksyms.c | 2 ++ arch/arm64/kernel/image.h | 2 ++ arch/arm64/lib/strcmp.S | 3 +++ arch/arm64/lib/strncmp.S| 3 +++ mm/kasan/kasan.c| 15 +++ 6 files changed, 29 insertions(+) diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index dd95d33..d546605 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h @@ -24,9 +24,11 @@ extern char *strchr(const char *, int c); #define __HAVE_ARCH_STRCMP extern int strcmp(const char *, const char *); +extern int __strcmp(const char *, const char *); #define __HAVE_ARCH_STRNCMP extern int strncmp(const char *, const char *, __kernel_size_t); +extern int __strncmp(const char *, const char *, __kernel_size_t); #define __HAVE_ARCH_STRLEN extern __kernel_size_t strlen(const char *); @@ -67,6 +69,8 @@ void memcpy_flushcache(void *dst, const void *src, size_t cnt); #define memcpy(dst, src, len) __memcpy(dst, src, len) #define memmove(dst, src, len) __memmove(dst, src, len) #define memset(s, c, n) __memset(s, c, n) +#define strcmp(cs, ct) __strcmp(cs, ct) +#define strncmp(cs, ct, n) __strncmp(cs, ct, n) #ifndef __NO_FORTIFY #define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index d894a20..10b1164 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -50,6 +50,8 @@ EXPORT_SYMBOL(strcmp); EXPORT_SYMBOL(strncmp); EXPORT_SYMBOL(strlen); EXPORT_SYMBOL(strnlen); +EXPORT_SYMBOL(__strcmp); +EXPORT_SYMBOL(__strncmp); EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memmove); diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index a820ed0..5ef7a57 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -110,6 +110,8 @@ __efistub___flush_dcache_area = KALLSYMS_HIDE(__pi___flush_dcache_area); __efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy); __efistub___memmove= KALLSYMS_HIDE(__pi_memmove); __efistub___memset = KALLSYMS_HIDE(__pi_memset); +__efistub___strcmp = KALLSYMS_HIDE(__pi_strcmp); +__efistub___strncmp= KALLSYMS_HIDE(__pi_strncmp); #endif __efistub__text= KALLSYMS_HIDE(_text); diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S index 471fe61..0dffef7 100644 --- a/arch/arm64/lib/strcmp.S +++ b/arch/arm64/lib/strcmp.S @@ -60,6 +60,8 @@ tmp3 .reqx9 zeroones .reqx10 pos.reqx11 +.weak strcmp +ENTRY(__strcmp) ENTRY(strcmp) eor tmp1, src1, src2 mov zeroones, #REP8_01 @@ -232,3 +234,4 @@ CPU_BE( orr syndrome, diff, has_nul ) sub result, data1, data2, lsr #56 ret ENDPIPROC(strcmp) +ENDPROC(__strcmp) diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S index e267044..b2648c7 100644 --- a/arch/arm64/lib/strncmp.S +++ b/arch/arm64/lib/strncmp.S @@ -64,6 +64,8 @@ limit_wd .reqx13 mask .reqx14 endloop.reqx15 +.weak strncmp +ENTRY(__strncmp) ENTRY(strncmp) cbz limit, .Lret0 eor tmp1, src1, src2 @@ -308,3 +310,4 @@ CPU_BE( orr syndrome, diff, has_nul ) mov result, #0 ret ENDPIPROC(strncmp) +ENDPROC(__strncmp) diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index c3bd520..eae43e6 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -304,7 +304,22 @@ void *memcpy(void *dest, const void *src, size_t len) return __memcpy(dest, src, len); } +#undef strcmp +int strcmp(const char *cs, const char *ct) +{ + check_memory_region((unsigned long)cs, 1, false, _RET_IP_
[PATCH] arm64: kasan: add interceptors for strcmp/strncmp functions
This patch declares strcmp/strncmp as weak symbols. (2 of them are the most used string operations) Original functions declared as weak symbols and strong ones in mm/kasan/kasan.c could replace them. Assembly optimized strcmp/strncmp functions cannot detect KASan bug. But, now we can detect them. == BUG: KASAN: use-after-free in platform_match+0x1c/0x5c at addr ffc0ad313500 Read of size 1 by task swapper/0/1 CPU: 3 PID: 1 Comm: swapper/0 Tainted: GB 4.9.77+ #1 Hardware name: Generic (DT) based system Call trace: dump_backtrace+0x0/0x2e0 show_stack+0x14/0x1c dump_stack+0x88/0xb0 kasan_object_err+0x24/0x7c kasan_report+0x2f0/0x484 check_memory_region+0x20/0x14c strcmp+0x1c/0x5c platform_match+0x40/0xe4 __driver_attach+0x40/0x130 bus_for_each_dev+0xc4/0xe0 driver_attach+0x30/0x3c bus_add_driver+0x2dc/0x328 driver_register+0x118/0x160 __platform_driver_register+0x7c/0x88 alarmtimer_init+0x154/0x1e4 do_one_initcall+0x184/0x1a4 kernel_init_freeable+0x2ec/0x2f0 kernel_init+0x18/0x10c ret_from_fork+0x10/0x50 Signed-off-by: Kyeongdon Kim --- arch/arm64/include/asm/string.h | 4 arch/arm64/kernel/arm64ksyms.c | 2 ++ arch/arm64/kernel/image.h | 2 ++ arch/arm64/lib/strcmp.S | 3 +++ arch/arm64/lib/strncmp.S| 3 +++ mm/kasan/kasan.c| 15 +++ 6 files changed, 29 insertions(+) diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index dd95d33..d546605 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h @@ -24,9 +24,11 @@ extern char *strchr(const char *, int c); #define __HAVE_ARCH_STRCMP extern int strcmp(const char *, const char *); +extern int __strcmp(const char *, const char *); #define __HAVE_ARCH_STRNCMP extern int strncmp(const char *, const char *, __kernel_size_t); +extern int __strncmp(const char *, const char *, __kernel_size_t); #define __HAVE_ARCH_STRLEN extern __kernel_size_t strlen(const char *); @@ -67,6 +69,8 @@ void memcpy_flushcache(void *dst, const void *src, size_t cnt); #define memcpy(dst, src, len) __memcpy(dst, src, len) #define memmove(dst, src, len) __memmove(dst, src, len) #define memset(s, c, n) __memset(s, c, n) +#define strcmp(cs, ct) __strcmp(cs, ct) +#define strncmp(cs, ct, n) __strncmp(cs, ct, n) #ifndef __NO_FORTIFY #define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index d894a20..10b1164 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -50,6 +50,8 @@ EXPORT_SYMBOL(strcmp); EXPORT_SYMBOL(strncmp); EXPORT_SYMBOL(strlen); EXPORT_SYMBOL(strnlen); +EXPORT_SYMBOL(__strcmp); +EXPORT_SYMBOL(__strncmp); EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memmove); diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index a820ed0..5ef7a57 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -110,6 +110,8 @@ __efistub___flush_dcache_area = KALLSYMS_HIDE(__pi___flush_dcache_area); __efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy); __efistub___memmove= KALLSYMS_HIDE(__pi_memmove); __efistub___memset = KALLSYMS_HIDE(__pi_memset); +__efistub___strcmp = KALLSYMS_HIDE(__pi_strcmp); +__efistub___strncmp= KALLSYMS_HIDE(__pi_strncmp); #endif __efistub__text= KALLSYMS_HIDE(_text); diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S index 471fe61..0dffef7 100644 --- a/arch/arm64/lib/strcmp.S +++ b/arch/arm64/lib/strcmp.S @@ -60,6 +60,8 @@ tmp3 .reqx9 zeroones .reqx10 pos.reqx11 +.weak strcmp +ENTRY(__strcmp) ENTRY(strcmp) eor tmp1, src1, src2 mov zeroones, #REP8_01 @@ -232,3 +234,4 @@ CPU_BE( orr syndrome, diff, has_nul ) sub result, data1, data2, lsr #56 ret ENDPIPROC(strcmp) +ENDPROC(__strcmp) diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S index e267044..b2648c7 100644 --- a/arch/arm64/lib/strncmp.S +++ b/arch/arm64/lib/strncmp.S @@ -64,6 +64,8 @@ limit_wd .reqx13 mask .reqx14 endloop.reqx15 +.weak strncmp +ENTRY(__strncmp) ENTRY(strncmp) cbz limit, .Lret0 eor tmp1, src1, src2 @@ -308,3 +310,4 @@ CPU_BE( orr syndrome, diff, has_nul ) mov result, #0 ret ENDPIPROC(strncmp) +ENDPROC(__strncmp) diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index c3bd520..eae43e6 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -304,7 +304,22 @@ void *memcpy(void *dest, const void *src, size_t len) return __memcpy(dest, src, len); } +#undef strcmp +int strcmp(const char *cs, const char *ct) +{ + check_memory_region((unsigned long)cs, 1, false, _RET_IP_
[PATCH] arm64: lib: use c string functions for KASAN support
Assembly optimized string functions cannot detect KASan bug. This might have been the intention of the original author. (not too much important to catch) But, I found the obvious uaf problem in strcmp() function. - in this case, using 32bit KASan patchset helps Since I used c string function, I believe I could find this bug. After using the patch, can see the report & backtrace the below: == BUG: KASAN: use-after-free in strcmp+0x1c/0x5c at addr ffc0ad313500 Read of size 1 by task swapper/0/1 CPU: 3 PID: 1 Comm: swapper/0 Tainted: GB 4.9.77+ #1 Hardware name: Generic (DT) based system Call trace: [] dump_backtrace+0x0/0x2e0 [] show_stack+0x14/0x1c [] dump_stack+0x88/0xb0 [] kasan_object_err+0x24/0x7c [] kasan_report+0x2f0/0x484 [] __asan_load1+0x24/0x50 [] strcmp+0x1c/0x5c [] platform_match+0x40/0xe4 [] __driver_attach+0x40/0x130 [] bus_for_each_dev+0xc4/0xe0 [] driver_attach+0x30/0x3c [] bus_add_driver+0x2dc/0x328 [] driver_register+0x118/0x160 [] __platform_driver_register+0x7c/0x88 [] alarmtimer_init+0x154/0x1e4 [] do_one_initcall+0x184/0x1a4 [] kernel_init_freeable+0x2ec/0x2f0 [] kernel_init+0x18/0x10c [] ret_from_fork+0x10/0x50 Object at ffc0ad313500, in cache kmalloc-64 size: 64 Allocated: PID = 1 save_stack_trace_tsk+0x0/0x194 save_stack_trace+0x18/0x20 kasan_kmalloc+0xa8/0x154 kasan_slab_alloc+0x14/0x1c __kmalloc_track_caller+0x178/0x2a0 kvasprintf+0x80/0x104 kvasprintf_const+0xcc/0xd0 kobject_set_name_vargs+0x54/0xd4 dev_set_name+0x64/0x84 of_device_make_bus_id+0xc4/0x140 of_device_alloc+0x1e0/0x200 of_platform_device_create_pdata+0x70/0xf4 of_platform_bus_create+0x448/0x508 of_platform_populate+0xf4/0x104 of_platform_default_populate+0x20/0x28 of_platform_default_populate_init+0x68/0x78 Freed: PID = 1 save_stack_trace_tsk+0x0/0x194 save_stack_trace+0x18/0x20 kasan_slab_free+0xa0/0x14c kfree+0x174/0x288 kfree_const+0x2c/0x38 kobject_rename+0x12c/0x160 device_rename+0xa8/0x110 mt_usb_probe+0x218/0x760 platform_drv_probe+0x74/0xd0 driver_probe_device+0x3d4/0x614 __driver_attach+0xc8/0x130 bus_for_each_dev+0xc4/0xe0 driver_attach+0x30/0x3c bus_add_driver+0x2dc/0x328 driver_register+0x118/0x160 __platform_driver_register+0x7c/0x88 Memory state around the buggy address: ffc0ad313300: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc ffc0ad313400: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc >ffc0ad313500: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc ^ ffc0ad313600: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc ffc0ad313700: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc == Signed-off-by: Kyeongdon Kim --- arch/arm64/include/asm/string.h | 2 ++ arch/arm64/kernel/arm64ksyms.c | 2 ++ arch/arm64/lib/Makefile | 8 +--- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index dd95d33..5c5219a 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h @@ -16,6 +16,7 @@ #ifndef __ASM_STRING_H #define __ASM_STRING_H +#if !defined(CONFIG_KASAN) #define __HAVE_ARCH_STRRCHR extern char *strrchr(const char *, int c); @@ -33,6 +34,7 @@ extern __kernel_size_t strlen(const char *); #define __HAVE_ARCH_STRNLEN extern __kernel_size_t strnlen(const char *, __kernel_size_t); +#endif #define __HAVE_ARCH_MEMCPY extern void *memcpy(void *, const void *, __kernel_size_t); diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index d894a20..eb9bf20 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -44,12 +44,14 @@ EXPORT_SYMBOL(__arch_copy_in_user); EXPORT_SYMBOL(memstart_addr); /* string / mem functions */ +#if !defined(CONFIG_KASAN) EXPORT_SYMBOL(strchr); EXPORT_SYMBOL(strrchr); EXPORT_SYMBOL(strcmp); EXPORT_SYMBOL(strncmp); EXPORT_SYMBOL(strlen); EXPORT_SYMBOL(strnlen); +#endif EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memmove); diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 68755fd..aa2d457 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -2,9 +2,11 @@ lib-y := clear_user.o delay.o copy_from_user.o\ copy_to_user.o copy_in_user.o copy_page.o\ clear_page.o memchr.o memcpy.o memmove.o memset.o\ - memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \ - strchr.o strrchr.o tishift.o - + memcmp.o tishift.o +ifndef CONFIG_KASAN +lib-y := strcmp.o strncmp.o strlen.o strnlen.o\ + strchr.o strrchr.o +endif # Tell the compiler to treat all general purpose registers (with the # exception of the IP registers, which are already handled by the caller # in case o
[PATCH] arm64: lib: use c string functions for KASAN support
Assembly optimized string functions cannot detect KASan bug. This might have been the intention of the original author. (not too much important to catch) But, I found the obvious uaf problem in strcmp() function. - in this case, using 32bit KASan patchset helps Since I used c string function, I believe I could find this bug. After using the patch, can see the report & backtrace the below: == BUG: KASAN: use-after-free in strcmp+0x1c/0x5c at addr ffc0ad313500 Read of size 1 by task swapper/0/1 CPU: 3 PID: 1 Comm: swapper/0 Tainted: GB 4.9.77+ #1 Hardware name: Generic (DT) based system Call trace: [] dump_backtrace+0x0/0x2e0 [] show_stack+0x14/0x1c [] dump_stack+0x88/0xb0 [] kasan_object_err+0x24/0x7c [] kasan_report+0x2f0/0x484 [] __asan_load1+0x24/0x50 [] strcmp+0x1c/0x5c [] platform_match+0x40/0xe4 [] __driver_attach+0x40/0x130 [] bus_for_each_dev+0xc4/0xe0 [] driver_attach+0x30/0x3c [] bus_add_driver+0x2dc/0x328 [] driver_register+0x118/0x160 [] __platform_driver_register+0x7c/0x88 [] alarmtimer_init+0x154/0x1e4 [] do_one_initcall+0x184/0x1a4 [] kernel_init_freeable+0x2ec/0x2f0 [] kernel_init+0x18/0x10c [] ret_from_fork+0x10/0x50 Object at ffc0ad313500, in cache kmalloc-64 size: 64 Allocated: PID = 1 save_stack_trace_tsk+0x0/0x194 save_stack_trace+0x18/0x20 kasan_kmalloc+0xa8/0x154 kasan_slab_alloc+0x14/0x1c __kmalloc_track_caller+0x178/0x2a0 kvasprintf+0x80/0x104 kvasprintf_const+0xcc/0xd0 kobject_set_name_vargs+0x54/0xd4 dev_set_name+0x64/0x84 of_device_make_bus_id+0xc4/0x140 of_device_alloc+0x1e0/0x200 of_platform_device_create_pdata+0x70/0xf4 of_platform_bus_create+0x448/0x508 of_platform_populate+0xf4/0x104 of_platform_default_populate+0x20/0x28 of_platform_default_populate_init+0x68/0x78 Freed: PID = 1 save_stack_trace_tsk+0x0/0x194 save_stack_trace+0x18/0x20 kasan_slab_free+0xa0/0x14c kfree+0x174/0x288 kfree_const+0x2c/0x38 kobject_rename+0x12c/0x160 device_rename+0xa8/0x110 mt_usb_probe+0x218/0x760 platform_drv_probe+0x74/0xd0 driver_probe_device+0x3d4/0x614 __driver_attach+0xc8/0x130 bus_for_each_dev+0xc4/0xe0 driver_attach+0x30/0x3c bus_add_driver+0x2dc/0x328 driver_register+0x118/0x160 __platform_driver_register+0x7c/0x88 Memory state around the buggy address: ffc0ad313300: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc ffc0ad313400: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc >ffc0ad313500: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc ^ ffc0ad313600: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc ffc0ad313700: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc == Signed-off-by: Kyeongdon Kim --- arch/arm64/include/asm/string.h | 2 ++ arch/arm64/kernel/arm64ksyms.c | 2 ++ arch/arm64/lib/Makefile | 8 +--- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index dd95d33..5c5219a 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h @@ -16,6 +16,7 @@ #ifndef __ASM_STRING_H #define __ASM_STRING_H +#if !defined(CONFIG_KASAN) #define __HAVE_ARCH_STRRCHR extern char *strrchr(const char *, int c); @@ -33,6 +34,7 @@ extern __kernel_size_t strlen(const char *); #define __HAVE_ARCH_STRNLEN extern __kernel_size_t strnlen(const char *, __kernel_size_t); +#endif #define __HAVE_ARCH_MEMCPY extern void *memcpy(void *, const void *, __kernel_size_t); diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index d894a20..eb9bf20 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -44,12 +44,14 @@ EXPORT_SYMBOL(__arch_copy_in_user); EXPORT_SYMBOL(memstart_addr); /* string / mem functions */ +#if !defined(CONFIG_KASAN) EXPORT_SYMBOL(strchr); EXPORT_SYMBOL(strrchr); EXPORT_SYMBOL(strcmp); EXPORT_SYMBOL(strncmp); EXPORT_SYMBOL(strlen); EXPORT_SYMBOL(strnlen); +#endif EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memmove); diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 68755fd..aa2d457 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -2,9 +2,11 @@ lib-y := clear_user.o delay.o copy_from_user.o\ copy_to_user.o copy_in_user.o copy_page.o\ clear_page.o memchr.o memcpy.o memmove.o memset.o\ - memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \ - strchr.o strrchr.o tishift.o - + memcmp.o tishift.o +ifndef CONFIG_KASAN +lib-y := strcmp.o strncmp.o strlen.o strnlen.o\ + strchr.o strrchr.o +endif # Tell the compiler to treat all general purpose registers (with the # exception of the IP registers, which are already handled by the caller # in case o
[PATCH v2] ksm : use checksum and memcmp for rb_tree
The current ksm is using memcmp to insert and search 'rb_tree'. It does cause very expensive computation cost. In order to reduce the time of this operation, we have added a checksum to traverse. Nearly all 'rb_node' in stable_tree_insert() function can be inserted as a checksum, most of it is possible in unstable_tree_search_insert() function. In stable_tree_search() function, the checksum may be an additional. But, checksum check duration is extremely small. Considering the time of the whole cmp_and_merge_page() function, it requires very little cost on average. Using this patch, we compared the time of ksm_do_scan() function by adding kernel trace at the start-end position of operation. (ARM 32bit target android device, over 1000 sample time gap stamps average) On original KSM scan avg duration = 0.0166893 sec 14991.975619 : ksm_do_scan_start: START: ksm_do_scan 14991.990975 : ksm_do_scan_end: END: ksm_do_scan 14992.008989 : ksm_do_scan_start: START: ksm_do_scan 14992.016839 : ksm_do_scan_end: END: ksm_do_scan ... On patch KSM scan avg duration = 0.0041157 sec 41081.46131 : ksm_do_scan_start : START: ksm_do_scan 41081.46636 : ksm_do_scan_end : END: ksm_do_scan 41081.48476 : ksm_do_scan_start : START: ksm_do_scan 41081.48795 : ksm_do_scan_end : END: ksm_do_scan ... We have tested randomly so many times for the stability and couldn't see any abnormal issue until now. Also, we found out this patch can make some good advantage for the power consumption than KSM default enable. v1 -> v2 - add comment for oldchecksum value - move the oldchecksum value out of union - remove check code regarding checksum 0 in stable_tree_search() link to v1 : https://lkml.org/lkml/2017/10/30/251 Signed-off-by: Kyeongdon Kim <kyeongdon@lge.com> --- mm/ksm.c | 48 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index be8f457..9280569 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -134,6 +134,7 @@ struct ksm_scan { * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) * @chain_prune_time: time of the last full garbage collection * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN + * @oldchecksum: previous checksum of the page about a stable_node * @nid: NUMA node id of stable tree in which linked (may not match kpfn) */ struct stable_node { @@ -159,6 +160,7 @@ struct stable_node { */ #define STABLE_NODE_CHAIN -1024 int rmap_hlist_len; + u32 oldchecksum; #ifdef CONFIG_NUMA int nid; #endif @@ -1522,7 +1524,7 @@ static __always_inline struct page *chain(struct stable_node **s_n_d, * This function returns the stable tree node of identical content if found, * NULL otherwise. */ -static struct page *stable_tree_search(struct page *page) +static struct page *stable_tree_search(struct page *page, u32 checksum) { int nid; struct rb_root *root; @@ -1550,6 +1552,18 @@ static struct page *stable_tree_search(struct page *page) cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); + + /* first make rb_tree by checksum */ + if (checksum < stable_node->oldchecksum) { + parent = *new; + new = >rb_left; + continue; + } else if (checksum > stable_node->oldchecksum) { + parent = *new; + new = >rb_right; + continue; + } + stable_node_any = NULL; tree_page = chain_prune(_node_dup, _node, root); /* @@ -1768,7 +1782,7 @@ static struct page *stable_tree_search(struct page *page) * This function returns the stable tree node just allocated on success, * NULL otherwise. */ -static struct stable_node *stable_tree_insert(struct page *kpage) +static struct stable_node *stable_tree_insert(struct page *kpage, u32 checksum) { int nid; unsigned long kpfn; @@ -1792,6 +1806,18 @@ static struct stable_node *stable_tree_insert(struct page *kpage) cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); stable_node_any = NULL; + + /* first make rb_tree by checksum */ + if (checksum < stable_node->oldchecksum) { + parent = *new; + new = >rb_left; + continue; + } else if (checksum > stable_node->oldchecksum) { + parent = *new; + new = >rb_right; + continue; + } + tree_page = chain(_node_dup, stable_node, root); if (!stable_node_dup) { /* @@ -1850,6 +1876,7 @@ static struct stable_node *stable_tree_insert(s
[PATCH v2] ksm : use checksum and memcmp for rb_tree
The current ksm is using memcmp to insert and search 'rb_tree'. It does cause very expensive computation cost. In order to reduce the time of this operation, we have added a checksum to traverse. Nearly all 'rb_node' in stable_tree_insert() function can be inserted as a checksum, most of it is possible in unstable_tree_search_insert() function. In stable_tree_search() function, the checksum may be an additional. But, checksum check duration is extremely small. Considering the time of the whole cmp_and_merge_page() function, it requires very little cost on average. Using this patch, we compared the time of ksm_do_scan() function by adding kernel trace at the start-end position of operation. (ARM 32bit target android device, over 1000 sample time gap stamps average) On original KSM scan avg duration = 0.0166893 sec 14991.975619 : ksm_do_scan_start: START: ksm_do_scan 14991.990975 : ksm_do_scan_end: END: ksm_do_scan 14992.008989 : ksm_do_scan_start: START: ksm_do_scan 14992.016839 : ksm_do_scan_end: END: ksm_do_scan ... On patch KSM scan avg duration = 0.0041157 sec 41081.46131 : ksm_do_scan_start : START: ksm_do_scan 41081.46636 : ksm_do_scan_end : END: ksm_do_scan 41081.48476 : ksm_do_scan_start : START: ksm_do_scan 41081.48795 : ksm_do_scan_end : END: ksm_do_scan ... We have tested randomly so many times for the stability and couldn't see any abnormal issue until now. Also, we found out this patch can make some good advantage for the power consumption than KSM default enable. v1 -> v2 - add comment for oldchecksum value - move the oldchecksum value out of union - remove check code regarding checksum 0 in stable_tree_search() link to v1 : https://lkml.org/lkml/2017/10/30/251 Signed-off-by: Kyeongdon Kim --- mm/ksm.c | 48 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index be8f457..9280569 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -134,6 +134,7 @@ struct ksm_scan { * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) * @chain_prune_time: time of the last full garbage collection * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN + * @oldchecksum: previous checksum of the page about a stable_node * @nid: NUMA node id of stable tree in which linked (may not match kpfn) */ struct stable_node { @@ -159,6 +160,7 @@ struct stable_node { */ #define STABLE_NODE_CHAIN -1024 int rmap_hlist_len; + u32 oldchecksum; #ifdef CONFIG_NUMA int nid; #endif @@ -1522,7 +1524,7 @@ static __always_inline struct page *chain(struct stable_node **s_n_d, * This function returns the stable tree node of identical content if found, * NULL otherwise. */ -static struct page *stable_tree_search(struct page *page) +static struct page *stable_tree_search(struct page *page, u32 checksum) { int nid; struct rb_root *root; @@ -1550,6 +1552,18 @@ static struct page *stable_tree_search(struct page *page) cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); + + /* first make rb_tree by checksum */ + if (checksum < stable_node->oldchecksum) { + parent = *new; + new = >rb_left; + continue; + } else if (checksum > stable_node->oldchecksum) { + parent = *new; + new = >rb_right; + continue; + } + stable_node_any = NULL; tree_page = chain_prune(_node_dup, _node, root); /* @@ -1768,7 +1782,7 @@ static struct page *stable_tree_search(struct page *page) * This function returns the stable tree node just allocated on success, * NULL otherwise. */ -static struct stable_node *stable_tree_insert(struct page *kpage) +static struct stable_node *stable_tree_insert(struct page *kpage, u32 checksum) { int nid; unsigned long kpfn; @@ -1792,6 +1806,18 @@ static struct stable_node *stable_tree_insert(struct page *kpage) cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); stable_node_any = NULL; + + /* first make rb_tree by checksum */ + if (checksum < stable_node->oldchecksum) { + parent = *new; + new = >rb_left; + continue; + } else if (checksum > stable_node->oldchecksum) { + parent = *new; + new = >rb_right; + continue; + } + tree_page = chain(_node_dup, stable_node, root); if (!stable_node_dup) { /* @@ -1850,6 +1876,7 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
Re: Re: [PATCH] ksm : use checksum and memcmp for rb_tree
Sorry, re-send this email because of the Delivery failed message (to linux-kernel) On 2017-10-30 오후 10:22, Timofey Titovets wrote: 2017-10-30 15:03 GMT+03:00 Kyeongdon Kim <kyeongdon@lge.com>: > The current ksm is using memcmp to insert and search 'rb_tree'. > It does cause very expensive computation cost. > In order to reduce the time of this operation, > we have added a checksum to traverse before memcmp operation. > > Nearly all 'rb_node' in stable_tree_insert() function > can be inserted as a checksum, most of it is possible > in unstable_tree_search_insert() function. > In stable_tree_search() function, the checksum may be an additional. > But, checksum check duration is extremely small. > Considering the time of the whole cmp_and_merge_page() function, > it requires very little cost on average. > > Using this patch, we compared the time of ksm_do_scan() function > by adding kernel trace at the start-end position of operation. > (ARM 32bit target android device, > over 1000 sample time gap stamps average) > > On original KSM scan avg duration = 0.0166893 sec > 24991.975619 : ksm_do_scan_start: START: ksm_do_scan > 24991.990975 : ksm_do_scan_end: END: ksm_do_scan > 24992.008989 : ksm_do_scan_start: START: ksm_do_scan > 24992.016839 : ksm_do_scan_end: END: ksm_do_scan > ... > > On patch KSM scan avg duration = 0.0041157 sec > 41081.461312 : ksm_do_scan_start: START: ksm_do_scan > 41081.466364 : ksm_do_scan_end: END: ksm_do_scan > 41081.484767 : ksm_do_scan_start: START: ksm_do_scan > 41081.487951 : ksm_do_scan_end: END: ksm_do_scan > ... > > We have tested randomly so many times for the stability > and couldn't see any abnormal issue until now. > Also, we found out this patch can make some good advantage > for the power consumption than KSM default enable. > > Signed-off-by: Kyeongdon Kim <kyeongdon@lge.com> > --- > mm/ksm.c | 49 + > 1 file changed, 45 insertions(+), 4 deletions(-) > > diff --git a/mm/ksm.c b/mm/ksm.c > index be8f457..66ab4f4 100644 > --- a/mm/ksm.c > +++ b/mm/ksm.c > @@ -150,6 +150,7 @@ struct stable_node { > struct hlist_head hlist; > union { > unsigned long kpfn; > + u32 oldchecksum; > unsigned long chain_prune_time; > }; > /* May be just checksum? i.e. that's can be "old", where checksum can change, in stable tree, checksum also stable. Also, as checksum are stable, may be that make a sense to move it out of union? (I'm afraid of clashes) Also, you miss update comment above struct stable_node, about checksum var. Thanks for your comment, and we may change those lines like below : + * @oldchecksum: previous checksum of the page about a stable_node * @nid: NUMA node id of stable tree in which linked (may not match kpfn) */ struct stable_node { @@ -159,6 +160,7 @@ struct stable_node { */ #define STABLE_NODE_CHAIN -1024 int rmap_hlist_len; + u32 oldchecksum; #ifdef CONFIG_NUMA And I think if checksum are matched, then we can use original memcmp logic in stable tree. the worst case that I imagine is no page merging(just in that moment). But, in my humble opinion, there will be no critical memory issue. but just return. (as I said, we tested a lot to check some abnormal memory operation, but so far, so good - only performance improvement) > @@ -1522,7 +1523,7 @@ static __always_inline struct page *chain(struct stable_node **s_n_d, > * This function returns the stable tree node of identical content if found, > * NULL otherwise. > */ > -static struct page *stable_tree_search(struct page *page) > +static struct page *stable_tree_search(struct page *page, u32 checksum) > { > int nid; > struct rb_root *root; > @@ -1540,6 +1541,8 @@ static struct page *stable_tree_search(struct page *page) > > nid = get_kpfn_nid(page_to_pfn(page)); > root = root_stable_tree + nid; > + if (!checksum) > + return NULL; That's not a pointer, and 0x0 - is a valid checksum. Also, jhash2 not so collision free, i.e.: jhash2((uint32_t *) , 2, 17); Example of collisions, where hash = 0x0: hash: 0x0 - num: 610041898 hash: 0x0 - num: 4893164379 hash: 0x0 - num: 16423540221 hash: 0x0 - num: 29036382188 You also compare values, so hash = 0, is a acceptable checksum. well, if then, I can remove this check line. Thanks, anyway in general idea looks good. Reviewed-by: Timofey Titovets <nefelim...@gmail.com> -- Have a nice day, Timofey. Thanks a lot :) Actually, our organization want to use this KSM feature in general, but, current logic needs too high cost. So I wish to change more light version. Please kindly give your opinion on this idea. Thanks, Kyeongdon Kim
Re: Re: [PATCH] ksm : use checksum and memcmp for rb_tree
Sorry, re-send this email because of the Delivery failed message (to linux-kernel) On 2017-10-30 오후 10:22, Timofey Titovets wrote: 2017-10-30 15:03 GMT+03:00 Kyeongdon Kim : > The current ksm is using memcmp to insert and search 'rb_tree'. > It does cause very expensive computation cost. > In order to reduce the time of this operation, > we have added a checksum to traverse before memcmp operation. > > Nearly all 'rb_node' in stable_tree_insert() function > can be inserted as a checksum, most of it is possible > in unstable_tree_search_insert() function. > In stable_tree_search() function, the checksum may be an additional. > But, checksum check duration is extremely small. > Considering the time of the whole cmp_and_merge_page() function, > it requires very little cost on average. > > Using this patch, we compared the time of ksm_do_scan() function > by adding kernel trace at the start-end position of operation. > (ARM 32bit target android device, > over 1000 sample time gap stamps average) > > On original KSM scan avg duration = 0.0166893 sec > 24991.975619 : ksm_do_scan_start: START: ksm_do_scan > 24991.990975 : ksm_do_scan_end: END: ksm_do_scan > 24992.008989 : ksm_do_scan_start: START: ksm_do_scan > 24992.016839 : ksm_do_scan_end: END: ksm_do_scan > ... > > On patch KSM scan avg duration = 0.0041157 sec > 41081.461312 : ksm_do_scan_start: START: ksm_do_scan > 41081.466364 : ksm_do_scan_end: END: ksm_do_scan > 41081.484767 : ksm_do_scan_start: START: ksm_do_scan > 41081.487951 : ksm_do_scan_end: END: ksm_do_scan > ... > > We have tested randomly so many times for the stability > and couldn't see any abnormal issue until now. > Also, we found out this patch can make some good advantage > for the power consumption than KSM default enable. > > Signed-off-by: Kyeongdon Kim > --- > mm/ksm.c | 49 + > 1 file changed, 45 insertions(+), 4 deletions(-) > > diff --git a/mm/ksm.c b/mm/ksm.c > index be8f457..66ab4f4 100644 > --- a/mm/ksm.c > +++ b/mm/ksm.c > @@ -150,6 +150,7 @@ struct stable_node { > struct hlist_head hlist; > union { > unsigned long kpfn; > + u32 oldchecksum; > unsigned long chain_prune_time; > }; > /* May be just checksum? i.e. that's can be "old", where checksum can change, in stable tree, checksum also stable. Also, as checksum are stable, may be that make a sense to move it out of union? (I'm afraid of clashes) Also, you miss update comment above struct stable_node, about checksum var. Thanks for your comment, and we may change those lines like below : + * @oldchecksum: previous checksum of the page about a stable_node * @nid: NUMA node id of stable tree in which linked (may not match kpfn) */ struct stable_node { @@ -159,6 +160,7 @@ struct stable_node { */ #define STABLE_NODE_CHAIN -1024 int rmap_hlist_len; + u32 oldchecksum; #ifdef CONFIG_NUMA And I think if checksum are matched, then we can use original memcmp logic in stable tree. the worst case that I imagine is no page merging(just in that moment). But, in my humble opinion, there will be no critical memory issue. but just return. (as I said, we tested a lot to check some abnormal memory operation, but so far, so good - only performance improvement) > @@ -1522,7 +1523,7 @@ static __always_inline struct page *chain(struct stable_node **s_n_d, > * This function returns the stable tree node of identical content if found, > * NULL otherwise. > */ > -static struct page *stable_tree_search(struct page *page) > +static struct page *stable_tree_search(struct page *page, u32 checksum) > { > int nid; > struct rb_root *root; > @@ -1540,6 +1541,8 @@ static struct page *stable_tree_search(struct page *page) > > nid = get_kpfn_nid(page_to_pfn(page)); > root = root_stable_tree + nid; > + if (!checksum) > + return NULL; That's not a pointer, and 0x0 - is a valid checksum. Also, jhash2 not so collision free, i.e.: jhash2((uint32_t *) , 2, 17); Example of collisions, where hash = 0x0: hash: 0x0 - num: 610041898 hash: 0x0 - num: 4893164379 hash: 0x0 - num: 16423540221 hash: 0x0 - num: 29036382188 You also compare values, so hash = 0, is a acceptable checksum. well, if then, I can remove this check line. Thanks, anyway in general idea looks good. Reviewed-by: Timofey Titovets -- Have a nice day, Timofey. Thanks a lot :) Actually, our organization want to use this KSM feature in general, but, current logic needs too high cost. So I wish to change more light version. Please kindly give your opinion on this idea. Thanks, Kyeongdon Kim
[PATCH] ksm : use checksum and memcmp for rb_tree
The current ksm is using memcmp to insert and search 'rb_tree'. It does cause very expensive computation cost. In order to reduce the time of this operation, we have added a checksum to traverse before memcmp operation. Nearly all 'rb_node' in stable_tree_insert() function can be inserted as a checksum, most of it is possible in unstable_tree_search_insert() function. In stable_tree_search() function, the checksum may be an additional. But, checksum check duration is extremely small. Considering the time of the whole cmp_and_merge_page() function, it requires very little cost on average. Using this patch, we compared the time of ksm_do_scan() function by adding kernel trace at the start-end position of operation. (ARM 32bit target android device, over 1000 sample time gap stamps average) On original KSM scan avg duration = 0.0166893 sec 24991.975619 : ksm_do_scan_start: START: ksm_do_scan 24991.990975 : ksm_do_scan_end: END: ksm_do_scan 24992.008989 : ksm_do_scan_start: START: ksm_do_scan 24992.016839 : ksm_do_scan_end: END: ksm_do_scan ... On patch KSM scan avg duration = 0.0041157 sec 41081.461312 : ksm_do_scan_start: START: ksm_do_scan 41081.466364 : ksm_do_scan_end: END: ksm_do_scan 41081.484767 : ksm_do_scan_start: START: ksm_do_scan 41081.487951 : ksm_do_scan_end: END: ksm_do_scan ... We have tested randomly so many times for the stability and couldn't see any abnormal issue until now. Also, we found out this patch can make some good advantage for the power consumption than KSM default enable. Signed-off-by: Kyeongdon Kim <kyeongdon@lge.com> --- mm/ksm.c | 49 + 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index be8f457..66ab4f4 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -150,6 +150,7 @@ struct stable_node { struct hlist_head hlist; union { unsigned long kpfn; + u32 oldchecksum; unsigned long chain_prune_time; }; /* @@ -1522,7 +1523,7 @@ static __always_inline struct page *chain(struct stable_node **s_n_d, * This function returns the stable tree node of identical content if found, * NULL otherwise. */ -static struct page *stable_tree_search(struct page *page) +static struct page *stable_tree_search(struct page *page, u32 checksum) { int nid; struct rb_root *root; @@ -1540,6 +1541,8 @@ static struct page *stable_tree_search(struct page *page) nid = get_kpfn_nid(page_to_pfn(page)); root = root_stable_tree + nid; + if (!checksum) + return NULL; again: new = >rb_node; parent = NULL; @@ -1550,6 +1553,18 @@ static struct page *stable_tree_search(struct page *page) cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); + + /* first make rb_tree by checksum */ + if (checksum < stable_node->oldchecksum) { + parent = *new; + new = >rb_left; + continue; + } else if (checksum > stable_node->oldchecksum) { + parent = *new; + new = >rb_right; + continue; + } + stable_node_any = NULL; tree_page = chain_prune(_node_dup, _node, root); /* @@ -1768,7 +1783,7 @@ static struct page *stable_tree_search(struct page *page) * This function returns the stable tree node just allocated on success, * NULL otherwise. */ -static struct stable_node *stable_tree_insert(struct page *kpage) +static struct stable_node *stable_tree_insert(struct page *kpage, u32 checksum) { int nid; unsigned long kpfn; @@ -1792,6 +1807,18 @@ static struct stable_node *stable_tree_insert(struct page *kpage) cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); stable_node_any = NULL; + + /* first make rb_tree by checksum */ + if (checksum < stable_node->oldchecksum) { + parent = *new; + new = >rb_left; + continue; + } else if (checksum > stable_node->oldchecksum) { + parent = *new; + new = >rb_right; + continue; + } + tree_page = chain(_node_dup, stable_node, root); if (!stable_node_dup) { /* @@ -1850,6 +1877,7 @@ static struct stable_node *stable_tree_insert(struct page *kpage) INIT_HLIST_HEAD(_node_dup->hlist); stable_node_dup->kpfn = kpfn; + stable_node_dup->oldchecksum = checksum; set_page_stable_node(kpage, stable_node_dup); stable_node_dup->rmap_hlist_len = 0; DO_NUMA(stable_node
[PATCH] ksm : use checksum and memcmp for rb_tree
The current ksm is using memcmp to insert and search 'rb_tree'. It does cause very expensive computation cost. In order to reduce the time of this operation, we have added a checksum to traverse before memcmp operation. Nearly all 'rb_node' in stable_tree_insert() function can be inserted as a checksum, most of it is possible in unstable_tree_search_insert() function. In stable_tree_search() function, the checksum may be an additional. But, checksum check duration is extremely small. Considering the time of the whole cmp_and_merge_page() function, it requires very little cost on average. Using this patch, we compared the time of ksm_do_scan() function by adding kernel trace at the start-end position of operation. (ARM 32bit target android device, over 1000 sample time gap stamps average) On original KSM scan avg duration = 0.0166893 sec 24991.975619 : ksm_do_scan_start: START: ksm_do_scan 24991.990975 : ksm_do_scan_end: END: ksm_do_scan 24992.008989 : ksm_do_scan_start: START: ksm_do_scan 24992.016839 : ksm_do_scan_end: END: ksm_do_scan ... On patch KSM scan avg duration = 0.0041157 sec 41081.461312 : ksm_do_scan_start: START: ksm_do_scan 41081.466364 : ksm_do_scan_end: END: ksm_do_scan 41081.484767 : ksm_do_scan_start: START: ksm_do_scan 41081.487951 : ksm_do_scan_end: END: ksm_do_scan ... We have tested randomly so many times for the stability and couldn't see any abnormal issue until now. Also, we found out this patch can make some good advantage for the power consumption than KSM default enable. Signed-off-by: Kyeongdon Kim --- mm/ksm.c | 49 + 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index be8f457..66ab4f4 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -150,6 +150,7 @@ struct stable_node { struct hlist_head hlist; union { unsigned long kpfn; + u32 oldchecksum; unsigned long chain_prune_time; }; /* @@ -1522,7 +1523,7 @@ static __always_inline struct page *chain(struct stable_node **s_n_d, * This function returns the stable tree node of identical content if found, * NULL otherwise. */ -static struct page *stable_tree_search(struct page *page) +static struct page *stable_tree_search(struct page *page, u32 checksum) { int nid; struct rb_root *root; @@ -1540,6 +1541,8 @@ static struct page *stable_tree_search(struct page *page) nid = get_kpfn_nid(page_to_pfn(page)); root = root_stable_tree + nid; + if (!checksum) + return NULL; again: new = >rb_node; parent = NULL; @@ -1550,6 +1553,18 @@ static struct page *stable_tree_search(struct page *page) cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); + + /* first make rb_tree by checksum */ + if (checksum < stable_node->oldchecksum) { + parent = *new; + new = >rb_left; + continue; + } else if (checksum > stable_node->oldchecksum) { + parent = *new; + new = >rb_right; + continue; + } + stable_node_any = NULL; tree_page = chain_prune(_node_dup, _node, root); /* @@ -1768,7 +1783,7 @@ static struct page *stable_tree_search(struct page *page) * This function returns the stable tree node just allocated on success, * NULL otherwise. */ -static struct stable_node *stable_tree_insert(struct page *kpage) +static struct stable_node *stable_tree_insert(struct page *kpage, u32 checksum) { int nid; unsigned long kpfn; @@ -1792,6 +1807,18 @@ static struct stable_node *stable_tree_insert(struct page *kpage) cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); stable_node_any = NULL; + + /* first make rb_tree by checksum */ + if (checksum < stable_node->oldchecksum) { + parent = *new; + new = >rb_left; + continue; + } else if (checksum > stable_node->oldchecksum) { + parent = *new; + new = >rb_right; + continue; + } + tree_page = chain(_node_dup, stable_node, root); if (!stable_node_dup) { /* @@ -1850,6 +1877,7 @@ static struct stable_node *stable_tree_insert(struct page *kpage) INIT_HLIST_HEAD(_node_dup->hlist); stable_node_dup->kpfn = kpfn; + stable_node_dup->oldchecksum = checksum; set_page_stable_node(kpage, stable_node_dup); stable_node_dup->rmap_hlist_len = 0; DO_NUMA(stable_node_dup->nid = nid); @@ -1907,6 +19
Re: Re: [PATCH] mm/vmstats: add counters for the page frag cache
On 2017-09-04 오후 5:30, Konstantin Khlebnikov wrote: On 04.09.2017 04:35, Kyeongdon Kim wrote: > Thanks for your reply, > But I couldn't find "NR_FRAGMENT_PAGES" in linux-next.git .. is that vmstat counter? or others? > I mean rather than adding bunch vmstat counters for operations it might be worth to add page counter which will show current amount of these pages. But this seems too low-level for tracking, common counters for all network buffers would be more useful but much harder to implement. Ok, thanks for the comment. As I can see page owner is able to save stacktrace where allocation happened, this makes debugging mostly trivial without any counters. If it adds too much overhead - just track random 1% of pages, should be enough for finding leak. As I said, we already used page owner tools to resolve the leak issue. But that's extremely difficult to figure it out, too much callstack and too much allocation orders(0 or more). We couldn't easily get a clue event if we track 1% of pages.. In fact, I was writing another email to send a new patch with debug config. We added a hash function to pick out address with callstack by using debugfs, It could be showing the only page_frag_cache leak with owner. for exmaple code : +++ /mm/page_alloc.c @@ -4371,7 +4371,9 @@ void *page_frag_alloc(struct page_frag_cache *nc, nc->pagecnt_bias--; nc->offset = offset; +#ifdef CONFIG_PGFRAG_DEBUG + page_frag_debug_alloc(nc->va + offset); +#endif return nc->va + offset; } EXPORT_SYMBOL(page_frag_alloc); @@ -4382,7 +4384,9 @@ EXPORT_SYMBOL(page_frag_alloc); void page_frag_free(void *addr) { struct page *page = virt_to_head_page(addr); +#ifdef CONFIG_PGFRAG_DEBUG + page_frag_debug_free(addr); +#endif if (unlikely(put_page_testzero(page))) Those counters that I added may be too much for the linux server or something. However, I think the other systems may need to simple debugging method. (like Android OS) So if you can accept the patch with debug feature, I will send it including counters. but still thinking those counters don't need. I won't. Anyway, I'm grateful for your feedback, means a lot to me. Thanks, Kyeongdon Kim > As you know, page_frag_alloc() directly calls __alloc_pages_nodemask() function, > so that makes too difficult to see memory usage in real time even though we have "/meminfo or /slabinfo.." information. > If there was a way already to figure out the memory leakage from page_frag_cache in mainline, I agree your opinion > but I think we don't have it now. > > If those counters too much in my patch, > I can say two values (pgfrag_alloc and pgfrag_free) are enough to guess what will happen > and would remove pgfrag_alloc_calls and pgfrag_free_calls. > > Thanks, > Kyeongdon Kim > >> IMHO that's too much counters. >> Per-node NR_FRAGMENT_PAGES should be enough for guessing what's going on. >> Perf probes provides enough features for furhter debugging. >> >> On 01.09.2017 02:37, Kyeongdon Kim wrote: >> > There was a memory leak problem when we did stressful test >> > on Android device. >> > The root cause of this was from page_frag_cache alloc >> > and it was very hard to find out. >> > >> > We add to count the page frag allocation and free with function call. >> > The gap between pgfrag_alloc and pgfrag_free is good to to calculate >> > for the amount of page. >> > The gap between pgfrag_alloc_calls and pgfrag_free_calls is for >> > sub-indicator. >> > They can see trends of memory usage during the test. >> > Without it, it's difficult to check page frag usage so I believe we >> > should add it. >> > >> > Signed-off-by: Kyeongdon Kim <kyeongdon@lge.com> >> > ---
Re: Re: [PATCH] mm/vmstats: add counters for the page frag cache
On 2017-09-04 오후 5:30, Konstantin Khlebnikov wrote: On 04.09.2017 04:35, Kyeongdon Kim wrote: > Thanks for your reply, > But I couldn't find "NR_FRAGMENT_PAGES" in linux-next.git .. is that vmstat counter? or others? > I mean rather than adding bunch vmstat counters for operations it might be worth to add page counter which will show current amount of these pages. But this seems too low-level for tracking, common counters for all network buffers would be more useful but much harder to implement. Ok, thanks for the comment. As I can see page owner is able to save stacktrace where allocation happened, this makes debugging mostly trivial without any counters. If it adds too much overhead - just track random 1% of pages, should be enough for finding leak. As I said, we already used page owner tools to resolve the leak issue. But that's extremely difficult to figure it out, too much callstack and too much allocation orders(0 or more). We couldn't easily get a clue event if we track 1% of pages.. In fact, I was writing another email to send a new patch with debug config. We added a hash function to pick out address with callstack by using debugfs, It could be showing the only page_frag_cache leak with owner. for exmaple code : +++ /mm/page_alloc.c @@ -4371,7 +4371,9 @@ void *page_frag_alloc(struct page_frag_cache *nc, nc->pagecnt_bias--; nc->offset = offset; +#ifdef CONFIG_PGFRAG_DEBUG + page_frag_debug_alloc(nc->va + offset); +#endif return nc->va + offset; } EXPORT_SYMBOL(page_frag_alloc); @@ -4382,7 +4384,9 @@ EXPORT_SYMBOL(page_frag_alloc); void page_frag_free(void *addr) { struct page *page = virt_to_head_page(addr); +#ifdef CONFIG_PGFRAG_DEBUG + page_frag_debug_free(addr); +#endif if (unlikely(put_page_testzero(page))) Those counters that I added may be too much for the linux server or something. However, I think the other systems may need to simple debugging method. (like Android OS) So if you can accept the patch with debug feature, I will send it including counters. but still thinking those counters don't need. I won't. Anyway, I'm grateful for your feedback, means a lot to me. Thanks, Kyeongdon Kim > As you know, page_frag_alloc() directly calls __alloc_pages_nodemask() function, > so that makes too difficult to see memory usage in real time even though we have "/meminfo or /slabinfo.." information. > If there was a way already to figure out the memory leakage from page_frag_cache in mainline, I agree your opinion > but I think we don't have it now. > > If those counters too much in my patch, > I can say two values (pgfrag_alloc and pgfrag_free) are enough to guess what will happen > and would remove pgfrag_alloc_calls and pgfrag_free_calls. > > Thanks, > Kyeongdon Kim > >> IMHO that's too much counters. >> Per-node NR_FRAGMENT_PAGES should be enough for guessing what's going on. >> Perf probes provides enough features for furhter debugging. >> >> On 01.09.2017 02:37, Kyeongdon Kim wrote: >> > There was a memory leak problem when we did stressful test >> > on Android device. >> > The root cause of this was from page_frag_cache alloc >> > and it was very hard to find out. >> > >> > We add to count the page frag allocation and free with function call. >> > The gap between pgfrag_alloc and pgfrag_free is good to to calculate >> > for the amount of page. >> > The gap between pgfrag_alloc_calls and pgfrag_free_calls is for >> > sub-indicator. >> > They can see trends of memory usage during the test. >> > Without it, it's difficult to check page frag usage so I believe we >> > should add it. >> > >> > Signed-off-by: Kyeongdon Kim >> > ---
[PATCH] selinux: Use kmem_cache for hashtab_node
During random test as own device to check slub account, we found some slack memory from hashtab_node(kmalloc-64). By using kzalloc(), middle of test result like below: allocated size 240768 request size 45144 slack size 195624 allocation count 3762 So, we want to use kmem_cache_zalloc() and that can reduce memory size 52byte(slack size/alloc count) per each struct. Signed-off-by: Kyeongdon Kim <kyeongdon@lge.com> --- security/selinux/ss/hashtab.c | 17 +++-- security/selinux/ss/hashtab.h | 4 security/selinux/ss/services.c | 4 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/security/selinux/ss/hashtab.c b/security/selinux/ss/hashtab.c index 686c391..bef7577 100644 --- a/security/selinux/ss/hashtab.c +++ b/security/selinux/ss/hashtab.c @@ -9,6 +9,8 @@ #include #include "hashtab.h" +static struct kmem_cache *hashtab_node_cachep; + struct hashtab *hashtab_create(u32 (*hash_value)(struct hashtab *h, const void *key), int (*keycmp)(struct hashtab *h, const void *key1, const void *key2), u32 size) @@ -57,7 +59,7 @@ int hashtab_insert(struct hashtab *h, void *key, void *datum) if (cur && (h->keycmp(h, key, cur->key) == 0)) return -EEXIST; - newnode = kzalloc(sizeof(*newnode), GFP_KERNEL); + newnode = kmem_cache_zalloc(hashtab_node_cachep, GFP_KERNEL); if (!newnode) return -ENOMEM; newnode->key = key; @@ -106,7 +108,7 @@ void hashtab_destroy(struct hashtab *h) while (cur) { temp = cur; cur = cur->next; - kfree(temp); + kmem_cache_free(hashtab_node_cachep, temp); } h->htable[i] = NULL; } @@ -166,3 +168,14 @@ void hashtab_stat(struct hashtab *h, struct hashtab_info *info) info->slots_used = slots_used; info->max_chain_len = max_chain_len; } +void hashtab_cache_init(void) +{ + hashtab_node_cachep = kmem_cache_create("hashtab_node", + sizeof(struct hashtab_node), + 0, SLAB_PANIC, NULL); +} + +void hashtab_cache_destroy(void) +{ + kmem_cache_destroy(hashtab_node_cachep); +} diff --git a/security/selinux/ss/hashtab.h b/security/selinux/ss/hashtab.h index 009fb5e..d6883d3 100644 --- a/security/selinux/ss/hashtab.h +++ b/security/selinux/ss/hashtab.h @@ -84,4 +84,8 @@ int hashtab_map(struct hashtab *h, /* Fill info with some hash table statistics */ void hashtab_stat(struct hashtab *h, struct hashtab_info *info); +/* Use kmem_cache for hashtab_node */ +void hashtab_cache_init(void); +void hashtab_cache_destroy(void); + #endif /* _SS_HASHTAB_H */ diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index e4a1c0d..33cfe5d 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -2060,10 +2060,12 @@ int security_load_policy(void *data, size_t len) if (!ss_initialized) { avtab_cache_init(); ebitmap_cache_init(); + hashtab_cache_init(); rc = policydb_read(, fp); if (rc) { avtab_cache_destroy(); ebitmap_cache_destroy(); + hashtab_cache_destroy(); goto out; } @@ -2075,6 +2077,7 @@ int security_load_policy(void *data, size_t len) policydb_destroy(); avtab_cache_destroy(); ebitmap_cache_destroy(); + hashtab_cache_destroy(); goto out; } @@ -2083,6 +2086,7 @@ int security_load_policy(void *data, size_t len) policydb_destroy(); avtab_cache_destroy(); ebitmap_cache_destroy(); + hashtab_cache_destroy(); goto out; } -- 2.6.2
[PATCH] selinux: Use kmem_cache for hashtab_node
During random test as own device to check slub account, we found some slack memory from hashtab_node(kmalloc-64). By using kzalloc(), middle of test result like below: allocated size 240768 request size 45144 slack size 195624 allocation count 3762 So, we want to use kmem_cache_zalloc() and that can reduce memory size 52byte(slack size/alloc count) per each struct. Signed-off-by: Kyeongdon Kim --- security/selinux/ss/hashtab.c | 17 +++-- security/selinux/ss/hashtab.h | 4 security/selinux/ss/services.c | 4 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/security/selinux/ss/hashtab.c b/security/selinux/ss/hashtab.c index 686c391..bef7577 100644 --- a/security/selinux/ss/hashtab.c +++ b/security/selinux/ss/hashtab.c @@ -9,6 +9,8 @@ #include #include "hashtab.h" +static struct kmem_cache *hashtab_node_cachep; + struct hashtab *hashtab_create(u32 (*hash_value)(struct hashtab *h, const void *key), int (*keycmp)(struct hashtab *h, const void *key1, const void *key2), u32 size) @@ -57,7 +59,7 @@ int hashtab_insert(struct hashtab *h, void *key, void *datum) if (cur && (h->keycmp(h, key, cur->key) == 0)) return -EEXIST; - newnode = kzalloc(sizeof(*newnode), GFP_KERNEL); + newnode = kmem_cache_zalloc(hashtab_node_cachep, GFP_KERNEL); if (!newnode) return -ENOMEM; newnode->key = key; @@ -106,7 +108,7 @@ void hashtab_destroy(struct hashtab *h) while (cur) { temp = cur; cur = cur->next; - kfree(temp); + kmem_cache_free(hashtab_node_cachep, temp); } h->htable[i] = NULL; } @@ -166,3 +168,14 @@ void hashtab_stat(struct hashtab *h, struct hashtab_info *info) info->slots_used = slots_used; info->max_chain_len = max_chain_len; } +void hashtab_cache_init(void) +{ + hashtab_node_cachep = kmem_cache_create("hashtab_node", + sizeof(struct hashtab_node), + 0, SLAB_PANIC, NULL); +} + +void hashtab_cache_destroy(void) +{ + kmem_cache_destroy(hashtab_node_cachep); +} diff --git a/security/selinux/ss/hashtab.h b/security/selinux/ss/hashtab.h index 009fb5e..d6883d3 100644 --- a/security/selinux/ss/hashtab.h +++ b/security/selinux/ss/hashtab.h @@ -84,4 +84,8 @@ int hashtab_map(struct hashtab *h, /* Fill info with some hash table statistics */ void hashtab_stat(struct hashtab *h, struct hashtab_info *info); +/* Use kmem_cache for hashtab_node */ +void hashtab_cache_init(void); +void hashtab_cache_destroy(void); + #endif /* _SS_HASHTAB_H */ diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index e4a1c0d..33cfe5d 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -2060,10 +2060,12 @@ int security_load_policy(void *data, size_t len) if (!ss_initialized) { avtab_cache_init(); ebitmap_cache_init(); + hashtab_cache_init(); rc = policydb_read(, fp); if (rc) { avtab_cache_destroy(); ebitmap_cache_destroy(); + hashtab_cache_destroy(); goto out; } @@ -2075,6 +2077,7 @@ int security_load_policy(void *data, size_t len) policydb_destroy(); avtab_cache_destroy(); ebitmap_cache_destroy(); + hashtab_cache_destroy(); goto out; } @@ -2083,6 +2086,7 @@ int security_load_policy(void *data, size_t len) policydb_destroy(); avtab_cache_destroy(); ebitmap_cache_destroy(); + hashtab_cache_destroy(); goto out; } -- 2.6.2
Re: Re: [PATCH] mm/vmstats: add counters for the page frag cache
Thanks for your reply, We already used other i/f like page_owner and kmemleak to resolve memory leakage issue. But, page_owner can only for guess but cannot find intuitively memory usage regarding page_frag_cache. And kmemleak cannot use (because of calling directly __alloc_pages_nodemask()). Additionally, some embedded linux like Android or something.. is not able to always use kmemleak & page_owner because of runtime performance deterioration. However, the root cause of this memory issue is from net device like wireless. In short, should always use wireless on device but, cannot use those memory debug tools. That's why those counters need.. and for much cheaper I can remove pgfrag_alloc_calls and pgfrag_free_calls. Thanks, Kyeongdon Kim On 2017-09-01 오후 6:21, Michal Hocko wrote: On Fri 01-09-17 12:12:36, Konstantin Khlebnikov wrote: > IMHO that's too much counters. > Per-node NR_FRAGMENT_PAGES should be enough for guessing what's going on. > Perf probes provides enough features for furhter debugging. I would tend to agree. Adding a counter based on a single debugging instance sounds like an overkill to me. Counters should be pretty cheep but this is way too specialized API to export to the userspace. We have other interfaces to debug memory leaks like page_owner. -- Michal Hocko SUSE Labs
Re: Re: [PATCH] mm/vmstats: add counters for the page frag cache
Thanks for your reply, We already used other i/f like page_owner and kmemleak to resolve memory leakage issue. But, page_owner can only for guess but cannot find intuitively memory usage regarding page_frag_cache. And kmemleak cannot use (because of calling directly __alloc_pages_nodemask()). Additionally, some embedded linux like Android or something.. is not able to always use kmemleak & page_owner because of runtime performance deterioration. However, the root cause of this memory issue is from net device like wireless. In short, should always use wireless on device but, cannot use those memory debug tools. That's why those counters need.. and for much cheaper I can remove pgfrag_alloc_calls and pgfrag_free_calls. Thanks, Kyeongdon Kim On 2017-09-01 오후 6:21, Michal Hocko wrote: On Fri 01-09-17 12:12:36, Konstantin Khlebnikov wrote: > IMHO that's too much counters. > Per-node NR_FRAGMENT_PAGES should be enough for guessing what's going on. > Perf probes provides enough features for furhter debugging. I would tend to agree. Adding a counter based on a single debugging instance sounds like an overkill to me. Counters should be pretty cheep but this is way too specialized API to export to the userspace. We have other interfaces to debug memory leaks like page_owner. -- Michal Hocko SUSE Labs
Re: Re: [PATCH] mm/vmstats: add counters for the page frag cache
Thanks for your reply, But I couldn't find "NR_FRAGMENT_PAGES" in linux-next.git .. is that vmstat counter? or others? As you know, page_frag_alloc() directly calls __alloc_pages_nodemask() function, so that makes too difficult to see memory usage in real time even though we have "/meminfo or /slabinfo.." information. If there was a way already to figure out the memory leakage from page_frag_cache in mainline, I agree your opinion but I think we don't have it now. If those counters too much in my patch, I can say two values (pgfrag_alloc and pgfrag_free) are enough to guess what will happen and would remove pgfrag_alloc_calls and pgfrag_free_calls. Thanks, Kyeongdon Kim On 2017-09-01 오후 6:12, Konstantin Khlebnikov wrote: IMHO that's too much counters. Per-node NR_FRAGMENT_PAGES should be enough for guessing what's going on. Perf probes provides enough features for furhter debugging. On 01.09.2017 02:37, Kyeongdon Kim wrote: > There was a memory leak problem when we did stressful test > on Android device. > The root cause of this was from page_frag_cache alloc > and it was very hard to find out. > > We add to count the page frag allocation and free with function call. > The gap between pgfrag_alloc and pgfrag_free is good to to calculate > for the amount of page. > The gap between pgfrag_alloc_calls and pgfrag_free_calls is for > sub-indicator. > They can see trends of memory usage during the test. > Without it, it's difficult to check page frag usage so I believe we > should add it. > > Signed-off-by: Kyeongdon Kim <kyeongdon@lge.com> > ---
Re: Re: [PATCH] mm/vmstats: add counters for the page frag cache
Thanks for your reply, But I couldn't find "NR_FRAGMENT_PAGES" in linux-next.git .. is that vmstat counter? or others? As you know, page_frag_alloc() directly calls __alloc_pages_nodemask() function, so that makes too difficult to see memory usage in real time even though we have "/meminfo or /slabinfo.." information. If there was a way already to figure out the memory leakage from page_frag_cache in mainline, I agree your opinion but I think we don't have it now. If those counters too much in my patch, I can say two values (pgfrag_alloc and pgfrag_free) are enough to guess what will happen and would remove pgfrag_alloc_calls and pgfrag_free_calls. Thanks, Kyeongdon Kim On 2017-09-01 오후 6:12, Konstantin Khlebnikov wrote: IMHO that's too much counters. Per-node NR_FRAGMENT_PAGES should be enough for guessing what's going on. Perf probes provides enough features for furhter debugging. On 01.09.2017 02:37, Kyeongdon Kim wrote: > There was a memory leak problem when we did stressful test > on Android device. > The root cause of this was from page_frag_cache alloc > and it was very hard to find out. > > We add to count the page frag allocation and free with function call. > The gap between pgfrag_alloc and pgfrag_free is good to to calculate > for the amount of page. > The gap between pgfrag_alloc_calls and pgfrag_free_calls is for > sub-indicator. > They can see trends of memory usage during the test. > Without it, it's difficult to check page frag usage so I believe we > should add it. > > Signed-off-by: Kyeongdon Kim > ---
[PATCH] mm/vmstats: add counters for the page frag cache
There was a memory leak problem when we did stressful test on Android device. The root cause of this was from page_frag_cache alloc and it was very hard to find out. We add to count the page frag allocation and free with function call. The gap between pgfrag_alloc and pgfrag_free is good to to calculate for the amount of page. The gap between pgfrag_alloc_calls and pgfrag_free_calls is for sub-indicator. They can see trends of memory usage during the test. Without it, it's difficult to check page frag usage so I believe we should add it. Signed-off-by: Kyeongdon Kim <kyeongdon@lge.com> --- include/linux/vm_event_item.h | 4 mm/page_alloc.c | 9 +++-- mm/vmstat.c | 4 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index d77bc35..75425d4 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -110,6 +110,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, SWAP_RA, SWAP_RA_HIT, #endif + PGFRAG_ALLOC, + PGFRAG_FREE, + PGFRAG_ALLOC_CALLS, + PGFRAG_FREE_CALLS, NR_VM_EVENT_ITEMS }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index db2d25f..b3ddd76 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4322,6 +4322,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) free_hot_cold_page(page, false); else __free_pages_ok(page, order); + __count_vm_events(PGFRAG_FREE, 1 << order); } } EXPORT_SYMBOL(__page_frag_cache_drain); @@ -4338,7 +4339,7 @@ void *page_frag_alloc(struct page_frag_cache *nc, page = __page_frag_cache_refill(nc, gfp_mask); if (!page) return NULL; - + __count_vm_events(PGFRAG_ALLOC, 1 << compound_order(page)); #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) /* if size can vary use size else just use PAGE_SIZE */ size = nc->size; @@ -4375,6 +4376,7 @@ void *page_frag_alloc(struct page_frag_cache *nc, nc->pagecnt_bias--; nc->offset = offset; + __count_vm_event(PGFRAG_ALLOC_CALLS); return nc->va + offset; } @@ -4387,8 +4389,11 @@ void page_frag_free(void *addr) { struct page *page = virt_to_head_page(addr); - if (unlikely(put_page_testzero(page))) + if (unlikely(put_page_testzero(page))) { + __count_vm_events(PGFRAG_FREE, 1 << compound_order(page)); __free_pages_ok(page, compound_order(page)); + } + __count_vm_event(PGFRAG_FREE_CALLS); } EXPORT_SYMBOL(page_frag_free); diff --git a/mm/vmstat.c b/mm/vmstat.c index 4bb13e7..c00fe05 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1217,6 +1217,10 @@ const char * const vmstat_text[] = { "swap_ra", "swap_ra_hit", #endif + "pgfrag_alloc", + "pgfrag_free", + "pgfrag_alloc_calls", + "pgfrag_free_calls", #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ -- 2.6.2
[PATCH] mm/vmstats: add counters for the page frag cache
There was a memory leak problem when we did stressful test on Android device. The root cause of this was from page_frag_cache alloc and it was very hard to find out. We add to count the page frag allocation and free with function call. The gap between pgfrag_alloc and pgfrag_free is good to to calculate for the amount of page. The gap between pgfrag_alloc_calls and pgfrag_free_calls is for sub-indicator. They can see trends of memory usage during the test. Without it, it's difficult to check page frag usage so I believe we should add it. Signed-off-by: Kyeongdon Kim --- include/linux/vm_event_item.h | 4 mm/page_alloc.c | 9 +++-- mm/vmstat.c | 4 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index d77bc35..75425d4 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -110,6 +110,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, SWAP_RA, SWAP_RA_HIT, #endif + PGFRAG_ALLOC, + PGFRAG_FREE, + PGFRAG_ALLOC_CALLS, + PGFRAG_FREE_CALLS, NR_VM_EVENT_ITEMS }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index db2d25f..b3ddd76 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4322,6 +4322,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) free_hot_cold_page(page, false); else __free_pages_ok(page, order); + __count_vm_events(PGFRAG_FREE, 1 << order); } } EXPORT_SYMBOL(__page_frag_cache_drain); @@ -4338,7 +4339,7 @@ void *page_frag_alloc(struct page_frag_cache *nc, page = __page_frag_cache_refill(nc, gfp_mask); if (!page) return NULL; - + __count_vm_events(PGFRAG_ALLOC, 1 << compound_order(page)); #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) /* if size can vary use size else just use PAGE_SIZE */ size = nc->size; @@ -4375,6 +4376,7 @@ void *page_frag_alloc(struct page_frag_cache *nc, nc->pagecnt_bias--; nc->offset = offset; + __count_vm_event(PGFRAG_ALLOC_CALLS); return nc->va + offset; } @@ -4387,8 +4389,11 @@ void page_frag_free(void *addr) { struct page *page = virt_to_head_page(addr); - if (unlikely(put_page_testzero(page))) + if (unlikely(put_page_testzero(page))) { + __count_vm_events(PGFRAG_FREE, 1 << compound_order(page)); __free_pages_ok(page, compound_order(page)); + } + __count_vm_event(PGFRAG_FREE_CALLS); } EXPORT_SYMBOL(page_frag_free); diff --git a/mm/vmstat.c b/mm/vmstat.c index 4bb13e7..c00fe05 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1217,6 +1217,10 @@ const char * const vmstat_text[] = { "swap_ra", "swap_ra_hit", #endif + "pgfrag_alloc", + "pgfrag_free", + "pgfrag_alloc_calls", + "pgfrag_free_calls", #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ -- 2.6.2
Re: Re: [PATCH] zram: revive swap_slot_free_notify
sis->bdev, >> - offset); >> - } >> - } >> - } >> - >> + swap_slot_free_notify(page); >> out: >> unlock_page(page); >> bio_put(bio); >> @@ -347,6 +353,7 @@ int swap_readpage(struct page *page) >> >> ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); >> if (!ret) { >> + swap_slot_free_notify(page); >> count_vm_event(PSWPIN); >> return 0; >> } >> -- >> 1.9.1 >> > > Kyeongdon, Could you try this patch? > > From 09497ba48f5621cd2737f72ac4d15e6b3e5e3d14 Mon Sep 17 00:00:00 2001 > From: Minchan Kim <minc...@kernel.org> > Date: Mon, 28 Mar 2016 23:08:14 +0900 > Subject: [PATCH] mm: call swap_slot_free_notify with holding page lock > > Kyeongdon reported below error which is BUG_ON(!PageSwapCache(page)) > in page_swap_info. > The reason is that page_endio in rw_page unlocks the page if read I/O > is completed so we need to hold a PG_lock again to check PageSwapCache. > Otherwise, the page can be removed from swapcache and trigger below > BUG_ON. > > [27833.995833] [ cut here ] > [27833.995853] Kernel BUG at c00f9040 [verbose debug info unavailable] > [27833.995865] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP ARM > [27833.995876] Modules linked in: > [27833.995892] CPU: 4 PID: 13446 Comm: RenderThread Tainted: G W > 3.10.84-g9f14aec-dirty #73 > [27833.995903] task: c3b73200 ti: dd192000 task.ti: dd192000 > [27833.995922] PC is at page_swap_info+0x10/0x2c > [27833.995934] LR is at swap_slot_free_notify+0x18/0x6c > [27833.995946] pc : [] lr : [] psr: 400f0113 > [27833.995946] sp : dd193d78 ip : c2deb1e4 fp : da015180 > [27833.995959] r10: r9 : 000200da r8 : c120fe08 > [27833.995968] r7 : r6 : r5 : c249a6c0 r4 : = > c249a6c0 > [27833.995979] r3 : r2 : 40080009 r1 : 200f0113 r0 : = > c249a6c0 > .. > [27833.996273] [] (page_swap_info+0x10/0x2c) from [] > (swap_slot_free_notify+0x18/0x6c) > [27833.996288] [] (swap_slot_free_notify+0x18/0x6c) from > [] (swap_readpage+0x90/0x11c) > [27833.996302] [] (swap_readpage+0x90/0x11c) from [] > (read_swap_cache_async+0x134/0x1ac) > [27833.996317] [] (read_swap_cache_async+0x134/0x1ac) from > [] (swapin_readahead+0x70/0xb0) > [27833.996334] [] (swapin_readahead+0x70/0xb0) from = > [] > (handle_pte_fault+0x320/0x6fc) > [27833.996348] [] (handle_pte_fault+0x320/0x6fc) from > [] (handle_mm_fault+0xc0/0xf0) > [27833.996363] [] (handle_mm_fault+0xc0/0xf0) from = > [] > (do_page_fault+0x11c/0x36c) > [27833.996378] [] (do_page_fault+0x11c/0x36c) from = > [] > (do_DataAbort+0x34/0x118) > [27833.996392] [] (do_DataAbort+0x34/0x118) from [] > (__dabt_usr+0x34/0x40) > > Reported-by: Kyeongdon Kim <kyeongdon@lge.com> > Signed-off-by: Minchan Kim <minc...@kernel.org> I tested this patch for 48 hours and can't reproduce the problem. Tested-by: Kyeongdon Kim <kyeongdon@lge.com> Thanks.
Re: Re: [PATCH] zram: revive swap_slot_free_notify
>> - } >> - } >> - } >> - >> + swap_slot_free_notify(page); >> out: >> unlock_page(page); >> bio_put(bio); >> @@ -347,6 +353,7 @@ int swap_readpage(struct page *page) >> >> ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); >> if (!ret) { >> + swap_slot_free_notify(page); >> count_vm_event(PSWPIN); >> return 0; >> } >> -- >> 1.9.1 >> > > Kyeongdon, Could you try this patch? > > From 09497ba48f5621cd2737f72ac4d15e6b3e5e3d14 Mon Sep 17 00:00:00 2001 > From: Minchan Kim > Date: Mon, 28 Mar 2016 23:08:14 +0900 > Subject: [PATCH] mm: call swap_slot_free_notify with holding page lock > > Kyeongdon reported below error which is BUG_ON(!PageSwapCache(page)) > in page_swap_info. > The reason is that page_endio in rw_page unlocks the page if read I/O > is completed so we need to hold a PG_lock again to check PageSwapCache. > Otherwise, the page can be removed from swapcache and trigger below > BUG_ON. > > [27833.995833] [ cut here ] > [27833.995853] Kernel BUG at c00f9040 [verbose debug info unavailable] > [27833.995865] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP ARM > [27833.995876] Modules linked in: > [27833.995892] CPU: 4 PID: 13446 Comm: RenderThread Tainted: G W > 3.10.84-g9f14aec-dirty #73 > [27833.995903] task: c3b73200 ti: dd192000 task.ti: dd192000 > [27833.995922] PC is at page_swap_info+0x10/0x2c > [27833.995934] LR is at swap_slot_free_notify+0x18/0x6c > [27833.995946] pc : [] lr : [] psr: 400f0113 > [27833.995946] sp : dd193d78 ip : c2deb1e4 fp : da015180 > [27833.995959] r10: r9 : 000200da r8 : c120fe08 > [27833.995968] r7 : r6 : r5 : c249a6c0 r4 : = > c249a6c0 > [27833.995979] r3 : r2 : 40080009 r1 : 200f0113 r0 : = > c249a6c0 > .. > [27833.996273] [] (page_swap_info+0x10/0x2c) from [] > (swap_slot_free_notify+0x18/0x6c) > [27833.996288] [] (swap_slot_free_notify+0x18/0x6c) from > [] (swap_readpage+0x90/0x11c) > [27833.996302] [] (swap_readpage+0x90/0x11c) from [] > (read_swap_cache_async+0x134/0x1ac) > [27833.996317] [] (read_swap_cache_async+0x134/0x1ac) from > [] (swapin_readahead+0x70/0xb0) > [27833.996334] [] (swapin_readahead+0x70/0xb0) from = > [] > (handle_pte_fault+0x320/0x6fc) > [27833.996348] [] (handle_pte_fault+0x320/0x6fc) from > [] (handle_mm_fault+0xc0/0xf0) > [27833.996363] [] (handle_mm_fault+0xc0/0xf0) from = > [] > (do_page_fault+0x11c/0x36c) > [27833.996378] [] (do_page_fault+0x11c/0x36c) from = > [] > (do_DataAbort+0x34/0x118) > [27833.996392] [] (do_DataAbort+0x34/0x118) from [] > (__dabt_usr+0x34/0x40) > > Reported-by: Kyeongdon Kim > Signed-off-by: Minchan Kim I tested this patch for 48 hours and can't reproduce the problem. Tested-by: Kyeongdon Kim Thanks.
Re: Re: [PATCH v4 2/2] zram: try vmalloc() after kmalloc()
On 2015-12-18 오전 9:28, Sergey Senozhatsky wrote: > On (12/01/15 21:36), Sergey Senozhatsky wrote: >> When we're using LZ4 multi compression streams for zram swap, we found > out >> page allocation failure message in system running test. That was not only >> once, but a few(2 - 5 times per test). Also, some failure cases were >> continually occurring to try allocation order 3. >> >> In order to make parallel compression private data, we should call >> kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order >> 2/3 size memory to allocate in that time, page allocation fails. This >> patch makes to use vmalloc() as fallback of kmalloc(), this prevents page >> alloc failure warning. >> >> After using this, we never found warning message in running test, also It >> could reduce process startup latency about 60-120ms in each case. >> > > Hello Kyeongdon, > > just to make sure, the patch works fine for you and we can move > forward and Cc -stable. correct? > > -ss > Hello Sergey, I was on vacation so I checked your email a moment ago, sorry about it. We're using this patch set. and we couldn't find any issue from this one until now. Thanks, Kyeongdon Kim >> For reference a call trace : >> >> Binder_1: page allocation failure: order:3, mode:0x10c0d0 >> CPU: 0 PID: 424 Comm: Binder_1 Tainted: GW 3.10.49-perf-g991d02b-dirty > #20 >> Call trace: >> [] dump_backtrace+0x0/0x270 >> [] show_stack+0x10/0x1c >> [] dump_stack+0x1c/0x28 >> [] warn_alloc_failed+0xfc/0x11c >> [] __alloc_pages_nodemask+0x724/0x7f0 >> [] __get_free_pages+0x14/0x5c >> [] kmalloc_order_trace+0x38/0xd8 >> [] zcomp_lz4_create+0x2c/0x38 >> [] zcomp_strm_alloc+0x34/0x78 >> [] zcomp_strm_multi_find+0x124/0x1ec >> [] zcomp_strm_find+0xc/0x18 >> [] zram_bvec_rw+0x2fc/0x780 >> [] zram_make_request+0x25c/0x2d4 >> [] generic_make_request+0x80/0xbc >> [] submit_bio+0xa4/0x15c >> [] __swap_writepage+0x218/0x230 >> [] swap_writepage+0x3c/0x4c >> [] shrink_page_list+0x51c/0x8d0 >> [] shrink_inactive_list+0x3f8/0x60c >> [] shrink_lruvec+0x33c/0x4cc >> [] shrink_zone+0x3c/0x100 >> [] try_to_free_pages+0x2b8/0x54c >> [] __alloc_pages_nodemask+0x514/0x7f0 >> [] __get_free_pages+0x14/0x5c >> [] proc_info_read+0x50/0xe4 >> [] vfs_read+0xa0/0x12c >> [] SyS_read+0x44/0x74 >> DMA: 3397*4kB (MC) 26*8kB (RC) 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB >> 0*512kB 0*1024kB 0*2048kB 0*4096kB = 13796kB >> >> [minc...@kernel.org: change vmalloc gfp and adding comment about gfp] >> [sergey.senozhat...@gmail.com: tweak comments and styles] >> Signed-off-by: Kyeongdon Kim >> Signed-off-by: Minchan Kim >> Acked-by: Sergey Senozhatsky >> --- >> drivers/block/zram/zcomp_lz4.c | 23 +-- >> drivers/block/zram/zcomp_lzo.c | 23 +-- >> 2 files changed, 42 insertions(+), 4 deletions(-) >> >> diff --git a/drivers/block/zram/zcomp_lz4.c > b/drivers/block/zram/zcomp_lz4.c >> index ee44b51..dd60831 100644 >> --- a/drivers/block/zram/zcomp_lz4.c >> +++ b/drivers/block/zram/zcomp_lz4.c >> @@ -10,17 +10,36 @@ >> #include >> #include >> #include >> +#include >> +#include >> >> #include "zcomp_lz4.h" >> >> static void *zcomp_lz4_create(void) >> { >> - return kzalloc(LZ4_MEM_COMPRESS, GFP_NOIO); >> + void *ret; >> + >> + /* >> + * This function can be called in swapout/fs write path >> + * so we can't use GFP_FS|IO. And it assumes we already >> + * have at least one stream in zram initialization so we >> + * don't do best effort to allocate more stream in here. >> + * A default stream will work well without further multiple >> + * streams. That's why we use NORETRY | NOWARN. >> + */ >> + ret = kzalloc(LZ4_MEM_COMPRESS, GFP_NOIO | __GFP_NORETRY | >> + __GFP_NOWARN); >> + if (!ret) >> + ret = __vmalloc(LZ4_MEM_COMPRESS, >> + GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN | >> + __GFP_ZERO | __GFP_HIGHMEM, >> + PAGE_KERNEL); >> + return ret; >> } >> >> static void zcomp_lz4_destroy(void *private) >> { >> - kfree(private); >> + kvfree(private); >> } >> >> static int zcomp_lz4_compress(const unsigned char *src, unsigned char > *dst, >> diff --git a/drivers/block/zram/zcomp_lzo.c > b/drivers/block/zram/zcomp_lzo.c >> index 683ce04..edc5499 100644 >> --- a/drivers/block/zram/zcomp_lzo.c >> +++ b/drivers/block/zram/zcomp_lzo.c >> @@ -10,17 +10,36 @@ >> #
Re: Re: [PATCH v4 2/2] zram: try vmalloc() after kmalloc()
On 2015-12-18 오전 9:28, Sergey Senozhatsky wrote: > On (12/01/15 21:36), Sergey Senozhatsky wrote: >> When we're using LZ4 multi compression streams for zram swap, we found > out >> page allocation failure message in system running test. That was not only >> once, but a few(2 - 5 times per test). Also, some failure cases were >> continually occurring to try allocation order 3. >> >> In order to make parallel compression private data, we should call >> kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order >> 2/3 size memory to allocate in that time, page allocation fails. This >> patch makes to use vmalloc() as fallback of kmalloc(), this prevents page >> alloc failure warning. >> >> After using this, we never found warning message in running test, also It >> could reduce process startup latency about 60-120ms in each case. >> > > Hello Kyeongdon, > > just to make sure, the patch works fine for you and we can move > forward and Cc -stable. correct? > > -ss > Hello Sergey, I was on vacation so I checked your email a moment ago, sorry about it. We're using this patch set. and we couldn't find any issue from this one until now. Thanks, Kyeongdon Kim >> For reference a call trace : >> >> Binder_1: page allocation failure: order:3, mode:0x10c0d0 >> CPU: 0 PID: 424 Comm: Binder_1 Tainted: GW 3.10.49-perf-g991d02b-dirty > #20 >> Call trace: >> [] dump_backtrace+0x0/0x270 >> [] show_stack+0x10/0x1c >> [] dump_stack+0x1c/0x28 >> [] warn_alloc_failed+0xfc/0x11c >> [] __alloc_pages_nodemask+0x724/0x7f0 >> [] __get_free_pages+0x14/0x5c >> [] kmalloc_order_trace+0x38/0xd8 >> [] zcomp_lz4_create+0x2c/0x38 >> [] zcomp_strm_alloc+0x34/0x78 >> [] zcomp_strm_multi_find+0x124/0x1ec >> [] zcomp_strm_find+0xc/0x18 >> [] zram_bvec_rw+0x2fc/0x780 >> [] zram_make_request+0x25c/0x2d4 >> [] generic_make_request+0x80/0xbc >> [] submit_bio+0xa4/0x15c >> [] __swap_writepage+0x218/0x230 >> [] swap_writepage+0x3c/0x4c >> [] shrink_page_list+0x51c/0x8d0 >> [] shrink_inactive_list+0x3f8/0x60c >> [] shrink_lruvec+0x33c/0x4cc >> [] shrink_zone+0x3c/0x100 >> [] try_to_free_pages+0x2b8/0x54c >> [] __alloc_pages_nodemask+0x514/0x7f0 >> [] __get_free_pages+0x14/0x5c >> [] proc_info_read+0x50/0xe4 >> [] vfs_read+0xa0/0x12c >> [] SyS_read+0x44/0x74 >> DMA: 3397*4kB (MC) 26*8kB (RC) 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB >> 0*512kB 0*1024kB 0*2048kB 0*4096kB = 13796kB >> >> [minc...@kernel.org: change vmalloc gfp and adding comment about gfp] >> [sergey.senozhat...@gmail.com: tweak comments and styles] >> Signed-off-by: Kyeongdon Kim <kyeongdon@lge.com> >> Signed-off-by: Minchan Kim <minc...@kernel.org> >> Acked-by: Sergey Senozhatsky <sergey.senozhat...@gmail.com> >> --- >> drivers/block/zram/zcomp_lz4.c | 23 +-- >> drivers/block/zram/zcomp_lzo.c | 23 +-- >> 2 files changed, 42 insertions(+), 4 deletions(-) >> >> diff --git a/drivers/block/zram/zcomp_lz4.c > b/drivers/block/zram/zcomp_lz4.c >> index ee44b51..dd60831 100644 >> --- a/drivers/block/zram/zcomp_lz4.c >> +++ b/drivers/block/zram/zcomp_lz4.c >> @@ -10,17 +10,36 @@ >> #include >> #include >> #include >> +#include >> +#include >> >> #include "zcomp_lz4.h" >> >> static void *zcomp_lz4_create(void) >> { >> - return kzalloc(LZ4_MEM_COMPRESS, GFP_NOIO); >> + void *ret; >> + >> + /* >> + * This function can be called in swapout/fs write path >> + * so we can't use GFP_FS|IO. And it assumes we already >> + * have at least one stream in zram initialization so we >> + * don't do best effort to allocate more stream in here. >> + * A default stream will work well without further multiple >> + * streams. That's why we use NORETRY | NOWARN. >> + */ >> + ret = kzalloc(LZ4_MEM_COMPRESS, GFP_NOIO | __GFP_NORETRY | >> + __GFP_NOWARN); >> + if (!ret) >> + ret = __vmalloc(LZ4_MEM_COMPRESS, >> + GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN | >> + __GFP_ZERO | __GFP_HIGHMEM, >> + PAGE_KERNEL); >> + return ret; >> } >> >> static void zcomp_lz4_destroy(void *private) >> { >> - kfree(private); >> + kvfree(private); >> } >> >> static int zcomp_lz4_compress(const unsigned char *src, unsigned char > *dst, >> diff --git a/drivers/block/zram/zcomp_lzo.c > b/drivers/block/zram/zcomp_lzo.c >> index 683ce04..edc5499 100644 >> --- a/drivers/block/zram/zcomp
Re: Re: [PATCH v3 2/2] zram: try vmalloc() after kmalloc()
On 2015-12-01 오후 2:16, Sergey Senozhatsky wrote: > On (12/01/15 13:55), Minchan Kim wrote: > [..] >> To clear my opinion, >> >> lzo_create(gfp_t flags) >> { >> void * ret = kmalloc(LZO1X_MEM_COMPRESS, flags); >> if (!ret) >> ret = vmalloc(LZO1X_MEM_COMPRESS, flasgs | GFP_NOMEMALLOC); >> return ret; >> } > > ah, ok, I see. I've a question. > > we had > kmalloc(f | __GFP_NOMEMALLOC) > __vmalloc(f | __GFP_NOMEMALLOC) > > which produced high failure rates for both kmalloc() and __vmalloc() > > test #1 > >> > > log message : > [..] >> > > [ 352.230608][0] zcomp_lz4_create: 32: ret = (null) >> > > [ 352.230619][0] zcomp_lz4_create: 38: ret = (null) >> > > [ 352.230888][0] zcomp_lz4_create: 32: ret = (null) >> > > [ 352.230902][0] zcomp_lz4_create: 38: ret = (null) >> > > [ 352.231406][0] zcomp_lz4_create: 32: ret = ffc002088000 >> > > [ 352.234024][0] zcomp_lz4_create: 32: ret = (null) >> > > [ 352.234060][0] zcomp_lz4_create: 38: ret = (null) >> > > [ 352.234359][0] zcomp_lz4_create: 32: ret = (null) > [..] >> > > [ 352.234384][0] zcomp_lz4_create: 38: ret = (null) >> > > [ 352.234618][0] zcomp_lz4_create: 32: ret = (null) >> > > [ 352.234639][0] zcomp_lz4_create: 38: ret = (null) >> > > [ 352.234667][0] zcomp_lz4_create: 32: ret = (null) >> > > [ 352.235179][0] zcomp_lz4_create: 38: ret = ff80016a4000 > > > > Kyeongdon, do I understand correctly, that for the second test you > removed '__GFP_NOMEMALLOC' from both kmalloc() and __vmalloc()? > > iow: > kmalloc(f & ~__GFP_NOMEMALLOC) > vmalloc(f & ~__GFP_NOMEMALLOC) > > test #2 : almost always failing kmalloc() and !NULL __vmalloc() > >> > > log message : >> > > <4>[ 2288.954934][0] KDKIM: zcomp_lz4_create: 24: ret = (null) >> > > <4>[ 2288.954972][0] KDKIM: zcomp_lz4_create: 30: ret = > ff800287e000 >> > > >> > > <4>[ 2289.092411][0] KDKIM: zcomp_lz4_create: 24: ret = (null) >> > > <4>[ 2289.092546][0] KDKIM: zcomp_lz4_create: 30: ret = > ff80028b5000 >> > > >> > > <4>[ 2289.135628][0] KDKIM: zcomp_lz4_create: 24: ret = (null) >> > > <4>[ 2289.135642][0] KDKIM: zcomp_lz4_create: 24: ret = (null) >> > > <4>[ 2289.135729][0] KDKIM: zcomp_lz4_create: 30: ret = > ff80028be000 >> > > <4>[ 2289.135732][0] KDKIM: zcomp_lz4_create: 30: ret = > ff80028c7000 > > > if this is the case (__GFP_NOMEMALLOC removed from both kmalloc and > __vmalloc), > then proposed > > kmalloc(f & ~__GFP_NOMEMALLOC) > __vmalloc(f | __GFP_NOMEMALLOC) > > > can be very close to 'test #1 && test #2': > > kmalloc() fails (as in test #2) > __vmalloc() fails (as in test #1) > > isn't it? > > -ss Let me give you a simple code of it. @test #1 (previous shared log) kmalloc(f | __GFP_NOMEMALLOC) __vmalloc(f | __GFP_NOMEMALLOC) // can find failure both @test #2 (previous shared log) kmalloc(f | __GFP_NOMEMALLOC) __vmalloc(f) // removed '__GFP_NOMEMALLOC' from vmalloc() only, and cannot find failure from vmalloc() And like you said, I made a quick check to see a failure about kmalloc() without the flag : @test #3 kmalloc(f) __vmalloc(f | __GFP_NOMEMALLOC) // removed '__GFP_NOMEMALLOC' from zmalloc() only // and cannot find failure from zmalloc(), but in this case, it's hard to find failure from vmalloc() because of already allocation mostly from zsmalloc() log message (test #3) : <4>[ 186.763605][1] KDKIM: zcomp_lz4_create: 24: ret = ffc00203 <4>[ 186.776652][1] KDKIM: zcomp_lz4_create: 24: ret = ffc0020f <4>[ 186.811423][1] KDKIM: zcomp_lz4_create: 24: ret = ffc002108000 <4>[ 186.816744][1] KDKIM: zcomp_lz4_create: 24: ret = ffc00200 <4>[ 186.816796][1] KDKIM: zcomp_lz4_create: 24: ret = ffc002008000 @test #4 kmalloc(f) __vmalloc(f) // cannot find failure both until now log message (test #4) : <4>[ 641.440468][7] KDKIM: zcomp_lz4_create: 24: ret = ffc00219 <4>[ 922.182980][7] KDKIM: zcomp_lz4_create: 24: ret = ffc002208000 <4>[ 923.197593][7] KDKIM: zcomp_lz4_create: 24: ret = ffc00202 <4>[ 939.813499][7] KDKIM: zcomp_lz4_create: 24: ret = ffc0020a So,is there another problem if we remove the flag from both sides? Thanks, Kyeongdon Kim -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Re: [PATCH v3 2/2] zram: try vmalloc() after kmalloc()
On 2015-12-01 오후 2:16, Sergey Senozhatsky wrote: > On (12/01/15 13:55), Minchan Kim wrote: > [..] >> To clear my opinion, >> >> lzo_create(gfp_t flags) >> { >> void * ret = kmalloc(LZO1X_MEM_COMPRESS, flags); >> if (!ret) >> ret = vmalloc(LZO1X_MEM_COMPRESS, flasgs | GFP_NOMEMALLOC); >> return ret; >> } > > ah, ok, I see. I've a question. > > we had > kmalloc(f | __GFP_NOMEMALLOC) > __vmalloc(f | __GFP_NOMEMALLOC) > > which produced high failure rates for both kmalloc() and __vmalloc() > > test #1 > >> > > log message : > [..] >> > > [ 352.230608][0] zcomp_lz4_create: 32: ret = (null) >> > > [ 352.230619][0] zcomp_lz4_create: 38: ret = (null) >> > > [ 352.230888][0] zcomp_lz4_create: 32: ret = (null) >> > > [ 352.230902][0] zcomp_lz4_create: 38: ret = (null) >> > > [ 352.231406][0] zcomp_lz4_create: 32: ret = ffc002088000 >> > > [ 352.234024][0] zcomp_lz4_create: 32: ret = (null) >> > > [ 352.234060][0] zcomp_lz4_create: 38: ret = (null) >> > > [ 352.234359][0] zcomp_lz4_create: 32: ret = (null) > [..] >> > > [ 352.234384][0] zcomp_lz4_create: 38: ret = (null) >> > > [ 352.234618][0] zcomp_lz4_create: 32: ret = (null) >> > > [ 352.234639][0] zcomp_lz4_create: 38: ret = (null) >> > > [ 352.234667][0] zcomp_lz4_create: 32: ret = (null) >> > > [ 352.235179][0] zcomp_lz4_create: 38: ret = ff80016a4000 > > > > Kyeongdon, do I understand correctly, that for the second test you > removed '__GFP_NOMEMALLOC' from both kmalloc() and __vmalloc()? > > iow: > kmalloc(f & ~__GFP_NOMEMALLOC) > vmalloc(f & ~__GFP_NOMEMALLOC) > > test #2 : almost always failing kmalloc() and !NULL __vmalloc() > >> > > log message : >> > > <4>[ 2288.954934][0] KDKIM: zcomp_lz4_create: 24: ret = (null) >> > > <4>[ 2288.954972][0] KDKIM: zcomp_lz4_create: 30: ret = > ff800287e000 >> > > >> > > <4>[ 2289.092411][0] KDKIM: zcomp_lz4_create: 24: ret = (null) >> > > <4>[ 2289.092546][0] KDKIM: zcomp_lz4_create: 30: ret = > ff80028b5000 >> > > >> > > <4>[ 2289.135628][0] KDKIM: zcomp_lz4_create: 24: ret = (null) >> > > <4>[ 2289.135642][0] KDKIM: zcomp_lz4_create: 24: ret = (null) >> > > <4>[ 2289.135729][0] KDKIM: zcomp_lz4_create: 30: ret = > ff80028be000 >> > > <4>[ 2289.135732][0] KDKIM: zcomp_lz4_create: 30: ret = > ff80028c7000 > > > if this is the case (__GFP_NOMEMALLOC removed from both kmalloc and > __vmalloc), > then proposed > > kmalloc(f & ~__GFP_NOMEMALLOC) > __vmalloc(f | __GFP_NOMEMALLOC) > > > can be very close to 'test #1 && test #2': > > kmalloc() fails (as in test #2) > __vmalloc() fails (as in test #1) > > isn't it? > > -ss Let me give you a simple code of it. @test #1 (previous shared log) kmalloc(f | __GFP_NOMEMALLOC) __vmalloc(f | __GFP_NOMEMALLOC) // can find failure both @test #2 (previous shared log) kmalloc(f | __GFP_NOMEMALLOC) __vmalloc(f) // removed '__GFP_NOMEMALLOC' from vmalloc() only, and cannot find failure from vmalloc() And like you said, I made a quick check to see a failure about kmalloc() without the flag : @test #3 kmalloc(f) __vmalloc(f | __GFP_NOMEMALLOC) // removed '__GFP_NOMEMALLOC' from zmalloc() only // and cannot find failure from zmalloc(), but in this case, it's hard to find failure from vmalloc() because of already allocation mostly from zsmalloc() log message (test #3) : <4>[ 186.763605][1] KDKIM: zcomp_lz4_create: 24: ret = ffc00203 <4>[ 186.776652][1] KDKIM: zcomp_lz4_create: 24: ret = ffc0020f <4>[ 186.811423][1] KDKIM: zcomp_lz4_create: 24: ret = ffc002108000 <4>[ 186.816744][1] KDKIM: zcomp_lz4_create: 24: ret = ffc00200 <4>[ 186.816796][1] KDKIM: zcomp_lz4_create: 24: ret = ffc002008000 @test #4 kmalloc(f) __vmalloc(f) // cannot find failure both until now log message (test #4) : <4>[ 641.440468][7] KDKIM: zcomp_lz4_create: 24: ret = ffc00219 <4>[ 922.182980][7] KDKIM: zcomp_lz4_create: 24: ret = ffc002208000 <4>[ 923.197593][7] KDKIM: zcomp_lz4_create: 24: ret = ffc00202 <4>[ 939.813499][7] KDKIM: zcomp_lz4_create: 24: ret = ffc0020a So,is there another problem if we remove the flag from both sides? Thanks, Kyeongdon Kim -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Re: [PATCH v3] zram: try vmalloc() after kmalloc()
On 2015-11-24 오전 8:28, Minchan Kim wrote: > Hello Andrew, > > On Mon, Nov 23, 2015 at 02:52:26PM -0800, Andrew Morton wrote: >> On Mon, 23 Nov 2015 15:21:15 +0900 Kyeongdon Kim > wrote: >> >> > When we're using LZ4 multi compression streams for zram swap, >> > we found out page allocation failure message in system running test. >> > That was not only once, but a few(2 - 5 times per test). >> > Also, some failure cases were continually occurring to try allocation >> > order 3. >> > >> > In order to make parallel compression private data, we should call >> > kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order >> > 2/3 size memory to allocate in that time, page allocation fails. >> > This patch makes to use vmalloc() as fallback of kmalloc(), this >> > prevents page alloc failure warning. >> > >> > After using this, we never found warning message in running test, also >> > It could reduce process startup latency about 60-120ms in each case. >> > >> > ... >> > >> > --- a/drivers/block/zram/zcomp_lz4.c >> > +++ b/drivers/block/zram/zcomp_lz4.c >> > @@ -10,17 +10,25 @@ >> > #include >> > #include >> > #include >> > +#include >> > +#include >> > >> > #include "zcomp_lz4.h" >> > >> > static void *zcomp_lz4_create(void) >> > { >> > - return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL); >> > + void *ret; >> > + >> > + ret = kzalloc(LZ4_MEM_COMPRESS, >> > + __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC); >> > + if (!ret) >> > + ret = vzalloc(LZ4_MEM_COMPRESS); >> > + return ret; >> > } >> >> What's the reasoning behind the modification to the gfp flags? >> >> It clears __GFP_FS, __GFP_IO and even __GFP_WAIT. I suspect the latter >> two (at least) can be retained. And given that vmalloc() uses > > This function is used in swapout and fs write path so we couldn't use > those flags. > >> GFP_KERNEL, what's the point in clearing those flags for the kmalloc() >> case? > > When I reviewed this patch, I wanted to fix it with another patch > because we should handle another places in zcomp and Sergey sent it > today. But I think Sergey's patch is stable material so I hope > Kyeongdon resend this patch with fixing vmalloc part. > >> >> If this change (or something like it) remains in place, it should have >> a comment which fully explains the reasons, please. > > Kyeongdon, Could you resend this patch with fixing vzalloc part and > adding comment? > Sorry for the delay in replying, I just checked your comments and patch set. [PATCH 1/3] zram/zcomp: use GFP_NOIO to allocate stream [PATCH 2/3] zram: try vmalloc() after kmalloc() [RFC 3/3] zram: pass gfp from zcomp frontend to backend First of all, thanks for your summary and organized code set, and If Sergey agrees with that, I have no question about the patch 2/3. >> > > -- > Kind regards, > Minchan Kim -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Re: [PATCH v3] zram: try vmalloc() after kmalloc()
On 2015-11-24 오전 8:28, Minchan Kim wrote: > Hello Andrew, > > On Mon, Nov 23, 2015 at 02:52:26PM -0800, Andrew Morton wrote: >> On Mon, 23 Nov 2015 15:21:15 +0900 Kyeongdon Kim > <kyeongdon@lge.com> wrote: >> >> > When we're using LZ4 multi compression streams for zram swap, >> > we found out page allocation failure message in system running test. >> > That was not only once, but a few(2 - 5 times per test). >> > Also, some failure cases were continually occurring to try allocation >> > order 3. >> > >> > In order to make parallel compression private data, we should call >> > kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order >> > 2/3 size memory to allocate in that time, page allocation fails. >> > This patch makes to use vmalloc() as fallback of kmalloc(), this >> > prevents page alloc failure warning. >> > >> > After using this, we never found warning message in running test, also >> > It could reduce process startup latency about 60-120ms in each case. >> > >> > ... >> > >> > --- a/drivers/block/zram/zcomp_lz4.c >> > +++ b/drivers/block/zram/zcomp_lz4.c >> > @@ -10,17 +10,25 @@ >> > #include >> > #include >> > #include >> > +#include >> > +#include >> > >> > #include "zcomp_lz4.h" >> > >> > static void *zcomp_lz4_create(void) >> > { >> > - return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL); >> > + void *ret; >> > + >> > + ret = kzalloc(LZ4_MEM_COMPRESS, >> > + __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC); >> > + if (!ret) >> > + ret = vzalloc(LZ4_MEM_COMPRESS); >> > + return ret; >> > } >> >> What's the reasoning behind the modification to the gfp flags? >> >> It clears __GFP_FS, __GFP_IO and even __GFP_WAIT. I suspect the latter >> two (at least) can be retained. And given that vmalloc() uses > > This function is used in swapout and fs write path so we couldn't use > those flags. > >> GFP_KERNEL, what's the point in clearing those flags for the kmalloc() >> case? > > When I reviewed this patch, I wanted to fix it with another patch > because we should handle another places in zcomp and Sergey sent it > today. But I think Sergey's patch is stable material so I hope > Kyeongdon resend this patch with fixing vmalloc part. > >> >> If this change (or something like it) remains in place, it should have >> a comment which fully explains the reasons, please. > > Kyeongdon, Could you resend this patch with fixing vzalloc part and > adding comment? > Sorry for the delay in replying, I just checked your comments and patch set. [PATCH 1/3] zram/zcomp: use GFP_NOIO to allocate stream [PATCH 2/3] zram: try vmalloc() after kmalloc() [RFC 3/3] zram: pass gfp from zcomp frontend to backend First of all, thanks for your summary and organized code set, and If Sergey agrees with that, I have no question about the patch 2/3. >> > > -- > Kind regards, > Minchan Kim -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Re: [PATCH v3] zram: try vmalloc() after kmalloc()
Hello Andrew, On 2015-11-24 오전 7:52, Andrew Morton wrote: > On Mon, 23 Nov 2015 15:21:15 +0900 Kyeongdon Kim > wrote: > >> When we're using LZ4 multi compression streams for zram swap, >> we found out page allocation failure message in system running test. >> That was not only once, but a few(2 - 5 times per test). >> Also, some failure cases were continually occurring to try allocation >> order 3. >> >> In order to make parallel compression private data, we should call >> kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order >> 2/3 size memory to allocate in that time, page allocation fails. >> This patch makes to use vmalloc() as fallback of kmalloc(), this >> prevents page alloc failure warning. >> >> After using this, we never found warning message in running test, also >> It could reduce process startup latency about 60-120ms in each case. >> >> ... >> >> --- a/drivers/block/zram/zcomp_lz4.c >> +++ b/drivers/block/zram/zcomp_lz4.c >> @@ -10,17 +10,25 @@ >> #include >> #include >> #include >> +#include >> +#include >> >> #include "zcomp_lz4.h" >> >> static void *zcomp_lz4_create(void) >> { >> - return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL); >> + void *ret; >> + >> + ret = kzalloc(LZ4_MEM_COMPRESS, >> + __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC); >> + if (!ret) >> + ret = vzalloc(LZ4_MEM_COMPRESS); >> + return ret; >> } > > What's the reasoning behind the modification to the gfp flags? > > It clears __GFP_FS, __GFP_IO and even __GFP_WAIT. I suspect the latter > two (at least) can be retained. And given that vmalloc() uses > GFP_KERNEL, what's the point in clearing those flags for the kmalloc() > case? > > If this change (or something like it) remains in place, it should have > a comment which fully explains the reasons, please. Sorry for the delay in replying, I just tried to remove that warning message. If there are more rightable gfp flags(like a code in Minchan's patch), we can use it. Thanks, -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Re: [PATCH v3] zram: try vmalloc() after kmalloc()
Hello Andrew, On 2015-11-24 오전 7:52, Andrew Morton wrote: > On Mon, 23 Nov 2015 15:21:15 +0900 Kyeongdon Kim <kyeongdon@lge.com> > wrote: > >> When we're using LZ4 multi compression streams for zram swap, >> we found out page allocation failure message in system running test. >> That was not only once, but a few(2 - 5 times per test). >> Also, some failure cases were continually occurring to try allocation >> order 3. >> >> In order to make parallel compression private data, we should call >> kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order >> 2/3 size memory to allocate in that time, page allocation fails. >> This patch makes to use vmalloc() as fallback of kmalloc(), this >> prevents page alloc failure warning. >> >> After using this, we never found warning message in running test, also >> It could reduce process startup latency about 60-120ms in each case. >> >> ... >> >> --- a/drivers/block/zram/zcomp_lz4.c >> +++ b/drivers/block/zram/zcomp_lz4.c >> @@ -10,17 +10,25 @@ >> #include >> #include >> #include >> +#include >> +#include >> >> #include "zcomp_lz4.h" >> >> static void *zcomp_lz4_create(void) >> { >> - return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL); >> + void *ret; >> + >> + ret = kzalloc(LZ4_MEM_COMPRESS, >> + __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC); >> + if (!ret) >> + ret = vzalloc(LZ4_MEM_COMPRESS); >> + return ret; >> } > > What's the reasoning behind the modification to the gfp flags? > > It clears __GFP_FS, __GFP_IO and even __GFP_WAIT. I suspect the latter > two (at least) can be retained. And given that vmalloc() uses > GFP_KERNEL, what's the point in clearing those flags for the kmalloc() > case? > > If this change (or something like it) remains in place, it should have > a comment which fully explains the reasons, please. Sorry for the delay in replying, I just tried to remove that warning message. If there are more rightable gfp flags(like a code in Minchan's patch), we can use it. Thanks, -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3] zram: try vmalloc() after kmalloc()
When we're using LZ4 multi compression streams for zram swap, we found out page allocation failure message in system running test. That was not only once, but a few(2 - 5 times per test). Also, some failure cases were continually occurring to try allocation order 3. In order to make parallel compression private data, we should call kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order 2/3 size memory to allocate in that time, page allocation fails. This patch makes to use vmalloc() as fallback of kmalloc(), this prevents page alloc failure warning. After using this, we never found warning message in running test, also It could reduce process startup latency about 60-120ms in each case. For reference a call trace : Binder_1: page allocation failure: order:3, mode:0x10c0d0 CPU: 0 PID: 424 Comm: Binder_1 Tainted: GW 3.10.49-perf-g991d02b-dirty #20 Call trace: [] dump_backtrace+0x0/0x270 [] show_stack+0x10/0x1c [] dump_stack+0x1c/0x28 [] warn_alloc_failed+0xfc/0x11c [] __alloc_pages_nodemask+0x724/0x7f0 [] __get_free_pages+0x14/0x5c [] kmalloc_order_trace+0x38/0xd8 [] zcomp_lz4_create+0x2c/0x38 [] zcomp_strm_alloc+0x34/0x78 [] zcomp_strm_multi_find+0x124/0x1ec [] zcomp_strm_find+0xc/0x18 [] zram_bvec_rw+0x2fc/0x780 [] zram_make_request+0x25c/0x2d4 [] generic_make_request+0x80/0xbc [] submit_bio+0xa4/0x15c [] __swap_writepage+0x218/0x230 [] swap_writepage+0x3c/0x4c [] shrink_page_list+0x51c/0x8d0 [] shrink_inactive_list+0x3f8/0x60c [] shrink_lruvec+0x33c/0x4cc [] shrink_zone+0x3c/0x100 [] try_to_free_pages+0x2b8/0x54c [] __alloc_pages_nodemask+0x514/0x7f0 [] __get_free_pages+0x14/0x5c [] proc_info_read+0x50/0xe4 [] vfs_read+0xa0/0x12c [] SyS_read+0x44/0x74 DMA: 3397*4kB (MC) 26*8kB (RC) 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 13796kB Signed-off-by: Kyeongdon Kim --- drivers/block/zram/zcomp_lz4.c | 12 ++-- drivers/block/zram/zcomp_lzo.c | 12 ++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c index f2afb7e..0cc4799 100644 --- a/drivers/block/zram/zcomp_lz4.c +++ b/drivers/block/zram/zcomp_lz4.c @@ -10,17 +10,25 @@ #include #include #include +#include +#include #include "zcomp_lz4.h" static void *zcomp_lz4_create(void) { - return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL); + void *ret; + + ret = kzalloc(LZ4_MEM_COMPRESS, + __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC); + if (!ret) + ret = vzalloc(LZ4_MEM_COMPRESS); + return ret; } static void zcomp_lz4_destroy(void *private) { - kfree(private); + kvfree(private); } static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst, diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c index da1bc47..59b8aa4 100644 --- a/drivers/block/zram/zcomp_lzo.c +++ b/drivers/block/zram/zcomp_lzo.c @@ -10,17 +10,25 @@ #include #include #include +#include +#include #include "zcomp_lzo.h" static void *lzo_create(void) { - return kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); + void *ret; + + ret = kzalloc(LZO1X_MEM_COMPRESS, + __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC); + if (!ret) + ret = vzalloc(LZO1X_MEM_COMPRESS); + return ret; } static void lzo_destroy(void *private) { - kfree(private); + kvfree(private); } static int lzo_compress(const unsigned char *src, unsigned char *dst, -- 1.8.2 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3] zram: try vmalloc() after kmalloc()
When we're using LZ4 multi compression streams for zram swap, we found out page allocation failure message in system running test. That was not only once, but a few(2 - 5 times per test). Also, some failure cases were continually occurring to try allocation order 3. In order to make parallel compression private data, we should call kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order 2/3 size memory to allocate in that time, page allocation fails. This patch makes to use vmalloc() as fallback of kmalloc(), this prevents page alloc failure warning. After using this, we never found warning message in running test, also It could reduce process startup latency about 60-120ms in each case. For reference a call trace : Binder_1: page allocation failure: order:3, mode:0x10c0d0 CPU: 0 PID: 424 Comm: Binder_1 Tainted: GW 3.10.49-perf-g991d02b-dirty #20 Call trace: [] dump_backtrace+0x0/0x270 [] show_stack+0x10/0x1c [] dump_stack+0x1c/0x28 [] warn_alloc_failed+0xfc/0x11c [] __alloc_pages_nodemask+0x724/0x7f0 [] __get_free_pages+0x14/0x5c [] kmalloc_order_trace+0x38/0xd8 [] zcomp_lz4_create+0x2c/0x38 [] zcomp_strm_alloc+0x34/0x78 [] zcomp_strm_multi_find+0x124/0x1ec [] zcomp_strm_find+0xc/0x18 [] zram_bvec_rw+0x2fc/0x780 [] zram_make_request+0x25c/0x2d4 [] generic_make_request+0x80/0xbc [] submit_bio+0xa4/0x15c [] __swap_writepage+0x218/0x230 [] swap_writepage+0x3c/0x4c [] shrink_page_list+0x51c/0x8d0 [] shrink_inactive_list+0x3f8/0x60c [] shrink_lruvec+0x33c/0x4cc [] shrink_zone+0x3c/0x100 [] try_to_free_pages+0x2b8/0x54c [] __alloc_pages_nodemask+0x514/0x7f0 [] __get_free_pages+0x14/0x5c [] proc_info_read+0x50/0xe4 [] vfs_read+0xa0/0x12c [] SyS_read+0x44/0x74 DMA: 3397*4kB (MC) 26*8kB (RC) 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 13796kB Signed-off-by: Kyeongdon Kim <kyeongdon@lge.com> --- drivers/block/zram/zcomp_lz4.c | 12 ++-- drivers/block/zram/zcomp_lzo.c | 12 ++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c index f2afb7e..0cc4799 100644 --- a/drivers/block/zram/zcomp_lz4.c +++ b/drivers/block/zram/zcomp_lz4.c @@ -10,17 +10,25 @@ #include #include #include +#include +#include #include "zcomp_lz4.h" static void *zcomp_lz4_create(void) { - return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL); + void *ret; + + ret = kzalloc(LZ4_MEM_COMPRESS, + __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC); + if (!ret) + ret = vzalloc(LZ4_MEM_COMPRESS); + return ret; } static void zcomp_lz4_destroy(void *private) { - kfree(private); + kvfree(private); } static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst, diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c index da1bc47..59b8aa4 100644 --- a/drivers/block/zram/zcomp_lzo.c +++ b/drivers/block/zram/zcomp_lzo.c @@ -10,17 +10,25 @@ #include #include #include +#include +#include #include "zcomp_lzo.h" static void *lzo_create(void) { - return kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); + void *ret; + + ret = kzalloc(LZO1X_MEM_COMPRESS, + __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC); + if (!ret) + ret = vzalloc(LZO1X_MEM_COMPRESS); + return ret; } static void lzo_destroy(void *private) { - kfree(private); + kvfree(private); } static int lzo_compress(const unsigned char *src, unsigned char *dst, -- 1.8.2 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2] zram: Prevent page allocation failure during zcomp_strm_alloc
When we're using LZ4 multi compression streams for zram swap, we found out page allocation failure message in system running test. That was not only once, but a few(2 - 5 times per test). Also, some failure cases were continually occurring to try allocation order 3. In order to make parallel compression private data, we should call kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order 2/3 size memory in that time, page allocation fails. This patch makes to use vmalloc() as fallback of kmalloc(), this prevents page alloc failure warning. After this, we never found warning message in running test, also It could reduce process startup latency about 60-120ms in each case. For reference a call trace : Binder_1: page allocation failure: order:3, mode:0x10c0d0 CPU: 0 PID: 424 Comm: Binder_1 Tainted: GW 3.10.49-perf-g991d02b-dirty #20 Call trace: [] dump_backtrace+0x0/0x270 [] show_stack+0x10/0x1c [] dump_stack+0x1c/0x28 [] warn_alloc_failed+0xfc/0x11c [] __alloc_pages_nodemask+0x724/0x7f0 [] __get_free_pages+0x14/0x5c [] kmalloc_order_trace+0x38/0xd8 [] zcomp_lz4_create+0x2c/0x38 [] zcomp_strm_alloc+0x34/0x78 [] zcomp_strm_multi_find+0x124/0x1ec [] zcomp_strm_find+0xc/0x18 [] zram_bvec_rw+0x2fc/0x780 [] zram_make_request+0x25c/0x2d4 [] generic_make_request+0x80/0xbc [] submit_bio+0xa4/0x15c [] __swap_writepage+0x218/0x230 [] swap_writepage+0x3c/0x4c [] shrink_page_list+0x51c/0x8d0 [] shrink_inactive_list+0x3f8/0x60c [] shrink_lruvec+0x33c/0x4cc [] shrink_zone+0x3c/0x100 [] try_to_free_pages+0x2b8/0x54c [] __alloc_pages_nodemask+0x514/0x7f0 [] __get_free_pages+0x14/0x5c [] proc_info_read+0x50/0xe4 [] vfs_read+0xa0/0x12c [] SyS_read+0x44/0x74 DMA: 3397*4kB (MC) 26*8kB (RC) 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 13796kB Signed-off-by: Kyeongdon Kim --- drivers/block/zram/zcomp_lz4.c | 15 +-- drivers/block/zram/zcomp_lzo.c | 15 +-- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c index f2afb7e..0477894 100644 --- a/drivers/block/zram/zcomp_lz4.c +++ b/drivers/block/zram/zcomp_lz4.c @@ -10,17 +10,28 @@ #include #include #include +#include +#include #include "zcomp_lz4.h" static void *zcomp_lz4_create(void) { - return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL); + void *ret; + + ret = kzalloc(LZ4_MEM_COMPRESS, + __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC); + if (!ret) + ret = vzalloc(LZ4_MEM_COMPRESS); + return ret; } static void zcomp_lz4_destroy(void *private) { - kfree(private); + if (is_vmalloc_addr(private)) + vfree(private); + else + kfree(private); } static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst, diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c index da1bc47..613fbac 100644 --- a/drivers/block/zram/zcomp_lzo.c +++ b/drivers/block/zram/zcomp_lzo.c @@ -10,17 +10,28 @@ #include #include #include +#include +#include #include "zcomp_lzo.h" static void *lzo_create(void) { - return kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); + void *ret; + + ret = kzalloc(LZO1X_MEM_COMPRESS, + __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC); + if (!ret) + ret = vzalloc(LZO1X_MEM_COMPRESS); + return ret; } static void lzo_destroy(void *private) { - kfree(private); + if (is_vmalloc_addr(private)) + vfree(private); + else + kfree(private); } static int lzo_compress(const unsigned char *src, unsigned char *dst, -- 1.8.2 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2] zram: Prevent page allocation failure during zcomp_strm_alloc
When we're using LZ4 multi compression streams for zram swap, we found out page allocation failure message in system running test. That was not only once, but a few(2 - 5 times per test). Also, some failure cases were continually occurring to try allocation order 3. In order to make parallel compression private data, we should call kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order 2/3 size memory in that time, page allocation fails. This patch makes to use vmalloc() as fallback of kmalloc(), this prevents page alloc failure warning. After this, we never found warning message in running test, also It could reduce process startup latency about 60-120ms in each case. For reference a call trace : Binder_1: page allocation failure: order:3, mode:0x10c0d0 CPU: 0 PID: 424 Comm: Binder_1 Tainted: GW 3.10.49-perf-g991d02b-dirty #20 Call trace: [] dump_backtrace+0x0/0x270 [] show_stack+0x10/0x1c [] dump_stack+0x1c/0x28 [] warn_alloc_failed+0xfc/0x11c [] __alloc_pages_nodemask+0x724/0x7f0 [] __get_free_pages+0x14/0x5c [] kmalloc_order_trace+0x38/0xd8 [] zcomp_lz4_create+0x2c/0x38 [] zcomp_strm_alloc+0x34/0x78 [] zcomp_strm_multi_find+0x124/0x1ec [] zcomp_strm_find+0xc/0x18 [] zram_bvec_rw+0x2fc/0x780 [] zram_make_request+0x25c/0x2d4 [] generic_make_request+0x80/0xbc [] submit_bio+0xa4/0x15c [] __swap_writepage+0x218/0x230 [] swap_writepage+0x3c/0x4c [] shrink_page_list+0x51c/0x8d0 [] shrink_inactive_list+0x3f8/0x60c [] shrink_lruvec+0x33c/0x4cc [] shrink_zone+0x3c/0x100 [] try_to_free_pages+0x2b8/0x54c [] __alloc_pages_nodemask+0x514/0x7f0 [] __get_free_pages+0x14/0x5c [] proc_info_read+0x50/0xe4 [] vfs_read+0xa0/0x12c [] SyS_read+0x44/0x74 DMA: 3397*4kB (MC) 26*8kB (RC) 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 13796kB Signed-off-by: Kyeongdon Kim <kyeongdon@lge.com> --- drivers/block/zram/zcomp_lz4.c | 15 +-- drivers/block/zram/zcomp_lzo.c | 15 +-- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c index f2afb7e..0477894 100644 --- a/drivers/block/zram/zcomp_lz4.c +++ b/drivers/block/zram/zcomp_lz4.c @@ -10,17 +10,28 @@ #include #include #include +#include +#include #include "zcomp_lz4.h" static void *zcomp_lz4_create(void) { - return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL); + void *ret; + + ret = kzalloc(LZ4_MEM_COMPRESS, + __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC); + if (!ret) + ret = vzalloc(LZ4_MEM_COMPRESS); + return ret; } static void zcomp_lz4_destroy(void *private) { - kfree(private); + if (is_vmalloc_addr(private)) + vfree(private); + else + kfree(private); } static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst, diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c index da1bc47..613fbac 100644 --- a/drivers/block/zram/zcomp_lzo.c +++ b/drivers/block/zram/zcomp_lzo.c @@ -10,17 +10,28 @@ #include #include #include +#include +#include #include "zcomp_lzo.h" static void *lzo_create(void) { - return kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); + void *ret; + + ret = kzalloc(LZO1X_MEM_COMPRESS, + __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC); + if (!ret) + ret = vzalloc(LZO1X_MEM_COMPRESS); + return ret; } static void lzo_destroy(void *private) { - kfree(private); + if (is_vmalloc_addr(private)) + vfree(private); + else + kfree(private); } static int lzo_compress(const unsigned char *src, unsigned char *dst, -- 1.8.2 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/