Re: Re: [PATCH v2] arm64: kasan: add interceptors for strcmp/strncmp functions

2018-09-05 Thread Kyeongdon Kim




On 2018-09-05 오전 1:24, Andrey Ryabinin wrote:



On 09/04/2018 01:10 PM, Andrey Ryabinin wrote:
>
>
> On 09/04/2018 09:59 AM, Kyeongdon Kim wrote:
>
>>>> +#undef strncmp
>>>> +int strncmp(const char *cs, const char *ct, size_t len)
>>>> +{
>>>> + check_memory_region((unsigned long)cs, len, false, _RET_IP_);
>>>> + check_memory_region((unsigned long)ct, len, false, _RET_IP_);
>>>
>>> This will cause false positives. Both 'cs', and 'ct' could be less 
than len bytes.

>>>
>>> There is no need in these interceptors, just use the C 
implementations from lib/string.c

>>> like you did in your first patch.
>>> The only thing that was wrong in the first patch is that assembly 
implementations

>>> were compiled out instead of being declared week.
>>>
>> Well, at first I thought so..
>> I would remove diff code in /mm/kasan/kasan.c then use C 
implementations in lib/string.c

>> w/ assem implementations as weak :
>>
>> diff --git a/lib/string.c b/lib/string.c
>> index 2c0900a..a18b18f 100644
>> --- a/lib/string.c
>> +++ b/lib/string.c
>> @@ -312,7 +312,7 @@ size_t strlcat(char *dest, const char *src, 
size_t count)

>>  EXPORT_SYMBOL(strlcat);
>>  #endif
>>
>> -#ifndef __HAVE_ARCH_STRCMP
>> +#if (defined(CONFIG_ARM64) && defined(CONFIG_KASAN)) || 
!defined(__HAVE_ARCH_STRCMP)

>
> No. What part of "like you did in your first patch" is unclear to you?

Just to be absolutely clear, I meant #ifdef out __HAVE_ARCH_* defines 
like it has been done in this patch
http://lkml.kernel.org/r/<1534233322-106271-1-git-send-email-kyeongdon@lge.com> 


I understood what you're saying, but I might think the wrong patch.

So, thinking about the other way as below:
can pick up assem variant or c one, declare them as weak.
---
diff --git a/arch/arm64/include/asm/string.h 
b/arch/arm64/include/asm/string.h

index dd95d33..53a2ae0 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -22,11 +22,22 @@ extern char *strrchr(const char *, int c);
 #define __HAVE_ARCH_STRCHR
 extern char *strchr(const char *, int c);

+#ifdef CONFIG_KASAN
+extern int __strcmp(const char *, const char *);
+extern int __strncmp(const char *, const char *, __kernel_size_t);
+
+#ifndef __SANITIZE_ADDRESS__
+#define strcmp(cs, ct) __strcmp(cs, ct)
+#define strncmp(cs, ct, n) __strncmp(cs, ct, n)
+#endif
+
+#else
 #define __HAVE_ARCH_STRCMP
 extern int strcmp(const char *, const char *);

 #define __HAVE_ARCH_STRNCMP
 extern int strncmp(const char *, const char *, __kernel_size_t);
+#endif

 #define __HAVE_ARCH_STRLEN
 extern __kernel_size_t strlen(const char *);
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index d894a20..9aeffd5 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -50,6 +50,10 @@ EXPORT_SYMBOL(strcmp);
 EXPORT_SYMBOL(strncmp);
 EXPORT_SYMBOL(strlen);
 EXPORT_SYMBOL(strnlen);
+#ifdef CONFIG_KASAN
+EXPORT_SYMBOL(__strcmp);
+EXPORT_SYMBOL(__strncmp);
+#endif
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memmove);
diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
index a820ed0..5ef7a57 100644
--- a/arch/arm64/kernel/image.h
+++ b/arch/arm64/kernel/image.h
@@ -110,6 +110,8 @@ __efistub___flush_dcache_area    = 
KALLSYMS_HIDE(__pi___flush_dcache_area);

 __efistub___memcpy        = KALLSYMS_HIDE(__pi_memcpy);
 __efistub___memmove        = KALLSYMS_HIDE(__pi_memmove);
 __efistub___memset        = KALLSYMS_HIDE(__pi_memset);
+__efistub___strcmp        = KALLSYMS_HIDE(__pi_strcmp);
+__efistub___strncmp        = KALLSYMS_HIDE(__pi_strncmp);
 #endif

 __efistub__text            = KALLSYMS_HIDE(_text);
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
index 471fe61..0dffef7 100644
--- a/arch/arm64/lib/strcmp.S
+++ b/arch/arm64/lib/strcmp.S
@@ -60,6 +60,8 @@ tmp3        .req    x9
 zeroones    .req    x10
 pos        .req    x11

+.weak strcmp
+ENTRY(__strcmp)
 ENTRY(strcmp)
 eor    tmp1, src1, src2
 mov    zeroones, #REP8_01
@@ -232,3 +234,4 @@ CPU_BE(    orr    syndrome, diff, has_nul )
 sub    result, data1, data2, lsr #56
 ret
 ENDPIPROC(strcmp)
+ENDPROC(__strcmp)
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S
index e267044..b2648c7 100644
--- a/arch/arm64/lib/strncmp.S
+++ b/arch/arm64/lib/strncmp.S
@@ -64,6 +64,8 @@ limit_wd    .req    x13
 mask        .req    x14
 endloop        .req    x15

+.weak strncmp
+ENTRY(__strncmp)
 ENTRY(strncmp)
 cbz    limit, .Lret0
 eor    tmp1, src1, src2
@@ -308,3 +310,4 @@ CPU_BE( orr    syndrome, diff, has_nul )
 mov    result, #0
 ret
 ENDPIPROC(strncmp)
+ENDPROC(__strncmp)
--
Could you review this diff?


Re: Re: [PATCH v2] arm64: kasan: add interceptors for strcmp/strncmp functions

2018-09-05 Thread Kyeongdon Kim




On 2018-09-05 오전 1:24, Andrey Ryabinin wrote:



On 09/04/2018 01:10 PM, Andrey Ryabinin wrote:
>
>
> On 09/04/2018 09:59 AM, Kyeongdon Kim wrote:
>
>>>> +#undef strncmp
>>>> +int strncmp(const char *cs, const char *ct, size_t len)
>>>> +{
>>>> + check_memory_region((unsigned long)cs, len, false, _RET_IP_);
>>>> + check_memory_region((unsigned long)ct, len, false, _RET_IP_);
>>>
>>> This will cause false positives. Both 'cs', and 'ct' could be less 
than len bytes.

>>>
>>> There is no need in these interceptors, just use the C 
implementations from lib/string.c

>>> like you did in your first patch.
>>> The only thing that was wrong in the first patch is that assembly 
implementations

>>> were compiled out instead of being declared week.
>>>
>> Well, at first I thought so..
>> I would remove diff code in /mm/kasan/kasan.c then use C 
implementations in lib/string.c

>> w/ assem implementations as weak :
>>
>> diff --git a/lib/string.c b/lib/string.c
>> index 2c0900a..a18b18f 100644
>> --- a/lib/string.c
>> +++ b/lib/string.c
>> @@ -312,7 +312,7 @@ size_t strlcat(char *dest, const char *src, 
size_t count)

>>  EXPORT_SYMBOL(strlcat);
>>  #endif
>>
>> -#ifndef __HAVE_ARCH_STRCMP
>> +#if (defined(CONFIG_ARM64) && defined(CONFIG_KASAN)) || 
!defined(__HAVE_ARCH_STRCMP)

>
> No. What part of "like you did in your first patch" is unclear to you?

Just to be absolutely clear, I meant #ifdef out __HAVE_ARCH_* defines 
like it has been done in this patch
http://lkml.kernel.org/r/<1534233322-106271-1-git-send-email-kyeongdon@lge.com> 


I understood what you're saying, but I might think the wrong patch.

So, thinking about the other way as below:
can pick up assem variant or c one, declare them as weak.
---
diff --git a/arch/arm64/include/asm/string.h 
b/arch/arm64/include/asm/string.h

index dd95d33..53a2ae0 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -22,11 +22,22 @@ extern char *strrchr(const char *, int c);
 #define __HAVE_ARCH_STRCHR
 extern char *strchr(const char *, int c);

+#ifdef CONFIG_KASAN
+extern int __strcmp(const char *, const char *);
+extern int __strncmp(const char *, const char *, __kernel_size_t);
+
+#ifndef __SANITIZE_ADDRESS__
+#define strcmp(cs, ct) __strcmp(cs, ct)
+#define strncmp(cs, ct, n) __strncmp(cs, ct, n)
+#endif
+
+#else
 #define __HAVE_ARCH_STRCMP
 extern int strcmp(const char *, const char *);

 #define __HAVE_ARCH_STRNCMP
 extern int strncmp(const char *, const char *, __kernel_size_t);
+#endif

 #define __HAVE_ARCH_STRLEN
 extern __kernel_size_t strlen(const char *);
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index d894a20..9aeffd5 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -50,6 +50,10 @@ EXPORT_SYMBOL(strcmp);
 EXPORT_SYMBOL(strncmp);
 EXPORT_SYMBOL(strlen);
 EXPORT_SYMBOL(strnlen);
+#ifdef CONFIG_KASAN
+EXPORT_SYMBOL(__strcmp);
+EXPORT_SYMBOL(__strncmp);
+#endif
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memmove);
diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
index a820ed0..5ef7a57 100644
--- a/arch/arm64/kernel/image.h
+++ b/arch/arm64/kernel/image.h
@@ -110,6 +110,8 @@ __efistub___flush_dcache_area    = 
KALLSYMS_HIDE(__pi___flush_dcache_area);

 __efistub___memcpy        = KALLSYMS_HIDE(__pi_memcpy);
 __efistub___memmove        = KALLSYMS_HIDE(__pi_memmove);
 __efistub___memset        = KALLSYMS_HIDE(__pi_memset);
+__efistub___strcmp        = KALLSYMS_HIDE(__pi_strcmp);
+__efistub___strncmp        = KALLSYMS_HIDE(__pi_strncmp);
 #endif

 __efistub__text            = KALLSYMS_HIDE(_text);
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
index 471fe61..0dffef7 100644
--- a/arch/arm64/lib/strcmp.S
+++ b/arch/arm64/lib/strcmp.S
@@ -60,6 +60,8 @@ tmp3        .req    x9
 zeroones    .req    x10
 pos        .req    x11

+.weak strcmp
+ENTRY(__strcmp)
 ENTRY(strcmp)
 eor    tmp1, src1, src2
 mov    zeroones, #REP8_01
@@ -232,3 +234,4 @@ CPU_BE(    orr    syndrome, diff, has_nul )
 sub    result, data1, data2, lsr #56
 ret
 ENDPIPROC(strcmp)
+ENDPROC(__strcmp)
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S
index e267044..b2648c7 100644
--- a/arch/arm64/lib/strncmp.S
+++ b/arch/arm64/lib/strncmp.S
@@ -64,6 +64,8 @@ limit_wd    .req    x13
 mask        .req    x14
 endloop        .req    x15

+.weak strncmp
+ENTRY(__strncmp)
 ENTRY(strncmp)
 cbz    limit, .Lret0
 eor    tmp1, src1, src2
@@ -308,3 +310,4 @@ CPU_BE( orr    syndrome, diff, has_nul )
 mov    result, #0
 ret
 ENDPIPROC(strncmp)
+ENDPROC(__strncmp)
--
Could you review this diff?


Re: Re: [PATCH v2] arm64: kasan: add interceptors for strcmp/strncmp functions

2018-09-04 Thread Kyeongdon Kim

Hello Andrey,

Thanks for your review.

On 2018-09-03 오후 6:40, Andrey Ryabinin wrote:



On 08/23/2018 11:56 AM, Kyeongdon Kim wrote:

> diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
> index c3bd520..61ad7f1 100644
> --- a/mm/kasan/kasan.c
> +++ b/mm/kasan/kasan.c
> @@ -304,6 +304,29 @@ void *memcpy(void *dest, const void *src, 
size_t len)

>
> return __memcpy(dest, src, len);
> }
> +#ifdef CONFIG_ARM64
> +/*
> + * Arch arm64 use assembly variant for strcmp/strncmp,
> + * xtensa use inline asm operations and x86_64 use c one,
> + * so now this interceptors only for arm64 kasan.
> + */
> +#undef strcmp
> +int strcmp(const char *cs, const char *ct)
> +{
> + check_memory_region((unsigned long)cs, 1, false, _RET_IP_);
> + check_memory_region((unsigned long)ct, 1, false, _RET_IP_);
> +

Well this is definitely wrong. strcmp() often accesses far more than 
one byte.


> + return __strcmp(cs, ct);
> +}
> +#undef strncmp
> +int strncmp(const char *cs, const char *ct, size_t len)
> +{
> + check_memory_region((unsigned long)cs, len, false, _RET_IP_);
> + check_memory_region((unsigned long)ct, len, false, _RET_IP_);

This will cause false positives. Both 'cs', and 'ct' could be less 
than len bytes.


There is no need in these interceptors, just use the C implementations 
from lib/string.c

like you did in your first patch.
The only thing that was wrong in the first patch is that assembly 
implementations

were compiled out instead of being declared week.


Well, at first I thought so..
I would remove diff code in /mm/kasan/kasan.c then use C implementations 
in lib/string.c

w/ assem implementations as weak :

diff --git a/lib/string.c b/lib/string.c
index 2c0900a..a18b18f 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -312,7 +312,7 @@ size_t strlcat(char *dest, const char *src, size_t 
count)

 EXPORT_SYMBOL(strlcat);
 #endif

-#ifndef __HAVE_ARCH_STRCMP
+#if (defined(CONFIG_ARM64) && defined(CONFIG_KASAN)) || 
!defined(__HAVE_ARCH_STRCMP)

 /**
  * strcmp - Compare two strings
  * @cs: One string
@@ -336,7 +336,7 @@ int strcmp(const char *cs, const char *ct)
 EXPORT_SYMBOL(strcmp);
 #endif

-#ifndef __HAVE_ARCH_STRNCMP
+#if (defined(CONFIG_ARM64) && defined(CONFIG_KASAN)) || 
!defined(__HAVE_ARCH_STRNCMP)

 /**
  * strncmp - Compare two length-limited strings

Can I get your opinion wrt this ?

Thanks,



Re: Re: [PATCH v2] arm64: kasan: add interceptors for strcmp/strncmp functions

2018-09-04 Thread Kyeongdon Kim

Hello Andrey,

Thanks for your review.

On 2018-09-03 오후 6:40, Andrey Ryabinin wrote:



On 08/23/2018 11:56 AM, Kyeongdon Kim wrote:

> diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
> index c3bd520..61ad7f1 100644
> --- a/mm/kasan/kasan.c
> +++ b/mm/kasan/kasan.c
> @@ -304,6 +304,29 @@ void *memcpy(void *dest, const void *src, 
size_t len)

>
> return __memcpy(dest, src, len);
> }
> +#ifdef CONFIG_ARM64
> +/*
> + * Arch arm64 use assembly variant for strcmp/strncmp,
> + * xtensa use inline asm operations and x86_64 use c one,
> + * so now this interceptors only for arm64 kasan.
> + */
> +#undef strcmp
> +int strcmp(const char *cs, const char *ct)
> +{
> + check_memory_region((unsigned long)cs, 1, false, _RET_IP_);
> + check_memory_region((unsigned long)ct, 1, false, _RET_IP_);
> +

Well this is definitely wrong. strcmp() often accesses far more than 
one byte.


> + return __strcmp(cs, ct);
> +}
> +#undef strncmp
> +int strncmp(const char *cs, const char *ct, size_t len)
> +{
> + check_memory_region((unsigned long)cs, len, false, _RET_IP_);
> + check_memory_region((unsigned long)ct, len, false, _RET_IP_);

This will cause false positives. Both 'cs', and 'ct' could be less 
than len bytes.


There is no need in these interceptors, just use the C implementations 
from lib/string.c

like you did in your first patch.
The only thing that was wrong in the first patch is that assembly 
implementations

were compiled out instead of being declared week.


Well, at first I thought so..
I would remove diff code in /mm/kasan/kasan.c then use C implementations 
in lib/string.c

w/ assem implementations as weak :

diff --git a/lib/string.c b/lib/string.c
index 2c0900a..a18b18f 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -312,7 +312,7 @@ size_t strlcat(char *dest, const char *src, size_t 
count)

 EXPORT_SYMBOL(strlcat);
 #endif

-#ifndef __HAVE_ARCH_STRCMP
+#if (defined(CONFIG_ARM64) && defined(CONFIG_KASAN)) || 
!defined(__HAVE_ARCH_STRCMP)

 /**
  * strcmp - Compare two strings
  * @cs: One string
@@ -336,7 +336,7 @@ int strcmp(const char *cs, const char *ct)
 EXPORT_SYMBOL(strcmp);
 #endif

-#ifndef __HAVE_ARCH_STRNCMP
+#if (defined(CONFIG_ARM64) && defined(CONFIG_KASAN)) || 
!defined(__HAVE_ARCH_STRNCMP)

 /**
  * strncmp - Compare two length-limited strings

Can I get your opinion wrt this ?

Thanks,



Re: Re: [PATCH v2] arm64: kasan: add interceptors for strcmp/strncmp functions

2018-09-04 Thread Kyeongdon Kim

Hello Dmitry,


On 2018-09-03 오후 6:13, Dmitry Vyukov wrote:
On Mon, Sep 3, 2018 at 11:02 AM, Kyeongdon Kim  
wrote:

> Dear all,
>
> Could anyone review this and provide me appropriate approach ?
> I think str[n]cmp are frequently used functions so I believe very 
useful w/

> arm64 KASAN.

Hi Kyeongdon,

Please add tests for this to lib/test_kasan.c.


I'll add tests for this patch after next version upload.

Thanks,


>>
>> This patch declares strcmp/strncmp as weak symbols.
>> (2 of them are the most used string operations)
>>
>> Original functions declared as weak and
>> strong ones in mm/kasan/kasan.c could replace them.
>>
>> Assembly optimized strcmp/strncmp functions cannot detect KASan bug.
>> But, now we can detect them like the call trace below.
>>
>> ==
>> BUG: KASAN: use-after-free in platform_match+0x1c/0x5c at addr
>> ffc0ad313500
>> Read of size 1 by task swapper/0/1
>> CPU: 3 PID: 1 Comm: swapper/0 Tainted: G B 4.9.77+ #1
>> Hardware name: Generic (DT) based system
>> Call trace:
>> dump_backtrace+0x0/0x2e0
>> show_stack+0x14/0x1c
>> dump_stack+0x88/0xb0
>> kasan_object_err+0x24/0x7c
>> kasan_report+0x2f0/0x484
>> check_memory_region+0x20/0x14c
>> strcmp+0x1c/0x5c
>> platform_match+0x40/0xe4
>> __driver_attach+0x40/0x130
>> bus_for_each_dev+0xc4/0xe0
>> driver_attach+0x30/0x3c
>> bus_add_driver+0x2dc/0x328
>> driver_register+0x118/0x160
>> __platform_driver_register+0x7c/0x88
>> alarmtimer_init+0x154/0x1e4
>> do_one_initcall+0x184/0x1a4
>> kernel_init_freeable+0x2ec/0x2f0
>> kernel_init+0x18/0x10c
>> ret_from_fork+0x10/0x50
>>
>> In case of xtensa and x86_64 kasan, no need to use this patch now.
>>
>> Signed-off-by: Kyeongdon Kim 
>> ---
>> arch/arm64/include/asm/string.h | 5 +
>> arch/arm64/kernel/arm64ksyms.c | 2 ++
>> arch/arm64/kernel/image.h | 2 ++
>> arch/arm64/lib/strcmp.S | 3 +++
>> arch/arm64/lib/strncmp.S | 3 +++
>> mm/kasan/kasan.c | 23 +++
>> 6 files changed, 38 insertions(+)
>>
>> diff --git a/arch/arm64/include/asm/string.h
>> b/arch/arm64/include/asm/string.h
>> index dd95d33..ab60349 100644
>> --- a/arch/arm64/include/asm/string.h
>> +++ b/arch/arm64/include/asm/string.h
>> @@ -24,9 +24,11 @@ extern char *strchr(const char *, int c);
>>
>> #define __HAVE_ARCH_STRCMP
>> extern int strcmp(const char *, const char *);
>> +extern int __strcmp(const char *, const char *);
>>
>> #define __HAVE_ARCH_STRNCMP
>> extern int strncmp(const char *, const char *, __kernel_size_t);
>> +extern int __strncmp(const char *, const char *, __kernel_size_t);
>>
>> #define __HAVE_ARCH_STRLEN
>> extern __kernel_size_t strlen(const char *);
>> @@ -68,6 +70,9 @@ void memcpy_flushcache(void *dst, const void *src,
>> size_t cnt);
>> #define memmove(dst, src, len) __memmove(dst, src, len)
>> #define memset(s, c, n) __memset(s, c, n)
>>
>> +#define strcmp(cs, ct) __strcmp(cs, ct)
>> +#define strncmp(cs, ct, n) __strncmp(cs, ct, n)
>> +
>> #ifndef __NO_FORTIFY
>> #define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */
>> #endif
>> diff --git a/arch/arm64/kernel/arm64ksyms.c
>> b/arch/arm64/kernel/arm64ksyms.c
>> index d894a20..10b1164 100644
>> --- a/arch/arm64/kernel/arm64ksyms.c
>> +++ b/arch/arm64/kernel/arm64ksyms.c
>> @@ -50,6 +50,8 @@ EXPORT_SYMBOL(strcmp);
>> EXPORT_SYMBOL(strncmp);
>> EXPORT_SYMBOL(strlen);
>> EXPORT_SYMBOL(strnlen);
>> +EXPORT_SYMBOL(__strcmp);
>> +EXPORT_SYMBOL(__strncmp);
>> EXPORT_SYMBOL(memset);
>> EXPORT_SYMBOL(memcpy);
>> EXPORT_SYMBOL(memmove);
>> diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
>> index a820ed0..5ef7a57 100644
>> --- a/arch/arm64/kernel/image.h
>> +++ b/arch/arm64/kernel/image.h
>> @@ -110,6 +110,8 @@ __efistub___flush_dcache_area =
>> KALLSYMS_HIDE(__pi___flush_dcache_area);
>> __efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy);
>> __efistub___memmove = KALLSYMS_HIDE(__pi_memmove);
>> __efistub___memset = KALLSYMS_HIDE(__pi_memset);
>> +__efistub___strcmp = KALLSYMS_HIDE(__pi_strcmp);
>> +__efistub___strncmp = KALLSYMS_HIDE(__pi_strncmp);
>> #endif
>>
>> __efistub__text = KALLSYMS_HIDE(_text);
>> diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
>> index 471fe61..0dffef7 100644
>> --- a/arch/arm64/lib/strcmp.S
>> +++ b/arch/arm64/lib/strcmp.S
>> @@ -60,6 +

Re: Re: [PATCH v2] arm64: kasan: add interceptors for strcmp/strncmp functions

2018-09-04 Thread Kyeongdon Kim

Hello Dmitry,


On 2018-09-03 오후 6:13, Dmitry Vyukov wrote:
On Mon, Sep 3, 2018 at 11:02 AM, Kyeongdon Kim  
wrote:

> Dear all,
>
> Could anyone review this and provide me appropriate approach ?
> I think str[n]cmp are frequently used functions so I believe very 
useful w/

> arm64 KASAN.

Hi Kyeongdon,

Please add tests for this to lib/test_kasan.c.


I'll add tests for this patch after next version upload.

Thanks,


>>
>> This patch declares strcmp/strncmp as weak symbols.
>> (2 of them are the most used string operations)
>>
>> Original functions declared as weak and
>> strong ones in mm/kasan/kasan.c could replace them.
>>
>> Assembly optimized strcmp/strncmp functions cannot detect KASan bug.
>> But, now we can detect them like the call trace below.
>>
>> ==
>> BUG: KASAN: use-after-free in platform_match+0x1c/0x5c at addr
>> ffc0ad313500
>> Read of size 1 by task swapper/0/1
>> CPU: 3 PID: 1 Comm: swapper/0 Tainted: G B 4.9.77+ #1
>> Hardware name: Generic (DT) based system
>> Call trace:
>> dump_backtrace+0x0/0x2e0
>> show_stack+0x14/0x1c
>> dump_stack+0x88/0xb0
>> kasan_object_err+0x24/0x7c
>> kasan_report+0x2f0/0x484
>> check_memory_region+0x20/0x14c
>> strcmp+0x1c/0x5c
>> platform_match+0x40/0xe4
>> __driver_attach+0x40/0x130
>> bus_for_each_dev+0xc4/0xe0
>> driver_attach+0x30/0x3c
>> bus_add_driver+0x2dc/0x328
>> driver_register+0x118/0x160
>> __platform_driver_register+0x7c/0x88
>> alarmtimer_init+0x154/0x1e4
>> do_one_initcall+0x184/0x1a4
>> kernel_init_freeable+0x2ec/0x2f0
>> kernel_init+0x18/0x10c
>> ret_from_fork+0x10/0x50
>>
>> In case of xtensa and x86_64 kasan, no need to use this patch now.
>>
>> Signed-off-by: Kyeongdon Kim 
>> ---
>> arch/arm64/include/asm/string.h | 5 +
>> arch/arm64/kernel/arm64ksyms.c | 2 ++
>> arch/arm64/kernel/image.h | 2 ++
>> arch/arm64/lib/strcmp.S | 3 +++
>> arch/arm64/lib/strncmp.S | 3 +++
>> mm/kasan/kasan.c | 23 +++
>> 6 files changed, 38 insertions(+)
>>
>> diff --git a/arch/arm64/include/asm/string.h
>> b/arch/arm64/include/asm/string.h
>> index dd95d33..ab60349 100644
>> --- a/arch/arm64/include/asm/string.h
>> +++ b/arch/arm64/include/asm/string.h
>> @@ -24,9 +24,11 @@ extern char *strchr(const char *, int c);
>>
>> #define __HAVE_ARCH_STRCMP
>> extern int strcmp(const char *, const char *);
>> +extern int __strcmp(const char *, const char *);
>>
>> #define __HAVE_ARCH_STRNCMP
>> extern int strncmp(const char *, const char *, __kernel_size_t);
>> +extern int __strncmp(const char *, const char *, __kernel_size_t);
>>
>> #define __HAVE_ARCH_STRLEN
>> extern __kernel_size_t strlen(const char *);
>> @@ -68,6 +70,9 @@ void memcpy_flushcache(void *dst, const void *src,
>> size_t cnt);
>> #define memmove(dst, src, len) __memmove(dst, src, len)
>> #define memset(s, c, n) __memset(s, c, n)
>>
>> +#define strcmp(cs, ct) __strcmp(cs, ct)
>> +#define strncmp(cs, ct, n) __strncmp(cs, ct, n)
>> +
>> #ifndef __NO_FORTIFY
>> #define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */
>> #endif
>> diff --git a/arch/arm64/kernel/arm64ksyms.c
>> b/arch/arm64/kernel/arm64ksyms.c
>> index d894a20..10b1164 100644
>> --- a/arch/arm64/kernel/arm64ksyms.c
>> +++ b/arch/arm64/kernel/arm64ksyms.c
>> @@ -50,6 +50,8 @@ EXPORT_SYMBOL(strcmp);
>> EXPORT_SYMBOL(strncmp);
>> EXPORT_SYMBOL(strlen);
>> EXPORT_SYMBOL(strnlen);
>> +EXPORT_SYMBOL(__strcmp);
>> +EXPORT_SYMBOL(__strncmp);
>> EXPORT_SYMBOL(memset);
>> EXPORT_SYMBOL(memcpy);
>> EXPORT_SYMBOL(memmove);
>> diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
>> index a820ed0..5ef7a57 100644
>> --- a/arch/arm64/kernel/image.h
>> +++ b/arch/arm64/kernel/image.h
>> @@ -110,6 +110,8 @@ __efistub___flush_dcache_area =
>> KALLSYMS_HIDE(__pi___flush_dcache_area);
>> __efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy);
>> __efistub___memmove = KALLSYMS_HIDE(__pi_memmove);
>> __efistub___memset = KALLSYMS_HIDE(__pi_memset);
>> +__efistub___strcmp = KALLSYMS_HIDE(__pi_strcmp);
>> +__efistub___strncmp = KALLSYMS_HIDE(__pi_strncmp);
>> #endif
>>
>> __efistub__text = KALLSYMS_HIDE(_text);
>> diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
>> index 471fe61..0dffef7 100644
>> --- a/arch/arm64/lib/strcmp.S
>> +++ b/arch/arm64/lib/strcmp.S
>> @@ -60,6 +

Re: [PATCH v2] arm64: kasan: add interceptors for strcmp/strncmp functions

2018-09-03 Thread Kyeongdon Kim

Dear all,

Could anyone review this and provide me appropriate approach ?
I think str[n]cmp are frequently used functions so I believe very useful 
w/ arm64 KASAN.


Best Regards,
Kyeongdon Kim


On 2018-08-23 오후 5:56, Kyeongdon Kim wrote:

This patch declares strcmp/strncmp as weak symbols.
(2 of them are the most used string operations)

Original functions declared as weak and
strong ones in mm/kasan/kasan.c could replace them.

Assembly optimized strcmp/strncmp functions cannot detect KASan bug.
But, now we can detect them like the call trace below.

==
BUG: KASAN: use-after-free in platform_match+0x1c/0x5c at addr 
ffc0ad313500

Read of size 1 by task swapper/0/1
CPU: 3 PID: 1 Comm: swapper/0 Tainted: G B 4.9.77+ #1
Hardware name: Generic (DT) based system
Call trace:
dump_backtrace+0x0/0x2e0
show_stack+0x14/0x1c
dump_stack+0x88/0xb0
kasan_object_err+0x24/0x7c
kasan_report+0x2f0/0x484
check_memory_region+0x20/0x14c
strcmp+0x1c/0x5c
platform_match+0x40/0xe4
__driver_attach+0x40/0x130
bus_for_each_dev+0xc4/0xe0
driver_attach+0x30/0x3c
bus_add_driver+0x2dc/0x328
driver_register+0x118/0x160
__platform_driver_register+0x7c/0x88
alarmtimer_init+0x154/0x1e4
do_one_initcall+0x184/0x1a4
kernel_init_freeable+0x2ec/0x2f0
kernel_init+0x18/0x10c
ret_from_fork+0x10/0x50

In case of xtensa and x86_64 kasan, no need to use this patch now.

Signed-off-by: Kyeongdon Kim 
---
arch/arm64/include/asm/string.h | 5 +
arch/arm64/kernel/arm64ksyms.c | 2 ++
arch/arm64/kernel/image.h | 2 ++
arch/arm64/lib/strcmp.S | 3 +++
arch/arm64/lib/strncmp.S | 3 +++
mm/kasan/kasan.c | 23 +++
6 files changed, 38 insertions(+)

diff --git a/arch/arm64/include/asm/string.h 
b/arch/arm64/include/asm/string.h

index dd95d33..ab60349 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -24,9 +24,11 @@ extern char *strchr(const char *, int c);

#define __HAVE_ARCH_STRCMP
extern int strcmp(const char *, const char *);
+extern int __strcmp(const char *, const char *);

#define __HAVE_ARCH_STRNCMP
extern int strncmp(const char *, const char *, __kernel_size_t);
+extern int __strncmp(const char *, const char *, __kernel_size_t);

#define __HAVE_ARCH_STRLEN
extern __kernel_size_t strlen(const char *);
@@ -68,6 +70,9 @@ void memcpy_flushcache(void *dst, const void *src, 
size_t cnt);

#define memmove(dst, src, len) __memmove(dst, src, len)
#define memset(s, c, n) __memset(s, c, n)

+#define strcmp(cs, ct) __strcmp(cs, ct)
+#define strncmp(cs, ct, n) __strncmp(cs, ct, n)
+
#ifndef __NO_FORTIFY
#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */
#endif
diff --git a/arch/arm64/kernel/arm64ksyms.c 
b/arch/arm64/kernel/arm64ksyms.c

index d894a20..10b1164 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -50,6 +50,8 @@ EXPORT_SYMBOL(strcmp);
EXPORT_SYMBOL(strncmp);
EXPORT_SYMBOL(strlen);
EXPORT_SYMBOL(strnlen);
+EXPORT_SYMBOL(__strcmp);
+EXPORT_SYMBOL(__strncmp);
EXPORT_SYMBOL(memset);
EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(memmove);
diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
index a820ed0..5ef7a57 100644
--- a/arch/arm64/kernel/image.h
+++ b/arch/arm64/kernel/image.h
@@ -110,6 +110,8 @@ __efistub___flush_dcache_area = 
KALLSYMS_HIDE(__pi___flush_dcache_area);

__efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy);
__efistub___memmove = KALLSYMS_HIDE(__pi_memmove);
__efistub___memset = KALLSYMS_HIDE(__pi_memset);
+__efistub___strcmp = KALLSYMS_HIDE(__pi_strcmp);
+__efistub___strncmp = KALLSYMS_HIDE(__pi_strncmp);
#endif

__efistub__text = KALLSYMS_HIDE(_text);
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
index 471fe61..0dffef7 100644
--- a/arch/arm64/lib/strcmp.S
+++ b/arch/arm64/lib/strcmp.S
@@ -60,6 +60,8 @@ tmp3 .req x9
zeroones .req x10
pos .req x11

+.weak strcmp
+ENTRY(__strcmp)
ENTRY(strcmp)
eor tmp1, src1, src2
mov zeroones, #REP8_01
@@ -232,3 +234,4 @@ CPU_BE( orr syndrome, diff, has_nul )
sub result, data1, data2, lsr #56
ret
ENDPIPROC(strcmp)
+ENDPROC(__strcmp)
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S
index e267044..b2648c7 100644
--- a/arch/arm64/lib/strncmp.S
+++ b/arch/arm64/lib/strncmp.S
@@ -64,6 +64,8 @@ limit_wd .req x13
mask .req x14
endloop .req x15

+.weak strncmp
+ENTRY(__strncmp)
ENTRY(strncmp)
cbz limit, .Lret0
eor tmp1, src1, src2
@@ -308,3 +310,4 @@ CPU_BE( orr syndrome, diff, has_nul )
mov result, #0
ret
ENDPIPROC(strncmp)
+ENDPROC(__strncmp)
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index c3bd520..61ad7f1 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -304,6 +304,29 @@ void *memcpy(void *dest, const void *src, size_t 
len)


return __memcpy(dest, src, len);
}
+#ifdef CONFIG_ARM64
+/*
+ * Arch arm64 use assembly variant for strcmp/strncmp,
+ * xtensa use inline asm operations and x86_64 use c one,
+ * so now this interceptors only for arm64 kasan.
+ */
+#undef strcmp
+int strcmp(const

Re: [PATCH v2] arm64: kasan: add interceptors for strcmp/strncmp functions

2018-09-03 Thread Kyeongdon Kim

Dear all,

Could anyone review this and provide me appropriate approach ?
I think str[n]cmp are frequently used functions so I believe very useful 
w/ arm64 KASAN.


Best Regards,
Kyeongdon Kim


On 2018-08-23 오후 5:56, Kyeongdon Kim wrote:

This patch declares strcmp/strncmp as weak symbols.
(2 of them are the most used string operations)

Original functions declared as weak and
strong ones in mm/kasan/kasan.c could replace them.

Assembly optimized strcmp/strncmp functions cannot detect KASan bug.
But, now we can detect them like the call trace below.

==
BUG: KASAN: use-after-free in platform_match+0x1c/0x5c at addr 
ffc0ad313500

Read of size 1 by task swapper/0/1
CPU: 3 PID: 1 Comm: swapper/0 Tainted: G B 4.9.77+ #1
Hardware name: Generic (DT) based system
Call trace:
dump_backtrace+0x0/0x2e0
show_stack+0x14/0x1c
dump_stack+0x88/0xb0
kasan_object_err+0x24/0x7c
kasan_report+0x2f0/0x484
check_memory_region+0x20/0x14c
strcmp+0x1c/0x5c
platform_match+0x40/0xe4
__driver_attach+0x40/0x130
bus_for_each_dev+0xc4/0xe0
driver_attach+0x30/0x3c
bus_add_driver+0x2dc/0x328
driver_register+0x118/0x160
__platform_driver_register+0x7c/0x88
alarmtimer_init+0x154/0x1e4
do_one_initcall+0x184/0x1a4
kernel_init_freeable+0x2ec/0x2f0
kernel_init+0x18/0x10c
ret_from_fork+0x10/0x50

In case of xtensa and x86_64 kasan, no need to use this patch now.

Signed-off-by: Kyeongdon Kim 
---
arch/arm64/include/asm/string.h | 5 +
arch/arm64/kernel/arm64ksyms.c | 2 ++
arch/arm64/kernel/image.h | 2 ++
arch/arm64/lib/strcmp.S | 3 +++
arch/arm64/lib/strncmp.S | 3 +++
mm/kasan/kasan.c | 23 +++
6 files changed, 38 insertions(+)

diff --git a/arch/arm64/include/asm/string.h 
b/arch/arm64/include/asm/string.h

index dd95d33..ab60349 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -24,9 +24,11 @@ extern char *strchr(const char *, int c);

#define __HAVE_ARCH_STRCMP
extern int strcmp(const char *, const char *);
+extern int __strcmp(const char *, const char *);

#define __HAVE_ARCH_STRNCMP
extern int strncmp(const char *, const char *, __kernel_size_t);
+extern int __strncmp(const char *, const char *, __kernel_size_t);

#define __HAVE_ARCH_STRLEN
extern __kernel_size_t strlen(const char *);
@@ -68,6 +70,9 @@ void memcpy_flushcache(void *dst, const void *src, 
size_t cnt);

#define memmove(dst, src, len) __memmove(dst, src, len)
#define memset(s, c, n) __memset(s, c, n)

+#define strcmp(cs, ct) __strcmp(cs, ct)
+#define strncmp(cs, ct, n) __strncmp(cs, ct, n)
+
#ifndef __NO_FORTIFY
#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */
#endif
diff --git a/arch/arm64/kernel/arm64ksyms.c 
b/arch/arm64/kernel/arm64ksyms.c

index d894a20..10b1164 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -50,6 +50,8 @@ EXPORT_SYMBOL(strcmp);
EXPORT_SYMBOL(strncmp);
EXPORT_SYMBOL(strlen);
EXPORT_SYMBOL(strnlen);
+EXPORT_SYMBOL(__strcmp);
+EXPORT_SYMBOL(__strncmp);
EXPORT_SYMBOL(memset);
EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(memmove);
diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
index a820ed0..5ef7a57 100644
--- a/arch/arm64/kernel/image.h
+++ b/arch/arm64/kernel/image.h
@@ -110,6 +110,8 @@ __efistub___flush_dcache_area = 
KALLSYMS_HIDE(__pi___flush_dcache_area);

__efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy);
__efistub___memmove = KALLSYMS_HIDE(__pi_memmove);
__efistub___memset = KALLSYMS_HIDE(__pi_memset);
+__efistub___strcmp = KALLSYMS_HIDE(__pi_strcmp);
+__efistub___strncmp = KALLSYMS_HIDE(__pi_strncmp);
#endif

__efistub__text = KALLSYMS_HIDE(_text);
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
index 471fe61..0dffef7 100644
--- a/arch/arm64/lib/strcmp.S
+++ b/arch/arm64/lib/strcmp.S
@@ -60,6 +60,8 @@ tmp3 .req x9
zeroones .req x10
pos .req x11

+.weak strcmp
+ENTRY(__strcmp)
ENTRY(strcmp)
eor tmp1, src1, src2
mov zeroones, #REP8_01
@@ -232,3 +234,4 @@ CPU_BE( orr syndrome, diff, has_nul )
sub result, data1, data2, lsr #56
ret
ENDPIPROC(strcmp)
+ENDPROC(__strcmp)
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S
index e267044..b2648c7 100644
--- a/arch/arm64/lib/strncmp.S
+++ b/arch/arm64/lib/strncmp.S
@@ -64,6 +64,8 @@ limit_wd .req x13
mask .req x14
endloop .req x15

+.weak strncmp
+ENTRY(__strncmp)
ENTRY(strncmp)
cbz limit, .Lret0
eor tmp1, src1, src2
@@ -308,3 +310,4 @@ CPU_BE( orr syndrome, diff, has_nul )
mov result, #0
ret
ENDPIPROC(strncmp)
+ENDPROC(__strncmp)
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index c3bd520..61ad7f1 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -304,6 +304,29 @@ void *memcpy(void *dest, const void *src, size_t 
len)


return __memcpy(dest, src, len);
}
+#ifdef CONFIG_ARM64
+/*
+ * Arch arm64 use assembly variant for strcmp/strncmp,
+ * xtensa use inline asm operations and x86_64 use c one,
+ * so now this interceptors only for arm64 kasan.
+ */
+#undef strcmp
+int strcmp(const

[PATCH v2] arm64: kasan: add interceptors for strcmp/strncmp functions

2018-08-23 Thread Kyeongdon Kim
This patch declares strcmp/strncmp as weak symbols.
(2 of them are the most used string operations)

Original functions declared as weak and
strong ones in mm/kasan/kasan.c could replace them.

Assembly optimized strcmp/strncmp functions cannot detect KASan bug.
But, now we can detect them like the call trace below.

==
BUG: KASAN: use-after-free in platform_match+0x1c/0x5c at addr ffc0ad313500
Read of size 1 by task swapper/0/1
CPU: 3 PID: 1 Comm: swapper/0 Tainted: GB   4.9.77+ #1
Hardware name: Generic (DT) based system
Call trace:
 dump_backtrace+0x0/0x2e0
 show_stack+0x14/0x1c
 dump_stack+0x88/0xb0
 kasan_object_err+0x24/0x7c
 kasan_report+0x2f0/0x484
 check_memory_region+0x20/0x14c
 strcmp+0x1c/0x5c
 platform_match+0x40/0xe4
 __driver_attach+0x40/0x130
 bus_for_each_dev+0xc4/0xe0
 driver_attach+0x30/0x3c
 bus_add_driver+0x2dc/0x328
 driver_register+0x118/0x160
 __platform_driver_register+0x7c/0x88
 alarmtimer_init+0x154/0x1e4
 do_one_initcall+0x184/0x1a4
 kernel_init_freeable+0x2ec/0x2f0
 kernel_init+0x18/0x10c
 ret_from_fork+0x10/0x50

In case of xtensa and x86_64 kasan, no need to use this patch now.

Signed-off-by: Kyeongdon Kim 
---
 arch/arm64/include/asm/string.h |  5 +
 arch/arm64/kernel/arm64ksyms.c  |  2 ++
 arch/arm64/kernel/image.h   |  2 ++
 arch/arm64/lib/strcmp.S |  3 +++
 arch/arm64/lib/strncmp.S|  3 +++
 mm/kasan/kasan.c| 23 +++
 6 files changed, 38 insertions(+)

diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h
index dd95d33..ab60349 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -24,9 +24,11 @@ extern char *strchr(const char *, int c);
 
 #define __HAVE_ARCH_STRCMP
 extern int strcmp(const char *, const char *);
+extern int __strcmp(const char *, const char *);
 
 #define __HAVE_ARCH_STRNCMP
 extern int strncmp(const char *, const char *, __kernel_size_t);
+extern int __strncmp(const char *, const char *, __kernel_size_t);
 
 #define __HAVE_ARCH_STRLEN
 extern __kernel_size_t strlen(const char *);
@@ -68,6 +70,9 @@ void memcpy_flushcache(void *dst, const void *src, size_t 
cnt);
 #define memmove(dst, src, len) __memmove(dst, src, len)
 #define memset(s, c, n) __memset(s, c, n)
 
+#define strcmp(cs, ct) __strcmp(cs, ct)
+#define strncmp(cs, ct, n) __strncmp(cs, ct, n)
+
 #ifndef __NO_FORTIFY
 #define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */
 #endif
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index d894a20..10b1164 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -50,6 +50,8 @@ EXPORT_SYMBOL(strcmp);
 EXPORT_SYMBOL(strncmp);
 EXPORT_SYMBOL(strlen);
 EXPORT_SYMBOL(strnlen);
+EXPORT_SYMBOL(__strcmp);
+EXPORT_SYMBOL(__strncmp);
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memmove);
diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
index a820ed0..5ef7a57 100644
--- a/arch/arm64/kernel/image.h
+++ b/arch/arm64/kernel/image.h
@@ -110,6 +110,8 @@ __efistub___flush_dcache_area   = 
KALLSYMS_HIDE(__pi___flush_dcache_area);
 __efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy);
 __efistub___memmove= KALLSYMS_HIDE(__pi_memmove);
 __efistub___memset = KALLSYMS_HIDE(__pi_memset);
+__efistub___strcmp = KALLSYMS_HIDE(__pi_strcmp);
+__efistub___strncmp= KALLSYMS_HIDE(__pi_strncmp);
 #endif
 
 __efistub__text= KALLSYMS_HIDE(_text);
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
index 471fe61..0dffef7 100644
--- a/arch/arm64/lib/strcmp.S
+++ b/arch/arm64/lib/strcmp.S
@@ -60,6 +60,8 @@ tmp3  .reqx9
 zeroones   .reqx10
 pos.reqx11
 
+.weak strcmp
+ENTRY(__strcmp)
 ENTRY(strcmp)
eor tmp1, src1, src2
mov zeroones, #REP8_01
@@ -232,3 +234,4 @@ CPU_BE( orr syndrome, diff, has_nul )
sub result, data1, data2, lsr #56
ret
 ENDPIPROC(strcmp)
+ENDPROC(__strcmp)
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S
index e267044..b2648c7 100644
--- a/arch/arm64/lib/strncmp.S
+++ b/arch/arm64/lib/strncmp.S
@@ -64,6 +64,8 @@ limit_wd  .reqx13
 mask   .reqx14
 endloop.reqx15
 
+.weak strncmp
+ENTRY(__strncmp)
 ENTRY(strncmp)
cbz limit, .Lret0
eor tmp1, src1, src2
@@ -308,3 +310,4 @@ CPU_BE( orr syndrome, diff, has_nul )
mov result, #0
ret
 ENDPIPROC(strncmp)
+ENDPROC(__strncmp)
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index c3bd520..61ad7f1 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -304,6 +304,29 @@ void *memcpy(void *dest, const void *src, size_t len)
 
return __memcpy(dest, src, len);
 }
+#ifdef CONFIG_ARM64
+/*
+ * Arch arm64 use assembly variant for strcmp/strncmp,
+ * xtensa use

[PATCH v2] arm64: kasan: add interceptors for strcmp/strncmp functions

2018-08-23 Thread Kyeongdon Kim
This patch declares strcmp/strncmp as weak symbols.
(2 of them are the most used string operations)

Original functions declared as weak and
strong ones in mm/kasan/kasan.c could replace them.

Assembly optimized strcmp/strncmp functions cannot detect KASan bug.
But, now we can detect them like the call trace below.

==
BUG: KASAN: use-after-free in platform_match+0x1c/0x5c at addr ffc0ad313500
Read of size 1 by task swapper/0/1
CPU: 3 PID: 1 Comm: swapper/0 Tainted: GB   4.9.77+ #1
Hardware name: Generic (DT) based system
Call trace:
 dump_backtrace+0x0/0x2e0
 show_stack+0x14/0x1c
 dump_stack+0x88/0xb0
 kasan_object_err+0x24/0x7c
 kasan_report+0x2f0/0x484
 check_memory_region+0x20/0x14c
 strcmp+0x1c/0x5c
 platform_match+0x40/0xe4
 __driver_attach+0x40/0x130
 bus_for_each_dev+0xc4/0xe0
 driver_attach+0x30/0x3c
 bus_add_driver+0x2dc/0x328
 driver_register+0x118/0x160
 __platform_driver_register+0x7c/0x88
 alarmtimer_init+0x154/0x1e4
 do_one_initcall+0x184/0x1a4
 kernel_init_freeable+0x2ec/0x2f0
 kernel_init+0x18/0x10c
 ret_from_fork+0x10/0x50

In case of xtensa and x86_64 kasan, no need to use this patch now.

Signed-off-by: Kyeongdon Kim 
---
 arch/arm64/include/asm/string.h |  5 +
 arch/arm64/kernel/arm64ksyms.c  |  2 ++
 arch/arm64/kernel/image.h   |  2 ++
 arch/arm64/lib/strcmp.S |  3 +++
 arch/arm64/lib/strncmp.S|  3 +++
 mm/kasan/kasan.c| 23 +++
 6 files changed, 38 insertions(+)

diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h
index dd95d33..ab60349 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -24,9 +24,11 @@ extern char *strchr(const char *, int c);
 
 #define __HAVE_ARCH_STRCMP
 extern int strcmp(const char *, const char *);
+extern int __strcmp(const char *, const char *);
 
 #define __HAVE_ARCH_STRNCMP
 extern int strncmp(const char *, const char *, __kernel_size_t);
+extern int __strncmp(const char *, const char *, __kernel_size_t);
 
 #define __HAVE_ARCH_STRLEN
 extern __kernel_size_t strlen(const char *);
@@ -68,6 +70,9 @@ void memcpy_flushcache(void *dst, const void *src, size_t 
cnt);
 #define memmove(dst, src, len) __memmove(dst, src, len)
 #define memset(s, c, n) __memset(s, c, n)
 
+#define strcmp(cs, ct) __strcmp(cs, ct)
+#define strncmp(cs, ct, n) __strncmp(cs, ct, n)
+
 #ifndef __NO_FORTIFY
 #define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */
 #endif
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index d894a20..10b1164 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -50,6 +50,8 @@ EXPORT_SYMBOL(strcmp);
 EXPORT_SYMBOL(strncmp);
 EXPORT_SYMBOL(strlen);
 EXPORT_SYMBOL(strnlen);
+EXPORT_SYMBOL(__strcmp);
+EXPORT_SYMBOL(__strncmp);
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memmove);
diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
index a820ed0..5ef7a57 100644
--- a/arch/arm64/kernel/image.h
+++ b/arch/arm64/kernel/image.h
@@ -110,6 +110,8 @@ __efistub___flush_dcache_area   = 
KALLSYMS_HIDE(__pi___flush_dcache_area);
 __efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy);
 __efistub___memmove= KALLSYMS_HIDE(__pi_memmove);
 __efistub___memset = KALLSYMS_HIDE(__pi_memset);
+__efistub___strcmp = KALLSYMS_HIDE(__pi_strcmp);
+__efistub___strncmp= KALLSYMS_HIDE(__pi_strncmp);
 #endif
 
 __efistub__text= KALLSYMS_HIDE(_text);
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
index 471fe61..0dffef7 100644
--- a/arch/arm64/lib/strcmp.S
+++ b/arch/arm64/lib/strcmp.S
@@ -60,6 +60,8 @@ tmp3  .reqx9
 zeroones   .reqx10
 pos.reqx11
 
+.weak strcmp
+ENTRY(__strcmp)
 ENTRY(strcmp)
eor tmp1, src1, src2
mov zeroones, #REP8_01
@@ -232,3 +234,4 @@ CPU_BE( orr syndrome, diff, has_nul )
sub result, data1, data2, lsr #56
ret
 ENDPIPROC(strcmp)
+ENDPROC(__strcmp)
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S
index e267044..b2648c7 100644
--- a/arch/arm64/lib/strncmp.S
+++ b/arch/arm64/lib/strncmp.S
@@ -64,6 +64,8 @@ limit_wd  .reqx13
 mask   .reqx14
 endloop.reqx15
 
+.weak strncmp
+ENTRY(__strncmp)
 ENTRY(strncmp)
cbz limit, .Lret0
eor tmp1, src1, src2
@@ -308,3 +310,4 @@ CPU_BE( orr syndrome, diff, has_nul )
mov result, #0
ret
 ENDPIPROC(strncmp)
+ENDPROC(__strncmp)
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index c3bd520..61ad7f1 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -304,6 +304,29 @@ void *memcpy(void *dest, const void *src, size_t len)
 
return __memcpy(dest, src, len);
 }
+#ifdef CONFIG_ARM64
+/*
+ * Arch arm64 use assembly variant for strcmp/strncmp,
+ * xtensa use

[PATCH] arm64: kasan: add interceptors for strcmp/strncmp functions

2018-08-17 Thread Kyeongdon Kim
This patch declares strcmp/strncmp as weak symbols.
(2 of them are the most used string operations)

Original functions declared as weak symbols and
strong ones in mm/kasan/kasan.c could replace them.

Assembly optimized strcmp/strncmp functions cannot detect KASan bug.
But, now we can detect them.

==
BUG: KASAN: use-after-free in platform_match+0x1c/0x5c at addr ffc0ad313500
Read of size 1 by task swapper/0/1
CPU: 3 PID: 1 Comm: swapper/0 Tainted: GB   4.9.77+ #1
Hardware name: Generic (DT) based system
Call trace:
 dump_backtrace+0x0/0x2e0
 show_stack+0x14/0x1c
 dump_stack+0x88/0xb0
 kasan_object_err+0x24/0x7c
 kasan_report+0x2f0/0x484
 check_memory_region+0x20/0x14c
 strcmp+0x1c/0x5c
 platform_match+0x40/0xe4
 __driver_attach+0x40/0x130
 bus_for_each_dev+0xc4/0xe0
 driver_attach+0x30/0x3c
 bus_add_driver+0x2dc/0x328
 driver_register+0x118/0x160
 __platform_driver_register+0x7c/0x88
 alarmtimer_init+0x154/0x1e4
 do_one_initcall+0x184/0x1a4
 kernel_init_freeable+0x2ec/0x2f0
 kernel_init+0x18/0x10c
 ret_from_fork+0x10/0x50

Signed-off-by: Kyeongdon Kim 
---
 arch/arm64/include/asm/string.h |  4 
 arch/arm64/kernel/arm64ksyms.c  |  2 ++
 arch/arm64/kernel/image.h   |  2 ++
 arch/arm64/lib/strcmp.S |  3 +++
 arch/arm64/lib/strncmp.S|  3 +++
 mm/kasan/kasan.c| 15 +++
 6 files changed, 29 insertions(+)

diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h
index dd95d33..d546605 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -24,9 +24,11 @@ extern char *strchr(const char *, int c);
 
 #define __HAVE_ARCH_STRCMP
 extern int strcmp(const char *, const char *);
+extern int __strcmp(const char *, const char *);
 
 #define __HAVE_ARCH_STRNCMP
 extern int strncmp(const char *, const char *, __kernel_size_t);
+extern int __strncmp(const char *, const char *, __kernel_size_t);
 
 #define __HAVE_ARCH_STRLEN
 extern __kernel_size_t strlen(const char *);
@@ -67,6 +69,8 @@ void memcpy_flushcache(void *dst, const void *src, size_t 
cnt);
 #define memcpy(dst, src, len) __memcpy(dst, src, len)
 #define memmove(dst, src, len) __memmove(dst, src, len)
 #define memset(s, c, n) __memset(s, c, n)
+#define strcmp(cs, ct) __strcmp(cs, ct)
+#define strncmp(cs, ct, n) __strncmp(cs, ct, n)
 
 #ifndef __NO_FORTIFY
 #define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index d894a20..10b1164 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -50,6 +50,8 @@ EXPORT_SYMBOL(strcmp);
 EXPORT_SYMBOL(strncmp);
 EXPORT_SYMBOL(strlen);
 EXPORT_SYMBOL(strnlen);
+EXPORT_SYMBOL(__strcmp);
+EXPORT_SYMBOL(__strncmp);
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memmove);
diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
index a820ed0..5ef7a57 100644
--- a/arch/arm64/kernel/image.h
+++ b/arch/arm64/kernel/image.h
@@ -110,6 +110,8 @@ __efistub___flush_dcache_area   = 
KALLSYMS_HIDE(__pi___flush_dcache_area);
 __efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy);
 __efistub___memmove= KALLSYMS_HIDE(__pi_memmove);
 __efistub___memset = KALLSYMS_HIDE(__pi_memset);
+__efistub___strcmp = KALLSYMS_HIDE(__pi_strcmp);
+__efistub___strncmp= KALLSYMS_HIDE(__pi_strncmp);
 #endif
 
 __efistub__text= KALLSYMS_HIDE(_text);
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
index 471fe61..0dffef7 100644
--- a/arch/arm64/lib/strcmp.S
+++ b/arch/arm64/lib/strcmp.S
@@ -60,6 +60,8 @@ tmp3  .reqx9
 zeroones   .reqx10
 pos.reqx11
 
+.weak strcmp
+ENTRY(__strcmp)
 ENTRY(strcmp)
eor tmp1, src1, src2
mov zeroones, #REP8_01
@@ -232,3 +234,4 @@ CPU_BE( orr syndrome, diff, has_nul )
sub result, data1, data2, lsr #56
ret
 ENDPIPROC(strcmp)
+ENDPROC(__strcmp)
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S
index e267044..b2648c7 100644
--- a/arch/arm64/lib/strncmp.S
+++ b/arch/arm64/lib/strncmp.S
@@ -64,6 +64,8 @@ limit_wd  .reqx13
 mask   .reqx14
 endloop.reqx15
 
+.weak strncmp
+ENTRY(__strncmp)
 ENTRY(strncmp)
cbz limit, .Lret0
eor tmp1, src1, src2
@@ -308,3 +310,4 @@ CPU_BE( orr syndrome, diff, has_nul )
mov result, #0
ret
 ENDPIPROC(strncmp)
+ENDPROC(__strncmp)
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index c3bd520..eae43e6 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -304,7 +304,22 @@ void *memcpy(void *dest, const void *src, size_t len)
 
return __memcpy(dest, src, len);
 }
+#undef strcmp
+int strcmp(const char *cs, const char *ct)
+{
+   check_memory_region((unsigned long)cs, 1, false, _RET_IP_

[PATCH] arm64: kasan: add interceptors for strcmp/strncmp functions

2018-08-17 Thread Kyeongdon Kim
This patch declares strcmp/strncmp as weak symbols.
(2 of them are the most used string operations)

Original functions declared as weak symbols and
strong ones in mm/kasan/kasan.c could replace them.

Assembly optimized strcmp/strncmp functions cannot detect KASan bug.
But, now we can detect them.

==
BUG: KASAN: use-after-free in platform_match+0x1c/0x5c at addr ffc0ad313500
Read of size 1 by task swapper/0/1
CPU: 3 PID: 1 Comm: swapper/0 Tainted: GB   4.9.77+ #1
Hardware name: Generic (DT) based system
Call trace:
 dump_backtrace+0x0/0x2e0
 show_stack+0x14/0x1c
 dump_stack+0x88/0xb0
 kasan_object_err+0x24/0x7c
 kasan_report+0x2f0/0x484
 check_memory_region+0x20/0x14c
 strcmp+0x1c/0x5c
 platform_match+0x40/0xe4
 __driver_attach+0x40/0x130
 bus_for_each_dev+0xc4/0xe0
 driver_attach+0x30/0x3c
 bus_add_driver+0x2dc/0x328
 driver_register+0x118/0x160
 __platform_driver_register+0x7c/0x88
 alarmtimer_init+0x154/0x1e4
 do_one_initcall+0x184/0x1a4
 kernel_init_freeable+0x2ec/0x2f0
 kernel_init+0x18/0x10c
 ret_from_fork+0x10/0x50

Signed-off-by: Kyeongdon Kim 
---
 arch/arm64/include/asm/string.h |  4 
 arch/arm64/kernel/arm64ksyms.c  |  2 ++
 arch/arm64/kernel/image.h   |  2 ++
 arch/arm64/lib/strcmp.S |  3 +++
 arch/arm64/lib/strncmp.S|  3 +++
 mm/kasan/kasan.c| 15 +++
 6 files changed, 29 insertions(+)

diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h
index dd95d33..d546605 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -24,9 +24,11 @@ extern char *strchr(const char *, int c);
 
 #define __HAVE_ARCH_STRCMP
 extern int strcmp(const char *, const char *);
+extern int __strcmp(const char *, const char *);
 
 #define __HAVE_ARCH_STRNCMP
 extern int strncmp(const char *, const char *, __kernel_size_t);
+extern int __strncmp(const char *, const char *, __kernel_size_t);
 
 #define __HAVE_ARCH_STRLEN
 extern __kernel_size_t strlen(const char *);
@@ -67,6 +69,8 @@ void memcpy_flushcache(void *dst, const void *src, size_t 
cnt);
 #define memcpy(dst, src, len) __memcpy(dst, src, len)
 #define memmove(dst, src, len) __memmove(dst, src, len)
 #define memset(s, c, n) __memset(s, c, n)
+#define strcmp(cs, ct) __strcmp(cs, ct)
+#define strncmp(cs, ct, n) __strncmp(cs, ct, n)
 
 #ifndef __NO_FORTIFY
 #define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index d894a20..10b1164 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -50,6 +50,8 @@ EXPORT_SYMBOL(strcmp);
 EXPORT_SYMBOL(strncmp);
 EXPORT_SYMBOL(strlen);
 EXPORT_SYMBOL(strnlen);
+EXPORT_SYMBOL(__strcmp);
+EXPORT_SYMBOL(__strncmp);
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memmove);
diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h
index a820ed0..5ef7a57 100644
--- a/arch/arm64/kernel/image.h
+++ b/arch/arm64/kernel/image.h
@@ -110,6 +110,8 @@ __efistub___flush_dcache_area   = 
KALLSYMS_HIDE(__pi___flush_dcache_area);
 __efistub___memcpy = KALLSYMS_HIDE(__pi_memcpy);
 __efistub___memmove= KALLSYMS_HIDE(__pi_memmove);
 __efistub___memset = KALLSYMS_HIDE(__pi_memset);
+__efistub___strcmp = KALLSYMS_HIDE(__pi_strcmp);
+__efistub___strncmp= KALLSYMS_HIDE(__pi_strncmp);
 #endif
 
 __efistub__text= KALLSYMS_HIDE(_text);
diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S
index 471fe61..0dffef7 100644
--- a/arch/arm64/lib/strcmp.S
+++ b/arch/arm64/lib/strcmp.S
@@ -60,6 +60,8 @@ tmp3  .reqx9
 zeroones   .reqx10
 pos.reqx11
 
+.weak strcmp
+ENTRY(__strcmp)
 ENTRY(strcmp)
eor tmp1, src1, src2
mov zeroones, #REP8_01
@@ -232,3 +234,4 @@ CPU_BE( orr syndrome, diff, has_nul )
sub result, data1, data2, lsr #56
ret
 ENDPIPROC(strcmp)
+ENDPROC(__strcmp)
diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S
index e267044..b2648c7 100644
--- a/arch/arm64/lib/strncmp.S
+++ b/arch/arm64/lib/strncmp.S
@@ -64,6 +64,8 @@ limit_wd  .reqx13
 mask   .reqx14
 endloop.reqx15
 
+.weak strncmp
+ENTRY(__strncmp)
 ENTRY(strncmp)
cbz limit, .Lret0
eor tmp1, src1, src2
@@ -308,3 +310,4 @@ CPU_BE( orr syndrome, diff, has_nul )
mov result, #0
ret
 ENDPIPROC(strncmp)
+ENDPROC(__strncmp)
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index c3bd520..eae43e6 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -304,7 +304,22 @@ void *memcpy(void *dest, const void *src, size_t len)
 
return __memcpy(dest, src, len);
 }
+#undef strcmp
+int strcmp(const char *cs, const char *ct)
+{
+   check_memory_region((unsigned long)cs, 1, false, _RET_IP_

[PATCH] arm64: lib: use c string functions for KASAN support

2018-08-14 Thread Kyeongdon Kim
Assembly optimized string functions cannot detect KASan bug.
This might have been the intention of the original author.
(not too much important to catch)

But, I found the obvious uaf problem in strcmp() function.
 - in this case, using 32bit KASan patchset helps

Since I used c string function, I believe I could find this bug.
After using the patch, can see the report & backtrace the below:

==
BUG: KASAN: use-after-free in strcmp+0x1c/0x5c at addr ffc0ad313500
Read of size 1 by task swapper/0/1
CPU: 3 PID: 1 Comm: swapper/0 Tainted: GB   4.9.77+ #1
Hardware name: Generic (DT) based system
Call trace:
[] dump_backtrace+0x0/0x2e0
[] show_stack+0x14/0x1c
[] dump_stack+0x88/0xb0
[] kasan_object_err+0x24/0x7c
[] kasan_report+0x2f0/0x484
[] __asan_load1+0x24/0x50
[] strcmp+0x1c/0x5c
[] platform_match+0x40/0xe4
[] __driver_attach+0x40/0x130
[] bus_for_each_dev+0xc4/0xe0
[] driver_attach+0x30/0x3c
[] bus_add_driver+0x2dc/0x328
[] driver_register+0x118/0x160
[] __platform_driver_register+0x7c/0x88
[] alarmtimer_init+0x154/0x1e4
[] do_one_initcall+0x184/0x1a4
[] kernel_init_freeable+0x2ec/0x2f0
[] kernel_init+0x18/0x10c
[] ret_from_fork+0x10/0x50
Object at ffc0ad313500, in cache kmalloc-64 size: 64
Allocated:
PID = 1
 save_stack_trace_tsk+0x0/0x194
 save_stack_trace+0x18/0x20
 kasan_kmalloc+0xa8/0x154
 kasan_slab_alloc+0x14/0x1c
 __kmalloc_track_caller+0x178/0x2a0
 kvasprintf+0x80/0x104
 kvasprintf_const+0xcc/0xd0
 kobject_set_name_vargs+0x54/0xd4
 dev_set_name+0x64/0x84
 of_device_make_bus_id+0xc4/0x140
 of_device_alloc+0x1e0/0x200
 of_platform_device_create_pdata+0x70/0xf4
 of_platform_bus_create+0x448/0x508
 of_platform_populate+0xf4/0x104
 of_platform_default_populate+0x20/0x28
 of_platform_default_populate_init+0x68/0x78
Freed:
PID = 1
 save_stack_trace_tsk+0x0/0x194
 save_stack_trace+0x18/0x20
 kasan_slab_free+0xa0/0x14c
 kfree+0x174/0x288
 kfree_const+0x2c/0x38
 kobject_rename+0x12c/0x160
 device_rename+0xa8/0x110
 mt_usb_probe+0x218/0x760
 platform_drv_probe+0x74/0xd0
 driver_probe_device+0x3d4/0x614
 __driver_attach+0xc8/0x130
 bus_for_each_dev+0xc4/0xe0
 driver_attach+0x30/0x3c
 bus_add_driver+0x2dc/0x328
 driver_register+0x118/0x160
 __platform_driver_register+0x7c/0x88
Memory state around the buggy address:
 ffc0ad313300: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc
 ffc0ad313400: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc
>ffc0ad313500: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc
   ^
 ffc0ad313600: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc
 ffc0ad313700: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc
==

Signed-off-by: Kyeongdon Kim 
---
 arch/arm64/include/asm/string.h | 2 ++
 arch/arm64/kernel/arm64ksyms.c  | 2 ++
 arch/arm64/lib/Makefile | 8 +---
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h
index dd95d33..5c5219a 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -16,6 +16,7 @@
 #ifndef __ASM_STRING_H
 #define __ASM_STRING_H
 
+#if !defined(CONFIG_KASAN)
 #define __HAVE_ARCH_STRRCHR
 extern char *strrchr(const char *, int c);
 
@@ -33,6 +34,7 @@ extern __kernel_size_t strlen(const char *);
 
 #define __HAVE_ARCH_STRNLEN
 extern __kernel_size_t strnlen(const char *, __kernel_size_t);
+#endif
 
 #define __HAVE_ARCH_MEMCPY
 extern void *memcpy(void *, const void *, __kernel_size_t);
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index d894a20..eb9bf20 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -44,12 +44,14 @@ EXPORT_SYMBOL(__arch_copy_in_user);
 EXPORT_SYMBOL(memstart_addr);
 
/* string / mem functions */
+#if !defined(CONFIG_KASAN)
 EXPORT_SYMBOL(strchr);
 EXPORT_SYMBOL(strrchr);
 EXPORT_SYMBOL(strcmp);
 EXPORT_SYMBOL(strncmp);
 EXPORT_SYMBOL(strlen);
 EXPORT_SYMBOL(strnlen);
+#endif
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memmove);
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 68755fd..aa2d457 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -2,9 +2,11 @@
 lib-y  := clear_user.o delay.o copy_from_user.o\
   copy_to_user.o copy_in_user.o copy_page.o\
   clear_page.o memchr.o memcpy.o memmove.o memset.o\
-  memcmp.o strcmp.o strncmp.o strlen.o strnlen.o   \
-  strchr.o strrchr.o tishift.o
-
+  memcmp.o tishift.o
+ifndef CONFIG_KASAN
+lib-y  := strcmp.o strncmp.o strlen.o strnlen.o\
+  strchr.o strrchr.o
+endif
 # Tell the compiler to treat all general purpose registers (with the
 # exception of the IP registers, which are already handled by the caller
 # in case o

[PATCH] arm64: lib: use c string functions for KASAN support

2018-08-14 Thread Kyeongdon Kim
Assembly optimized string functions cannot detect KASan bug.
This might have been the intention of the original author.
(not too much important to catch)

But, I found the obvious uaf problem in strcmp() function.
 - in this case, using 32bit KASan patchset helps

Since I used c string function, I believe I could find this bug.
After using the patch, can see the report & backtrace the below:

==
BUG: KASAN: use-after-free in strcmp+0x1c/0x5c at addr ffc0ad313500
Read of size 1 by task swapper/0/1
CPU: 3 PID: 1 Comm: swapper/0 Tainted: GB   4.9.77+ #1
Hardware name: Generic (DT) based system
Call trace:
[] dump_backtrace+0x0/0x2e0
[] show_stack+0x14/0x1c
[] dump_stack+0x88/0xb0
[] kasan_object_err+0x24/0x7c
[] kasan_report+0x2f0/0x484
[] __asan_load1+0x24/0x50
[] strcmp+0x1c/0x5c
[] platform_match+0x40/0xe4
[] __driver_attach+0x40/0x130
[] bus_for_each_dev+0xc4/0xe0
[] driver_attach+0x30/0x3c
[] bus_add_driver+0x2dc/0x328
[] driver_register+0x118/0x160
[] __platform_driver_register+0x7c/0x88
[] alarmtimer_init+0x154/0x1e4
[] do_one_initcall+0x184/0x1a4
[] kernel_init_freeable+0x2ec/0x2f0
[] kernel_init+0x18/0x10c
[] ret_from_fork+0x10/0x50
Object at ffc0ad313500, in cache kmalloc-64 size: 64
Allocated:
PID = 1
 save_stack_trace_tsk+0x0/0x194
 save_stack_trace+0x18/0x20
 kasan_kmalloc+0xa8/0x154
 kasan_slab_alloc+0x14/0x1c
 __kmalloc_track_caller+0x178/0x2a0
 kvasprintf+0x80/0x104
 kvasprintf_const+0xcc/0xd0
 kobject_set_name_vargs+0x54/0xd4
 dev_set_name+0x64/0x84
 of_device_make_bus_id+0xc4/0x140
 of_device_alloc+0x1e0/0x200
 of_platform_device_create_pdata+0x70/0xf4
 of_platform_bus_create+0x448/0x508
 of_platform_populate+0xf4/0x104
 of_platform_default_populate+0x20/0x28
 of_platform_default_populate_init+0x68/0x78
Freed:
PID = 1
 save_stack_trace_tsk+0x0/0x194
 save_stack_trace+0x18/0x20
 kasan_slab_free+0xa0/0x14c
 kfree+0x174/0x288
 kfree_const+0x2c/0x38
 kobject_rename+0x12c/0x160
 device_rename+0xa8/0x110
 mt_usb_probe+0x218/0x760
 platform_drv_probe+0x74/0xd0
 driver_probe_device+0x3d4/0x614
 __driver_attach+0xc8/0x130
 bus_for_each_dev+0xc4/0xe0
 driver_attach+0x30/0x3c
 bus_add_driver+0x2dc/0x328
 driver_register+0x118/0x160
 __platform_driver_register+0x7c/0x88
Memory state around the buggy address:
 ffc0ad313300: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc
 ffc0ad313400: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc
>ffc0ad313500: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc
   ^
 ffc0ad313600: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc
 ffc0ad313700: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc
==

Signed-off-by: Kyeongdon Kim 
---
 arch/arm64/include/asm/string.h | 2 ++
 arch/arm64/kernel/arm64ksyms.c  | 2 ++
 arch/arm64/lib/Makefile | 8 +---
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h
index dd95d33..5c5219a 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -16,6 +16,7 @@
 #ifndef __ASM_STRING_H
 #define __ASM_STRING_H
 
+#if !defined(CONFIG_KASAN)
 #define __HAVE_ARCH_STRRCHR
 extern char *strrchr(const char *, int c);
 
@@ -33,6 +34,7 @@ extern __kernel_size_t strlen(const char *);
 
 #define __HAVE_ARCH_STRNLEN
 extern __kernel_size_t strnlen(const char *, __kernel_size_t);
+#endif
 
 #define __HAVE_ARCH_MEMCPY
 extern void *memcpy(void *, const void *, __kernel_size_t);
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index d894a20..eb9bf20 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -44,12 +44,14 @@ EXPORT_SYMBOL(__arch_copy_in_user);
 EXPORT_SYMBOL(memstart_addr);
 
/* string / mem functions */
+#if !defined(CONFIG_KASAN)
 EXPORT_SYMBOL(strchr);
 EXPORT_SYMBOL(strrchr);
 EXPORT_SYMBOL(strcmp);
 EXPORT_SYMBOL(strncmp);
 EXPORT_SYMBOL(strlen);
 EXPORT_SYMBOL(strnlen);
+#endif
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memmove);
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 68755fd..aa2d457 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -2,9 +2,11 @@
 lib-y  := clear_user.o delay.o copy_from_user.o\
   copy_to_user.o copy_in_user.o copy_page.o\
   clear_page.o memchr.o memcpy.o memmove.o memset.o\
-  memcmp.o strcmp.o strncmp.o strlen.o strnlen.o   \
-  strchr.o strrchr.o tishift.o
-
+  memcmp.o tishift.o
+ifndef CONFIG_KASAN
+lib-y  := strcmp.o strncmp.o strlen.o strnlen.o\
+  strchr.o strrchr.o
+endif
 # Tell the compiler to treat all general purpose registers (with the
 # exception of the IP registers, which are already handled by the caller
 # in case o

[PATCH v2] ksm : use checksum and memcmp for rb_tree

2017-11-14 Thread Kyeongdon Kim
The current ksm is using memcmp to insert and search 'rb_tree'.
It does cause very expensive computation cost.
In order to reduce the time of this operation,
we have added a checksum to traverse.

Nearly all 'rb_node' in stable_tree_insert() function
can be inserted as a checksum, most of it is possible
in unstable_tree_search_insert() function.
In stable_tree_search() function, the checksum may be an additional.
But, checksum check duration is extremely small.
Considering the time of the whole cmp_and_merge_page() function,
it requires very little cost on average.

Using this patch, we compared the time of ksm_do_scan() function
by adding kernel trace at the start-end position of operation.
(ARM 32bit target android device,
over 1000 sample time gap stamps average)

On original KSM scan avg duration = 0.0166893 sec
14991.975619 : ksm_do_scan_start: START: ksm_do_scan
14991.990975 : ksm_do_scan_end: END: ksm_do_scan
14992.008989 : ksm_do_scan_start: START: ksm_do_scan
14992.016839 : ksm_do_scan_end: END: ksm_do_scan
...

On patch KSM scan avg duration = 0.0041157 sec
41081.46131 : ksm_do_scan_start : START: ksm_do_scan
41081.46636 : ksm_do_scan_end : END: ksm_do_scan
41081.48476 : ksm_do_scan_start : START: ksm_do_scan
41081.48795 : ksm_do_scan_end : END: ksm_do_scan
...

We have tested randomly so many times for the stability
and couldn't see any abnormal issue until now.
Also, we found out this patch can make some good advantage
for the power consumption than KSM default enable.

v1 -> v2
- add comment for oldchecksum value
- move the oldchecksum value out of union
- remove check code regarding checksum 0 in stable_tree_search()

link to v1 : https://lkml.org/lkml/2017/10/30/251

Signed-off-by: Kyeongdon Kim <kyeongdon@lge.com>
---
 mm/ksm.c | 48 
 1 file changed, 44 insertions(+), 4 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index be8f457..9280569 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -134,6 +134,7 @@ struct ksm_scan {
  * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
  * @chain_prune_time: time of the last full garbage collection
  * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
+ * @oldchecksum: previous checksum of the page about a stable_node
  * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
  */
 struct stable_node {
@@ -159,6 +160,7 @@ struct stable_node {
 */
 #define STABLE_NODE_CHAIN -1024
int rmap_hlist_len;
+   u32 oldchecksum;
 #ifdef CONFIG_NUMA
int nid;
 #endif
@@ -1522,7 +1524,7 @@ static __always_inline struct page *chain(struct 
stable_node **s_n_d,
  * This function returns the stable tree node of identical content if found,
  * NULL otherwise.
  */
-static struct page *stable_tree_search(struct page *page)
+static struct page *stable_tree_search(struct page *page, u32 checksum)
 {
int nid;
struct rb_root *root;
@@ -1550,6 +1552,18 @@ static struct page *stable_tree_search(struct page *page)
 
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
+
+   /* first make rb_tree by checksum */
+   if (checksum < stable_node->oldchecksum) {
+   parent = *new;
+   new = >rb_left;
+   continue;
+   } else if (checksum > stable_node->oldchecksum) {
+   parent = *new;
+   new = >rb_right;
+   continue;
+   }
+
stable_node_any = NULL;
tree_page = chain_prune(_node_dup, _node, root);
/*
@@ -1768,7 +1782,7 @@ static struct page *stable_tree_search(struct page *page)
  * This function returns the stable tree node just allocated on success,
  * NULL otherwise.
  */
-static struct stable_node *stable_tree_insert(struct page *kpage)
+static struct stable_node *stable_tree_insert(struct page *kpage, u32 checksum)
 {
int nid;
unsigned long kpfn;
@@ -1792,6 +1806,18 @@ static struct stable_node *stable_tree_insert(struct 
page *kpage)
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
stable_node_any = NULL;
+
+   /* first make rb_tree by checksum */
+   if (checksum < stable_node->oldchecksum) {
+   parent = *new;
+   new = >rb_left;
+   continue;
+   } else if (checksum > stable_node->oldchecksum) {
+   parent = *new;
+   new = >rb_right;
+   continue;
+   }
+
tree_page = chain(_node_dup, stable_node, root);
if (!stable_node_dup) {
/*
@@ -1850,6 +1876,7 @@ static struct stable_node *stable_tree_insert(s

[PATCH v2] ksm : use checksum and memcmp for rb_tree

2017-11-14 Thread Kyeongdon Kim
The current ksm is using memcmp to insert and search 'rb_tree'.
It does cause very expensive computation cost.
In order to reduce the time of this operation,
we have added a checksum to traverse.

Nearly all 'rb_node' in stable_tree_insert() function
can be inserted as a checksum, most of it is possible
in unstable_tree_search_insert() function.
In stable_tree_search() function, the checksum may be an additional.
But, checksum check duration is extremely small.
Considering the time of the whole cmp_and_merge_page() function,
it requires very little cost on average.

Using this patch, we compared the time of ksm_do_scan() function
by adding kernel trace at the start-end position of operation.
(ARM 32bit target android device,
over 1000 sample time gap stamps average)

On original KSM scan avg duration = 0.0166893 sec
14991.975619 : ksm_do_scan_start: START: ksm_do_scan
14991.990975 : ksm_do_scan_end: END: ksm_do_scan
14992.008989 : ksm_do_scan_start: START: ksm_do_scan
14992.016839 : ksm_do_scan_end: END: ksm_do_scan
...

On patch KSM scan avg duration = 0.0041157 sec
41081.46131 : ksm_do_scan_start : START: ksm_do_scan
41081.46636 : ksm_do_scan_end : END: ksm_do_scan
41081.48476 : ksm_do_scan_start : START: ksm_do_scan
41081.48795 : ksm_do_scan_end : END: ksm_do_scan
...

We have tested randomly so many times for the stability
and couldn't see any abnormal issue until now.
Also, we found out this patch can make some good advantage
for the power consumption than KSM default enable.

v1 -> v2
- add comment for oldchecksum value
- move the oldchecksum value out of union
- remove check code regarding checksum 0 in stable_tree_search()

link to v1 : https://lkml.org/lkml/2017/10/30/251

Signed-off-by: Kyeongdon Kim 
---
 mm/ksm.c | 48 
 1 file changed, 44 insertions(+), 4 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index be8f457..9280569 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -134,6 +134,7 @@ struct ksm_scan {
  * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
  * @chain_prune_time: time of the last full garbage collection
  * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
+ * @oldchecksum: previous checksum of the page about a stable_node
  * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
  */
 struct stable_node {
@@ -159,6 +160,7 @@ struct stable_node {
 */
 #define STABLE_NODE_CHAIN -1024
int rmap_hlist_len;
+   u32 oldchecksum;
 #ifdef CONFIG_NUMA
int nid;
 #endif
@@ -1522,7 +1524,7 @@ static __always_inline struct page *chain(struct 
stable_node **s_n_d,
  * This function returns the stable tree node of identical content if found,
  * NULL otherwise.
  */
-static struct page *stable_tree_search(struct page *page)
+static struct page *stable_tree_search(struct page *page, u32 checksum)
 {
int nid;
struct rb_root *root;
@@ -1550,6 +1552,18 @@ static struct page *stable_tree_search(struct page *page)
 
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
+
+   /* first make rb_tree by checksum */
+   if (checksum < stable_node->oldchecksum) {
+   parent = *new;
+   new = >rb_left;
+   continue;
+   } else if (checksum > stable_node->oldchecksum) {
+   parent = *new;
+   new = >rb_right;
+   continue;
+   }
+
stable_node_any = NULL;
tree_page = chain_prune(_node_dup, _node, root);
/*
@@ -1768,7 +1782,7 @@ static struct page *stable_tree_search(struct page *page)
  * This function returns the stable tree node just allocated on success,
  * NULL otherwise.
  */
-static struct stable_node *stable_tree_insert(struct page *kpage)
+static struct stable_node *stable_tree_insert(struct page *kpage, u32 checksum)
 {
int nid;
unsigned long kpfn;
@@ -1792,6 +1806,18 @@ static struct stable_node *stable_tree_insert(struct 
page *kpage)
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
stable_node_any = NULL;
+
+   /* first make rb_tree by checksum */
+   if (checksum < stable_node->oldchecksum) {
+   parent = *new;
+   new = >rb_left;
+   continue;
+   } else if (checksum > stable_node->oldchecksum) {
+   parent = *new;
+   new = >rb_right;
+   continue;
+   }
+
tree_page = chain(_node_dup, stable_node, root);
if (!stable_node_dup) {
/*
@@ -1850,6 +1876,7 @@ static struct stable_node *stable_tree_insert(struct page 
*kpage)
 
 

Re: Re: [PATCH] ksm : use checksum and memcmp for rb_tree

2017-11-06 Thread Kyeongdon Kim
Sorry, re-send this email because of the Delivery failed message (to 
linux-kernel)


On 2017-10-30 오후 10:22, Timofey Titovets wrote:

2017-10-30 15:03 GMT+03:00 Kyeongdon Kim <kyeongdon@lge.com>:
> The current ksm is using memcmp to insert and search 'rb_tree'.
> It does cause very expensive computation cost.
> In order to reduce the time of this operation,
> we have added a checksum to traverse before memcmp operation.
>
> Nearly all 'rb_node' in stable_tree_insert() function
> can be inserted as a checksum, most of it is possible
> in unstable_tree_search_insert() function.
> In stable_tree_search() function, the checksum may be an additional.
> But, checksum check duration is extremely small.
> Considering the time of the whole cmp_and_merge_page() function,
> it requires very little cost on average.
>
> Using this patch, we compared the time of ksm_do_scan() function
> by adding kernel trace at the start-end position of operation.
> (ARM 32bit target android device,
> over 1000 sample time gap stamps average)
>
> On original KSM scan avg duration = 0.0166893 sec
> 24991.975619 : ksm_do_scan_start: START: ksm_do_scan
> 24991.990975 : ksm_do_scan_end: END: ksm_do_scan
> 24992.008989 : ksm_do_scan_start: START: ksm_do_scan
> 24992.016839 : ksm_do_scan_end: END: ksm_do_scan
> ...
>
> On patch KSM scan avg duration = 0.0041157 sec
> 41081.461312 : ksm_do_scan_start: START: ksm_do_scan
> 41081.466364 : ksm_do_scan_end: END: ksm_do_scan
> 41081.484767 : ksm_do_scan_start: START: ksm_do_scan
> 41081.487951 : ksm_do_scan_end: END: ksm_do_scan
> ...
>
> We have tested randomly so many times for the stability
> and couldn't see any abnormal issue until now.
> Also, we found out this patch can make some good advantage
> for the power consumption than KSM default enable.
>
> Signed-off-by: Kyeongdon Kim <kyeongdon@lge.com>
> ---
> mm/ksm.c | 49 +
> 1 file changed, 45 insertions(+), 4 deletions(-)
>
> diff --git a/mm/ksm.c b/mm/ksm.c
> index be8f457..66ab4f4 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -150,6 +150,7 @@ struct stable_node {
> struct hlist_head hlist;
> union {
> unsigned long kpfn;
> + u32 oldchecksum;
> unsigned long chain_prune_time;
> };
> /*

May be just checksum? i.e. that's can be "old", where checksum can 
change,

in stable tree, checksum also stable.

Also, as checksum are stable, may be that make a sense to move it out
of union? (I'm afraid of clashes)

Also, you miss update comment above struct stable_node, about checksum 
var.



Thanks for your comment, and we may change those lines like below :

+ * @oldchecksum: previous checksum of the page about a stable_node
  * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
  */
 struct stable_node {
@@ -159,6 +160,7 @@ struct stable_node {
 */
 #define STABLE_NODE_CHAIN -1024
    int rmap_hlist_len;
+   u32 oldchecksum;
 #ifdef CONFIG_NUMA

And I think if checksum are matched,
then we can use original memcmp logic in stable tree.
the worst case that I imagine is no page merging(just in that moment).
But, in my humble opinion, there will be no critical memory issue. but 
just return.

(as I said, we tested a lot to check some abnormal memory operation,
but so far, so good - only performance improvement)
> @@ -1522,7 +1523,7 @@ static __always_inline struct page 
*chain(struct stable_node **s_n_d,
> * This function returns the stable tree node of identical content if 
found,

> * NULL otherwise.
> */
> -static struct page *stable_tree_search(struct page *page)
> +static struct page *stable_tree_search(struct page *page, u32 
checksum)

> {
> int nid;
> struct rb_root *root;
> @@ -1540,6 +1541,8 @@ static struct page *stable_tree_search(struct 
page *page)

>
> nid = get_kpfn_nid(page_to_pfn(page));
> root = root_stable_tree + nid;
> + if (!checksum)
> + return NULL;

That's not a pointer, and 0x0 - is a valid checksum.
Also, jhash2 not so collision free, i.e.:
jhash2((uint32_t *) , 2, 17);

Example of collisions, where hash = 0x0:
hash: 0x0 - num: 610041898
hash: 0x0 - num: 4893164379
hash: 0x0 - num: 16423540221
hash: 0x0 - num: 29036382188

You also compare values, so hash = 0, is a acceptable checksum.


well, if then, I can remove this check line.


Thanks,
anyway in general idea looks good.

Reviewed-by: Timofey Titovets <nefelim...@gmail.com>

--
Have a nice day,
Timofey.

Thanks a lot :)
Actually, our organization want to use this KSM feature in general,
but, current logic needs too high cost.
So I wish to change more light version.
Please kindly give your opinion on this idea.

Thanks,
Kyeongdon Kim


Re: Re: [PATCH] ksm : use checksum and memcmp for rb_tree

2017-11-06 Thread Kyeongdon Kim
Sorry, re-send this email because of the Delivery failed message (to 
linux-kernel)


On 2017-10-30 오후 10:22, Timofey Titovets wrote:

2017-10-30 15:03 GMT+03:00 Kyeongdon Kim :
> The current ksm is using memcmp to insert and search 'rb_tree'.
> It does cause very expensive computation cost.
> In order to reduce the time of this operation,
> we have added a checksum to traverse before memcmp operation.
>
> Nearly all 'rb_node' in stable_tree_insert() function
> can be inserted as a checksum, most of it is possible
> in unstable_tree_search_insert() function.
> In stable_tree_search() function, the checksum may be an additional.
> But, checksum check duration is extremely small.
> Considering the time of the whole cmp_and_merge_page() function,
> it requires very little cost on average.
>
> Using this patch, we compared the time of ksm_do_scan() function
> by adding kernel trace at the start-end position of operation.
> (ARM 32bit target android device,
> over 1000 sample time gap stamps average)
>
> On original KSM scan avg duration = 0.0166893 sec
> 24991.975619 : ksm_do_scan_start: START: ksm_do_scan
> 24991.990975 : ksm_do_scan_end: END: ksm_do_scan
> 24992.008989 : ksm_do_scan_start: START: ksm_do_scan
> 24992.016839 : ksm_do_scan_end: END: ksm_do_scan
> ...
>
> On patch KSM scan avg duration = 0.0041157 sec
> 41081.461312 : ksm_do_scan_start: START: ksm_do_scan
> 41081.466364 : ksm_do_scan_end: END: ksm_do_scan
> 41081.484767 : ksm_do_scan_start: START: ksm_do_scan
> 41081.487951 : ksm_do_scan_end: END: ksm_do_scan
> ...
>
> We have tested randomly so many times for the stability
> and couldn't see any abnormal issue until now.
> Also, we found out this patch can make some good advantage
> for the power consumption than KSM default enable.
>
> Signed-off-by: Kyeongdon Kim 
> ---
> mm/ksm.c | 49 +
> 1 file changed, 45 insertions(+), 4 deletions(-)
>
> diff --git a/mm/ksm.c b/mm/ksm.c
> index be8f457..66ab4f4 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -150,6 +150,7 @@ struct stable_node {
> struct hlist_head hlist;
> union {
> unsigned long kpfn;
> + u32 oldchecksum;
> unsigned long chain_prune_time;
> };
> /*

May be just checksum? i.e. that's can be "old", where checksum can 
change,

in stable tree, checksum also stable.

Also, as checksum are stable, may be that make a sense to move it out
of union? (I'm afraid of clashes)

Also, you miss update comment above struct stable_node, about checksum 
var.



Thanks for your comment, and we may change those lines like below :

+ * @oldchecksum: previous checksum of the page about a stable_node
  * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
  */
 struct stable_node {
@@ -159,6 +160,7 @@ struct stable_node {
 */
 #define STABLE_NODE_CHAIN -1024
    int rmap_hlist_len;
+   u32 oldchecksum;
 #ifdef CONFIG_NUMA

And I think if checksum are matched,
then we can use original memcmp logic in stable tree.
the worst case that I imagine is no page merging(just in that moment).
But, in my humble opinion, there will be no critical memory issue. but 
just return.

(as I said, we tested a lot to check some abnormal memory operation,
but so far, so good - only performance improvement)
> @@ -1522,7 +1523,7 @@ static __always_inline struct page 
*chain(struct stable_node **s_n_d,
> * This function returns the stable tree node of identical content if 
found,

> * NULL otherwise.
> */
> -static struct page *stable_tree_search(struct page *page)
> +static struct page *stable_tree_search(struct page *page, u32 
checksum)

> {
> int nid;
> struct rb_root *root;
> @@ -1540,6 +1541,8 @@ static struct page *stable_tree_search(struct 
page *page)

>
> nid = get_kpfn_nid(page_to_pfn(page));
> root = root_stable_tree + nid;
> + if (!checksum)
> + return NULL;

That's not a pointer, and 0x0 - is a valid checksum.
Also, jhash2 not so collision free, i.e.:
jhash2((uint32_t *) , 2, 17);

Example of collisions, where hash = 0x0:
hash: 0x0 - num: 610041898
hash: 0x0 - num: 4893164379
hash: 0x0 - num: 16423540221
hash: 0x0 - num: 29036382188

You also compare values, so hash = 0, is a acceptable checksum.


well, if then, I can remove this check line.


Thanks,
anyway in general idea looks good.

Reviewed-by: Timofey Titovets 

--
Have a nice day,
Timofey.

Thanks a lot :)
Actually, our organization want to use this KSM feature in general,
but, current logic needs too high cost.
So I wish to change more light version.
Please kindly give your opinion on this idea.

Thanks,
Kyeongdon Kim


[PATCH] ksm : use checksum and memcmp for rb_tree

2017-10-30 Thread Kyeongdon Kim
The current ksm is using memcmp to insert and search 'rb_tree'.
It does cause very expensive computation cost.
In order to reduce the time of this operation,
we have added a checksum to traverse before memcmp operation.

Nearly all 'rb_node' in stable_tree_insert() function
can be inserted as a checksum, most of it is possible
in unstable_tree_search_insert() function.
In stable_tree_search() function, the checksum may be an additional.
But, checksum check duration is extremely small.
Considering the time of the whole cmp_and_merge_page() function,
it requires very little cost on average.

Using this patch, we compared the time of ksm_do_scan() function
by adding kernel trace at the start-end position of operation.
(ARM 32bit target android device,
over 1000 sample time gap stamps average)

On original KSM scan avg duration = 0.0166893 sec
24991.975619 : ksm_do_scan_start: START: ksm_do_scan
24991.990975 : ksm_do_scan_end: END: ksm_do_scan
24992.008989 : ksm_do_scan_start: START: ksm_do_scan
24992.016839 : ksm_do_scan_end: END: ksm_do_scan
...

On patch KSM scan avg duration = 0.0041157 sec
41081.461312 : ksm_do_scan_start: START: ksm_do_scan
41081.466364 : ksm_do_scan_end: END: ksm_do_scan
41081.484767 : ksm_do_scan_start: START: ksm_do_scan
41081.487951 : ksm_do_scan_end: END: ksm_do_scan
...

We have tested randomly so many times for the stability
and couldn't see any abnormal issue until now.
Also, we found out this patch can make some good advantage
for the power consumption than KSM default enable.

Signed-off-by: Kyeongdon Kim <kyeongdon@lge.com>
---
 mm/ksm.c | 49 +
 1 file changed, 45 insertions(+), 4 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index be8f457..66ab4f4 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -150,6 +150,7 @@ struct stable_node {
struct hlist_head hlist;
union {
unsigned long kpfn;
+   u32 oldchecksum;
unsigned long chain_prune_time;
};
/*
@@ -1522,7 +1523,7 @@ static __always_inline struct page *chain(struct 
stable_node **s_n_d,
  * This function returns the stable tree node of identical content if found,
  * NULL otherwise.
  */
-static struct page *stable_tree_search(struct page *page)
+static struct page *stable_tree_search(struct page *page, u32 checksum)
 {
int nid;
struct rb_root *root;
@@ -1540,6 +1541,8 @@ static struct page *stable_tree_search(struct page *page)
 
nid = get_kpfn_nid(page_to_pfn(page));
root = root_stable_tree + nid;
+   if (!checksum)
+   return NULL;
 again:
new = >rb_node;
parent = NULL;
@@ -1550,6 +1553,18 @@ static struct page *stable_tree_search(struct page *page)
 
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
+
+   /* first make rb_tree by checksum */
+   if (checksum < stable_node->oldchecksum) {
+   parent = *new;
+   new = >rb_left;
+   continue;
+   } else if (checksum > stable_node->oldchecksum) {
+   parent = *new;
+   new = >rb_right;
+   continue;
+   }
+
stable_node_any = NULL;
tree_page = chain_prune(_node_dup, _node, root);
/*
@@ -1768,7 +1783,7 @@ static struct page *stable_tree_search(struct page *page)
  * This function returns the stable tree node just allocated on success,
  * NULL otherwise.
  */
-static struct stable_node *stable_tree_insert(struct page *kpage)
+static struct stable_node *stable_tree_insert(struct page *kpage, u32 checksum)
 {
int nid;
unsigned long kpfn;
@@ -1792,6 +1807,18 @@ static struct stable_node *stable_tree_insert(struct 
page *kpage)
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
stable_node_any = NULL;
+
+   /* first make rb_tree by checksum */
+   if (checksum < stable_node->oldchecksum) {
+   parent = *new;
+   new = >rb_left;
+   continue;
+   } else if (checksum > stable_node->oldchecksum) {
+   parent = *new;
+   new = >rb_right;
+   continue;
+   }
+
tree_page = chain(_node_dup, stable_node, root);
if (!stable_node_dup) {
/*
@@ -1850,6 +1877,7 @@ static struct stable_node *stable_tree_insert(struct page 
*kpage)
 
INIT_HLIST_HEAD(_node_dup->hlist);
stable_node_dup->kpfn = kpfn;
+   stable_node_dup->oldchecksum = checksum;
set_page_stable_node(kpage, stable_node_dup);
stable_node_dup->rmap_hlist_len = 0;
DO_NUMA(stable_node

[PATCH] ksm : use checksum and memcmp for rb_tree

2017-10-30 Thread Kyeongdon Kim
The current ksm is using memcmp to insert and search 'rb_tree'.
It does cause very expensive computation cost.
In order to reduce the time of this operation,
we have added a checksum to traverse before memcmp operation.

Nearly all 'rb_node' in stable_tree_insert() function
can be inserted as a checksum, most of it is possible
in unstable_tree_search_insert() function.
In stable_tree_search() function, the checksum may be an additional.
But, checksum check duration is extremely small.
Considering the time of the whole cmp_and_merge_page() function,
it requires very little cost on average.

Using this patch, we compared the time of ksm_do_scan() function
by adding kernel trace at the start-end position of operation.
(ARM 32bit target android device,
over 1000 sample time gap stamps average)

On original KSM scan avg duration = 0.0166893 sec
24991.975619 : ksm_do_scan_start: START: ksm_do_scan
24991.990975 : ksm_do_scan_end: END: ksm_do_scan
24992.008989 : ksm_do_scan_start: START: ksm_do_scan
24992.016839 : ksm_do_scan_end: END: ksm_do_scan
...

On patch KSM scan avg duration = 0.0041157 sec
41081.461312 : ksm_do_scan_start: START: ksm_do_scan
41081.466364 : ksm_do_scan_end: END: ksm_do_scan
41081.484767 : ksm_do_scan_start: START: ksm_do_scan
41081.487951 : ksm_do_scan_end: END: ksm_do_scan
...

We have tested randomly so many times for the stability
and couldn't see any abnormal issue until now.
Also, we found out this patch can make some good advantage
for the power consumption than KSM default enable.

Signed-off-by: Kyeongdon Kim 
---
 mm/ksm.c | 49 +
 1 file changed, 45 insertions(+), 4 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index be8f457..66ab4f4 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -150,6 +150,7 @@ struct stable_node {
struct hlist_head hlist;
union {
unsigned long kpfn;
+   u32 oldchecksum;
unsigned long chain_prune_time;
};
/*
@@ -1522,7 +1523,7 @@ static __always_inline struct page *chain(struct 
stable_node **s_n_d,
  * This function returns the stable tree node of identical content if found,
  * NULL otherwise.
  */
-static struct page *stable_tree_search(struct page *page)
+static struct page *stable_tree_search(struct page *page, u32 checksum)
 {
int nid;
struct rb_root *root;
@@ -1540,6 +1541,8 @@ static struct page *stable_tree_search(struct page *page)
 
nid = get_kpfn_nid(page_to_pfn(page));
root = root_stable_tree + nid;
+   if (!checksum)
+   return NULL;
 again:
new = >rb_node;
parent = NULL;
@@ -1550,6 +1553,18 @@ static struct page *stable_tree_search(struct page *page)
 
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
+
+   /* first make rb_tree by checksum */
+   if (checksum < stable_node->oldchecksum) {
+   parent = *new;
+   new = >rb_left;
+   continue;
+   } else if (checksum > stable_node->oldchecksum) {
+   parent = *new;
+   new = >rb_right;
+   continue;
+   }
+
stable_node_any = NULL;
tree_page = chain_prune(_node_dup, _node, root);
/*
@@ -1768,7 +1783,7 @@ static struct page *stable_tree_search(struct page *page)
  * This function returns the stable tree node just allocated on success,
  * NULL otherwise.
  */
-static struct stable_node *stable_tree_insert(struct page *kpage)
+static struct stable_node *stable_tree_insert(struct page *kpage, u32 checksum)
 {
int nid;
unsigned long kpfn;
@@ -1792,6 +1807,18 @@ static struct stable_node *stable_tree_insert(struct 
page *kpage)
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
stable_node_any = NULL;
+
+   /* first make rb_tree by checksum */
+   if (checksum < stable_node->oldchecksum) {
+   parent = *new;
+   new = >rb_left;
+   continue;
+   } else if (checksum > stable_node->oldchecksum) {
+   parent = *new;
+   new = >rb_right;
+   continue;
+   }
+
tree_page = chain(_node_dup, stable_node, root);
if (!stable_node_dup) {
/*
@@ -1850,6 +1877,7 @@ static struct stable_node *stable_tree_insert(struct page 
*kpage)
 
INIT_HLIST_HEAD(_node_dup->hlist);
stable_node_dup->kpfn = kpfn;
+   stable_node_dup->oldchecksum = checksum;
set_page_stable_node(kpage, stable_node_dup);
stable_node_dup->rmap_hlist_len = 0;
DO_NUMA(stable_node_dup->nid = nid);
@@ -1907,6 +19

Re: Re: [PATCH] mm/vmstats: add counters for the page frag cache

2017-09-08 Thread Kyeongdon Kim

On 2017-09-04 오후 5:30, Konstantin Khlebnikov wrote:

On 04.09.2017 04:35, Kyeongdon Kim wrote:
> Thanks for your reply,
> But I couldn't find "NR_FRAGMENT_PAGES" in linux-next.git .. is that 
vmstat counter? or others?

>

I mean rather than adding bunch vmstat counters for operations it 
might be

worth to add page counter which will show current amount of these pages.
But this seems too low-level for tracking, common counters for all 
network

buffers would be more useful but much harder to implement.


Ok, thanks for the comment.
As I can see page owner is able to save stacktrace where allocation 
happened,
this makes debugging mostly trivial without any counters. If it adds 
too much
overhead - just track random 1% of pages, should be enough for finding 
leak.



As I said, we already used page owner tools to resolve the leak issue.
But that's extremely difficult to figure it out,
too much callstack and too much allocation orders(0 or more).
We couldn't easily get a clue event if we track 1% of pages..

In fact, I was writing another email to send a new patch with debug config.
We added a hash function to pick out address with callstack by using 
debugfs,

It could be showing the only page_frag_cache leak with owner.

for exmaple code :
+++ /mm/page_alloc.c
@@ -4371,7 +4371,9 @@ void *page_frag_alloc(struct page_frag_cache *nc,

    nc->pagecnt_bias--;
    nc->offset = offset;
+#ifdef CONFIG_PGFRAG_DEBUG
+   page_frag_debug_alloc(nc->va + offset);
+#endif
    return nc->va + offset;
 }
 EXPORT_SYMBOL(page_frag_alloc);
@@ -4382,7 +4384,9 @@ EXPORT_SYMBOL(page_frag_alloc);
 void page_frag_free(void *addr)
 {
    struct page *page = virt_to_head_page(addr);
+#ifdef CONFIG_PGFRAG_DEBUG
+   page_frag_debug_free(addr);
+#endif
    if (unlikely(put_page_testzero(page)))

Those counters that I added may be too much for the linux server or 
something.

However, I think the other systems may need to simple debugging method.
(like Android OS)

So if you can accept the patch with debug feature,
I will send it including counters.
but still thinking those counters don't need. I won't.

Anyway, I'm grateful for your feedback, means a lot to me.

Thanks,
Kyeongdon Kim
> As you know, page_frag_alloc() directly calls 
__alloc_pages_nodemask() function,
> so that makes too difficult to see memory usage in real time even 
though we have "/meminfo or /slabinfo.." information.
> If there was a way already to figure out the memory leakage from 
page_frag_cache in mainline, I agree your opinion

> but I think we don't have it now.
>
> If those counters too much in my patch,
> I can say two values (pgfrag_alloc and pgfrag_free) are enough to 
guess what will happen

> and would remove pgfrag_alloc_calls and pgfrag_free_calls.
>
> Thanks,
> Kyeongdon Kim
>

>> IMHO that's too much counters.
>> Per-node NR_FRAGMENT_PAGES should be enough for guessing what's 
going on.

>> Perf probes provides enough features for furhter debugging.
>>
>> On 01.09.2017 02:37, Kyeongdon Kim wrote:
>> > There was a memory leak problem when we did stressful test
>> > on Android device.
>> > The root cause of this was from page_frag_cache alloc
>> > and it was very hard to find out.
>> >
>> > We add to count the page frag allocation and free with function 
call.

>> > The gap between pgfrag_alloc and pgfrag_free is good to to calculate
>> > for the amount of page.
>> > The gap between pgfrag_alloc_calls and pgfrag_free_calls is for
>> > sub-indicator.
>> > They can see trends of memory usage during the test.
>> > Without it, it's difficult to check page frag usage so I believe we
>> > should add it.
>> >
>> > Signed-off-by: Kyeongdon Kim <kyeongdon@lge.com>
>> > ---




Re: Re: [PATCH] mm/vmstats: add counters for the page frag cache

2017-09-08 Thread Kyeongdon Kim

On 2017-09-04 오후 5:30, Konstantin Khlebnikov wrote:

On 04.09.2017 04:35, Kyeongdon Kim wrote:
> Thanks for your reply,
> But I couldn't find "NR_FRAGMENT_PAGES" in linux-next.git .. is that 
vmstat counter? or others?

>

I mean rather than adding bunch vmstat counters for operations it 
might be

worth to add page counter which will show current amount of these pages.
But this seems too low-level for tracking, common counters for all 
network

buffers would be more useful but much harder to implement.


Ok, thanks for the comment.
As I can see page owner is able to save stacktrace where allocation 
happened,
this makes debugging mostly trivial without any counters. If it adds 
too much
overhead - just track random 1% of pages, should be enough for finding 
leak.



As I said, we already used page owner tools to resolve the leak issue.
But that's extremely difficult to figure it out,
too much callstack and too much allocation orders(0 or more).
We couldn't easily get a clue event if we track 1% of pages..

In fact, I was writing another email to send a new patch with debug config.
We added a hash function to pick out address with callstack by using 
debugfs,

It could be showing the only page_frag_cache leak with owner.

for exmaple code :
+++ /mm/page_alloc.c
@@ -4371,7 +4371,9 @@ void *page_frag_alloc(struct page_frag_cache *nc,

    nc->pagecnt_bias--;
    nc->offset = offset;
+#ifdef CONFIG_PGFRAG_DEBUG
+   page_frag_debug_alloc(nc->va + offset);
+#endif
    return nc->va + offset;
 }
 EXPORT_SYMBOL(page_frag_alloc);
@@ -4382,7 +4384,9 @@ EXPORT_SYMBOL(page_frag_alloc);
 void page_frag_free(void *addr)
 {
    struct page *page = virt_to_head_page(addr);
+#ifdef CONFIG_PGFRAG_DEBUG
+   page_frag_debug_free(addr);
+#endif
    if (unlikely(put_page_testzero(page)))

Those counters that I added may be too much for the linux server or 
something.

However, I think the other systems may need to simple debugging method.
(like Android OS)

So if you can accept the patch with debug feature,
I will send it including counters.
but still thinking those counters don't need. I won't.

Anyway, I'm grateful for your feedback, means a lot to me.

Thanks,
Kyeongdon Kim
> As you know, page_frag_alloc() directly calls 
__alloc_pages_nodemask() function,
> so that makes too difficult to see memory usage in real time even 
though we have "/meminfo or /slabinfo.." information.
> If there was a way already to figure out the memory leakage from 
page_frag_cache in mainline, I agree your opinion

> but I think we don't have it now.
>
> If those counters too much in my patch,
> I can say two values (pgfrag_alloc and pgfrag_free) are enough to 
guess what will happen

> and would remove pgfrag_alloc_calls and pgfrag_free_calls.
>
> Thanks,
> Kyeongdon Kim
>

>> IMHO that's too much counters.
>> Per-node NR_FRAGMENT_PAGES should be enough for guessing what's 
going on.

>> Perf probes provides enough features for furhter debugging.
>>
>> On 01.09.2017 02:37, Kyeongdon Kim wrote:
>> > There was a memory leak problem when we did stressful test
>> > on Android device.
>> > The root cause of this was from page_frag_cache alloc
>> > and it was very hard to find out.
>> >
>> > We add to count the page frag allocation and free with function 
call.

>> > The gap between pgfrag_alloc and pgfrag_free is good to to calculate
>> > for the amount of page.
>> > The gap between pgfrag_alloc_calls and pgfrag_free_calls is for
>> > sub-indicator.
>> > They can see trends of memory usage during the test.
>> > Without it, it's difficult to check page frag usage so I believe we
>> > should add it.
>> >
>> > Signed-off-by: Kyeongdon Kim 
>> > ---




[PATCH] selinux: Use kmem_cache for hashtab_node

2017-09-06 Thread Kyeongdon Kim
During random test as own device to check slub account,
we found some slack memory from hashtab_node(kmalloc-64).
By using kzalloc(), middle of test result like below:
allocated size 240768
request size 45144
slack size 195624
allocation count 3762

So, we want to use kmem_cache_zalloc() and that
can reduce memory size 52byte(slack size/alloc count) per each struct.

Signed-off-by: Kyeongdon Kim <kyeongdon@lge.com>
---
 security/selinux/ss/hashtab.c  | 17 +++--
 security/selinux/ss/hashtab.h  |  4 
 security/selinux/ss/services.c |  4 
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/security/selinux/ss/hashtab.c b/security/selinux/ss/hashtab.c
index 686c391..bef7577 100644
--- a/security/selinux/ss/hashtab.c
+++ b/security/selinux/ss/hashtab.c
@@ -9,6 +9,8 @@
 #include 
 #include "hashtab.h"
 
+static struct kmem_cache *hashtab_node_cachep;
+
 struct hashtab *hashtab_create(u32 (*hash_value)(struct hashtab *h, const void 
*key),
   int (*keycmp)(struct hashtab *h, const void 
*key1, const void *key2),
   u32 size)
@@ -57,7 +59,7 @@ int hashtab_insert(struct hashtab *h, void *key, void *datum)
if (cur && (h->keycmp(h, key, cur->key) == 0))
return -EEXIST;
 
-   newnode = kzalloc(sizeof(*newnode), GFP_KERNEL);
+   newnode = kmem_cache_zalloc(hashtab_node_cachep, GFP_KERNEL);
if (!newnode)
return -ENOMEM;
newnode->key = key;
@@ -106,7 +108,7 @@ void hashtab_destroy(struct hashtab *h)
while (cur) {
temp = cur;
cur = cur->next;
-   kfree(temp);
+   kmem_cache_free(hashtab_node_cachep, temp);
}
h->htable[i] = NULL;
}
@@ -166,3 +168,14 @@ void hashtab_stat(struct hashtab *h, struct hashtab_info 
*info)
info->slots_used = slots_used;
info->max_chain_len = max_chain_len;
 }
+void hashtab_cache_init(void)
+{
+   hashtab_node_cachep = kmem_cache_create("hashtab_node",
+   sizeof(struct hashtab_node),
+   0, SLAB_PANIC, NULL);
+}
+
+void hashtab_cache_destroy(void)
+{
+   kmem_cache_destroy(hashtab_node_cachep);
+}
diff --git a/security/selinux/ss/hashtab.h b/security/selinux/ss/hashtab.h
index 009fb5e..d6883d3 100644
--- a/security/selinux/ss/hashtab.h
+++ b/security/selinux/ss/hashtab.h
@@ -84,4 +84,8 @@ int hashtab_map(struct hashtab *h,
 /* Fill info with some hash table statistics */
 void hashtab_stat(struct hashtab *h, struct hashtab_info *info);
 
+/* Use kmem_cache for hashtab_node */
+void hashtab_cache_init(void);
+void hashtab_cache_destroy(void);
+
 #endif /* _SS_HASHTAB_H */
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index e4a1c0d..33cfe5d 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -2060,10 +2060,12 @@ int security_load_policy(void *data, size_t len)
if (!ss_initialized) {
avtab_cache_init();
ebitmap_cache_init();
+   hashtab_cache_init();
rc = policydb_read(, fp);
if (rc) {
avtab_cache_destroy();
ebitmap_cache_destroy();
+   hashtab_cache_destroy();
goto out;
}
 
@@ -2075,6 +2077,7 @@ int security_load_policy(void *data, size_t len)
policydb_destroy();
avtab_cache_destroy();
ebitmap_cache_destroy();
+   hashtab_cache_destroy();
goto out;
}
 
@@ -2083,6 +2086,7 @@ int security_load_policy(void *data, size_t len)
policydb_destroy();
avtab_cache_destroy();
ebitmap_cache_destroy();
+   hashtab_cache_destroy();
goto out;
}
 
-- 
2.6.2



[PATCH] selinux: Use kmem_cache for hashtab_node

2017-09-06 Thread Kyeongdon Kim
During random test as own device to check slub account,
we found some slack memory from hashtab_node(kmalloc-64).
By using kzalloc(), middle of test result like below:
allocated size 240768
request size 45144
slack size 195624
allocation count 3762

So, we want to use kmem_cache_zalloc() and that
can reduce memory size 52byte(slack size/alloc count) per each struct.

Signed-off-by: Kyeongdon Kim 
---
 security/selinux/ss/hashtab.c  | 17 +++--
 security/selinux/ss/hashtab.h  |  4 
 security/selinux/ss/services.c |  4 
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/security/selinux/ss/hashtab.c b/security/selinux/ss/hashtab.c
index 686c391..bef7577 100644
--- a/security/selinux/ss/hashtab.c
+++ b/security/selinux/ss/hashtab.c
@@ -9,6 +9,8 @@
 #include 
 #include "hashtab.h"
 
+static struct kmem_cache *hashtab_node_cachep;
+
 struct hashtab *hashtab_create(u32 (*hash_value)(struct hashtab *h, const void 
*key),
   int (*keycmp)(struct hashtab *h, const void 
*key1, const void *key2),
   u32 size)
@@ -57,7 +59,7 @@ int hashtab_insert(struct hashtab *h, void *key, void *datum)
if (cur && (h->keycmp(h, key, cur->key) == 0))
return -EEXIST;
 
-   newnode = kzalloc(sizeof(*newnode), GFP_KERNEL);
+   newnode = kmem_cache_zalloc(hashtab_node_cachep, GFP_KERNEL);
if (!newnode)
return -ENOMEM;
newnode->key = key;
@@ -106,7 +108,7 @@ void hashtab_destroy(struct hashtab *h)
while (cur) {
temp = cur;
cur = cur->next;
-   kfree(temp);
+   kmem_cache_free(hashtab_node_cachep, temp);
}
h->htable[i] = NULL;
}
@@ -166,3 +168,14 @@ void hashtab_stat(struct hashtab *h, struct hashtab_info 
*info)
info->slots_used = slots_used;
info->max_chain_len = max_chain_len;
 }
+void hashtab_cache_init(void)
+{
+   hashtab_node_cachep = kmem_cache_create("hashtab_node",
+   sizeof(struct hashtab_node),
+   0, SLAB_PANIC, NULL);
+}
+
+void hashtab_cache_destroy(void)
+{
+   kmem_cache_destroy(hashtab_node_cachep);
+}
diff --git a/security/selinux/ss/hashtab.h b/security/selinux/ss/hashtab.h
index 009fb5e..d6883d3 100644
--- a/security/selinux/ss/hashtab.h
+++ b/security/selinux/ss/hashtab.h
@@ -84,4 +84,8 @@ int hashtab_map(struct hashtab *h,
 /* Fill info with some hash table statistics */
 void hashtab_stat(struct hashtab *h, struct hashtab_info *info);
 
+/* Use kmem_cache for hashtab_node */
+void hashtab_cache_init(void);
+void hashtab_cache_destroy(void);
+
 #endif /* _SS_HASHTAB_H */
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index e4a1c0d..33cfe5d 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -2060,10 +2060,12 @@ int security_load_policy(void *data, size_t len)
if (!ss_initialized) {
avtab_cache_init();
ebitmap_cache_init();
+   hashtab_cache_init();
rc = policydb_read(, fp);
if (rc) {
avtab_cache_destroy();
ebitmap_cache_destroy();
+   hashtab_cache_destroy();
goto out;
}
 
@@ -2075,6 +2077,7 @@ int security_load_policy(void *data, size_t len)
policydb_destroy();
avtab_cache_destroy();
ebitmap_cache_destroy();
+   hashtab_cache_destroy();
goto out;
}
 
@@ -2083,6 +2086,7 @@ int security_load_policy(void *data, size_t len)
policydb_destroy();
avtab_cache_destroy();
ebitmap_cache_destroy();
+   hashtab_cache_destroy();
goto out;
}
 
-- 
2.6.2



Re: Re: [PATCH] mm/vmstats: add counters for the page frag cache

2017-09-03 Thread Kyeongdon Kim

Thanks for your reply,

We already used other i/f like page_owner and kmemleak to resolve memory 
leakage issue.
But, page_owner can only for guess but cannot find intuitively memory 
usage regarding page_frag_cache.
And kmemleak cannot use (because of calling directly 
__alloc_pages_nodemask()).


Additionally, some embedded linux like Android or something..
is not able to always use kmemleak & page_owner because of runtime 
performance deterioration.
However, the root cause of this memory issue is from net device like 
wireless.
In short, should always use wireless on device but, cannot use those 
memory debug tools.


That's why those counters need..
and for much cheaper I can remove pgfrag_alloc_calls and pgfrag_free_calls.

Thanks,
Kyeongdon Kim

On 2017-09-01 오후 6:21, Michal Hocko wrote:

On Fri 01-09-17 12:12:36, Konstantin Khlebnikov wrote:
> IMHO that's too much counters.
> Per-node NR_FRAGMENT_PAGES should be enough for guessing what's 
going on.

> Perf probes provides enough features for furhter debugging.

I would tend to agree. Adding a counter based on a single debugging
instance sounds like an overkill to me. Counters should be pretty cheep
but this is way too specialized API to export to the userspace.

We have other interfaces to debug memory leaks like page_owner.
--
Michal Hocko
SUSE Labs




Re: Re: [PATCH] mm/vmstats: add counters for the page frag cache

2017-09-03 Thread Kyeongdon Kim

Thanks for your reply,

We already used other i/f like page_owner and kmemleak to resolve memory 
leakage issue.
But, page_owner can only for guess but cannot find intuitively memory 
usage regarding page_frag_cache.
And kmemleak cannot use (because of calling directly 
__alloc_pages_nodemask()).


Additionally, some embedded linux like Android or something..
is not able to always use kmemleak & page_owner because of runtime 
performance deterioration.
However, the root cause of this memory issue is from net device like 
wireless.
In short, should always use wireless on device but, cannot use those 
memory debug tools.


That's why those counters need..
and for much cheaper I can remove pgfrag_alloc_calls and pgfrag_free_calls.

Thanks,
Kyeongdon Kim

On 2017-09-01 오후 6:21, Michal Hocko wrote:

On Fri 01-09-17 12:12:36, Konstantin Khlebnikov wrote:
> IMHO that's too much counters.
> Per-node NR_FRAGMENT_PAGES should be enough for guessing what's 
going on.

> Perf probes provides enough features for furhter debugging.

I would tend to agree. Adding a counter based on a single debugging
instance sounds like an overkill to me. Counters should be pretty cheep
but this is way too specialized API to export to the userspace.

We have other interfaces to debug memory leaks like page_owner.
--
Michal Hocko
SUSE Labs




Re: Re: [PATCH] mm/vmstats: add counters for the page frag cache

2017-09-03 Thread Kyeongdon Kim

Thanks for your reply,
But I couldn't find "NR_FRAGMENT_PAGES" in linux-next.git .. is that 
vmstat counter? or others?


As you know, page_frag_alloc() directly calls __alloc_pages_nodemask() 
function,
so that makes too difficult to see memory usage in real time even though 
we have "/meminfo or /slabinfo.." information.
If there was a way already to figure out the memory leakage from 
page_frag_cache in mainline, I agree your opinion

but I think we don't have it now.

If those counters too much in my patch,
I can say two values (pgfrag_alloc and pgfrag_free) are enough to guess 
what will happen

and would remove pgfrag_alloc_calls and pgfrag_free_calls.

Thanks,
Kyeongdon Kim

On 2017-09-01 오후 6:12, Konstantin Khlebnikov wrote:

IMHO that's too much counters.
Per-node NR_FRAGMENT_PAGES should be enough for guessing what's going on.
Perf probes provides enough features for furhter debugging.

On 01.09.2017 02:37, Kyeongdon Kim wrote:
> There was a memory leak problem when we did stressful test
> on Android device.
> The root cause of this was from page_frag_cache alloc
> and it was very hard to find out.
>
> We add to count the page frag allocation and free with function call.
> The gap between pgfrag_alloc and pgfrag_free is good to to calculate
> for the amount of page.
> The gap between pgfrag_alloc_calls and pgfrag_free_calls is for
> sub-indicator.
> They can see trends of memory usage during the test.
> Without it, it's difficult to check page frag usage so I believe we
> should add it.
>
> Signed-off-by: Kyeongdon Kim <kyeongdon@lge.com>
> ---


Re: Re: [PATCH] mm/vmstats: add counters for the page frag cache

2017-09-03 Thread Kyeongdon Kim

Thanks for your reply,
But I couldn't find "NR_FRAGMENT_PAGES" in linux-next.git .. is that 
vmstat counter? or others?


As you know, page_frag_alloc() directly calls __alloc_pages_nodemask() 
function,
so that makes too difficult to see memory usage in real time even though 
we have "/meminfo or /slabinfo.." information.
If there was a way already to figure out the memory leakage from 
page_frag_cache in mainline, I agree your opinion

but I think we don't have it now.

If those counters too much in my patch,
I can say two values (pgfrag_alloc and pgfrag_free) are enough to guess 
what will happen

and would remove pgfrag_alloc_calls and pgfrag_free_calls.

Thanks,
Kyeongdon Kim

On 2017-09-01 오후 6:12, Konstantin Khlebnikov wrote:

IMHO that's too much counters.
Per-node NR_FRAGMENT_PAGES should be enough for guessing what's going on.
Perf probes provides enough features for furhter debugging.

On 01.09.2017 02:37, Kyeongdon Kim wrote:
> There was a memory leak problem when we did stressful test
> on Android device.
> The root cause of this was from page_frag_cache alloc
> and it was very hard to find out.
>
> We add to count the page frag allocation and free with function call.
> The gap between pgfrag_alloc and pgfrag_free is good to to calculate
> for the amount of page.
> The gap between pgfrag_alloc_calls and pgfrag_free_calls is for
> sub-indicator.
> They can see trends of memory usage during the test.
> Without it, it's difficult to check page frag usage so I believe we
> should add it.
>
> Signed-off-by: Kyeongdon Kim 
> ---


[PATCH] mm/vmstats: add counters for the page frag cache

2017-08-31 Thread Kyeongdon Kim
There was a memory leak problem when we did stressful test
on Android device.
The root cause of this was from page_frag_cache alloc
and it was very hard to find out.

We add to count the page frag allocation and free with function call.
The gap between pgfrag_alloc and pgfrag_free is good to to calculate
for the amount of page.
The gap between pgfrag_alloc_calls and pgfrag_free_calls is for
sub-indicator.
They can see trends of memory usage during the test.
Without it, it's difficult to check page frag usage so I believe we
should add it.

Signed-off-by: Kyeongdon Kim <kyeongdon@lge.com>
---
 include/linux/vm_event_item.h | 4 
 mm/page_alloc.c   | 9 +++--
 mm/vmstat.c   | 4 
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index d77bc35..75425d4 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -110,6 +110,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
SWAP_RA,
SWAP_RA_HIT,
 #endif
+   PGFRAG_ALLOC,
+   PGFRAG_FREE,
+   PGFRAG_ALLOC_CALLS,
+   PGFRAG_FREE_CALLS,
NR_VM_EVENT_ITEMS
 };
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index db2d25f..b3ddd76 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4322,6 +4322,7 @@ void __page_frag_cache_drain(struct page *page, unsigned 
int count)
free_hot_cold_page(page, false);
else
__free_pages_ok(page, order);
+   __count_vm_events(PGFRAG_FREE, 1 << order);
}
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
@@ -4338,7 +4339,7 @@ void *page_frag_alloc(struct page_frag_cache *nc,
page = __page_frag_cache_refill(nc, gfp_mask);
if (!page)
return NULL;
-
+   __count_vm_events(PGFRAG_ALLOC, 1 << compound_order(page));
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
/* if size can vary use size else just use PAGE_SIZE */
size = nc->size;
@@ -4375,6 +4376,7 @@ void *page_frag_alloc(struct page_frag_cache *nc,
 
nc->pagecnt_bias--;
nc->offset = offset;
+   __count_vm_event(PGFRAG_ALLOC_CALLS);
 
return nc->va + offset;
 }
@@ -4387,8 +4389,11 @@ void page_frag_free(void *addr)
 {
struct page *page = virt_to_head_page(addr);
 
-   if (unlikely(put_page_testzero(page)))
+   if (unlikely(put_page_testzero(page))) {
+   __count_vm_events(PGFRAG_FREE, 1 << compound_order(page));
__free_pages_ok(page, compound_order(page));
+   }
+   __count_vm_event(PGFRAG_FREE_CALLS);
 }
 EXPORT_SYMBOL(page_frag_free);
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4bb13e7..c00fe05 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1217,6 +1217,10 @@ const char * const vmstat_text[] = {
"swap_ra",
"swap_ra_hit",
 #endif
+   "pgfrag_alloc",
+   "pgfrag_free",
+   "pgfrag_alloc_calls",
+   "pgfrag_free_calls",
 #endif /* CONFIG_VM_EVENTS_COUNTERS */
 };
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
-- 
2.6.2



[PATCH] mm/vmstats: add counters for the page frag cache

2017-08-31 Thread Kyeongdon Kim
There was a memory leak problem when we did stressful test
on Android device.
The root cause of this was from page_frag_cache alloc
and it was very hard to find out.

We add to count the page frag allocation and free with function call.
The gap between pgfrag_alloc and pgfrag_free is good to to calculate
for the amount of page.
The gap between pgfrag_alloc_calls and pgfrag_free_calls is for
sub-indicator.
They can see trends of memory usage during the test.
Without it, it's difficult to check page frag usage so I believe we
should add it.

Signed-off-by: Kyeongdon Kim 
---
 include/linux/vm_event_item.h | 4 
 mm/page_alloc.c   | 9 +++--
 mm/vmstat.c   | 4 
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index d77bc35..75425d4 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -110,6 +110,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
SWAP_RA,
SWAP_RA_HIT,
 #endif
+   PGFRAG_ALLOC,
+   PGFRAG_FREE,
+   PGFRAG_ALLOC_CALLS,
+   PGFRAG_FREE_CALLS,
NR_VM_EVENT_ITEMS
 };
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index db2d25f..b3ddd76 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4322,6 +4322,7 @@ void __page_frag_cache_drain(struct page *page, unsigned 
int count)
free_hot_cold_page(page, false);
else
__free_pages_ok(page, order);
+   __count_vm_events(PGFRAG_FREE, 1 << order);
}
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
@@ -4338,7 +4339,7 @@ void *page_frag_alloc(struct page_frag_cache *nc,
page = __page_frag_cache_refill(nc, gfp_mask);
if (!page)
return NULL;
-
+   __count_vm_events(PGFRAG_ALLOC, 1 << compound_order(page));
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
/* if size can vary use size else just use PAGE_SIZE */
size = nc->size;
@@ -4375,6 +4376,7 @@ void *page_frag_alloc(struct page_frag_cache *nc,
 
nc->pagecnt_bias--;
nc->offset = offset;
+   __count_vm_event(PGFRAG_ALLOC_CALLS);
 
return nc->va + offset;
 }
@@ -4387,8 +4389,11 @@ void page_frag_free(void *addr)
 {
struct page *page = virt_to_head_page(addr);
 
-   if (unlikely(put_page_testzero(page)))
+   if (unlikely(put_page_testzero(page))) {
+   __count_vm_events(PGFRAG_FREE, 1 << compound_order(page));
__free_pages_ok(page, compound_order(page));
+   }
+   __count_vm_event(PGFRAG_FREE_CALLS);
 }
 EXPORT_SYMBOL(page_frag_free);
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4bb13e7..c00fe05 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1217,6 +1217,10 @@ const char * const vmstat_text[] = {
"swap_ra",
"swap_ra_hit",
 #endif
+   "pgfrag_alloc",
+   "pgfrag_free",
+   "pgfrag_alloc_calls",
+   "pgfrag_free_calls",
 #endif /* CONFIG_VM_EVENTS_COUNTERS */
 };
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
-- 
2.6.2



Re: Re: [PATCH] zram: revive swap_slot_free_notify

2016-03-31 Thread Kyeongdon Kim
sis->bdev,
>> - offset);
>> - }
>> - }
>> - }
>> -
>> + swap_slot_free_notify(page);
>> out:
>> unlock_page(page);
>> bio_put(bio);
>> @@ -347,6 +353,7 @@ int swap_readpage(struct page *page)
>>
>> ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
>> if (!ret) {
>> + swap_slot_free_notify(page);
>> count_vm_event(PSWPIN);
>> return 0;
>> }
>> --
>> 1.9.1
>>
> 
> Kyeongdon, Could you try this patch?
> 
> From 09497ba48f5621cd2737f72ac4d15e6b3e5e3d14 Mon Sep 17 00:00:00 2001
> From: Minchan Kim <minc...@kernel.org>
> Date: Mon, 28 Mar 2016 23:08:14 +0900
> Subject: [PATCH] mm: call swap_slot_free_notify with holding page lock
> 
> Kyeongdon reported below error which is BUG_ON(!PageSwapCache(page))
> in page_swap_info.
> The reason is that page_endio in rw_page unlocks the page if read I/O
> is completed so we need to hold a PG_lock again to check PageSwapCache.
> Otherwise, the page can be removed from swapcache and trigger below
> BUG_ON.
> 
> [27833.995833] [ cut here ]
> [27833.995853] Kernel BUG at c00f9040 [verbose debug info unavailable]
> [27833.995865] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP ARM
> [27833.995876] Modules linked in:
> [27833.995892] CPU: 4 PID: 13446 Comm: RenderThread Tainted: G W
> 3.10.84-g9f14aec-dirty #73
> [27833.995903] task: c3b73200 ti: dd192000 task.ti: dd192000
> [27833.995922] PC is at page_swap_info+0x10/0x2c
> [27833.995934] LR is at swap_slot_free_notify+0x18/0x6c
> [27833.995946] pc : [] lr : [] psr: 400f0113
> [27833.995946] sp : dd193d78 ip : c2deb1e4 fp : da015180
> [27833.995959] r10:  r9 : 000200da r8 : c120fe08
> [27833.995968] r7 :  r6 :  r5 : c249a6c0 r4 : =
> c249a6c0
> [27833.995979] r3 :  r2 : 40080009 r1 : 200f0113 r0 : =
> c249a6c0
> ..
> [27833.996273] [] (page_swap_info+0x10/0x2c) from []
> (swap_slot_free_notify+0x18/0x6c)
> [27833.996288] [] (swap_slot_free_notify+0x18/0x6c) from
> [] (swap_readpage+0x90/0x11c)
> [27833.996302] [] (swap_readpage+0x90/0x11c) from []
> (read_swap_cache_async+0x134/0x1ac)
> [27833.996317] [] (read_swap_cache_async+0x134/0x1ac) from
> [] (swapin_readahead+0x70/0xb0)
> [27833.996334] [] (swapin_readahead+0x70/0xb0) from =
> []
> (handle_pte_fault+0x320/0x6fc)
> [27833.996348] [] (handle_pte_fault+0x320/0x6fc) from
> [] (handle_mm_fault+0xc0/0xf0)
> [27833.996363] [] (handle_mm_fault+0xc0/0xf0) from =
> []
> (do_page_fault+0x11c/0x36c)
> [27833.996378] [] (do_page_fault+0x11c/0x36c) from =
> []
> (do_DataAbort+0x34/0x118)
> [27833.996392] [] (do_DataAbort+0x34/0x118) from []
> (__dabt_usr+0x34/0x40)
> 
> Reported-by: Kyeongdon Kim <kyeongdon@lge.com>
> Signed-off-by: Minchan Kim <minc...@kernel.org>

I tested this patch for 48 hours and can't reproduce the problem.

Tested-by: Kyeongdon Kim <kyeongdon@lge.com>

Thanks.


Re: Re: [PATCH] zram: revive swap_slot_free_notify

2016-03-31 Thread Kyeongdon Kim
>> - }
>> - }
>> - }
>> -
>> + swap_slot_free_notify(page);
>> out:
>> unlock_page(page);
>> bio_put(bio);
>> @@ -347,6 +353,7 @@ int swap_readpage(struct page *page)
>>
>> ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
>> if (!ret) {
>> + swap_slot_free_notify(page);
>> count_vm_event(PSWPIN);
>> return 0;
>> }
>> --
>> 1.9.1
>>
> 
> Kyeongdon, Could you try this patch?
> 
> From 09497ba48f5621cd2737f72ac4d15e6b3e5e3d14 Mon Sep 17 00:00:00 2001
> From: Minchan Kim 
> Date: Mon, 28 Mar 2016 23:08:14 +0900
> Subject: [PATCH] mm: call swap_slot_free_notify with holding page lock
> 
> Kyeongdon reported below error which is BUG_ON(!PageSwapCache(page))
> in page_swap_info.
> The reason is that page_endio in rw_page unlocks the page if read I/O
> is completed so we need to hold a PG_lock again to check PageSwapCache.
> Otherwise, the page can be removed from swapcache and trigger below
> BUG_ON.
> 
> [27833.995833] [ cut here ]
> [27833.995853] Kernel BUG at c00f9040 [verbose debug info unavailable]
> [27833.995865] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP ARM
> [27833.995876] Modules linked in:
> [27833.995892] CPU: 4 PID: 13446 Comm: RenderThread Tainted: G W
> 3.10.84-g9f14aec-dirty #73
> [27833.995903] task: c3b73200 ti: dd192000 task.ti: dd192000
> [27833.995922] PC is at page_swap_info+0x10/0x2c
> [27833.995934] LR is at swap_slot_free_notify+0x18/0x6c
> [27833.995946] pc : [] lr : [] psr: 400f0113
> [27833.995946] sp : dd193d78 ip : c2deb1e4 fp : da015180
> [27833.995959] r10:  r9 : 000200da r8 : c120fe08
> [27833.995968] r7 :  r6 :  r5 : c249a6c0 r4 : =
> c249a6c0
> [27833.995979] r3 :  r2 : 40080009 r1 : 200f0113 r0 : =
> c249a6c0
> ..
> [27833.996273] [] (page_swap_info+0x10/0x2c) from []
> (swap_slot_free_notify+0x18/0x6c)
> [27833.996288] [] (swap_slot_free_notify+0x18/0x6c) from
> [] (swap_readpage+0x90/0x11c)
> [27833.996302] [] (swap_readpage+0x90/0x11c) from []
> (read_swap_cache_async+0x134/0x1ac)
> [27833.996317] [] (read_swap_cache_async+0x134/0x1ac) from
> [] (swapin_readahead+0x70/0xb0)
> [27833.996334] [] (swapin_readahead+0x70/0xb0) from =
> []
> (handle_pte_fault+0x320/0x6fc)
> [27833.996348] [] (handle_pte_fault+0x320/0x6fc) from
> [] (handle_mm_fault+0xc0/0xf0)
> [27833.996363] [] (handle_mm_fault+0xc0/0xf0) from =
> []
> (do_page_fault+0x11c/0x36c)
> [27833.996378] [] (do_page_fault+0x11c/0x36c) from =
> []
> (do_DataAbort+0x34/0x118)
> [27833.996392] [] (do_DataAbort+0x34/0x118) from []
> (__dabt_usr+0x34/0x40)
> 
> Reported-by: Kyeongdon Kim 
> Signed-off-by: Minchan Kim 

I tested this patch for 48 hours and can't reproduce the problem.

Tested-by: Kyeongdon Kim 

Thanks.


Re: Re: [PATCH v4 2/2] zram: try vmalloc() after kmalloc()

2015-12-21 Thread Kyeongdon Kim

On 2015-12-18 오전 9:28, Sergey Senozhatsky wrote:
> On (12/01/15 21:36), Sergey Senozhatsky wrote:
>> When we're using LZ4 multi compression streams for zram swap, we found
> out
>> page allocation failure message in system running test. That was not only
>> once, but a few(2 - 5 times per test). Also, some failure cases were
>> continually occurring to try allocation order 3.
>>
>> In order to make parallel compression private data, we should call
>> kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order
>> 2/3 size memory to allocate in that time, page allocation fails. This
>> patch makes to use vmalloc() as fallback of kmalloc(), this prevents page
>> alloc failure warning.
>>
>> After using this, we never found warning message in running test, also It
>> could reduce process startup latency about 60-120ms in each case.
>>
> 
> Hello Kyeongdon,
> 
> just to make sure, the patch works fine for you and we can move
> forward and Cc -stable. correct?
> 
> -ss
> 
Hello Sergey,

I was on vacation so I checked your email a moment ago, sorry about it.
We're using this patch set. and we couldn't find any issue from this one
until now.

Thanks,
Kyeongdon Kim

>> For reference a call trace :
>>
>> Binder_1: page allocation failure: order:3, mode:0x10c0d0
>> CPU: 0 PID: 424 Comm: Binder_1 Tainted: GW 3.10.49-perf-g991d02b-dirty
> #20
>> Call trace:
>> [] dump_backtrace+0x0/0x270
>> [] show_stack+0x10/0x1c
>> [] dump_stack+0x1c/0x28
>> [] warn_alloc_failed+0xfc/0x11c
>> [] __alloc_pages_nodemask+0x724/0x7f0
>> [] __get_free_pages+0x14/0x5c
>> [] kmalloc_order_trace+0x38/0xd8
>> [] zcomp_lz4_create+0x2c/0x38
>> [] zcomp_strm_alloc+0x34/0x78
>> [] zcomp_strm_multi_find+0x124/0x1ec
>> [] zcomp_strm_find+0xc/0x18
>> [] zram_bvec_rw+0x2fc/0x780
>> [] zram_make_request+0x25c/0x2d4
>> [] generic_make_request+0x80/0xbc
>> [] submit_bio+0xa4/0x15c
>> [] __swap_writepage+0x218/0x230
>> [] swap_writepage+0x3c/0x4c
>> [] shrink_page_list+0x51c/0x8d0
>> [] shrink_inactive_list+0x3f8/0x60c
>> [] shrink_lruvec+0x33c/0x4cc
>> [] shrink_zone+0x3c/0x100
>> [] try_to_free_pages+0x2b8/0x54c
>> [] __alloc_pages_nodemask+0x514/0x7f0
>> [] __get_free_pages+0x14/0x5c
>> [] proc_info_read+0x50/0xe4
>> [] vfs_read+0xa0/0x12c
>> [] SyS_read+0x44/0x74
>> DMA: 3397*4kB (MC) 26*8kB (RC) 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB
>> 0*512kB 0*1024kB 0*2048kB 0*4096kB = 13796kB
>>
>> [minc...@kernel.org: change vmalloc gfp and adding comment about gfp]
>> [sergey.senozhat...@gmail.com: tweak comments and styles]
>> Signed-off-by: Kyeongdon Kim 
>> Signed-off-by: Minchan Kim 
>> Acked-by: Sergey Senozhatsky 
>> ---
>> drivers/block/zram/zcomp_lz4.c | 23 +--
>> drivers/block/zram/zcomp_lzo.c | 23 +--
>> 2 files changed, 42 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/block/zram/zcomp_lz4.c
> b/drivers/block/zram/zcomp_lz4.c
>> index ee44b51..dd60831 100644
>> --- a/drivers/block/zram/zcomp_lz4.c
>> +++ b/drivers/block/zram/zcomp_lz4.c
>> @@ -10,17 +10,36 @@
>> #include 
>> #include 
>> #include 
>> +#include 
>> +#include 
>>
>> #include "zcomp_lz4.h"
>>
>> static void *zcomp_lz4_create(void)
>> {
>> - return kzalloc(LZ4_MEM_COMPRESS, GFP_NOIO);
>> + void *ret;
>> +
>> + /*
>> + * This function can be called in swapout/fs write path
>> + * so we can't use GFP_FS|IO. And it assumes we already
>> + * have at least one stream in zram initialization so we
>> + * don't do best effort to allocate more stream in here.
>> + * A default stream will work well without further multiple
>> + * streams. That's why we use NORETRY | NOWARN.
>> + */
>> + ret = kzalloc(LZ4_MEM_COMPRESS, GFP_NOIO | __GFP_NORETRY |
>> + __GFP_NOWARN);
>> + if (!ret)
>> + ret = __vmalloc(LZ4_MEM_COMPRESS,
>> + GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN |
>> + __GFP_ZERO | __GFP_HIGHMEM,
>> + PAGE_KERNEL);
>> + return ret;
>> }
>>
>> static void zcomp_lz4_destroy(void *private)
>> {
>> - kfree(private);
>> + kvfree(private);
>> }
>>
>> static int zcomp_lz4_compress(const unsigned char *src, unsigned char
> *dst,
>> diff --git a/drivers/block/zram/zcomp_lzo.c
> b/drivers/block/zram/zcomp_lzo.c
>> index 683ce04..edc5499 100644
>> --- a/drivers/block/zram/zcomp_lzo.c
>> +++ b/drivers/block/zram/zcomp_lzo.c
>> @@ -10,17 +10,36 @@
>> #

Re: Re: [PATCH v4 2/2] zram: try vmalloc() after kmalloc()

2015-12-21 Thread Kyeongdon Kim

On 2015-12-18 오전 9:28, Sergey Senozhatsky wrote:
> On (12/01/15 21:36), Sergey Senozhatsky wrote:
>> When we're using LZ4 multi compression streams for zram swap, we found
> out
>> page allocation failure message in system running test. That was not only
>> once, but a few(2 - 5 times per test). Also, some failure cases were
>> continually occurring to try allocation order 3.
>>
>> In order to make parallel compression private data, we should call
>> kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order
>> 2/3 size memory to allocate in that time, page allocation fails. This
>> patch makes to use vmalloc() as fallback of kmalloc(), this prevents page
>> alloc failure warning.
>>
>> After using this, we never found warning message in running test, also It
>> could reduce process startup latency about 60-120ms in each case.
>>
> 
> Hello Kyeongdon,
> 
> just to make sure, the patch works fine for you and we can move
> forward and Cc -stable. correct?
> 
> -ss
> 
Hello Sergey,

I was on vacation so I checked your email a moment ago, sorry about it.
We're using this patch set. and we couldn't find any issue from this one
until now.

Thanks,
Kyeongdon Kim

>> For reference a call trace :
>>
>> Binder_1: page allocation failure: order:3, mode:0x10c0d0
>> CPU: 0 PID: 424 Comm: Binder_1 Tainted: GW 3.10.49-perf-g991d02b-dirty
> #20
>> Call trace:
>> [] dump_backtrace+0x0/0x270
>> [] show_stack+0x10/0x1c
>> [] dump_stack+0x1c/0x28
>> [] warn_alloc_failed+0xfc/0x11c
>> [] __alloc_pages_nodemask+0x724/0x7f0
>> [] __get_free_pages+0x14/0x5c
>> [] kmalloc_order_trace+0x38/0xd8
>> [] zcomp_lz4_create+0x2c/0x38
>> [] zcomp_strm_alloc+0x34/0x78
>> [] zcomp_strm_multi_find+0x124/0x1ec
>> [] zcomp_strm_find+0xc/0x18
>> [] zram_bvec_rw+0x2fc/0x780
>> [] zram_make_request+0x25c/0x2d4
>> [] generic_make_request+0x80/0xbc
>> [] submit_bio+0xa4/0x15c
>> [] __swap_writepage+0x218/0x230
>> [] swap_writepage+0x3c/0x4c
>> [] shrink_page_list+0x51c/0x8d0
>> [] shrink_inactive_list+0x3f8/0x60c
>> [] shrink_lruvec+0x33c/0x4cc
>> [] shrink_zone+0x3c/0x100
>> [] try_to_free_pages+0x2b8/0x54c
>> [] __alloc_pages_nodemask+0x514/0x7f0
>> [] __get_free_pages+0x14/0x5c
>> [] proc_info_read+0x50/0xe4
>> [] vfs_read+0xa0/0x12c
>> [] SyS_read+0x44/0x74
>> DMA: 3397*4kB (MC) 26*8kB (RC) 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB
>> 0*512kB 0*1024kB 0*2048kB 0*4096kB = 13796kB
>>
>> [minc...@kernel.org: change vmalloc gfp and adding comment about gfp]
>> [sergey.senozhat...@gmail.com: tweak comments and styles]
>> Signed-off-by: Kyeongdon Kim <kyeongdon@lge.com>
>> Signed-off-by: Minchan Kim <minc...@kernel.org>
>> Acked-by: Sergey Senozhatsky <sergey.senozhat...@gmail.com>
>> ---
>> drivers/block/zram/zcomp_lz4.c | 23 +--
>> drivers/block/zram/zcomp_lzo.c | 23 +--
>> 2 files changed, 42 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/block/zram/zcomp_lz4.c
> b/drivers/block/zram/zcomp_lz4.c
>> index ee44b51..dd60831 100644
>> --- a/drivers/block/zram/zcomp_lz4.c
>> +++ b/drivers/block/zram/zcomp_lz4.c
>> @@ -10,17 +10,36 @@
>> #include 
>> #include 
>> #include 
>> +#include 
>> +#include 
>>
>> #include "zcomp_lz4.h"
>>
>> static void *zcomp_lz4_create(void)
>> {
>> - return kzalloc(LZ4_MEM_COMPRESS, GFP_NOIO);
>> + void *ret;
>> +
>> + /*
>> + * This function can be called in swapout/fs write path
>> + * so we can't use GFP_FS|IO. And it assumes we already
>> + * have at least one stream in zram initialization so we
>> + * don't do best effort to allocate more stream in here.
>> + * A default stream will work well without further multiple
>> + * streams. That's why we use NORETRY | NOWARN.
>> + */
>> + ret = kzalloc(LZ4_MEM_COMPRESS, GFP_NOIO | __GFP_NORETRY |
>> + __GFP_NOWARN);
>> + if (!ret)
>> + ret = __vmalloc(LZ4_MEM_COMPRESS,
>> + GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN |
>> + __GFP_ZERO | __GFP_HIGHMEM,
>> + PAGE_KERNEL);
>> + return ret;
>> }
>>
>> static void zcomp_lz4_destroy(void *private)
>> {
>> - kfree(private);
>> + kvfree(private);
>> }
>>
>> static int zcomp_lz4_compress(const unsigned char *src, unsigned char
> *dst,
>> diff --git a/drivers/block/zram/zcomp_lzo.c
> b/drivers/block/zram/zcomp_lzo.c
>> index 683ce04..edc5499 100644
>> --- a/drivers/block/zram/zcomp

Re: Re: [PATCH v3 2/2] zram: try vmalloc() after kmalloc()

2015-11-30 Thread Kyeongdon Kim

On 2015-12-01 오후 2:16, Sergey Senozhatsky wrote:
> On (12/01/15 13:55), Minchan Kim wrote:
> [..]
>> To clear my opinion,
>>
>> lzo_create(gfp_t flags)
>> {
>> void * ret = kmalloc(LZO1X_MEM_COMPRESS, flags);
>> if (!ret)
>> ret = vmalloc(LZO1X_MEM_COMPRESS, flasgs | GFP_NOMEMALLOC);
>> return ret;
>> }
> 
> ah, ok, I see. I've a question.
> 
> we had
> kmalloc(f | __GFP_NOMEMALLOC)
> __vmalloc(f | __GFP_NOMEMALLOC)
> 
> which produced high failure rates for both kmalloc() and __vmalloc()
> 
> test #1
> 
>> > > log message :
> [..]
>> > > [ 352.230608][0] zcomp_lz4_create: 32: ret = (null)
>> > > [ 352.230619][0] zcomp_lz4_create: 38: ret = (null)
>> > > [ 352.230888][0] zcomp_lz4_create: 32: ret = (null)
>> > > [ 352.230902][0] zcomp_lz4_create: 38: ret = (null)
>> > > [ 352.231406][0] zcomp_lz4_create: 32: ret = ffc002088000
>> > > [ 352.234024][0] zcomp_lz4_create: 32: ret = (null)
>> > > [ 352.234060][0] zcomp_lz4_create: 38: ret = (null)
>> > > [ 352.234359][0] zcomp_lz4_create: 32: ret = (null)
> [..]
>> > > [ 352.234384][0] zcomp_lz4_create: 38: ret = (null)
>> > > [ 352.234618][0] zcomp_lz4_create: 32: ret = (null)
>> > > [ 352.234639][0] zcomp_lz4_create: 38: ret = (null)
>> > > [ 352.234667][0] zcomp_lz4_create: 32: ret = (null)
>> > > [ 352.235179][0] zcomp_lz4_create: 38: ret = ff80016a4000
> 
> 
> 
> Kyeongdon, do I understand correctly, that for the second test you
> removed '__GFP_NOMEMALLOC' from both kmalloc() and __vmalloc()?
> 
> iow:
> kmalloc(f & ~__GFP_NOMEMALLOC)
> vmalloc(f & ~__GFP_NOMEMALLOC)
> 
> test #2 : almost always failing kmalloc() and !NULL __vmalloc()
> 
>> > > log message :
>> > > <4>[ 2288.954934][0] KDKIM: zcomp_lz4_create: 24: ret = (null)
>> > > <4>[ 2288.954972][0] KDKIM: zcomp_lz4_create: 30: ret =
> ff800287e000
>> > > 
>> > > <4>[ 2289.092411][0] KDKIM: zcomp_lz4_create: 24: ret = (null)
>> > > <4>[ 2289.092546][0] KDKIM: zcomp_lz4_create: 30: ret =
> ff80028b5000
>> > > 
>> > > <4>[ 2289.135628][0] KDKIM: zcomp_lz4_create: 24: ret = (null)
>> > > <4>[ 2289.135642][0] KDKIM: zcomp_lz4_create: 24: ret = (null)
>> > > <4>[ 2289.135729][0] KDKIM: zcomp_lz4_create: 30: ret =
> ff80028be000
>> > > <4>[ 2289.135732][0] KDKIM: zcomp_lz4_create: 30: ret =
> ff80028c7000
> 
> 
> if this is the case (__GFP_NOMEMALLOC removed from both kmalloc and
> __vmalloc),
> then proposed
> 
> kmalloc(f & ~__GFP_NOMEMALLOC)
> __vmalloc(f | __GFP_NOMEMALLOC)
> 
> 
> can be very close to 'test #1 && test #2':
> 
> kmalloc() fails (as in test #2)
> __vmalloc() fails (as in test #1)
> 
> isn't it?
> 
> -ss

Let me give you a simple code of it.

@test #1 (previous shared log)
 kmalloc(f | __GFP_NOMEMALLOC)
  __vmalloc(f | __GFP_NOMEMALLOC)
// can find failure both

@test #2 (previous shared log)
 kmalloc(f | __GFP_NOMEMALLOC)
__vmalloc(f)
// removed '__GFP_NOMEMALLOC' from vmalloc() only, and cannot find
failure from vmalloc()

And like you said, I made a quick check to see a failure about kmalloc()
without the flag :

@test #3
 kmalloc(f)
__vmalloc(f | __GFP_NOMEMALLOC)
// removed '__GFP_NOMEMALLOC' from zmalloc() only
// and cannot find failure from zmalloc(), but in this case, it's hard
to find failure from vmalloc() because of already allocation mostly from
zsmalloc()

log message (test #3) :
<4>[  186.763605][1] KDKIM: zcomp_lz4_create: 24: ret = ffc00203
<4>[  186.776652][1] KDKIM: zcomp_lz4_create: 24: ret = ffc0020f
<4>[  186.811423][1] KDKIM: zcomp_lz4_create: 24: ret = ffc002108000
<4>[  186.816744][1] KDKIM: zcomp_lz4_create: 24: ret = ffc00200
<4>[  186.816796][1] KDKIM: zcomp_lz4_create: 24: ret = ffc002008000

@test #4
 kmalloc(f)
__vmalloc(f)
// cannot find failure both until now

log message (test #4) :
<4>[  641.440468][7] KDKIM: zcomp_lz4_create: 24: ret = ffc00219

<4>[  922.182980][7] KDKIM: zcomp_lz4_create: 24: ret = ffc002208000

<4>[  923.197593][7] KDKIM: zcomp_lz4_create: 24: ret = ffc00202

<4>[  939.813499][7] KDKIM: zcomp_lz4_create: 24: ret = ffc0020a

So,is there another problem if we remove the flag from both sides?

Thanks,
Kyeongdon Kim
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Re: [PATCH v3 2/2] zram: try vmalloc() after kmalloc()

2015-11-30 Thread Kyeongdon Kim

On 2015-12-01 오후 2:16, Sergey Senozhatsky wrote:
> On (12/01/15 13:55), Minchan Kim wrote:
> [..]
>> To clear my opinion,
>>
>> lzo_create(gfp_t flags)
>> {
>> void * ret = kmalloc(LZO1X_MEM_COMPRESS, flags);
>> if (!ret)
>> ret = vmalloc(LZO1X_MEM_COMPRESS, flasgs | GFP_NOMEMALLOC);
>> return ret;
>> }
> 
> ah, ok, I see. I've a question.
> 
> we had
> kmalloc(f | __GFP_NOMEMALLOC)
> __vmalloc(f | __GFP_NOMEMALLOC)
> 
> which produced high failure rates for both kmalloc() and __vmalloc()
> 
> test #1
> 
>> > > log message :
> [..]
>> > > [ 352.230608][0] zcomp_lz4_create: 32: ret = (null)
>> > > [ 352.230619][0] zcomp_lz4_create: 38: ret = (null)
>> > > [ 352.230888][0] zcomp_lz4_create: 32: ret = (null)
>> > > [ 352.230902][0] zcomp_lz4_create: 38: ret = (null)
>> > > [ 352.231406][0] zcomp_lz4_create: 32: ret = ffc002088000
>> > > [ 352.234024][0] zcomp_lz4_create: 32: ret = (null)
>> > > [ 352.234060][0] zcomp_lz4_create: 38: ret = (null)
>> > > [ 352.234359][0] zcomp_lz4_create: 32: ret = (null)
> [..]
>> > > [ 352.234384][0] zcomp_lz4_create: 38: ret = (null)
>> > > [ 352.234618][0] zcomp_lz4_create: 32: ret = (null)
>> > > [ 352.234639][0] zcomp_lz4_create: 38: ret = (null)
>> > > [ 352.234667][0] zcomp_lz4_create: 32: ret = (null)
>> > > [ 352.235179][0] zcomp_lz4_create: 38: ret = ff80016a4000
> 
> 
> 
> Kyeongdon, do I understand correctly, that for the second test you
> removed '__GFP_NOMEMALLOC' from both kmalloc() and __vmalloc()?
> 
> iow:
> kmalloc(f & ~__GFP_NOMEMALLOC)
> vmalloc(f & ~__GFP_NOMEMALLOC)
> 
> test #2 : almost always failing kmalloc() and !NULL __vmalloc()
> 
>> > > log message :
>> > > <4>[ 2288.954934][0] KDKIM: zcomp_lz4_create: 24: ret = (null)
>> > > <4>[ 2288.954972][0] KDKIM: zcomp_lz4_create: 30: ret =
> ff800287e000
>> > > 
>> > > <4>[ 2289.092411][0] KDKIM: zcomp_lz4_create: 24: ret = (null)
>> > > <4>[ 2289.092546][0] KDKIM: zcomp_lz4_create: 30: ret =
> ff80028b5000
>> > > 
>> > > <4>[ 2289.135628][0] KDKIM: zcomp_lz4_create: 24: ret = (null)
>> > > <4>[ 2289.135642][0] KDKIM: zcomp_lz4_create: 24: ret = (null)
>> > > <4>[ 2289.135729][0] KDKIM: zcomp_lz4_create: 30: ret =
> ff80028be000
>> > > <4>[ 2289.135732][0] KDKIM: zcomp_lz4_create: 30: ret =
> ff80028c7000
> 
> 
> if this is the case (__GFP_NOMEMALLOC removed from both kmalloc and
> __vmalloc),
> then proposed
> 
> kmalloc(f & ~__GFP_NOMEMALLOC)
> __vmalloc(f | __GFP_NOMEMALLOC)
> 
> 
> can be very close to 'test #1 && test #2':
> 
> kmalloc() fails (as in test #2)
> __vmalloc() fails (as in test #1)
> 
> isn't it?
> 
> -ss

Let me give you a simple code of it.

@test #1 (previous shared log)
 kmalloc(f | __GFP_NOMEMALLOC)
  __vmalloc(f | __GFP_NOMEMALLOC)
// can find failure both

@test #2 (previous shared log)
 kmalloc(f | __GFP_NOMEMALLOC)
__vmalloc(f)
// removed '__GFP_NOMEMALLOC' from vmalloc() only, and cannot find
failure from vmalloc()

And like you said, I made a quick check to see a failure about kmalloc()
without the flag :

@test #3
 kmalloc(f)
__vmalloc(f | __GFP_NOMEMALLOC)
// removed '__GFP_NOMEMALLOC' from zmalloc() only
// and cannot find failure from zmalloc(), but in this case, it's hard
to find failure from vmalloc() because of already allocation mostly from
zsmalloc()

log message (test #3) :
<4>[  186.763605][1] KDKIM: zcomp_lz4_create: 24: ret = ffc00203
<4>[  186.776652][1] KDKIM: zcomp_lz4_create: 24: ret = ffc0020f
<4>[  186.811423][1] KDKIM: zcomp_lz4_create: 24: ret = ffc002108000
<4>[  186.816744][1] KDKIM: zcomp_lz4_create: 24: ret = ffc00200
<4>[  186.816796][1] KDKIM: zcomp_lz4_create: 24: ret = ffc002008000

@test #4
 kmalloc(f)
__vmalloc(f)
// cannot find failure both until now

log message (test #4) :
<4>[  641.440468][7] KDKIM: zcomp_lz4_create: 24: ret = ffc00219

<4>[  922.182980][7] KDKIM: zcomp_lz4_create: 24: ret = ffc002208000

<4>[  923.197593][7] KDKIM: zcomp_lz4_create: 24: ret = ffc00202

<4>[  939.813499][7] KDKIM: zcomp_lz4_create: 24: ret = ffc0020a

So,is there another problem if we remove the flag from both sides?

Thanks,
Kyeongdon Kim
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Re: [PATCH v3] zram: try vmalloc() after kmalloc()

2015-11-24 Thread Kyeongdon Kim
On 2015-11-24 오전 8:28, Minchan Kim wrote:
> Hello Andrew,
> 
> On Mon, Nov 23, 2015 at 02:52:26PM -0800, Andrew Morton wrote:
>> On Mon, 23 Nov 2015 15:21:15 +0900 Kyeongdon Kim
>  wrote:
>>
>> > When we're using LZ4 multi compression streams for zram swap,
>> > we found out page allocation failure message in system running test.
>> > That was not only once, but a few(2 - 5 times per test).
>> > Also, some failure cases were continually occurring to try allocation
>> > order 3.
>> >
>> > In order to make parallel compression private data, we should call
>> > kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order
>> > 2/3 size memory to allocate in that time, page allocation fails.
>> > This patch makes to use vmalloc() as fallback of kmalloc(), this
>> > prevents page alloc failure warning.
>> >
>> > After using this, we never found warning message in running test, also
>> > It could reduce process startup latency about 60-120ms in each case.
>> >
>> > ...
>> >
>> > --- a/drivers/block/zram/zcomp_lz4.c
>> > +++ b/drivers/block/zram/zcomp_lz4.c
>> > @@ -10,17 +10,25 @@
>> > #include 
>> > #include 
>> > #include 
>> > +#include 
>> > +#include 
>> >
>> > #include "zcomp_lz4.h"
>> >
>> > static void *zcomp_lz4_create(void)
>> > {
>> > - return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL);
>> > + void *ret;
>> > +
>> > + ret = kzalloc(LZ4_MEM_COMPRESS,
>> > + __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC);
>> > + if (!ret)
>> > + ret = vzalloc(LZ4_MEM_COMPRESS);
>> > + return ret;
>> > }
>>
>> What's the reasoning behind the modification to the gfp flags?
>>
>> It clears __GFP_FS, __GFP_IO and even __GFP_WAIT. I suspect the latter
>> two (at least) can be retained. And given that vmalloc() uses
> 
> This function is used in swapout and fs write path so we couldn't use
> those flags.
> 
>> GFP_KERNEL, what's the point in clearing those flags for the kmalloc()
>> case?
> 
> When I reviewed this patch, I wanted to fix it with another patch
> because we should handle another places in zcomp and Sergey sent it
> today. But I think Sergey's patch is stable material so I hope
> Kyeongdon resend this patch with fixing vmalloc part.
> 
>>
>> If this change (or something like it) remains in place, it should have
>> a comment which fully explains the reasons, please.
> 
> Kyeongdon, Could you resend this patch with fixing vzalloc part and
> adding comment?
> 
Sorry for the delay in replying,
I just checked your comments and patch set.

[PATCH 1/3] zram/zcomp: use GFP_NOIO to allocate stream
[PATCH 2/3] zram: try vmalloc() after kmalloc()
[RFC 3/3] zram: pass gfp from zcomp frontend to backend

First of all, thanks for your summary and organized code set, and
If Sergey agrees with that, I have no question about the patch 2/3.

>>
> 
> -- 
> Kind regards,
> Minchan Kim

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Re: [PATCH v3] zram: try vmalloc() after kmalloc()

2015-11-24 Thread Kyeongdon Kim
On 2015-11-24 오전 8:28, Minchan Kim wrote:
> Hello Andrew,
> 
> On Mon, Nov 23, 2015 at 02:52:26PM -0800, Andrew Morton wrote:
>> On Mon, 23 Nov 2015 15:21:15 +0900 Kyeongdon Kim
> <kyeongdon@lge.com> wrote:
>>
>> > When we're using LZ4 multi compression streams for zram swap,
>> > we found out page allocation failure message in system running test.
>> > That was not only once, but a few(2 - 5 times per test).
>> > Also, some failure cases were continually occurring to try allocation
>> > order 3.
>> >
>> > In order to make parallel compression private data, we should call
>> > kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order
>> > 2/3 size memory to allocate in that time, page allocation fails.
>> > This patch makes to use vmalloc() as fallback of kmalloc(), this
>> > prevents page alloc failure warning.
>> >
>> > After using this, we never found warning message in running test, also
>> > It could reduce process startup latency about 60-120ms in each case.
>> >
>> > ...
>> >
>> > --- a/drivers/block/zram/zcomp_lz4.c
>> > +++ b/drivers/block/zram/zcomp_lz4.c
>> > @@ -10,17 +10,25 @@
>> > #include 
>> > #include 
>> > #include 
>> > +#include 
>> > +#include 
>> >
>> > #include "zcomp_lz4.h"
>> >
>> > static void *zcomp_lz4_create(void)
>> > {
>> > - return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL);
>> > + void *ret;
>> > +
>> > + ret = kzalloc(LZ4_MEM_COMPRESS,
>> > + __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC);
>> > + if (!ret)
>> > + ret = vzalloc(LZ4_MEM_COMPRESS);
>> > + return ret;
>> > }
>>
>> What's the reasoning behind the modification to the gfp flags?
>>
>> It clears __GFP_FS, __GFP_IO and even __GFP_WAIT. I suspect the latter
>> two (at least) can be retained. And given that vmalloc() uses
> 
> This function is used in swapout and fs write path so we couldn't use
> those flags.
> 
>> GFP_KERNEL, what's the point in clearing those flags for the kmalloc()
>> case?
> 
> When I reviewed this patch, I wanted to fix it with another patch
> because we should handle another places in zcomp and Sergey sent it
> today. But I think Sergey's patch is stable material so I hope
> Kyeongdon resend this patch with fixing vmalloc part.
> 
>>
>> If this change (or something like it) remains in place, it should have
>> a comment which fully explains the reasons, please.
> 
> Kyeongdon, Could you resend this patch with fixing vzalloc part and
> adding comment?
> 
Sorry for the delay in replying,
I just checked your comments and patch set.

[PATCH 1/3] zram/zcomp: use GFP_NOIO to allocate stream
[PATCH 2/3] zram: try vmalloc() after kmalloc()
[RFC 3/3] zram: pass gfp from zcomp frontend to backend

First of all, thanks for your summary and organized code set, and
If Sergey agrees with that, I have no question about the patch 2/3.

>>
> 
> -- 
> Kind regards,
> Minchan Kim

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Re: [PATCH v3] zram: try vmalloc() after kmalloc()

2015-11-23 Thread Kyeongdon Kim
Hello Andrew,
On 2015-11-24 오전 7:52, Andrew Morton wrote:
> On Mon, 23 Nov 2015 15:21:15 +0900 Kyeongdon Kim 
> wrote:
> 
>> When we're using LZ4 multi compression streams for zram swap,
>> we found out page allocation failure message in system running test.
>> That was not only once, but a few(2 - 5 times per test).
>> Also, some failure cases were continually occurring to try allocation
>> order 3.
>>
>> In order to make parallel compression private data, we should call
>> kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order
>> 2/3 size memory to allocate in that time, page allocation fails.
>> This patch makes to use vmalloc() as fallback of kmalloc(), this
>> prevents page alloc failure warning.
>>
>> After using this, we never found warning message in running test, also
>> It could reduce process startup latency about 60-120ms in each case.
>>
>> ...
>>
>> --- a/drivers/block/zram/zcomp_lz4.c
>> +++ b/drivers/block/zram/zcomp_lz4.c
>> @@ -10,17 +10,25 @@
>> #include 
>> #include 
>> #include 
>> +#include 
>> +#include 
>>
>> #include "zcomp_lz4.h"
>>
>> static void *zcomp_lz4_create(void)
>> {
>> - return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL);
>> + void *ret;
>> +
>> + ret = kzalloc(LZ4_MEM_COMPRESS,
>> + __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC);
>> + if (!ret)
>> + ret = vzalloc(LZ4_MEM_COMPRESS);
>> + return ret;
>> }
> 
> What's the reasoning behind the modification to the gfp flags?
> 
> It clears __GFP_FS, __GFP_IO and even __GFP_WAIT. I suspect the latter
> two (at least) can be retained. And given that vmalloc() uses
> GFP_KERNEL, what's the point in clearing those flags for the kmalloc()
> case?
> 
> If this change (or something like it) remains in place, it should have
> a comment which fully explains the reasons, please.

Sorry for the delay in replying,
I just tried to remove that warning message. If there are more rightable
gfp flags(like a code in Minchan's patch), we can use it.

Thanks,
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Re: [PATCH v3] zram: try vmalloc() after kmalloc()

2015-11-23 Thread Kyeongdon Kim
Hello Andrew,
On 2015-11-24 오전 7:52, Andrew Morton wrote:
> On Mon, 23 Nov 2015 15:21:15 +0900 Kyeongdon Kim <kyeongdon@lge.com>
> wrote:
> 
>> When we're using LZ4 multi compression streams for zram swap,
>> we found out page allocation failure message in system running test.
>> That was not only once, but a few(2 - 5 times per test).
>> Also, some failure cases were continually occurring to try allocation
>> order 3.
>>
>> In order to make parallel compression private data, we should call
>> kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order
>> 2/3 size memory to allocate in that time, page allocation fails.
>> This patch makes to use vmalloc() as fallback of kmalloc(), this
>> prevents page alloc failure warning.
>>
>> After using this, we never found warning message in running test, also
>> It could reduce process startup latency about 60-120ms in each case.
>>
>> ...
>>
>> --- a/drivers/block/zram/zcomp_lz4.c
>> +++ b/drivers/block/zram/zcomp_lz4.c
>> @@ -10,17 +10,25 @@
>> #include 
>> #include 
>> #include 
>> +#include 
>> +#include 
>>
>> #include "zcomp_lz4.h"
>>
>> static void *zcomp_lz4_create(void)
>> {
>> - return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL);
>> + void *ret;
>> +
>> + ret = kzalloc(LZ4_MEM_COMPRESS,
>> + __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC);
>> + if (!ret)
>> + ret = vzalloc(LZ4_MEM_COMPRESS);
>> + return ret;
>> }
> 
> What's the reasoning behind the modification to the gfp flags?
> 
> It clears __GFP_FS, __GFP_IO and even __GFP_WAIT. I suspect the latter
> two (at least) can be retained. And given that vmalloc() uses
> GFP_KERNEL, what's the point in clearing those flags for the kmalloc()
> case?
> 
> If this change (or something like it) remains in place, it should have
> a comment which fully explains the reasons, please.

Sorry for the delay in replying,
I just tried to remove that warning message. If there are more rightable
gfp flags(like a code in Minchan's patch), we can use it.

Thanks,
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3] zram: try vmalloc() after kmalloc()

2015-11-22 Thread Kyeongdon Kim
When we're using LZ4 multi compression streams for zram swap,
we found out page allocation failure message in system running test.
That was not only once, but a few(2 - 5 times per test).
Also, some failure cases were continually occurring to try allocation
order 3.

In order to make parallel compression private data, we should call
kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order
 2/3 size memory to allocate in that time, page allocation fails.
This patch makes to use vmalloc() as fallback of kmalloc(), this
prevents page alloc failure warning.

After using this, we never found warning message in running test, also
It could reduce process startup latency about 60-120ms in each case.

For reference a call trace :

Binder_1: page allocation failure: order:3, mode:0x10c0d0
CPU: 0 PID: 424 Comm: Binder_1 Tainted: GW 3.10.49-perf-g991d02b-dirty #20
Call trace:
[] dump_backtrace+0x0/0x270
[] show_stack+0x10/0x1c
[] dump_stack+0x1c/0x28
[] warn_alloc_failed+0xfc/0x11c
[] __alloc_pages_nodemask+0x724/0x7f0
[] __get_free_pages+0x14/0x5c
[] kmalloc_order_trace+0x38/0xd8
[] zcomp_lz4_create+0x2c/0x38
[] zcomp_strm_alloc+0x34/0x78
[] zcomp_strm_multi_find+0x124/0x1ec
[] zcomp_strm_find+0xc/0x18
[] zram_bvec_rw+0x2fc/0x780
[] zram_make_request+0x25c/0x2d4
[] generic_make_request+0x80/0xbc
[] submit_bio+0xa4/0x15c
[] __swap_writepage+0x218/0x230
[] swap_writepage+0x3c/0x4c
[] shrink_page_list+0x51c/0x8d0
[] shrink_inactive_list+0x3f8/0x60c
[] shrink_lruvec+0x33c/0x4cc
[] shrink_zone+0x3c/0x100
[] try_to_free_pages+0x2b8/0x54c
[] __alloc_pages_nodemask+0x514/0x7f0
[] __get_free_pages+0x14/0x5c
[] proc_info_read+0x50/0xe4
[] vfs_read+0xa0/0x12c
[] SyS_read+0x44/0x74
DMA: 3397*4kB (MC) 26*8kB (RC) 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB
 0*512kB 0*1024kB 0*2048kB 0*4096kB = 13796kB

Signed-off-by: Kyeongdon Kim 
---
 drivers/block/zram/zcomp_lz4.c | 12 ++--
 drivers/block/zram/zcomp_lzo.c | 12 ++--
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c
index f2afb7e..0cc4799 100644
--- a/drivers/block/zram/zcomp_lz4.c
+++ b/drivers/block/zram/zcomp_lz4.c
@@ -10,17 +10,25 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "zcomp_lz4.h"
 
 static void *zcomp_lz4_create(void)
 {
-   return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL);
+   void *ret;
+
+   ret = kzalloc(LZ4_MEM_COMPRESS,
+   __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC);
+   if (!ret)
+   ret = vzalloc(LZ4_MEM_COMPRESS);
+   return ret;
 }
 
 static void zcomp_lz4_destroy(void *private)
 {
-   kfree(private);
+   kvfree(private);
 }
 
 static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst,
diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c
index da1bc47..59b8aa4 100644
--- a/drivers/block/zram/zcomp_lzo.c
+++ b/drivers/block/zram/zcomp_lzo.c
@@ -10,17 +10,25 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "zcomp_lzo.h"
 
 static void *lzo_create(void)
 {
-   return kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
+   void *ret;
+
+   ret = kzalloc(LZO1X_MEM_COMPRESS,
+   __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC);
+   if (!ret)
+   ret = vzalloc(LZO1X_MEM_COMPRESS);
+   return ret;
 }
 
 static void lzo_destroy(void *private)
 {
-   kfree(private);
+   kvfree(private);
 }
 
 static int lzo_compress(const unsigned char *src, unsigned char *dst,
-- 
1.8.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3] zram: try vmalloc() after kmalloc()

2015-11-22 Thread Kyeongdon Kim
When we're using LZ4 multi compression streams for zram swap,
we found out page allocation failure message in system running test.
That was not only once, but a few(2 - 5 times per test).
Also, some failure cases were continually occurring to try allocation
order 3.

In order to make parallel compression private data, we should call
kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order
 2/3 size memory to allocate in that time, page allocation fails.
This patch makes to use vmalloc() as fallback of kmalloc(), this
prevents page alloc failure warning.

After using this, we never found warning message in running test, also
It could reduce process startup latency about 60-120ms in each case.

For reference a call trace :

Binder_1: page allocation failure: order:3, mode:0x10c0d0
CPU: 0 PID: 424 Comm: Binder_1 Tainted: GW 3.10.49-perf-g991d02b-dirty #20
Call trace:
[] dump_backtrace+0x0/0x270
[] show_stack+0x10/0x1c
[] dump_stack+0x1c/0x28
[] warn_alloc_failed+0xfc/0x11c
[] __alloc_pages_nodemask+0x724/0x7f0
[] __get_free_pages+0x14/0x5c
[] kmalloc_order_trace+0x38/0xd8
[] zcomp_lz4_create+0x2c/0x38
[] zcomp_strm_alloc+0x34/0x78
[] zcomp_strm_multi_find+0x124/0x1ec
[] zcomp_strm_find+0xc/0x18
[] zram_bvec_rw+0x2fc/0x780
[] zram_make_request+0x25c/0x2d4
[] generic_make_request+0x80/0xbc
[] submit_bio+0xa4/0x15c
[] __swap_writepage+0x218/0x230
[] swap_writepage+0x3c/0x4c
[] shrink_page_list+0x51c/0x8d0
[] shrink_inactive_list+0x3f8/0x60c
[] shrink_lruvec+0x33c/0x4cc
[] shrink_zone+0x3c/0x100
[] try_to_free_pages+0x2b8/0x54c
[] __alloc_pages_nodemask+0x514/0x7f0
[] __get_free_pages+0x14/0x5c
[] proc_info_read+0x50/0xe4
[] vfs_read+0xa0/0x12c
[] SyS_read+0x44/0x74
DMA: 3397*4kB (MC) 26*8kB (RC) 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB
 0*512kB 0*1024kB 0*2048kB 0*4096kB = 13796kB

Signed-off-by: Kyeongdon Kim <kyeongdon@lge.com>
---
 drivers/block/zram/zcomp_lz4.c | 12 ++--
 drivers/block/zram/zcomp_lzo.c | 12 ++--
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c
index f2afb7e..0cc4799 100644
--- a/drivers/block/zram/zcomp_lz4.c
+++ b/drivers/block/zram/zcomp_lz4.c
@@ -10,17 +10,25 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "zcomp_lz4.h"
 
 static void *zcomp_lz4_create(void)
 {
-   return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL);
+   void *ret;
+
+   ret = kzalloc(LZ4_MEM_COMPRESS,
+   __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC);
+   if (!ret)
+   ret = vzalloc(LZ4_MEM_COMPRESS);
+   return ret;
 }
 
 static void zcomp_lz4_destroy(void *private)
 {
-   kfree(private);
+   kvfree(private);
 }
 
 static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst,
diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c
index da1bc47..59b8aa4 100644
--- a/drivers/block/zram/zcomp_lzo.c
+++ b/drivers/block/zram/zcomp_lzo.c
@@ -10,17 +10,25 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "zcomp_lzo.h"
 
 static void *lzo_create(void)
 {
-   return kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
+   void *ret;
+
+   ret = kzalloc(LZO1X_MEM_COMPRESS,
+   __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC);
+   if (!ret)
+   ret = vzalloc(LZO1X_MEM_COMPRESS);
+   return ret;
 }
 
 static void lzo_destroy(void *private)
 {
-   kfree(private);
+   kvfree(private);
 }
 
 static int lzo_compress(const unsigned char *src, unsigned char *dst,
-- 
1.8.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] zram: Prevent page allocation failure during zcomp_strm_alloc

2015-11-20 Thread Kyeongdon Kim
When we're using LZ4 multi compression streams for zram swap,
we found out page allocation failure message in system running test.
That was not only once, but a few(2 - 5 times per test).
Also, some failure cases were continually occurring to try allocation
order 3.

In order to make parallel compression private data, we should call
kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order
 2/3 size memory in that time, page allocation fails.
This patch makes to use vmalloc() as fallback of kmalloc(), this
prevents page alloc failure warning.

After this, we never found warning message in running test, also
It could reduce process startup latency about 60-120ms in each case.

For reference a call trace :

Binder_1: page allocation failure: order:3, mode:0x10c0d0
CPU: 0 PID: 424 Comm: Binder_1 Tainted: GW 3.10.49-perf-g991d02b-dirty #20
Call trace:
[] dump_backtrace+0x0/0x270
[] show_stack+0x10/0x1c
[] dump_stack+0x1c/0x28
[] warn_alloc_failed+0xfc/0x11c
[] __alloc_pages_nodemask+0x724/0x7f0
[] __get_free_pages+0x14/0x5c
[] kmalloc_order_trace+0x38/0xd8
[] zcomp_lz4_create+0x2c/0x38
[] zcomp_strm_alloc+0x34/0x78
[] zcomp_strm_multi_find+0x124/0x1ec
[] zcomp_strm_find+0xc/0x18
[] zram_bvec_rw+0x2fc/0x780
[] zram_make_request+0x25c/0x2d4
[] generic_make_request+0x80/0xbc
[] submit_bio+0xa4/0x15c
[] __swap_writepage+0x218/0x230
[] swap_writepage+0x3c/0x4c
[] shrink_page_list+0x51c/0x8d0
[] shrink_inactive_list+0x3f8/0x60c
[] shrink_lruvec+0x33c/0x4cc
[] shrink_zone+0x3c/0x100
[] try_to_free_pages+0x2b8/0x54c
[] __alloc_pages_nodemask+0x514/0x7f0
[] __get_free_pages+0x14/0x5c
[] proc_info_read+0x50/0xe4
[] vfs_read+0xa0/0x12c
[] SyS_read+0x44/0x74
DMA: 3397*4kB (MC) 26*8kB (RC) 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB
 0*512kB 0*1024kB 0*2048kB 0*4096kB = 13796kB

Signed-off-by: Kyeongdon Kim 
---
 drivers/block/zram/zcomp_lz4.c | 15 +--
 drivers/block/zram/zcomp_lzo.c | 15 +--
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c
index f2afb7e..0477894 100644
--- a/drivers/block/zram/zcomp_lz4.c
+++ b/drivers/block/zram/zcomp_lz4.c
@@ -10,17 +10,28 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "zcomp_lz4.h"
 
 static void *zcomp_lz4_create(void)
 {
-   return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL);
+   void *ret;
+
+   ret = kzalloc(LZ4_MEM_COMPRESS,
+   __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC);
+   if (!ret)
+   ret = vzalloc(LZ4_MEM_COMPRESS);
+   return ret;
 }
 
 static void zcomp_lz4_destroy(void *private)
 {
-   kfree(private);
+   if (is_vmalloc_addr(private))
+   vfree(private);
+   else
+   kfree(private);
 }
 
 static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst,
diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c
index da1bc47..613fbac 100644
--- a/drivers/block/zram/zcomp_lzo.c
+++ b/drivers/block/zram/zcomp_lzo.c
@@ -10,17 +10,28 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "zcomp_lzo.h"
 
 static void *lzo_create(void)
 {
-   return kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
+   void *ret;
+
+   ret = kzalloc(LZO1X_MEM_COMPRESS,
+   __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC);
+   if (!ret)
+   ret = vzalloc(LZO1X_MEM_COMPRESS);
+   return ret;
 }
 
 static void lzo_destroy(void *private)
 {
-   kfree(private);
+   if (is_vmalloc_addr(private))
+   vfree(private);
+   else
+   kfree(private);
 }
 
 static int lzo_compress(const unsigned char *src, unsigned char *dst,
-- 
1.8.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] zram: Prevent page allocation failure during zcomp_strm_alloc

2015-11-20 Thread Kyeongdon Kim
When we're using LZ4 multi compression streams for zram swap,
we found out page allocation failure message in system running test.
That was not only once, but a few(2 - 5 times per test).
Also, some failure cases were continually occurring to try allocation
order 3.

In order to make parallel compression private data, we should call
kzalloc() with order 2/3 in runtime(lzo/lz4). But if there is no order
 2/3 size memory in that time, page allocation fails.
This patch makes to use vmalloc() as fallback of kmalloc(), this
prevents page alloc failure warning.

After this, we never found warning message in running test, also
It could reduce process startup latency about 60-120ms in each case.

For reference a call trace :

Binder_1: page allocation failure: order:3, mode:0x10c0d0
CPU: 0 PID: 424 Comm: Binder_1 Tainted: GW 3.10.49-perf-g991d02b-dirty #20
Call trace:
[] dump_backtrace+0x0/0x270
[] show_stack+0x10/0x1c
[] dump_stack+0x1c/0x28
[] warn_alloc_failed+0xfc/0x11c
[] __alloc_pages_nodemask+0x724/0x7f0
[] __get_free_pages+0x14/0x5c
[] kmalloc_order_trace+0x38/0xd8
[] zcomp_lz4_create+0x2c/0x38
[] zcomp_strm_alloc+0x34/0x78
[] zcomp_strm_multi_find+0x124/0x1ec
[] zcomp_strm_find+0xc/0x18
[] zram_bvec_rw+0x2fc/0x780
[] zram_make_request+0x25c/0x2d4
[] generic_make_request+0x80/0xbc
[] submit_bio+0xa4/0x15c
[] __swap_writepage+0x218/0x230
[] swap_writepage+0x3c/0x4c
[] shrink_page_list+0x51c/0x8d0
[] shrink_inactive_list+0x3f8/0x60c
[] shrink_lruvec+0x33c/0x4cc
[] shrink_zone+0x3c/0x100
[] try_to_free_pages+0x2b8/0x54c
[] __alloc_pages_nodemask+0x514/0x7f0
[] __get_free_pages+0x14/0x5c
[] proc_info_read+0x50/0xe4
[] vfs_read+0xa0/0x12c
[] SyS_read+0x44/0x74
DMA: 3397*4kB (MC) 26*8kB (RC) 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB
 0*512kB 0*1024kB 0*2048kB 0*4096kB = 13796kB

Signed-off-by: Kyeongdon Kim <kyeongdon@lge.com>
---
 drivers/block/zram/zcomp_lz4.c | 15 +--
 drivers/block/zram/zcomp_lzo.c | 15 +--
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c
index f2afb7e..0477894 100644
--- a/drivers/block/zram/zcomp_lz4.c
+++ b/drivers/block/zram/zcomp_lz4.c
@@ -10,17 +10,28 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "zcomp_lz4.h"
 
 static void *zcomp_lz4_create(void)
 {
-   return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL);
+   void *ret;
+
+   ret = kzalloc(LZ4_MEM_COMPRESS,
+   __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC);
+   if (!ret)
+   ret = vzalloc(LZ4_MEM_COMPRESS);
+   return ret;
 }
 
 static void zcomp_lz4_destroy(void *private)
 {
-   kfree(private);
+   if (is_vmalloc_addr(private))
+   vfree(private);
+   else
+   kfree(private);
 }
 
 static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst,
diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c
index da1bc47..613fbac 100644
--- a/drivers/block/zram/zcomp_lzo.c
+++ b/drivers/block/zram/zcomp_lzo.c
@@ -10,17 +10,28 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "zcomp_lzo.h"
 
 static void *lzo_create(void)
 {
-   return kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
+   void *ret;
+
+   ret = kzalloc(LZO1X_MEM_COMPRESS,
+   __GFP_NORETRY|__GFP_NOWARN|__GFP_NOMEMALLOC);
+   if (!ret)
+   ret = vzalloc(LZO1X_MEM_COMPRESS);
+   return ret;
 }
 
 static void lzo_destroy(void *private)
 {
-   kfree(private);
+   if (is_vmalloc_addr(private))
+   vfree(private);
+   else
+   kfree(private);
 }
 
 static int lzo_compress(const unsigned char *src, unsigned char *dst,
-- 
1.8.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/