[PATCH v2 4/4] kernel hacking: new config CC_OPTIMIZE_FOR_DEBUGGING to apply GCC -Og optimization

2018-10-19 Thread Du Changbin
This will apply GCC '-Og' optimization level which is supported
since GCC 4.8. This optimization level offers a reasonable level
of optimization while maintaining fast compilation and a good
debugging experience. It is similar to '-O1' while perferring
to keep debug ability over runtime speed.

If enabling this option breaks your kernel, you should either
disable this or find a fix (mostly in the arch code). Currently
this option has only been tested on x86_64 and arm platform.

This option can satisfy people who was searching for a method
to disable compiler optimizations so to achieve better kernel
debugging experience with kgdb or qemu.

The main problem of '-Og' is we must not use __attribute__((error(msg))).
The compiler will report error though the call to error function
still can be optimize out. So we must fallback to array tricky.

Comparison of vmlinux size: a bit smaller.

w/o CONFIG_CC_OPTIMIZE_FOR_DEBUGGING
$ size vmlinux
   textdata bss dec hex filename
22665554   9709674  2920908 3529613621a9388 vmlinux

w/ CONFIG_CC_OPTIMIZE_FOR_DEBUGGING
$ size vmlinux
   textdata bss dec hex filename
21499032   10102758 2920908 3452269820ec64a vmlinux

Comparison of system performance: a bit drop (~6%).
This benchmark of kernel compilation is suggested by Ingo Molnar.
https://lkml.org/lkml/2018/5/2/74

Preparation: Set cpufreq to 'performance'.
for ((cpu=0; cpu<120; cpu++)); do
  G=/sys/devices/system/cpu/cpu$cpu/cpufreq/scaling_governor
  [ -f $G ] && echo performance > $G
done

w/o CONFIG_CC_OPTIMIZE_FOR_DEBUGGING
$ perf stat --repeat 5 --null --pre '\
cp -a kernel ../kernel.copy.$(date +%s); \
rm -rf *;\
git checkout .;  \
echo 1 > /proc/sys/vm/drop_caches;   \
find ../kernel* -type f | xargs cat >/dev/null;  \
make -j kernel >/dev/null;   \
make clean >/dev/null 2>&1;  \
sync'\
 \
make -j8 >/dev/null

Performance counter stats for 'make -j8' (5 runs):

219.764246652 seconds time elapsed   ( +-  0.78% )

w/ CONFIG_CC_OPTIMIZE_FOR_DEBUGGING
$ perf stat --repeat 5 --null --pre '\
cp -a kernel ../kernel.copy.$(date +%s); \
rm -rf *;\
git checkout .;  \
echo 1 > /proc/sys/vm/drop_caches;   \
find ../kernel* -type f | xargs cat >/dev/null;  \
make -j kernel >/dev/null;   \
make clean >/dev/null 2>&1;  \
sync'\
 \
make -j8 >/dev/null

Performance counter stats for 'make -j8' (5 runs):

 233.574187771 seconds time elapsed  ( +-  0.19% )

Signed-off-by: Du Changbin 
Acked-by: Steven Rostedt (VMware) 
---
 Makefile |  5 +
 include/linux/compiler-gcc.h |  2 +-
 include/linux/compiler.h |  2 +-
 init/Kconfig | 19 +++
 4 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 757d6507cb5c..ea908cfe8594 100644
--- a/Makefile
+++ b/Makefile
@@ -657,6 +657,10 @@ KBUILD_CFLAGS  += $(call cc-disable-warning, 
format-truncation)
 KBUILD_CFLAGS  += $(call cc-disable-warning, format-overflow)
 KBUILD_CFLAGS  += $(call cc-disable-warning, int-in-bool-context)
 
+ifdef CONFIG_CC_OPTIMIZE_FOR_DEBUGGING
+KBUILD_CFLAGS  += $(call cc-option, -Og)
+KBUILD_CFLAGS  += $(call cc-disable-warning,maybe-uninitialized,)
+else
 ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
 KBUILD_CFLAGS  += $(call cc-option,-Oz,-Os)
 KBUILD_CFLAGS  += $(call cc-disable-warning,maybe-uninitialized,)
@@ -667,6 +671,7 @@ else
 KBUILD_CFLAGS   += -O2
 endif
 endif
+endif
 
 KBUILD_CFLAGS += $(call cc-ifversion, -lt, 0409, \
$(call cc-disable-warning,maybe-uninitialized,))
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 4d36b27214fd..2a76f7c64b54 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -85,7 +85,7 @@
 
 #define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
 
-#ifndef __CHECKER__
+#if !defined(__CHECKER__) && !defined(CONFIG_CC_OPTIMIZE_FOR_DEBUGGING)
 #define __compiletime_warning(message) __attribute__((warning(message)))
 #define __compiletime_error(message) __attribute__((error(message)))
 
diff --git a/include

[PATCH v2 3/4] ARM: mm: fix build error in fix_to_virt with CONFIG_CC_OPTIMIZE_FOR_DEBUGGING

2018-10-19 Thread Du Changbin
With '-Og' optimization level, GCC would not optimize a count for a loop
as a constant value. But BUILD_BUG_ON() only accept compile-time constant
values. Let's use __fix_to_virt() to avoid the error.

arch/arm/mm/mmu.o: In function `fix_to_virt':
/home/changbin/work/linux/./include/asm-generic/fixmap.h:31: undefined 
reference to `__compiletime_assert_31'
Makefile:1051: recipe for target 'vmlinux' failed
make: *** [vmlinux] Error 1

Signed-off-by: Du Changbin 
Acked-by: Steven Rostedt (VMware) 
---
 arch/arm/mm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index e46a6a446cdd..c08d74e76714 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -1599,7 +1599,7 @@ static void __init early_fixmap_shutdown(void)
pte_t *pte;
struct map_desc map;
 
-   map.virtual = fix_to_virt(i);
+   map.virtual = __fix_to_virt(i);
pte = pte_offset_early_fixmap(pmd_off_k(map.virtual), 
map.virtual);
 
/* Only i/o device mappings are supported ATM */
-- 
2.17.1



[PATCH v2 2/4] kernel hacking: new config NO_AUTO_INLINE to disable compiler auto-inline optimizations

2018-10-19 Thread Du Changbin
This patch add a new kernel hacking option NO_AUTO_INLINE. Selecting
this option will prevent the compiler from optimizing the kernel by
auto-inlining functions not marked with the inline keyword.

With this option, only functions explicitly marked with "inline" will
be inlined. This will allow the function tracer to trace more functions
because it only traces functions that the compiler has not inlined.

Signed-off-by: Du Changbin 
Acked-by: Steven Rostedt (VMware) 
---
 Makefile  |  6 ++
 lib/Kconfig.debug | 17 +
 2 files changed, 23 insertions(+)

diff --git a/Makefile b/Makefile
index e8b599b4dcde..757d6507cb5c 100644
--- a/Makefile
+++ b/Makefile
@@ -749,6 +749,12 @@ KBUILD_CFLAGS  += $(call cc-option, 
-femit-struct-debug-baseonly) \
   $(call cc-option,-fno-var-tracking)
 endif
 
+ifdef CONFIG_NO_AUTO_INLINE
+KBUILD_CFLAGS   += $(call cc-option, -fno-inline-functions) \
+  $(call cc-option, -fno-inline-small-functions) \
+  $(call cc-option, -fno-inline-functions-called-once)
+endif
+
 ifdef CONFIG_FUNCTION_TRACER
 ifdef CONFIG_FTRACE_MCOUNT_RECORD
   # gcc 5 supports generating the mcount tables directly
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 4966c4fbe7f7..c7c28ee01dfc 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -211,6 +211,23 @@ config GDB_SCRIPTS
  instance. See Documentation/dev-tools/gdb-kernel-debugging.rst
  for further details.
 
+config NO_AUTO_INLINE
+   bool "Disable compiler auto-inline optimizations"
+   help
+ This will prevent the compiler from optimizing the kernel by
+ auto-inlining functions not marked with the inline keyword.
+ With this option, only functions explicitly marked with
+ "inline" will be inlined. This will allow the function tracer
+ to trace more functions because it only traces functions that
+ the compiler has not inlined.
+
+ Enabling this function can help debugging a kernel if using
+ the function tracer. But it can also change how the kernel
+ works, because inlining functions may change the timing,
+ which could make it difficult while debugging race conditions.
+
+ If unsure, select N.
+
 config ENABLE_MUST_CHECK
bool "Enable __must_check logic"
default y
-- 
2.17.1



[PATCH v2 0/4] kernel hacking: GCC optimization for better debug experience (-Og)

2018-10-19 Thread Du Changbin
Hi all,
I have posted this series several months ago but interrupted by personal
affairs. Now I get time to complete this task. Thanks for all of the
reviewers.

I know some kernel developers was searching for a method to dissable GCC
optimizations, probably they want to apply GCC '-O0' option. But since Linux
kernel relies on GCC optimization to remove some dead code, so '-O0' just
breaks the build. They do need this because they want to debug kernel with
qemu, simics, kgtp or kgdb.

Thanks for the GCC '-Og' optimization level introduced in GCC 4.8, which
offers a reasonable level of optimization while maintaining fast compilation
and a good debugging experience. It is similar to '-O1' while perferring to
keep debug ability over runtime speed. With '-Og', we can build a kernel with
better debug ability and little performance drop after some simple change.

In this series, firstly introduce a new config CONFIG_NO_AUTO_INLINE after two
fixes for this new option. With this option, only functions explicitly marked
with "inline" will  be inlined. This will allow the function tracer to trace
more functions because it only traces functions that the compiler has not
inlined.

Then introduce new config CC_OPTIMIZE_FOR_DEBUGGING which apply '-Og'
optimization level for whole kernel, with a simple fix in fix_to_virt().
Currently I have only tested this option on x86 and ARM platform. Other
platforms should also work but probably need some compiling fixes as what
having done in this series. I leave that to who want to try this debug
option.

Comparison of vmlinux size: a bit smaller.

w/o CONFIG_CC_OPTIMIZE_FOR_DEBUGGING
$ size vmlinux
   textdata bss dec hex filename
22665554   9709674  2920908 3529613621a9388 vmlinux

w/ CONFIG_CC_OPTIMIZE_FOR_DEBUGGING
$ size vmlinux
   textdata bss dec hex filename
21499032   10102758 2920908 3452269820ec64a vmlinux


Comparison of system performance: a bit drop (~6%).
This benchmark of kernel compilation is suggested by Ingo Molnar.
https://lkml.org/lkml/2018/5/2/74

Preparation: Set cpufreq to 'performance'.
for ((cpu=0; cpu<120; cpu++)); do
  G=/sys/devices/system/cpu/cpu$cpu/cpufreq/scaling_governor
  [ -f $G ] && echo performance > $G
done

w/o CONFIG_CC_OPTIMIZE_FOR_DEBUGGING
$ perf stat --repeat 5 --null --pre '\
cp -a kernel ../kernel.copy.$(date +%s); \
rm -rf *;\
git checkout .;  \
echo 1 > /proc/sys/vm/drop_caches;   \
find ../kernel* -type f | xargs cat >/dev/null;  \
make -j kernel >/dev/null;   \
make clean >/dev/null 2>&1;  \
sync'\
 \
make -j8 >/dev/null

 Performance counter stats for 'make -j8' (5 runs):

219.764246652 seconds time elapsed   ( +-  0.78% )

w/ CONFIG_CC_OPTIMIZE_FOR_DEBUGGING
$ perf stat --repeat 5 --null --pre '\
cp -a kernel ../kernel.copy.$(date +%s); \
rm -rf *;\
git checkout .;  \
echo 1 > /proc/sys/vm/drop_caches;   \
find ../kernel* -type f | xargs cat >/dev/null;  \
make -j kernel >/dev/null;   \
make clean >/dev/null 2>&1;  \
sync'\
 \
make -j8 >/dev/null

Performance counter stats for 'make -j8' (5 runs):

 233.574187771 seconds time elapsed  ( +-  0.19% )


Du Changbin (4):
  x86/mm: surround level4_kernel_pgt with #ifdef
CONFIG_X86_5LEVEL...#endif
  kernel hacking: new config NO_AUTO_INLINE to disable compiler
auto-inline optimizations
  ARM: mm: fix build error in fix_to_virt with
CONFIG_CC_OPTIMIZE_FOR_DEBUGGING
  kernel hacking: new config CC_OPTIMIZE_FOR_DEBUGGING to apply GCC -Og
optimization

 Makefile  | 11 +++
 arch/arm/mm/mmu.c |  2 +-
 arch/x86/include/asm/pgtable_64.h |  2 ++
 arch/x86/kernel/head64.c  | 13 ++---
 include/linux/compiler-gcc.h  |  2 +-
 include/linux/compiler.h  |  2 +-
 init/Kconfig  | 19 +++
 lib/Kconfig.debug | 17 +
 8 files changed, 58 insertions(+), 10 deletions(-)

-- 
2.17.1



[PATCH v2 1/4] x86/mm: surround level4_kernel_pgt with #ifdef CONFIG_X86_5LEVEL...#endif

2018-10-19 Thread Du Changbin
The level4_kernel_pgt is only defined when X86_5LEVEL is enabled. So
surround level4_kernel_pgt with #ifdef CONFIG_X86_5LEVEL...#endif to
make code correct.

Signed-off-by: Du Changbin 
Acked-by: Steven Rostedt (VMware) 
---
 arch/x86/include/asm/pgtable_64.h |  2 ++
 arch/x86/kernel/head64.c  | 13 ++---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/pgtable_64.h 
b/arch/x86/include/asm/pgtable_64.h
index 9c85b54bf03c..9333f7fa5bdb 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -16,7 +16,9 @@
 #include 
 #include 
 
+#ifdef CONFIG_X86_5LEVEL
 extern p4d_t level4_kernel_pgt[512];
+#endif
 extern p4d_t level4_ident_pgt[512];
 extern pud_t level3_kernel_pgt[512];
 extern pud_t level3_ident_pgt[512];
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index ddee1f0870c4..4a59ef93c258 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -151,16 +151,15 @@ unsigned long __head __startup_64(unsigned long physaddr,
 
pgd = fixup_pointer(&early_top_pgt, physaddr);
p = pgd + pgd_index(__START_KERNEL_map);
-   if (la57)
-   *p = (unsigned long)level4_kernel_pgt;
-   else
-   *p = (unsigned long)level3_kernel_pgt;
-   *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
-
+#ifdef CONFIG_X86_5LEVEL
if (la57) {
+   *p = (unsigned long)level4_kernel_pgt;
p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
p4d[511] += load_delta;
-   }
+   } else
+#endif
+   *p = (unsigned long)level3_kernel_pgt;
+   *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
 
pud = fixup_pointer(&level3_kernel_pgt, physaddr);
pud[510] += load_delta;
-- 
2.17.1



Re: [PATCH 2/4] kernel hacking: new config NO_AUTO_INLINE to disable compiler auto-inline optimizations

2018-10-18 Thread Du Changbin
On Thu, Oct 18, 2018 at 12:59:48PM -0400, Steven Rostedt wrote:
> On Thu, 18 Oct 2018 16:25:46 +
> Du Changbin  wrote:
> 
> > From: Changbin Du 
> > 
> > This patch add a new kernel hacking option NO_AUTO_INLINE. Selecting
> > this option will prevent the compiler from optimizing the kernel by
> > auto-inlining functions not marked with the inline keyword.
> > 
> > With this option, only functions explicitly marked with "inline" will
> > be inlined. This will allow the function tracer to trace more functions
> > because it only traces functions that the compiler has not inlined.
> > 
> > Signed-off-by: Changbin Du 
> > Acked-by: Steven Rostedt (VMware) 
> 
> I have acked patch this before, but this particular patch has extra
> changes that I have not acked.
>
Steven, no extra changes made. I just wronly rebased it on top of mainline.
:)

> 
> 
> > +config ENABLE_WARN_DEPRECATED
> > +   bool "Enable __deprecated logic"
> > +   default y
> > +   help
> > + Enable the __deprecated logic in the kernel build.
> > + Disable this to suppress the "warning: 'foo' is deprecated
> > + (declared at kernel/power/somefile.c:1234)" messages.
> > +
> 
> What is that?
> 
> -- Steve

-- 
Thanks,
Du Changbin


Re: [PATCH 2/4] kernel hacking: new config NO_AUTO_INLINE to disable compiler auto-inline optimizations

2018-10-18 Thread Du Changbin
On Thu, Oct 18, 2018 at 05:57:56PM +0100, Robin Murphy wrote:
> On 18/10/18 17:25, Du Changbin wrote:
> > From: Changbin Du 
> > 
> > This patch add a new kernel hacking option NO_AUTO_INLINE. Selecting
> > this option will prevent the compiler from optimizing the kernel by
> > auto-inlining functions not marked with the inline keyword.
> > 
> > With this option, only functions explicitly marked with "inline" will
> > be inlined. This will allow the function tracer to trace more functions
> > because it only traces functions that the compiler has not inlined.
> > 
> > Signed-off-by: Changbin Du 
> > Acked-by: Steven Rostedt (VMware) 
> > ---
> >   Makefile  |  6 ++
> >   lib/Kconfig.debug | 25 +
> >   2 files changed, 31 insertions(+)
> > 
> > diff --git a/Makefile b/Makefile
> > index e8b599b4dcde..757d6507cb5c 100644
> > --- a/Makefile
> > +++ b/Makefile
> > @@ -749,6 +749,12 @@ KBUILD_CFLAGS  += $(call cc-option, 
> > -femit-struct-debug-baseonly) \
> >$(call cc-option,-fno-var-tracking)
> >   endif
> > +ifdef CONFIG_NO_AUTO_INLINE
> > +KBUILD_CFLAGS   += $(call cc-option, -fno-inline-functions) \
> > +  $(call cc-option, -fno-inline-small-functions) \
> > +  $(call cc-option, -fno-inline-functions-called-once)
> > +endif
> > +
> >   ifdef CONFIG_FUNCTION_TRACER
> >   ifdef CONFIG_FTRACE_MCOUNT_RECORD
> > # gcc 5 supports generating the mcount tables directly
> > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> > index 4966c4fbe7f7..0f9b4fa78b1c 100644
> > --- a/lib/Kconfig.debug
> > +++ b/lib/Kconfig.debug
> > @@ -211,6 +211,31 @@ config GDB_SCRIPTS
> >   instance. See Documentation/dev-tools/gdb-kernel-debugging.rst
> >   for further details.
> > +config NO_AUTO_INLINE
> > +   bool "Disable compiler auto-inline optimizations"
> > +   help
> > + This will prevent the compiler from optimizing the kernel by
> > + auto-inlining functions not marked with the inline keyword.
> > + With this option, only functions explicitly marked with
> > + "inline" will be inlined. This will allow the function tracer
> > + to trace more functions because it only traces functions that
> > + the compiler has not inlined.
> > +
> > + Enabling this function can help debugging a kernel if using
> > + the function tracer. But it can also change how the kernel
> > + works, because inlining functions may change the timing,
> > + which could make it difficult while debugging race conditions.
> > +
> > + If unsure, select N.
> > +
> > +config ENABLE_WARN_DEPRECATED
> 
> This part doesn't look like it belongs in this patch, and judging by the
> commit message in 771c035372a0 wouldn't be welcome back anyway.
> 
opps, this is a rebasing mistake. Let me update it. Thanks.

> Robin.
> 
> > +   bool "Enable __deprecated logic"
> > +   default y
> > +   help
> > + Enable the __deprecated logic in the kernel build.
> > + Disable this to suppress the "warning: 'foo' is deprecated
> > + (declared at kernel/power/somefile.c:1234)" messages.
> > +
> >   config ENABLE_MUST_CHECK
> > bool "Enable __must_check logic"
> > default y
> > 

-- 
Thanks,
Du Changbin


[PATCH 3/4] ARM: mm: fix build error in fix_to_virt with CONFIG_CC_OPTIMIZE_FOR_DEBUGGING

2018-10-18 Thread Du Changbin
From: Changbin Du 

With '-Og' optimization level, GCC would not optimize a count for a loop
as a constant value. But BUILD_BUG_ON() only accept compile-time constant
values. Let's use __fix_to_virt() to avoid the error.

arch/arm/mm/mmu.o: In function `fix_to_virt':
/home/changbin/work/linux/./include/asm-generic/fixmap.h:31: undefined 
reference to `__compiletime_assert_31'
Makefile:1051: recipe for target 'vmlinux' failed
make: *** [vmlinux] Error 1

Signed-off-by: Changbin Du 
Acked-by: Steven Rostedt (VMware) 
---
 arch/arm/mm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index e46a6a446cdd..c08d74e76714 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -1599,7 +1599,7 @@ static void __init early_fixmap_shutdown(void)
pte_t *pte;
struct map_desc map;
 
-   map.virtual = fix_to_virt(i);
+   map.virtual = __fix_to_virt(i);
pte = pte_offset_early_fixmap(pmd_off_k(map.virtual), 
map.virtual);
 
/* Only i/o device mappings are supported ATM */
-- 
2.17.1



[PATCH 4/4] kernel hacking: new config CC_OPTIMIZE_FOR_DEBUGGING to apply GCC -Og optimization

2018-10-18 Thread Du Changbin
From: Changbin Du 

This will apply GCC '-Og' optimization level which is supported
since GCC 4.8. This optimization level offers a reasonable level
of optimization while maintaining fast compilation and a good
debugging experience. It is similar to '-O1' while perferring
to keep debug ability over runtime speed.

If enabling this option breaks your kernel, you should either
disable this or find a fix (mostly in the arch code). Currently
this option has only been tested on x86_64 and arm platform.

This option can satisfy people who was searching for a method
to disable compiler optimizations so to achieve better kernel
debugging experience with kgdb or qemu.

The main problem of '-Og' is we must not use __attribute__((error(msg))).
The compiler will report error though the call to error function
still can be optimize out. So we must fallback to array tricky.

Comparison of vmlinux size: a bit smaller.

w/o CONFIG_CC_OPTIMIZE_FOR_DEBUGGING
$ size vmlinux
   textdata bss dec hex filename
22665554   9709674  2920908 3529613621a9388 vmlinux

w/ CONFIG_CC_OPTIMIZE_FOR_DEBUGGING
$ size vmlinux
   textdata bss dec hex filename
21499032   10102758 2920908 3452269820ec64a vmlinux

Comparison of system performance: a bit drop (~6%).
This benchmark of kernel compilation is suggested by Ingo Molnar.
https://lkml.org/lkml/2018/5/2/74

Preparation: Set cpufreq to 'performance'.
for ((cpu=0; cpu<120; cpu++)); do
  G=/sys/devices/system/cpu/cpu$cpu/cpufreq/scaling_governor
  [ -f $G ] && echo performance > $G
done

w/o CONFIG_CC_OPTIMIZE_FOR_DEBUGGING
$ perf stat --repeat 5 --null --pre '\
cp -a kernel ../kernel.copy.$(date +%s); \
rm -rf *;\
git checkout .;  \
echo 1 > /proc/sys/vm/drop_caches;   \
find ../kernel* -type f | xargs cat >/dev/null;  \
make -j kernel >/dev/null;   \
make clean >/dev/null 2>&1;  \
sync'\
 \
make -j8 >/dev/null

Performance counter stats for 'make -j8' (5 runs):

219.764246652 seconds time elapsed   ( +-  0.78% )

w/ CONFIG_CC_OPTIMIZE_FOR_DEBUGGING
$ perf stat --repeat 5 --null --pre '\
cp -a kernel ../kernel.copy.$(date +%s); \
rm -rf *;\
git checkout .;  \
echo 1 > /proc/sys/vm/drop_caches;   \
find ../kernel* -type f | xargs cat >/dev/null;  \
make -j kernel >/dev/null;   \
make clean >/dev/null 2>&1;  \
sync'\
 \
make -j8 >/dev/null

Performance counter stats for 'make -j8' (5 runs):

 233.574187771 seconds time elapsed  ( +-  0.19% )

Signed-off-by: Changbin Du 
Acked-by: Steven Rostedt (VMware) 
---
 Makefile |  5 +
 include/linux/compiler-gcc.h |  2 +-
 include/linux/compiler.h |  2 +-
 init/Kconfig | 19 +++
 4 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 757d6507cb5c..0e747fafb53b 100644
--- a/Makefile
+++ b/Makefile
@@ -657,6 +657,10 @@ KBUILD_CFLAGS  += $(call cc-disable-warning, 
format-truncation)
 KBUILD_CFLAGS  += $(call cc-disable-warning, format-overflow)
 KBUILD_CFLAGS  += $(call cc-disable-warning, int-in-bool-context)
 
+ifdef CONFIG_CC_OPTIMIZE_FOR_DEBUGGING
+KBUILD_CFLAGS  += $(call cc-option, -Og)
+KBUILD_CFLAGS   += $(call cc-disable-warning,maybe-uninitialized,)
+else
 ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
 KBUILD_CFLAGS  += $(call cc-option,-Oz,-Os)
 KBUILD_CFLAGS  += $(call cc-disable-warning,maybe-uninitialized,)
@@ -667,6 +671,7 @@ else
 KBUILD_CFLAGS   += -O2
 endif
 endif
+endif
 
 KBUILD_CFLAGS += $(call cc-ifversion, -lt, 0409, \
$(call cc-disable-warning,maybe-uninitialized,))
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 4d36b27214fd..2a76f7c64b54 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -85,7 +85,7 @@
 
 #define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
 
-#ifndef __CHECKER__
+#if !defined(__CHECKER__) && !defined(CONFIG_CC_OPTIMIZE_FOR_DEBUGGING)
 #define __compiletime_warning(message) __attribute__((warning(message)))
 #define __compiletime_error(message) __attribute__((error(message)))
 
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 681d866efb1e..9385c62e9f00 100644
--- a/include/linux/compiler.h
+++ b/include/l

[PATCH 1/4] x86/mm: surround level4_kernel_pgt with #ifdef CONFIG_X86_5LEVEL...#endif

2018-10-18 Thread Du Changbin
From: Changbin Du 

The level4_kernel_pgt is only defined when X86_5LEVEL is enabled. So
surround level4_kernel_pgt with #ifdef CONFIG_X86_5LEVEL...#endif to
make code correct.

Signed-off-by: Changbin Du 
Acked-by: Steven Rostedt (VMware) 
---
 arch/x86/include/asm/pgtable_64.h |  2 ++
 arch/x86/kernel/head64.c  | 13 ++---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/pgtable_64.h 
b/arch/x86/include/asm/pgtable_64.h
index 9c85b54bf03c..9333f7fa5bdb 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -16,7 +16,9 @@
 #include 
 #include 
 
+#ifdef CONFIG_X86_5LEVEL
 extern p4d_t level4_kernel_pgt[512];
+#endif
 extern p4d_t level4_ident_pgt[512];
 extern pud_t level3_kernel_pgt[512];
 extern pud_t level3_ident_pgt[512];
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index ddee1f0870c4..4a59ef93c258 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -151,16 +151,15 @@ unsigned long __head __startup_64(unsigned long physaddr,
 
pgd = fixup_pointer(&early_top_pgt, physaddr);
p = pgd + pgd_index(__START_KERNEL_map);
-   if (la57)
-   *p = (unsigned long)level4_kernel_pgt;
-   else
-   *p = (unsigned long)level3_kernel_pgt;
-   *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
-
+#ifdef CONFIG_X86_5LEVEL
if (la57) {
+   *p = (unsigned long)level4_kernel_pgt;
p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
p4d[511] += load_delta;
-   }
+   } else
+#endif
+   *p = (unsigned long)level3_kernel_pgt;
+   *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
 
pud = fixup_pointer(&level3_kernel_pgt, physaddr);
pud[510] += load_delta;
-- 
2.17.1



[PATCH 2/4] kernel hacking: new config NO_AUTO_INLINE to disable compiler auto-inline optimizations

2018-10-18 Thread Du Changbin
From: Changbin Du 

This patch add a new kernel hacking option NO_AUTO_INLINE. Selecting
this option will prevent the compiler from optimizing the kernel by
auto-inlining functions not marked with the inline keyword.

With this option, only functions explicitly marked with "inline" will
be inlined. This will allow the function tracer to trace more functions
because it only traces functions that the compiler has not inlined.

Signed-off-by: Changbin Du 
Acked-by: Steven Rostedt (VMware) 
---
 Makefile  |  6 ++
 lib/Kconfig.debug | 25 +
 2 files changed, 31 insertions(+)

diff --git a/Makefile b/Makefile
index e8b599b4dcde..757d6507cb5c 100644
--- a/Makefile
+++ b/Makefile
@@ -749,6 +749,12 @@ KBUILD_CFLAGS  += $(call cc-option, 
-femit-struct-debug-baseonly) \
   $(call cc-option,-fno-var-tracking)
 endif
 
+ifdef CONFIG_NO_AUTO_INLINE
+KBUILD_CFLAGS   += $(call cc-option, -fno-inline-functions) \
+  $(call cc-option, -fno-inline-small-functions) \
+  $(call cc-option, -fno-inline-functions-called-once)
+endif
+
 ifdef CONFIG_FUNCTION_TRACER
 ifdef CONFIG_FTRACE_MCOUNT_RECORD
   # gcc 5 supports generating the mcount tables directly
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 4966c4fbe7f7..0f9b4fa78b1c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -211,6 +211,31 @@ config GDB_SCRIPTS
  instance. See Documentation/dev-tools/gdb-kernel-debugging.rst
  for further details.
 
+config NO_AUTO_INLINE
+   bool "Disable compiler auto-inline optimizations"
+   help
+ This will prevent the compiler from optimizing the kernel by
+ auto-inlining functions not marked with the inline keyword.
+ With this option, only functions explicitly marked with
+ "inline" will be inlined. This will allow the function tracer
+ to trace more functions because it only traces functions that
+ the compiler has not inlined.
+
+ Enabling this function can help debugging a kernel if using
+ the function tracer. But it can also change how the kernel
+ works, because inlining functions may change the timing,
+ which could make it difficult while debugging race conditions.
+
+ If unsure, select N.
+
+config ENABLE_WARN_DEPRECATED
+   bool "Enable __deprecated logic"
+   default y
+   help
+ Enable the __deprecated logic in the kernel build.
+ Disable this to suppress the "warning: 'foo' is deprecated
+ (declared at kernel/power/somefile.c:1234)" messages.
+
 config ENABLE_MUST_CHECK
bool "Enable __must_check logic"
default y
-- 
2.17.1



[PATCH 0/4] kernel hacking: GCC optimization for better debug experience (-Og)

2018-10-18 Thread Du Changbin
Hi all,
I have posted this series several months ago but interrupted by personal
affairs. Now I get time to complete this task. Thanks for all of the
reviewers.

I know some kernel developers was searching for a method to dissable GCC
optimizations, probably they want to apply GCC '-O0' option. But since Linux
kernel replys on GCC optimization to remove some dead code, so '-O0' just
breaks the build. They do need this because they want to debug kernel with
qemu, simics, kgtp or kgdb.

Thanks for the GCC '-Og' optimization level introduced in GCC 4.8, which
offers a reasonable level of optimization while maintaining fast compilation
and a good debugging experience. It is similar to '-O1' while perferring to
keep debug ability over runtime speed. With '-Og', we can build a kernel with
better debug ability and little performance drop after some simple change.

In this series, firstly introduce a new config CONFIG_NO_AUTO_INLINE after two
fixes for this new option. With this option, only functions explicitly marked
with "inline" will  be inlined. This will allow the function tracer to trace
more functions because it only traces functions that the compiler has not
inlined.

Then introduce new config CC_OPTIMIZE_FOR_DEBUGGING which apply '-Og'
optimization level for whole kernel, with a simple fix in fix_to_virt().
Currently I have only tested this option on x86 and ARM platform. Other
platforms should also work but probably need some compiling fixes as what
having done in this series. I leave that to who want to try this debug
option.

Comparison of vmlinux size: a bit smaller.

w/o CONFIG_CC_OPTIMIZE_FOR_DEBUGGING
$ size vmlinux
   textdata bss dec hex filename
22665554   9709674  2920908 3529613621a9388 vmlinux

w/ CONFIG_CC_OPTIMIZE_FOR_DEBUGGING
$ size vmlinux
   textdata bss dec hex filename
21499032   10102758 2920908 3452269820ec64a vmlinux


Comparison of system performance: a bit drop (~6%).
This benchmark of kernel compilation is suggested by Ingo Molnar.
https://lkml.org/lkml/2018/5/2/74

Preparation: Set cpufreq to 'performance'.
for ((cpu=0; cpu<120; cpu++)); do
  G=/sys/devices/system/cpu/cpu$cpu/cpufreq/scaling_governor
  [ -f $G ] && echo performance > $G
done

w/o CONFIG_CC_OPTIMIZE_FOR_DEBUGGING
$ perf stat --repeat 5 --null --pre '\
cp -a kernel ../kernel.copy.$(date +%s); \
rm -rf *;\
git checkout .;  \
echo 1 > /proc/sys/vm/drop_caches;   \
find ../kernel* -type f | xargs cat >/dev/null;  \
make -j kernel >/dev/null;   \
make clean >/dev/null 2>&1;  \
sync'\
 \
make -j8 >/dev/null

 Performance counter stats for 'make -j8' (5 runs):

219.764246652 seconds time elapsed   ( +-  0.78% )

w/ CONFIG_CC_OPTIMIZE_FOR_DEBUGGING
$ perf stat --repeat 5 --null --pre '\
cp -a kernel ../kernel.copy.$(date +%s); \
rm -rf *;\
git checkout .;  \
echo 1 > /proc/sys/vm/drop_caches;   \
find ../kernel* -type f | xargs cat >/dev/null;  \
make -j kernel >/dev/null;   \
make clean >/dev/null 2>&1;  \
sync'\
 \
make -j8 >/dev/null

Performance counter stats for 'make -j8' (5 runs):

 233.574187771 seconds time elapsed  ( +-  0.19% )


Changbin Du (4):
  x86/mm: surround level4_kernel_pgt with #ifdef
CONFIG_X86_5LEVEL...#endif
  kernel hacking: new config NO_AUTO_INLINE to disable compiler
auto-inline optimizations
  ARM: mm: fix build error in fix_to_virt with
CONFIG_CC_OPTIMIZE_FOR_DEBUGGING
  kernel hacking: new config CC_OPTIMIZE_FOR_DEBUGGING to apply GCC -Og
optimization

 Makefile  | 11 +++
 arch/arm/mm/mmu.c |  2 +-
 arch/x86/include/asm/pgtable_64.h |  2 ++
 arch/x86/kernel/head64.c  | 13 ++---
 include/linux/compiler-gcc.h  |  2 +-
 include/linux/compiler.h  |  2 +-
 init/Kconfig  | 19 +++
 lib/Kconfig.debug | 25 +
 8 files changed, 66 insertions(+), 10 deletions(-)

-- 
2.17.1



[PATCH v3] scripts/gdb: fix lx-version

2018-10-17 Thread Du Changbin
For current gdb version (has tested with 7.3 and 8.1), 'lx-version'
only prints one character.
(gdb) lx-version
L(gdb)

This can be fixed by casting 'linux_banner' as (char *).
(gdb) lx-version
Linux version 4.19.0-rc1+ (changbin@acer) (gcc version 7.3.0 (Ubuntu 
7.3.0-16ubuntu3)) #21 SMP Sat Sep 1 21:43:30 CST 2018

Signed-off-by: Du Changbin 
---
 scripts/gdb/linux/proc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py
index 086d27223c0c..0aebd7565b03 100644
--- a/scripts/gdb/linux/proc.py
+++ b/scripts/gdb/linux/proc.py
@@ -41,7 +41,7 @@ class LxVersion(gdb.Command):
 
 def invoke(self, arg, from_tty):
 # linux_banner should contain a newline
-gdb.write(gdb.parse_and_eval("linux_banner").string())
+gdb.write(gdb.parse_and_eval("(char *)linux_banner").string())
 
 LxVersion()
 
-- 
2.17.1



Re: [PATCH] PCI: make pci_size() return real size

2018-10-17 Thread Du Changbin
Hi Bjorn. Have you checked this little improvment? The idea here is that
this is not a hotspot, so readbility matters than trick. Thanks!

On Sat, Oct 13, 2018 at 08:49:19AM +0800, changbin...@gmail.com wrote:
> From: Du Changbin 
> 
> Currently, the pci_size() function actually return 'size-1'.
> Make it return real size to avoid confusing.
> 
> Signed-off-by: Du Changbin 
> ---
>  drivers/pci/probe.c | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> index 201f9e5ff55c..8ff2b1413865 100644
> --- a/drivers/pci/probe.c
> +++ b/drivers/pci/probe.c
> @@ -121,13 +121,13 @@ static u64 pci_size(u64 base, u64 maxbase, u64 mask)
>* Get the lowest of them to find the decode size, and from that
>* the extent.
>*/
> - size = (size & ~(size-1)) - 1;
> + size = size & ~(size-1);
>  
>   /*
>* base == maxbase can be valid only if the BAR has already been
>* programmed with all 1s.
>*/
> - if (base == maxbase && ((base | size) & mask) != mask)
> + if (base == maxbase && ((base | (size - 1)) & mask) != mask)
>   return 0;
>  
>   return size;
> @@ -278,7 +278,7 @@ int __pci_read_base(struct pci_dev *dev, enum 
> pci_bar_type type,
>   /* Above 32-bit boundary; try to reallocate */
>   res->flags |= IORESOURCE_UNSET;
>   res->start = 0;
> - res->end = sz64;
> + res->end = sz64 - 1;
>   pci_info(dev, "reg 0x%x: can't handle BAR above 4GB 
> (bus address %#010llx)\n",
>pos, (unsigned long long)l64);
>   goto out;
> @@ -286,7 +286,7 @@ int __pci_read_base(struct pci_dev *dev, enum 
> pci_bar_type type,
>   }
>  
>   region.start = l64;
> - region.end = l64 + sz64;
> + region.end = l64 + sz64 - 1;
>  
>   pcibios_bus_to_resource(dev->bus, res, ®ion);
>   pcibios_resource_to_bus(dev->bus, &inverted_region, res);
> -- 
> 2.17.1
> 

-- 
Thanks,
Du Changbin


[PATCH v2] scripts/gdb: fix lx-version for gdb 7.3-

2018-10-17 Thread Du Changbin
For gdb version less than 7.3, lx-version only prints one character.
(gdb) lx-version
L(gdb)

This can be fixed by casting 'linux_banner' as (char *).
(gdb) lx-version
Linux version 4.19.0-rc1+ (changbin@acer) (gcc version 7.3.0 (Ubuntu 
7.3.0-16ubuntu3)) #21 SMP Sat Sep 1 21:43:30 CST 2018

gdb 7.4 seems to be no such issue.

Signed-off-by: Du Changbin 
---
 scripts/gdb/linux/proc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py
index 086d27223c0c..0aebd7565b03 100644
--- a/scripts/gdb/linux/proc.py
+++ b/scripts/gdb/linux/proc.py
@@ -41,7 +41,7 @@ class LxVersion(gdb.Command):
 
 def invoke(self, arg, from_tty):
 # linux_banner should contain a newline
-gdb.write(gdb.parse_and_eval("linux_banner").string())
+gdb.write(gdb.parse_and_eval("(char *)linux_banner").string())
 
 LxVersion()
 
-- 
2.17.1



[PATCH] scripts/gdb: fix lx-version for gdb 7.3-

2018-10-16 Thread Du Changbin
For gdb version less than 7.3, lx-version only one character.
(gdb) lx-version
L(gdb)

This can be fixed by casting 'linux_banner' as (char *).
(gdb) lx-version
Linux version 4.19.0-rc1+ (changbin@acer) (gcc version 7.3.0 (Ubuntu 
7.3.0-16ubuntu3)) #21 SMP Sat Sep 1 21:43:30 CST 2018

gdb 7.4 seems to be no such issue.

Signed-off-by: Du Changbin 
---
 scripts/gdb/linux/proc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py
index 086d27223c0c..0aebd7565b03 100644
--- a/scripts/gdb/linux/proc.py
+++ b/scripts/gdb/linux/proc.py
@@ -41,7 +41,7 @@ class LxVersion(gdb.Command):
 
 def invoke(self, arg, from_tty):
 # linux_banner should contain a newline
-gdb.write(gdb.parse_and_eval("linux_banner").string())
+gdb.write(gdb.parse_and_eval("(char *)linux_banner").string())
 
 LxVersion()
 
-- 
2.17.1



Re: kernel BUG at arch/x86/kvm/x86.c:LINE! (2)

2018-10-10 Thread Du Changbin
try/entry_64.S:993
RIP: 0010:kvm_spurious_fault+0x9/0x10 arch/x86/kvm/x86.c:353
Code: 45 10 50 e8 e9 44 7c 00 58 5a 48 8d 65 d8 5b 41 5c 41 5d 41 5e 41 5f
5d c3 0f 1f 84 00 00 00 00 00 55 48 89 e5 e8 97 03 73 00 <0f> 0b 0f 1f 44 00
00 55 48 89 e5 41 57 41 56 41 55 41 89 fd 41 54
RSP: 0018:88018b397448 EFLAGS: 00010212
RAX: 0004 RBX: 110031672e8d RCX: c9000628e000
RDX: 0417 RSI: 810bd1f9 RDI: 88018b397488
RBP: 88018b397448 R08: 8801cc2f2180 R09: 8801c308d000
R10: ed0038611bff R11: 8801c308dfff R12: 88018b3974c8
R13: dc00 R14: 8801c308d000 R15: 88018b397488
 kvm_fastop_exception+0x50b/0x5455
 loaded_vmcs_init arch/x86/kvm/vmx.c:2129 [inline]
 alloc_loaded_vmcs+0x7f/0x280 arch/x86/kvm/vmx.c:4766
 vmx_create_vcpu+0x20e/0x25e0 arch/x86/kvm/vmx.c:11025
 kvm_arch_vcpu_create+0xe5/0x220 arch/x86/kvm/x86.c:8471
 kvm_vm_ioctl_create_vcpu arch/x86/kvm/../../../virt/kvm/kvm_main.c:2476
[inline]
 kvm_vm_ioctl+0x470/0x1d40 arch/x86/kvm/../../../virt/kvm/kvm_main.c:2977
 kvm_vm_compat_ioctl+0x143/0x430
arch/x86/kvm/../../../virt/kvm/kvm_main.c:3170
 __do_compat_sys_ioctl fs/compat_ioctl.c:1419 [inline]
 __se_compat_sys_ioctl fs/compat_ioctl.c:1365 [inline]
 __ia32_compat_sys_ioctl+0x20e/0x630 fs/compat_ioctl.c:1365
 do_syscall_32_irqs_on arch/x86/entry/common.c:326 [inline]
 do_fast_syscall_32+0x34d/0xfb2 arch/x86/entry/common.c:397
 entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139
RIP: 0023:0xf7fecca9
Code: 85 d2 74 02 89 0a 5b 5d c3 8b 04 24 c3 8b 0c 24 c3 8b 1c 24 c3 90 90
90 90 90 90 90 90 90 90 90 90 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90
90 90 eb 0d 90 90 90 90 90 90 90 90 90 90 90 90
RSP: 002b:f5fa60cc EFLAGS: 0296 ORIG_RAX: 0036
RAX: ffda RBX: 000c RCX: ae41
RDX:  RSI:  RDI: 
RBP:  R08:  R09: 
R10:  R11:  R12: 
R13:  R14:  R15: 
Modules linked in:
---[ end trace 1c8fec48833612c0 ]---
RIP: 0010:kvm_spurious_fault+0x9/0x10 arch/x86/kvm/x86.c:353
Code: 45 10 50 e8 e9 44 7c 00 58 5a 48 8d 65 d8 5b 41 5c 41 5d 41 5e 41 5f
5d c3 0f 1f 84 00 00 00 00 00 55 48 89 e5 e8 97 03 73 00 <0f> 0b 0f 1f 44 00
00 55 48 89 e5 41 57 41 56 41 55 41 89 fd 41 54
RSP: 0018:8801dae07bd8 EFLAGS: 00010006
RAX: 8801cc2f2180 RBX: 11003b5c0f7f RCX: 81385bcc
RDX: 0001 RSI: 810bd1f9 RDI: 8801dae07c18
RBP: 8801dae07bd8 R08: 8801cc2f2180 R09: ed003b5c5ba0
R10: ed003b5c5ba0 R11: 8801dae2dd07 R12: 8801dae07c58
R13: dc00 R14: 8801beccc000 R15: 8801dae07c18
FS:  () GS:8801dae0(0063) knlGS:f5fa6b40
CS:  0010 DS: 002b ES: 002b CR0: 80050033
CR2: 8801dae07c18 CR3: 0001cc8d1000 CR4: 001426f0
DR0:  DR1:  DR2: 
DR3:  DR6: fffe0ff0 DR7: 0400


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkal...@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with
syzbot.


--
Thanks,
Du Changbin


Re: [PATCH v5 2/4] kernel hacking: new config NO_AUTO_INLINE to disable compiler auto-inline optimizations

2018-06-07 Thread Du, Changbin
Hi,
On Thu, Jun 07, 2018 at 09:47:18AM +0530, Viresh Kumar wrote:
> +Greg/Alex,
> 
> @Fegguang/build-bot: I do see mention of Greg and /me in your initial email's
> body saying TO: Viresh, CC: Greg, but I don't see any of us getting cc'd in 
> your
> email. Bug ?
> 
> On 06-06-18, 14:26, Steven Rostedt wrote:
> > On Wed, 6 Jun 2018 16:26:00 +0200
> > Johan Hovold  wrote:
> > 
> > > Looks like the greybus code above is working as intended by checking for
> > > unterminated string after the strncpy, even if this does now triggers
> > > the truncation warning.
> 
> So why exactly are we generating a warning here ? Is it because it is possible
> that the first n bytes of src may not have the null terminating byte and the
> dest may not be null terminated eventually ?
> 
> Maybe I should just use memcpy here then ?
> 
I think if the destination is not a null terminated string (If I understand your
description below), memcpy can be used to get rid of such warning. The warning
makes sense in general as explained in mannual. Thanks!

> But AFAIR, I used strncpy() specifically because it also sets all the 
> remaining
> bytes after the null terminating byte with the null terminating byte. And so 
> it
> is pretty easy for me to check if the final string is null terminated by
> checking [max - 1] byte against '\0', which the code is doing right now.
> 
> I am not sure what would the best way to get around this incorrect-warning.
> 
> And I am wondering on why buildbot reported the warning only for two instances
> in that file, while I have done the same thing at 4 places.
> 
> > Ah, yes I now see that. Thanks for pointing it out. But perhaps it
> > should also add the "- 1" to the strncpy() so that gcc doesn't think
> > it's a mistake.
> 
> The src string is passed on from a firmware entity and we need to make sure 
> the
> protocol (greybus) is implemented properly by the other end. For example, in 
> the
> current case if the firmware sends "HELLOWORLD", its an error as it should 
> have
> sent "HELLWORLD\0". But with what you are saying we will forcefully make dest 
> as
> "HELLWORLD\0", which wouldn't be the right thing to do as we will miss the bug
> present in firmware.
> 
> -- 
> viresh

-- 
Thanks,
Changbin Du


Re: [PATCH] scripts/faddr2line: show the code context

2018-06-03 Thread Du, Changbin
On Tue, May 29, 2018 at 06:03:32PM +0200, Peter Zijlstra wrote:
> On Mon, Mar 19, 2018 at 03:23:25PM +0800, changbin...@intel.com wrote:
> > From: Changbin Du 
> > 
> > Inspired by gdb command 'list', show the code context of target lines.
> > Here is a example:
> > 
> > $ scripts/faddr2line vmlinux native_write_msr+0x6
> > native_write_msr+0x6/0x20:
> > arch_static_branch at arch/x86/include/asm/msr.h:105
> > 100 return EAX_EDX_VAL(val, low, high);
> > 101 }
> > 102
> > 103 static inline void notrace __wrmsr(unsigned int msr, u32 low, u32 
> > high)
> > 104 {
> > 105 asm volatile("1: wrmsr\n"
> > 106  "2:\n"
> > 107  _ASM_EXTABLE_HANDLE(1b, 2b, 
> > ex_handler_wrmsr_unsafe)
> > 108  : : "c" (msr), "a"(low), "d" (high) : 
> > "memory");
> > 109 }
> > 110
> > (inlined by) static_key_false at include/linux/jump_label.h:142
> > 137 #define JUMP_TYPE_LINKED2UL
> > 138 #define JUMP_TYPE_MASK  3UL
> > 139
> > 140 static __always_inline bool static_key_false(struct static_key *key)
> > 141 {
> > 142 return arch_static_branch(key, false);
> > 143 }
> > 144
> > 145 static __always_inline bool static_key_true(struct static_key *key)
> > 146 {
> > 147 return !arch_static_branch(key, true);
> > (inlined by) native_write_msr at arch/x86/include/asm/msr.h:150
> > 145 static inline void notrace
> > 146 native_write_msr(unsigned int msr, u32 low, u32 high)
> > 147 {
> > 148 __wrmsr(msr, low, high);
> > 149
> > 150 if (msr_tracepoint_active(__tracepoint_write_msr))
> > 151 do_trace_write_msr(msr, ((u64)high << 32 | low), 0);
> > 152 }
> > 153
> > 154 /* Can be uninlined because referenced by paravirt */
> > 155 static inline int notrace
> 
> Not a fan of this :-/ And you didn't even make it optional. Nor did you
> Cc the original author of the tool.
Yeah, understand your compatibility concern, and thanks for your improvment.
I only added people from 'scripts/get_maintainer.pl'.


-- 
Thanks,
Changbin Du


Re: [PATCH] scripts/faddr2line: show the code context

2018-06-03 Thread Du, Changbin
On Wed, May 30, 2018 at 08:01:48AM +1000, NeilBrown wrote:
> On Tue, May 29 2018, Peter Zijlstra wrote:
> 
> > On Tue, May 29, 2018 at 12:07:10PM -0500, Josh Poimboeuf wrote:
> >> Yeah, this change really should have been an optional arg.  It hurt the
> >> readability and compactness of the output.  The above looks good to me.
> >> 
> >> Care to send a proper patch?  If you send it to Linus he might apply it
> >> directly as he did with my original patches.
> >
> > ---
> > From: Peter Zijlstra (Intel) 
> >
> > Commit 6870c0165feaa5 ("scripts/faddr2line: show the code context")
> > radically altered the output format of the faddr2line tool. And while
> > the new list output format might have merrit it broke my vim usage and
> > was hard to read.
> >
> > Make the new format optional; using a '--list' argument and attempt to
> > make the output slightly easier to read by adding a little whitespace to
> > separate the different files and explicitly mark the line in question.
> 
> Not commenting on your code but on the original patch.
> I've recently noticed that ADDR2LINE sometimes outputs
>   (discriminator 2)
> or similar at the end of the line.  This messes up the parsing.
> 
> I hacked it to work so I could keep debugging with
> 
> - local file_lines=$(${ADDR2LINE} -fpie $objfile $addr | sed "s; 
> $dir_prefix\(\./\)*; ;")
> + local file_lines=$(${ADDR2LINE} -fpie $objfile $addr | sed -e 
> "s; $dir_prefix\(\./\)*; ;" -e "s/(discriminator [0-9]*)//")
> 
> but someone should probably find out exactly what sort of messages
> ADDR2LINE produces, and make sure they are all parsed correctly.
> (maybe that someone should be me, but not today).
> 
Hi, I have fixed it by commit 78eb0c6356 ("scripts/faddr2line: fix error when
addr2line output contains discriminator") and it is already in the mainline now.
Thank you!

> Thanks,
> NeilBrown
> 
> 
> >
> > Cc: Changbin Du 
> > Acked-by: Josh Poimboeuf 
> > Fixes: 6870c0165feaa5 ("scripts/faddr2line: show the code context")
> > Signed-off-by: Peter Zijlstra (Intel) 
> > ---
> >  scripts/faddr2line | 18 --
> >  1 file changed, 16 insertions(+), 2 deletions(-)
> >
> > diff --git a/scripts/faddr2line b/scripts/faddr2line
> > index 1876a741087c..a0149db00be7 100755
> > --- a/scripts/faddr2line
> > +++ b/scripts/faddr2line
> > @@ -56,7 +56,7 @@ command -v ${SIZE} >/dev/null 2>&1 || die "size isn't 
> > installed"
> >  command -v ${NM} >/dev/null 2>&1 || die "nm isn't installed"
> >  
> >  usage() {
> > -   echo "usage: faddr2line   ..." 
> > >&2
> > +   echo "usage: faddr2line [--list]   
> > ..." >&2
> > exit 1
> >  }
> >  
> > @@ -166,15 +166,25 @@ __faddr2line() {
> > local file_lines=$(${ADDR2LINE} -fpie $objfile $addr | sed "s; 
> > $dir_prefix\(\./\)*; ;")
> > [[ -z $file_lines ]] && return
> >  
> > +   if [[ $LIST = 0 ]]; then
> > +   echo "$file_lines" | while read -r line
> > +   do
> > +   echo $line
> > +   done
> > +   DONE=1;
> > +   return
> > +   fi
> > +
> > # show each line with context
> > echo "$file_lines" | while read -r line
> > do
> > +   echo
> > echo $line
> > n=$(echo $line | sed 's/.*:\([0-9]\+\).*/\1/g')
> > n1=$[$n-5]
> > n2=$[$n+5]
> > f=$(echo $line | sed 's/.*at \(.\+\):.*/\1/g')
> > -   awk 'NR>=strtonum("'$n1'") && NR<=strtonum("'$n2'") 
> > {printf("%d\t%s\n", NR, $0)}' $f
> > +   awk 'NR>=strtonum("'$n1'") && NR<=strtonum("'$n2'") { 
> > if (NR=='$n') printf(">%d<", NR); else printf(" %d ", NR); printf("\t%s\n", 
> > $0)}' $f
> > done
> >  
> > DONE=1
> > @@ -185,6 +195,10 @@ __faddr2line() {
> >  [[ $# -lt 2 ]] && usage
> >  
> >  objfile=$1
> > +
> > +LIST=0
> > +[[ "$objfile" == "--list" ]] && LIST=1 && shift && objfile=$1
> > +
> >  [[ ! -f $objfile ]] && die "can't find objfile $objfile"
> >  shift
> >  



-- 
Thanks,
Changbin Du


Re: [PATCH v4 4/4] asm-generic: fix build error in fix_to_virt with CONFIG_CC_OPTIMIZE_FOR_DEBUGGING

2018-05-10 Thread Du, Changbin
On Wed, May 09, 2018 at 08:52:24AM -0400, Steven Rostedt wrote:
> On Wed,  9 May 2018 16:43:16 +0800
> changbin...@intel.com wrote:
> 
> > From: Changbin Du 
> > 
> > With '-Og' optimization level, GCC would not optimize a count for a loop
> > as a constant value. But BUILD_BUG_ON() only accept compile-time constant
> > values. Let's use __fix_to_virt() to avoid the error.
> > 
> > arch/arm/mm/mmu.o: In function `fix_to_virt':
> > /home/changbin/work/linux/./include/asm-generic/fixmap.h:31: undefined 
> > reference to `__compiletime_assert_31'
> > Makefile:1051: recipe for target 'vmlinux' failed
> > make: *** [vmlinux] Error 1
> 
> Perhaps we should put this patch ahead of patch 3. Why allow it to
> break?
> 
Agree, let me exchange the last two patches.

> Anyway, besides that, I think the series looks good.
> 
> For the series: Acked-by: Steven Rostedt (VMware) 
> 
> -- Steve
> 
> 
> > 
> > Signed-off-by: Changbin Du 
> > 
> > ---
> > v2: use __fix_to_virt() to fix the issue.
> > ---
> >  arch/arm/mm/mmu.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> > index e46a6a4..c08d74e 100644
> > --- a/arch/arm/mm/mmu.c
> > +++ b/arch/arm/mm/mmu.c
> > @@ -1599,7 +1599,7 @@ static void __init early_fixmap_shutdown(void)
> > pte_t *pte;
> > struct map_desc map;
> >  
> > -   map.virtual = fix_to_virt(i);
> > +   map.virtual = __fix_to_virt(i);
> > pte = pte_offset_early_fixmap(pmd_off_k(map.virtual), 
> > map.virtual);
> >  
> > /* Only i/o device mappings are supported ATM */
> 

-- 
Thanks,
Changbin Du


Re: [PATCH v3 2/5] regulator: add dummy function of_find_regulator_by_node

2018-05-09 Thread Du, Changbin
On Wed, May 09, 2018 at 05:21:14PM +0900, Mark Brown wrote:
> On Sun, May 06, 2018 at 08:20:13AM +0800, changbin...@intel.com wrote:
> > From: Changbin Du 
> > 
> > If device tree is not enabled, of_find_regulator_by_node() should have
> > a dummy function since the function call is still there.
> 
> Please do not submit new versions of already applied patches, please
> submit incremental updates to the existing code.  Modifying existing
> commits creates problems for other users building on top of those
> commits so it's best practice to only change pubished git commits if
> absolutely essential.

Hmm, I saw your merging notification too late. Let me refresh the series. Sorry
for confusing.

-- 
Thanks,
Changbin Du


Re: [PATCH v2 4/5] kernel hacking: new config DEBUG_EXPERIENCE to apply GCC -Og optimization

2018-05-05 Thread Du, Changbin
On Thu, May 03, 2018 at 10:28:23AM -0400, Steven Rostedt wrote:
> On Thu, 3 May 2018 21:45:46 +0800
> "Du, Changbin"  wrote:
> 
> > > With that gcc comment, I still think CONFIG_OPTIMIZE_DEBUG is more
> > > inline with what it is and understandable than
> > > CONFIG_DEBUG_EXPERIENCE. The "OPTIMIZE" is the key word there.
> > > 
> > > -- Steve  
> > What about CONFIG_CC_OPTIMIZE_FOR_DEBUGGING? We alreay have
> > CONFIG_CC_OPTIMIZE_FOR_SIZE and CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE.
> 
> Yes I like that much better.
> 
> > 
> > And do we need to move it to existing configuration menu "General setup->
> > Compiler optimization level"? But I also want it appear in "kernel hacking"
> > since this is a debug option.
> 
> I understand why you would want it by debugging, but I think it does
> make more sense to be included with the above two other options, as
> they are all mutually exclusive.
> 
> This brings up the topic of creating config paradigms. That is, a way
> of saying "I want a debug kernel" and select one option that selects
> everything you would expect. Or perhaps we should have a:
> 
>  make debug_config
> 
Agree, I accomplish this by running script scripts/kconfig/merge_config.sh.

> that does it.
> 
> But that's a different topic. For now, I would just included it in
> init/Kconfig, and not worry about it not showing up in kernel hacking.
> 
> 
> -- Steve

-- 
Thanks,
Changbin Du


Re: [PATCH v2 0/5] kernel hacking: GCC optimization for debug experience (-Og)

2018-05-03 Thread Du, Changbin
On Wed, May 02, 2018 at 03:56:31PM +0100, Daniel Thompson wrote:
> On Wed, May 02, 2018 at 09:44:55PM +0800, changbin...@intel.com wrote:
> > From: Changbin Du 
> > 
> > Hi all,
> > I know some kernel developers was searching for a method to dissable GCC
> > optimizations, probably they want to apply GCC '-O0' option. But since Linux
> > kernel replys on GCC optimization to remove some dead code, so '-O0' just
> > breaks the build. They do need this because they want to debug kernel with
> > qemu, simics, kgtp or kgdb.
> > 
> > Thanks for the GCC '-Og' optimization level introduced in GCC 4.8, which
> > offers a reasonable level of optimization while maintaining fast compilation
> > and a good debugging experience. It is similar to '-O1' while perfer keeping
> > debug ability over runtime speed. With '-Og', we can build a kernel with
> > better debug ability and little performance drop after some simple change.
> > 
> > In this series, firstly introduce a new config CONFIG_NO_AUTO_INLINE after 
> > two
> > fixes for this new option. With this option, only functions explicitly 
> > marked
> > with "inline" will  be inlined. This will allow the function tracer to trace
> > more functions because it only traces functions that the compiler has not
> > inlined.
> > 
> > Then introduce new config CONFIG_DEBUG_EXPERIENCE which apply '-Og'
> > optimization level for whole kernel, with a simple fix in fix_to_virt().
> > Currently this option is only tested on a QEMU gust and it works fine.
> > 
> > 
> > Comparison of vmlinux size: a bit smaller.
> > 
> > w/o CONFIG_DEBUG_EXPERIENCE
> > $ size vmlinux
> >textdata bss dec hex filename
> > 22665554   9709674  2920908 3529613621a9388 vmlinux
> > 
> > w/ CONFIG_DEBUG_EXPERIENCE
> > $ size vmlinux
> >textdata bss dec hex filename
> > 21499032   10102758 2920908 3452269820ec64a vmlinux
> > 
> > 
> > Comparison of system performance: a bit drop (~6%).
> > This benchmark of kernel compilation is suggested by Ingo Molnar.
> > https://lkml.org/lkml/2018/5/2/74
> 
> In my mind was the opposite question. When running on the same kernel
> does a kernel whose config contains CONFIG_DEBUG_EXPERIENCE build faster
> than one without (due to the disabled optimization passes).
> 
> To be honest this is more curiosity than a review comment though... if
> you have the figures please share, if not then don't sweat it on my
> account!
> 
> 
> Daniel.
>
Sorry I don't have the data yet. Per the comment in GCC, I think it should be a
little faster but not obviously.

> 
-- 
Thanks,
Changbin Du


Re: [PATCH v2 4/5] kernel hacking: new config DEBUG_EXPERIENCE to apply GCC -Og optimization

2018-05-03 Thread Du, Changbin
On Wed, May 02, 2018 at 09:19:56PM -0400, Steven Rostedt wrote:
> On Wed, 2 May 2018 13:45:58 -0700
> Andrew Morton  wrote:
> 
> > On Wed, 2 May 2018 10:17:07 -0400 Steven Rostedt  
> > wrote:
> > 
> > > > Comparison of vmlinux size: a bit smaller.
> > > > 
> > > > w/o CONFIG_DEBUG_EXPERIENCE  
> > > 
> > > I hate the config name.
> > > 
> > > I probably can't come up with better ones but let's try:
> > > 
> > >  CONFIG_DEBUG_OPTIMIZE ?
> > >  CONFIG_OPTIMIZE_DEBUG ?
> > > 
> > > But "EXPERIENCE" sounds like I'm on some DEBUG LSD.  
> > 
> > Metoo, but the gcc people decided on "-Og: Optimize debugging
> > experience ..." and I think there are benefits if the kernel is to
> > align the naming with that.
> 
> I still see that as "Optimize debugging" and "experience" is just the
> platform of what was done.
> 
> With that gcc comment, I still think CONFIG_OPTIMIZE_DEBUG is more
> inline with what it is and understandable than
> CONFIG_DEBUG_EXPERIENCE. The "OPTIMIZE" is the key word there.
> 
> -- Steve
What about CONFIG_CC_OPTIMIZE_FOR_DEBUGGING? We alreay have
CONFIG_CC_OPTIMIZE_FOR_SIZE and CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE.

And do we need to move it to existing configuration menu "General setup->
Compiler optimization level"? But I also want it appear in "kernel hacking"
since this is a debug option.

-- 
Thanks,
Changbin Du


Re: [PATCH v2 5/5] asm-generic: fix build error in fix_to_virt with CONFIG_DEBUG_EXPERIENCE

2018-05-03 Thread Du, Changbin
On Wed, May 02, 2018 at 10:19:30AM -0400, Steven Rostedt wrote:
> On Wed,  2 May 2018 21:45:00 +0800
> changbin...@intel.com wrote:
> 
> > From: Changbin Du 
> > 
> > With '-Og' optimization level, GCC would not optimize a count for a loop
> > as a constant value. But BUILD_BUG_ON() only accept compile-time constant
> > values.
> > 
> > arch/arm/mm/mmu.o: In function `fix_to_virt':
> > /home/changbin/work/linux/./include/asm-generic/fixmap.h:31: undefined 
> > reference to `__compiletime_assert_31'
> > Makefile:1051: recipe for target 'vmlinux' failed
> > make: *** [vmlinux] Error 1
> > 
> > Signed-off-by: Changbin Du 
> > ---
> >  include/asm-generic/fixmap.h | 3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/asm-generic/fixmap.h b/include/asm-generic/fixmap.h
> > index 827e4d3..a6576d4 100644
> > --- a/include/asm-generic/fixmap.h
> > +++ b/include/asm-generic/fixmap.h
> > @@ -28,7 +28,8 @@
> >   */
> >  static __always_inline unsigned long fix_to_virt(const unsigned int idx)
> >  {
> > -   BUILD_BUG_ON(idx >= __end_of_fixed_addresses);
> > +   BUILD_BUG_ON(__builtin_constant_p(idx) &&
> > +idx >= __end_of_fixed_addresses);
> 
> Hmm, this changes the check slightly. Perhaps we should only do this
> when your config is active:
> 
> {
>   BUILD_BUG_ON(
> /* CONFIG_DEBUG_OPTIMIZE may cause idx not to be constant */
> #ifdef CONFIG_DEBUG_OPTIMIZE
>   __builtin_constant_p(idx) &&
> #endif
>   idx >= __end_of_fixed_addresses);
> 
> }
I think fix_to_virt() is designed for constant idx only. So I think we should
fix it at the caller side by replacing it with __fix_to_virt().

--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -1599,7 +1599,7 @@ static void __init early_fixmap_shutdown(void)
pte_t *pte;
struct map_desc map;

-   map.virtual = fix_to_virt(i);
+   map.virtual = __fix_to_virt(i);
pte = pte_offset_early_fixmap(pmd_off_k(map.virtual), 
map.virtual);

> 
> -- Steve
> 
> > return __fix_to_virt(idx);
> >  }
> >  
> 

-- 
Thanks,
Changbin Du


Re: [PATCH v2 3/5] kernel hacking: new config NO_AUTO_INLINE to disable compiler auto-inline optimizations

2018-05-03 Thread Du, Changbin
On Wed, May 02, 2018 at 04:27:47PM -0400, Arnd Bergmann wrote:
> On Wed, May 2, 2018 at 9:44 AM,   wrote:
> > From: Changbin Du 
> >
> > This patch add a new kernel hacking option NO_AUTO_INLINE. Selecting
> > this option will prevent the compiler from optimizing the kernel by
> > auto-inlining functions not marked with the inline keyword.
> >
> > With this option, only functions explicitly marked with "inline" will
> > be inlined. This will allow the function tracer to trace more functions
> > because it only traces functions that the compiler has not inlined.
> >
> > Signed-off-by: Changbin Du 
> > Cc: Steven Rostedt 
> 
> Should this be closer to CONFIG_OPTIMIZE_INLINING or
> possibly mutually exclusive with it?
>
They are not related I think. CONFIG_OPTIMIZE_INLINING only has effect on
functions which are explicitly marked as inline.

>Arnd

-- 
Thanks,
Changbin Du


Re: [PATCH 3/5] kernel hacking: new config NO_AUTO_INLINE to disable compiler atuo-inline optimizations

2018-05-02 Thread Du, Changbin
On Tue, May 01, 2018 at 10:54:20AM -0400, Steven Rostedt wrote:
> On Tue,  1 May 2018 21:00:12 +0800
> changbin...@intel.com wrote:
> 
> > From: Changbin Du 
> > 
> > This patch add a new kernel hacking option NO_AUTO_INLINE. Selecting
> > this option will make compiler not auto-inline kernel functions. By
> > enabling this option, all the kernel functions (including static ones)
> > will not be optimized out except those marked as inline or always_inline.
> > This is useful when you are using ftrace to understand the control flow
> > of kernel code or tracing some static functions.
> 
> I'm not against this patch, but it's up to others if this gets included
> or not.
> 
> > 
> > Signed-off-by: Changbin Du 
> > Cc: Steven Rostedt 
> > ---
> >  Makefile  |  6 ++
> >  lib/Kconfig.debug | 13 +
> >  2 files changed, 19 insertions(+)
> > 
> > diff --git a/Makefile b/Makefile
> > index 619a85a..eb694f6 100644
> > --- a/Makefile
> > +++ b/Makefile
> > @@ -775,6 +775,12 @@ KBUILD_CFLAGS  += $(call cc-option, 
> > -femit-struct-debug-baseonly) \
> >$(call cc-option,-fno-var-tracking)
> >  endif
> >  
> > +ifdef CONFIG_NO_AUTO_INLINE
> > +KBUILD_CFLAGS   += $(call cc-option, -fno-inline-functions) \
> > +  $(call cc-option, -fno-inline-small-functions) \
> > +  $(call cc-option, -fno-inline-functions-called-once)
> > +endif
> > +
> >  ifdef CONFIG_FUNCTION_TRACER
> >  ifndef CC_FLAGS_FTRACE
> >  CC_FLAGS_FTRACE := -pg
> > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> > index c40c7b7..90f35ad 100644
> > --- a/lib/Kconfig.debug
> > +++ b/lib/Kconfig.debug
> > @@ -198,6 +198,19 @@ config GDB_SCRIPTS
> >   instance. See Documentation/dev-tools/gdb-kernel-debugging.rst
> >   for further details.
> >  
> > +config NO_AUTO_INLINE
> > +   bool "Disable compiler atuo-inline optimizations"
> 
> typo: s/atuo/auto/
> 
> > +   default n
> > +   help
> > + This will make compiler not auto-inline kernel functions for
> > + optimization. By enabling this option, all the kernel functions
> > + (including static ones) will not be optimized out except those
> > + marked as inline or always_inline. This is useful when you are
> > + using ftrace to understand the control flow of kernel code or
> > + tracing some static functions.
> 
> Some grammar updates:
> 
> This will prevent the compiler from optimizing the kernel by
> auto-inlining functions not marked with the inline keyword.
> With this option, only functions explicitly marked with
> "inline" will be inlined. This will allow the function tracer
> to trace more functions because it only traces functions that
> the compiler has not inlined.
> 
> Enabling this function can help debugging a kernel if using
> the function tracer. But it can also change how the kernel
> works, because inlining functions may change the timing,
> which could make it difficult while debugging race conditions.
> 

Thanks for your kind grammar updates. I will update them. :)

> > +
> > + Use only if you want to debug the kernel.
> 
> The proper way to say the above is:
> 
> If unsure, select N
>
Agree.

> -- Steve
> 
> > +
> >  config ENABLE_WARN_DEPRECATED
> > bool "Enable __deprecated logic"
> > default y
> 


-- 
Thanks,
Changbin Du


Re: [PATCH 4/5] kernel hacking: new config DEBUG_EXPERIENCE to apply GCC -Og optimization

2018-05-02 Thread Du, Changbin
On Tue, May 01, 2018 at 08:25:27AM -0700, Randy Dunlap wrote:
> Good morning.
> 
> On 05/01/2018 06:00 AM, changbin...@intel.com wrote:
> > From: Changbin Du 
> > 
> > 
> > Signed-off-by: Changbin Du 
> > ---
> >  Makefile |  4 
> >  include/linux/compiler-gcc.h |  2 +-
> >  include/linux/compiler.h |  2 +-
> >  lib/Kconfig.debug| 21 +
> >  4 files changed, 27 insertions(+), 2 deletions(-)
> > 
> 
> > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> > index 90f35ad..2432e77d 100644
> > --- a/lib/Kconfig.debug
> > +++ b/lib/Kconfig.debug
> > @@ -211,6 +211,27 @@ config NO_AUTO_INLINE
> >  
> >   Use only if you want to debug the kernel.
> >  
> > +config DEBUG_EXPERIENCE
> > +   bool "Optimize for better debugging experience (-Og)"
> > +   default n
> > +   select NO_AUTO_INLINE
> > +   depends on !CC_OPTIMIZE_FOR_SIZE
> > +   help
> > + This will apply GCC '-Og' optimization level get supported from
> 
>  which is supported since
> 
> > + GCC 4.8. This optimization level offers a reasonable level of
> > + optimization while maintaining fast compilation and a good
> > + debugging experience. It is similar to '-O1' while perfer keeping
> 
>  while preferring to keep
> 
> > + debug ability over runtime speed. The overall performance will
> > + drop a bit.
> > +
> > + If enabling this option break your kernel, you should either
> 
> breaks
> 
> > + disable this or find a fix (mostly in the arch code). Currently
> > + this option has only be tested in qemu x86_64 guest.
> > +
> > + Use only if you want to debug the kernel, especially if you want
> > + to have better kernel debugging experience with gdb facilities
> > + like kgdb and qemu.
> > +
> >  config ENABLE_WARN_DEPRECATED
> > bool "Enable __deprecated logic"
> > default y
> > 
> 
> thanks,
> -- 
> ~Randy

Thanks for your correction, I will update.

-- 
Thanks,
Changbin Du


Re: [PATCH 2/5] regulator: add dummy of_find_regulator_by_node

2018-05-02 Thread Du, Changbin
On Wed, May 02, 2018 at 05:40:36AM +0900, Mark Brown wrote:
> On Tue, May 01, 2018 at 09:00:11PM +0800, changbin...@intel.com wrote:
> > From: Changbin Du 
> > 
> > If device tree is not enabled, of_find_regulator_by_node() should have
> > a dummy function since the function call is still there.
> > 
> > Signed-off-by: Changbin Du 
> 
> This appears to have no obvious connection with the cover letter for the
> series...  The first question here is if this is something best fixed
> with a stub or by fixing the users - is the lack of a stub pointing out
> some bugs in them?  I'm a bit worried about how we've been managing to
> avoid any build test issues here though, surely the various builders
> would have spotted a problem?

This is to fix build error after NO_AUTO_INLINE is introduced. If this option
is enabled, GCC will not auto-inline functions that are not explicitly marked
as inline.

In this case (no CONFIG_OF), the copmiler will report error in 
regulator_dev_lookup().
W/o NO_AUTO_INLINE, function of_get_regulator() is auto-inlined and then the 
call
to of_find_regulator_by_node() is optimized out since of_get_regulator() always
return NULL. W/ NO_AUTO_INLINE, the return value of of_get_regulator() is a 
variable
so the call to of_find_regulator_by_node() cannot be optimized out.

static struct regulator_dev *regulator_dev_lookup(struct device *dev,
  const char *supply)
{
struct regulator_dev *r = NULL;
struct device_node *node;
struct regulator_map *map;
const char *devname = NULL;

regulator_supply_alias(&dev, &supply);

/* first do a dt based lookup */
if (dev && dev->of_node) {
node = of_get_regulator(dev, supply);
if (node) {
r = of_find_regulator_by_node(node);
if (r)
return r;


It is safe we just provide a stub of_find_regulator_by_node() if no CONFIG_OF.

-- 
Thanks,
Changbin Du


Re: [PATCH 0/5] kernel hacking: GCC optimization for debug experience (-Og)

2018-05-02 Thread Du, Changbin
On Wed, May 02, 2018 at 09:33:15AM +0200, Ingo Molnar wrote:
> 
> * changbin...@intel.com  wrote:
> 
> > Comparison of system performance: a bit drop.
> > 
> > w/o CONFIG_DEBUG_EXPERIENCE
> > $ time make -j4
> > real6m43.619s
> > user19m5.160s
> > sys 2m20.287s
> > 
> > w/ CONFIG_DEBUG_EXPERIENCE
> > $ time make -j4
> > real6m55.054s
> > user19m11.129s
> > sys 2m36.345s
> 
> Sorry, that's not a proper kbuild performance measurement - there's no noise 
> estimation at all.
> 
> Below is a description that should produce more reliable numbers.
> 
> Thanks,
> 
>   Ingo
>
Thanks for your suggestion, I will try your tips to eliminate noise. Since it is
tested in KVM guest, so I just reboot the guest before testing. But in host side
I still need to consider these noises.

> 
> =>
> 
> So here's a pretty reliable way to measure kernel build time, which tries to 
> avoid 
> the various pitfalls of caching.
> 
> First I make sure that cpufreq is set to 'performance':
> 
>   for ((cpu=0; cpu<120; cpu++)); do
> G=/sys/devices/system/cpu/cpu$cpu/cpufreq/scaling_governor
> [ -f $G ] && echo performance > $G
>   done
> 
> [ ... because it can be *really* annoying to discover that an ostensible 
>   performance regression was a cpufreq artifact ... again. ;-) ]
> 
> Then I copy a kernel tree to /tmp (ramfs) as root:
> 
>   cd /tmp
>   rm -rf linux
>   git clone ~/linux linux
>   cd linux
>   make defconfig >/dev/null
>   
> ... and then we can build the kernel in such a loop (as root again):
> 
>   perf stat --repeat 10 --null --pre  '\
>   cp -a kernel ../kernel.copy.$(date +%s); \
>   rm -rf *;\
>   git checkout .;  \
>   echo 1 > /proc/sys/vm/drop_caches;   \
>   find ../kernel* -type f | xargs cat >/dev/null;  \
>   make -j kernel >/dev/null;   \
>   make clean >/dev/null 2>&1;  \
>   sync'\
>\
>   make -j16 >/dev/null
> 
> ( I have tested these by pasting them into a terminal. Adjust the ~/linux 
> source 
>   git tree and the '-j16' to your system. )
> 
> Notes:
> 
>  - the 'pre' script portion is not timed by 'perf stat', only the raw build 
> times
> 
>  - we flush all caches via drop_caches and re-establish everything again, but:
> 
>  - we also introduce an intentional memory leak by slowly filling up ramfs 
> with 
>copies of 'kernel/', thus continously changing the layout of free memory, 
>cached data such as compiler binaries and the source code hierarchy. (Note 
>that the leak is about 8MB per iteration, so it isn't massive.)
> 
> With 10 iterations this is the statistical stability I get this on a big box:
> 
>  Performance counter stats for 'make -j128 kernel' (10 runs):
> 
>   26.346436425 seconds time elapsed(+- 0.19%)
> 
> ... which, despite a high iteration count of 10, is still surprisingly noisy, 
> right?
> 
> A 0.2% stddev is probably not enough to call a 0.7% regression with good 
> confidence, so I had to use *30* iterations to make measurement noise to be 
> about 
> an order of magnitude lower than the effect I'm trying to measure:
> 
>  Performance counter stats for 'make -j128' (30 runs):
> 
>   26.334767571 seconds time elapsed(+- 0.09% )
> 
> i.e. "26.334 +- 0.023" seconds is a number we can have pretty high confidence 
> in, 
> on this system.
> 
> And just to demonstrate that it's all real, I repeated the whole 30-iteration 
> measurement again:
> 
>  Performance counter stats for 'make -j128' (30 runs):
> 
>   26.311166142 seconds time elapsed(+- 0.07%)
> 

-- 
Thanks,
Changbin Du


Re: [PATCH] iommu/vt-d: fix shift-out-of-bounds in bug checking

2018-04-26 Thread Du, Changbin
Hello, any reviewer? Thanks!

On Fri, Apr 20, 2018 at 01:29:55PM +0800, changbin...@intel.com wrote:
> From: Changbin Du 
> 
> It allows to flush more than 4GB of device TLBs. So the mask should be
> 64bit wide. UBSAN captured this fault as below.
> 
> [3.760024] 
> 
> [3.768440] UBSAN: Undefined behaviour in drivers/iommu/dmar.c:1348:3
> [3.774864] shift exponent 64 is too large for 32-bit type 'int'
> [3.780853] CPU: 2 PID: 0 Comm: swapper/2 Tainted: G U
> 4.17.0-rc1+ #89
> [3.788661] Hardware name: Dell Inc. OptiPlex 7040/0Y7WYT, BIOS 1.2.8 
> 01/26/2016
> [3.796034] Call Trace:
> [3.798472]  
> [3.800479]  dump_stack+0x90/0xfb
> [3.803787]  ubsan_epilogue+0x9/0x40
> [3.807353]  __ubsan_handle_shift_out_of_bounds+0x10e/0x170
> [3.812916]  ? qi_flush_dev_iotlb+0x124/0x180
> [3.817261]  qi_flush_dev_iotlb+0x124/0x180
> [3.821437]  iommu_flush_dev_iotlb+0x94/0xf0
> [3.825698]  iommu_flush_iova+0x10b/0x1c0
> [3.829699]  ? fq_ring_free+0x1d0/0x1d0
> [3.833527]  iova_domain_flush+0x25/0x40
> [3.837448]  fq_flush_timeout+0x55/0x160
> [3.841368]  ? fq_ring_free+0x1d0/0x1d0
> [3.845200]  ? fq_ring_free+0x1d0/0x1d0
> [3.849034]  call_timer_fn+0xbe/0x310
> [3.852696]  ? fq_ring_free+0x1d0/0x1d0
> [3.856530]  run_timer_softirq+0x223/0x6e0
> [3.860625]  ? sched_clock+0x5/0x10
> [3.864108]  ? sched_clock+0x5/0x10
> [3.867594]  __do_softirq+0x1b5/0x6f5
> [3.871250]  irq_exit+0xd4/0x130
> [3.874470]  smp_apic_timer_interrupt+0xb8/0x2f0
> [3.879075]  apic_timer_interrupt+0xf/0x20
> [3.883159]  
> [3.885255] RIP: 0010:poll_idle+0x60/0xe7
> [3.889252] RSP: 0018:b1b201943e30 EFLAGS: 0246 ORIG_RAX: 
> ff13
> [3.896802] RAX: 8020 RBX: 008e RCX: 
> 001f
> [3.903918] RDX:  RSI: 2819aa06 RDI: 
> 
> [3.911031] RBP: 9e93c6b33280 R08: 0010f717d567 R09: 
> 0010d205
> [3.918146] R10: b1b201943df8 R11: 0001 R12: 
> e01b169d
> [3.925260] R13:  R14: b12aa400 R15: 
> 
> [3.932382]  cpuidle_enter_state+0xb4/0x470
> [3.936558]  do_idle+0x222/0x310
> [3.939779]  cpu_startup_entry+0x78/0x90
> [3.943693]  start_secondary+0x205/0x2e0
> [3.947607]  secondary_startup_64+0xa5/0xb0
> [3.951783] 
> 
> 
> Signed-off-by: Changbin Du 
> ---
>  drivers/iommu/dmar.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
> index accf5838..e4ae600 100644
> --- a/drivers/iommu/dmar.c
> +++ b/drivers/iommu/dmar.c
> @@ -1345,7 +1345,7 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 
> sid, u16 qdep,
>   struct qi_desc desc;
>  
>   if (mask) {
> - BUG_ON(addr & ((1 << (VTD_PAGE_SHIFT + mask)) - 1));
> + BUG_ON(addr & ((1ULL << (VTD_PAGE_SHIFT + mask)) - 1));
>   addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1;
>   desc.high = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE;
>   } else
> -- 
> 2.7.4
> 

-- 
Thanks,
Changbin Du


Re: [PATCH] Documentation: fix reST markup error in driver-api/usb/typec.rst

2018-04-08 Thread Du, Changbin
On Sun, Apr 08, 2018 at 09:19:58AM +0200, Greg KH wrote:
> On Sun, Apr 08, 2018 at 10:47:12AM +0800, changbin...@intel.com wrote:
> > From: Changbin Du 
> > 
> > There is an format error in driver-api/usb/typec.rst that breaks sphinx
> > docs building.
> > 
> > reST markup error:
> > /home/changbin/work/linux/Documentation/driver-api/usb/typec.rst:215: 
> > (SEVERE/4) Unexpected section title or transition.
> > 
> > 
> > Documentation/Makefile:68: recipe for target 'htmldocs' failed
> > make[1]: *** [htmldocs] Error 1
> > Makefile:1527: recipe for target 'htmldocs' failed
> > make: *** [htmldocs] Error 2
> > 
> > Signed-off-by: Changbin Du 
> > ---
> >  Documentation/driver-api/usb/typec.rst | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> Thanks, someone else already sent this, sorry.  I'll be sending it
> onward after 4.17-rc1 is out.
>
No problem. Thanks for your quick checking!

> greg k-h

-- 
Thanks,
Changbin Du


Re: [PATCH] perf trace: remove redundant ')'

2018-04-03 Thread Du, Changbin
On Tue, Apr 03, 2018 at 04:19:07PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Wed, Mar 28, 2018 at 03:26:31PM +0800, Du, Changbin escreveu:
> > Hi Arnaldo,
> > Just a kind reminder. Hope you didn't forget this.
> 
> Ok, applied.
> 
> - Arnaldo
>  
Got it, thanks!


Re: [PATCH] perf trace: remove redundant ')'

2018-03-28 Thread Du, Changbin
Hi Arnaldo,
Just a kind reminder. Hope you didn't forget this.

On Fri, Mar 16, 2018 at 09:50:45AM -0300, Arnaldo Carvalho de Melo wrote:
> Em Fri, Mar 16, 2018 at 03:51:09PM +0800, Du, Changbin escreveu:
> > Hi Arnaldo, How about this simple one? Thanks.
> > 
> > On Tue, Mar 13, 2018 at 06:40:01PM +0800, changbin...@intel.com wrote:
> > > From: Changbin Du 
> > > 
> > > There is a redundant ')' at the tail of each event. So remove it.
> > > $ sudo perf trace --no-syscalls -e 'kmem:*' -a
> > >899.342 kmem:kfree:(vfs_writev+0xb9) call_site=9c453979 
> > > ptr=(nil))
> > >899.344 kmem:kfree:(___sys_recvmsg+0x188) call_site=9c9b8b88 
> > > ptr=(nil))
> > > 
> > > Signed-off-by: Changbin Du 
> > > ---
> > >  tools/perf/builtin-trace.c | 2 +-
> > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > 
> > > diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> > > index e7f1b18..7273f5f 100644
> > > --- a/tools/perf/builtin-trace.c
> > > +++ b/tools/perf/builtin-trace.c
> > > @@ -1959,7 +1959,7 @@ static int trace__event_handler(struct trace 
> > > *trace, struct perf_evsel *evsel,
> > > trace->output);
> > >   }
> > >  
> > > - fprintf(trace->output, ")\n");
> > > + fprintf(trace->output, "\n");
> 
> It looks simple on the surface, but I couldn't quickly recall why this
> ')' was put there in the first place... So I left for later to do a 'git
> blame' on this file, etc.
> 
> - Arnaldo
> 
> > >   if (callchain_ret > 0)
> > >   trace__fprintf_callchain(trace, sample);
> > > -- 
> > > 2.7.4
> > > 
> > 
> > -- 
> > Thanks,
> > Changbin Du

-- 
Thanks,
Changbin Du


Re: [PATCH v2 1/4] selftests/Makefile: append a slash to env variable OUTPUT

2018-03-27 Thread Du, Changbin
On Tue, Mar 27, 2018 at 03:19:26PM -0600, Shuah Khan wrote:
> On 03/26/2018 09:11 PM, changbin...@intel.com wrote:
> > From: Changbin Du 
> > 
> > The tools/build/Makefile.build use 'OUTPUT' variable as below example:
> > objprefix:= $(subst ./,,$(OUTPUT)$(dir)/)
> > 
> > So it requires the 'OUTPUT' already has a slash at the end.
> > 
> > This patch can kill below odd paths:
> > make[3]: Entering directory '/home/changbin/work/linux/tools/gpio'
> >   CC   /home/changbin/work/linux/tools/testing/selftests/gpiolsgpio.o
> >   CC   
> > /home/changbin/work/linux/tools/testing/selftests/gpiogpio-utils.o
> >   LD   /home/changbin/work/linux/tools/testing/selftests/gpiolsgpio-in.o
> > 
> > A correct path should be:
> > /home/changbin/work/linux/tools/testing/selftests/gpio/lsgpio.o
> > 
> > Signed-off-by: Changbin Du 
> 
> Are you seeing this when you run "make kselftest" - if gpio is the
> only test compile that fails, it should be fixed in gpio/Makefile,
> not is the common Makefile.
>
I only saw error in gpio, but I also saw some kselftest Makefiles having string 
concatenation
as '$(OUTPUT)$(dir)'. So the rule is not aligned all over. They just didn't 
produce any errors
so far.

By the way, is there a basic test for kselftest infrastructure? It seems it was 
always
reporting error when building it :(

> thanks,
> -- Shuah

-- 
Thanks,
Changbin Du


Re: [PATCH 4/4] selftests/bpf: fix compiling errors

2018-03-27 Thread Du, Changbin
On Tue, Mar 27, 2018 at 11:52:27AM +0200, Daniel Borkmann wrote:
> On 03/27/2018 11:00 AM, Du, Changbin wrote:
> > On Tue, Mar 27, 2018 at 10:52:57AM +0200, Daniel Borkmann wrote:
> >> On 03/27/2018 05:06 AM, Du, Changbin wrote:
> >>> On Mon, Mar 26, 2018 at 08:02:30PM -0700, Alexei Starovoitov wrote:
> >>>> On Tue, Mar 27, 2018 at 10:20:10AM +0800, Du, Changbin wrote:
> >>>>> On Mon, Mar 26, 2018 at 07:55:13AM -0700, Alexei Starovoitov wrote:
> >>>>>> On Mon, Mar 26, 2018 at 05:23:28PM +0800, changbin...@intel.com wrote:
> >>>>>>> Signed-off-by: Changbin Du 
> >>>>>>> ---
> >>>>>>>  tools/testing/selftests/bpf/Makefile | 5 +++--
> >>>>>>>  1 file changed, 3 insertions(+), 2 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/tools/testing/selftests/bpf/Makefile 
> >>>>>>> b/tools/testing/selftests/bpf/Makefile
> >>>>>>> index 5c43c18..dc0fdc8 100644
> >>>>>>> --- a/tools/testing/selftests/bpf/Makefile
> >>>>>>> +++ b/tools/testing/selftests/bpf/Makefile
> >>>>>>> @@ -10,7 +10,8 @@ ifneq ($(wildcard $(GENHDR)),)
> >>>>>>>GENFLAGS := -DHAVE_GENHDR
> >>>>>>>  endif
> >>>>>>>  
> >>>>>>> -CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) 
> >>>>>>> -I../../../include
> >>>>>>> +CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) \
> >>>>>>> +   -I../../../include -I../../../../usr/include
> >>>>>>>  LDLIBS += -lcap -lelf -lrt -lpthread
> >>>>>>>  
> >>>>>>>  # Order correspond to 'make run_tests' order
> >>>>>>> @@ -62,7 +63,7 @@ else
> >>>>>>>CPU ?= generic
> >>>>>>>  endif
> >>>>>>>  
> >>>>>>> -CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi \
> >>>>>>> +CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi 
> >>>>>>> -I../../../../usr/include \
> >>>>>>> -Wno-compare-distinct-pointer-types
> >>>>>>
> >>>>>> Nack.
> >>>>>> I suspect that will break the build for everyone else who's doing it 
> >>>>>> in the directory
> >>>>>> itself instead of the outer one.
> >>>>>
> >>>>> This one? But I didn't see any problem.
> >>>>
> >>>> because the build was lucky and additional path ../../../../usr/include 
> >>>> didn't point
> >>>> to a valid dir?
> >>
> >> Agree.
> >>
> >>> I am sorry but I don't understand why you mean *lucky*. Of cause, the 
> >>> path is valid.
> >>
> >> The problem is that this suddenly requires users to do a 'make 
> >> headers_install' in
> >> order to populate usr/include/ directory in the first place. While it's 
> >> annoying
> >> enough for BPF samples where this is needed, I absolutely don't want to 
> >> introduce
> >> this for BPF kselftests. It's the wrong approach. Besides, in tools infra, 
> >> there is
> >> a tools/arch/*/include/uapi/asm/bitsperlong.h header copy already, so we 
> >> really need
> >> to use that instead. Please adapt your patch accordingly and respin. 
> >> Please also Cc
> >> us and net...@vger.kernel.org for BPF kselftests changes.
> >>
> > Thanks for the explanation. So we expect that tools/arch/*/include is in 
> > the searching list, right?
> > The corrent makefile seems not. How do you get this built?
> 
> E.g. take a look at tools/include/asm/barrier.h or 
> tools/include/uapi/asm/bpf_perf_event.h
> just to name two examples. We'd need something similar to this which then 
> points to the
> arch specific includes.
> 
> Thanks,
> Daniel
>
ok, I see. But I am going to skip this fix this time. Because one get fixed, 
another appears.
IMHO, It doesn't sound like a good idea to sync all these files manually. We 
should have
better solution I think.

clang -I. -I./include/uapi -I../../../include/uapi 
-Wno-compare-distinct-pointer-types \
 -O2 -target bpf -emit-llvm -c test_pkt_access.c -o - |  \
llc -march=bpf -mcpu=generic -filetype=obj -o 
/home/changbin/work/linux/tools/testing/selftests/bpf/test_pkt_access.o
In file included from test_pkt_access.c:12:
/usr/include/linux/ip.h:20:10: fatal error: 'asm/byteorder.h' file not found
#include 

 
> > changbin@gvt-dell-host:~/work/linux/tools/testing/selftests/bpf$ make -p
> > [...]
> > clang -I. -I./include/uapi -I../../../include/uapi 
> > -Wno-compare-distinct-pointer-types \
> >  -O2 -target bpf -emit-llvm -c test_pkt_access.c -o - |  \
> > llc -march=bpf -mcpu=generic -filetype=obj -o 
> > /home/changbin/work/linux/tools/testing/selftests/bpf/test_pkt_access.o
> > In file included from test_pkt_access.c:9:
> > In file included from ../../../include/uapi/linux/bpf.h:11:
> > In file included from ./include/uapi/linux/types.h:5:
> > /usr/include/asm-generic/int-ll64.h:11:10: fatal error: 'asm/bitsperlong.h' 
> > file not found
> > #include 
> > 
> > 
> 

-- 
Thanks,
Changbin Du


Re: [PATCH 4/4] selftests/bpf: fix compiling errors

2018-03-27 Thread Du, Changbin
On Tue, Mar 27, 2018 at 10:52:57AM +0200, Daniel Borkmann wrote:
> On 03/27/2018 05:06 AM, Du, Changbin wrote:
> > On Mon, Mar 26, 2018 at 08:02:30PM -0700, Alexei Starovoitov wrote:
> >> On Tue, Mar 27, 2018 at 10:20:10AM +0800, Du, Changbin wrote:
> >>> On Mon, Mar 26, 2018 at 07:55:13AM -0700, Alexei Starovoitov wrote:
> >>>> On Mon, Mar 26, 2018 at 05:23:28PM +0800, changbin...@intel.com wrote:
> >>>>> Signed-off-by: Changbin Du 
> >>>>> ---
> >>>>>  tools/testing/selftests/bpf/Makefile | 5 +++--
> >>>>>  1 file changed, 3 insertions(+), 2 deletions(-)
> >>>>>
> >>>>> diff --git a/tools/testing/selftests/bpf/Makefile 
> >>>>> b/tools/testing/selftests/bpf/Makefile
> >>>>> index 5c43c18..dc0fdc8 100644
> >>>>> --- a/tools/testing/selftests/bpf/Makefile
> >>>>> +++ b/tools/testing/selftests/bpf/Makefile
> >>>>> @@ -10,7 +10,8 @@ ifneq ($(wildcard $(GENHDR)),)
> >>>>>GENFLAGS := -DHAVE_GENHDR
> >>>>>  endif
> >>>>>  
> >>>>> -CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) 
> >>>>> -I../../../include
> >>>>> +CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) \
> >>>>> + -I../../../include -I../../../../usr/include
> >>>>>  LDLIBS += -lcap -lelf -lrt -lpthread
> >>>>>  
> >>>>>  # Order correspond to 'make run_tests' order
> >>>>> @@ -62,7 +63,7 @@ else
> >>>>>CPU ?= generic
> >>>>>  endif
> >>>>>  
> >>>>> -CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi \
> >>>>> +CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi 
> >>>>> -I../../../../usr/include \
> >>>>>   -Wno-compare-distinct-pointer-types
> >>>>
> >>>> Nack.
> >>>> I suspect that will break the build for everyone else who's doing it in 
> >>>> the directory
> >>>> itself instead of the outer one.
> >>>
> >>> This one? But I didn't see any problem.
> >>
> >> because the build was lucky and additional path ../../../../usr/include 
> >> didn't point
> >> to a valid dir?
> 
> Agree.
> 
> > I am sorry but I don't understand why you mean *lucky*. Of cause, the path 
> > is valid.
> 
> The problem is that this suddenly requires users to do a 'make 
> headers_install' in
> order to populate usr/include/ directory in the first place. While it's 
> annoying
> enough for BPF samples where this is needed, I absolutely don't want to 
> introduce
> this for BPF kselftests. It's the wrong approach. Besides, in tools infra, 
> there is
> a tools/arch/*/include/uapi/asm/bitsperlong.h header copy already, so we 
> really need
> to use that instead. Please adapt your patch accordingly and respin. Please 
> also Cc
> us and net...@vger.kernel.org for BPF kselftests changes.
> 
> Thanks,
> Daniel
Thanks for the explanation. So we expect that tools/arch/*/include is in the 
searching list, right?
The corrent makefile seems not. How do you get this built?

changbin@gvt-dell-host:~/work/linux/tools/testing/selftests/bpf$ make -p
[...]
clang -I. -I./include/uapi -I../../../include/uapi 
-Wno-compare-distinct-pointer-types \
 -O2 -target bpf -emit-llvm -c test_pkt_access.c -o - |  \
llc -march=bpf -mcpu=generic -filetype=obj -o 
/home/changbin/work/linux/tools/testing/selftests/bpf/test_pkt_access.o
In file included from test_pkt_access.c:9:
In file included from ../../../include/uapi/linux/bpf.h:11:
In file included from ./include/uapi/linux/types.h:5:
/usr/include/asm-generic/int-ll64.h:11:10: fatal error: 'asm/bitsperlong.h' 
file not found
#include 


-- 
Thanks,
Changbin Du


Re: [PATCH 4/4] selftests/bpf: fix compiling errors

2018-03-26 Thread Du, Changbin
On Mon, Mar 26, 2018 at 08:02:30PM -0700, Alexei Starovoitov wrote:
> On Tue, Mar 27, 2018 at 10:20:10AM +0800, Du, Changbin wrote:
> > On Mon, Mar 26, 2018 at 07:55:13AM -0700, Alexei Starovoitov wrote:
> > > On Mon, Mar 26, 2018 at 05:23:28PM +0800, changbin...@intel.com wrote:
> > > > Signed-off-by: Changbin Du 
> > > > ---
> > > >  tools/testing/selftests/bpf/Makefile | 5 +++--
> > > >  1 file changed, 3 insertions(+), 2 deletions(-)
> > > > 
> > > > diff --git a/tools/testing/selftests/bpf/Makefile 
> > > > b/tools/testing/selftests/bpf/Makefile
> > > > index 5c43c18..dc0fdc8 100644
> > > > --- a/tools/testing/selftests/bpf/Makefile
> > > > +++ b/tools/testing/selftests/bpf/Makefile
> > > > @@ -10,7 +10,8 @@ ifneq ($(wildcard $(GENHDR)),)
> > > >GENFLAGS := -DHAVE_GENHDR
> > > >  endif
> > > >  
> > > > -CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) 
> > > > -I../../../include
> > > > +CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) \
> > > > + -I../../../include -I../../../../usr/include
> > > >  LDLIBS += -lcap -lelf -lrt -lpthread
> > > >  
> > > >  # Order correspond to 'make run_tests' order
> > > > @@ -62,7 +63,7 @@ else
> > > >CPU ?= generic
> > > >  endif
> > > >  
> > > > -CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi \
> > > > +CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi 
> > > > -I../../../../usr/include \
> > > >   -Wno-compare-distinct-pointer-types
> > > 
> > > Nack.
> > > I suspect that will break the build for everyone else who's doing it in 
> > > the directory
> > > itself instead of the outer one.
> > >
> > 
> > This one? But I didn't see any problem.
> 
> because the build was lucky and additional path ../../../../usr/include 
> didn't point
> to a valid dir?
I am sorry but I don't understand why you mean *lucky*. Of cause, the path is 
valid.

> Please test with in-source and out-of-source builds.
> 
agree.

-- 
Thanks,
Changbin Du


Re: [PATCH 4/4] selftests/bpf: fix compiling errors

2018-03-26 Thread Du, Changbin
Hi Starovoitov,

This one does have the issue you mentioned.
[PATCH 2/4] selftests/gpio: fix paths in Makefile

And can be fixed by:

--- a/tools/testing/selftests/gpio/Makefile
+++ b/tools/testing/selftests/gpio/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0

+OUTPUT ?= $(shell pwd)
 TEST_PROGS := gpio-mockup.sh
 TEST_FILES := gpio-mockup-sysfs.sh $(BINARIES)
 BINARIES := gpio-mockup-chardev
@@ -24,7 +25,7 @@ LDLIBS += -lmount -I/usr/include/libmount
 $(BINARIES): gpio-utils.o ../../../../usr/include/linux/gpio.h

 gpio-utils.o:
-   make ARCH=$(ARCH) CROSS_COMPILE=$(CROSS_COMPILE) -C ../../../gpio
+   make ARCH=$(ARCH) CROSS_COMPILE=$(CROSS_COMPILE) OUTPUT=$(OUTPUT)/ -C 
../../../gpio

 ../../../../usr/include/linux/gpio.h:


I will update it later.

On Mon, Mar 26, 2018 at 07:55:13AM -0700, Alexei Starovoitov wrote:
> On Mon, Mar 26, 2018 at 05:23:28PM +0800, changbin...@intel.com wrote:
> > From: Changbin Du 
> > 
> > This patch fixed below errors of missing head files.
> > 
> > tools/testing/selftests$ make
> > ...
> > clang -I. -I./include/uapi -I../../../include/uapi 
> > -Wno-compare-distinct-pointer-types \
> >  -O2 -target bpf -emit-llvm -c test_pkt_access.c -o - |  \
> > llc -march=bpf -mcpu=generic -filetype=obj -o 
> > /home/changbin/work/linux/tools/testing/selftests/bpf//test_pkt_access.o
> > In file included from test_pkt_access.c:9:
> > In file included from ../../../include/uapi/linux/bpf.h:11:
> > In file included from ./include/uapi/linux/types.h:5:
> > /usr/include/asm-generic/int-ll64.h:11:10: fatal error: 'asm/bitsperlong.h' 
> > file not found
> >  #include 
> >  ^
> > 1 error generated.
> > clang -I. -I./include/uapi -I../../../include/uapi 
> > -Wno-compare-distinct-pointer-types \
> >  -O2 -target bpf -emit-llvm -c test_xdp.c -o - |  \
> > llc -march=bpf -mcpu=generic -filetype=obj -o 
> > /home/changbin/work/linux/tools/testing/selftests/bpf//test_xdp.o
> > In file included from test_xdp.c:9:
> > In file included from ../../../include/uapi/linux/bpf.h:11:
> > In file included from ./include/uapi/linux/types.h:5:
> > /usr/include/asm-generic/int-ll64.h:11:10: fatal error: 'asm/bitsperlong.h' 
> > file not found
> >  #include 
> >  ^
> > 1 error generated.
> > clang -I. -I./include/uapi -I../../../include/uapi 
> > -Wno-compare-distinct-pointer-types \
> >  -O2 -target bpf -emit-llvm -c test_l4lb.c -o - |  \
> > llc -march=bpf -mcpu=generic -filetype=obj -o 
> > /home/changbin/work/linux/tools/testing/selftests/bpf//test_l4lb.o
> > In file included from test_l4lb.c:10:
> > In file included from /usr/include/linux/pkt_cls.h:4:
> > In file included from ./include/uapi/linux/types.h:5:
> > /usr/include/asm-generic/int-ll64.h:11:10: fatal error: 'asm/bitsperlong.h' 
> > file not found
> >  #include 
> >  ^
> > 1 error generated.
> > clang -I. -I./include/uapi -I../../../include/uapi 
> > -Wno-compare-distinct-pointer-types \
> >  -O2 -target bpf -emit-llvm -c test_tcp_estats.c -o - |  \
> > llc -march=bpf -mcpu=generic -filetype=obj -o 
> > /home/changbin/work/linux/tools/testing/selftests/bpf//test_tcp_estats.o
> > In file included from test_tcp_estats.c:35:
> > In file included from ../../../include/uapi/linux/bpf.h:11:
> > In file included from ./include/uapi/linux/types.h:5:
> > /usr/include/asm-generic/int-ll64.h:11:10: fatal error: 'asm/bitsperlong.h' 
> > file not found
> >  #include 
> > ...
> > 
> > Signed-off-by: Changbin Du 
> > ---
> >  tools/testing/selftests/bpf/Makefile | 5 +++--
> >  1 file changed, 3 insertions(+), 2 deletions(-)
> > 
> > diff --git a/tools/testing/selftests/bpf/Makefile 
> > b/tools/testing/selftests/bpf/Makefile
> > index 5c43c18..dc0fdc8 100644
> > --- a/tools/testing/selftests/bpf/Makefile
> > +++ b/tools/testing/selftests/bpf/Makefile
> > @@ -10,7 +10,8 @@ ifneq ($(wildcard $(GENHDR)),)
> >GENFLAGS := -DHAVE_GENHDR
> >  endif
> >  
> > -CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) 
> > -I../../../include
> > +CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) \
> > + -I../../../include -I../../../../usr/include
> >  LDLIBS += -lcap -lelf -lrt -lpthread
> >  
> >  # Order correspond to 'make run_tests' order
> > @@ -62,7 +63,7 @@ else
> >CPU ?= generic
> >  endif
> >  
> > -CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi \
> > +CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi 
> > -I../../../../usr/include \
> >   -Wno-compare-distinct-pointer-types
> 
> Nack.
> I suspect that will break the build for everyone else who's doing it in the 
> directory
> itself instead of the outer one.
> 

-- 
Thanks,
Changbin Du


Re: [PATCH 4/4] selftests/bpf: fix compiling errors

2018-03-26 Thread Du, Changbin
On Mon, Mar 26, 2018 at 07:55:13AM -0700, Alexei Starovoitov wrote:
> On Mon, Mar 26, 2018 at 05:23:28PM +0800, changbin...@intel.com wrote:
> > Signed-off-by: Changbin Du 
> > ---
> >  tools/testing/selftests/bpf/Makefile | 5 +++--
> >  1 file changed, 3 insertions(+), 2 deletions(-)
> > 
> > diff --git a/tools/testing/selftests/bpf/Makefile 
> > b/tools/testing/selftests/bpf/Makefile
> > index 5c43c18..dc0fdc8 100644
> > --- a/tools/testing/selftests/bpf/Makefile
> > +++ b/tools/testing/selftests/bpf/Makefile
> > @@ -10,7 +10,8 @@ ifneq ($(wildcard $(GENHDR)),)
> >GENFLAGS := -DHAVE_GENHDR
> >  endif
> >  
> > -CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) 
> > -I../../../include
> > +CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) \
> > + -I../../../include -I../../../../usr/include
> >  LDLIBS += -lcap -lelf -lrt -lpthread
> >  
> >  # Order correspond to 'make run_tests' order
> > @@ -62,7 +63,7 @@ else
> >CPU ?= generic
> >  endif
> >  
> > -CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi \
> > +CLANG_FLAGS = -I. -I./include/uapi -I../../../include/uapi 
> > -I../../../../usr/include \
> >   -Wno-compare-distinct-pointer-types
> 
> Nack.
> I suspect that will break the build for everyone else who's doing it in the 
> directory
> itself instead of the outer one.
>

This one? But I didn't see any problem.

changbin@gvt-dell-host:~/work/linux/tools/testing/selftests/bpf$ make
make -C ../../../lib/bpf 
OUTPUT=/home/changbin/work/linux/tools/testing/selftests/bpf/
make[1]: Entering directory '/home/changbin/work/linux/tools/lib/bpf'
  HOSTCC   /home/changbin/work/linux/tools/testing/selftests/bpf/fixdep.o
  HOSTLD   /home/changbin/work/linux/tools/testing/selftests/bpf/fixdep-in.o
  LINK /home/changbin/work/linux/tools/testing/selftests/bpf/fixdep
  CC   /home/changbin/work/linux/tools/testing/selftests/bpf/libbpf.o
  CC   /home/changbin/work/linux/tools/testing/selftests/bpf/bpf.o
  CC   /home/changbin/work/linux/tools/testing/selftests/bpf/nlattr.o
  LD   /home/changbin/work/linux/tools/testing/selftests/bpf/libbpf-in.o
  LINK /home/changbin/work/linux/tools/testing/selftests/bpf/libbpf.a
  LINK /home/changbin/work/linux/tools/testing/selftests/bpf/libbpf.so
make[1]: Leaving directory '/home/changbin/work/linux/tools/lib/bpf'
make -C ../../../lib/bpf 
OUTPUT=/home/changbin/work/linux/tools/testing/selftests/bpf/
make[1]: Entering directory '/home/changbin/work/linux/tools/lib/bpf'
make[1]: Leaving directory '/home/changbin/work/linux/tools/lib/bpf'
gcc -Wall -O2 -I../../../include/uapi -I../../../lib 
-I../../../../include/generated -DHAVE_GENHDR -I../../../include 
-I../../../../usr/includetest_verifier.c 
/home/changbin/work/linux/tools/testing/selftests/bpf/libbpf.a cgroup_helpers.c 
-lcap -lelf -lrt -lpthread -o 
/home/changbin/work/linux/tools/testing/selftests/bpf/test_verifier
gcc -Wall -O2 -I../../../include/uapi -I../../../lib 
-I../../../../include/generated -DHAVE_GENHDR -I../../../include 
-I../../../../usr/includetest_tag.c 
/home/changbin/work/linux/tools/testing/selftests/bpf/libbpf.a cgroup_helpers.c 
-lcap -lelf -lrt -lpthread -o 
/home/changbin/work/linux/tools/testing/selftests/bpf/test_tag
gcc -Wall -O2 -I../../../include/uapi -I../../../lib 
-I../../../../include/generated -DHAVE_GENHDR -I../../../include 
-I../../../../usr/includetest_maps.c 
/home/changbin/work/linux/tools/testing/selftests/bpf/libbpf.a cgroup_helpers.c 
-lcap -lelf -lrt -lpthread -o 
/home/changbin/work/linux/tools/testing/selftests/bpf/test_maps
gcc -Wall -O2 -I../../../include/uapi -I../../../lib 
-I../../../../include/generated -DHAVE_GENHDR -I../../../include 
-I../../../../usr/includetest_lru_map.c 
/home/changbin/work/linux/tools/testing/selftests/bpf/libbpf.a cgroup_helpers.c 
-lcap -lelf -lrt -lpthread -o 
/home/changbin/work/linux/tools/testing/selftests/bpf/test_lru_map
gcc -Wall -O2 -I../../../include/uapi -I../../../lib 
-I../../../../include/generated -DHAVE_GENHDR -I../../../include 
-I../../../../usr/includetest_lpm_map.c 
/home/changbin/work/linux/tools/testing/selftests/bpf/libbpf.a cgroup_helpers.c 
-lcap -lelf -lrt -lpthread -o 
/home/changbin/work/linux/tools/testing/selftests/bpf/test_lpm_map
gcc -Wall -O2 -I../../../include/uapi -I../../../lib 
-I../../../../include/generated -DHAVE_GENHDR -I../../../include 
-I../../../../usr/includetest_progs.c 
/home/changbin/work/linux/tools/testing/selftests/bpf/libbpf.a cgroup_helpers.c 
-lcap -lelf -lrt -lpthread -o 
/home/changbin/work/linux/tools/testing/selftests/bpf/test_progs
gcc -Wall -O2 -I../../../include/uapi -I../../../lib 
-I../../../../include/generated -DHAVE_GENHDR -I../../../include 
-I../../../../usr/includetest_align.c 
/home/changbin/work/linux/tools/testing/selftests/bpf/libbpf.a cgroup_helpers.c 
-lcap -lelf -lrt -lpthread -o 
/home/changbin/work/linux/tools/testing/selftests/bpf/test_align
gcc -Wall -O2 

Re: [PATCH] perf trace: remove redundant ')'

2018-03-16 Thread Du, Changbin
Hi Arnaldo, How about this simple one? Thanks.

On Tue, Mar 13, 2018 at 06:40:01PM +0800, changbin...@intel.com wrote:
> From: Changbin Du 
> 
> There is a redundant ')' at the tail of each event. So remove it.
> $ sudo perf trace --no-syscalls -e 'kmem:*' -a
>899.342 kmem:kfree:(vfs_writev+0xb9) call_site=9c453979 ptr=(nil))
>899.344 kmem:kfree:(___sys_recvmsg+0x188) call_site=9c9b8b88 
> ptr=(nil))
> 
> Signed-off-by: Changbin Du 
> ---
>  tools/perf/builtin-trace.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> index e7f1b18..7273f5f 100644
> --- a/tools/perf/builtin-trace.c
> +++ b/tools/perf/builtin-trace.c
> @@ -1959,7 +1959,7 @@ static int trace__event_handler(struct trace *trace, 
> struct perf_evsel *evsel,
> trace->output);
>   }
>  
> - fprintf(trace->output, ")\n");
> + fprintf(trace->output, "\n");
>  
>   if (callchain_ret > 0)
>   trace__fprintf_callchain(trace, sample);
> -- 
> 2.7.4
> 

-- 
Thanks,
Changbin Du


[Q] How does linux kernel lockdep record lock-class dependency?

2018-03-14 Thread Du, Changbin

Hello everyone,
I got a warning as below which is a AB-BA deadlock issue. But I don't understand
how the 'existing dependency' happened.

It looks like: kvm_read_guest() held (&mm->mmap_sem), then reading userspace 
memory
(which is not ready yet) caused page_fault() invoked, then in i915_gem_fault()
it tries to hold (&dev->struct_mutex).

But this sequence must haven't happened. Otherwise, double-lock already happed,
since intel_vgpu_create_workload() has held (&dev->struct_mutex) already:

  (&dev->struct_mutex)->(&mm->mmap_sem)->(&dev->struct_mutex)

So how could lockdep find such 'existing dependency'? Thanks!

[  163.179109] ==
[  163.185306] WARNING: possible circular locking dependency detected
[  163.191504] 4.16.0-rc5+ #44 Tainted: G U
[  163.196655] --
[  163.202854] qemu-system-x86/4514 is trying to acquire lock:
[  163.208443]  (&mm->mmap_sem){}, at: [] 
__might_fault+0x36/0x80
[  163.216230]
   but task is already holding lock:
[  163.222090]  (&dev->struct_mutex){+.+.}, at: [] 
copy_gma_to_hva+0xe5/0x140 [i915]
[  163.231205]
   which lock already depends on the new lock.

[  163.239421]
   the existing dependency chain (in reverse order) is:
[  163.246925]
   -> #1 (&dev->struct_mutex){+.+.}:
[  163.252792]i915_mutex_lock_interruptible+0x66/0x170 [i915]
[  163.259005]i915_gem_fault+0x1e0/0x630 [i915]
[  163.263985]__do_fault+0x19/0xed
[  163.267830]__handle_mm_fault+0x9fa/0x1140
[  163.272550]handle_mm_fault+0x1a7/0x390
[  163.277006]__do_page_fault+0x286/0x530
[  163.281462]page_fault+0x45/0x50
[  163.285307]
   -> #0 (&mm->mmap_sem){}:
[  163.290722]__might_fault+0x60/0x80
[  163.294839]__kvm_read_guest_page+0x3d/0x80 [kvm]
[  163.300173]kvm_read_guest+0x47/0x80 [kvm]
[  163.304891]kvmgt_rw_gpa+0x9d/0x110 [kvmgt]
[  163.309714]intel_gvt_scan_and_shadow_workload+0x1be/0x480 [i915]
[  163.316448]intel_vgpu_create_workload+0x3d9/0x550 [i915]
[  163.322488]intel_vgpu_submit_execlist+0xc0/0x2a0 [i915]
[  163.328440]elsp_mmio_write+0xcb/0x140 [i915]
[  163.333448]intel_vgpu_mmio_reg_rw+0x250/0x4f0 [i915]
[  163.339138]intel_vgpu_emulate_mmio_write+0xaa/0x240 [i915]
[  163.345337]intel_vgpu_rw+0x200/0x250 [kvmgt]
[  163.350319]intel_vgpu_write+0x164/0x1f0 [kvmgt]
[  163.38]__vfs_write+0x33/0x170
[  163.359580]vfs_write+0xc5/0x1c0
[  163.363427]SyS_pwrite64+0x90/0xb0
[  163.367447]do_syscall_64+0x70/0x1c0
[  163.371642]entry_SYSCALL_64_after_hwframe+0x42/0xb7
[  163.377230]
   other info that might help us debug this:

[  163.385258]  Possible unsafe locking scenario:

[  163.391196]CPU0CPU1
[  163.395737]
[  163.400280]   lock(&dev->struct_mutex);
[  163.404125]lock(&mm->mmap_sem);
[  163.410062]lock(&dev->struct_mutex);
[  163.416436]   lock(&mm->mmap_sem);
[  163.419846]
*** DEADLOCK ***

[  163.425780] 3 locks held by qemu-system-x86/4514:
[  163.430496]  #0:  (&gvt->lock){+.+.}, at: [] 
intel_vgpu_emulate_mmio_write+0x64/0x240 [i915]
[  163.440544]  #1:  (&dev->struct_mutex){+.+.}, at: [] 
copy_gma_to_hva+0xe5/0x140 [i915]
[  163.450068]  #2:  (&kvm->srcu){}, at: [] 
kvmgt_rw_gpa+0x4c/0x110 [kvmgt]
[  163.458721]
   stack backtrace:
[  163.463097] CPU: 0 PID: 4514 Comm: qemu-system-x86 Tainted: G U  
 4.16.0-rc5+ #44
[  163.471663] Hardware name: Dell Inc. OptiPlex 7040/0Y7WYT, BIOS 1.2.8 
01/26/2016
[  163.479093] Call Trace:
[  163.481547]  dump_stack+0x7c/0xbe
[  163.484872]  print_circular_bug.isra.33+0x21b/0x228
[  163.489765]  __lock_acquire+0xf7d/0x1470
[  163.493700]  ? lock_acquire+0xec/0x1e0
[  163.497459]  lock_acquire+0xec/0x1e0
[  163.501046]  ? __might_fault+0x36/0x80
[  163.504805]  __might_fault+0x60/0x80
[  163.508389]  ? __might_fault+0x36/0x80
[  163.512155]  __kvm_read_guest_page+0x3d/0x80 [kvm]
[  163.516966]  kvm_read_guest+0x47/0x80 [kvm]
[  163.521161]  kvmgt_rw_gpa+0x9d/0x110 [kvmgt]
[  163.525459]  intel_gvt_scan_and_shadow_workload+0x1be/0x480 [i915]
[  163.531675]  intel_vgpu_create_workload+0x3d9/0x550 [i915]
[  163.537192]  intel_vgpu_submit_execlist+0xc0/0x2a0 [i915]
[  163.542621]  elsp_mmio_write+0xcb/0x140 [i915]
[  163.547093]  intel_vgpu_mmio_reg_rw+0x250/0x4f0 [i915]
[  163.552261]  intel_vgpu_emulate_mmio_write+0xaa/0x240 [i915]
[  163.557938]  intel_vgpu_rw+0x200/0x250 [kvmgt]
[  163.562396]  intel_vgpu_write+0x164/0x1f0 [kvmgt]
[  163.567114]  __vfs_write+0x33/0x170
[  163.570614]  ? common_file_perm+0x68/0x250
[  163.

Re: [PATCH 00/17] Include linux trace docs to Sphinx TOC tree

2018-03-07 Thread Du, Changbin
On Wed, Mar 07, 2018 at 10:46:49AM -0700, Jonathan Corbet wrote:
> On Tue, 27 Feb 2018 17:43:37 -0500
> Steven Rostedt  wrote:
> 
> > On Tue, 27 Feb 2018 17:34:22 +0800
> > "Du, Changbin"  wrote:
> >
> > > Ten days past, will you accept this serias? Thank you!
> > 
> > Currently I'm very overloaded with other code that needs to get done
> > ASAP, and I need to balance what is critical and what is not. I don't
> > have time to review this, as this isn't critical, and can wait.
> > 
> > If Jon can review it to make sure that it doesn't change the
> > readability of the text, then I'll trust his judgment. 
> 
> So I've spent a while working through the patches.  I think it's a
> well-done RST conversion carried out with a light hand; I do not believe
> there are any readability issues with the resulting text files.
> 
> I will note that the series adds some new build warnings:
> 
> > Documentation/trace/events.rst:45: WARNING: Inline emphasis start-string 
> > without end-string.
> > Documentation/trace/events.rst:49: WARNING: Inline emphasis start-string 
> > without end-string.
> > Documentation/trace/events.rst:193: WARNING: Inline emphasis start-string 
> > without end-string.
> > Documentation/trace/events.rst:114: WARNING: Unknown target name: "common".
> > Documentation/trace/ftrace.rst:2620: WARNING: Inline emphasis start-string 
> > without end-string.
> 
> These point to places where the resulting formatted docs are, in fact,
> incorrect.  I had to append the attached patch to the series to make those
> problems go away.  The warnings are there for a purpose!
> 
> Anyway, with that, the patch series is applied.  Thanks for helping to
> improve the docs, and my apologies for taking so long to get to this.
> 
> jon
> 
I am also appriciated for your review. And I am glade to see these docs can 
appear
in the new beautiful html documentation! Thnak you.

- changbin


Re: [PATCH v2 0/2] perf sched map: re-annotate shortname if thread comm changed

2018-03-06 Thread Du, Changbin
On Tue, Mar 06, 2018 at 11:17:07AM -0300, Arnaldo Carvalho de Melo wrote:
> Em Tue, Mar 06, 2018 at 08:53:02AM +0100, Jiri Olsa escreveu:
> > On Tue, Mar 06, 2018 at 11:37:35AM +0800, changbin...@intel.com wrote:
> > > From: Changbin Du 
> > > 
> > > v2:
> > >   o add a patch to move thread::shortname to thread_runtime
> > >   o add function perf_sched__process_comm() to process PERF_RECORD_COMM 
> > > event.
> > > 
> > > Changbin Du (2):
> > >   perf sched: move thread::shortname to thread_runtime
> > >   perf sched map: re-annotate shortname if thread comm changed
> > 
> > Acked-by: Jiri Olsa 
> 
> Thanks, applied both, the final layout for 'struct thread_runtime':
> 
> [root@jouet perf]# pahole -C thread_runtime ~/bin/perf
> struct thread_runtime {
>   u64last_time;/* 0 8 */
>   u64dt_run;   /* 8 8 */
>   u64dt_sleep; /*16 8 */
>   u64dt_iowait;/*24 8 */
>   u64dt_preempt;   /*32 8 */
>   u64dt_delay; /*40 8 */
>   u64ready_to_run; /*48 8 */
>   struct stats   run_stats;/*5640 */
>   /* --- cacheline 1 boundary (64 bytes) was 32 bytes ago --- */
>   u64total_run_time;   /*96 8 */
>   u64total_sleep_time; /*   104 8 */
>   u64total_iowait_time;/*   112 8 */
>   u64total_preempt_time;   /*   120 8 */
>   /* --- cacheline 2 boundary (128 bytes) --- */
>   u64total_delay_time; /*   128 8 */
>   intlast_state;   /*   136 4 */
>   char   shortname[3]; /*   140 3 */
>   _Bool  comm_changed; /*   143 1 */
>   u64migrations;   /*   144 8 */
> 
>   /* size: 152, cachelines: 3, members: 17 */
>   /* last cacheline: 24 bytes */
> };
> [root@jouet perf]#

Hi Arnaldo, thanks for your patient optimization for this!

-- 
Thanks,
Changbin Du


Re: [RESEND PATCH] perf sched map: re-annotate shortname if thread comm changed

2018-03-05 Thread Du, Changbin
I just done final version, please check v2. Thanks for your comments!

On Mon, Mar 05, 2018 at 11:37:54PM +0100, Jiri Olsa wrote:
> On Mon, Mar 05, 2018 at 03:11:36PM +0800, Du, Changbin wrote:
> 
> SNIP
> 
> > > > on the other hand it's simple enough and looks
> > > > like generic solution would be more tricky
> > > 
> > > What about adding perf_sched__process_comm() to set it in the
> > > thread::priv?
> > >
> > I can be done, then thread->comm_changed moves to 
> > thread_runtime->comm_changed.
> > Draft code as below. It is also a little tricky.
> > 
> > +int perf_sched__process_comm(struct perf_tool *tool __maybe_unused,
> > +union perf_event *event,
> > +struct perf_sample *sample,
> > +struct machine *machine)
> > +{
> > +   struct thread *thread;
> > +   struct thread_runtime *r;
> > +
> > +   perf_event__process_comm(tool, event, sample, machine);
> > +
> > +   thread = machine__findnew_thread(machine, pid, tid);
> 
> should you use machine__find_thread in here?
> 
> > +   if (thread) {
> > +   r = thread__priv(thread);
> > +   if (r)
> > +   r->comm_changed = true;
> > +   thread__put(thread);
> > +   }
> > +}
> > +
> >  static int perf_sched__read_events(struct perf_sched *sched)
> >  {
> > const struct perf_evsel_str_handler handlers[] = {
> > @@ -3291,7 +3311,7 @@ int cmd_sched(int argc, const char **argv)
> > struct perf_sched sched = {
> > .tool = {
> > .sample  = 
> > perf_sched__process_tracepoint_sample,
> > -   .comm= perf_event__process_comm,
> > +   .comm= perf_sched__process_comm,
> > 
> > 
> > But I'd keep 'comm_changed' where 'shortname' is defined. I think they 
> > should appears
> > togother. And 'shortname' is only used by sched command, too.
> 
> they can both go to struct thread_runtime then
> 
> > 
> > So I still prefer my privous simpler change. Thanks!
> 
> I was wrong thinking that the amount of code
> making it sched specific would be biger
> 
> we're trying to keep the core structs generic,
> so this one fits better 
> 
> thanks,
> jirka

-- 
Thanks,
Changbin Du


Re: [RESEND PATCH] perf sched map: re-annotate shortname if thread comm changed

2018-03-04 Thread Du, Changbin
On Fri, Mar 02, 2018 at 11:43:12AM -0300, Arnaldo Carvalho de Melo wrote:
> Em Fri, Feb 23, 2018 at 07:40:40PM +0800, changbin...@intel.com escreveu:
> > From: Changbin Du 
> > 
> > This is to show the real name of thread that created via fork-exec.
> > See below example for shortname *A0*.
> 
> Can you ellaborate a bit more and perhaps provide before and after
> outputs?
> 
> - Arnaldo
>
Arnaldo, please see below diff stat.
  *A0   80393.050639 secs A0 => perf:22368
  *.   A0   80393.050748 secs .  => swapper:0
   .  *.80393.050887 secs
  *B0  .   .80393.052735 secs B0 => rcu_sched:8
  *.   .   .80393.052743 secs
   .  *C0  .80393.056264 secs C0 => kworker/2:1H:287
   .  *A0  .80393.056270 secs
   .  *D0  .80393.056769 secs D0 => ksoftirqd/2:22
-  .  *A0  .80393.056804 secs
+  .  *A0  .80393.056804 secs A0 => pi:22368
   .  *.   .80393.056854 secs


> > $ sudo ./perf sched map
> >   *A0   80393.050639 secs A0 => perf:22368
> >   *.   A0   80393.050748 secs .  => swapper:0
> >.  *.80393.050887 secs
> >   *B0  .   .80393.052735 secs B0 => rcu_sched:8
> >   *.   .   .80393.052743 secs
> >.  *C0  .80393.056264 secs C0 => kworker/2:1H:287
> >.  *A0  .80393.056270 secs
> >.  *D0  .80393.056769 secs D0 => ksoftirqd/2:22
> >.  *A0  .80393.056804 secs A0 => pi:22368
> >.  *.   .80393.056854 secs
> >   *B0  .   .80393.060727 secs
> >   ...
> > 
> > Signed-off-by: Changbin Du 
> > ---
> >  tools/perf/builtin-sched.c | 4 +++-
> >  tools/perf/util/thread.c   | 1 +
> >  tools/perf/util/thread.h   | 1 +
> >  3 files changed, 5 insertions(+), 1 deletion(-)
> > 
> > diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
> > index 83283fe..53bb8df 100644
> > --- a/tools/perf/builtin-sched.c
> > +++ b/tools/perf/builtin-sched.c
> > @@ -1580,7 +1580,7 @@ static int map_switch_event(struct perf_sched *sched, 
> > struct perf_evsel *evsel,
> >  
> > timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp));
> > color_fprintf(stdout, color, "  %12s secs ", stimestamp);
> > -   if (new_shortname || (verbose > 0 && sched_in->tid)) {
> > +   if (new_shortname || sched_in->comm_changed || (verbose > 0 && 
> > sched_in->tid)) {
> > const char *pid_color = color;
> >  
> > if (thread__has_color(sched_in))
> > @@ -1588,6 +1588,8 @@ static int map_switch_event(struct perf_sched *sched, 
> > struct perf_evsel *evsel,
> >  
> > color_fprintf(stdout, pid_color, "%s => %s:%d",
> >sched_in->shortname, thread__comm_str(sched_in), 
> > sched_in->tid);
> > +
> > +   sched_in->comm_changed = false;
> > }
> >  
> > if (sched->map.comp && new_cpu)
> > diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
> > index 68b65b1..c660fe6 100644
> > --- a/tools/perf/util/thread.c
> > +++ b/tools/perf/util/thread.c
> > @@ -212,6 +212,7 @@ static int thread__set_comm(struct thread *thread, 
> > const char *str,
> > unwind__flush_access(thread);
> > }
> >  
> > +   thread->comm_changed = true;
> > thread->comm_set = true;
> >  
> > return 0;
> > diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
> > index 40cfa36..b9a328b 100644
> > --- a/tools/perf/util/thread.h
> > +++ b/tools/perf/util/thread.h
> > @@ -27,6 +27,7 @@ struct thread {
> > int cpu;
> > refcount_t  refcnt;
> > charshortname[3];
> > +   boolcomm_changed;
> > boolcomm_set;
> > int comm_len;
> > booldead; /* if set thread has exited */
> > -- 
> > 2.7.4

-- 
Thanks,
Changbin Du


Re: [RESEND PATCH] perf sched map: re-annotate shortname if thread comm changed

2018-03-04 Thread Du, Changbin
Hi,
On Fri, Mar 02, 2018 at 11:47:32PM +0900, Namhyung Kim wrote:
> Hi,
> 
> On Fri, Mar 02, 2018 at 12:38:45PM +0100, Jiri Olsa wrote:
> > On Fri, Mar 02, 2018 at 06:52:54PM +0800, Du, Changbin wrote:
> > > Hello, any comment?
> > 
> > sry, overlooked this one
> > 
> > SNIP
> > 
> > > > diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
> > > > index 68b65b1..c660fe6 100644
> > > > --- a/tools/perf/util/thread.c
> > > > +++ b/tools/perf/util/thread.c
> > > > @@ -212,6 +212,7 @@ static int thread__set_comm(struct thread 
> > > > *thread, const char *str,
> > > > unwind__flush_access(thread);
> > > > }
> > > >  
> > > > +   thread->comm_changed = true;
> > > > thread->comm_set = true;
> > > >  
> > > > return 0;
> > > > diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
> > > > index 40cfa36..b9a328b 100644
> > > > --- a/tools/perf/util/thread.h
> > > > +++ b/tools/perf/util/thread.h
> > > > @@ -27,6 +27,7 @@ struct thread {
> > > > int cpu;
> > > > refcount_t  refcnt;
> > > > charshortname[3];
> > > > +   boolcomm_changed;
> > 
> > I don't like that it's in struct thread and set by generic function,
> > and just one command (sched) checks/sets it back.. I'd rather see it
> > in thread::priv area..
> 
> 100% agreed.
> 
> 
> > on the other hand it's simple enough and looks
> > like generic solution would be more tricky
> 
> What about adding perf_sched__process_comm() to set it in the
> thread::priv?
>
I can be done, then thread->comm_changed moves to thread_runtime->comm_changed.
Draft code as below. It is also a little tricky.

+int perf_sched__process_comm(struct perf_tool *tool __maybe_unused,
+union perf_event *event,
+struct perf_sample *sample,
+struct machine *machine)
+{
+   struct thread *thread;
+   struct thread_runtime *r;
+
+   perf_event__process_comm(tool, event, sample, machine);
+
+   thread = machine__findnew_thread(machine, pid, tid);
+   if (thread) {
+   r = thread__priv(thread);
+   if (r)
+   r->comm_changed = true;
+   thread__put(thread);
+   }
+}
+
 static int perf_sched__read_events(struct perf_sched *sched)
 {
const struct perf_evsel_str_handler handlers[] = {
@@ -3291,7 +3311,7 @@ int cmd_sched(int argc, const char **argv)
struct perf_sched sched = {
.tool = {
.sample  = 
perf_sched__process_tracepoint_sample,
-   .comm= perf_event__process_comm,
+   .comm= perf_sched__process_comm,


But I'd keep 'comm_changed' where 'shortname' is defined. I think they should 
appears
togother. And 'shortname' is only used by sched command, too.

So I still prefer my privous simpler change. Thanks!

> Thanks,
> Namhyung

-- 
Thanks,
Changbin Du


Re: [RESEND PATCH] perf sched map: re-annotate shortname if thread comm changed

2018-03-02 Thread Du, Changbin
Hello, any comment?

On Fri, Feb 23, 2018 at 07:40:40PM +0800, changbin...@intel.com wrote:
> From: Changbin Du 
> 
> This is to show the real name of thread that created via fork-exec.
> See below example for shortname *A0*.
> 
> $ sudo ./perf sched map
>   *A0   80393.050639 secs A0 => perf:22368
>   *.   A0   80393.050748 secs .  => swapper:0
>.  *.80393.050887 secs
>   *B0  .   .80393.052735 secs B0 => rcu_sched:8
>   *.   .   .80393.052743 secs
>.  *C0  .80393.056264 secs C0 => kworker/2:1H:287
>.  *A0  .80393.056270 secs
>.  *D0  .80393.056769 secs D0 => ksoftirqd/2:22
>.  *A0  .80393.056804 secs A0 => pi:22368
>.  *.   .80393.056854 secs
>   *B0  .   .80393.060727 secs
>   ...
> 
> Signed-off-by: Changbin Du 
> ---
>  tools/perf/builtin-sched.c | 4 +++-
>  tools/perf/util/thread.c   | 1 +
>  tools/perf/util/thread.h   | 1 +
>  3 files changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
> index 83283fe..53bb8df 100644
> --- a/tools/perf/builtin-sched.c
> +++ b/tools/perf/builtin-sched.c
> @@ -1580,7 +1580,7 @@ static int map_switch_event(struct perf_sched *sched, 
> struct perf_evsel *evsel,
>  
>   timestamp__scnprintf_usec(timestamp, stimestamp, sizeof(stimestamp));
>   color_fprintf(stdout, color, "  %12s secs ", stimestamp);
> - if (new_shortname || (verbose > 0 && sched_in->tid)) {
> + if (new_shortname || sched_in->comm_changed || (verbose > 0 && 
> sched_in->tid)) {
>   const char *pid_color = color;
>  
>   if (thread__has_color(sched_in))
> @@ -1588,6 +1588,8 @@ static int map_switch_event(struct perf_sched *sched, 
> struct perf_evsel *evsel,
>  
>   color_fprintf(stdout, pid_color, "%s => %s:%d",
>  sched_in->shortname, thread__comm_str(sched_in), 
> sched_in->tid);
> +
> + sched_in->comm_changed = false;
>   }
>  
>   if (sched->map.comp && new_cpu)
> diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
> index 68b65b1..c660fe6 100644
> --- a/tools/perf/util/thread.c
> +++ b/tools/perf/util/thread.c
> @@ -212,6 +212,7 @@ static int thread__set_comm(struct thread *thread, 
> const char *str,
>   unwind__flush_access(thread);
>   }
>  
> + thread->comm_changed = true;
>   thread->comm_set = true;
>  
>   return 0;
> diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
> index 40cfa36..b9a328b 100644
> --- a/tools/perf/util/thread.h
> +++ b/tools/perf/util/thread.h
> @@ -27,6 +27,7 @@ struct thread {
>   int cpu;
>   refcount_t  refcnt;
>   charshortname[3];
> + boolcomm_changed;
>   boolcomm_set;
>   int comm_len;
>   booldead; /* if set thread has exited */
> -- 
> 2.7.4
> 

-- 
Thanks,
Changbin Du


Re: [PATCH v2] tracing/power: Polish the tracepoints cpu_idle and cpu_frequency

2018-03-02 Thread Du, Changbin
On Fri, Mar 02, 2018 at 11:39:16AM +0100, Rafael J. Wysocki wrote:
> On 3/2/2018 11:15 AM, Du, Changbin wrote:
> > On Fri, Mar 02, 2018 at 11:18:10AM +0100, Rafael J. Wysocki wrote:
> > > On Fri, Mar 2, 2018 at 10:41 AM, Du, Changbin  
> > > wrote:
> > > > > > > That rather isn't the case if negative values are ever passed to 
> > > > > > > the
> > > > > > > tracepoint, right?
> > > > > > > 
> > > > > > yes.
> > > > > > > Which seems to be the reason why you want to make this change, 
> > > > > > > isn't it?
> > > > > > > 
> > > > > > yes, to improve readability.
> > > > > > 
> > > > > > > So maybe fix the code using the tracepoint(s) to avoid passing
> > > > > > > negative values to it(them)?
> > > > > > For cpu_idle event, [0, CPUIDLE_STATE_MAX) are used to index the 
> > > > > > idle state arrary,
> > > > > > so I think a appropriate value for PWR_EVENT_EXIT is -1 (defined in 
> > > > > > include/trace/events/power.h).
> > > > > > Or do you have a better idea? Thanks!
> > > > > Sorry, I'm not sure what you mean.
> > > > > 
> > > > > I'm saying that the code using the CPU PM tracepoints is not expected
> > > > > to pass -1 as the CPU number to them.  IOW, neither -1 nor its UL
> > > > > representation should ever appear in the output of these tracepoints.
> > > > > If that happens, it is a problem with the code using the tracepoints
> > > > > which needs to be fixed.  Users should not see any of these values.
> > > > This patch only changed 'state' field but cpuid. For cpu_idle event, 
> > > > 'state' is
> > > > singned value, but for cpu_frequency it is unsinged.
> > > > The cpuid is always unsinged value. So no one passes -1 as CPU number.
> > > You are right, 'state' not 'cpuid', sorry.
> > > 
> > > Negative 'state' should not be passed to these tracepoints too, though.
> > The current situtation is that 'state' can be negative for event cpu_idle 
> > :(. This
> > is why I made this change.
> > 
> And which is why I said that IMO it would be better to change the current
> situation.
> 
> Your patch makes the results of it slightly less confusing to a human reader
> of the tracepoint output, but the situation is still unchanged after it.
> 
> And what if someone has a script built around these tracepoints that knows
> how to handle the UL representation of -1, but doesn't know how to parse
> "-1"?  They would need to update the script after your change, wouldn't
> they?  And why would it be OK to inflict that work on them just to improve
> the readability of the output for humans?
>
yeah, I can guarantee all in kernel tools updated but people's private script.
For me, I just read the raw event for debug purpose. It is fair enough that 
leave
code as it was considering users' private tool based on this event.

> 
> 

-- 
Thanks,
Changbin Du


Re: [PATCH v2] tracing/power: Polish the tracepoints cpu_idle and cpu_frequency

2018-03-02 Thread Du, Changbin
On Fri, Mar 02, 2018 at 11:18:10AM +0100, Rafael J. Wysocki wrote:
> On Fri, Mar 2, 2018 at 10:41 AM, Du, Changbin  wrote:
> >> >> That rather isn't the case if negative values are ever passed to the
> >> >> tracepoint, right?
> >> >>
> >> > yes.
> >> >> Which seems to be the reason why you want to make this change, isn't it?
> >> >>
> >> > yes, to improve readability.
> >> >
> >> >> So maybe fix the code using the tracepoint(s) to avoid passing
> >> >> negative values to it(them)?
> >> > For cpu_idle event, [0, CPUIDLE_STATE_MAX) are used to index the idle 
> >> > state arrary,
> >> > so I think a appropriate value for PWR_EVENT_EXIT is -1 (defined in 
> >> > include/trace/events/power.h).
> >> > Or do you have a better idea? Thanks!
> >>
> >> Sorry, I'm not sure what you mean.
> >>
> >> I'm saying that the code using the CPU PM tracepoints is not expected
> >> to pass -1 as the CPU number to them.  IOW, neither -1 nor its UL
> >> representation should ever appear in the output of these tracepoints.
> >> If that happens, it is a problem with the code using the tracepoints
> >> which needs to be fixed.  Users should not see any of these values.
> >
> > This patch only changed 'state' field but cpuid. For cpu_idle event, 
> > 'state' is
> > singned value, but for cpu_frequency it is unsinged.
> > The cpuid is always unsinged value. So no one passes -1 as CPU number.
> 
> You are right, 'state' not 'cpuid', sorry.
> 
> Negative 'state' should not be passed to these tracepoints too, though.

The current situtation is that 'state' can be negative for event cpu_idle :(. 
This
is why I made this change.

-- 
Thanks,
Changbin Du


Re: [PATCH v2] tracing/power: Polish the tracepoints cpu_idle and cpu_frequency

2018-03-02 Thread Du, Changbin
> >> That rather isn't the case if negative values are ever passed to the
> >> tracepoint, right?
> >>
> > yes.
> >> Which seems to be the reason why you want to make this change, isn't it?
> >>
> > yes, to improve readability.
> >
> >> So maybe fix the code using the tracepoint(s) to avoid passing
> >> negative values to it(them)?
> > For cpu_idle event, [0, CPUIDLE_STATE_MAX) are used to index the idle state 
> > arrary,
> > so I think a appropriate value for PWR_EVENT_EXIT is -1 (defined in 
> > include/trace/events/power.h).
> > Or do you have a better idea? Thanks!
> 
> Sorry, I'm not sure what you mean.
> 
> I'm saying that the code using the CPU PM tracepoints is not expected
> to pass -1 as the CPU number to them.  IOW, neither -1 nor its UL
> representation should ever appear in the output of these tracepoints.
> If that happens, it is a problem with the code using the tracepoints
> which needs to be fixed.  Users should not see any of these values.

This patch only changed 'state' field but cpuid. For cpu_idle event, 'state' is
singned value, but for cpu_frequency it is unsinged.
The cpuid is always unsinged value. So no one passes -1 as CPU number.

-- 
Thanks,
Changbin Du


Re: [PATCH v2] tracing/power: Polish the tracepoints cpu_idle and cpu_frequency

2018-02-28 Thread Du, Changbin
On Wed, Feb 28, 2018 at 10:14:41AM +0100, Rafael J. Wysocki wrote:
> On 2/28/2018 3:45 AM, Du, Changbin wrote:
> > On Tue, Feb 27, 2018 at 05:39:38PM -0500, Steven Rostedt wrote:
> > > On Tue, 27 Feb 2018 17:35:27 +0800
> > > "Du, Changbin"  wrote:
> > > 
> > > > >  From the tracing perspective:
> > > > > 
> > > > > Acked-by: Steven Rostedt (VMware) 
> > > > > 
> > > > > -- Steve
> > > > Hi Steve, will you pick this or someoneelse?
> > > I maintain the tracing infrastructure, but the tracing use cases are
> > > maintained by the maintainers of the users of the trace events. That
> > > is, who added these trace events? They are the ones most affected by
> > > these changes.
> > > 
> > > For example, it looks like Rafael J. Wysocki, is the one that added
> > > trace_cpu_frequency(). He's the one that is affected by this change,
> > > and is the one that you need to have take it.
> > > 
> > Got it, thanks!
> > 
> > Hi Wysocki, could you take a look?
> 
> Please send the patch(es) to linux...@vger.kernel.org with a CC to me and I
> will take care of them.
> 
sure~

> Thanks,
> Rafael
> 

-- 
Thanks,
Changbin Du


Re: [PATCH v2] tracing/power: Polish the tracepoints cpu_idle and cpu_frequency

2018-02-27 Thread Du, Changbin
On Tue, Feb 27, 2018 at 05:39:38PM -0500, Steven Rostedt wrote:
> On Tue, 27 Feb 2018 17:35:27 +0800
> "Du, Changbin"  wrote:
> 
> > > From the tracing perspective:
> > > 
> > > Acked-by: Steven Rostedt (VMware) 
> > > 
> > > -- Steve
> > >  
> > Hi Steve, will you pick this or someoneelse?
> 
> I maintain the tracing infrastructure, but the tracing use cases are
> maintained by the maintainers of the users of the trace events. That
> is, who added these trace events? They are the ones most affected by
> these changes.
> 
> For example, it looks like Rafael J. Wysocki, is the one that added
> trace_cpu_frequency(). He's the one that is affected by this change,
> and is the one that you need to have take it.
>
Got it, thanks!

Hi Wysocki, could you take a look?

> -- Steve
> 

-- 
Thanks,
Changbin Du


Re: [PATCH 00/17] Include linux trace docs to Sphinx TOC tree

2018-02-27 Thread Du, Changbin
Hello Steven and Corbet,
Ten days past, will you accept this serias? Thank you!

On Sat, Feb 17, 2018 at 01:39:33PM +0800, changbin...@intel.com wrote:
> From: Changbin Du 
> 
> Hi All,
> The linux tracers are so useful that I want to make the docs better. The 
> kernel
> now uses Sphinx to generate intelligent and beautiful documentation from
> reStructuredText files. I converted most of the Linux trace docs to rst format
> in this serias.
> 
> For you to preview, please visit below url:
> http://docservice.askxiong.com/linux-kernel/trace/index.html
> 
> Thank you!
> 
> Changbin Du (17):
>   Documentation: add Linux tracing to Sphinx TOC tree
>   trace doc: convert trace/ftrace-design.txt to rst format
>   trace doc: add ftrace-uses.rst to doc tree
>   trace doc: convert trace/tracepoint-analysis.txt to rst format
>   trace doc: convert trace/ftrace.txt to rst format
>   trace doc: convert trace/kprobetrace.txt to rst format
>   trace doc: convert trace/uprobetracer.txt to rst format
>   trace doc: convert trace/tracepoints.txt to rst format
>   trace doc: convert trace/events.txt to rst format
>   trace doc: convert trace/events-kmem.txt to rst format
>   trace doc: convert trace/events-power.txt to rst format
>   trace doc: convert trace/events-nmi.txt to rst format
>   trace doc: convert trace/events-msr.txt to rst format
>   trace doc: convert trace/mmiotrace.txt to rst format
>   trace doc: convert trace/hwlat_detector.txt to rst fromat
>   trace doc: convert trace/intel_th.txt to rst format
>   trace doc: convert trace/stm.txt to rst format
> 
>  Documentation/index.rst|1 +
>  .../trace/{events-kmem.txt => events-kmem.rst} |   50 +-
>  Documentation/trace/events-msr.rst |   40 +
>  Documentation/trace/events-msr.txt |   37 -
>  Documentation/trace/events-nmi.rst |   45 +
>  Documentation/trace/events-nmi.txt |   43 -
>  .../trace/{events-power.txt => events-power.rst}   |   52 +-
>  Documentation/trace/{events.txt => events.rst} |  669 ++--
>  .../trace/{ftrace-design.txt => ftrace-design.rst} |  252 +-
>  Documentation/trace/ftrace-uses.rst|   23 +-
>  Documentation/trace/ftrace.rst | 3332 
> 
>  Documentation/trace/ftrace.txt | 3220 ---
>  .../{hwlat_detector.txt => hwlat_detector.rst} |   26 +-
>  Documentation/trace/index.rst  |   23 +
>  Documentation/trace/{intel_th.txt => intel_th.rst} |   43 +-
>  .../trace/{kprobetrace.txt => kprobetrace.rst} |  100 +-
>  .../trace/{mmiotrace.txt => mmiotrace.rst} |   86 +-
>  Documentation/trace/{stm.txt => stm.rst}   |   23 +-
>  ...epoint-analysis.txt => tracepoint-analysis.rst} |   41 +-
>  .../trace/{tracepoints.txt => tracepoints.rst} |   77 +-
>  .../trace/{uprobetracer.txt => uprobetracer.rst}   |   44 +-
>  21 files changed, 4237 insertions(+), 3990 deletions(-)
>  rename Documentation/trace/{events-kmem.txt => events-kmem.rst} (76%)
>  create mode 100644 Documentation/trace/events-msr.rst
>  delete mode 100644 Documentation/trace/events-msr.txt
>  create mode 100644 Documentation/trace/events-nmi.rst
>  delete mode 100644 Documentation/trace/events-nmi.txt
>  rename Documentation/trace/{events-power.txt => events-power.rst} (65%)
>  rename Documentation/trace/{events.txt => events.rst} (82%)
>  rename Documentation/trace/{ftrace-design.txt => ftrace-design.rst} (74%)
>  create mode 100644 Documentation/trace/ftrace.rst
>  delete mode 100644 Documentation/trace/ftrace.txt
>  rename Documentation/trace/{hwlat_detector.txt => hwlat_detector.rst} (83%)
>  create mode 100644 Documentation/trace/index.rst
>  rename Documentation/trace/{intel_th.txt => intel_th.rst} (82%)
>  rename Documentation/trace/{kprobetrace.txt => kprobetrace.rst} (63%)
>  rename Documentation/trace/{mmiotrace.txt => mmiotrace.rst} (78%)
>  rename Documentation/trace/{stm.txt => stm.rst} (91%)
>  rename Documentation/trace/{tracepoint-analysis.txt => 
> tracepoint-analysis.rst} (93%)
>  rename Documentation/trace/{tracepoints.txt => tracepoints.rst} (74%)
>  rename Documentation/trace/{uprobetracer.txt => uprobetracer.rst} (86%)
> 
> -- 
> 2.7.4
> 

-- 
Thanks,
Changbin Du


Re: [PATCH 05/17] trace doc: convert trace/ftrace.txt to rst format

2018-02-20 Thread Du, Changbin
Hi,
On Tue, Feb 20, 2018 at 08:28:24AM +0100, Philippe Ombredanne wrote:
> Changbin, Steven,
> 
> On Sat, Feb 17, 2018 at 6:39 AM,   wrote:
> > From: Changbin Du 
> >
> > This converts the plain text documentation to reStructuredText format and
> > add it into Sphinx TOC tree. No essential content change.
> >
> > Cc: Steven Rostedt 
> > Signed-off-by: Changbin Du 
> > ---
> >  Documentation/trace/ftrace.rst | 3332 
> > 
> >  Documentation/trace/ftrace.txt | 3220 
> > --
> >  Documentation/trace/index.rst  |1 +
> >  3 files changed,  insertions(+), 3220 deletions(-)
> >  create mode 100644 Documentation/trace/ftrace.rst
> >  delete mode 100644 Documentation/trace/ftrace.txt
> >
> > diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
> > new file mode 100644
> > index 000..636aa9bf
> > --- /dev/null
> > +++ b/Documentation/trace/ftrace.rst
> > @@ -0,0 +1,3332 @@
> > +
> > +ftrace - Function Tracer
> > +
> > +
> > +Copyright 2008 Red Hat Inc.
> > +
> > +:Author:   Steven Rostedt 
> > +:License:  The GNU Free Documentation License, Version 1.2
> > +  (dual licensed under the GPL v2)
> 
> 
> Do you mind using an SPDX id per [1] rather that this?
> 
> Steven, are you OK with this? Can you ack?
> 
> [1] 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/license-rules.rst
> -- 
> Cordially
> Philippe Ombredanne

I leave this to Steven, since I just converted the format of this doc.

-- 
Thanks,
Changbin Du


Re: [PATCH v3] perf ftrace: Append an EOL when write tracing files

2018-02-18 Thread Du, Changbin
On Mon, Feb 19, 2018 at 10:21:34AM +0900, Namhyung Kim wrote:
> Hello,
> 
> On Wed, Feb 14, 2018 at 10:44:24AM +0800, changbin...@intel.com wrote:
> > From: Changbin Du 
> > 
> > Before this change, the '--graph-funcs', '--nograph-funcs' and
> > '--trace-funcs' options didn't work as expected when the  doesn't
> > exist. Because the kernel side hid possible errors.
> > 
> > $ sudo ./perf ftrace -a --graph-depth 1 --graph-funcs abcdefg
> >  0)   0.140 us|  rcu_all_qs();
> >  3)   0.304 us|  mutex_unlock();
> >  0)   0.153 us|  find_vma();
> >  3)   0.088 us|  __fsnotify_parent();
> >  0)   6.145 us|  handle_mm_fault();
> >  3)   0.089 us|  fsnotify();
> >  3)   0.161 us|  __sb_end_write();
> >  3)   0.710 us|  SyS_close();
> >  3)   7.848 us|  exit_to_usermode_loop();
> > 
> > On above example, I specified function filter 'abcdefg' but all functions
> > are enabled. The expected error is hidden.
> > 
> > The original fix is to make the kernel support '\0' as end of string:
> > https://lkml.org/lkml/2018/1/16/116
> > 
> > But above fix cannot be compatible with old kernels. Then Namhyung Kim
> > suggest adding a space after function name.
> > 
> > This patch will append an '\n' when write tracing file. After this fix,
> > the perf will report correct error state. Also let it print an error if
> > reset_tracing_files() fails.
> > 
> > Cc: Namhyung Kim 
> > Signed-off-by: Changbin Du 
> > 
> > ---
> > v3: Took Kim's suggestion that add a space after function name.
> > v2: Rebase.
> > ---
> >  tools/perf/builtin-ftrace.c | 15 +--
> >  1 file changed, 13 insertions(+), 2 deletions(-)
> > 
> > diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
> > index 25a42ac..9ffd748 100644
> > --- a/tools/perf/builtin-ftrace.c
> > +++ b/tools/perf/builtin-ftrace.c
> > @@ -72,6 +72,7 @@ static int __write_tracing_file(const char *name, const 
> > char *val, bool append)
> > ssize_t size = strlen(val);
> > int flags = O_WRONLY;
> > char errbuf[512];
> > +   char *val_copy;
> >  
> > file = get_tracing_file(name);
> > if (!file) {
> > @@ -91,12 +92,20 @@ static int __write_tracing_file(const char *name, const 
> > char *val, bool append)
> > goto out;
> > }
> >  
> > -   if (write(fd, val, size) == size)
> > +   /*
> > +* Copy the original value and append a '\n'. Without this,
> > +* the kernel can hide possible errors.
> > +*/
> > +   val_copy = strdup(val);
> 
> Please check the return value.
>
Thanks, I will update.
 
> Thanks,
> Namhyung
> 
> 
> > +   val_copy[size] = '\n';
> > +
> > +   if (write(fd, val_copy, size + 1) == size + 1)
> > ret = 0;
> > else
> > pr_debug("write '%s' to tracing/%s failed: %s\n",
> >  val, name, str_error_r(errno, errbuf, sizeof(errbuf)));
> >  
> > +   free(val_copy);
> > close(fd);
> >  out:
> > put_tracing_file(file);
> > @@ -280,8 +289,10 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, 
> > int argc, const char **argv)
> > signal(SIGCHLD, sig_handler);
> > signal(SIGPIPE, sig_handler);
> >  
> > -   if (reset_tracing_files(ftrace) < 0)
> > +   if (reset_tracing_files(ftrace) < 0) {
> > +   pr_err("failed to reset ftrace\n");
> > goto out;
> > +   }
> >  
> > /* reset ftrace buffer */
> > if (write_tracing_file("trace", "0") < 0)
> > -- 
> > 2.7.4
> > 

-- 
Thanks,
Changbin Du


Re: [PATCH 2/3] Documentation: convert trace/ftrace-design.txt to rst format

2018-02-16 Thread Du, Changbin
On Fri, Feb 16, 2018 at 12:36:29PM -0500, Steven Rostedt wrote:
> On Fri, 16 Feb 2018 05:49:52 -0700
> Jonathan Corbet  wrote:
> 
> > On Thu, 15 Feb 2018 22:57:05 -0500
> > Steven Rostedt  wrote:
> > 
> > > This document is out of date, and I rather have it updated before we
> > > make it more "available" elsewhere.  
> > 
> > Imagine that, an out-of-date doc in the kernel :)
> > 
> > Seriously, though, I'd argue that (1) it's already highly available, and
> > (2) it's useful now.  And (3) who knows when that update will happen?
> > Unless we have reason to believe that a new version is waiting on the
> > wings, I don't really see why we would want to delay this work.
> 
> Actually, some of these documents I was thinking of labeling as
> "obsolete" or simply removing them. The ftrace-design one is about
> how to port ftrace to other architectures, and I already had to correct
> people that based their work on it.
> 
> Yeah, I really need to get some time to update them, but like everyone
> else, that's just the 90th thing I have to do.
> 
> -- Steve
Reading this doc, I think most of information are still useful for undertading 
the
implemeation. So how abount just put a caution at the begining of doc as below 
defore get updated?
http://docservice.askxiong.com/linux-kernel/trace/ftrace-design.html

Anyway, I just converted them all. I will send them out. Please comemnt if some
of them should be removed. 

-- 
Thanks,
Changbin Du


Re: [PATCH 2/3] Documentation: convert trace/ftrace-design.txt to rst format

2018-02-16 Thread Du, Changbin
On Thu, Feb 15, 2018 at 10:57:05PM -0500, Steven Rostedt wrote:
> On Fri, 16 Feb 2018 11:12:18 +0800
> changbin...@intel.com wrote:
> 
> > From: Changbin Du 
> > 
> > Signed-off-by: Changbin Du 
> > ---
> >  .../trace/{ftrace-design.txt => ftrace-design.rst} | 248 
> > +++--
> >  Documentation/trace/index.rst  |   2 +
> >  2 files changed, 137 insertions(+), 113 deletions(-)
> >  rename Documentation/trace/{ftrace-design.txt => ftrace-design.rst} (75%)
> > 
> > diff --git a/Documentation/trace/ftrace-design.txt 
> > b/Documentation/trace/ftrace-design.rst
> > similarity index 75%
> > rename from Documentation/trace/ftrace-design.txt
> > rename to Documentation/trace/ftrace-design.rst
> > index a273dd0..0f32a85 100644
> > --- a/Documentation/trace/ftrace-design.txt
> > +++ b/Documentation/trace/ftrace-design.rst
> > @@ -1,6 +1,8 @@
> > -   function tracer guts
> > -   
> > -   By Mike Frysinger
> > +==
> > +Function Tracer Design
> > +==
> > +
> > +:Author: Mike Frysinger
> >  
> 
> This document is out of date, and I rather have it updated before we
> make it more "available" elsewhere.
>
Got you. I plan to convert below docs. Are they out of date, too?

events-msr.txt, events.txt, mmiotrace.txt, stm.txt, uprobetracer.txt
events-nmi.txt, intel_th.txt, tracepoint-analysis.txt, events-kmem.txt,
events-power.txt, ftrace.txt, hwlat_detector.txt, kprobetrace.txt,
tracepoints.txt.
 
> -- Steve

-- 
Thanks,
Changbin Du


Re: [PATCH] tracing/power: Don't share template for cpu_idle and cpu_frequency

2018-02-12 Thread Du, Changbin
Thanks, I will improve this change in v2. And also update related docs.

On Mon, Feb 12, 2018 at 12:04:52PM -0500, Steven Rostedt wrote:
> On Sun, 11 Feb 2018 18:50:04 +0800
> "Du, Changbin"  wrote:
> 
> > Steve, How abount DEFINE_EVENT_PRINT as below?
> 
> Yes, DEFINE_EVENT_PRINT is better.
> 
> > 
> > diff --git a/include/trace/events/power.h b/include/trace/events/power.h
> > index 908977d..e71ce98 100644
> > --- a/include/trace/events/power.h
> > +++ b/include/trace/events/power.h
> > @@ -14,12 +14,12 @@
> > 
> >  DECLARE_EVENT_CLASS(cpu,
> > 
> > -   TP_PROTO(unsigned int state, unsigned int cpu_id),
> > +   TP_PROTO(int state, unsigned int cpu_id),
> > 
> > TP_ARGS(state, cpu_id),
> > 
> > TP_STRUCT__entry(
> > -   __field(u32,state   )
> > +   __field(s32,state   )
> > __field(u32,cpu_id  )
> > ),
> > 
> > @@ -28,13 +28,12 @@ DECLARE_EVENT_CLASS(cpu,
> > __entry->cpu_id = cpu_id;
> > ),
> > 
> > -   TP_printk("state=%lu cpu_id=%lu", (unsigned long)__entry->state,
> > - (unsigned long)__entry->cpu_id)
> 
> Yous still need the type casting, because s32/u32 on 32 bit machines
> can be defined as "long".
> 
> -- Steve
> 
> > +   TP_printk("state=%d cpu_id=%u", __entry->state, __entry->cpu_id)
> >  );
> > 
> >  DEFINE_EVENT(cpu, cpu_idle,
> > 
> > -   TP_PROTO(unsigned int state, unsigned int cpu_id),
> > +   TP_PROTO(int state, unsigned int cpu_id),
> > 
> > TP_ARGS(state, cpu_id)
> >  );
> > @@ -141,11 +140,13 @@ TRACE_EVENT(pstate_sample,
> > { PM_EVENT_RESTORE, "restore" }, \
> > { PM_EVENT_RECOVER, "recover" })
> > 
> > -DEFINE_EVENT(cpu, cpu_frequency,
> > +DEFINE_EVENT_PRINT(cpu, cpu_frequency,
> > 
> > -   TP_PROTO(unsigned int frequency, unsigned int cpu_id),
> > +   TP_PROTO(int state, unsigned int cpu_id),
> > 
> > -   TP_ARGS(frequency, cpu_id)
> > +   TP_ARGS(state, cpu_id),
> > +
> > +   TP_printk("frequency=%u cpu_id=%lu", __entry->state, 
> > __entry->cpu_id)
> >  );
> 

-- 
Thanks,
Changbin Du


Re: [PATCH v2] perf ftrace: Fix the buffer size in __write_tracing_file

2018-02-12 Thread Du, Changbin
On Sun, Feb 11, 2018 at 10:15:10PM -0800, Namhyung Kim wrote:
> On Mon, Feb 12, 2018 at 12:48:15PM +0800, Du, Changbin wrote:
> > Hi,
> > 
> > On Mon, Feb 12, 2018 at 10:55:27AM +0900, Namhyung Kim wrote:
> > > Hello,
> > > 
> > > On Thu, Feb 08, 2018 at 04:13:20PM +0800, changbin...@intel.com wrote:
> > > > From: Changbin Du 
> > > > 
> > > > The terminal character '\0' should take into account into size of the
> > > > string buffer. Without this fix, the '--graph-funcs', '--nograph-funcs'
> > > > and '--trace-funcs' options didn't work as expected when the 
> > > > doesn't exist. If usersapce writes a non-terminated string, the kernel
> > > > side will always return success but actually no filter applied.
> > > > 
> > > > As discussed before, the kernel now support '\0' to mark the end of 
> > > > string:
> > > > https://lkml.org/lkml/2018/1/16/116
> > > > 
> > > > After this fix in userspace, the perf will report correct error state. 
> > > > Also
> > > > let it print an error if reset_tracing_files() fails.
> > > 
> > > But what about old kernels?  IIRC there was an error with this change.
> > >
> > Yes, you're right. I can't find a good compitable change. So what is the 
> > compatibilty policy for perf?
> > If it must work with recent kernel, I think the only idea is leave as it 
> > was.
> 
> It should support *both* of kernels.  I suggest adding a space after
> function name.  You can simply call write(fd, " ", 1) IMHO.
> 
hmm, I see. I will try it later mabye tommorow. Thanks!

> Thanks,
> Namhyung
> 
> 
> > > 
> > > 
> > > > 
> > > > The problem:
> > > > $ sudo ./perf ftrace -a --graph-depth 1 --graph-funcs abcdefg
> > > >  0)   0.140 us|  rcu_all_qs();
> > > >  3)   0.304 us|  mutex_unlock();
> > > >  0)   0.153 us|  find_vma();
> > > >  3)   0.088 us|  __fsnotify_parent();
> > > >  0)   6.145 us|  handle_mm_fault();
> > > >  3)   0.089 us|  fsnotify();
> > > >  3)   0.161 us|  __sb_end_write();
> > > >  3)   0.710 us|  SyS_close();
> > > >  3)   7.848 us|  exit_to_usermode_loop();
> > > > 
> > > > On above example, I specified function filter 'abcdefg' but all 
> > > > functions
> > > > are enabled. The expected error is hidden.
> > > > 
> > > > Signed-off-by: Changbin Du 
> > > > ---
> > > >  tools/perf/builtin-ftrace.c | 6 --
> > > >  1 file changed, 4 insertions(+), 2 deletions(-)
> > > > 
> > > > diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
> > > > index 25a42ac..a87e9b3 100644
> > > > --- a/tools/perf/builtin-ftrace.c
> > > > +++ b/tools/perf/builtin-ftrace.c
> > > > @@ -69,7 +69,7 @@ static int __write_tracing_file(const char *name, 
> > > > const char *val, bool append)
> > > >  {
> > > > char *file;
> > > > int fd, ret = -1;
> > > > -   ssize_t size = strlen(val);
> > > > +   ssize_t size = strlen(val) + 1;
> > > > int flags = O_WRONLY;
> > > > char errbuf[512];
> > > >  
> > > > @@ -280,8 +280,10 @@ static int __cmd_ftrace(struct perf_ftrace 
> > > > *ftrace, int argc, const char **argv)
> > > > signal(SIGCHLD, sig_handler);
> > > > signal(SIGPIPE, sig_handler);
> > > >  
> > > > -   if (reset_tracing_files(ftrace) < 0)
> > > > +   if (reset_tracing_files(ftrace) < 0) {
> > > > +   pr_err("failed to reset ftrace\n");
> > > > goto out;
> > > > +   }
> > > >  
> > > > /* reset ftrace buffer */
> > > > if (write_tracing_file("trace", "0") < 0)
> > > > -- 
> > > > 2.7.4
> > > > 
> > 
> > -- 
> > Thanks,
> > Changbin Du

-- 
Thanks,
Changbin Du


Re: [PATCH v2] perf ftrace: Fix the buffer size in __write_tracing_file

2018-02-11 Thread Du, Changbin
Hi,

On Mon, Feb 12, 2018 at 10:55:27AM +0900, Namhyung Kim wrote:
> Hello,
> 
> On Thu, Feb 08, 2018 at 04:13:20PM +0800, changbin...@intel.com wrote:
> > From: Changbin Du 
> > 
> > The terminal character '\0' should take into account into size of the
> > string buffer. Without this fix, the '--graph-funcs', '--nograph-funcs'
> > and '--trace-funcs' options didn't work as expected when the 
> > doesn't exist. If usersapce writes a non-terminated string, the kernel
> > side will always return success but actually no filter applied.
> > 
> > As discussed before, the kernel now support '\0' to mark the end of string:
> > https://lkml.org/lkml/2018/1/16/116
> > 
> > After this fix in userspace, the perf will report correct error state. Also
> > let it print an error if reset_tracing_files() fails.
> 
> But what about old kernels?  IIRC there was an error with this change.
>
Yes, you're right. I can't find a good compitable change. So what is the 
compatibilty policy for perf?
If it must work with recent kernel, I think the only idea is leave as it was.
 
> Thanks,
> Namhyung
> 
> 
> > 
> > The problem:
> > $ sudo ./perf ftrace -a --graph-depth 1 --graph-funcs abcdefg
> >  0)   0.140 us|  rcu_all_qs();
> >  3)   0.304 us|  mutex_unlock();
> >  0)   0.153 us|  find_vma();
> >  3)   0.088 us|  __fsnotify_parent();
> >  0)   6.145 us|  handle_mm_fault();
> >  3)   0.089 us|  fsnotify();
> >  3)   0.161 us|  __sb_end_write();
> >  3)   0.710 us|  SyS_close();
> >  3)   7.848 us|  exit_to_usermode_loop();
> > 
> > On above example, I specified function filter 'abcdefg' but all functions
> > are enabled. The expected error is hidden.
> > 
> > Signed-off-by: Changbin Du 
> > ---
> >  tools/perf/builtin-ftrace.c | 6 --
> >  1 file changed, 4 insertions(+), 2 deletions(-)
> > 
> > diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
> > index 25a42ac..a87e9b3 100644
> > --- a/tools/perf/builtin-ftrace.c
> > +++ b/tools/perf/builtin-ftrace.c
> > @@ -69,7 +69,7 @@ static int __write_tracing_file(const char *name, const 
> > char *val, bool append)
> >  {
> > char *file;
> > int fd, ret = -1;
> > -   ssize_t size = strlen(val);
> > +   ssize_t size = strlen(val) + 1;
> > int flags = O_WRONLY;
> > char errbuf[512];
> >  
> > @@ -280,8 +280,10 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, 
> > int argc, const char **argv)
> > signal(SIGCHLD, sig_handler);
> > signal(SIGPIPE, sig_handler);
> >  
> > -   if (reset_tracing_files(ftrace) < 0)
> > +   if (reset_tracing_files(ftrace) < 0) {
> > +   pr_err("failed to reset ftrace\n");
> > goto out;
> > +   }
> >  
> > /* reset ftrace buffer */
> > if (write_tracing_file("trace", "0") < 0)
> > -- 
> > 2.7.4
> > 

-- 
Thanks,
Changbin Du


Re: [PATCH] tracing/fgraph: Missed irq return mark for leaf entry

2018-02-11 Thread Du, Changbin
Hi Rostedt,
What abount this fix? Thanks!

On Wed, Jan 31, 2018 at 11:48:49PM +0800, changbin...@intel.com wrote:
> From: Changbin Du 
> 
> The fgraph forget to print irq return mark for leaf entry. Then we can see
> unbalanced irq mark in the trace. This patch fix this.
> 
> Before:
>  1)   |  SyS_write() {
>  1)   |__fdget_pos() {
>  1)   0.061 us|  __fget_light();
>  1)   0.289 us|}
>  1)   |vfs_write() {
>  1)   0.049 us|  rw_verify_area();
>  1) + 15.424 us   |  __vfs_write();
>  1)   ==> |
>  1)   6.003 us|  smp_apic_timer_interrupt();
>  1)   0.055 us|  __fsnotify_parent();
>  1)   0.073 us|  fsnotify();
>  1) + 23.665 us   |}
>  1) + 24.501 us   |  }
> 
> After:
>  0)   |  SyS_write() {
>  0)   |__fdget_pos() {
>  0)   0.052 us|  __fget_light();
>  0)   0.328 us|}
>  0)   |vfs_write() {
>  0)   0.057 us|  rw_verify_area();
>  0)   |  __vfs_write() {
>  0)   ==> |
>  0)   8.548 us|  smp_apic_timer_interrupt();
>  0)   <== |
>  0) + 36.507 us   |  } /* __vfs_write */
>  0)   0.049 us|  __fsnotify_parent();
>  0)   0.066 us|  fsnotify();
>  0) + 50.064 us   |}
>  0) + 50.952 us   |  }
> 
> Signed-off-by: Changbin Du 
> ---
>  kernel/trace/trace_functions_graph.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/trace/trace_functions_graph.c 
> b/kernel/trace/trace_functions_graph.c
> index 23c0b0c..169b3c4 100644
> --- a/kernel/trace/trace_functions_graph.c
> +++ b/kernel/trace/trace_functions_graph.c
> @@ -831,6 +831,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
>   struct ftrace_graph_ret *graph_ret;
>   struct ftrace_graph_ent *call;
>   unsigned long long duration;
> + int cpu = iter->cpu;
>   int i;
>  
>   graph_ret = &ret_entry->ret;
> @@ -839,7 +840,6 @@ print_graph_entry_leaf(struct trace_iterator *iter,
>  
>   if (data) {
>   struct fgraph_cpu_data *cpu_data;
> - int cpu = iter->cpu;
>  
>   cpu_data = per_cpu_ptr(data->cpu_data, cpu);
>  
> @@ -869,6 +869,9 @@ print_graph_entry_leaf(struct trace_iterator *iter,
>  
>   trace_seq_printf(s, "%ps();\n", (void *)call->func);
>  
> + print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET,
> + cpu, iter->ent->pid, flags);
> +
>   return trace_handle_return(s);
>  }
>  
> -- 
> 2.7.4
> 

-- 
Thanks,
Changbin Du


Re: [PATCH] tracing/power: Don't share template for cpu_idle and cpu_frequency

2018-02-11 Thread Du, Changbin
On Fri, Feb 09, 2018 at 09:44:58PM -0500, Steven Rostedt wrote:
> On Sat, 10 Feb 2018 09:37:04 +0800
> changbin...@intel.com wrote:
> 
> > From: Changbin Du 
> > 
> > The type of state is signed int, convert it to unsigned int looks weird.
> > (-1 become 4294967295)
> >932.123 power:cpu_idle:state=1 cpu_id=0)
> >932.125 power:cpu_idle:state=4294967295 cpu_id=0)
> >932.132 power:cpu_idle:state=1 cpu_id=0)
> >932.133 power:cpu_idle:state=4294967295 cpu_id=0)
> > 
> > Similarly for cpu_frequency as "state=%lu cpu_id=%lu". User need to read
> > the code to understand what 'state' means.
> > 
> > No functional change in this patch.
> 
> That's not true. You split a class into two TRACE_EVENTS. Each
> TRACE_EVENT adds approximately 5k of code and data. A DEFINE_EVENT()
> adds around 300 bytes. There's better ways to do this,
> 
> Please don't add this patch.
> 
> -- Steve

Steve, How abount DEFINE_EVENT_PRINT as below?

diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 908977d..e71ce98 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -14,12 +14,12 @@

 DECLARE_EVENT_CLASS(cpu,

-   TP_PROTO(unsigned int state, unsigned int cpu_id),
+   TP_PROTO(int state, unsigned int cpu_id),

TP_ARGS(state, cpu_id),

TP_STRUCT__entry(
-   __field(u32,state   )
+   __field(s32,state   )
__field(u32,cpu_id  )
),

@@ -28,13 +28,12 @@ DECLARE_EVENT_CLASS(cpu,
__entry->cpu_id = cpu_id;
),

-   TP_printk("state=%lu cpu_id=%lu", (unsigned long)__entry->state,
- (unsigned long)__entry->cpu_id)
+   TP_printk("state=%d cpu_id=%u", __entry->state, __entry->cpu_id)
 );

 DEFINE_EVENT(cpu, cpu_idle,

-   TP_PROTO(unsigned int state, unsigned int cpu_id),
+   TP_PROTO(int state, unsigned int cpu_id),

TP_ARGS(state, cpu_id)
 );
@@ -141,11 +140,13 @@ TRACE_EVENT(pstate_sample,
{ PM_EVENT_RESTORE, "restore" }, \
{ PM_EVENT_RECOVER, "recover" })

-DEFINE_EVENT(cpu, cpu_frequency,
+DEFINE_EVENT_PRINT(cpu, cpu_frequency,

-   TP_PROTO(unsigned int frequency, unsigned int cpu_id),
+   TP_PROTO(int state, unsigned int cpu_id),

-   TP_ARGS(frequency, cpu_id)
+   TP_ARGS(state, cpu_id),
+
+   TP_printk("frequency=%u cpu_id=%lu", __entry->state, __entry->cpu_id)
 );



Re: A problem about 'perf sched latency'

2018-02-07 Thread Du, Changbin
On Thu, Feb 08, 2018 at 09:57:24AM +0900, Namhyung Kim wrote:
> Hello,
> 
[snip]
> > Does anyone know why? Thank you! :)
> 
> It seems your data doesn't have wakeup event which is required by the
> 'perf sched latency'.
> 
> Thanks,
> Namhyung
> 
Hi Kim,
Thanks for your reply. I thought the 'Swicthes' is the total number of sche-in 
event.
Now I know it only count wakeup sched-in event. It is not documented, so 
confused me.

> 
> > 
> > -- 
> > Thanks,
> > Changbin Du
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-perf-users" 
> > in
> > the body of a message to majord...@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Thanks,
Changbin Du


A problem about 'perf sched latency'

2018-02-06 Thread Du, Changbin
Hello all,
I am using perf sched tool to analyzer sched data on my machine. But it seems 
that 
'perf sched latency' report incorrect 'Switches'. What I did is as below.

First, record...
$ sudo perf sched record time pi 200 > /dev/null

Then check statistics. It says 'pi' only sched-in 1 time.
$ sudo ./perf sched latency | grep pi
  compiz:4054   |  6.389 ms |   30 | avg:0.023 ms | max:
0.122 ms | max at:  94546.021080 s
  pi:4059   |   2083.241 ms |1 | avg:0.007 ms | max:
0.007 ms | max at:  94545.201435 s

But the sched event show 'pi' has been sched-in 7 times.
$ sudo ./perf sched script | grep pi:
  pi  4059 [001] 94545.820858:   sched:sched_switch: pi:4059 
[120] S ==> kworker/1:1:32237 [120]
 kworker/1:1 32237 [001] 94545.820868:   sched:sched_switch: 
kworker/1:1:32237 [120] R ==> pi:4059 [120]
  pi  4059 [001] 94545.885412:   sched:sched_switch: pi:4059 
[120] S ==> kworker/1:1H:320 [100]
kworker/1:1H   320 [001] 94545.885419:   sched:sched_switch: 
kworker/1:1H:320 [100] R ==> pi:4059 [120]
  pi  4059 [001] 94545.907869:   sched:sched_switch: pi:4059 
[120] S ==> kworker/1:1H:320 [100]
kworker/1:1H   320 [001] 94545.907875:   sched:sched_switch: 
kworker/1:1H:320 [100] R ==> pi:4059 [120]
  pi  4059 [001] 94545.908104:   sched:sched_switch: pi:4059 
[120] S ==> kworker/1:1H:320 [100]
kworker/1:1H   320 [001] 94545.908108:   sched:sched_switch: 
kworker/1:1H:320 [100] R ==> pi:4059 [120]
  pi  4059 [001] 94545.916135:   sched:sched_switch: pi:4059 
[120] S ==> kworker/1:1H:320 [100]
kworker/1:1H   320 [001] 94545.916154:   sched:sched_switch: 
kworker/1:1H:320 [100] R ==> pi:4059 [120]
  pi  4059 [001] 94546.812856:   sched:sched_switch: pi:4059 
[120] S ==> kworker/1:1:32237 [120]
 kworker/1:1 32237 [001] 94546.813148:   sched:sched_switch: 
kworker/1:1:32237 [120] R ==> pi:4059 [120]
  pi  4059 [001] 94546.885227:   sched:sched_switch: pi:4059 
[120] S ==> i915/signal:1:207 [98]
   i915/signal:1   207 [001] 94546.885232:   sched:sched_switch: 
i915/signal:1:207 [98] D ==> pi:4059 [120]
  pi  4059 [001] 94547.285049:   sched:sched_switch: pi:4059 
[120] x ==> swapper/1:0 [120]

Does anyone know why? Thank you! :)

-- 
Thanks,
Changbin Du


Re: [PATCH v3 0/3] tracing: Fix the parser when processing strings w/ or w/o terminated '\0'

2018-01-16 Thread Du, Changbin
On Wed, Jan 17, 2018 at 02:45:24PM +0900, Namhyung Kim wrote:
> Hello,
> 
> On Wed, Jan 17, 2018 at 12:54:34PM +0800, Du, Changbin wrote:
> > On Tue, Jan 16, 2018 at 12:42:26PM -0500, Steven Rostedt wrote:
> > > On Tue, 16 Jan 2018 17:02:27 +0800
> > > changbin...@intel.com wrote:
> > > 
> > > > From: Changbin Du 
> > > > 
> > > > I found there are some problems in the tracing parser when I investiage 
> > > > the root
> > > > cause of issues mentioned in below patch.
> > > > https://patchwork.kernel.org/patch/10132953/
> > > 
> > > I pulled in your patches and tweaked the change logs of the other two
> > > patches as well. You can see my temporary git tree here, but it may
> > > rebase.
> > > 
> > > git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace.git
> > > 
> > >  ftrace/core
> > > 
> > > -- Steve
> > Got it. Thank you!
> > 
> > Hi Olsa, so the perf patch 'perf ftrace: Fix the buffer size 
> > in__write_tracing_file'
> > is still needed. I will resend you at appropriate time.
> 
> But it will work on the future kernels only, right?  For tools to be
> compatible with old kernels, you'd better writing a whitespace after
> the function name IMHO.
> 
Yes, it needs to write a space if it doesn't want possible error hidden.

> Thanks,
> Namhyung

-- 
Thanks,
Changbin Du


Re: [PATCH v3 0/3] tracing: Fix the parser when processing strings w/ or w/o terminated '\0'

2018-01-16 Thread Du, Changbin
On Tue, Jan 16, 2018 at 12:42:26PM -0500, Steven Rostedt wrote:
> On Tue, 16 Jan 2018 17:02:27 +0800
> changbin...@intel.com wrote:
> 
> > From: Changbin Du 
> > 
> > I found there are some problems in the tracing parser when I investiage the 
> > root
> > cause of issues mentioned in below patch.
> > https://patchwork.kernel.org/patch/10132953/
> 
> I pulled in your patches and tweaked the change logs of the other two
> patches as well. You can see my temporary git tree here, but it may
> rebase.
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace.git
> 
>  ftrace/core
> 
> -- Steve
Got it. Thank you!

Hi Olsa, so the perf patch 'perf ftrace: Fix the buffer size 
in__write_tracing_file'
is still needed. I will resend you at appropriate time.

-- 
Thanks,
Changbin Du


Re: [PATCH v2 1/3] tracing: detect the string termination character when parsing user input string

2018-01-15 Thread Du, Changbin
Hi Rostedt,
Thanks for your polish, let me update commit msg with your words.

On Mon, Jan 15, 2018 at 06:20:00PM -0500, Steven Rostedt wrote:
> On Mon, 15 Jan 2018 19:41:12 +0800
> changbin...@intel.com wrote:
> 
> > From: Changbin Du 
> > 
> > The usersapce can give a '\0' terminated C string in the input buffer.
> > Before this change, trace_get_user() will return a parsed string "\0" in
> > below case which is not expected (expects it skip all inputs) and cause the
> > caller failed.
> 
> The above totally does not parse (no pun intended).
> 
> Are you trying to say:
> 
> "User space can pass in a C nul character '\0' along with its input.
> The function trace_get_user() will try to process it as a normal
> character, and that will fail to parse.
> 
> > 
> > open("/sys/kernel/debug/tracing//set_ftrace_pid", O_WRONLY|O_TRUNC) = 3
> > write(3, " \0", 2)  = -1 EINVAL (Invalid argument)
> > 
> > while parse can handle spaces, so below works.
> > 
> > $ echo "" > set_ftrace_pid
> > $ echo " " > set_ftrace_pid
> > $ echo -n " " > set_ftrace_pid
> > 
> > This patch try to make the parser '\0' aware to fix such issue. When parser
> > sees a '\0' it stops further parsing. With this change, write(3, " \0", 2)
> > will work.
> 
> The above should be something like:
> 
> "Have the parser stop on '\0' and cease any further parsing. Only
> process the characters up to the nul '\0' character and do not process
> it."
> 
> -- Steve
> 
> 
> > 
> > Signed-off-by: Changbin Du 
> > 
> > ---
> >   v2: Stop parsing when '\0' found.
> > ---
> >  kernel/trace/trace.c | 6 +++---
> >  1 file changed, 3 insertions(+), 3 deletions(-)
> > 
> > diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> > index 2a8d8a2..144d08e 100644
> > --- a/kernel/trace/trace.c
> > +++ b/kernel/trace/trace.c
> > @@ -1237,7 +1237,7 @@ int trace_get_user(struct trace_parser *parser, const 
> > char __user *ubuf,
> > }
> >  
> > /* only spaces were written */
> > -   if (isspace(ch)) {
> > +   if (isspace(ch) || !ch) {
> > *ppos += read;
> > ret = read;
> > goto out;
> > @@ -1247,7 +1247,7 @@ int trace_get_user(struct trace_parser *parser, const 
> > char __user *ubuf,
> > }
> >  
> > /* read the non-space input */
> > -   while (cnt && !isspace(ch)) {
> > +   while (cnt && !isspace(ch) && ch) {
> > if (parser->idx < parser->size - 1)
> > parser->buffer[parser->idx++] = ch;
> > else {
> > @@ -1262,7 +1262,7 @@ int trace_get_user(struct trace_parser *parser, const 
> > char __user *ubuf,
> > }
> >  
> > /* We either got finished input or we have to wait for another call. */
> > -   if (isspace(ch)) {
> > +   if (isspace(ch) || !ch) {
> > parser->buffer[parser->idx] = 0;
> > parser->cont = false;
> > } else if (parser->idx < parser->size - 1) {
> 

-- 
Thanks,
Changbin Du


Re: [PATCH 2/3] tracing: make sure the parsed string always terminates with '\0'

2018-01-15 Thread Du, Changbin
On Tue, Jan 09, 2018 at 11:10:22PM -0500, Steven Rostedt wrote:
> On Wed, 10 Jan 2018 11:02:06 +0800
> "Du, Changbin"  wrote:
> 
> > On Tue, Jan 09, 2018 at 06:02:58PM -0500, Steven Rostedt wrote:
> > > On Tue,  9 Jan 2018 17:55:47 +0800
> > > changbin...@intel.com wrote:
> > >   
> > > > From: Changbin Du 
> > > > 
> > > > The parser parse every string into parser.buffer. And some of the 
> > > > callers
> > > > assume that parser.buffer contains a C string. So it is dangerous that 
> > > > the
> > > > parser returns a unterminated string. The userspace can leverage this to
> > > > attack the kernel.  
> > > 
> > > Is this only a bug if we apply your first patch?
> > >  
> > I don't think so. Seems it is there already.
> >  
> 
> OK. I'll have to take a deeper look into this so that I completely
> understand the problem and your solution. I'm currently traveling and
> may not get to do that this week. Please ping me next week if you don't
> hear back from me on this issue.
> 
> Thanks!
> 
> -- Steve

I checked every trace_get_user() clients again and found it is not an issue in
current kernel. The client has checked trace_parser_cont() before using parsed
string or append '\0'.

I still want to make the parser returns a '\0' terminated string. Then we don't
require the clients append it. I think this would be better since we are dealing
with strings.

-- 
Thanks,
Changbin Du


Re: [PATCH 3/3] tracing: don't set parser->cont if it has reached the end of input buffer

2018-01-13 Thread Du, Changbin
On Fri, Jan 12, 2018 at 10:31:08AM -0500, Steven Rostedt wrote:
[...]
> > Thanks, so now I unstand why below corner case. The userspace try to set the
> > filter with a unrecognized symbole name (e.g "abcdefg").
> > open("/sys/kernel/debug/tracing/set_ftrace_filter", O_WRONLY|O_TRUNC) = 3
> > write(3, "abcdefg", 7)
> > 
> > Since "abcdefg" is not in the symbole list, so we would expect the write 
> > return
> > -EINVAL, right? As below:
> > # echo abcdefg > set_ftrace_filter
> > bash: echo: write error: Invalid argument
> 
> The write itself doesn't finish the operation. There may be another
> write. In other words:
> 
>   write(3, "do_", 3);
>   write(3, "IRQ\n", 4);
> 
> Should both return success, even though it only enabled do_IRQ.
> 
> > 
> > But the above mechanism hide the error. It return success actually no 
> > filter is
> > apllied at all.
> > # echo -n abcdefg > set_ftrace_filter
> > 
> > I think in this case kernel may request the userspace append a '\0' or 
> > space to the
> > string buffer so everything can work.
> > 
> > Also there is another corner case. Below write dosn't work.
> > open("/sys/kernel/debug/tracing//set_ftrace_pid", O_WRONLY|O_TRUNC) = 3
> > write(3, " \0", 2)  = -1 EINVAL (Invalid argument)
> > 
> > While these works:
> > # echo "" > set_ftrace_pid
> > # echo " " > set_ftrace_pid
> > # echo -n " " > set_ftrace_pid
> > 
> > These is the reason why I think '\0' should be recognized by the parser.
> 
> Hmm, thinking about this more, I do partially agree with you. We should
> accept '\0' but I disagree that it should be treated as a space. I
> don't want hidden code.
> 
> It should be treated as a terminator. And carefully as well.
> 
>   write(3, "do_IRQ", 7);
> 
> Which will send to the kernel 'd' 'o' '_' 'I' 'R' 'Q' '\0' when the
> kernel sees the '\0', and the write has not sent anything else, it
> should go ahead and execute 'do_IRQ'
> 
> This will allow for this to work:
> 
>   char *funcs[] = { "do_IRQ", "schedule", NULL };
> 
>   for (i = 0; funcs[i]; i++) {
>   ret = write(3, funcs[i], strlen(funcs[i]) + 1);
>   if (ret < 0)
>   exit(-1);
>   }
> 
> 
> Now if someone were to write:
> 
>   write(3, "do_IRQ\0schedule", 16);
> 
> That should return an error.
> 
> Why?
> 
> Because these are strings, and most tools treat '\0' as a nul
> terminator to a string. If we allow for tools to send data after that
> nul terminator, we are opening up a way for those interacting with
> these tools to sneak in strings that are not visible.
> 
> Say we have some admin tools that is doing tracing, and takes input.
> And all the input is logged. And say the tool does something like:
> 
> 
>   r = read(0, buf, sizeof(buf));
>   if (r < 0 || r > sizeof(buf) - 1)
>   return -1;
>   log("Adding to output %s\n", buf);
>   write(3, buf, r);
> 
> The "Adding to output" would only show up to the '\0', but if we allow
> that write to process after the '\0' then we just allowed the user to
> circumvent the log.
> 
> -- Steve
I agree on your concern. So I will revise this serias and drop the last patch.

-- 
Thanks,
Changbin Du


Re: [PATCH 3/3] tracing: don't set parser->cont if it has reached the end of input buffer

2018-01-11 Thread Du, Changbin

Hi Rostedt,
On Tue, Jan 09, 2018 at 11:19:36PM -0500, Steven Rostedt wrote:
> On Wed, 10 Jan 2018 11:18:23 +0800
> "Du, Changbin"  wrote:
> 
> > write(3, "abcdefg", 7)  
> > > 
> > > From my point of view, the above isn't done writing the function name
> > > yet and we SHOULD continue waiting for more input.
> > >   
> > hmm, thanks for the background. Your above case is a postive use case. So by
> > this design, instead of write(3, "abcdefg", 7), it should be
> > write(3, "abcdefg\0", 8), right?
> 
> BTW, gcc would translate the above string to 'abcdefg\0\0'. When
> defining strings with "", gcc (and all C compilers) append a '\0' to
> the end.
> 
I should clarify the expression here first. :) All the strings here is to 
express
all the content of a string buffer, including the compiler appended '\0'. (Just 
like
the output of 'strace').
If this description is still not clear, please let me know!

> But I replied to the first patch, saying that allowing \0 as whitespace
> may be OK, given the usecase I showed.
> 
> > 
> > If true, it means kernel expect userspace write every string terminated with
> > '\0'. So to fix this issue:
> > open("/sys/kernel/debug/tracing//set_ftrace_pid", O_WRONLY|O_TRUNC) = 3
> > write(3, " \0", 2)  = -1 EINVAL (Invalid argument)
> > 
> > Fix would be:
> > write(3, "\0", 1)?
> > 
> > So far, I am still confused. Some of the tracing debugfs entry accept '\0'
> > while some not. AFIK, 'echo xxx > ' always has a '\0'
> > terminated.
> 
> I don't believe that's true.
> 
>  $ echo -n abc > /tmp/abc
>  $ wc /tmp/abc
>  0 1 3 /tmp/abc
> 
> Echo writes only the characters you put on the line, nothing more.
> 
Sorry, I misundertood it. The extra character is '\n'.
  $ echo abc > set_ftrace_filter
0.000 probe:ftrace_filter_write_line0:(a7b8db80) ubuf=0xc77408 
cnt=0x4)
  $ echo -n abc > set_ftrace_filter
8889.832 probe:ftrace_filter_write_line0:(a7b8db80) ubuf=0xc77408 
cnt=0x3)

> Note, when the file descriptor is closed, the code also executes on
> what was written but not terminated. That is:
> 
>   write(fd, "abc", 3);
>   close(fd);
> 
> Will keep the "abc" in the continue buffer, but the closing of the file
> descriptor will flush it, and execute it.
> 
Thanks, so now I unstand why below corner case. The userspace try to set the
filter with a unrecognized symbole name (e.g "abcdefg").
open("/sys/kernel/debug/tracing/set_ftrace_filter", O_WRONLY|O_TRUNC) = 3
write(3, "abcdefg", 7)

Since "abcdefg" is not in the symbole list, so we would expect the write return
-EINVAL, right? As below:
# echo abcdefg > set_ftrace_filter
bash: echo: write error: Invalid argument

But the above mechanism hide the error. It return success actually no filter is
apllied at all.
# echo -n abcdefg > set_ftrace_filter

I think in this case kernel may request the userspace append a '\0' or space to 
the
string buffer so everything can work.

Also there is another corner case. Below write dosn't work.
open("/sys/kernel/debug/tracing//set_ftrace_pid", O_WRONLY|O_TRUNC) = 3
write(3, " \0", 2)  = -1 EINVAL (Invalid argument)

While these works:
# echo "" > set_ftrace_pid
# echo " " > set_ftrace_pid
# echo -n " " > set_ftrace_pid

These is the reason why I think '\0' should be recognized by the parser.

> -- Steve

-- 
Thanks,
Changbin Du


Re: [PATCH 3/3] tracing: don't set parser->cont if it has reached the end of input buffer

2018-01-09 Thread Du, Changbin
On Tue, Jan 09, 2018 at 06:12:41PM -0500, Steven Rostedt wrote:
> On Tue,  9 Jan 2018 17:55:48 +0800
> changbin...@intel.com wrote:
> 
> > From: Changbin Du 
> > 
> > We should not set parser->cont if it has reached the end of input buffer.
> > And since some callers (like ftrace_graph_write()) treat it as an error
> > condition if trace_parser_cont() returns true.
> 
> This will break existing use cases. In fact you are removing the entire
> point of this code. It NEEDS to continue if it reached the end of the
> input buffer.
> 
> I do things like:
> 
>  # cat file > set_ftrace_filter
> 
> where the file has a list of function names. It writes in blocks, and
> it could very well have a function name split between two writes where
> the write is at the end of the buffer but not finished writing the
> function name.
>
> > 
> > For example, if userspace set 'set_ftrace_filter' by writing:
> > write(3, "abcdefg", 7)
> 
> From my point of view, the above isn't done writing the function name
> yet and we SHOULD continue waiting for more input.
> 
hmm, thanks for the background. Your above case is a postive use case. So by
this design, instead of write(3, "abcdefg", 7), it should be
write(3, "abcdefg\0", 8), right?

If true, it means kernel expect userspace write every string terminated with
'\0'. So to fix this issue:
open("/sys/kernel/debug/tracing//set_ftrace_pid", O_WRONLY|O_TRUNC) = 3
write(3, " \0", 2)  = -1 EINVAL (Invalid argument)

Fix would be:
write(3, "\0", 1)?

So far, I am still confused. Some of the tracing debugfs entry accept '\0'
while some not. AFIK, 'echo xxx > ' always has a '\0'
terminated.

> BIG NACK on this patch. Sorry.
> 
> I'm guessing you have some program that writes only the strlen() of
> these strings. That's wrong, you need to write "strlen()+1". Write some
> real white space between calls, it will work. Add a "write(fd, " ", 1)"
> between calls if you need to. Please don't change the kernel to fix
> some bad use case. Especially when your fix will break existing use
> cases.
> 
> -- Steve
> 
> > 
> > Then in the kernel function ftrace_regex_write(), ftrace_process_regex()
> > will not be executed. The result is that the given filter will not be
> > applied at all.
> > 
> > ftrace_regex_write() {
> > ...
> > read = trace_get_user(parser, ubuf, cnt, ppos);
> > if (read >= 0 && trace_parser_loaded(parser) &&
> > !trace_parser_cont(parser)) {
> > ret = ftrace_process_regex(iter, parser->buffer,
> >parser->idx, enable);
> > ...
> > }
> > ...
> > }
> > 
> > Signed-off-by: Changbin Du 

-- 
Thanks,
Changbin Du


Re: [PATCH 2/3] tracing: make sure the parsed string always terminates with '\0'

2018-01-09 Thread Du, Changbin
On Tue, Jan 09, 2018 at 06:02:58PM -0500, Steven Rostedt wrote:
> On Tue,  9 Jan 2018 17:55:47 +0800
> changbin...@intel.com wrote:
> 
> > From: Changbin Du 
> > 
> > The parser parse every string into parser.buffer. And some of the callers
> > assume that parser.buffer contains a C string. So it is dangerous that the
> > parser returns a unterminated string. The userspace can leverage this to
> > attack the kernel.
> 
> Is this only a bug if we apply your first patch?
>
I don't think so. Seems it is there already.
 
> -- Steve
> 

-- 
Thanks,
Changbin Du


Re: [PATCH 1/3] tracing: detect the string termination character when parsing user input string

2018-01-09 Thread Du, Changbin
hi Rostedt,

On Tue, Jan 09, 2018 at 05:54:34PM -0500, Steven Rostedt wrote:
> On Tue,  9 Jan 2018 17:55:46 +0800
> changbin...@intel.com wrote:
> 
> > From: Changbin Du 
> > 
> > The usersapce can give a '\0' terminated C string or even has '\0' at the
> > middle of input buffer. We need handle both these two cases correctly.
> 
> What do you define as correctly. Because I'm not seeing it.
>
Soory I don't fully understand your question. What I meant is want to get clear 
that
how will tracing parser below strings.
  "", "  ",  "\0", " \0 ", "aa\0bb"
The parser may only recognize certain formats, but whatever the behaviour should
be clear and coherent for all tracing interfaces.

> > 
> > Before this change, trace_get_user() will return a parsed string "\0" in
> > below case. It is not expected (expects it skip all inputs) and cause the
> > caller failed.
> > 
> > open("/sys/kernel/debug/tracing//set_ftrace_pid", O_WRONLY|O_TRUNC) = 3
> > write(3, " \0", 2)  = -1 EINVAL (Invalid argument)
> 
> That looks more like a feature and not a bug.
> 
I point this out because I think the parser should take this as an emptry string
per the comments of trace_get_user().
/*
 * trace_get_user - reads the user input string separated by  space
 * (matched by isspace(ch))
 *
 * For each string found the 'struct trace_parser' is updated,
 * and the function returns.
 *
 * Returns number of bytes read.
 *
 * See kernel/trace/trace.h for 'struct trace_parser' details.
 */

> > 
> > This patch try to make the parser '\0' aware to fix such issue.
> 
> Why?
> 

> > 
> > Since the caller expects trace_get_user() to parse whole input buffer, so
> > this patch treat '\0' as a separator as whitespace.
> 
> It looks more like we are trying to fix a userspace bug via the kernel.
> 

> I'm not liking this. So NACK.
> 
> -- Steve
> 
> > 
> > Signed-off-by: Changbin Du 
> > ---
> >  kernel/trace/trace.c | 17 +++--
> >  1 file changed, 11 insertions(+), 6 deletions(-)
> > 
> > diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> > index 2a8d8a2..18526a1 100644
> > --- a/kernel/trace/trace.c
> > +++ b/kernel/trace/trace.c
> > @@ -1194,9 +1194,14 @@ void trace_parser_put(struct trace_parser *parser)
> > parser->buffer = NULL;
> >  }
> >  
> > +static inline bool is_space_or_zero(char ch)
> > +{
> > +   return isspace(ch) || !ch;
> > +}
> > +
> >  /*
> > - * trace_get_user - reads the user input string separated by  space
> > - * (matched by isspace(ch))
> > + * trace_get_user - reads the user input string separated by space or '\0'
> > + * (matched by is_space_or_zero(ch))
> >   *
> >   * For each string found the 'struct trace_parser' is updated,
> >   * and the function returns.
> > @@ -1228,7 +1233,7 @@ int trace_get_user(struct trace_parser *parser, const 
> > char __user *ubuf,
> >  */
> > if (!parser->cont) {
> > /* skip white space */
> > -   while (cnt && isspace(ch)) {
> > +   while (cnt && is_space_or_zero(ch)) {
> > ret = get_user(ch, ubuf++);
> > if (ret)
> > goto out;
> > @@ -1237,7 +1242,7 @@ int trace_get_user(struct trace_parser *parser, const 
> > char __user *ubuf,
> > }
> >  
> > /* only spaces were written */
> > -   if (isspace(ch)) {
> > +   if (is_space_or_zero(ch)) {
> > *ppos += read;
> > ret = read;
> > goto out;
> > @@ -1247,7 +1252,7 @@ int trace_get_user(struct trace_parser *parser, const 
> > char __user *ubuf,
> > }
> >  
> > /* read the non-space input */
> > -   while (cnt && !isspace(ch)) {
> > +   while (cnt && !is_space_or_zero(ch)) {
> > if (parser->idx < parser->size - 1)
> > parser->buffer[parser->idx++] = ch;
> > else {
> > @@ -1262,7 +1267,7 @@ int trace_get_user(struct trace_parser *parser, const 
> > char __user *ubuf,
> > }
> >  
> > /* We either got finished input or we have to wait for another call. */
> > -   if (isspace(ch)) {
> > +   if (is_space_or_zero(ch)) {
> > parser->buffer[parser->idx] = 0;
> > parser->cont = false;
> > } else if (parser->idx < parser->size - 1) {
> 

-- 
Thanks,
Changbin Du


Re: [PATCH] perf ftrace: Fix the buffer size in __write_tracing_file

2018-01-08 Thread Du, Changbin
On Mon, Jan 08, 2018 at 03:34:57PM +0100, Jiri Olsa wrote:
> On Mon, Jan 08, 2018 at 11:05:12AM +0800, Du, Changbin wrote:
> > Hi Olsa,
> > What about this fix now? Thanks!
> > 
> > On Tue, Dec 26, 2017 at 05:26:56PM +0800, changbin...@intel.com wrote:
> > > From: Changbin Du 
> > > 
> > > The terminal character '\0' should take into account as size of the string
> > > buffer. Without this fix, the '--graph-funcs', '--nograph-funcs' and
> > > '--trace-funcs' options didn't work as expected when the  doesn't
> > > exist.
> > > 
> > > I didn't dive into kernel ftrace fops, but strace shows that if usersapce
> > > writes a non-terminated string, the kernel side will return success but
> > > no filter applied. After this fix in userspace, the kernel will return an
> > > error.
> > > 
> > > $ sudo ./perf ftrace -a --graph-depth 1 --graph-funcs abcdefg
> > >  0)   0.140 us|  rcu_all_qs();
> > >  3)   0.304 us|  mutex_unlock();
> > >  0)   0.153 us|  find_vma();
> > >  3)   0.088 us|  __fsnotify_parent();
> > >  0)   6.145 us|  handle_mm_fault();
> > >  3)   0.089 us|  fsnotify();
> > >  3)   0.161 us|  __sb_end_write();
> > >  3)   0.710 us|  SyS_close();
> > >  3)   7.848 us|  exit_to_usermode_loop();
> > > 
> > > On above example, I specified function filter 'abcdefg' but all functions
> > > are enabled.
> 
> hum, haven't checked, but looks like the filter is not working at all now:
> 
> [root@krava perf]# ./perf ftrace -vv -a --graph-depth 1 --graph-funcs 
> proc_sys_read
> write ' ' to tracing/set_ftrace_pid failed: Invalid argument
> [root@krava perf]# ./perf ftrace -vv -a --graph-depth 1 --graph-funcs SyS_read
> write ' ' to tracing/set_ftrace_pid failed: Invalid argument
> [root@krava perf]# ./perf ftrace -vv -a --graph-depth 1 --graph-funcs fsnotify
> write ' ' to tracing/set_ftrace_pid failed: Invalid argument
>
Thanks for your test. I forgot to test normal case and thought the err is 
expected...

This time I dived into kernel side, and found 3 issues (if I am all right) at 
the
kernel function trace_get_user(). This function has problems to process both 
complete
C string or not.

I will send the kernel patches and Cc you guys. And I still think it is better
let perf write a complete C string.

Thanks!
Changbin Du

> jirka
>
[...] 


Re: [PATCH] perf ftrace: Fix the buffer size in __write_tracing_file

2018-01-07 Thread Du, Changbin
Hi Olsa,
What about this fix now? Thanks!

On Tue, Dec 26, 2017 at 05:26:56PM +0800, changbin...@intel.com wrote:
> From: Changbin Du 
> 
> The terminal character '\0' should take into account as size of the string
> buffer. Without this fix, the '--graph-funcs', '--nograph-funcs' and
> '--trace-funcs' options didn't work as expected when the  doesn't
> exist.
> 
> I didn't dive into kernel ftrace fops, but strace shows that if usersapce
> writes a non-terminated string, the kernel side will return success but
> no filter applied. After this fix in userspace, the kernel will return an
> error.
> 
> $ sudo ./perf ftrace -a --graph-depth 1 --graph-funcs abcdefg
>  0)   0.140 us|  rcu_all_qs();
>  3)   0.304 us|  mutex_unlock();
>  0)   0.153 us|  find_vma();
>  3)   0.088 us|  __fsnotify_parent();
>  0)   6.145 us|  handle_mm_fault();
>  3)   0.089 us|  fsnotify();
>  3)   0.161 us|  __sb_end_write();
>  3)   0.710 us|  SyS_close();
>  3)   7.848 us|  exit_to_usermode_loop();
> 
> On above example, I specified function filter 'abcdefg' but all functions
> are enabled.
> 
> Signed-off-by: Changbin Du 
> ---
>  tools/perf/builtin-ftrace.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
> index 25a42ac..2604a64 100644
> --- a/tools/perf/builtin-ftrace.c
> +++ b/tools/perf/builtin-ftrace.c
> @@ -69,7 +69,7 @@ static int __write_tracing_file(const char *name, const 
> char *val, bool append)
>  {
>   char *file;
>   int fd, ret = -1;
> - ssize_t size = strlen(val);
> + ssize_t size = strlen(val) + 1;
>   int flags = O_WRONLY;
>   char errbuf[512];
>  
> -- 
> 2.7.4
> 

-- 
Thanks,
Changbin Du


Re: [PATCH] ACPI / sysfs: fix shift-overflow in GPE flooding quirk mechanism

2018-01-02 Thread Du, Changbin
On Tue, Jan 02, 2018 at 11:18:31AM +0100, Rafael J. Wysocki wrote:
> On Tue, Jan 2, 2018 at 7:36 AM, Du, Changbin  wrote:
> > Hi Wysocki and Brown,
> > May I know wether you have checked this? Thanks!
> 
> There's a commit changing this queued up already, see
> https://patchwork.kernel.org/patch/10085579/
> 
Got it, Thanks for your reply.

> Thanks!

-- 
Thanks,
Changbin Du


Re: [PATCH] ACPI / sysfs: fix shift-overflow in GPE flooding quirk mechanism

2018-01-01 Thread Du, Changbin
Hi Wysocki and Brown,
May I know wether you have checked this? Thanks!

On Fri, Dec 22, 2017 at 11:11:10PM +0800, changbin...@intel.com wrote:
> From: Changbin Du 
> 
> The ACPI_MASKABLE_GPE_MAX is larger than the number of bits that u64 can
> represent. This result in shift-overflow. So actually we need a bitmap.
> 
> [1.003153] 
> ==
> [1.003257] UBSAN: Undefined behaviour in drivers/acpi/sysfs.c:849:33
> [1.003314] shift exponent 64 is too large for 64-bit type 'long long 
> unsigned int'
> [1.003381] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.15.0-rc4+ #40
> [1.003436] Hardware name: LENOVO 20HAS02515/20HAS02515, BIOS N1VET36W 
> (1.26 ) 10/03/2017
> [1.003504] Call Trace:
> [1.003542]  dump_stack+0xe3/0x177
> [1.003582]  ? _atomic_dec_and_lock+0x219/0x219
> [1.003653]  ubsan_epilogue+0xd/0x4e
> [1.003695]  __ubsan_handle_shift_out_of_bounds+0x1f8/0x23d
> [1.003754]  ? __ubsan_handle_load_invalid_value+0x13b/0x13b
> [1.003817]  ? trace_hardirqs_on_caller+0x1f3/0x370
> [1.003868]  ? trace_hardirqs_on+0xd/0x10
> [1.003917]  ? up+0xe9/0x160
> [1.003957]  ? sugov_should_update_freq+0xa1/0x1f0
> [1.004000]  ? trace_hardirqs_on+0xd/0x10
> [1.004000]  acpi_gpe_apply_masked_gpes+0xa4/0x125
> [1.004000]  ? acpi_gpe_apply_masked_gpes+0xa4/0x125
> [1.004000]  ? acpi_gpe_set_masked_gpes+0xe3/0xe3
> [1.004000]  ? acpi_get_table+0x111/0x127
> [1.004000]  acpi_scan_init+0x299/0x598
> [1.004000]  ? acpi_match_madt+0xae/0xae
> [1.004000]  ? sysfs_add_file_mode_ns+0x160/0x320
> [1.004000]  ? kobject_put+0x23/0x220
> [1.004000]  ? bus_create_file+0x75/0x90
> [1.004000]  ? bus_register+0x44a/0x540
> [1.004000]  ? subsys_register.part.1+0x140/0x140
> [1.004000]  acpi_init+0x532/0x5d8
> [1.004000]  ? acpi_sleep_proc_init+0x36/0x36
> [1.004000]  ? console_trylock+0x60/0x60
> [1.004000]  ? sysfs_add_file_mode_ns+0x160/0x320
> [1.004000]  ? sysfs_create_file_ns+0x56/0x80
> [1.004000]  ? video_setup+0x13c/0x13c
> [1.004000]  ? fb_console_init+0x16c/0x1fc
> [1.004000]  ? acpi_sleep_proc_init+0x36/0x36
> [1.004000]  do_one_initcall+0xae/0x282
> [1.004000]  ? initcall_blacklisted+0x1c0/0x1c0
> [1.004000]  ? up_write+0x92/0x100
> [1.004000]  ? down_write_nested+0x110/0x110
> [1.004000]  ? kasan_unpoison_shadow+0x35/0x50
> [1.004000]  kernel_init_freeable+0x4af/0x573
> [1.004000]  ? start_kernel+0x6b1/0x6b1
> [1.004000]  ? rest_init+0x100/0x100
> [1.004000]  kernel_init+0x13/0x13d
> [1.004000]  ? rest_init+0x100/0x100
> [1.004000]  ? rest_init+0x100/0x100
> [1.004000]  ret_from_fork+0x24/0x30
> [1.004000] 
> ==
> 
> Fixes: 9c4aa1eecb48 ("ACPI / sysfs: Provide quirk mechanism to prevent GPE 
> flooding")
> Signed-off-by: Changbin Du 
> ---
>  drivers/acpi/sysfs.c | 7 ---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c
> index 06a150b..60ade0b 100644
> --- a/drivers/acpi/sysfs.c
> +++ b/drivers/acpi/sysfs.c
> @@ -823,7 +823,7 @@ static ssize_t counter_set(struct kobject *kobj,
>   */
>  #define ACPI_MASKABLE_GPE_MAX0x80
>  
> -static u64 __initdata acpi_masked_gpes;
> +static __initdata DECLARE_BITMAP(acpi_masked_gpes, ACPI_MASKABLE_GPE_MAX);
>  
>  static int __init acpi_gpe_set_masked_gpes(char *val)
>  {
> @@ -831,7 +831,8 @@ static int __init acpi_gpe_set_masked_gpes(char *val)
>  
>   if (kstrtou8(val, 0, &gpe) || gpe > ACPI_MASKABLE_GPE_MAX)
>   return -EINVAL;
> - acpi_masked_gpes |= ((u64)1< +
> + set_bit(gpe, acpi_masked_gpes);
>  
>   return 1;
>  }
> @@ -846,7 +847,7 @@ void __init acpi_gpe_apply_masked_gpes(void)
>   for (gpe = 0;
>gpe < min_t(u8, ACPI_MASKABLE_GPE_MAX, acpi_current_gpe_count);
>gpe++) {
> - if (acpi_masked_gpes & ((u64)1< + if (test_bit(gpe, acpi_masked_gpes)) {
>   status = acpi_get_gpe_device(gpe, &handle);
>   if (ACPI_SUCCESS(status)) {
>   pr_info("Masking GPE 0x%x.\n", gpe);
> -- 
> 2.7.4
> 

-- 
Thanks,
Changbin Du


Re: [PATCH v4] tracing: Allocate mask_str buffer dynamically

2017-12-13 Thread Du, Changbin
On Wed, Dec 13, 2017 at 03:17:24PM -0500, Steven Rostedt wrote:
> On Tue, 12 Dec 2017 19:15:53 +0800
> "Du, Changbin"  wrote:
> 
> > Hi Rostedt, How about this version?
> > 
> 
> I've pulled it, but due to traveling, I haven't been able to push it to
> Linus yet. I'm hoping to do that soon.
> 
> -- Steve

Thank you. And have a nice trip!

-- 
Thanks,
Changbin Du


Re: [PATCH v4] tracing: Allocate mask_str buffer dynamically

2017-12-12 Thread Du, Changbin

Hi Rostedt, How about this version?

On Thu, Nov 30, 2017 at 11:39:43AM +0800, changbin...@intel.com wrote:
> From: Changbin Du 
> 
> The default NR_CPUS can be very large, but actual possible nr_cpu_ids
> usually is very small. For my x86 distribution, the NR_CPUS is 8192 and
> nr_cpu_ids is 4. About 2 pages are wasted.
> 
> Most machines don't have so many CPUs, so define a array with NR_CPUS
> just wastes memory. So let's allocate the buffer dynamically when need.
> 
> With this change, the mutext tracing_cpumask_update_lock also can be
> removed now, which was used to protect mask_str.
> 
> Signed-off-by: Changbin Du 
> Cc: Steven Rostedt 
> 
> ---
> v4:
>   - calculate the buffer size using snprintf. (Rostedt)
> v3:
>   - remove tracing_cpumask_update_lock which was used to protect mask_str. 
> (Rostedt)
> v2:
>   - remove 'static' declaration.
>   - fix buffer size.
> ---
>  kernel/trace/trace.c | 29 +
>  1 file changed, 9 insertions(+), 20 deletions(-)
> 
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index 73e67b6..53d29c7 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -4178,37 +4178,30 @@ static const struct file_operations show_traces_fops 
> = {
>   .llseek = seq_lseek,
>  };
>  
> -/*
> - * The tracer itself will not take this lock, but still we want
> - * to provide a consistent cpumask to user-space:
> - */
> -static DEFINE_MUTEX(tracing_cpumask_update_lock);
> -
> -/*
> - * Temporary storage for the character representation of the
> - * CPU bitmask (and one more byte for the newline):
> - */
> -static char mask_str[NR_CPUS + 1];
> -
>  static ssize_t
>  tracing_cpumask_read(struct file *filp, char __user *ubuf,
>size_t count, loff_t *ppos)
>  {
>   struct trace_array *tr = file_inode(filp)->i_private;
> + char *mask_str;
>   int len;
>  
> - mutex_lock(&tracing_cpumask_update_lock);
> + len = snprintf(NULL, 0, "%*pb\n",
> +cpumask_pr_args(tr->tracing_cpumask)) + 1;
> + mask_str = kmalloc(len, GFP_KERNEL);
> + if (!mask_str)
> + return -ENOMEM;
>  
> - len = snprintf(mask_str, count, "%*pb\n",
> + len = snprintf(mask_str, len, "%*pb\n",
>  cpumask_pr_args(tr->tracing_cpumask));
>   if (len >= count) {
>   count = -EINVAL;
>   goto out_err;
>   }
> - count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
> + count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
>  
>  out_err:
> - mutex_unlock(&tracing_cpumask_update_lock);
> + kfree(mask_str);
>  
>   return count;
>  }
> @@ -4228,8 +4221,6 @@ tracing_cpumask_write(struct file *filp, const char 
> __user *ubuf,
>   if (err)
>   goto err_unlock;
>  
> - mutex_lock(&tracing_cpumask_update_lock);
> -
>   local_irq_disable();
>   arch_spin_lock(&tr->max_lock);
>   for_each_tracing_cpu(cpu) {
> @@ -4252,8 +4243,6 @@ tracing_cpumask_write(struct file *filp, const char 
> __user *ubuf,
>   local_irq_enable();
>  
>   cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
> -
> - mutex_unlock(&tracing_cpumask_update_lock);
>   free_cpumask_var(tracing_cpumask_new);
>  
>   return count;
> -- 
> 2.7.4
> 

-- 
Thanks,
Changbin Du


Re: [PATCH v4] mm, thp: introduce generic transparent huge page allocation interfaces

2017-12-08 Thread Du, Changbin
On Fri, Dec 08, 2017 at 09:27:37AM +0100, Michal Hocko wrote:
> On Fri 08-12-17 12:42:55, changbin...@intel.com wrote:
> > From: Changbin Du 
> > 
> > This patch introduced 4 new interfaces to allocate a prepared transparent
> > huge page. These interfaces merge distributed two-step allocation as simple
> > single step. And they can avoid issue like forget to call 
> > prep_transhuge_page()
> > or call it on wrong page. A real fix:
> > 40a899e ("mm: migrate: fix an incorrect call of prep_transhuge_page()")
> > 
> > Anyway, I just want to prove that expose direct allocation interfaces is
> > better than a interface only do the second part of it.
> > 
> > These are similar to alloc_hugepage_xxx which are for hugetlbfs pages. New
> > interfaces are:
> >   - alloc_transhuge_page_vma
> >   - alloc_transhuge_page_nodemask
> >   - alloc_transhuge_page_node
> >   - alloc_transhuge_page
> > 
> > These interfaces implicitly add __GFP_COMP gfp mask which is the minimum
> > flags used for huge page allocation. More flags leave to the callers.
> > 
> > This patch does below changes:
> >   - define alloc_transhuge_page_xxx interfaces
> >   - apply them to all existing code
> >   - declare prep_transhuge_page as static since no others use it
> >   - remove alloc_hugepage_vma definition since it no longer has users
> 
> I am not really convinced this is a huge win, to be honest. Just look at
> the diffstat. Very few callsites get marginally simpler while we add a
> lot of stubs and the code churn.
>
I know we should write less code, but it is not the only rule. Sometimes we need
add little more code since the compiler requires so, but it doesn't mean then
the compiler will generate worse/more machine code. Besides this, I really want
to know wethere any other considerations you have. Thanks.
 
> >  mm/mempolicy.c  | 14 +++---
> >  mm/migrate.c| 14 --
> >  mm/shmem.c  |  6 ++
> >  8 files changed, 90 insertions(+), 56 deletions(-)
> -- 
> Michal Hocko
> SUSE Labs

-- 
Thanks,
Changbin Du


Re: [PATCH v3] mm, thp: introduce generic transparent huge page allocation interfaces

2017-12-07 Thread Du, Changbin

Hi Andrew,
On Thu, Dec 07, 2017 at 03:45:19PM -0800, Andrew Morton wrote:
> On Thu,  7 Dec 2017 18:54:19 +0800 changbin...@intel.com wrote:
> 
> > From: Changbin Du 
[snip]
 > -static inline void prep_transhuge_page(struct page *page) {}
> > +#define alloc_transhuge_page_vma(gfp_mask, vma, addr) NULL
> > +#define alloc_transhuge_page_nodemask(gfp_mask, preferred_nid, nmask) NULL
> > +#define alloc_transhuge_page_node(nid, gfp_maskg) NULL
> > +#define alloc_transhuge_page(gfp_mask) NULL
> 
> Ugly.  And such things can cause unused-variable warnings in calling
> code.  Whereas
> 
> static inline struct page *alloc_transhuge_page_vma(gfp_t gfp_mask,
>   struct vm_area_struct *vma, unsigned long addr)
> {
>   return NULL;
> }
> 
> will avoid such warnings.
>
Thanks for pointing out, I will update it.
   
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 

-- 
Thanks,
Changbin Du


BUG: KASAN: global-out-of-bounds in cppc_get_perf_caps+0xf3/0x3b0

2017-12-04 Thread Du, Changbin
Hi Cherian,
Your patch 'ACPI / CPPC: Make CPPC ACPI driver aware of PCC subspace IDs'
introduced a out-of-bounds BUG in kernel. The code need to check
cpu_pcc_subspace_idx before use it since it can be -1. Thanks.

[   15.113449] 
==
[   15.116983] BUG: KASAN: global-out-of-bounds in cppc_get_perf_caps+0xf3/0x3b0
[   15.116983] Read of size 8 at addr b9a5c0d8 by task swapper/0/1

[   15.116983] CPU: 3 PID: 1 Comm: swapper/0 Not tainted 4.15.0-rc2+ #2
[   15.116983] Hardware name: Dell Inc. OptiPlex 7040/0Y7WYT, BIOS 1.2.8 
01/26/2016
[   15.116983] Call Trace:
[   15.116983]  dump_stack+0x7c/0xbb
[   15.116983]  print_address_description+0x1df/0x290
[   15.116983]  kasan_report+0x28a/0x370
[   15.116983]  ? cppc_get_perf_caps+0xf3/0x3b0
[   15.116983]  cppc_get_perf_caps+0xf3/0x3b0
[   15.116983]  ? cpc_read+0x210/0x210
[   15.116983]  ? __rdmsr_on_cpu+0x90/0x90
[   15.116983]  ? rdmsrl_on_cpu+0xa9/0xe0
[   15.116983]  ? rdmsr_on_cpu+0x100/0x100
[   15.116983]  ? wrmsrl_on_cpu+0x9c/0xd0
[   15.116983]  ? wrmsrl_on_cpu+0x9c/0xd0
[   15.116983]  ? wrmsr_on_cpu+0xe0/0xe0
[   15.116983]  __intel_pstate_cpu_init.part.16+0x3a2/0x530
[   15.116983]  ? intel_pstate_init_cpu+0x197/0x390
[   15.116983]  ? show_no_turbo+0xe0/0xe0
[   15.116983]  ? __lockdep_init_map+0xa0/0x290
[   15.116983]  intel_pstate_cpu_init+0x30/0x60
[   15.116983]  cpufreq_online+0x155/0xac0
[   15.116983]  cpufreq_add_dev+0x9b/0xb0
[   15.116983]  subsys_interface_register+0x1ae/0x290
[   15.116983]  ? bus_unregister_notifier+0x40/0x40
[   15.116983]  ? mark_held_locks+0x83/0xb0
[   15.116983]  ? _raw_write_unlock_irqrestore+0x32/0x60
[   15.116983]  ? intel_pstate_setup+0xc/0x104
[   15.116983]  ? intel_pstate_setup+0xc/0x104
[   15.116983]  ? cpufreq_register_driver+0x1ce/0x2b0
[   15.116983]  cpufreq_register_driver+0x1ce/0x2b0
[   15.116983]  ? intel_pstate_setup+0x104/0x104
[   15.116983]  intel_pstate_register_driver+0x3a/0xa0
[   15.116983]  intel_pstate_init+0x3c4/0x434
[   15.116983]  ? intel_pstate_setup+0x104/0x104
[   15.116983]  ? intel_pstate_setup+0x104/0x104
[   15.116983]  do_one_initcall+0x9c/0x206
[   15.116983]  ? parameq+0xa0/0xa0
[   15.116983]  ? initcall_blacklisted+0x150/0x150
[   15.116983]  ? lock_downgrade+0x2c0/0x2c0
[   15.116983]  kernel_init_freeable+0x327/0x3f0
[   15.116983]  ? start_kernel+0x612/0x612
[   15.116983]  ? _raw_spin_unlock_irq+0x29/0x40
[   15.116983]  ? finish_task_switch+0xdd/0x320
[   15.116983]  ? finish_task_switch+0x8e/0x320
[   15.116983]  ? rest_init+0xd0/0xd0
[   15.116983]  kernel_init+0xf/0x11a
[   15.116983]  ? rest_init+0xd0/0xd0
[   15.116983]  ret_from_fork+0x24/0x30

[   15.116983] The buggy address belongs to the variable:
[   15.116983]  __key.36299+0x38/0x40

[   15.116983] Memory state around the buggy address:
[   15.116983]  b9a5bf80: fa fa fa fa 00 fa fa fa fa fa fa fa 00 fa fa 
fa
[   15.116983]  b9a5c000: fa fa fa fa 00 fa fa fa fa fa fa fa 00 fa fa 
fa
[   15.116983] >b9a5c080: fa fa fa fa 00 fa fa fa fa fa fa fa 00 00 00 
00
[   15.116983] ^
[   15.116983]  b9a5c100: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
00
[   15.116983]  b9a5c180: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 
00
[   15.116983] 
==

-- 
Thanks,
Changbin Du


Re: [RESEND PATCH v3] tracing: Allocate mask_str buffer dynamically

2017-11-29 Thread Du, Changbin
On Wed, Nov 29, 2017 at 10:12:09PM -0500, Steven Rostedt wrote:
> On Wed, 29 Nov 2017 12:42:45 +0800
> changbin...@intel.com wrote:
> 
> > From: Changbin Du 
> > 
> > The default NR_CPUS can be very large, but actual possible nr_cpu_ids
> > usually is very small. For my x86 distribution, the NR_CPUS is 8192 and
> > nr_cpu_ids is 4. About 2 pages are wasted.
> > 
> > Most machines don't have so many CPUs, so define a array with NR_CPUS
> > just wastes memory. So let's allocate the buffer dynamically when need.
> > 
> > The exact buffer size should be:
> >   DIV_ROUND_UP(nr_cpu_ids, 4) + nr_cpu_ids/32 + 2;
> > 
> > Example output:
> >   ff,
> 
> Um, what if there's more than 64 CPUs, where I have booted several
> boxes that have more. There's going to be more than 1 comma.
>
The commas are calculated by formula. (DIV_ROUND_UP(nr_cpu_ids, 4))

> > 
> > With this change, the mutext tracing_cpumask_update_lock also can be
> > removed now, which was used to protect mask_str.
> > 
> > Signed-off-by: Changbin Du 
> > Cc: Steven Rostedt 
> > 
> > ---
> > v3:
> >   - remove tracing_cpumask_update_lock which was used to protect mask_str. 
> > (Rostedt)
> > v2:
> >   - remove 'static' declaration.
> >   - fix buffer size.
> > ---
> >  kernel/trace/trace.c | 29 +
> >  1 file changed, 9 insertions(+), 20 deletions(-)
> > 
> > diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> > index 73e67b6..6750d05 100644
> > --- a/kernel/trace/trace.c
> > +++ b/kernel/trace/trace.c
> > @@ -4178,37 +4178,30 @@ static const struct file_operations 
> > show_traces_fops = {
> > .llseek = seq_lseek,
> >  };
> >  
> > -/*
> > - * The tracer itself will not take this lock, but still we want
> > - * to provide a consistent cpumask to user-space:
> > - */
> > -static DEFINE_MUTEX(tracing_cpumask_update_lock);
> > -
> > -/*
> > - * Temporary storage for the character representation of the
> > - * CPU bitmask (and one more byte for the newline):
> > - */
> > -static char mask_str[NR_CPUS + 1];
> > -
> >  static ssize_t
> >  tracing_cpumask_read(struct file *filp, char __user *ubuf,
> >  size_t count, loff_t *ppos)
> >  {
> > struct trace_array *tr = file_inode(filp)->i_private;
> > +   char *mask_str;
> > int len;
> >  
> > -   mutex_lock(&tracing_cpumask_update_lock);
> > +   /* Bitmap, ',' and two more bytes for the newline and '\0'. */
> > +   len = DIV_ROUND_UP(nr_cpu_ids, 4) + nr_cpu_ids/32 + 2;
> 
> This is broken. Instead do:
> 
>   len = snprintf(NULL, 0, "%*pb\n",
>  cpumask_pr_args(tr->tracing_cpumask)) + 1;
> 
>   mask_str = kmalloc(len, GFP_KERNEL);
>   [..]
>   len = snprintf(mask_str, len, "%*pb\n",
>  cpumask_pr_args(tr->tracing_cpumask));
> 
> -- Steve
> 
hmm. I never know that snprintf has such usage. This is much better than
calculating it by a formula.

> > +   mask_str = kmalloc(len, GFP_KERNEL);
> > +   if (!mask_str)
> > +   return -ENOMEM;
> >  
> > -   len = snprintf(mask_str, count, "%*pb\n",
> > +   len = snprintf(mask_str, len, "%*pb\n",
> >cpumask_pr_args(tr->tracing_cpumask));
> > if (len >= count) {
> > count = -EINVAL;
> > goto out_err;
> > }
> > -   count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
> > +   count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
> >  
> >  out_err:
> > -   mutex_unlock(&tracing_cpumask_update_lock);
> > +   kfree(mask_str);
> >  
> > return count;
> >  }
> > @@ -4228,8 +4221,6 @@ tracing_cpumask_write(struct file *filp, const char 
> > __user *ubuf,
> > if (err)
> > goto err_unlock;
> >  
> > -   mutex_lock(&tracing_cpumask_update_lock);
> > -
> > local_irq_disable();
> > arch_spin_lock(&tr->max_lock);
> > for_each_tracing_cpu(cpu) {
> > @@ -4252,8 +4243,6 @@ tracing_cpumask_write(struct file *filp, const char 
> > __user *ubuf,
> > local_irq_enable();
> >  
> > cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
> > -
> > -   mutex_unlock(&tracing_cpumask_update_lock);
> > free_cpumask_var(tracing_cpumask_new);
> >  
> > return count;
> 

-- 
Thanks,
Changbin Du


Re: [PATCH v3] tracing: Allocate mask_str buffer dynamically

2017-11-20 Thread Du, Changbin
Hi Steven,
Have you picked up this patch or need more polish? Thanks.

On Wed, Nov 01, 2017 at 11:28:08AM +0800, changbin...@intel.com wrote:
> From: Changbin Du 
> 
> The default NR_CPUS can be very large, but actual possible nr_cpu_ids
> usually is very small. For my x86 distribution, the NR_CPUS is 8192 and
> nr_cpu_ids is 4. About 2 pages are wasted.
> 
> Most machines don't have so many CPUs, so define a array with NR_CPUS
> just wastes memory. So let's allocate the buffer dynamically when need.
> 
> The exact buffer size should be:
>   DIV_ROUND_UP(nr_cpu_ids, 4) + nr_cpu_ids/32 + 2;
> 
> Example output:
>   ff,
> 
> With this change, the mutext tracing_cpumask_update_lock also can be
> removed now, which was used to protect mask_str.
> 
> Signed-off-by: Changbin Du 
> Cc: Steven Rostedt 
> 
> ---
> v3:
>   - remove tracing_cpumask_update_lock which was used to protect mask_str. 
> (Rostedt)
> v2:
>   - remove 'static' declaration.
>   - fix buffer size.
> ---
>  kernel/trace/trace.c | 29 +
>  1 file changed, 9 insertions(+), 20 deletions(-)
> 
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index 752e5da..5d2ec80 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -4178,37 +4178,30 @@ static const struct file_operations show_traces_fops 
> = {
>   .llseek = seq_lseek,
>  };
>  
> -/*
> - * The tracer itself will not take this lock, but still we want
> - * to provide a consistent cpumask to user-space:
> - */
> -static DEFINE_MUTEX(tracing_cpumask_update_lock);
> -
> -/*
> - * Temporary storage for the character representation of the
> - * CPU bitmask (and one more byte for the newline):
> - */
> -static char mask_str[NR_CPUS + 1];
> -
>  static ssize_t
>  tracing_cpumask_read(struct file *filp, char __user *ubuf,
>size_t count, loff_t *ppos)
>  {
>   struct trace_array *tr = file_inode(filp)->i_private;
> + char *mask_str;
>   int len;
>  
> - mutex_lock(&tracing_cpumask_update_lock);
> + /* Bitmap, ',' and two more bytes for the newline and '\0'. */
> + len = DIV_ROUND_UP(nr_cpu_ids, 4) + nr_cpu_ids/32 + 2;
> + mask_str = kmalloc(len, GFP_KERNEL);
> + if (!mask_str)
> + return -ENOMEM;
>  
> - len = snprintf(mask_str, count, "%*pb\n",
> + len = snprintf(mask_str, len, "%*pb\n",
>  cpumask_pr_args(tr->tracing_cpumask));
>   if (len >= count) {
>   count = -EINVAL;
>   goto out_err;
>   }
> - count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
> + count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
>  
>  out_err:
> - mutex_unlock(&tracing_cpumask_update_lock);
> + kfree(mask_str);
>  
>   return count;
>  }
> @@ -4228,8 +4221,6 @@ tracing_cpumask_write(struct file *filp, const char 
> __user *ubuf,
>   if (err)
>   goto err_unlock;
>  
> - mutex_lock(&tracing_cpumask_update_lock);
> -
>   local_irq_disable();
>   arch_spin_lock(&tr->max_lock);
>   for_each_tracing_cpu(cpu) {
> @@ -4252,8 +4243,6 @@ tracing_cpumask_write(struct file *filp, const char 
> __user *ubuf,
>   local_irq_enable();
>  
>   cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
> -
> - mutex_unlock(&tracing_cpumask_update_lock);
>   free_cpumask_var(tracing_cpumask_new);
>  
>   return count;
> -- 
> 2.7.4
> 

-- 
Thanks,
Changbin Du


signature.asc
Description: PGP signature


Re: [PATCH] x86, build: Make genimage.sh little more quite

2017-11-08 Thread Du, Changbin
On Wed, Nov 08, 2017 at 10:01:37AM +0100, Ingo Molnar wrote:
> 
> * changbin...@intel.com  wrote:
> 
> > From: Changbin Du 
> > 
> > This change suppresses the 'dd' output and adds '-quite' parameter
> > to mkisofs tool. None of the messages matter to the user. Now:
> > 
> > $ make isoimage
> > ...
> > Kernel: arch/x86/boot/bzImage is ready  (#75)
> >   GENIMAGE arch/x86/boot/image.iso
> > Using /usr/lib/ISOLINUX/isolinux.bin
> > Using /usr/lib/syslinux/modules/bios/ldlinux.c32
> > Kernel: arch/x86/boot/image.iso is ready
> 
> Could you please also remove the other unnecessary lines as well, from the 
> default 
> build log:
> 
> > Using /usr/lib/ISOLINUX/isolinux.bin
> > Using /usr/lib/syslinux/modules/bios/ldlinux.c32
> > Kernel: arch/x86/boot/image.iso is ready
> 
> The build process is using a ton of files and we don't list them. Maybe print 
> them 
> when V=1, but not by default.
> 
> The only lines printed should be:
> 
> > Kernel: arch/x86/boot/bzImage is ready  (#75)
> >   GENIMAGE arch/x86/boot/image.iso
> 
> Users are totally not interested in the details. They want to see a single 
> line of 
> output about which file is being built, but that's it.
>
I agree with you. So I will remove the 'Using ...', and 'set -x' if V=1.

> Thanks,
> 
>   Ingo

-- 
Thanks,
Changbin Du


signature.asc
Description: PGP signature


Re: [PATCH v2 1/4] x86, build: Factor out fdimage/isoimage generation commands to standalone script

2017-11-07 Thread Du, Changbin
On Tue, Nov 07, 2017 at 11:20:29AM +0100, Ingo Molnar wrote:
> 
> * changbin...@intel.com  wrote:
> 
> > From: Changbin Du 
> > 
> > The build message for fdimage/isoimage are pretty unstructured. The raw
> > shell command blocks are printed. We can improve them as regular build
> > system messages. Besides, writing commands in a shell script is much more
> > easy than in a Makefile.
> > 
> > See Ingo's suggestion here https://lkml.org/lkml/2017/10/31/124.
> > 
> > This patch factors out the commands used for fdimage/isoimage generation
> > from arch/x86/boot/Makefile to a new script arch/x86/boot/genimage.sh.
> > Then it adds the new kbuild command 'genimage' which invokes the new script.
> > All fdimages/isoimage files are now generated by a call to 'genimage' with
> > different parameters.
> > 
> > Now 'make isoimage' becomes:
> > ...
> > Kernel: arch/x86/boot/bzImage is ready  (#30)
> >   GENIMAGE arch/x86/boot/image.iso
> > Size of boot image is 4 sectors -> No emulation
> >  15.37% done, estimate finish Sun Nov  5 23:36:57 2017
> >  30.68% done, estimate finish Sun Nov  5 23:36:57 2017
> >  46.04% done, estimate finish Sun Nov  5 23:36:57 2017
> >  61.35% done, estimate finish Sun Nov  5 23:36:57 2017
> >  76.69% done, estimate finish Sun Nov  5 23:36:57 2017
> >  92.00% done, estimate finish Sun Nov  5 23:36:57 2017
> > Total translation table size: 2048
> > Total rockridge attributes bytes: 659
> > Total directory bytes: 0
> > Path table size(bytes): 10
> > Max brk space used 0
> > 32608 extents written (63 MB)
> > Kernel: arch/x86/boot/image.iso is ready
> 
> Could we please also do another patch, to make it emit only this by default:
> 
> >   GENIMAGE arch/x86/boot/image.iso
> 
> None of the other messages really matter to the user.
> 
> Maybe emit them when building with V=1 - but none of that info is really 
> interesting IMHO.
> 
Ingo, do you mean that omit below messages?
Size of boot image is 4 sectors -> No emulation
15.37% done, estimate finish Sun Nov  5 23:36:57 2017
...
32608 extents written (63 MB)

These are printed by mkisofs. How about just add '-quite' parameter to it?
Kernel: arch/x86/boot/bzImage is ready  (#75)
GENIMAGE arch/x86/boot/image.iso
Using /usr/lib/ISOLINUX/isolinux.bin
Using /usr/lib/syslinux/modules/bios/ldlinux.c32
Kernel: arch/x86/boot/image.iso is ready

> Thanks,
> 
>   Ingo

-- 
Thanks,
Changbin Du


signature.asc
Description: PGP signature


Re: [PATCH 2/4] x86, build: Add new paths for isolinux.bin and ldlinux.c32

2017-11-05 Thread Du, Changbin
Hi Ingo,
On Sun, Nov 05, 2017 at 10:33:53AM +0100, Ingo Molnar wrote:
> 
> * changbin...@intel.com  wrote:
> 
> > From: Changbin Du 
> > 
> > Recently I failed to build isoimage target, because the path of isolinux.bin
> > changed to /usr/xxx/ISOLINUX/isolinux.bin, as well as ldlinux.c32 which
> > changed to /usr/xxx/syslinux/modules/bios/ldlinux.c32.
> > 
> > This patch has a improvement of the file search:
> >   - Show a error message instead of silent fail.
> >   - Add above new paths.
> 
> How about:
> 
>   This patch improves the file search logic:
> - Show an error message instead of failing silently
> - Add the new paths listed above.
> 
> 
> > +   if [ $i = end -a -z "$isolinux" ] ; then
> > +   echo 'Need isolinux.bin, please install 
> > syslinux/isolinux'
> 
> How about:
> 
>   echo 'Need an isolinux.bin file, please install 
> syslinux/isolinux'
>
The new description is much better, will update. Thanks.

> Thanks,
> 
>   Ingo

-- 
Thanks,
Changbin Du


signature.asc
Description: PGP signature


Re: [PATCH 1/4] x86, build: Fact out fdimage/isoimage generation commands to standalone script

2017-11-05 Thread Du, Changbin
On Sun, Nov 05, 2017 at 10:32:08AM +0100, Ingo Molnar wrote:
> 
> A few spelling fixes:
> 
> in the title:
> 
> s/Fact out
>  /Factor out
> 
> * changbin...@intel.com  wrote:
> 
> > From: Changbin Du 
> > 
> > The build message for fdimage/isoimage are pretty unstructured. The raw
> > shell command blocks are printed. We can improve them as regular build
> > system messages. Besides, writing commands in shell script is much more
> > easier than in a Makefile.
> 
> s/much more easier
>  /much more easy
> 
> > 
> > See Ingo's suggestion here https://lkml.org/lkml/2017/10/31/124.
> > 
> > This patch fact out the commands used for fdimage/isoimage generation from
> > arch/x86/boot/Makefile to new script arch/x86/boot/genimage.sh. Then add a
> > new kbuild command 'genimage' which invokes the new script. All
> > fdimages/isoimage now is generated by call to 'genimage' with different
> > parameters.
> 
> s/fact out
>  /factors out
> 
> s/to new script
>   to a new script
> 
> s/Then add
>  /Then it adds
> 
> s/a new kbuild command 'genimage'
>  /the new 'genimage' kbuild command
> 
> s/All fdimages/isoimage now is generated by call to
>  /All fdimage/isoimage files are now generated by a call to
> 
Sorry for these grammar errors. I alwyas forgot to write complete sentences in
English. :)

> > +#   $3 - kernel bzImage file
> > +#   $4 - mtool configuration file
> > +#   $5 - kernel cmdline
> > +#   $6 - inird image file
> > +#
> 
> The new script is much easier to read!
> 
> Thanks,
> 
>   Ingo

-- 
Thanks,
Changbin Du


signature.asc
Description: PGP signature


Re: [PATCH] x86, build: Improve the isolinux searching of isoimage generation

2017-11-01 Thread Du, Changbin
Hi Ingo and Yamada,
Thanks for your suggestions. I'll have a try though I am not familiar with 
kbuild system.

On Wed, Nov 01, 2017 at 12:17:50PM +0900, Masahiro Yamada wrote:
> 2017-10-31 18:39 GMT+09:00 Ingo Molnar :
> >
> > * changbin...@intel.com  wrote:
> >
> >> From: Changbin Du 
> >>
> >> Recently I failed to build isoimage target, because the path of 
> >> isolinux.bin
> >> changed to /usr/xxx/ISOLINUX/isolinux.bin, as well as ldlinux.c32 which
> >> changed to /usr/xxx/syslinux/modules/bios/ldlinux.c32.
> >>
> >> This patch has a improvement of the file search:
> >>   - Don't print the raw shell commands. It doesn't make sense to show the
> >> entire big block.
> >>   - Show a error message instead of silent fail.
> >>   - Add above new paths.
> >>
> >> Now it becomes:
> >> Kernel: arch/x86/boot/bzImage is ready  (#62)
> >> rm -rf arch/x86/boot/isoimage
> >> mkdir arch/x86/boot/isoimage
> >> Using /usr/lib/ISOLINUX/isolinux.bin
> >> Using /usr/lib/syslinux/modules/bios/ldlinux.c32
> >> cp arch/x86/boot/bzImage arch/x86/boot/isoimage/linux
> >> ...
> >>
> >> Before:
> >> Kernel: arch/x86/boot/bzImage is ready  (#63)
> >> rm -rf arch/x86/boot/isoimage
> >> mkdir arch/x86/boot/isoimage
> >> for i in lib lib64 share end ; do \
> >>   if [ -f /usr/$i/syslinux/isolinux.bin ] ; then \
> >>   cp /usr/$i/syslinux/isolinux.bin arch/x86/boot/isoimage ; \
> >>   if [ -f /usr/$i/syslinux/ldlinux.c32 ]; then \
> >>   cp /usr/$i/syslinux/ldlinux.c32 
> >> arch/x86/boot/isoimage ; \
> >>   fi ; \
> >>   break ; \
> >>   fi ; \
> >>   if [ $i = end ] ; then exit 1 ; fi ; \
> >> done
> >> arch/x86/boot/Makefile:161: recipe for target 'isoimage' failed
> >> make[1]: *** [isoimage] Error 1
> >
> > I like these changes. Could we please further improve it: for example the 
> > boot
> > image build messages are still pretty unstructured, while regular build 
> > system
> > messages come in the following format:
> >
> >   CC  arch/x86/events/msr.o
> >   RELOCS  arch/x86/realmode/rm/realmode.relocs
> >   OBJCOPY arch/x86/realmode/rm/realmode.bin
> >   CC  arch/x86/kernel/signal.o
> >   AS  arch/x86/realmode/rmpiggy.o
> >   CC  ipc/msg.o
> >   AR  arch/x86/ia32/built-in.o
> >   CC  arch/x86/events/amd/iommu.o
> >   CC  init/do_mounts.o
> >   AR  arch/x86/realmode/built-in.o
> >
> > So instead of:
> >
> >> Kernel: arch/x86/boot/bzImage is ready  (#62)
> >> rm -rf arch/x86/boot/isoimage
> >> mkdir arch/x86/boot/isoimage
> >> Using /usr/lib/ISOLINUX/isolinux.bin
> >> Using /usr/lib/syslinux/modules/bios/ldlinux.c32
> >> cp arch/x86/boot/bzImage arch/x86/boot/isoimage/linux
> >
> > Could we make it something more streamlined and similar to the rest of the 
> > build
> > as well, like:
> >
> >   GEN arch/x86/boot/bzImage
> >   GEN arch/x86/boot/isoimage
> >   GEN arch/x86/boot/isoimage/linux
> >
> > I.e. only mention the new files built, with an appropriate prefix.
> >
> > I've Cc:-ed the kbuild maintainers, maybe they have a better suggestion 
> > instead of
> > the 'GEN' abbreviation?
> >
> 
> Generally, the abbreviation is the tool that has processed the target,
> but if you do not find an appropriate one, 'GEN' is fine.
> 
> 
> 
> 
> -- 
> Best Regards
> Masahiro Yamada

-- 
Thanks,
Changbin Du


signature.asc
Description: PGP signature


Re: [PATCH v2] tracing: Allocate mask_str buffer dynamically

2017-10-31 Thread Du, Changbin
Hi Rostedt,
On Tue, Oct 31, 2017 at 12:19:58PM -0400, Steven Rostedt wrote:
> On Thu, 26 Oct 2017 00:20:28 +0800
> changbin...@intel.com wrote:
> 
> > From: Changbin Du 
> > 
> > The default NR_CPUS can be very large, but actual possible nr_cpu_ids
> > usually is very small. For my x86 distribution, the NR_CPUS is 8192 and
> > nr_cpu_ids is 4. About 2 pages are wasted.
> > 
> > Most machines don't have so many CPUs, so define a array with NR_CPUS
> > just wastes memory. So let's allocate the buffer dynamically when need.
> > 
> > The exact buffer size should be:
> >   DIV_ROUND_UP(nr_cpu_ids, 4) + nr_cpu_ids/32 + 2;
> > 
> > Example output:
> >   ff,
> > 
> > Signed-off-by: Changbin Du 
> > 
> > ---
> > v2:
> >   - remove 'static' declaration.
> >   - fix buffer size.
> > ---
> >  kernel/trace/trace.c | 18 ++
> >  1 file changed, 10 insertions(+), 8 deletions(-)
> > 
> > diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> > index 752e5da..6b70648 100644
> > --- a/kernel/trace/trace.c
> > +++ b/kernel/trace/trace.c
> > @@ -4184,31 +4184,33 @@ static const struct file_operations 
> > show_traces_fops = {
> >   */
> >  static DEFINE_MUTEX(tracing_cpumask_update_lock);
> 
> The above mutex was used to protect mask_str.
> 
> >  
> > -/*
> > - * Temporary storage for the character representation of the
> > - * CPU bitmask (and one more byte for the newline):
> > - */
> > -static char mask_str[NR_CPUS + 1];
> > -
> >  static ssize_t
> >  tracing_cpumask_read(struct file *filp, char __user *ubuf,
> >  size_t count, loff_t *ppos)
> >  {
> > struct trace_array *tr = file_inode(filp)->i_private;
> > +   char *mask_str;
> > int len;
> >  
> > +   /* Bitmap, ',' and two more bytes for the newline and '\0'. */
> > +   len = DIV_ROUND_UP(nr_cpu_ids, 4) + nr_cpu_ids/32 + 2;
> > +   mask_str = kmalloc(len, GFP_KERNEL);
> > +   if (!mask_str)
> > +   return -ENOMEM;
> > +
> > mutex_lock(&tracing_cpumask_update_lock);
> 
> This patch can remove the mutex as well, since there's no sharing of
> the mask anymore.
> 
> -- Steve
>
ok, let me remove it in v3.

> >  
> > -   len = snprintf(mask_str, count, "%*pb\n",
> > +   len = snprintf(mask_str, len, "%*pb\n",
> >cpumask_pr_args(tr->tracing_cpumask));
> > if (len >= count) {
> > count = -EINVAL;
> > goto out_err;
> > }
> > -   count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
> > +   count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
> >  
> >  out_err:
> > mutex_unlock(&tracing_cpumask_update_lock);
> > +   kfree(mask_str);
> >  
> > return count;
> >  }
> 

-- 
Thanks,
Changbin Du


signature.asc
Description: PGP signature


Re: [PATCH] tracing: Allocate mask_str buffer dynamically

2017-10-25 Thread Du, Changbin
On Wed, Oct 25, 2017 at 07:24:36PM +0800, changbin...@intel.com wrote:
> From: Changbin Du 
> 
> The default NR_CPUS can be very large, but actual possible nr_cpu_ids
> usually is very small. For my x86 distribution, the NR_CPUS is 8192 and
> nr_cpu_ids is 4. About 2 pages are wasted.
> 
> Most machines don't have so many CPUs, so define a array with NR_CPUS
> just wastes memory. So let's allocate the buffer dynamically when need.
> 
> Signed-off-by: Changbin Du 
> ---
>  kernel/trace/trace.c | 15 ---
>  1 file changed, 8 insertions(+), 7 deletions(-)
> 
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index 752e5da..d1b3f11 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -4184,19 +4184,18 @@ static const struct file_operations show_traces_fops 
> = {
>   */
>  static DEFINE_MUTEX(tracing_cpumask_update_lock);
>  
> -/*
> - * Temporary storage for the character representation of the
> - * CPU bitmask (and one more byte for the newline):
> - */
> -static char mask_str[NR_CPUS + 1];
> -
>  static ssize_t
>  tracing_cpumask_read(struct file *filp, char __user *ubuf,
>size_t count, loff_t *ppos)
>  {
>   struct trace_array *tr = file_inode(filp)->i_private;
> + static char *mask_str;
ah, need remove 'static'.

>   int len;
>  
> + mask_str = kmalloc(nr_cpu_ids + 1, GFP_KERNEL);
> + if (!mask_str)
> + return -ENOMEM;
> +
>   mutex_lock(&tracing_cpumask_update_lock);
>  
>   len = snprintf(mask_str, count, "%*pb\n",
> @@ -4205,10 +4204,12 @@ tracing_cpumask_read(struct file *filp, char __user 
> *ubuf,
>   count = -EINVAL;
>   goto out_err;
>   }
> - count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
> + count = simple_read_from_buffer(ubuf, count, ppos,
> + mask_str, nr_cpu_ids+1);
>  
>  out_err:
>   mutex_unlock(&tracing_cpumask_update_lock);
> + kfree(mask_str);
>  
>   return count;
>  }
> -- 
> 2.7.4
> 

-- 
Thanks,
Changbin Du


signature.asc
Description: PGP signature


Re: [PATCH v2 1/2] mm, thp: introduce dedicated transparent huge page allocation interfaces

2017-10-22 Thread Du, Changbin
Hi Lameter,
On Fri, Oct 20, 2017 at 06:35:44AM -0500, Christopher Lameter wrote:
> On Fri, 20 Oct 2017, changbin...@intel.com wrote:
> 
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 269b5df..2a960fc 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -501,6 +501,43 @@ void prep_transhuge_page(struct page *page)
> > set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
> >  }
> >
> > +struct page *alloc_transhuge_page_vma(gfp_t gfp_mask,
> > +   struct vm_area_struct *vma, unsigned long addr)
> > +{
> > +   struct page *page;
> > +
> > +   page = alloc_pages_vma(gfp_mask | __GFP_COMP, HPAGE_PMD_ORDER,
> > +  vma, addr, numa_node_id(), true);
> > +   if (unlikely(!page))
> > +   return NULL;
> > +   prep_transhuge_page(page);
> > +   return page;
> > +}
> > +
> > +struct page *alloc_transhuge_page_nodemask(gfp_t gfp_mask,
> > +   int preferred_nid, nodemask_t *nmask)
> > +{
> > +   struct page *page;
> > +
> > +   page = __alloc_pages_nodemask(gfp_mask | __GFP_COMP, HPAGE_PMD_ORDER,
> > + preferred_nid, nmask);
> > +   if (unlikely(!page))
> > +   return NULL;
> > +   prep_transhuge_page(page);
> > +   return page;
> > +}
> > +
> > +struct page *alloc_transhuge_page(gfp_t gfp_mask)
> > +{
> > +   struct page *page;
> > +
> > +   page = alloc_pages(gfp_mask | __GFP_COMP, HPAGE_PMD_ORDER);
> > +   if (unlikely(!page))
> > +   return NULL;
> > +   prep_transhuge_page(page);
> > +   return page;
> > +}
> > +
> 
> These look pretty similar to the code used for huge pages (aside from the
> call to prep_transhuge_page(). Maybe we can have common allocation
> primitives for huge pages?
> 
yes, they are similar to each other, but allocation approaches are much 
different.
hugetlbfs alloc page from reserved memory, while thp just directly get page
from page allocator.

I think it doesn't make much sense to provide uified api for both of them, 
because
transhuge_page allocation primitives only used within hugetlbfs code. thp
allocation is more common as system wide. If Unify them then all the api need 1 
more
parameter to distinguish what huge page is going to allocate.

-- 
Thanks,
Changbin Du


signature.asc
Description: PGP signature


Re: [PATCH 1/2] mm, thp: introduce dedicated transparent huge page allocation interfaces

2017-10-20 Thread Du, Changbin
Hi Hocko,
On Thu, Oct 19, 2017 at 02:49:31PM +0200, Michal Hocko wrote:
> On Wed 18-10-17 19:00:26, Du, Changbin wrote:
> > Hi Hocko,
> > 
> > On Tue, Oct 17, 2017 at 12:20:52PM +0200, Michal Hocko wrote:
> > > [CC Kirill]
> > > 
> > > On Mon 16-10-17 17:19:16, changbin...@intel.com wrote:
> > > > From: Changbin Du 
> > > > 
> > > > This patch introduced 4 new interfaces to allocate a prepared
> > > > transparent huge page.
> > > >   - alloc_transhuge_page_vma
> > > >   - alloc_transhuge_page_nodemask
> > > >   - alloc_transhuge_page_node
> > > >   - alloc_transhuge_page
> > > > 
> > > > The aim is to remove duplicated code and simplify transparent
> > > > huge page allocation. These are similar to alloc_hugepage_xxx
> > > > which are for hugetlbfs pages. This patch does below changes:
> > > >   - define alloc_transhuge_page_xxx interfaces
> > > >   - apply them to all existing code
> > > >   - declare prep_transhuge_page as static since no others use it
> > > >   - remove alloc_hugepage_vma definition since it no longer has users
> > > 
> > > So what exactly is the advantage of the new API? The diffstat doesn't
> > > sound very convincing to me.
> > >
> > The caller only need one step to allocate thp. Several LOCs removed for all 
> > the
> > caller side with this change. So it's little more convinent.
> 
> Yeah, but the overall result is more code. So I am not really convinced. 
Yes, but some of code are just to make compiler happy (declarations). These are
just simple light wrappers same as other functions in kernel. At least the code
readbility is improved by this, two steps allocation merged into one so
duplicated logic removed.

> -- 
> Michal Hocko
> SUSE Labs

-- 
Thanks,
Changbin Du


signature.asc
Description: PGP signature


Re: [PATCH 0/2] mm, thp: introduce dedicated transparent huge page allocation interfaces

2017-10-18 Thread Du, Changbin
Hi Morton,
On Tue, Oct 17, 2017 at 04:28:16PM -0700, Andrew Morton wrote:
> On Mon, 16 Oct 2017 17:19:15 +0800 changbin...@intel.com wrote:
> 
> > The first one introduce new interfaces, the second one kills naming 
> > confusion.
> > The aim is to remove duplicated code and simplify transparent huge page
> > allocation.
> 
> These introduce various allnoconfig build errors.
Thanks, I will fix and have more test.

-- 
Thanks,
Changbin Du


signature.asc
Description: PGP signature


Re: [PATCH 1/2] mm, thp: introduce dedicated transparent huge page allocation interfaces

2017-10-18 Thread Du, Changbin
On Tue, Oct 17, 2017 at 02:12:46PM +0300, Kirill A. Shutemov wrote:
> On Mon, Oct 16, 2017 at 05:19:16PM +0800, changbin...@intel.com wrote:
> > @@ -501,6 +501,45 @@ void prep_transhuge_page(struct page *page)
> > set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
> >  }
> >  
> > +struct page *alloc_transhuge_page_vma(gfp_t gfp_mask,
> > +   struct vm_area_struct *vma, unsigned long addr)
> > +{
> > +   struct page *page;
> > +
> > +   page = alloc_pages_vma(gfp_mask | __GFP_COMP, HPAGE_PMD_ORDER,
> > +  vma, addr, numa_node_id(), true);
> > +   if (unlikely(!page))
> > +   return NULL;
> > +   prep_transhuge_page(page);
> > +   return page;
> > +}
> > +
> > +struct page *alloc_transhuge_page_nodemask(gfp_t gfp_mask,
> > +   int preferred_nid, nodemask_t *nmask)
> > +{
> > +   struct page *page;
> > +
> > +   page = __alloc_pages_nodemask(gfp_mask | __GFP_COMP, HPAGE_PMD_ORDER,
> > + preferred_nid, nmask);
> > +   if (unlikely(!page))
> > +   return NULL;
> > +   prep_transhuge_page(page);
> > +   return page;
> > +}
> > +
> > +struct page *alloc_transhuge_page(gfp_t gfp_mask)
> > +{
> > +   struct page *page;
> > +
> > +   VM_BUG_ON(!(gfp_mask & __GFP_COMP));
> 
> Why do you check for __GFP_COMP only in this helper?
> 
> > +   page = alloc_pages(gfp_mask | __GFP_COMP, HPAGE_PMD_ORDER);
> 
> And still apply __GFP_COMP anyway?
>
This is a mistake, will removed. Thanks.

> > +   if (unlikely(!page))
> > +   return NULL;
> > +   prep_transhuge_page(page);
> > +   return page;
> > +}
> > +
> >  unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
> > loff_t off, unsigned long flags, unsigned long size)
> >  {
> 
> -- 
>  Kirill A. Shutemov

-- 
Thanks,
Changbin Du


signature.asc
Description: PGP signature


Re: [PATCH 1/2] mm, thp: introduce dedicated transparent huge page allocation interfaces

2017-10-18 Thread Du, Changbin
Hi Hocko,

On Tue, Oct 17, 2017 at 12:20:52PM +0200, Michal Hocko wrote:
> [CC Kirill]
> 
> On Mon 16-10-17 17:19:16, changbin...@intel.com wrote:
> > From: Changbin Du 
> > 
> > This patch introduced 4 new interfaces to allocate a prepared
> > transparent huge page.
> >   - alloc_transhuge_page_vma
> >   - alloc_transhuge_page_nodemask
> >   - alloc_transhuge_page_node
> >   - alloc_transhuge_page
> > 
> > The aim is to remove duplicated code and simplify transparent
> > huge page allocation. These are similar to alloc_hugepage_xxx
> > which are for hugetlbfs pages. This patch does below changes:
> >   - define alloc_transhuge_page_xxx interfaces
> >   - apply them to all existing code
> >   - declare prep_transhuge_page as static since no others use it
> >   - remove alloc_hugepage_vma definition since it no longer has users
> 
> So what exactly is the advantage of the new API? The diffstat doesn't
> sound very convincing to me.
>
The caller only need one step to allocate thp. Several LOCs removed for all the
caller side with this change. So it's little more convinent.

> > Signed-off-by: Changbin Du 
> > ---
> >  include/linux/gfp.h |  4 
> >  include/linux/huge_mm.h | 13 -
> >  include/linux/migrate.h | 14 +-
> >  mm/huge_memory.c| 50 
> > ++---
> >  mm/khugepaged.c | 11 ++-
> >  mm/mempolicy.c  | 10 +++---
> >  mm/migrate.c| 12 
> >  mm/shmem.c  |  6 ++
> >  8 files changed, 71 insertions(+), 49 deletions(-)
> > 
> > diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> > index f780718..855c72e 100644
> > --- a/include/linux/gfp.h
> > +++ b/include/linux/gfp.h
> > @@ -507,15 +507,11 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
> >  extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
> > struct vm_area_struct *vma, unsigned long addr,
> > int node, bool hugepage);
> > -#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
> > -   alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
> >  #else
> >  #define alloc_pages(gfp_mask, order) \
> > alloc_pages_node(numa_node_id(), gfp_mask, order)
> >  #define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
> > alloc_pages(gfp_mask, order)
> > -#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
> > -   alloc_pages(gfp_mask, order)
> >  #endif
> >  #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
> >  #define alloc_page_vma(gfp_mask, vma, addr)\
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index 14bc21c..1dd2c33 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -130,9 +130,20 @@ extern unsigned long thp_get_unmapped_area(struct file 
> > *filp,
> > unsigned long addr, unsigned long len, unsigned long pgoff,
> > unsigned long flags);
> >  
> > -extern void prep_transhuge_page(struct page *page);
> >  extern void free_transhuge_page(struct page *page);
> >  
> > +struct page *alloc_transhuge_page_vma(gfp_t gfp_mask,
> > +   struct vm_area_struct *vma, unsigned long addr);
> > +struct page *alloc_transhuge_page_nodemask(gfp_t gfp_mask,
> > +   int preferred_nid, nodemask_t *nmask);
> > +
> > +static inline struct page *alloc_transhuge_page_node(int nid, gfp_t 
> > gfp_mask)
> > +{
> > +   return alloc_transhuge_page_nodemask(gfp_mask, nid, NULL);
> > +}
> > +
> > +struct page *alloc_transhuge_page(gfp_t gfp_mask);
> > +
> >  bool can_split_huge_page(struct page *page, int *pextra_pins);
> >  int split_huge_page_to_list(struct page *page, struct list_head *list);
> >  static inline int split_huge_page(struct page *page)
> > diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> > index 643c7ae..70a00f3 100644
> > --- a/include/linux/migrate.h
> > +++ b/include/linux/migrate.h
> > @@ -42,19 +42,15 @@ static inline struct page *new_page_nodemask(struct 
> > page *page,
> > return 
> > alloc_huge_page_nodemask(page_hstate(compound_head(page)),
> > preferred_nid, nodemask);
> >  
> > -   if (thp_migration_supported() && PageTransHuge(page)) {
> > -   order = HPAGE_PMD_ORDER;
> > -   gfp_mask |= GFP_TRANSHUGE;
> > -   }
> > -
> > if (PageHighMem(page) || (zone_idx(page_zone(page)) == ZONE_MOVABLE))
> > gfp_mask |= __GFP_HIGHMEM;
> >  
> > -   new_page = __alloc_pages_nodemask(gfp_mask, order,
> > +   if (thp_migration_supported() && PageTransHuge(page))
> > +   return alloc_transhuge_page_nodemask(gfp_mask | GFP_TRANSHUGE,
> > +   preferred_nid, nodemask);
> > +   else
> > +   return __alloc_pages_nodemask(gfp_mask, order,
> > preferred_nid, nodemask);
> > -
> > -   if (new_page && PageTransHuge(page))
> > -   prep_transhuge_pag

Re: [PATCH 2/2] mm: rename page dtor functions to {compound,huge,transhuge}_page__dtor

2017-10-17 Thread Du, Changbin
Hi Khandual,
> > long freed);
> >  bool isolate_huge_page(struct page *page, struct list_head *list);
> >  void putback_active_hugepage(struct page *page);
> > -void free_huge_page(struct page *page);
> > +void huge_page_dtor(struct page *page);
> >  void hugetlb_fix_reserve_counts(struct inode *inode);
> >  extern struct mutex *hugetlb_fault_mutex_table;
> >  u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 065d99d..adfa906 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -616,7 +616,7 @@ void split_page(struct page *page, unsigned int order);
> >   * prototype for that function and accessor functions.
> >   * These are _only_ valid on the head of a compound page.
> >   */
> > -typedef void compound_page_dtor(struct page *);
> > +typedef void compound_page_dtor_t(struct page *);
> 
> Why changing this ? I understand _t kind of specifies it more
> like a type def but this patch is just to rename the compound
> page destructor functions. Not sure we should change datatype
> here as well in this patch.
>
It is because of name conflict. I think you already get it per below comments.
I will describe it in commit message.

> >  
> >  /* Keep the enum in sync with compound_page_dtors array in mm/page_alloc.c 
> > */
> >  enum compound_dtor_id {
> > @@ -630,7 +630,7 @@ enum compound_dtor_id {
> >  #endif
> > NR_COMPOUND_DTORS,
> >  };
> > -extern compound_page_dtor * const compound_page_dtors[];
> > +extern compound_page_dtor_t * const compound_page_dtors[];
> >  
> >  static inline void set_compound_page_dtor(struct page *page,
> > enum compound_dtor_id compound_dtor)
> > @@ -639,7 +639,7 @@ static inline void set_compound_page_dtor(struct page 
> > *page,
> > page[1].compound_dtor = compound_dtor;
> >  }
> >  
> > -static inline compound_page_dtor *get_compound_page_dtor(struct page *page)
> > +static inline compound_page_dtor_t *get_compound_page_dtor(struct page 
> > *page)
> 
> Which is adding these kind of changes to the patch without
> having a corresponding description in the commit message.
> 
> >  {
> > VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page);
> > return compound_page_dtors[page[1].compound_dtor];
> > @@ -657,7 +657,7 @@ static inline void set_compound_order(struct page 
> > *page, unsigned int order)
> > page[1].compound_order = order;
> >  }
> >  
> > -void free_compound_page(struct page *page);
> > +void compound_page_dtor(struct page *page);
> >  
> >  #ifdef CONFIG_MMU
> >  /*
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index e267488..a01125b 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -2717,7 +2717,7 @@ fail: if (mapping)
> > return ret;
> >  }
> >  
> > -void free_transhuge_page(struct page *page)
> > +void transhuge_page_dtor(struct page *page)
> >  {
> > struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
> > unsigned long flags;
> > @@ -2728,7 +2728,7 @@ void free_transhuge_page(struct page *page)
> > list_del(page_deferred_list(page));
> > }
> > spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
> > -   free_compound_page(page);
> > +   compound_page_dtor(page);
> >  }
> >  
> >  void deferred_split_huge_page(struct page *page)
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 424b0ef..1af2c4e7 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1250,7 +1250,7 @@ static void clear_page_huge_active(struct page *page)
> > ClearPagePrivate(&page[1]);
> >  }
> >  
> > -void free_huge_page(struct page *page)
> > +void huge_page_dtor(struct page *page)
> >  {
> > /*
> >  * Can't pass hstate in here because it is called from the
> > @@ -1363,7 +1363,7 @@ int PageHeadHuge(struct page *page_head)
> > if (!PageHead(page_head))
> > return 0;
> >  
> > -   return get_compound_page_dtor(page_head) == free_huge_page;
> > +   return get_compound_page_dtor(page_head) == huge_page_dtor;
> >  }
> >  
> >  pgoff_t __basepage_index(struct page *page)
> > @@ -1932,11 +1932,11 @@ static long vma_add_reservation(struct hstate *h,
> >   * specific error paths, a huge page was allocated (via alloc_huge_page)
> >   * and is about to be freed.  If a reservation for the page existed,
> >   * alloc_huge_page would have consumed the reservation and set PagePrivate
> > - * in the newly allocated page.  When the page is freed via free_huge_page,
> > + * in the newly allocated page.  When the page is freed via huge_page_dtor,
> >   * the global reservation count will be incremented if PagePrivate is set.
> > - * However, free_huge_page can not adjust the reserve map.  Adjust the
> > + * However, huge_page_dtor can not adjust the reserve map.  Adjust the
> >   * reserve map here to be consistent with global reserve count adjustments
> > - * to be made by free_huge_page.
> > + * to be made

Re: [PATCH 1/2] mm, thp: introduce dedicated transparent huge page allocation interfaces

2017-10-17 Thread Du, Changbin
Hi Khandual,
Thanks for your review.

On Tue, Oct 17, 2017 at 01:38:07PM +0530, Anshuman Khandual wrote:
> On 10/16/2017 02:49 PM, changbin...@intel.com wrote:
> > From: Changbin Du 
> > 
> > This patch introduced 4 new interfaces to allocate a prepared
> > transparent huge page.
> >   - alloc_transhuge_page_vma
> >   - alloc_transhuge_page_nodemask
> >   - alloc_transhuge_page_node
> >   - alloc_transhuge_page
> > 
> 
> If we are trying to match HugeTLB helpers, then it should have
> format something like alloc_transhugepage_xxx instead of
> alloc_transhuge_page_XXX. But I think its okay.
>
HugeTLB helpers are something like alloc_huge_page, so I think
alloc_transhuge_page match it. And existing names already have
*transhuge_page* style.

> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index 14bc21c..1dd2c33 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -130,9 +130,20 @@ extern unsigned long thp_get_unmapped_area(struct file 
> > *filp,
> > unsigned long addr, unsigned long len, unsigned long pgoff,
> > unsigned long flags);
> >  
> > -extern void prep_transhuge_page(struct page *page);
> >  extern void free_transhuge_page(struct page *page);
> >  
> > +struct page *alloc_transhuge_page_vma(gfp_t gfp_mask,
> > +   struct vm_area_struct *vma, unsigned long addr);
> > +struct page *alloc_transhuge_page_nodemask(gfp_t gfp_mask,
> > +   int preferred_nid, nodemask_t *nmask);
> 
> Would not they require 'extern' here ?
>
Need or not, function declaration are implicitly 'extern'. I will add it to
align with existing code.

> > +
> > +static inline struct page *alloc_transhuge_page_node(int nid, gfp_t 
> > gfp_mask)
> > +{
> > +   return alloc_transhuge_page_nodemask(gfp_mask, nid, NULL);
> > +}
> > +
> > +struct page *alloc_transhuge_page(gfp_t gfp_mask);
> > +
> >  bool can_split_huge_page(struct page *page, int *pextra_pins);
> >  int split_huge_page_to_list(struct page *page, struct list_head *list);
> >  static inline int split_huge_page(struct page *page)
> > diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> > index 643c7ae..70a00f3 100644
> > --- a/include/linux/migrate.h
> > +++ b/include/linux/migrate.h
> > @@ -42,19 +42,15 @@ static inline struct page *new_page_nodemask(struct 
> > page *page,
> > return 
> > alloc_huge_page_nodemask(page_hstate(compound_head(page)),
> > preferred_nid, nodemask);
> >  
> > -   if (thp_migration_supported() && PageTransHuge(page)) {
> > -   order = HPAGE_PMD_ORDER;
> > -   gfp_mask |= GFP_TRANSHUGE;
> > -   }
> > -
> > if (PageHighMem(page) || (zone_idx(page_zone(page)) == ZONE_MOVABLE))
> > gfp_mask |= __GFP_HIGHMEM;
> >  
> > -   new_page = __alloc_pages_nodemask(gfp_mask, order,
> > +   if (thp_migration_supported() && PageTransHuge(page))
> > +   return alloc_transhuge_page_nodemask(gfp_mask | GFP_TRANSHUGE,
> > +   preferred_nid, nodemask);
> > +   else
> > +   return __alloc_pages_nodemask(gfp_mask, order,
> > preferred_nid, nodemask);
> > -
> > -   if (new_page && PageTransHuge(page))
> > -   prep_transhuge_page(new_page);
> 
> This makes sense, calling prep_transhuge_page() inside the
> function alloc_transhuge_page_nodemask() is better I guess.
> 
> >  
> > return new_page;
> >  }
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 269b5df..e267488 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -490,7 +490,7 @@ static inline struct list_head 
> > *page_deferred_list(struct page *page)
> > return (struct list_head *)&page[2].mapping;
> >  }
> >  
> > -void prep_transhuge_page(struct page *page)
> > +static void prep_transhuge_page(struct page *page)
> 
> Right. It wont be used outside huge page allocation context and
> you have already mentioned about it.
> 
> >  {
> > /*
> >  * we use page->mapping and page->indexlru in second tail page
> > @@ -501,6 +501,45 @@ void prep_transhuge_page(struct page *page)
> > set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
> >  }
> >  
> > +struct page *alloc_transhuge_page_vma(gfp_t gfp_mask,
> > +   struct vm_area_struct *vma, unsigned long addr)
> > +{
> > +   struct page *page;
> > +
> > +   page = alloc_pages_vma(gfp_mask | __GFP_COMP, HPAGE_PMD_ORDER,
> > +  vma, addr, numa_node_id(), true);
> > +   if (unlikely(!page))
> > +   return NULL;
> > +   prep_transhuge_page(page);
> > +   return page;
> > +}
> 
> __GFP_COMP and HPAGE_PMD_ORDER are the minimum flags which will be used
> for huge page allocation and preparation. Any thing else depending upon
> the context will be passed by the caller. Makes sense.
> 
yes, thanks.

> > +
> > +struct page *alloc_transhuge_page_nodemask(gfp_t gfp_mask,
> > +   int preferred_nid, nodemask_t *nmask)
> > +{
> > +   struct page *

  1   2   3   >