[PATCH] powerpc: Convert out of line __arch_hweight to inline

2013-08-06 Thread Madhavan Srinivasan
Patch attempts to improve the performace of __arch_hweight functions by
making them inline instead of current out of line implementation.

Testcase is to disable/enable SMT on a large (192 thread) POWER7 lpar.
Program used for SMT disable/enable is ppc64_cpu with --smt=[off/on]
option. Here are the perf output. In this case, __arch_hweight64 is
called by __bitmap_weight.

Without patch (ppc64_cpu --smt=off):

 17.60%  ppc64_cpu  [kernel.kallsyms]   [k] .deactivate_slab

  4.85%  ppc64_cpu  [kernel.kallsyms]   [k] .__bitmap_weight

  1.36%  ppc64_cpu  [kernel.kallsyms]   [k] .__disable_runtime
  1.29%  ppc64_cpu  [kernel.kallsyms]   [k] .__arch_hweight64


With patch (ppc64_cpu --smt=off):

 17.29%  ppc64_cpu  [kernel.kallsyms]   [k] .deactivate_slab

  3.71%  ppc64_cpu  [kernel.kallsyms]   [k] .__bitmap_weight
  3.26%  ppc64_cpu  [kernel.kallsyms]   [k]
.build_overlap_sched_groups


Without patch (ppc64_cpu --smt=on):

  8.35%  ppc64_cpu  [kernel.kallsyms]   [k] .strlen
  7.00%  ppc64_cpu  [kernel.kallsyms]   [k] .memset
  6.78%  ppc64_cpu  [kernel.kallsyms]   [k] .__bitmap_weight
  4.23%  ppc64_cpu  [kernel.kallsyms]   [k] .deactivate_slab

  1.58%  ppc64_cpu  [kernel.kallsyms]   [k]
.refresh_zone_stat_thresholds
  1.57%  ppc64_cpu  [kernel.kallsyms]   [k] .__arch_hweight64
  1.54%  ppc64_cpu  [kernel.kallsyms]   [k] .__enable_runtime


With patch (ppc64_cpu --smt=on):

  9.44%  ppc64_cpu  [kernel.kallsyms]   [k] .strlen
  6.43%  ppc64_cpu  [kernel.kallsyms]   [k] .memset
  5.48%  ppc64_cpu  [kernel.kallsyms]   [k] .__bitmap_weight
  4.59%  ppc64_cpu  [kernel.kallsyms]   [k] .insert_entry
  4.29%  ppc64_cpu  [kernel.kallsyms]   [k] .deactivate_slab


Signed-off-by: Madhavan Srinivasan ma...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/bitops.h |  130 -
 arch/powerpc/include/asm/ppc-opcode.h |6 ++
 arch/powerpc/lib/Makefile |2 +-
 3 files changed, 133 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/bitops.h 
b/arch/powerpc/include/asm/bitops.h
index 910194e..136fe6a 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -43,8 +43,10 @@
 #endif
 
 #include linux/compiler.h
+#include linux/types.h
 #include asm/asm-compat.h
 #include asm/synch.h
+#include asm/cputable.h
 
 /*
  * clear_bit doesn't imply a memory barrier
@@ -263,10 +265,130 @@ static __inline__ int fls64(__u64 x)
 #endif /* __powerpc64__ */
 
 #ifdef CONFIG_PPC64
-unsigned int __arch_hweight8(unsigned int w);
-unsigned int __arch_hweight16(unsigned int w);
-unsigned int __arch_hweight32(unsigned int w);
-unsigned long __arch_hweight64(__u64 w);
+
+static inline unsigned int __arch_hweight8(unsigned int w)
+{
+   unsigned int register iop asm(r3) = w;
+   unsigned int register tmp asm(r4);
+   __asm__ __volatile__ (
+   stringify_in_c(BEGIN_FTR_SECTION)
+   bl .__sw_hweight8;
+   nop;
+   stringify_in_c(FTR_SECTION_ELSE)
+   PPC_POPCNTB_M(%1,%2) ;
+   clrldi %0,%1,64-8;
+   stringify_in_c(ALT_FTR_SECTION_END_IFCLR((%3)))
+   : =r (iop), =r (tmp)
+   : r (iop), i (CPU_FTR_POPCNTB)
+   : r0, r1, r5, r6, r7, r8, r9,
+   r10, r11, r12, r13, r31, lr, cr0, xer);
+
+   return iop;
+}
+
+static inline unsigned int __arch_hweight16(unsigned int w)
+{
+   unsigned int register iop asm(r3) = w;
+   unsigned int register tmp asm(r4);
+   __asm__ __volatile__ (
+   stringify_in_c(BEGIN_FTR_SECTION)
+   bl .__sw_hweight16;
+   nop;
+   nop;
+   nop;
+   nop;
+   stringify_in_c(FTR_SECTION_ELSE)
+   stringify_in_c(BEGIN_FTR_SECTION_NESTED(50))
+   PPC_POPCNTB_M(%0,%2) ;
+   srdi %1,%0,8;
+   add %0,%1,%0;
+   clrldi %0,%0,64-8;
+   stringify_in_c(FTR_SECTION_ELSE_NESTED(50))
+   clrlwi %0,%2,16;
+   PPC_POPCNTW_M(%1,%0) ;
+   clrldi %0,%1,64-8;
+   stringify_in_c(ALT_FTR_SECTION_END_NESTED_IFCLR(%4,50))
+   stringify_in_c(ALT_FTR_SECTION_END_IFCLR((%3)))
+   : =r (iop), =r (tmp)
+   : r (iop), i (CPU_FTR_POPCNTB), i (CPU_FTR_POPCNTD)
+   : r0, r1, r5, r6, r7, r8, r9,
+   r10, r11, r12, r13, r31, lr, cr0, xer);
+
+   return iop;
+}
+
+static inline unsigned int __arch_hweight32(unsigned int w)
+{
+   unsigned int register iop asm(r3) = w;
+   unsigned int register tmp asm(r4);
+   __asm__ __volatile__ (
+   stringify_in_c(BEGIN_FTR_SECTION)
+   bl .__sw_hweight32;
+   nop;
+   nop;
+   nop;
+   nop;
+   nop;
+   nop;
+   stringify_in_c(FTR_SECTION_ELSE)
+   

Re: [PATCH] powerpc: Convert out of line __arch_hweight to inline

2013-08-06 Thread Anshuman Khandual
 
  obj-$(CONFIG_PPC64)  += copypage_64.o copyuser_64.o \
  memcpy_64.o usercopy_64.o mem_64.o string.o \
 -checksum_wrappers_64.o hweight_64.o \
 +checksum_wrappers_64.o \
  copyuser_power7.o string_64.o copypage_power7.o \
  memcpy_power7.o
  obj-$(CONFIG_PPC_EMULATE_SSTEP)  += sstep.o ldstfp.o
 

As you have moved all the code from hweight_64.S file and removed it from the
compilation list in the Makefile, you need to delete the file from the 
directory as well.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev