Patch attempts to improve the performace of __arch_hweight functions by
making them inline instead of current out of line implementation.
Testcase is to disable/enable SMT on a large (192 thread) POWER7 lpar.
Program used for SMT disable/enable is ppc64_cpu with --smt=[off/on]
option. Here are the perf output. In this case, __arch_hweight64 is
called by __bitmap_weight.
Without patch (ppc64_cpu --smt=off):
17.60% ppc64_cpu [kernel.kallsyms] [k] .deactivate_slab
4.85% ppc64_cpu [kernel.kallsyms] [k] .__bitmap_weight
1.36% ppc64_cpu [kernel.kallsyms] [k] .__disable_runtime
1.29% ppc64_cpu [kernel.kallsyms] [k] .__arch_hweight64
With patch (ppc64_cpu --smt=off):
17.29% ppc64_cpu [kernel.kallsyms] [k] .deactivate_slab
3.71% ppc64_cpu [kernel.kallsyms] [k] .__bitmap_weight
3.26% ppc64_cpu [kernel.kallsyms] [k]
.build_overlap_sched_groups
Without patch (ppc64_cpu --smt=on):
8.35% ppc64_cpu [kernel.kallsyms] [k] .strlen
7.00% ppc64_cpu [kernel.kallsyms] [k] .memset
6.78% ppc64_cpu [kernel.kallsyms] [k] .__bitmap_weight
4.23% ppc64_cpu [kernel.kallsyms] [k] .deactivate_slab
1.58% ppc64_cpu [kernel.kallsyms] [k]
.refresh_zone_stat_thresholds
1.57% ppc64_cpu [kernel.kallsyms] [k] .__arch_hweight64
1.54% ppc64_cpu [kernel.kallsyms] [k] .__enable_runtime
With patch (ppc64_cpu --smt=on):
9.44% ppc64_cpu [kernel.kallsyms] [k] .strlen
6.43% ppc64_cpu [kernel.kallsyms] [k] .memset
5.48% ppc64_cpu [kernel.kallsyms] [k] .__bitmap_weight
4.59% ppc64_cpu [kernel.kallsyms] [k] .insert_entry
4.29% ppc64_cpu [kernel.kallsyms] [k] .deactivate_slab
Signed-off-by: Madhavan Srinivasan ma...@linux.vnet.ibm.com
---
arch/powerpc/include/asm/bitops.h | 130 -
arch/powerpc/include/asm/ppc-opcode.h |6 ++
arch/powerpc/lib/Makefile |2 +-
3 files changed, 133 insertions(+), 5 deletions(-)
diff --git a/arch/powerpc/include/asm/bitops.h
b/arch/powerpc/include/asm/bitops.h
index 910194e..136fe6a 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -43,8 +43,10 @@
#endif
#include linux/compiler.h
+#include linux/types.h
#include asm/asm-compat.h
#include asm/synch.h
+#include asm/cputable.h
/*
* clear_bit doesn't imply a memory barrier
@@ -263,10 +265,130 @@ static __inline__ int fls64(__u64 x)
#endif /* __powerpc64__ */
#ifdef CONFIG_PPC64
-unsigned int __arch_hweight8(unsigned int w);
-unsigned int __arch_hweight16(unsigned int w);
-unsigned int __arch_hweight32(unsigned int w);
-unsigned long __arch_hweight64(__u64 w);
+
+static inline unsigned int __arch_hweight8(unsigned int w)
+{
+ unsigned int register iop asm(r3) = w;
+ unsigned int register tmp asm(r4);
+ __asm__ __volatile__ (
+ stringify_in_c(BEGIN_FTR_SECTION)
+ bl .__sw_hweight8;
+ nop;
+ stringify_in_c(FTR_SECTION_ELSE)
+ PPC_POPCNTB_M(%1,%2) ;
+ clrldi %0,%1,64-8;
+ stringify_in_c(ALT_FTR_SECTION_END_IFCLR((%3)))
+ : =r (iop), =r (tmp)
+ : r (iop), i (CPU_FTR_POPCNTB)
+ : r0, r1, r5, r6, r7, r8, r9,
+ r10, r11, r12, r13, r31, lr, cr0, xer);
+
+ return iop;
+}
+
+static inline unsigned int __arch_hweight16(unsigned int w)
+{
+ unsigned int register iop asm(r3) = w;
+ unsigned int register tmp asm(r4);
+ __asm__ __volatile__ (
+ stringify_in_c(BEGIN_FTR_SECTION)
+ bl .__sw_hweight16;
+ nop;
+ nop;
+ nop;
+ nop;
+ stringify_in_c(FTR_SECTION_ELSE)
+ stringify_in_c(BEGIN_FTR_SECTION_NESTED(50))
+ PPC_POPCNTB_M(%0,%2) ;
+ srdi %1,%0,8;
+ add %0,%1,%0;
+ clrldi %0,%0,64-8;
+ stringify_in_c(FTR_SECTION_ELSE_NESTED(50))
+ clrlwi %0,%2,16;
+ PPC_POPCNTW_M(%1,%0) ;
+ clrldi %0,%1,64-8;
+ stringify_in_c(ALT_FTR_SECTION_END_NESTED_IFCLR(%4,50))
+ stringify_in_c(ALT_FTR_SECTION_END_IFCLR((%3)))
+ : =r (iop), =r (tmp)
+ : r (iop), i (CPU_FTR_POPCNTB), i (CPU_FTR_POPCNTD)
+ : r0, r1, r5, r6, r7, r8, r9,
+ r10, r11, r12, r13, r31, lr, cr0, xer);
+
+ return iop;
+}
+
+static inline unsigned int __arch_hweight32(unsigned int w)
+{
+ unsigned int register iop asm(r3) = w;
+ unsigned int register tmp asm(r4);
+ __asm__ __volatile__ (
+ stringify_in_c(BEGIN_FTR_SECTION)
+ bl .__sw_hweight32;
+ nop;
+ nop;
+ nop;
+ nop;
+ nop;
+ nop;
+ stringify_in_c(FTR_SECTION_ELSE)
+