From: Jiri Kosina <[email protected]>

The open-coded version of __sw_hweight32() and __sw_hweight64() are 
broken; the variable <-> register mapping in the comments doesn't match 
the registers being used, resulting in wrong calculations.

This has a potential to demonstrate itself by various syptoms on machines 
where this is actually being used to compute the hweight (due to lack of 
popcnt insn). On my core2 system, this demonstrates itself as

 NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter.
 unchecked MSR access error: WRMSR to 0xdf (tried to write 0x000000ff80000001) 
at rIP: 0xffffffff90004acct_set_period+0xdc/0x190)
  ffff8e39f941b000 ffff8e39fbc0a440 000000000000001e ffff8e39fbc0a660
  ffff8e39f9523b78 ffffffff90004bd0 ffff8e39f941b000 ffff8e39fbc0a760
  ffff8e39fbc0a440 ffff8e39f9523bc0 ffffffff900053ef 00000000f951c2c0
 Call Trace:
  [<ffffffff90004bd0>] x86_pmu_start+0x50/0x110
  [<ffffffff900053ef>] x86_pmu_enable+0x27f/0x2f0
  [<ffffffff90175642>] perf_pmu_enable+0x22/0x30
  [<ffffffff901766b1>] ctx_resched+0x51/0x60
  [<ffffffff901768a0>] __perf_event_enable+0x1e0/0x240
  [<ffffffff9016e5f9>] event_function+0xa9/0x180
  [<ffffffff901766c0>] ? ctx_resched+0x60/0x60
  [<ffffffff9016fcef>] remote_function+0x3f/0x50
  [<ffffffff901012b6>] generic_exec_single+0xb6/0x140
  [<ffffffff9016fcb0>] ? perf_cgroup_attach+0x50/0x50
  [<ffffffff9016fcb0>] ? perf_cgroup_attach+0x50/0x50
  [<ffffffff901013f7>] smp_call_function_single+0xb7/0x110
  [<ffffffff9016e984>] cpu_function_call+0x34/0x40
  [<ffffffff9016e550>] ? list_del_event+0x150/0x150
  [<ffffffff9016ecda>] event_function_call+0x11a/0x120
  [<ffffffff901766c0>] ? ctx_resched+0x60/0x60
  [<ffffffff9016ed79>] _perf_event_enable+0x49/0x70
  [<ffffffff901736ac>] perf_event_enable+0x1c/0x40
  [<ffffffff9013cad2>] watchdog_enable+0x132/0x1d0
  [<ffffffff90092440>] smpboot_thread_fn+0xe0/0x1d0
  [<ffffffff90092360>] ? sort_range+0x30/0x30
  [<ffffffff9008e7e2>] kthread+0xf2/0x110
  [<ffffffff9069e611>] ? wait_for_completion+0xe1/0x110
  [<ffffffff906a3b2f>] ret_from_fork+0x1f/0x40
  [<ffffffff9008e6f0>] ? kthread_create_on_node+0x220/0x220

as FIXED_EVENT_CONSTRAINT() doesn't produce correct results, and followup 
panic in x86_pmu_stop(). YMMV.

Fix this by rewriting the code so that it actually matches the math 
lib/hweight.c. While at it, nuke the original comments altogether; the 
"variable" names don't really match what's being used in the C-version of 
the function, and I find them to be more misleading than helping.

This version of gcc:

        gcc (SUSE Linux) 4.8.5

produces identical assembly code from lib/hweight.c (modulo frame pointer 
maths).

Fixes: f5967101e9d ("x86/hweight: Get rid of the special calling convention")
Signed-off-by: Jiri Kosina <[email protected]>
---

v1 -> v2: remove the comments altogether; put "approved by gcc" note into 
          changelog

 arch/x86/lib/hweight.S | 69 +++++++++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 35 deletions(-)

diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
index 02de3d7..e1cea44f 100644
--- a/arch/x86/lib/hweight.S
+++ b/arch/x86/lib/hweight.S
@@ -8,56 +8,55 @@
  */
 ENTRY(__sw_hweight32)
 
-#ifdef CONFIG_X86_64
-       movl %edi, %eax                         # w
-#endif
        __ASM_SIZE(push,) %__ASM_REG(dx)
-       movl %eax, %edx                         # w -> t
-       shrl %edx                               # t >>= 1
-       andl $0x55555555, %edx                  # t &= 0x55555555
-       subl %edx, %eax                         # w -= t
 
-       movl %eax, %edx                         # w -> t
-       shrl $2, %eax                           # w_tmp >>= 2
-       andl $0x33333333, %edx                  # t     &= 0x33333333
-       andl $0x33333333, %eax                  # w_tmp &= 0x33333333
-       addl %edx, %eax                         # w = w_tmp + t
+       movl %edi, %edx
+
+       shrl %edx
+       andl $0x55555555, %edx
+       subl %edx, %edi
+       movl %edi, %eax
+
+       shrl $2, %edi
+       andl $0x33333333, %eax
+       andl $0x33333333, %edi
+       addl %eax, %edi
 
-       movl %eax, %edx                         # w -> t
-       shrl $4, %edx                           # t >>= 4
-       addl %edx, %eax                         # w_tmp += t
-       andl  $0x0f0f0f0f, %eax                 # w_tmp &= 0x0f0f0f0f
-       imull $0x01010101, %eax, %eax           # w_tmp *= 0x01010101
-       shrl $24, %eax                          # w = w_tmp >> 24
+       movl %edi, %eax
+       shrl $4, %eax
+       addl %edi, %eax
+       andl  $0x0f0f0f0f, %eax
+       imull $0x01010101, %eax, %eax
+       shrl $24, %eax
        __ASM_SIZE(pop,) %__ASM_REG(dx)
        ret
 ENDPROC(__sw_hweight32)
 
 ENTRY(__sw_hweight64)
-#ifdef CONFIG_X86_64
+
        pushq   %rdx
 
-       movq    %rdi, %rdx                      # w -> t
+       movq    %rdi, %rdx
        movabsq $0x5555555555555555, %rax
-       shrq    %rdx                            # t >>= 1
-       andq    %rdx, %rax                      # t &= 0x5555555555555555
+       shrq    %rdx
+       andq    %rax, %rdx
+       subq    %rdx, %rdi
        movabsq $0x3333333333333333, %rdx
-       subq    %rax, %rdi                      # w -= t
-
-       movq    %rdi, %rax                      # w -> t
-       shrq    $2, %rdi                        # w_tmp >>= 2
-       andq    %rdx, %rax                      # t     &= 0x3333333333333333
-       andq    %rdi, %rdx                      # w_tmp &= 0x3333333333333333
-       addq    %rdx, %rax                      # w = w_tmp + t
 
-       movq    %rax, %rdx                      # w -> t
-       shrq    $4, %rdx                        # t >>= 4
-       addq    %rdx, %rax                      # w_tmp += t
+       movq    %rdi, %rax
+       shrq    $2, %rdi
+       andq    %rdx, %rax
+       andq    %rdx, %rdi
        movabsq $0x0f0f0f0f0f0f0f0f, %rdx
-       andq    %rdx, %rax                      # w_tmp &= 0x0f0f0f0f0f0f0f0f
+       addq    %rax, %rdi
+
+       movq    %rdi, %rax
+       shrq    $4, %rax
+       addq    %rdi, %rax
+       andq    %rdx, %rax
        movabsq $0x0101010101010101, %rdx
-       imulq   %rdx, %rax                      # w_tmp *= 0x0101010101010101
-       shrq    $56, %rax                       # w = w_tmp >> 56
+       imulq   %rdx, %rax
+       shrq    $56, %rax
 
        popq    %rdx
        ret
-- 
Jiri Kosina
SUSE Labs

Reply via email to