regression from 6.3 with while (__builtin_ffs(x) && x)

vegard.nossum at oracle dot com Thu, 12 Apr 2018 04:16:50 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85375


            Bug ID: 85375
           Summary: possible missed optimisation / regression from 6.3
                    with while (__builtin_ffs(x) && x)
           Product: gcc
           Version: 8.0.1
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: vegard.nossum at oracle dot com
  Target Milestone: ---

Input:

extern int a;

int f(int x)
{
    while (__builtin_ffs(x) && x)
        x -= a;

    return x;
}

gcc 6.3.0 with -O3 compiled this as:

f(int):
  movl %edi, %eax
  movl a(%rip), %esi
  movl $-1, %ecx
  jmp .L3
.L11:
  testl %eax, %eax
  je .L2
  subl %esi, %eax
.L3:
  bsfl %eax, %edx
  cmove %ecx, %edx
  cmpl $-1, %edx
  jne .L11
.L2:
  rep ret

whereas current trunk (also with -O3) compiles it as:

f(int):
  movl $-1, %ecx
  bsfl %edi, %eax
  cmove %ecx, %eax
  cmpl %ecx, %eax
  je .L5
  testl %edi, %edi
  je .L6
  movl a(%rip), %esi
  movl %edi, %eax
  jmp .L3
.L4:
  testl %eax, %eax
  je .L1
.L3:
  subl %esi, %eax
  bsfl %eax, %edx
  cmove %ecx, %edx
  cmpl $-1, %edx
  jne .L4
  ret
.L6:
  xorl %eax, %eax
.L1:
  ret
.L5:
  movl %edi, %eax
  ret

There are fewer instructions overall for the case where x is 0 on entry, but
trunk still has longer code overall even if we change the while-condition to
__builtin_expect(!!__builtin_ffs(x) && x, 1) which ideally should pessimise
this case.

For the same input, clang trunk with -O3 gives:

f(int): # @f(int)
  test edi, edi
  je .LBB0_3
  mov eax, dword ptr [rip + a]
  neg edi
.LBB0_2: # =>This Inner Loop Header: Depth=1
  add edi, eax
  jne .LBB0_2
.LBB0_3:
  xor eax, eax
  ret

This seems to rely simply on the fact that (__builtin_ffs(x) == 0) and (x == 0)
are equivalent.

If you simplify the while-condition to simply __builtin_ffs(x), then the
difference is smaller but still there:

6.3.0:

f(int):
  movl %edi, %eax
  movl a(%rip), %esi
  movl $-1, %ecx
  jmp .L3
.L5:
  subl %esi, %eax
.L3:
  bsfl %eax, %edx
  cmove %ecx, %edx
  cmpl $-1, %edx
  jne .L5
  rep ret

trunk:

f(int):
  movl $-1, %ecx
  bsfl %edi, %eax
  cmove %ecx, %eax
  cmpl %ecx, %eax
  je .L4
  movl a(%rip), %esi
  movl %edi, %eax
.L3:
  subl %esi, %eax
  bsfl %eax, %edx
  cmove %ecx, %edx
  cmpl $-1, %edx
  jne .L3
  ret
.L4:
  movl %edi, %eax
  ret

[Bug tree-optimization/85375] New: possible missed optimisation / regression from 6.3 with while (__builtin_ffs(x) && x)

Reply via email to