https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95752

            Bug ID: 95752
           Summary: Failure to optimize complicated usage of __builtin_ctz
                    with conditionals properly
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: gabravier at gmail dot com
  Target Milestone: ---

unsigned long f(uint64_t value)
{
    unsigned int result;

    if ((value & 0xFFFFFFFF) == 0)
    {
        result = __builtin_ctz(value >> 32) + 32;
    }
    else
    {
        if ((unsigned int)value != 0)
            result = __builtin_ctz((unsigned int)value);
    }

    return result;
}

With -O3 -mbmi, LLVM outputs this :

f(unsigned long):
  mov rax, rdi
  shr rax, 32
  tzcnt ecx, eax
  or ecx, 32
  tzcnt eax, edi
  cmovb eax, ecx
  ret

GCC outputs this :

f(unsigned long):
  test edi, edi
  jne .L2

  shr rdi, 32
  xor eax, eax
  tzcnt eax, edi
  add eax, 32
  mov eax, eax
  ret

.L2:
  xor edx, edx
  mov eax, 0
  tzcnt edx, edi
  test edi, edi
  cmovne eax, edx
  mov eax, eax
  ret

This may be related to how GCC handles undefined behaviour in relation to
`__builtin_ctz` and uninitialized variables, but this still seems like it could
be heavily optimized. At least, it could emit something like this if the
`cmovcc` is not the best behaviour here :

f(unsigned long):
  test edi, edi
  jne .L2

  shr rdi, 32
  tzcnt eax, edi
  add eax, 32
  ret

.L1:
  tzcnt eax, edi
  ret

Using this code :

unsigned long f(uint64_t value)
{
    unsigned int result;

    if ((value & 0xFFFFFFFF) == 0)
    {
        result = __builtin_ctz(value >> 32) + 32;
    }
    else
    {
        if ((unsigned int)value != 0)
            result = __builtin_ctz((unsigned int)value);
        else
            __builtin_unreachable();
    }

    return result;
}

(i.e. adding __builtin_unreachable where an undefined value is created)
generates better code :

f(unsigned long):
  xor eax, eax
  tzcnt eax, edi
  test edi, edi
  jne .L3
  shr rdi, 32
  tzcnt edi, edi
  lea eax, [rdi+32]
.L3:
  mov eax, eax
  ret

This looks like something tree-ssa optimizers could do (inserting
__builtin_unreachable when invoking UB through usage of undefined values) since
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94861 indicates that GCC doesn't
do this even for the simplest cases (and, looking at tree dumps, tree-ssa
doesn't look like it makes any assumptions on the initial value of variables).

Reply via email to