https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103462

            Bug ID: 103462
           Summary: vectorizer failed to reduce bit_clear in loop.
           Product: gcc
           Version: 12.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: crazylht at gmail dot com
  Target Milestone: ---
              Host: x86_64-pc-linux-gnu

the testcase is from pr47769

unsigned long cfunc_one(unsigned long tmp) {
    for (unsigned long bit = 0; bit < 64; bit += 3) {
        tmp &= ~(1UL << bit);
    }
    return tmp;
}

with -O3 -march=skylake -funroll-loops
gcc generates:
cfunc_one:
        mov     rax, rdi
        xor     edx, edx
.L2:
        lea     rcx, [rdx+3]
        btr     rax, rdx
        lea     rsi, [rdx+6]
        btr     rax, rcx
        lea     rdi, [rdx+9]
        btr     rax, rsi
        btr     rax, rdi
        lea     r8, [rdx+12]
        lea     r9, [rdx+15]
        btr     rax, r8
        lea     r10, [rdx+18]
        btr     rax, r9
        lea     r11, [rdx+21]
        btr     rax, r10
        lea     rcx, [rdx+24]
        btr     rax, r11
        lea     rsi, [rdx+27]
        btr     rax, rcx
        lea     rdi, [rdx+30]
        btr     rax, rsi
        add     rdx, 33
        btr     rax, rdi
        cmp     rdx, 66
        jne     .L2
        ret

while clang generates:

cfunc_one(unsigned long):                          # @cfunc_one(unsigned long)
        movabs  rax, 7905747460161236406
        and     rax, rdi
        ret

7905747460161236406 is bit clear for bit {0, 3, 6, 9, ..., 63}.

Reply via email to