https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114162

            Bug ID: 114162
           Summary: Missing Optimization: Loop is vectorized instead of
                    removed
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: carnet at student dot ethz.ch
  Target Milestone: ---

This simple for-loop is compiled to complex vectorized code instead of
optmizing the loop like clang is able to.

https://godbolt.org/z/c3fjWbWEz

Source:
unsigned short a;
int main() {
    int counter = 0;
    // n and a are mutliples of 4
    int n = 4;
    a *= n;
    for (; a > 0; a += n)
        counter++;
    return counter;
}
x86 -O3 Assembly:
main:
        movzwl  a(%rip), %edx
        salw    $2, %dx
        je      .L24
        movl    $-4, %ecx
        subl    %edx, %ecx
        cmpw    $52, %cx
        jbe     .L15
        shrw    $2, %cx
        movd    %edx, %xmm0
        movl    $8, %edi
        xorl    %eax, %eax
        addl    $1, %ecx
        punpcklwd       %xmm0, %xmm0
        movd    %edi, %xmm3
        movdqa  .LC0(%rip), %xmm1
        movl    %ecx, %esi
        pshufd  $0, %xmm0, %xmm0
        movl    $2097184, %edi
        pshufd  $0, %xmm3, %xmm3
        shrw    $3, %si
        paddw   .LC1(%rip), %xmm0
        movd    %edi, %xmm2
        movzwl  %si, %esi
        pshufd  $0, %xmm2, %xmm2
.L5:
        addl    $1, %eax
        movdqa  %xmm1, %xmm4
        movdqa  %xmm0, %xmm5
        paddd   %xmm3, %xmm1
        paddw   %xmm2, %xmm0
        cmpl    %eax, %esi
        jne     .L5
        movl    %ecx, %esi
        movl    %ecx, %eax
        andl    $-8, %esi
        andl    $32760, %eax
        andl    $7, %ecx
        leal    (%rdx,%rsi,4), %edx
        je      .L6
        cmpw    $-4, %dx
        je      .L25
        cmpw    $-8, %dx
        je      .L26
        cmpw    $-12, %dx
        je      .L27
        cmpw    $-16, %dx
        je      .L28
        cmpw    $-20, %dx
        je      .L29
        cmpw    $-24, %dx
        je      .L30
        addl    $7, %eax
        addl    $28, %edx
.L14:
        movw    %dx, a(%rip)
        ret
.L26:
        addl    $2, %eax
        xorl    %edx, %edx
        jmp     .L14
.L24:
        movw    $0, a(%rip)
        xorl    %eax, %eax
        ret
.L15:
        xorl    %eax, %eax
        andb    $4, %cl
        jne     .L4
        movl    $1, %eax
        addw    $4, %dx
        je      .L14
.L4:
        addl    $2, %eax
        addw    $8, %dx
        jne     .L4
        jmp     .L14
.L6:
        paddd   .LC5(%rip), %xmm4
        paddw   .LC7(%rip), %xmm5
        pshufd  $255, %xmm4, %xmm0
        pextrw  $7, %xmm5, %edx
        movd    %xmm0, %eax
        jmp     .L14
.L25:
        addl    $1, %eax
        xorl    %edx, %edx
        jmp     .L14
.L27:
        addl    $3, %eax
        xorl    %edx, %edx
        jmp     .L14
.L28:
        addl    $4, %eax
        xorl    %edx, %edx
        jmp     .L14
.L29:
        addl    $5, %eax
        xorl    %edx, %edx
        jmp     .L14
.L30:
        addl    $6, %eax
        xorl    %edx, %edx
        jmp     .L14
a:
        .zero   2
.LC0:
        .long   0
        .long   1
        .long   2
        .long   3
.LC1:
        .value  0
        .value  4
        .value  8
        .value  12
        .value  16
        .value  20
        .value  24
        .value  28
.LC5:
        .long   5
        .long   5
        .long   5
        .long   5
.LC7:
        .long   262148
        .long   262148
        .long   262148
        .long   262148

Reply via email to