https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87505

            Bug ID: 87505
           Summary: Vectorizer generates a lot of code for a small loop
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hiraditya at msn dot com
  Target Milestone: ---

test.cpp

#include <cstdint>

int bar(int* v, std::size_t base) {
    int sum = 0;
    for (int i = base; i < base + 4; ++i) {
        sum += v[i];
    }
    return sum;
}


$ gcc-8.2 -std=c++17 -O3 -DNDEBUG test.cpp

bar(int*, unsigned long):
        movslq  %esi, %rcx
        leaq    4(%rsi), %r8
        movl    %esi, %edx
        cmpq    %r8, %rcx
        jnb     .L7
        leaq    3(%rsi), %rax
        movq    %r8, %r9
        subq    %rcx, %rax
        subq    %rcx, %r9
        cmpq    $3, %rax
        jbe     .L8
        movq    %r9, %rdx
        leaq    (%rdi,%rcx,4), %rax
        pxor    %xmm0, %xmm0
        shrq    $2, %rdx
        salq    $4, %rdx
        addq    %rax, %rdx
.L5:
        movdqu  (%rax), %xmm2
        addq    $16, %rax
        paddd   %xmm2, %xmm0
        cmpq    %rdx, %rax
        jne     .L5
        movdqa  %xmm0, %xmm1
        movq    %r9, %r10
        psrldq  $8, %xmm1
        andq    $-4, %r10
        paddd   %xmm1, %xmm0
        addq    %r10, %rcx
        leal    (%rsi,%r10), %edx
        movdqa  %xmm0, %xmm1
        psrldq  $4, %xmm1
        paddd   %xmm1, %xmm0
        movd    %xmm0, %eax
        cmpq    %r10, %r9
        je      .L10
.L3:
        addl    (%rdi,%rcx,4), %eax
        leal    1(%rdx), %ecx
        movslq  %ecx, %rcx
        cmpq    %r8, %rcx
        jnb     .L1
        addl    (%rdi,%rcx,4), %eax
        leal    2(%rdx), %ecx
        movslq  %ecx, %rcx
        cmpq    %rcx, %r8
        jbe     .L1
        addl    $3, %edx
        addl    (%rdi,%rcx,4), %eax
        movslq  %edx, %rdx
        cmpq    %rdx, %r8
        jbe     .L1
        addl    (%rdi,%rdx,4), %eax
        ret
.L7:
        xorl    %eax, %eax
.L1:
        ret
.L10:
        ret
.L8:
        xorl    %eax, %eax
        jmp     .L3

Reply via email to