https://gcc.gnu.org/bugzilla/show_bug.cgi?id=123225

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
           Priority|P3                          |P1

--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> ---
I think this ultimatively requires targets to punt on some cases.  On x86 we
get with -march=znver5

foo:
.LFB0:
        .cfi_startproc
        leaq    2(%rdi), %rcx
        movq    %rdi, %rax
        shrq    %rcx
        negq    %rcx
        andl    $31, %ecx
        je      .L2     
        xorl    %edx, %edx
        .p2align 5      
        .p2align 4
        .p2align 3
.L4:    // alignment loop
        addq    $2, %rax
        cmpw    $0, (%rax)
        je      .L1
        incq    %rdx
        cmpq    %rdx, %rcx
        jne     .L4

.L2:
        leaq    2(%rdi,%rcx,2), %rsi
        xorl    %edx, %edx
        .p2align 5
        .p2align 4
        .p2align 3
.L5: // vector loop
        vpxor   %xmm0, %xmm0, %xmm0
        movq    %rdx, %rcx
        vpcmpeqw        (%rsi,%rdx,2), %zmm0, %k0
        addq    $32, %rdx
        kortestd        %k0, %k0
        je      .L5
        leaq    (%rax,%rcx,2), %rax
        .p2align 4
        .p2align 4
        .p2align 3
.L6: // early-break epilog
        addq    $2, %rax
        cmpw    $0, (%rax)
        jne     .L6

at least until we fix that there's no epilog required the thing is to
observe that in the alignment loop the alignment check is of comparable
complexity to the original scalar iteration, so that alignment loop
covering up to 63 elements is much slower than the scalar loop.
Possibly only loops with much more actual work per scalar iteration
are worth vectorizing this way (unless we can elide the alignment loop).

Reply via email to