https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96275

            Bug ID: 96275
           Summary: Vectorizer doesn't take into account bitmask condition
                    from branch conditions.
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: witold.baryluk+gcc at gmail dot com
  Target Milestone: ---

https://godbolt.org/z/Gfebjd

With gcc trunk 20200720

If the loop to be vectorized is inside a if condition that check for loop
counter, or there is preceding assert / function return on such condition, the
gcc seems to forgot about it and not take into account in the optimizer /
vectorizer, and still emits the backup scalar code to take care of stragglers
despite it being a dead code.

#include "assert.h"

void fillArray(const unsigned int N, float * restrict a, const float* restrict
b, const float* restrict c) {
    //assert(N >= 1024);
    for (int i = 0; i < (N & ~31u); i++) {
        a[i] = b[0] * c[i];
    }
}


produces:

fillArray:
        and     edi, -32
        je      .L8
        shr     edi, 3
        vbroadcastss    ymm1, DWORD PTR [rdx]
        xor     eax, eax
        mov     edx, edi
        sal     rdx, 5
.L3:
        vmulps  ymm0, ymm1, YMMWORD PTR [rcx+rax]
        vmovups YMMWORD PTR [rsi+rax], ymm0
        add     rax, 32
        cmp     rax, rdx
        jne     .L3
        vzeroupper
.L8:
        ret




but:

#include "assert.h"

void fillArray(const unsigned int N, float * restrict a, const float* restrict
b, const float* restrict c) {
    //assert(N >= 1024);
    if ((N & 31u) == 0) {
        for (int i = 0; i < N; i++) {
            a[i] = b[0] * c[i];
        }
    }
}

produces this sub-optimal code:

fillArray:
        mov     eax, edi
        and     eax, 31
        jne     .L14
        test    edi, edi
        je      .L14
        lea     r8d, [rdi-1]
        vmovss  xmm1, DWORD PTR [rdx]
        cmp     r8d, 6
        jbe     .L8
        mov     edx, edi
        vbroadcastss    ymm2, xmm1
        xor     eax, eax
        shr     edx, 3
        sal     rdx, 5
.L4:
        vmulps  ymm0, ymm2, YMMWORD PTR [rcx+rax]
        vmovups YMMWORD PTR [rsi+rax], ymm0
        add     rax, 32
        cmp     rdx, rax
        jne     .L4
        mov     eax, edi
        and     eax, -8
        mov     edx, eax
        cmp     edi, eax
        je      .L16
        vzeroupper
.L3:
        mov     r9d, edi
        sub     r8d, eax
        sub     r9d, eax
        cmp     r8d, 2
        jbe     .L6
        mov     eax, eax
        vshufps xmm0, xmm1, xmm1, 0
        vmulps  xmm0, xmm0, XMMWORD PTR [rcx+rax*4]
        vmovups XMMWORD PTR [rsi+rax*4], xmm0
        mov     eax, r9d
        and     eax, -4
        add     edx, eax
        cmp     r9d, eax
        je      .L14
.L6:
        movsx   rax, edx
        vmulss  xmm0, xmm1, DWORD PTR [rcx+rax*4]
        vmovss  DWORD PTR [rsi+rax*4], xmm0
        lea     eax, [rdx+1]
        cmp     edi, eax
        jbe     .L14
        cdqe
        add     edx, 2
        vmulss  xmm0, xmm1, DWORD PTR [rcx+rax*4]
        vmovss  DWORD PTR [rsi+rax*4], xmm0
        cmp     edi, edx
        jbe     .L14
        movsx   rdx, edx
        vmulss  xmm1, xmm1, DWORD PTR [rcx+rdx*4]
        vmovss  DWORD PTR [rsi+rdx*4], xmm1
.L14:
        ret
.L16:
        vzeroupper
        ret
.L8:
        xor     edx, edx
        jmp     .L3


Adding `assert(N == (N & ~31u));` doesn't help.

Reply via email to