https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88570

            Bug ID: 88570
           Summary: Missing or ineffective vectorization of scatter load
           Product: gcc
           Version: 8.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: bugzi...@poradnik-webmastera.com
  Target Milestone: ---

[code]
void test1(int*__restrict n1, int*__restrict n2,
    int*__restrict n3, int*__restrict n4)
{
    for (int n = 0; n < 8; ++n)
    {
        if (n1[n] > 0)
            n2[n] = n3[n];
        else
            n2[n] = n4[n];
    }
}

void test2(double*__restrict d1, double*__restrict d2,
    double*__restrict d3, double*__restrict d4)
{
    for (int n = 0; n < 4; ++n)
    {
        if (d1[n] > 0.0)
            d2[n] = d3[n];
        else
            d2[n] = d4[n];
    }
}
[/code]

Code like above is vectorized properly when global variables are used. However
when code has to work on pointers passed as function arguments, vectorization
is not performed or performed ineffectively.

1. Compilation with -O3 -msse2: no vectorization at all, scalar code is
generated. It is long so I do not paste it here.

2. Compilation with -O3 -msse4.1: no vectorization at all

3. Compilation with -O3 -mavx or -march=sandybridge: code for test1() is still
not vectorized (somewhat expected, as int operations are in AVX2). Output for
test2() is below. As you can see, generated code performs masked loads for d3
and d4, and then used blend to create final result. When global vars are used,
masked loads are not used, only blend. Additionally xor mask is loaded from
memory instead of using cmpeq instruction.

[asm]
test2(double*, double*, double*, double*):
        vmovupd xmm3, XMMWORD PTR [rdi]
        vinsertf128     ymm1, ymm3, XMMWORD PTR [rdi+16], 0x1
        vxorpd  xmm0, xmm0, xmm0
        vcmpltpd        ymm1, ymm0, ymm1
        vmaskmovpd      ymm2, ymm1, YMMWORD PTR [rdx]
        vxorps  ymm0, ymm1, YMMWORD PTR .LC0[rip]
        vmaskmovpd      ymm0, ymm0, YMMWORD PTR [rcx]
        vblendvpd       ymm0, ymm0, ymm2, ymm1
        vmovups XMMWORD PTR [rsi], xmm0
        vextractf128    XMMWORD PTR [rsi+16], ymm0, 0x1
        vzeroupper
        ret
.LC0:
        .quad   -1
        .quad   -1
        .quad   -1
        .quad   -1
[/asm]

4. Compilation with -O3 -march=haswell: code similar as above, with both masked
loads and blend. This time compiler generated vpcmpeqd to load xor mask. This
also happen when -mavx2 is used instead of -march=haswell.

[asm]
test1(int*, int*, int*, int*):
        vmovdqu ymm1, YMMWORD PTR [rdi]
        vpxor   xmm0, xmm0, xmm0
        vpcmpgtd        ymm1, ymm1, ymm0
        vpmaskmovd      ymm2, ymm1, YMMWORD PTR [rdx]
        vpcmpeqd        ymm0, ymm1, ymm0
        vpmaskmovd      ymm0, ymm0, YMMWORD PTR [rcx]
        vpblendvb       ymm0, ymm0, ymm2, ymm1
        vmovdqu YMMWORD PTR [rsi], ymm0
        vzeroupper
        ret
test2(double*, double*, double*, double*):
        vxorpd  xmm0, xmm0, xmm0
        vcmpltpd        ymm1, ymm0, YMMWORD PTR [rdi]
        vpcmpeqd        ymm0, ymm0, ymm0
        vmaskmovpd      ymm2, ymm1, YMMWORD PTR [rdx]
        vpxor   ymm0, ymm0, ymm1
        vmaskmovpd      ymm0, ymm0, YMMWORD PTR [rcx]
        vblendvpd       ymm0, ymm0, ymm2, ymm1
        vmovupd YMMWORD PTR [rsi], ymm0
        vzeroupper
        ret
[/asm]

4. Compilation with -O3 -march=skylake-avx512: masked loads and blend used
again. This time masked loads uses kN registers to store mask. test1() performs
comparison twice to get negated value. test2() uses single comparison, but to
negate it it moves value to eax and then back (I will log a separate bug for
this part, as it has other implications). Code which uses global variables only
uses blend with mask in ymm register.

[asm]
test1(int*, int*, int*, int*):
        vmovdqu32       ymm0, YMMWORD PTR [rdi]
        vpxor   xmm2, xmm2, xmm2
        vpcmpd  k1, ymm0, ymm2, 6
        vpcmpgtd        ymm3, ymm0, ymm2
        vmovdqu32       ymm1{k1}{z}, YMMWORD PTR [rdx]
        vpcmpd  k1, ymm0, ymm2, 2
        vmovdqu32       ymm0{k1}{z}, YMMWORD PTR [rcx]
        vpblendvb       ymm0, ymm0, ymm1, ymm3
        vmovdqu32       YMMWORD PTR [rsi], ymm0
        vzeroupper
        ret
test2(double*, double*, double*, double*):
        vmovupd ymm0, YMMWORD PTR [rdi]
        vxorpd  xmm1, xmm1, xmm1
        vcmppd  k1, ymm0, ymm1, 14
        vcmpltpd        ymm1, ymm1, ymm0
        kmovb   eax, k1
        not     eax
        vmovupd ymm2{k1}{z}, YMMWORD PTR [rdx]
        kmovb   k2, eax
        vmovupd ymm0{k2}{z}, YMMWORD PTR [rcx]
        vblendvpd       ymm0, ymm0, ymm2, ymm1
        vmovupd YMMWORD PTR [rsi], ymm0
        vzeroupper
        ret
[/asm]

5. I tried to compile this code using icc, and got this. As you can see, it
uses masked move instead of blend. I did not check if it offers better
performance or not.

[asm]
test1(int*, int*, int*, int*):
        vpxor     ymm0, ymm0, ymm0                              #6.21
        vpcmpd    k2, ymm0, YMMWORD PTR [rdi], 1                #6.21
        knotw     k1, k2                                        #9.21
        vmovdqu32 ymm1{k2}{z}, YMMWORD PTR [rdx]                #9.21
        vmovdqu32 ymm2{k1}{z}, YMMWORD PTR [rcx]                #9.21
        vmovdqa32 ymm2{k2}, ymm1                                #9.21
        vmovdqu   YMMWORD PTR [rsi], ymm2                       #7.13
        vzeroupper                                              #11.1
        ret                                                     #11.1
test2(double*, double*, double*, double*):
        vxorpd    ymm0, ymm0, ymm0                              #18.21
        vcmppd    k2, ymm0, YMMWORD PTR [rdi], 1                #18.21
        knotw     k1, k2                                        #18.21
        vmovupd   ymm1{k2}{z}, YMMWORD PTR [rdx]                #19.21
        vmovupd   ymm2{k1}{z}, YMMWORD PTR [rcx]                #21.21
        vmovapd   ymm2{k2}, ymm1                                #21.21
        vmovupd   YMMWORD PTR [rsi], ymm2                       #19.13
        vzeroupper                                              #23.1
        ret                                                     #23.1
[/asm]

6. It is possible to eliminate knotw and one vmovapd from icc's output. This is
also not benchmarked. Icc's version loads both values from memory first, then
performs masked load. In my code I use non-masked load for 1st value, and
masked for 2nd. This probably will create dependency and decrease performance.
Anyway, there is possibility that knotw could be removed.

[code]
#include <immintrin.h>

void test5(double*__restrict d1, double*__restrict d2,
    double*__restrict d3, double*__restrict d4)
{
    __m256d v = _mm256_loadu_pd(d1);
    __m256d v0 = _mm256_setzero_pd();
    __mmask8 m = _mm256_cmp_pd_mask(v, v0, _CMP_LT_OS);

    v = _mm256_loadu_pd(d4);
    v = _mm256_mask_loadu_pd(v, m, d3);

    _mm256_storeu_pd(d2, v);
}
[/code]

[asm]
test5(double*, double*, double*, double*):
        vmovupd ymm0, YMMWORD PTR [rdi]
        vxorpd  xmm1, xmm1, xmm1
        vcmppd  k1, ymm0, ymm1, 1
        vmovupd ymm0, YMMWORD PTR [rcx]
        vmovupd ymm0{k1}, YMMWORD PTR [rdx]
        vmovupd YMMWORD PTR [rsi], ymm0
        vzeroupper
        ret
[/asm]

Reply via email to