https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88571

            Bug ID: 88571
           Summary: AVX512: when calculating logical expression with all
                    values in kN registers, do not use GPRs
           Product: gcc
           Version: 8.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: bugzi...@poradnik-webmastera.com
  Target Milestone: ---

This is a side effect of finding Bug 88570. I have noticed that when gcc has to
generate code for logical expression with all values already stored in kN
registers, it moves them to GPRs, performs calculation on them and moved result
back. Such situation may happen as a side effect of optimizations in gcc. It is
also move convenient to use C/C++ operators to write expressions instead of
intrinsics, so some people may prefer to use them. It probably can also happen
as a side effect of interaction of code optimized by gcc with user code.

When logical expression is written using intrinsics, values stays in kN
registers as expected.

Code below was compiled with -O3 -march=skylake-avx512. test1 and test2 are
examples of code with C/C++ operators. test3 is an example of not introduced by
gcc during optimization. This last example is also in Bug 88570, which I logged
to fix inefficient optimizations.

[code]
#include <immintrin.h>

void test1(int*__restrict n1, int*__restrict n2,
    int*__restrict n3, int*__restrict n4)
{
    __m256i v = _mm256_loadu_si256((__m256i*)n1);
    __mmask8 m = _mm256_cmpgt_epi32_mask(v, _mm256_set1_epi32(1));
    m = ~m;
    _mm256_mask_storeu_epi32((__m256i*)n2, m, v);
}

void test2(int*__restrict n1, int*__restrict n2,
    int*__restrict n3, int*__restrict n4)
{
    __m256i v1 = _mm256_loadu_si256((__m256i*)n1);
    __m256i v2 = _mm256_loadu_si256((__m256i*)n1);
    __m256i v0 = _mm256_set1_epi32(2);
    __mmask8 m1 = _mm256_cmpgt_epi32_mask(v1, _mm256_set1_epi32(1));
    __mmask8 m2 = _mm256_cmpgt_epi32_mask(v2, _mm256_set1_epi32(2));
    __mmask8 m = ~(m1 | m2);
    _mm256_mask_storeu_epi32((__m256i*)n2, m, v1);
}

void test3(double*__restrict d1, double*__restrict d2,
    double*__restrict d3, double*__restrict d4)
{
    for (int n = 0; n < 4; ++n)
    {
        if (d1[n] > 0.0)
            d2[n] = d3[n];
        else
            d2[n] = d4[n];
    }
}
[/code]

[asm]
test1(int*, int*, int*, int*):
        vmovdqu64       ymm0, YMMWORD PTR [rdi]
        vpcmpgtd        k1, ymm0, YMMWORD PTR .LC0[rip]
        kmovb   eax, k1
        not     eax
        kmovb   k2, eax
        vmovdqu32       YMMWORD PTR [rsi]{k2}, ymm0
        vzeroupper
        ret
test2(int*, int*, int*, int*):
        vmovdqu64       ymm1, YMMWORD PTR [rdi]
        vpcmpgtd        k1, ymm1, YMMWORD PTR .LC0[rip]
        vpcmpgtd        k2, ymm1, YMMWORD PTR .LC1[rip]
        kmovb   edx, k1
        kmovb   eax, k2
        or      eax, edx
        not     eax
        kmovb   k3, eax
        vmovdqu32       YMMWORD PTR [rsi]{k3}, ymm1
        vzeroupper
        ret
test3(double*, double*, double*, double*):
        vmovupd ymm0, YMMWORD PTR [rdi]
        vxorpd  xmm1, xmm1, xmm1
        vcmppd  k1, ymm0, ymm1, 14
        vcmpltpd        ymm1, ymm1, ymm0
        kmovb   eax, k1
        not     eax
        vmovupd ymm2{k1}{z}, YMMWORD PTR [rdx]
        kmovb   k2, eax
        vmovupd ymm0{k2}{z}, YMMWORD PTR [rcx]
        vblendvpd       ymm0, ymm0, ymm2, ymm1
        vmovupd YMMWORD PTR [rsi], ymm0
        vzeroupper
        ret
.LC0:
        .long   1
        .long   1
        .long   1
        .long   1
        .long   1
        .long   1
        .long   1
        .long   1
.LC1:
        .long   2
        .long   2
        .long   2
        .long   2
        .long   2
        .long   2
        .long   2
        .long   2
[/asm]

Reply via email to