https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106704

            Bug ID: 106704
           Summary: avx intrinsic no generating vblendvps instruction with
                    -mavx
           Product: gcc
           Version: 12.1.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: mindmark at gmail dot com
  Target Milestone: ---

I'm getting some really strange code generation on x86_64 when using the
_mm256_blendv_ps intrinsic in gcc 12 with just the -mavx flag. 

https://godbolt.org/z/YbEK4nvEd

It should generate a single vblendvps instruction, but instead creates a lot of
branchy scaler code.

gcc 11 doesn't appear to exhibit this behavior.
Also using -mavx2 on gcc 12 generates the single instruction, just not -mavx
for whatever reason. 

#include <immintrin.h>

__m256 bend_stuff( __m256 a, __m256 b, __m256 mask)
{
    return _mm256_blendv_ps(a, b, mask);
}

gcc -O2 -mavx -c blend.c -masm=intel -S -o out.s

in GCC 12 it generates:

bend_stuff:
        vmovd   eax, xmm2
        vmovaps ymm3, ymm0
        vmovdqa xmm4, xmm2
        test    eax, eax
        jns     .L3
        vmovaps xmm0, xmm1
.L3:
        vpextrd eax, xmm4, 1
        vshufps xmm9, xmm3, xmm3, 85
        test    eax, eax
        jns     .L5
        vshufps xmm9, xmm1, xmm1, 85
.L5:
        vpextrd eax, xmm4, 2
        vunpckhps       xmm5, xmm3, xmm3
        test    eax, eax
        jns     .L7
        vunpckhps       xmm5, xmm1, xmm1
.L7:
        vpextrd eax, xmm4, 3
        vshufps xmm7, xmm3, xmm3, 255
        test    eax, eax
        jns     .L9
        vshufps xmm7, xmm1, xmm1, 255
.L9:
        vextractf128    xmm2, ymm2, 0x1
        vextractf128    xmm4, ymm3, 0x1
        vmovd   eax, xmm2
        test    eax, eax
        jns     .L11
        vextractf128    xmm4, ymm1, 0x1
.L11:
        vextractf128    xmm8, ymm3, 0x1
        vpextrd eax, xmm2, 1
        vshufps xmm8, xmm8, xmm8, 85
        test    eax, eax
        jns     .L13
        vextractf128    xmm8, ymm1, 0x1
        vshufps xmm8, xmm8, xmm8, 85
.L13:
        vextractf128    xmm6, ymm3, 0x1
        vpextrd eax, xmm2, 2
        vunpckhps       xmm6, xmm6, xmm6
        test    eax, eax
        jns     .L15
        vextractf128    xmm6, ymm1, 0x1
        vunpckhps       xmm6, xmm6, xmm6
.L15:
        vextractf128    xmm3, ymm3, 0x1
        vpextrd eax, xmm2, 3
        vshufps xmm3, xmm3, xmm3, 255
        test    eax, eax
        jns     .L17
        vextractf128    xmm1, ymm1, 0x1
        vshufps xmm3, xmm1, xmm1, 255
.L17:
        vunpcklps       xmm6, xmm6, xmm3
        vunpcklps       xmm4, xmm4, xmm8
        vunpcklps       xmm5, xmm5, xmm7
        vunpcklps       xmm0, xmm0, xmm9
        vmovlhps        xmm4, xmm4, xmm6
        vmovlhps        xmm0, xmm0, xmm5
        vinsertf128     ymm0, ymm0, xmm4, 0x1
        ret

in GCC 11.3 it generates:

bend_stuff:
        vblendvps       ymm0, ymm0, ymm1, ymm2
        ret

Reply via email to