https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82370

            Bug ID: 82370
           Summary: AVX512 can use a memory operand for immediate-count
                    vpsrlw, but gcc doesn't.
           Product: gcc
           Version: 8.0
            Status: UNCONFIRMED
          Keywords: missed-optimization, ssemmx
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: peter at cordes dot ca
  Target Milestone: ---
            Target: x86_64-*-*, i?86-*-*

#include <immintrin.h>
#include <stdint.h>
#include <stddef.h>
void pack_high8_alignhack(uint8_t *restrict dst, const uint8_t *restrict src,
size_t bytes) {
  uint8_t *end_dst = dst + bytes;
  do{
     __m128i v0 = _mm_loadu_si128((__m128i*)src);
     __m128i v1_offset = _mm_loadu_si128(1+(__m128i*)(src-1));
     v0 = _mm_srli_epi16(v0, 8);
     __m128i v1 = _mm_and_si128(v1_offset, _mm_set1_epi16(0x00FF));
     __m128i pack = _mm_packus_epi16(v0, v1);
     _mm_storeu_si128((__m128i*)dst, pack);
     dst += 16;
     src += 32;  // 32 bytes
  } while(dst < end_dst);
}

pack_high8_alignhack:
        vmovdqa64       .LC0(%rip), %xmm2    # pointless EVEX when VEX is
shorter
        addq    %rdi, %rdx
.L18:
        vmovdqu64       (%rsi), %xmm0
        vpandq  15(%rsi), %xmm2, %xmm1       # pointless EVEX vs. VPAND
        addq    $16, %rdi
        addq    $32, %rsi
        vpsrlw  $8, %xmm0, %xmm0             # could use a memory source.
        vpackuswb       %xmm1, %xmm0, %xmm0
        vmovups %xmm0, -16(%rdi)
        cmpq    %rdi, %rdx
        ja      .L18
        ret

There's no benefit to using VPANDQ (4-byte EVEX prefix) instead of VPAND
(2-byte VEX prefix).  Same for VMOVDQA64.  We should only use the AVX512
version when we need masking, ZMM register size, or xmm/ymm16-31.

Or in this case, to use the AVX512VL+AVX512BW form that lets us fold a load
into a memory operand:  VPSRLW xmm1 {k1}{z}, xmm2/m128, imm8 
(https://hjlebbink.github.io/x86doc/html/PSRLW_PSRLD_PSRLQ.html).  IACA2.3 says
it micro-fuses, so it's definitely worth it.

Clang gets everything right and emits:

pack_high8_alignhack:
        addq    %rdi, %rdx
        vmovdqa .LCPI2_0(%rip), %xmm0    # Plain AVX (VEX prefix)
.LBB2_1:
        vpsrlw  $8, (%rsi), %xmm1        # load folded into AVX512BW version
        vpand   15(%rsi), %xmm0, %xmm2   # AVX-128 VEX encoding.
        vpackuswb       %xmm2, %xmm1, %xmm1
        vmovdqu %xmm1, (%rdi)
        addq    $16, %rdi
        addq    $32, %rsi
        cmpq    %rdx, %rdi
        jb      .LBB2_1
        retq

vmovdqu is the same length as vmovups, so there's no benefit.  But AFAIK, no
downside on any CPU to always using FP stores on the results of vector-integer
ALU instructions.

(There isn't a separate mnemonic for EVEX vmovups, so the assembler uses the
VEX encoding whenever it's encodeable that way.  Or maybe for medium-size
displacements that are multiples of the vector width, it can save a byte by
using an EVEX + disp8 instead of VEX + disp32.)

Reply via email to