https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122320

            Bug ID: 122320
           Summary: gcc generate suboptimal code for avx512 with constant
                    propagation
           Product: gcc
           Version: 16.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c++
          Assignee: unassigned at gcc dot gnu.org
          Reporter: rockeet at gmail dot com
  Target Milestone: ---

#include <stddef.h>
#include <stdint.h>
#include <immintrin.h>
inline bool MemoryEqual(const void* vx, const void* vy, size_t n) {
  auto px = (const unsigned char*)vx;
  auto py = (const unsigned char*)vy;
  for (; n >= 64; n -= 64, px += 64, py += 64) {
    __m512i   xxx = _mm512_loadu_epi8(px);
    __m512i   yyy = _mm512_loadu_epi8(py);
    __mmask64 neq = _mm512_cmpneq_epi8_mask(xxx, yyy);
    if (0 != neq)
      return false;
  }
  // n is unlikely aligned to 64, skip check n has no harm
  // if (n) // if this check is present, gcc is ok
  {
    __mmask64 msk = _bzhi_u64(-1, n);
    __m512i   xxx = _mm512_maskz_loadu_epi8(msk, px);
    __m512i   yyy = _mm512_maskz_loadu_epi8(msk, py);
    __mmask64 neq = _mm512_cmpneq_epi8_mask(xxx, yyy);
    return 0 == neq;
  }
  return true;
}
bool MemoryEqual_64(const void* vx, const void* vy) {
    return MemoryEqual(vx, vy, 64);
}
--------------------
gcc generate suboptimal code, clang is ok(https://godbolt.org/z/jTdqxrr1j):
--------------------
MemoryEqual_64(void const*, void const*):
        vmovdqu8        zmm0, ZMMWORD PTR [rdi]
        vpcmpb  k0, zmm0, ZMMWORD PTR [rsi], 4
        kortestq        k0, k0
        jne     .L3
        vpxor   xmm0, xmm0, xmm0
        vpcmpb  k0, zmm0, zmm0, 4
        kortestq        k0, k0
        sete    al
.L1:
        vzeroupper
        ret
.L3:
        xor     eax, eax
        jmp     .L1

Reply via email to