https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122320
Bug ID: 122320
Summary: gcc generate suboptimal code for avx512 with constant
propagation
Product: gcc
Version: 16.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c++
Assignee: unassigned at gcc dot gnu.org
Reporter: rockeet at gmail dot com
Target Milestone: ---
#include <stddef.h>
#include <stdint.h>
#include <immintrin.h>
inline bool MemoryEqual(const void* vx, const void* vy, size_t n) {
auto px = (const unsigned char*)vx;
auto py = (const unsigned char*)vy;
for (; n >= 64; n -= 64, px += 64, py += 64) {
__m512i xxx = _mm512_loadu_epi8(px);
__m512i yyy = _mm512_loadu_epi8(py);
__mmask64 neq = _mm512_cmpneq_epi8_mask(xxx, yyy);
if (0 != neq)
return false;
}
// n is unlikely aligned to 64, skip check n has no harm
// if (n) // if this check is present, gcc is ok
{
__mmask64 msk = _bzhi_u64(-1, n);
__m512i xxx = _mm512_maskz_loadu_epi8(msk, px);
__m512i yyy = _mm512_maskz_loadu_epi8(msk, py);
__mmask64 neq = _mm512_cmpneq_epi8_mask(xxx, yyy);
return 0 == neq;
}
return true;
}
bool MemoryEqual_64(const void* vx, const void* vy) {
return MemoryEqual(vx, vy, 64);
}
--------------------
gcc generate suboptimal code, clang is ok(https://godbolt.org/z/jTdqxrr1j):
--------------------
MemoryEqual_64(void const*, void const*):
vmovdqu8 zmm0, ZMMWORD PTR [rdi]
vpcmpb k0, zmm0, ZMMWORD PTR [rsi], 4
kortestq k0, k0
jne .L3
vpxor xmm0, xmm0, xmm0
vpcmpb k0, zmm0, zmm0, 4
kortestq k0, k0
sete al
.L1:
vzeroupper
ret
.L3:
xor eax, eax
jmp .L1