On Sunday, 7 March 2021 at 13:26:37 UTC, z wrote:
On Thursday, 25 February 2021 at 11:28:14 UTC, z wrote:
However, AVX512 support seems limited to being able to use the
16 other YMM registers, rather than using the same code
template but changed to use ZMM registers and double the
offsets to take advantage of the new size.
Compiled with «-g -enable-unsafe-fp-math
-enable-no-infs-fp-math -ffast-math -O -release -mcpu=skylake» :
You're not compiling with AVX512 enabled. You would need to use
-mcpu=skylake-avx512.
However, LLVM's code generation for AVX512 seems to be pretty
terrible still, so you'll need to either use some inline ASM, or
stick with AVX2. Here's a structure of arrays style example:
import std.meta : Repeat;
void euclideanDistanceFixedSizeArray(V)(ref Repeat!(3, const(V))
a, ref Repeat!(3, const(V)) b, out V result)
if(is(V : __vector(float[length]), size_t length))
{
Repeat!(3, V) diffSq = a;
static foreach(i; 0 .. 3) {
diffSq[i] -= b[i];
diffSq[i] *= diffSq[i];
}
result = diffSq[0];
static foreach(i; 0 .. 3)
result += diffSq[i];
version(LDC) { version(X86_64) {
enum isSupportedPlatform = true;
import ldc.llvmasm : __asm;
result = __asm!V(`vsqrtps $1, $0`, `=x, x`, result);
} }
static assert(isSupportedPlatform);
}
Resulting asm with is(V == __vector(float[16])):
.LCPI1_0:
.long 0x7fc00000
pure nothrow @nogc void
app.euclideanDistanceFixedSizeArray!(__vector(float[16])).euclideanDistanceFixedSizeArray(ref const(__vector(float[16])), ref const(__vector(float[16])), ref const(__vector(float[16])), ref const(__vector(float[16])), ref const(__vector(float[16])), ref const(__vector(float[16])), out __vector(float[16])):
mov rax, qword ptr [rsp + 8]
vbroadcastss zmm0, dword ptr [rip + .LCPI1_0]
vmovaps zmmword ptr [rdi], zmm0
vmovaps zmm0, zmmword ptr [rax]
vmovaps zmm1, zmmword ptr [r9]
vmovaps zmm2, zmmword ptr [r8]
vsubps zmm0, zmm0, zmmword ptr [rcx]
vmulps zmm0, zmm0, zmm0
vsubps zmm1, zmm1, zmmword ptr [rdx]
vsubps zmm2, zmm2, zmmword ptr [rsi]
vaddps zmm0, zmm0, zmm0
vfmadd231ps zmm0, zmm1, zmm1
vfmadd231ps zmm0, zmm2, zmm2
vmovaps zmmword ptr [rdi], zmm0
vsqrtps zmm0, zmm0
vmovaps zmmword ptr [rdi], zmm0
vzeroupper
ret