On Sunday, 7 March 2021 at 13:26:37 UTC, z wrote:
On Thursday, 25 February 2021 at 11:28:14 UTC, z wrote:
However, AVX512 support seems limited to being able to use the 16 other YMM registers, rather than using the same code template but changed to use ZMM registers and double the offsets to take advantage of the new size. Compiled with «-g -enable-unsafe-fp-math -enable-no-infs-fp-math -ffast-math -O -release -mcpu=skylake» :

You're not compiling with AVX512 enabled. You would need to use -mcpu=skylake-avx512.

However, LLVM's code generation for AVX512 seems to be pretty terrible still, so you'll need to either use some inline ASM, or stick with AVX2. Here's a structure of arrays style example:

import std.meta : Repeat;
void euclideanDistanceFixedSizeArray(V)(ref Repeat!(3, const(V)) a, ref Repeat!(3, const(V)) b, out V result)
    if(is(V : __vector(float[length]), size_t length))
{
    Repeat!(3, V) diffSq = a;
    static foreach(i; 0 .. 3) {
        diffSq[i] -= b[i];
        diffSq[i] *= diffSq[i];
    }

    result = diffSq[0];
    static foreach(i; 0 .. 3)
        result += diffSq[i];

    version(LDC) { version(X86_64) {
        enum isSupportedPlatform = true;
        import ldc.llvmasm : __asm;
        result = __asm!V(`vsqrtps $1, $0`, `=x, x`, result);
    } }
    static assert(isSupportedPlatform);
}

Resulting asm with is(V == __vector(float[16])):

.LCPI1_0:
        .long   0x7fc00000
pure nothrow @nogc void app.euclideanDistanceFixedSizeArray!(__vector(float[16])).euclideanDistanceFixedSizeArray(ref const(__vector(float[16])), ref const(__vector(float[16])), ref const(__vector(float[16])), ref const(__vector(float[16])), ref const(__vector(float[16])), ref const(__vector(float[16])), out __vector(float[16])):
        mov     rax, qword ptr [rsp + 8]
        vbroadcastss    zmm0, dword ptr [rip + .LCPI1_0]
        vmovaps zmmword ptr [rdi], zmm0
        vmovaps zmm0, zmmword ptr [rax]
        vmovaps zmm1, zmmword ptr [r9]
        vmovaps zmm2, zmmword ptr [r8]
        vsubps  zmm0, zmm0, zmmword ptr [rcx]
        vmulps  zmm0, zmm0, zmm0
        vsubps  zmm1, zmm1, zmmword ptr [rdx]
        vsubps  zmm2, zmm2, zmmword ptr [rsi]
        vaddps  zmm0, zmm0, zmm0
        vfmadd231ps     zmm0, zmm1, zmm1
        vfmadd231ps     zmm0, zmm2, zmm2
        vmovaps zmmword ptr [rdi], zmm0
        vsqrtps zmm0, zmm0
        vmovaps zmmword ptr [rdi], zmm0
        vzeroupper
        ret

Reply via email to