Mir Algorithm and Mir GLAS (glas is experimental) was added to https://d.godbolt.org
by Johan Engelen. Thanks you, Johan!

Try it:
1. Select mir-algorithm 0.6.13 from the libraries list (after Intel button)
2. Select LDC 1.4.0
3. Add compiler flags: -O -release -mcpu=cannonlake -linkonce-templates -betterC
4. Add o code
5. Enjoy AVX512 instructions with fused math :)
// Euclidian norm
import mir.ndslice;
import mir.math.common;

@fastmath double norm2(ContiguousVector!double x) {
    return 0.0.reduce!"a + b * b"(x).sqrt;


double example.norm2(mir.ndslice.slice.Slice!(2, [1], double*).Slice):
  mov rax, qword ptr [rsp + 8]
  test rax, rax
  je .LBB0_1
  lea rcx, [rsp + 8]
  mov rcx, qword ptr [rcx + 8]
  vxorpd xmm0, xmm0, xmm0
  cmp rax, 32
  jb .LBB0_12
  mov r8, rax
  and r8, -32
  mov rsi, rax
  and rsi, -32
  je .LBB0_12
  lea rdi, [rsi - 32]
  mov rdx, rdi
  shr rdx, 5
  bt edi, 5
  jb .LBB0_5
  vmovupd zmm0, zmmword ptr [rcx]
  vmovupd zmm1, zmmword ptr [rcx + 64]
  vmovupd zmm2, zmmword ptr [rcx + 128]
  vmovupd zmm3, zmmword ptr [rcx + 192]
  vmulpd zmm0, zmm0, zmm0
  vmulpd zmm1, zmm1, zmm1
  vmulpd zmm2, zmm2, zmm2
  vmulpd zmm3, zmm3, zmm3
  mov r9d, 32
  test rdx, rdx
  jne .LBB0_8
  jmp .LBB0_10
  vxorps xmm0, xmm0, xmm0
  vsqrtsd xmm0, xmm0, xmm0
  vxorpd zmm0, zmm0, zmm0
  xor r9d, r9d
  vxorpd zmm1, zmm1, zmm1
  vxorpd zmm2, zmm2, zmm2
  vxorpd zmm3, zmm3, zmm3
  test rdx, rdx
  je .LBB0_10
  mov rdi, rsi
  sub rdi, r9
  lea rdx, [rcx + 8*r9 + 448]
  vmovupd zmm4, zmmword ptr [rdx - 448]
  vmovupd zmm5, zmmword ptr [rdx - 384]
  vmovupd zmm6, zmmword ptr [rdx - 320]
  vmovupd zmm7, zmmword ptr [rdx - 256]
  vfmadd213pd zmm4, zmm4, zmm0
  vfmadd213pd zmm5, zmm5, zmm1
  vfmadd213pd zmm6, zmm6, zmm2
  vfmadd213pd zmm7, zmm7, zmm3
  vmovupd zmm0, zmmword ptr [rdx - 192]
  vmovupd zmm1, zmmword ptr [rdx - 128]
  vmovupd zmm2, zmmword ptr [rdx - 64]
  vmovupd zmm3, zmmword ptr [rdx]
  vfmadd213pd zmm0, zmm0, zmm4
  vfmadd213pd zmm1, zmm1, zmm5
  vfmadd213pd zmm2, zmm2, zmm6
  vfmadd213pd zmm3, zmm3, zmm7
  add rdx, 512
  add rdi, -64
  jne .LBB0_9
  vaddpd zmm0, zmm0, zmm2
  vaddpd zmm1, zmm1, zmm3
  vaddpd zmm0, zmm0, zmm1
  vshuff64x2 zmm1, zmm0, zmm0, 14
  vaddpd zmm0, zmm0, zmm1
  vpermpd zmm1, zmm0, 238
  vaddpd zmm0, zmm0, zmm1
  vpermilpd zmm1, zmm0, 1
  vaddpd zmm0, zmm0, zmm1
  cmp rax, rsi
  je .LBB0_13
  sub rax, r8
  lea rcx, [rcx + 8*rsi]
  vmovsd xmm1, qword ptr [rcx]
  vfmadd231sd xmm0, xmm1, xmm1
  add rcx, 8
  add rax, -1
  jne .LBB0_12
  vsqrtsd xmm0, xmm0, xmm0

Bet regards,

Reply via email to