My matrix multiplication kernels reach the speed of industry grade pure 
Assembly libraries in pure Nim:

[https://github.com/numforge/laser/tree/d1e6ae61/laser/primitives/matrix_multiplication](https://github.com/numforge/laser/tree/d1e6ae61/laser/primitives/matrix_multiplication)

Adding a new type or CPU architecture is very easy thanks to macros:
    
    
    ukernel_generator(
        x86_AVX512,
        typ = float32,
        vectype = m512,
        nb_scalars = 16,
        simd_setZero = mm512_setzero_ps,
        simd_broadcast_value = mm512_set1_ps,
        simd_load_aligned = mm512_load_ps,
        simd_load_unaligned = mm512_loadu_ps,
        simd_store_unaligned = mm512_storeu_ps,
        simd_mul = mm512_mul_ps,
        simd_add = mm512_add_ps,
        simd_fma = mm512_fmadd_ps
      )
    
    
    Run

In comparison, OpenBLAS has to create a 7000 assembly file for each 
architecture 
([https://github.com/xianyi/OpenBLAS/blob/1a6ea8ee6/kernel/x86_64/sgemm_kernel_16x4_skylakex.S](https://github.com/xianyi/OpenBLAS/blob/1a6ea8ee6/kernel/x86_64/sgemm_kernel_16x4_skylakex.S))
 and the repo hosts a lot of those: 
[https://github.com/xianyi/OpenBLAS/tree/develop/kernel/x86_64](https://github.com/xianyi/OpenBLAS/tree/develop/kernel/x86_64).

For a more recent example XNNPACK was started a year ago, uses C "templates" 
[https://github.com/google/XNNPACK/tree/73ccfb4e/src/f32-gemm](https://github.com/google/XNNPACK/tree/73ccfb4e/src/f32-gemm)
 that a Python build script transforms into actual C code later: 
[https://github.com/google/XNNPACK/tree/73ccfb4e/src/f32-gemm/gen-inc](https://github.com/google/XNNPACK/tree/73ccfb4e/src/f32-gemm/gen-inc)

Other example: the syntax tensor slicing syntax in Arraymancer is impossible in 
a statically typed language without macros and probably one of the main reason 
scientists are not using raw C++ or any other new languages for scientitific 
computing:
    
    
    macro `[]`*[T](t: AnyTensor[T], args: varargs[untyped]): untyped =
      ## Slice a Tensor or a CudaTensor
      ## Input:
      ##   - a Tensor or a CudaTensor
      ##   - and:
      ##     - specific coordinates (``varargs[int]``)
      ##     - or a slice (cf. tutorial)
      ## Returns:
      ##   - a value or a tensor corresponding to the slice
      ## Warning ⚠ CudaTensor temporary default:
      ##   For CudaTensor only, this is a no-copy operation, data is shared 
with the input.
      ##   This proc does not guarantee that a ``let`` value is immutable.
      ## Usage:
      ##    - Basic indexing - foo[2, 3]
      ##    - Basic indexing - foo[1+1, 2*2*1]
      ##    - Basic slicing - foo[1..2, 3]
      ##    - Basic slicing - foo[1+1..4, 3-2..2]
      ##    - Span slices - foo[_, 3]
      ##    - Span slices - foo[1.._, 3]
      ##    - Span slices - foo[_..3, 3]
      ##    - Span slices - foo[_.._, 3]
      ##    - Stepping - foo[1..3\|2, 3]
      ##    - Span stepping - foo[_.._\|2, 3]
      ##    - Span stepping - foo[_.._\|+2, 3]
      ##    - Span stepping - foo[1.._\|1, 2..3]
      ##    - Span stepping - foo[_..<4\|2, 3]
      ##    - Slicing until at n from the end - foo[0..^4, 3]
      ##    - Span Slicing until at n from the end - foo[_..^2, 3]
      ##    - Stepped Slicing until at n from the end - foo[1..^1\|2, 3]
      ##    - Slice from the end - foo[^1..0\|-1, 3]
      ##    - Slice from the end - expect non-negative step error - foo[^1..0, 
3]
      ##    - Slice from the end - foo[^(2*2)..2*2, 3]
      ##    - Slice from the end - foo[^3..^2, 3]
    
    
    Run

A last example from the scientific domain, this neural network declaration 
syntax would not be possible without macros (yes neural networks can learn 
FizzBuzz ;))
    
    
    const
      NumDigits = 10
      NumHidden = 100
    
    let ctx = newContext Tensor[float32]
    
    network ctx, FizzBuzzNet:
      layers:
        hidden: Linear(NumDigits, NumHidden)
        output: Linear(NumHidden, 4)
      forward x:
        x.hidden.relu.output
    
    let model = ctx.init(FizzBuzzNet)
    let optim = model.optimizerSGD(0.05'f32)
    
    # ....
    echo answer
    # @["1", "2", "fizz", "4", "buzz", "6", "7", "8", "fizz", "10",
    #   "11", "12", "13", "14", "15", "16", "17", "fizz", "19", "buzz",
    #   "fizz", "22", "23", "24", "buzz", "26", "fizz", "28", "29", "30",
    #   "31", "32", "fizz", "34", "buzz", "36", "37", "38", "39", "40",
    #   "41", "fizz", "43", "44", "fizzbuzz", "46", "47", "fizz", "49", "50",
    #   "fizz", "52","53", "54", "buzz", "56", "fizz", "58", "59", "fizzbuzz",
    #   "61", "62", "63", "64", "buzz", "fizz", "67", "68", "fizz", "buzz",
    #   "71", "fizz", "73", "74", "75", "76", "77","fizz", "79", "buzz",
    #   "fizz", "82", "83", "fizz", "buzz", "86", "fizz", "88", "89", "90",
    #   "91", "92", "fizz", "94", "buzz", "fizz", "97", "98", "fizz", "buzz"]
    
    
    Run

Now outside of the scientific computing, macros are great to generate 
instructions tables for an Assembler or a Jit, for example 
[https://github.com/numforge/laser/blob/d1e6ae61/laser/photon_jit/x86_64/x86_64_ops.nim](https://github.com/numforge/laser/blob/d1e6ae61/laser/photon_jit/x86_64/x86_64_ops.nim)
    
    
    op_generator:
      op MOV: # MOV(dst, src) load/copy src into destination
        ## Copy 64-bit register content to another register
        [dst64, src64]: [rex(w=1), 0x89, modrm(Direct, reg = src64, rm = dst64)]
        ## Copy 32-bit register content to another register
        [dst32, src32]: [          0x89, modrm(Direct, reg = src32, rm = dst32)]
        ## Copy 16-bit register content to another register
        [dst16, src16]: [    0x66, 0x89, modrm(Direct, reg = src16, rm = dst16)]
        
        ## Copy  8-bit register content to another register
        [dst8,  src8]:  [          0x88, modrm(Direct, reg = src8, rm = dst8)]
        
        ## Copy 64-bit immediate value into register
        [dst64, imm64]: [rex(w=1), 0xB8 + dst64] & imm64
        ## Copy 32-bit immediate value into register
        [dst64, imm32]: [          0xB8 + dst64] & imm32
        ## Copy 16-bit immediate value into register
        [dst64, imm16]: [    0x66, 0xB8 + dst64] & imm16
        
        ## Copy 32-bit immediate value into register
        [dst32, imm32]: [          0xB8 + dst32] & imm32
        ## Copy 16-bit immediate value into register
        [dst32, imm16]: [    0x66, 0xB8 + dst32] & imm16
        
        ## Copy 16-bit immediate value into register
        [dst16, imm16]: [    0x66, 0xB8 + dst16] & imm16
        ## Copy  8-bit immediate value into register
        [dst8,  imm8]:  [          0xB0 + dst8, imm8]
      
      op LEA:
        ## Load effective address of the target label into a register
        [dst64, label]: [rex(w=1), 0x8D, modrm(Direct, reg = dst64, rm = rbp)]
      
      op CMP:
        ## Compare 32-bit immediate with 32-bit int at memory location stored 
in adr register
        [adr, imm64]: [ rex(w=1), 0x81, modrm(Indirect, opcode_ext = 7, rm = 
adr[0])] & imm64
        ## Compare 32-bit immediate with 32-bit int at memory location stored 
in adr register
        [adr, imm32]: [           0x81, modrm(Indirect, opcode_ext = 7, rm = 
adr[0])] & imm32
        ## Compare 16-bit immediate with 16-bit int at memory location stored 
in adr register
        [adr, imm16]: [     0x66, 0x81, modrm(Indirect, opcode_ext = 7, rm = 
adr[0])] & imm16
        ## Compare 8-bit immediate with byte at memory location stored in adr 
register
        [adr, imm8]:  [           0x80, modrm(Indirect, opcode_ext = 7, rm = 
adr[0]), imm8]
      
      op JZ:
        ## Jump to label if zero flag is set
        [label]: [0x0F, 0x84]
      op JNZ:
        ## Jump to label if zero flag is not set
        [label]: [0x0F, 0x85]
    
    
    Run

The 2 major JIT assembler today are ASMJit, which uses Javascript as a prepass 
before C 
[https://github.com/asmjit/asmjit/blob/ac77dfcd/tools/tablegen.js](https://github.com/asmjit/asmjit/blob/ac77dfcd/tools/tablegen.js)
 and Xbyak which uses C++ to generate C++ 
[https://github.com/herumi/xbyak/blob/7dac9f61/gen/gen_code.cpp](https://github.com/herumi/xbyak/blob/7dac9f61/gen/gen_code.cpp)

And last example from the low-level world. This is how you could defined an 
instruction table for an emulator 
([https://github.com/mratsim/glyph/blob/8b278c5e/glyph/snes/opcodes.nim#L16-L85](https://github.com/mratsim/glyph/blob/8b278c5e/glyph/snes/opcodes.nim#L16-L85))
    
    
    genOpcTable:
      op ADC: # Add with Carry
        0x61: cycles 6, {Ecc1_m16bit, EccDirectLowNonZero}                  , 
DirectXIndirect
        0x63: cycles 4, {Ecc1_m16bit}                                       , 
StackRelative
        0x65: cycles 3, {Ecc1_m16bit, EccDirectLowNonZero}                  , 
Direct
        0x67: cycles 6, {Ecc1_m16bit, EccDirectLowNonZero}                  , 
DirectIndirectLong
        0x69: cycles 2, {Ecc1_m16bit}                                       , 
Immediate
        0x6D: cycles 4, {Ecc1_m16bit}                                       , 
Absolute
        0x6F: cycles 5, {Ecc1_m16bit}                                       , 
AbsoluteLong
        0x71: cycles 5, {Ecc1_m16bit, EccDirectLowNonZero, EccCrossBoundary}, 
DirectIndirectY
        0x72: cycles 5, {Ecc1_m16bit, EccDirectLowNonZero}                  , 
DirectIndirect
        0x73: cycles 7, {Ecc1_m16bit}                                       , 
StackRelativeIndirectY
        0x75: cycles 4, {Ecc1_m16bit, EccDirectLowNonZero}                  , 
DirectX
        0x77: cycles 6, {Ecc1_m16bit, EccDirectLowNonZero}                  , 
DirectIndirectLongY
        0x79: cycles 4, {Ecc1_m16bit, EccCrossBoundary}                     , 
AbsoluteY
        0x7D: cycles 4, {Ecc1_m16bit, EccCrossBoundary}                     , 
AbsoluteX
        0x7F: cycles 5, {Ecc1_m16bit}                                       , 
AbsoluteLongX
        
        implementation:
          # 
###################################################################################
          template adcImpl(sys: Sys, T: typedesc[uint8 or uint16], carry, 
overflow: var bool) =
            # Implement uint8 and uint16 mode
            
            template A {.dirty.} =
              # Alias for accumulator depending on mode
              when T is uint16: sys.cpu.regs.A
              else: sys.cpu.regs.A.lo
            
            func add(x, y: T, carry, overflow: var bool): T {.inline.} =
              # Add function helper
              # Carry edge cases on uint8:
              #   - x =   0, y =   0, P.carry = 0 --> result =   0, carry = 0
              #   - x = 255, y =   0, P.carry = 1 --> result =   0, carry = 1
              #   - x =   0, y = 255, P.carry = 1 --> result =   0, carry = 1
              #   - x = 127, y = 128, P.carry = 1 --> result =   0, carry = 1
              #   - x = 128, y = 127, P.carry = 1 --> result =   0, carry = 1
              #   - x = 255, y =   0, P.carry = 0 --> result = 255, carry = 0
              #   - x =   0, y = 255, P.carry = 0 --> result = 255, carry = 0
              #   - x = 127, y = 128, P.carry = 0 --> result = 255, carry = 0
              #   - x = 128, y = 127, P.carry = 0 --> result = 255, carry = 0
              result = x + y
              carry = carry or result < x
              overflow =  overflow or
                          not(result.isMsbSet xor x.isMsbSet) or
                          not(result.isMsbSet xor y.isMsbSet)
            
            # Fetch data.
            # `addressingMode` and `extraCycleCosts` are injected by 
"implementation"
            let val = sys.`addressingMode`(T, `extraCycleCosts`{.inject.})
            
            # Computation
            # TODO: Decimal mode
            A = add(A, val, carry, overflow)
            A = add(A, T(P.carry), carry, overflow)
          # 
###################################################################################
          
          var carry, overflow = false
          
          if P.emulation_mode:
            sys.adcImpl(uint8, carry, overflow)
          else:
            sys.adcImpl(uint16, carry, overflow)
          
          # Sets the flags
          P.carry    = carry
          P.overflow = overflow
          P.negative = A.isMsbSet
          P.zero     = A == 0
      
      op AND: # AND Accumulator with memory
        ...
    
    
    Run

Alternative emulators either have Python, Ruby codegen as an intermediate step 
or are using hex directly or are very verbose.

And lastly, I doubt any static language besides Nim is able to write something 
like [npeg](https://github.com/zevv/npeg) (i.e. writing a custom parser without 
having to first compile the parser before use) or writing a compiler in macros 
to optimize linear algebra 
([https://github.com/numforge/laser/tree/d1e6ae61/laser/lux_compiler/core](https://github.com/numforge/laser/tree/d1e6ae61/laser/lux_compiler/core))
    
    
    import npeg, strutils, tables
    
    var words = initTable[string, int]()
    
    let parser = peg "pairs":
      pairs <- pair * *(',' * pair) * !1
      word <- +Alpha
      number <- +Digit
      pair <- >word * '=' * >number:
        words[$1] = parseInt($2)
    
    doAssert parser.match("one=1,two=2,three=3,four=4").ok
    echo words # {"two": 2, "three": 3, "one": 1, "four": 4}
    
    
    Run

[https://github.com/numforge/laser/tree/d1e6ae61/laser/lux_compiler/core](https://github.com/numforge/laser/tree/d1e6ae61/laser/lux_compiler/core)
    
    
    proc transpose(A: Function): Function =
      # Generator for the transposition function
      var i, j: Domain
      var B: Function
      
      # Definition of the result function
      B[j, i] = A[i, j]
      
      return B # Transposition function
    
    
    Run

is compiled transparently to 
    
    
    proc transpose(A: Tensor[float32]): Tensor[float32] =
      # Transposition kernel
      let dim0 = A.shape[0]
      let dim1 = A.shape[1]
      
      result = newTensor[float32](dim0, dim1)
      
      for j in 0 ..< dim1:
        for i in 0 ..< dim0:
          result[j, i] = A[i, j]
    
    
    Run

with a goal to add parallelism and SIMD vectorization support seamlessly while 
allowing the user to stay in with a representation closed to science. 

Reply via email to