I also would like to have rdtsc tests to catch perf regression, see: 
[https://github.com/mratsim/Arraymancer/issues/135](https://github.com/mratsim/Arraymancer/issues/135)

@Stefan_Salewski, note that it's easy to wrap RDTSC, see 
[https://gist.github.com/edubart/f6c92b1fdfca1c1e15ec34bb45f88595](https://gist.github.com/edubart/f6c92b1fdfca1c1e15ec34bb45f88595)
    
    
    import sequtils, random
    
    proc c_malloc(size: csize): pointer {.importc: "malloc", header: 
"<stdlib.h>".}
    proc c_aligned_alloc(alignment, size: csize): pointer {.importc: 
"aligned_alloc", header: "<stdlib.h>".}
    proc c_free(p: pointer) {.importc: "free", header: "<stdlib.h>".}
    proc mkl_malloc(size: csize, align: int): pointer {.importc: "mkl_malloc", 
header: "<mkl.h>".}
    proc mkl_free(p: pointer) {.importc: "mkl_free", header: "<mkl.h>".}
    
    {.passL:"-lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lm".}
    
    proc rdtsc(): int64 =
      var hi, lo: uint32
      asm """
        rdtsc
        :"=a"(`lo`), "=d"(`hi`)
      """
      result = int64(lo) or (int64(hi) shl 32)
    type
      CycleCounter = object
        start_cycles: int64
    proc restart(self: var CycleCounter) {.inline.} =
      self.start_cycles = rdtsc()
    proc elapsedCycles(self: CycleCounter): int64 {.inline.} =
      rdtsc() - self.start_cycles
    proc iterationCycles(self: CycleCounter, iters: int64): float {.inline.} =
      self.elapsedCycles().float / iters.float
    
    const ALIGNMENT = 64
    const MAX_SIZE = 128*128*128
    const MAX_ITERS = 1000000
    
    var alloc_sizes = newSeqWith(MAX_ITERS, max((random(MAX_SIZE) div 
ALIGNMENT) * ALIGNMENT, ALIGNMENT))
    
    proc main() =
      var cycler: CycleCounter
      cycler.restart()
      var c = 0.0f
      for size in alloc_sizes:
        let data = c_malloc(size*sizeof(float32))
        c += cast[ptr UncheckedArray[float32]](data)[0]
        c_free(data)
      echo "C malloc ", cycler.iterationCycles(MAX_ITERS), " cycles"
      
      cycler.restart()
      for size in alloc_sizes:
        let data = c_aligned_alloc(ALIGNMENT, size*sizeof(float32))
        c += cast[ptr UncheckedArray[float32]](data)[0]
        c_free(data)
      echo "C aligned_alloc ", cycler.iterationCycles(MAX_ITERS), " cycles"
      
      cycler.restart()
      for size in alloc_sizes:
        let data = allocShared(size*sizeof(float32))
        c += cast[ptr UncheckedArray[float32]](data)[0]
        deallocShared(data)
      echo "nim allocShared ", cycler.iterationCycles(MAX_ITERS), " cycles"
      
      cycler.restart()
      for size in alloc_sizes:
        let data = alloc(size*sizeof(float32))
        c += cast[ptr UncheckedArray[float32]](data)[0]
        dealloc(data)
      echo "nim alloc ", cycler.iterationCycles(MAX_ITERS), " cycles"
      
      cycler.restart()
      for size in alloc_sizes:
        var data = newSeqOfCap[float32](size)
        data.setLen(size)
        c += data[0]
      echo "nim seq ", cycler.iterationCycles(MAX_ITERS), " cycles"
      
      cycler.restart()
      for size in alloc_sizes:
        let data = mkl_malloc(size*sizeof(float32), ALIGNMENT)
        c += cast[ptr UncheckedArray[float32]](data)[0]
        mkl_free(data)
      echo "mkl_alloc ", cycler.iterationCycles(MAX_ITERS), " cycles"
      
      echo "just printed to foll gcc: ", c
    
    echo "--- initial ---"
    main()
    echo "--- after warmup ---"
    main()
    
    
    Run

Reply via email to