First of all, I never used the parallel pragma, I don't know what it does under 
the hood. However I see critical issues with your benchmark:

  * When you report "CPU Time" are you using "cpuTime", "epochTime" or a 
monotonic clock?

> cpuTime or monotonic clocks are tricky to use for multithreading because it 
> counts the number of cycle spent by the CPU, but if you have a parallel 
> workload that takes 1000 cycles per cores on 10 cores, you might get a report 
> of 10000 cycles (divided by CpuFreq). And with multithreading overhead and 
> such you might report a total of 12000 cycles instead of 10000 cycles for a 
> serial processing on a single core and then it seems slower.

  * As I said I don't know how "parallel" work but I fear you are doing the 
same work on each core instead of splitting the work per core
  * Unless your "benchmark" template as a special script, in the serial case 
the compiler will see that your computation is side-effect free and will 
**optimize it away** so you are effectively benchmarking an empty statement.



Now, for your use-case I would use either OpenMP or cook up a parallel-for 
using raw spawn. Here you go, comments are inline, make sure to create a bench 
that the compiler cannot optimize away.
    
    
    import cpuinfo, times, strformat, math, threadpool
    
    # Bench script
    # 
----------------------------------------------------------------------------------------------------
    
    template benchmark(benchName: string, body: untyped) =
      # When multithreading, make sure to measure wall-clock time
      # If you use CPU time you might measure the CPU time on each processor
      # and then add them together.
      let start = epochTime()
      body
      let stop = epochTime()
      let elapsed = stop-start
      # strformat is again broken in templates :/
      # echo &"Wall time for {benchName:<20}: {elapsed:3.4f}s"
      echo "Wall time for ", benchName, ": ", round(elapsed, 3), " s"
    
    # OpenMP
    # 
----------------------------------------------------------------------------------------------------
    
    # Add OpenMP to compilation flags
    {.passC:"-fopenmp".}
    {.passL:"-fopenmp".}
    
    # Nim native threading
    # 
----------------------------------------------------------------------------------------------------
    
    template parallelChunks(start, stop: int, chunkOffset, chunkSize: 
untyped{ident}, body: untyped): untyped =
      ## In-place declare and define "chunkOffset" and "chunkSize"
      ## That corresponds to a slice of the start..stop range
      ## that will be processed on the same core
      let
        numIters = (stop - start)
        numChunks = countProcessors()
        baseChunkSize = numIters div numChunks
        remainder = numIters mod numChunks
      
      # The following simple chunking scheme can lead to severe load imbalance
      #
      # `chunkOffset`{.inject.} = chunkSize * threadId
      # `chunkSize`{.inject.} =  if threadId < nb_chunks - 1: chunkSize
      #                          else: numIters - chunkOffset # remainder if 
division isn't exact
      #
      # For example dividing 40 items on 12 threads will lead to
      # a base_chunk_size of 40/12 = 3 so work on the first 11 threads
      # will be 3 * 11 = 33, and the remainder 7 on the last thread.
      
      # Instead of dividing 40 work items on 12 cores into:
      # 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7 = 3*11 + 7 = 40
      # the following scheme will divide into
      # 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3 = 4*4 + 3*8 = 40
      
      var chunkOffset {.inject.}, chunkSize {.inject.}: Natural
      
      for threadID in 0 ..< numChunks:
        if threadID < remainder:
          chunkOffset = start + (baseChunkSize + 1) * threadID
          chunkSize = baseChunkSize + 1
        else:
          chunkOffset = start + baseChunkSize * threadID + remainder
          chunkSize = baseChunkSize
        
        block: body
    
    # Benches
    # 
----------------------------------------------------------------------------------------------------
    
    let maxLim = 1_000_000
    
    proc doSth(i: int) =
      let k = i*2
    
    benchmark "normal loop":
      for i in 1..maxLim:
        doSth(i)
    
    # Don't interleave stacktraces, they require to allocate strings
    # and OpenMP doesn't create a GC or would require "setupForeignThreadGc()"
    {.push stackTrace:off.}
    benchmark "parallelized OpenMP":
      for i in 1||maxLim:
        doSth(i)
    {.pop.}
    
    proc chunkedDoSth(chunkOffset, chunkSize: Natural) =
      ## A checunk is processed on the same core
      for i in chunkOffset ..< chunkSize:
        doSth(i)
    
    benchmark "parallelized Nim spawn":
      parallelChunks(1, maxLim, chunkOffset, chunkSize):
        # Spawn a task for each chunk
        spawn chunkedDoSth(chunkOffset, chunkSize)
    
    
    Run

Reply via email to