You're mistaken, there is no OpenMP on OSX, Apple wants to force everyone to 
use Grand Central Dispatch (which as someone writing a multithreading runtime, 
must say that it's a fine library but not portable on Windows at the moment).

So if you're benchmarking on OSX you should get GCC or Clang from Homebrew.

To wait for all you can add `sync()` at the end.

To wait for some, unfortunately you must return a dummy value at the moment due 
to 
[https://github.com/nim-lang/Nim/issues/8040](https://github.com/nim-lang/Nim/issues/8040).

So without further ado
    
    
    import cpuinfo, times, math, threadpool
    
    # Bench script
    # 
----------------------------------------------------------------------------------------------------
    
    template benchmark(benchName: string, body: untyped) =
      # When multithreading, make sure to measure wall-clock time
      # If you use CPU time you might measure the cumulated CPU time on each 
processor.
      let start = epochTime()
      body
      let stop = epochTime()
      let elapsed = stop-start
      echo "Wall time for ", benchName, ": ", round(elapsed, 3), " s"
    
    # OpenMP
    # 
----------------------------------------------------------------------------------------------------
    
    # Add OpenMP to compilation flags
    {.passC:"-fopenmp".}
    {.passL:"-fopenmp".}
    
    # Nim native threading
    # 
----------------------------------------------------------------------------------------------------
    
    template parallelChunks(start, stop: int, chunkOffset, chunkSize, threadID: 
untyped{ident}, body: untyped): untyped =
      ## In-place declare and define "chunkOffset" and "chunkSize"
      ## That corresponds to a slice of the start..stop range
      ## that will be processed on the same core
      let
        numIters = (stop - start)
        numChunks = countProcessors()
        baseChunkSize = numIters div numChunks
        remainder = numIters mod numChunks
      
      # The following simple chunking scheme can lead to severe load imbalance
      #
      # `chunkOffset`{.inject.} = chunkSize * threadId
      # `chunkSize`{.inject.} =  if threadId < nb_chunks - 1: chunkSize
      #                          else: numIters - chunkOffset # remainder if 
division isn't exact
      #
      # For example dividing 40 items on 12 threads will lead to
      # a base_chunk_size of 40/12 = 3 so work on the first 11 threads
      # will be 3 * 11 = 33, and the remainder 7 on the last thread.
      
      # Instead of dividing 40 work items on 12 cores into:
      # 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7 = 3*11 + 7 = 40
      # the following scheme will divide into
      # 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3 = 4*4 + 3*8 = 40
      
      var chunkOffset {.inject.}, chunkSize {.inject.}: Natural
      
      for threadID{.inject.} in 0 ..< numChunks:
        if threadID < remainder:
          chunkOffset = start + (baseChunkSize + 1) * threadID
          chunkSize = baseChunkSize + 1
        else:
          chunkOffset = start + baseChunkSize * threadID + remainder
          chunkSize = baseChunkSize
        
        block: body
    
    # Benches
    # 
----------------------------------------------------------------------------------------------------
    
    let maxLim = 1_000_000
    
    proc doSth(i: int) =
      let k = i*2
    
    benchmark "normal loop":
      for i in 1..maxLim:
        doSth(i)
    
    # Don't interleave stacktraces, they require to allocate strings
    # and OpenMP doesn't create a GC or would require "setupForeignThreadGc()"
    {.push stackTrace:off.}
    benchmark "parallelized OpenMP":
      for i in 1||maxLim:
        doSth(i)
    {.pop.}
    
    type Dummy = bool
      ## Allow waiting on void spawns: 
https://github.com/nim-lang/Nim/issues/8040
    
    proc chunkedDoSth(chunkOffset, chunkSize: Natural): Dummy =
      ## A chunk is processed on the same core
      ## It returns a dummy calue so that we can wait on it
      for i in chunkOffset ..< chunkSize:
        doSth(i)
    
    # Let's not use heap allocation for this, but we need to wrap in a proc
    when defined(windows):
      proc alloca(size: csize): pointer {.header: "<malloc.h>".}
    else:
      proc alloca(size: csize): pointer {.header: "<alloca.h>".}
    
    proc parNimSpawn() =
      var tasks: ptr UncheckedArray[FlowVar[Dummy]]
      tasks = cast[type tasks](alloca(countProcessors() * 
sizeof(FlowVar[Dummy])))
      
      # Transforming this into a nice "parallel_for" is left as an exercise to 
the reader
      parallelChunks(1, maxLim, chunkOffset, chunkSize, threadID):
        # Spawn a task for each chunk
        tasks[threadID] = spawn chunkedDoSth(chunkOffset, chunkSize)
      
      # Wait all
      for i in 0 ..< countProcessors():
        let dummy = ^tasks[i]
    
    benchmark "parallelized Nim spawn":
      parNimSpawn()
    
    
    Run

Reply via email to