You're mistaken, there is no OpenMP on OSX, Apple wants to force everyone to use Grand Central Dispatch (which as someone writing a multithreading runtime, must say that it's a fine library but not portable on Windows at the moment).
So if you're benchmarking on OSX you should get GCC or Clang from Homebrew. To wait for all you can add `sync()` at the end. To wait for some, unfortunately you must return a dummy value at the moment due to [https://github.com/nim-lang/Nim/issues/8040](https://github.com/nim-lang/Nim/issues/8040). So without further ado import cpuinfo, times, math, threadpool # Bench script # ---------------------------------------------------------------------------------------------------- template benchmark(benchName: string, body: untyped) = # When multithreading, make sure to measure wall-clock time # If you use CPU time you might measure the cumulated CPU time on each processor. let start = epochTime() body let stop = epochTime() let elapsed = stop-start echo "Wall time for ", benchName, ": ", round(elapsed, 3), " s" # OpenMP # ---------------------------------------------------------------------------------------------------- # Add OpenMP to compilation flags {.passC:"-fopenmp".} {.passL:"-fopenmp".} # Nim native threading # ---------------------------------------------------------------------------------------------------- template parallelChunks(start, stop: int, chunkOffset, chunkSize, threadID: untyped{ident}, body: untyped): untyped = ## In-place declare and define "chunkOffset" and "chunkSize" ## That corresponds to a slice of the start..stop range ## that will be processed on the same core let numIters = (stop - start) numChunks = countProcessors() baseChunkSize = numIters div numChunks remainder = numIters mod numChunks # The following simple chunking scheme can lead to severe load imbalance # # `chunkOffset`{.inject.} = chunkSize * threadId # `chunkSize`{.inject.} = if threadId < nb_chunks - 1: chunkSize # else: numIters - chunkOffset # remainder if division isn't exact # # For example dividing 40 items on 12 threads will lead to # a base_chunk_size of 40/12 = 3 so work on the first 11 threads # will be 3 * 11 = 33, and the remainder 7 on the last thread. # Instead of dividing 40 work items on 12 cores into: # 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7 = 3*11 + 7 = 40 # the following scheme will divide into # 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3 = 4*4 + 3*8 = 40 var chunkOffset {.inject.}, chunkSize {.inject.}: Natural for threadID{.inject.} in 0 ..< numChunks: if threadID < remainder: chunkOffset = start + (baseChunkSize + 1) * threadID chunkSize = baseChunkSize + 1 else: chunkOffset = start + baseChunkSize * threadID + remainder chunkSize = baseChunkSize block: body # Benches # ---------------------------------------------------------------------------------------------------- let maxLim = 1_000_000 proc doSth(i: int) = let k = i*2 benchmark "normal loop": for i in 1..maxLim: doSth(i) # Don't interleave stacktraces, they require to allocate strings # and OpenMP doesn't create a GC or would require "setupForeignThreadGc()" {.push stackTrace:off.} benchmark "parallelized OpenMP": for i in 1||maxLim: doSth(i) {.pop.} type Dummy = bool ## Allow waiting on void spawns: https://github.com/nim-lang/Nim/issues/8040 proc chunkedDoSth(chunkOffset, chunkSize: Natural): Dummy = ## A chunk is processed on the same core ## It returns a dummy calue so that we can wait on it for i in chunkOffset ..< chunkSize: doSth(i) # Let's not use heap allocation for this, but we need to wrap in a proc when defined(windows): proc alloca(size: csize): pointer {.header: "<malloc.h>".} else: proc alloca(size: csize): pointer {.header: "<alloca.h>".} proc parNimSpawn() = var tasks: ptr UncheckedArray[FlowVar[Dummy]] tasks = cast[type tasks](alloca(countProcessors() * sizeof(FlowVar[Dummy]))) # Transforming this into a nice "parallel_for" is left as an exercise to the reader parallelChunks(1, maxLim, chunkOffset, chunkSize, threadID): # Spawn a task for each chunk tasks[threadID] = spawn chunkedDoSth(chunkOffset, chunkSize) # Wait all for i in 0 ..< countProcessors(): let dummy = ^tasks[i] benchmark "parallelized Nim spawn": parNimSpawn() Run
