First of all, I never used the parallel pragma, I don't know what it does under
the hood. However I see critical issues with your benchmark:
* When you report "CPU Time" are you using "cpuTime", "epochTime" or a
monotonic clock?
> cpuTime or monotonic clocks are tricky to use for multithreading because it
> counts the number of cycle spent by the CPU, but if you have a parallel
> workload that takes 1000 cycles per cores on 10 cores, you might get a report
> of 10000 cycles (divided by CpuFreq). And with multithreading overhead and
> such you might report a total of 12000 cycles instead of 10000 cycles for a
> serial processing on a single core and then it seems slower.
* As I said I don't know how "parallel" work but I fear you are doing the
same work on each core instead of splitting the work per core
* Unless your "benchmark" template as a special script, in the serial case
the compiler will see that your computation is side-effect free and will
**optimize it away** so you are effectively benchmarking an empty statement.
Now, for your use-case I would use either OpenMP or cook up a parallel-for
using raw spawn. Here you go, comments are inline, make sure to create a bench
that the compiler cannot optimize away.
import cpuinfo, times, strformat, math, threadpool
# Bench script
#
----------------------------------------------------------------------------------------------------
template benchmark(benchName: string, body: untyped) =
# When multithreading, make sure to measure wall-clock time
# If you use CPU time you might measure the CPU time on each processor
# and then add them together.
let start = epochTime()
body
let stop = epochTime()
let elapsed = stop-start
# strformat is again broken in templates :/
# echo &"Wall time for {benchName:<20}: {elapsed:3.4f}s"
echo "Wall time for ", benchName, ": ", round(elapsed, 3), " s"
# OpenMP
#
----------------------------------------------------------------------------------------------------
# Add OpenMP to compilation flags
{.passC:"-fopenmp".}
{.passL:"-fopenmp".}
# Nim native threading
#
----------------------------------------------------------------------------------------------------
template parallelChunks(start, stop: int, chunkOffset, chunkSize:
untyped{ident}, body: untyped): untyped =
## In-place declare and define "chunkOffset" and "chunkSize"
## That corresponds to a slice of the start..stop range
## that will be processed on the same core
let
numIters = (stop - start)
numChunks = countProcessors()
baseChunkSize = numIters div numChunks
remainder = numIters mod numChunks
# The following simple chunking scheme can lead to severe load imbalance
#
# `chunkOffset`{.inject.} = chunkSize * threadId
# `chunkSize`{.inject.} = if threadId < nb_chunks - 1: chunkSize
# else: numIters - chunkOffset # remainder if
division isn't exact
#
# For example dividing 40 items on 12 threads will lead to
# a base_chunk_size of 40/12 = 3 so work on the first 11 threads
# will be 3 * 11 = 33, and the remainder 7 on the last thread.
# Instead of dividing 40 work items on 12 cores into:
# 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7 = 3*11 + 7 = 40
# the following scheme will divide into
# 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3 = 4*4 + 3*8 = 40
var chunkOffset {.inject.}, chunkSize {.inject.}: Natural
for threadID in 0 ..< numChunks:
if threadID < remainder:
chunkOffset = start + (baseChunkSize + 1) * threadID
chunkSize = baseChunkSize + 1
else:
chunkOffset = start + baseChunkSize * threadID + remainder
chunkSize = baseChunkSize
block: body
# Benches
#
----------------------------------------------------------------------------------------------------
let maxLim = 1_000_000
proc doSth(i: int) =
let k = i*2
benchmark "normal loop":
for i in 1..maxLim:
doSth(i)
# Don't interleave stacktraces, they require to allocate strings
# and OpenMP doesn't create a GC or would require "setupForeignThreadGc()"
{.push stackTrace:off.}
benchmark "parallelized OpenMP":
for i in 1||maxLim:
doSth(i)
{.pop.}
proc chunkedDoSth(chunkOffset, chunkSize: Natural) =
## A checunk is processed on the same core
for i in chunkOffset ..< chunkSize:
doSth(i)
benchmark "parallelized Nim spawn":
parallelChunks(1, maxLim, chunkOffset, chunkSize):
# Spawn a task for each chunk
spawn chunkedDoSth(chunkOffset, chunkSize)
Run