@jackmott
Something like _multiply_avr_ below does the trick.
It gives a 5x speedup on my computer when compiling in _normal_ mode.
When using _-d:release_ it doesn't run any faster. The compiler seems to be
smart enough to use SIMD for the very simple loop in _multiply_.
import times, x86_avx
const
N = 8_000
M = 100_000
proc multiply(a, b, d: var seq[float32]) =
for ix in 0 ..< N:
d[ix] = a[ix] * b[ix]
proc multiply_avr(a, b, d: var seq[float32]) =
for ix in countup(0, N-1, 8):
let
av = loadu_ps_256(addr a[ix])
bv = loadu_ps_256(addr b[ix])
rv = mul_ps(av, bv)
storeu_ps(addr d[ix], rv)
proc test(f: proc (a, b, d: var seq[float32])) =
var a, b, d: seq[float32]
newSeq(a, N)
newSeq(b, N)
newSeq(d, N)
for ix in 0 ..< N:
a[ix] = float32(ix)
b[ix] = float32(ix)
let t0 = cpuTime()
for t in 1 .. M:
f(a, b, d)
let tt = cpuTime() - t0
echo("Elapsed time: ", tt, " seconds")
echo "--- normal multiply ---"
test(multiply)
echo "--- avr multiply ---"
test(multiply_avr)
echo "---"