@jackmott

Something like _multiply_avr_ below does the trick.

It gives a 5x speedup on my computer when compiling in _normal_ mode.

When using _-d:release_ it doesn't run any faster. The compiler seems to be 
smart enough to use SIMD for the very simple loop in _multiply_.
    
    
    import times, x86_avx
    
    const
      N =   8_000
      M = 100_000
    
    proc multiply(a, b, d: var seq[float32]) =
      for ix in 0 ..< N:
         d[ix] = a[ix] * b[ix]
    
    proc multiply_avr(a, b, d: var seq[float32]) =
      for ix in countup(0, N-1, 8):
        let
          av = loadu_ps_256(addr a[ix])
          bv = loadu_ps_256(addr b[ix])
          rv = mul_ps(av, bv)
        storeu_ps(addr d[ix], rv)
    
    proc test(f: proc (a, b, d: var seq[float32])) =
      var a, b, d: seq[float32]
      newSeq(a, N)
      newSeq(b, N)
      newSeq(d, N)
      for ix in 0 ..< N:
        a[ix] = float32(ix)
        b[ix] = float32(ix)
      let t0 = cpuTime()
      for t in 1 .. M:
        f(a, b, d)
      let tt = cpuTime() - t0
      echo("Elapsed time: ", tt, " seconds")
    
    echo "--- normal multiply ---"
    test(multiply)
    echo "--- avr multiply ---"
    test(multiply_avr)
    echo "---"
    

Reply via email to