jxy - I did look at that, and perhaps I am reading the code wrong but I think
that one does compile time decision about which SIMD feature set is available,
not runtime, is that correct?
I'm looking to be able to build one exe, send it to a computer with SSE or AVX
or AVX512 and have it use the appropriate instructions at runtime.
I have a prototype now that works, that might illustrate what I am after, but
perhaps there are much better ways of going about it. Also, I haven't verified
how runtime performance is at all yet:
import rdstdin, strutils,x86_sse,x86_avx
var has_sse = true
var has_sse2 = true
var has_avx = false
proc load(a: var m128,s: var seq[float32],index: int) {.inline.} =
a = loadu_ps(addr s[index])
proc load(a: var m256,s: var seq[float32],index: int) {.inline.} =
a = loadu_ps_256(addr s[index])
template simd_block(s:seq[float32], a:untyped,count:untyped,body:untyped) =
if has_avx:
var count = 8
var a: m256
body
elif has_sse:
var count = 4
var a: m128
body
var s = @[1.0'f32,2.0'f32,3.0'f32,4.0'f32,1.0'f32,2.0'f32,3.0'f32,4.0'f32]
simd_block(s,a,count):
for i in countup(0,<s.len,count):
a.load(s,i)
a = add_ps(a,a)
storeu_ps(addr s[i],a)
echo s #result is correct!