Since I only care about square matrices, something like this was the
fastest of the things I tried:
@generated function get_index{N}(t::NTuple{N}, i::Int, j::Int)
rows = Int(sqrt(N))
return quote
@inbounds v = t[(j-1) * $rows + i]
return v
end
end
@generated function loopdot_flatten{N, T}(A::NTuple{N, T}, B::NTuple{N, T},
i, j)
rows = Int(sqrt(N))
return quote
s = zero(T)*zero(T)
@inbounds @simd for k = 1:$rows
s += get_index(A, i, k) * get_index(B, k, j)
end
s
end
end
@generated function unrolled_matmult_flatten{N}(A::NTuple{N}, B::NTuple{N})
rows = Int(sqrt(N))
return Expr(:tuple, [:(loopdot_flatten(A, B, $i, $j)) for i=1:rows,
j=1:rows]...)
end
function bench_flatten(A)
@time for i in 1:10^5
unrolled_matmult_flatten(A, A)
end
end
It ends up being about 2x slower for 9x9 matrices than a simple loop for
vectors.
On Friday, February 26, 2016 at 8:27:51 PM UTC+1, Kristoffer Carlsson wrote:
>
> The non inlined version is 5 times slower on 0.5 than 0.4 as well.