Since I only care about square matrices, something like this was the 
fastest of the things I tried:

@generated function get_index{N}(t::NTuple{N}, i::Int, j::Int)
    rows = Int(sqrt(N))
    return quote
        @inbounds v = t[(j-1) * $rows + i]
        return v
    end
end

@generated function loopdot_flatten{N, T}(A::NTuple{N, T}, B::NTuple{N, T}, 
i, j)
    rows = Int(sqrt(N))
    return quote
        s = zero(T)*zero(T)
        @inbounds @simd for k = 1:$rows
            s += get_index(A, i, k) * get_index(B, k, j)
        end
        s
    end
end

@generated function unrolled_matmult_flatten{N}(A::NTuple{N}, B::NTuple{N})
    rows = Int(sqrt(N))
    return Expr(:tuple, [:(loopdot_flatten(A, B, $i, $j)) for i=1:rows, 
j=1:rows]...)
end


function bench_flatten(A)
    @time for i in 1:10^5
        unrolled_matmult_flatten(A, A)
    end
end

It ends up being about 2x slower for 9x9 matrices than a simple loop for 
vectors.

On Friday, February 26, 2016 at 8:27:51 PM UTC+1, Kristoffer Carlsson wrote:
>
> The non inlined version is 5 times slower on 0.5 than 0.4 as well.

Reply via email to