Try:
function foo_old!(a)
for i in 1:size(a, 2)
a[:, i] /= norm(a[:, i])
end
return a
end
function foo_new!(a)
for i in 1:size(a, 2)
s = zero(eltype(a))
@simd for j = 1:size(a,1)
@inbounds s += abs2(a[j, i])
end
scale_factor = 1 / sqrt(s)
@simd for j = 1:size(a,1)
@inbounds a[j, i] *= scale_factor
end
end
return a
end
a = rand(1000,10000);
@time foo_old!(a);
@time foo_old!(a);
@time foo_old!(a);
a = rand(1000,10000);
@time foo_new!(a);
@time foo_new!(a);
@time foo_new!(a);
On my machine, foo_new! is at least 10x faster than foo_old!, and also
avoids all the allocations for the slices a[:,i].