Writting line #13 as a BLAS call reduced further memory allocation. Thanks for all the tips (BLAS, NumericExtensions, InplaceOps, etc.).
using ArrayViews
function mgs(X)
# mgs speed project
nobs, nvars = size(X)
R = eye(nvars)
for l=1:nvars-1
v_i = view(X,:,l+1:nvars)
s = view(X,:,l);
R[l,l+1:nvars]=BLAS.gemv('T', 1.0/sumabs2(s), v_i, s) # --- line 13
BLAS.ger!(-1.0, s, vec(view(R,l,l+1:nvars)), v_i) # --- line 14
end
X, R
end
