Hi,
Trying to write custom and using BLAS functions implementation of
Gram-Schmidt algorithm I got more than 2-times slower performance for Julia
0.4.3 on my computer Intel i7 6700HQ (on older processor i7 5500 the
performance gain is 1.2-times). The code below is a bit longer, but I got
the slow performance in the whole context. Trying to profile parts of the
algorithm I got only slightly different performance.
Custom implementation:
function rank1update!(DST, A, R, k)
rows, cols = size(A)
for j = k+1:cols
@simd for i = 1:rows
@inbounds DST[i,j] -= A[i, k] * R[k, j]
end
end
end
function mygemv!(DST, A, k, alpha)
rows, cols = size(A)
for j in k+1:cols
s = 0.0
@simd for i in 1:rows
@inbounds s += A[i, k] * A[i, j]
end
DST[k, j] = s * alpha
end
end
function mgsf(M)
rows, cols = size(M)
Q = copy(M)
R = eye(cols)
for k in 1:cols
alpha = 1.0 / sumabs2(sub(Q, :, k))
mygemv!(R, Q, k, alpha)
rank1update!(Q, Q, R, k)
end
Q, R
end
Implementation using BLAS functions:
function mgs_blas(M)
cols = size(M, 2)
Q = copy(M)
R = eye(cols)
for k in 1:cols
q_k = sub(Q, :, k)
Q_sub = sub(Q, :, k+1:cols)
R_sub = sub(R, k, k+1:cols)
alpha = 1.0 / sumabs2(q_k)
R[k, k+1:cols] = BLAS.gemv('T', alpha, Q_sub, q_k)
BLAS.ger!(-1.0, q_k, vec(R_sub), Q_sub)
end
Q, R
end
And results; using BLAS the performance gain is ~2.6 times:
# custom implementation
Q2, R2 = @time mgsf(T);
0.714916 seconds (4.99 k allocations: 15.411 MB, 0.08% gc time)
# implementation using BLAS functions
Q5, R5 = @time mgs_blas(T);
0.339278 seconds (16.45 k allocations: 23.521 MB, 0.76% gc time)
A hint: Looking at performance graph in the Task Manager it seems BLAS uses
more cores.
The question that remains is: what is going on?
Thanks for explanation.