Hi,

Trying to write custom and using BLAS functions implementation of 
Gram-Schmidt algorithm I got more than 2-times slower performance for Julia 
0.4.3 on my computer Intel i7 6700HQ (on older processor i7 5500 the 
performance gain is 1.2-times). The code below is a bit longer, but I got 
the slow performance in the whole context. Trying to profile parts of the 
algorithm I got only slightly different performance.

Custom implementation:

function rank1update!(DST, A, R, k)
    rows, cols = size(A)
    for j = k+1:cols
        @simd for i = 1:rows
            @inbounds DST[i,j] -= A[i, k] * R[k, j]
        end
    end
end

function mygemv!(DST, A, k, alpha)
    rows, cols = size(A)
    for j in k+1:cols
        s = 0.0
        @simd for i in 1:rows
            @inbounds s += A[i, k] * A[i, j]
        end      
        DST[k, j] = s * alpha
    end
end

function mgsf(M)
    rows, cols = size(M)
    Q = copy(M)
    R = eye(cols)

    for k in 1:cols
        alpha = 1.0 / sumabs2(sub(Q, :, k))
        mygemv!(R, Q, k, alpha)
        rank1update!(Q, Q, R, k)
    end
    Q, R
end

Implementation using BLAS functions:
function mgs_blas(M)
    cols = size(M, 2)
    Q = copy(M)
    R = eye(cols)

    for k in 1:cols
        q_k = sub(Q, :, k)
        Q_sub = sub(Q, :, k+1:cols)
        R_sub = sub(R, k, k+1:cols)

        alpha = 1.0 / sumabs2(q_k)
        R[k, k+1:cols] = BLAS.gemv('T', alpha, Q_sub, q_k)
        BLAS.ger!(-1.0,  q_k, vec(R_sub), Q_sub)
    end
    
    Q, R
end

And results; using BLAS the performance gain is ~2.6 times:

# custom implementation
Q2, R2 = @time mgsf(T);

  0.714916 seconds (4.99 k allocations: 15.411 MB, 0.08% gc time)


# implementation using BLAS functions 

Q5, R5 = @time mgs_blas(T);

  0.339278 seconds (16.45 k allocations: 23.521 MB, 0.76% gc time)



A hint: Looking at performance graph in the Task Manager it seems BLAS uses 
more cores.
The question that remains is: what is going on?

Thanks for explanation.

Reply via email to