Thanks a lot for this hint Tim. I think this is indeed exactly what I need. 
I have implemented it more or less successfully in the sense that it works 
and computes the correct matrix:

## parallel helpers, from there: 
http://stackoverflow.com/questions/27677399/julia-how-to-copy-data-to-another-processor-in-julia

function sendto(p::Int; args...)
    for (nm, val) in args
        @spawnat(p, eval(Main, Expr(:(=), nm, val)))
    end
end

function sendto(ps::Vector{Int}; args...)
    for p in ps
        sendto(p; args...)
    end
end

getfrom(p::Int, nm::Symbol; mod=Main) = fetch(@spawnat(p, getfield(mod, 
nm)))

function passobj(src::Int, target::Vector{Int}, nm::Symbol;
                 from_mod=Main, to_mod=Main)
    r = RemoteRef(src)
    @spawnat(src, put!(r, getfield(from_mod, nm)))
    for to in target
        @spawnat(to, eval(to_mod, Expr(:(=), nm, fetch(r))))
    end
    nothing
end

function passobj(src::Int, target::Int, nm::Symbol; from_mod=Main, 
to_mod=Main)
    passobj(src, [target], nm; from_mod=from_mod, to_mod=to_mod)
end

function passobj(src::Int, target, nms::Vector{Symbol};
                 from_mod=Main, to_mod=Main)
    for nm in nms
        passobj(src, target, nm; from_mod=from_mod, to_mod=to_mod)
    end
end

## variables

m = Int(1e3)
n = Int(1e4)
mat_b = rand(m, n)

## sequential

function compute_row_sequential(mat_b, i, n)
    return mean(mat_b[:,i] .* mat_b[:,i:n], 1)
end

mat_a = zeros(n, n)

tic()
for i = 1:n
    mat_a[i,i:n] = compute_row_sequential(mat_b, i, n)
end
toc()

## parallel

addprocs(3)

@everywhere function compute_row_shared!(smat_a, mat_b, irange, n)
    for i in irange
        smat_a[i,i:n] = mean(mat_b[:,i] .* mat_b[:,i:n], 1)
    end
end

sendto(workers(), n = n)
sendto(workers(), mat_b = mat_b)

smat_a = SharedArray(Float64, (n,n), pids = workers())

tic()
@sync begin
    for p in procs(smat_a)
        @async begin
            irange = p-1:length(procs(smat_a)):n
            remotecall_wait(p, compute_row_shared!, smat_a, mat_b, irange, 
n)
        end
    end
end
toc()

println(mat_a == smat_a)

the last line returns true, but I tried different values of m and n and I 
could not find a case where the parallel implementation is more efficient 
than the sequential one. So obviously, either I'm doing something wrong or 
it's not the best approach for this case...

I'll keep on trying to improve the efficiency, any advice again welcome :)

Thank you.

Reply via email to