Thanks a lot for this hint Tim. I think this is indeed exactly what I need. I have implemented it more or less successfully in the sense that it works and computes the correct matrix:
## parallel helpers, from there: http://stackoverflow.com/questions/27677399/julia-how-to-copy-data-to-another-processor-in-julia function sendto(p::Int; args...) for (nm, val) in args @spawnat(p, eval(Main, Expr(:(=), nm, val))) end end function sendto(ps::Vector{Int}; args...) for p in ps sendto(p; args...) end end getfrom(p::Int, nm::Symbol; mod=Main) = fetch(@spawnat(p, getfield(mod, nm))) function passobj(src::Int, target::Vector{Int}, nm::Symbol; from_mod=Main, to_mod=Main) r = RemoteRef(src) @spawnat(src, put!(r, getfield(from_mod, nm))) for to in target @spawnat(to, eval(to_mod, Expr(:(=), nm, fetch(r)))) end nothing end function passobj(src::Int, target::Int, nm::Symbol; from_mod=Main, to_mod=Main) passobj(src, [target], nm; from_mod=from_mod, to_mod=to_mod) end function passobj(src::Int, target, nms::Vector{Symbol}; from_mod=Main, to_mod=Main) for nm in nms passobj(src, target, nm; from_mod=from_mod, to_mod=to_mod) end end ## variables m = Int(1e3) n = Int(1e4) mat_b = rand(m, n) ## sequential function compute_row_sequential(mat_b, i, n) return mean(mat_b[:,i] .* mat_b[:,i:n], 1) end mat_a = zeros(n, n) tic() for i = 1:n mat_a[i,i:n] = compute_row_sequential(mat_b, i, n) end toc() ## parallel addprocs(3) @everywhere function compute_row_shared!(smat_a, mat_b, irange, n) for i in irange smat_a[i,i:n] = mean(mat_b[:,i] .* mat_b[:,i:n], 1) end end sendto(workers(), n = n) sendto(workers(), mat_b = mat_b) smat_a = SharedArray(Float64, (n,n), pids = workers()) tic() @sync begin for p in procs(smat_a) @async begin irange = p-1:length(procs(smat_a)):n remotecall_wait(p, compute_row_shared!, smat_a, mat_b, irange, n) end end end toc() println(mat_a == smat_a) the last line returns true, but I tried different values of m and n and I could not find a case where the parallel implementation is more efficient than the sequential one. So obviously, either I'm doing something wrong or it's not the best approach for this case... I'll keep on trying to improve the efficiency, any advice again welcome :) Thank you.
