Hello: I recently started exploring the parallel capabilities of Julia and
I need some help in understanding and improving the performance a very
elementary parallel code using DArrays (I use Julia
version 0.4.0-dev+2431). The code pasted below (based essentially on
plife.jl) solves u''(x) = 0, x \in [0,1] with u(0) and u(1) specified,
using the 2nd order central difference approximation. The parallel version
of the code runs significantly slower than the serial version. It would be
nice if someone could point out ways to improve this and/or suggest an
alternative efficient version.
function laplace_1D_serial(u::Array{Float64})
N = length(u) - 2
u_new = zeros(N)
for i = 1:N
u_new[i] = 0.5(u[i] + u[i + 2])
end
u_new
end
function serial_iterate(u::Array{Float64})
u_new = laplace_1D_serial(u)
for i = 1:length(u_new)
u[i + 1] = u_new[i]
end
end
function parallel_iterate(u::DArray)
DArray(size(u), procs(u)) do I
J = I[1]
if myid() == 2
local_array = zeros(length(J) + 1)
for i = J[1] : J[end] + 1
local_array[i - J[1] + 1] = u[i]
end
append!([float(u[1])], laplace_1D_serial(local_array))
elseif myid() == length(procs(u)) + 1
local_array = zeros(length(J) + 1)
for i = J[1] - 1 : J[end]
local_array[i - J[1] + 2] = u[i]
end
append!(laplace_1D_serial(local_array), [float(u[end])])
else
local_array = zeros(length(J) + 2)
for i = J[1] - 1 : J[end] + 1
local_array[i - J[1] + 2] = u[i]
end
laplace_1D_serial(local_array)
end
end
end
A sample run on my laptop with 4 processors:
julia> u = zeros(1000); u[end] = 1.0; u_distributed = distribute(u);
julia> @time for i = 1:1000
serial_iterate(u)
end
elapsed time: 0.011452192 seconds (8300112 bytes allocated)
julia> @time for i = 1:1000
u_distributed = parallel_iterate(u_distributed)
end
elapsed time: 4.461922218 seconds (190565036 bytes allocated, 10.17% gc
time)
Thanks for your help!
Cheers,
Amuthan