Hi everyone,

My adventures with parallel programming with Julia continue. Here is a 
different issue from other threads: My parallel function is 8300x slower 
than my serial function even though I am running on 4 processes on a 
multi-core machine.

julia> nprocs()
4

I have Julia 0.3.8. Here is my program in its entirety (not very long).

function main()
    
nbig::Int16 = 7
nbod::Int16 = nbig
bod  = Float64[
        0       1      2      3      4      5      6  # x position
        0       0      0      0      0      0      0  # y position
        0       0      0      0      0      0      0  # z position
        0       0      0      0      0      0      0  # x velocity
        0       0      0      0      0      0      0  # y velocity
        0       0      0      0      0      0      0  # z velocity
        1       1      1      1      1      1      1  # Mass
    ]

    a = zeros(3,nbod)
    
    @time for k = 1:1000
        gravity_1!(bod, nbig, nbod, a)
    end
    println(a[1,:])

    @time for k = 1:1000
        gravity_2!(bod, nbig, nbod, a)
end
    println(a[1,:])
end

function gravity_1!(bod, nbig, nbod, a)
    
    for i = 1:nbod
        a[1,i] = 0.0
        a[2,i] = 0.0
        a[3,i] = 0.0
    end
    
    @inbounds for i = 1:nbig
        for j = (i + 1):nbod
            
            dx = bod[1,j] - bod[1,i]
            dy = bod[2,j] - bod[2,i]
            dz = bod[3,j] - bod[3,i]
            
            s_1 = 1.0 / sqrt(dx*dx+dy*dy+dz*dz)
            s_3 = s_1 * s_1 * s_1
            
            tmp1 = s_3 * bod[7,i]
            tmp2 = s_3 * bod[7,j]
            
            a[1,j] = a[1,j] - tmp1*dx
            a[2,j] = a[2,j] - tmp1*dy
            a[3,j] = a[3,j] - tmp1*dz
            
            a[1,i] = a[1,i] + tmp2*dx
            a[2,i] = a[2,i] + tmp2*dy
            a[3,i] = a[3,i] + tmp2*dz
        end
    end
    return a
end

function gravity_2!(bod, nbig, nbod, a)
    
    for i = 1:nbod
        a[1,i] = 0.0
        a[2,i] = 0.0
        a[3,i] = 0.0
    end
    
    @inbounds @sync @parallel for i = 1:nbig
        for j = (i + 1):nbod
            
            dx = bod[1,j] - bod[1,i]
            dy = bod[2,j] - bod[2,i]
            dz = bod[3,j] - bod[3,i]
            
            s_1 = 1.0 / sqrt(dx*dx+dy*dy+dz*dz)
            s_3 = s_1 * s_1 * s_1
            
            tmp1 = s_3 * bod[7,i]
            tmp2 = s_3 * bod[7,j]
            
            a[1,j] = a[1,j] - tmp1*dx
            a[2,j] = a[2,j] - tmp1*dy
            a[3,j] = a[3,j] - tmp1*dz
            
            a[1,i] = a[1,i] + tmp2*dx
            a[2,i] = a[2,i] + tmp2*dy
            a[3,i] = a[3,i] + tmp2*dz
        end
    end
    return a
end



So this is a straight forward N-body gravity calculation. Yes, I realize 
that gravity_2!() is wrong, but that's fine. Right now I'm just talking 
about the CPU time. When I run this on my computer I get:

julia> main()
elapsed time: 0.000475294 seconds (0 bytes allocated)
[1.4913888888888889 0.4636111111111111 0.1736111111111111 
-5.551115123125783e-17 -0.17361111111111116 -0.4636111111111112 
-1.4913888888888889]
elapsed time: 3.953546654 seconds (126156320 bytes allocated, 13.49% gc 
time)
[0.0 0.0 0.0 0.0 0.0 0.0 0.0]


So, the serial version takes 0.000475 seconds and the parallel takes 3.95 
seconds. Furthermore, the parallel version is calling the garbage 
collector. I suspect that the problem has something to do with the memory 
access. Maybe the parallel code is wasting a lot of time copying variables 
in memory. But whatever the reason, this is bad. The documentation says 
that @parallel is supposed to be fast, even for very small loops, but 
that's not what I'm seeing. A non-buggy implementation will be even slower.

Have I missed something? Is there an obvious error in how I'm using the 
parallel constructs?

I would appreciate any guidance you may offer.

Cheers,
Daniel.

Reply via email to