Hi, all. I'm doing a small test case of Julia's speed & code length for 
numerical integration of PDEs, in comparison to Matlab, Python, and C++. 
The test case is the 1d Kuramoto-Sivashinksy equation on a 1d periodic 
domain, using Fourier expansions for spatial discretization and simple 
implicit-explicit finite-difference time-stepping for temporal 
discretization. 

My results so far indicate that (for large data sets) a naive Julia code 
runs slightly slower than the line-for-line equivalent Matlab code, faster 
than Python by a factor of two, and slower than an optimized C++ code by a 
factor of two. That's no big deal; the naive code creates lots of temporary 
arrays in the innermost loops. 

The trouble is when I try to optimize the Julia by unrolling array 
operations and eliminating the temporary variables, performance is worse. 
Execution is slower and more memory is allocated.

And confusingly, when I do a simplest possible mock-up of the issue with a 
few arrays, the opposite is true. The naive code is slower and allocative, 
the unrolled code is fast and tight. 

Here's the mock-up code in file "vecsum.jl" (the operation in the for- loop 
matches the critical operation in the PDE code).
function vecsumNaive(u,a,b,c,d)
    uf = copy(u);
    for n=1:1000
      uf = b .* (a .* uf + 1.5*c - 0.5*d)
    end
    sum(uf)
end


function vecsumUnrolled(u,a,b,c,d)

    uf = copy(u);
    N = length(a);
    for n=1:1000
        for i=1:N
            @fastmath @inbounds uf[i] = b[i]*(a[i]*uf[i] + 1.5*c[i] - 
0.5*d[i])
        end
    end
    sum(uf)
end

N = 512;
eps = .01;

# Assign random variables of the same types as appear in the KS integration 
code
a = eps*randn(N);
b = eps*randn(N);

u = eps*(randn(N)+1im*randn(N));
c = eps*(randn(N)+1im*randn(N));
d = eps*(randn(N)+1im*randn(N));

Running this with Julia 0.4.2 on Linux gives
julia> @time vecsumNaive(u,a,b,c,d);
  0.433867 seconds (517.62 k allocations: 68.861 MB, 9.93% gc time)

julia> @time vecsumNaive(u,a,b,c,d);
  0.031010 seconds (60.01 k allocations: 48.684 MB, 18.60% gc time)

julia> @time vecsumNaive(u,a,b,c,d);
  0.031556 seconds (60.01 k allocations: 48.684 MB, 12.79% gc time)

julia> 

julia> @time vecsumUnrolled(u,a,b,c,d);
  0.025642 seconds (10.56 k allocations: 510.544 KB)

julia> @time vecsumUnrolled(u,a,b,c,d);
  0.003166 seconds (6 allocations: 8.266 KB)

julia> @time vecsumUnrolled(u,a,b,c,d);
  0.003533 seconds (6 allocations: 8.266 KB)


Great, just as you would expect. The unrolled operation is 10 times faster 
with almost no allocation. But now for the PDE code (file "ksintegrate.jl")
# The Kuramoto-Shivashinksy eqn is (subscripts indicate differentiation)
#
#     u_t = -u*u_x - u_xx - u_xxxx on domain [0,Lx] with periodic BCs 
# or  
#     u_t = L(u) + N(u) 
#
# where L(u) = -u_xx - u_xxxx and N(u) = -u u_x 

# The numerical integration scheme: Discretize u in space with a Fourier 
expansion 
#
# u(x,t) = sum_k uf[k] exp(2pi i k x/Lx)
#
# Discretize u in time with Crank-Nicolson Adams-Bashforth time stepping 
formula
# (I - dt/2 L) u^{n+1} = (I + dt/2 L) u^n + 3dt/2 N^n - dt/2 N^{n-1}
#
# u^{n+1} = (I - dt/2 L)^{-1} [(I + dt/2 L) u^n + 3dt/2 N^n - dt/2 N^{n-1}]
#
# let A = (I + dt/2 L)
# let B = (I - dt/2 L)^{-1}, then
# 
# u^{n+1} = B ( A u^n + 3dt/2 N^n - dt/2 N^{n-1})


function ksintegrateNaive(u, Lx, dt, Nt);
# integrate Kuramoto-Shivashinsky eqn for inputs
#   u  = initial condition, array of gridpoint values of u(x) on uniform 
grid
#   Lx = periodic domain length
#   dt = time step
#   Nt = number of time steps

    Nx = length(u)                      # number of gridpoints
    kx = vcat(0:Nx/2-1, 0, -Nx/2+1:-1)  # integer wavenumbers: 
exp(2*pi*kx*x/L)
    alpha = 2*pi*kx/Lx                  # real wavenumbers:    exp(alpha*x)
    D = 1im*alpha;                      # spectral D = d/dx operator 
    L = alpha.^2 - alpha.^4             # spectral L = -D^2 - D^4 operator
    G = -0.5*D                          # spectral -1/2 D operator, to eval 
-u u_x = 1/2 d/dx u^2

    # convenience variables
    dt2  = dt/2;
    dt32 = 3*dt/2;
    A =  ones(Nx) + dt2*L;
    B = (ones(Nx) - dt2*L).^(-1);

    # evaluate nonlinear term 
    Nnf  = G.*fft(u.^2); # Nnf == -1/2 d/dx (u^2) = -u u_x, spectral
    Nn1f = Nnf;          # use Nnf1 = Nnf at first time step

    # compute Fourier coeffs of u
    uf  = fft(u);

    # timestepping loop
    for n = 0:Nt

        # compute new nonlinear term 
        Nn1f = copy(Nnf);                 # shift nonlinear term in time: 
N^{n-1} <- N^n
        Nnf  = G.*fft(real(ifft(uf)).^2); # compute Nn = -u u_x

        ##############################################################
        # use lots of temporary arrays to evaluate timestepping formula
        uf = B .* (A .* uf + dt32*Nnf - dt2*Nn1f); 

    end
    u = real(ifft(u))
end


function ksintegrateUnrolled(u, Lx, dt, Nt);
# integrate Kuramoto-Shivashinsky eqn for inputs
#   u  = initial condition, array of gridpoint values of u(x) on uniform 
grid
#   Lx = periodic domain length
#   dt = time step
#   Nt = number of time steps

    Nx = length(u)                      # number of gridpoints
    kx = vcat(0:Nx/2-1, 0, -Nx/2+1:-1)  # integer wavenumbers: 
exp(2*pi*kx*x/L)
    alpha = 2*pi*kx/Lx                  # real wavenumbers:    exp(alpha*x)
    D = 1im*alpha;                      # spectral D = d/dx operator 
    L = alpha.^2 - alpha.^4             # spectral L = -D^2 - D^4 operator
    G = -0.5*D                          # spectral -1/2 D operator, to eval 
-u u_x = 1/2 d/dx u^2

    # convenience variables
    dt2  = dt/2;
    dt32 = 3*dt/2;
    A =  ones(Nx) + dt2*L;
    B = (ones(Nx) - dt2*L).^(-1);

    # compute uf == Fourier coeffs of u and Nnf == Fourier coeffs of -u u_x
    uf  = fft(u);
    Nnf  = G.*fft(u.^2); # Nnf == -1/2 d/dx (u^2) = -u u_x, spectral
    Nn1f = Nnf;          # use Nnf1 = Nnf at first time step

    # timestepping loop
    for n = 0:Nt

        Nn1f = copy(Nnf);                 # shift nonlinear term in time: 
N^{n-1} <- N^n
        Nnf  = G.*fft(real(ifft(uf)).^2); # compute Nn = -u u_x

        ##############################################################
        # unroll array operations to compute timestepping formula
        for n = 1:length(uf)
            @fastmath @inbounds uf[n] = B[n]* (A[n] * uf[n] + dt32*Nnf[n] - 
dt2*Nn1f[n]);
        end


    end
    u = real(ifft(u))
end


Nx = 512;
Lx = Nx/16*pi             # spatial domain [0, L] periodic
dt = 1/16;                # discrete time step 
T  = 200;                 # integrate from t=0 to t=T
nplot = round(Int,1/dt);  # save every nploth time step
Nt = round(Int, T/dt);    # total number of timesteps
x = Lx*(0:Nx-1)/Nx;
u = cos(x/16).*(1+2*sin(x/16)).*(1+0.01*sin(x/32)); # an initial condition


which when runs gives

ulia> include("ksintegrate.jl");

julia> @time ksintegrateNaive(u,Lx,dt,Nt);
  1.776498 seconds (3.86 M allocations: 466.410 MB, 2.35% gc time)

julia> @time ksintegrateNaive(u,Lx,dt,Nt);
  0.287348 seconds (653.28 k allocations: 328.565 MB, 8.34% gc time)

julia> @time ksintegrateNaive(u,Lx,dt,Nt);
  0.287859 seconds (653.28 k allocations: 328.565 MB, 8.37% gc time)

julia> 

julia> @time ksintegrateUnrolled(u,Lx,dt,Nt);
  0.823053 seconds (23.44 M allocations: 774.411 MB, 7.56% gc time)

julia> @time ksintegrateUnrolled(u,Lx,dt,Nt);
  0.836033 seconds (23.41 M allocations: 773.040 MB, 7.52% gc time)

julia> @time ksintegrateUnrolled(u,Lx,dt,Nt);
  0.800421 seconds (23.41 M allocations: 773.040 MB, 7.91% gc time)


The unrolled code is slower by a factor of two and more memory-intensive. I 
don't get it.

(note: I do know my FFTW calls are suboptimal, will tackle that next)

--John

Reply via email to