Hi, all. I'm doing a small test case of Julia's speed & code length for
numerical integration of PDEs, in comparison to Matlab, Python, and C++.
The test case is the 1d Kuramoto-Sivashinksy equation on a 1d periodic
domain, using Fourier expansions for spatial discretization and simple
implicit-explicit finite-difference time-stepping for temporal
discretization.
My results so far indicate that (for large data sets) a naive Julia code
runs slightly slower than the line-for-line equivalent Matlab code, faster
than Python by a factor of two, and slower than an optimized C++ code by a
factor of two. That's no big deal; the naive code creates lots of temporary
arrays in the innermost loops.
The trouble is when I try to optimize the Julia by unrolling array
operations and eliminating the temporary variables, performance is worse.
Execution is slower and more memory is allocated.
And confusingly, when I do a simplest possible mock-up of the issue with a
few arrays, the opposite is true. The naive code is slower and allocative,
the unrolled code is fast and tight.
Here's the mock-up code in file "vecsum.jl" (the operation in the for- loop
matches the critical operation in the PDE code).
function vecsumNaive(u,a,b,c,d)
uf = copy(u);
for n=1:1000
uf = b .* (a .* uf + 1.5*c - 0.5*d)
end
sum(uf)
end
function vecsumUnrolled(u,a,b,c,d)
uf = copy(u);
N = length(a);
for n=1:1000
for i=1:N
@fastmath @inbounds uf[i] = b[i]*(a[i]*uf[i] + 1.5*c[i] -
0.5*d[i])
end
end
sum(uf)
end
N = 512;
eps = .01;
# Assign random variables of the same types as appear in the KS integration
code
a = eps*randn(N);
b = eps*randn(N);
u = eps*(randn(N)+1im*randn(N));
c = eps*(randn(N)+1im*randn(N));
d = eps*(randn(N)+1im*randn(N));
Running this with Julia 0.4.2 on Linux gives
julia> @time vecsumNaive(u,a,b,c,d);
0.433867 seconds (517.62 k allocations: 68.861 MB, 9.93% gc time)
julia> @time vecsumNaive(u,a,b,c,d);
0.031010 seconds (60.01 k allocations: 48.684 MB, 18.60% gc time)
julia> @time vecsumNaive(u,a,b,c,d);
0.031556 seconds (60.01 k allocations: 48.684 MB, 12.79% gc time)
julia>
julia> @time vecsumUnrolled(u,a,b,c,d);
0.025642 seconds (10.56 k allocations: 510.544 KB)
julia> @time vecsumUnrolled(u,a,b,c,d);
0.003166 seconds (6 allocations: 8.266 KB)
julia> @time vecsumUnrolled(u,a,b,c,d);
0.003533 seconds (6 allocations: 8.266 KB)
Great, just as you would expect. The unrolled operation is 10 times faster
with almost no allocation. But now for the PDE code (file "ksintegrate.jl")
# The Kuramoto-Shivashinksy eqn is (subscripts indicate differentiation)
#
# u_t = -u*u_x - u_xx - u_xxxx on domain [0,Lx] with periodic BCs
# or
# u_t = L(u) + N(u)
#
# where L(u) = -u_xx - u_xxxx and N(u) = -u u_x
# The numerical integration scheme: Discretize u in space with a Fourier
expansion
#
# u(x,t) = sum_k uf[k] exp(2pi i k x/Lx)
#
# Discretize u in time with Crank-Nicolson Adams-Bashforth time stepping
formula
# (I - dt/2 L) u^{n+1} = (I + dt/2 L) u^n + 3dt/2 N^n - dt/2 N^{n-1}
#
# u^{n+1} = (I - dt/2 L)^{-1} [(I + dt/2 L) u^n + 3dt/2 N^n - dt/2 N^{n-1}]
#
# let A = (I + dt/2 L)
# let B = (I - dt/2 L)^{-1}, then
#
# u^{n+1} = B ( A u^n + 3dt/2 N^n - dt/2 N^{n-1})
function ksintegrateNaive(u, Lx, dt, Nt);
# integrate Kuramoto-Shivashinsky eqn for inputs
# u = initial condition, array of gridpoint values of u(x) on uniform
grid
# Lx = periodic domain length
# dt = time step
# Nt = number of time steps
Nx = length(u) # number of gridpoints
kx = vcat(0:Nx/2-1, 0, -Nx/2+1:-1) # integer wavenumbers:
exp(2*pi*kx*x/L)
alpha = 2*pi*kx/Lx # real wavenumbers: exp(alpha*x)
D = 1im*alpha; # spectral D = d/dx operator
L = alpha.^2 - alpha.^4 # spectral L = -D^2 - D^4 operator
G = -0.5*D # spectral -1/2 D operator, to eval
-u u_x = 1/2 d/dx u^2
# convenience variables
dt2 = dt/2;
dt32 = 3*dt/2;
A = ones(Nx) + dt2*L;
B = (ones(Nx) - dt2*L).^(-1);
# evaluate nonlinear term
Nnf = G.*fft(u.^2); # Nnf == -1/2 d/dx (u^2) = -u u_x, spectral
Nn1f = Nnf; # use Nnf1 = Nnf at first time step
# compute Fourier coeffs of u
uf = fft(u);
# timestepping loop
for n = 0:Nt
# compute new nonlinear term
Nn1f = copy(Nnf); # shift nonlinear term in time:
N^{n-1} <- N^n
Nnf = G.*fft(real(ifft(uf)).^2); # compute Nn = -u u_x
##############################################################
# use lots of temporary arrays to evaluate timestepping formula
uf = B .* (A .* uf + dt32*Nnf - dt2*Nn1f);
end
u = real(ifft(u))
end
function ksintegrateUnrolled(u, Lx, dt, Nt);
# integrate Kuramoto-Shivashinsky eqn for inputs
# u = initial condition, array of gridpoint values of u(x) on uniform
grid
# Lx = periodic domain length
# dt = time step
# Nt = number of time steps
Nx = length(u) # number of gridpoints
kx = vcat(0:Nx/2-1, 0, -Nx/2+1:-1) # integer wavenumbers:
exp(2*pi*kx*x/L)
alpha = 2*pi*kx/Lx # real wavenumbers: exp(alpha*x)
D = 1im*alpha; # spectral D = d/dx operator
L = alpha.^2 - alpha.^4 # spectral L = -D^2 - D^4 operator
G = -0.5*D # spectral -1/2 D operator, to eval
-u u_x = 1/2 d/dx u^2
# convenience variables
dt2 = dt/2;
dt32 = 3*dt/2;
A = ones(Nx) + dt2*L;
B = (ones(Nx) - dt2*L).^(-1);
# compute uf == Fourier coeffs of u and Nnf == Fourier coeffs of -u u_x
uf = fft(u);
Nnf = G.*fft(u.^2); # Nnf == -1/2 d/dx (u^2) = -u u_x, spectral
Nn1f = Nnf; # use Nnf1 = Nnf at first time step
# timestepping loop
for n = 0:Nt
Nn1f = copy(Nnf); # shift nonlinear term in time:
N^{n-1} <- N^n
Nnf = G.*fft(real(ifft(uf)).^2); # compute Nn = -u u_x
##############################################################
# unroll array operations to compute timestepping formula
for n = 1:length(uf)
@fastmath @inbounds uf[n] = B[n]* (A[n] * uf[n] + dt32*Nnf[n] -
dt2*Nn1f[n]);
end
end
u = real(ifft(u))
end
Nx = 512;
Lx = Nx/16*pi # spatial domain [0, L] periodic
dt = 1/16; # discrete time step
T = 200; # integrate from t=0 to t=T
nplot = round(Int,1/dt); # save every nploth time step
Nt = round(Int, T/dt); # total number of timesteps
x = Lx*(0:Nx-1)/Nx;
u = cos(x/16).*(1+2*sin(x/16)).*(1+0.01*sin(x/32)); # an initial condition
which when runs gives
ulia> include("ksintegrate.jl");
julia> @time ksintegrateNaive(u,Lx,dt,Nt);
1.776498 seconds (3.86 M allocations: 466.410 MB, 2.35% gc time)
julia> @time ksintegrateNaive(u,Lx,dt,Nt);
0.287348 seconds (653.28 k allocations: 328.565 MB, 8.34% gc time)
julia> @time ksintegrateNaive(u,Lx,dt,Nt);
0.287859 seconds (653.28 k allocations: 328.565 MB, 8.37% gc time)
julia>
julia> @time ksintegrateUnrolled(u,Lx,dt,Nt);
0.823053 seconds (23.44 M allocations: 774.411 MB, 7.56% gc time)
julia> @time ksintegrateUnrolled(u,Lx,dt,Nt);
0.836033 seconds (23.41 M allocations: 773.040 MB, 7.52% gc time)
julia> @time ksintegrateUnrolled(u,Lx,dt,Nt);
0.800421 seconds (23.41 M allocations: 773.040 MB, 7.91% gc time)
The unrolled code is slower by a factor of two and more memory-intensive. I
don't get it.
(note: I do know my FFTW calls are suboptimal, will tackle that next)
--John