Not sure how valuable this post is but for fun I tried the new Yeppp.jl
wrapper (https://github.com/JuliaLang/Yeppp.jl) and got it down from 15 to
~ 4 seconds without anything running in parallel. I didn't bother sending
in the buffer arrays but instead just put them as consts outside the
function
using Yeppp
const n = 10000000
const p = Array(Float64, n)
const p_cos = Array(Float64, n)
const p_sin = Array(Float64, n)
function innerloop(x_real::Array{Float64}, x_imag::Array{Float64},
y::Array{Complex{Float64}}, scaler::Float64)
n = length(x_real)
assert(length(y) == n)
Yeppp.exp!(p, x_real)
Yeppp.cos!(p_cos, x_imag)
Yeppp.sin!(p_sin, x_imag)
s = 0.0 + 0.0im
@inbounds for i in 1:n
s += y[i] * Complex(p[i] * scaler * p_cos[i], p[i] * scaler * p_sin[i])
end
return s
end
function timeit(n::Int, reps::Int)
x_real = fill(0.0, n)
x_imag = fill(-1.0, n)
y = fill(0.0+1.0im, n)
res = Array(Complex{Float64}, reps)
for k in 1:reps
scaler = convert(Float64, k)
res[k] = innerloop(x_real, x_imag, y, scaler)
end
end
timeit(100, 1)
@time timeit(n, 50)
On Wednesday, April 15, 2015 at 8:32:19 PM UTC+2, SixString wrote:
>
> I've been advised to write loops in pure Julia for speed, contrary to the
> strategy of using library functions in Python. After spending a couple
> hundred hours learning Julia, I am having trouble matching Python's speed
> with Julia. Here is code for a benchmark in Python:
>
> import numpy as np
>
> import numexpr as ne
>
> from scipy.linalg.blas import zdotu
>
> from timeit import default_timer as timer
>
> n = 10000000
>
> reps = 50
>
> x = -1J * np.ones(n)
>
> y = 1J * np.ones(n)
>
> res = np.empty(reps, dtype=np.complex128)
>
> start = timer()
>
> for k in range(reps):
>
> scaler = float(k)
>
> cexp = ne.evaluate('exp(scaler * x)')
>
> res[k] = zdotu(cexp, y)
>
> exec_time=(timer() - start)
>
> print
>
> print("Execution took", str(round(exec_time, 1)), "seconds")
>
> On my Intel Core-i7 Windows PC with 64-bit Python 3.4, this took 4.5
> seconds to execute. I note that Python's numexpr library function uses all
> 8 CPU threads, while BLAS' zdotu() is single-threaded.
>
> Here is my single-threaded Julia code for the same benchmark:
> function innerloop(x::Array{Complex{Float64}}, y::Array{Complex{Float64}},
> scaler::Float64)
>
> n = length(x)
>
> assert(length(y) == n)
>
> s = 0.0 + 0.0im
>
> @simd for i in 1:n
>
> @inbounds s += y[i] * exp(scaler * x[i])
>
> end
>
> s
>
> end
>
>
> function timeit(n::Int, reps::Int)
>
> x = fill(0.0-1.0im, n)
>
> y = fill(0.0+1.0im, n)
>
> res = Array(Complex{Float64}, reps)
>
> @time for k in 1:reps
>
> scaler = convert(Float64, k)
>
> res[k] = innerloop(x, y, scaler)
>
> end
>
> end
>
>
> timeit(100, 1)
>
> timeit(10000000, 50)
>
> On the same PC with 64-bit Julia 0.3.7, the second call to timeit() took
> 17.6 seconds and allocated 0 bytes. That is 3.9 times slower than Python.
> I note that the @simd decorator didn't speed up the execution.
>
> Python performance seems in reach if I can get a parallelized Julia
> implementation. Here is my attempt:
> function innerloop(x::Array{Complex{Float64}}, y::Array{Complex{Float64}},
> scaler::Float64)
>
> n = length(x)
>
> assert(length(y) == n)
>
> s = @parallel (+) for i in 1:n
>
> y[i] * exp(scaler * x[i])
>
> end
>
> s
>
> end
>
> The timeit() function remains the same. Running Julia with the "-p 8" on
> the command line, the second call to timeit() took 233 seconds and
> allocated 303GB. The huge allocation is my clue that something is really
> wrong here. All my attempts at adding @inbounds created errors.
>
> I should say that this benchmark is representative of a real problem that
> I am trying to solve. My actual Python code takes 10 minutes to execute
> because n and reps are larger. However, the types of the vectors match my
> actual code, and scaler changes with each rep.
>
> Please show me by example how to modify innerloop() to be faster than the
> single-threaded version. My preference is for a fix with Julia 0.3.7, but
> I am also interested if someone provides Julia 0.4-pre code including a
> speed comparison with the single-threaded version on their machine. If
> possible, I would like to avoid SharedArrays because (I believe) they can't
> be used with BLAS calls in other functions.
>