Not sure how valuable this post is but for fun I tried the new Yeppp.jl 
wrapper (https://github.com/JuliaLang/Yeppp.jl) and got it down from 15 to 
~ 4 seconds without anything running in parallel. I didn't bother sending 
in the buffer arrays but instead just put them as consts outside the 
function

using Yeppp
const n = 10000000
const p = Array(Float64, n)
const p_cos = Array(Float64, n)
const p_sin = Array(Float64, n)


function innerloop(x_real::Array{Float64}, x_imag::Array{Float64},
                   y::Array{Complex{Float64}}, scaler::Float64)

   n = length(x_real)
   assert(length(y) == n)

   Yeppp.exp!(p, x_real)
   Yeppp.cos!(p_cos, x_imag)
   Yeppp.sin!(p_sin, x_imag)


   s = 0.0 + 0.0im
  @inbounds for i in 1:n
    s += y[i] * Complex(p[i] * scaler * p_cos[i], p[i] * scaler * p_sin[i])
  end
  return s
end


function timeit(n::Int, reps::Int)
   x_real = fill(0.0, n)
   x_imag = fill(-1.0, n)
   y = fill(0.0+1.0im, n)
   res = Array(Complex{Float64}, reps)
   for k in 1:reps
       scaler = convert(Float64, k)
       res[k] = innerloop(x_real, x_imag, y, scaler)
   end
end


timeit(100, 1)


@time timeit(n, 50)




On Wednesday, April 15, 2015 at 8:32:19 PM UTC+2, SixString wrote:
>
> I've been advised to write loops in pure Julia for speed, contrary to the 
> strategy of using library functions in Python.  After spending a couple 
> hundred hours learning Julia, I am having trouble matching Python's speed 
> with Julia.  Here is code for a benchmark in Python:
>
> import numpy as np
>
> import numexpr as ne
>
> from scipy.linalg.blas import zdotu
>
> from timeit import default_timer as timer
>
> n = 10000000
>
> reps = 50
>
> x = -1J * np.ones(n)
>
> y = 1J * np.ones(n)
>
> res = np.empty(reps, dtype=np.complex128)
>
> start = timer()
>
> for k in range(reps):
>
>    scaler = float(k)
>
>    cexp = ne.evaluate('exp(scaler * x)')
>
>    res[k] = zdotu(cexp, y)
>
> exec_time=(timer() - start)
>
> print
>
> print("Execution took", str(round(exec_time, 1)), "seconds")
>
> On my Intel Core-i7 Windows PC with 64-bit Python 3.4, this took 4.5 
> seconds to execute.  I note that Python's numexpr library function uses all 
> 8 CPU threads, while BLAS' zdotu() is single-threaded.
>
> Here is my single-threaded Julia code for the same benchmark:
> function innerloop(x::Array{Complex{Float64}}, y::Array{Complex{Float64}}, 
> scaler::Float64)
>
>    n = length(x)
>
>    assert(length(y) == n)
>
>    s = 0.0 + 0.0im
>
>    @simd for i in 1:n
>
>        @inbounds s += y[i] * exp(scaler * x[i])
>
>    end
>
>    s
>
> end
>
>
> function timeit(n::Int, reps::Int)
>
>    x = fill(0.0-1.0im, n)
>
>    y = fill(0.0+1.0im, n)
>
>    res = Array(Complex{Float64}, reps)
>
>    @time for k in 1:reps
>
>        scaler = convert(Float64, k)
>
>        res[k] = innerloop(x, y, scaler)
>
>    end
>
> end
>
>
> timeit(100, 1)
>
> timeit(10000000, 50)
>
> On the same PC with 64-bit Julia 0.3.7, the second call to timeit() took 
> 17.6 seconds and allocated 0 bytes.  That is 3.9 times slower than Python.  
> I note that the @simd decorator didn't speed up the execution.
>
> Python performance seems in reach if I can get a parallelized Julia 
> implementation.  Here is my attempt:
> function innerloop(x::Array{Complex{Float64}}, y::Array{Complex{Float64}}, 
> scaler::Float64)
>
>    n = length(x)
>
>    assert(length(y) == n)
>
>    s = @parallel (+) for i in 1:n
>
>        y[i] * exp(scaler * x[i])
>
>    end
>
>    s
>
> end
>
> The timeit() function remains the same.  Running Julia with the "-p 8" on 
> the command line, the second call to timeit() took 233 seconds and 
> allocated 303GB.  The huge allocation is my clue that something is really 
> wrong here.  All my attempts at adding @inbounds created errors.
>
> I should say that this benchmark is representative of a real problem that 
> I am trying to solve.  My actual Python code takes 10 minutes to execute 
> because n and reps are larger.  However, the types of the vectors match my 
> actual code, and scaler changes with each rep.
>
> Please show me by example how to modify innerloop() to be faster than the 
> single-threaded version.  My preference is for a fix with Julia 0.3.7, but 
> I am also interested if someone provides Julia 0.4-pre code including a 
> speed comparison with the single-threaded version on their machine.  If 
> possible, I would like to avoid SharedArrays because (I believe) they can't 
> be used with BLAS calls in other functions.
>

Reply via email to