Chris

To get good performance, you need to put your code into a function.
You seem to be evaluating it directly at the REPL -- this will be
slow. See the "Performance Tips" in the manual.

The LLVM code you see is not your kernel code. Instead, it contains a
"call" statement, presumably to a function that contains all the code
in the @parallel region. You can try creating a function for your
actual kernel, which goes from "tmp = 0" to "tmp" near the bottom, and
running "@code_llvm" on it.

-erik


On Thu, Feb 4, 2016 at 4:29 PM, Chris Rackauckas <[email protected]> wrote:
> Hi, I am trying to optimize a code which just adds a bunch of things. My
> first instinct was to unravel the loops and run it as SIMD, like so:
>
> dx = 1/400
> addprocs(3)
> imin = -6
> jmin = -2
>
> #Some SIMD
> @time res = @sync @parallel (+) for i = imin:dx:0
>   tmp = 0
>   for j=jmin:dx:0
>     ans = 0
>     @simd for k=1:24
>       @inbounds ans += coefs[k]*(i^(powz[k]))*(j^(poww[k]))
>     end
>     tmp += abs(ans)<1
>   end
>   tmp
> end
> res = res*(12/(length(imin:dx:0)*length(jmin:dx:0)))
> println(res)#Mathematica gives 2.98
>
> (Coefficients arrays posted at the bottom for completeness). On a 4 core i7
> this runs in ~4.68 seconds. I was able to beat this by optimizing the power
> calculations like so:
>
> ## Full unravel
> @time res = @sync @parallel (+) for i = imin:dx:0
>   tmp = 0
>   isq2 = i*i; isq3 = i*isq2; isq4 = isq2*isq2; isq5 = i*isq4
>   isq6 = isq4*isq2; isq7 = i*isq6; isq8 = isq4*isq4
>   for j=jmin:dx:0
>     jsq2 = j*j; jsq4 = jsq2*jsq2; jsq6 = jsq2*jsq4; jsq8 = jsq4*jsq4
>     @inbounds tmp += abs(coefs[1]*(jsq2) + coefs[2]*(jsq4) + coefs[3]*(jsq6)
> + coefs[4]*(jsq8) + coefs[5]*(i) + coefs[6]*(i)*(jsq2) + coefs[7]*(i)*(jsq4)
> + coefs[8]*(i)*(jsq6) +
>     coefs[9]*(isq2) + coefs[10]*(isq2)*(jsq2) + coefs[11]*(isq2)*(jsq4) +
> coefs[12]*(isq2)*(jsq6) + coefs[13]*(isq3) + coefs[14]*(isq3)*(jsq2) +
> coefs[15]*(isq3)*(jsq4) +
>     coefs[16]*(isq4) + coefs[17]*(isq4)*(jsq2) + coefs[18]*(isq4)*(jsq4) +
> coefs[19]*(isq5) + coefs[20]*(isq5)*(jsq2) + coefs[21]*(isq6) +
> coefs[22]*(isq6)*(jsq2) +
>     coefs[23]*(isq7) + coefs[24]*(isq8))<1
>   end
>   tmp
> end
> res = res*(12/(length(imin:dx:0)*length(jmin:dx:0)))
> println(res)#Mathematica gives 2.98
>
> This is the best version and gets ~1.83 seconds (FYI it bets a version where
> I distributed out by isq). However, when I put this into a function and call
> it with @code_llvm, I get:
>
> define %jl_value_t* @julia_simdCheck_3161() {
> top:
>   %0 = alloca [8 x %jl_value_t*], align 8
>   %.sub = getelementptr inbounds [8 x %jl_value_t*]* %0, i64 0, i64 0
>   %1 = getelementptr [8 x %jl_value_t*]* %0, i64 0, i64 2
>   %2 = getelementptr [8 x %jl_value_t*]* %0, i64 0, i64 3
>   store %jl_value_t* inttoptr (i64 12 to %jl_value_t*), %jl_value_t** %.sub,
> align 8
>   %3 = getelementptr [8 x %jl_value_t*]* %0, i64 0, i64 1
>   %4 = load %jl_value_t*** @jl_pgcstack, align 8
>   %.c = bitcast %jl_value_t** %4 to %jl_value_t*
>   store %jl_value_t* %.c, %jl_value_t** %3, align 8
>   store %jl_value_t** %.sub, %jl_value_t*** @jl_pgcstack, align 8
>   store %jl_value_t* null, %jl_value_t** %1, align 8
>   store %jl_value_t* null, %jl_value_t** %2, align 8
>   %5 = getelementptr [8 x %jl_value_t*]* %0, i64 0, i64 4
>   store %jl_value_t* null, %jl_value_t** %5, align 8
>   %6 = getelementptr [8 x %jl_value_t*]* %0, i64 0, i64 5
>   store %jl_value_t* null, %jl_value_t** %6, align 8
>   %7 = getelementptr [8 x %jl_value_t*]* %0, i64 0, i64 6
>   store %jl_value_t* null, %jl_value_t** %7, align 8
>   %8 = getelementptr [8 x %jl_value_t*]* %0, i64 0, i64 7
>   store %jl_value_t* null, %jl_value_t** %8, align 8
>   %9 = call %jl_value_t* @julia_sync_begin_535()
>   store %jl_value_t* inttoptr (i64 2274649264 to %jl_value_t*),
> %jl_value_t** %2, align 8
>   %10 = load %jl_value_t* (%jl_value_t*, %jl_value_t**, i32)** inttoptr (i64
> 2274649264 to %jl_value_t* (%jl_value_t*, %jl_value_t**, i32)**), align 16
>   %11 = load %jl_value_t** inttoptr (i64 2212282536 to %jl_value_t**), align
> 8
>   store %jl_value_t* %11, %jl_value_t** %5, align 8
>   %12 = load %jl_value_t** inttoptr (i64 2197909240 to %jl_value_t**), align
> 8
>   store %jl_value_t* %12, %jl_value_t** %6, align 8
>   %13 = load %jl_value_t** inttoptr (i64 2197909288 to %jl_value_t**), align
> 8
>   store %jl_value_t* %13, %jl_value_t** %7, align 8
>   %14 = load %jl_value_t** inttoptr (i64 2212418552 to %jl_value_t**), align
> 8
>   store %jl_value_t* %14, %jl_value_t** %8, align 8
>   %15 = call %jl_value_t* %10(%jl_value_t* inttoptr (i64 2274649264 to
> %jl_value_t*), %jl_value_t** %5, i32 4)
>   store %jl_value_t* %15, %jl_value_t** %1, align 8
>   call void @julia_sync_end_541()
>   %16 = load %jl_value_t** %3, align 8
>   %17 = getelementptr inbounds %jl_value_t* %16, i64 0, i32 0
>   store %jl_value_t** %17, %jl_value_t*** @jl_pgcstack, align 8
>   ret %jl_value_t* %15
> }
>
> This shows me that the llvm is not auto-vectorizing the inner part. Is there
> a way to tell the llvm to SIMD vectorize the big summation in this more
> optimized form?
>
> For completeness, here are the coefficient arrays (note: coefs should be
> treated as variable. In principle those zeros could change so there's no
> manual optimizing to be done there):
>
> coefs
> Vector Float64, 24
> 1.3333333333333328
> 0.16666666666666669
> 0.0
> 0.0
> 2.0
> 2.333333333333335
> 0.06250000000000001
> 0.0
> 2.0
> 1.1666666666666659
> 0.019097222222222224
> 0.0
> 1.0
> 3.469446951953614e-18
> 0.0
> 0.25
> 0.010416666666666666
> 0.0
> 0.0
> 0.0
> 0.0
> 0.0
> 0.0
> 0.0
>
> powz
> Vector Float64, 24
> 0.0
> 0.0
> 0.0
> 0.0
> 1.0
> 1.0
> 1.0
> 1.0
> 2.0
> 2.0
> 2.0
> 2.0
> 3.0
> 3.0
> 3.0
> 4.0
> 4.0
> 4.0
> 5.0
> 5.0
> 6.0
> 6.0
> 7.0
> 8.0
>
> poww
>
> Vector Float64, 24
>
> 2.0
> 4.0
> 6.0
> 8.0
> 0.0
> 2.0
> 4.0
> 6.0
> 0.0
> 2.0
> 4.0
> 6.0
> 0.0
> 2.0
> 4.0
> 0.0
> 2.0
> 4.0
> 0.0
> 2.0
> 0.0
> 2.0
> 0.0
> 0.0



-- 
Erik Schnetter <[email protected]>
http://www.perimeterinstitute.ca/personal/eschnetter/

Reply via email to