Hi, I am trying to optimize a code which just adds a bunch of things. My
first instinct was to unravel the loops and run it as SIMD, like so:
dx = 1/400
addprocs(3)
imin = -6
jmin = -2
#Some SIMD
@time res = @sync @parallel (+) for i = imin:dx:0
tmp = 0
for j=jmin:dx:0
ans = 0
@simd for k=1:24
@inbounds ans += coefs[k]*(i^(powz[k]))*(j^(poww[k]))
end
tmp += abs(ans)<1
end
tmp
end
res = res*(12/(length(imin:dx:0)*length(jmin:dx:0)))
println(res)#Mathematica gives 2.98
(Coefficients arrays posted at the bottom for completeness). On a 4 core i7
this runs in ~4.68 seconds. I was able to beat this by optimizing the power
calculations like so:
## Full unravel
@time res = @sync @parallel (+) for i = imin:dx:0
tmp = 0
isq2 = i*i; isq3 = i*isq2; isq4 = isq2*isq2; isq5 = i*isq4
isq6 = isq4*isq2; isq7 = i*isq6; isq8 = isq4*isq4
for j=jmin:dx:0
jsq2 = j*j; jsq4 = jsq2*jsq2; jsq6 = jsq2*jsq4; jsq8 = jsq4*jsq4
@inbounds tmp += abs(coefs[1]*(jsq2) + coefs[2]*(jsq4) + coefs[3]*(jsq6)
+ coefs[4]*(jsq8) + coefs[5]*(i) + coefs[6]*(i)*(jsq2) + coefs[7]*(i)*(jsq4)
+ coefs[8]*(i)*(jsq6) +
coefs[9]*(isq2) + coefs[10]*(isq2)*(jsq2) + coefs[11]*(isq2)*(jsq4) +
coefs[12]*(isq2)*(jsq6) + coefs[13]*(isq3) + coefs[14]*(isq3)*(jsq2) + coefs
[15]*(isq3)*(jsq4) +
coefs[16]*(isq4) + coefs[17]*(isq4)*(jsq2) + coefs[18]*(isq4)*(jsq4) +
coefs[19]*(isq5) + coefs[20]*(isq5)*(jsq2) + coefs[21]*(isq6) + coefs[22]*(
isq6)*(jsq2) +
coefs[23]*(isq7) + coefs[24]*(isq8))<1
end
tmp
end
res = res*(12/(length(imin:dx:0)*length(jmin:dx:0)))
println(res)#Mathematica gives 2.98
This is the best version and gets ~1.83 seconds (FYI it bets a version
where I distributed out by isq). However, when I put this into a function
and call it with @code_llvm, I get:
define %jl_value_t* @julia_simdCheck_3161() {
top:
%0 = alloca [8 x %jl_value_t*], align 8
%.sub = getelementptr inbounds [8 x %jl_value_t*]* %0, i64 0, i64 0
%1 = getelementptr [8 x %jl_value_t*]* %0, i64 0, i64 2
%2 = getelementptr [8 x %jl_value_t*]* %0, i64 0, i64 3
store %jl_value_t* inttoptr (i64 12 to %jl_value_t*), %jl_value_t** %.sub,
align 8
%3 = getelementptr [8 x %jl_value_t*]* %0, i64 0, i64 1
%4 = load %jl_value_t*** @jl_pgcstack, align 8
%.c = bitcast %jl_value_t** %4 to %jl_value_t*
store %jl_value_t* %.c, %jl_value_t** %3, align 8
store %jl_value_t** %.sub, %jl_value_t*** @jl_pgcstack, align 8
store %jl_value_t* null, %jl_value_t** %1, align 8
store %jl_value_t* null, %jl_value_t** %2, align 8
%5 = getelementptr [8 x %jl_value_t*]* %0, i64 0, i64 4
store %jl_value_t* null, %jl_value_t** %5, align 8
%6 = getelementptr [8 x %jl_value_t*]* %0, i64 0, i64 5
store %jl_value_t* null, %jl_value_t** %6, align 8
%7 = getelementptr [8 x %jl_value_t*]* %0, i64 0, i64 6
store %jl_value_t* null, %jl_value_t** %7, align 8
%8 = getelementptr [8 x %jl_value_t*]* %0, i64 0, i64 7
store %jl_value_t* null, %jl_value_t** %8, align 8
%9 = call %jl_value_t* @julia_sync_begin_535()
store %jl_value_t* inttoptr (i64 2274649264 to %jl_value_t*), %jl_value_t
** %2, align 8
%10 = load %jl_value_t* (%jl_value_t*, %jl_value_t**, i32)** inttoptr (i64
2274649264 to %jl_value_t* (%jl_value_t*, %jl_value_t**, i32)**), align 16
%11 = load %jl_value_t** inttoptr (i64 2212282536 to %jl_value_t**),
align 8
store %jl_value_t* %11, %jl_value_t** %5, align 8
%12 = load %jl_value_t** inttoptr (i64 2197909240 to %jl_value_t**),
align 8
store %jl_value_t* %12, %jl_value_t** %6, align 8
%13 = load %jl_value_t** inttoptr (i64 2197909288 to %jl_value_t**),
align 8
store %jl_value_t* %13, %jl_value_t** %7, align 8
%14 = load %jl_value_t** inttoptr (i64 2212418552 to %jl_value_t**),
align 8
store %jl_value_t* %14, %jl_value_t** %8, align 8
%15 = call %jl_value_t* %10(%jl_value_t* inttoptr (i64 2274649264 to %
jl_value_t*), %jl_value_t** %5, i32 4)
store %jl_value_t* %15, %jl_value_t** %1, align 8
call void @julia_sync_end_541()
%16 = load %jl_value_t** %3, align 8
%17 = getelementptr inbounds %jl_value_t* %16, i64 0, i32 0
store %jl_value_t** %17, %jl_value_t*** @jl_pgcstack, align 8
ret %jl_value_t* %15
}
This shows me that the llvm is not auto-vectorizing the inner part. *Is
there a way to tell the llvm to SIMD vectorize the big summation in this
more optimized form?*
For completeness, here are the coefficient arrays (note: coefs should be
treated as variable. In principle those zeros could change so there's no
manual optimizing to be done there):
coefs
Vector Float64, 24
1.3333333333333328
0.16666666666666669
0.0
0.0
2.0
2.333333333333335
0.06250000000000001
0.0
2.0
1.1666666666666659
0.019097222222222224
0.0
1.0
3.469446951953614e-18
0.0
0.25
0.010416666666666666
0.0
0.0
0.0
0.0
0.0
0.0
0.0
powz
Vector Float64, 24
0.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
2.0
2.0
2.0
2.0
3.0
3.0
3.0
4.0
4.0
4.0
5.0
5.0
6.0
6.0
7.0
8.0
poww
Vector Float64, 24
2.0
4.0
6.0
8.0
0.0
2.0
4.0
6.0
0.0
2.0
4.0
6.0
0.0
2.0
4.0
0.0
2.0
4.0
0.0
2.0
0.0
2.0
0.0
0.0