Hi DNF,
I get below results onJulia 0.5 (home-build) and Julia 0.4 (downloaded).
A clear difference is the presence of a vector block in the output of
‘code_llvm(innersimd, Tuple{Vector{Float32},Vector{Float32}})'
Regards,
Rob
> On Nov 6, 2015, at 3:53 AM, DNF <[email protected]> wrote:
>
> On Friday, November 6, 2015 at 12:20:38 PM UTC+1, Giuseppe Ragusa wrote:
> I am pretty sure must something specific to your installation.
>
> Do you mean my Julia installation?
Julia Version 0.5.0-dev+1158
Commit 20786d2* (2015-11-05 14:13 UTC)
Platform Info:
System: Darwin (x86_64-apple-darwin15.0.0)
CPU: Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz
WORD_SIZE: 64
BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell)
LAPACK: libopenblas64_
LIBM: libopenlibm
LLVM: libLLVM-3.3
First call to timeit(1000,1000):
GFlop = 2.4158674171961443
GFlop (SIMD) = 14.63560990245366
Second call to timeit(1000,1000):
GFlop = 2.3526062760477626
GFlop (SIMD) = 16.769379113738314
define float @julia_innersimd_23136(%jl_value_t*, %jl_value_t*) {
L:
%2 = getelementptr inbounds %jl_value_t* %0, i64 1
%3 = bitcast %jl_value_t* %2 to i64*
%4 = load i64* %3, align 8
%5 = icmp sgt i64 %4, 0
%6 = select i1 %5, i64 %4, i64 0
%7 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %6, i64 1)
%8 = extractvalue { i64, i1 } %7, 1
br i1 %8, label %fail, label %pass
fail: ; preds = %L
%9 = load %jl_value_t** @jl_overflow_exception, align 8
call void @jl_throw(%jl_value_t* %9)
unreachable
pass: ; preds = %L
%10 = extractvalue { i64, i1 } %7, 0
%11 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %10, i64 1)
%12 = extractvalue { i64, i1 } %11, 1
br i1 %12, label %fail1, label %pass2
fail1: ; preds = %pass
%13 = load %jl_value_t** @jl_overflow_exception, align 8
call void @jl_throw(%jl_value_t* %13)
unreachable
pass2: ; preds = %pass
%14 = extractvalue { i64, i1 } %11, 0
%15 = icmp slt i64 %14, 1
br i1 %15, label %L11, label %if3
if3: ; preds = %pass2
%16 = bitcast %jl_value_t* %1 to i8**
%17 = bitcast %jl_value_t* %0 to i8**
%18 = load i8** %17, align 8
%19 = load i8** %16, align 8
%n.mod.vf = urem i64 %14, 24
%cmp.zero = icmp eq i64 %14, %n.mod.vf
br i1 %cmp.zero, label %middle.block, label %vector.ph
vector.ph: ; preds = %if3
%n.vec = sub i64 %14, %n.mod.vf
%20 = sub i64 %n.mod.vf, %14
br label %vector.body
vector.body: ; preds = %vector.body,
%vector.ph
%lsr.iv41 = phi i64 [ %lsr.iv.next42, %vector.body ], [ 0, %vector.ph ]
%vec.phi = phi <8 x float> [ zeroinitializer, %vector.ph ], [ %29,
%vector.body ]
%vec.phi12 = phi <8 x float> [ zeroinitializer, %vector.ph ], [ %30,
%vector.body ]
%vec.phi13 = phi <8 x float> [ zeroinitializer, %vector.ph ], [ %31,
%vector.body ]
%21 = mul i64 %lsr.iv41, -4
%uglygep60 = getelementptr i8* %18, i64 %21
%uglygep6061 = bitcast i8* %uglygep60 to <8 x float>*
%wide.load = load <8 x float>* %uglygep6061, align 4
%22 = mul i64 %lsr.iv41, -4
%sunkaddr = ptrtoint i8* %18 to i64
%sunkaddr62 = add i64 %sunkaddr, %22
%sunkaddr63 = add i64 %sunkaddr62, 32
%sunkaddr64 = inttoptr i64 %sunkaddr63 to <8 x float>*
%wide.load16 = load <8 x float>* %sunkaddr64, align 4
%23 = mul i64 %lsr.iv41, -4
%sunkaddr65 = ptrtoint i8* %18 to i64
%sunkaddr66 = add i64 %sunkaddr65, %23
%sunkaddr67 = add i64 %sunkaddr66, 64
%sunkaddr68 = inttoptr i64 %sunkaddr67 to <8 x float>*
%wide.load17 = load <8 x float>* %sunkaddr68, align 4
%24 = mul i64 %lsr.iv41, -4
%uglygep = getelementptr i8* %19, i64 %24
%uglygep43 = bitcast i8* %uglygep to <8 x float>*
%wide.load18 = load <8 x float>* %uglygep43, align 4
%sunkaddr69 = ptrtoint i8* %19 to i64
%sunkaddr70 = add i64 %sunkaddr69, %24
%sunkaddr71 = add i64 %sunkaddr70, 32
%sunkaddr72 = inttoptr i64 %sunkaddr71 to <8 x float>*
%wide.load19 = load <8 x float>* %sunkaddr72, align 4
%25 = mul i64 %lsr.iv41, -4
%sunkaddr73 = ptrtoint i8* %19 to i64
%sunkaddr74 = add i64 %sunkaddr73, %25
%sunkaddr75 = add i64 %sunkaddr74, 64
%sunkaddr76 = inttoptr i64 %sunkaddr75 to <8 x float>*
%wide.load20 = load <8 x float>* %sunkaddr76, align 4
%26 = fmul <8 x float> %wide.load, %wide.load18
%27 = fmul <8 x float> %wide.load16, %wide.load19
%28 = fmul <8 x float> %wide.load17, %wide.load20
%29 = fadd <8 x float> %vec.phi, %26
%30 = fadd <8 x float> %vec.phi12, %27
%31 = fadd <8 x float> %vec.phi13, %28
%lsr.iv.next42 = add i64 %lsr.iv41, -24
%32 = icmp eq i64 %20, %lsr.iv.next42
br i1 %32, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body, %if3
%resume.val = phi i64 [ 0, %if3 ], [ %n.vec, %vector.body ]
%rdx.vec.exit.phi = phi <8 x float> [ zeroinitializer, %if3 ], [ %29,
%vector.body ]
%rdx.vec.exit.phi23 = phi <8 x float> [ zeroinitializer, %if3 ], [ %30,
%vector.body ]
%rdx.vec.exit.phi24 = phi <8 x float> [ zeroinitializer, %if3 ], [ %31,
%vector.body ]
%bin.rdx = fadd <8 x float> %rdx.vec.exit.phi23, %rdx.vec.exit.phi
%bin.rdx25 = fadd <8 x float> %rdx.vec.exit.phi24, %bin.rdx
%rdx.shuf = shufflevector <8 x float> %bin.rdx25, <8 x float> undef, <8 x
i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx26 = fadd <8 x float> %bin.rdx25, %rdx.shuf
%rdx.shuf27 = shufflevector <8 x float> %bin.rdx26, <8 x float> undef, <8 x
i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32
undef>
%bin.rdx28 = fadd <8 x float> %bin.rdx26, %rdx.shuf27
%rdx.shuf29 = shufflevector <8 x float> %bin.rdx28, <8 x float> undef, <8 x
i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
i32 undef>
%bin.rdx30 = fadd <8 x float> %bin.rdx28, %rdx.shuf29
%33 = extractelement <8 x float> %bin.rdx30, i32 0
%cmp.n = icmp eq i64 %14, %resume.val
br i1 %cmp.n, label %L11, label %L5.preheader
L5.preheader: ; preds = %middle.block
%34 = mul i64 %resume.val, 4
%scevgep = getelementptr i8* %19, i64 %34
%scevgep36 = getelementptr i8* %18, i64 %34
<SNIPPED>
-----------------------------------------------------------------------------------------------
Julia Version 0.4.0
Commit 0ff703b* (2015-10-08 06:20 UTC)
Platform Info:
System: Darwin (x86_64-apple-darwin13.4.0)
CPU: Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz
WORD_SIZE: 64
BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell)
LAPACK: libopenblas64_
LIBM: libopenlibm
LLVM: libLLVM-3.3
First call to timeit(1000,1000):
GFlop = 2.552215131317849
GFlop (SIMD) = 13.911108019753776
Second call to timeit(1000,1000):
GFlop = 2.553179538308544
GFlop (SIMD) = 14.156285390713476
define float @julia_innersimd_24595(%jl_value_t*, %jl_value_t*) {
L:
%2 = getelementptr inbounds %jl_value_t* %0, i64 1
%3 = bitcast %jl_value_t* %2 to i64*
%4 = load i64* %3, align 8
%5 = icmp sgt i64 %4, 0
%6 = select i1 %5, i64 %4, i64 0
%7 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %6, i64 1)
%8 = extractvalue { i64, i1 } %7, 1
br i1 %8, label %fail, label %pass
fail: ; preds = %L
%9 = load %jl_value_t** @jl_overflow_exception, align 8
call void @jl_throw_with_superfluous_argument(%jl_value_t* %9, i32 67)
unreachable
pass: ; preds = %L
%10 = extractvalue { i64, i1 } %7, 0
%11 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %10, i64 1)
%12 = extractvalue { i64, i1 } %11, 1
br i1 %12, label %fail1, label %pass2
fail1: ; preds = %pass
%13 = load %jl_value_t** @jl_overflow_exception, align 8
call void @jl_throw_with_superfluous_argument(%jl_value_t* %13, i32 67)
unreachable
pass2: ; preds = %pass
%14 = extractvalue { i64, i1 } %11, 0
%15 = icmp slt i64 %14, 1
br i1 %15, label %L11, label %if3
if3: ; preds = %pass2
%16 = bitcast %jl_value_t* %1 to i8**
%17 = bitcast %jl_value_t* %0 to i8**
%18 = load i8** %17, align 8
%19 = load i8** %16, align 8
%n.vec = and i64 %14, -8
%cmp.zero = icmp eq i64 %n.vec, 0
br i1 %cmp.zero, label %middle.block, label %vector.body.preheader
vector.body.preheader: ; preds = %if3
br label %vector.body
vector.body: ; preds = %vector.body,
%vector.body.preheader
%lsr.iv32 = phi i64 [ 0, %vector.body.preheader ], [ %lsr.iv.next33,
%vector.body ]
%vec.phi = phi <4 x float> [ %25, %vector.body ], [ zeroinitializer,
%vector.body.preheader ]
%vec.phi13 = phi <4 x float> [ %26, %vector.body ], [ zeroinitializer,
%vector.body.preheader ]
%20 = mul i64 %lsr.iv32, -4
%uglygep43 = getelementptr i8* %18, i64 %20
%uglygep4344 = bitcast i8* %uglygep43 to <4 x float>*
%wide.load = load <4 x float>* %uglygep4344, align 4
%21 = mul i64 %lsr.iv32, -4
%sunkaddr = ptrtoint i8* %18 to i64
%sunkaddr45 = add i64 %sunkaddr, %21
%sunkaddr46 = add i64 %sunkaddr45, 16
%sunkaddr47 = inttoptr i64 %sunkaddr46 to <4 x float>*
%wide.load14 = load <4 x float>* %sunkaddr47, align 4
%22 = mul i64 %lsr.iv32, -4
%uglygep = getelementptr i8* %19, i64 %22
%uglygep34 = bitcast i8* %uglygep to <4 x float>*
%wide.load15 = load <4 x float>* %uglygep34, align 4
%sunkaddr48 = ptrtoint i8* %19 to i64
%sunkaddr49 = add i64 %sunkaddr48, %22
%sunkaddr50 = add i64 %sunkaddr49, 16
%sunkaddr51 = inttoptr i64 %sunkaddr50 to <4 x float>*
%wide.load16 = load <4 x float>* %sunkaddr51, align 4
%23 = fmul <4 x float> %wide.load, %wide.load15
%24 = fmul <4 x float> %wide.load14, %wide.load16
%25 = fadd <4 x float> %vec.phi, %23
%26 = fadd <4 x float> %vec.phi13, %24
%lsr.iv.next33 = add i64 %lsr.iv32, -8
%27 = add i64 %n.vec, %lsr.iv.next33
%28 = icmp eq i64 %27, 0
br i1 %28, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body, %if3
%resume.val = phi i64 [ 0, %if3 ], [ %n.vec, %vector.body ]
%rdx.vec.exit.phi = phi <4 x float> [ zeroinitializer, %if3 ], [ %25,
%vector.body ]
%rdx.vec.exit.phi19 = phi <4 x float> [ zeroinitializer, %if3 ], [ %26,
%vector.body ]
%bin.rdx = fadd <4 x float> %rdx.vec.exit.phi19, %rdx.vec.exit.phi
%rdx.shuf = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32>
<i32 2, i32 3, i32 undef, i32 undef>
%bin.rdx20 = fadd <4 x float> %bin.rdx, %rdx.shuf
%rdx.shuf21 = shufflevector <4 x float> %bin.rdx20, <4 x float> undef, <4 x
i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx22 = fadd <4 x float> %bin.rdx20, %rdx.shuf21
%29 = extractelement <4 x float> %bin.rdx22, i32 0
%cmp.n = icmp eq i64 %14, %resume.val
br i1 %cmp.n, label %L11, label %L5.preheader
L5.preheader: ; preds = %middle.block
%30 = mul i64 %resume.val, 4
%scevgep = getelementptr i8* %19, i64 %30
<SNIPPED>