Hi DNF,

I get below results onJulia 0.5 (home-build) and  Julia 0.4 (downloaded).

A clear difference is the presence of a vector block in the output of 
‘code_llvm(innersimd, Tuple{Vector{Float32},Vector{Float32}})'

Regards,
Rob


> On Nov 6, 2015, at 3:53 AM, DNF <[email protected]> wrote:
> 
> On Friday, November 6, 2015 at 12:20:38 PM UTC+1, Giuseppe Ragusa wrote:
> I am pretty sure must something specific to your installation.
> 
> Do you mean my Julia installation? 


Julia Version 0.5.0-dev+1158
Commit 20786d2* (2015-11-05 14:13 UTC)
Platform Info:
  System: Darwin (x86_64-apple-darwin15.0.0)
  CPU: Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz
  WORD_SIZE: 64
  BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell)
  LAPACK: libopenblas64_
  LIBM: libopenlibm
  LLVM: libLLVM-3.3

First call to timeit(1000,1000):

GFlop        = 2.4158674171961443
GFlop (SIMD) = 14.63560990245366

Second call to timeit(1000,1000):

GFlop        = 2.3526062760477626
GFlop (SIMD) = 16.769379113738314


define float @julia_innersimd_23136(%jl_value_t*, %jl_value_t*) {
L:
  %2 = getelementptr inbounds %jl_value_t* %0, i64 1
  %3 = bitcast %jl_value_t* %2 to i64*
  %4 = load i64* %3, align 8
  %5 = icmp sgt i64 %4, 0
  %6 = select i1 %5, i64 %4, i64 0
  %7 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %6, i64 1)
  %8 = extractvalue { i64, i1 } %7, 1
  br i1 %8, label %fail, label %pass

fail:                                             ; preds = %L
  %9 = load %jl_value_t** @jl_overflow_exception, align 8
  call void @jl_throw(%jl_value_t* %9)
  unreachable

pass:                                             ; preds = %L
  %10 = extractvalue { i64, i1 } %7, 0
  %11 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %10, i64 1)
  %12 = extractvalue { i64, i1 } %11, 1
  br i1 %12, label %fail1, label %pass2

fail1:                                            ; preds = %pass
  %13 = load %jl_value_t** @jl_overflow_exception, align 8
  call void @jl_throw(%jl_value_t* %13)
  unreachable

pass2:                                            ; preds = %pass
  %14 = extractvalue { i64, i1 } %11, 0
  %15 = icmp slt i64 %14, 1
  br i1 %15, label %L11, label %if3

if3:                                              ; preds = %pass2
  %16 = bitcast %jl_value_t* %1 to i8**
  %17 = bitcast %jl_value_t* %0 to i8**
  %18 = load i8** %17, align 8
  %19 = load i8** %16, align 8
  %n.mod.vf = urem i64 %14, 24
  %cmp.zero = icmp eq i64 %14, %n.mod.vf
  br i1 %cmp.zero, label %middle.block, label %vector.ph

vector.ph:                                        ; preds = %if3
  %n.vec = sub i64 %14, %n.mod.vf
  %20 = sub i64 %n.mod.vf, %14
  br label %vector.body

vector.body:                                      ; preds = %vector.body, 
%vector.ph
  %lsr.iv41 = phi i64 [ %lsr.iv.next42, %vector.body ], [ 0, %vector.ph ]
  %vec.phi = phi <8 x float> [ zeroinitializer, %vector.ph ], [ %29, 
%vector.body ]
  %vec.phi12 = phi <8 x float> [ zeroinitializer, %vector.ph ], [ %30, 
%vector.body ]
  %vec.phi13 = phi <8 x float> [ zeroinitializer, %vector.ph ], [ %31, 
%vector.body ]
  %21 = mul i64 %lsr.iv41, -4
  %uglygep60 = getelementptr i8* %18, i64 %21
  %uglygep6061 = bitcast i8* %uglygep60 to <8 x float>*
  %wide.load = load <8 x float>* %uglygep6061, align 4
  %22 = mul i64 %lsr.iv41, -4
  %sunkaddr = ptrtoint i8* %18 to i64
  %sunkaddr62 = add i64 %sunkaddr, %22
  %sunkaddr63 = add i64 %sunkaddr62, 32
  %sunkaddr64 = inttoptr i64 %sunkaddr63 to <8 x float>*
  %wide.load16 = load <8 x float>* %sunkaddr64, align 4
  %23 = mul i64 %lsr.iv41, -4
  %sunkaddr65 = ptrtoint i8* %18 to i64
  %sunkaddr66 = add i64 %sunkaddr65, %23
  %sunkaddr67 = add i64 %sunkaddr66, 64
  %sunkaddr68 = inttoptr i64 %sunkaddr67 to <8 x float>*
  %wide.load17 = load <8 x float>* %sunkaddr68, align 4
  %24 = mul i64 %lsr.iv41, -4
  %uglygep = getelementptr i8* %19, i64 %24
  %uglygep43 = bitcast i8* %uglygep to <8 x float>*
  %wide.load18 = load <8 x float>* %uglygep43, align 4
  %sunkaddr69 = ptrtoint i8* %19 to i64
  %sunkaddr70 = add i64 %sunkaddr69, %24
  %sunkaddr71 = add i64 %sunkaddr70, 32
  %sunkaddr72 = inttoptr i64 %sunkaddr71 to <8 x float>*
  %wide.load19 = load <8 x float>* %sunkaddr72, align 4
  %25 = mul i64 %lsr.iv41, -4
  %sunkaddr73 = ptrtoint i8* %19 to i64
  %sunkaddr74 = add i64 %sunkaddr73, %25
  %sunkaddr75 = add i64 %sunkaddr74, 64
  %sunkaddr76 = inttoptr i64 %sunkaddr75 to <8 x float>*
  %wide.load20 = load <8 x float>* %sunkaddr76, align 4
  %26 = fmul <8 x float> %wide.load, %wide.load18
  %27 = fmul <8 x float> %wide.load16, %wide.load19
  %28 = fmul <8 x float> %wide.load17, %wide.load20
  %29 = fadd <8 x float> %vec.phi, %26
  %30 = fadd <8 x float> %vec.phi12, %27
  %31 = fadd <8 x float> %vec.phi13, %28
  %lsr.iv.next42 = add i64 %lsr.iv41, -24
  %32 = icmp eq i64 %20, %lsr.iv.next42
  br i1 %32, label %middle.block, label %vector.body

middle.block:                                     ; preds = %vector.body, %if3
  %resume.val = phi i64 [ 0, %if3 ], [ %n.vec, %vector.body ]
  %rdx.vec.exit.phi = phi <8 x float> [ zeroinitializer, %if3 ], [ %29, 
%vector.body ]
  %rdx.vec.exit.phi23 = phi <8 x float> [ zeroinitializer, %if3 ], [ %30, 
%vector.body ]
  %rdx.vec.exit.phi24 = phi <8 x float> [ zeroinitializer, %if3 ], [ %31, 
%vector.body ]
  %bin.rdx = fadd <8 x float> %rdx.vec.exit.phi23, %rdx.vec.exit.phi
  %bin.rdx25 = fadd <8 x float> %rdx.vec.exit.phi24, %bin.rdx
  %rdx.shuf = shufflevector <8 x float> %bin.rdx25, <8 x float> undef, <8 x 
i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
  %bin.rdx26 = fadd <8 x float> %bin.rdx25, %rdx.shuf
  %rdx.shuf27 = shufflevector <8 x float> %bin.rdx26, <8 x float> undef, <8 x 
i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 
undef>
  %bin.rdx28 = fadd <8 x float> %bin.rdx26, %rdx.shuf27
  %rdx.shuf29 = shufflevector <8 x float> %bin.rdx28, <8 x float> undef, <8 x 
i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, 
i32 undef>
  %bin.rdx30 = fadd <8 x float> %bin.rdx28, %rdx.shuf29
  %33 = extractelement <8 x float> %bin.rdx30, i32 0
  %cmp.n = icmp eq i64 %14, %resume.val
  br i1 %cmp.n, label %L11, label %L5.preheader

L5.preheader:                                     ; preds = %middle.block
  %34 = mul i64 %resume.val, 4
  %scevgep = getelementptr i8* %19, i64 %34
  %scevgep36 = getelementptr i8* %18, i64 %34

<SNIPPED>

-----------------------------------------------------------------------------------------------

Julia Version 0.4.0
Commit 0ff703b* (2015-10-08 06:20 UTC)
Platform Info:
  System: Darwin (x86_64-apple-darwin13.4.0)
  CPU: Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz
  WORD_SIZE: 64
  BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell)
  LAPACK: libopenblas64_
  LIBM: libopenlibm
  LLVM: libLLVM-3.3

First call to timeit(1000,1000):

GFlop        = 2.552215131317849
GFlop (SIMD) = 13.911108019753776

Second call to timeit(1000,1000):

GFlop        = 2.553179538308544
GFlop (SIMD) = 14.156285390713476


define float @julia_innersimd_24595(%jl_value_t*, %jl_value_t*) {
L:
  %2 = getelementptr inbounds %jl_value_t* %0, i64 1
  %3 = bitcast %jl_value_t* %2 to i64*
  %4 = load i64* %3, align 8
  %5 = icmp sgt i64 %4, 0
  %6 = select i1 %5, i64 %4, i64 0
  %7 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %6, i64 1)
  %8 = extractvalue { i64, i1 } %7, 1
  br i1 %8, label %fail, label %pass

fail:                                             ; preds = %L
  %9 = load %jl_value_t** @jl_overflow_exception, align 8
  call void @jl_throw_with_superfluous_argument(%jl_value_t* %9, i32 67)
  unreachable

pass:                                             ; preds = %L
  %10 = extractvalue { i64, i1 } %7, 0
  %11 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %10, i64 1)
  %12 = extractvalue { i64, i1 } %11, 1
  br i1 %12, label %fail1, label %pass2

fail1:                                            ; preds = %pass
  %13 = load %jl_value_t** @jl_overflow_exception, align 8
  call void @jl_throw_with_superfluous_argument(%jl_value_t* %13, i32 67)
  unreachable

pass2:                                            ; preds = %pass
  %14 = extractvalue { i64, i1 } %11, 0
  %15 = icmp slt i64 %14, 1
  br i1 %15, label %L11, label %if3

if3:                                              ; preds = %pass2
  %16 = bitcast %jl_value_t* %1 to i8**
  %17 = bitcast %jl_value_t* %0 to i8**
  %18 = load i8** %17, align 8
  %19 = load i8** %16, align 8
  %n.vec = and i64 %14, -8
  %cmp.zero = icmp eq i64 %n.vec, 0
  br i1 %cmp.zero, label %middle.block, label %vector.body.preheader

vector.body.preheader:                            ; preds = %if3
  br label %vector.body

vector.body:                                      ; preds = %vector.body, 
%vector.body.preheader
  %lsr.iv32 = phi i64 [ 0, %vector.body.preheader ], [ %lsr.iv.next33, 
%vector.body ]
  %vec.phi = phi <4 x float> [ %25, %vector.body ], [ zeroinitializer, 
%vector.body.preheader ]
  %vec.phi13 = phi <4 x float> [ %26, %vector.body ], [ zeroinitializer, 
%vector.body.preheader ]
  %20 = mul i64 %lsr.iv32, -4
  %uglygep43 = getelementptr i8* %18, i64 %20
  %uglygep4344 = bitcast i8* %uglygep43 to <4 x float>*
  %wide.load = load <4 x float>* %uglygep4344, align 4
  %21 = mul i64 %lsr.iv32, -4
  %sunkaddr = ptrtoint i8* %18 to i64
  %sunkaddr45 = add i64 %sunkaddr, %21
  %sunkaddr46 = add i64 %sunkaddr45, 16
  %sunkaddr47 = inttoptr i64 %sunkaddr46 to <4 x float>*
  %wide.load14 = load <4 x float>* %sunkaddr47, align 4
  %22 = mul i64 %lsr.iv32, -4
  %uglygep = getelementptr i8* %19, i64 %22
  %uglygep34 = bitcast i8* %uglygep to <4 x float>*
  %wide.load15 = load <4 x float>* %uglygep34, align 4
  %sunkaddr48 = ptrtoint i8* %19 to i64
  %sunkaddr49 = add i64 %sunkaddr48, %22
  %sunkaddr50 = add i64 %sunkaddr49, 16
  %sunkaddr51 = inttoptr i64 %sunkaddr50 to <4 x float>*
  %wide.load16 = load <4 x float>* %sunkaddr51, align 4
  %23 = fmul <4 x float> %wide.load, %wide.load15
  %24 = fmul <4 x float> %wide.load14, %wide.load16
  %25 = fadd <4 x float> %vec.phi, %23
  %26 = fadd <4 x float> %vec.phi13, %24
  %lsr.iv.next33 = add i64 %lsr.iv32, -8
  %27 = add i64 %n.vec, %lsr.iv.next33
  %28 = icmp eq i64 %27, 0
  br i1 %28, label %middle.block, label %vector.body

middle.block:                                     ; preds = %vector.body, %if3
  %resume.val = phi i64 [ 0, %if3 ], [ %n.vec, %vector.body ]
  %rdx.vec.exit.phi = phi <4 x float> [ zeroinitializer, %if3 ], [ %25, 
%vector.body ]
  %rdx.vec.exit.phi19 = phi <4 x float> [ zeroinitializer, %if3 ], [ %26, 
%vector.body ]
  %bin.rdx = fadd <4 x float> %rdx.vec.exit.phi19, %rdx.vec.exit.phi
  %rdx.shuf = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> 
<i32 2, i32 3, i32 undef, i32 undef>
  %bin.rdx20 = fadd <4 x float> %bin.rdx, %rdx.shuf
  %rdx.shuf21 = shufflevector <4 x float> %bin.rdx20, <4 x float> undef, <4 x 
i32> <i32 1, i32 undef, i32 undef, i32 undef>
  %bin.rdx22 = fadd <4 x float> %bin.rdx20, %rdx.shuf21
  %29 = extractelement <4 x float> %bin.rdx22, i32 0
  %cmp.n = icmp eq i64 %14, %resume.val
  br i1 %cmp.n, label %L11, label %L5.preheader

L5.preheader:                                     ; preds = %middle.block
  %30 = mul i64 %resume.val, 4
  %scevgep = getelementptr i8* %19, i64 %30

<SNIPPED>

Reply via email to