Re: [julia-users] Telling if code is vectorised.

2016-09-14 Thread Yichao Yu
On Wed, Sep 14, 2016 at 10:33 AM, Ben Ward  wrote:

> Hi,
>
> I've written a simple function:
>
> function testfun2()
> a = 0
> @inbounds @simd for i in UInt64(1):UInt64(1000)
> i = i - ((i >> 1) & 0x)
> a += ((i & 0x) + ((i >> 2) & 0x))
> end
> return a
> end
>
>

Run your code through code_warntype first


> I applies the same set of bit operations to a series of UInt64's and
> accumulates a result.
>
> And know from the Intel blog on vectorisation, in the llvm code generated
> by julia there is a set of instructions to look out for that indicate
> vectorised code: vector.head and vector.ph:
>
> I'm wondering why I don't see those instructions in the llvm generated for
> this function, I don't think I've violated any of the rules of writing
> loops that can be vectorised:
>
> *julia> **@code_llvm testfun2()*
>
>
> define %jl_value_t* @julia_testfun2_70900() #0 {
>
> top:
>
>   %0 = call %jl_value_t*** @jl_get_ptls_states() #1
>
>   %1 = alloca [11 x %jl_value_t*], align 8
>
>   %.sub = getelementptr inbounds [11 x %jl_value_t*], [11 x %jl_value_t*]*
> %1, i64 0, i64 0
>
>   %2 = getelementptr [11 x %jl_value_t*], [11 x %jl_value_t*]* %1, i64 0,
> i64 8
>
>   %3 = getelementptr [11 x %jl_value_t*], [11 x %jl_value_t*]* %1, i64 0,
> i64 2
>
>   %a = getelementptr [11 x %jl_value_t*], [11 x %jl_value_t*]* %1, i64 0,
> i64 7
>
>   %4 = bitcast %jl_value_t** %2 to i8*
>
>   call void @llvm.memset.p0i8.i32(i8* %4, i8 0, i32 24, i32 8, i1 false)
>
>   %5 = bitcast [11 x %jl_value_t*]* %1 to i64*
>
>   %6 = bitcast %jl_value_t** %3 to i8*
>
>   call void @llvm.memset.p0i8.i64(i8* %6, i8 0, i64 40, i32 8, i1 false)
>
>   store i64 18, i64* %5, align 8
>
>   %7 = bitcast %jl_value_t*** %0 to i64*
>
>   %8 = load i64, i64* %7, align 8
>
>   %9 = getelementptr [11 x %jl_value_t*], [11 x %jl_value_t*]* %1, i64 0,
> i64 1
>
>   %10 = bitcast %jl_value_t** %9 to i64*
>
>   store i64 %8, i64* %10, align 8
>
>   store %jl_value_t** %.sub, %jl_value_t*** %0, align 8
>
>   %"r#274" = alloca %UnitRange.6, align 8
>
>   store %jl_value_t* inttoptr (i64 4400218208 to %jl_value_t*),
> %jl_value_t** %a, align 8
>
>   %11 = getelementptr inbounds %UnitRange.6, %UnitRange.6* %"r#274", i64
> 0, i32 0
>
>   store i64 1, i64* %11, align 8
>
>   %12 = getelementptr inbounds %UnitRange.6, %UnitRange.6* %"r#274", i64
> 0, i32 1
>
>   store i64 1000, i64* %12, align 8
>
>   %13 = call i64 @julia_simd_inner_length_70896(%UnitRange.6* nonnull
> %"r#274", i64 0) #0
>
>   %14 = icmp eq i64 %13, 0
>
>   br i1 %14, label %L.backedge, label %if26.lr.ph
>
>
> L8:   ; preds = %if26
>
>   %15 = load %jl_value_t*, %jl_value_t** %a, align 8
>
>   store %jl_value_t* %15, %jl_value_t** %40, align 8
>
>   %16 = getelementptr inbounds %jl_value_t, %jl_value_t* %15, i64 -1, i32 0
>
>   %17 = bitcast %jl_value_t** %16 to i64*
>
>   %18 = load i64, i64* %17, align 8
>
>   %19 = and i64 %18, -16
>
>   %20 = inttoptr i64 %19 to %jl_value_t*
>
>   %21 = icmp eq %jl_value_t* %20, inttoptr (i64 4400088496 to %jl_value_t*)
>
>   br i1 %21, label %L11, label %L10
>
>
> L10:  ; preds = %L8
>
>   %22 = load i64, i64* %46, align 8
>
>   store i64 %22, i64* %47, align 8
>
>   %23 = and i64 %53, 3689348814741910323
>
>   %24 = lshr i64 %53, 2
>
>   %25 = and i64 %24, 3689348814741910323
>
>   %26 = add nuw nsw i64 %25, %23
>
>   store %jl_value_t* inttoptr (i64 4408276712 to %jl_value_t*),
> %jl_value_t** %2, align 8
>
>   %27 = call %jl_value_t* @jl_box_uint64(i64 zeroext %26)
>
>   store %jl_value_t* %27, %jl_value_t** %44, align 8
>
>   %28 = call %jl_value_t* @jl_apply_generic(%jl_value_t** %2, i32 3)
>
>   store %jl_value_t* %28, %jl_value_t** %41, align 8
>
>   br label %L12
>
>
> L11:  ; preds = %L8, %if26
>
>   %"#temp#3.0" = phi %jl_value_t* [ inttoptr (i64 4409008592 to
> %jl_value_t*), %if26 ], [ inttoptr (i64 4423075536 to %jl_value_t*), %L8 ]
>
>   store %jl_value_t* %"#temp#3.0", %jl_value_t** %42, align 8
>
>   %29 = load i64, i64* %46, align 8
>
>   store i64 %29, i64* %47, align 8
>
>   %30 = and i64 %53, 3689348814741910323
>
>   %31 = lshr i64 %53, 2
>
>   %32 = and i64 %31, 3689348814741910323
>
>   %33 = add nuw nsw i64 %32, %30
>
>   store %jl_value_t* inttoptr (i64 4408276712 to %jl_value_t*),
> %jl_value_t** %2, align 8
>
>   %34 = call %jl_value_t* @jl_box_uint64(i64 zeroext %33)
>
>   store %jl_value_t* %34, %jl_value_t** %44, align 8
>
>   %35 = call %jl_value_t* @jl_invoke(%jl_value_t* %"#temp#3.0",
> %jl_value_t** %2, i32 3)
>
>   store %jl_value_t* %35, %jl_value_t** %43, align 8
>
>   br label %L12
>
>
> L12:  ; preds = %L11, %L10
>
>   %storemerge.in.in = phi %jl_value_t* [ %28, %L10 ], [ %35, %L11 ]
>
>   %storemerge.in = bitcast %jl_value_t* 

[julia-users] Telling if code is vectorised.

2016-09-14 Thread Ben Ward
Hi,

I've written a simple function:

function testfun2()
a = 0
@inbounds @simd for i in UInt64(1):UInt64(1000)
i = i - ((i >> 1) & 0x)
a += ((i & 0x) + ((i >> 2) & 0x))
end
return a
end

I applies the same set of bit operations to a series of UInt64's and 
accumulates a result.

And know from the Intel blog on vectorisation, in the llvm code generated 
by julia there is a set of instructions to look out for that indicate 
vectorised code: vector.head and vector.ph:

I'm wondering why I don't see those instructions in the llvm generated for 
this function, I don't think I've violated any of the rules of writing 
loops that can be vectorised:

*julia> **@code_llvm testfun2()*


define %jl_value_t* @julia_testfun2_70900() #0 {

top:

  %0 = call %jl_value_t*** @jl_get_ptls_states() #1

  %1 = alloca [11 x %jl_value_t*], align 8

  %.sub = getelementptr inbounds [11 x %jl_value_t*], [11 x %jl_value_t*]* 
%1, i64 0, i64 0

  %2 = getelementptr [11 x %jl_value_t*], [11 x %jl_value_t*]* %1, i64 0, 
i64 8

  %3 = getelementptr [11 x %jl_value_t*], [11 x %jl_value_t*]* %1, i64 0, 
i64 2

  %a = getelementptr [11 x %jl_value_t*], [11 x %jl_value_t*]* %1, i64 0, 
i64 7

  %4 = bitcast %jl_value_t** %2 to i8*

  call void @llvm.memset.p0i8.i32(i8* %4, i8 0, i32 24, i32 8, i1 false)

  %5 = bitcast [11 x %jl_value_t*]* %1 to i64*

  %6 = bitcast %jl_value_t** %3 to i8*

  call void @llvm.memset.p0i8.i64(i8* %6, i8 0, i64 40, i32 8, i1 false)

  store i64 18, i64* %5, align 8

  %7 = bitcast %jl_value_t*** %0 to i64*

  %8 = load i64, i64* %7, align 8

  %9 = getelementptr [11 x %jl_value_t*], [11 x %jl_value_t*]* %1, i64 0, 
i64 1

  %10 = bitcast %jl_value_t** %9 to i64*

  store i64 %8, i64* %10, align 8

  store %jl_value_t** %.sub, %jl_value_t*** %0, align 8

  %"r#274" = alloca %UnitRange.6, align 8

  store %jl_value_t* inttoptr (i64 4400218208 to %jl_value_t*), 
%jl_value_t** %a, align 8

  %11 = getelementptr inbounds %UnitRange.6, %UnitRange.6* %"r#274", i64 0, 
i32 0

  store i64 1, i64* %11, align 8

  %12 = getelementptr inbounds %UnitRange.6, %UnitRange.6* %"r#274", i64 0, 
i32 1

  store i64 1000, i64* %12, align 8

  %13 = call i64 @julia_simd_inner_length_70896(%UnitRange.6* nonnull 
%"r#274", i64 0) #0

  %14 = icmp eq i64 %13, 0

  br i1 %14, label %L.backedge, label %if26.lr.ph


L8:   ; preds = %if26

  %15 = load %jl_value_t*, %jl_value_t** %a, align 8

  store %jl_value_t* %15, %jl_value_t** %40, align 8

  %16 = getelementptr inbounds %jl_value_t, %jl_value_t* %15, i64 -1, i32 0

  %17 = bitcast %jl_value_t** %16 to i64*

  %18 = load i64, i64* %17, align 8

  %19 = and i64 %18, -16

  %20 = inttoptr i64 %19 to %jl_value_t*

  %21 = icmp eq %jl_value_t* %20, inttoptr (i64 4400088496 to %jl_value_t*)

  br i1 %21, label %L11, label %L10


L10:  ; preds = %L8

  %22 = load i64, i64* %46, align 8

  store i64 %22, i64* %47, align 8

  %23 = and i64 %53, 3689348814741910323

  %24 = lshr i64 %53, 2

  %25 = and i64 %24, 3689348814741910323

  %26 = add nuw nsw i64 %25, %23

  store %jl_value_t* inttoptr (i64 4408276712 to %jl_value_t*), 
%jl_value_t** %2, align 8

  %27 = call %jl_value_t* @jl_box_uint64(i64 zeroext %26)

  store %jl_value_t* %27, %jl_value_t** %44, align 8

  %28 = call %jl_value_t* @jl_apply_generic(%jl_value_t** %2, i32 3)

  store %jl_value_t* %28, %jl_value_t** %41, align 8

  br label %L12


L11:  ; preds = %L8, %if26

  %"#temp#3.0" = phi %jl_value_t* [ inttoptr (i64 4409008592 to 
%jl_value_t*), %if26 ], [ inttoptr (i64 4423075536 to %jl_value_t*), %L8 ]

  store %jl_value_t* %"#temp#3.0", %jl_value_t** %42, align 8

  %29 = load i64, i64* %46, align 8

  store i64 %29, i64* %47, align 8

  %30 = and i64 %53, 3689348814741910323

  %31 = lshr i64 %53, 2

  %32 = and i64 %31, 3689348814741910323

  %33 = add nuw nsw i64 %32, %30

  store %jl_value_t* inttoptr (i64 4408276712 to %jl_value_t*), 
%jl_value_t** %2, align 8

  %34 = call %jl_value_t* @jl_box_uint64(i64 zeroext %33)

  store %jl_value_t* %34, %jl_value_t** %44, align 8

  %35 = call %jl_value_t* @jl_invoke(%jl_value_t* %"#temp#3.0", 
%jl_value_t** %2, i32 3)

  store %jl_value_t* %35, %jl_value_t** %43, align 8

  br label %L12


L12:  ; preds = %L11, %L10

  %storemerge.in.in = phi %jl_value_t* [ %28, %L10 ], [ %35, %L11 ]

  %storemerge.in = bitcast %jl_value_t* %storemerge.in.in to i64*

  %storemerge = load i64, i64* %storemerge.in, align 1

  %36 = call %jl_value_t* @jl_box_uint64(i64 zeroext %storemerge)

  store %jl_value_t* %36, %jl_value_t** %a, align 8

  %37 = add nuw i64 %"i#277.040", 1

  %exitcond = icmp eq i64 %37, %13

  br i1 %exitcond, label %L.backedge.loopexit, label %if26


L.backedge.loopexit:  ; preds = %L12