This is on 0.5. Let's first look at a simple example

function f{N}(::Type{Val{N}}, v::Vector{Int}) # yes v is unused here
    q = ntuple(i -> 1 + i, Val{N})
end

The generated code for this looks simple and nice:

julia> @code_llvm f(Val{4}, [1,2,3,4])


define void @julia_f_50045([4 x i64]* sret, %jl_value_t*, %jl_value_t*) #0 {
top:
  %3 = alloca [4 x i64], align 8
  call void @julia_ntuple_50046([4 x i64]* nonnull sret %3, %jl_value_t* 
inttoptr (i64 140673751236272 to %jl_value_t*)) #0
  %.fca.0.gep = getelementptr inbounds [4 x i64], [4 x i64]* %3, i64 0, i64 
0
  %.fca.0.load = load i64, i64* %.fca.0.gep, align 8
  %.fca.0.insert = insertvalue [4 x i64] undef, i64 %.fca.0.load, 0
  %.fca.1.gep = getelementptr inbounds [4 x i64], [4 x i64]* %3, i64 0, i64 
1
  %.fca.1.load = load i64, i64* %.fca.1.gep, align 8
  %.fca.1.insert = insertvalue [4 x i64] %.fca.0.insert, i64 %.fca.1.load, 1
  %.fca.2.gep = getelementptr inbounds [4 x i64], [4 x i64]* %3, i64 0, i64 
2
  %.fca.2.load = load i64, i64* %.fca.2.gep, align 8
  %.fca.2.insert = insertvalue [4 x i64] %.fca.1.insert, i64 %.fca.2.load, 2
  %.fca.3.gep = getelementptr inbounds [4 x i64], [4 x i64]* %3, i64 0, i64 
3
  %.fca.3.load = load i64, i64* %.fca.3.gep, align 8
  %.fca.3.insert = insertvalue [4 x i64] %.fca.2.insert, i64 %.fca.3.load, 3
  store [4 x i64] %.fca.3.insert, [4 x i64]* %0, align 8
  ret v

On the other hand if we change f a little so it accesses v:


function f2{N}(::Type{Val{N}}, v::Vector{Int})
    q = ntuple(i -> 1 + v[i], Val{N})
end

We then get:

julia> @code_llvm f2(Val{4}, [1,2,3,4])

define void @julia_f2_50063([4 x i64]* sret, %jl_value_t*, %jl_value_t*) #0 
{
top:
  %3 = alloca [4 x i64], align 8
  %4 = call %jl_value_t*** @jl_get_ptls_states()
  %5 = alloca [5 x %jl_value_t*], align 8
  %.sub = getelementptr inbounds [5 x %jl_value_t*], [5 x %jl_value_t*]* 
%5, i64 0, i64 0
  %6 = getelementptr [5 x %jl_value_t*], [5 x %jl_value_t*]* %5, i64 0, i64 
2
  store %jl_value_t* null, %jl_value_t** %6, align 8
  %7 = getelementptr [5 x %jl_value_t*], [5 x %jl_value_t*]* %5, i64 0, i64 
3
  store %jl_value_t* null, %jl_value_t** %7, align 8
  %8 = getelementptr [5 x %jl_value_t*], [5 x %jl_value_t*]* %5, i64 0, i64 
4
  store %jl_value_t* null, %jl_value_t** %8, align 8
  %9 = bitcast [5 x %jl_value_t*]* %5 to i64*
  store i64 6, i64* %9, align 8
  %10 = getelementptr [5 x %jl_value_t*], [5 x %jl_value_t*]* %5, i64 0, 
i64 1
  %11 = bitcast %jl_value_t*** %4 to i64*
  %12 = load i64, i64* %11, align 8
  %13 = bitcast %jl_value_t** %10 to i64*
  store i64 %12, i64* %13, align 8
  store %jl_value_t** %.sub, %jl_value_t*** %4, align 8
  %14 = call %jl_value_t* @jl_gc_alloc_1w()
  store %jl_value_t* %14, %jl_value_t** %6, align 8
  %15 = getelementptr inbounds %jl_value_t, %jl_value_t* %14, i64 -1, i32 0
  store %jl_value_t* inttoptr (i64 140673762914912 to %jl_value_t*), 
%jl_value_t** %15, align 8
  store %jl_value_t* %2, %jl_value_t** %7, align 8
  %16 = getelementptr inbounds %jl_value_t, %jl_value_t* %14, i64 0, i32 0
  store %jl_value_t* %2, %jl_value_t** %16, align 8
  store %jl_value_t* %14, %jl_value_t** %8, align 8
*  call void @julia_ntuple_50064([4 x i64]* nonnull sret %3, %jl_value_t* 
%14, %jl_value_t* inttoptr (i64 140673751236272 to %jl_value_t*)) #0*
 * %.fca.0.gep = getelementptr inbounds [4 x i64], [4 x i64]* %3, i64 0, 
i64 0*
*  %.fca.0.load = load i64, i64* %.fca.0.gep, align 8*
*  %.fca.0.insert = insertvalue [4 x i64] undef, i64 %.fca.0.load, 0*
*  %.fca.1.gep = getelementptr inbounds [4 x i64], [4 x i64]* %3, i64 0, 
i64 1*
*  %.fca.1.load = load i64, i64* %.fca.1.gep, align 8*
*  %.fca.1.insert = insertvalue [4 x i64] %.fca.0.insert, i64 %.fca.1.load, 
1*
*  %.fca.2.gep = getelementptr inbounds [4 x i64], [4 x i64]* %3, i64 0, 
i64 2*
*  %.fca.2.load = load i64, i64* %.fca.2.gep, align 8*
*  %.fca.2.insert = insertvalue [4 x i64] %.fca.1.insert, i64 %.fca.2.load, 
2*
*  %.fca.3.gep = getelementptr inbounds [4 x i64], [4 x i64]* %3, i64 0, 
i64 3*
*  %.fca.3.load = load i64, i64* %.fca.3.gep, align 8*
*  %.fca.3.insert = insertvalue [4 x i64] %.fca.2.insert, i64 %.fca.3.load, 
3*
*  store [4 x i64] %.fca.3.insert, [4 x i64]* %0, align 8*
*  %17 = load i64, i64* %13, align 8*
*  store i64 %17, i64* %11, align 8*
*  ret void*
}

The bold text is very similar to the previous function but there is a large 
chunk of allocations and stores above as well as a call to 
jl_get_ptls_states()..

A generated function that does the same works well and does not allocate:

@generated function f2{N}(::Type{Val{N}}, v::Vector{Int})
    return Expr(:tuple, [:($i+v[$i]) for i=1:N]...)
end

but I try to avoid using @generated  by using ntuple.. Anyone has some 
experience with this on how to use ntuple like this without getting 
allocations.

Thanks!

// Kristoffer


Reply via email to