This is on 0.5. Let's first look at a simple example
function f{N}(::Type{Val{N}}, v::Vector{Int}) # yes v is unused here
q = ntuple(i -> 1 + i, Val{N})
end
The generated code for this looks simple and nice:
julia> @code_llvm f(Val{4}, [1,2,3,4])
define void @julia_f_50045([4 x i64]* sret, %jl_value_t*, %jl_value_t*) #0 {
top:
%3 = alloca [4 x i64], align 8
call void @julia_ntuple_50046([4 x i64]* nonnull sret %3, %jl_value_t*
inttoptr (i64 140673751236272 to %jl_value_t*)) #0
%.fca.0.gep = getelementptr inbounds [4 x i64], [4 x i64]* %3, i64 0, i64
0
%.fca.0.load = load i64, i64* %.fca.0.gep, align 8
%.fca.0.insert = insertvalue [4 x i64] undef, i64 %.fca.0.load, 0
%.fca.1.gep = getelementptr inbounds [4 x i64], [4 x i64]* %3, i64 0, i64
1
%.fca.1.load = load i64, i64* %.fca.1.gep, align 8
%.fca.1.insert = insertvalue [4 x i64] %.fca.0.insert, i64 %.fca.1.load, 1
%.fca.2.gep = getelementptr inbounds [4 x i64], [4 x i64]* %3, i64 0, i64
2
%.fca.2.load = load i64, i64* %.fca.2.gep, align 8
%.fca.2.insert = insertvalue [4 x i64] %.fca.1.insert, i64 %.fca.2.load, 2
%.fca.3.gep = getelementptr inbounds [4 x i64], [4 x i64]* %3, i64 0, i64
3
%.fca.3.load = load i64, i64* %.fca.3.gep, align 8
%.fca.3.insert = insertvalue [4 x i64] %.fca.2.insert, i64 %.fca.3.load, 3
store [4 x i64] %.fca.3.insert, [4 x i64]* %0, align 8
ret v
On the other hand if we change f a little so it accesses v:
function f2{N}(::Type{Val{N}}, v::Vector{Int})
q = ntuple(i -> 1 + v[i], Val{N})
end
We then get:
julia> @code_llvm f2(Val{4}, [1,2,3,4])
define void @julia_f2_50063([4 x i64]* sret, %jl_value_t*, %jl_value_t*) #0
{
top:
%3 = alloca [4 x i64], align 8
%4 = call %jl_value_t*** @jl_get_ptls_states()
%5 = alloca [5 x %jl_value_t*], align 8
%.sub = getelementptr inbounds [5 x %jl_value_t*], [5 x %jl_value_t*]*
%5, i64 0, i64 0
%6 = getelementptr [5 x %jl_value_t*], [5 x %jl_value_t*]* %5, i64 0, i64
2
store %jl_value_t* null, %jl_value_t** %6, align 8
%7 = getelementptr [5 x %jl_value_t*], [5 x %jl_value_t*]* %5, i64 0, i64
3
store %jl_value_t* null, %jl_value_t** %7, align 8
%8 = getelementptr [5 x %jl_value_t*], [5 x %jl_value_t*]* %5, i64 0, i64
4
store %jl_value_t* null, %jl_value_t** %8, align 8
%9 = bitcast [5 x %jl_value_t*]* %5 to i64*
store i64 6, i64* %9, align 8
%10 = getelementptr [5 x %jl_value_t*], [5 x %jl_value_t*]* %5, i64 0,
i64 1
%11 = bitcast %jl_value_t*** %4 to i64*
%12 = load i64, i64* %11, align 8
%13 = bitcast %jl_value_t** %10 to i64*
store i64 %12, i64* %13, align 8
store %jl_value_t** %.sub, %jl_value_t*** %4, align 8
%14 = call %jl_value_t* @jl_gc_alloc_1w()
store %jl_value_t* %14, %jl_value_t** %6, align 8
%15 = getelementptr inbounds %jl_value_t, %jl_value_t* %14, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 140673762914912 to %jl_value_t*),
%jl_value_t** %15, align 8
store %jl_value_t* %2, %jl_value_t** %7, align 8
%16 = getelementptr inbounds %jl_value_t, %jl_value_t* %14, i64 0, i32 0
store %jl_value_t* %2, %jl_value_t** %16, align 8
store %jl_value_t* %14, %jl_value_t** %8, align 8
* call void @julia_ntuple_50064([4 x i64]* nonnull sret %3, %jl_value_t*
%14, %jl_value_t* inttoptr (i64 140673751236272 to %jl_value_t*)) #0*
* %.fca.0.gep = getelementptr inbounds [4 x i64], [4 x i64]* %3, i64 0,
i64 0*
* %.fca.0.load = load i64, i64* %.fca.0.gep, align 8*
* %.fca.0.insert = insertvalue [4 x i64] undef, i64 %.fca.0.load, 0*
* %.fca.1.gep = getelementptr inbounds [4 x i64], [4 x i64]* %3, i64 0,
i64 1*
* %.fca.1.load = load i64, i64* %.fca.1.gep, align 8*
* %.fca.1.insert = insertvalue [4 x i64] %.fca.0.insert, i64 %.fca.1.load,
1*
* %.fca.2.gep = getelementptr inbounds [4 x i64], [4 x i64]* %3, i64 0,
i64 2*
* %.fca.2.load = load i64, i64* %.fca.2.gep, align 8*
* %.fca.2.insert = insertvalue [4 x i64] %.fca.1.insert, i64 %.fca.2.load,
2*
* %.fca.3.gep = getelementptr inbounds [4 x i64], [4 x i64]* %3, i64 0,
i64 3*
* %.fca.3.load = load i64, i64* %.fca.3.gep, align 8*
* %.fca.3.insert = insertvalue [4 x i64] %.fca.2.insert, i64 %.fca.3.load,
3*
* store [4 x i64] %.fca.3.insert, [4 x i64]* %0, align 8*
* %17 = load i64, i64* %13, align 8*
* store i64 %17, i64* %11, align 8*
* ret void*
}
The bold text is very similar to the previous function but there is a large
chunk of allocations and stores above as well as a call to
jl_get_ptls_states()..
A generated function that does the same works well and does not allocate:
@generated function f2{N}(::Type{Val{N}}, v::Vector{Int})
return Expr(:tuple, [:($i+v[$i]) for i=1:N]...)
end
but I try to avoid using @generated by using ntuple.. Anyone has some
experience with this on how to use ntuple like this without getting
allocations.
Thanks!
// Kristoffer