Note: This looks long, but really just has a lot of LLVM IR!
I have been digging into the issue recently of the best way to enclose
parameters with a function
<https://github.com/ChrisRackauckas/DifferentialEquations.jl/issues/41>.
This is an issue that comes up a lot with scientific codes, and so I was
hoping to try and get it right. However, the results of my experiments
aren't looking too good, and so I was hoping to find out whether I am
running into some bug or simply just not finding the optimal solution.
The example is as follows (with LLVM IR included to show how exactly
everything is compiling). Say the user wants we to do a bunch of things
with the function f(u,t)=α*u where α is some parameter. They don't
necessarily want to replace it as a constant since they may change it
around a bit, but every time this function is given to me, I can treat it
as a constant. If they were willing to treat it as a constant, then they
could take this function:
k(u::Float64,t::Float64,α) = α*u
println("Standard k definition")
@code_llvm k(1.0,2.0,1.01)
#Result
define double @julia_k_70163(double, double, double) #0 {
top:
%3 = fmul double %0, %2
ret double %3
}
and enclose the constant:
G = (u,t) -> k(u,t,1.01)
G2 = (u,t)->k(u,t,α)
println("Top level inlined k")
@code_llvm G(1.0,2.0)
println("Top level not inlined k")
@code_llvm G2(1.0,2.0)
const β = 1.01
G3 = (u,t)->k(u,t,β)
println("Top level not inlined but const k")
@code_llvm G3(1.0,2.0)
#Results
Top level inlined k
define double @"julia_#159_70165"(double, double) #0 {
top:
%2 = fmul double %0, 1.010000e+00
ret double %2
}
Top level not inlined k
define %jl_value_t* @"julia_#161_70167"(double, double) #0 {
top:
%thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #2
%ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -2672
%ptls = bitcast i8* %ptls_i8 to %jl_value_t***
%2 = alloca [5 x %jl_value_t*], align 8
%.sub = getelementptr inbounds [5 x %jl_value_t*], [5 x %jl_value_t*]*
%2, i64 0, i64 0
%3 = getelementptr [5 x %jl_value_t*], [5 x %jl_value_t*]* %2, i64 0, i64
2
%4 = bitcast %jl_value_t** %3 to i8*
call void @llvm.memset.p0i8.i32(i8* %4, i8 0, i32 24, i32 8, i1 false)
%5 = bitcast [5 x %jl_value_t*]* %2 to i64*
store i64 6, i64* %5, align 8
%6 = getelementptr [5 x %jl_value_t*], [5 x %jl_value_t*]* %2, i64 0, i64
1
%7 = bitcast i8* %ptls_i8 to i64*
%8 = load i64, i64* %7, align 8
%9 = bitcast %jl_value_t** %6 to i64*
store i64 %8, i64* %9, align 8
store %jl_value_t** %.sub, %jl_value_t*** %ptls, align 8
%10 = getelementptr [5 x %jl_value_t*], [5 x %jl_value_t*]* %2, i64 0,
i64 4
%11 = getelementptr [5 x %jl_value_t*], [5 x %jl_value_t*]* %2, i64 0,
i64 3
%12 = load i64, i64* inttoptr (i64 139896404414328 to i64*), align 8
%13 = bitcast %jl_value_t** %11 to i64*
store i64 %12, i64* %13, align 8
store %jl_value_t* inttoptr (i64 139896327403528 to %jl_value_t*),
%jl_value_t** %3, align 8
%14 = call %jl_value_t* @jl_gc_pool_alloc(i8* %ptls_i8, i32 1432, i32 16)
%15 = getelementptr inbounds %jl_value_t, %jl_value_t* %14, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139896322417392 to %jl_value_t*),
%jl_value_t** %15, align 8
%16 = bitcast %jl_value_t* %14 to double*
store double %0, double* %16, align 8
store %jl_value_t* %14, %jl_value_t** %10, align 8
%17 = call %jl_value_t* @jl_apply_generic(%jl_value_t** %3, i32 3)
%18 = load i64, i64* %9, align 8
store i64 %18, i64* %7, align 8
ret %jl_value_t* %17
}
Top level not inlined but const k
define double @"julia_#163_70169"(double, double) #0 {
top:
%2 = fmul double %0, 1.010000e+00
ret double %2
}
It's clear from these results that the user would have to treat that values
they enclose as a constant. This is a known performance issue due to global
variables.
So let's say I will be doing the enclosing on my end. The parameters ends
up inside my function, and I am willing to wrap that into another function
which holds all the parameters (can be necessary for calling some C
libraries). However, I ran into issues finding any option which was
performant. Here's my attempts:
# Some helpers
immutable ParameterHolder{uType<:Number}
α::uType
end
function translator(f,α)
Base.@propagate_inbounds g(u::Float64,t) = f(u::Float64,t,α)
return g
end
function translator2(f,α)
@inbounds g = (u::Float64,t) -> f(u::Float64,t,α)
return g
end
# The "main" function the user will call
function code_test()
local u::Float64 = 1.0
const pconst = ParameterHolder(1.01)
p = ParameterHolder(1.01)
f(u,t,p) = @inbounds return p.α*u
println("Inside using ParameterHolder Constant")
@code_llvm f(1.0,2.0,pconst)
println("Inside using ParameterHolder")
@code_llvm f(1.0,2.0,p)
h(u,t,α) = α*u
α = 1.01
g = (u,t) -> h(u,t,α)
println("Inside using closure with variable")
@code_llvm g(u,2.0)
const β = 1.01
l = (u,t) -> h(u,t,β)
println("Inside using closure with const variable")
@code_llvm l(u,2.0)
m = (u,t) -> k(u,t,β)
println("Inside using outside function closure with variable")
@code_llvm m(u,2.0)
J = (u::Float64,t::Float64) -> k(u::Float64,t::Float64,α::Float64)
println("Inside using outside function closure with variable")
@code_llvm J(u,2.0)
@inline J2(u::Float64,t::Float64) = k(u::Float64,t::Float64,1.01::Float64)
println("Inside using closure inlined")
@code_llvm J2(u::Float64,2.0)
J3 = translator(k,1.01)
println("Inside using translator")
@code_llvm J3(u::Float64,2.0)
println(J3(u,2.0))
J4 = translator(g,1.01)
println("Inside using translator of inside")
@code_llvm J4(u::Float64,2.0)
J5 = translator(k,1.01)
println("Inside using translator 2")
@code_llvm J5(u::Float64,2.0)
println(J3(u,2.0))
J6 = translator(g,1.01)
println("Inside using translator 2 of inside")
@code_llvm J6(u::Float64,2.0)
end
code_test()
Let's walk through all of the results. If the user gave me the parameters
in an immutable ParameterHolder, I can get functions like:
Inside using ParameterHolder Constant
define double @julia_f_70172(double, double, %ParameterHolder*) #0 {
top:
%3 = getelementptr inbounds %ParameterHolder, %ParameterHolder* %2, i64
0, i32 0
%4 = load double, double* %3, align 8
%5 = fmul double %4, %0
ret double %5
}
Inside using ParameterHolder
define double @julia_f_70172(double, double, %ParameterHolder*) #0 {
top:
%3 = getelementptr inbounds %ParameterHolder, %ParameterHolder* %2, i64
0, i32 0
%4 = load double, double* %3, align 8
%5 = fmul double %4, %0
ret double %5
}
This has more steps than necessary, but is okay. The ways using a closure
fair rather poorly. For some reason, even though u is type-stable, it
compiles functions for jl_value_t's:
Inside using closure
define %jl_value_t* @"julia_#146_70175"(%jl_value_t*, double, double) #0 {
top:
%thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #2
%ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -2672
%ptls = bitcast i8* %ptls_i8 to %jl_value_t***
%3 = alloca [10 x %jl_value_t*], align 8
%.sub = getelementptr inbounds [10 x %jl_value_t*], [10 x %jl_value_t*]*
%3, i64 0, i64 0
%4 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 2
%5 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 8
%6 = bitcast %jl_value_t** %4 to i8*
call void @llvm.memset.p0i8.i32(i8* %6, i8 0, i32 64, i32 8, i1 false)
%7 = bitcast [10 x %jl_value_t*]* %3 to i64*
store i64 16, i64* %7, align 8
%8 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 1
%9 = bitcast i8* %ptls_i8 to i64*
%10 = load i64, i64* %9, align 8
%11 = bitcast %jl_value_t** %8 to i64*
store i64 %10, i64* %11, align 8
store %jl_value_t** %.sub, %jl_value_t*** %ptls, align 8
%12 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 6
%13 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 5
%14 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 4
%15 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 3
%16 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 7
%17 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 9
%18 = getelementptr inbounds %jl_value_t, %jl_value_t* %0, i64 1, i32 0
%19 = bitcast %jl_value_t** %18 to i64*
%20 = load i64, i64* %19, align 8
%21 = bitcast %jl_value_t** %5 to i64*
store i64 %20, i64* %21, align 8
store %jl_value_t* inttoptr (i64 139896320318056 to %jl_value_t*),
%jl_value_t** %17, align 8
%22 = call %jl_value_t* @jl_f_getfield(%jl_value_t* null, %jl_value_t**
%5, i32 2)
store %jl_value_t* %22, %jl_value_t** %4, align 8
%23 = bitcast %jl_value_t* %0 to i64*
%24 = load i64, i64* %23, align 8
%25 = bitcast %jl_value_t** %12 to i64*
store i64 %24, i64* %25, align 8
store %jl_value_t* inttoptr (i64 139896320318056 to %jl_value_t*),
%jl_value_t** %16, align 8
%26 = call %jl_value_t* @jl_f_getfield(%jl_value_t* null, %jl_value_t**
%12, i32 2)
store %jl_value_t* %26, %jl_value_t** %13, align 8
%27 = call %jl_value_t* @jl_gc_pool_alloc(i8* %ptls_i8, i32 1432, i32 16)
%28 = getelementptr inbounds %jl_value_t, %jl_value_t* %27, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139896322417392 to %jl_value_t*),
%jl_value_t** %28, align 8
%29 = bitcast %jl_value_t* %27 to double*
store double %1, double* %29, align 8
store %jl_value_t* %27, %jl_value_t** %15, align 8
%30 = call %jl_value_t* @jl_gc_pool_alloc(i8* %ptls_i8, i32 1432, i32 16)
%31 = getelementptr inbounds %jl_value_t, %jl_value_t* %30, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139896322417392 to %jl_value_t*),
%jl_value_t** %31, align 8
%32 = bitcast %jl_value_t* %30 to double*
store double %2, double* %32, align 8
store %jl_value_t* %30, %jl_value_t** %14, align 8
%33 = call %jl_value_t* @jl_apply_generic(%jl_value_t** %4, i32 4)
%34 = load i64, i64* %11, align 8
store i64 %34, i64* %9, align 8
ret %jl_value_t* %33
}
Inside using closure with variable
define %jl_value_t* @"julia_#147_70177"(%jl_value_t*, double, double) #0 {
top:
%thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #2
%ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -2672
%ptls = bitcast i8* %ptls_i8 to %jl_value_t***
%3 = alloca [10 x %jl_value_t*], align 8
%.sub = getelementptr inbounds [10 x %jl_value_t*], [10 x %jl_value_t*]*
%3, i64 0, i64 0
%4 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 2
%5 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 8
%6 = bitcast %jl_value_t** %4 to i8*
call void @llvm.memset.p0i8.i32(i8* %6, i8 0, i32 64, i32 8, i1 false)
%7 = bitcast [10 x %jl_value_t*]* %3 to i64*
store i64 16, i64* %7, align 8
%8 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 1
%9 = bitcast i8* %ptls_i8 to i64*
%10 = load i64, i64* %9, align 8
%11 = bitcast %jl_value_t** %8 to i64*
store i64 %10, i64* %11, align 8
store %jl_value_t** %.sub, %jl_value_t*** %ptls, align 8
%12 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 6
%13 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 5
%14 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 4
%15 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 3
%16 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 7
%17 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 9
%18 = bitcast %jl_value_t* %0 to i64*
%19 = load i64, i64* %18, align 8
%20 = bitcast %jl_value_t** %5 to i64*
store i64 %19, i64* %20, align 8
store %jl_value_t* inttoptr (i64 139896320318056 to %jl_value_t*),
%jl_value_t** %17, align 8
%21 = call %jl_value_t* @jl_f_getfield(%jl_value_t* null, %jl_value_t**
%5, i32 2)
store %jl_value_t* %21, %jl_value_t** %4, align 8
%22 = getelementptr inbounds %jl_value_t, %jl_value_t* %0, i64 1, i32 0
%23 = bitcast %jl_value_t** %22 to i64*
%24 = load i64, i64* %23, align 8
%25 = bitcast %jl_value_t** %12 to i64*
store i64 %24, i64* %25, align 8
store %jl_value_t* inttoptr (i64 139896320318056 to %jl_value_t*),
%jl_value_t** %16, align 8
%26 = call %jl_value_t* @jl_f_getfield(%jl_value_t* null, %jl_value_t**
%12, i32 2)
store %jl_value_t* %26, %jl_value_t** %13, align 8
%27 = call %jl_value_t* @jl_gc_pool_alloc(i8* %ptls_i8, i32 1432, i32 16)
%28 = getelementptr inbounds %jl_value_t, %jl_value_t* %27, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139896322417392 to %jl_value_t*),
%jl_value_t** %28, align 8
%29 = bitcast %jl_value_t* %27 to double*
store double %1, double* %29, align 8
store %jl_value_t* %27, %jl_value_t** %15, align 8
%30 = call %jl_value_t* @jl_gc_pool_alloc(i8* %ptls_i8, i32 1432, i32 16)
%31 = getelementptr inbounds %jl_value_t, %jl_value_t* %30, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139896322417392 to %jl_value_t*),
%jl_value_t** %31, align 8
%32 = bitcast %jl_value_t* %30 to double*
store double %2, double* %32, align 8
store %jl_value_t* %30, %jl_value_t** %14, align 8
%33 = call %jl_value_t* @jl_apply_generic(%jl_value_t** %4, i32 4)
%34 = load i64, i64* %11, align 8
store i64 %34, i64* %9, align 8
ret %jl_value_t* %33
}
Inside using closure with const variable
define %jl_value_t* @"julia_#148_70179"(%jl_value_t*, double, double) #0 {
top:
%thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #2
%ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -2672
%ptls = bitcast i8* %ptls_i8 to %jl_value_t***
%3 = alloca [10 x %jl_value_t*], align 8
%.sub = getelementptr inbounds [10 x %jl_value_t*], [10 x %jl_value_t*]*
%3, i64 0, i64 0
%4 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 2
%5 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 8
%6 = bitcast %jl_value_t** %4 to i8*
call void @llvm.memset.p0i8.i32(i8* %6, i8 0, i32 64, i32 8, i1 false)
%7 = bitcast [10 x %jl_value_t*]* %3 to i64*
store i64 16, i64* %7, align 8
%8 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 1
%9 = bitcast i8* %ptls_i8 to i64*
%10 = load i64, i64* %9, align 8
%11 = bitcast %jl_value_t** %8 to i64*
store i64 %10, i64* %11, align 8
store %jl_value_t** %.sub, %jl_value_t*** %ptls, align 8
%12 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 6
%13 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 5
%14 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 4
%15 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 3
%16 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 7
%17 = getelementptr [10 x %jl_value_t*], [10 x %jl_value_t*]* %3, i64 0,
i64 9
%18 = getelementptr inbounds %jl_value_t, %jl_value_t* %0, i64 1, i32 0
%19 = bitcast %jl_value_t** %18 to i64*
%20 = load i64, i64* %19, align 8
%21 = bitcast %jl_value_t** %5 to i64*
store i64 %20, i64* %21, align 8
store %jl_value_t* inttoptr (i64 139896320318056 to %jl_value_t*),
%jl_value_t** %17, align 8
%22 = call %jl_value_t* @jl_f_getfield(%jl_value_t* null, %jl_value_t**
%5, i32 2)
store %jl_value_t* %22, %jl_value_t** %4, align 8
%23 = bitcast %jl_value_t* %0 to i64*
%24 = load i64, i64* %23, align 8
%25 = bitcast %jl_value_t** %12 to i64*
store i64 %24, i64* %25, align 8
store %jl_value_t* inttoptr (i64 139896320318056 to %jl_value_t*),
%jl_value_t** %16, align 8
%26 = call %jl_value_t* @jl_f_getfield(%jl_value_t* null, %jl_value_t**
%12, i32 2)
store %jl_value_t* %26, %jl_value_t** %13, align 8
%27 = call %jl_value_t* @jl_gc_pool_alloc(i8* %ptls_i8, i32 1432, i32 16)
%28 = getelementptr inbounds %jl_value_t, %jl_value_t* %27, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139896322417392 to %jl_value_t*),
%jl_value_t** %28, align 8
%29 = bitcast %jl_value_t* %27 to double*
store double %1, double* %29, align 8
store %jl_value_t* %27, %jl_value_t** %15, align 8
%30 = call %jl_value_t* @jl_gc_pool_alloc(i8* %ptls_i8, i32 1432, i32 16)
%31 = getelementptr inbounds %jl_value_t, %jl_value_t* %30, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139896322417392 to %jl_value_t*),
%jl_value_t** %31, align 8
%32 = bitcast %jl_value_t* %30 to double*
store double %2, double* %32, align 8
store %jl_value_t* %30, %jl_value_t** %14, align 8
%33 = call %jl_value_t* @jl_apply_generic(%jl_value_t** %4, i32 4)
%34 = load i64, i64* %11, align 8
store i64 %34, i64* %9, align 8
ret %jl_value_t* %33
}
Inside using outside function closure with variable
define %jl_value_t* @"julia_#149_70181"(%jl_value_t*, double, double) #0 {
top:
%thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #2
%ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -2672
%ptls = bitcast i8* %ptls_i8 to %jl_value_t***
%3 = alloca [7 x %jl_value_t*], align 8
%.sub = getelementptr inbounds [7 x %jl_value_t*], [7 x %jl_value_t*]*
%3, i64 0, i64 0
%4 = getelementptr [7 x %jl_value_t*], [7 x %jl_value_t*]* %3, i64 0, i64
2
%5 = getelementptr [7 x %jl_value_t*], [7 x %jl_value_t*]* %3, i64 0, i64
5
%6 = bitcast %jl_value_t** %4 to i8*
call void @llvm.memset.p0i8.i32(i8* %6, i8 0, i32 40, i32 8, i1 false)
%7 = bitcast [7 x %jl_value_t*]* %3 to i64*
store i64 10, i64* %7, align 8
%8 = getelementptr [7 x %jl_value_t*], [7 x %jl_value_t*]* %3, i64 0, i64
1
%9 = bitcast i8* %ptls_i8 to i64*
%10 = load i64, i64* %9, align 8
%11 = bitcast %jl_value_t** %8 to i64*
store i64 %10, i64* %11, align 8
store %jl_value_t** %.sub, %jl_value_t*** %ptls, align 8
%12 = getelementptr [7 x %jl_value_t*], [7 x %jl_value_t*]* %3, i64 0,
i64 4
%13 = getelementptr [7 x %jl_value_t*], [7 x %jl_value_t*]* %3, i64 0,
i64 3
%14 = getelementptr [7 x %jl_value_t*], [7 x %jl_value_t*]* %3, i64 0,
i64 6
%15 = bitcast %jl_value_t* %0 to i64*
%16 = load i64, i64* %15, align 8
%17 = bitcast %jl_value_t** %5 to i64*
store i64 %16, i64* %17, align 8
store %jl_value_t* inttoptr (i64 139896320318056 to %jl_value_t*),
%jl_value_t** %14, align 8
%18 = call %jl_value_t* @jl_f_getfield(%jl_value_t* null, %jl_value_t**
%5, i32 2)
store %jl_value_t* %18, %jl_value_t** %13, align 8
store %jl_value_t* inttoptr (i64 139896327403528 to %jl_value_t*),
%jl_value_t** %4, align 8
%19 = call %jl_value_t* @jl_gc_pool_alloc(i8* %ptls_i8, i32 1432, i32 16)
%20 = getelementptr inbounds %jl_value_t, %jl_value_t* %19, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139896322417392 to %jl_value_t*),
%jl_value_t** %20, align 8
%21 = bitcast %jl_value_t* %19 to double*
store double %1, double* %21, align 8
store %jl_value_t* %19, %jl_value_t** %12, align 8
%22 = call %jl_value_t* @jl_apply_generic(%jl_value_t** %4, i32 3)
%23 = load i64, i64* %11, align 8
store i64 %23, i64* %9, align 8
ret %jl_value_t* %22
}
Inside using outside function closure with variable
define double @"julia_#150_70183"(%jl_value_t*, double, double) #0 {
top:
%thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #3
%ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -2672
%ptls = bitcast i8* %ptls_i8 to %jl_value_t***
%3 = alloca [5 x %jl_value_t*], align 8
%.sub = getelementptr inbounds [5 x %jl_value_t*], [5 x %jl_value_t*]*
%3, i64 0, i64 0
%4 = getelementptr [5 x %jl_value_t*], [5 x %jl_value_t*]* %3, i64 0, i64
3
%5 = getelementptr [5 x %jl_value_t*], [5 x %jl_value_t*]* %3, i64 0, i64
2
%6 = bitcast %jl_value_t** %4 to i8*
call void @llvm.memset.p0i8.i32(i8* %6, i8 0, i32 16, i32 8, i1 false)
%7 = bitcast [5 x %jl_value_t*]* %3 to i64*
store i64 6, i64* %7, align 8
%8 = getelementptr [5 x %jl_value_t*], [5 x %jl_value_t*]* %3, i64 0, i64
1
%9 = bitcast i8* %ptls_i8 to i64*
%10 = load i64, i64* %9, align 8
%11 = bitcast %jl_value_t** %8 to i64*
store i64 %10, i64* %11, align 8
store %jl_value_t** %.sub, %jl_value_t*** %ptls, align 8
store %jl_value_t* null, %jl_value_t** %5, align 8
%12 = getelementptr [5 x %jl_value_t*], [5 x %jl_value_t*]* %3, i64 0,
i64 4
%13 = bitcast %jl_value_t* %0 to i64*
%14 = load i64, i64* %13, align 8
%15 = bitcast %jl_value_t** %4 to i64*
store i64 %14, i64* %15, align 8
store %jl_value_t* inttoptr (i64 139896320318056 to %jl_value_t*),
%jl_value_t** %12, align 8
%16 = call %jl_value_t* @jl_f_getfield(%jl_value_t* null, %jl_value_t**
%4, i32 2)
store %jl_value_t* %16, %jl_value_t** %5, align 8
%17 = getelementptr inbounds %jl_value_t, %jl_value_t* %16, i64 -1, i32 0
%18 = bitcast %jl_value_t** %17 to i64*
%19 = load i64, i64* %18, align 8
%20 = and i64 %19, -16
%21 = inttoptr i64 %20 to %jl_value_t*
%22 = icmp eq %jl_value_t* %21, inttoptr (i64 139896322417392 to
%jl_value_t*)
br i1 %22, label %pass, label %fail
fail: ; preds = %top
call void @jl_type_error_rt(i8* inttoptr (i64 116848560 to i8*), i8*
inttoptr (i64 64818736 to i8*), %jl_value_t* inttoptr (i64 139896322417392
to %jl_value_t*), %jl_value_t* %16)
unreachable
pass: ; preds = %top
%23 = bitcast %jl_value_t* %16 to double*
%24 = load double, double* %23, align 16
%25 = fmul double %24, %1
%26 = load i64, i64* %11, align 8
store i64 %26, i64* %9, align 8
ret double %25
}
The only way to fix this is to manually inline the number as in J2:
Inside using closure inlined
define double @julia_J2_70185(double, double) #0 {
top:
%2 = fmul double %0, 1.010000e+00
ret double %2
}
Note that even @inline failed to generate suitable code. What's interesting
is that using the translator function tended to work okay. But the results
show that this trick is only good for externally defined functions:
Inside using translator
define double @julia_g_70187(%"#g#143"*, double, double) #0 {
top:
%3 = getelementptr inbounds %"#g#143", %"#g#143"* %0, i64 0, i32 1
%4 = load double, double* %3, align 8
%5 = fmul double %4, %1
ret double %5
}
1.01
Inside using translator of inside
define %jl_value_t* @julia_g_70316(%jl_value_t*, double, double) #0 {
top:
%thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #2
%ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -2672
%ptls = bitcast i8* %ptls_i8 to %jl_value_t***
%3 = alloca [6 x %jl_value_t*], align 8
%.sub = getelementptr inbounds [6 x %jl_value_t*], [6 x %jl_value_t*]*
%3, i64 0, i64 0
%4 = getelementptr [6 x %jl_value_t*], [6 x %jl_value_t*]* %3, i64 0, i64
2
%5 = bitcast %jl_value_t** %4 to i8*
call void @llvm.memset.p0i8.i32(i8* %5, i8 0, i32 32, i32 8, i1 false)
%6 = bitcast [6 x %jl_value_t*]* %3 to i64*
store i64 8, i64* %6, align 8
%7 = getelementptr [6 x %jl_value_t*], [6 x %jl_value_t*]* %3, i64 0, i64
1
%8 = bitcast i8* %ptls_i8 to i64*
%9 = load i64, i64* %8, align 8
%10 = bitcast %jl_value_t** %7 to i64*
store i64 %9, i64* %10, align 8
store %jl_value_t** %.sub, %jl_value_t*** %ptls, align 8
%11 = getelementptr [6 x %jl_value_t*], [6 x %jl_value_t*]* %3, i64 0,
i64 5
%12 = getelementptr [6 x %jl_value_t*], [6 x %jl_value_t*]* %3, i64 0,
i64 4
%13 = getelementptr [6 x %jl_value_t*], [6 x %jl_value_t*]* %3, i64 0,
i64 3
%14 = bitcast %jl_value_t* %0 to i64*
%15 = load i64, i64* %14, align 8
%16 = getelementptr %jl_value_t, %jl_value_t* %0, i64 1
%17 = bitcast %jl_value_t* %16 to i64*
%18 = load i64, i64* %17, align 8
%19 = bitcast %jl_value_t** %4 to i64*
store i64 %15, i64* %19, align 8
%20 = call %jl_value_t* @jl_gc_pool_alloc(i8* %ptls_i8, i32 1432, i32 16)
%21 = getelementptr inbounds %jl_value_t, %jl_value_t* %20, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139896322417392 to %jl_value_t*),
%jl_value_t** %21, align 8
%22 = bitcast %jl_value_t* %20 to double*
store double %1, double* %22, align 8
store %jl_value_t* %20, %jl_value_t** %13, align 8
%23 = call %jl_value_t* @jl_gc_pool_alloc(i8* %ptls_i8, i32 1432, i32 16)
%24 = getelementptr inbounds %jl_value_t, %jl_value_t* %23, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139896322417392 to %jl_value_t*),
%jl_value_t** %24, align 8
%25 = bitcast %jl_value_t* %23 to double*
store double %2, double* %25, align 8
store %jl_value_t* %23, %jl_value_t** %12, align 8
%26 = call %jl_value_t* @jl_gc_pool_alloc(i8* %ptls_i8, i32 1432, i32 16)
%27 = getelementptr inbounds %jl_value_t, %jl_value_t* %26, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139896322417392 to %jl_value_t*),
%jl_value_t** %27, align 8
%28 = bitcast %jl_value_t* %26 to i64*
store i64 %18, i64* %28, align 8
store %jl_value_t* %26, %jl_value_t** %11, align 8
%29 = call %jl_value_t* @jl_apply_generic(%jl_value_t** %4, i32 4)
%30 = load i64, i64* %10, align 8
store i64 %30, i64* %8, align 8
ret %jl_value_t* %29
}
Inside using translator 2
define double @julia_g_70187(%"#g#143"*, double, double) #0 {
top:
%3 = getelementptr inbounds %"#g#143", %"#g#143"* %0, i64 0, i32 1
%4 = load double, double* %3, align 8
%5 = fmul double %4, %1
ret double %5
}
1.01
Inside using translator 2 of inside
define %jl_value_t* @julia_g_70316(%jl_value_t*, double, double) #0 {
top:
%thread_ptr = call i8* asm "movq %fs:0, $0", "=r"() #2
%ptls_i8 = getelementptr i8, i8* %thread_ptr, i64 -2672
%ptls = bitcast i8* %ptls_i8 to %jl_value_t***
%3 = alloca [6 x %jl_value_t*], align 8
%.sub = getelementptr inbounds [6 x %jl_value_t*], [6 x %jl_value_t*]*
%3, i64 0, i64 0
%4 = getelementptr [6 x %jl_value_t*], [6 x %jl_value_t*]* %3, i64 0, i64
2
%5 = bitcast %jl_value_t** %4 to i8*
call void @llvm.memset.p0i8.i32(i8* %5, i8 0, i32 32, i32 8, i1 false)
%6 = bitcast [6 x %jl_value_t*]* %3 to i64*
store i64 8, i64* %6, align 8
%7 = getelementptr [6 x %jl_value_t*], [6 x %jl_value_t*]* %3, i64 0, i64
1
%8 = bitcast i8* %ptls_i8 to i64*
%9 = load i64, i64* %8, align 8
%10 = bitcast %jl_value_t** %7 to i64*
store i64 %9, i64* %10, align 8
store %jl_value_t** %.sub, %jl_value_t*** %ptls, align 8
%11 = getelementptr [6 x %jl_value_t*], [6 x %jl_value_t*]* %3, i64 0,
i64 5
%12 = getelementptr [6 x %jl_value_t*], [6 x %jl_value_t*]* %3, i64 0,
i64 4
%13 = getelementptr [6 x %jl_value_t*], [6 x %jl_value_t*]* %3, i64 0,
i64 3
%14 = bitcast %jl_value_t* %0 to i64*
%15 = load i64, i64* %14, align 8
%16 = getelementptr %jl_value_t, %jl_value_t* %0, i64 1
%17 = bitcast %jl_value_t* %16 to i64*
%18 = load i64, i64* %17, align 8
%19 = bitcast %jl_value_t** %4 to i64*
store i64 %15, i64* %19, align 8
%20 = call %jl_value_t* @jl_gc_pool_alloc(i8* %ptls_i8, i32 1432, i32 16)
%21 = getelementptr inbounds %jl_value_t, %jl_value_t* %20, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139896322417392 to %jl_value_t*),
%jl_value_t** %21, align 8
%22 = bitcast %jl_value_t* %20 to double*
store double %1, double* %22, align 8
store %jl_value_t* %20, %jl_value_t** %13, align 8
%23 = call %jl_value_t* @jl_gc_pool_alloc(i8* %ptls_i8, i32 1432, i32 16)
%24 = getelementptr inbounds %jl_value_t, %jl_value_t* %23, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139896322417392 to %jl_value_t*),
%jl_value_t** %24, align 8
%25 = bitcast %jl_value_t* %23 to double*
store double %2, double* %25, align 8
store %jl_value_t* %23, %jl_value_t** %12, align 8
%26 = call %jl_value_t* @jl_gc_pool_alloc(i8* %ptls_i8, i32 1432, i32 16)
%27 = getelementptr inbounds %jl_value_t, %jl_value_t* %26, i64 -1, i32 0
store %jl_value_t* inttoptr (i64 139896322417392 to %jl_value_t*),
%jl_value_t** %27, align 8
%28 = bitcast %jl_value_t* %26 to i64*
store i64 %18, i64* %28, align 8
store %jl_value_t* %26, %jl_value_t** %11, align 8
%29 = call %jl_value_t* @jl_apply_generic(%jl_value_t** %4, i32 4)
%30 = load i64, i64* %10, align 8
store i64 %30, i64* %8, align 8
ret %jl_value_t* %29
}
So in the end, I couldn't find a way within a function to enclose the
parameter α and compile a function which actually treats α as a constant
and optimizes it all the way. However, the ParameterHolder and translator
results are getting pretty close, but I can't seem to get rid of the bounds
checking.
Does anyone else have a better solution? Or is this supposed to "act nicer"
by default?