Hi,
I'm trying to figure out how Gandiva works by tracing unit test
TestSimpleArichmetic[1].
I met with a problem about Gandiva IR generator and optimizer, would like to
seek for
help from community.
I'm focusing on case "b+1", which adds 1 to each element of an int32 vector. I
see
there's a precompiled IR for simple int32+int32 scalar sum [2]. The vector sum
IR
code is generated at runtime in LLVMGenerator::CodeGenExprValue [3], which
injects IR
of loops and other controls. Then the IR goes optimization pass to generate
optimized
IR [4], and finally machine code and relocation.
For the above vector sum case, I think compiler should do loop vectorize.
According
optimization options aer enabled in Gandiva. But I didn't see vectorized code
in both
Gandiva IR and machine code. I'm wondering what's the problem.
Below is my debug logs. Pseudo C code [5] is the according C code Gandiva
generates IR.
I pasted links to clang-7 generated assembly and IR code for comparison. clang
generated
code has loop vectorization. I also dumped IR genereted by Gandiva, before and
after
optimization, and Gandiva generated assembly code. They don't have loop
vectorization.
====Pseudo C code [5]==========================================================
int expr_0_0(int64_t *addrs, int64_t *local_bitmaps,
int64_t execution_context_ptr, int64_t nrecords) {
int *outVec = (int *) addrs[5];
int *c0Vec = (int *) addrs[1];
int *c1Vec = (int *) addrs[3];
for (int loop_var = 0; loop_var < nrecords; ++loop_var) {
int c0 = c0Vec[loop_var];
int c1 = c1Vec[loop_var];
int out = c0 + c1;
outVec[loop_var] = out;
}
}
====clang-7.0.0 generated code(has loop vectorize)=============================
assembly code: https://godbolt.org/z/_bAzSs
IR code: https://pastebin.com/NGc8eCMq
====Gandiva generated IR(before optimize)======================================
Full IR: https://pastebin.com/YEKDCNVR
define internal i32 @add_int32_int32(i32, i32) local_unnamed_addr #0 {
%3 = add nsw i32 %1, %0
ret i32 %3
}
define i32 @expr_0_0(i64* %args, i64* %arg_addr_offsets, i64* %local_bitmaps,
i16* %selection_vector, i64 %context_ptr, i64 %nrecords) {
entry:
%"b+1_mem_addr" = getelementptr i64, i64* %args, i32 0
%"b+1_mem" = load i64, i64* %"b+1_mem_addr"
%"b+1_darray" = inttoptr i64 %"b+1_mem" to i32*
%"b+1_mem_addr1" = getelementptr i64, i64* %args, i32 2
%"b+1_mem2" = load i64, i64* %"b+1_mem_addr1"
%"b+1_buf_ptr" = inttoptr i64 %"b+1_mem2" to i8*
%"b+1_mem_addr3" = getelementptr i64, i64* %args, i32 -1
%"b+1_mem4" = load i64, i64* %"b+1_mem_addr3"
%"b+1_oarray" = inttoptr i64 %"b+1_mem4" to i32*
%b_mem_addr = getelementptr i64, i64* %args, i32 3
%b_mem = load i64, i64* %b_mem_addr
%b_darray = inttoptr i64 %b_mem to i32*
br label %loop
loop: ; preds = %loop, %entry
%loop_var = phi i64 [ 0, %entry ], [ %"loop_var+1", %loop ]
%b_offset_addr = getelementptr i64, i64* %arg_addr_offsets, i32 3
%b_addr = load i64, i64* %b_offset_addr
%0 = add i64 %loop_var, %b_addr
%1 = getelementptr i32, i32* %b_darray, i64 %0
%b = load i32, i32* %1
%add_int32_int32 = call i32 @add_int32_int32(i32 %b, i32 1)
%2 = getelementptr i32, i32* %"b+1_darray", i64 %loop_var
store i32 %add_int32_int32, i32* %2
%"loop_var+1" = add i64 %loop_var, 1
%"loop_var < nrec" = icmp slt i64 %"loop_var+1", %nrecords
br i1 %"loop_var < nrec", label %loop, label %exit
exit: ; preds = %loop
ret i32 0
}
====Gandiva generated IR(after optimize)=======================================
Full IR: https://pastebin.com/a0CRS0Ud
define i32 @expr_0_0(i64* nocapture readonly %args, i64* nocapture readonly
%arg_addr_offsets, i64* nocapture readnone %local_bitmaps, i16* nocapture
readnone %selection_vector, i64 %context_ptr, i64 %nrecords) local_unnamed_addr
#0 {
entry:
%0 = bitcast i64* %args to i32**
%"b+1_mem5" = load i32*, i32** %0, align 8
%b_mem_addr = getelementptr i64, i64* %args, i64 3
%1 = bitcast i64* %b_mem_addr to i32**
%b_mem6 = load i32*, i32** %1, align 8
%b_offset_addr = getelementptr i64, i64* %arg_addr_offsets, i64 3
br label %loop
loop: ; preds = %loop, %entry
%loop_var = phi i64 [ 0, %entry ], [ %"loop_var+1", %loop ]
%b_addr = load i64, i64* %b_offset_addr, align 8
%2 = add i64 %b_addr, %loop_var
%3 = getelementptr i32, i32* %b_mem6, i64 %2
%b = load i32, i32* %3, align 4
%4 = add nsw i32 %b, 1
%5 = getelementptr i32, i32* %"b+1_mem5", i64 %loop_var
store i32 %4, i32* %5, align 4
%"loop_var+1" = add nuw nsw i64 %loop_var, 1
%"loop_var < nrec" = icmp slt i64 %"loop_var+1", %nrecords
br i1 %"loop_var < nrec", label %loop, label %exit
exit: ; preds = %loop
ret i32 0
}
====Gandiva generated assembly code (no loop vectorize)========================
Dump of assembler code for function expr_0_0:
0x00007ffff7ff4000 <+0>: mov (%rdi),%rax
0x00007ffff7ff4003 <+3>: mov 0x18(%rdi),%rcx
0x00007ffff7ff4007 <+7>: xor %edx,%edx
0x00007ffff7ff4009 <+9>: nop
0x00007ffff7ff400a <+10>: nop
0x00007ffff7ff400b <+11>: nop
0x00007ffff7ff400c <+12>: nop
0x00007ffff7ff400d <+13>: nop
0x00007ffff7ff400e <+14>: nop
0x00007ffff7ff400f <+15>: nop
0x00007ffff7ff4010 <+16>: mov 0x18(%rsi),%rdi
0x00007ffff7ff4014 <+20>: add %rdx,%rdi
0x00007ffff7ff4017 <+23>: mov (%rcx,%rdi,4),%edi
0x00007ffff7ff401a <+26>: inc %edi
0x00007ffff7ff401c <+28>: mov %edi,(%rax,%rdx,4)
0x00007ffff7ff401f <+31>: inc %rdx
0x00007ffff7ff4022 <+34>: cmp %r9,%rdx
0x00007ffff7ff4025 <+37>: jl 0x7ffff7ff4010 <expr_0_0+16>
0x00007ffff7ff4027 <+39>: xor %eax,%eax
0x00007ffff7ff4029 <+41>: retq
[1]
https://github.com/apache/arrow/blob/master/cpp/src/gandiva/tests/literal_test.cc#L42
[2]
https://github.com/apache/arrow/blob/master/cpp/src/gandiva/precompiled/arithmetic_ops.cc#L65
[3]
https://github.com/apache/arrow/blob/master/cpp/src/gandiva/llvm_generator.cc#L247
[4]
https://github.com/apache/arrow/blob/master/cpp/src/gandiva/engine.cc#L207-L216
[5]
https://github.com/apache/arrow/blob/master/cpp/src/gandiva/llvm_generator.cc#L204-L215