[26/50] [abbrv] incubator-impala git commit: IMPALA-1430, IMPALA-4108: codegen all builtin aggregate functions

tarmstrong Thu, 17 Nov 2016 08:09:56 -0800

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d7246d64/be/src/exec/partitioned-aggregation-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-aggregation-node.cc 
b/be/src/exec/partitioned-aggregation-node.cc
index 17fbb6c..56db26f 100644
--- a/be/src/exec/partitioned-aggregation-node.cc
+++ b/be/src/exec/partitioned-aggregation-node.cc
@@ -17,21 +17,20 @@
 
 #include "exec/partitioned-aggregation-node.h"
 
-#include <algorithm>
 #include <math.h>
+#include <algorithm>
 #include <set>
 #include <sstream>
-#include <gutil/strings/substitute.h>
-#include <thrift/protocol/TDebugProtocol.h>
 
 #include "codegen/codegen-anyval.h"
 #include "codegen/llvm-codegen.h"
 #include "exec/hash-table.inline.h"
 #include "exprs/agg-fn-evaluator.h"
 #include "exprs/anyval-util.h"
-#include "exprs/expr.h"
 #include "exprs/expr-context.h"
+#include "exprs/expr.h"
 #include "exprs/slot-ref.h"
+#include "gutil/strings/substitute.h"
 #include "runtime/buffered-tuple-stream.inline.h"
 #include "runtime/descriptors.h"
 #include "runtime/mem-pool.h"
@@ -40,8 +39,8 @@
 #include "runtime/row-batch.h"
 #include "runtime/runtime-state.h"
 #include "runtime/string-value.inline.h"
-#include "runtime/tuple.h"
 #include "runtime/tuple-row.h"
+#include "runtime/tuple.h"
 #include "udf/udf-internal.h"
 #include "util/debug-util.h"
 #include "util/runtime-profile-counters.h"
@@ -150,21 +149,20 @@ Status PartitionedAggregationNode::Init(const TPlanNode& 
tnode, RuntimeState* st
       Expr::CreateExprTrees(pool_, tnode.agg_node.grouping_exprs, 
&grouping_expr_ctxs_));
   for (int i = 0; i < tnode.agg_node.aggregate_functions.size(); ++i) {
     AggFnEvaluator* evaluator;
-    RETURN_IF_ERROR(AggFnEvaluator::Create(
-        pool_, tnode.agg_node.aggregate_functions[i], &evaluator));
+    RETURN_IF_ERROR(
+        AggFnEvaluator::Create(pool_, tnode.agg_node.aggregate_functions[i], 
&evaluator));
     aggregate_evaluators_.push_back(evaluator);
-    ExprContext* agg_expr_ctx;
-    if (evaluator->input_expr_ctxs().size() == 1) {
-      agg_expr_ctx = evaluator->input_expr_ctxs()[0];
+    ExprContext* const* agg_expr_ctxs;
+    if (evaluator->input_expr_ctxs().size() > 0) {
+      agg_expr_ctxs = evaluator->input_expr_ctxs().data();
     } else {
-      // CodegenUpdateSlot() can only support aggregate operator with only one 
ExprContext
-      // so it doesn't support operator such as group_concat. There are also 
aggregate
-      // operators with no ExprContext (e.g. count(*)). In cases above, 
'agg_expr_ctxs_'
-      // will contain NULL for that entry.
+      // Some aggregate functions have no input expressions and therefore no 
ExprContext
+      // (e.g. count(*)). In those cases, 'agg_expr_ctxs_' will contain NULL 
for that
+      // entry.
       DCHECK(evaluator->agg_op() == AggFnEvaluator::OTHER || 
evaluator->is_count_star());
-      agg_expr_ctx = NULL;
+      agg_expr_ctxs = NULL;
     }
-    agg_expr_ctxs_.push_back(agg_expr_ctx);
+    agg_expr_ctxs_.push_back(agg_expr_ctxs);
   }
   return Status::OK();
 }
@@ -1059,32 +1057,31 @@ void PartitionedAggregationNode::InitAggSlots(
       intermediate_tuple_desc_->slots().begin() + grouping_expr_ctxs_.size();
   for (int i = 0; i < aggregate_evaluators_.size(); ++i, ++slot_desc) {
     AggFnEvaluator* evaluator = aggregate_evaluators_[i];
+    // To minimize branching on the UpdateTuple path, initialize the result 
value so that
+    // the Add() UDA function can ignore the NULL bit of its destination 
value. E.g. for
+    // SUM(), if we initialize the destination value to 0 (with the NULL bit 
set), we can
+    // just start adding to the destination value (rather than repeatedly 
checking the
+    // destination NULL bit. The codegen'd version of UpdateSlot() exploits 
this to
+    // eliminate a branch per value.
+    //
+    // For boolean and numeric types, the default values are false/0, so the 
nullable
+    // aggregate functions SUM() and AVG() produce the correct result. For 
MIN()/MAX(),
+    // initialize the value to max/min possible value for the same effect.
     evaluator->Init(agg_fn_ctxs[i], intermediate_tuple);
-    // Codegen specific path for min/max.
-    // To minimize branching on the UpdateTuple path, initialize the result 
value
-    // so that UpdateTuple doesn't have to check if the aggregation
-    // dst slot is null.
-    // TODO: remove when we don't use the irbuilder for codegen here.  This 
optimization
-    // will no longer be necessary when all aggregates are implemented with 
the UDA
-    // interface.
-    if ((*slot_desc)->type().type != TYPE_STRING &&
-        (*slot_desc)->type().type != TYPE_VARCHAR &&
-        (*slot_desc)->type().type != TYPE_TIMESTAMP &&
-        (*slot_desc)->type().type != TYPE_CHAR) {
+
+    AggFnEvaluator::AggregationOp agg_op = evaluator->agg_op();
+    if ((agg_op == AggFnEvaluator::MIN || agg_op == AggFnEvaluator::MAX)
+        && !evaluator->intermediate_type().IsStringType()
+        && !evaluator->intermediate_type().IsTimestampType()) {
       ExprValue default_value;
       void* default_value_ptr = NULL;
-      switch (evaluator->agg_op()) {
-        case AggFnEvaluator::MIN:
-          default_value_ptr = default_value.SetToMax((*slot_desc)->type());
-          RawValue::Write(default_value_ptr, intermediate_tuple, *slot_desc, 
NULL);
-          break;
-        case AggFnEvaluator::MAX:
-          default_value_ptr = default_value.SetToMin((*slot_desc)->type());
-          RawValue::Write(default_value_ptr, intermediate_tuple, *slot_desc, 
NULL);
-          break;
-        default:
-          break;
+      if (evaluator->agg_op() == AggFnEvaluator::MIN) {
+        default_value_ptr = default_value.SetToMax((*slot_desc)->type());
+      } else {
+        DCHECK_EQ(evaluator->agg_op(), AggFnEvaluator::MAX);
+        default_value_ptr = default_value.SetToMin((*slot_desc)->type());
       }
+      RawValue::Write(default_value_ptr, intermediate_tuple, *slot_desc, NULL);
     }
   }
 }
@@ -1450,111 +1447,123 @@ Status 
PartitionedAggregationNode::QueryMaintenance(RuntimeState* state) {
 // void UpdateSlot(FunctionContext* agg_fn_ctx, ExprContext* agg_expr_ctx,
 //     AggTuple* agg_tuple, char** row)
 //
-// The IR for sum(double_col) is:
+// The IR for sum(double_col), which is constructed directly with the 
IRBuilder, is:
 //
 // define void @UpdateSlot(%"class.impala_udf::FunctionContext"* %agg_fn_ctx,
-//                         %"class.impala::ExprContext"* %agg_expr_ctx,
-//                         { i8, [7 x i8], double }* %agg_tuple,
-//                         %"class.impala::TupleRow"* %row) #34 {
-//
+//    %"class.impala::ExprContext"** %agg_expr_ctxs,
+//    { i8, [7 x i8], double }* %agg_tuple, %"class.impala::TupleRow"* %row) 
#34 {
 // entry:
-//   %src = call { i8, double } @GetSlotRef(%"class.impala::ExprContext"* 
%agg_expr_ctx,
-//       %"class.impala::TupleRow"* %row)
-//   %0 = extractvalue { i8, double } %src, 0
-//   %is_null = trunc i8 %0 to i1
-//   br i1 %is_null, label %ret, label %src_not_null
-//
-// src_not_null:                                     ; preds = %entry
+//   %expr_ctx_ptr = getelementptr %"class.impala::ExprContext"*,
+//      %"class.impala::ExprContext"** %agg_expr_ctxs, i32 0
+//   %expr_ctx = load %"class.impala::ExprContext"*,
+//      %"class.impala::ExprContext"** %expr_ctx_ptr
+//   %input0 = call { i8, double } @GetSlotRef(%"class.impala::ExprContext"* 
%expr_ctx,
+//      %"class.impala::TupleRow"* %row)
 //   %dst_slot_ptr = getelementptr inbounds { i8, [7 x i8], double },
 //       { i8, [7 x i8], double }* %agg_tuple, i32 0, i32 2
-//   call void @SetNotNull({ i8, [7 x i8], double }* %agg_tuple)
 //   %dst_val = load double, double* %dst_slot_ptr
-//   %val = extractvalue { i8, double } %src, 1
+//   %0 = extractvalue { i8, double } %input0, 0
+//   %is_null = trunc i8 %0 to i1
+//   br i1 %is_null, label %ret, label %not_null
+//
+// ret:                                              ; preds = %not_null, 
%entry
+//   ret void
+//
+// not_null:                                         ; preds = %entry
+//   %val = extractvalue { i8, double } %input0, 1
 //   %1 = fadd double %dst_val, %val
+//   %2 = bitcast { i8, [7 x i8], double }* %agg_tuple to i8*
+//   %null_byte_ptr = getelementptr i8, i8* %2, i32 0
+//   %null_byte = load i8, i8* %null_byte_ptr
+//   %null_bit_cleared = and i8 %null_byte, -2
+//   store i8 %null_bit_cleared, i8* %null_byte_ptr
 //   store double %1, double* %dst_slot_ptr
 //   br label %ret
-//
-// ret:                                              ; preds = %src_not_null, 
%entry
-//   ret void
 // }
 //
-// The IR for ndv(double_col) is:
+// The IR for min(timestamp_col), which uses the UDA interface, is:
 //
 // define void @UpdateSlot(%"class.impala_udf::FunctionContext"* %agg_fn_ctx,
-//                         %"class.impala::ExprContext"* %agg_expr_ctx,
-//                         { i8, [7 x i8], %"struct.impala::StringValue" }* 
%agg_tuple,
-//                         %"class.impala::TupleRow"* %row) #34 {
+//      %"class.impala::ExprContext"** %agg_expr_ctxs,
+//      { i8, [7 x i8], %"class.impala::TimestampValue" }* %agg_tuple,
+//      %"class.impala::TupleRow"* %row) #34 {
 // entry:
-//   %dst_lowered_ptr = alloca { i64, i8* }
-//   %src_lowered_ptr = alloca { i8, double }
-//   %src = call { i8, double } @GetSlotRef(%"class.impala::ExprContext"* 
%agg_expr_ctx,
-//       %"class.impala::TupleRow"* %row)
-//   %0 = extractvalue { i8, double } %src, 0
-//   %is_null = trunc i8 %0 to i1
-//   br i1 %is_null, label %ret, label %src_not_null
-//
-// src_not_null:                                     ; preds = %entry
-//   %dst_slot_ptr = getelementptr inbounds { i8, [7 x i8], 
%"struct.impala::StringValue" },
-//       { i8, [7 x i8], %"struct.impala::StringValue" }* %agg_tuple, i32 0, 
i32 2
-//   call void @SetNotNull({ i8, [7 x i8], %"struct.impala::StringValue" }* 
%agg_tuple)
-//   %dst_val =
-//       load %"struct.impala::StringValue", %"struct.impala::StringValue"* 
%dst_slot_ptr
-//   store { i8, double } %src, { i8, double }* %src_lowered_ptr
-//   %src_unlowered_ptr =
-//       bitcast { i8, double }* %src_lowered_ptr to 
%"struct.impala_udf::DoubleVal"*
-//   %ptr = extractvalue %"struct.impala::StringValue" %dst_val, 0
-//   %dst = insertvalue { i64, i8* } zeroinitializer, i8* %ptr, 1
-//   %len = extractvalue %"struct.impala::StringValue" %dst_val, 1
-//   %1 = extractvalue { i64, i8* } %dst, 0
-//   %2 = zext i32 %len to i64
-//   %3 = shl i64 %2, 32
-//   %4 = and i64 %1, 4294967295
-//   %5 = or i64 %4, %3
-//   %dst1 = insertvalue { i64, i8* } %dst, i64 %5, 0
-//   store { i64, i8* } %dst1, { i64, i8* }* %dst_lowered_ptr
-//   %dst_unlowered_ptr =
-//       bitcast { i64, i8* }* %dst_lowered_ptr to 
%"struct.impala_udf::StringVal"*
-//   call void @HllUpdate(%"class.impala_udf::FunctionContext"* %agg_fn_ctx,
-//                        %"struct.impala_udf::DoubleVal"* %src_unlowered_ptr,
-//                        %"struct.impala_udf::StringVal"* %dst_unlowered_ptr)
-//   %anyval_result = load { i64, i8* }, { i64, i8* }* %dst_lowered_ptr
-//   %6 = extractvalue { i64, i8* } %anyval_result, 0
-//   %7 = ashr i64 %6, 32
-//   %8 = trunc i64 %7 to i32
-//   %9 = insertvalue %"struct.impala::StringValue" zeroinitializer, i32 %8, 1
-//   %10 = extractvalue { i64, i8* } %anyval_result, 1
-//   %11 = insertvalue %"struct.impala::StringValue" %9, i8* %10, 0
-//   store %"struct.impala::StringValue" %11, %"struct.impala::StringValue"* 
%dst_slot_ptr
+//   %dst_lowered_ptr = alloca { i64, i64 }
+//   %input_lowered_ptr = alloca { i64, i64 }
+//   %expr_ctx_ptr = getelementptr %"class.impala::ExprContext"*,
+//        %"class.impala::ExprContext"** %agg_expr_ctxs, i32 0
+//   %expr_ctx = load %"class.impala::ExprContext"*,
+//        %"class.impala::ExprContext"** %expr_ctx_ptr
+//   %input0 = call { i64, i64 } @GetSlotRef(%"class.impala::ExprContext"* 
%expr_ctx,
+//        %"class.impala::TupleRow"* %row)
+//   %dst_slot_ptr = getelementptr inbounds { i8, [7 x i8],
+//        %"class.impala::TimestampValue" }, { i8, [7 x i8],
+//        %"class.impala::TimestampValue" }* %agg_tuple, i32 0, i32 2
+//   %dst_val = load %"class.impala::TimestampValue",
+//        %"class.impala::TimestampValue"* %dst_slot_ptr
+//   %0 = bitcast { i8, [7 x i8], %"class.impala::TimestampValue" }* 
%agg_tuple to i8*
+//   %null_byte_ptr = getelementptr i8, i8* %0, i32 0
+//   %null_byte = load i8, i8* %null_byte_ptr
+//   %null_mask = and i8 %null_byte, 1
+//   %is_null = icmp ne i8 %null_mask, 0
+//   %is_null_ext = zext i1 %is_null to i64
+//   %1 = or i64 0, %is_null_ext
+//   %dst = insertvalue { i64, i64 } zeroinitializer, i64 %1, 0
+//   %time_of_day = extractvalue %"class.impala::TimestampValue" %dst_val, 0, 
0, 0, 0
+//   %dst1 = insertvalue { i64, i64 } %dst, i64 %time_of_day, 1
+//   %date = extractvalue %"class.impala::TimestampValue" %dst_val, 1, 0, 0
+//   %2 = extractvalue { i64, i64 } %dst1, 0
+//   %3 = zext i32 %date to i64
+//   %4 = shl i64 %3, 32
+//   %5 = and i64 %2, 4294967295
+//   %6 = or i64 %5, %4
+//   %dst2 = insertvalue { i64, i64 } %dst1, i64 %6, 0
+//   store { i64, i64 } %input0, { i64, i64 }* %input_lowered_ptr
+//   %input_unlowered_ptr = bitcast { i64, i64 }* %input_lowered_ptr
+//        to %"struct.impala_udf::TimestampVal"*
+//   store { i64, i64 } %dst2, { i64, i64 }* %dst_lowered_ptr
+//   %dst_unlowered_ptr = bitcast { i64, i64 }* %dst_lowered_ptr
+//        to %"struct.impala_udf::TimestampVal"*
+//   call void
+//        
@_ZN6impala18AggregateFunctions3MinIN10impala_udf12TimestampValEEEvPNS2_15FunctionContextERKT_PS6_.2(
+//        %"class.impala_udf::FunctionContext"* %agg_fn_ctx,
+//        %"struct.impala_udf::TimestampVal"* %input_unlowered_ptr,
+//        %"struct.impala_udf::TimestampVal"* %dst_unlowered_ptr)
+//   %anyval_result = load { i64, i64 }, { i64, i64 }* %dst_lowered_ptr
+//   %7 = extractvalue { i64, i64 } %anyval_result, 1
+//   %8 = insertvalue %"class.impala::TimestampValue" zeroinitializer, i64 %7, 
0, 0, 0, 0
+//   %9 = extractvalue { i64, i64 } %anyval_result, 0
+//   %10 = ashr i64 %9, 32
+//   %11 = trunc i64 %10 to i32
+//   %12 = insertvalue %"class.impala::TimestampValue" %8, i32 %11, 1, 0, 0
+//   %13 = extractvalue { i64, i64 } %anyval_result, 0
+//   %result_is_null = trunc i64 %13 to i1
+//   %14 = bitcast { i8, [7 x i8], %"class.impala::TimestampValue" }* 
%agg_tuple to i8*
+//   %null_byte_ptr3 = getelementptr i8, i8* %14, i32 0
+//   %null_byte4 = load i8, i8* %null_byte_ptr3
+//   %null_bit_cleared = and i8 %null_byte4, -2
+//   %15 = sext i1 %result_is_null to i8
+//   %null_bit = and i8 %15, 1
+//   %null_bit_set = or i8 %null_bit_cleared, %null_bit
+//   store i8 %null_bit_set, i8* %null_byte_ptr3
+//   store %"class.impala::TimestampValue" %12,
+//        %"class.impala::TimestampValue"* %dst_slot_ptr
 //   br label %ret
 //
-// ret:                                              ; preds = %src_not_null, 
%entry
+// ret:                                              ; preds = %entry
 //   ret void
 // }
+//
 Status PartitionedAggregationNode::CodegenUpdateSlot(LlvmCodeGen* codegen,
     AggFnEvaluator* evaluator, SlotDescriptor* slot_desc, Function** fn) {
-  // TODO: Fix this DCHECK and Init() once CodegenUpdateSlot() can handle 
AggFnEvaluator
-  // with multiple input expressions (e.g. group_concat).
-  DCHECK_EQ(evaluator->input_expr_ctxs().size(), 1);
-  ExprContext* agg_expr_ctx = evaluator->input_expr_ctxs()[0];
-  Expr* agg_expr = agg_expr_ctx->root();
-
-  // TODO: implement timestamp
-  if (agg_expr->type().type == TYPE_TIMESTAMP &&
-      evaluator->agg_op() != AggFnEvaluator::AVG) {
-    return Status("PartitionedAggregationNode::CodegenUpdateSlot(): timestamp 
input type "
-        "NYI");
-  }
-
-  Function* agg_expr_fn;
-  RETURN_IF_ERROR(agg_expr->GetCodegendComputeFn(codegen, &agg_expr_fn));
-
   PointerType* fn_ctx_type =
       codegen->GetPtrType(FunctionContextImpl::LLVM_FUNCTIONCONTEXT_NAME);
-  PointerType* expr_ctx_type = 
codegen->GetPtrType(ExprContext::LLVM_CLASS_NAME);
+  PointerType* expr_ctxs_type =
+      codegen->GetPtrPtrType(codegen->GetType(ExprContext::LLVM_CLASS_NAME));
   StructType* tuple_struct = intermediate_tuple_desc_->GetLlvmStruct(codegen);
   if (tuple_struct == NULL) {
     return Status("PartitionedAggregationNode::CodegenUpdateSlot(): failed to 
generate "
-        "intermediate tuple desc");
+                  "intermediate tuple desc");
   }
   PointerType* tuple_ptr_type = codegen->GetPtrType(tuple_struct);
   PointerType* tuple_row_ptr_type = 
codegen->GetPtrType(TupleRow::LLVM_CLASS_NAME);
@@ -1562,129 +1571,129 @@ Status 
PartitionedAggregationNode::CodegenUpdateSlot(LlvmCodeGen* codegen,
   // Create UpdateSlot prototype
   LlvmCodeGen::FnPrototype prototype(codegen, "UpdateSlot", 
codegen->void_type());
   prototype.AddArgument(LlvmCodeGen::NamedVariable("agg_fn_ctx", fn_ctx_type));
-  prototype.AddArgument(LlvmCodeGen::NamedVariable("agg_expr_ctx", 
expr_ctx_type));
+  prototype.AddArgument(LlvmCodeGen::NamedVariable("agg_expr_ctxs", 
expr_ctxs_type));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("agg_tuple", 
tuple_ptr_type));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("row", tuple_row_ptr_type));
 
-  LlvmCodeGen::LlvmBuilder builder(codegen->context());
+  LlvmBuilder builder(codegen->context());
   Value* args[4];
   *fn = prototype.GeneratePrototype(&builder, &args[0]);
   Value* agg_fn_ctx_arg = args[0];
-  Value* agg_expr_ctx_arg = args[1];
+  Value* agg_expr_ctxs_arg = args[1];
   Value* agg_tuple_arg = args[2];
   Value* row_arg = args[3];
 
-  BasicBlock* src_not_null_block =
-      BasicBlock::Create(codegen->context(), "src_not_null", *fn);
-  BasicBlock* ret_block = BasicBlock::Create(codegen->context(), "ret", *fn);
-
-  // Call expr function to get src slot value
-  Value* agg_expr_fn_args[] = { agg_expr_ctx_arg, row_arg };
-  CodegenAnyVal src = CodegenAnyVal::CreateCallWrapped(
-      codegen, &builder, agg_expr->type(), agg_expr_fn, agg_expr_fn_args, 
"src");
+  DCHECK_GE(evaluator->input_expr_ctxs().size(), 1);
+  vector<CodegenAnyVal> input_vals;
+  for (int i = 0; i < evaluator->input_expr_ctxs().size(); ++i) {
+    ExprContext* agg_expr_ctx = evaluator->input_expr_ctxs()[i];
+    Expr* agg_expr = agg_expr_ctx->root();
+    Function* agg_expr_fn;
+    RETURN_IF_ERROR(agg_expr->GetCodegendComputeFn(codegen, &agg_expr_fn));
+    DCHECK(agg_expr_fn != NULL);
+
+    // Call expr function with the matching expr context to get src slot value.
+    Value* expr_ctx_ptr = builder.CreateInBoundsGEP(
+        agg_expr_ctxs_arg, codegen->GetIntConstant(TYPE_INT, i), 
"expr_ctx_ptr");
+    Value* expr_ctx = builder.CreateLoad(expr_ctx_ptr, "expr_ctx");
+    string input_name = Substitute("input$0", i);
+    input_vals.push_back(
+        CodegenAnyVal::CreateCallWrapped(codegen, &builder, agg_expr->type(), 
agg_expr_fn,
+            ArrayRef<Value*>({expr_ctx, row_arg}), input_name.c_str()));
+  }
+
+  AggFnEvaluator::AggregationOp agg_op = evaluator->agg_op();
+  const ColumnType& dst_type = evaluator->intermediate_type();
+  bool dst_is_int_or_float_or_bool = dst_type.IsIntegerType()
+      || dst_type.IsFloatingPointType() || dst_type.IsBooleanType();
+  bool dst_is_numeric_or_bool = dst_is_int_or_float_or_bool || 
dst_type.IsDecimalType();
 
-  Value* src_is_null = src.GetIsNull();
-  builder.CreateCondBr(src_is_null, ret_block, src_not_null_block);
+  BasicBlock* ret_block = BasicBlock::Create(codegen->context(), "ret", *fn);
 
-  // Src slot is not null, update dst_slot
-  builder.SetInsertPoint(src_not_null_block);
-  Value* dst_ptr = builder.CreateStructGEP(NULL, agg_tuple_arg, 
slot_desc->llvm_field_idx(),
-      "dst_slot_ptr");
+  // Emit the code to compute 'result' and set the NULL indicator if needed. 
First check
+  // for special cases where we can emit a very simple instruction sequence, 
then fall
+  // back to the general-purpose approach of calling the cross-compiled 
builtin UDA.
+  CodegenAnyVal& src = input_vals[0];
+  // 'dst_slot_ptr' points to the slot in the aggregate tuple to update.
+  Value* dst_slot_ptr = builder.CreateStructGEP(
+      NULL, agg_tuple_arg, slot_desc->llvm_field_idx(), "dst_slot_ptr");
   Value* result = NULL;
-
-  if (slot_desc->is_nullable()) {
-    // Dst is NULL, just update dst slot to src slot and clear null bit
-    Function* clear_null_fn = slot_desc->GetUpdateNullFn(codegen, false);
-    builder.CreateCall(clear_null_fn, ArrayRef<Value*>({agg_tuple_arg}));
-  }
-
-  // Update the slot
-  Value* dst_value = builder.CreateLoad(dst_ptr, "dst_val");
-  switch (evaluator->agg_op()) {
-    case AggFnEvaluator::COUNT:
-      if (evaluator->is_merge()) {
-        result = builder.CreateAdd(dst_value, src.GetVal(), "count_sum");
-      } else {
-        result = builder.CreateAdd(dst_value,
-            codegen->GetIntConstant(TYPE_BIGINT, 1), "count_inc");
-      }
-      break;
-    case AggFnEvaluator::MIN: {
-      Function* min_fn = codegen->CodegenMinMax(slot_desc->type(), true);
-      Value* min_args[] = { dst_value, src.GetVal() };
-      result = builder.CreateCall(min_fn, min_args, "min_value");
-      break;
+  Value* dst_value = builder.CreateLoad(dst_slot_ptr, "dst_val");
+  if (agg_op == AggFnEvaluator::COUNT) {
+    src.CodegenBranchIfNull(&builder, ret_block);
+    if (evaluator->is_merge()) {
+      result = builder.CreateAdd(dst_value, src.GetVal(), "count_sum");
+    } else {
+      result = builder.CreateAdd(
+          dst_value, codegen->GetIntConstant(TYPE_BIGINT, 1), "count_inc");
     }
-    case AggFnEvaluator::MAX: {
-      Function* max_fn = codegen->CodegenMinMax(slot_desc->type(), false);
-      Value* max_args[] = { dst_value, src.GetVal() };
-      result = builder.CreateCall(max_fn, max_args, "max_value");
-      break;
+    DCHECK(!slot_desc->is_nullable());
+  } else if ((agg_op == AggFnEvaluator::MIN || agg_op == AggFnEvaluator::MAX)
+      && dst_is_numeric_or_bool) {
+    bool is_min = agg_op == AggFnEvaluator::MIN;
+    src.CodegenBranchIfNull(&builder, ret_block);
+    Function* min_max_fn = codegen->CodegenMinMax(slot_desc->type(), is_min);
+    Value* min_max_args[] = {dst_value, src.GetVal()};
+    result =
+        builder.CreateCall(min_max_fn, min_max_args, is_min ? "min_value" : 
"max_value");
+    // Dst may have been NULL, make sure to unset the NULL bit.
+    DCHECK(slot_desc->is_nullable());
+    slot_desc->CodegenSetNullIndicator(
+        codegen, &builder, agg_tuple_arg, codegen->false_value());
+  } else if (agg_op == AggFnEvaluator::SUM && dst_is_int_or_float_or_bool) {
+    src.CodegenBranchIfNull(&builder, ret_block);
+    if (dst_type.IsFloatingPointType()) {
+      result = builder.CreateFAdd(dst_value, src.GetVal());
+    } else {
+      result = builder.CreateAdd(dst_value, src.GetVal());
     }
-    case AggFnEvaluator::SUM:
-      if (slot_desc->type().type != TYPE_DECIMAL) {
-        if (slot_desc->type().type == TYPE_FLOAT ||
-            slot_desc->type().type == TYPE_DOUBLE) {
-          result = builder.CreateFAdd(dst_value, src.GetVal());
-        } else {
-          result = builder.CreateAdd(dst_value, src.GetVal());
-        }
-        break;
+    // Dst may have been NULL, make sure to unset the NULL bit.
+    DCHECK(slot_desc->is_nullable());
+    slot_desc->CodegenSetNullIndicator(
+        codegen, &builder, agg_tuple_arg, codegen->false_value());
+  } else {
+    // The remaining cases are implemented using the UDA interface.
+    // Create intermediate argument 'dst' from 'dst_value'
+    CodegenAnyVal dst = CodegenAnyVal::GetNonNullVal(codegen, &builder, 
dst_type, "dst");
+
+    // For a subset of builtins we generate a different code sequence that 
exploits two
+    // properties of the builtins. First, NULL input values can be skipped. 
Second, the
+    // value of the slot was initialized in the right way in InitAggSlots() 
(e.g. 0 for
+    // SUM) that we get the right result if UpdateSlot() pretends that the 
NULL bit of
+    // 'dst' is unset. Empirically this optimisation makes TPC-H Q1 5-10% 
faster.
+    bool special_null_handling = !evaluator->intermediate_type().IsStringType()
+        && !evaluator->intermediate_type().IsTimestampType()
+        && (agg_op == AggFnEvaluator::MIN || agg_op == AggFnEvaluator::MAX
+               || agg_op == AggFnEvaluator::SUM || agg_op == 
AggFnEvaluator::AVG
+               || agg_op == AggFnEvaluator::NDV);
+    if (slot_desc->is_nullable()) {
+      if (special_null_handling) {
+        src.CodegenBranchIfNull(&builder, ret_block);
+        slot_desc->CodegenSetNullIndicator(
+            codegen, &builder, agg_tuple_arg, codegen->false_value());
+      } else {
+        dst.SetIsNull(slot_desc->CodegenIsNull(codegen, &builder, 
agg_tuple_arg));
       }
-      DCHECK_EQ(slot_desc->type().type, TYPE_DECIMAL);
-      // Fall through to xcompiled case
-    case AggFnEvaluator::AVG:
-    case AggFnEvaluator::NDV: {
-      // Get xcompiled update/merge function from IR module
-      const string& symbol = evaluator->is_merge() ?
-                             evaluator->merge_symbol() : 
evaluator->update_symbol();
-      const ColumnType& dst_type = evaluator->intermediate_type();
-      Function* ir_fn = codegen->GetFunction(symbol);
-      DCHECK(ir_fn != NULL);
-
-      // Clone and replace constants.
-      ir_fn = codegen->CloneFunction(ir_fn);
-      vector<FunctionContext::TypeDesc> arg_types;
-      arg_types.push_back(AnyValUtil::ColumnTypeToTypeDesc(agg_expr->type()));
-      Expr::InlineConstants(AnyValUtil::ColumnTypeToTypeDesc(dst_type), 
arg_types,
-          codegen, ir_fn);
-
-      // Create pointer to src to pass to ir_fn. We must use the unlowered 
type.
-      Value* src_lowered_ptr = codegen->CreateEntryBlockAlloca(
-          *fn, LlvmCodeGen::NamedVariable("src_lowered_ptr", 
src.value()->getType()));
-      builder.CreateStore(src.value(), src_lowered_ptr);
-      Type* unlowered_ptr_type =
-          CodegenAnyVal::GetUnloweredPtrType(codegen, agg_expr->type());
-      Value* src_unlowered_ptr =
-          builder.CreateBitCast(src_lowered_ptr, unlowered_ptr_type, 
"src_unlowered_ptr");
-
-      // Create intermediate argument 'dst' from 'dst_value'
-      CodegenAnyVal dst = CodegenAnyVal::GetNonNullVal(
-          codegen, &builder, dst_type, "dst");
-      dst.SetFromRawValue(dst_value);
-      // Create pointer to dst to pass to ir_fn. We must use the unlowered 
type.
-      Value* dst_lowered_ptr = codegen->CreateEntryBlockAlloca(
-          *fn, LlvmCodeGen::NamedVariable("dst_lowered_ptr", 
dst.value()->getType()));
-      builder.CreateStore(dst.value(), dst_lowered_ptr);
-      unlowered_ptr_type = CodegenAnyVal::GetUnloweredPtrType(codegen, 
dst_type);
-      Value* dst_unlowered_ptr =
-          builder.CreateBitCast(dst_lowered_ptr, unlowered_ptr_type, 
"dst_unlowered_ptr");
-
-      // Call 'ir_fn'
-      builder.CreateCall(ir_fn,
-          ArrayRef<Value*>({agg_fn_ctx_arg, src_unlowered_ptr, 
dst_unlowered_ptr}));
-
-      // Convert StringVal intermediate 'dst_arg' back to StringValue
-      Value* anyval_result = builder.CreateLoad(dst_lowered_ptr, 
"anyval_result");
-      result = CodegenAnyVal(codegen, &builder, dst_type, 
anyval_result).ToNativeValue();
-      break;
     }
-    default:
-      DCHECK(false) << "bad aggregate operator: " << evaluator->agg_op();
+    dst.SetFromRawValue(dst_value);
+
+    // Call the UDA to update/merge 'src' into 'dst', with the result stored in
+    // 'updated_dst_val'.
+    CodegenAnyVal updated_dst_val;
+    RETURN_IF_ERROR(CodegenCallUda(
+        codegen, &builder, evaluator, agg_fn_ctx_arg, input_vals, dst, 
&updated_dst_val));
+    result = updated_dst_val.ToNativeValue();
+
+    if (slot_desc->is_nullable() && !special_null_handling) {
+      // Set NULL bit in the slot based on the return value.
+      Value* result_is_null = updated_dst_val.GetIsNull("result_is_null");
+      slot_desc->CodegenSetNullIndicator(
+          codegen, &builder, agg_tuple_arg, result_is_null);
+    }
   }
 
   // TODO: Store to register in the loop and store once to memory at the end 
of the loop.
-  builder.CreateStore(result, dst_ptr);
+  builder.CreateStore(result, dst_slot_ptr);
   builder.CreateBr(ret_block);
 
   builder.SetInsertPoint(ret_block);
@@ -1698,6 +1707,58 @@ Status 
PartitionedAggregationNode::CodegenUpdateSlot(LlvmCodeGen* codegen,
   return Status::OK();
 }
 
+Status PartitionedAggregationNode::CodegenCallUda(LlvmCodeGen* codegen,
+    LlvmBuilder* builder, AggFnEvaluator* evaluator, Value* agg_fn_ctx,
+    const vector<CodegenAnyVal>& input_vals, const CodegenAnyVal& dst,
+    CodegenAnyVal* updated_dst_val) {
+  DCHECK_EQ(evaluator->input_expr_ctxs().size(), input_vals.size());
+  const string& symbol =
+      evaluator->is_merge() ? evaluator->merge_symbol() : 
evaluator->update_symbol();
+  const ColumnType& dst_type = evaluator->intermediate_type();
+
+  // TODO: to support actual UDAs, not just builtin functions using the UDA 
interface,
+  // we need to load the function at this point.
+  Function* uda_fn = codegen->GetFunction(symbol, true);
+  DCHECK(uda_fn != NULL);
+
+  vector<FunctionContext::TypeDesc> arg_types;
+  for (int i = 0; i < evaluator->input_expr_ctxs().size(); ++i) {
+    arg_types.push_back(AnyValUtil::ColumnTypeToTypeDesc(
+        evaluator->input_expr_ctxs()[i]->root()->type()));
+  }
+  Expr::InlineConstants(
+      AnyValUtil::ColumnTypeToTypeDesc(dst_type), arg_types, codegen, uda_fn);
+
+  // Set up arguments for call to UDA, which are the FunctionContext*, 
followed by
+  // pointers to all input values, followed by a pointer to the destination 
value.
+  vector<Value*> uda_fn_args;
+  uda_fn_args.push_back(agg_fn_ctx);
+
+  // Create pointers to input args to pass to uda_fn. We must use the 
unlowered type,
+  // e.g. IntVal, because the UDA interface expects the values to be passed as 
const
+  // references to the classes.
+  for (int i = 0; i < evaluator->input_expr_ctxs().size(); ++i) {
+    
uda_fn_args.push_back(input_vals[i].GetUnloweredPtr("input_unlowered_ptr"));
+  }
+
+  // Create pointer to dst to pass to uda_fn. We must use the unlowered type 
for the
+  // same reason as above.
+  Value* dst_lowered_ptr = dst.GetLoweredPtr("dst_lowered_ptr");
+  Type* dst_unlowered_ptr_type = CodegenAnyVal::GetUnloweredPtrType(codegen, 
dst_type);
+  Value* dst_unlowered_ptr = builder->CreateBitCast(
+      dst_lowered_ptr, dst_unlowered_ptr_type, "dst_unlowered_ptr");
+  uda_fn_args.push_back(dst_unlowered_ptr);
+
+  // Call 'uda_fn'
+  builder->CreateCall(uda_fn, uda_fn_args);
+
+  // Convert intermediate 'dst_arg' back to the native type.
+  Value* anyval_result = builder->CreateLoad(dst_lowered_ptr, "anyval_result");
+
+  *updated_dst_val = CodegenAnyVal(codegen, builder, dst_type, anyval_result);
+  return Status::OK();
+}
+
 // IR codegen for the UpdateTuple loop.  This loop is query specific and based 
on the
 // aggregate functions.  The function signature must match the non- codegen'd 
UpdateTuple
 // exactly.
@@ -1706,78 +1767,60 @@ Status 
PartitionedAggregationNode::CodegenUpdateSlot(LlvmCodeGen* codegen,
 //
 // ; Function Attrs: alwaysinline
 // define void @UpdateTuple(%"class.impala::PartitionedAggregationNode"* 
%this_ptr,
-//                          %"class.impala_udf::FunctionContext"** 
%agg_fn_ctxs,
-//                          %"class.impala::Tuple"* %tuple,
-//                          %"class.impala::TupleRow"* %row,
-//                          i1 %is_merge) #34 {
+//      %"class.impala_udf::FunctionContext"** %agg_fn_ctxs, 
%"class.impala::Tuple"*
+//      %tuple,
+//      %"class.impala::TupleRow"* %row, i1 %is_merge) #34 {
 // entry:
 //   %tuple1 =
-//       bitcast %"class.impala::Tuple"* %tuple to { i8, [7 x i8], i64, i64, 
double }*
+//      bitcast %"class.impala::Tuple"* %tuple to { i8, [7 x i8], i64, i64, 
double }*
 //   %src_slot = getelementptr inbounds { i8, [7 x i8], i64, i64, double },
-//       { i8, [7 x i8], i64, i64, double }* %tuple1, i32 0, i32 2
+//      { i8, [7 x i8], i64, i64, double }* %tuple1, i32 0, i32 2
 //   %count_star_val = load i64, i64* %src_slot
 //   %count_star_inc = add i64 %count_star_val, 1
 //   store i64 %count_star_inc, i64* %src_slot
 //   %0 = getelementptr %"class.impala_udf::FunctionContext"*,
-//       %"class.impala_udf::FunctionContext"** %agg_fn_ctxs, i32 1
+//      %"class.impala_udf::FunctionContext"** %agg_fn_ctxs, i32 1
 //   %agg_fn_ctx = load %"class.impala_udf::FunctionContext"*,
-//       %"class.impala_udf::FunctionContext"** %0
-//   %1 = call %"class.impala::ExprContext"*
-//       @_ZNK6impala26PartitionedAggregationNode17GetAggExprContextEi(
-//           %"class.impala::PartitionedAggregationNode"* %this_ptr, i32 1)
+//      %"class.impala_udf::FunctionContext"** %0
+//   %1 = call %"class.impala::ExprContext"**
+//      @_ZNK6impala26PartitionedAggregationNode18GetAggExprContextsEi(
+//      %"class.impala::PartitionedAggregationNode"* %this_ptr, i32 1)
 //   call void @UpdateSlot(%"class.impala_udf::FunctionContext"* %agg_fn_ctx,
-//                         %"class.impala::ExprContext"* %1,
-//                         { i8, [7 x i8], i64, i64, double }* %tuple1,
-//                         %"class.impala::TupleRow"* %row)
+//      %"class.impala::ExprContext"** %1, { i8, [7 x i8], i64, i64, double }* 
%tuple1,
+//      %"class.impala::TupleRow"* %row)
 //   %2 = getelementptr %"class.impala_udf::FunctionContext"*,
-//       %"class.impala_udf::FunctionContext"** %agg_fn_ctxs, i32 2
+//      %"class.impala_udf::FunctionContext"** %agg_fn_ctxs, i32 2
 //   %agg_fn_ctx2 = load %"class.impala_udf::FunctionContext"*,
-//       %"class.impala_udf::FunctionContext"** %2
-//   %3 = call %"class.impala::ExprContext"*
-//       @_ZNK6impala26PartitionedAggregationNode17GetAggExprContextEi(
-//           %"class.impala::PartitionedAggregationNode"* %this_ptr, i32 2)
-//   call void @UpdateSlot.3(%"class.impala_udf::FunctionContext"* 
%agg_fn_ctx2,
-//                          %"class.impala::ExprContext"* %3,
-//                          { i8, [7 x i8], i64, i64, double }* %tuple1,
-//                          %"class.impala::TupleRow"* %row)
+//      %"class.impala_udf::FunctionContext"** %2
+//   %3 = call %"class.impala::ExprContext"**
+//      @_ZNK6impala26PartitionedAggregationNode18GetAggExprContextsEi(
+//      %"class.impala::PartitionedAggregationNode"* %this_ptr, i32 2)
+//   call void @UpdateSlot.4(%"class.impala_udf::FunctionContext"* 
%agg_fn_ctx2,
+//      %"class.impala::ExprContext"** %3, { i8, [7 x i8], i64, i64, double }* 
%tuple1,
+//      %"class.impala::TupleRow"* %row)
 //   ret void
 // }
-Status PartitionedAggregationNode::CodegenUpdateTuple(LlvmCodeGen* codegen,
-    Function** fn) {
+Status PartitionedAggregationNode::CodegenUpdateTuple(
+    LlvmCodeGen* codegen, Function** fn) {
   SCOPED_TIMER(codegen->codegen_timer());
 
-  int j = grouping_expr_ctxs_.size();
-  for (int i = 0; i < aggregate_evaluators_.size(); ++i, ++j) {
-    SlotDescriptor* slot_desc = intermediate_tuple_desc_->slots()[j];
-    AggFnEvaluator* evaluator = aggregate_evaluators_[i];
-
-    // Don't codegen things that aren't builtins (for now)
-    if (!evaluator->is_builtin()) {
-      return Status("PartitionedAggregationNode::CodegenUpdateTuple(): UDA 
codegen NYI");
-    }
-
-    bool supported = true;
-    AggFnEvaluator::AggregationOp op = evaluator->agg_op();
-    PrimitiveType type = slot_desc->type().type;
-    // Char and timestamp intermediates aren't supported
-    if (type == TYPE_TIMESTAMP || type == TYPE_CHAR) supported = false;
-    // Only AVG and NDV support string intermediates
-    if ((type == TYPE_STRING || type == TYPE_VARCHAR) &&
-        !(op == AggFnEvaluator::AVG || op == AggFnEvaluator::NDV)) {
-      supported = false;
-    }
-    if (!supported) {
-      stringstream ss;
-      ss << "Could not codegen PartitionedAggregationNode::UpdateTuple because 
"
-         << "intermediate type " << slot_desc->type() << " is not yet 
supported for "
-         << "aggregate function \"" << evaluator->fn_name() << "()\"";
-      return Status(ss.str());
+  for (const SlotDescriptor* slot_desc : intermediate_tuple_desc_->slots()) {
+    if (slot_desc->type().type == TYPE_CHAR) {
+      return Status("PartitionedAggregationNode::CodegenUpdateTuple(): cannot 
codegen"
+                    "CHAR in aggregations");
     }
   }
 
   if (intermediate_tuple_desc_->GetLlvmStruct(codegen) == NULL) {
     return Status("PartitionedAggregationNode::CodegenUpdateTuple(): failed to 
generate "
-        "intermediate tuple desc");
+                  "intermediate tuple desc");
+  }
+
+  for (AggFnEvaluator* evaluator : aggregate_evaluators_) {
+    // Don't codegen things that aren't builtins (for now)
+    if (!evaluator->is_builtin()) {
+      return Status("PartitionedAggregationNode::CodegenUpdateTuple(): UDA 
codegen NYI");
+    }
   }
 
   // Get the types to match the UpdateTuple signature
@@ -1800,7 +1843,7 @@ Status 
PartitionedAggregationNode::CodegenUpdateTuple(LlvmCodeGen* codegen,
   prototype.AddArgument(LlvmCodeGen::NamedVariable("row", tuple_row_ptr_type));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("is_merge", 
codegen->boolean_type()));
 
-  LlvmCodeGen::LlvmBuilder builder(codegen->context());
+  LlvmBuilder builder(codegen->context());
   Value* args[5];
   *fn = prototype.GeneratePrototype(&builder, &args[0]);
   Value* this_arg = args[0];
@@ -1812,13 +1855,13 @@ Status 
PartitionedAggregationNode::CodegenUpdateTuple(LlvmCodeGen* codegen,
   // TODO: get rid of this by using right type in function signature
   tuple_arg = builder.CreateBitCast(tuple_arg, tuple_ptr, "tuple");
 
-  Function* get_expr_ctx_fn =
-      codegen->GetFunction(IRFunction::PART_AGG_NODE_GET_EXPR_CTX, false);
-  DCHECK(get_expr_ctx_fn != NULL);
+  Function* get_expr_ctxs_fn =
+      codegen->GetFunction(IRFunction::PART_AGG_NODE_GET_EXPR_CTXS, false);
+  DCHECK(get_expr_ctxs_fn != NULL);
 
   // Loop over each expr and generate the IR for that slot.  If the expr is not
   // count(*), generate a helper IR function to update the slot and call that.
-  j = grouping_expr_ctxs_.size();
+  int j = grouping_expr_ctxs_.size();
   for (int i = 0; i < aggregate_evaluators_.size(); ++i, ++j) {
     SlotDescriptor* slot_desc = intermediate_tuple_desc_->slots()[j];
     AggFnEvaluator* evaluator = aggregate_evaluators_[i];
@@ -1838,9 +1881,9 @@ Status 
PartitionedAggregationNode::CodegenUpdateTuple(LlvmCodeGen* codegen,
       Value* agg_fn_ctx = builder.CreateLoad(agg_fn_ctx_ptr, "agg_fn_ctx");
       // Call GetExprCtx() to get the expression context.
       DCHECK(agg_expr_ctxs_[i] != NULL);
-      Value* get_expr_ctx_args[] = { this_arg, 
codegen->GetIntConstant(TYPE_INT, i) };
-      Value* agg_expr_ctx = builder.CreateCall(get_expr_ctx_fn, 
get_expr_ctx_args);
-      Value* update_slot_args[] = { agg_fn_ctx, agg_expr_ctx, tuple_arg, 
row_arg };
+      Value* get_expr_ctxs_args[] = {this_arg, 
codegen->GetIntConstant(TYPE_INT, i)};
+      Value* agg_expr_ctxs = builder.CreateCall(get_expr_ctxs_fn, 
get_expr_ctxs_args);
+      Value* update_slot_args[] = {agg_fn_ctx, agg_expr_ctxs, tuple_arg, 
row_arg};
       builder.CreateCall(update_slot_fn, update_slot_args);
     }
   }
@@ -1849,8 +1892,8 @@ Status 
PartitionedAggregationNode::CodegenUpdateTuple(LlvmCodeGen* codegen,
   // CodegenProcessBatch() does the final optimizations.
   *fn = codegen->FinalizeFunction(*fn);
   if (*fn == NULL) {
-    return Status("PartitionedAggregationNode::CodegeUpdateTuple(): codegen'd "
-        "UpdateTuple() function failed verification, see log");
+    return Status("PartitionedAggregationNode::CodegenUpdateTuple(): codegen'd 
"
+                  "UpdateTuple() function failed verification, see log");
   }
   return Status::OK();
 }


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d7246d64/be/src/exec/partitioned-aggregation-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-aggregation-node.h 
b/be/src/exec/partitioned-aggregation-node.h
index 9e14e66..4287cea 100644
--- a/be/src/exec/partitioned-aggregation-node.h
+++ b/be/src/exec/partitioned-aggregation-node.h
@@ -30,13 +30,17 @@
 #include "runtime/string-value.h"
 
 namespace llvm {
-  class Function;
+class BasicBlock;
+class Function;
+class Value;
 }
 
 namespace impala {
 
 class AggFnEvaluator;
+class CodegenAnyVal;
 class LlvmCodeGen;
+class LlvmBuilder;
 class RowBatch;
 class RuntimeState;
 struct StringValue;
@@ -203,7 +207,7 @@ class PartitionedAggregationNode : public ExecNode {
   /// version of UpdateTuple() to avoid loading aggregate_evaluators_[i] at 
runtime.
   /// An entry is NULL if the aggregate evaluator is not codegen'ed or there 
is no Expr
   /// in the aggregate evaluator (e.g. count(*)).
-  std::vector<ExprContext*> agg_expr_ctxs_;
+  std::vector<ExprContext* const*> agg_expr_ctxs_;
 
   /// FunctionContext for each aggregate function and backing MemPool. String 
data
   /// returned by the aggregate functions is allocated via these contexts.
@@ -520,12 +524,13 @@ class PartitionedAggregationNode : public ExecNode {
   /// This function processes each individual row in ProcessBatch(). Must be 
inlined into
   /// ProcessBatch for codegen to substitute function calls with codegen'd 
versions.
   /// May spill partitions if not enough memory is available.
-  template<bool AGGREGATED_ROWS>
+  template <bool AGGREGATED_ROWS>
   Status IR_ALWAYS_INLINE ProcessRow(TupleRow* row, HashTableCtx* ht_ctx);
 
-  /// Accessor for the expression context of an AggFnEvaluator. Used only in 
codegen'ed
+  /// Accessor for the expression contexts of an AggFnEvaluator. Returns an 
array of
+  /// pointers the the AggFnEvaluator's expression contexts. Used only in 
codegen'ed
   /// version of UpdateTuple().
-  ExprContext* IR_ALWAYS_INLINE GetAggExprContext(int i) const;
+  ExprContext* const* IR_ALWAYS_INLINE GetAggExprContexts(int i) const;
 
   /// Create a new intermediate tuple in partition, initialized with row. 
ht_ctx is
   /// the context for the partition's hash table and hash is the precomputed 
hash of
@@ -640,6 +645,16 @@ class PartitionedAggregationNode : public ExecNode {
   Status CodegenUpdateSlot(LlvmCodeGen* codegen, AggFnEvaluator* evaluator,
       SlotDescriptor* slot_desc, llvm::Function** fn);
 
+  /// Codegen a call to a function implementing the UDA interface with input 
values
+  /// from 'input_vals'. 'dst_val' should contain the previous value of the 
aggregate
+  /// function, and 'updated_dst_val' is set to the new value after the Update 
or Merge
+  /// operation is applied. The instruction sequence for the UDA call is 
inserted at
+  /// the insert position of 'builder'.
+  Status CodegenCallUda(LlvmCodeGen* codegen, LlvmBuilder* builder,
+      AggFnEvaluator* evaluator, llvm::Value* agg_fn_ctx,
+      const std::vector<CodegenAnyVal>& input_vals, const CodegenAnyVal& 
dst_val,
+      CodegenAnyVal* updated_dst_val);
+
   /// Codegen UpdateTuple(). Returns non-OK status if codegen is unsuccessful.
   Status CodegenUpdateTuple(LlvmCodeGen* codegen, llvm::Function** fn);
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d7246d64/be/src/exec/partitioned-hash-join-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-hash-join-node.cc 
b/be/src/exec/partitioned-hash-join-node.cc
index 116ac02..c3211af 100644
--- a/be/src/exec/partitioned-hash-join-node.cc
+++ b/be/src/exec/partitioned-hash-join-node.cc
@@ -1144,7 +1144,7 @@ Status 
PartitionedHashJoinNode::CodegenCreateOutputRow(LlvmCodeGen* codegen,
   prototype.AddArgument(LlvmCodeGen::NamedVariable("build_arg", 
tuple_row_ptr_type));
 
   LLVMContext& context = codegen->context();
-  LlvmCodeGen::LlvmBuilder builder(context);
+  LlvmBuilder builder(context);
   Value* args[4];
   *fn = prototype.GeneratePrototype(&builder, args);
   Value* out_row_arg = builder.CreateBitCast(args[1], tuple_row_working_type, 
"out");
@@ -1156,8 +1156,9 @@ Status 
PartitionedHashJoinNode::CodegenCreateOutputRow(LlvmCodeGen* codegen,
 
   // Copy probe row
   codegen->CodegenMemcpy(&builder, out_row_arg, probe_row_arg, 
probe_tuple_row_size_);
-  Value* build_row_idx[] = { codegen->GetIntConstant(TYPE_INT, 
num_probe_tuples) };
-  Value* build_row_dst = builder.CreateGEP(out_row_arg, build_row_idx, 
"build_dst_ptr");
+  Value* build_row_idx[] = {codegen->GetIntConstant(TYPE_INT, 
num_probe_tuples)};
+  Value* build_row_dst =
+      builder.CreateInBoundsGEP(out_row_arg, build_row_idx, "build_dst_ptr");
 
   // Copy build row.
   BasicBlock* build_not_null_block = BasicBlock::Create(context, 
"build_not_null", *fn);
@@ -1176,9 +1177,8 @@ Status 
PartitionedHashJoinNode::CodegenCreateOutputRow(LlvmCodeGen* codegen,
     // to work.
     builder.SetInsertPoint(build_null_block);
     for (int i = 0; i < num_build_tuples; ++i) {
-      Value* array_idx[] =
-          { codegen->GetIntConstant(TYPE_INT, i + num_probe_tuples) };
-      Value* dst = builder.CreateGEP(out_row_arg, array_idx, "dst_tuple_ptr");
+      Value* array_idx[] = {codegen->GetIntConstant(TYPE_INT, i + 
num_probe_tuples)};
+      Value* dst = builder.CreateInBoundsGEP(out_row_arg, array_idx, 
"dst_tuple_ptr");
       builder.CreateStore(codegen->null_ptr_value(), dst);
     }
     builder.CreateRetVoid();

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d7246d64/be/src/exec/text-converter.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/text-converter.cc b/be/src/exec/text-converter.cc
index d38d662..cb5d70c 100644
--- a/be/src/exec/text-converter.cc
+++ b/be/src/exec/text-converter.cc
@@ -128,19 +128,13 @@ Function* TextConverter::CodegenWriteSlot(LlvmCodeGen* 
codegen,
   if (tuple_type == NULL) return NULL;
   PointerType* tuple_ptr_type = tuple_type->getPointerTo();
 
-  Function* set_null_fn = slot_desc->GetUpdateNullFn(codegen, true);
-  if (set_null_fn == NULL) {
-    LOG(ERROR) << "Could not codegen WriteSlot because slot update codegen 
failed.";
-    return NULL;
-  }
-
   LlvmCodeGen::FnPrototype prototype(
       codegen, "WriteSlot", codegen->GetType(TYPE_BOOLEAN));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("tuple_arg", 
tuple_ptr_type));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("data", 
codegen->ptr_type()));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("len", 
codegen->GetType(TYPE_INT)));
 
-  LlvmCodeGen::LlvmBuilder builder(codegen->context());
+  LlvmBuilder builder(codegen->context());
   Value* args[3];
   Function* fn = prototype.GeneratePrototype(&builder, &args[0]);
 
@@ -267,13 +261,13 @@ Function* TextConverter::CodegenWriteSlot(LlvmCodeGen* 
codegen,
 
     // Parse failed, set slot to null and return false
     builder.SetInsertPoint(parse_failed_block);
-    builder.CreateCall(set_null_fn, ArrayRef<Value*>({args[0]}));
+    slot_desc->CodegenSetNullIndicator(codegen, &builder, args[0], 
codegen->true_value());
     builder.CreateRet(codegen->false_value());
   }
 
   // Case where data is \N or len == 0 and it is not a string col
   builder.SetInsertPoint(set_null_block);
-  builder.CreateCall(set_null_fn, ArrayRef<Value*>({args[0]}));
+  slot_desc->CodegenSetNullIndicator(codegen, &builder, args[0], 
codegen->true_value());
   builder.CreateRet(codegen->true_value());
 
   return codegen->FinalizeFunction(fn);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d7246d64/be/src/exprs/aggregate-functions-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/aggregate-functions-ir.cc 
b/be/src/exprs/aggregate-functions-ir.cc
index c07d1db..7ed6386 100644
--- a/be/src/exprs/aggregate-functions-ir.cc
+++ b/be/src/exprs/aggregate-functions-ir.cc
@@ -164,12 +164,10 @@ StringVal ToStringVal(FunctionContext* context, T val) {
   stringstream ss;
   ss << val;
   const string &str = ss.str();
-  return StringVal::CopyFrom(context, reinterpret_cast<const 
uint8_t*>(str.c_str()), str.size());
+  return StringVal::CopyFrom(
+      context, reinterpret_cast<const uint8_t*>(str.c_str()), str.size());
 }
 
-// Delimiter to use if the separator is NULL.
-static const StringVal DEFAULT_STRING_CONCAT_DELIM((uint8_t*)", ", 2);
-
 constexpr int AggregateFunctions::HLL_PRECISION;
 constexpr int AggregateFunctions::HLL_LEN;
 
@@ -627,15 +625,21 @@ void AggregateFunctions::Max(FunctionContext*,
 // StringConcatUpdate().
 typedef int StringConcatHeader;
 
-void AggregateFunctions::StringConcatUpdate(FunctionContext* ctx,
-    const StringVal& src, StringVal* result) {
-  StringConcatUpdate(ctx, src, DEFAULT_STRING_CONCAT_DELIM, result);
+// Delimiter to use if the separator is not provided.
+static inline StringVal ALWAYS_INLINE DefaultStringConcatDelim() {
+  return StringVal(reinterpret_cast<uint8_t*>(const_cast<char*>(", ")), 2);
+}
+
+void AggregateFunctions::StringConcatUpdate(
+    FunctionContext* ctx, const StringVal& src, StringVal* result) {
+  StringConcatUpdate(ctx, src, DefaultStringConcatDelim(), result);
 }
 
-void AggregateFunctions::StringConcatUpdate(FunctionContext* ctx,
-    const StringVal& src, const StringVal& separator, StringVal* result) {
+void AggregateFunctions::StringConcatUpdate(FunctionContext* ctx, const 
StringVal& src,
+    const StringVal& separator, StringVal* result) {
   if (src.is_null) return;
-  const StringVal* sep = separator.is_null ? &DEFAULT_STRING_CONCAT_DELIM : 
&separator;
+  const StringVal default_delim = DefaultStringConcatDelim();
+  const StringVal* sep = separator.is_null ? &default_delim : &separator;
   if (result->is_null) {
     // Header of the intermediate state holds the length of the first 
separator.
     const int header_len = sizeof(StringConcatHeader);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d7246d64/be/src/exprs/case-expr.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/case-expr.cc b/be/src/exprs/case-expr.cc
index f139d64..3c0db79 100644
--- a/be/src/exprs/case-expr.cc
+++ b/be/src/exprs/case-expr.cc
@@ -196,7 +196,7 @@ Status CaseExpr::GetCodegendComputeFn(LlvmCodeGen* codegen, 
Function** fn) {
   }
 
   LLVMContext& context = codegen->context();
-  LlvmCodeGen::LlvmBuilder builder(context);
+  LlvmBuilder builder(context);
 
   Value* args[2];
   Function* function = CreateIrFunctionPrototype(codegen, "CaseExpr", &args);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d7246d64/be/src/exprs/compound-predicates.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/compound-predicates.cc 
b/be/src/exprs/compound-predicates.cc
index 531a70c..d367e5a 100644
--- a/be/src/exprs/compound-predicates.cc
+++ b/be/src/exprs/compound-predicates.cc
@@ -131,7 +131,7 @@ Status CompoundPredicate::CodegenComputeFn(
   RETURN_IF_ERROR(children()[1]->GetCodegendComputeFn(codegen, &rhs_function));
 
   LLVMContext& context = codegen->context();
-  LlvmCodeGen::LlvmBuilder builder(context);
+  LlvmBuilder builder(context);
   Value* args[2];
   Function* function = CreateIrFunctionPrototype(codegen, "CompoundPredicate", 
&args);
 
@@ -140,8 +140,7 @@ Status CompoundPredicate::CodegenComputeFn(
 
   // Control blocks for aggregating results
   BasicBlock* lhs_null_block = BasicBlock::Create(context, "lhs_null", 
function);
-  BasicBlock* lhs_not_null_block =
-      BasicBlock::Create(context, "lhs_not_null", function);
+  BasicBlock* lhs_not_null_block = BasicBlock::Create(context, "lhs_not_null", 
function);
   BasicBlock* lhs_null_rhs_not_null_block =
       BasicBlock::Create(context, "lhs_null_rhs_not_null", function);
   BasicBlock* lhs_not_null_rhs_null_block =
@@ -232,7 +231,7 @@ Status CompoundPredicate::CodegenComputeFn(
   CodegenAnyVal ret(codegen, &builder, TYPE_BOOLEAN, NULL, "ret");
   ret.SetIsNull(is_null_phi);
   ret.SetVal(val_phi);
-  builder.CreateRet(ret.value());
+  builder.CreateRet(ret.GetLoweredValue());
 
   *fn = codegen->FinalizeFunction(function);
   DCHECK(*fn != NULL);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d7246d64/be/src/exprs/expr-codegen-test.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/expr-codegen-test.cc 
b/be/src/exprs/expr-codegen-test.cc
index 800a440..6ccbbf4 100644
--- a/be/src/exprs/expr-codegen-test.cc
+++ b/be/src/exprs/expr-codegen-test.cc
@@ -252,7 +252,7 @@ TEST_F(ExprCodegenTest, TestInlineConstants) {
   test_udf_file << getenv("IMPALA_HOME") << 
"/be/build/latest/exprs/expr-codegen-test.ll";
   scoped_ptr<LlvmCodeGen> codegen;
   ASSERT_OK(LlvmCodeGen::CreateFromFile(&pool, test_udf_file.str(), "test", 
&codegen));
-  Function* fn = codegen->GetFunction(TEST_GET_CONSTANT_SYMBOL);
+  Function* fn = codegen->GetFunction(TEST_GET_CONSTANT_SYMBOL, false);
   ASSERT_TRUE(fn != NULL);
 
   // Function verification should fail because we haven't inlined 
GetConstant() calls

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d7246d64/be/src/exprs/expr.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/expr.cc b/be/src/exprs/expr.cc
index 0141bae..045c030 100644
--- a/be/src/exprs/expr.cc
+++ b/be/src/exprs/expr.cc
@@ -684,10 +684,10 @@ Status Expr::GetCodegendComputeFnWrapper(LlvmCodeGen* 
codegen, Function** fn) {
   ir_compute_fn_ = CreateIrFunctionPrototype(codegen, 
"CodegenComputeFnWrapper", &args);
   BasicBlock* entry_block =
       BasicBlock::Create(codegen->context(), "entry", ir_compute_fn_);
-  LlvmCodeGen::LlvmBuilder builder(entry_block);
+  LlvmBuilder builder(entry_block);
   Value* this_ptr =
       codegen->CastPtrToLlvmPtr(codegen->GetPtrType(Expr::LLVM_CLASS_NAME), 
this);
-  Value* compute_fn_args[] = { this_ptr, args[0], args[1] };
+  Value* compute_fn_args[] = {this_ptr, args[0], args[1]};
   Value* ret = CodegenAnyVal::CreateCall(
       codegen, &builder, static_getval_fn, compute_fn_args, "ret");
   builder.CreateRet(ret);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d7246d64/be/src/exprs/literal.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/literal.cc b/be/src/exprs/literal.cc
index 03bdd50..5b075f6 100644
--- a/be/src/exprs/literal.cc
+++ b/be/src/exprs/literal.cc
@@ -385,7 +385,7 @@ Status Literal::GetCodegendComputeFn(LlvmCodeGen* codegen, 
llvm::Function** fn)
   Value* args[2];
   *fn = CreateIrFunctionPrototype(codegen, "Literal", &args);
   BasicBlock* entry_block = BasicBlock::Create(codegen->context(), "entry", 
*fn);
-  LlvmCodeGen::LlvmBuilder builder(entry_block);
+  LlvmBuilder builder(entry_block);
 
   CodegenAnyVal v = CodegenAnyVal::GetNonNullVal(codegen, &builder, type_);
   switch (type_.type) {
@@ -439,7 +439,7 @@ Status Literal::GetCodegendComputeFn(LlvmCodeGen* codegen, 
llvm::Function** fn)
       return Status(ss.str());
   }
 
-  builder.CreateRet(v.value());
+  builder.CreateRet(v.GetLoweredValue());
   *fn = codegen->FinalizeFunction(*fn);
   ir_compute_fn_ = *fn;
   return Status::OK();

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d7246d64/be/src/exprs/null-literal.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/null-literal.cc b/be/src/exprs/null-literal.cc
index a8f4241..6985c5b 100644
--- a/be/src/exprs/null-literal.cc
+++ b/be/src/exprs/null-literal.cc
@@ -101,7 +101,7 @@ Status NullLiteral::GetCodegendComputeFn(LlvmCodeGen* 
codegen, llvm::Function**
   Value* args[2];
   *fn = CreateIrFunctionPrototype(codegen, "NullLiteral", &args);
   BasicBlock* entry_block = BasicBlock::Create(codegen->context(), "entry", 
*fn);
-  LlvmCodeGen::LlvmBuilder builder(entry_block);
+  LlvmBuilder builder(entry_block);
 
   Value* v = CodegenAnyVal::GetNullVal(codegen, type());
   builder.CreateRet(v);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d7246d64/be/src/exprs/scalar-fn-call.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/scalar-fn-call.cc b/be/src/exprs/scalar-fn-call.cc
index 3a7ffb1..5457ced 100644
--- a/be/src/exprs/scalar-fn-call.cc
+++ b/be/src/exprs/scalar-fn-call.cc
@@ -319,7 +319,7 @@ Status ScalarFnCall::GetCodegendComputeFn(LlvmCodeGen* 
codegen, Function** fn) {
   Value* expr_ctx = args[0];
   Value* row = args[1];
   BasicBlock* block = BasicBlock::Create(codegen->context(), "entry", *fn);
-  LlvmCodeGen::LlvmBuilder builder(block);
+  LlvmBuilder builder(block);
 
   // Populate UDF arguments
   vector<Value*> udf_args;
@@ -482,7 +482,7 @@ Status ScalarFnCall::GetUdf(LlvmCodeGen* codegen, 
Function** udf) {
   } else if (fn_.binary_type == TFunctionBinaryType::BUILTIN) {
     // In this path, we're running a builtin with the UDF interface. The IR is
     // in the llvm module.
-    *udf = codegen->GetFunction(fn_.scalar_fn.symbol);
+    *udf = codegen->GetFunction(fn_.scalar_fn.symbol, false);
     if (*udf == NULL) {
       // Builtins symbols should exist unless there is a version mismatch.
       stringstream ss;
@@ -502,11 +502,11 @@ Status ScalarFnCall::GetUdf(LlvmCodeGen* codegen, 
Function** udf) {
   } else {
     // We're running an IR UDF.
     DCHECK_EQ(fn_.binary_type, TFunctionBinaryType::IR);
-    *udf = codegen->GetFunction(fn_.scalar_fn.symbol);
+    *udf = codegen->GetFunction(fn_.scalar_fn.symbol, false);
     if (*udf == NULL) {
       stringstream ss;
-      ss << "Unable to locate function " << fn_.scalar_fn.symbol
-         << " from LLVM module " << fn_.hdfs_location;
+      ss << "Unable to locate function " << fn_.scalar_fn.symbol << " from 
LLVM module "
+         << fn_.hdfs_location;
       return Status(ss.str());
     }
     *udf = codegen->FinalizeFunction(*udf);
@@ -527,7 +527,7 @@ Status ScalarFnCall::GetFunction(RuntimeState* state, const 
string& symbol, void
     DCHECK_EQ(fn_.binary_type, TFunctionBinaryType::IR);
     LlvmCodeGen* codegen = state->codegen();
     DCHECK(codegen != NULL);
-    Function* ir_fn = codegen->GetFunction(symbol);
+    Function* ir_fn = codegen->GetFunction(symbol, false);
     if (ir_fn == NULL) {
       stringstream ss;
       ss << "Unable to locate function " << symbol << " from LLVM module "

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d7246d64/be/src/exprs/slot-ref.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/slot-ref.cc b/be/src/exprs/slot-ref.cc
index eeba42f..3a70b6c 100644
--- a/be/src/exprs/slot-ref.cc
+++ b/be/src/exprs/slot-ref.cc
@@ -134,11 +134,11 @@ string SlotRef::DebugString() const {
 //   br label %check_slot_null
 //
 // check_slot_null:                                  ; preds = %entry
-//   %null_ptr = getelementptr i8* %tuple_ptr, i32 0
+//   %null_byte_ptr = getelementptr i8* %tuple_ptr, i32 0
 //   %null_byte = load i8* %null_ptr
 //   %null_byte_set = and i8 %null_byte, 2
-//   %slot_is_null = icmp ne i8 %null_byte_set, 0
-//   br i1 %slot_is_null, label %ret, label %get_slot
+//   %is_null = icmp ne i8 %null_byte_set, 0
+//   br i1 %is_null, label %ret, label %get_slot
 //
 // get_slot:                                         ; preds = %check_slot_null
 //   %slot_addr = getelementptr i8* %tuple_ptr, i32 8
@@ -189,28 +189,24 @@ Status SlotRef::GetCodegendComputeFn(LlvmCodeGen* 
codegen, llvm::Function** fn)
   Value* row_ptr = args[1];
 
   Value* tuple_offset = ConstantInt::get(codegen->int_type(), tuple_idx_);
-  Value* null_byte_offset =
-    ConstantInt::get(codegen->int_type(), null_indicator_offset_.byte_offset);
   Value* slot_offset = ConstantInt::get(codegen->int_type(), slot_offset_);
-  Value* null_mask =
-      ConstantInt::get(codegen->tinyint_type(), 
null_indicator_offset_.bit_mask);
   Value* zero = ConstantInt::get(codegen->GetType(TYPE_TINYINT), 0);
   Value* one = ConstantInt::get(codegen->GetType(TYPE_TINYINT), 1);
 
   BasicBlock* entry_block = BasicBlock::Create(context, "entry", *fn);
+  bool slot_is_nullable = null_indicator_offset_.bit_mask != 0;
   BasicBlock* check_slot_null_indicator_block = NULL;
-  if (null_indicator_offset_.bit_mask != 0) {
-    check_slot_null_indicator_block =
-        BasicBlock::Create(context, "check_slot_null", *fn);
+  if (slot_is_nullable) {
+    check_slot_null_indicator_block = BasicBlock::Create(context, 
"check_slot_null", *fn);
   }
   BasicBlock* get_slot_block = BasicBlock::Create(context, "get_slot", *fn);
   BasicBlock* ret_block = BasicBlock::Create(context, "ret", *fn);
 
-  LlvmCodeGen::LlvmBuilder builder(entry_block);
+  LlvmBuilder builder(entry_block);
   // Get the tuple offset addr from the row
   Value* cast_row_ptr = builder.CreateBitCast(
       row_ptr, PointerType::get(codegen->ptr_type(), 0), "cast_row_ptr");
-  Value* tuple_addr = builder.CreateGEP(cast_row_ptr, tuple_offset, 
"tuple_addr");
+  Value* tuple_addr = builder.CreateInBoundsGEP(cast_row_ptr, tuple_offset, 
"tuple_addr");
   // Load the tuple*
   Value* tuple_ptr = builder.CreateLoad(tuple_addr, "tuple_ptr");
 
@@ -218,32 +214,30 @@ Status SlotRef::GetCodegendComputeFn(LlvmCodeGen* 
codegen, llvm::Function** fn)
   if (tuple_is_nullable_) {
     Value* tuple_is_null = builder.CreateIsNull(tuple_ptr, "tuple_is_null");
     // Check slot is null only if the null indicator bit is set
-    if (null_indicator_offset_.bit_mask == 0) {
-      builder.CreateCondBr(tuple_is_null, ret_block, get_slot_block);
-    } else {
+    if (slot_is_nullable) {
       builder.CreateCondBr(tuple_is_null, ret_block, 
check_slot_null_indicator_block);
+    } else {
+      builder.CreateCondBr(tuple_is_null, ret_block, get_slot_block);
     }
   } else {
-    if (null_indicator_offset_.bit_mask == 0) {
-      builder.CreateBr(get_slot_block);
-    } else {
+    if (slot_is_nullable) {
       builder.CreateBr(check_slot_null_indicator_block);
+    } else {
+      builder.CreateBr(get_slot_block);
     }
   }
 
   // Branch for tuple* != NULL.  Need to check if null-indicator is set
-  if (check_slot_null_indicator_block != NULL) {
+  if (slot_is_nullable) {
     builder.SetInsertPoint(check_slot_null_indicator_block);
-    Value* null_addr = builder.CreateGEP(tuple_ptr, null_byte_offset, 
"null_ptr");
-    Value* null_val = builder.CreateLoad(null_addr, "null_byte");
-    Value* slot_null_mask = builder.CreateAnd(null_val, null_mask, 
"null_byte_set");
-    Value* is_slot_null = builder.CreateICmpNE(slot_null_mask, zero, 
"slot_is_null");
+    Value* is_slot_null = SlotDescriptor::CodegenIsNull(
+        codegen, &builder, null_indicator_offset_, tuple_ptr);
     builder.CreateCondBr(is_slot_null, ret_block, get_slot_block);
   }
 
   // Branch for slot != NULL
   builder.SetInsertPoint(get_slot_block);
-  Value* slot_ptr = builder.CreateGEP(tuple_ptr, slot_offset, "slot_addr");
+  Value* slot_ptr = builder.CreateInBoundsGEP(tuple_ptr, slot_offset, 
"slot_addr");
   Value* val_ptr = builder.CreateBitCast(slot_ptr, codegen->GetPtrType(type_), 
"val_ptr");
   // Depending on the type, load the values we need
   Value* val = NULL;
@@ -313,7 +307,7 @@ Status SlotRef::GetCodegendComputeFn(LlvmCodeGen* codegen, 
llvm::Function** fn)
     result.SetIsNull(is_null_phi);
     result.SetPtr(ptr_phi);
     result.SetLen(len_phi);
-    builder.CreateRet(result.value());
+    builder.CreateRet(result.GetLoweredValue());
   } else if (type() == TYPE_TIMESTAMP) {
     DCHECK(time_of_day != NULL);
     DCHECK(date != NULL);
@@ -343,7 +337,7 @@ Status SlotRef::GetCodegendComputeFn(LlvmCodeGen* codegen, 
llvm::Function** fn)
     result.SetIsNull(is_null_phi);
     result.SetTimeOfDay(time_of_day_phi);
     result.SetDate(date_phi);
-    builder.CreateRet(result.value());
+    builder.CreateRet(result.GetLoweredValue());
   } else {
     DCHECK(val != NULL);
     PHINode* val_phi = builder.CreatePHI(val->getType(), 2, "val_phi");
@@ -360,7 +354,7 @@ Status SlotRef::GetCodegendComputeFn(LlvmCodeGen* codegen, 
llvm::Function** fn)
         CodegenAnyVal::GetNonNullVal(codegen, &builder, type(), "result");
     result.SetIsNull(is_null_phi);
     result.SetVal(val_phi);
-    builder.CreateRet(result.value());
+    builder.CreateRet(result.GetLoweredValue());
   }
 
   *fn = codegen->FinalizeFunction(*fn);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d7246d64/be/src/runtime/descriptors.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/descriptors.cc b/be/src/runtime/descriptors.cc
index 41485f9..f8ec79c 100644
--- a/be/src/runtime/descriptors.cc
+++ b/be/src/runtime/descriptors.cc
@@ -77,9 +77,8 @@ ostream& operator<<(ostream& os, const NullIndicatorOffset& 
null_indicator) {
   return os;
 }
 
-SlotDescriptor::SlotDescriptor(
-    const TSlotDescriptor& tdesc, const TupleDescriptor* parent,
-    const TupleDescriptor* collection_item_descriptor)
+SlotDescriptor::SlotDescriptor(const TSlotDescriptor& tdesc,
+    const TupleDescriptor* parent, const TupleDescriptor* 
collection_item_descriptor)
   : id_(tdesc.id),
     type_(ColumnType::FromThrift(tdesc.slotType)),
     parent_(parent),
@@ -89,9 +88,7 @@ SlotDescriptor::SlotDescriptor(
     null_indicator_offset_(tdesc.nullIndicatorByte, tdesc.nullIndicatorBit),
     slot_idx_(tdesc.slotIdx),
     slot_size_(type_.GetSlotSize()),
-    llvm_field_idx_(-1),
-    set_not_null_fn_(NULL),
-    set_null_fn_(NULL) {
+    llvm_field_idx_(-1) {
   DCHECK_NE(type_.type, TYPE_STRUCT);
   DCHECK(parent_ != NULL) << tdesc.parent;
   if (type_.IsCollectionType()) {
@@ -569,60 +566,79 @@ void 
DescriptorTbl::GetTupleDescs(vector<TupleDescriptor*>* descs) const {
   }
 }
 
-// Generate function to set a slot to be null or not-null.  The resulting IR
-// for SetNotNull looks like:
-// (in this case the tuple contains only a nullable double)
-// define void @SetNotNull({ i8, double }* %tuple) {
-// entry:
-//   %null_byte_ptr = getelementptr inbounds { i8, double }* %tuple, i32 0, 
i32 0
-//   %null_byte = load i8* %null_byte_ptr
-//   %0 = and i8 %null_byte, -2
-//   store i8 %0, i8* %null_byte_ptr
-//   ret void
-// }
-Function* SlotDescriptor::GetUpdateNullFn(LlvmCodeGen* codegen, bool set_null) 
const {
-  if (set_null && set_null_fn_ != NULL) return set_null_fn_;
-  if (!set_null && set_not_null_fn_ != NULL) return set_not_null_fn_;
-
-  StructType* tuple_type = parent()->GetLlvmStruct(codegen);
-  PointerType* tuple_ptr_type = tuple_type->getPointerTo();
-  LlvmCodeGen::FnPrototype prototype(codegen, (set_null) ? "SetNull" 
:"SetNotNull",
-      codegen->void_type());
-  prototype.AddArgument(LlvmCodeGen::NamedVariable("tuple", tuple_ptr_type));
-
-  LlvmCodeGen::LlvmBuilder builder(codegen->context());
-  Value* tuple_arg;
-  Function* fn = prototype.GeneratePrototype(&builder, &tuple_arg);
-
-  Value* tuple_int8_ptr =
-      builder.CreateBitCast(tuple_arg, codegen->ptr_type(), "tuple_int8_ptr");
-  Value* null_byte_offset =
-      ConstantInt::get(codegen->int_type(), 
null_indicator_offset_.byte_offset);
-  Value* null_byte_ptr =
-      builder.CreateInBoundsGEP(tuple_int8_ptr, null_byte_offset, 
"null_byte_ptr");
-  Value* null_byte = builder.CreateLoad(null_byte_ptr, "null_byte");
-
+Value* SlotDescriptor::CodegenIsNull(
+    LlvmCodeGen* codegen, LlvmBuilder* builder, Value* tuple) const {
+  return CodegenIsNull(codegen, builder, null_indicator_offset_, tuple);
+}
+
+// Example IR for getting the first null bit:
+//  %0 = bitcast { i8, [7 x i8], %"class.impala::TimestampValue" }* %agg_tuple 
to i8*
+//  %null_byte_ptr = getelementptr i8, i8* %0, i32 0
+//  %null_byte = load i8, i8* %null_byte_ptr
+//  %null_mask = and i8 %null_byte, 1
+//  %is_null = icmp ne i8 %null_mask, 0
+Value* SlotDescriptor::CodegenIsNull(LlvmCodeGen* codegen, LlvmBuilder* 
builder,
+    const NullIndicatorOffset& null_indicator_offset, Value* tuple) {
+  Value* null_byte =
+      CodegenGetNullByte(codegen, builder, null_indicator_offset, tuple, NULL);
+  Constant* mask =
+      ConstantInt::get(codegen->tinyint_type(), 
null_indicator_offset.bit_mask);
+  Value* null_mask = builder->CreateAnd(null_byte, mask, "null_mask");
+  Constant* zero = ConstantInt::get(codegen->tinyint_type(), 0);
+  return builder->CreateICmpNE(null_mask, zero, "is_null");
+}
+
+// Example IR for setting the first null bit to a non-constant 'is_null' value:
+//  %14 = bitcast { i8, [7 x i8], %"class.impala::TimestampValue" }* 
%agg_tuple to i8*
+//  %null_byte_ptr3 = getelementptr i8, i8* %14, i32 0
+//  %null_byte4 = load i8, i8* %null_byte_ptr3
+//  %null_bit_cleared = and i8 %null_byte4, -2
+//  %15 = sext i1 %result_is_null to i8
+//  %null_bit = and i8 %15, 1
+//  %null_bit_set = or i8 %null_bit_cleared, %null_bit
+//  store i8 %null_bit_set, i8* %null_byte_ptr3
+void SlotDescriptor::CodegenSetNullIndicator(
+    LlvmCodeGen* codegen, LlvmBuilder* builder, Value* tuple, Value* is_null) 
const {
+  DCHECK_EQ(is_null->getType(), codegen->boolean_type());
+  Value* null_byte_ptr;
+  Value* null_byte =
+      CodegenGetNullByte(codegen, builder, null_indicator_offset_, tuple, 
&null_byte_ptr);
+  Constant* mask =
+      ConstantInt::get(codegen->tinyint_type(), 
null_indicator_offset_.bit_mask);
+  Constant* not_mask =
+      ConstantInt::get(codegen->tinyint_type(), 
~null_indicator_offset_.bit_mask);
+
+  ConstantInt* constant_is_null = dyn_cast<ConstantInt>(is_null);
   Value* result = NULL;
-  if (set_null) {
-    Value* null_set = codegen->GetIntConstant(
-        TYPE_TINYINT, null_indicator_offset_.bit_mask);
-    result = builder.CreateOr(null_byte, null_set);
-  } else {
-    Value* null_clear_val =
-        codegen->GetIntConstant(TYPE_TINYINT, 
~null_indicator_offset_.bit_mask);
-    result = builder.CreateAnd(null_byte, null_clear_val);
-  }
-
-  builder.CreateStore(result, null_byte_ptr);
-  builder.CreateRetVoid();
-
-  fn = codegen->FinalizeFunction(fn);
-  if (set_null) {
-    set_null_fn_ = fn;
+  if (constant_is_null != NULL) {
+    if (constant_is_null->isOne()) {
+      result = builder->CreateOr(null_byte, mask, "null_bit_set");
+    } else {
+      DCHECK(constant_is_null->isZero());
+      result = builder->CreateAnd(null_byte, not_mask, "null_bit_cleared");
+    }
   } else {
-    set_not_null_fn_ = fn;
-  }
-  return fn;
+    // Avoid branching by computing the new byte as:
+    // (null_byte & ~mask) | (-null & mask);
+    Value* byte_with_cleared_bit =
+        builder->CreateAnd(null_byte, not_mask, "null_bit_cleared");
+    Value* sign_extended_null = builder->CreateSExt(is_null, 
codegen->tinyint_type());
+    Value* bit_only = builder->CreateAnd(sign_extended_null, mask, "null_bit");
+    result = builder->CreateOr(byte_with_cleared_bit, bit_only, 
"null_bit_set");
+  }
+
+  builder->CreateStore(result, null_byte_ptr);
+}
+
+Value* SlotDescriptor::CodegenGetNullByte(LlvmCodeGen* codegen, LlvmBuilder* 
builder,
+    const NullIndicatorOffset& null_indicator_offset, Value* tuple,
+    Value** null_byte_ptr) {
+  Constant* byte_offset =
+      ConstantInt::get(codegen->int_type(), null_indicator_offset.byte_offset);
+  Value* tuple_bytes = builder->CreateBitCast(tuple, codegen->ptr_type());
+  Value* byte_ptr = builder->CreateInBoundsGEP(tuple_bytes, byte_offset, 
"null_byte_ptr");
+  if (null_byte_ptr != NULL) *null_byte_ptr = byte_ptr;
+  return builder->CreateLoad(byte_ptr, "null_byte");
 }
 
 StructType* TupleDescriptor::GetLlvmStruct(LlvmCodeGen* codegen) const {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d7246d64/be/src/runtime/descriptors.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/descriptors.h b/be/src/runtime/descriptors.h
index 8122653..4a50800 100644
--- a/be/src/runtime/descriptors.h
+++ b/be/src/runtime/descriptors.h
@@ -35,19 +35,21 @@ namespace llvm {
   class Function;
   class PointerType;
   class StructType;
+  class Value;
 };
 
 namespace impala {
 
+class Expr;
+class ExprContext;
+class LlvmBuilder;
 class LlvmCodeGen;
 class ObjectPool;
+class RuntimeState;
 class TDescriptorTable;
 class TSlotDescriptor;
 class TTable;
 class TTupleDescriptor;
-class Expr;
-class ExprContext;
-class RuntimeState;
 
 /// A path into a table schema (e.g. a vector of ColumnTypes) pointing to a 
particular
 /// column/field. The i-th element of the path is the ordinal position of the 
column/field
@@ -133,9 +135,22 @@ class SlotDescriptor {
 
   std::string DebugString() const;
 
-  /// Codegen for: void SetNull(Tuple* tuple) / void SetNotNull(Tuple* tuple)
-  /// The codegen'd IR function is cached.
-  llvm::Function* GetUpdateNullFn(LlvmCodeGen*, bool set_null) const;
+  /// Generate LLVM code at the insert position of 'builder' that returns a 
boolean value
+  /// represented as a LLVM i1 indicating whether this slot is null in 'tuple'.
+  llvm::Value* CodegenIsNull(
+      LlvmCodeGen* codegen, LlvmBuilder* builder, llvm::Value* tuple) const;
+
+  /// Generate LLVM code at the insert position of 'builder' that returns a 
boolean value
+  /// represented as a LLVM i1 with value of the NULL bit at 
'null_indicator_offset' in
+  /// 'tuple'.
+  static llvm::Value* CodegenIsNull(LlvmCodeGen* codegen, LlvmBuilder* builder,
+      const NullIndicatorOffset& null_indicator_offset, llvm::Value* tuple);
+
+  /// Generate LLVM code at the insert position of 'builder' to set this slot's
+  /// NULL bit in the given 'tuple' to the value 'is_null'. 'tuple' is a 
pointer
+  /// to the tuple, and 'is_null' is an boolean value represented as a LLVM i1.
+  void CodegenSetNullIndicator(LlvmCodeGen* codegen, LlvmBuilder* builder,
+      llvm::Value* tuple, llvm::Value* is_null) const;
 
  private:
   friend class DescriptorTbl;
@@ -163,13 +178,16 @@ class SlotDescriptor {
   /// any padding bytes.
   int llvm_field_idx_;
 
-  /// Cached codegen'd functions
-  mutable llvm::Function* set_not_null_fn_;
-  mutable llvm::Function* set_null_fn_;
-
   /// collection_item_descriptor should be non-NULL iff this is a collection 
slot
   SlotDescriptor(const TSlotDescriptor& tdesc, const TupleDescriptor* parent,
-                 const TupleDescriptor* collection_item_descriptor);
+      const TupleDescriptor* collection_item_descriptor);
+
+  /// Generate LLVM code at the insert position of 'builder' to get the i8 
value of the
+  /// the byte containing 'null_indicator_offset' in 'tuple'. If 
'null_byte_ptr' is
+  /// non-NULL, sets that to a pointer to the null byte.
+  static llvm::Value* CodegenGetNullByte(LlvmCodeGen* codegen, LlvmBuilder* 
builder,
+      const NullIndicatorOffset& null_indicator_offset, llvm::Value* tuple,
+      llvm::Value** null_byte_ptr);
 };
 
 class ColumnDescriptor {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d7246d64/be/src/runtime/tuple.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/tuple.cc b/be/src/runtime/tuple.cc
index 7fa8b2b..dc83922 100644
--- a/be/src/runtime/tuple.cc
+++ b/be/src/runtime/tuple.cc
@@ -349,7 +349,7 @@ Status Tuple::CodegenMaterializeExprs(LlvmCodeGen* codegen, 
bool collect_string_
   prototype.AddArgument("total_string_lengths", int_ptr_type);
   prototype.AddArgument("num_non_null_string_values", int_ptr_type);
 
-  LlvmCodeGen::LlvmBuilder builder(context);
+  LlvmBuilder builder(context);
   Value* args[8];
   *fn = prototype.GeneratePrototype(&builder, args);
   Value* opaque_tuple_arg = args[0];
@@ -363,6 +363,9 @@ Status Tuple::CodegenMaterializeExprs(LlvmCodeGen* codegen, 
bool collect_string_
 
   // Cast the opaque Tuple* argument to the generated struct type
   Type* tuple_struct_type = desc.GetLlvmStruct(codegen);
+  if (tuple_struct_type == NULL) {
+    return Status("CodegenMaterializeExprs(): failed to generate tuple desc");
+  }
   PointerType* tuple_type = codegen->GetPtrType(tuple_struct_type);
   Value* tuple = builder.CreateBitCast(opaque_tuple_arg, tuple_type, "tuple");
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d7246d64/be/src/runtime/types.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/types.h b/be/src/runtime/types.h
index 9558792..b0835bf 100644
--- a/be/src/runtime/types.h
+++ b/be/src/runtime/types.h
@@ -163,13 +163,28 @@ struct ColumnType {
     return thrift_type;
   }
 
+  inline bool IsBooleanType() const { return type == TYPE_BOOLEAN; }
+
+  inline bool IsIntegerType() const {
+    return type == TYPE_TINYINT || type == TYPE_SMALLINT || type == TYPE_INT
+        || type == TYPE_BIGINT;
+  }
+
+  inline bool IsFloatingPointType() const {
+    return type == TYPE_FLOAT || type == TYPE_DOUBLE;
+  }
+
+  inline bool IsDecimalType() const { return type == TYPE_DECIMAL; }
+
   inline bool IsStringType() const {
     return type == TYPE_STRING || type == TYPE_VARCHAR || type == TYPE_CHAR;
   }
 
+  inline bool IsTimestampType() const { return type == TYPE_TIMESTAMP; }
+
   inline bool IsVarLenStringType() const {
-    return type == TYPE_STRING || type == TYPE_VARCHAR ||
-        (type == TYPE_CHAR && len > MAX_CHAR_INLINE_LENGTH);
+    return type == TYPE_STRING || type == TYPE_VARCHAR
+        || (type == TYPE_CHAR && len > MAX_CHAR_INLINE_LENGTH);
   }
 
   inline bool IsComplexType() const {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d7246d64/be/src/testutil/test-udas.cc
----------------------------------------------------------------------
diff --git a/be/src/testutil/test-udas.cc b/be/src/testutil/test-udas.cc
index 569853e..0097500 100644
--- a/be/src/testutil/test-udas.cc
+++ b/be/src/testutil/test-udas.cc
@@ -130,3 +130,39 @@ void ArgIsConstUpdate(FunctionContext* context, const 
IntVal& val,
 void ArgIsConstMerge(FunctionContext* context, const BooleanVal& src, 
BooleanVal* dst) {
   dst->val |= src.val;
 }
+
+// Defines aggregate function for testing NULL handling. Returns NULL if an 
even number
+// of non-NULL inputs are consumed or 1 if an odd number of non-NULL inputs 
are consumed.
+void ToggleNullInit(FunctionContext* context, IntVal* total) {
+  *total = IntVal::null();
+}
+
+void ToggleNullUpdate(FunctionContext* context, const IntVal& val, IntVal* 
total) {
+  if (total->is_null) {
+    *total = IntVal(1);
+  } else {
+    *total = IntVal::null();
+  }
+}
+
+void ToggleNullMerge(FunctionContext* context, const IntVal& src, IntVal* dst) 
{
+  if (src.is_null != dst->is_null) {
+    *dst = IntVal(1);
+  } else {
+    *dst = IntVal::null();
+  }
+}
+
+// Defines aggregate function for testing input NULL handling. Returns the 
number of NULL
+// input values.
+void CountNullsInit(FunctionContext* context, BigIntVal* total) {
+  *total = BigIntVal(0);
+}
+
+void CountNullsUpdate(FunctionContext* context, const BigIntVal& val, 
BigIntVal* total) {
+  if (val.is_null) ++total->val;
+}
+
+void CountNullsMerge(FunctionContext* context, const BigIntVal& src, 
BigIntVal* dst) {
+  dst->val += src.val;
+}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d7246d64/be/src/testutil/test-udfs.cc
----------------------------------------------------------------------
diff --git a/be/src/testutil/test-udfs.cc b/be/src/testutil/test-udfs.cc
index efdfa0a..56d4524 100644
--- a/be/src/testutil/test-udfs.cc
+++ b/be/src/testutil/test-udfs.cc
@@ -56,11 +56,7 @@ IntVal AllTypes(
 StringVal NoArgs(FunctionContext* context) {
   const char* result = "string";
   StringVal ret(context, strlen(result));
-  // TODO: llvm 3.3 seems to have a bug if we use memcpy here making
-  // the IR udf crash. This is fixed in 3.3.1. Fix this when we upgrade.
-  //memcpy(ret.ptr, result, strlen(result));
-  // IMPALA-775
-  for (int i = 0; i < strlen(result); ++i) ret.ptr[i] = result[i];
+  memcpy(ret.ptr, result, strlen(result));
   return ret;
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/d7246d64/be/src/util/tuple-row-compare.cc
----------------------------------------------------------------------
diff --git a/be/src/util/tuple-row-compare.cc b/be/src/util/tuple-row-compare.cc
index 69892ae..c159853 100644
--- a/be/src/util/tuple-row-compare.cc
+++ b/be/src/util/tuple-row-compare.cc
@@ -205,7 +205,7 @@ Status TupleRowComparator::CodegenCompare(LlvmCodeGen* 
codegen, Function** fn) {
   prototype.AddArgument("lhs", tuple_row_type);
   prototype.AddArgument("rhs", tuple_row_type);
 
-  LlvmCodeGen::LlvmBuilder builder(context);
+  LlvmBuilder builder(context);
   Value* args[4];
   *fn = prototype.GeneratePrototype(&builder, args);
   Value* lhs_ctxs_arg = args[0];

[26/50] [abbrv] incubator-impala git commit: IMPALA-1430, IMPALA-4108: codegen all builtin aggregate functions

Reply via email to