IMPALA-2548: Codegen Tuple::MaterializeExprs() and use in TopN node

For the following benchmark query:
 select count(*) from (select l_orderkey from biglineitem order by l_orderkey 
limit 1000) a

The overall query time goes from 2.74s to 1.74s, with the top-n node
time going from 2.2s to 1.0s. There is no effect on sort node time.

The overall approach of this patch is to move the
TopNNode::InsertTupleRow() call into a cross-compiled batched function
(InsertBatch()), and then replace the MaterializeExprs() calls with
new functions built using the IRBuilder. This involves new codegen
utilities, such as CodegenAnyVal::WriteToSlot() and the ability to
hardcode in a MemPool pointer from which to make varlen data
allocations. This patch also adds a new timer measuring the time spent
inserting tuple rows.

The existing TestQueries::test_top_n and TestQueries::test_sort tests
pass with this patch.

Change-Id: Ib422a8d50303c21c6a228675157bf867e8619444
Reviewed-on: http://gerrit.cloudera.org:8080/1901
Reviewed-by: Skye Wanderman-Milne <[email protected]>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/8e8df2f2
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/8e8df2f2
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/8e8df2f2

Branch: refs/heads/master
Commit: 8e8df2f2f60f72cf05bf7b9f8a1c5e2839691a87
Parents: da49a37
Author: Skye Wanderman-Milne <[email protected]>
Authored: Wed May 11 18:22:50 2016 -0700
Committer: Tim Armstrong <[email protected]>
Committed: Thu May 12 14:18:03 2016 -0700

----------------------------------------------------------------------
 be/src/codegen/codegen-anyval.cc      |  76 ++++++++++-
 be/src/codegen/codegen-anyval.h       |  25 +++-
 be/src/codegen/gen_ir_descriptions.py |   3 +
 be/src/codegen/impala-ir.cc           |   2 +
 be/src/codegen/llvm-codegen.cc        |  51 ++++---
 be/src/codegen/llvm-codegen.h         |  16 ++-
 be/src/exec/CMakeLists.txt            |   1 +
 be/src/exec/topn-node-ir.cc           |  49 +++++++
 be/src/exec/topn-node.cc              |  75 +++++++----
 be/src/exec/topn-node.h               |  17 ++-
 be/src/runtime/descriptors.cc         |   2 +
 be/src/runtime/descriptors.h          |   2 +
 be/src/runtime/mem-pool.h             |   4 +
 be/src/runtime/raw-value.cc           |   3 +
 be/src/runtime/sorter.cc              |   2 +-
 be/src/runtime/tuple.cc               | 207 ++++++++++++++++++++++++++---
 be/src/runtime/tuple.h                |  81 +++++++++--
 17 files changed, 519 insertions(+), 97 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8e8df2f2/be/src/codegen/codegen-anyval.cc
----------------------------------------------------------------------
diff --git a/be/src/codegen/codegen-anyval.cc b/be/src/codegen/codegen-anyval.cc
index 0eb8094..5466c8e 100644
--- a/be/src/codegen/codegen-anyval.cc
+++ b/be/src/codegen/codegen-anyval.cc
@@ -151,8 +151,8 @@ Value* CodegenAnyVal::CreateCall(
 
 CodegenAnyVal CodegenAnyVal::CreateCallWrapped(
     LlvmCodeGen* cg, LlvmCodeGen::LlvmBuilder* builder, const ColumnType& type,
-    Function* fn, ArrayRef<Value*> args, const char* name, Value* result_ptr) {
-  Value* v = CreateCall(cg, builder, fn, args, name, result_ptr);
+    Function* fn, ArrayRef<Value*> args, const char* name) {
+  Value* v = CreateCall(cg, builder, fn, args, name);
   return CodegenAnyVal(cg, builder, type, v, name);
 }
 
@@ -514,15 +514,24 @@ void CodegenAnyVal::SetFromRawValue(Value* raw_val) {
   }
 }
 
-Value* CodegenAnyVal::ToNativeValue() {
+Value* CodegenAnyVal::ToNativeValue(MemPool* pool) {
   Type* raw_type = codegen_->GetType(type_);
   Value* raw_val = Constant::getNullValue(raw_type);
   switch (type_.type) {
     case TYPE_STRING:
     case TYPE_VARCHAR: {
       // Convert StringVal to StringValue
-      raw_val = builder_->CreateInsertValue(raw_val, GetPtr(), 0);
-      raw_val = builder_->CreateInsertValue(raw_val, GetLen(), 1);
+      Value* len = GetLen();
+      raw_val = builder_->CreateInsertValue(raw_val, len, 1);
+      if (pool == NULL) {
+        // Set raw_val.ptr from this->ptr
+        raw_val = builder_->CreateInsertValue(raw_val, GetPtr(), 0);
+      } else {
+        // Allocate raw_val.ptr from 'pool' and copy this->ptr
+        Value* new_ptr = codegen_->CodegenAllocate(builder_, pool, len, 
"new_ptr");
+        codegen_->CodegenMemcpy(builder_, new_ptr, GetPtr(), len);
+        raw_val = builder_->CreateInsertValue(raw_val, new_ptr, 0);
+      }
       break;
     }
     case TYPE_TIMESTAMP: {
@@ -554,8 +563,8 @@ Value* CodegenAnyVal::ToNativeValue() {
   return raw_val;
 }
 
-Value* CodegenAnyVal::ToNativePtr(Value* native_ptr) {
-  Value* v = ToNativeValue();
+Value* CodegenAnyVal::ToNativePtr(Value* native_ptr, MemPool* pool) {
+  Value* v = ToNativeValue(pool);
   if (native_ptr == NULL) {
     native_ptr = codegen_->CreateEntryBlockAlloca(*builder_, v->getType());
   }
@@ -563,6 +572,59 @@ Value* CodegenAnyVal::ToNativePtr(Value* native_ptr) {
   return native_ptr;
 }
 
+// Example output for materializing an int slot:
+//
+//   ; [insert point starts here]
+//   %is_null = trunc i64 %src to i1
+//   br i1 %is_null, label %null, label %non_null ;
+//
+// non_null:                                         ; preds = %entry
+//   %slot = getelementptr inbounds { i8, i32, %"struct.impala::StringValue" 
}* %tuple,
+//       i32 0, i32 1
+//   %2 = ashr i64 %src, 32
+//   %3 = trunc i64 %2 to i32
+//   store i32 %3, i32* %slot
+//   br label %end_write
+//
+// null:                                             ; preds = %entry
+//   call void @SetNull6({ i8, i32, %"struct.impala::StringValue" }* %tuple)
+//   br label %end_write
+//
+// end_write:                                        ; preds = %null, %non_null
+//   ; [insert point ends here]
+void CodegenAnyVal::WriteToSlot(const SlotDescriptor& slot_desc, Value* tuple,
+    MemPool* pool, BasicBlock* insert_before) {
+  DCHECK(tuple->getType()->isPointerTy());
+  DCHECK(tuple->getType()->getPointerElementType()->isStructTy());
+  LLVMContext& context = codegen_->context();
+  Function* fn = builder_->GetInsertBlock()->getParent();
+
+  // Create new block that will come after conditional blocks if necessary
+  if (insert_before == NULL) insert_before = BasicBlock::Create(context, 
"end_write", fn);
+
+  // Create new basic blocks and br instruction
+  BasicBlock* non_null_block = BasicBlock::Create(context, "non_null", fn, 
insert_before);
+  BasicBlock* null_block = BasicBlock::Create(context, "null", fn, 
insert_before);
+  builder_->CreateCondBr(GetIsNull(), null_block, non_null_block);
+
+  // Non-null block: write slot
+  builder_->SetInsertPoint(non_null_block);
+  Value* slot = builder_->CreateStructGEP(NULL, tuple, 
slot_desc.llvm_field_idx(),
+      "slot");
+  ToNativePtr(slot, pool);
+  builder_->CreateBr(insert_before);
+
+  // Null block: set null bit
+  builder_->SetInsertPoint(null_block);
+  Function* set_null_fn = slot_desc.GetUpdateNullFn(codegen_, true);
+  DCHECK(set_null_fn != NULL);
+  builder_->CreateCall(set_null_fn, tuple);
+  builder_->CreateBr(insert_before);
+
+  // Leave builder_ after conditional blocks
+  builder_->SetInsertPoint(insert_before);
+}
+
 Value* CodegenAnyVal::Eq(CodegenAnyVal* other) {
   DCHECK_EQ(type_, other->type_);
   switch (type_.type) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8e8df2f2/be/src/codegen/codegen-anyval.h
----------------------------------------------------------------------
diff --git a/be/src/codegen/codegen-anyval.h b/be/src/codegen/codegen-anyval.h
index 67e84e7..cafe522 100644
--- a/be/src/codegen/codegen-anyval.h
+++ b/be/src/codegen/codegen-anyval.h
@@ -80,8 +80,7 @@ class CodegenAnyVal {
   /// Same as above but wraps the result in a CodegenAnyVal.
   static CodegenAnyVal CreateCallWrapped(LlvmCodeGen* cg,
       LlvmCodeGen::LlvmBuilder* builder, const ColumnType& type, 
llvm::Function* fn,
-      llvm::ArrayRef<llvm::Value*> args, const char* name = "",
-      llvm::Value* result_ptr = NULL);
+      llvm::ArrayRef<llvm::Value*> args, const char* name = "");
 
   /// Returns the lowered AnyVal type associated with 'type'.
   /// E.g.: TYPE_BOOLEAN (which corresponds to a BooleanVal) => i16
@@ -191,13 +190,31 @@ class CodegenAnyVal {
 
   /// Converts this *Val's value to a native type, StringValue, 
TimestampValue, etc.
   /// This should only be used if this *Val is not null.
-  llvm::Value* ToNativeValue();
+  ///
+  /// If 'pool' is non-NULL, var-len data will be copied into 'pool'.
+  llvm::Value* ToNativeValue(MemPool* pool = NULL);
 
   /// Sets 'native_ptr' to this *Val's value. If non-NULL, 'native_ptr' should 
be a
   /// pointer to a native type, StringValue, TimestampValue, etc. If NULL, a 
pointer is
   /// alloca'd. In either case the pointer is returned. This should only be 
used if this
   /// *Val is not null.
-  llvm::Value* ToNativePtr(llvm::Value* native_ptr = NULL);
+  ///
+  /// If 'pool' is non-NULL, var-len data will be copied into 'pool'.
+  llvm::Value* ToNativePtr(llvm::Value* native_ptr = NULL, MemPool* pool = 
NULL);
+
+  /// Writes this *Val's value to the appropriate slot in 'tuple' if non-null, 
or sets the
+  /// appropriate null bit if null. This assumes null bits are initialized to 
0. Analogous
+  /// to RawValue::Write(void* value, Tuple*, SlotDescriptor*, MemPool*). 
'tuple' should
+  /// be a pointer to the generated LLVM struct type, not an opaque Tuple*.
+  ///
+  /// Creates new basic blocks in order to branch on the 'is_null' fields, and 
leaves
+  /// builder_'s insert point at the block after these new blocks. This block 
will be
+  /// 'insert_before' if specified, or a new basic block created at the end of 
the
+  /// function if 'insert_before' is NULL.
+  ///
+  /// If 'pool' is non-NULL, var-len data will be copied into 'pool'.
+  void WriteToSlot(const SlotDescriptor& slot_desc, llvm::Value* tuple,
+      MemPool* pool = NULL, llvm::BasicBlock* insert_before = NULL);
 
   /// Returns the i1 result of this == other. this and other must be non-null.
   llvm::Value* Eq(CodegenAnyVal* other);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8e8df2f2/be/src/codegen/gen_ir_descriptions.py
----------------------------------------------------------------------
diff --git a/be/src/codegen/gen_ir_descriptions.py 
b/be/src/codegen/gen_ir_descriptions.py
index eb9d16e..71fb1a7 100755
--- a/be/src/codegen/gen_ir_descriptions.py
+++ b/be/src/codegen/gen_ir_descriptions.py
@@ -114,6 +114,9 @@ ir_functions = [
   ["IS_NULL_STRING", "IrIsNullString"],
   ["GENERIC_IS_NULL_STRING", "IrGenericIsNullString"],
   ["RAW_VALUE_COMPARE", "8RawValue7Compare"],
+  ["TOPN_NODE_INSERT_BATCH", "TopNNode11InsertBatch"],
+  ["MEMPOOL_ALLOCATE", "MemPool8AllocateILb0"],
+  ["MEMPOOL_CHECKED_ALLOCATE", "MemPool8AllocateILb1"],
 ]
 
 enums_preamble = '\

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8e8df2f2/be/src/codegen/impala-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/codegen/impala-ir.cc b/be/src/codegen/impala-ir.cc
index 7f4faba..c419efd 100644
--- a/be/src/codegen/impala-ir.cc
+++ b/be/src/codegen/impala-ir.cc
@@ -26,6 +26,7 @@
 #include "exec/hdfs-scanner-ir.cc"
 #include "exec/partitioned-aggregation-node-ir.cc"
 #include "exec/partitioned-hash-join-node-ir.cc"
+#include "exec/topn-node-ir.cc"
 #include "exprs/aggregate-functions-ir.cc"
 #include "exprs/cast-functions-ir.cc"
 #include "exprs/compound-predicates-ir.cc"
@@ -42,6 +43,7 @@
 #include "exprs/timestamp-functions-ir.cc"
 #include "exprs/udf-builtins-ir.cc"
 #include "exprs/utility-functions-ir.cc"
+#include "runtime/mem-pool.h"
 #include "runtime/raw-value-ir.cc"
 #include "udf/udf-ir.cc"
 #include "util/hash-util-ir.cc"

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8e8df2f2/be/src/codegen/llvm-codegen.cc
----------------------------------------------------------------------
diff --git a/be/src/codegen/llvm-codegen.cc b/be/src/codegen/llvm-codegen.cc
index bbfd419..2b3e245 100644
--- a/be/src/codegen/llvm-codegen.cc
+++ b/be/src/codegen/llvm-codegen.cc
@@ -954,26 +954,39 @@ Status LlvmCodeGen::LoadIntrinsics() {
 void LlvmCodeGen::CodegenMemcpy(LlvmBuilder* builder, Value* dst, Value* src, 
int size) {
   DCHECK_GE(size, 0);
   if (size == 0) return;
+  Value* size_val = GetIntConstant(TYPE_BIGINT, size);
+  CodegenMemcpy(builder, dst, src, size_val);
+}
 
-  // Cast src/dst to int8_t*.  If they already are, this will get optimized 
away
-  DCHECK(PointerType::classof(dst->getType()));
-  DCHECK(PointerType::classof(src->getType()));
-  dst = builder->CreateBitCast(dst, ptr_type());
-  src = builder->CreateBitCast(src, ptr_type());
-
-  // Get intrinsic function.
-  Function* memcpy_fn = llvm_intrinsics_[Intrinsic::memcpy];
-  DCHECK(memcpy_fn != NULL);
-
-  // The fourth argument is the alignment.  For non-zero values, the caller
-  // must guarantee that the src and dst values are aligned to that byte 
boundary.
-  // TODO: We should try to take advantage of this since our tuples are well 
aligned.
-  Value* args[] = {
-    dst, src, GetIntConstant(TYPE_INT, size),
-    GetIntConstant(TYPE_INT, 0),
-    false_value()                       // is_volatile.
-  };
-  builder->CreateCall(memcpy_fn, args);
+void LlvmCodeGen::CodegenMemcpy(LlvmBuilder* builder, Value* dst, Value* src,
+    Value* size) {
+  DCHECK(dst->getType()->isPointerTy()) << Print(dst);
+  DCHECK(src->getType()->isPointerTy()) << Print(src);
+  builder->CreateMemCpy(dst, src, size, /* no alignment */ 0);
+}
+
+void LlvmCodeGen::CodegenMemset(LlvmBuilder* builder, Value* dst, int value, 
int size) {
+  DCHECK(dst->getType()->isPointerTy()) << Print(dst);
+  DCHECK_GE(size, 0);
+  if (size == 0) return;
+  Value* value_const = GetIntConstant(TYPE_TINYINT, value);
+  builder->CreateMemSet(dst, value_const, size, /* no alignment */ 0);
+}
+
+Value* LlvmCodeGen::CodegenAllocate(LlvmBuilder* builder, MemPool* pool, 
Value* size,
+    const char* name) {
+  DCHECK(pool != NULL);
+  DCHECK(size->getType()->isIntegerTy());
+  DCHECK_LE(size->getType()->getIntegerBitWidth(), 64);
+  // Extend 'size' to i64 if necessary
+  if (size->getType()->getIntegerBitWidth() < 64) {
+    size = builder->CreateSExt(size, bigint_type());
+  }
+  Function* allocate_fn = GetFunction(IRFunction::MEMPOOL_ALLOCATE, false);
+  PointerType* pool_type = GetPtrType(MemPool::LLVM_CLASS_NAME);
+  Value* pool_val = CastPtrToLlvmPtr(pool_type, pool);
+  Value* fn_args[] = { pool_val, size };
+  return builder->CreateCall(allocate_fn, fn_args, name);
 }
 
 Value* LlvmCodeGen::CodegenArrayAt(LlvmBuilder* builder, Value* array, int idx,

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8e8df2f2/be/src/codegen/llvm-codegen.h
----------------------------------------------------------------------
diff --git a/be/src/codegen/llvm-codegen.h b/be/src/codegen/llvm-codegen.h
index dcb6cc6..b4b5223 100644
--- a/be/src/codegen/llvm-codegen.h
+++ b/be/src/codegen/llvm-codegen.h
@@ -249,8 +249,8 @@ class LlvmCodeGen {
   /// false, the module will not be optimized before compilation.
   Status FinalizeModule();
 
-  /// Replaces all instructions in 'caller' that call 'target_name' with a 
call instruction
-  /// to 'new_fn'.  Returns the number of call sites updated.
+  /// Replaces all instructions in 'caller' that call 'target_name' with a call
+  /// instruction to 'new_fn'. Returns the number of call sites updated.
   ///
   /// 'target_name' must be a substring of the mangled symbol of the function 
to be
   /// replaced. This usually means that the unmangled function name is 
sufficient.
@@ -386,7 +386,17 @@ class LlvmCodeGen {
   /// Codegen to call llvm memcpy intrinsic at the current builder location
   /// dst & src must be pointer types. size is the number of bytes to copy.
   /// No-op if size is zero.
-  void CodegenMemcpy(LlvmBuilder*, llvm::Value* dst, llvm::Value* src, int 
size);
+  void CodegenMemcpy(LlvmBuilder* builder, llvm::Value* dst, llvm::Value* src, 
int size);
+  void CodegenMemcpy(LlvmBuilder* builder, llvm::Value* dst, llvm::Value* src,
+      llvm::Value* size);
+
+  /// Codegen to call llvm memset intrinsic at the current builder location. 
'dst' should
+  /// be a pointer. No-op if size is zero.
+  void CodegenMemset(LlvmBuilder* builder, llvm::Value* dst, int value, int 
size);
+
+  /// Codegen to call pool->Allocate(size).
+  llvm::Value* CodegenAllocate(LlvmBuilder* builder, MemPool* pool, 
llvm::Value* size,
+      const char* name = "");
 
   /// Codegens IR to load array[idx] and returns the loaded value. 'array' 
should be a
   /// C-style array (e.g. i32*) or an IR array (e.g. [10 x i32]). This 
function does not

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8e8df2f2/be/src/exec/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/be/src/exec/CMakeLists.txt b/be/src/exec/CMakeLists.txt
index af86306..c3208fc 100644
--- a/be/src/exec/CMakeLists.txt
+++ b/be/src/exec/CMakeLists.txt
@@ -80,6 +80,7 @@ add_library(Exec
   subplan-node.cc
   text-converter.cc
   topn-node.cc
+  topn-node-ir.cc
   union-node.cc
   unnest-node.cc
 )

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8e8df2f2/be/src/exec/topn-node-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/topn-node-ir.cc b/be/src/exec/topn-node-ir.cc
new file mode 100644
index 0000000..e56090e
--- /dev/null
+++ b/be/src/exec/topn-node-ir.cc
@@ -0,0 +1,49 @@
+// Copyright 2016 Cloudera Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "exec/topn-node.h"
+
+using namespace impala;
+
+void TopNNode::InsertBatch(RowBatch* batch) {
+  for (int i = 0; i < batch->num_rows(); ++i) {
+    InsertTupleRow(batch->GetRow(i));
+  }
+}
+
+// Insert if either not at the limit or it's a new TopN tuple_row
+void TopNNode::InsertTupleRow(TupleRow* input_row) {
+  Tuple* insert_tuple = NULL;
+
+  if (priority_queue_->size() < limit_ + offset_) {
+    insert_tuple = reinterpret_cast<Tuple*>(
+        tuple_pool_->Allocate(materialized_tuple_desc_->byte_size()));
+    insert_tuple->MaterializeExprs<false, false>(input_row, 
*materialized_tuple_desc_,
+        sort_exec_exprs_.sort_tuple_slot_expr_ctxs(), tuple_pool_.get());
+  } else {
+    DCHECK(!priority_queue_->empty());
+    Tuple* top_tuple = priority_queue_->top();
+    tmp_tuple_->MaterializeExprs<false, true>(input_row, 
*materialized_tuple_desc_,
+        sort_exec_exprs_.sort_tuple_slot_expr_ctxs(), NULL);
+    if (tuple_row_less_than_->Less(tmp_tuple_, top_tuple)) {
+      // TODO: DeepCopy() will allocate new buffers for the string data. This 
needs
+      // to be fixed to use a freelist
+      tmp_tuple_->DeepCopy(top_tuple, *materialized_tuple_desc_, 
tuple_pool_.get());
+      insert_tuple = top_tuple;
+      priority_queue_->pop();
+    }
+  }
+
+  if (insert_tuple != NULL) priority_queue_->push(insert_tuple);
+}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8e8df2f2/be/src/exec/topn-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/topn-node.cc b/be/src/exec/topn-node.cc
index a0ad825..a73c3b5 100644
--- a/be/src/exec/topn-node.cc
+++ b/be/src/exec/topn-node.cc
@@ -16,6 +16,7 @@
 
 #include <sstream>
 
+#include "codegen/llvm-codegen.h"
 #include "exprs/expr.h"
 #include "runtime/descriptors.h"
 #include "runtime/mem-pool.h"
@@ -33,6 +34,7 @@
 
 using std::priority_queue;
 using namespace impala;
+using namespace llvm;
 
 TopNNode::TopNNode(ObjectPool* pool, const TPlanNode& tnode, const 
DescriptorTbl& descs)
   : ExecNode(pool, tnode, descs),
@@ -41,6 +43,7 @@ TopNNode::TopNNode(ObjectPool* pool, const TPlanNode& tnode, 
const DescriptorTbl
     tuple_row_less_than_(NULL),
     tmp_tuple_(NULL),
     tuple_pool_(NULL),
+    codegend_insert_batch_fn_(NULL),
     num_rows_skipped_(0),
     priority_queue_(NULL) {
 }
@@ -57,10 +60,44 @@ Status TopNNode::Init(const TPlanNode& tnode, RuntimeState* 
state) {
   return Status::OK();
 }
 
+Status TopNNode::Codegen(RuntimeState* state) {
+  DCHECK(materialized_tuple_desc_ != NULL);
+  LlvmCodeGen* codegen;
+  RETURN_IF_ERROR(state->GetCodegen(&codegen));
+  Function* insert_batch_fn =
+      codegen->GetFunction(IRFunction::TOPN_NODE_INSERT_BATCH, true);
+
+  // Generate two MaterializeExprs() functions, one using tuple_pool_ and one 
with no
+  // pool.
+  Function* materialize_exprs_tuple_pool_fn;
+  RETURN_IF_ERROR(Tuple::CodegenMaterializeExprs(state, false, 
*materialized_tuple_desc_,
+      sort_exec_exprs_.sort_tuple_slot_expr_ctxs(), tuple_pool_.get(),
+      &materialize_exprs_tuple_pool_fn));
+
+  Function* materialize_exprs_no_pool_fn;
+  RETURN_IF_ERROR(Tuple::CodegenMaterializeExprs(state, false, 
*materialized_tuple_desc_,
+      sort_exec_exprs_.sort_tuple_slot_expr_ctxs(), NULL, 
&materialize_exprs_no_pool_fn));
+
+  int replaced = codegen->ReplaceCallSites(insert_batch_fn,
+      materialize_exprs_tuple_pool_fn, Tuple::MATERIALIZE_EXPRS_SYMBOL);
+  DCHECK_EQ(replaced, 1) << LlvmCodeGen::Print(insert_batch_fn);
+
+  replaced = codegen->ReplaceCallSites(insert_batch_fn, 
materialize_exprs_no_pool_fn,
+      Tuple::MATERIALIZE_EXPRS_NULL_POOL_SYMBOL);
+  DCHECK_EQ(replaced, 1) << LlvmCodeGen::Print(insert_batch_fn);
+
+  insert_batch_fn = codegen->FinalizeFunction(insert_batch_fn);
+  DCHECK(insert_batch_fn != NULL);
+  codegen->AddFunctionToJit(insert_batch_fn,
+      reinterpret_cast<void**>(&codegend_insert_batch_fn_));
+  return Status::OK();
+}
+
 Status TopNNode::Prepare(RuntimeState* state) {
   SCOPED_TIMER(runtime_profile_->total_time_counter());
   RETURN_IF_ERROR(ExecNode::Prepare(state));
   tuple_pool_.reset(new MemPool(mem_tracker()));
+  materialized_tuple_desc_ = row_descriptor_.tuple_descriptors()[0];
   RETURN_IF_ERROR(sort_exec_exprs_.Prepare(
       state, child(0)->row_desc(), row_descriptor_, expr_mem_tracker()));
   AddExprCtxsToFree(sort_exec_exprs_);
@@ -69,13 +106,16 @@ Status TopNNode::Prepare(RuntimeState* state) {
   bool codegen_enabled = false;
   Status codegen_status;
   if (state->codegen_enabled()) {
+    // TODO: inline tuple_row_less_than_->Compare()
     codegen_status = tuple_row_less_than_->Codegen(state);
+    codegen_status.MergeStatus(Codegen(state));
     codegen_enabled = codegen_status.ok();
   }
   AddCodegenExecOption(codegen_enabled, codegen_status);
   priority_queue_.reset(new priority_queue<Tuple*, vector<Tuple*>,
       ComparatorWrapper<TupleRowComparator> >(*tuple_row_less_than_));
   materialized_tuple_desc_ = row_descriptor_.tuple_descriptors()[0];
+  insert_batch_timer_ = ADD_TIMER(runtime_profile(), "InsertBatchTime");
   return Status::OK();
 }
 
@@ -99,8 +139,13 @@ Status TopNNode::Open(RuntimeState* state) {
     do {
       batch.Reset();
       RETURN_IF_ERROR(child(0)->GetNext(state, &batch, &eos));
-      for (int i = 0; i < batch.num_rows(); ++i) {
-        InsertTupleRow(batch.GetRow(i));
+      {
+        SCOPED_TIMER(insert_batch_timer_);
+        if (codegend_insert_batch_fn_ != NULL) {
+          codegend_insert_batch_fn_(this, &batch);
+        } else {
+          InsertBatch(&batch);
+        }
       }
       RETURN_IF_CANCELLED(state);
       RETURN_IF_ERROR(QueryMaintenance(state));
@@ -161,32 +206,6 @@ void TopNNode::Close(RuntimeState* state) {
   ExecNode::Close(state);
 }
 
-// Insert if either not at the limit or it's a new TopN tuple_row
-void TopNNode::InsertTupleRow(TupleRow* input_row) {
-  Tuple* insert_tuple = NULL;
-
-  if (priority_queue_->size() < limit_ + offset_) {
-    insert_tuple = reinterpret_cast<Tuple*>(
-        tuple_pool_->Allocate(materialized_tuple_desc_->byte_size()));
-    insert_tuple->MaterializeExprs<false>(input_row, *materialized_tuple_desc_,
-        sort_exec_exprs_.sort_tuple_slot_expr_ctxs(), tuple_pool_.get());
-  } else {
-    DCHECK(!priority_queue_->empty());
-    Tuple* top_tuple = priority_queue_->top();
-    tmp_tuple_->MaterializeExprs<false>(input_row, *materialized_tuple_desc_,
-            sort_exec_exprs_.sort_tuple_slot_expr_ctxs(), NULL);
-    if (tuple_row_less_than_->Less(tmp_tuple_, top_tuple)) {
-      // TODO: DeepCopy() will allocate new buffers for the string data. This 
needs
-      // to be fixed to use a freelist
-      tmp_tuple_->DeepCopy(top_tuple, *materialized_tuple_desc_, 
tuple_pool_.get());
-      insert_tuple = top_tuple;
-      priority_queue_->pop();
-    }
-  }
-
-  if (insert_tuple != NULL) priority_queue_->push(insert_tuple);
-}
-
 // Reverse the order of the tuples in the priority queue
 void TopNNode::PrepareForOutput() {
   sorted_top_n_.resize(priority_queue_->size());

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8e8df2f2/be/src/exec/topn-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/topn-node.h b/be/src/exec/topn-node.h
index a9e0bd9..29c89e0 100644
--- a/be/src/exec/topn-node.h
+++ b/be/src/exec/topn-node.h
@@ -19,6 +19,7 @@
 #include <queue>
 #include <boost/scoped_ptr.hpp>
 
+#include "codegen/impala-ir.h"
 #include "exec/exec-node.h"
 #include "exec/sort-exec-exprs.h"
 #include "runtime/descriptors.h"  // for TupleId
@@ -53,9 +54,15 @@ class TopNNode : public ExecNode {
 
   friend class TupleLessThan;
 
+  /// Creates a codegen'd version of InsertBatch() that is used in Open().
+  Status Codegen(RuntimeState* state);
+
+  /// Inserts all the rows in 'batch' into the queue.
+  void InsertBatch(RowBatch* batch);
+
   /// Inserts a tuple row into the priority queue if it's in the TopN.  
Creates a deep
   /// copy of tuple_row, which it stores in tuple_pool_.
-  void InsertTupleRow(TupleRow* tuple_row);
+  void IR_ALWAYS_INLINE InsertTupleRow(TupleRow* tuple_row);
 
   /// Flatten and reverse the priority queue.
   void PrepareForOutput();
@@ -86,9 +93,15 @@ class TopNNode : public ExecNode {
   /// Stores everything referenced in priority_queue_.
   boost::scoped_ptr<MemPool> tuple_pool_;
 
-  // Iterator over elements in sorted_top_n_.
+  /// Iterator over elements in sorted_top_n_.
   std::vector<Tuple*>::iterator get_next_iter_;
 
+  typedef void (*InsertBatchFn)(TopNNode*, RowBatch*);
+  InsertBatchFn codegend_insert_batch_fn_;
+
+  /// Timer for time spent in InsertBatch() function (or codegen'd version)
+  RuntimeProfile::Counter* insert_batch_timer_;
+
   /////////////////////////////////////////
   /// BEGIN: Members that must be Reset()
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8e8df2f2/be/src/runtime/descriptors.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/descriptors.cc b/be/src/runtime/descriptors.cc
index dd45a3e..387d377 100644
--- a/be/src/runtime/descriptors.cc
+++ b/be/src/runtime/descriptors.cc
@@ -60,6 +60,8 @@ namespace impala {
 
 const int RowDescriptor::INVALID_IDX;
 
+const char* TupleDescriptor::LLVM_CLASS_NAME = "class.impala::TupleDescriptor";
+
 string NullIndicatorOffset::DebugString() const {
   stringstream out;
   out << "(offset=" << byte_offset

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8e8df2f2/be/src/runtime/descriptors.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/descriptors.h b/be/src/runtime/descriptors.h
index 45b4183..18d835c 100644
--- a/be/src/runtime/descriptors.h
+++ b/be/src/runtime/descriptors.h
@@ -395,6 +395,8 @@ class TupleDescriptor {
   /// The resulting struct definition is cached.
   llvm::StructType* GetLlvmStruct(LlvmCodeGen* codegen) const;
 
+  static const char* LLVM_CLASS_NAME;
+
  protected:
   friend class DescriptorTbl;
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8e8df2f2/be/src/runtime/mem-pool.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/mem-pool.h b/be/src/runtime/mem-pool.h
index 5079447..9e22e9f 100644
--- a/be/src/runtime/mem-pool.h
+++ b/be/src/runtime/mem-pool.h
@@ -229,6 +229,10 @@ class MemPool {
   }
 };
 
+// Stamp out templated implementations here so they're included in IR module
+template uint8_t* MemPool::Allocate<false>(int64_t size);
+template uint8_t* MemPool::Allocate<true>(int64_t size);
+
 }
 
 #endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8e8df2f2/be/src/runtime/raw-value.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/raw-value.cc b/be/src/runtime/raw-value.cc
index 2874ee1..b8be788 100644
--- a/be/src/runtime/raw-value.cc
+++ b/be/src/runtime/raw-value.cc
@@ -151,6 +151,9 @@ void RawValue::Write(const void* value, void* dst, const 
ColumnType& type,
       dest->len = src->len;
       if (type.type == TYPE_VARCHAR) DCHECK_LE(dest->len, type.len);
       if (pool != NULL) {
+        // Note: if this changes to TryAllocate(), 
CodegenAnyVal::WriteToSlot() will need
+        // to reflect this change as well (the codegen'd Allocate() call is 
actually
+        // generated in CodegenAnyVal::ToNativeValue()).
         dest->ptr = reinterpret_cast<char*>(pool->Allocate(dest->len));
         memcpy(dest->ptr, src->ptr, dest->len);
       } else {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8e8df2f2/be/src/runtime/sorter.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/sorter.cc b/be/src/runtime/sorter.cc
index 4640374..96c27df 100644
--- a/be/src/runtime/sorter.cc
+++ b/be/src/runtime/sorter.cc
@@ -454,7 +454,7 @@ Status Sorter::Run::AddBatch(RowBatch* batch, int 
start_index, int* num_processe
       TupleRow* input_row = batch->GetRow(cur_input_index);
       Tuple* new_tuple = 
cur_fixed_len_block->Allocate<Tuple>(sort_tuple_size_);
       if (materialize_slots_) {
-        new_tuple->MaterializeExprs<has_var_len_data>(input_row, 
*sort_tuple_desc_,
+        new_tuple->MaterializeExprs<has_var_len_data, true>(input_row, 
*sort_tuple_desc_,
             sorter_->sort_tuple_slot_expr_ctxs_, NULL, &string_values, 
&total_var_len);
         if (total_var_len > sorter_->block_mgr_->max_block_size()) {
           return Status(ErrorMsg(TErrorCode::INTERNAL_ERROR, Substitute(

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8e8df2f2/be/src/runtime/tuple.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/tuple.cc b/be/src/runtime/tuple.cc
index c2fda93..4142241 100644
--- a/be/src/runtime/tuple.cc
+++ b/be/src/runtime/tuple.cc
@@ -15,22 +15,31 @@
 #include "runtime/tuple.h"
 
 #include <vector>
+#include "llvm/IR/Function.h"
 
+#include "codegen/codegen-anyval.h"
+#include "codegen/llvm-codegen.h"
 #include "exprs/expr.h"
 #include "exprs/expr-context.h"
 #include "runtime/collection-value.h"
 #include "runtime/descriptors.h"
 #include "runtime/mem-pool.h"
 #include "runtime/raw-value.h"
-#include "runtime/tuple-row.h"
+#include "runtime/runtime-state.h"
 #include "runtime/string-value.h"
+#include "runtime/tuple-row.h"
 #include "util/debug-util.h"
 
 #include "common/names.h"
 
+using namespace llvm;
+
 namespace impala {
 
-  const char* Tuple::LLVM_CLASS_NAME = "class.impala::Tuple";
+const char* Tuple::LLVM_CLASS_NAME = "class.impala::Tuple";
+
+const char* Tuple::MATERIALIZE_EXPRS_SYMBOL = "MaterializeExprsILb0ELb0";
+const char* Tuple::MATERIALIZE_EXPRS_NULL_POOL_SYMBOL = 
"MaterializeExprsILb0ELb1";
 
 int64_t Tuple::TotalByteSize(const TupleDescriptor& desc) const {
   int64_t result = desc.byte_size();
@@ -189,18 +198,13 @@ void Tuple::ConvertOffsetsToPointers(const 
TupleDescriptor& desc, uint8_t* tuple
   }
 }
 
-template <bool collect_string_vals>
+template <bool COLLECT_STRING_VALS, bool NO_POOL>
 void Tuple::MaterializeExprs(
-    TupleRow* row, const TupleDescriptor& desc,
-    const vector<ExprContext*>& materialize_expr_ctxs, MemPool* pool,
-    vector<StringValue*>* non_null_string_values, int* total_string) {
-  DCHECK_EQ(materialize_expr_ctxs.size(), desc.slots().size());
-  if (collect_string_vals) {
-    non_null_string_values->clear();
-    *total_string = 0;
-  }
+    TupleRow* row, const TupleDescriptor& desc, ExprContext* const* 
materialize_expr_ctxs,
+    MemPool* pool, StringValue** non_null_string_values, int* 
total_string_lengths,
+    int* num_non_null_string_values) {
   memset(this, 0, desc.num_null_bytes());
-  // Evaluate the output_slot_exprs and place the results in the tuples.
+  // Evaluate the materialize_expr_ctxs and place the results in the tuple.
   for (int i = 0; i < desc.slots().size(); ++i) {
     SlotDescriptor* slot_desc = desc.slots()[i];
     // The FE ensures we don't get any TYPE_NULL expressions by picking an 
arbitrary type
@@ -212,10 +216,11 @@ void Tuple::MaterializeExprs(
     if (src != NULL) {
       void* dst = GetSlot(slot_desc->tuple_offset());
       RawValue::Write(src, dst, slot_desc->type(), pool);
-      if (collect_string_vals && slot_desc->type().IsVarLenStringType()) {
+      if (COLLECT_STRING_VALS && slot_desc->type().IsVarLenStringType()) {
         StringValue* string_val = reinterpret_cast<StringValue*>(dst);
-        non_null_string_values->push_back(string_val);
-        *total_string += string_val->len;
+        *(non_null_string_values++) = string_val;
+        *total_string_lengths += string_val->len;
+        ++(*num_non_null_string_values);
       }
     } else {
       SetNull(slot_desc->null_indicator_offset());
@@ -223,11 +228,171 @@ void Tuple::MaterializeExprs(
   }
 }
 
-template void Tuple::MaterializeExprs<false>(TupleRow* row, const 
TupleDescriptor& desc,
-    const vector<ExprContext*>& materialize_expr_ctxs, MemPool* pool,
-    vector<StringValue*>* non_null_var_values, int* total_var_len);
+// Codegens an unrolled version of MaterializeExprs(). Uses codegen'd exprs 
and slot
+// writes. If 'pool' is non-NULL, string data is copied into it. Note that the 
generated
+// function ignores its 'pool' arg; instead we hardcode the pointer in the IR.
+//
+// Example IR for materializing an int column and a string column with 
non-NULL 'pool':
+//
+// ; Function Attrs: alwaysinline
+// define void @MaterializeExprs(%"class.impala::Tuple"* %opaque_tuple,
+//     %"class.impala::TupleRow"* %row, %"class.impala::TupleDescriptor"* 
%desc,
+//     %"class.impala::ExprContext"** %materialize_expr_ctxs,
+//     %"class.impala::MemPool"* %pool,
+//     %"struct.impala::StringValue"** %non_null_string_values,
+//     i32* %total_string_lengths) #20 {
+// entry:
+//   %tuple = bitcast %"class.impala::Tuple"* %opaque_tuple to
+//       { i8, i32, %"struct.impala::StringValue" }*
+//   %0 = bitcast { i8, i32, %"struct.impala::StringValue" }* %tuple to i8*
+//   call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 1, i32 0, i1 false)
+//   %1 = getelementptr %"class.impala::ExprContext"** %materialize_expr_ctxs, 
i32 0
+//   %expr_ctx = load %"class.impala::ExprContext"** %1
+//   %src = call i64 @GetSlotRef4(%"class.impala::ExprContext"* %expr_ctx,
+//       %"class.impala::TupleRow"* %row)
+//   ; ----- generated by CodegenAnyVal::WriteToSlot() 
----------------------------------
+//   %is_null = trunc i64 %src to i1
+//   br i1 %is_null, label %null, label %non_null
+//
+// non_null:                                         ; preds = %entry
+//   %slot = getelementptr inbounds { i8, i32, %"struct.impala::StringValue" 
}* %tuple,
+//       i32 0, i32 1
+//   %2 = ashr i64 %src, 32
+//   %3 = trunc i64 %2 to i32
+//   store i32 %3, i32* %slot
+//   br label %end_write
+//
+// null:                                             ; preds = %entry
+//   call void @SetNull6({ i8, i32, %"struct.impala::StringValue" }* %tuple)
+//   br label %end_write
+//
+// end_write:                                        ; preds = %null, %non_null
+//   ; ----- end CodegenAnyVal::WriteToSlot() 
-------------------------------------------
+//   %4 = getelementptr %"class.impala::ExprContext"** %materialize_expr_ctxs, 
i32 1
+//   %expr_ctx1 = load %"class.impala::ExprContext"** %4
+//   %src2 = call { i64, i8* } @GetSlotRef5(%"class.impala::ExprContext"* 
%expr_ctx1,
+//       %"class.impala::TupleRow"* %row)
+//   ; ----- generated by CodegenAnyVal::WriteToSlot() 
----------------------------------
+//   %5 = extractvalue { i64, i8* } %src2, 0
+//   %is_null5 = trunc i64 %5 to i1
+//   br i1 %is_null5, label %null4, label %non_null3
+//
+// non_null3:                                        ; preds = %end_write
+//   %slot7 = getelementptr inbounds { i8, i32, %"struct.impala::StringValue" 
}* %tuple,
+//       i32 0, i32 2
+//   %6 = extractvalue { i64, i8* } %src2, 0
+//   %7 = ashr i64 %6, 32
+//   %8 = trunc i64 %7 to i32
+//   %9 = insertvalue %"struct.impala::StringValue" zeroinitializer, i32 %8, 1
+//   %new_ptr = call i8* @_ZN6impala7MemPool8AllocateILb0EEEPhi(
+//       %"class.impala::MemPool"* inttoptr (i64 159661008 to 
%"class.impala::MemPool"*),
+//       i32 %8)
+//   %src8 = extractvalue { i64, i8* } %src2, 1
+//   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %new_ptr, i8* %src8, i32 %8, i32 
0,
+//       i1 false)
+//   %10 = insertvalue %"struct.impala::StringValue" %9, i8* %new_ptr, 0
+//   store %"struct.impala::StringValue" %10, %"struct.impala::StringValue"* 
%slot7
+//   br label %end_write6
+//
+// null4:                                            ; preds = %end_write
+//   call void @SetNull7({ i8, i32, %"struct.impala::StringValue" }* %tuple)
+//   br label %end_write6
+//
+// end_write6:                                       ; preds = %null4, 
%non_null3
+//   ; ----- end CodegenAnyVal::WriteToSlot() 
-------------------------------------------
+//   ret void
+// }
+Status Tuple::CodegenMaterializeExprs(RuntimeState* state, bool 
collect_string_vals,
+    const TupleDescriptor& desc, const vector<ExprContext*>& 
materialize_expr_ctxs,
+    MemPool* pool, Function** fn) {
+  DCHECK(!collect_string_vals) << "CodegenMaterializeExprs: 
collect_string_vals NYI";
+  LlvmCodeGen* codegen;
+  RETURN_IF_ERROR(state->GetCodegen(&codegen));
+  SCOPED_TIMER(codegen->codegen_timer());
+  LLVMContext& context = codegen->context();
+
+  // Codegen each compute function from materialize_expr_ctxs
+  Function* materialize_expr_fns[materialize_expr_ctxs.size()];
+  for (int i = 0; i < materialize_expr_ctxs.size(); ++i) {
+    Status status = 
materialize_expr_ctxs[i]->root()->GetCodegendComputeFn(state,
+        &materialize_expr_fns[i]);
+    if (!status.ok()) {
+      stringstream ss;
+      ss << "Could not codegen CodegenMaterializeExprs: " << 
status.GetDetail();
+      return Status(ss.str());
+    }
+  }
+
+  // Construct function signature (this must exactly match the actual 
signature since it's
+  // used in xcompiled IR). With 'pool':
+  // void MaterializeExprs(Tuple* tuple, TupleRow* row, TupleDescriptor* desc,
+  //     ExprContext** materialize_expr_ctxs, MemPool* pool,
+  //     StringValue** non_null_string_values, int* total_string_lengths)
+  PointerType* opaque_tuple_type = codegen->GetPtrType(Tuple::LLVM_CLASS_NAME);
+  PointerType* row_type = codegen->GetPtrType(TupleRow::LLVM_CLASS_NAME);
+  PointerType* desc_type = 
codegen->GetPtrType(TupleDescriptor::LLVM_CLASS_NAME);
+  PointerType* expr_ctxs_type =
+      codegen->GetPtrType(codegen->GetPtrType(ExprContext::LLVM_CLASS_NAME));
+  PointerType* pool_type = codegen->GetPtrType(MemPool::LLVM_CLASS_NAME);
+  PointerType* string_values_type =
+      codegen->GetPtrType(codegen->GetPtrType(StringValue::LLVM_CLASS_NAME));
+  PointerType* int_ptr_type = codegen->GetPtrType(TYPE_INT);
+  LlvmCodeGen::FnPrototype prototype(codegen, "MaterializeExprs", 
codegen->void_type());
+  prototype.AddArgument("opaque_tuple", opaque_tuple_type);
+  prototype.AddArgument("row", row_type);
+  prototype.AddArgument("desc", desc_type);
+  prototype.AddArgument("materialize_expr_ctxs", expr_ctxs_type);
+  prototype.AddArgument("pool", pool_type);
+  prototype.AddArgument("non_null_string_values", string_values_type);
+  prototype.AddArgument("total_string_lengths", int_ptr_type);
+  prototype.AddArgument("num_non_null_string_values", int_ptr_type);
+
+  LlvmCodeGen::LlvmBuilder builder(context);
+  Value* args[7];
+  *fn = prototype.GeneratePrototype(&builder, args);
+  Value* opaque_tuple_arg = args[0];
+  Value* row_arg = args[1];
+  Value* expr_ctxs_arg = args[3];
+  // 'desc', 'pool', 'non_null_string_values', 'total_string_lengths', and
+  // 'num_non_null_string_values' are unused
+
+  // Cast the opaque Tuple* argument to the generated struct type
+  Type* tuple_struct_type = desc.GetLlvmStruct(codegen);
+  PointerType* tuple_type = codegen->GetPtrType(tuple_struct_type);
+  Value* tuple = builder.CreateBitCast(opaque_tuple_arg, tuple_type, "tuple");
+
+  // Memset tuple's null bytes
+  codegen->CodegenMemset(&builder, tuple, 0, desc.num_null_bytes());
+
+  // Evaluate the materialize_expr_ctxs and place the results in the tuple.
+  for (int i = 0; i < desc.slots().size(); ++i) {
+    SlotDescriptor* slot_desc = desc.slots()[i];
+    DCHECK(slot_desc->type().type == TYPE_NULL ||
+        slot_desc->type() == materialize_expr_ctxs[i]->root()->type());
+
+    // Call materialize_expr_fns[i](materialize_expr_ctxs[i], row)
+    Value* expr_ctx = codegen->CodegenArrayAt(&builder, expr_ctxs_arg, i, 
"expr_ctx");
+    Value* expr_args[] = { expr_ctx, row_arg };
+    CodegenAnyVal src = CodegenAnyVal::CreateCallWrapped(codegen, &builder,
+        materialize_expr_ctxs[i]->root()->type(),
+        materialize_expr_fns[i], expr_args, "src");
+
+    // Write expr result 'src' to slot
+    src.WriteToSlot(*slot_desc, tuple, pool);
+  }
+  builder.CreateRetVoid();
+  // TODO: if pool != NULL, OptimizeFunctionWithExprs() is inlining the 
Allocate()
+  // call. Investigate if this is a good thing.
+  *fn = codegen->FinalizeFunction(*fn);
+  return Status::OK();
+}
 
-template void Tuple::MaterializeExprs<true>(TupleRow* row, const 
TupleDescriptor& desc,
-    const vector<ExprContext*>& materialize_expr_ctxs, MemPool* pool,
-    vector<StringValue*>* non_null_var_values, int* total_var_len);
+template void Tuple::MaterializeExprs<false, false>(TupleRow*, const 
TupleDescriptor&,
+    ExprContext* const*, MemPool*, StringValue**, int*, int*);
+template void Tuple::MaterializeExprs<false, true>(TupleRow*, const 
TupleDescriptor&,
+    ExprContext* const*, MemPool*, StringValue**, int*, int*);
+template void Tuple::MaterializeExprs<true, false>(TupleRow*, const 
TupleDescriptor&,
+    ExprContext* const*, MemPool*, StringValue**, int*, int*);
+template void Tuple::MaterializeExprs<true, true>(TupleRow*, const 
TupleDescriptor&,
+    ExprContext* const*, MemPool*, StringValue**, int*, int*);
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8e8df2f2/be/src/runtime/tuple.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/tuple.h b/be/src/runtime/tuple.h
index 6151cb4..c8a3e07 100644
--- a/be/src/runtime/tuple.h
+++ b/be/src/runtime/tuple.h
@@ -17,11 +17,16 @@
 #define IMPALA_RUNTIME_TUPLE_H
 
 #include <cstring>
+#include "codegen/impala-ir.h"
 #include "common/logging.h"
 #include "gutil/macros.h"
 #include "runtime/descriptors.h"
 #include "runtime/mem-pool.h"
 
+namespace llvm {
+class Function;
+}
+
 namespace impala {
 
 struct CollectionValue;
@@ -103,19 +108,63 @@ class Tuple {
   /// regardless of this tuple's offset in 'tuple_data'.
   void ConvertOffsetsToPointers(const TupleDescriptor& desc, uint8_t* 
tuple_data);
 
-  /// Materialize this by evaluating the expressions in materialize_exprs
-  /// over the specified 'row'. 'pool' is used to allocate var-length data.
-  /// (Memory for this tuple itself must already be allocated.)
-  /// If collect_string_vals is true, the materialized non-NULL string value
-  /// slots and the total length of the string slots are returned in var_values
-  /// and total_string.
+  /// Materialize 'this' by evaluating the expressions in 
'materialize_exprs_ctxs' over
+  /// the specified 'row'.
+  ///
+  /// If non-NULL, 'pool' is used to allocate var-length data, otherwise 
var-length data
+  /// isn't copied. (Memory for this tuple itself must already be allocated.) 
'NULL_POOL'
+  /// should be true if 'pool' is NULL and false otherwise. The template 
parameter serves
+  /// only to differentiate the NULL vs. non-NULL pool cases when we replace 
the function
+  /// calls during codegen; the parameter means there are two different 
function symbols.
+  ///
+  /// If 'COLLECT_STRING_VALS' is true, the materialized non-NULL string value 
slots and
+  /// the total length of the string slots are returned in 
'non_null_string_values' and
+  /// 'total_string_lengths'. 'non_null_string_values' and 
'total_string_lengths' must be
+  /// non-NULL in this case. 'non_null_string_values' does not need to be 
empty; its
+  /// original contents will be overwritten.
+
   /// TODO: this function does not collect other var-len types such as 
collections.
-  template <bool collect_string_vals>
-  void MaterializeExprs(
-      TupleRow* row, const TupleDescriptor& desc,
-      const std::vector<ExprContext*>& materialize_expr_ctxs, MemPool* pool,
-      std::vector<StringValue*>* non_null_string_values = NULL,
-      int* total_string = NULL);
+  template <bool COLLECT_STRING_VALS, bool NULL_POOL>
+  inline void IR_ALWAYS_INLINE MaterializeExprs(TupleRow* row,
+      const TupleDescriptor& desc, const std::vector<ExprContext*>& 
materialize_expr_ctxs,
+      MemPool* pool, std::vector<StringValue*>* non_null_string_values = NULL,
+      int* total_string_lengths = NULL) {
+    DCHECK_EQ(NULL_POOL, pool == NULL);
+    DCHECK_EQ(materialize_expr_ctxs.size(), desc.slots().size());
+    StringValue** non_null_string_values_array = NULL;
+    int num_non_null_string_values = 0;
+    if (COLLECT_STRING_VALS) {
+      DCHECK(non_null_string_values != NULL);
+      DCHECK(total_string_lengths != NULL);
+      // string::resize() will zero-initialize any new values, so we resize to 
the largest
+      // possible size here, then truncate the vector below once we know the 
actual size
+      // (which preserves already-written values).
+      non_null_string_values->resize(desc.string_slots().size());
+      non_null_string_values_array = non_null_string_values->data();
+      *total_string_lengths = 0;
+    }
+    MaterializeExprs<COLLECT_STRING_VALS, NULL_POOL>(row, desc,
+        materialize_expr_ctxs.data(), pool, non_null_string_values_array,
+        total_string_lengths, &num_non_null_string_values);
+    if (COLLECT_STRING_VALS) 
non_null_string_values->resize(num_non_null_string_values);
+  }
+
+  /// Symbols (or substrings of the symbols) of MaterializeExprs(). These can 
be passed to
+  /// LlvmCodeGen::ReplaceCallSites().
+  static const char* MATERIALIZE_EXPRS_SYMBOL;
+  static const char* MATERIALIZE_EXPRS_NULL_POOL_SYMBOL;
+
+  /// Generates an IR version of MaterializeExprs(), returned in 'fn'. 
Currently only
+  /// 'collect_string_vals' = false is implemented.
+  ///
+  /// 'pool' may be NULL, in which case no pool-related code is generated. 
Otherwise
+  /// 'pool's address is used directly in the IR. Note that this requires 
generating
+  /// separate functions for the non-NULL and NULL cases, i.e., the 'pool' 
argument of the
+  /// generated function is ignored. There are two different MaterializeExprs 
symbols to
+  /// differentiate these cases when we replace the function calls during 
codegen.
+  static Status CodegenMaterializeExprs(RuntimeState* state, bool 
collect_string_vals,
+      const TupleDescriptor& desc, const vector<ExprContext*>& 
materialize_expr_ctxs,
+      MemPool* pool, llvm::Function** fn);
 
   /// Turn null indicator bit on. For non-nullable slots, the mask will be 0 
and
   /// this is a no-op (but we don't have to branch to check is slots are 
nulalble).
@@ -183,6 +232,14 @@ class Tuple {
   /// and referenced collection and string data.
   void DeepCopyVarlenData(const TupleDescriptor& desc, char** data, int* 
offset,
       bool convert_ptrs);
+
+  /// Implementation of MaterializedExprs(). This function is replaced during
+  /// codegen. 'num_non_null_string_values' must be initialized by the caller.
+  template <bool COLLECT_STRING_VALS, bool NULL_POOL>
+  void IR_NO_INLINE MaterializeExprs(TupleRow* row, const TupleDescriptor& 
desc,
+      ExprContext* const* materialize_expr_ctxs, MemPool* pool,
+      StringValue** non_null_string_values, int* total_string_lengths,
+      int* num_non_null_string_values);
 };
 
 }

Reply via email to